novela/containers/novela/scrapers/tedlouis.py

import re
from urllib.parse import urljoin

import httpx
from bs4 import BeautifulSoup

from .base import BaseScraper

TED_BASE = "https://tedlouis.com/"


class TedLouisScraper(BaseScraper):

    @classmethod
    def matches(cls, url: str) -> bool:
        return "tedlouis.com" in url

    async def login(self, client: httpx.AsyncClient, username: str, password: str) -> bool:
        return True  # no login required

    async def fetch_book_info(self, client: httpx.AsyncClient, url: str) -> dict:
        r = await client.get(url)
        soup = BeautifulSoup(r.text, "html.parser")

        # Detect chapter page (wrong entry point)
        if soup.find("h1", class_="story-title") and not soup.find("h2", class_="story-page-title"):
            raise ValueError(
                "Voer de story index-URL in, geen chapter-URL. "
                "Kopieer de URL van de verhaal-indexpagina (de pagina met de hoofdstukkenlijst)."
            )

        # Title: extract only direct NavigableString children from the h2,
        # ignoring nested elements like the "Back" link and author byline.
        book_title = "Unknown title"
        title_el = soup.find("h2", class_="story-page-title")
        if title_el:
            from bs4 import NavigableString
            parts = [
                str(c).strip()
                for c in title_el.children
                if isinstance(c, NavigableString) and str(c).strip()
            ]
            book_title = " ".join(parts) or title_el.get_text(strip=True)

        # Author: from byline span (may be inside the h2 or elsewhere)
        author = "Unknown author"
        byline = soup.find("span", class_="story-author-by-line")
        if byline:
            a = byline.find("a")
            if a:
                author = a.get_text(strip=True)

        # Publication status
        status_el = soup.find("span", class_="story-status-text")
        publication_status = ""
        if status_el:
            raw = status_el.get_text(strip=True)
            publication_status = re.sub(r"^Status:\s*", "", raw, flags=re.I).strip()

        # Updated date: "Last Updated: Month D, YYYY" → "YYYY-MM-DD"
        updated_date = ""
        updated_el = soup.find("span", class_="story-last-updated")
        if updated_el:
            raw = re.sub(r"^Last\s+Updated:\s*", "", updated_el.get_text(strip=True), flags=re.I).strip()
            try:
                from datetime import datetime
                updated_date = datetime.strptime(raw, "%B %d, %Y").strftime("%Y-%m-%d")
            except ValueError:
                try:
                    updated_date = datetime.strptime(raw, "%B %Y").strftime("%Y-%m-01")
                except ValueError:
                    pass

        # Chapter links from all story-index-list columns
        actual_url = str(r.url)
        chapter_links: list[dict] = []
        seen: set[str] = set()
        for ul in soup.find_all("ul", class_="story-index-list"):
            for li in ul.find_all("li"):
                a = li.find("a", href=True)
                if not a:
                    continue
                href = a["href"]
                full_url = urljoin(actual_url, href)
                if full_url in seen:
                    continue
                seen.add(full_url)
                chapter_links.append({"url": full_url, "title": a.get_text(strip=True)})

        return {
            "title":              book_title,
            "author":             author,
            "publisher":          "tedlouis.com",
            "series":             "",
            "series_index_hint":  0,
            "genres":             [],
            "subgenres":          [],
            "tags":               [],
            "description":        "",
            "updated_date":       updated_date,
            "publication_status": publication_status,
            "source_url":         url,
            "chapters":           chapter_links,
            "chapter_method":     "html_scan",
            "index_image_url":    None,
        }

    async def fetch_chapter(self, client: httpx.AsyncClient, ch: dict) -> dict:
        cr = await client.get(ch["url"])
        csoup = BeautifulSoup(cr.text, "html.parser")
        title = ch["title"]

        # Refine chapter title from <h2 class="chapter-title"><span>…</span></h2>
        chapter_h2 = csoup.find("h2", class_="chapter-title")
        if chapter_h2:
            span = chapter_h2.find("span")
            refined = (span or chapter_h2).get_text(strip=True)
            if refined:
                title = refined

        content_el = csoup.find("div", id="chapter")

        if content_el:
            # Remove story title, chapter title, copyright blocks
            for el in content_el.find_all("h1", class_="story-title"):
                el.decompose()
            for el in content_el.find_all("h2", class_="chapter-title"):
                el.decompose()
            for el in content_el.find_all("div", class_="chapter-copyright-line"):
                el.decompose()
            for el in content_el.find_all("div", class_=re.compile(r"chapter-copyright-notice", re.I)):
                el.decompose()

        return {
            "title":          title,
            "content_el":     content_el,
            "selector_id":    "chapter",
            "selector_class": None,
        }