novela/containers/novela/scrapers/nifty_new.py

import json
import re
from html import unescape as html_unescape
from urllib.parse import urlparse

import httpx
from bs4 import BeautifulSoup, Comment

from .base import BaseScraper


class NiftyNewScraper(BaseScraper):
    _LEAD_MARKERS = (
        "notice this is a work of fiction",
        "if it is illegal to read stories",
        "if you enjoy this story",
        "for my other stories",
        "nifty archive",
        "code of conduct",
        "author note",
        "author's note",
        "disclaimer",
        "this story contains",
        "this story includes",
        "all characters are",
        "all characters depicted",
    )
    _TAIL_MARKERS = (
        "please remember to donate",
        "donate",
        "support nifty",
        "support the archive",
        "nifty archive alliance",
        "donate.nifty.org",
        "nifty.org/donate",
        "nifty.org/support",
        "patreon",
        "buy me a coffee",
        "tip jar",
        "become a supporter",
    )

    @classmethod
    def matches(cls, url: str) -> bool:
        return "new.nifty.org" in url

    async def login(self, client: httpx.AsyncClient, username: str, password: str) -> bool:
        return True  # no login required

    # ── Helpers ───────────────────────────────────────────────────────────────

    def _to_index_url(self, url: str) -> str:
        """Strip trailing chapter number, return story index URL.

        /stories/some-slug-83036/3  →  /stories/some-slug-83036
        /stories/some-slug-83036    →  /stories/some-slug-83036
        """
        parsed = urlparse(url)
        path = re.sub(r"/\d+$", "", parsed.path.rstrip("/"))
        return f"{parsed.scheme}://{parsed.netloc}{path}"

    def _parse_date(self, iso: str) -> str:
        """Return YYYY-MM-DD from an ISO datetime string, or ''."""
        if not iso:
            return ""
        return iso[:10]

    # ── BaseScraper interface ─────────────────────────────────────────────────

    async def fetch_book_info(self, client: httpx.AsyncClient, url: str) -> dict:
        index_url = self._to_index_url(url)
        r = await client.get(index_url)
        soup = BeautifulSoup(r.text, "html.parser")

        # Title: <h1>, fallback to <title> (strip "- … - Nifty Archive …" suffix)
        h1 = soup.find("h1")
        if h1:
            title = h1.get_text(strip=True)
        else:
            title_el = soup.find("title")
            raw = title_el.get_text(strip=True) if title_el else ""
            title = re.split(r"\s+[-–]\s+", raw)[0].strip() if raw else ""

        # Author: <strong itemprop="name"> inside /authors/ link
        author = "Unknown author"
        author_link = soup.find("a", href=re.compile(r"^/authors/\d+"))
        if author_link:
            name_el = author_link.find("strong", itemprop="name")
            if name_el:
                author = name_el.get_text(strip=True)

        # Dates: <time itemprop="datePublished/dateModified">
        pub_el = soup.find("time", itemprop="datePublished")
        mod_el = soup.find("time", itemprop="dateModified")
        pub_date = self._parse_date(pub_el.get("datetime", "") if pub_el else "")
        updated_date = self._parse_date(mod_el.get("datetime", "") if mod_el else "") or pub_date

        # Tags: from all <ul aria-label="Tags"> containers (category links + generated tags)
        tags: list[str] = []
        seen: set[str] = set()
        for ul in soup.find_all("ul", attrs={"aria-label": "Tags"}):
            for a in ul.find_all("a", href=True):
                label = a.get_text(strip=True)
                if label and label.lower() not in seen:
                    seen.add(label.lower())
                    tags.append(label)

        # Description: <meta name="description">
        desc = ""
        meta_desc = soup.find("meta", attrs={"name": "description"})
        if meta_desc and meta_desc.get("content"):
            desc = meta_desc["content"].strip()

        # Chapters: find /stories/{slug}/N links in the page HTML
        slug_path = urlparse(index_url).path  # e.g. /stories/some-slug-83036
        chapter_pattern = re.compile(r"^" + re.escape(slug_path) + r"/(\d+)$")

        chapter_nums: set[int] = set()
        for a in soup.find_all("a", href=True):
            m = chapter_pattern.match(a["href"])
            if m:
                chapter_nums.add(int(m.group(1)))

        # Fallback: scan RSC stream for chapter index values
        if not chapter_nums:
            for m in re.finditer(r'"index"\s*:\s*(\d+)', r.text):
                chapter_nums.add(int(m.group(1)))

        if not chapter_nums:
            chapter_nums = {1}

        chapters = [
            {"url": f"{index_url}/{i}", "title": f"Chapter {i}"}
            for i in range(1, max(chapter_nums) + 1)
        ]

        return {
            "title":              title,
            "author":             author,
            "publisher":          "nifty.org",
            "series":             "",
            "series_index_hint":  0,
            "genres":             [],
            "subgenres":          [],
            "tags":               tags,
            "description":        desc,
            "updated_date":       updated_date,
            "publication_status": "",
            "source_url":         index_url,
            "chapters":           chapters,
            "chapter_method":     "html_scan",
            "index_image_url":    None,
        }

    # ── RSC parser ───────────────────────────────────────────────────────────

    def _parse_rsc_paragraphs(self, rsc_text: str) -> list[str]:
        """Extract story paragraph text from a Next.js RSC stream.

        The RSC format is a series of lines: ``{hex_id}:{json_value}``.
        Each line that represents a <p> element looks like:
            2c:["$","p",null,{"children":"Paragraph text."}]
        """
        paragraphs: list[str] = []
        for line in rsc_text.splitlines():
            colon = line.find(":")
            if colon < 0:
                continue
            try:
                node = json.loads(line[colon + 1:])
            except Exception:
                continue
            paragraphs.extend(self._rsc_find_paragraphs(node))
        return paragraphs

    def _rsc_find_paragraphs(self, node) -> list[str]:
        """Recursively find <p> text in an RSC component tree node."""
        if not isinstance(node, list):
            return []
        # React element: ["$", tagname, key, props]
        if len(node) >= 4 and node[0] == "$" and isinstance(node[1], str):
            tag = node[1]
            props = node[3] if isinstance(node[3], dict) else {}
            if tag == "p":
                text = self._rsc_text(props.get("children", ""))
                return [text] if text.strip() else []
            children = props.get("children")
            if children is not None:
                return self._rsc_find_paragraphs(children)
            return []
        # Plain list of child nodes
        result: list[str] = []
        for item in node:
            result.extend(self._rsc_find_paragraphs(item))
        return result

    def _rsc_text(self, children) -> str:
        """Flatten RSC children (string or nested array) into plain text."""
        if isinstance(children, str):
            return children if not children.startswith("$") else ""
        if isinstance(children, list):
            parts: list[str] = []
            for item in children:
                if isinstance(item, str) and not item.startswith("$"):
                    parts.append(item)
                elif isinstance(item, list) and len(item) >= 4 and item[0] == "$":
                    inner = item[3] if isinstance(item[3], dict) else {}
                    parts.append(self._rsc_text(inner.get("children", "")))
            return "".join(parts)
        return ""

    def _extract_escaped_html_paragraphs(self, text: str) -> list[str]:
        """Extract \\u003cp\\u003e...\\u003c/p\\u003e paragraphs from Next payload text."""
        paragraphs: list[str] = []
        for raw in re.findall(r"\\u003cp\\u003e(.*?)\\u003c/p\\u003e", text, flags=re.S):
            try:
                decoded = bytes(raw, "utf-8").decode("unicode_escape")
            except Exception:
                decoded = raw
            decoded = html_unescape(decoded)
            decoded = re.sub(r"\s+", " ", decoded).strip()
            if decoded:
                paragraphs.append(decoded)
        return paragraphs

    def _comment_safe(self, text: str) -> str:
        return text.replace("--", "- -")

    def _plain_text(self, text: str) -> str:
        # Some payload variants contain inline HTML inside paragraph text.
        # Convert to plain text before marker matching.
        if "<" in text and ">" in text:
            return BeautifulSoup(text, "html.parser").get_text(" ", strip=True)
        return text

    def _looks_like_lead_boilerplate(self, text: str) -> bool:
        t = re.sub(r"\s+", " ", self._plain_text(text).lower()).strip()
        if not t or len(t) > 4000:
            return False
        return any(m in t for m in self._LEAD_MARKERS)

    def _looks_like_tail_boilerplate(self, text: str) -> bool:
        t = re.sub(r"\s+", " ", self._plain_text(text).lower()).strip()
        if not t or len(t) > 4000:
            return False
        return any(m in t for m in self._TAIL_MARKERS)

    def _extract_hidden_boilerplate(self, paragraphs: list[str]) -> tuple[list[str], list[str], list[str]]:
        visible = list(paragraphs)
        leading: list[str] = []
        trailing: list[str] = []

        while visible and len(leading) < 6 and self._looks_like_lead_boilerplate(visible[0]):
            leading.append(visible.pop(0))
        while visible and len(trailing) < 6 and self._looks_like_tail_boilerplate(visible[-1]):
            trailing.insert(0, visible.pop())

        # Never return an empty chapter due to over-eager filtering.
        if not visible:
            return list(paragraphs), [], []
        return visible, leading, trailing

    async def fetch_chapter(self, client: httpx.AsyncClient, ch: dict) -> dict:
        # Primary path: fetch chapter HTML and read the rendered <article> content.
        r = await client.get(ch["url"])
        soup = BeautifulSoup(r.text, "html.parser")
        paragraphs: list[str] = []

        article = soup.find("article")
        if article:
            for p in article.find_all("p"):
                text = p.get_text(" ", strip=True)
                if text:
                    paragraphs.append(text)

        # Fallback: paragraph HTML may only appear escaped in Next payload scripts.
        if not paragraphs:
            paragraphs = self._extract_escaped_html_paragraphs(r.text)

        # Last fallback: request ?_rsc=1 and parse both RSC line format + escaped chunks.
        if not paragraphs:
            r_rsc = await client.get(ch["url"] + "?_rsc=1")
            paragraphs = self._parse_rsc_paragraphs(r_rsc.text)
            if not paragraphs:
                paragraphs = self._extract_escaped_html_paragraphs(r_rsc.text)

        paragraphs, hidden_lead, hidden_tail = self._extract_hidden_boilerplate(paragraphs)

        # Build a BeautifulSoup <div> with <p> elements.
        wrapper = BeautifulSoup("", "html.parser")
        div = wrapper.new_tag("div")
        if hidden_lead:
            lead_text = " || ".join(re.sub(r"\s+", " ", p).strip() for p in hidden_lead if p.strip())
            if lead_text:
                div.append(Comment(self._comment_safe(f"NIFTY_HIDDEN_LEAD: {lead_text}")))
        for text in paragraphs:
            p = wrapper.new_tag("p")
            p.string = text
            div.append(p)
        if hidden_tail:
            tail_text = " || ".join(re.sub(r"\s+", " ", p).strip() for p in hidden_tail if p.strip())
            if tail_text:
                div.append(Comment(self._comment_safe(f"NIFTY_HIDDEN_TAIL: {tail_text}")))

        return {
            "title":          ch["title"],
            "content_el":     div,
            "selector_id":    None,
            "selector_class": None,
        }