novela/containers/novela/scrapers/nifty_new.py
2026-04-15 21:39:20 +02:00

311 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import re
from html import unescape as html_unescape
from urllib.parse import urlparse
import httpx
from bs4 import BeautifulSoup, Comment
from .base import BaseScraper
class NiftyNewScraper(BaseScraper):
_LEAD_MARKERS = (
"notice this is a work of fiction",
"if it is illegal to read stories",
"if you enjoy this story",
"for my other stories",
"nifty archive",
"code of conduct",
"author note",
"author's note",
"disclaimer",
"this story contains",
"this story includes",
"all characters are",
"all characters depicted",
)
_TAIL_MARKERS = (
"please remember to donate",
"donate",
"support nifty",
"support the archive",
"nifty archive alliance",
"donate.nifty.org",
"nifty.org/donate",
"nifty.org/support",
"patreon",
"buy me a coffee",
"tip jar",
"become a supporter",
)
@classmethod
def matches(cls, url: str) -> bool:
return "new.nifty.org" in url
async def login(self, client: httpx.AsyncClient, username: str, password: str) -> bool:
return True # no login required
# ── Helpers ───────────────────────────────────────────────────────────────
def _to_index_url(self, url: str) -> str:
"""Strip trailing chapter number, return story index URL.
/stories/some-slug-83036/3 → /stories/some-slug-83036
/stories/some-slug-83036 → /stories/some-slug-83036
"""
parsed = urlparse(url)
path = re.sub(r"/\d+$", "", parsed.path.rstrip("/"))
return f"{parsed.scheme}://{parsed.netloc}{path}"
def _parse_date(self, iso: str) -> str:
"""Return YYYY-MM-DD from an ISO datetime string, or ''."""
if not iso:
return ""
return iso[:10]
# ── BaseScraper interface ─────────────────────────────────────────────────
async def fetch_book_info(self, client: httpx.AsyncClient, url: str) -> dict:
index_url = self._to_index_url(url)
r = await client.get(index_url)
soup = BeautifulSoup(r.text, "html.parser")
# Title: <h1>, fallback to <title> (strip "- … - Nifty Archive …" suffix)
h1 = soup.find("h1")
if h1:
title = h1.get_text(strip=True)
else:
title_el = soup.find("title")
raw = title_el.get_text(strip=True) if title_el else ""
title = re.split(r"\s+[-]\s+", raw)[0].strip() if raw else ""
# Author: <strong itemprop="name"> inside /authors/ link
author = "Unknown author"
author_link = soup.find("a", href=re.compile(r"^/authors/\d+"))
if author_link:
name_el = author_link.find("strong", itemprop="name")
if name_el:
author = name_el.get_text(strip=True)
# Dates: <time itemprop="datePublished/dateModified">
pub_el = soup.find("time", itemprop="datePublished")
mod_el = soup.find("time", itemprop="dateModified")
pub_date = self._parse_date(pub_el.get("datetime", "") if pub_el else "")
updated_date = self._parse_date(mod_el.get("datetime", "") if mod_el else "") or pub_date
# Tags: from all <ul aria-label="Tags"> containers (category links + generated tags)
tags: list[str] = []
seen: set[str] = set()
for ul in soup.find_all("ul", attrs={"aria-label": "Tags"}):
for a in ul.find_all("a", href=True):
label = a.get_text(strip=True)
if label and label.lower() not in seen:
seen.add(label.lower())
tags.append(label)
# Description: <meta name="description">
desc = ""
meta_desc = soup.find("meta", attrs={"name": "description"})
if meta_desc and meta_desc.get("content"):
desc = meta_desc["content"].strip()
# Chapters: find /stories/{slug}/N links in the page HTML
slug_path = urlparse(index_url).path # e.g. /stories/some-slug-83036
chapter_pattern = re.compile(r"^" + re.escape(slug_path) + r"/(\d+)$")
chapter_nums: set[int] = set()
for a in soup.find_all("a", href=True):
m = chapter_pattern.match(a["href"])
if m:
chapter_nums.add(int(m.group(1)))
# Fallback: scan RSC stream for chapter index values
if not chapter_nums:
for m in re.finditer(r'"index"\s*:\s*(\d+)', r.text):
chapter_nums.add(int(m.group(1)))
if not chapter_nums:
chapter_nums = {1}
chapters = [
{"url": f"{index_url}/{i}", "title": f"Chapter {i}"}
for i in range(1, max(chapter_nums) + 1)
]
return {
"title": title,
"author": author,
"publisher": "nifty.org",
"series": "",
"series_index_hint": 0,
"genres": [],
"subgenres": [],
"tags": tags,
"description": desc,
"updated_date": updated_date,
"publication_status": "",
"source_url": index_url,
"chapters": chapters,
"chapter_method": "html_scan",
"index_image_url": None,
}
# ── RSC parser ───────────────────────────────────────────────────────────
def _parse_rsc_paragraphs(self, rsc_text: str) -> list[str]:
"""Extract story paragraph text from a Next.js RSC stream.
The RSC format is a series of lines: ``{hex_id}:{json_value}``.
Each line that represents a <p> element looks like:
2c:["$","p",null,{"children":"Paragraph text."}]
"""
paragraphs: list[str] = []
for line in rsc_text.splitlines():
colon = line.find(":")
if colon < 0:
continue
try:
node = json.loads(line[colon + 1:])
except Exception:
continue
paragraphs.extend(self._rsc_find_paragraphs(node))
return paragraphs
def _rsc_find_paragraphs(self, node) -> list[str]:
"""Recursively find <p> text in an RSC component tree node."""
if not isinstance(node, list):
return []
# React element: ["$", tagname, key, props]
if len(node) >= 4 and node[0] == "$" and isinstance(node[1], str):
tag = node[1]
props = node[3] if isinstance(node[3], dict) else {}
if tag == "p":
text = self._rsc_text(props.get("children", ""))
return [text] if text.strip() else []
children = props.get("children")
if children is not None:
return self._rsc_find_paragraphs(children)
return []
# Plain list of child nodes
result: list[str] = []
for item in node:
result.extend(self._rsc_find_paragraphs(item))
return result
def _rsc_text(self, children) -> str:
"""Flatten RSC children (string or nested array) into plain text."""
if isinstance(children, str):
return children if not children.startswith("$") else ""
if isinstance(children, list):
parts: list[str] = []
for item in children:
if isinstance(item, str) and not item.startswith("$"):
parts.append(item)
elif isinstance(item, list) and len(item) >= 4 and item[0] == "$":
inner = item[3] if isinstance(item[3], dict) else {}
parts.append(self._rsc_text(inner.get("children", "")))
return "".join(parts)
return ""
def _extract_escaped_html_paragraphs(self, text: str) -> list[str]:
"""Extract \\u003cp\\u003e...\\u003c/p\\u003e paragraphs from Next payload text."""
paragraphs: list[str] = []
for raw in re.findall(r"\\u003cp\\u003e(.*?)\\u003c/p\\u003e", text, flags=re.S):
try:
decoded = bytes(raw, "utf-8").decode("unicode_escape")
except Exception:
decoded = raw
decoded = html_unescape(decoded)
decoded = re.sub(r"\s+", " ", decoded).strip()
if decoded:
paragraphs.append(decoded)
return paragraphs
def _comment_safe(self, text: str) -> str:
return text.replace("--", "- -")
def _plain_text(self, text: str) -> str:
# Some payload variants contain inline HTML inside paragraph text.
# Convert to plain text before marker matching.
if "<" in text and ">" in text:
return BeautifulSoup(text, "html.parser").get_text(" ", strip=True)
return text
def _looks_like_lead_boilerplate(self, text: str) -> bool:
t = re.sub(r"\s+", " ", self._plain_text(text).lower()).strip()
if not t or len(t) > 4000:
return False
return any(m in t for m in self._LEAD_MARKERS)
def _looks_like_tail_boilerplate(self, text: str) -> bool:
t = re.sub(r"\s+", " ", self._plain_text(text).lower()).strip()
if not t or len(t) > 4000:
return False
return any(m in t for m in self._TAIL_MARKERS)
def _extract_hidden_boilerplate(self, paragraphs: list[str]) -> tuple[list[str], list[str], list[str]]:
visible = list(paragraphs)
leading: list[str] = []
trailing: list[str] = []
while visible and len(leading) < 6 and self._looks_like_lead_boilerplate(visible[0]):
leading.append(visible.pop(0))
while visible and len(trailing) < 6 and self._looks_like_tail_boilerplate(visible[-1]):
trailing.insert(0, visible.pop())
# Never return an empty chapter due to over-eager filtering.
if not visible:
return list(paragraphs), [], []
return visible, leading, trailing
async def fetch_chapter(self, client: httpx.AsyncClient, ch: dict) -> dict:
# Primary path: fetch chapter HTML and read the rendered <article> content.
r = await client.get(ch["url"])
soup = BeautifulSoup(r.text, "html.parser")
paragraphs: list[str] = []
article = soup.find("article")
if article:
for p in article.find_all("p"):
text = p.get_text(" ", strip=True)
if text:
paragraphs.append(text)
# Fallback: paragraph HTML may only appear escaped in Next payload scripts.
if not paragraphs:
paragraphs = self._extract_escaped_html_paragraphs(r.text)
# Last fallback: request ?_rsc=1 and parse both RSC line format + escaped chunks.
if not paragraphs:
r_rsc = await client.get(ch["url"] + "?_rsc=1")
paragraphs = self._parse_rsc_paragraphs(r_rsc.text)
if not paragraphs:
paragraphs = self._extract_escaped_html_paragraphs(r_rsc.text)
paragraphs, hidden_lead, hidden_tail = self._extract_hidden_boilerplate(paragraphs)
# Build a BeautifulSoup <div> with <p> elements.
wrapper = BeautifulSoup("", "html.parser")
div = wrapper.new_tag("div")
if hidden_lead:
lead_text = " || ".join(re.sub(r"\s+", " ", p).strip() for p in hidden_lead if p.strip())
if lead_text:
div.append(Comment(self._comment_safe(f"NIFTY_HIDDEN_LEAD: {lead_text}")))
for text in paragraphs:
p = wrapper.new_tag("p")
p.string = text
div.append(p)
if hidden_tail:
tail_text = " || ".join(re.sub(r"\s+", " ", p).strip() for p in hidden_tail if p.strip())
if tail_text:
div.append(Comment(self._comment_safe(f"NIFTY_HIDDEN_TAIL: {tail_text}")))
return {
"title": ch["title"],
"content_el": div,
"selector_id": None,
"selector_class": None,
}