311 lines
12 KiB
Python
311 lines
12 KiB
Python
import json
|
||
import re
|
||
from html import unescape as html_unescape
|
||
from urllib.parse import urlparse
|
||
|
||
import httpx
|
||
from bs4 import BeautifulSoup, Comment
|
||
|
||
from .base import BaseScraper
|
||
|
||
|
||
class NiftyNewScraper(BaseScraper):
|
||
_LEAD_MARKERS = (
|
||
"notice this is a work of fiction",
|
||
"if it is illegal to read stories",
|
||
"if you enjoy this story",
|
||
"for my other stories",
|
||
"nifty archive",
|
||
"code of conduct",
|
||
"author note",
|
||
"author's note",
|
||
"disclaimer",
|
||
"this story contains",
|
||
"this story includes",
|
||
"all characters are",
|
||
"all characters depicted",
|
||
)
|
||
_TAIL_MARKERS = (
|
||
"please remember to donate",
|
||
"donate",
|
||
"support nifty",
|
||
"support the archive",
|
||
"nifty archive alliance",
|
||
"donate.nifty.org",
|
||
"nifty.org/donate",
|
||
"nifty.org/support",
|
||
"patreon",
|
||
"buy me a coffee",
|
||
"tip jar",
|
||
"become a supporter",
|
||
)
|
||
|
||
@classmethod
|
||
def matches(cls, url: str) -> bool:
|
||
return "new.nifty.org" in url
|
||
|
||
async def login(self, client: httpx.AsyncClient, username: str, password: str) -> bool:
|
||
return True # no login required
|
||
|
||
# ── Helpers ───────────────────────────────────────────────────────────────
|
||
|
||
def _to_index_url(self, url: str) -> str:
|
||
"""Strip trailing chapter number, return story index URL.
|
||
|
||
/stories/some-slug-83036/3 → /stories/some-slug-83036
|
||
/stories/some-slug-83036 → /stories/some-slug-83036
|
||
"""
|
||
parsed = urlparse(url)
|
||
path = re.sub(r"/\d+$", "", parsed.path.rstrip("/"))
|
||
return f"{parsed.scheme}://{parsed.netloc}{path}"
|
||
|
||
def _parse_date(self, iso: str) -> str:
|
||
"""Return YYYY-MM-DD from an ISO datetime string, or ''."""
|
||
if not iso:
|
||
return ""
|
||
return iso[:10]
|
||
|
||
# ── BaseScraper interface ─────────────────────────────────────────────────
|
||
|
||
async def fetch_book_info(self, client: httpx.AsyncClient, url: str) -> dict:
|
||
index_url = self._to_index_url(url)
|
||
r = await client.get(index_url)
|
||
soup = BeautifulSoup(r.text, "html.parser")
|
||
|
||
# Title: <h1>, fallback to <title> (strip "- … - Nifty Archive …" suffix)
|
||
h1 = soup.find("h1")
|
||
if h1:
|
||
title = h1.get_text(strip=True)
|
||
else:
|
||
title_el = soup.find("title")
|
||
raw = title_el.get_text(strip=True) if title_el else ""
|
||
title = re.split(r"\s+[-–]\s+", raw)[0].strip() if raw else ""
|
||
|
||
# Author: <strong itemprop="name"> inside /authors/ link
|
||
author = "Unknown author"
|
||
author_link = soup.find("a", href=re.compile(r"^/authors/\d+"))
|
||
if author_link:
|
||
name_el = author_link.find("strong", itemprop="name")
|
||
if name_el:
|
||
author = name_el.get_text(strip=True)
|
||
|
||
# Dates: <time itemprop="datePublished/dateModified">
|
||
pub_el = soup.find("time", itemprop="datePublished")
|
||
mod_el = soup.find("time", itemprop="dateModified")
|
||
pub_date = self._parse_date(pub_el.get("datetime", "") if pub_el else "")
|
||
updated_date = self._parse_date(mod_el.get("datetime", "") if mod_el else "") or pub_date
|
||
|
||
# Tags: from all <ul aria-label="Tags"> containers (category links + generated tags)
|
||
tags: list[str] = []
|
||
seen: set[str] = set()
|
||
for ul in soup.find_all("ul", attrs={"aria-label": "Tags"}):
|
||
for a in ul.find_all("a", href=True):
|
||
label = a.get_text(strip=True)
|
||
if label and label.lower() not in seen:
|
||
seen.add(label.lower())
|
||
tags.append(label)
|
||
|
||
# Description: <meta name="description">
|
||
desc = ""
|
||
meta_desc = soup.find("meta", attrs={"name": "description"})
|
||
if meta_desc and meta_desc.get("content"):
|
||
desc = meta_desc["content"].strip()
|
||
|
||
# Chapters: find /stories/{slug}/N links in the page HTML
|
||
slug_path = urlparse(index_url).path # e.g. /stories/some-slug-83036
|
||
chapter_pattern = re.compile(r"^" + re.escape(slug_path) + r"/(\d+)$")
|
||
|
||
chapter_nums: set[int] = set()
|
||
for a in soup.find_all("a", href=True):
|
||
m = chapter_pattern.match(a["href"])
|
||
if m:
|
||
chapter_nums.add(int(m.group(1)))
|
||
|
||
# Fallback: scan RSC stream for chapter index values
|
||
if not chapter_nums:
|
||
for m in re.finditer(r'"index"\s*:\s*(\d+)', r.text):
|
||
chapter_nums.add(int(m.group(1)))
|
||
|
||
if not chapter_nums:
|
||
chapter_nums = {1}
|
||
|
||
chapters = [
|
||
{"url": f"{index_url}/{i}", "title": f"Chapter {i}"}
|
||
for i in range(1, max(chapter_nums) + 1)
|
||
]
|
||
|
||
return {
|
||
"title": title,
|
||
"author": author,
|
||
"publisher": "nifty.org",
|
||
"series": "",
|
||
"series_index_hint": 0,
|
||
"genres": [],
|
||
"subgenres": [],
|
||
"tags": tags,
|
||
"description": desc,
|
||
"updated_date": updated_date,
|
||
"publication_status": "",
|
||
"source_url": index_url,
|
||
"chapters": chapters,
|
||
"chapter_method": "html_scan",
|
||
"index_image_url": None,
|
||
}
|
||
|
||
# ── RSC parser ───────────────────────────────────────────────────────────
|
||
|
||
def _parse_rsc_paragraphs(self, rsc_text: str) -> list[str]:
|
||
"""Extract story paragraph text from a Next.js RSC stream.
|
||
|
||
The RSC format is a series of lines: ``{hex_id}:{json_value}``.
|
||
Each line that represents a <p> element looks like:
|
||
2c:["$","p",null,{"children":"Paragraph text."}]
|
||
"""
|
||
paragraphs: list[str] = []
|
||
for line in rsc_text.splitlines():
|
||
colon = line.find(":")
|
||
if colon < 0:
|
||
continue
|
||
try:
|
||
node = json.loads(line[colon + 1:])
|
||
except Exception:
|
||
continue
|
||
paragraphs.extend(self._rsc_find_paragraphs(node))
|
||
return paragraphs
|
||
|
||
def _rsc_find_paragraphs(self, node) -> list[str]:
|
||
"""Recursively find <p> text in an RSC component tree node."""
|
||
if not isinstance(node, list):
|
||
return []
|
||
# React element: ["$", tagname, key, props]
|
||
if len(node) >= 4 and node[0] == "$" and isinstance(node[1], str):
|
||
tag = node[1]
|
||
props = node[3] if isinstance(node[3], dict) else {}
|
||
if tag == "p":
|
||
text = self._rsc_text(props.get("children", ""))
|
||
return [text] if text.strip() else []
|
||
children = props.get("children")
|
||
if children is not None:
|
||
return self._rsc_find_paragraphs(children)
|
||
return []
|
||
# Plain list of child nodes
|
||
result: list[str] = []
|
||
for item in node:
|
||
result.extend(self._rsc_find_paragraphs(item))
|
||
return result
|
||
|
||
def _rsc_text(self, children) -> str:
|
||
"""Flatten RSC children (string or nested array) into plain text."""
|
||
if isinstance(children, str):
|
||
return children if not children.startswith("$") else ""
|
||
if isinstance(children, list):
|
||
parts: list[str] = []
|
||
for item in children:
|
||
if isinstance(item, str) and not item.startswith("$"):
|
||
parts.append(item)
|
||
elif isinstance(item, list) and len(item) >= 4 and item[0] == "$":
|
||
inner = item[3] if isinstance(item[3], dict) else {}
|
||
parts.append(self._rsc_text(inner.get("children", "")))
|
||
return "".join(parts)
|
||
return ""
|
||
|
||
def _extract_escaped_html_paragraphs(self, text: str) -> list[str]:
|
||
"""Extract \\u003cp\\u003e...\\u003c/p\\u003e paragraphs from Next payload text."""
|
||
paragraphs: list[str] = []
|
||
for raw in re.findall(r"\\u003cp\\u003e(.*?)\\u003c/p\\u003e", text, flags=re.S):
|
||
try:
|
||
decoded = bytes(raw, "utf-8").decode("unicode_escape")
|
||
except Exception:
|
||
decoded = raw
|
||
decoded = html_unescape(decoded)
|
||
decoded = re.sub(r"\s+", " ", decoded).strip()
|
||
if decoded:
|
||
paragraphs.append(decoded)
|
||
return paragraphs
|
||
|
||
def _comment_safe(self, text: str) -> str:
|
||
return text.replace("--", "- -")
|
||
|
||
def _plain_text(self, text: str) -> str:
|
||
# Some payload variants contain inline HTML inside paragraph text.
|
||
# Convert to plain text before marker matching.
|
||
if "<" in text and ">" in text:
|
||
return BeautifulSoup(text, "html.parser").get_text(" ", strip=True)
|
||
return text
|
||
|
||
def _looks_like_lead_boilerplate(self, text: str) -> bool:
|
||
t = re.sub(r"\s+", " ", self._plain_text(text).lower()).strip()
|
||
if not t or len(t) > 4000:
|
||
return False
|
||
return any(m in t for m in self._LEAD_MARKERS)
|
||
|
||
def _looks_like_tail_boilerplate(self, text: str) -> bool:
|
||
t = re.sub(r"\s+", " ", self._plain_text(text).lower()).strip()
|
||
if not t or len(t) > 4000:
|
||
return False
|
||
return any(m in t for m in self._TAIL_MARKERS)
|
||
|
||
def _extract_hidden_boilerplate(self, paragraphs: list[str]) -> tuple[list[str], list[str], list[str]]:
|
||
visible = list(paragraphs)
|
||
leading: list[str] = []
|
||
trailing: list[str] = []
|
||
|
||
while visible and len(leading) < 6 and self._looks_like_lead_boilerplate(visible[0]):
|
||
leading.append(visible.pop(0))
|
||
while visible and len(trailing) < 6 and self._looks_like_tail_boilerplate(visible[-1]):
|
||
trailing.insert(0, visible.pop())
|
||
|
||
# Never return an empty chapter due to over-eager filtering.
|
||
if not visible:
|
||
return list(paragraphs), [], []
|
||
return visible, leading, trailing
|
||
|
||
async def fetch_chapter(self, client: httpx.AsyncClient, ch: dict) -> dict:
|
||
# Primary path: fetch chapter HTML and read the rendered <article> content.
|
||
r = await client.get(ch["url"])
|
||
soup = BeautifulSoup(r.text, "html.parser")
|
||
paragraphs: list[str] = []
|
||
|
||
article = soup.find("article")
|
||
if article:
|
||
for p in article.find_all("p"):
|
||
text = p.get_text(" ", strip=True)
|
||
if text:
|
||
paragraphs.append(text)
|
||
|
||
# Fallback: paragraph HTML may only appear escaped in Next payload scripts.
|
||
if not paragraphs:
|
||
paragraphs = self._extract_escaped_html_paragraphs(r.text)
|
||
|
||
# Last fallback: request ?_rsc=1 and parse both RSC line format + escaped chunks.
|
||
if not paragraphs:
|
||
r_rsc = await client.get(ch["url"] + "?_rsc=1")
|
||
paragraphs = self._parse_rsc_paragraphs(r_rsc.text)
|
||
if not paragraphs:
|
||
paragraphs = self._extract_escaped_html_paragraphs(r_rsc.text)
|
||
|
||
paragraphs, hidden_lead, hidden_tail = self._extract_hidden_boilerplate(paragraphs)
|
||
|
||
# Build a BeautifulSoup <div> with <p> elements.
|
||
wrapper = BeautifulSoup("", "html.parser")
|
||
div = wrapper.new_tag("div")
|
||
if hidden_lead:
|
||
lead_text = " || ".join(re.sub(r"\s+", " ", p).strip() for p in hidden_lead if p.strip())
|
||
if lead_text:
|
||
div.append(Comment(self._comment_safe(f"NIFTY_HIDDEN_LEAD: {lead_text}")))
|
||
for text in paragraphs:
|
||
p = wrapper.new_tag("p")
|
||
p.string = text
|
||
div.append(p)
|
||
if hidden_tail:
|
||
tail_text = " || ".join(re.sub(r"\s+", " ", p).strip() for p in hidden_tail if p.strip())
|
||
if tail_text:
|
||
div.append(Comment(self._comment_safe(f"NIFTY_HIDDEN_TAIL: {tail_text}")))
|
||
|
||
return {
|
||
"title": ch["title"],
|
||
"content_el": div,
|
||
"selector_id": None,
|
||
"selector_class": None,
|
||
}
|