diff --git a/containers/novela/changelog.py b/containers/novela/changelog.py index b57d900..2b40ce7 100644 --- a/containers/novela/changelog.py +++ b/containers/novela/changelog.py @@ -3,6 +3,179 @@ Changelog data for Novela """ CHANGELOG = [ + { + "version": "v0.1.12", + "date": "2026-04-15", + "summary": "Font size slider in the reader settings drawer.", + "sections": [ + { + "title": "New features", + "type": "feature", + "changes": [ + "Reader: font size slider in the reading settings drawer — adjust text size from 80% to 150%; setting is saved per device so iPad and desktop each remember their own preference", + ], + }, + ], + }, + { + "version": "v0.1.11", + "date": "2026-04-13", + "summary": "Comma-separated values in genre, subgenre and tag inputs are now split into individual tags.", + "sections": [ + { + "title": "Bug fixes", + "type": "bugfix", + "changes": [ + "Edit metadata: pasting or typing a comma-separated list in the genre, subgenre or tag input now adds each value as a separate tag instead of one combined tag", + ], + }, + ], + }, + { + "version": "v0.1.10", + "date": "2026-04-12", + "summary": "Series navigation in the reader, series_volume support for annual comics, archive a series in one click, and a TedLouis scraper fix.", + "sections": [ + { + "title": "New features", + "type": "feature", + "changes": [ + "Reader: prev/next volume buttons in the header for books that are part of a series — buttons appear automatically when the book has adjacent volumes; tooltip shows the volume number and title; marking a book as read redirects directly to the next volume in the reader instead of the book detail page", + "Comics: series_volume field for annual series where issue numbers restart each year (e.g. Donald Duck (1982) [15]) — stored in the database and EPUB OPF; displayed as '(year)' after the series name on the book detail page; sorting respects series_volume before series_index; supported in Bulk Import via %series_volume% placeholder and 'Year/Vol.' shared field", + "Library: archive or unarchive an entire series in one click — 'Archive series' / 'Unarchive series' button in the series detail view; updates all books in the series via a single SQL UPDATE and recalculates sidebar counters without a page reload", + ], + }, + { + "title": "Bug fixes", + "type": "bugfix", + "changes": [ + "TedLouis scraper: title extraction no longer includes the 'Back' button text or the author byline — only direct text nodes of the title heading are used", + ], + }, + ], + }, + { + "version": "v0.1.9", + "date": "2026-04-08", + "summary": "Five new scrapers (Nifty, codeysworld.org, iomfats.org, tedlouis.com), break image settings, and bug fixes.", + "sections": [ + { + "title": "New features", + "type": "feature", + "changes": [ + "New scraper: Nifty.org (classic) — scrapes plain-text email-format stories; email headers stripped, boilerplate paragraphs auto-detected and hidden, scene-break patterns converted to break images", + "New scraper: new.nifty.org — scrapes the Next.js version of Nifty; reads chapter content from RSC payload when the static HTML does not include it; boilerplate detection shared with classic Nifty", + "New scraper: codeysworld.org — single-file and multi-chapter stories; title and author extracted from heading elements; category from URL path stored as tag; navigation links and audio links stripped from chapter content", + "New scraper: iomfats.org — all stories are listed on a single author page; provide any chapter URL and the scraper finds the correct story automatically; supports single stories and multi-part series (series name, book title, and series index derived from the page structure)", + "New scraper: tedlouis.com — all pages use opaque token-based routing (?t=TOKEN); provide the story index URL and the scraper collects all chapter links from the three-column chapter list", + "Settings: break image upload — upload a custom PNG/JPG/WebP to use as the scene break image in all converted books; stored in the imagestore and applied to both DB-stored and EPUB-format books", + "Settings: develop mode toggle — shows a DEVELOP banner and updates the page title across all pages when enabled", + ], + }, + { + "title": "Bug fixes", + "type": "bugfix", + "changes": [ + "Break images were not displayed in DB-stored books — the image path '../Images/break.png' is a relative EPUB path that does not exist for DB content; DB mode now uses '/static/break.png'", + "Break images were silently lost during import — the image was decomposed before element_to_xhtml ran, leaving an empty wrapper; the wrapper is now replaced with
children of body. + if not content_el or selected_p_count < 3 or selected_text_len < int(body_text_len * 0.35): + content_el = body + + if not content_el: + content_el = body + + # Strip site boilerplate: headings (title/byline), navigation links, + # audio links and empty nodes — anywhere in the content element. + if content_el: + # Remove all h1/h2 headings (title and "by Author") + for el in content_el.find_all(["h1", "h2"]): + el.decompose() + + # Remove navigation links ("Back to …", "Home", etc.) + for el in content_el.find_all("a", href=True): + text = el.get_text(strip=True) + if re.search(r"back\s+to|<{0,2}\s*back|home", text, re.I): + parent = el.parent + el.decompose() + # Remove the parent too if it's now empty + if parent and not parent.get_text(strip=True): + parent.decompose() + + # Remove audio links (links to .mp3 files or containing "listen"/"audio") + for el in content_el.find_all("a", href=True): + href = el.get("href", "") + text = el.get_text(strip=True) + if re.search(r"\.mp3$", href, re.I) or re.search(r"listen|audio", text, re.I): + parent = el.parent + el.decompose() + if parent and not parent.get_text(strip=True): + parent.decompose() + + # Remove email links ("Email Author") + for el in content_el.find_all("a", href=re.compile(r"^mailto:", re.I)): + parent = el.parent + el.decompose() + if parent and not parent.get_text(strip=True): + parent.decompose() + + return { + "title": title, + "content_el": content_el, + "selector_id": content_el.get("id") if content_el else None, + "selector_class": " ".join(content_el.get("class", [])) if content_el else None, + } diff --git a/containers/novela/scrapers/iomfats.py b/containers/novela/scrapers/iomfats.py new file mode 100644 index 0000000..03ce395 --- /dev/null +++ b/containers/novela/scrapers/iomfats.py @@ -0,0 +1,267 @@ +import re +from urllib.parse import urljoin, urlparse + +import httpx +from bs4 import BeautifulSoup, NavigableString + +from .base import BaseScraper + +IOMFATS_BASE = "https://iomfats.org" + + +class IomfatsScraper(BaseScraper): + + @classmethod + def matches(cls, url: str) -> bool: + return "iomfats.org" in url + + async def login(self, client: httpx.AsyncClient, username: str, password: str) -> bool: + return True # no login required + + def _author_page_url(self, url: str) -> str: + """Derive the author index page URL from any iomfats.org URL.""" + parsed = urlparse(url) + parts = parsed.path.strip("/").split("/") + # Path: storyshelf/hosted/{author}/... + # Author page is the first 3 segments. + if len(parts) >= 3 and parts[0] == "storyshelf" and parts[1] == "hosted": + author_path = "/" + "/".join(parts[:3]) + "/" + return f"{parsed.scheme}://{parsed.netloc}{author_path}" + return url + + def _is_author_page(self, url: str) -> bool: + parts = urlparse(url).path.strip("/").split("/") + return ( + len(parts) <= 3 + and len(parts) >= 2 + and parts[0] == "storyshelf" + and parts[1] == "hosted" + ) + + def _story_folder(self, url: str) -> str | None: + """Return the story folder segment from a chapter URL, or None.""" + parts = urlparse(url).path.strip("/").split("/") + # storyshelf/hosted/{author}/{story-folder}/{chapter}.html + if len(parts) >= 5: + return parts[3] + return None + + async def fetch_book_info(self, client: httpx.AsyncClient, url: str) -> dict: + if self._is_author_page(url): + raise ValueError( + "Voer een chapter-URL in, geen author-pagina. " + "Kopieer de URL van het eerste hoofdstuk van het gewenste verhaal." + ) + + story_folder = self._story_folder(url) + if not story_folder: + raise ValueError( + "Onverwacht URL-formaat voor iomfats.org. " + "Gebruik de URL van een hoofdstuk, bijv. …/grasshopper/dreamchasers/01.html" + ) + + author_url = self._author_page_url(url) + r = await client.get(author_url) + soup = BeautifulSoup(r.text, "html.parser") + content = soup.find("div", id="content") + if not content: + raise ValueError("Kan de author-pagina niet verwerken (geen #content element).") + + # Author name from "by Name" heading + author = "Unknown author" + for el in content.find_all(["h2", "h3"]): + text = el.get_text(strip=True) + m = re.match(r"^by\s+(.+)$", text, re.I) + if m: + author = m.group(1).strip() + break + # Fallback: author slug from URL + if author == "Unknown author": + parts = urlparse(author_url).path.strip("/").split("/") + if len(parts) >= 3: + author = parts[2].replace("_", " ").title() + + # Walk the content to find the story matching story_folder. + # + # Two structures on the author page: + # + # Single story: + #
[status]
+ # inside this .
+ html_parts: list[str] = []
+ if subject:
+ html_parts.append(f"")
+ if hidden_lead:
+ lead_text = " || ".join(re.sub(r"\s+", " ", p).strip() for p in hidden_lead if p.strip())
+ if lead_text:
+ html_parts.append(f"")
+ for para in paragraphs:
+ if _BREAK_RE.match(para.strip()):
+ html_parts.append(" {he(para)}, filtered by story_folder."""
+ out: list[dict] = []
+ for li in ul.find_all("li", recursive=False):
+ a = li.find("a", href=True)
+ if not a:
+ continue
+ full_url = urljoin(base_url, a["href"])
+ if story_folder not in urlparse(full_url).path:
+ continue
+ raw_title = a.get_text(strip=True)
+ title = f"Chapter {raw_title}" if re.match(r"^\d+$", raw_title) else raw_title
+ out.append({"url": full_url, "title": title})
+ return out
+
+ async def fetch_chapter(self, client: httpx.AsyncClient, ch: dict) -> dict:
+ cr = await client.get(ch["url"])
+ csoup = BeautifulSoup(cr.text, "html.parser")
+ title = ch["title"]
+
+ content_el = csoup.find("div", id="content")
+ if not content_el:
+ content_el = csoup.find("body")
+
+ if content_el:
+ # Remove headings (story title, author, chapter number)
+ for el in content_el.find_all(["h2", "h3"]):
+ el.decompose()
+
+ # Remove chapter navigation divs
+ for el in content_el.find_all("div", class_=re.compile(r"chapternav", re.I)):
+ el.decompose()
+
+ # Remove footer elements (author note, forum button)
+ for el in content_el.find_all("div", class_="important"):
+ el.decompose()
+ for el in content_el.find_all("a", class_="styled-button"):
+ parent = el.parent
+ el.decompose()
+ if parent and not parent.get_text(strip=True):
+ parent.decompose()
+
+ # Remove anchor tags used as page anchors ()
+ for el in content_el.find_all("a", attrs={"name": True}):
+ if not el.get("href"):
+ el.unwrap()
+
+ return {
+ "title": title,
+ "content_el": content_el,
+ "selector_id": "content",
+ "selector_class": None,
+ }
diff --git a/containers/novela/scrapers/nifty.py b/containers/novela/scrapers/nifty.py
new file mode 100644
index 0000000..8c863e2
--- /dev/null
+++ b/containers/novela/scrapers/nifty.py
@@ -0,0 +1,358 @@
+import re
+from email.utils import parsedate
+from html import escape as he
+from time import mktime
+from datetime import date as _date
+from urllib.parse import urljoin, urlparse
+
+import httpx
+from bs4 import BeautifulSoup
+
+from .base import BaseScraper
+
+# Email header field names that appear at the top of Nifty classic chapters.
+_HEADER_RE = re.compile(
+ r"^(Date|From|Subject|Reply-To|Message-ID|MIME-Version|Content-Type|X-[\w-]+):",
+ re.I,
+)
+
+# Scene-break patterns in plain text (subset of xhtml.BREAK_PATTERNS for text matching).
+_BREAK_RE = re.compile(
+ r"^("
+ r"[\*\-]{3,}"
+ r"|[~=]{3,}"
+ r"|#{3,}"
+ r"|[·•◦‣⁃]\s*[·•◦‣⁃]\s*[·•◦‣⁃]"
+ r"|[-–—]\s*[oO0]\s*[-–—]"
+ r")$"
+)
+
+
+class NiftyScraper(BaseScraper):
+ _LEAD_MARKERS = (
+ "notice this is a work of fiction",
+ "if it is illegal to read stories",
+ "if you enjoy this story",
+ "for my other stories",
+ "nifty archive",
+ "code of conduct",
+ "author note",
+ "author's note",
+ "disclaimer",
+ "this story contains",
+ "this story includes",
+ "all characters are",
+ "all characters depicted",
+ )
+ _TAIL_MARKERS = (
+ "please remember to donate",
+ "donate",
+ "support nifty",
+ "support the archive",
+ "nifty archive alliance",
+ "donate.nifty.org",
+ "nifty.org/donate",
+ "nifty.org/support",
+ "patreon",
+ "buy me a coffee",
+ "tip jar",
+ "become a supporter",
+ )
+
+ @classmethod
+ def matches(cls, url: str) -> bool:
+ return "nifty.org" in url and "new.nifty.org" not in url
+
+ async def login(self, client: httpx.AsyncClient, username: str, password: str) -> bool:
+ return True # no login required
+
+ # ── Helpers ───────────────────────────────────────────────────────────────
+
+ def _to_index_url(self, url: str) -> str:
+ """Return the story index URL for any Nifty URL (index or chapter).
+
+ Nifty path structure:
+ /nifty/{category}/{subcategory}/{story}/ ← index (4 segments)
+ /nifty/{category}/{subcategory}/{story}/{chapter} ← chapter (5 segments)
+ """
+ parsed = urlparse(url)
+ parts = [p for p in parsed.path.split("/") if p]
+ if len(parts) >= 5:
+ path = "/" + "/".join(parts[:4]) + "/"
+ else:
+ path = parsed.path.rstrip("/") + "/"
+ return f"{parsed.scheme}://{parsed.netloc}{path}"
+
+ def _slug_to_title(self, slug: str) -> str:
+ return slug.replace("-", " ").title()
+
+ def _parse_date_header(self, text: str) -> str:
+ """Return YYYY-MM-DD from a 'Date: …' line, or ''."""
+ m = re.search(r"^Date:\s+(.+)$", text, re.M)
+ if not m:
+ return ""
+ try:
+ parsed = parsedate(m.group(1).strip())
+ if parsed:
+ return _date.fromtimestamp(mktime(parsed)).isoformat()
+ except Exception:
+ pass
+ return ""
+
+ def _parse_author_header(self, text: str) -> str:
+ """Return author name from 'From: Name
element.
+ Falls back to the full body text if no
is found.
+ """
+ r = await client.get(url)
+ soup = BeautifulSoup(r.text, "html.parser")
+ pre = soup.find("pre")
+ if pre:
+ raw = pre.get_text()
+ else:
+ body = soup.find("body")
+ raw = body.get_text("\n") if body else soup.get_text("\n")
+ return soup, raw
+
+ def _strip_email_headers(self, text: str) -> str:
+ """Remove the leading email header block (Date/From/Subject/…) from chapter text.
+
+ Tolerates blank lines between header fields — some Nifty pages place the
+ Subject on a separate line after a blank line:
+ Date: …
+ From: …
+
+ Subject: …
+ """
+ lines = text.splitlines()
+ i = 0
+ # Skip leading blank lines.
+ while i < len(lines) and not lines[i].strip():
+ i += 1
+ # Only strip if this actually looks like an email header block.
+ if not any(_HEADER_RE.match(lines[j]) for j in range(i, min(i + 12, len(lines)))):
+ return text
+ # Skip header lines, tolerating blank lines between them.
+ # A blank line ends the block only when no further header line follows.
+ while i < len(lines):
+ stripped = lines[i].strip()
+ if _HEADER_RE.match(stripped):
+ i += 1
+ elif not stripped:
+ # Peek ahead past any blank lines.
+ j = i + 1
+ while j < len(lines) and not lines[j].strip():
+ j += 1
+ if j < len(lines) and _HEADER_RE.match(lines[j].strip()):
+ i = j # more headers follow — jump over the blank line(s)
+ else:
+ i += 1
+ break # no more headers — end of block
+ else:
+ break # non-header, non-blank line — end of block
+ # Skip blank lines immediately after the header block.
+ while i < len(lines) and not lines[i].strip():
+ i += 1
+ return "\n".join(lines[i:])
+
+ def _text_to_paragraphs(self, text: str) -> list[str]:
+ """Split plain text into paragraphs; join hard-wrapped lines within each paragraph.
+
+ Nifty classic stories are stored as email submissions: paragraphs are
+ separated by blank lines, and each line is wrapped at ~70 characters.
+ This function merges those wrapped lines back into a single line per
+ paragraph.
+ """
+ text = text.replace("\r\n", "\n").replace("\r", "\n")
+ blocks = re.split(r"\n{2,}", text)
+ result = []
+ for block in blocks:
+ lines = [l.strip() for l in block.splitlines() if l.strip()]
+ if lines:
+ result.append(" ".join(lines))
+ return result
+
+ def _comment_safe(self, text: str) -> str:
+ return text.replace("--", "- -")
+
+ def _plain_text(self, text: str) -> str:
+ if "<" in text and ">" in text:
+ return BeautifulSoup(text, "html.parser").get_text(" ", strip=True)
+ return text
+
+ def _looks_like_lead_boilerplate(self, text: str) -> bool:
+ t = re.sub(r"\s+", " ", self._plain_text(text).lower()).strip()
+ if not t or len(t) > 4000:
+ return False
+ return any(m in t for m in self._LEAD_MARKERS)
+
+ def _looks_like_tail_boilerplate(self, text: str) -> bool:
+ t = re.sub(r"\s+", " ", self._plain_text(text).lower()).strip()
+ if not t or len(t) > 4000:
+ return False
+ return any(m in t for m in self._TAIL_MARKERS)
+
+ def _extract_hidden_boilerplate(self, paragraphs: list[str]) -> tuple[list[str], list[str], list[str]]:
+ visible = list(paragraphs)
+ leading: list[str] = []
+ trailing: list[str] = []
+
+ while visible and len(leading) < 6 and self._looks_like_lead_boilerplate(visible[0]):
+ leading.append(visible.pop(0))
+ while visible and len(trailing) < 6 and self._looks_like_tail_boilerplate(visible[-1]):
+ trailing.insert(0, visible.pop())
+
+ if not visible:
+ return list(paragraphs), [], []
+ return visible, leading, trailing
+
+ # ── BaseScraper interface ─────────────────────────────────────────────────
+
+ async def fetch_book_info(self, client: httpx.AsyncClient, url: str) -> dict:
+ index_url = self._to_index_url(url)
+ r = await client.get(index_url)
+ soup = BeautifulSoup(r.text, "html.parser")
+
+ # Title from URL slug.
+ slug = urlparse(index_url).path.rstrip("/").rsplit("/", 1)[-1]
+ book_title = self._slug_to_title(slug)
+
+ # Genres from URL path: /nifty/{category}/{subcategory}/{story}/
+ path_parts = [p for p in urlparse(index_url).path.split("/") if p]
+ category = self._slug_to_title(path_parts[1]) if len(path_parts) > 1 else ""
+ subcategory = self._slug_to_title(path_parts[2]) if len(path_parts) > 2 else ""
+
+ # Chapter links: all tags pointing one level deeper than the index.
+ chapter_links: list[dict] = []
+ seen: set[str] = set()
+ for a in soup.find_all("a", href=True):
+ full = urljoin(index_url, a["href"])
+ if (
+ full.startswith(index_url)
+ and full.rstrip("/") != index_url.rstrip("/")
+ and full not in seen
+ ):
+ seen.add(full)
+ chapter_links.append({"url": full, "title": a.get_text(strip=True)})
+
+ # Sort by trailing chapter number.
+ def _num(ch: dict) -> int:
+ m = re.search(r"-(\d+)$", ch["url"].rstrip("/"))
+ return int(m.group(1)) if m else 0
+
+ chapter_links.sort(key=_num)
+ for i, ch in enumerate(chapter_links, 1):
+ ch["title"] = f"Chapter {i}"
+
+ # Author and dates: extract from email headers in first and last chapters.
+ author = "Unknown author"
+ updated_date = ""
+ preamble_count = 0
+
+ if chapter_links:
+ _, first_text = await self._get_text(client, chapter_links[0]["url"])
+ author = self._parse_author_header(first_text) or author
+ pub_date = self._parse_date_header(first_text)
+
+ if len(chapter_links) > 1:
+ _, last_text = await self._get_text(client, chapter_links[-1]["url"])
+ updated_date = self._parse_date_header(last_text) or pub_date
+ else:
+ updated_date = pub_date
+
+ # Boilerplate detection: compare leading paragraphs of chapters 1 and 2.
+ # Paragraphs present in both (after header strip) are repeated preamble.
+ if len(chapter_links) >= 2:
+ _, ch2_text = await self._get_text(client, chapter_links[1]["url"])
+ paras1 = self._text_to_paragraphs(self._strip_email_headers(first_text))
+ paras2 = self._text_to_paragraphs(self._strip_email_headers(ch2_text))
+ for p1, p2 in zip(paras1, paras2):
+ if self._normalize(p1) == self._normalize(p2):
+ preamble_count += 1
+ else:
+ break
+
+ for ch in chapter_links:
+ ch["preamble_count"] = preamble_count
+
+ return {
+ "title": book_title,
+ "author": author,
+ "publisher": "nifty.org",
+ "series": "",
+ "series_index_hint": 0,
+ "genres": [],
+ "subgenres": [],
+ "tags": [t for t in [category, subcategory] if t],
+ "description": "",
+ "updated_date": updated_date,
+ "publication_status": "",
+ "source_url": index_url,
+ "chapters": chapter_links,
+ "chapter_method": "html_scan",
+ "index_image_url": None,
+ }
+
+ async def fetch_chapter(self, client: httpx.AsyncClient, ch: dict) -> dict:
+ _, raw_text = await self._get_text(client, ch["url"])
+
+ # Extract Subject before stripping headers; store as invisible comment.
+ subject = self._parse_subject_header(raw_text)
+
+ # Remove email header block.
+ story_text = self._strip_email_headers(raw_text)
+
+ # Convert hard-wrapped plain text to paragraphs.
+ paragraphs = self._text_to_paragraphs(story_text)
+
+ # Skip repeated boilerplate paragraphs at the top of each chapter.
+ preamble_count = ch.get("preamble_count", 0)
+ if preamble_count:
+ paragraphs = paragraphs[preamble_count:]
+ paragraphs, hidden_lead, hidden_tail = self._extract_hidden_boilerplate(paragraphs)
+
+ # Build an HTML fragment: subject as comment, scene-breaks as
, rest as
")
+ else:
+ html_parts.append(f", fallback to