novela/containers/novela/scrapers/tedlouis.py
2026-04-15 21:39:20 +02:00

140 lines
5.2 KiB
Python

import re
from urllib.parse import urljoin
import httpx
from bs4 import BeautifulSoup
from .base import BaseScraper
TED_BASE = "https://tedlouis.com/"
class TedLouisScraper(BaseScraper):
@classmethod
def matches(cls, url: str) -> bool:
return "tedlouis.com" in url
async def login(self, client: httpx.AsyncClient, username: str, password: str) -> bool:
return True # no login required
async def fetch_book_info(self, client: httpx.AsyncClient, url: str) -> dict:
r = await client.get(url)
soup = BeautifulSoup(r.text, "html.parser")
# Detect chapter page (wrong entry point)
if soup.find("h1", class_="story-title") and not soup.find("h2", class_="story-page-title"):
raise ValueError(
"Voer de story index-URL in, geen chapter-URL. "
"Kopieer de URL van de verhaal-indexpagina (de pagina met de hoofdstukkenlijst)."
)
# Title: extract only direct NavigableString children from the h2,
# ignoring nested elements like the "Back" link and author byline.
book_title = "Unknown title"
title_el = soup.find("h2", class_="story-page-title")
if title_el:
from bs4 import NavigableString
parts = [
str(c).strip()
for c in title_el.children
if isinstance(c, NavigableString) and str(c).strip()
]
book_title = " ".join(parts) or title_el.get_text(strip=True)
# Author: from byline span (may be inside the h2 or elsewhere)
author = "Unknown author"
byline = soup.find("span", class_="story-author-by-line")
if byline:
a = byline.find("a")
if a:
author = a.get_text(strip=True)
# Publication status
status_el = soup.find("span", class_="story-status-text")
publication_status = ""
if status_el:
raw = status_el.get_text(strip=True)
publication_status = re.sub(r"^Status:\s*", "", raw, flags=re.I).strip()
# Updated date: "Last Updated: Month D, YYYY" → "YYYY-MM-DD"
updated_date = ""
updated_el = soup.find("span", class_="story-last-updated")
if updated_el:
raw = re.sub(r"^Last\s+Updated:\s*", "", updated_el.get_text(strip=True), flags=re.I).strip()
try:
from datetime import datetime
updated_date = datetime.strptime(raw, "%B %d, %Y").strftime("%Y-%m-%d")
except ValueError:
try:
updated_date = datetime.strptime(raw, "%B %Y").strftime("%Y-%m-01")
except ValueError:
pass
# Chapter links from all story-index-list columns
actual_url = str(r.url)
chapter_links: list[dict] = []
seen: set[str] = set()
for ul in soup.find_all("ul", class_="story-index-list"):
for li in ul.find_all("li"):
a = li.find("a", href=True)
if not a:
continue
href = a["href"]
full_url = urljoin(actual_url, href)
if full_url in seen:
continue
seen.add(full_url)
chapter_links.append({"url": full_url, "title": a.get_text(strip=True)})
return {
"title": book_title,
"author": author,
"publisher": "tedlouis.com",
"series": "",
"series_index_hint": 0,
"genres": [],
"subgenres": [],
"tags": [],
"description": "",
"updated_date": updated_date,
"publication_status": publication_status,
"source_url": url,
"chapters": chapter_links,
"chapter_method": "html_scan",
"index_image_url": None,
}
async def fetch_chapter(self, client: httpx.AsyncClient, ch: dict) -> dict:
cr = await client.get(ch["url"])
csoup = BeautifulSoup(cr.text, "html.parser")
title = ch["title"]
# Refine chapter title from <h2 class="chapter-title"><span>…</span></h2>
chapter_h2 = csoup.find("h2", class_="chapter-title")
if chapter_h2:
span = chapter_h2.find("span")
refined = (span or chapter_h2).get_text(strip=True)
if refined:
title = refined
content_el = csoup.find("div", id="chapter")
if content_el:
# Remove story title, chapter title, copyright blocks
for el in content_el.find_all("h1", class_="story-title"):
el.decompose()
for el in content_el.find_all("h2", class_="chapter-title"):
el.decompose()
for el in content_el.find_all("div", class_="chapter-copyright-line"):
el.decompose()
for el in content_el.find_all("div", class_=re.compile(r"chapter-copyright-notice", re.I)):
el.decompose()
return {
"title": title,
"content_el": content_el,
"selector_id": "chapter",
"selector_class": None,
}