140 lines
5.2 KiB
Python
140 lines
5.2 KiB
Python
import re
|
|
from urllib.parse import urljoin
|
|
|
|
import httpx
|
|
from bs4 import BeautifulSoup
|
|
|
|
from .base import BaseScraper
|
|
|
|
TED_BASE = "https://tedlouis.com/"
|
|
|
|
|
|
class TedLouisScraper(BaseScraper):
|
|
|
|
@classmethod
|
|
def matches(cls, url: str) -> bool:
|
|
return "tedlouis.com" in url
|
|
|
|
async def login(self, client: httpx.AsyncClient, username: str, password: str) -> bool:
|
|
return True # no login required
|
|
|
|
async def fetch_book_info(self, client: httpx.AsyncClient, url: str) -> dict:
|
|
r = await client.get(url)
|
|
soup = BeautifulSoup(r.text, "html.parser")
|
|
|
|
# Detect chapter page (wrong entry point)
|
|
if soup.find("h1", class_="story-title") and not soup.find("h2", class_="story-page-title"):
|
|
raise ValueError(
|
|
"Voer de story index-URL in, geen chapter-URL. "
|
|
"Kopieer de URL van de verhaal-indexpagina (de pagina met de hoofdstukkenlijst)."
|
|
)
|
|
|
|
# Title: extract only direct NavigableString children from the h2,
|
|
# ignoring nested elements like the "Back" link and author byline.
|
|
book_title = "Unknown title"
|
|
title_el = soup.find("h2", class_="story-page-title")
|
|
if title_el:
|
|
from bs4 import NavigableString
|
|
parts = [
|
|
str(c).strip()
|
|
for c in title_el.children
|
|
if isinstance(c, NavigableString) and str(c).strip()
|
|
]
|
|
book_title = " ".join(parts) or title_el.get_text(strip=True)
|
|
|
|
# Author: from byline span (may be inside the h2 or elsewhere)
|
|
author = "Unknown author"
|
|
byline = soup.find("span", class_="story-author-by-line")
|
|
if byline:
|
|
a = byline.find("a")
|
|
if a:
|
|
author = a.get_text(strip=True)
|
|
|
|
# Publication status
|
|
status_el = soup.find("span", class_="story-status-text")
|
|
publication_status = ""
|
|
if status_el:
|
|
raw = status_el.get_text(strip=True)
|
|
publication_status = re.sub(r"^Status:\s*", "", raw, flags=re.I).strip()
|
|
|
|
# Updated date: "Last Updated: Month D, YYYY" → "YYYY-MM-DD"
|
|
updated_date = ""
|
|
updated_el = soup.find("span", class_="story-last-updated")
|
|
if updated_el:
|
|
raw = re.sub(r"^Last\s+Updated:\s*", "", updated_el.get_text(strip=True), flags=re.I).strip()
|
|
try:
|
|
from datetime import datetime
|
|
updated_date = datetime.strptime(raw, "%B %d, %Y").strftime("%Y-%m-%d")
|
|
except ValueError:
|
|
try:
|
|
updated_date = datetime.strptime(raw, "%B %Y").strftime("%Y-%m-01")
|
|
except ValueError:
|
|
pass
|
|
|
|
# Chapter links from all story-index-list columns
|
|
actual_url = str(r.url)
|
|
chapter_links: list[dict] = []
|
|
seen: set[str] = set()
|
|
for ul in soup.find_all("ul", class_="story-index-list"):
|
|
for li in ul.find_all("li"):
|
|
a = li.find("a", href=True)
|
|
if not a:
|
|
continue
|
|
href = a["href"]
|
|
full_url = urljoin(actual_url, href)
|
|
if full_url in seen:
|
|
continue
|
|
seen.add(full_url)
|
|
chapter_links.append({"url": full_url, "title": a.get_text(strip=True)})
|
|
|
|
return {
|
|
"title": book_title,
|
|
"author": author,
|
|
"publisher": "tedlouis.com",
|
|
"series": "",
|
|
"series_index_hint": 0,
|
|
"genres": [],
|
|
"subgenres": [],
|
|
"tags": [],
|
|
"description": "",
|
|
"updated_date": updated_date,
|
|
"publication_status": publication_status,
|
|
"source_url": url,
|
|
"chapters": chapter_links,
|
|
"chapter_method": "html_scan",
|
|
"index_image_url": None,
|
|
}
|
|
|
|
async def fetch_chapter(self, client: httpx.AsyncClient, ch: dict) -> dict:
|
|
cr = await client.get(ch["url"])
|
|
csoup = BeautifulSoup(cr.text, "html.parser")
|
|
title = ch["title"]
|
|
|
|
# Refine chapter title from <h2 class="chapter-title"><span>…</span></h2>
|
|
chapter_h2 = csoup.find("h2", class_="chapter-title")
|
|
if chapter_h2:
|
|
span = chapter_h2.find("span")
|
|
refined = (span or chapter_h2).get_text(strip=True)
|
|
if refined:
|
|
title = refined
|
|
|
|
content_el = csoup.find("div", id="chapter")
|
|
|
|
if content_el:
|
|
# Remove story title, chapter title, copyright blocks
|
|
for el in content_el.find_all("h1", class_="story-title"):
|
|
el.decompose()
|
|
for el in content_el.find_all("h2", class_="chapter-title"):
|
|
el.decompose()
|
|
for el in content_el.find_all("div", class_="chapter-copyright-line"):
|
|
el.decompose()
|
|
for el in content_el.find_all("div", class_=re.compile(r"chapter-copyright-notice", re.I)):
|
|
el.decompose()
|
|
|
|
return {
|
|
"title": title,
|
|
"content_el": content_el,
|
|
"selector_id": "chapter",
|
|
"selector_class": None,
|
|
}
|