novela/containers/novela/xhtml.py
2026-03-26 10:24:57 +01:00

254 lines
10 KiB
Python

import re
from html import escape as he
from bs4 import NavigableString, Tag
BREAK_PATTERNS = [
re.compile(r"^\s*[\*\-]{3,}\s*$"), # *** of ---
re.compile(r"^\s*[·•◦‣⁃]\s*[·•◦‣⁃]\s*[·•◦‣⁃]\s*$"), # • • •
re.compile(r"^\s*~{2,}\s*$"), # ~~
re.compile(r"^\s*={3,}\s*$"), # ===
re.compile(r"^\s*#{3,}\s*$"), # ###
re.compile(r"^\s*[oO0]{1,3}\s*$"), # oOo
re.compile(r"^\s*[-–—]\s*[oO0]\s*[-–—]\s*$"), # -o- / —O—
re.compile(r"^\s*[<>]+\s*[·•*]\s*[<>]+\s*$"), # <<<<<·>>>>>
]
BREAK_CSS_CLASSES = [
"hr", "separator", "section-break", "divider", "break",
"chapterbreak", "scene-break", "scenebreak",
]
# Normalised set (hyphens removed, lowercase) for exact-match checking.
# Substring matching caused false positives: e.g. "ipsPageBreak" contains
# "break" but is a layout class, not a scene-break marker.
_BREAK_CSS_NORM = frozenset(b.replace("-", "") for b in BREAK_CSS_CLASSES)
# ---------------------------------------------------------------------------
# Runtime-configurable overrides (populated from DB by main.py before scraping)
# ---------------------------------------------------------------------------
_active_patterns: list | None = None # None → fall back to BREAK_PATTERNS
_active_css_norm: frozenset | None = None # None → fall back to _BREAK_CSS_NORM
def configure_break_patterns(regex_strings: list[str], css_classes: list[str]) -> None:
"""Override the active break patterns with values loaded from the database.
Called by main.py before each scrape so user-edited patterns take effect
without requiring a server restart.
"""
global _active_patterns, _active_css_norm
compiled = []
for p in regex_strings:
try:
compiled.append(re.compile(p))
except re.error:
pass
_active_patterns = compiled
_active_css_norm = frozenset(c.lower().replace("-", "") for c in css_classes)
def _get_patterns() -> list:
return _active_patterns if _active_patterns is not None else BREAK_PATTERNS
def _get_css_norm() -> frozenset:
return _active_css_norm if _active_css_norm is not None else _BREAK_CSS_NORM
def is_break_element(el, empty_p_is_spacer: bool = False) -> bool:
"""Detect scene breaks based on tag, class, or text pattern."""
patterns = _get_patterns()
css_norm = _get_css_norm()
if isinstance(el, Tag):
if el.name == "hr":
return True
classes = el.get("class", [])
for cls in classes:
if cls.lower().replace("-", "") in css_norm:
return True
# Empty paragraph (whitespace or &nbsp; only) counts as a break,
# unless the content uses them as spacers between every paragraph.
if el.name == "p" and not empty_p_is_spacer:
child_tags = [c for c in el.children if isinstance(c, Tag)]
if not child_tags and not el.get_text().replace("\xa0", "").strip():
return True
# Image that represents a break
if el.name == "img":
src = el.get("src", "").lower()
alt = el.get("alt", "").lower()
if any(b in src or b in alt for b in ["break", "divider", "separator", "hr"]):
return True
# Element containing only a single break image
children = [c for c in el.children if not (isinstance(c, NavigableString) and not c.strip())]
if len(children) == 1 and isinstance(children[0], Tag) and children[0].name == "img":
return is_break_element(children[0])
# Text pattern
text = el.get_text()
for pat in patterns:
if pat.match(text):
return True
elif isinstance(el, NavigableString):
for pat in patterns:
if pat.match(str(el)):
return True
return False
def element_to_xhtml(el, break_img_path: str = "../Images/break.png", empty_p_is_spacer: bool = False) -> str:
"""Convert a BeautifulSoup element to an XHTML fragment."""
if is_break_element(el, empty_p_is_spacer):
result = f'<center><img src="{break_img_path}" style="height:15px;"/></center>'
# HTML parsers (notably html.parser) can nest subsequent siblings inside
# void elements like <hr>, so a break element may contain actual content
# as children. Process those children so no text is silently discarded.
if isinstance(el, Tag):
trailer = "".join(
element_to_xhtml(c, break_img_path, empty_p_is_spacer)
for c in el.children
)
if trailer.strip():
result += "\n" + trailer
return result
if isinstance(el, NavigableString):
text = str(el)
if text.strip():
return he(text)
return ""
if el.name in ("p", "div"):
inner = "".join(element_to_xhtml(c, break_img_path, empty_p_is_spacer) for c in el.children)
inner = inner.strip()
if not inner:
return ""
return f"<p>{inner}</p>\n"
if el.name in ("em", "i"):
inner = "".join(element_to_xhtml(c, break_img_path, empty_p_is_spacer) for c in el.children)
return f"<em>{inner}</em>"
if el.name in ("strong", "b"):
inner = "".join(element_to_xhtml(c, break_img_path, empty_p_is_spacer) for c in el.children)
return f"<strong>{inner}</strong>"
if el.name in ("h1", "h2", "h3", "h4"):
inner = "".join(element_to_xhtml(c, break_img_path, empty_p_is_spacer) for c in el.children)
return f"<{el.name}>{inner}</{el.name}>\n"
if el.name == "br":
return "<br />"
if el.name in ("sup", "sub"):
inner = "".join(element_to_xhtml(c, break_img_path, empty_p_is_spacer) for c in el.children)
return inner
if el.name == "a":
inner = "".join(element_to_xhtml(c, break_img_path, empty_p_is_spacer) for c in el.children)
return inner # strip links, keep text
if el.name == "img":
src = el.get("src", "")
alt = he(el.get("alt", ""))
if src:
return f'<img src="{he(src)}" alt="{alt}"/>\n'
return ""
if el.name == "figure":
parts = []
for c in el.children:
if isinstance(c, Tag) and c.name == "figcaption":
continue
parts.append(element_to_xhtml(c, break_img_path, empty_p_is_spacer))
return "".join(parts)
# Other tags: recurse
parts = []
for c in el.children:
parts.append(element_to_xhtml(c, break_img_path, empty_p_is_spacer))
return "".join(parts)
def normalize_wysiwyg_html(raw_html: str, break_img_path: str = "../Images/break.png") -> str:
"""Normaliseer HTML uit de WYSIWYG-editor naar EPUB-compatibele XHTML.
Vervangt scene-breaks door de break-afbeelding, behoudt <strong>, <em>, <u>,
<blockquote>, <blockquote class="author-note">, wikkelt losse tekst in <p>,
verwijdert lege paragrafen.
"""
from bs4 import BeautifulSoup, NavigableString, Tag
soup = BeautifulSoup(raw_html or "", "html.parser")
body = soup.find("body") or soup
output_parts: list[str] = []
def process_inline(el) -> str:
if isinstance(el, NavigableString):
text = str(el)
return he(text) if text else ""
if el.name in ("strong", "b"):
inner = "".join(process_inline(c) for c in el.children)
return f"<strong>{inner}</strong>"
if el.name in ("em", "i"):
inner = "".join(process_inline(c) for c in el.children)
return f"<em>{inner}</em>"
if el.name == "u":
inner = "".join(process_inline(c) for c in el.children)
return f"<u>{inner}</u>"
if el.name == "br":
return "<br />"
return "".join(process_inline(c) for c in el.children)
def process_block(el) -> str | None:
if isinstance(el, NavigableString):
text = str(el).strip()
return f"<p>{he(text)}</p>" if text else None
if not isinstance(el, Tag):
return None
if is_break_element(el):
return f'<center><img src="{break_img_path}" style="height:15px;"/></center>'
if el.name == "img":
src = el.get("src", "")
alt = he(el.get("alt", ""))
if "break" in src.lower():
return f'<center><img src="{break_img_path}" style="height:15px;"/></center>'
return f'<img src="{he(src)}" alt="{alt}"/>' if src else None
if el.name in ("p", "div"):
if is_break_element(el):
return f'<center><img src="{break_img_path}" style="height:15px;"/></center>'
inner = "".join(process_inline(c) for c in el.children).strip()
return f"<p>{inner}</p>" if inner else None
if el.name in ("h1", "h2", "h3", "h4"):
inner = "".join(process_inline(c) for c in el.children).strip()
return f"<{el.name}>{inner}</{el.name}>" if inner else None
if el.name == "blockquote":
classes = el.get("class", [])
css_class = " ".join(classes) if classes else ""
tag_open = f'<blockquote class="{css_class}">' if css_class else "<blockquote>"
parts = []
for child in el.children:
if isinstance(child, NavigableString):
text = str(child).strip()
if text:
parts.append(f"<p>{he(text)}</p>")
elif isinstance(child, Tag) and child.name in ("p", "div"):
inner = "".join(process_inline(c) for c in child.children).strip()
if inner:
parts.append(f"<p>{inner}</p>")
else:
r = process_block(child)
if r:
parts.append(r)
return f"{tag_open}{''.join(parts)}</blockquote>" if parts else None
if el.name == "hr":
return f'<center><img src="{break_img_path}" style="height:15px;"/></center>'
parts = [r for c in el.children if (r := process_block(c))]
return "".join(parts) if parts else None
for child in list(body.children if hasattr(body, "children") else []):
result = process_block(child)
if result:
output_parts.append(result)
return "\n".join(output_parts)