254 lines
10 KiB
Python
254 lines
10 KiB
Python
import re
|
|
from html import escape as he
|
|
|
|
from bs4 import NavigableString, Tag
|
|
|
|
BREAK_PATTERNS = [
|
|
re.compile(r"^\s*[\*\-]{3,}\s*$"), # *** of ---
|
|
re.compile(r"^\s*[·•◦‣⁃]\s*[·•◦‣⁃]\s*[·•◦‣⁃]\s*$"), # • • •
|
|
re.compile(r"^\s*~{2,}\s*$"), # ~~
|
|
re.compile(r"^\s*={3,}\s*$"), # ===
|
|
re.compile(r"^\s*#{3,}\s*$"), # ###
|
|
re.compile(r"^\s*[oO0]{1,3}\s*$"), # oOo
|
|
re.compile(r"^\s*[-–—]\s*[oO0]\s*[-–—]\s*$"), # -o- / —O—
|
|
re.compile(r"^\s*[<>]+\s*[·•*]\s*[<>]+\s*$"), # <<<<<·>>>>>
|
|
]
|
|
|
|
BREAK_CSS_CLASSES = [
|
|
"hr", "separator", "section-break", "divider", "break",
|
|
"chapterbreak", "scene-break", "scenebreak",
|
|
]
|
|
# Normalised set (hyphens removed, lowercase) for exact-match checking.
|
|
# Substring matching caused false positives: e.g. "ipsPageBreak" contains
|
|
# "break" but is a layout class, not a scene-break marker.
|
|
_BREAK_CSS_NORM = frozenset(b.replace("-", "") for b in BREAK_CSS_CLASSES)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Runtime-configurable overrides (populated from DB by main.py before scraping)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_active_patterns: list | None = None # None → fall back to BREAK_PATTERNS
|
|
_active_css_norm: frozenset | None = None # None → fall back to _BREAK_CSS_NORM
|
|
|
|
|
|
def configure_break_patterns(regex_strings: list[str], css_classes: list[str]) -> None:
|
|
"""Override the active break patterns with values loaded from the database.
|
|
|
|
Called by main.py before each scrape so user-edited patterns take effect
|
|
without requiring a server restart.
|
|
"""
|
|
global _active_patterns, _active_css_norm
|
|
compiled = []
|
|
for p in regex_strings:
|
|
try:
|
|
compiled.append(re.compile(p))
|
|
except re.error:
|
|
pass
|
|
_active_patterns = compiled
|
|
_active_css_norm = frozenset(c.lower().replace("-", "") for c in css_classes)
|
|
|
|
|
|
def _get_patterns() -> list:
|
|
return _active_patterns if _active_patterns is not None else BREAK_PATTERNS
|
|
|
|
|
|
def _get_css_norm() -> frozenset:
|
|
return _active_css_norm if _active_css_norm is not None else _BREAK_CSS_NORM
|
|
|
|
|
|
def is_break_element(el, empty_p_is_spacer: bool = False) -> bool:
|
|
"""Detect scene breaks based on tag, class, or text pattern."""
|
|
patterns = _get_patterns()
|
|
css_norm = _get_css_norm()
|
|
if isinstance(el, Tag):
|
|
if el.name == "hr":
|
|
return True
|
|
classes = el.get("class", [])
|
|
for cls in classes:
|
|
if cls.lower().replace("-", "") in css_norm:
|
|
return True
|
|
# Empty paragraph (whitespace or only) counts as a break,
|
|
# unless the content uses them as spacers between every paragraph.
|
|
if el.name == "p" and not empty_p_is_spacer:
|
|
child_tags = [c for c in el.children if isinstance(c, Tag)]
|
|
if not child_tags and not el.get_text().replace("\xa0", "").strip():
|
|
return True
|
|
# Image that represents a break
|
|
if el.name == "img":
|
|
src = el.get("src", "").lower()
|
|
alt = el.get("alt", "").lower()
|
|
if any(b in src or b in alt for b in ["break", "divider", "separator", "hr"]):
|
|
return True
|
|
# Element containing only a single break image
|
|
children = [c for c in el.children if not (isinstance(c, NavigableString) and not c.strip())]
|
|
if len(children) == 1 and isinstance(children[0], Tag) and children[0].name == "img":
|
|
return is_break_element(children[0])
|
|
# Text pattern
|
|
text = el.get_text()
|
|
for pat in patterns:
|
|
if pat.match(text):
|
|
return True
|
|
elif isinstance(el, NavigableString):
|
|
for pat in patterns:
|
|
if pat.match(str(el)):
|
|
return True
|
|
return False
|
|
|
|
|
|
def element_to_xhtml(el, break_img_path: str = "../Images/break.png", empty_p_is_spacer: bool = False) -> str:
|
|
"""Convert a BeautifulSoup element to an XHTML fragment."""
|
|
if is_break_element(el, empty_p_is_spacer):
|
|
result = f'<center><img src="{break_img_path}" style="height:15px;"/></center>'
|
|
# HTML parsers (notably html.parser) can nest subsequent siblings inside
|
|
# void elements like <hr>, so a break element may contain actual content
|
|
# as children. Process those children so no text is silently discarded.
|
|
if isinstance(el, Tag):
|
|
trailer = "".join(
|
|
element_to_xhtml(c, break_img_path, empty_p_is_spacer)
|
|
for c in el.children
|
|
)
|
|
if trailer.strip():
|
|
result += "\n" + trailer
|
|
return result
|
|
|
|
if isinstance(el, NavigableString):
|
|
text = str(el)
|
|
if text.strip():
|
|
return he(text)
|
|
return ""
|
|
|
|
if el.name in ("p", "div"):
|
|
inner = "".join(element_to_xhtml(c, break_img_path, empty_p_is_spacer) for c in el.children)
|
|
inner = inner.strip()
|
|
if not inner:
|
|
return ""
|
|
return f"<p>{inner}</p>\n"
|
|
|
|
if el.name in ("em", "i"):
|
|
inner = "".join(element_to_xhtml(c, break_img_path, empty_p_is_spacer) for c in el.children)
|
|
return f"<em>{inner}</em>"
|
|
|
|
if el.name in ("strong", "b"):
|
|
inner = "".join(element_to_xhtml(c, break_img_path, empty_p_is_spacer) for c in el.children)
|
|
return f"<strong>{inner}</strong>"
|
|
|
|
if el.name in ("h1", "h2", "h3", "h4"):
|
|
inner = "".join(element_to_xhtml(c, break_img_path, empty_p_is_spacer) for c in el.children)
|
|
return f"<{el.name}>{inner}</{el.name}>\n"
|
|
|
|
if el.name == "br":
|
|
return "<br />"
|
|
|
|
if el.name in ("sup", "sub"):
|
|
inner = "".join(element_to_xhtml(c, break_img_path, empty_p_is_spacer) for c in el.children)
|
|
return inner
|
|
|
|
if el.name == "a":
|
|
inner = "".join(element_to_xhtml(c, break_img_path, empty_p_is_spacer) for c in el.children)
|
|
return inner # strip links, keep text
|
|
|
|
if el.name == "img":
|
|
src = el.get("src", "")
|
|
alt = he(el.get("alt", ""))
|
|
if src:
|
|
return f'<img src="{he(src)}" alt="{alt}"/>\n'
|
|
return ""
|
|
|
|
if el.name == "figure":
|
|
parts = []
|
|
for c in el.children:
|
|
if isinstance(c, Tag) and c.name == "figcaption":
|
|
continue
|
|
parts.append(element_to_xhtml(c, break_img_path, empty_p_is_spacer))
|
|
return "".join(parts)
|
|
|
|
# Other tags: recurse
|
|
parts = []
|
|
for c in el.children:
|
|
parts.append(element_to_xhtml(c, break_img_path, empty_p_is_spacer))
|
|
return "".join(parts)
|
|
|
|
|
|
def normalize_wysiwyg_html(raw_html: str, break_img_path: str = "../Images/break.png") -> str:
|
|
"""Normaliseer HTML uit de WYSIWYG-editor naar EPUB-compatibele XHTML.
|
|
|
|
Vervangt scene-breaks door de break-afbeelding, behoudt <strong>, <em>, <u>,
|
|
<blockquote>, <blockquote class="author-note">, wikkelt losse tekst in <p>,
|
|
verwijdert lege paragrafen.
|
|
"""
|
|
from bs4 import BeautifulSoup, NavigableString, Tag
|
|
|
|
soup = BeautifulSoup(raw_html or "", "html.parser")
|
|
body = soup.find("body") or soup
|
|
output_parts: list[str] = []
|
|
|
|
def process_inline(el) -> str:
|
|
if isinstance(el, NavigableString):
|
|
text = str(el)
|
|
return he(text) if text else ""
|
|
if el.name in ("strong", "b"):
|
|
inner = "".join(process_inline(c) for c in el.children)
|
|
return f"<strong>{inner}</strong>"
|
|
if el.name in ("em", "i"):
|
|
inner = "".join(process_inline(c) for c in el.children)
|
|
return f"<em>{inner}</em>"
|
|
if el.name == "u":
|
|
inner = "".join(process_inline(c) for c in el.children)
|
|
return f"<u>{inner}</u>"
|
|
if el.name == "br":
|
|
return "<br />"
|
|
return "".join(process_inline(c) for c in el.children)
|
|
|
|
def process_block(el) -> str | None:
|
|
if isinstance(el, NavigableString):
|
|
text = str(el).strip()
|
|
return f"<p>{he(text)}</p>" if text else None
|
|
if not isinstance(el, Tag):
|
|
return None
|
|
if is_break_element(el):
|
|
return f'<center><img src="{break_img_path}" style="height:15px;"/></center>'
|
|
if el.name == "img":
|
|
src = el.get("src", "")
|
|
alt = he(el.get("alt", ""))
|
|
if "break" in src.lower():
|
|
return f'<center><img src="{break_img_path}" style="height:15px;"/></center>'
|
|
return f'<img src="{he(src)}" alt="{alt}"/>' if src else None
|
|
if el.name in ("p", "div"):
|
|
if is_break_element(el):
|
|
return f'<center><img src="{break_img_path}" style="height:15px;"/></center>'
|
|
inner = "".join(process_inline(c) for c in el.children).strip()
|
|
return f"<p>{inner}</p>" if inner else None
|
|
if el.name in ("h1", "h2", "h3", "h4"):
|
|
inner = "".join(process_inline(c) for c in el.children).strip()
|
|
return f"<{el.name}>{inner}</{el.name}>" if inner else None
|
|
if el.name == "blockquote":
|
|
classes = el.get("class", [])
|
|
css_class = " ".join(classes) if classes else ""
|
|
tag_open = f'<blockquote class="{css_class}">' if css_class else "<blockquote>"
|
|
parts = []
|
|
for child in el.children:
|
|
if isinstance(child, NavigableString):
|
|
text = str(child).strip()
|
|
if text:
|
|
parts.append(f"<p>{he(text)}</p>")
|
|
elif isinstance(child, Tag) and child.name in ("p", "div"):
|
|
inner = "".join(process_inline(c) for c in child.children).strip()
|
|
if inner:
|
|
parts.append(f"<p>{inner}</p>")
|
|
else:
|
|
r = process_block(child)
|
|
if r:
|
|
parts.append(r)
|
|
return f"{tag_open}{''.join(parts)}</blockquote>" if parts else None
|
|
if el.name == "hr":
|
|
return f'<center><img src="{break_img_path}" style="height:15px;"/></center>'
|
|
parts = [r for c in el.children if (r := process_block(c))]
|
|
return "".join(parts) if parts else None
|
|
|
|
for child in list(body.children if hasattr(body, "children") else []):
|
|
result = process_block(child)
|
|
if result:
|
|
output_parts.append(result)
|
|
|
|
return "\n".join(output_parts)
|