novela/containers/novela/epub.py
2026-03-26 10:24:57 +01:00

579 lines
22 KiB
Python

import io
import re
import zipfile
from datetime import datetime, timezone
from html import escape as he
from pathlib import Path
def detect_image_format(data: bytes, base: str) -> tuple[str, str]:
"""Return (filename_with_ext, media_type) detected from image magic bytes.
base -- filename stem without extension, e.g. 'cover' or 'ch001_img002'
"""
if data[:2] == b'\xff\xd8':
return f"{base}.jpg", "image/jpeg"
if data[:8] == b'\x89PNG\r\n\x1a\n':
return f"{base}.png", "image/png"
if data[:4] == b'RIFF' and data[8:12] == b'WEBP':
return f"{base}.webp", "image/webp"
if data[:3] == b'GIF':
return f"{base}.gif", "image/gif"
return f"{base}.jpg", "image/jpeg" # fallback
def add_cover_to_epub(epub_path, cover_data: bytes) -> None:
"""Replace (or add) the cover image in an existing EPUB."""
cover_filename, cover_media_type = detect_image_format(cover_data, "cover")
with open(epub_path, "rb") as f:
original = f.read()
with zipfile.ZipFile(io.BytesIO(original), "r") as zin:
names = zin.namelist()
# Locate the OPF via META-INF/container.xml
opf_path = "OEBPS/content.opf"
try:
container = zin.read("META-INF/container.xml").decode("utf-8", errors="replace")
m = re.search(r'full-path\s*=\s*["\']([^"\']+)["\']', container)
if m:
opf_path = m.group(1)
except Exception:
pass
opf_dir = opf_path.rsplit("/", 1)[0] if "/" in opf_path else ""
# Parse OPF to find the existing cover image path
old_cover_zip_path: str | None = None
try:
opf_text = zin.read(opf_path).decode("utf-8", errors="replace")
# Find item with id="cover*" that is an image
for m in re.finditer(
r'<item\b[^>]+id=["\']cover[^"\']*["\'][^>]*/?>',
opf_text,
):
href_m = re.search(r'href=["\']([^"\']+)["\']', m.group(0))
if href_m:
href = href_m.group(1)
zip_path = (opf_dir + "/" + href).lstrip("/") if opf_dir else href
# Normalise ../ segments
parts, resolved = zip_path.split("/"), []
for p in parts:
if p == ".." and resolved:
resolved.pop()
else:
resolved.append(p)
old_cover_zip_path = "/".join(resolved)
break
except Exception:
pass
# Decide where to write the new cover (same folder as old, or Images/ next to OPF)
if old_cover_zip_path:
cover_dir = old_cover_zip_path.rsplit("/", 1)[0] if "/" in old_cover_zip_path else ""
else:
cover_dir = (opf_dir + "/Images").lstrip("/") if opf_dir else "OEBPS/Images"
new_cover_zip_path = (cover_dir + "/" + cover_filename).lstrip("/")
# Rebuild the ZIP
buf = io.BytesIO()
with zipfile.ZipFile(io.BytesIO(original), "r") as zin, \
zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zout:
zout.writestr(zipfile.ZipInfo("mimetype"), zin.read("mimetype"), compress_type=zipfile.ZIP_STORED)
for item in zin.infolist():
if item.filename == "mimetype":
continue
# Drop the old cover image (will be replaced below)
if old_cover_zip_path and item.filename == old_cover_zip_path:
continue
data = zin.read(item.filename)
if item.filename == opf_path:
data = _patch_opf(
data.decode("utf-8"),
cover_filename,
cover_media_type,
old_cover_zip_path,
opf_dir,
).encode("utf-8")
zout.writestr(item, data)
# Write the new cover image
zout.writestr(new_cover_zip_path, cover_data)
with open(epub_path, "wb") as f:
f.write(buf.getvalue())
def _patch_opf(
opf: str,
cover_filename: str,
cover_media_type: str,
old_cover_zip_path: str | None,
opf_dir: str,
) -> str:
"""Replace or insert the cover manifest item and cover meta in an OPF."""
# Remove "Cover Missing" dc:subject
opf = re.sub(r'\s*<dc:subject>Cover Missing</dc:subject>', '', opf)
# Remove existing cover manifest item(s) with id starting with "cover"
opf = re.sub(r'\s*<item\b[^>]+id=["\']cover[^"\']*["\'][^>]*/>', '', opf)
opf = re.sub(r'\s*<item\b[^>]+id=["\']cover[^"\']*["\'][^>]*></item>', '', opf)
# Remove existing <meta name="cover" .../>
opf = re.sub(r'\s*<meta\b[^>]+name=["\']cover["\'][^>]*/>', '', opf)
# Compute relative href from OPF dir to the new cover
# new cover is placed in the same folder as the old one, relative to OPF
cover_href = cover_filename # same dir as OPF → just the filename
if old_cover_zip_path:
old_dir = old_cover_zip_path.rsplit("/", 1)[0] if "/" in old_cover_zip_path else ""
if old_dir != opf_dir:
# Make relative: e.g. opf_dir=EPUB, old_dir=EPUB/images → href=images/cover.jpg
if opf_dir and old_dir.startswith(opf_dir + "/"):
cover_href = old_dir[len(opf_dir) + 1:] + "/" + cover_filename
else:
cover_href = cover_filename
else:
cover_href = cover_filename
else:
cover_href = "Images/" + cover_filename
cover_item = f'<item id="cover-img" href="{cover_href}" media-type="{cover_media_type}"/>'
opf = opf.replace("</manifest>", f' {cover_item}\n </manifest>')
cover_meta = '<meta name="cover" content="cover-img"/>'
opf = opf.replace("</metadata>", f' {cover_meta}\n </metadata>')
return opf
def make_chapter_xhtml(title: str, content_html: str, chapter_num: int) -> str:
t = he(title)
return f"""<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
<title>{t}</title>
<link rel="stylesheet" type="text/css" href="../Styles/style.css"/>
</head>
<body>
<h2 class="chapter-title">{t}</h2>
{content_html}
</body>
</html>
"""
def make_intro_xhtml(book_title: str, author: str, book_info: dict) -> str:
"""Generate the intro page XHTML with genres, description, source and date."""
parts = []
# Optional illustration from the story index page (e.g. awesomedude.org)
if book_info.get("index_image_name"):
img = he(book_info["index_image_name"])
parts.append(f'<div class="intro-image"><img src="../Images/{img}" alt="" style="max-width:100%;"/></div>')
if book_info.get("genres"):
parts.append(f'<p><strong>Genres:</strong> {he(", ".join(book_info["genres"]))}</p>')
if book_info.get("subgenres"):
parts.append(f'<p><strong>Sub-genres:</strong> {he(", ".join(book_info["subgenres"]))}</p>')
if book_info.get("tags"):
parts.append(f'<p><strong>Tags:</strong> {he(", ".join(book_info["tags"]))}</p>')
if book_info.get("description"):
parts.append("<hr/>")
for para in book_info["description"].split("\n\n"):
if para.strip():
parts.append(f"<p>{he(para.strip())}</p>")
parts.append("<hr/>")
if book_info.get("source_url"):
parts.append(f'<p><strong>Source:</strong> {he(book_info["source_url"])}</p>')
if book_info.get("updated_date"):
parts.append(f'<p><strong>Updated:</strong> {he(book_info["updated_date"])}</p>')
content = "\n".join(parts)
t = he(book_title)
a = he(author)
return f"""<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
<title>{t}</title>
<link rel="stylesheet" type="text/css" href="../Styles/style.css"/>
</head>
<body>
<h1>{t}</h1>
<p class="author">by {a}</p>
{content}
</body>
</html>
"""
def make_epub(
book_title: str,
author: str,
chapters: list[dict],
cover_data: bytes | None,
break_img_data: bytes,
book_id: str,
book_info: dict | None = None,
) -> bytes:
"""Build a complete EPUB 2.0 in-memory and return the bytes."""
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
# mimetype must be first and uncompressed
zf.writestr(
zipfile.ZipInfo("mimetype"),
"application/epub+zip",
compress_type=zipfile.ZIP_STORED,
)
zf.writestr(
"META-INF/container.xml",
"""<?xml version="1.0"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>""",
)
css = open("static/epub-style.css", "r", encoding="utf-8").read()
zf.writestr("OEBPS/Styles/style.css", css)
zf.writestr("OEBPS/Images/break.png", break_img_data)
info = book_info or {}
# Optional intro illustration (e.g. index page image from awesomedude.org)
if info.get("index_image_data"):
zf.writestr(f"OEBPS/Images/{info['index_image_name']}", info["index_image_data"])
has_cover = cover_data is not None
cover_filename = ""
cover_media_type = ""
if has_cover:
cover_filename, cover_media_type = detect_image_format(cover_data, "cover")
zf.writestr(f"OEBPS/Images/{cover_filename}", cover_data)
zf.writestr("OEBPS/Text/intro.xhtml", make_intro_xhtml(book_title, author, info))
# Chapter images
for ch in chapters:
for img in ch.get("images", []):
zf.writestr(img["epub_path"], img["data"])
chapter_files = []
for i, ch in enumerate(chapters, 1):
fname = f"chapter{i:03d}.xhtml"
zf.writestr(f"OEBPS/Text/{fname}", ch["xhtml"])
chapter_files.append((fname, ch["title"]))
# Manifest
manifest_items = []
if has_cover:
manifest_items.append(
f'<item id="cover-img" href="Images/{cover_filename}" media-type="{cover_media_type}"/>'
)
# Chapter images
for ch in chapters:
for img in ch.get("images", []):
img_id = img["epub_path"].split("/")[-1].replace(".", "_")
manifest_items.append(
f'<item id="{img_id}" href="{img["epub_path"].replace("OEBPS/", "")}"'
f' media-type="{img["media_type"]}"/>'
)
if info.get("index_image_name"):
manifest_items.append(
f'<item id="intro-img" href="Images/{info["index_image_name"]}"'
f' media-type="{info["index_image_mime"]}"/>'
)
manifest_items.append('<item id="break-img" href="Images/break.png" media-type="image/png"/>')
manifest_items.append('<item id="css" href="Styles/style.css" media-type="text/css"/>')
manifest_items.append('<item id="intro" href="Text/intro.xhtml" media-type="application/xhtml+xml"/>')
for i, (fname, _) in enumerate(chapter_files, 1):
manifest_items.append(f'<item id="ch{i:03d}" href="Text/{fname}" media-type="application/xhtml+xml"/>')
manifest_items.append('<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>')
spine_items = ['<itemref idref="intro"/>'] + [
f'<itemref idref="ch{i:03d}"/>' for i in range(1, len(chapter_files) + 1)
]
cover_meta = f'<meta name="cover" content="cover-img"/>' if has_cover else ""
subject_items = "".join(
f"\n <dc:subject>{he(g)}</dc:subject>"
for g in info.get("genres", []) + info.get("subgenres", []) + info.get("tags", [])
)
desc_item = (
f"\n <dc:description>{he(info['description'].replace(chr(10), ' '))}</dc:description>"
if info.get("description") else ""
)
date_item = (
f"\n <dc:date opf:event=\"modification\">{he(info['updated_date'])}</dc:date>"
if info.get("updated_date") else ""
)
source_item = (
f"\n <dc:source>{he(info['source_url'])}</dc:source>"
if info.get("source_url") else ""
)
publisher_item = (
f"\n <dc:publisher>{he(info['publisher'])}</dc:publisher>"
if info.get("publisher") else ""
)
series_items = ""
if info.get("series"):
s = he(info["series"])
idx = int(info.get("series_index", 1))
series_items = (
f'\n <meta name="calibre:series" content="{s}"/>'
f'\n <meta name="calibre:series_index" content="{idx}"/>'
)
status_item = (
f'\n <meta name="publication_status" content="{he(info["publication_status"])}"/>'
if info.get("publication_status") else ""
)
opf = f"""<?xml version="1.0" encoding="UTF-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="2.0" unique-identifier="BookId">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
<dc:title>{he(book_title)}</dc:title>
<dc:creator opf:role="aut">{he(author)}</dc:creator>
<dc:language>en</dc:language>
<dc:identifier id="BookId">{book_id}</dc:identifier>
{cover_meta}{subject_items}{desc_item}{date_item}{source_item}{publisher_item}{series_items}{status_item}
</metadata>
<manifest>
{"".join(manifest_items)}
</manifest>
<spine toc="ncx">
{"".join(spine_items)}
</spine>
</package>"""
zf.writestr("OEBPS/content.opf", opf)
# TOC NCX
nav_points = [
""" <navPoint id="intro" playOrder="1">
<navLabel><text>Book Info</text></navLabel>
<content src="Text/intro.xhtml"/>
</navPoint>"""
]
for i, (fname, title) in enumerate(chapter_files, 1):
nav_points.append(
f""" <navPoint id="ch{i:03d}" playOrder="{i + 1}">
<navLabel><text>{he(title)}</text></navLabel>
<content src="Text/{fname}"/>
</navPoint>"""
)
ncx = f"""<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN"
"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
<head>
<meta name="dtb:uid" content="{book_id}"/>
<meta name="dtb:depth" content="1"/>
<meta name="dtb:totalPageCount" content="0"/>
<meta name="dtb:maxPageNumber" content="0"/>
</head>
<docTitle><text>{he(book_title)}</text></docTitle>
<navMap>
{"".join(nav_points)}
</navMap>
</ncx>"""
zf.writestr("OEBPS/toc.ncx", ncx)
return buf.getvalue()
def read_epub_file(epub_path, internal_path: str) -> str:
"""Read a single file from the EPUB zip and return it as a UTF-8 string."""
with zipfile.ZipFile(epub_path, "r") as z:
return z.read(internal_path).decode("utf-8", errors="replace")
def write_epub_file(epub_path, internal_path: str, content: str) -> None:
"""Replace a single file inside the EPUB zip (full zip rewrite).
If OEBPS/Images/break.png is missing from the zip it is added automatically,
so break-image inserts made in the editor render correctly in older EPUBs.
"""
with open(epub_path, "rb") as f:
original = f.read()
break_img_path = "OEBPS/Images/break.png"
buf = io.BytesIO()
with zipfile.ZipFile(io.BytesIO(original), "r") as zin, \
zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zout:
zout.writestr(
zipfile.ZipInfo("mimetype"), zin.read("mimetype"),
compress_type=zipfile.ZIP_STORED,
)
names = zin.namelist()
has_break = break_img_path in names
for item in zin.infolist():
if item.filename == "mimetype":
continue
if item.filename == internal_path:
zout.writestr(item, content.encode("utf-8"))
else:
zout.writestr(item, zin.read(item.filename))
if not has_break:
try:
zout.writestr(break_img_path, open("static/break.png", "rb").read())
except Exception:
pass
with open(epub_path, "wb") as f:
f.write(buf.getvalue())
def build_epub(
title: str,
author: str,
publisher: str,
chapters: list[dict],
) -> bytes:
"""Bouw een EPUB 2.0 bestand vanuit builder-data. Geeft raw bytes terug."""
import uuid as _uuid
book_id = str(_uuid.uuid4())
now_str = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
# mimetype — ongecomprimeerd als eerste entry
mi = zipfile.ZipInfo("mimetype")
mi.compress_type = zipfile.ZIP_STORED
z.writestr(mi, "application/epub+zip")
z.writestr(
"META-INF/container.xml",
'<?xml version="1.0" encoding="UTF-8"?>\n'
'<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">\n'
' <rootfiles>\n'
' <rootfile full-path="OEBPS/content.opf"'
' media-type="application/oebps-package+xml"/>\n'
' </rootfiles>\n'
'</container>\n',
)
style_css = (
"body { font-family: Georgia, serif; font-size: 1em;"
" line-height: 1.6; margin: 1em; }\n"
"p { margin: 0 0 0.8em 0; text-indent: 1.2em; }\n"
"p:first-child, h1 + p, h2 + p, h3 + p { text-indent: 0; }\n"
"h1, h2, h3 { font-weight: bold; margin: 1.2em 0 0.4em; }\n"
"blockquote { margin: 1em 2em; padding: 0.3em 0.8em;"
" border-left: 3px solid #aaa; }\n"
"blockquote.author-note { font-style: italic; color: #666;"
" border-left: 3px solid #555; margin: 1.2em 2em;"
" padding: 0.4em 1em; font-size: 0.92em; }\n"
"center img { display: block; margin: 1em auto; }\n"
)
z.writestr("OEBPS/Styles/style.css", style_css)
break_png_path = Path("static/break.png")
if break_png_path.exists():
z.write(str(break_png_path), "OEBPS/Images/break.png")
manifest_items: list[str] = []
spine_idrefs: list[str] = []
ncx_nav_points: list[str] = []
for i, ch in enumerate(chapters):
ch_id = f"chapter_{i + 1:03d}"
ch_filename = f"OEBPS/Text/{ch_id}.xhtml"
ch_title = he(ch.get("title") or f"Hoofdstuk {i + 1}")
ch_content = ch.get("content") or "<p></p>"
xhtml = (
'<?xml version="1.0" encoding="UTF-8"?>\n'
'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"\n'
' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n'
'<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="nl">\n'
"<head>\n"
' <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>\n'
f" <title>{ch_title}</title>\n"
' <link rel="stylesheet" type="text/css" href="../Styles/style.css"/>\n'
"</head>\n"
"<body>\n"
f" <h2 class=\"chapter-title\">{ch_title}</h2>\n"
f" {ch_content}\n"
"</body>\n"
"</html>\n"
)
z.writestr(ch_filename, xhtml)
manifest_items.append(
f' <item id="{ch_id}" href="Text/{ch_id}.xhtml"'
f' media-type="application/xhtml+xml"/>'
)
spine_idrefs.append(f' <itemref idref="{ch_id}"/>')
ncx_nav_points.append(
f' <navPoint id="navPoint-{i + 1}" playOrder="{i + 2}">\n'
f' <navLabel><text>{ch_title}</text></navLabel>\n'
f' <content src="Text/{ch_id}.xhtml"/>\n'
f' </navPoint>'
)
safe_title = he(title)
safe_author = he(author)
ncx = (
'<?xml version="1.0" encoding="UTF-8"?>\n'
'<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN"\n'
' "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">\n'
'<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">\n'
"<head>\n"
f' <meta name="dtb:uid" content="{book_id}"/>\n'
' <meta name="dtb:depth" content="1"/>\n'
' <meta name="dtb:totalPageCount" content="0"/>\n'
' <meta name="dtb:maxPageNumber" content="0"/>\n'
"</head>\n"
f"<docTitle><text>{safe_title}</text></docTitle>\n"
f"<docAuthor><text>{safe_author}</text></docAuthor>\n"
"<navMap>\n"
+ "\n".join(ncx_nav_points)
+ "\n</navMap>\n</ncx>\n"
)
z.writestr("OEBPS/toc.ncx", ncx)
has_break = break_png_path.exists()
opf = (
'<?xml version="1.0" encoding="UTF-8"?>\n'
'<package xmlns="http://www.idpf.org/2007/opf" version="2.0"'
f' unique-identifier="BookId">\n'
'<metadata xmlns:dc="http://purl.org/dc/elements/1.1/"'
' xmlns:opf="http://www.idpf.org/2007/opf">\n'
f' <dc:title>{safe_title}</dc:title>\n'
f' <dc:creator opf:role="aut">{safe_author}</dc:creator>\n'
f' <dc:publisher>{he(publisher or "")}</dc:publisher>\n'
f' <dc:identifier id="BookId" opf:scheme="UUID">{book_id}</dc:identifier>\n'
f' <dc:date>{now_str}</dc:date>\n'
' <dc:language>nl</dc:language>\n'
"</metadata>\n"
"<manifest>\n"
' <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>\n'
' <item id="style" href="Styles/style.css" media-type="text/css"/>\n'
+ (
' <item id="break-img" href="Images/break.png" media-type="image/png"/>\n'
if has_break else ""
)
+ "\n".join(manifest_items)
+ "\n</manifest>\n"
'<spine toc="ncx">\n'
+ "\n".join(spine_idrefs)
+ "\n</spine>\n"
"</package>\n"
)
z.writestr("OEBPS/content.opf", opf)
return buf.getvalue()