novela/containers/novela/epub.py

import io
import re
import zipfile
from datetime import datetime, timezone
from html import escape as he
from pathlib import Path


def detect_image_format(data: bytes, base: str) -> tuple[str, str]:
    """Return (filename_with_ext, media_type) detected from image magic bytes.

    base -- filename stem without extension, e.g. 'cover' or 'ch001_img002'
    """
    if data[:2] == b'\xff\xd8':
        return f"{base}.jpg", "image/jpeg"
    if data[:8] == b'\x89PNG\r\n\x1a\n':
        return f"{base}.png", "image/png"
    if data[:4] == b'RIFF' and data[8:12] == b'WEBP':
        return f"{base}.webp", "image/webp"
    if data[:3] == b'GIF':
        return f"{base}.gif", "image/gif"
    return f"{base}.jpg", "image/jpeg"  # fallback


def add_cover_to_epub(epub_path, cover_data: bytes) -> None:
    """Replace (or add) the cover image in an existing EPUB."""
    cover_filename, cover_media_type = detect_image_format(cover_data, "cover")

    with open(epub_path, "rb") as f:
        original = f.read()

    with zipfile.ZipFile(io.BytesIO(original), "r") as zin:
        names = zin.namelist()

        # Locate the OPF via META-INF/container.xml
        opf_path = "OEBPS/content.opf"
        try:
            container = zin.read("META-INF/container.xml").decode("utf-8", errors="replace")
            m = re.search(r'full-path\s*=\s*["\']([^"\']+)["\']', container)
            if m:
                opf_path = m.group(1)
        except Exception:
            pass
        opf_dir = opf_path.rsplit("/", 1)[0] if "/" in opf_path else ""

        # Parse OPF to find the existing cover image path
        old_cover_zip_path: str | None = None
        try:
            opf_text = zin.read(opf_path).decode("utf-8", errors="replace")
            # Find item with id="cover*" that is an image
            for m in re.finditer(
                r'<item\b[^>]+id=["\']cover[^"\']*["\'][^>]*/?>',
                opf_text,
            ):
                href_m = re.search(r'href=["\']([^"\']+)["\']', m.group(0))
                if href_m:
                    href = href_m.group(1)
                    zip_path = (opf_dir + "/" + href).lstrip("/") if opf_dir else href
                    # Normalise ../ segments
                    parts, resolved = zip_path.split("/"), []
                    for p in parts:
                        if p == ".." and resolved:
                            resolved.pop()
                        else:
                            resolved.append(p)
                    old_cover_zip_path = "/".join(resolved)
                    break
        except Exception:
            pass

        # Decide where to write the new cover (same folder as old, or Images/ next to OPF)
        if old_cover_zip_path:
            cover_dir = old_cover_zip_path.rsplit("/", 1)[0] if "/" in old_cover_zip_path else ""
        else:
            cover_dir = (opf_dir + "/Images").lstrip("/") if opf_dir else "OEBPS/Images"
        new_cover_zip_path = (cover_dir + "/" + cover_filename).lstrip("/")

    # Rebuild the ZIP
    buf = io.BytesIO()
    with zipfile.ZipFile(io.BytesIO(original), "r") as zin, \
         zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zout:

        zout.writestr(zipfile.ZipInfo("mimetype"), zin.read("mimetype"), compress_type=zipfile.ZIP_STORED)

        for item in zin.infolist():
            if item.filename == "mimetype":
                continue
            # Drop the old cover image (will be replaced below)
            if old_cover_zip_path and item.filename == old_cover_zip_path:
                continue
            data = zin.read(item.filename)
            if item.filename == opf_path:
                data = _patch_opf(
                    data.decode("utf-8"),
                    cover_filename,
                    cover_media_type,
                    old_cover_zip_path,
                    opf_dir,
                ).encode("utf-8")
            zout.writestr(item, data)

        # Write the new cover image
        zout.writestr(new_cover_zip_path, cover_data)

    with open(epub_path, "wb") as f:
        f.write(buf.getvalue())


def _patch_opf(
    opf: str,
    cover_filename: str,
    cover_media_type: str,
    old_cover_zip_path: str | None,
    opf_dir: str,
) -> str:
    """Replace or insert the cover manifest item and cover meta in an OPF."""
    # Remove "Cover Missing" dc:subject
    opf = re.sub(r'\s*<dc:subject>Cover Missing</dc:subject>', '', opf)

    # Remove existing cover manifest item(s) with id starting with "cover"
    opf = re.sub(r'\s*<item\b[^>]+id=["\']cover[^"\']*["\'][^>]*/>', '', opf)
    opf = re.sub(r'\s*<item\b[^>]+id=["\']cover[^"\']*["\'][^>]*></item>', '', opf)
    # Remove existing <meta name="cover" .../>
    opf = re.sub(r'\s*<meta\b[^>]+name=["\']cover["\'][^>]*/>', '', opf)

    # Compute relative href from OPF dir to the new cover
    # new cover is placed in the same folder as the old one, relative to OPF
    cover_href = cover_filename  # same dir as OPF → just the filename
    if old_cover_zip_path:
        old_dir = old_cover_zip_path.rsplit("/", 1)[0] if "/" in old_cover_zip_path else ""
        if old_dir != opf_dir:
            # Make relative: e.g. opf_dir=EPUB, old_dir=EPUB/images → href=images/cover.jpg
            if opf_dir and old_dir.startswith(opf_dir + "/"):
                cover_href = old_dir[len(opf_dir) + 1:] + "/" + cover_filename
            else:
                cover_href = cover_filename
        else:
            cover_href = cover_filename
    else:
        cover_href = "Images/" + cover_filename

    cover_item = f'<item id="cover-img" href="{cover_href}" media-type="{cover_media_type}"/>'
    opf = opf.replace("</manifest>", f'  {cover_item}\n  </manifest>')

    cover_meta = '<meta name="cover" content="cover-img"/>'
    opf = opf.replace("</metadata>", f'    {cover_meta}\n  </metadata>')

    return opf


def make_chapter_xhtml(title: str, content_html: str, chapter_num: int) -> str:
    t = he(title)
    return f"""<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head>
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
  <title>{t}</title>
  <link rel="stylesheet" type="text/css" href="../Styles/style.css"/>
</head>
<body>
  <h2 class="chapter-title">{t}</h2>
{content_html}
</body>
</html>
"""


def make_intro_xhtml(book_title: str, author: str, book_info: dict) -> str:
    """Generate the intro page XHTML with genres, description, source and date."""
    parts = []
    # Optional illustration from the story index page (e.g. awesomedude.org)
    if book_info.get("index_image_name"):
        img = he(book_info["index_image_name"])
        parts.append(f'<div class="intro-image"><img src="../Images/{img}" alt="" style="max-width:100%;"/></div>')
    if book_info.get("genres"):
        parts.append(f'<p><strong>Genres:</strong> {he(", ".join(book_info["genres"]))}</p>')
    if book_info.get("subgenres"):
        parts.append(f'<p><strong>Sub-genres:</strong> {he(", ".join(book_info["subgenres"]))}</p>')
    if book_info.get("tags"):
        parts.append(f'<p><strong>Tags:</strong> {he(", ".join(book_info["tags"]))}</p>')
    if book_info.get("description"):
        parts.append("<hr/>")
        for para in book_info["description"].split("\n\n"):
            if para.strip():
                parts.append(f"<p>{he(para.strip())}</p>")
    parts.append("<hr/>")
    if book_info.get("source_url"):
        parts.append(f'<p><strong>Source:</strong> {he(book_info["source_url"])}</p>')
    if book_info.get("updated_date"):
        parts.append(f'<p><strong>Updated:</strong> {he(book_info["updated_date"])}</p>')
    content = "\n".join(parts)
    t = he(book_title)
    a = he(author)
    return f"""<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head>
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
  <title>{t}</title>
  <link rel="stylesheet" type="text/css" href="../Styles/style.css"/>
</head>
<body>
  <h1>{t}</h1>
  <p class="author">by {a}</p>
{content}
</body>
</html>
"""


def make_epub(
    book_title: str,
    author: str,
    chapters: list[dict],
    cover_data: bytes | None,
    break_img_data: bytes,
    book_id: str,
    book_info: dict | None = None,
) -> bytes:
    """Build a complete EPUB 2.0 in-memory and return the bytes."""
    buf = io.BytesIO()
    with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
        # mimetype must be first and uncompressed
        zf.writestr(
            zipfile.ZipInfo("mimetype"),
            "application/epub+zip",
            compress_type=zipfile.ZIP_STORED,
        )

        zf.writestr(
            "META-INF/container.xml",
            """<?xml version="1.0"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
  <rootfiles>
    <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
  </rootfiles>
</container>""",
        )

        css = open("static/epub-style.css", "r", encoding="utf-8").read()
        zf.writestr("OEBPS/Styles/style.css", css)
        zf.writestr("OEBPS/Images/break.png", break_img_data)

        info = book_info or {}

        # Optional intro illustration (e.g. index page image from awesomedude.org)
        if info.get("index_image_data"):
            zf.writestr(f"OEBPS/Images/{info['index_image_name']}", info["index_image_data"])

        has_cover = cover_data is not None
        cover_filename = ""
        cover_media_type = ""
        if has_cover:
            cover_filename, cover_media_type = detect_image_format(cover_data, "cover")
            zf.writestr(f"OEBPS/Images/{cover_filename}", cover_data)

        zf.writestr("OEBPS/Text/intro.xhtml", make_intro_xhtml(book_title, author, info))

        # Chapter images
        for ch in chapters:
            for img in ch.get("images", []):
                zf.writestr(img["epub_path"], img["data"])

        chapter_files = []
        for i, ch in enumerate(chapters, 1):
            fname = f"chapter{i:03d}.xhtml"
            zf.writestr(f"OEBPS/Text/{fname}", ch["xhtml"])
            chapter_files.append((fname, ch["title"]))

        # Manifest
        manifest_items = []
        if has_cover:
            manifest_items.append(
                f'<item id="cover-img" href="Images/{cover_filename}" media-type="{cover_media_type}"/>'
            )
        # Chapter images
        for ch in chapters:
            for img in ch.get("images", []):
                img_id = img["epub_path"].split("/")[-1].replace(".", "_")
                manifest_items.append(
                    f'<item id="{img_id}" href="{img["epub_path"].replace("OEBPS/", "")}"'
                    f' media-type="{img["media_type"]}"/>'
                )
        if info.get("index_image_name"):
            manifest_items.append(
                f'<item id="intro-img" href="Images/{info["index_image_name"]}"'
                f' media-type="{info["index_image_mime"]}"/>'
            )
        manifest_items.append('<item id="break-img" href="Images/break.png" media-type="image/png"/>')
        manifest_items.append('<item id="css" href="Styles/style.css" media-type="text/css"/>')
        manifest_items.append('<item id="intro" href="Text/intro.xhtml" media-type="application/xhtml+xml"/>')
        for i, (fname, _) in enumerate(chapter_files, 1):
            manifest_items.append(f'<item id="ch{i:03d}" href="Text/{fname}" media-type="application/xhtml+xml"/>')
        manifest_items.append('<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>')

        spine_items = ['<itemref idref="intro"/>'] + [
            f'<itemref idref="ch{i:03d}"/>' for i in range(1, len(chapter_files) + 1)
        ]

        cover_meta = f'<meta name="cover" content="cover-img"/>' if has_cover else ""

        subject_items = "".join(
            f"\n    <dc:subject>{he(g)}</dc:subject>"
            for g in info.get("genres", []) + info.get("subgenres", []) + info.get("tags", [])
        )
        desc_item = (
            f"\n    <dc:description>{he(info['description'].replace(chr(10), ' '))}</dc:description>"
            if info.get("description") else ""
        )
        date_item = (
            f"\n    <dc:date opf:event=\"modification\">{he(info['updated_date'])}</dc:date>"
            if info.get("updated_date") else ""
        )
        source_item = (
            f"\n    <dc:source>{he(info['source_url'])}</dc:source>"
            if info.get("source_url") else ""
        )
        publisher_item = (
            f"\n    <dc:publisher>{he(info['publisher'])}</dc:publisher>"
            if info.get("publisher") else ""
        )
        series_items = ""
        if info.get("series"):
            s = he(info["series"])
            idx = int(info.get("series_index", 1))
            series_items = (
                f'\n    <meta name="calibre:series" content="{s}"/>'
                f'\n    <meta name="calibre:series_index" content="{idx}"/>'
            )
        status_item = (
            f'\n    <meta name="publication_status" content="{he(info["publication_status"])}"/>'
            if info.get("publication_status") else ""
        )

        opf = f"""<?xml version="1.0" encoding="UTF-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="2.0" unique-identifier="BookId">
  <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
    <dc:title>{he(book_title)}</dc:title>
    <dc:creator opf:role="aut">{he(author)}</dc:creator>
    <dc:language>en</dc:language>
    <dc:identifier id="BookId">{book_id}</dc:identifier>
    {cover_meta}{subject_items}{desc_item}{date_item}{source_item}{publisher_item}{series_items}{status_item}
  </metadata>
  <manifest>
    {"".join(manifest_items)}
  </manifest>
  <spine toc="ncx">
    {"".join(spine_items)}
  </spine>
</package>"""
        zf.writestr("OEBPS/content.opf", opf)

        # TOC NCX
        nav_points = [
            """    <navPoint id="intro" playOrder="1">
      <navLabel><text>Book Info</text></navLabel>
      <content src="Text/intro.xhtml"/>
    </navPoint>"""
        ]
        for i, (fname, title) in enumerate(chapter_files, 1):
            nav_points.append(
                f"""    <navPoint id="ch{i:03d}" playOrder="{i + 1}">
      <navLabel><text>{he(title)}</text></navLabel>
      <content src="Text/{fname}"/>
    </navPoint>"""
            )

        ncx = f"""<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN"
  "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
  <head>
    <meta name="dtb:uid" content="{book_id}"/>
    <meta name="dtb:depth" content="1"/>
    <meta name="dtb:totalPageCount" content="0"/>
    <meta name="dtb:maxPageNumber" content="0"/>
  </head>
  <docTitle><text>{he(book_title)}</text></docTitle>
  <navMap>
{"".join(nav_points)}
  </navMap>
</ncx>"""
        zf.writestr("OEBPS/toc.ncx", ncx)

    return buf.getvalue()


def read_epub_file(epub_path, internal_path: str) -> str:
    """Read a single file from the EPUB zip and return it as a UTF-8 string."""
    with zipfile.ZipFile(epub_path, "r") as z:
        return z.read(internal_path).decode("utf-8", errors="replace")


def write_epub_file(epub_path, internal_path: str, content: str) -> None:
    """Replace a single file inside the EPUB zip (full zip rewrite).

    If OEBPS/Images/break.png is missing from the zip it is added automatically,
    so break-image inserts made in the editor render correctly in older EPUBs.
    """
    with open(epub_path, "rb") as f:
        original = f.read()

    break_img_path = "OEBPS/Images/break.png"
    buf = io.BytesIO()
    with zipfile.ZipFile(io.BytesIO(original), "r") as zin, \
         zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zout:

        zout.writestr(
            zipfile.ZipInfo("mimetype"), zin.read("mimetype"),
            compress_type=zipfile.ZIP_STORED,
        )

        names = zin.namelist()
        has_break = break_img_path in names

        for item in zin.infolist():
            if item.filename == "mimetype":
                continue
            if item.filename == internal_path:
                zout.writestr(item, content.encode("utf-8"))
            else:
                zout.writestr(item, zin.read(item.filename))

        if not has_break:
            try:
                zout.writestr(break_img_path, open("static/break.png", "rb").read())
            except Exception:
                pass

    with open(epub_path, "wb") as f:
        f.write(buf.getvalue())


def build_epub(
    title: str,
    author: str,
    publisher: str,
    chapters: list[dict],
) -> bytes:
    """Bouw een EPUB 2.0 bestand vanuit builder-data. Geeft raw bytes terug."""
    import uuid as _uuid
    book_id = str(_uuid.uuid4())
    now_str = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

    buf = io.BytesIO()
    with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:

        # mimetype — ongecomprimeerd als eerste entry
        mi = zipfile.ZipInfo("mimetype")
        mi.compress_type = zipfile.ZIP_STORED
        z.writestr(mi, "application/epub+zip")

        z.writestr(
            "META-INF/container.xml",
            '<?xml version="1.0" encoding="UTF-8"?>\n'
            '<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">\n'
            '  <rootfiles>\n'
            '    <rootfile full-path="OEBPS/content.opf"'
            ' media-type="application/oebps-package+xml"/>\n'
            '  </rootfiles>\n'
            '</container>\n',
        )

        style_css = (
            "body { font-family: Georgia, serif; font-size: 1em;"
            " line-height: 1.6; margin: 1em; }\n"
            "p { margin: 0 0 0.8em 0; text-indent: 1.2em; }\n"
            "p:first-child, h1 + p, h2 + p, h3 + p { text-indent: 0; }\n"
            "h1, h2, h3 { font-weight: bold; margin: 1.2em 0 0.4em; }\n"
            "blockquote { margin: 1em 2em; padding: 0.3em 0.8em;"
            " border-left: 3px solid #aaa; }\n"
            "blockquote.author-note { font-style: italic; color: #666;"
            " border-left: 3px solid #555; margin: 1.2em 2em;"
            " padding: 0.4em 1em; font-size: 0.92em; }\n"
            "center img { display: block; margin: 1em auto; }\n"
        )
        z.writestr("OEBPS/Styles/style.css", style_css)

        break_png_path = Path("static/break.png")
        if break_png_path.exists():
            z.write(str(break_png_path), "OEBPS/Images/break.png")

        manifest_items: list[str] = []
        spine_idrefs: list[str] = []
        ncx_nav_points: list[str] = []

        for i, ch in enumerate(chapters):
            ch_id = f"chapter_{i + 1:03d}"
            ch_filename = f"OEBPS/Text/{ch_id}.xhtml"
            ch_title = he(ch.get("title") or f"Hoofdstuk {i + 1}")
            ch_content = ch.get("content") or "<p></p>"

            xhtml = (
                '<?xml version="1.0" encoding="UTF-8"?>\n'
                '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"\n'
                '  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n'
                '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="nl">\n'
                "<head>\n"
                '  <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>\n'
                f"  <title>{ch_title}</title>\n"
                '  <link rel="stylesheet" type="text/css" href="../Styles/style.css"/>\n'
                "</head>\n"
                "<body>\n"
                f"  <h2 class=\"chapter-title\">{ch_title}</h2>\n"
                f"  {ch_content}\n"
                "</body>\n"
                "</html>\n"
            )
            z.writestr(ch_filename, xhtml)

            manifest_items.append(
                f'    <item id="{ch_id}" href="Text/{ch_id}.xhtml"'
                f' media-type="application/xhtml+xml"/>'
            )
            spine_idrefs.append(f'    <itemref idref="{ch_id}"/>')
            ncx_nav_points.append(
                f'  <navPoint id="navPoint-{i + 1}" playOrder="{i + 2}">\n'
                f'    <navLabel><text>{ch_title}</text></navLabel>\n'
                f'    <content src="Text/{ch_id}.xhtml"/>\n'
                f'  </navPoint>'
            )

        safe_title = he(title)
        safe_author = he(author)

        ncx = (
            '<?xml version="1.0" encoding="UTF-8"?>\n'
            '<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN"\n'
            '  "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">\n'
            '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">\n'
            "<head>\n"
            f'  <meta name="dtb:uid" content="{book_id}"/>\n'
            '  <meta name="dtb:depth" content="1"/>\n'
            '  <meta name="dtb:totalPageCount" content="0"/>\n'
            '  <meta name="dtb:maxPageNumber" content="0"/>\n'
            "</head>\n"
            f"<docTitle><text>{safe_title}</text></docTitle>\n"
            f"<docAuthor><text>{safe_author}</text></docAuthor>\n"
            "<navMap>\n"
            + "\n".join(ncx_nav_points)
            + "\n</navMap>\n</ncx>\n"
        )
        z.writestr("OEBPS/toc.ncx", ncx)

        has_break = break_png_path.exists()
        opf = (
            '<?xml version="1.0" encoding="UTF-8"?>\n'
            '<package xmlns="http://www.idpf.org/2007/opf" version="2.0"'
            f' unique-identifier="BookId">\n'
            '<metadata xmlns:dc="http://purl.org/dc/elements/1.1/"'
            ' xmlns:opf="http://www.idpf.org/2007/opf">\n'
            f'  <dc:title>{safe_title}</dc:title>\n'
            f'  <dc:creator opf:role="aut">{safe_author}</dc:creator>\n'
            f'  <dc:publisher>{he(publisher or "")}</dc:publisher>\n'
            f'  <dc:identifier id="BookId" opf:scheme="UUID">{book_id}</dc:identifier>\n'
            f'  <dc:date>{now_str}</dc:date>\n'
            '  <dc:language>nl</dc:language>\n'
            "</metadata>\n"
            "<manifest>\n"
            '  <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>\n'
            '  <item id="style" href="Styles/style.css" media-type="text/css"/>\n'
            + (
                '  <item id="break-img" href="Images/break.png" media-type="image/png"/>\n'
                if has_break else ""
            )
            + "\n".join(manifest_items)
            + "\n</manifest>\n"
            '<spine toc="ncx">\n'
            + "\n".join(spine_idrefs)
            + "\n</spine>\n"
            "</package>\n"
        )
        z.writestr("OEBPS/content.opf", opf)

    return buf.getvalue()