novela/containers/novela/pdf.py

from pathlib import Path

import fitz
from PIL import Image, ImageOps

COVER_W = 300
COVER_H = 450


def pdf_page_count(path: Path) -> int:
    with fitz.open(path) as doc:
        return doc.page_count


def pdf_render_page(path: Path, page_num: int, dpi: int = 150) -> bytes:
    with fitz.open(path) as doc:
        if page_num < 0 or page_num >= doc.page_count:
            raise IndexError("Page out of range")
        page = doc.load_page(page_num)
        mat = fitz.Matrix(dpi / 72.0, dpi / 72.0)
        pix = page.get_pixmap(matrix=mat, alpha=False)
        return pix.tobytes("png")


def _webp_thumb_from_image(path: Path) -> bytes:
    with Image.open(path) as im:
        im = ImageOps.exif_transpose(im)
        if im.mode not in ("RGB", "RGBA"):
            im = im.convert("RGB")
        thumb = ImageOps.fit(im, (COVER_W, COVER_H), method=Image.Resampling.LANCZOS)
        from io import BytesIO

        out = BytesIO()
        thumb.save(out, format="WEBP", quality=82, method=6)
        return out.getvalue()


def pdf_cover_thumb(path: Path) -> bytes:
    with fitz.open(path) as doc:
        if doc.page_count == 0:
            raise ValueError("PDF has no pages")
        page = doc.load_page(0)
        pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5), alpha=False)
        tmp = path.with_suffix(".cover.tmp.png")
        try:
            pix.save(tmp)
            return _webp_thumb_from_image(tmp)
        finally:
            if tmp.exists():
                tmp.unlink(missing_ok=True)


def pdf_scan_metadata(path: Path) -> dict:
    with fitz.open(path) as doc:
        meta = doc.metadata or {}
        return {
            "title": (meta.get("title") or path.stem or "").strip(),
            "author": (meta.get("author") or "").strip(),
            "publisher": (meta.get("producer") or "").strip(),
            "description": (meta.get("subject") or "").strip(),
            "source_url": "",
            "series": "",
            "series_index": 0,
            "publication_status": "",
            "has_cover": doc.page_count > 0,
            "subjects": [],
            "publish_date": "",
        }