novela/containers/novela/pdf.py

69 lines
2.1 KiB
Python

from pathlib import Path
import fitz
from PIL import Image, ImageOps
COVER_W = 300
COVER_H = 450
def pdf_page_count(path: Path) -> int:
with fitz.open(path) as doc:
return doc.page_count
def pdf_render_page(path: Path, page_num: int, dpi: int = 150) -> bytes:
with fitz.open(path) as doc:
if page_num < 0 or page_num >= doc.page_count:
raise IndexError("Page out of range")
page = doc.load_page(page_num)
mat = fitz.Matrix(dpi / 72.0, dpi / 72.0)
pix = page.get_pixmap(matrix=mat, alpha=False)
return pix.tobytes("png")
def _webp_thumb_from_image(path: Path) -> bytes:
with Image.open(path) as im:
im = ImageOps.exif_transpose(im)
if im.mode not in ("RGB", "RGBA"):
im = im.convert("RGB")
thumb = ImageOps.fit(im, (COVER_W, COVER_H), method=Image.Resampling.LANCZOS)
from io import BytesIO
out = BytesIO()
thumb.save(out, format="WEBP", quality=82, method=6)
return out.getvalue()
def pdf_cover_thumb(path: Path) -> bytes:
with fitz.open(path) as doc:
if doc.page_count == 0:
raise ValueError("PDF has no pages")
page = doc.load_page(0)
pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5), alpha=False)
tmp = path.with_suffix(".cover.tmp.png")
try:
pix.save(tmp)
return _webp_thumb_from_image(tmp)
finally:
if tmp.exists():
tmp.unlink(missing_ok=True)
def pdf_scan_metadata(path: Path) -> dict:
with fitz.open(path) as doc:
meta = doc.metadata or {}
return {
"title": (meta.get("title") or path.stem or "").strip(),
"author": (meta.get("author") or "").strip(),
"publisher": (meta.get("producer") or "").strip(),
"description": (meta.get("subject") or "").strip(),
"source_url": "",
"series": "",
"series_index": 0,
"publication_status": "",
"has_cover": doc.page_count > 0,
"subjects": [],
"publish_date": "",
}