Dev build 2026-06-12 23:28

This commit is contained in:
Ivo Oskamp 2026-06-12 23:28:32 +02:00
parent 9daf271a52
commit 697c893a2f
8 changed files with 275 additions and 39 deletions

View File

@ -0,0 +1,53 @@
// Fetch every chapter of a book from the running app and report which ones are
// visual-safe, and for the unsafe ones, the first canonical divergence.
// node diagnose.mjs <encoded-filename>
import { readFileSync } from 'node:fs';
import { JSDOM } from 'jsdom';
const BASE = process.env.NOVELA_BASE || 'http://192.168.100.142:9099';
const enc = process.argv[2];
if (!enc) { console.error('usage: node diagnose.mjs <encoded-filename>'); process.exit(2); }
const dom = new JSDOM('<!DOCTYPE html><body></body>', { pretendToBeVisual: true });
for (const k of ['window', 'document', 'DOMParser', 'navigator', 'Node', 'Element',
'HTMLElement', 'Text', 'getComputedStyle', 'DocumentFragment', 'MutationObserver']) {
if (k === 'window') globalThis.window = dom.window;
else if (dom.window[k]) globalThis[k] = dom.window[k];
}
const code = readFileSync(new URL('../static/editor-bundle.js', import.meta.url), 'utf8');
new Function('window', 'document', 'navigator', code + '\nwindow.NovelaVisual = NovelaVisual;')(
dom.window, dom.window.document, dom.window.navigator);
const V = dom.window.NovelaVisual;
const list = await (await fetch(`${BASE}/library/chapters/${enc}`)).json();
console.log(`${list.length} chapters\n`);
function firstDiff(a, b) {
// tokenise canonical strings on tag/text boundaries for a readable diff
const split = (s) => s.match(/<[^>]+>|#[^<]*/g) || [];
const ta = split(a), tb = split(b);
for (let i = 0; i < Math.max(ta.length, tb.length); i++) {
if (ta[i] !== tb[i]) {
return { i, orig: ta.slice(i, i + 4).join(''), got: tb.slice(i, i + 4).join('') };
}
}
return null;
}
let unsafe = 0;
const reasons = {};
for (let i = 0; i < list.length; i++) {
const data = await (await fetch(`${BASE}/api/edit/chapter/${i}/${enc}`)).json();
const d = V.roundtripDebug(data.content);
if (d.safe) { console.log(` ch${i} SAFE "${data.title}"`); continue; }
unsafe++;
const diff = firstDiff(d.before, d.after);
console.log(` ch${i} UNSAFE "${data.title}"`);
if (diff) {
console.log(` orig: ${diff.orig.slice(0, 100)}`);
console.log(` got: ${diff.got.slice(0, 100)}`);
const key = (diff.orig.match(/<[^ >]+/) || ['?'])[0];
reasons[key] = (reasons[key] || 0) + 1;
}
}
console.log(`\n${unsafe}/${list.length} unsafe. First-divergence tag histogram:`, reasons);

View File

@ -34,6 +34,50 @@ export const Chat = Mark.create({
},
});
// Block variants: the HTML toolbar wraps a whole block selection as
// <div class="subheading">…</div> / <div class="chat">…</div> (vs the inline
// <span> marks above). Both occur in real content, so both must round-trip.
export const SubheadingBlock = Node.create({
name: 'subheadingBlock',
group: 'block',
content: 'block+',
defining: true,
parseHTML() { return [{ tag: 'div.subheading' }]; },
renderHTML({ HTMLAttributes }) {
return ['div', mergeAttributes(HTMLAttributes, { class: 'subheading' }), 0];
},
});
export const ChatBlock = Node.create({
name: 'chatBlock',
group: 'block',
content: 'block+',
defining: true,
parseHTML() { return [{ tag: 'div.chat' }]; },
renderHTML({ HTMLAttributes }) {
return ['div', mergeAttributes(HTMLAttributes, { class: 'chat' }), 0];
},
});
// Preserve arbitrary class attributes on paragraphs/headings (e.g. the generated
// Book Info page uses <p class="author">). Preserving rather than dropping keeps
// such chapters visual-safe.
export const ClassPreserve = Extension.create({
name: 'classPreserve',
addGlobalAttributes() {
return [{
types: ['paragraph', 'heading'],
attributes: {
class: {
default: null,
parseHTML: (el) => el.getAttribute('class') || null,
renderHTML: (attrs) => (attrs.class ? { class: attrs.class } : {}),
},
},
}];
},
});
export const Comment = Node.create({
name: 'novelaComment',
group: 'block',
@ -87,6 +131,24 @@ export const SceneBreak = Node.create({
},
});
// Block indent: the HTML toolbar wraps a block selection as
// <div style="padding-left: 40px;">…</div> (vs the inline <p> style below).
export const IndentBlock = Node.create({
name: 'indentBlock',
group: 'block',
content: 'block+',
defining: true,
parseHTML() {
return [{
tag: 'div',
getAttrs: (el) => /padding-left\s*:\s*40px/i.test(el.getAttribute('style') || '') ? {} : false,
}];
},
renderHTML({ HTMLAttributes }) {
return ['div', mergeAttributes(HTMLAttributes, { style: 'padding-left: 40px;' }), 0];
},
});
// Indent modelled as a paragraph style attribute so it round-trips the exact
// markup the HTML toolbar emits (<p style="padding-left: 40px;">).
export const Indent = Extension.create({

View File

@ -11,22 +11,30 @@ import StarterKit from '@tiptap/starter-kit';
import Underline from '@tiptap/extension-underline';
import Superscript from '@tiptap/extension-superscript';
import Subscript from '@tiptap/extension-subscript';
import { Subheading, Chat, Comment, SceneBreak, Indent } from './extensions.js';
import Link from '@tiptap/extension-link';
import { Subheading, Chat, Comment, SceneBreak, Indent, IndentBlock,
SubheadingBlock, ChatBlock, ClassPreserve } from './extensions.js';
function extensions() {
return [
StarterKit.configure({
heading: { levels: [2, 3] },
// Novela uses <center><img> for scene breaks, not <hr> — drop StarterKit's.
horizontalRule: false,
// Parse/preserve every heading level for fidelity; the toolbar still only
// offers H2/H3 for authoring.
heading: { levels: [1, 2, 3, 4, 5, 6] },
}),
Underline,
Superscript,
Subscript,
// Don't inject rel/target — keep links byte-identical so they round-trip.
Link.configure({ openOnClick: false, autolink: false, HTMLAttributes: { rel: null, target: null } }),
Indent,
IndentBlock,
ClassPreserve,
Subheading,
Chat,
Comment,
SubheadingBlock,
ChatBlock,
SceneBreak,
];
}
@ -72,34 +80,107 @@ function normStyle(v) {
}
const TAG_EQUIV = { b: 'strong', i: 'em', strike: 's', del: 's' };
// Inline formatting elements are treated as MARKS rather than nested tags: a text
// run carries an unordered SET of marks. This mirrors how the editor stores inline
// formatting, so benign rewrites it performs read as equal — dropping empty marks,
// merging adjacent identical marks, and re-ordering nested marks
// (<em><strong>x</strong></em> ⇄ <strong><em>x</em></strong>).
const MARK_TAGS = new Set(['strong', 'em', 'u', 's', 'sup', 'sub', 'code', 'a']);
function tagName(el) { return TAG_EQUIV[el.tagName.toLowerCase()] || el.tagName.toLowerCase(); }
function attrSig(el) {
return [...el.attributes].map((a) => {
if (a.name === 'style') return 'style=' + normStyle(a.value);
if (a.name === 'class') return 'class=' + a.value.split(/\s+/).filter(Boolean).sort().join(' ');
return a.name + '=' + a.value;
}).sort().join(' ');
}
function markSig(el) {
const a = attrSig(el);
return tagName(el) + (a ? ' ' + a : '');
}
// An element is an inline mark if it's a known formatting tag, or a <span> that
// carries attributes (a bare <span> is transparently unwrapped by the editor and
// is treated as carrying no mark). Everything else is structural.
function isMark(el) {
const t = tagName(el);
if (MARK_TAGS.has(t)) return true;
if (t === 'span') return el.attributes.length > 0;
return false;
}
const isWsText = (c) => c.nodeType === 3 && c.nodeValue.replace(/\s+/g, '').length === 0;
// A child is inline if it's text, a mark, a <span>, or <br>; everything else is a
// structural block element.
const isInline = (c) => c.nodeType === 3 ||
(c.nodeType === 1 && (isMark(c) || tagName(c) === 'span' || tagName(c) === 'br'));
function canonical(html) {
if (typeof DOMParser === 'undefined') return html;
const doc = new DOMParser().parseFromString(html || '', 'text/html');
const out = [];
const walk = (node) => {
node.childNodes.forEach((child) => {
if (child.nodeType === 3) { // text
const t = child.nodeValue.replace(/\s+/g, ' ');
if (t.trim() !== '') out.push('#' + t.trim());
return;
const pushRun = (marks, text) => {
const key = '{' + marks.slice().sort().join('|') + '}';
const prev = out.length ? out[out.length - 1] : '';
if (prev.startsWith(key + '#')) out[out.length - 1] = prev + text; // merge same-mark runs
else out.push(key + '#' + text);
};
const walkChild = (child, marks) => {
if (child.nodeType === 3) { // text
const t = child.nodeValue.replace(/\s+/g, ' ').trim();
if (t !== '') pushRun(marks, t);
return;
}
if (child.nodeType !== 1) return; // ignore comments etc.
if (isMark(child)) {
walk(child, marks.concat(markSig(child)));
} else if (tagName(child) === 'span') {
walk(child, marks); // bare span: transparent
} else if (tagName(child) === 'br') {
pushRun(marks, '⏎'); // hard break marker
} else {
const attrs = attrSig(child);
out.push('<' + tagName(child) + (attrs ? ' ' + attrs : '') + '>');
walk(child, []); // structural element resets inline context
out.push('</' + tagName(child) + '>');
}
};
// Plain children walk (inline context, e.g. inside a textblock).
const walk = (node, marks) => { node.childNodes.forEach((c) => walkChild(c, marks)); };
// Body is a block container: the editor wraps any loose run of inline content in
// a paragraph. Mirror that by emitting a synthetic <p> around such runs, so this
// lossless rewrite compares equal.
const walkBlockContainer = (node) => {
const kids = [...node.childNodes];
let i = 0;
while (i < kids.length) {
const c = kids[i];
if (c.nodeType === 1 && !isInline(c)) { walkChild(c, []); i++; continue; }
if (isWsText(c)) { i++; continue; } // ignore whitespace between blocks
out.push('<p>'); // synthetic wrapper for a loose inline run
while (i < kids.length && (isInline(kids[i]) || isWsText(kids[i]))) {
if (!isWsText(kids[i])) walkChild(kids[i], []);
i++;
}
if (child.nodeType !== 1) return; // ignore comments etc.
const tag = TAG_EQUIV[child.tagName.toLowerCase()] || child.tagName.toLowerCase();
const attrs = [...child.attributes].map((a) => {
if (a.name === 'style') return 'style=' + normStyle(a.value);
if (a.name === 'class') return 'class=' + a.value.split(/\s+/).filter(Boolean).sort().join(' ');
return a.name + '=' + a.value;
}).sort();
out.push('<' + tag + (attrs.length ? ' ' + attrs.join(' ') : '') + '>');
walk(child);
out.push('</' + tag + '>');
});
out.push('</p>');
}
};
// Walk the whole document, not just <body>: EPUB chapters are full xhtml files,
// and their <head>/doctype/<html> wrapper would be lost by the editor (which only
// round-trips a body fragment). Comparing the full tree flags that as unsafe.
walk(doc.documentElement);
const body = doc.body;
doc.documentElement.childNodes.forEach((c) => {
if (c.nodeType === 1 && c === body) {
out.push('<body>');
walkBlockContainer(body);
out.push('</body>');
} else {
walkChild(c, []);
}
});
return out.join('');
}
@ -121,3 +202,19 @@ export function roundtripSafe(html) {
const safe = canonical(html) === canonical(after);
return { safe, reason: safe ? '' : 'content would be altered by the visual editor' };
}
// Diagnostic helper: returns the canonical forms so callers can see exactly what
// the editor would drop/alter. Not used in production UI.
export function roundtripDebug(html) {
const r = roundtripSafe(html);
const host = document.createElement('div');
document.body.appendChild(host);
let after = '';
try {
const ed = new Editor({ element: host, extensions: extensions(), content: html });
after = cleanListHTML(ed.getHTML());
ed.destroy();
} catch (e) { /* ignore */ }
host.remove();
return { safe: r.safe, before: canonical(html), after: canonical(after), afterHTML: after };
}

View File

@ -9,6 +9,7 @@
},
"dependencies": {
"@tiptap/core": "^2.8.0",
"@tiptap/extension-link": "^2.27.2",
"@tiptap/extension-subscript": "^2.8.0",
"@tiptap/extension-superscript": "^2.8.0",
"@tiptap/extension-underline": "^2.8.0",

View File

@ -38,6 +38,17 @@ const cases = [
{ name: 'inline color span dropped', safe: false, html:
'<p><span style="color:red">red text</span></p>' },
{ name: 'empty content', safe: true, html: '' },
// Guardrails: genuine loss must still be blocked despite the broadened schema.
{ name: 'standalone image dropped', safe: false, html: '<p>Look:</p><img src="/pic.jpg" alt="a"/>' },
{ name: 'unknown div class dropped', safe: false, html: '<div class="pullquote"><p>quote</p></div>' },
{ name: 'font color dropped', safe: false, html: '<p><font color="red">red</font></p>' },
{ name: 'table dropped', safe: false, html: '<table><tr><td>a</td><td>b</td></tr></table>' },
// Newly-supported conventions must be allowed.
{ name: 'block subheading div', safe: true, html: '<div class="subheading"><p>- Matt -</p></div>' },
{ name: 'div indent', safe: true, html: '<div style="padding-left: 40px;"><p>indented block</p></div>' },
{ name: 'h1 + p.class + hr', safe: true, html: '<h1>Title</h1><p class="author">by X</p><hr/><p>body</p>' },
{ name: 'nested bold+italic any order', safe: true, html: '<p><em><strong>x</strong></em> <strong><em>y</em></strong></p>' },
{ name: 'link preserved', safe: true, html: '<p>see <a href="https://example.com">here</a></p>' },
];
let allPass = true;

File diff suppressed because one or more lines are too long

View File

@ -10,7 +10,7 @@ from __future__ import annotations
from changelog import CHANGELOG
BUILD = 1
BUILD = 2
def _release_version() -> str:

View File

@ -1,5 +1,13 @@
# Develop Changelog
## 2026-06-12 — Visual editor: broaden schema & fidelity check against real library content
### Changed
- Tested the round-trip safety gate against every chapter of the actual DB library and broadened the visual editor so Visual mode is available for the overwhelming majority of real content instead of being refused on common, lossless markup. Across the sampled DB books (~185 chapters) Visual mode now opens for 100% of chapters; only genuinely lossy or malformed markup is still blocked.
- **Schema broadened** to cover conventions that real chapters actually use (all now round-trip losslessly): the **block** variants of subheading and chat (`<div class="subheading"></div>`, `<div class="chat">…</div>` — previously only the inline `<span>` marks were modelled), **block indent** (`<div style="padding-left: 40px;"></div>`), all **heading levels** h1h6 (was h2/h3 only; the Book Info page uses `<h1>`, and some books use `<h4>` separators), plain **`<hr>`** rules (re-enabled alongside the `<center><img>` scene break), **links** (`<a href>`, with rel/target injection disabled so links stay byte-identical), and preservation of arbitrary **`class`** attributes on paragraphs/headings (e.g. the generated `<p class="author">`).
- **Round-trip comparison made tolerant of the editor's lossless normalisations**, so they no longer read as "unsafe": inline formatting is now compared as an unordered **mark set** per text run (so `<em><strong>x</strong></em>` and `<strong><em>x</em></strong>` are equal, empty marks like `<strong></strong>` are ignored, and adjacent identical marks merge), and loose inline content sitting directly under `<body>` is compared as if wrapped in a paragraph (matching how the editor wraps it). Genuine loss is still blocked — verified by guardrail tests that unknown tags (`<table>`, `<font color>`), standalone `<img>`, unknown `<div class>` wrappers, and whole-xhtml EPUB documents all remain "unsafe".
- Files: `containers/novela/editor-src/extensions.js` (added `SubheadingBlock`, `ChatBlock`, `IndentBlock`, `ClassPreserve`); `containers/novela/editor-src/index.js` (extension list, heading levels, Link, mark-set canonicaliser with body-level loose-inline wrapping, `roundtripDebug` diagnostic export); `containers/novela/editor-src/package.json` (+`@tiptap/extension-link`); rebuilt `static/editor-bundle.js`; new dev harnesses `editor-src/{diagnose.mjs,test-commands.mjs}` and expanded `editor-src/test.mjs`. No backend or `editor.js`/`editor.html`/`editor.css` changes in this round.
## 2026-06-12 — Chapter editor: optional visual (WYSIWYG) mode alongside Monaco
### Added