diff --git a/src/core/annotation.js b/src/core/annotation.js index ebafeb92816148..ba6602b3f5e003 100644 --- a/src/core/annotation.js +++ b/src/core/annotation.js @@ -1639,7 +1639,7 @@ class MarkupAnnotation extends Annotation { } static async createNewAnnotation(xref, annotation, dependencies, params) { - const annotationRef = annotation.ref || xref.getNewTemporaryRef(); + const annotationRef = (annotation.ref ||= xref.getNewTemporaryRef()); const ap = await this.createNewAppearanceStream(annotation, xref, params); const buffer = []; let annotationDict; @@ -1652,6 +1652,9 @@ class MarkupAnnotation extends Annotation { } else { annotationDict = this.createNewDict(annotation, xref, {}); } + if (Number.isInteger(annotation.parentTreeId)) { + annotationDict.set("StructParent", annotation.parentTreeId); + } buffer.length = 0; await writeObject(annotationRef, annotationDict, buffer, xref); diff --git a/src/core/catalog.js b/src/core/catalog.js index f52357d6ae229f..8f354775c1c9a3 100644 --- a/src/core/catalog.js +++ b/src/core/catalog.js @@ -84,6 +84,10 @@ class Catalog { this.systemFontCache = new Map(); } + cloneDict() { + return this._catDict.clone(); + } + get version() { const version = this._catDict.get("Version"); if (version instanceof Name) { @@ -245,11 +249,13 @@ class Catalog { * @private */ _readStructTreeRoot() { - const obj = this._catDict.get("StructTreeRoot"); + const rawObj = this._catDict.getRaw("StructTreeRoot"); + const obj = this.xref.fetchIfRef(rawObj); if (!(obj instanceof Dict)) { return null; } - const root = new StructTreeRoot(obj); + + const root = new StructTreeRoot(obj, rawObj); root.init(); return root; } diff --git a/src/core/pdf_manager.js b/src/core/pdf_manager.js index ff3d3bf7544650..d57f4bc3e5f8b5 100644 --- a/src/core/pdf_manager.js +++ b/src/core/pdf_manager.js @@ -64,6 +64,10 @@ class BasePdfManager { return this._docBaseUrl; } + get catalog() { + return this.pdfDocument.catalog; + } + ensureDoc(prop, args) { return this.ensure(this.pdfDocument, prop, args); } diff --git a/src/core/primitives.js b/src/core/primitives.js index 30f81407b99961..0ada1849e08dbd 100644 --- a/src/core/primitives.js +++ b/src/core/primitives.js @@ -262,6 +262,14 @@ class Dict { return mergedDict.size > 0 ? mergedDict : Dict.empty; } + + clone() { + const dict = new Dict(this.xref); + for (const key of this.getKeys()) { + dict.set(key, this.getRaw(key)); + } + return dict; + } } class Ref { diff --git a/src/core/struct_tree.js b/src/core/struct_tree.js index eeebd119455c43..4ac5d8f57c1d41 100644 --- a/src/core/struct_tree.js +++ b/src/core/struct_tree.js @@ -16,6 +16,7 @@ import { AnnotationPrefix, stringToPDFString, warn } from "../shared/util.js"; import { Dict, isName, Name, Ref, RefSetCache } from "./primitives.js"; import { NumberTree } from "./name_number_tree.js"; +import { writeObject } from "./writer.js"; const MAX_DEPTH = 40; @@ -28,8 +29,9 @@ const StructElementType = { }; class StructTreeRoot { - constructor(rootDict) { + constructor(rootDict, rootRef) { this.dict = rootDict; + this.ref = rootRef instanceof Ref ? rootRef : null; this.roleMap = new Map(); this.structParentIds = null; } @@ -67,6 +69,419 @@ class StructTreeRoot { this.roleMap.set(key, value.name); }); } + + static async canCreateStructureTree({ + catalogRef, + pdfManager, + newAnnotationsByPage, + }) { + if (!(catalogRef instanceof Ref)) { + warn("Cannot save the struct tree: no catalog reference."); + return false; + } + + let nextKey = 0; + let hasNothingToUpdate = true; + + for (const [pageIndex, elements] of newAnnotationsByPage) { + const { ref: pageRef } = await pdfManager.getPage(pageIndex); + if (!(pageRef instanceof Ref)) { + warn(`Cannot save the struct tree: page ${pageIndex} has no ref.`); + hasNothingToUpdate = true; + break; + } + for (const element of elements) { + if (element.accessibilityData?.type) { + // Each tag must have a structure type. + element.parentTreeId = nextKey++; + hasNothingToUpdate = false; + } + } + } + + if (hasNothingToUpdate) { + for (const elements of newAnnotationsByPage.values()) { + for (const element of elements) { + delete element.parentTreeId; + } + } + return false; + } + + return true; + } + + static async createStructureTree({ + newAnnotationsByPage, + xref, + catalogRef, + pdfManager, + newRefs, + }) { + const root = pdfManager.catalog.cloneDict(); + const structTreeRootRef = xref.getNewTemporaryRef(); + root.set("StructTreeRoot", structTreeRootRef); + + const buffer = []; + await writeObject(catalogRef, root, buffer, xref); + newRefs.push({ ref: catalogRef, data: buffer.join("") }); + + const structTreeRoot = new Dict(xref); + structTreeRoot.set("Type", Name.get("StructTreeRoot")); + const parentTreeRef = xref.getNewTemporaryRef(); + structTreeRoot.set("ParentTree", parentTreeRef); + const kids = []; + structTreeRoot.set("K", kids); + + const parentTree = new Dict(xref); + const nums = []; + parentTree.set("Nums", nums); + + const nextKey = await this.#writeKids({ + newAnnotationsByPage, + structTreeRootRef, + kids, + nums, + xref, + pdfManager, + newRefs, + buffer, + }); + structTreeRoot.set("ParentTreeNextKey", nextKey); + + buffer.length = 0; + await writeObject(parentTreeRef, parentTree, buffer, xref); + newRefs.push({ ref: parentTreeRef, data: buffer.join("") }); + + buffer.length = 0; + await writeObject(structTreeRootRef, structTreeRoot, buffer, xref); + newRefs.push({ ref: structTreeRootRef, data: buffer.join("") }); + } + + async canUpdateStructTree({ pdfManager, newAnnotationsByPage }) { + if (!this.ref) { + warn("Cannot update the struct tree: no root reference."); + return false; + } + + let nextKey = this.dict.get("ParentTreeNextKey"); + if (!Number.isInteger(nextKey) || nextKey < 0) { + warn("Cannot update the struct tree: invalid next key."); + return false; + } + + const parentTree = this.dict.get("ParentTree"); + if (!(parentTree instanceof Dict)) { + warn("Cannot update the struct tree: ParentTree isn't a dict."); + return false; + } + const nums = parentTree.get("Nums"); + if (!Array.isArray(nums)) { + warn("Cannot update the struct tree: nums isn't an array."); + return false; + } + + const { numPages } = pdfManager.catalog; + for (const pageIndex of newAnnotationsByPage.keys()) { + const { pageDict, ref: pageRef } = await pdfManager.getPage(pageIndex); + if (!(pageRef instanceof Ref)) { + warn(`Cannot save the struct tree: page ${pageIndex} has no ref.`); + return false; + } + const id = pageDict.get("StructParents"); + if (!Number.isInteger(id) || id < 0 || id >= numPages) { + warn(`Cannot save the struct tree: page ${pageIndex} has no id.`); + return false; + } + } + + let hasNothingToUpdate = true; + for (const [pageIndex, elements] of newAnnotationsByPage) { + const { pageDict } = await pdfManager.getPage(pageIndex); + StructTreeRoot.#collectParents({ + elements, + xref: this.dict.xref, + pageDict, + parentTree, + }); + + for (const element of elements) { + if (element.accessibilityData?.type) { + // Each tag must have a structure type. + element.parentTreeId = nextKey++; + hasNothingToUpdate = false; + } + } + } + + if (hasNothingToUpdate) { + for (const elements of newAnnotationsByPage.values()) { + for (const element of elements) { + delete element.parentTreeId; + delete element.structTreeParent; + } + } + return false; + } + + return true; + } + + async updateStructureTree({ newAnnotationsByPage, pdfManager, newRefs }) { + const xref = this.dict.xref; + const structTreeRoot = this.dict.clone(); + const structTreeRootRef = this.ref; + + let parentTreeRef = structTreeRoot.getRaw("ParentTree"); + let parentTree; + if (!(parentTreeRef instanceof Ref)) { + parentTree = parentTreeRef; + parentTreeRef = xref.getNewTemporaryRef(); + structTreeRoot.set("ParentTree", parentTreeRef); + } else { + parentTree = xref.fetchIfRef(parentTreeRef); + } + parentTree = parentTree.clone(); + + let nums = parentTree.getRaw("Nums"); + let numsRef = null; + if (nums instanceof Ref) { + numsRef = nums; + nums = xref.fetch(numsRef); + } + nums = nums.slice(); + if (!numsRef) { + parentTree.set("Nums", nums); + } + + let kids = structTreeRoot.getRaw("K"); + let kidsRef = null; + if (!(kids instanceof Ref)) { + kidsRef = xref.getNewTemporaryRef(); + structTreeRoot.set("K", kidsRef); + } else { + kidsRef = kids; + kids = xref.fetch(kidsRef); + } + kids = Array.isArray(kids) ? kids.slice() : [kids]; + + const buffer = []; + const newNextkey = await StructTreeRoot.#writeKids({ + newAnnotationsByPage, + structTreeRootRef, + kids, + nums, + xref, + pdfManager, + newRefs, + buffer, + }); + structTreeRoot.set("ParentTreeNextKey", newNextkey); + + buffer.length = 0; + await writeObject(kidsRef, kids, buffer, xref); + newRefs.push({ ref: kidsRef, data: buffer.join("") }); + + if (numsRef) { + buffer.length = 0; + await writeObject(numsRef, nums, buffer, xref); + newRefs.push({ ref: numsRef, data: buffer.join("") }); + } + + buffer.length = 0; + await writeObject(parentTreeRef, parentTree, buffer, xref); + newRefs.push({ ref: parentTreeRef, data: buffer.join("") }); + + buffer.length = 0; + await writeObject(structTreeRootRef, structTreeRoot, buffer, xref); + newRefs.push({ ref: structTreeRootRef, data: buffer.join("") }); + } + + static async #writeKids({ + newAnnotationsByPage, + structTreeRootRef, + kids, + nums, + xref, + pdfManager, + newRefs, + buffer, + }) { + const objr = Name.get("OBJR"); + let nextKey = -Infinity; + + for (const [pageIndex, elements] of newAnnotationsByPage) { + const { ref: pageRef } = await pdfManager.getPage(pageIndex); + for (const { + accessibilityData: { type, title, lang, alt, expanded, actualText }, + ref, + parentTreeId, + structTreeParent, + } of elements) { + nextKey = Math.max(nextKey, parentTreeId); + + const tagRef = xref.getNewTemporaryRef(); + const tagDict = new Dict(xref); + + // The structure type is required. + tagDict.set("S", Name.get(type)); + + if (title) { + tagDict.set("T", title); + } + if (lang) { + tagDict.set("Lang", lang); + } + if (alt) { + tagDict.set("Alt", alt); + } + if (expanded) { + tagDict.set("E", expanded); + } + if (actualText) { + tagDict.set("ActualText", actualText); + } + + if (structTreeParent) { + await this.#updateParentTag({ + structTreeParent, + tagDict, + newTagRef: tagRef, + fallbackRef: structTreeRootRef, + xref, + newRefs, + buffer, + }); + } else { + tagDict.set("P", structTreeRootRef); + } + + const objDict = new Dict(xref); + tagDict.set("K", objDict); + objDict.set("Type", objr); + objDict.set("Pg", pageRef); + objDict.set("Obj", ref); + + buffer.length = 0; + await writeObject(tagRef, tagDict, buffer, xref); + newRefs.push({ ref: tagRef, data: buffer.join("") }); + + nums.push(parentTreeId, tagRef); + kids.push(tagRef); + } + } + return nextKey + 1; + } + + static #collectParents({ elements, xref, pageDict, parentTree }) { + const idToElement = new Map(); + for (const element of elements) { + if (element.structTreeParentId) { + const id = parseInt(element.structTreeParentId.split("_mc")[1], 10); + idToElement.set(id, element); + } + } + + const id = pageDict.get("StructParents"); + const numberTree = new NumberTree(parentTree, xref); + const parentArray = numberTree.get(id); + if (!Array.isArray(parentArray)) { + return; + } + const updateElement = (kid, pageKid, kidRef) => { + const element = idToElement.get(kid); + if (element) { + const parentRef = pageKid.getRaw("P"); + const parentDict = xref.fetchIfRef(parentRef); + if (parentRef instanceof Ref && parentDict instanceof Dict) { + // It should always the case, but we check just in case. + element.structTreeParent = { ref: kidRef, dict: pageKid }; + } + return true; + } + return false; + }; + for (const kidRef of parentArray) { + if (!(kidRef instanceof Ref)) { + continue; + } + const pageKid = xref.fetch(kidRef); + const k = pageKid.get("K"); + if (Number.isInteger(k)) { + updateElement(k, pageKid, kidRef); + continue; + } + + if (!Array.isArray(k)) { + continue; + } + for (let kid of k) { + kid = xref.fetchIfRef(kid); + if (Number.isInteger(kid) && updateElement(kid, pageKid, kidRef)) { + break; + } + } + } + } + + static async #updateParentTag({ + structTreeParent: { ref, dict }, + tagDict, + newTagRef, + fallbakRef, + xref, + newRefs, + buffer, + }) { + // We get the parent of the tag. + const parentRef = dict.getRaw("P"); + let parentDict = xref.fetchIfRef(parentRef); + + tagDict.set("P", parentRef); + + // We get the kids in order to insert a new tag at the right position. + let saveParentDict = false; + let parentKids; + let parentKidsRef = parentDict.getRaw("K"); + if (!(parentKidsRef instanceof Ref)) { + parentKids = parentKidsRef; + parentKidsRef = xref.getNewTemporaryRef(); + parentDict = parentDict.clone(); + parentDict.set("K", parentKidsRef); + saveParentDict = true; + } else { + parentKids = xref.fetch(parentKidsRef); + } + + if (Array.isArray(parentKids)) { + const index = parentKids.indexOf(ref); + if (index >= 0) { + parentKids = parentKids.slice(); + parentKids.splice(index + 1, 0, newTagRef); + } else { + warn("Cannot update the struct tree: parent kid not found."); + tagDict.set("P", fallbakRef); + return; + } + } else if (parentKids instanceof Dict) { + parentKids = [parentKidsRef, newTagRef]; + parentKidsRef = xref.getNewTemporaryRef(); + parentDict.set("K", parentKidsRef); + saveParentDict = true; + } + + buffer.length = 0; + await writeObject(parentKidsRef, parentKids, buffer, xref); + newRefs.push({ ref: parentKidsRef, data: buffer.join("") }); + + if (!saveParentDict) { + return; + } + + buffer.length = 0; + await writeObject(parentRef, parentDict, buffer, xref); + newRefs.push({ ref: parentRef, data: buffer.join("") }); + } } /** diff --git a/src/core/worker.js b/src/core/worker.js index d03c2b95770c25..745720486130e3 100644 --- a/src/core/worker.js +++ b/src/core/worker.js @@ -42,6 +42,7 @@ import { clearGlobalCaches } from "./cleanup_helper.js"; import { incrementalUpdate } from "./writer.js"; import { MessageHandler } from "../shared/message_handler.js"; import { PDFWorkerStream } from "./worker_stream.js"; +import { StructTreeRoot } from "./struct_tree.js"; class WorkerTask { constructor(name) { @@ -542,24 +543,54 @@ class WorkerMessageHandler { pdfManager.ensureDoc("startXRef"), pdfManager.ensureDoc("xref"), pdfManager.ensureDoc("linearization"), + pdfManager.ensureCatalog("structTreeRoot"), ]; const promises = []; const newAnnotationsByPage = !isPureXfa ? getNewAnnotationsMap(annotationStorage) : null; - const [stream, acroForm, acroFormRef, startXRef, xref, linearization] = - await Promise.all(globalPromises); + const [ + stream, + acroForm, + acroFormRef, + startXRef, + xref, + linearization, + _structTreeRoot, + ] = await Promise.all(globalPromises); + const catalogRef = xref.trailer.getRaw("Root") || null; + let structTreeRoot; if (newAnnotationsByPage) { + if (!_structTreeRoot) { + if ( + await StructTreeRoot.canCreateStructureTree({ + catalogRef, + pdfManager, + newAnnotationsByPage, + }) + ) { + structTreeRoot = null; + } + } else if ( + await _structTreeRoot.canUpdateStructTree({ + pdfManager, + newAnnotationsByPage, + }) + ) { + structTreeRoot = _structTreeRoot; + } + const imagePromises = AnnotationFactory.generateImages( annotationStorage.values(), xref, pdfManager.evaluatorOptions.isOffscreenCanvasSupported ); - + const newAnnotationPromises = + structTreeRoot === undefined ? promises : []; for (const [pageIndex, annotations] of newAnnotationsByPage) { - promises.push( + newAnnotationPromises.push( pdfManager.getPage(pageIndex).then(page => { const task = new WorkerTask(`Save (editor): page ${pageIndex}`); return page @@ -570,6 +601,32 @@ class WorkerMessageHandler { }) ); } + if (structTreeRoot === null) { + // No structTreeRoot exists, so we need to create one. + promises.push( + Promise.all(newAnnotationPromises).then(async newRefs => { + await StructTreeRoot.createStructureTree({ + newAnnotationsByPage, + xref, + catalogRef, + pdfManager, + newRefs, + }); + return newRefs; + }) + ); + } else if (structTreeRoot) { + promises.push( + Promise.all(newAnnotationPromises).then(async newRefs => { + await structTreeRoot.updateStructureTree({ + newAnnotationsByPage, + pdfManager, + newRefs, + }); + return newRefs; + }) + ); + } } if (isPureXfa) { @@ -643,7 +700,7 @@ class WorkerMessageHandler { } newXrefInfo = { - rootRef: xref.trailer.getRaw("Root") || null, + rootRef: catalogRef, encryptRef: xref.trailer.getRaw("Encrypt") || null, newRef: xref.getNewTemporaryRef(), infoRef: xref.trailer.getRaw("Info") || null, diff --git a/src/core/writer.js b/src/core/writer.js index e5193c141dbf05..26db83cd4fc0db 100644 --- a/src/core/writer.js +++ b/src/core/writer.js @@ -32,6 +32,8 @@ async function writeObject(ref, obj, buffer, { encrypt = null }) { await writeDict(obj, buffer, transform); } else if (obj instanceof BaseStream) { await writeStream(obj, buffer, transform); + } else if (Array.isArray(obj)) { + await writeArray(obj, buffer, transform); } buffer.push("\nendobj\n"); } @@ -233,11 +235,7 @@ async function updateAcroform({ return; } - // Clone the acroForm. - const dict = new Dict(xref); - for (const key of acroForm.getKeys()) { - dict.set(key, acroForm.getRaw(key)); - } + const dict = acroForm.clone(); if (hasXfa && !hasXfaDatasetsEntry) { // We've a XFA array which doesn't contain a datasets entry. diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 0ad48ec99d5a4d..4f3334cf5cb640 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -2297,6 +2297,114 @@ describe("api", function () { await loadingTask.destroy(); }); + it("write a new stamp annotation in a tagged pdf, save and check that the structure tree", async function () { + if (isNodeJS) { + pending("Cannot create a bitmap from Node.js."); + } + + const TEST_IMAGES_PATH = "../images/"; + const filename = "firefox_logo.png"; + const path = new URL(TEST_IMAGES_PATH + filename, window.location).href; + + const response = await fetch(path); + const blob = await response.blob(); + const bitmap = await createImageBitmap(blob); + + let loadingTask = getDocument(buildGetDocumentParams("bug1823296.pdf")); + let pdfDoc = await loadingTask.promise; + pdfDoc.annotationStorage.setValue("pdfjs_internal_editor_0", { + annotationType: AnnotationEditorType.STAMP, + rect: [128, 400, 148, 420], + rotation: 0, + bitmap, + bitmapId: "im1", + pageIndex: 0, + structTreeParentId: "p3R_mc12", + accessibilityData: { + type: "Figure", + alt: "Hello World", + }, + }); + + const data = await pdfDoc.saveDocument(); + await loadingTask.destroy(); + + loadingTask = getDocument(data); + pdfDoc = await loadingTask.promise; + const page = await pdfDoc.getPage(1); + const tree = await page.getStructTree(); + const leaf = tree.children[0].children[6].children[1]; + + expect(leaf).toEqual({ + role: "Figure", + children: [ + { + type: "annotation", + id: "pdfjs_internal_id_477R", + }, + ], + alt: "Hello World", + }); + + await loadingTask.destroy(); + }); + + it("write a new stamp annotation in a non-tagged pdf, save and check that the structure tree", async function () { + if (isNodeJS) { + pending("Cannot create a bitmap from Node.js."); + } + + const TEST_IMAGES_PATH = "../images/"; + const filename = "firefox_logo.png"; + const path = new URL(TEST_IMAGES_PATH + filename, window.location).href; + + const response = await fetch(path); + const blob = await response.blob(); + const bitmap = await createImageBitmap(blob); + + let loadingTask = getDocument(buildGetDocumentParams("empty.pdf")); + let pdfDoc = await loadingTask.promise; + pdfDoc.annotationStorage.setValue("pdfjs_internal_editor_0", { + annotationType: AnnotationEditorType.STAMP, + rect: [128, 400, 148, 420], + rotation: 0, + bitmap, + bitmapId: "im1", + pageIndex: 0, + structTreeParentId: null, + accessibilityData: { + type: "Figure", + alt: "Hello World", + }, + }); + + const data = await pdfDoc.saveDocument(); + await loadingTask.destroy(); + + loadingTask = getDocument(data); + pdfDoc = await loadingTask.promise; + const page = await pdfDoc.getPage(1); + const tree = await page.getStructTree(); + + expect(tree).toEqual({ + children: [ + { + role: "Figure", + children: [ + { + type: "annotation", + id: "pdfjs_internal_id_18R", + }, + ], + alt: "Hello World", + }, + ], + role: "Root", + }); + + await loadingTask.destroy(); + }); + describe("Cross-origin", function () { let loadingTask; function _checkCanLoad(expectSuccess, filename, options) {