GH-314 image and text redaction support (#335)

pcorless · Feb 6, 2024 · 3f345f7 · 3f345f7
1 parent eea10fc
commit 3f345f7
Show file tree

Hide file tree

Showing 133 changed files with 4,903 additions and 745 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,5 @@
+# Set the default behavior, in case people don't have core.autocrlf set.
+* text=auto
+# Denote all files that are truly binary and should not be modified.
+# stop messing up test data.
+*.pdf binary
diff --git a/core/core-awt/src/main/java/org/icepdf/core/pobjects/Document.java b/core/core-awt/src/main/java/org/icepdf/core/pobjects/Document.java
@@ -21,6 +21,7 @@
 import org.icepdf.core.pobjects.acroform.FieldDictionary;
 import org.icepdf.core.pobjects.acroform.InteractiveForm;
 import org.icepdf.core.pobjects.annotations.AbstractWidgetAnnotation;
+import org.icepdf.core.pobjects.annotations.RedactionAnnotation;
 import org.icepdf.core.pobjects.graphics.WatermarkCallback;
 import org.icepdf.core.pobjects.graphics.images.ImageUtility;
 import org.icepdf.core.pobjects.graphics.text.PageText;
@@ -567,7 +568,7 @@ public void dispose() {
      * @return The length of the PDF file copied
      * @throws IOException if there is some problem reading or writing the PDF data
      */
-    public long writeToOutputStream(OutputStream out) throws IOException {
+    public long writeToOutputStream(OutputStream out) throws IOException, InterruptedException {
         return writeToOutputStream(out, WriteMode.INCREMENT_UPDATE);
     }
 
@@ -581,7 +582,7 @@ public long writeToOutputStream(OutputStream out) throws IOException {
      * @return The length of the PDF file copied
      * @throws IOException if there is some problem reading or writing the PDF data
      */
-    public long writeToOutputStream(OutputStream out, WriteMode writeMode) throws IOException {
+    public long writeToOutputStream(OutputStream out, WriteMode writeMode) throws IOException, InterruptedException {
         if (documentFileChannel != null) {
             synchronized (library.getMappedFileByteBufferLock()) {
                 ByteBuffer documentByteBuffer = library.getMappedFileByteBuffer();
@@ -620,7 +621,7 @@ public long writeToOutputStream(OutputStream out, WriteMode writeMode) throws IO
      * @return The length of the PDF file saved
      * @throws IOException if there is some problem reading or writing the PDF data
      */
-    public long saveToOutputStream(OutputStream out) throws IOException {
+    public long saveToOutputStream(OutputStream out) throws IOException, InterruptedException {
         return writeToOutputStream(out, WriteMode.INCREMENT_UPDATE);
     }
 
@@ -633,7 +634,7 @@ public long saveToOutputStream(OutputStream out) throws IOException {
      * @return The length of the PDF file saved
      * @throws IOException if there is some problem reading or writing the PDF data
      */
-    public long saveToOutputStream(OutputStream out, WriteMode writeMode) throws IOException {
+    public long saveToOutputStream(OutputStream out, WriteMode writeMode) throws IOException, InterruptedException {
         return writeToOutputStream(out, writeMode);
     }
 
@@ -720,6 +721,26 @@ public PageText getPageViewText(int pageNumber) throws InterruptedException {
         }
     }
 
+    public boolean hasRedactions() {
+        // check state manager first as this will be a bit cheaper than scanning each page in the document.
+        if (stateManager.hasRedactions()) {
+            return true;
+        } else {
+            PageTree pageTree = catalog.getPageTree();
+            Page page;
+            List<RedactionAnnotation> redactions;
+            for (int i = 0, max = pageTree.getNumberOfPages(); i < max; i++) {
+                page = pageTree.getPage(i);
+                redactions = page.getRedactionAnnotations();
+                if (redactions != null && redactions.size() > 1) {
+                    return true;
+                }
+            }
+            return false;
+        }
+    }
+
+
     /**
      * Gets the security manager for this document. If the document has no
      * security manager null is returned.
@@ -883,7 +904,7 @@ public Catalog getCatalog() {
     }
 
     /**
-     * Sets the caching mode when handling file loaded by an URI.  If enabled
+     * Sets the caching mode when handling file loaded by a URI.  If enabled
      * URI streams will be cached to disk, otherwise they will be stored in
      * memory. This method must be set before a call to setByteArray() or
      * setInputStream() is called.

diff --git a/core/core-awt/src/main/java/org/icepdf/core/pobjects/Form.java b/core/core-awt/src/main/java/org/icepdf/core/pobjects/Form.java
@@ -20,6 +20,7 @@
 import org.icepdf.core.pobjects.graphics.Shapes;
 import org.icepdf.core.util.Library;
 import org.icepdf.core.util.parser.content.ContentParser;
+import org.icepdf.core.util.updater.callbacks.ContentStreamRedactorCallback;
 
 import java.awt.geom.AffineTransform;
 import java.awt.geom.Rectangle2D;
@@ -148,10 +149,16 @@ public void setParentResources(Resources parentResource) {
         this.parentResource = parentResource;
     }
 
+
+    public synchronized void init() throws InterruptedException {
+        init(null);
+    }
+
     /**
      *
      */
-    public synchronized void init() throws InterruptedException {
+    public synchronized void init(ContentStreamRedactorCallback contentStreamRedactorCallback)
+            throws InterruptedException {
         if (inited) {
             return;
         }
@@ -171,13 +178,13 @@ public synchronized void init() throws InterruptedException {
         }
         // Build a new content parser for the content streams and apply the
         // content stream of the calling content stream.
-        ContentParser cp = new ContentParser(library, leafResources);
+        ContentParser cp = new ContentParser(library, leafResources, contentStreamRedactorCallback);
         cp.setGraphicsState(graphicsState);
         byte[] in = getDecodedStreamBytes();
         if (in != null) {
             try {
                 logger.log(Level.FINER, () -> "Parsing form " + getPObjectReference());
-                shapes = cp.parse(new byte[][]{in}, new Reference[]{this.getPObjectReference()}, null).getShapes();
+                shapes = cp.parse(Stream.fromByteArray(in, this.getPObjectReference()), null).getShapes();
                 inited = true;
             } catch (InterruptedException e) {
                 // the initialization was interrupted so we need to make sure we bubble up the exception

diff --git a/core/core-awt/src/main/java/org/icepdf/core/pobjects/LiteralStringObject.java b/core/core-awt/src/main/java/org/icepdf/core/pobjects/LiteralStringObject.java
@@ -165,15 +165,15 @@ public String getLiteralString() {
 
     /**
      * <p>Gets a literal String representation of this object's data using the
-     * specifed font and format.  The font is used to verify that the
-     * specific character codes can be rendered; if they cannot they may be
+     * specified font and format.  The font is used to verify that the
+     * specific character codes can be rendered; if they cannot, they may be
      * removed or combined with the next character code to get a displayable
      * character code.
      *
      * @param fontFormat the type of pdf font which will be used to display
      *                   the text.  Valid values are CID_FORMAT and SIMPLE_FORMAT for Adobe
      *                   Composite and Simple font types respectively
-     * @param font       font used to render the the literal string data.
+     * @param font       font used to render the literal string data.
      * @return StringBuffer which contains all renderable characters for the
      *         given font.
      */
@@ -217,7 +217,7 @@ public StringBuilder getLiteralStringBuffer(final int fontFormat, FontFile font)
     }
 
     /**
-     * The length of the the underlying object's data.
+     * The length of the underlying object's data.
      *
      * @return length of objcts data.
      */

diff --git a/core/core-awt/src/main/java/org/icepdf/core/pobjects/Page.java b/core/core-awt/src/main/java/org/icepdf/core/pobjects/Page.java
@@ -17,10 +17,7 @@
 
 import org.icepdf.core.events.*;
 import org.icepdf.core.io.SeekableInput;
-import org.icepdf.core.pobjects.annotations.Annotation;
-import org.icepdf.core.pobjects.annotations.MarkupAnnotation;
-import org.icepdf.core.pobjects.annotations.MarkupGlueAnnotation;
-import org.icepdf.core.pobjects.annotations.PopupAnnotation;
+import org.icepdf.core.pobjects.annotations.*;
 import org.icepdf.core.pobjects.graphics.Shapes;
 import org.icepdf.core.pobjects.graphics.WatermarkCallback;
 import org.icepdf.core.pobjects.graphics.text.GlyphText;
@@ -29,6 +26,7 @@
 import org.icepdf.core.pobjects.graphics.text.WordText;
 import org.icepdf.core.util.*;
 import org.icepdf.core.util.parser.content.ContentParser;
+import org.icepdf.core.util.updater.callbacks.ContentStreamRedactorCallback;
 import org.icepdf.core.util.updater.modifiers.AnnotationRemovalModifier;
 import org.icepdf.core.util.updater.modifiers.ModifierFactory;
 
@@ -333,7 +331,7 @@ else if (annotObj instanceof DictionaryEntries) { // HashMap lacks "Type"->"Anno
                             if (creator.equals(SystemProperties.USER_NAME)) {
                                 annotations.add(a);
                             } else {
-                                // other wise we skip it all together but make sure the popup is hidden.
+                                // otherwise we skip it all together but make sure the popup is hidden.
                                 if (markupAnnotation.getPopupAnnotation() != null) {
                                     markupAnnotation.getPopupAnnotation().setOpen(false);
                                 }
@@ -352,7 +350,7 @@ else if (annotObj instanceof DictionaryEntries) { // HashMap lacks "Type"->"Anno
                     logger.log(Level.WARNING, e, () -> " " + finalA.getPObjectReference() + finalA.getEntries());
                 }
             }
-            //The popup annotations may not be referenced in the page annotations entry, we have to add them manually.
+            // The popup annotations may not be referenced in the page annotations entry, we have to add them manually.
             final Set<Annotation> annotSet = new HashSet<>(annotations);
             for (final Annotation annot : annotSet) {
                 if (annot instanceof MarkupAnnotation) {
@@ -380,6 +378,15 @@ public void resetInitializedState() {
      * child elements.  Once a page has been initialized, it can be painted.
      */
     public synchronized void init() throws InterruptedException {
+        init(null);
+    }
+
+    /**
+     * Initialize the Page object.  This method triggers the parsing of a page's
+     * child elements.  Once a page has been initialized, it can be painted.
+     * @param contentStreamRedactorCallback callback use to rewrite content stream
+     */
+    public synchronized void init(ContentStreamRedactorCallback contentStreamRedactorCallback) throws InterruptedException {
         try {
             // make sure we are not revisiting this method
             if (inited) {
@@ -407,34 +414,31 @@ public synchronized void init() throws InterruptedException {
             }
 
             /*
-              Finally iterate through the contents vector and concat all of the
-              the resource streams together so that the content parser can
-              go to town and build all of the page's shapes.
+              Finally iterate through the contents vector and concat all the
+              resource streams together so that the content parser can
+              go to town and build all the page's shapes.
              */
             notifyPageInitializationStarted();
             if (contents != null) {
                 try {
-                    ContentParser cp = new ContentParser(library, resources);
-                    byte[][] streams = new byte[contents.size()][];
-                    byte[] stream;
-                    Reference[] references = new Reference[contents.size()];
+                    ContentParser cp = new ContentParser(library, resources, contentStreamRedactorCallback);
+                    Stream[] streams = new Stream[contents.size()];
+                    byte[] streamByte;
                     for (int i = 0, max = contents.size(); i < max; i++) {
-                        stream = contents.get(i).getDecodedStreamBytes();
-                        if (stream != null) {
-                            streams[i] = stream;
-                            references[i] = contents.get(i).pObjectReference;
+                        streamByte = contents.get(i).getDecodedStreamBytes();
+                        if (streamByte != null) {
+                            streams[i] = contents.get(i);
                         }
                     }
-                    // get any optional groups from the catalog, which control
-                    // visibility
+                    // get any optional groups from the catalog, which control visibility
                     OptionalContent optionalContent = library.getCatalog().getOptionalContent();
                     if (optionalContent != null) {
                         optionalContent.init();
                     }
 
                     // pass in option group references into parse.
                     if (streams.length > 0) {
-                        shapes = cp.parse(streams, references, this).getShapes();
+                        shapes = cp.parse(streams, this).getShapes();
                     }
                     // set the initiated flag, first as there are couple corner
                     // cases where the content parsing can call page.init() again
@@ -455,14 +459,18 @@ public synchronized void init() throws InterruptedException {
                 logger.log(Level.WARNING, "Error initializing Page, no page content.");
             }
         } catch (InterruptedException e) {
-            // keeps shapes vector so we can paint what we have but make init state as false
-            // so we can try to re parse it later.
+            // keeps shapes vector so that we can paint what we have but make init state as false
+            // so that we can try to reparse it later.
             inited = false;
             throw new InterruptedException(e.getMessage());
         }
         notifyPageInitializationEnded(inited);
     }
 
+    public List<Stream> getContentStreams() {
+        return contents;
+    }
+
     /**
      * Gets a Thumbnail object associated with this page.  If no Thumbnail
      * entry exists then null is returned.
@@ -679,6 +687,7 @@ private void paintPageContent(Graphics2D g2, int renderHintType, float userRotat
                         }
                     }
                 }
+                g2.setComposite(AlphaComposite.getInstance(AlphaComposite.SRC_OVER, 1.0f));
                 //g2.setComposite(BlendComposite.getInstance(BlendComposite.BlendingMode.NORMAL, 1.0f));
             }
         }
@@ -1296,9 +1305,27 @@ public List<Annotation> getAnnotations() {
         return annotations;
     }
 
+    public List<RedactionAnnotation> getRedactionAnnotations() {
+        if (annotations == null) {
+            try {
+                initPageAnnotations();
+            } catch (InterruptedException e) {
+                logger.finer("Interrupt exception getting annotations. ");
+            }
+        }
+        // todo make this method more generic to any Annotation subtype
+        if (annotations != null) {
+            return annotations.stream()
+                    .filter(RedactionAnnotation.class::isInstance)
+                    .map(RedactionAnnotation.class::cast)
+                    .collect(Collectors.toList());
+        }
+        return null;
+    }
+
     /**
      * Returns the decoded content stream for this page instance.  A page instance
-     * can have more then one content stream associated with it.
+     * can have more than one content stream associated with it.
      *
      * @return An array of decoded content stream.  Each index in the array
      * represents one content stream.  Null return and null String array
@@ -1379,7 +1406,7 @@ public Rectangle2D.Float getCropBox() {
         if (cropBox != null) {
             return cropBox;
         }
-        // add all of the pages crop box dimensions to a vector and process
+        // add all the pages crop box dimensions to a vector and process
         List boxDimensions = (List) (library.getObject(entries, CROPBOX_KEY));
         if (boxDimensions != null) {
             cropBox = new PRectangle(boxDimensions);
@@ -1532,9 +1559,9 @@ public synchronized PageText getText() throws InterruptedException {
         Shapes textBlockShapes = new Shapes();
 
         /*
-          Finally iterate through the contents vector and concat all of the
-          the resource streams together so that the content parser can
-          go to town and build all of the pages shapes.
+          Finally iterate through the contents array and concat all the
+          resource streams together so that the content parser can
+          go to town and build all the pages shapes.
          */
         if (contents == null) {
             // Get the value of the page's content entry
@@ -1547,12 +1574,9 @@ public synchronized PageText getText() throws InterruptedException {
         }
         if (contents != null) {
             try {
-
                 ContentParser cp = new ContentParser(library, resources);
-                byte[][] streams = new byte[contents.size()][];
-                for (int i = 0, max = contents.size(); i < max; i++) {
-                    streams[i] = contents.get(i).getDecodedStreamBytes();
-                }
+                Stream[] streams = new Stream[contents.size()];
+                contents.toArray(streams);
                 textBlockShapes = cp.parseTextBlocks(streams);
                 // print off any fuzz left on the stack
                 if (logger.isLoggable(Level.FINER)) {

diff --git a/core/core-awt/src/main/java/org/icepdf/core/pobjects/Resources.java b/core/core-awt/src/main/java/org/icepdf/core/pobjects/Resources.java
@@ -212,6 +212,10 @@ public Object getXObject(Name s) {
         return library.getObject(xobjects, s);
     }
 
+    public DictionaryEntries getXObjects() {
+        return xobjects;
+    }
+
     /**
      * Gets a rough count of the images resources associated with this page. Does
      * not include inline images.

diff --git a/core/core-awt/src/main/java/org/icepdf/core/pobjects/StateManager.java b/core/core-awt/src/main/java/org/icepdf/core/pobjects/StateManager.java
@@ -15,6 +15,7 @@
  */
 package org.icepdf.core.pobjects;
 
+import org.icepdf.core.pobjects.annotations.RedactionAnnotation;
 import org.icepdf.core.pobjects.structure.CrossReferenceRoot;
 
 import java.util.*;
@@ -205,6 +206,18 @@ public CrossReferenceRoot getCrossReferenceRoot() {
         return crossReferenceRoot;
     }
 
+    public boolean hasRedactions() {
+        if (changes.isEmpty()) return false;
+        Collection<Change> changesValues = changes.values();
+        for (Change change : changesValues) {
+            Object object = change.getPObject().getObject();
+            if (object instanceof RedactionAnnotation) {
+                return true;
+            }
+        }
+        return false;
+    }
+
     private static class PObjectComparatorByReferenceObjectNumber
             implements Comparator<Change> {
         public int compare(Change a, Change b) {