#196 - Add OCR Support for PDF Embedded Pdf files

formkiq · Dec 1, 2023 · 3fde82b · 3fde82b
1 parent 2cf924e
commit 3fde82b
Show file tree

Hide file tree

Showing 10 changed files with 248 additions and 71 deletions.
diff --git a/lambda-ocr-tesseract/build.gradle b/lambda-ocr-tesseract/build.gradle
@@ -27,9 +27,9 @@ dependencies {
 	testImplementation project(':fkq-test-utils')
 
 	testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-engine', version:'5.10.0'
-        testImplementation group: 'org.testcontainers', name: 'testcontainers', version: '1.19.0'
+    testImplementation group: 'org.testcontainers', name: 'testcontainers', version: '1.19.0'
 	testImplementation group: 'org.testcontainers', name: 'junit-jupiter', version: '1.19.0'
-        testImplementation group: 'org.testcontainers', name: 'localstack', version: '1.19.0'
+    testImplementation group: 'org.testcontainers', name: 'localstack', version: '1.19.0'
 }
 
 /*

diff --git a/lambda-ocr-tesseract/config/checkstyle/import-control.xml b/lambda-ocr-tesseract/config/checkstyle/import-control.xml
@@ -9,8 +9,38 @@
 
 		<subpackage name="ocr">
 
-			<subpackage name="tesseract">
+			<subpackage name="docx">
+				<allow pkg="java.io" />
+				<allow pkg="com.formkiq.aws.dynamodb.objects" />
+				<allow pkg="org.apache.poi.hwpf" />
+				<allow pkg="org.apache.poi.hwpf.extractor" />
 
+				<allow pkg="org.apache.poi.openxml4j.exceptions" />
+				<allow pkg="org.apache.poi.openxml4j.opc" />
+				<allow pkg="org.apache.poi.xwpf.extractor" />
+				<allow pkg="org.apache.poi.xwpf.usermodel" />
+
+				<allow pkg="com.formkiq.module.lambdaservices" />
+				<allow pkg="com.formkiq.module.ocr" />
+
+			</subpackage>
+
+			<subpackage name="pdf">
+				<allow pkg="java.io" />
+				<allow pkg="java.util" />
+
+				<allow pkg="com.formkiq.aws.dynamodb.objects" />
+
+				<allow pkg="org.apache.pdfbox.cos" />
+				<allow pkg="org.apache.pdfbox.pdmodel" />
+				<allow pkg="org.apache.pdfbox.text" />
+
+				<allow pkg="com.formkiq.module.lambdaservices" />
+				<allow pkg="com.formkiq.module.ocr" />
+			</subpackage>
+
+			<subpackage name="tesseract">
+
 				<allow pkg="com.formkiq.module.events" />
 				<allow pkg="java.io" />
 				<allow pkg="java.nio.charset" />
@@ -28,20 +58,11 @@
 				<allow pkg="com.formkiq.module.actions.services" />
 				<allow pkg="com.formkiq.module.lambdaservices" />
 				<allow pkg="com.formkiq.module.ocr" />
+				<allow pkg="com.formkiq.module.lambda.ocr.docx" />
+				<allow pkg="com.formkiq.module.lambda.ocr.pdf" />
 
 				<allow pkg="net.sourceforge.tess4j" />
 
-				<allow pkg="org.apache.poi.hwpf" />
-				<allow pkg="org.apache.poi.hwpf.extractor" />
-
-				<allow pkg="org.apache.poi.openxml4j.exceptions" />
-				<allow pkg="org.apache.poi.openxml4j.opc" />
-				<allow pkg="org.apache.poi.xwpf.extractor" />
-				<allow pkg="org.apache.poi.xwpf.usermodel" />
-
-				<allow pkg="org.apache.pdfbox.pdmodel" />
-				<allow pkg="org.apache.pdfbox.text" />
-
 				<allow pkg="software.amazon.awssdk.auth.credentials" />
 				<allow pkg="software.amazon.awssdk.regions" />
 				<allow pkg="software.amazon.awssdk.utils" />

diff --git a/...bda/ocr/tesseract/DocFormatConverter.java → ...e/lambda/ocr/docx/DocFormatConverter.java b/...bda/ocr/tesseract/DocFormatConverter.java → ...e/lambda/ocr/docx/DocFormatConverter.java
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-package com.formkiq.module.lambda.ocr.tesseract;
+package com.formkiq.module.lambda.ocr.docx;
 
 import java.io.File;
 import java.io.FileInputStream;

diff --git a/...da/ocr/tesseract/DocxFormatConverter.java → .../lambda/ocr/docx/DocxFormatConverter.java b/...da/ocr/tesseract/DocxFormatConverter.java → .../lambda/ocr/docx/DocxFormatConverter.java
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-package com.formkiq.module.lambda.ocr.tesseract;
+package com.formkiq.module.lambda.ocr.docx;
 
 import java.io.File;
 import java.io.IOException;

diff --git a/lambda-ocr-tesseract/src/main/java/com/formkiq/module/lambda/ocr/pdf/PdfFormatConverter.java b/lambda-ocr-tesseract/src/main/java/com/formkiq/module/lambda/ocr/pdf/PdfFormatConverter.java
@@ -0,0 +1,155 @@
+/**
+ * MIT License
+ * 
+ * Copyright (c) 2018 - 2020 FormKiQ
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+package com.formkiq.module.lambda.ocr.pdf;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
+import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
+import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
+import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
+import org.apache.pdfbox.text.PDFTextStripper;
+import com.formkiq.aws.dynamodb.objects.MimeType;
+import com.formkiq.module.lambdaservices.AwsServiceCache;
+import com.formkiq.module.ocr.FormatConverter;
+import com.formkiq.module.ocr.OcrSqsMessage;
+
+/**
+ * DOCX {@link FormatConverter}.
+ */
+public class PdfFormatConverter implements FormatConverter {
+
+  @Override
+  public String convert(final AwsServiceCache awsServices, final OcrSqsMessage sqsMessage,
+      final File file) throws IOException {
+
+    StringBuilder sb = new StringBuilder();
+
+    PDFTextStripper pdfTextStripper = new PDFTextStripper();
+
+    try (PDDocument document = PDDocument.load(file)) {
+
+      if (isPdfPortfolio(document)) {
+
+        List<Map<String, String>> texts = getPortfolioTextMap(pdfTextStripper, document);
+
+        for (Map<String, String> map : texts) {
+          sb.append(map.get("text"));
+        }
+
+      } else {
+
+        sb.append(pdfTextStripper.getText(document));
+      }
+
+      return sb.toString();
+    }
+  }
+
+  private List<Map<String, String>> extractFiles(final PDFTextStripper pdfTextStripper,
+      final Map<String, PDComplexFileSpecification> names) throws IOException {
+
+    List<Map<String, String>> list = new ArrayList<>();
+
+    for (Entry<String, PDComplexFileSpecification> e : names.entrySet()) {
+      String filename = e.getKey();
+
+      PDComplexFileSpecification fileSpec = names.get(filename);
+      PDEmbeddedFile embeddedFile = fileSpec.getEmbeddedFile();
+
+      if (filename.endsWith(".pdf")) {
+        try (PDDocument document = PDDocument.load(embeddedFile.toByteArray())) {
+          list.add(Map.of("fileName", filename, "text", pdfTextStripper.getText(document)));
+        }
+      }
+    }
+
+    return list;
+  }
+
+  /**
+   * Get {@link Map} of Portfolio and Text.
+   * 
+   * @param pdfTextStripper {@link PDFTextStripper}
+   * @param document {@link PDDocument}
+   * @return {@link Map}
+   * @throws IOException IOException
+   */
+  private List<Map<String, String>> getPortfolioTextMap(final PDFTextStripper pdfTextStripper,
+      final PDDocument document) throws IOException {
+
+    List<Map<String, String>> list = new ArrayList<>();
+
+    String text = pdfTextStripper.getText(document);
+    list.add(Map.of("fileName", "root", "text", text));
+
+    PDDocumentNameDictionary names = new PDDocumentNameDictionary(document.getDocumentCatalog());
+    PDEmbeddedFilesNameTreeNode efTree = names.getEmbeddedFiles();
+
+    if (efTree != null) {
+
+      Map<String, PDComplexFileSpecification> namesMap = efTree.getNames();
+
+      if (namesMap != null) {
+
+        list.addAll(extractFiles(pdfTextStripper, namesMap));
+
+      } else {
+
+        List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
+        for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
+          namesMap = node.getNames();
+          list.addAll(extractFiles(pdfTextStripper, namesMap));
+        }
+      }
+    }
+
+    return list;
+  }
+
+  /**
+   * Whether {@link PDDocument} is a Portfolio.
+   * 
+   * @param document {@link PDDocument}
+   * @return boolean
+   */
+  private boolean isPdfPortfolio(final PDDocument document) {
+    PDDocumentCatalog catalog = document.getDocumentCatalog();
+    COSDictionary cosObject = catalog.getCOSObject();
+    return cosObject.containsKey("Collection");
+  }
+
+  @Override
+  public boolean isSupported(final OcrSqsMessage sqsMessage, final MimeType mineType) {
+    return MimeType.MIME_PDF.equals(mineType);
+  }
+}
diff --git a/...esseract/src/main/java/com/formkiq/module/lambda/ocr/tesseract/OcrTesseractProcessor.java b/...esseract/src/main/java/com/formkiq/module/lambda/ocr/tesseract/OcrTesseractProcessor.java
@@ -58,6 +58,9 @@
 import com.formkiq.module.actions.services.ActionsServiceExtension;
 import com.formkiq.module.events.EventService;
 import com.formkiq.module.events.EventServiceSnsExtension;
+import com.formkiq.module.lambda.ocr.docx.DocFormatConverter;
+import com.formkiq.module.lambda.ocr.docx.DocxFormatConverter;
+import com.formkiq.module.lambda.ocr.pdf.PdfFormatConverter;
 import com.formkiq.module.lambdaservices.AwsServiceCache;
 import com.formkiq.module.lambdaservices.AwsServiceCacheBuilder;
 import com.formkiq.module.ocr.DocumentOcrService;

diff --git a/...r-tesseract/src/main/java/com/formkiq/module/lambda/ocr/tesseract/PdfFormatConverter.java b/...r-tesseract/src/main/java/com/formkiq/module/lambda/ocr/tesseract/PdfFormatConverter.java
diff --git a/lambda-ocr-tesseract/src/main/resources/cloudformation/template.yaml b/lambda-ocr-tesseract/src/main/resources/cloudformation/template.yaml
@@ -15,7 +15,7 @@ Parameters:
   LambdaMemory:
     Type: Number
     Description: The amount of memory used by lambda function (MB)
-    Default: 1024
+    Default: 2048
     MinValue: 128
     MaxValue: 3008
 

diff --git a/...ract/src/test/java/com/formkiq/module/lambda/ocr/tesseract/OcrTesseractProcessorTest.java b/...ract/src/test/java/com/formkiq/module/lambda/ocr/tesseract/OcrTesseractProcessorTest.java
@@ -53,6 +53,9 @@
 import com.formkiq.module.actions.ActionStatus;
 import com.formkiq.module.actions.ActionType;
 import com.formkiq.module.actions.services.ActionsService;
+import com.formkiq.module.lambda.ocr.docx.DocFormatConverter;
+import com.formkiq.module.lambda.ocr.docx.DocxFormatConverter;
+import com.formkiq.module.lambda.ocr.pdf.PdfFormatConverter;
 import com.formkiq.module.lambdaservices.AwsServiceCache;
 import com.formkiq.module.lambdaservices.AwsServiceCacheBuilder;
 import com.formkiq.module.ocr.DocumentOcrService;
@@ -396,4 +399,53 @@ void testHandleRequest06() throws Exception {
       assertEquals(ActionStatus.COMPLETE, actions.get(0).status());
     }
   }
+
+  /**
+   * Test Successful PDF Portfolio application/pdf OCR.
+   * 
+   * @throws Exception Exception
+   */
+  @Test
+  void testHandleRequest07() throws Exception {
+    // given
+    for (String siteId : Arrays.asList("default", UUID.randomUUID().toString())) {
+
+      String documentId = UUID.randomUUID().toString();
+      String jobId = UUID.randomUUID().toString();
+
+      List<Action> actions = Arrays
+          .asList(new Action().type(ActionType.OCR).status(ActionStatus.RUNNING).userId("joe"));
+      actionsService.saveNewActions(siteId, documentId, actions);
+
+      String documentS3Key = createS3Key(siteId, documentId);
+      try (InputStream is = LambdaContextRecorder.class.getResourceAsStream("/collection.pdf")) {
+        s3.putObject(BUCKET_NAME, documentS3Key, is, MimeType.MIME_PDF.getContentType());
+      }
+
+      Ocr ocr = new Ocr().siteId(siteId).documentId(documentId).jobId(jobId)
+          .engine(OcrEngine.TESSERACT).status(OcrScanStatus.REQUESTED);
+      ocrService.save(ocr);
+
+      SqsMessageRecord record =
+          new SqsMessageRecord().body(GSON.toJson(Map.of("siteId", siteId, "documentId", documentId,
+              "jobId", jobId, "contentType", MimeType.MIME_PDF.getContentType())));
+      SqsMessageRecords records = new SqsMessageRecords().records(Arrays.asList(record));
+
+      String json = GSON.toJson(records);
+      InputStream is = new ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8));
+
+      // when
+      processor.handleRequest(is, null, this.context);
+
+      // then
+      DynamicObject obj = ocrService.get(siteId, documentId);
+      assertEquals("successful", obj.get("ocrStatus"));
+
+      String ocrS3Key = ocrService.getS3Key(siteId, documentId, jobId);
+      assertTrue(s3.getContentAsString(OCR_BUCKET_NAME, ocrS3Key, null).contains("And more text"));
+
+      actions = actionsService.getActions(siteId, documentId);
+      assertEquals(ActionStatus.COMPLETE, actions.get(0).status());
+    }
+  }
 }
diff --git a/lambda-ocr-tesseract/src/test/resources/collection.pdf b/lambda-ocr-tesseract/src/test/resources/collection.pdf