Skip to content

Commit

Permalink
#196 - Add OCR Support for PDF Embedded Pdf files
Browse files Browse the repository at this point in the history
  • Loading branch information
mfriesen committed Dec 1, 2023
1 parent 2cf924e commit 3fde82b
Show file tree
Hide file tree
Showing 10 changed files with 248 additions and 71 deletions.
4 changes: 2 additions & 2 deletions lambda-ocr-tesseract/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ dependencies {
testImplementation project(':fkq-test-utils')

testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-engine', version:'5.10.0'
testImplementation group: 'org.testcontainers', name: 'testcontainers', version: '1.19.0'
testImplementation group: 'org.testcontainers', name: 'testcontainers', version: '1.19.0'
testImplementation group: 'org.testcontainers', name: 'junit-jupiter', version: '1.19.0'
testImplementation group: 'org.testcontainers', name: 'localstack', version: '1.19.0'
testImplementation group: 'org.testcontainers', name: 'localstack', version: '1.19.0'
}

/*
Expand Down
45 changes: 33 additions & 12 deletions lambda-ocr-tesseract/config/checkstyle/import-control.xml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,38 @@

<subpackage name="ocr">

<subpackage name="tesseract">
<subpackage name="docx">
<allow pkg="java.io" />
<allow pkg="com.formkiq.aws.dynamodb.objects" />
<allow pkg="org.apache.poi.hwpf" />
<allow pkg="org.apache.poi.hwpf.extractor" />

<allow pkg="org.apache.poi.openxml4j.exceptions" />
<allow pkg="org.apache.poi.openxml4j.opc" />
<allow pkg="org.apache.poi.xwpf.extractor" />
<allow pkg="org.apache.poi.xwpf.usermodel" />

<allow pkg="com.formkiq.module.lambdaservices" />
<allow pkg="com.formkiq.module.ocr" />

</subpackage>

<subpackage name="pdf">
<allow pkg="java.io" />
<allow pkg="java.util" />

<allow pkg="com.formkiq.aws.dynamodb.objects" />

<allow pkg="org.apache.pdfbox.cos" />
<allow pkg="org.apache.pdfbox.pdmodel" />
<allow pkg="org.apache.pdfbox.text" />

<allow pkg="com.formkiq.module.lambdaservices" />
<allow pkg="com.formkiq.module.ocr" />
</subpackage>

<subpackage name="tesseract">

<allow pkg="com.formkiq.module.events" />
<allow pkg="java.io" />
<allow pkg="java.nio.charset" />
Expand All @@ -28,20 +58,11 @@
<allow pkg="com.formkiq.module.actions.services" />
<allow pkg="com.formkiq.module.lambdaservices" />
<allow pkg="com.formkiq.module.ocr" />
<allow pkg="com.formkiq.module.lambda.ocr.docx" />
<allow pkg="com.formkiq.module.lambda.ocr.pdf" />

<allow pkg="net.sourceforge.tess4j" />

<allow pkg="org.apache.poi.hwpf" />
<allow pkg="org.apache.poi.hwpf.extractor" />

<allow pkg="org.apache.poi.openxml4j.exceptions" />
<allow pkg="org.apache.poi.openxml4j.opc" />
<allow pkg="org.apache.poi.xwpf.extractor" />
<allow pkg="org.apache.poi.xwpf.usermodel" />

<allow pkg="org.apache.pdfbox.pdmodel" />
<allow pkg="org.apache.pdfbox.text" />

<allow pkg="software.amazon.awssdk.auth.credentials" />
<allow pkg="software.amazon.awssdk.regions" />
<allow pkg="software.amazon.awssdk.utils" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package com.formkiq.module.lambda.ocr.tesseract;
package com.formkiq.module.lambda.ocr.docx;

import java.io.File;
import java.io.FileInputStream;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package com.formkiq.module.lambda.ocr.tesseract;
package com.formkiq.module.lambda.ocr.docx;

import java.io.File;
import java.io.IOException;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
/**
* MIT License
*
* Copyright (c) 2018 - 2020 FormKiQ
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package com.formkiq.module.lambda.ocr.pdf;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
import org.apache.pdfbox.text.PDFTextStripper;
import com.formkiq.aws.dynamodb.objects.MimeType;
import com.formkiq.module.lambdaservices.AwsServiceCache;
import com.formkiq.module.ocr.FormatConverter;
import com.formkiq.module.ocr.OcrSqsMessage;

/**
* DOCX {@link FormatConverter}.
*/
public class PdfFormatConverter implements FormatConverter {

@Override
public String convert(final AwsServiceCache awsServices, final OcrSqsMessage sqsMessage,
final File file) throws IOException {

StringBuilder sb = new StringBuilder();

PDFTextStripper pdfTextStripper = new PDFTextStripper();

try (PDDocument document = PDDocument.load(file)) {

if (isPdfPortfolio(document)) {

List<Map<String, String>> texts = getPortfolioTextMap(pdfTextStripper, document);

for (Map<String, String> map : texts) {
sb.append(map.get("text"));
}

} else {

sb.append(pdfTextStripper.getText(document));
}

return sb.toString();
}
}

private List<Map<String, String>> extractFiles(final PDFTextStripper pdfTextStripper,
final Map<String, PDComplexFileSpecification> names) throws IOException {

List<Map<String, String>> list = new ArrayList<>();

for (Entry<String, PDComplexFileSpecification> e : names.entrySet()) {
String filename = e.getKey();

PDComplexFileSpecification fileSpec = names.get(filename);
PDEmbeddedFile embeddedFile = fileSpec.getEmbeddedFile();

if (filename.endsWith(".pdf")) {
try (PDDocument document = PDDocument.load(embeddedFile.toByteArray())) {
list.add(Map.of("fileName", filename, "text", pdfTextStripper.getText(document)));
}
}
}

return list;
}

/**
* Get {@link Map} of Portfolio and Text.
*
* @param pdfTextStripper {@link PDFTextStripper}
* @param document {@link PDDocument}
* @return {@link Map}
* @throws IOException IOException
*/
private List<Map<String, String>> getPortfolioTextMap(final PDFTextStripper pdfTextStripper,
final PDDocument document) throws IOException {

List<Map<String, String>> list = new ArrayList<>();

String text = pdfTextStripper.getText(document);
list.add(Map.of("fileName", "root", "text", text));

PDDocumentNameDictionary names = new PDDocumentNameDictionary(document.getDocumentCatalog());
PDEmbeddedFilesNameTreeNode efTree = names.getEmbeddedFiles();

if (efTree != null) {

Map<String, PDComplexFileSpecification> namesMap = efTree.getNames();

if (namesMap != null) {

list.addAll(extractFiles(pdfTextStripper, namesMap));

} else {

List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
namesMap = node.getNames();
list.addAll(extractFiles(pdfTextStripper, namesMap));
}
}
}

return list;
}

/**
* Whether {@link PDDocument} is a Portfolio.
*
* @param document {@link PDDocument}
* @return boolean
*/
private boolean isPdfPortfolio(final PDDocument document) {
PDDocumentCatalog catalog = document.getDocumentCatalog();
COSDictionary cosObject = catalog.getCOSObject();
return cosObject.containsKey("Collection");
}

@Override
public boolean isSupported(final OcrSqsMessage sqsMessage, final MimeType mineType) {
return MimeType.MIME_PDF.equals(mineType);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@
import com.formkiq.module.actions.services.ActionsServiceExtension;
import com.formkiq.module.events.EventService;
import com.formkiq.module.events.EventServiceSnsExtension;
import com.formkiq.module.lambda.ocr.docx.DocFormatConverter;
import com.formkiq.module.lambda.ocr.docx.DocxFormatConverter;
import com.formkiq.module.lambda.ocr.pdf.PdfFormatConverter;
import com.formkiq.module.lambdaservices.AwsServiceCache;
import com.formkiq.module.lambdaservices.AwsServiceCacheBuilder;
import com.formkiq.module.ocr.DocumentOcrService;
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Parameters:
LambdaMemory:
Type: Number
Description: The amount of memory used by lambda function (MB)
Default: 1024
Default: 2048
MinValue: 128
MaxValue: 3008

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@
import com.formkiq.module.actions.ActionStatus;
import com.formkiq.module.actions.ActionType;
import com.formkiq.module.actions.services.ActionsService;
import com.formkiq.module.lambda.ocr.docx.DocFormatConverter;
import com.formkiq.module.lambda.ocr.docx.DocxFormatConverter;
import com.formkiq.module.lambda.ocr.pdf.PdfFormatConverter;
import com.formkiq.module.lambdaservices.AwsServiceCache;
import com.formkiq.module.lambdaservices.AwsServiceCacheBuilder;
import com.formkiq.module.ocr.DocumentOcrService;
Expand Down Expand Up @@ -396,4 +399,53 @@ void testHandleRequest06() throws Exception {
assertEquals(ActionStatus.COMPLETE, actions.get(0).status());
}
}

/**
* Test Successful PDF Portfolio application/pdf OCR.
*
* @throws Exception Exception
*/
@Test
void testHandleRequest07() throws Exception {
// given
for (String siteId : Arrays.asList("default", UUID.randomUUID().toString())) {

String documentId = UUID.randomUUID().toString();
String jobId = UUID.randomUUID().toString();

List<Action> actions = Arrays
.asList(new Action().type(ActionType.OCR).status(ActionStatus.RUNNING).userId("joe"));
actionsService.saveNewActions(siteId, documentId, actions);

String documentS3Key = createS3Key(siteId, documentId);
try (InputStream is = LambdaContextRecorder.class.getResourceAsStream("/collection.pdf")) {
s3.putObject(BUCKET_NAME, documentS3Key, is, MimeType.MIME_PDF.getContentType());
}

Ocr ocr = new Ocr().siteId(siteId).documentId(documentId).jobId(jobId)
.engine(OcrEngine.TESSERACT).status(OcrScanStatus.REQUESTED);
ocrService.save(ocr);

SqsMessageRecord record =
new SqsMessageRecord().body(GSON.toJson(Map.of("siteId", siteId, "documentId", documentId,
"jobId", jobId, "contentType", MimeType.MIME_PDF.getContentType())));
SqsMessageRecords records = new SqsMessageRecords().records(Arrays.asList(record));

String json = GSON.toJson(records);
InputStream is = new ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8));

// when
processor.handleRequest(is, null, this.context);

// then
DynamicObject obj = ocrService.get(siteId, documentId);
assertEquals("successful", obj.get("ocrStatus"));

String ocrS3Key = ocrService.getS3Key(siteId, documentId, jobId);
assertTrue(s3.getContentAsString(OCR_BUCKET_NAME, ocrS3Key, null).contains("And more text"));

actions = actionsService.getActions(siteId, documentId);
assertEquals(ActionStatus.COMPLETE, actions.get(0).status());
}
}
}
Binary file not shown.

0 comments on commit 3fde82b

Please sign in to comment.