Merge pull request #1510 from macb74/feature/add_psm_option

add "Preserve Interword Spacing" and "Page Seg Mode"
dadoonet · Sep 14, 2024 · 6fd3502 · 6fd3502
2 parents 669895d + 893a1e1
commit 6fd3502
Show file tree

Hide file tree

Showing 3 changed files with 95 additions and 19 deletions.
diff --git a/docs/source/user/ocr.rst b/docs/source/user/ocr.rst
@@ -17,21 +17,27 @@ OCR settings
 
 Here is a list of OCR settings (under ``fs.ocr`` prefix)`:
 
-+-------------------------+------------------+------------------------------------+
-| Name                    |   Default value  | Documentation                      |
-+=========================+==================+====================================+
-| ``fs.ocr.enabled``      | ``true``         | `Disable/Enable OCR`_              |
-+-------------------------+------------------+------------------------------------+
-| ``fs.ocr.language``     | ``"eng"``        | `OCR Language`_                    |
-+-------------------------+------------------+------------------------------------+
-| ``fs.ocr.path``         | ``null``         | `OCR Path`_                        |
-+-------------------------+------------------+------------------------------------+
-| ``fs.ocr.data_path``    | ``null``         | `OCR Data Path`_                   |
-+-------------------------+------------------+------------------------------------+
-| ``fs.ocr.output_type``  | ``txt``          | `OCR Output Type`_                 |
-+-------------------------+------------------+------------------------------------+
-| ``fs.ocr.pdf_strategy`` | ``ocr_and_text`` | `OCR PDF Strategy`_                |
-+-------------------------+------------------+------------------------------------+
++---------------------------------------+------------------+-----------------------------------+
+| Name                                  |   Default value  | Documentation                     |
++=======================================+==================+===================================+
+| ``fs.ocr.enabled``                    | ``true``         | `Disable/Enable OCR`_             |
++---------------------------------------+------------------+-----------------------------------+
+| ``fs.ocr.language``                   | ``"eng"``        | `OCR Language`_                   |
++---------------------------------------+------------------+-----------------------------------+
+| ``fs.ocr.path``                       | ``null``         | `OCR Path`_                       |
++---------------------------------------+------------------+-----------------------------------+
+| ``fs.ocr.data_path``                  | ``null``         | `OCR Data Path`_                  |
++---------------------------------------+----+-------------+-----------------------------------+
+| ``fs.ocr.output_type``                | ``txt``          | `OCR Output Type`_                |
++---------------------------------------+------------------+-----------------------------------+
+| ``fs.ocr.pdf_strategy``               | ``ocr_and_text`` | `OCR PDF Strategy`_               |
++---------------------------------------+------------------+-----------------------------------+
+| ``fs.ocr.page_seg_mode``              | ``1``            | `OCR Page Seg Mode`_              |
++---------------------------------------+------------------+-----------------------------------+
+| ``fs.ocr.preserve_interword_spacing`` | ``false``        | `OCR Preserve Interword Spacing`_ |
++---------------------------------------+------------------*-----------------------------------+
+
+
 
 Disable/Enable OCR
 ------------------
@@ -160,3 +166,28 @@ Supported strategies are:
 
 .. note:: When omitted, ``ocr_and_text`` value is used. If you have performance issues, it's worth using the ``auto`` option
     instead as only documents with barely no text will go through the OCR process.
+
+OCR Page Seg Mode
+-----------------
+
+Set Tesseract to only run a subset of layout analysis and assume a certain form of image. The options for N are:
+
+* ``0`` = Orientation and script detection (OSD) only.
+* ``1`` = Automatic page segmentation with OSD.
+* ``2`` = Automatic page segmentation, but no OSD, or OCR. (not implemented)
+* ``3`` = Fully automatic page segmentation, but no OSD.
+* ``4`` = Assume a single column of text of variable sizes.
+* ``5`` = Assume a single uniform block of vertically aligned text.
+* ``6`` = Assume a single uniform block of text.
+* ``7`` = Treat the image as a single text line.
+* ``8`` = Treat the image as a single word.
+* ``9`` = Treat the image as a single word in a circle.
+* ``10`` = Treat the image as a single character.
+* ``11`` = Sparse text. Find as much text as possible in no particular order.
+* ``12`` = Sparse text with OSD.
+* ``13`` = Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific.
+
+OCR Preserve Interword Spacing
+------------------------------
+
+Spaces between the words will be deleted.
diff --git a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Ocr.java b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Ocr.java
@@ -34,6 +34,9 @@ public class Ocr {
     private boolean enabled = true;
     // Pdf OCR Strategy
     private String pdfStrategy = "ocr_and_text";
+    // PDF Page Seg Mode
+    private Integer pageSegMode = null;
+    private Boolean preserveInterwordSpacing = null;
 
     public static Builder builder() {
         return new Builder();
@@ -47,6 +50,8 @@ public static class Builder {
         private String outputType = null;
         private boolean enabled = true;
         private String pdfStrategy = "ocr_and_text";
+        private Integer pageSegMode = null;
+        private Boolean preserveInterwordSpacing = null;
 
         public Builder setLanguage(String language) {
             this.language = language;
@@ -73,6 +78,16 @@ public Builder setEnabled(boolean enabled) {
             return this;
         }
 
+        public Builder setPageSegMode(Integer pageSegMode) {
+            this.pageSegMode = pageSegMode;
+            return this;
+        }
+
+        public Builder setPreserveInterwordSpacing(Boolean preserveInterwordSpacing) {
+            this.preserveInterwordSpacing = preserveInterwordSpacing;
+            return this;
+        }
+
         /**
          * Set the PDF Strategy.
          * @param pdfStrategy the PDF Strategy. Could be "no_ocr", "ocr_only" or "ocr_and_text"
@@ -83,7 +98,7 @@ public Builder setPdfStrategy(String pdfStrategy) {
         }
 
         public Ocr build() {
-            return new Ocr(language, path, dataPath, outputType, pdfStrategy, enabled);
+            return new Ocr(language, path, dataPath, outputType, pdfStrategy, enabled, pageSegMode, preserveInterwordSpacing);
         }
 
     }
@@ -92,13 +107,15 @@ public Ocr( ) {
 
     }
 
-    private Ocr(String language, String path, String dataPath, String outputType, String pdfStrategy, boolean enabled) {
+    private Ocr(String language, String path, String dataPath, String outputType, String pdfStrategy, boolean enabled, Integer pageSegMode, Boolean preserveInterwordSpacing) {
         this.language = language;
         this.path = path;
         this.dataPath = dataPath;
         this.outputType = outputType;
         this.pdfStrategy = pdfStrategy;
         this.enabled = enabled;
+        this.pageSegMode = pageSegMode;
+        this.preserveInterwordSpacing = preserveInterwordSpacing;
     }
 
     public String getLanguage() {
@@ -141,6 +158,22 @@ public void setEnabled(boolean enabled) {
         this.enabled = enabled;
     }
 
+    public Integer getPageSegMode() {
+        return pageSegMode;
+    }
+
+    public Boolean getPreserveInterwordSpacing() {
+        return this.preserveInterwordSpacing;
+    }
+
+    public void setPreserveInterwordSpacing( Boolean preserveInterwordSpacing) {
+        this.preserveInterwordSpacing = preserveInterwordSpacing;
+    }
+
+    public void setPageSegMode( Integer pageSegMode) {
+        this.pageSegMode = pageSegMode;
+    }
+
     /**
      * Get the PDF Strategy. Could be "no_ocr", "auto", "ocr_only" or "ocr_and_text" (default)
      * @return the PDF Strategy
@@ -167,12 +200,14 @@ public boolean equals(Object o) {
                 Objects.equals(path, ocr.path) &&
                 Objects.equals(dataPath, ocr.dataPath) &&
                 Objects.equals(outputType, ocr.outputType) &&
-                Objects.equals(pdfStrategy, ocr.pdfStrategy);
+                Objects.equals(pdfStrategy, ocr.pdfStrategy) &&
+                Objects.equals(pageSegMode, ocr.pageSegMode) &&
+                Objects.equals(preserveInterwordSpacing, ocr.preserveInterwordSpacing);
     }
 
     @Override
     public int hashCode() {
-        return Objects.hash(language, path, dataPath, outputType, enabled, pdfStrategy);
+        return Objects.hash(language, path, dataPath, outputType, enabled, pdfStrategy, pageSegMode, preserveInterwordSpacing);
     }
 
     @Override
@@ -183,6 +218,8 @@ public String toString() {
                 ", outputType='" + outputType + '\'' +
                 ", enabled=" + enabled +
                 ", pdfStrategy='" + pdfStrategy + '\'' +
+                ", pageSegMode='" + pageSegMode + '\'' +
+                ", preserveInterwordSpacing='" + preserveInterwordSpacing + '\'' +
                 '}';
     }
 }
diff --git a/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java b/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java
@@ -180,6 +180,14 @@ private static void initContext(Fs fs) {
                 TesseractOCRConfig config = new TesseractOCRConfig();
                 logger.debug("Tesseract Language set to [{}].", fs.getOcr().getLanguage());
                 config.setLanguage(fs.getOcr().getLanguage());
+                if (fs.getOcr().getPageSegMode() != null) {
+                    logger.debug("Tesseract PageSegMode set to [{}].", fs.getOcr().getPageSegMode());
+                    config.setPageSegMode("" + fs.getOcr().getPageSegMode());
+                }
+                if (fs.getOcr().getPreserveInterwordSpacing() != null) {
+                    logger.debug("Tesseract preserveInterwordSpacing set to [{}].", fs.getOcr().getPreserveInterwordSpacing());
+                    config.setPreserveInterwordSpacing(fs.getOcr().getPreserveInterwordSpacing());
+                }
                 if (fs.getOcr().getOutputType() != null) {
                     logger.debug("Tesseract Output Type set to [{}].", fs.getOcr().getOutputType());
                     config.setOutputType(fs.getOcr().getOutputType());