Skip to content

Commit

Permalink
Merge pull request #1510 from macb74/feature/add_psm_option
Browse files Browse the repository at this point in the history
add "Preserve Interword Spacing" and "Page Seg Mode"
  • Loading branch information
dadoonet authored Sep 14, 2024
2 parents 669895d + 893a1e1 commit 6fd3502
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 19 deletions.
61 changes: 46 additions & 15 deletions docs/source/user/ocr.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,27 @@ OCR settings

Here is a list of OCR settings (under ``fs.ocr`` prefix)`:

+-------------------------+------------------+------------------------------------+
| Name | Default value | Documentation |
+=========================+==================+====================================+
| ``fs.ocr.enabled`` | ``true`` | `Disable/Enable OCR`_ |
+-------------------------+------------------+------------------------------------+
| ``fs.ocr.language`` | ``"eng"`` | `OCR Language`_ |
+-------------------------+------------------+------------------------------------+
| ``fs.ocr.path`` | ``null`` | `OCR Path`_ |
+-------------------------+------------------+------------------------------------+
| ``fs.ocr.data_path`` | ``null`` | `OCR Data Path`_ |
+-------------------------+------------------+------------------------------------+
| ``fs.ocr.output_type`` | ``txt`` | `OCR Output Type`_ |
+-------------------------+------------------+------------------------------------+
| ``fs.ocr.pdf_strategy`` | ``ocr_and_text`` | `OCR PDF Strategy`_ |
+-------------------------+------------------+------------------------------------+
+---------------------------------------+------------------+-----------------------------------+
| Name | Default value | Documentation |
+=======================================+==================+===================================+
| ``fs.ocr.enabled`` | ``true`` | `Disable/Enable OCR`_ |
+---------------------------------------+------------------+-----------------------------------+
| ``fs.ocr.language`` | ``"eng"`` | `OCR Language`_ |
+---------------------------------------+------------------+-----------------------------------+
| ``fs.ocr.path`` | ``null`` | `OCR Path`_ |
+---------------------------------------+------------------+-----------------------------------+
| ``fs.ocr.data_path`` | ``null`` | `OCR Data Path`_ |
+---------------------------------------+----+-------------+-----------------------------------+
| ``fs.ocr.output_type`` | ``txt`` | `OCR Output Type`_ |
+---------------------------------------+------------------+-----------------------------------+
| ``fs.ocr.pdf_strategy`` | ``ocr_and_text`` | `OCR PDF Strategy`_ |
+---------------------------------------+------------------+-----------------------------------+
| ``fs.ocr.page_seg_mode`` | ``1`` | `OCR Page Seg Mode`_ |
+---------------------------------------+------------------+-----------------------------------+
| ``fs.ocr.preserve_interword_spacing`` | ``false`` | `OCR Preserve Interword Spacing`_ |
+---------------------------------------+------------------*-----------------------------------+



Disable/Enable OCR
------------------
Expand Down Expand Up @@ -160,3 +166,28 @@ Supported strategies are:

.. note:: When omitted, ``ocr_and_text`` value is used. If you have performance issues, it's worth using the ``auto`` option
instead as only documents with barely no text will go through the OCR process.

OCR Page Seg Mode
-----------------

Set Tesseract to only run a subset of layout analysis and assume a certain form of image. The options for N are:

* ``0`` = Orientation and script detection (OSD) only.
* ``1`` = Automatic page segmentation with OSD.
* ``2`` = Automatic page segmentation, but no OSD, or OCR. (not implemented)
* ``3`` = Fully automatic page segmentation, but no OSD.
* ``4`` = Assume a single column of text of variable sizes.
* ``5`` = Assume a single uniform block of vertically aligned text.
* ``6`` = Assume a single uniform block of text.
* ``7`` = Treat the image as a single text line.
* ``8`` = Treat the image as a single word.
* ``9`` = Treat the image as a single word in a circle.
* ``10`` = Treat the image as a single character.
* ``11`` = Sparse text. Find as much text as possible in no particular order.
* ``12`` = Sparse text with OSD.
* ``13`` = Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific.

OCR Preserve Interword Spacing
------------------------------

Spaces between the words will be deleted.
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ public class Ocr {
private boolean enabled = true;
// Pdf OCR Strategy
private String pdfStrategy = "ocr_and_text";
// PDF Page Seg Mode
private Integer pageSegMode = null;
private Boolean preserveInterwordSpacing = null;

public static Builder builder() {
return new Builder();
Expand All @@ -47,6 +50,8 @@ public static class Builder {
private String outputType = null;
private boolean enabled = true;
private String pdfStrategy = "ocr_and_text";
private Integer pageSegMode = null;
private Boolean preserveInterwordSpacing = null;

public Builder setLanguage(String language) {
this.language = language;
Expand All @@ -73,6 +78,16 @@ public Builder setEnabled(boolean enabled) {
return this;
}

public Builder setPageSegMode(Integer pageSegMode) {
this.pageSegMode = pageSegMode;
return this;
}

public Builder setPreserveInterwordSpacing(Boolean preserveInterwordSpacing) {
this.preserveInterwordSpacing = preserveInterwordSpacing;
return this;
}

/**
* Set the PDF Strategy.
* @param pdfStrategy the PDF Strategy. Could be "no_ocr", "ocr_only" or "ocr_and_text"
Expand All @@ -83,7 +98,7 @@ public Builder setPdfStrategy(String pdfStrategy) {
}

public Ocr build() {
return new Ocr(language, path, dataPath, outputType, pdfStrategy, enabled);
return new Ocr(language, path, dataPath, outputType, pdfStrategy, enabled, pageSegMode, preserveInterwordSpacing);
}

}
Expand All @@ -92,13 +107,15 @@ public Ocr( ) {

}

private Ocr(String language, String path, String dataPath, String outputType, String pdfStrategy, boolean enabled) {
private Ocr(String language, String path, String dataPath, String outputType, String pdfStrategy, boolean enabled, Integer pageSegMode, Boolean preserveInterwordSpacing) {
this.language = language;
this.path = path;
this.dataPath = dataPath;
this.outputType = outputType;
this.pdfStrategy = pdfStrategy;
this.enabled = enabled;
this.pageSegMode = pageSegMode;
this.preserveInterwordSpacing = preserveInterwordSpacing;
}

public String getLanguage() {
Expand Down Expand Up @@ -141,6 +158,22 @@ public void setEnabled(boolean enabled) {
this.enabled = enabled;
}

public Integer getPageSegMode() {
return pageSegMode;
}

public Boolean getPreserveInterwordSpacing() {
return this.preserveInterwordSpacing;
}

public void setPreserveInterwordSpacing( Boolean preserveInterwordSpacing) {
this.preserveInterwordSpacing = preserveInterwordSpacing;
}

public void setPageSegMode( Integer pageSegMode) {
this.pageSegMode = pageSegMode;
}

/**
* Get the PDF Strategy. Could be "no_ocr", "auto", "ocr_only" or "ocr_and_text" (default)
* @return the PDF Strategy
Expand All @@ -167,12 +200,14 @@ public boolean equals(Object o) {
Objects.equals(path, ocr.path) &&
Objects.equals(dataPath, ocr.dataPath) &&
Objects.equals(outputType, ocr.outputType) &&
Objects.equals(pdfStrategy, ocr.pdfStrategy);
Objects.equals(pdfStrategy, ocr.pdfStrategy) &&
Objects.equals(pageSegMode, ocr.pageSegMode) &&
Objects.equals(preserveInterwordSpacing, ocr.preserveInterwordSpacing);
}

@Override
public int hashCode() {
return Objects.hash(language, path, dataPath, outputType, enabled, pdfStrategy);
return Objects.hash(language, path, dataPath, outputType, enabled, pdfStrategy, pageSegMode, preserveInterwordSpacing);
}

@Override
Expand All @@ -183,6 +218,8 @@ public String toString() {
", outputType='" + outputType + '\'' +
", enabled=" + enabled +
", pdfStrategy='" + pdfStrategy + '\'' +
", pageSegMode='" + pageSegMode + '\'' +
", preserveInterwordSpacing='" + preserveInterwordSpacing + '\'' +
'}';
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,14 @@ private static void initContext(Fs fs) {
TesseractOCRConfig config = new TesseractOCRConfig();
logger.debug("Tesseract Language set to [{}].", fs.getOcr().getLanguage());
config.setLanguage(fs.getOcr().getLanguage());
if (fs.getOcr().getPageSegMode() != null) {
logger.debug("Tesseract PageSegMode set to [{}].", fs.getOcr().getPageSegMode());
config.setPageSegMode("" + fs.getOcr().getPageSegMode());
}
if (fs.getOcr().getPreserveInterwordSpacing() != null) {
logger.debug("Tesseract preserveInterwordSpacing set to [{}].", fs.getOcr().getPreserveInterwordSpacing());
config.setPreserveInterwordSpacing(fs.getOcr().getPreserveInterwordSpacing());
}
if (fs.getOcr().getOutputType() != null) {
logger.debug("Tesseract Output Type set to [{}].", fs.getOcr().getOutputType());
config.setOutputType(fs.getOcr().getOutputType());
Expand Down

0 comments on commit 6fd3502

Please sign in to comment.