Skip to content

Commit

Permalink
[Reconstitution] Improve reconstitution (#1750)
Browse files Browse the repository at this point in the history
  • Loading branch information
felixdittrich92 authored Oct 21, 2024
1 parent 2ca3928 commit d5dbc73
Show file tree
Hide file tree
Showing 3 changed files with 198 additions and 72 deletions.
12 changes: 10 additions & 2 deletions doctr/io/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,10 @@ def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **
def synthesize(self, **kwargs) -> np.ndarray:
"""Synthesize the page from the predictions
Args:
----
**kwargs: keyword arguments passed to the `synthesize_page` method
Returns
-------
synthesized page
Expand Down Expand Up @@ -493,7 +497,7 @@ def synthesize(self, **kwargs) -> np.ndarray:
Args:
----
**kwargs: keyword arguments passed to the matplotlib.pyplot.show method
**kwargs: keyword arguments passed to the `synthesize_kie_page` method
Returns:
-------
Expand Down Expand Up @@ -603,11 +607,15 @@ def show(self, **kwargs) -> None:
def synthesize(self, **kwargs) -> List[np.ndarray]:
"""Synthesize all pages from their predictions
Args:
----
**kwargs: keyword arguments passed to the `Page.synthesize` method
Returns
-------
list of synthesized pages
"""
return [page.synthesize() for page in self.pages]
return [page.synthesize(**kwargs) for page in self.pages]

def export_as_xml(self, **kwargs) -> List[Tuple[bytes, ET.ElementTree]]:
"""Export the document as XML (hOCR-format)
Expand Down
216 changes: 151 additions & 65 deletions doctr/utils/reconstitution.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

# This program is licensed under the Apache License 2.0.
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
import logging
from typing import Any, Dict, Optional

import numpy as np
Expand All @@ -13,61 +14,163 @@
__all__ = ["synthesize_page", "synthesize_kie_page"]


# Global variable to avoid multiple warnings
ROTATION_WARNING = False


def _warn_rotation(entry: Dict[str, Any]) -> None: # pragma: no cover
global ROTATION_WARNING
if not ROTATION_WARNING and len(entry["geometry"]) == 4:
logging.warning("Polygons with larger rotations will lead to inaccurate rendering")
ROTATION_WARNING = True


def _synthesize(
response: Image.Image,
entry: Dict[str, Any],
w: int,
h: int,
draw_proba: bool = False,
font_family: Optional[str] = None,
smoothing_factor: float = 0.75,
min_font_size: int = 6,
max_font_size: int = 50,
) -> Image.Image:
if len(entry["geometry"]) == 2:
(xmin, ymin), (xmax, ymax) = entry["geometry"]
polygon = [(xmin, ymin), (xmax, ymin), (xmax, ymax), (xmin, ymax)]
else:
polygon = entry["geometry"]

# Calculate the bounding box of the word
x_coords, y_coords = zip(*polygon)
xmin, ymin, xmax, ymax = (
int(round(w * min(x_coords))),
int(round(h * min(y_coords))),
int(round(w * max(x_coords))),
int(round(h * max(y_coords))),
)
word_width = xmax - xmin
word_height = ymax - ymin

# If lines are provided instead of words, concatenate the word entries
if "words" in entry:
word_text = " ".join(word["value"] for word in entry["words"])
else:
word_text = entry["value"]
# Find the optimal font size
try:
font_size = min(word_height, max_font_size)
font = get_font(font_family, font_size)
text_width, text_height = font.getbbox(word_text)[2:4]

while (text_width > word_width or text_height > word_height) and font_size > min_font_size:
font_size = max(int(font_size * smoothing_factor), min_font_size)
font = get_font(font_family, font_size)
text_width, text_height = font.getbbox(word_text)[2:4]
except ValueError:
font = get_font(font_family, min_font_size)

# Create a mask for the word
mask = Image.new("L", (w, h), 0)
ImageDraw.Draw(mask).polygon([(int(round(w * x)), int(round(h * y))) for x, y in polygon], fill=255)

# Draw the word text
d = ImageDraw.Draw(response)
try:
try:
d.text((xmin, ymin), word_text, font=font, fill=(0, 0, 0), anchor="lt")
except UnicodeEncodeError:
d.text((xmin, ymin), anyascii(word_text), font=font, fill=(0, 0, 0), anchor="lt")
# Catch generic exceptions to avoid crashing the whole rendering
except Exception: # pragma: no cover
logging.warning(f"Could not render word: {word_text}")

if draw_proba:
confidence = (
entry["confidence"]
if "confidence" in entry
else sum(w["confidence"] for w in entry["words"]) / len(entry["words"])
)
p = int(255 * confidence)
color = (255 - p, 0, p) # Red to blue gradient based on probability
d.rectangle([(xmin, ymin), (xmax, ymax)], outline=color, width=2)

prob_font = get_font(font_family, 20)
prob_text = f"{confidence:.2f}"
prob_text_width, prob_text_height = prob_font.getbbox(prob_text)[2:4]

# Position the probability slightly above the bounding box
prob_x_offset = (word_width - prob_text_width) // 2
prob_y_offset = ymin - prob_text_height - 2
prob_y_offset = max(0, prob_y_offset)

d.text((xmin + prob_x_offset, prob_y_offset), prob_text, font=prob_font, fill=color, anchor="lt")

return response


def synthesize_page(
page: Dict[str, Any],
draw_proba: bool = False,
font_family: Optional[str] = None,
smoothing_factor: float = 0.95,
min_font_size: int = 8,
max_font_size: int = 50,
) -> np.ndarray:
"""Draw a the content of the element page (OCR response) on a blank page.
Args:
----
page: exported Page object to represent
draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0
font_size: size of the font, default font = 13
font_family: family of the font
smoothing_factor: factor to smooth the font size
min_font_size: minimum font size
max_font_size: maximum font size
Returns:
-------
the synthesized page
"""
# Draw template
h, w = page["dimensions"]
response = 255 * np.ones((h, w, 3), dtype=np.int32)
response = Image.new("RGB", (w, h), color=(255, 255, 255))

# Draw each word
for block in page["blocks"]:
for line in block["lines"]:
for word in line["words"]:
# Get absolute word geometry
(xmin, ymin), (xmax, ymax) = word["geometry"]
xmin, xmax = int(round(w * xmin)), int(round(w * xmax))
ymin, ymax = int(round(h * ymin)), int(round(h * ymax))

# White drawing context adapted to font size, 0.75 factor to convert pts --> pix
font = get_font(font_family, int(0.75 * (ymax - ymin)))
img = Image.new("RGB", (xmax - xmin, ymax - ymin), color=(255, 255, 255))
d = ImageDraw.Draw(img)
# Draw in black the value of the word
try:
d.text((0, 0), word["value"], font=font, fill=(0, 0, 0))
except UnicodeEncodeError:
# When character cannot be encoded, use its anyascii version
d.text((0, 0), anyascii(word["value"]), font=font, fill=(0, 0, 0))

# Colorize if draw_proba
if draw_proba:
p = int(255 * word["confidence"])
mask = np.where(np.array(img) == 0, 1, 0)
proba: np.ndarray = np.array([255 - p, 0, p])
color = mask * proba[np.newaxis, np.newaxis, :]
white_mask = 255 * (1 - mask)
img = color + white_mask

# Write to response page
response[ymin:ymax, xmin:xmax, :] = np.array(img)

return response
# If lines are provided use these to get better rendering results
if len(block["lines"]) > 1:
for line in block["lines"]:
_warn_rotation(block) # pragma: no cover
response = _synthesize(
response=response,
entry=line,
w=w,
h=h,
draw_proba=draw_proba,
font_family=font_family,
smoothing_factor=smoothing_factor,
min_font_size=min_font_size,
max_font_size=max_font_size,
)
# Otherwise, draw each word
else:
for line in block["lines"]:
_warn_rotation(block) # pragma: no cover
for word in line["words"]:
response = _synthesize(
response=response,
entry=word,
w=w,
h=h,
draw_proba=draw_proba,
font_family=font_family,
smoothing_factor=smoothing_factor,
min_font_size=min_font_size,
max_font_size=max_font_size,
)

return np.array(response, dtype=np.uint8)


def synthesize_kie_page(
Expand All @@ -81,46 +184,29 @@ def synthesize_kie_page(
----
page: exported Page object to represent
draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0
font_size: size of the font, default font = 13
font_family: family of the font
smoothing_factor: factor to smooth the font size
min_font_size: minimum font size
max_font_size: maximum font size
Returns:
-------
the synthesized page
"""
# Draw template
h, w = page["dimensions"]
response = 255 * np.ones((h, w, 3), dtype=np.int32)
response = Image.new("RGB", (w, h), color=(255, 255, 255))

# Draw each word
for predictions in page["predictions"].values():
for prediction in predictions:
# Get aboslute word geometry
(xmin, ymin), (xmax, ymax) = prediction["geometry"]
xmin, xmax = int(round(w * xmin)), int(round(w * xmax))
ymin, ymax = int(round(h * ymin)), int(round(h * ymax))

# White drawing context adapted to font size, 0.75 factor to convert pts --> pix
font = get_font(font_family, int(0.75 * (ymax - ymin)))
img = Image.new("RGB", (xmax - xmin, ymax - ymin), color=(255, 255, 255))
d = ImageDraw.Draw(img)
# Draw in black the value of the word
try:
d.text((0, 0), prediction["value"], font=font, fill=(0, 0, 0))
except UnicodeEncodeError:
# When character cannot be encoded, use its anyascii version
d.text((0, 0), anyascii(prediction["value"]), font=font, fill=(0, 0, 0))

# Colorize if draw_proba
if draw_proba:
p = int(255 * prediction["confidence"])
mask = np.where(np.array(img) == 0, 1, 0)
proba: np.ndarray = np.array([255 - p, 0, p])
color = mask * proba[np.newaxis, np.newaxis, :]
white_mask = 255 * (1 - mask)
img = color + white_mask

# Write to response page
response[ymin:ymax, xmin:xmax, :] = np.array(img)

return response
_warn_rotation(prediction) # pragma: no cover
response = _synthesize(
response=response,
entry=prediction,
w=w,
h=h,
draw_proba=draw_proba,
font_family=font_family,
)
return np.array(response, dtype=np.uint8)
42 changes: 37 additions & 5 deletions tests/common/test_utils_reconstitution.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,44 @@
import numpy as np
from test_io_elements import _mock_pages
from test_io_elements import _mock_kie_pages, _mock_pages

from doctr.utils import reconstitution


def test_synthesize_page():
pages = _mock_pages()
reconstitution.synthesize_page(pages[0].export(), draw_proba=False)
render = reconstitution.synthesize_page(pages[0].export(), draw_proba=True)
assert isinstance(render, np.ndarray)
assert render.shape == (*pages[0].dimensions, 3)
# Test without probability rendering
render_no_proba = reconstitution.synthesize_page(pages[0].export(), draw_proba=False)
assert isinstance(render_no_proba, np.ndarray)
assert render_no_proba.shape == (*pages[0].dimensions, 3)

# Test with probability rendering
render_with_proba = reconstitution.synthesize_page(pages[0].export(), draw_proba=True)
assert isinstance(render_with_proba, np.ndarray)
assert render_with_proba.shape == (*pages[0].dimensions, 3)

# Test with only one line
pages_one_line = pages[0].export()
pages_one_line["blocks"][0]["lines"] = [pages_one_line["blocks"][0]["lines"][0]]
render_one_line = reconstitution.synthesize_page(pages_one_line, draw_proba=True)
assert isinstance(render_one_line, np.ndarray)
assert render_one_line.shape == (*pages[0].dimensions, 3)

# Test with polygons
pages_poly = pages[0].export()
pages_poly["blocks"][0]["lines"][0]["geometry"] = [(0, 0), (0, 1), (1, 1), (1, 0)]
render_poly = reconstitution.synthesize_page(pages_poly, draw_proba=True)
assert isinstance(render_poly, np.ndarray)
assert render_poly.shape == (*pages[0].dimensions, 3)


def test_synthesize_kie_page():
pages = _mock_kie_pages()
# Test without probability rendering
render_no_proba = reconstitution.synthesize_kie_page(pages[0].export(), draw_proba=False)
assert isinstance(render_no_proba, np.ndarray)
assert render_no_proba.shape == (*pages[0].dimensions, 3)

# Test with probability rendering
render_with_proba = reconstitution.synthesize_kie_page(pages[0].export(), draw_proba=True)
assert isinstance(render_with_proba, np.ndarray)
assert render_with_proba.shape == (*pages[0].dimensions, 3)

0 comments on commit d5dbc73

Please sign in to comment.