Skip to content

Commit

Permalink
feat: Add Support for Layout Parser Documents (#334)
Browse files Browse the repository at this point in the history
- Adds `Document.chunks` - `List[documentai.Document.ChunkedDocument.Chunk]`
- Adds `Document.document_layout_blocks` - An un-nested List of `documentai.Document.DocumentLayout.DocumentLayoutBlock`
- Updated `Document.text` to support Document AI Layout Parser
  • Loading branch information
holtskinner committed Jul 16, 2024
1 parent 2352cae commit c877b22
Show file tree
Hide file tree
Showing 6 changed files with 100 additions and 4 deletions.
44 changes: 42 additions & 2 deletions google/cloud/documentai_toolbox/wrappers/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,14 @@
#
"""Wrappers for Document AI Document type."""

import collections
import copy
import dataclasses
from functools import cached_property
import glob
import os
import re
from typing import Dict, List, Optional, Type, Union
from typing import Dict, Iterator, List, Optional, Type, Union

from google.api_core.client_options import ClientOptions
from google.api_core.operation import from_gapic as operation_from_gapic
Expand All @@ -38,6 +39,33 @@
from google.cloud.documentai_toolbox.wrappers.page import FormField, Page


def _chunks_from_shards(
shards: List[documentai.Document],
) -> Iterator[documentai.Document.ChunkedDocument.Chunk]:
for shard in shards:
for chunk in shard.chunked_document.chunks:
yield chunk


def _document_layout_blocks_from_shards(
shards: List[documentai.Document],
) -> Iterator[documentai.Document.DocumentLayout.DocumentLayoutBlock]:
def extract_blocks(
blocks: List[documentai.Document.DocumentLayout.DocumentLayoutBlock],
) -> Iterator[documentai.Document.DocumentLayout.DocumentLayoutBlock]:
queue = collections.deque(blocks)

while queue:
block = queue.popleft()
yield block
# Add the nested blocks to the stack in the correct order
if block.text_block and block.text_block.blocks:
queue.extendleft(reversed(block.text_block.blocks))

for shard in shards:
yield from extract_blocks(shard.document_layout.blocks)


def _entities_from_shards(
shards: List[documentai.Document],
) -> List[Entity]:
Expand Down Expand Up @@ -379,7 +407,11 @@ class Document:
pages (List[Page]):
A list of `Pages` in the `Document`.
entities (List[Entity]):
A list of `Entities` in the `Document`.
A list of un-nested `Entities` in the `Document`.
chunks (Iterator[documentai.Document.ChunkedDocument.Chunk]):
An iterator of document chunks extracted from a Layout Parser.
document_layout_blocks (Iterator[documentai.Document.DocumentLayout.DocumentLayoutBlock]):
An iterator of document layout blocks extracted from a Layout Parser.
text (str):
The full text of the `Document`.
"""
Expand All @@ -398,6 +430,14 @@ def pages(self):
def entities(self):
return _entities_from_shards(shards=self.shards)

@cached_property
def chunks(self):
return _chunks_from_shards(shards=self.shards)

@cached_property
def document_layout_blocks(self):
return _document_layout_blocks_from_shards(shards=self.shards)

@cached_property
def text(self):
return "".join(shard.text for shard in self.shards)
Expand Down
15 changes: 15 additions & 0 deletions samples/snippets/quickstart_sample.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa: C901
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -127,6 +128,20 @@ def quickstart_sample(
if entity.normalized_text:
print(f"\tNormalized Text: {entity.normalized_text}")

# Only supported with Layout Parser
for chunk in wrapped_document.chunks:
print(f"Chunk {chunk.chunk_id}: {chunk.content}")

for block in wrapped_document.document_layout_blocks:
print(f"Document Layout Block {block.block_id}")

if block.text_block:
print(f"{block.text_block.type_}: {block.text_block.text}")
if block.list_block:
print(f"{block.list_block.type_}: {block.list_block.list_entries}")
if block.table_block:
print(block.table_block.header_rows, block.table_block.body_rows)

# [END documentai_toolbox_quickstart]

return wrapped_document
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
"proto-plus>=1.22.3, <2.0.0dev",
"grpc-google-iam-v1>=0.12.6, <1.0.0dev",
"google-cloud-bigquery>=3.5.0, <4.0.0dev",
"google-cloud-documentai>=2.20.0, <3.0.0dev",
"google-cloud-documentai>=2.29.2, <3.0.0dev",
"google-cloud-storage>=1.31.0, <3.0.0dev",
"google-cloud-vision>=2.7.0, <4.0.0dev",
"numpy>=1.23.5, <2.0.0",
Expand Down
2 changes: 1 addition & 1 deletion testing/constraints-3.8.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ pandas==2.0.0
proto-plus==1.22.3
grpc-google-iam-v1==0.12.6
google-cloud-bigquery==3.5.0
google-cloud-documentai==2.20.0
google-cloud-documentai==2.29.2
google-cloud-storage==2.7.0
pandas-gbq==0.21.0
numpy==1.23.5
Expand Down
1 change: 1 addition & 0 deletions tests/unit/resources/layout_parser/layout_parser.json

Large diffs are not rendered by default.

40 changes: 40 additions & 0 deletions tests/unit/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,46 @@ def test_document_from_documentai_document_with_single_shard():
assert len(actual.text) > 0


def test_document_from_documentai_document_layout_parser():
with open(
"tests/unit/resources/layout_parser/layout_parser.json", "r", encoding="utf-8"
) as f:
doc = documentai.Document.from_json(f.read())

actual = document.Document.from_documentai_document(documentai_document=doc)

chunk_list = list(actual.chunks)
assert len(chunk_list) == 2
assert chunk_list[0].chunk_id == "c1"
assert "CHAPTER I" in chunk_list[0].content
assert chunk_list[0].page_span.page_start == 1
assert chunk_list[0].page_span.page_end == 8

assert chunk_list[1].chunk_id == "c2"
assert "Was that me?" in chunk_list[1].content
assert chunk_list[1].page_span.page_start == 8
assert chunk_list[1].page_span.page_end == 15

block_list = list(actual.document_layout_blocks)

for i, block in enumerate(block_list, start=1):
assert int(block.block_id) == i

assert len(block_list) == 175
assert block_list[0].block_id == "1"
assert block_list[0].text_block.text == "CHAPTER I"
assert block_list[0].text_block.type_ == "heading-1"
assert block_list[0].text_block.blocks
assert block_list[0].page_span.page_start == 1
assert block_list[0].page_span.page_end == 8

assert block_list[1].block_id == "2"
assert block_list[1].text_block.text == "IN WHICH We Are Introduced to"
assert block_list[1].text_block.type_ == "paragraph"
assert block_list[1].page_span.page_start == 1
assert block_list[1].page_span.page_end == 1


def test_document_from_gcs_with_single_shard(get_bytes_single_file_mock):
actual = document.Document.from_gcs(
gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0/"
Expand Down

0 comments on commit c877b22

Please sign in to comment.