-
Notifications
You must be signed in to change notification settings - Fork 1
/
up.py
145 lines (106 loc) · 4.01 KB
/
up.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import fitz
from PIL import Image
from layoutparser import OCR, Document
# Load the PDF file using PyMuPDF
pdf_path = 'path/to/pdf/file.pdf'
doc = fitz.open(pdf_path)
# Convert each page in the PDF file to a JPEG image
for page_number in range(doc.page_count):
page = doc[page_number]
zoom = 2.0
rotation = 0
trans = fitz.Matrix(zoom, zoom).preRotate(rotation)
pix = page.getPixmap(matrix=trans, alpha=False)
image = Image.frombytes('RGB', [pix.width, pix.height], pix.samples)
image_path = f'page_{page_number+1}.jpg'
image.save(image_path)
# Extract text and layout information using OCR
ocr_agent = OCR()
layout = ocr_agent.detect(image_path, return_response=True)
# Convert the layout information to a LayoutParser document
document = Document(layout=layout, page_size=(image.width, image.height))
# Add the text to the LayoutParser document
text = ocr_agent.detect(image_path)
page_layout = layout[0]
block = page_layout['layout']
text_block = block.filter_blocks(lambda b, _: b['type'] == 'text')
text_block.set(text=text, type='text')
# Print the text and layout information for the page
print(f'Page {page_number + 1}:')
print(f'Text: {text}')
print(f'Layout: {page_layout}')
import fitz
import re
# Open the PDF file
doc = fitz.open('path/to/pdf/file.pdf')
# Define the regular expression pattern for key-value pairs
pattern = r'(\w+)\s*:\s*(.+)'
# Iterate over each page in the document
for page_number in range(doc.page_count):
# Get the page object
page = doc[page_number]
# Get the text on the page
text = page.get_text()
# Search for key-value pairs in the text
matches = re.findall(pattern, text)
# Print the key-value pairs
for match in matches:
key = match[0]
value = match[1]
print(f'{key}: {value}')
import textract
# Define the path to the PDF file
pdf_path = "path/to/pdf/file.pdf"
# Extract the text from the PDF file using Textract
text = textract.process(pdf_path)
# Split the text into lines
lines = text.decode("utf-8").splitlines()
# Initialize an empty dictionary to store the form data
form_data = {}
# Iterate over each line in the text
for line in lines:
# Check if the line contains a key-value pair
if ":" in line:
# Split the line into key and value
key, value = line.split(":", 1)
# Strip leading and trailing whitespace from the key and value
key = key.strip()
value = value.strip()
# Add the key-value pair to the form data dictionary
form_data[key] = value
# Print the extracted form data
print(form_data)
import fitz
from layoutparser import OCR, Document
# Load the PDF file using PyMuPDF
pdf_path = 'path/to/pdf/file.pdf'
doc = fitz.open(pdf_path)
# Extract text and layout information using OCR
ocr_agent = OCR()
layout = ocr_agent.detect(doc, return_response=True)
# Convert the layout information to a LayoutParser document
document = Document(layout=layout, page_size=(doc[0].rect.width, doc[0].rect.height))
# Print the text and layout information for each page in the document
for page_number in range(doc.page_count):
# Get the page object
page = doc[page_number]
# Get the text on the page using OCR
text = ocr_agent.detect(page)
# Add the text to the LayoutParser document
page_layout = layout[page_number]
block = page_layout['layout']
text_block = block.filter_blocks(lambda b, _: b['type'] == 'text')
text_block.set(text=text, type='text')
# Print the text and layout information for the page
print(f'Page {page_number + 1}:')
print(f'Text: {text}')
print(f'Layout: {page_layout}')
from pdf2image import convert_from_path
pdf_path = '/path/to/pdf/file.pdf'
first_page = 1
last_page = 3
image_path_prefix = '/path/to/image/file'
images = convert_from_path(pdf_path, first_page=first_page, last_page=last_page)
for i, image in enumerate(images):
image_path = f"{image_path_prefix}_{i+1}.jpg"
image.save(image_path, 'JPEG')