-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
data scrapping, validation test, and california tables
- Loading branch information
yiboliang
committed
Jul 23, 2024
1 parent
ade495e
commit f9a073d
Showing
7 changed files
with
8,164 additions
and
42 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../02_pdf.ipynb. | ||
|
||
# %% auto 0 | ||
__all__ = ['PDF'] | ||
|
||
# %% ../02_pdf.ipynb 2 | ||
from utilities import convert_page | ||
from PyPDF2 import PdfReader | ||
from multiprocessing import Pool, cpu_count | ||
import time | ||
from tqdm import tqdm | ||
import PIL | ||
import pandas as pd | ||
import os | ||
|
||
# %% ../02_pdf.ipynb 4 | ||
class PDF(): | ||
" A class for PDFs. " | ||
def __init__(self, pdf_path, output_folder=None): | ||
self.pdf_path = pdf_path | ||
self.output_folder = output_folder | ||
|
||
def get_length(self)->int: | ||
" Get the number of pages in the PDF. " | ||
pdf = PdfReader(self.pdf_path) | ||
return len(pdf.pages) | ||
|
||
# Define the property length using the get_length function | ||
length = property(get_length) | ||
|
||
# define the __len__ method | ||
def __len__(self): | ||
return self.length | ||
|
||
# define the __repr__ method | ||
def __repr__(self): | ||
return f"PDF({self.pdf_path})" | ||
|
||
def to_images(self, # PDF object | ||
start_page:int=None, # Set to None to convert all pages | ||
end_page:int=None, # Set to None to convert all pages | ||
debug:bool=False, # Set to True to print debug messages | ||
cache_on_disk=False # Set to True to cache images on disk | ||
|
||
)->list: # Return a list of image paths | ||
""" Convert a PDF to images using multiprocessing.""" | ||
def debug_print(message): | ||
if debug: | ||
print(message) | ||
# Only run if the PDF has not already been converted to images | ||
# Check to see if _images has data | ||
if not hasattr(self, "_images") or not hasattr(self, "_image_paths"): | ||
# Extract base filename without extension | ||
base_filename = os.path.splitext(os.path.basename(self.pdf_path))[0] | ||
|
||
# Determine the number of pages to convert | ||
if start_page and end_page: | ||
pages = range(start_page, end_page + 1) | ||
elif start_page: | ||
pages = range(start_page, len(self)) # Placeholder for actual page count | ||
elif end_page: | ||
pages = range(1, end_page + 1) | ||
else: | ||
pages = range(1, len(self) + 1) # Assume self knows the length | ||
|
||
if debug: | ||
debug_print(f"Converting pages {list(pages)} of {self.pdf_path} to images") | ||
|
||
start_time = time.time() | ||
with Pool(cpu_count()) as pool: | ||
tasks = [pool.apply_async(convert_page, ((self.pdf_path, self.output_folder, page_num),)) for page_num in pages] | ||
image_paths = [] | ||
images = [] | ||
# Initialize tqdm progress bar | ||
for task in tqdm(tasks, total=len(tasks), desc="Converting Pages"): | ||
image_path, image = task.get() # Block until each task is complete | ||
if image_path and image: | ||
image_paths.append(image_path) | ||
images.append(image) | ||
if debug: | ||
current_page = pages.start + len(image_paths) - 1 | ||
debug_print(f"Finished page {current_page}") | ||
|
||
if debug: | ||
debug_print(f"All tasks completed. Total processing time: {time.time() - start_time:.2f} seconds") | ||
|
||
self._image_paths = image_paths | ||
self._images = images | ||
if cache_on_disk: | ||
self._cache_images() | ||
else: | ||
if debug: | ||
debug_print("Images already converted") | ||
return self._images | ||
|
||
def _cache_images(self, | ||
clear_images=False, # Set to True to clear images from memory after caching | ||
debug=False # Set to True to print debug messages | ||
)->None: | ||
" Cache images on disk. " | ||
if not hasattr(self, "image_paths"): | ||
raise AttributeError("No images to cache. Run the to_images method first.") | ||
if not hasattr(self, "output_folder"): | ||
raise AttributeError("No output folder specified. Set the output_folder attribute first.") | ||
for image, image_path in zip(self._images, self._image_paths): | ||
# Check to see if the image is already saved: | ||
if not os.path.exists(image_path): | ||
image.save(image_path) | ||
if debug: | ||
print(f"Saved image to {image_path}") | ||
else: | ||
if debug: | ||
print(f"Image already saved to {image_path}") | ||
if clear_images: | ||
if debug: | ||
print("Clearing images from memory") | ||
self.clear_images() | ||
|
||
def clear_images(self)->None: | ||
" Clear images from memory. " | ||
if hasattr(self, "images"): | ||
del self.images | ||
if hasattr(self, "image_paths"): | ||
del self.image_paths | ||
|
||
def images(self)->list: | ||
" Get the images from the PDF. " | ||
if not hasattr(self, "_images"): | ||
self._images = self.to_images() | ||
return self._images | ||
|
||
def image_paths(self)->list: | ||
" Get the image paths from the PDF. " | ||
if not hasattr(self, "_image_paths"): | ||
self.to_images() | ||
return self._image_paths | ||
|
||
if __name__ == "__main__": | ||
cali_csv = "/home/waves/data/usgs_extract/csvs/cali.csv" | ||
pdf_dir = "/home/waves/data/usgs_extract/raw_pdf" | ||
output_path = "/home/waves/data/usgs_extract/cali_images" | ||
california_df = pd.read_csv(cali_csv) | ||
cali_id = california_df["Publication ID"] | ||
all_pdf_files = [os.path.join(pdf_dir, str(id) + ".pdf") for id in cali_id] | ||
for p in all_pdf_files: | ||
try: | ||
pdf = PDF(p, output_folder=output_path) | ||
images = pdf.to_images(debug=False) | ||
pdf._cache_images() | ||
except Exception as e: | ||
print(f"{p} failed to convert.") | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../02_pdf.ipynb. | ||
|
||
# %% auto 0 | ||
__all__ = ['PDF'] | ||
|
||
# %% ../02_pdf.ipynb 2 | ||
from utilities import convert_page | ||
from PyPDF2 import PdfReader | ||
from multiprocessing import Pool, cpu_count | ||
import time | ||
from tqdm import tqdm | ||
import PIL | ||
import os | ||
|
||
# %% ../02_pdf.ipynb 4 | ||
class PDF(): | ||
" A class for PDFs. " | ||
def __init__(self, pdf_path, output_folder=None): | ||
self.pdf_path = pdf_path | ||
self.output_folder = output_folder | ||
|
||
def get_length(self)->int: | ||
" Get the number of pages in the PDF. " | ||
pdf = PdfReader(self.pdf_path) | ||
return len(pdf.pages) | ||
|
||
# Define the property length using the get_length function | ||
length = property(get_length) | ||
|
||
# define the __len__ method | ||
def __len__(self): | ||
return self.length | ||
|
||
# define the __repr__ method | ||
def __repr__(self): | ||
return f"PDF({self.pdf_path})" | ||
|
||
def to_images(self, # PDF object | ||
start_page:int=None, # Set to None to convert all pages | ||
end_page:int=None, # Set to None to convert all pages | ||
debug:bool=False, # Set to True to print debug messages | ||
cache_on_disk=False # Set to True to cache images on disk | ||
|
||
)->list: # Return a list of image paths | ||
""" Convert a PDF to images using multiprocessing.""" | ||
def debug_print(message): | ||
if debug: | ||
print(message) | ||
# Only run if the PDF has not already been converted to images | ||
# Check to see if _images has data | ||
if not hasattr(self, "_images") or not hasattr(self, "_image_paths"): | ||
# Extract base filename without extension | ||
base_filename = os.path.splitext(os.path.basename(self.pdf_path))[0] | ||
|
||
# Determine the number of pages to convert | ||
if start_page and end_page: | ||
pages = range(start_page, end_page + 1) | ||
elif start_page: | ||
pages = range(start_page, len(self)) # Placeholder for actual page count | ||
elif end_page: | ||
pages = range(1, end_page + 1) | ||
else: | ||
pages = range(1, len(self) + 1) # Assume self knows the length | ||
|
||
if debug: | ||
debug_print(f"Converting pages {list(pages)} of {self.pdf_path} to images") | ||
|
||
start_time = time.time() | ||
with Pool(cpu_count()) as pool: | ||
tasks = [pool.apply_async(convert_page, ((self.pdf_path, self.output_folder, page_num),)) for page_num in pages] | ||
image_paths = [] | ||
images = [] | ||
# Initialize tqdm progress bar | ||
for task in tqdm(tasks, total=len(tasks), desc="Converting Pages"): | ||
image_path, image = task.get() # Block until each task is complete | ||
if image_path and image: | ||
image_paths.append(image_path) | ||
images.append(image) | ||
if debug: | ||
current_page = pages.start + len(image_paths) - 1 | ||
debug_print(f"Finished page {current_page}") | ||
|
||
if debug: | ||
debug_print(f"All tasks completed. Total processing time: {time.time() - start_time:.2f} seconds") | ||
|
||
self._image_paths = image_paths | ||
self._images = images | ||
if cache_on_disk: | ||
self._cache_images() | ||
else: | ||
if debug: | ||
debug_print("Images already converted") | ||
return self._images | ||
|
||
def _cache_images(self, | ||
clear_images=False, # Set to True to clear images from memory after caching | ||
debug=False # Set to True to print debug messages | ||
)->None: | ||
" Cache images on disk. " | ||
if not hasattr(self, "image_paths"): | ||
raise AttributeError("No images to cache. Run the to_images method first.") | ||
if not hasattr(self, "output_folder"): | ||
raise AttributeError("No output folder specified. Set the output_folder attribute first.") | ||
for image, image_path in zip(self._images, self._image_paths): | ||
# Check to see if the image is already saved: | ||
if not os.path.exists(image_path): | ||
image.save(image_path) | ||
if debug: | ||
print(f"Saved image to {image_path}") | ||
else: | ||
if debug: | ||
print(f"Image already saved to {image_path}") | ||
if clear_images: | ||
if debug: | ||
print("Clearing images from memory") | ||
self.clear_images() | ||
|
||
def clear_images(self)->None: | ||
" Clear images from memory. " | ||
if hasattr(self, "images"): | ||
del self.images | ||
if hasattr(self, "image_paths"): | ||
del self.image_paths | ||
|
||
def images(self)->list: | ||
" Get the images from the PDF. " | ||
if not hasattr(self, "_images"): | ||
self._images = self.to_images() | ||
return self._images | ||
|
||
def image_paths(self)->list: | ||
" Get the image paths from the PDF. " | ||
if not hasattr(self, "_image_paths"): | ||
self.to_images() | ||
return self._image_paths | ||
|
||
if __name__ == "__main__": | ||
pdf_dir = "/home/waves/data/usgs_extract/validation_test/pdfs" | ||
output_path = "/home/waves/data/usgs_extract/validation_test/images" | ||
all_pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith(".pdf")] | ||
for path in all_pdf_files: | ||
try: | ||
pdf = PDF(path, output_folder=output_path) | ||
images = pdf.to_images(debug=False) | ||
pdf._cache_images() | ||
except Exception as e: | ||
print(f"{path} failed to convert.") | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.