Skip to content

Commit

Permalink
data scrapping, validation test, and california tables
Browse files Browse the repository at this point in the history
  • Loading branch information
yiboliang committed Jul 23, 2024
1 parent ade495e commit f9a073d
Show file tree
Hide file tree
Showing 7 changed files with 8,164 additions and 42 deletions.
7,554 changes: 7,553 additions & 1 deletion pdf/publications-water-terms1970.csv

Large diffs are not rendered by default.

152 changes: 152 additions & 0 deletions usgs_extract/california_image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: ../02_pdf.ipynb.

# %% auto 0
__all__ = ['PDF']

# %% ../02_pdf.ipynb 2
from utilities import convert_page
from PyPDF2 import PdfReader
from multiprocessing import Pool, cpu_count
import time
from tqdm import tqdm
import PIL
import pandas as pd
import os

# %% ../02_pdf.ipynb 4
class PDF():
" A class for PDFs. "
def __init__(self, pdf_path, output_folder=None):
self.pdf_path = pdf_path
self.output_folder = output_folder

def get_length(self)->int:
" Get the number of pages in the PDF. "
pdf = PdfReader(self.pdf_path)
return len(pdf.pages)

# Define the property length using the get_length function
length = property(get_length)

# define the __len__ method
def __len__(self):
return self.length

# define the __repr__ method
def __repr__(self):
return f"PDF({self.pdf_path})"

def to_images(self, # PDF object
start_page:int=None, # Set to None to convert all pages
end_page:int=None, # Set to None to convert all pages
debug:bool=False, # Set to True to print debug messages
cache_on_disk=False # Set to True to cache images on disk

)->list: # Return a list of image paths
""" Convert a PDF to images using multiprocessing."""
def debug_print(message):
if debug:
print(message)
# Only run if the PDF has not already been converted to images
# Check to see if _images has data
if not hasattr(self, "_images") or not hasattr(self, "_image_paths"):
# Extract base filename without extension
base_filename = os.path.splitext(os.path.basename(self.pdf_path))[0]

# Determine the number of pages to convert
if start_page and end_page:
pages = range(start_page, end_page + 1)
elif start_page:
pages = range(start_page, len(self)) # Placeholder for actual page count
elif end_page:
pages = range(1, end_page + 1)
else:
pages = range(1, len(self) + 1) # Assume self knows the length

if debug:
debug_print(f"Converting pages {list(pages)} of {self.pdf_path} to images")

start_time = time.time()
with Pool(cpu_count()) as pool:
tasks = [pool.apply_async(convert_page, ((self.pdf_path, self.output_folder, page_num),)) for page_num in pages]
image_paths = []
images = []
# Initialize tqdm progress bar
for task in tqdm(tasks, total=len(tasks), desc="Converting Pages"):
image_path, image = task.get() # Block until each task is complete
if image_path and image:
image_paths.append(image_path)
images.append(image)
if debug:
current_page = pages.start + len(image_paths) - 1
debug_print(f"Finished page {current_page}")

if debug:
debug_print(f"All tasks completed. Total processing time: {time.time() - start_time:.2f} seconds")

self._image_paths = image_paths
self._images = images
if cache_on_disk:
self._cache_images()
else:
if debug:
debug_print("Images already converted")
return self._images

def _cache_images(self,
clear_images=False, # Set to True to clear images from memory after caching
debug=False # Set to True to print debug messages
)->None:
" Cache images on disk. "
if not hasattr(self, "image_paths"):
raise AttributeError("No images to cache. Run the to_images method first.")
if not hasattr(self, "output_folder"):
raise AttributeError("No output folder specified. Set the output_folder attribute first.")
for image, image_path in zip(self._images, self._image_paths):
# Check to see if the image is already saved:
if not os.path.exists(image_path):
image.save(image_path)
if debug:
print(f"Saved image to {image_path}")
else:
if debug:
print(f"Image already saved to {image_path}")
if clear_images:
if debug:
print("Clearing images from memory")
self.clear_images()

def clear_images(self)->None:
" Clear images from memory. "
if hasattr(self, "images"):
del self.images
if hasattr(self, "image_paths"):
del self.image_paths

def images(self)->list:
" Get the images from the PDF. "
if not hasattr(self, "_images"):
self._images = self.to_images()
return self._images

def image_paths(self)->list:
" Get the image paths from the PDF. "
if not hasattr(self, "_image_paths"):
self.to_images()
return self._image_paths

if __name__ == "__main__":
cali_csv = "/home/waves/data/usgs_extract/csvs/cali.csv"
pdf_dir = "/home/waves/data/usgs_extract/raw_pdf"
output_path = "/home/waves/data/usgs_extract/cali_images"
california_df = pd.read_csv(cali_csv)
cali_id = california_df["Publication ID"]
all_pdf_files = [os.path.join(pdf_dir, str(id) + ".pdf") for id in cali_id]
for p in all_pdf_files:
try:
pdf = PDF(p, output_folder=output_path)
images = pdf.to_images(debug=False)
pdf._cache_images()
except Exception as e:
print(f"{p} failed to convert.")

148 changes: 148 additions & 0 deletions usgs_extract/get_images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: ../02_pdf.ipynb.

# %% auto 0
__all__ = ['PDF']

# %% ../02_pdf.ipynb 2
from utilities import convert_page
from PyPDF2 import PdfReader
from multiprocessing import Pool, cpu_count
import time
from tqdm import tqdm
import PIL
import os

# %% ../02_pdf.ipynb 4
class PDF():
" A class for PDFs. "
def __init__(self, pdf_path, output_folder=None):
self.pdf_path = pdf_path
self.output_folder = output_folder

def get_length(self)->int:
" Get the number of pages in the PDF. "
pdf = PdfReader(self.pdf_path)
return len(pdf.pages)

# Define the property length using the get_length function
length = property(get_length)

# define the __len__ method
def __len__(self):
return self.length

# define the __repr__ method
def __repr__(self):
return f"PDF({self.pdf_path})"

def to_images(self, # PDF object
start_page:int=None, # Set to None to convert all pages
end_page:int=None, # Set to None to convert all pages
debug:bool=False, # Set to True to print debug messages
cache_on_disk=False # Set to True to cache images on disk

)->list: # Return a list of image paths
""" Convert a PDF to images using multiprocessing."""
def debug_print(message):
if debug:
print(message)
# Only run if the PDF has not already been converted to images
# Check to see if _images has data
if not hasattr(self, "_images") or not hasattr(self, "_image_paths"):
# Extract base filename without extension
base_filename = os.path.splitext(os.path.basename(self.pdf_path))[0]

# Determine the number of pages to convert
if start_page and end_page:
pages = range(start_page, end_page + 1)
elif start_page:
pages = range(start_page, len(self)) # Placeholder for actual page count
elif end_page:
pages = range(1, end_page + 1)
else:
pages = range(1, len(self) + 1) # Assume self knows the length

if debug:
debug_print(f"Converting pages {list(pages)} of {self.pdf_path} to images")

start_time = time.time()
with Pool(cpu_count()) as pool:
tasks = [pool.apply_async(convert_page, ((self.pdf_path, self.output_folder, page_num),)) for page_num in pages]
image_paths = []
images = []
# Initialize tqdm progress bar
for task in tqdm(tasks, total=len(tasks), desc="Converting Pages"):
image_path, image = task.get() # Block until each task is complete
if image_path and image:
image_paths.append(image_path)
images.append(image)
if debug:
current_page = pages.start + len(image_paths) - 1
debug_print(f"Finished page {current_page}")

if debug:
debug_print(f"All tasks completed. Total processing time: {time.time() - start_time:.2f} seconds")

self._image_paths = image_paths
self._images = images
if cache_on_disk:
self._cache_images()
else:
if debug:
debug_print("Images already converted")
return self._images

def _cache_images(self,
clear_images=False, # Set to True to clear images from memory after caching
debug=False # Set to True to print debug messages
)->None:
" Cache images on disk. "
if not hasattr(self, "image_paths"):
raise AttributeError("No images to cache. Run the to_images method first.")
if not hasattr(self, "output_folder"):
raise AttributeError("No output folder specified. Set the output_folder attribute first.")
for image, image_path in zip(self._images, self._image_paths):
# Check to see if the image is already saved:
if not os.path.exists(image_path):
image.save(image_path)
if debug:
print(f"Saved image to {image_path}")
else:
if debug:
print(f"Image already saved to {image_path}")
if clear_images:
if debug:
print("Clearing images from memory")
self.clear_images()

def clear_images(self)->None:
" Clear images from memory. "
if hasattr(self, "images"):
del self.images
if hasattr(self, "image_paths"):
del self.image_paths

def images(self)->list:
" Get the images from the PDF. "
if not hasattr(self, "_images"):
self._images = self.to_images()
return self._images

def image_paths(self)->list:
" Get the image paths from the PDF. "
if not hasattr(self, "_image_paths"):
self.to_images()
return self._image_paths

if __name__ == "__main__":
pdf_dir = "/home/waves/data/usgs_extract/validation_test/pdfs"
output_path = "/home/waves/data/usgs_extract/validation_test/images"
all_pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith(".pdf")]
for path in all_pdf_files:
try:
pdf = PDF(path, output_folder=output_path)
images = pdf.to_images(debug=False)
pdf._cache_images()
except Exception as e:
print(f"{path} failed to convert.")

28 changes: 13 additions & 15 deletions usgs_extract/getpdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,7 @@
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import sys

# data = pd.read_csv("./publications-water-terms1970.csv")
classurl = "usa-link Document"
url_folder = os.path.join(os.getcwd(), "../pdf/publications-water-terms1970.csv")
data = pd.read_csv(url_folder)
urls = data["URL"]
def get_urls_with_class(url, class_name):
try:
response = requests.get(url)
Expand All @@ -23,36 +17,40 @@ def get_urls_with_class(url, class_name):
return []

# Function to download a file from a given URL
def download_file(url, folder_path):
def download_file(url, folder_path, id):
try:
# Send a HTTP request to the URL
response = requests.get(url)
response.raise_for_status() # Check for HTTP errors

# Parse the URL to get the path and filename
parsed_url = urlparse(url)
name = parsed_url.path.split('/')
name = name[-1]
file_path = os.path.join(folder_path, name)
file_path = os.path.join(folder_path, f"{pdfID[id]}.pdf")

# Write the content to a file
with open(file_path, 'wb') as file:
file.write(response.content)
print(f"download success at {file_path}")
except requests.exceptions.RequestException as e:
print(f'Failed to download {url}. Reason: {e}')


if __name__ == "__main__":

download_folder = sys.argv[1]
download_folder = "/home/waves/data/usgs_extract/validation_test/pdfs"
classurl = "usa-link Document"
url_csv = "/home/waves/data/usgs_extract/validation_test/valiSet.csv"
data = pd.read_csv(url_csv)

urls = data["URL"]
pdfID = data["Publication ID"]

# Create the folder if it doesn't exist
if not os.path.exists(download_folder):
os.makedirs(download_folder)

# Download each file from the list of URLs
for url in urls:
for i, url in enumerate(urls):
pdfurls = get_urls_with_class(url, classurl)
for urlp in pdfurls:
download_file(urlp, download_folder, i)


download_file(urlp, download_folder)
3 changes: 1 addition & 2 deletions usgs_extract/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,7 @@ def __len__(self):
def __repr__(self):
return f"PDF({self.pdf_path})"

# %% ../02_pdf.ipynb 7
@patch

def to_images(self:PDF, # PDF object
start_page:int=None, # Set to None to convert all pages
end_page:int=None, # Set to None to convert all pages
Expand Down
Loading

0 comments on commit f9a073d

Please sign in to comment.