data scrapping, validation test, and california tables

ecohydro · Jul 23, 2024 · f9a073d · f9a073d
1 parent ade495e
commit f9a073d
Show file tree

Hide file tree

Showing 7 changed files with 8,164 additions and 42 deletions.
diff --git a/pdf/publications-water-terms1970.csv b/pdf/publications-water-terms1970.csv
diff --git a/usgs_extract/california_image.py b/usgs_extract/california_image.py
@@ -0,0 +1,152 @@
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../02_pdf.ipynb.
+
+# %% auto 0
+__all__ = ['PDF']
+
+# %% ../02_pdf.ipynb 2
+from utilities import convert_page 
+from PyPDF2 import PdfReader
+from multiprocessing import Pool, cpu_count
+import time
+from tqdm import tqdm
+import PIL
+import pandas as pd
+import os
+
+# %% ../02_pdf.ipynb 4
+class PDF():
+    " A class for PDFs. "
+    def __init__(self, pdf_path, output_folder=None):
+        self.pdf_path = pdf_path
+        self.output_folder = output_folder
+
+    def get_length(self)->int:
+        " Get the number of pages in the PDF. "
+        pdf = PdfReader(self.pdf_path)
+        return len(pdf.pages)
+
+    # Define the property length using the get_length function
+    length = property(get_length)
+
+    # define the __len__ method
+    def __len__(self):
+        return self.length
+
+    # define the __repr__ method
+    def __repr__(self):
+        return f"PDF({self.pdf_path})"
+
+    def to_images(self, # PDF object
+                start_page:int=None, # Set to None to convert all pages
+                end_page:int=None, # Set to None to convert all pages
+                debug:bool=False, # Set to True to print debug messages
+                cache_on_disk=False # Set to True to cache images on disk
+
+                )->list: # Return a list of image paths
+        """ Convert a PDF to images using multiprocessing."""
+        def debug_print(message):
+            if debug:
+                print(message)
+        # Only run if the PDF has not already been converted to images
+        # Check to see if _images has data
+        if not hasattr(self, "_images") or not hasattr(self, "_image_paths"):        
+            # Extract base filename without extension
+            base_filename = os.path.splitext(os.path.basename(self.pdf_path))[0]
+
+            # Determine the number of pages to convert
+            if start_page and end_page:
+                pages = range(start_page, end_page + 1)
+            elif start_page:
+                pages = range(start_page, len(self))  # Placeholder for actual page count
+            elif end_page:
+                pages = range(1, end_page + 1)
+            else:
+                pages = range(1, len(self) + 1)  # Assume self knows the length
+
+            if debug:
+                debug_print(f"Converting pages {list(pages)} of {self.pdf_path} to images")
+
+            start_time = time.time()
+            with Pool(cpu_count()) as pool:
+                tasks = [pool.apply_async(convert_page, ((self.pdf_path, self.output_folder, page_num),)) for page_num in pages]
+                image_paths = []
+                images = []
+                # Initialize tqdm progress bar
+                for task in tqdm(tasks, total=len(tasks), desc="Converting Pages"):
+                    image_path, image = task.get()  # Block until each task is complete
+                    if image_path and image:
+                        image_paths.append(image_path)
+                        images.append(image)
+                    if debug:
+                        current_page = pages.start + len(image_paths) - 1
+                        debug_print(f"Finished page {current_page}")
+
+            if debug:
+                debug_print(f"All tasks completed. Total processing time: {time.time() - start_time:.2f} seconds")
+
+            self._image_paths = image_paths
+            self._images = images
+            if cache_on_disk:
+                self._cache_images() 
+        else:
+            if debug:
+                debug_print("Images already converted")
+        return self._images
+
+    def _cache_images(self,
+                    clear_images=False, # Set to True to clear images from memory after caching
+                    debug=False # Set to True to print debug messages
+                    )->None:
+        " Cache images on disk. "
+        if not hasattr(self, "image_paths"):
+            raise AttributeError("No images to cache. Run the to_images method first.")
+        if not hasattr(self, "output_folder"):
+            raise AttributeError("No output folder specified. Set the output_folder attribute first.")
+        for image, image_path in zip(self._images, self._image_paths):
+            # Check to see if the image is already saved:
+            if not os.path.exists(image_path):
+                image.save(image_path)
+                if debug:
+                    print(f"Saved image to {image_path}")
+            else:
+                if debug:
+                    print(f"Image already saved to {image_path}")
+        if clear_images:
+            if debug:
+                print("Clearing images from memory")
+            self.clear_images()
+
+    def clear_images(self)->None:
+        " Clear images from memory. "
+        if hasattr(self, "images"):
+            del self.images
+        if hasattr(self, "image_paths"):
+            del self.image_paths
+
+    def images(self)->list:
+        " Get the images from the PDF. "
+        if not hasattr(self, "_images"):
+            self._images = self.to_images()
+        return self._images
+
+    def image_paths(self)->list:
+        " Get the image paths from the PDF. "
+        if not hasattr(self, "_image_paths"):
+            self.to_images()
+        return self._image_paths
+
+if __name__ == "__main__":
+    cali_csv = "/home/waves/data/usgs_extract/csvs/cali.csv"
+    pdf_dir = "/home/waves/data/usgs_extract/raw_pdf"
+    output_path = "/home/waves/data/usgs_extract/cali_images"
+    california_df = pd.read_csv(cali_csv)
+    cali_id = california_df["Publication ID"]
+    all_pdf_files = [os.path.join(pdf_dir, str(id) + ".pdf") for id in cali_id]
+    for p in all_pdf_files:
+        try:
+            pdf = PDF(p, output_folder=output_path)
+            images = pdf.to_images(debug=False)
+            pdf._cache_images()
+        except Exception as e:
+            print(f"{p} failed to convert.")
+
diff --git a/usgs_extract/get_images.py b/usgs_extract/get_images.py
@@ -0,0 +1,148 @@
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../02_pdf.ipynb.
+
+# %% auto 0
+__all__ = ['PDF']
+
+# %% ../02_pdf.ipynb 2
+from utilities import convert_page 
+from PyPDF2 import PdfReader
+from multiprocessing import Pool, cpu_count
+import time
+from tqdm import tqdm
+import PIL
+import os
+
+# %% ../02_pdf.ipynb 4
+class PDF():
+    " A class for PDFs. "
+    def __init__(self, pdf_path, output_folder=None):
+        self.pdf_path = pdf_path
+        self.output_folder = output_folder
+
+    def get_length(self)->int:
+        " Get the number of pages in the PDF. "
+        pdf = PdfReader(self.pdf_path)
+        return len(pdf.pages)
+
+    # Define the property length using the get_length function
+    length = property(get_length)
+
+    # define the __len__ method
+    def __len__(self):
+        return self.length
+
+    # define the __repr__ method
+    def __repr__(self):
+        return f"PDF({self.pdf_path})"
+
+    def to_images(self, # PDF object
+                start_page:int=None, # Set to None to convert all pages
+                end_page:int=None, # Set to None to convert all pages
+                debug:bool=False, # Set to True to print debug messages
+                cache_on_disk=False # Set to True to cache images on disk
+
+                )->list: # Return a list of image paths
+        """ Convert a PDF to images using multiprocessing."""
+        def debug_print(message):
+            if debug:
+                print(message)
+        # Only run if the PDF has not already been converted to images
+        # Check to see if _images has data
+        if not hasattr(self, "_images") or not hasattr(self, "_image_paths"):        
+            # Extract base filename without extension
+            base_filename = os.path.splitext(os.path.basename(self.pdf_path))[0]
+
+            # Determine the number of pages to convert
+            if start_page and end_page:
+                pages = range(start_page, end_page + 1)
+            elif start_page:
+                pages = range(start_page, len(self))  # Placeholder for actual page count
+            elif end_page:
+                pages = range(1, end_page + 1)
+            else:
+                pages = range(1, len(self) + 1)  # Assume self knows the length
+
+            if debug:
+                debug_print(f"Converting pages {list(pages)} of {self.pdf_path} to images")
+
+            start_time = time.time()
+            with Pool(cpu_count()) as pool:
+                tasks = [pool.apply_async(convert_page, ((self.pdf_path, self.output_folder, page_num),)) for page_num in pages]
+                image_paths = []
+                images = []
+                # Initialize tqdm progress bar
+                for task in tqdm(tasks, total=len(tasks), desc="Converting Pages"):
+                    image_path, image = task.get()  # Block until each task is complete
+                    if image_path and image:
+                        image_paths.append(image_path)
+                        images.append(image)
+                    if debug:
+                        current_page = pages.start + len(image_paths) - 1
+                        debug_print(f"Finished page {current_page}")
+
+            if debug:
+                debug_print(f"All tasks completed. Total processing time: {time.time() - start_time:.2f} seconds")
+
+            self._image_paths = image_paths
+            self._images = images
+            if cache_on_disk:
+                self._cache_images() 
+        else:
+            if debug:
+                debug_print("Images already converted")
+        return self._images
+
+    def _cache_images(self,
+                    clear_images=False, # Set to True to clear images from memory after caching
+                    debug=False # Set to True to print debug messages
+                    )->None:
+        " Cache images on disk. "
+        if not hasattr(self, "image_paths"):
+            raise AttributeError("No images to cache. Run the to_images method first.")
+        if not hasattr(self, "output_folder"):
+            raise AttributeError("No output folder specified. Set the output_folder attribute first.")
+        for image, image_path in zip(self._images, self._image_paths):
+            # Check to see if the image is already saved:
+            if not os.path.exists(image_path):
+                image.save(image_path)
+                if debug:
+                    print(f"Saved image to {image_path}")
+            else:
+                if debug:
+                    print(f"Image already saved to {image_path}")
+        if clear_images:
+            if debug:
+                print("Clearing images from memory")
+            self.clear_images()
+
+    def clear_images(self)->None:
+        " Clear images from memory. "
+        if hasattr(self, "images"):
+            del self.images
+        if hasattr(self, "image_paths"):
+            del self.image_paths
+
+    def images(self)->list:
+        " Get the images from the PDF. "
+        if not hasattr(self, "_images"):
+            self._images = self.to_images()
+        return self._images
+
+    def image_paths(self)->list:
+        " Get the image paths from the PDF. "
+        if not hasattr(self, "_image_paths"):
+            self.to_images()
+        return self._image_paths
+
+if __name__ == "__main__":
+    pdf_dir = "/home/waves/data/usgs_extract/validation_test/pdfs"
+    output_path = "/home/waves/data/usgs_extract/validation_test/images"
+    all_pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith(".pdf")]
+    for path in all_pdf_files:
+        try:
+            pdf = PDF(path, output_folder=output_path)
+            images = pdf.to_images(debug=False)
+            pdf._cache_images()
+        except Exception as e:
+            print(f"{path} failed to convert.")
+
diff --git a/usgs_extract/getpdf.py b/usgs_extract/getpdf.py
@@ -3,13 +3,7 @@
 import pandas as pd
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse, urljoin
-import sys
 
-# data = pd.read_csv("./publications-water-terms1970.csv")
-classurl = "usa-link Document"
-url_folder = os.path.join(os.getcwd(), "../pdf/publications-water-terms1970.csv")
-data = pd.read_csv(url_folder)
-urls = data["URL"]
 def get_urls_with_class(url, class_name):
     try:
         response = requests.get(url)
@@ -23,36 +17,40 @@ def get_urls_with_class(url, class_name):
         return []
 
 # Function to download a file from a given URL
-def download_file(url, folder_path):
+def download_file(url, folder_path, id):
     try:
         # Send a HTTP request to the URL
         response = requests.get(url)
         response.raise_for_status()  # Check for HTTP errors
 
-        # Parse the URL to get the path and filename
-        parsed_url = urlparse(url)
-        name =  parsed_url.path.split('/')
-        name = name[-1]
-        file_path = os.path.join(folder_path, name)
+        file_path = os.path.join(folder_path, f"{pdfID[id]}.pdf")
 
         # Write the content to a file
         with open(file_path, 'wb') as file:
             file.write(response.content)
+        print(f"download success at {file_path}")
     except requests.exceptions.RequestException as e:
         print(f'Failed to download {url}. Reason: {e}')
 
 
 if __name__ == "__main__":
 
-    download_folder = sys.argv[1]
+    download_folder = "/home/waves/data/usgs_extract/validation_test/pdfs"
+    classurl = "usa-link Document"
+    url_csv = "/home/waves/data/usgs_extract/validation_test/valiSet.csv"
+    data = pd.read_csv(url_csv)
+
+    urls = data["URL"]
+    pdfID = data["Publication ID"]
 
     # Create the folder if it doesn't exist
     if not os.path.exists(download_folder):
         os.makedirs(download_folder)
 
     # Download each file from the list of URLs
-    for url in urls:
+    for i, url in enumerate(urls):
         pdfurls = get_urls_with_class(url, classurl)
         for urlp in pdfurls:
+            download_file(urlp, download_folder, i)
+
 
-            download_file(urlp, download_folder)
diff --git a/usgs_extract/pdf.py b/usgs_extract/pdf.py
@@ -29,8 +29,7 @@ def __len__(self):
     def __repr__(self):
         return f"PDF({self.pdf_path})"
 
-# %% ../02_pdf.ipynb 7
-@patch
+
 def to_images(self:PDF, # PDF object
             start_page:int=None, # Set to None to convert all pages
             end_page:int=None, # Set to None to convert all pages