code update to enchance the text macthing

AshishMahendra · May 17, 2024 · 34ecc8b · 34ecc8b
1 parent 9cb18a4
commit 34ecc8b
Show file tree

Hide file tree

Showing 2 changed files with 56 additions and 38 deletions.
diff --git a/ingest.py b/ingest.py
@@ -16,6 +16,7 @@
 import difflib
 import pdfplumber
 import pdfplumber
+import re
 
 
 def file_log(logentry):
@@ -50,65 +51,76 @@ def load_pdf_document(file_content: BytesIO, pdf_path):
     return documents
 
 
-def preprocess_text(text):
-    """Lowercase, remove hyphenation and extra spaces from text."""
-    # Remove hyphens that might have been incorrectly added at line breaks
-    text = text.replace("-\n", "")
-    text = text.replace("\n", " ")
-    return " ".join(text.lower().strip().split())
-
-
 def debug_extract_words(page):
     """Debug function to extract words from the page, with added details."""
     words = page.extract_words(keep_blank_chars=True)
     # print(f"Extracted Words: {[word['text'] for word in words]}")
     return words
 
 
+def preprocess_text(text):
+    """Remove non-alphanumeric characters, except necessary punctuation, and normalize spaces and case."""
+    text = re.sub(r"[^\w\s-]", "", text)  # Keep alphanumeric, whitespace, hyphens
+    text = re.sub(r"\s+", " ", text)  # Reduce multiple spaces to a single space
+    return text.lower().strip()
+
+
 def find_text_positions(page, search_text):
-    """Find bounding boxes for text matches with improved matching logic."""
+    """Improved logic to handle text matches that start or end mid-word."""
+    # print("Original Search Text:", search_text)
     search_text = preprocess_text(search_text)
-    words = debug_extract_words(page)  # Debugging text extraction with visual output
+    # print("Preprocessed Search Text:", search_text)
+
+    words = debug_extract_words(page)  # Assuming this function returns word boundaries accurately
     page_text = preprocess_text(" ".join([word["text"] for word in words]))
-    matcher = difflib.SequenceMatcher(None, page_text, search_text)
-    matches = [match for match in matcher.get_matching_blocks() if match.size > 0]
+    # print("Full Page Text:", page_text)
 
-    bounding_boxes = []
-    current_word_index = 0
+    # Create a continuous index for word positions in the page text
+    word_positions = []
+    current_index = 0
     for word in words:
-        word_text = preprocess_text(word["text"])
-        word_len = len(word_text)
+        start = current_index
+        end = start + len(preprocess_text(word["text"]))
+        word_positions.append((start, end, word))
+        current_index = end + 1  # Account for space between words
 
-        # Check if current word is within any match range
-        if any(match.a <= current_word_index < match.a + match.size for match in matches):
-            bounding_boxes.append((word["x0"], word["top"], word["x1"], word["bottom"]))
+    # Use regex for finding overlapping matches
+    pattern = re.compile(re.escape(search_text), re.IGNORECASE)
+    matches = list(pattern.finditer(page_text))
 
-        current_word_index += len(preprocess_text(word["text"] + " "))
+    bounding_boxes = []
+    for match in matches:
+        match_start, match_end = match.start(), match.end()
+        # Collect words that overlap with the match
+        for start, end, word in word_positions:
+            if start < match_end and end > match_start:  # Check for any overlap
+                bounding_boxes.append((word["x0"], word["top"], word["x1"], word["bottom"]))
 
     return bounding_boxes
 
 
 def highlight_text_in_pdf(pdf_path, page_number, highlight_text):
-    """Highlight the specified text on the given page."""
+    """Highlight the specified text on the given page with enhanced debugging."""
     try:
         with pdfplumber.open(pdf_path) as pdf:
             page = pdf.pages[page_number - 1]
-            print(f"Highlighting text on Page {page_number}")
+            words = debug_extract_words(page)  # Extract words with visual debugging
 
-            # Extract bounding boxes for the specified text
+            # print(f"Page {page_number} - Extracted Text: {' '.join([word['text'] for word in words])}")
             bounding_boxes = find_text_positions(page, highlight_text)
-            page_image = page.to_image(resolution=400)
 
+            if not bounding_boxes:
+                print("No matching text found.")
+                return None
+
+            page_image = page.to_image(resolution=400)
             for box in bounding_boxes:
-                page_image.draw_rect(
-                    box,
-                    fill=(255, 255, 0, 64),  # Semi-transparent yellow fill
-                    stroke="orange",  # Vivid orange stroke
-                    stroke_width=3,  # Slightly thicker stroke
-                )
+                page_image.draw_rect(box, fill=(255, 255, 0, 64), stroke="orange", stroke_width=3)
+
             output_file_path = f"highlighted_page_{page_number}.png"
             page_image.save(output_file_path, quality=95)
             print(f"Highlighted text saved to {output_file_path}")
+
             return output_file_path
 
     except Exception as e:

diff --git a/local_gpt_fast_api.py b/local_gpt_fast_api.py
@@ -223,22 +223,28 @@ def highlight_pdf_endpoint(highlight_requests: List[HighlightRequest]):
             # Highlight the text in the PDF
             highlighted_pdf = highlight_text_in_pdf(pdf_path, request.page_number, request.highlight_text)
 
-            # Upload the highlighted image back to the S3 bucket
-            image_name = f"{key_prefix}/highlighted_page_{request.page_number}.png"
-            s3_image_url = upload_image_to_s3(highlighted_pdf, bucket, image_name)
-
-            # Add the result with the S3 URL of the highlighted image
-            results.append({"pdf_name": request.pdf_name, "highlighted_image": s3_image_url})
+            if highlighted_pdf:  # Check if a highlighted image was actually created
+                # Upload the highlighted image back to the S3 bucket
+                image_name = f"{key_prefix}/highlighted_page_{request.page_number}.png"
+                s3_image_url = upload_image_to_s3(highlighted_pdf, bucket, image_name)
+
+                # Add the result with the S3 URL of the highlighted image
+                results.append({"pdf_name": request.pdf_name, "highlighted_image": s3_image_url})
+            else:
+                # Handle case where no text was found or no image was created
+                results.append(
+                    {"pdf_name": request.pdf_name, "error": "No matching text found or failed to create highlight."}
+                )
 
         except Exception as e:
             logging.error(f"Error highlighting {request.pdf_name}: {e}")
             results.append({"pdf_name": request.pdf_name, "error": str(e)})
 
         finally:
-            # Clean up the temporary file after processing
+            # Clean up the temporary files after processing
             clean_temp = [pdf_path, highlighted_pdf]
             for c_image in clean_temp:
-                if pdf_path and os.path.exists(c_image):
+                if c_image and os.path.exists(c_image):
                     os.remove(c_image)
 
     return results