updated code to have a highlight PDF API

AshishMahendra · May 12, 2024 · 9cb18a4 · 9cb18a4
1 parent 00f4890
commit 9cb18a4
Show file tree

Hide file tree

Showing 2 changed files with 175 additions and 62 deletions.
diff --git a/ingest.py b/ingest.py
@@ -1,23 +1,20 @@
 import logging
 import os
 import shutil
-import tempfile
 import httpx
 from io import BytesIO
 import traceback
 from langchain.docstore.document import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.vectorstores import Chroma
 from utils import get_embeddings
-from langchain.document_loaders import UnstructuredFileLoader
-import re
-from PyPDF2 import PdfReader
 from constants import (
     CHROMA_SETTINGS,
     EMBEDDING_MODEL_NAME,
     PERSIST_DIRECTORY,
 )
-
+import difflib
+import pdfplumber
 import pdfplumber
 
 
@@ -28,16 +25,6 @@ def file_log(logentry):
     print(logentry + "\n")
 
 
-def fix_broken_words(text):
-    hyphenated_words_pattern = r"(\w+-\n\w+)"
-
-    def _fix_match(match):
-        return match.group(1).replace("-\n", "")
-
-    fixed_text = re.sub(hyphenated_words_pattern, _fix_match, text)
-    return fixed_text
-
-
 def clear_existing_data():
     if os.path.exists(PERSIST_DIRECTORY):
         for file in os.listdir(PERSIST_DIRECTORY):
@@ -49,20 +36,119 @@ def clear_existing_data():
     logging.info("Cleared existing data from the database.")
 
 
-def load_pdf_document(file_content: BytesIO, url) -> list[Document]:
+def load_pdf_document(file_content: BytesIO, pdf_path):
+    """Read PDF and return list of Document objects for all pages."""
     documents = []
-    try:
-        with pdfplumber.open(file_content) as pdf:
-            for i, page in enumerate(pdf.pages):
-                text = page.extract_text()
-                if text:
-                    doc = Document(page_content=text, metadata={"source": url, "page_number": i + 1})
-                    documents.append(doc)
-    except Exception as ex:
-        logging.error(f"Error loading PDF document from {url}: {ex}")
+    with pdfplumber.open(file_content) as pdf:
+        for i, page in enumerate(pdf.pages):
+            text = page.extract_text()
+            if text:
+                normalized_text = preprocess_text(text)
+                documents.append(
+                    Document(page_content=normalized_text, metadata={"source": pdf_path, "page_number": i + 1})
+                )
     return documents
 
 
+def preprocess_text(text):
+    """Lowercase, remove hyphenation and extra spaces from text."""
+    # Remove hyphens that might have been incorrectly added at line breaks
+    text = text.replace("-\n", "")
+    text = text.replace("\n", " ")
+    return " ".join(text.lower().strip().split())
+
+
+def debug_extract_words(page):
+    """Debug function to extract words from the page, with added details."""
+    words = page.extract_words(keep_blank_chars=True)
+    # print(f"Extracted Words: {[word['text'] for word in words]}")
+    return words
+
+
+def find_text_positions(page, search_text):
+    """Find bounding boxes for text matches with improved matching logic."""
+    search_text = preprocess_text(search_text)
+    words = debug_extract_words(page)  # Debugging text extraction with visual output
+    page_text = preprocess_text(" ".join([word["text"] for word in words]))
+    matcher = difflib.SequenceMatcher(None, page_text, search_text)
+    matches = [match for match in matcher.get_matching_blocks() if match.size > 0]
+
+    bounding_boxes = []
+    current_word_index = 0
+    for word in words:
+        word_text = preprocess_text(word["text"])
+        word_len = len(word_text)
+
+        # Check if current word is within any match range
+        if any(match.a <= current_word_index < match.a + match.size for match in matches):
+            bounding_boxes.append((word["x0"], word["top"], word["x1"], word["bottom"]))
+
+        current_word_index += len(preprocess_text(word["text"] + " "))
+
+    return bounding_boxes
+
+
+def highlight_text_in_pdf(pdf_path, page_number, highlight_text):
+    """Highlight the specified text on the given page."""
+    try:
+        with pdfplumber.open(pdf_path) as pdf:
+            page = pdf.pages[page_number - 1]
+            print(f"Highlighting text on Page {page_number}")
+
+            # Extract bounding boxes for the specified text
+            bounding_boxes = find_text_positions(page, highlight_text)
+            page_image = page.to_image(resolution=400)
+
+            for box in bounding_boxes:
+                page_image.draw_rect(
+                    box,
+                    fill=(255, 255, 0, 64),  # Semi-transparent yellow fill
+                    stroke="orange",  # Vivid orange stroke
+                    stroke_width=3,  # Slightly thicker stroke
+                )
+            output_file_path = f"highlighted_page_{page_number}.png"
+            page_image.save(output_file_path, quality=95)
+            print(f"Highlighted text saved to {output_file_path}")
+            return output_file_path
+
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        return str(e)
+
+
+async def process_documents(file_urls, device_type):
+    results = []
+    full_texts = []
+    clear_existing_data()
+    async with httpx.AsyncClient() as client:
+        for url in file_urls:
+            try:
+                response = await client.get(url)
+                response.raise_for_status()  # ensure successful response
+                with BytesIO(response.content) as pdf_file:
+                    logging.info(f"Loading documents from {pdf_file}")
+                    documents = load_pdf_document(pdf_file, url)
+                    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+                    texts = text_splitter.split_documents(documents)
+                    logging.info(f"Loaded {len(documents)} documents from {url}")
+                    logging.info(f"Split into {len(texts)} chunks of text")
+                    full_texts.extend(texts)
+            except Exception as e:
+                traceback.print_exc()
+                logging.error(f"Error processing PDF from {url}: {e}")
+                results.append({"url": url, "error": str(e)})
+
+    embeddings = get_embeddings(device_type)
+
+    logging.info(f"Loaded embeddings from {EMBEDDING_MODEL_NAME}")
+    db = Chroma.from_documents(
+        full_texts,
+        embeddings,
+        persist_directory=PERSIST_DIRECTORY,
+        client_settings=CHROMA_SETTINGS,
+    )
+
+
 # def load_pdf_document(file_content: str, url) -> list[Document]:
 #     documents = []
 #     try:
@@ -110,36 +196,3 @@ def load_pdf_document(file_content: BytesIO, url) -> list[Document]:
 #             os.unlink(tmp_file_path)
 
 #     return documents
-
-
-async def process_documents(file_urls, device_type):
-    results = []
-    full_texts = []
-    clear_existing_data()
-    async with httpx.AsyncClient() as client:
-        for url in file_urls:
-            try:
-                response = await client.get(url)
-                response.raise_for_status()  # ensure successful response
-                with BytesIO(response.content) as pdf_file:
-                    logging.info(f"Loading documents from {pdf_file}")
-                    documents = load_pdf_document(pdf_file, url)
-                    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-                    texts = text_splitter.split_documents(documents)
-                    logging.info(f"Loaded {len(documents)} documents from {url}")
-                    logging.info(f"Split into {len(texts)} chunks of text")
-                    full_texts.extend(texts)
-            except Exception as e:
-                traceback.print_exc()
-                logging.error(f"Error processing PDF from {url}: {e}")
-                results.append({"url": url, "error": str(e)})
-
-    embeddings = get_embeddings(device_type)
-
-    logging.info(f"Loaded embeddings from {EMBEDDING_MODEL_NAME}")
-    db = Chroma.from_documents(
-        full_texts,
-        embeddings,
-        persist_directory=PERSIST_DIRECTORY,
-        client_settings=CHROMA_SETTINGS,
-    )
diff --git a/local_gpt_fast_api.py b/local_gpt_fast_api.py
@@ -9,20 +9,21 @@
 from langchain.embeddings import HuggingFaceInstructEmbeddings
 import traceback
 from fastapi.middleware.cors import CORSMiddleware
-
 from run_localGPT import load_model
 from prompt_template_utils import get_prompt_template
-
 from langchain.vectorstores import Chroma
-from ingest import process_documents
+from ingest import process_documents, highlight_text_in_pdf
 from constants import (
     CHROMA_SETTINGS,
     EMBEDDING_MODEL_NAME,
     PERSIST_DIRECTORY,
     MODEL_ID,
     MODEL_BASENAME,
 )
-
+import tempfile
+import httpx
+import boto3
+from urllib.parse import urlparse
 from threading import Lock
 from pydantic import BaseModel, HttpUrl
 from typing import List
@@ -184,5 +185,64 @@ def receive_feedback(feedback: FeedbackModel):
     return {"message": "Thank you for your feedback!"}
 
 
+class HighlightRequest(BaseModel):
+    pdf_name: str
+    page_number: int
+    highlight_text: str
+
+
+def upload_image_to_s3(image_path, bucket, object_name):
+    s3_client = boto3.client("s3")
+    try:
+        s3_client.upload_file(image_path, bucket, object_name)
+        return f"https://{bucket}.s3.amazonaws.com/{object_name}"
+    except Exception as e:
+        logging.error(f"Failed to upload {image_path} to {bucket}/{object_name}: {e}")
+        return str(e)
+
+
+@app.post("/api/highlight_pdf")
+def highlight_pdf_endpoint(highlight_requests: List[HighlightRequest]):
+    results = []
+
+    for request in highlight_requests:
+        pdf_path = None  # Initialize to ensure cleanup in the `finally` block
+        try:
+            # Extract bucket and key information from the original URL
+            parsed_url = urlparse(request.pdf_name)
+            bucket = parsed_url.netloc.split(".")[0]
+            key_prefix = os.path.dirname(parsed_url.path).strip("/")
+
+            # Download the file from the provided URL to a temporary file
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
+                response = httpx.get(request.pdf_name)
+                response.raise_for_status()
+                tmp_file.write(response.content)
+                pdf_path = tmp_file.name
+
+            # Highlight the text in the PDF
+            highlighted_pdf = highlight_text_in_pdf(pdf_path, request.page_number, request.highlight_text)
+
+            # Upload the highlighted image back to the S3 bucket
+            image_name = f"{key_prefix}/highlighted_page_{request.page_number}.png"
+            s3_image_url = upload_image_to_s3(highlighted_pdf, bucket, image_name)
+
+            # Add the result with the S3 URL of the highlighted image
+            results.append({"pdf_name": request.pdf_name, "highlighted_image": s3_image_url})
+
+        except Exception as e:
+            logging.error(f"Error highlighting {request.pdf_name}: {e}")
+            results.append({"pdf_name": request.pdf_name, "error": str(e)})
+
+        finally:
+            # Clean up the temporary file after processing
+            clean_temp = [pdf_path, highlighted_pdf]
+            for c_image in clean_temp:
+                if pdf_path and os.path.exists(c_image):
+                    os.remove(c_image)
+
+    return results
+
+
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=8500)