Skip to content

Commit

Permalink
updated code to have a highlight PDF API
Browse files Browse the repository at this point in the history
  • Loading branch information
AshishMahendra committed May 12, 2024
1 parent 00f4890 commit 9cb18a4
Show file tree
Hide file tree
Showing 2 changed files with 175 additions and 62 deletions.
169 changes: 111 additions & 58 deletions ingest.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,20 @@
import logging
import os
import shutil
import tempfile
import httpx
from io import BytesIO
import traceback
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from utils import get_embeddings
from langchain.document_loaders import UnstructuredFileLoader
import re
from PyPDF2 import PdfReader
from constants import (
CHROMA_SETTINGS,
EMBEDDING_MODEL_NAME,
PERSIST_DIRECTORY,
)

import difflib
import pdfplumber
import pdfplumber


Expand All @@ -28,16 +25,6 @@ def file_log(logentry):
print(logentry + "\n")


def fix_broken_words(text):
hyphenated_words_pattern = r"(\w+-\n\w+)"

def _fix_match(match):
return match.group(1).replace("-\n", "")

fixed_text = re.sub(hyphenated_words_pattern, _fix_match, text)
return fixed_text


def clear_existing_data():
if os.path.exists(PERSIST_DIRECTORY):
for file in os.listdir(PERSIST_DIRECTORY):
Expand All @@ -49,20 +36,119 @@ def clear_existing_data():
logging.info("Cleared existing data from the database.")


def load_pdf_document(file_content: BytesIO, url) -> list[Document]:
def load_pdf_document(file_content: BytesIO, pdf_path):
"""Read PDF and return list of Document objects for all pages."""
documents = []
try:
with pdfplumber.open(file_content) as pdf:
for i, page in enumerate(pdf.pages):
text = page.extract_text()
if text:
doc = Document(page_content=text, metadata={"source": url, "page_number": i + 1})
documents.append(doc)
except Exception as ex:
logging.error(f"Error loading PDF document from {url}: {ex}")
with pdfplumber.open(file_content) as pdf:
for i, page in enumerate(pdf.pages):
text = page.extract_text()
if text:
normalized_text = preprocess_text(text)
documents.append(
Document(page_content=normalized_text, metadata={"source": pdf_path, "page_number": i + 1})
)
return documents


def preprocess_text(text):
"""Lowercase, remove hyphenation and extra spaces from text."""
# Remove hyphens that might have been incorrectly added at line breaks
text = text.replace("-\n", "")
text = text.replace("\n", " ")
return " ".join(text.lower().strip().split())


def debug_extract_words(page):
"""Debug function to extract words from the page, with added details."""
words = page.extract_words(keep_blank_chars=True)
# print(f"Extracted Words: {[word['text'] for word in words]}")
return words


def find_text_positions(page, search_text):
"""Find bounding boxes for text matches with improved matching logic."""
search_text = preprocess_text(search_text)
words = debug_extract_words(page) # Debugging text extraction with visual output
page_text = preprocess_text(" ".join([word["text"] for word in words]))
matcher = difflib.SequenceMatcher(None, page_text, search_text)
matches = [match for match in matcher.get_matching_blocks() if match.size > 0]

bounding_boxes = []
current_word_index = 0
for word in words:
word_text = preprocess_text(word["text"])
word_len = len(word_text)

# Check if current word is within any match range
if any(match.a <= current_word_index < match.a + match.size for match in matches):
bounding_boxes.append((word["x0"], word["top"], word["x1"], word["bottom"]))

current_word_index += len(preprocess_text(word["text"] + " "))

return bounding_boxes


def highlight_text_in_pdf(pdf_path, page_number, highlight_text):
"""Highlight the specified text on the given page."""
try:
with pdfplumber.open(pdf_path) as pdf:
page = pdf.pages[page_number - 1]
print(f"Highlighting text on Page {page_number}")

# Extract bounding boxes for the specified text
bounding_boxes = find_text_positions(page, highlight_text)
page_image = page.to_image(resolution=400)

for box in bounding_boxes:
page_image.draw_rect(
box,
fill=(255, 255, 0, 64), # Semi-transparent yellow fill
stroke="orange", # Vivid orange stroke
stroke_width=3, # Slightly thicker stroke
)
output_file_path = f"highlighted_page_{page_number}.png"
page_image.save(output_file_path, quality=95)
print(f"Highlighted text saved to {output_file_path}")
return output_file_path

except Exception as e:
print(f"An error occurred: {e}")
return str(e)


async def process_documents(file_urls, device_type):
results = []
full_texts = []
clear_existing_data()
async with httpx.AsyncClient() as client:
for url in file_urls:
try:
response = await client.get(url)
response.raise_for_status() # ensure successful response
with BytesIO(response.content) as pdf_file:
logging.info(f"Loading documents from {pdf_file}")
documents = load_pdf_document(pdf_file, url)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
logging.info(f"Loaded {len(documents)} documents from {url}")
logging.info(f"Split into {len(texts)} chunks of text")
full_texts.extend(texts)
except Exception as e:
traceback.print_exc()
logging.error(f"Error processing PDF from {url}: {e}")
results.append({"url": url, "error": str(e)})

embeddings = get_embeddings(device_type)

logging.info(f"Loaded embeddings from {EMBEDDING_MODEL_NAME}")
db = Chroma.from_documents(
full_texts,
embeddings,
persist_directory=PERSIST_DIRECTORY,
client_settings=CHROMA_SETTINGS,
)


# def load_pdf_document(file_content: str, url) -> list[Document]:
# documents = []
# try:
Expand Down Expand Up @@ -110,36 +196,3 @@ def load_pdf_document(file_content: BytesIO, url) -> list[Document]:
# os.unlink(tmp_file_path)

# return documents


async def process_documents(file_urls, device_type):
results = []
full_texts = []
clear_existing_data()
async with httpx.AsyncClient() as client:
for url in file_urls:
try:
response = await client.get(url)
response.raise_for_status() # ensure successful response
with BytesIO(response.content) as pdf_file:
logging.info(f"Loading documents from {pdf_file}")
documents = load_pdf_document(pdf_file, url)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
logging.info(f"Loaded {len(documents)} documents from {url}")
logging.info(f"Split into {len(texts)} chunks of text")
full_texts.extend(texts)
except Exception as e:
traceback.print_exc()
logging.error(f"Error processing PDF from {url}: {e}")
results.append({"url": url, "error": str(e)})

embeddings = get_embeddings(device_type)

logging.info(f"Loaded embeddings from {EMBEDDING_MODEL_NAME}")
db = Chroma.from_documents(
full_texts,
embeddings,
persist_directory=PERSIST_DIRECTORY,
client_settings=CHROMA_SETTINGS,
)
68 changes: 64 additions & 4 deletions local_gpt_fast_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,21 @@
from langchain.embeddings import HuggingFaceInstructEmbeddings
import traceback
from fastapi.middleware.cors import CORSMiddleware

from run_localGPT import load_model
from prompt_template_utils import get_prompt_template

from langchain.vectorstores import Chroma
from ingest import process_documents
from ingest import process_documents, highlight_text_in_pdf
from constants import (
CHROMA_SETTINGS,
EMBEDDING_MODEL_NAME,
PERSIST_DIRECTORY,
MODEL_ID,
MODEL_BASENAME,
)

import tempfile
import httpx
import boto3
from urllib.parse import urlparse
from threading import Lock
from pydantic import BaseModel, HttpUrl
from typing import List
Expand Down Expand Up @@ -184,5 +185,64 @@ def receive_feedback(feedback: FeedbackModel):
return {"message": "Thank you for your feedback!"}


class HighlightRequest(BaseModel):
pdf_name: str
page_number: int
highlight_text: str


def upload_image_to_s3(image_path, bucket, object_name):
s3_client = boto3.client("s3")
try:
s3_client.upload_file(image_path, bucket, object_name)
return f"https://{bucket}.s3.amazonaws.com/{object_name}"
except Exception as e:
logging.error(f"Failed to upload {image_path} to {bucket}/{object_name}: {e}")
return str(e)


@app.post("/api/highlight_pdf")
def highlight_pdf_endpoint(highlight_requests: List[HighlightRequest]):
results = []

for request in highlight_requests:
pdf_path = None # Initialize to ensure cleanup in the `finally` block
try:
# Extract bucket and key information from the original URL
parsed_url = urlparse(request.pdf_name)
bucket = parsed_url.netloc.split(".")[0]
key_prefix = os.path.dirname(parsed_url.path).strip("/")

# Download the file from the provided URL to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
response = httpx.get(request.pdf_name)
response.raise_for_status()
tmp_file.write(response.content)
pdf_path = tmp_file.name

# Highlight the text in the PDF
highlighted_pdf = highlight_text_in_pdf(pdf_path, request.page_number, request.highlight_text)

# Upload the highlighted image back to the S3 bucket
image_name = f"{key_prefix}/highlighted_page_{request.page_number}.png"
s3_image_url = upload_image_to_s3(highlighted_pdf, bucket, image_name)

# Add the result with the S3 URL of the highlighted image
results.append({"pdf_name": request.pdf_name, "highlighted_image": s3_image_url})

except Exception as e:
logging.error(f"Error highlighting {request.pdf_name}: {e}")
results.append({"pdf_name": request.pdf_name, "error": str(e)})

finally:
# Clean up the temporary file after processing
clean_temp = [pdf_path, highlighted_pdf]
for c_image in clean_temp:
if pdf_path and os.path.exists(c_image):
os.remove(c_image)

return results


if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8500)

0 comments on commit 9cb18a4

Please sign in to comment.