Skip to content

Commit

Permalink
openai updates
Browse files Browse the repository at this point in the history
  • Loading branch information
AshishMahendra committed Jul 4, 2024
1 parent 34ecc8b commit d18552d
Show file tree
Hide file tree
Showing 8 changed files with 452 additions and 0 deletions.
52 changes: 52 additions & 0 deletions ask_monk/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# PDF Question Answering API

This project provides an API for processing PDF documents and answering questions based on their content.

## Features

- Ingest PDF documents and process them into semantic chunks.
- Answer questions based on the content of the ingested documents.
- Provide feedback on the answers.
- Highlight text in PDFs and return the modified files.

## Installation

1. Clone the repository:
```bash
git clone https://github.com/yourusername/yourrepository.git
```

2. Navigate to the project directory:
```bash
cd yourrepository
```

3. Install the dependencies:
```bash
pip install -r requirements.txt
```

4. Download the spaCy model:
```bash
python -m spacy download en_core_web_trf
```

## Usage

1. Start the FastAPI application:
```bash
uvicorn app:app --host 0.0.0.0 --port 8000
```

2. Use the API endpoints to process documents, ask questions, provide feedback, and highlight text in PDFs.

## Endpoints

- **POST /api/run_ingest**: Ingest and process PDF documents.
- **POST /api/prompt_route**: Ask questions based on the content of the ingested documents.
- **POST /api/feedback**: Provide feedback on the answers.
- **POST /api/highlight_pdf**: Highlight text in PDFs and return the modified files.

## License

This project is licensed under the MIT License.
145 changes: 145 additions & 0 deletions ask_monk/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import List
from utils.pdf_processing import process_documents
from utils.question_answering import answer_question_from_pdf
from utils.highlight import highlight_text_in_pdf
import os
import logging
import traceback
import tempfile
import httpx
import boto3
import uvicorn
from urllib.parse import urlparse
from threading import Lock


class FileInfo(BaseModel):
id: int
name: str
file: str


class FolderInfo(BaseModel):
id: int
uid: str
name: str
slug: str
user_id: int
url: str
files: List[FileInfo]


class DocumentData(BaseModel):
data: FolderInfo


class QuestionRequest(BaseModel):
question: str
document_url: str


class FeedbackModel(BaseModel):
user_prompt: str
feedback: str


class HighlightRequest(BaseModel):
pdf_name: str
page_number: int
highlight_text: str


app = FastAPI()

app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)

request_lock = Lock()


@app.post("/api/run_ingest")
async def ingest_from_json(document_data: DocumentData):
file_urls = [file.file for file in document_data.data.files]
try:
results = await process_documents(file_urls)
return {"message": "Documents processed and vectorstore updated successfully", "results": results}
except Exception as e:
traceback.print_exc()
raise HTTPException(status_code=500, detail=str(e))


@app.post("/api/prompt_route")
async def prompt_route(question_request: QuestionRequest):
with request_lock:
try:
answer, references = answer_question_from_pdf(question_request.document_url, question_request.question)
return {"answer": answer, "references": references}
except Exception as e:
traceback.print_exc()
raise HTTPException(status_code=500, detail=str(e))


@app.post("/api/feedback")
async def receive_feedback(feedback: FeedbackModel):
print(f"Received feedback for '{feedback.user_prompt}': {feedback.feedback}")
return {"message": "Thank you for your feedback!"}


@app.post("/api/highlight_pdf")
async def highlight_pdf_endpoint(highlight_requests: List[HighlightRequest]):
results = []
for request in highlight_requests:
pdf_path = None
try:
parsed_url = urlparse(request.pdf_name)
bucket = parsed_url.netloc.split(".")[0]
key_prefix = os.path.dirname(parsed_url.path).strip("/")

with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
response = httpx.get(request.pdf_name)
response.raise_for_status()
tmp_file.write(response.content)
pdf_path = tmp_file.name

highlighted_pdf = highlight_text_in_pdf(pdf_path, request.page_number, request.highlight_text)

if highlighted_pdf:
image_name = f"{key_prefix}/highlighted_page_{request.page_number}.png"
s3_image_url = upload_image_to_s3(highlighted_pdf, bucket, image_name)
results.append({"pdf_name": request.pdf_name, "highlighted_image": s3_image_url})
else:
results.append(
{"pdf_name": request.pdf_name, "error": "No matching text found or failed to create highlight."}
)
except Exception as e:
logging.error(f"Error highlighting {request.pdf_name}: {e}")
results.append({"pdf_name": request.pdf_name, "error": str(e)})
finally:
if pdf_path and os.path.exists(pdf_path):
os.remove(pdf_path)
if highlighted_pdf and os.path.exists(highlighted_pdf):
os.remove(highlighted_pdf)

return results


def upload_image_to_s3(image_path, bucket, object_name):
s3_client = boto3.client("s3")
try:
s3_client.upload_file(image_path, bucket, object_name)
return f"https://{bucket}.s3.amazonaws.com/{object_name}"
except Exception as e:
logging.error(f"Failed to upload {image_path} to {bucket}/{object_name}: {e}")
return str(e)


if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
38 changes: 38 additions & 0 deletions ask_monk/models/schemas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from pydantic import BaseModel, HttpUrl
from typing import List


class FileInfo(BaseModel):
id: int
name: str
file: HttpUrl


class FolderInfo(BaseModel):
id: int
uid: str
name: str
slug: str
user_id: int
url: HttpUrl
files: List[FileInfo]


class DocumentData(BaseModel):
data: FolderInfo


class QuestionRequest(BaseModel):
question: str
document_url: str


class FeedbackModel(BaseModel):
user_prompt: str
feedback: str


class HighlightRequest(BaseModel):
pdf_name: str
page_number: int
highlight_text: str
11 changes: 11 additions & 0 deletions ask_monk/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
fastapi
uvicorn
fitz
spacy
scikit-learn
sentence-transformers
openai
httpx
boto3
pydantic
langchain
32 changes: 32 additions & 0 deletions ask_monk/utils/embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from openai import OpenAI
import os
from langchain.vectorstores import Chroma
from constants import CHROMA_SETTINGS, PERSIST_DIRECTORY

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


def get_openai_embeddings(text):
response = client.embeddings.create(model="text-embedding-ada-002", input=text)
return response.data[0].embedding


def get_embeddings(text_chunks):
embeddings = []
for chunk in text_chunks:
embedding = get_openai_embeddings(chunk)
embeddings.append(embedding)
return embeddings


def save_embeddings(embeddings, texts):
if not os.path.exists(PERSIST_DIRECTORY):
os.makedirs(PERSIST_DIRECTORY)

db = Chroma.from_embeddings(
texts,
embeddings,
persist_directory=PERSIST_DIRECTORY,
client_settings=CHROMA_SETTINGS,
)
db.persist()
57 changes: 57 additions & 0 deletions ask_monk/utils/highlight.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import pdfplumber
from pdf_processing import preprocess_text
import re


def find_text_positions(page, search_text):
"""Improved logic to handle text matches that start or end mid-word."""
search_text = preprocess_text(search_text)
words = page.extract_words(keep_blank_chars=True)
page_text = preprocess_text(" ".join([word["text"] for word in words]))

word_positions = []
current_index = 0
for word in words:
start = current_index
end = start + len(preprocess_text(word["text"]))
word_positions.append((start, end, word))
current_index = end + 1 # Account for space between words

pattern = re.compile(re.escape(search_text), re.IGNORECASE)
matches = list(pattern.finditer(page_text))

bounding_boxes = []
for match in matches:
match_start, match_end = match.start(), match.end()
for start, end, word in word_positions:
if start < match_end and end > match_start: # Check for any overlap
bounding_boxes.append((word["x0"], word["top"], word["x1"], word["bottom"]))

return bounding_boxes


def highlight_text_in_pdf(pdf_path, page_number, highlight_text):
"""Highlight the specified text on the given page with enhanced debugging."""
try:
with pdfplumber.open(pdf_path) as pdf:
page = pdf.pages[page_number - 1]
words = page.extract_words(keep_blank_chars=True)
bounding_boxes = find_text_positions(page, highlight_text)

if not bounding_boxes:
print("No matching text found.")
return None

page_image = page.to_image(resolution=400)
for box in bounding_boxes:
page_image.draw_rect(box, fill=(255, 255, 0, 64), stroke="orange", stroke_width=3)

output_file_path = f"highlighted_page_{page_number}.png"
page_image.save(output_file_path, quality=95)
print(f"Highlighted text saved to {output_file_path}")

return output_file_path

except Exception as e:
print(f"An error occurred: {e}")
return str(e)
Loading

0 comments on commit d18552d

Please sign in to comment.