forked from PromtEngineer/localGPT
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
34ecc8b
commit d18552d
Showing
8 changed files
with
452 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
# PDF Question Answering API | ||
|
||
This project provides an API for processing PDF documents and answering questions based on their content. | ||
|
||
## Features | ||
|
||
- Ingest PDF documents and process them into semantic chunks. | ||
- Answer questions based on the content of the ingested documents. | ||
- Provide feedback on the answers. | ||
- Highlight text in PDFs and return the modified files. | ||
|
||
## Installation | ||
|
||
1. Clone the repository: | ||
```bash | ||
git clone https://github.com/yourusername/yourrepository.git | ||
``` | ||
|
||
2. Navigate to the project directory: | ||
```bash | ||
cd yourrepository | ||
``` | ||
|
||
3. Install the dependencies: | ||
```bash | ||
pip install -r requirements.txt | ||
``` | ||
|
||
4. Download the spaCy model: | ||
```bash | ||
python -m spacy download en_core_web_trf | ||
``` | ||
|
||
## Usage | ||
|
||
1. Start the FastAPI application: | ||
```bash | ||
uvicorn app:app --host 0.0.0.0 --port 8000 | ||
``` | ||
|
||
2. Use the API endpoints to process documents, ask questions, provide feedback, and highlight text in PDFs. | ||
|
||
## Endpoints | ||
|
||
- **POST /api/run_ingest**: Ingest and process PDF documents. | ||
- **POST /api/prompt_route**: Ask questions based on the content of the ingested documents. | ||
- **POST /api/feedback**: Provide feedback on the answers. | ||
- **POST /api/highlight_pdf**: Highlight text in PDFs and return the modified files. | ||
|
||
## License | ||
|
||
This project is licensed under the MIT License. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
from fastapi import FastAPI, HTTPException | ||
from fastapi.middleware.cors import CORSMiddleware | ||
from pydantic import BaseModel | ||
from typing import List | ||
from utils.pdf_processing import process_documents | ||
from utils.question_answering import answer_question_from_pdf | ||
from utils.highlight import highlight_text_in_pdf | ||
import os | ||
import logging | ||
import traceback | ||
import tempfile | ||
import httpx | ||
import boto3 | ||
import uvicorn | ||
from urllib.parse import urlparse | ||
from threading import Lock | ||
|
||
|
||
class FileInfo(BaseModel): | ||
id: int | ||
name: str | ||
file: str | ||
|
||
|
||
class FolderInfo(BaseModel): | ||
id: int | ||
uid: str | ||
name: str | ||
slug: str | ||
user_id: int | ||
url: str | ||
files: List[FileInfo] | ||
|
||
|
||
class DocumentData(BaseModel): | ||
data: FolderInfo | ||
|
||
|
||
class QuestionRequest(BaseModel): | ||
question: str | ||
document_url: str | ||
|
||
|
||
class FeedbackModel(BaseModel): | ||
user_prompt: str | ||
feedback: str | ||
|
||
|
||
class HighlightRequest(BaseModel): | ||
pdf_name: str | ||
page_number: int | ||
highlight_text: str | ||
|
||
|
||
app = FastAPI() | ||
|
||
app.add_middleware( | ||
CORSMiddleware, | ||
allow_origins=["*"], | ||
allow_credentials=True, | ||
allow_methods=["*"], | ||
allow_headers=["*"], | ||
) | ||
|
||
request_lock = Lock() | ||
|
||
|
||
@app.post("/api/run_ingest") | ||
async def ingest_from_json(document_data: DocumentData): | ||
file_urls = [file.file for file in document_data.data.files] | ||
try: | ||
results = await process_documents(file_urls) | ||
return {"message": "Documents processed and vectorstore updated successfully", "results": results} | ||
except Exception as e: | ||
traceback.print_exc() | ||
raise HTTPException(status_code=500, detail=str(e)) | ||
|
||
|
||
@app.post("/api/prompt_route") | ||
async def prompt_route(question_request: QuestionRequest): | ||
with request_lock: | ||
try: | ||
answer, references = answer_question_from_pdf(question_request.document_url, question_request.question) | ||
return {"answer": answer, "references": references} | ||
except Exception as e: | ||
traceback.print_exc() | ||
raise HTTPException(status_code=500, detail=str(e)) | ||
|
||
|
||
@app.post("/api/feedback") | ||
async def receive_feedback(feedback: FeedbackModel): | ||
print(f"Received feedback for '{feedback.user_prompt}': {feedback.feedback}") | ||
return {"message": "Thank you for your feedback!"} | ||
|
||
|
||
@app.post("/api/highlight_pdf") | ||
async def highlight_pdf_endpoint(highlight_requests: List[HighlightRequest]): | ||
results = [] | ||
for request in highlight_requests: | ||
pdf_path = None | ||
try: | ||
parsed_url = urlparse(request.pdf_name) | ||
bucket = parsed_url.netloc.split(".")[0] | ||
key_prefix = os.path.dirname(parsed_url.path).strip("/") | ||
|
||
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: | ||
response = httpx.get(request.pdf_name) | ||
response.raise_for_status() | ||
tmp_file.write(response.content) | ||
pdf_path = tmp_file.name | ||
|
||
highlighted_pdf = highlight_text_in_pdf(pdf_path, request.page_number, request.highlight_text) | ||
|
||
if highlighted_pdf: | ||
image_name = f"{key_prefix}/highlighted_page_{request.page_number}.png" | ||
s3_image_url = upload_image_to_s3(highlighted_pdf, bucket, image_name) | ||
results.append({"pdf_name": request.pdf_name, "highlighted_image": s3_image_url}) | ||
else: | ||
results.append( | ||
{"pdf_name": request.pdf_name, "error": "No matching text found or failed to create highlight."} | ||
) | ||
except Exception as e: | ||
logging.error(f"Error highlighting {request.pdf_name}: {e}") | ||
results.append({"pdf_name": request.pdf_name, "error": str(e)}) | ||
finally: | ||
if pdf_path and os.path.exists(pdf_path): | ||
os.remove(pdf_path) | ||
if highlighted_pdf and os.path.exists(highlighted_pdf): | ||
os.remove(highlighted_pdf) | ||
|
||
return results | ||
|
||
|
||
def upload_image_to_s3(image_path, bucket, object_name): | ||
s3_client = boto3.client("s3") | ||
try: | ||
s3_client.upload_file(image_path, bucket, object_name) | ||
return f"https://{bucket}.s3.amazonaws.com/{object_name}" | ||
except Exception as e: | ||
logging.error(f"Failed to upload {image_path} to {bucket}/{object_name}: {e}") | ||
return str(e) | ||
|
||
|
||
if __name__ == "__main__": | ||
uvicorn.run(app, host="0.0.0.0", port=8000) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
from pydantic import BaseModel, HttpUrl | ||
from typing import List | ||
|
||
|
||
class FileInfo(BaseModel): | ||
id: int | ||
name: str | ||
file: HttpUrl | ||
|
||
|
||
class FolderInfo(BaseModel): | ||
id: int | ||
uid: str | ||
name: str | ||
slug: str | ||
user_id: int | ||
url: HttpUrl | ||
files: List[FileInfo] | ||
|
||
|
||
class DocumentData(BaseModel): | ||
data: FolderInfo | ||
|
||
|
||
class QuestionRequest(BaseModel): | ||
question: str | ||
document_url: str | ||
|
||
|
||
class FeedbackModel(BaseModel): | ||
user_prompt: str | ||
feedback: str | ||
|
||
|
||
class HighlightRequest(BaseModel): | ||
pdf_name: str | ||
page_number: int | ||
highlight_text: str |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
fastapi | ||
uvicorn | ||
fitz | ||
spacy | ||
scikit-learn | ||
sentence-transformers | ||
openai | ||
httpx | ||
boto3 | ||
pydantic | ||
langchain |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
from openai import OpenAI | ||
import os | ||
from langchain.vectorstores import Chroma | ||
from constants import CHROMA_SETTINGS, PERSIST_DIRECTORY | ||
|
||
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) | ||
|
||
|
||
def get_openai_embeddings(text): | ||
response = client.embeddings.create(model="text-embedding-ada-002", input=text) | ||
return response.data[0].embedding | ||
|
||
|
||
def get_embeddings(text_chunks): | ||
embeddings = [] | ||
for chunk in text_chunks: | ||
embedding = get_openai_embeddings(chunk) | ||
embeddings.append(embedding) | ||
return embeddings | ||
|
||
|
||
def save_embeddings(embeddings, texts): | ||
if not os.path.exists(PERSIST_DIRECTORY): | ||
os.makedirs(PERSIST_DIRECTORY) | ||
|
||
db = Chroma.from_embeddings( | ||
texts, | ||
embeddings, | ||
persist_directory=PERSIST_DIRECTORY, | ||
client_settings=CHROMA_SETTINGS, | ||
) | ||
db.persist() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import pdfplumber | ||
from pdf_processing import preprocess_text | ||
import re | ||
|
||
|
||
def find_text_positions(page, search_text): | ||
"""Improved logic to handle text matches that start or end mid-word.""" | ||
search_text = preprocess_text(search_text) | ||
words = page.extract_words(keep_blank_chars=True) | ||
page_text = preprocess_text(" ".join([word["text"] for word in words])) | ||
|
||
word_positions = [] | ||
current_index = 0 | ||
for word in words: | ||
start = current_index | ||
end = start + len(preprocess_text(word["text"])) | ||
word_positions.append((start, end, word)) | ||
current_index = end + 1 # Account for space between words | ||
|
||
pattern = re.compile(re.escape(search_text), re.IGNORECASE) | ||
matches = list(pattern.finditer(page_text)) | ||
|
||
bounding_boxes = [] | ||
for match in matches: | ||
match_start, match_end = match.start(), match.end() | ||
for start, end, word in word_positions: | ||
if start < match_end and end > match_start: # Check for any overlap | ||
bounding_boxes.append((word["x0"], word["top"], word["x1"], word["bottom"])) | ||
|
||
return bounding_boxes | ||
|
||
|
||
def highlight_text_in_pdf(pdf_path, page_number, highlight_text): | ||
"""Highlight the specified text on the given page with enhanced debugging.""" | ||
try: | ||
with pdfplumber.open(pdf_path) as pdf: | ||
page = pdf.pages[page_number - 1] | ||
words = page.extract_words(keep_blank_chars=True) | ||
bounding_boxes = find_text_positions(page, highlight_text) | ||
|
||
if not bounding_boxes: | ||
print("No matching text found.") | ||
return None | ||
|
||
page_image = page.to_image(resolution=400) | ||
for box in bounding_boxes: | ||
page_image.draw_rect(box, fill=(255, 255, 0, 64), stroke="orange", stroke_width=3) | ||
|
||
output_file_path = f"highlighted_page_{page_number}.png" | ||
page_image.save(output_file_path, quality=95) | ||
print(f"Highlighted text saved to {output_file_path}") | ||
|
||
return output_file_path | ||
|
||
except Exception as e: | ||
print(f"An error occurred: {e}") | ||
return str(e) |
Oops, something went wrong.