openai updates

AshishMahendra · Jul 4, 2024 · d18552d · d18552d
1 parent 34ecc8b
commit d18552d
Show file tree

Hide file tree

Showing 8 changed files with 452 additions and 0 deletions.
diff --git a/ask_monk/README.md b/ask_monk/README.md
@@ -0,0 +1,52 @@
+# PDF Question Answering API
+
+This project provides an API for processing PDF documents and answering questions based on their content.
+
+## Features
+
+- Ingest PDF documents and process them into semantic chunks.
+- Answer questions based on the content of the ingested documents.
+- Provide feedback on the answers.
+- Highlight text in PDFs and return the modified files.
+
+## Installation
+
+1. Clone the repository:
+    ```bash
+    git clone https://github.com/yourusername/yourrepository.git
+    ```
+
+2. Navigate to the project directory:
+    ```bash
+    cd yourrepository
+    ```
+
+3. Install the dependencies:
+    ```bash
+    pip install -r requirements.txt
+    ```
+
+4. Download the spaCy model:
+    ```bash
+    python -m spacy download en_core_web_trf
+    ```
+
+## Usage
+
+1. Start the FastAPI application:
+    ```bash
+    uvicorn app:app --host 0.0.0.0 --port 8000
+    ```
+
+2. Use the API endpoints to process documents, ask questions, provide feedback, and highlight text in PDFs.
+
+## Endpoints
+
+- **POST /api/run_ingest**: Ingest and process PDF documents.
+- **POST /api/prompt_route**: Ask questions based on the content of the ingested documents.
+- **POST /api/feedback**: Provide feedback on the answers.
+- **POST /api/highlight_pdf**: Highlight text in PDFs and return the modified files.
+
+## License
+
+This project is licensed under the MIT License.
diff --git a/ask_monk/app.py b/ask_monk/app.py
@@ -0,0 +1,145 @@
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from typing import List
+from utils.pdf_processing import process_documents
+from utils.question_answering import answer_question_from_pdf
+from utils.highlight import highlight_text_in_pdf
+import os
+import logging
+import traceback
+import tempfile
+import httpx
+import boto3
+import uvicorn
+from urllib.parse import urlparse
+from threading import Lock
+
+
+class FileInfo(BaseModel):
+    id: int
+    name: str
+    file: str
+
+
+class FolderInfo(BaseModel):
+    id: int
+    uid: str
+    name: str
+    slug: str
+    user_id: int
+    url: str
+    files: List[FileInfo]
+
+
+class DocumentData(BaseModel):
+    data: FolderInfo
+
+
+class QuestionRequest(BaseModel):
+    question: str
+    document_url: str
+
+
+class FeedbackModel(BaseModel):
+    user_prompt: str
+    feedback: str
+
+
+class HighlightRequest(BaseModel):
+    pdf_name: str
+    page_number: int
+    highlight_text: str
+
+
+app = FastAPI()
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+request_lock = Lock()
+
+
+@app.post("/api/run_ingest")
+async def ingest_from_json(document_data: DocumentData):
+    file_urls = [file.file for file in document_data.data.files]
+    try:
+        results = await process_documents(file_urls)
+        return {"message": "Documents processed and vectorstore updated successfully", "results": results}
+    except Exception as e:
+        traceback.print_exc()
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@app.post("/api/prompt_route")
+async def prompt_route(question_request: QuestionRequest):
+    with request_lock:
+        try:
+            answer, references = answer_question_from_pdf(question_request.document_url, question_request.question)
+            return {"answer": answer, "references": references}
+        except Exception as e:
+            traceback.print_exc()
+            raise HTTPException(status_code=500, detail=str(e))
+
+
+@app.post("/api/feedback")
+async def receive_feedback(feedback: FeedbackModel):
+    print(f"Received feedback for '{feedback.user_prompt}': {feedback.feedback}")
+    return {"message": "Thank you for your feedback!"}
+
+
+@app.post("/api/highlight_pdf")
+async def highlight_pdf_endpoint(highlight_requests: List[HighlightRequest]):
+    results = []
+    for request in highlight_requests:
+        pdf_path = None
+        try:
+            parsed_url = urlparse(request.pdf_name)
+            bucket = parsed_url.netloc.split(".")[0]
+            key_prefix = os.path.dirname(parsed_url.path).strip("/")
+
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
+                response = httpx.get(request.pdf_name)
+                response.raise_for_status()
+                tmp_file.write(response.content)
+                pdf_path = tmp_file.name
+
+            highlighted_pdf = highlight_text_in_pdf(pdf_path, request.page_number, request.highlight_text)
+
+            if highlighted_pdf:
+                image_name = f"{key_prefix}/highlighted_page_{request.page_number}.png"
+                s3_image_url = upload_image_to_s3(highlighted_pdf, bucket, image_name)
+                results.append({"pdf_name": request.pdf_name, "highlighted_image": s3_image_url})
+            else:
+                results.append(
+                    {"pdf_name": request.pdf_name, "error": "No matching text found or failed to create highlight."}
+                )
+        except Exception as e:
+            logging.error(f"Error highlighting {request.pdf_name}: {e}")
+            results.append({"pdf_name": request.pdf_name, "error": str(e)})
+        finally:
+            if pdf_path and os.path.exists(pdf_path):
+                os.remove(pdf_path)
+            if highlighted_pdf and os.path.exists(highlighted_pdf):
+                os.remove(highlighted_pdf)
+
+    return results
+
+
+def upload_image_to_s3(image_path, bucket, object_name):
+    s3_client = boto3.client("s3")
+    try:
+        s3_client.upload_file(image_path, bucket, object_name)
+        return f"https://{bucket}.s3.amazonaws.com/{object_name}"
+    except Exception as e:
+        logging.error(f"Failed to upload {image_path} to {bucket}/{object_name}: {e}")
+        return str(e)
+
+
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/ask_monk/models/schemas.py b/ask_monk/models/schemas.py
@@ -0,0 +1,38 @@
+from pydantic import BaseModel, HttpUrl
+from typing import List
+
+
+class FileInfo(BaseModel):
+    id: int
+    name: str
+    file: HttpUrl
+
+
+class FolderInfo(BaseModel):
+    id: int
+    uid: str
+    name: str
+    slug: str
+    user_id: int
+    url: HttpUrl
+    files: List[FileInfo]
+
+
+class DocumentData(BaseModel):
+    data: FolderInfo
+
+
+class QuestionRequest(BaseModel):
+    question: str
+    document_url: str
+
+
+class FeedbackModel(BaseModel):
+    user_prompt: str
+    feedback: str
+
+
+class HighlightRequest(BaseModel):
+    pdf_name: str
+    page_number: int
+    highlight_text: str
diff --git a/ask_monk/requirements.txt b/ask_monk/requirements.txt
@@ -0,0 +1,11 @@
+fastapi
+uvicorn
+fitz
+spacy
+scikit-learn
+sentence-transformers
+openai
+httpx
+boto3
+pydantic
+langchain
diff --git a/ask_monk/utils/embeddings.py b/ask_monk/utils/embeddings.py
@@ -0,0 +1,32 @@
+from openai import OpenAI
+import os
+from langchain.vectorstores import Chroma
+from constants import CHROMA_SETTINGS, PERSIST_DIRECTORY
+
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+
+
+def get_openai_embeddings(text):
+    response = client.embeddings.create(model="text-embedding-ada-002", input=text)
+    return response.data[0].embedding
+
+
+def get_embeddings(text_chunks):
+    embeddings = []
+    for chunk in text_chunks:
+        embedding = get_openai_embeddings(chunk)
+        embeddings.append(embedding)
+    return embeddings
+
+
+def save_embeddings(embeddings, texts):
+    if not os.path.exists(PERSIST_DIRECTORY):
+        os.makedirs(PERSIST_DIRECTORY)
+
+    db = Chroma.from_embeddings(
+        texts,
+        embeddings,
+        persist_directory=PERSIST_DIRECTORY,
+        client_settings=CHROMA_SETTINGS,
+    )
+    db.persist()
diff --git a/ask_monk/utils/highlight.py b/ask_monk/utils/highlight.py
@@ -0,0 +1,57 @@
+import pdfplumber
+from pdf_processing import preprocess_text
+import re
+
+
+def find_text_positions(page, search_text):
+    """Improved logic to handle text matches that start or end mid-word."""
+    search_text = preprocess_text(search_text)
+    words = page.extract_words(keep_blank_chars=True)
+    page_text = preprocess_text(" ".join([word["text"] for word in words]))
+
+    word_positions = []
+    current_index = 0
+    for word in words:
+        start = current_index
+        end = start + len(preprocess_text(word["text"]))
+        word_positions.append((start, end, word))
+        current_index = end + 1  # Account for space between words
+
+    pattern = re.compile(re.escape(search_text), re.IGNORECASE)
+    matches = list(pattern.finditer(page_text))
+
+    bounding_boxes = []
+    for match in matches:
+        match_start, match_end = match.start(), match.end()
+        for start, end, word in word_positions:
+            if start < match_end and end > match_start:  # Check for any overlap
+                bounding_boxes.append((word["x0"], word["top"], word["x1"], word["bottom"]))
+
+    return bounding_boxes
+
+
+def highlight_text_in_pdf(pdf_path, page_number, highlight_text):
+    """Highlight the specified text on the given page with enhanced debugging."""
+    try:
+        with pdfplumber.open(pdf_path) as pdf:
+            page = pdf.pages[page_number - 1]
+            words = page.extract_words(keep_blank_chars=True)
+            bounding_boxes = find_text_positions(page, highlight_text)
+
+            if not bounding_boxes:
+                print("No matching text found.")
+                return None
+
+            page_image = page.to_image(resolution=400)
+            for box in bounding_boxes:
+                page_image.draw_rect(box, fill=(255, 255, 0, 64), stroke="orange", stroke_width=3)
+
+            output_file_path = f"highlighted_page_{page_number}.png"
+            page_image.save(output_file_path, quality=95)
+            print(f"Highlighted text saved to {output_file_path}")
+
+            return output_file_path
+
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        return str(e)