Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added Functionality to read JSON files #772

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 40 additions & 3 deletions constants.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import os

import json
# from dotenv import load_dotenv
from chromadb.config import Settings

# https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/excel.html?highlight=xlsx#microsoft-excel
from langchain.document_loaders import CSVLoader, PDFMinerLoader, TextLoader, UnstructuredExcelLoader, Docx2txtLoader
from langchain.document_loaders import CSVLoader, PDFMinerLoader, TextLoader, UnstructuredExcelLoader, Docx2txtLoader,JSONLoader
from langchain.document_loaders import UnstructuredFileLoader, UnstructuredMarkdownLoader
from langchain.document_loaders import UnstructuredHTMLLoader

Expand Down Expand Up @@ -40,7 +40,43 @@
### From experimenting with the Llama-2-7B-Chat-GGML model on 8GB VRAM, these values work:
# N_GPU_LAYERS = 20
# N_BATCH = 512

class JSONArrayLoader:
def _init_(self, file_path:str,jq_schema:str = ".",text_content:bool = True):
self.file_path = file_path
self.jq_schema = jq_schema
self.text_content = text_content

def load(self):
documents = []
with open(self.file_path, encoding = "utf-8") as json_file:
raw_data = json_file.read()
try:
data = json.loads(raw_data)
if not isinstance(data,list):
raise ValueError("JSON file at {} should contain a list of documents.".format(self.file_path))

for entry in data:
if not isinstance(entry,dict):
raise ValueError("Each entry in the JSON file at {} should be a dictionary.".format(self.file_path))

entry_schema = self.jq_schema
if self.text_content:
entry_schema = f".{entry_schema}"
loader = JSONLoader(
file_path = self.file_path,
jq_schema = entry_schema,
text_content = self.text_content,
)
documents.append(loader.load()[0])
return documents
except:
loader = JSONLoader(
file_path = self.file_path,
jq_schema = self.jq_schema,
text_content = self.text_content,
)
return loader.load()


# https://python.langchain.com/en/latest/_modules/langchain/document_loaders/excel.html#UnstructuredExcelLoader
DOCUMENT_MAP = {
Expand All @@ -55,6 +91,7 @@
".xlsx": UnstructuredExcelLoader,
".docx": Docx2txtLoader,
".doc": Docx2txtLoader,
".json":JSONArrayLoader
}

# Default Instructor Model
Expand Down
1 change: 1 addition & 0 deletions ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import torch
from langchain.docstore.document import Document
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
from langchain_text_splitters import RecursiveJsonSplitter
from langchain.vectorstores import Chroma
from utils import get_embeddings

Expand Down