Skip to content

Commit

Permalink
Reduce repeat code for init and minor change on web ui
Browse files Browse the repository at this point in the history
  • Loading branch information
linjungz committed Aug 5, 2023
1 parent 3b74a62 commit 705042a
Show file tree
Hide file tree
Showing 3 changed files with 150 additions and 88 deletions.
4 changes: 2 additions & 2 deletions chat_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
VECTORDB_PATH = os.getenv("VECTORDB_PATH")

if VECTORDB_PATH is None:
typer.echo(typer.style("VECTORDB_PATH environment variable not found.", fg=typer.colors.RED))
raise typer.Exit()
typer.echo(typer.style("VECTORDB_PATH environment variable not found and default path ./data/vector_store will be used.", fg=typer.colors.RED))
VECTORDB_PATH = "./data/vector_store"

@app.command()
def ingest(
Expand Down
10 changes: 5 additions & 5 deletions chat_web_st.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,22 @@
available_indexes = docChatBot.get_available_indexes("./data/vector_store")

# Add an option for "Uploaded File"
index_options = ["Uploaded File"] + available_indexes
index_options = ["-- Existing Vector Stores --"] + available_indexes

with st.sidebar:
st.title("💬 Chat with Your Doc")
st.write("Upload a document and ask questions about it.")

# Dropdown for selecting an index or uploaded file
selected_index = st.selectbox('Select an index or upload a file:', index_options)

with st.form("Upload and Process", True):
# Dropdown for selecting an index or uploaded file
selected_index = st.selectbox('Select an existing vector store or upload a file to create one, then press Process button', index_options)

uploaded_file = st.file_uploader("Upload documents", type=["pdf", "md", "txt", "docx", ".csv", ".xml"])
submitted = st.form_submit_button("Process")

if submitted:
try:
if selected_index == "Uploaded File":
if selected_index == "-- Existing Vector Stores --":
if uploaded_file:
ext_name = os.path.splitext(uploaded_file.name)[-1]
if ext_name not in [".pdf", ".md", ".txt", ".docx", ".csv", ".xml"]:
Expand Down
224 changes: 143 additions & 81 deletions chatbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@
import streamlit
import glob

REQUEST_TIMEOUT_DEFAULT = 10
TEMPERATURE_DEFAULT = 0.0
CHAT_MODEL_NAME_DEFAULT = "gpt-3.5-turbo"
OPENAI_EMBEDDING_DEPLOYMENT_NAME_DEFAULT = "text-embedding-ada-002"
CHUNK_SIZE_DEFAULT = 1000
CHUNK_OVERLAP_DEFAULT = 0

class StreamHandler(BaseCallbackHandler):
def __init__(self, container, initial_text=""):
Expand All @@ -38,104 +44,161 @@ class DocChatbot:
vector_db: FAISS
chatchain: BaseConversationalRetrievalChain

def __init__(self) -> None:
#init for LLM and Embeddings
load_dotenv()
assert(os.getenv("OPENAI_API_KEY") is not None)
api_key = str(os.getenv("OPENAI_API_KEY"))
embedding_deployment = os.getenv("OPENAI_EMBEDDING_DEPLOYMENT_NAME")
request_timeout = int(os.getenv("REQUEST_TIMEOUT"))
temperature = float(os.getenv("TEMPERATURE"))
model_name = str(os.getenv("CHAT_MODEL_NAME"))

#check if user is using API from openai.com or Azure OpenAI Service by inspecting the api key
if api_key.startswith("sk-"):
# user is using API from openai.com
assert(len(api_key) == 51)
# configuration for API calls
request_timeout: int
temperature: float
chat_model_name : str
api_key : str

def init_llm_openai(self, streaming: bool, condense_question_container = None, answer_container = None) -> None:
# init for LLM using openai.com api

self.llm = ChatOpenAI(
temperature=self.temperature,
openai_api_key=self.api_key,
request_timeout=self.request_timeout,
model=self.chat_model_name, # Model name is needed for openai.com only
streaming=streaming,
callbacks=[StreamHandler(answer_container)] if streaming else []
) # type: ignore

self.llm = ChatOpenAI(
temperature=temperature,
openai_api_key=api_key,
request_timeout=request_timeout,
model=model_name
if streaming:
self.condense_question_llm = ChatOpenAI(
temperature=self.temperature,
openai_api_key=self.api_key,
request_timeout=self.request_timeout,
streaming=True,
model=self.chat_model_name,
callbacks=[StreamHandler(condense_question_container, "🤔...")]
) # type: ignore
else:
# user is using Azure OpenAI Service
assert(os.getenv("OPENAI_GPT_DEPLOYMENT_NAME") is not None)
assert(os.getenv("OPENAI_API_BASE") is not None)
assert(len(api_key) == 32)
self.condense_question_llm = self.llm

self.llm = AzureChatOpenAI(
def init_llm_azure(self, streaming: bool, condense_question_container = None, answer_container = None) -> None:
# init for LLM using Azure OpenAI Service

assert(os.getenv("OPENAI_GPT_DEPLOYMENT_NAME") is not None)
assert(os.getenv("OPENAI_API_BASE") is not None)
assert(os.getenv("OPENAI_EMBEDDING_DEPLOYMENT_NAME") is not None)
assert(len(self.api_key) == 32)

self.llm = AzureChatOpenAI(
deployment_name=os.getenv("OPENAI_GPT_DEPLOYMENT_NAME"),
temperature=self.temperature,
openai_api_version="2023-05-15",
openai_api_type="azure",
openai_api_base=os.getenv("OPENAI_API_BASE"),
openai_api_key=self.api_key,
request_timeout=self.request_timeout,
streaming=streaming,
callbacks=[StreamHandler(answer_container)] if streaming else []
) # type: ignore

if streaming:
self.condense_question_llm = AzureChatOpenAI(
deployment_name=os.getenv("OPENAI_GPT_DEPLOYMENT_NAME"),
temperature=temperature,
temperature=self.temperature,
openai_api_version="2023-05-15",
openai_api_type="azure",
openai_api_base=os.getenv("OPENAI_API_BASE"),
openai_api_key=api_key,
request_timeout=request_timeout,
model=model_name
openai_api_key=os.getenv("OPENAI_API_KEY"),
request_timeout=self.request_timeout,
model=self.chat_model_name,
streaming=True,
callbacks=[StreamHandler(condense_question_container, "🤔...")]
) # type: ignore
else:
self.condense_question_llm = self.llm

def __init__(self) -> None:
#init for LLM and Embeddings, without support for streaming

embedding_deployment = os.getenv("OPENAI_EMBEDDING_DEPLOYMENT_NAME")
#load environment variables
load_dotenv()
assert(os.getenv("OPENAI_API_KEY") is not None)
self.api_key = str(os.getenv("OPENAI_API_KEY"))
self.request_timeout = REQUEST_TIMEOUT_DEFAULT if os.getenv("REQUEST_TIMEOUT") is None else int(os.getenv("REQUEST_TIMEOUT"))
self.temperature = TEMPERATURE_DEFAULT if os.getenv("TEMPERATURE") is None else float(os.getenv("TEMPERATURE"))
self.chat_model_name = CHAT_MODEL_NAME_DEFAULT if os.getenv("CHAT_MODEL_NAME") is None else str(os.getenv("CHAT_MODEL_NAME"))

self.condense_question_llm = self.llm
#check if user is using API from openai.com or Azure OpenAI Service by inspecting the api key
if self.api_key.startswith("sk-"):
# user is using API from openai.com
assert(len(self.api_key) == 51)
self.init_llm_openai(False)
else:
# user is using Azure OpenAI Service
self.init_llm_azure(False)

embedding_deployment = OPENAI_EMBEDDING_DEPLOYMENT_NAME_DEFAULT if os.getenv("OPENAI_EMBEDDING_DEPLOYMENT_NAME") is None else str(os.getenv("OPENAI_EMBEDDING_DEPLOYMENT_NAME"))
self.embeddings = OpenAIEmbeddings(
deployment=embedding_deployment,
deployment=embedding_deployment,
chunk_size=1
) # type: ignore

def init_streaming(self, condense_question_container, answer_container) -> None:
api_key = str(os.getenv("OPENAI_API_KEY"))
temperature=float(os.getenv("TEMPERATURE"))
request_timeout=int(os.getenv("REQUEST_TIMEOUT"))
model_name=str(os.getenv("CHAT_MODEL_NAME"))
if api_key.startswith("sk-"):
# user is using API from openai.com
self.llm = ChatOpenAI(
temperature=temperature,
openai_api_key=api_key,
request_timeout=request_timeout,
streaming=True,
model=model_name,
callbacks=[StreamHandler(answer_container)]
) # type: ignore
#init for LLM and Embeddings, with support for streaming

self.condense_question_llm = ChatOpenAI(
temperature=temperature,
openai_api_key=api_key,
request_timeout=request_timeout,
streaming=True,
model=model_name,
callbacks=[StreamHandler(condense_question_container, "🤔...")]
) # type: ignore
if self.api_key.startswith("sk-"):
# user is using API from openai.com
self.init_llm_openai(True, condense_question_container, answer_container)
else:
# user is using Azure OpenAI Service
self.llm = AzureChatOpenAI(
deployment_name=os.getenv("OPENAI_GPT_DEPLOYMENT_NAME"),
temperature=temperature,
openai_api_version="2023-05-15",
openai_api_type="azure",
openai_api_base=os.getenv("OPENAI_API_BASE"),
openai_api_key=os.getenv("OPENAI_API_KEY"),
request_timeout=request_timeout,
model=model_name,
streaming=True,
callbacks=[StreamHandler(answer_container)]
) # type: ignore

self.condense_question_llm = AzureChatOpenAI(
deployment_name=os.getenv("OPENAI_GPT_DEPLOYMENT_NAME"),
temperature=temperature,
openai_api_version="2023-05-15",
openai_api_type="azure",
openai_api_base=os.getenv("OPENAI_API_BASE"),
openai_api_key=os.getenv("OPENAI_API_KEY"),
request_timeout=request_timeout,
model=model_name,
streaming=True,
callbacks=[StreamHandler(condense_question_container, "🤔...")]
) # type: ignore
self.init_llm_azure(True, condense_question_container, answer_container)

# def init_streaming(self, condense_question_container, answer_container) -> None:
# #init for LLM and Embeddings, with support for streaming

# api_key = str(os.getenv("OPENAI_API_KEY"))
# temperature=float(os.getenv("TEMPERATURE"))
# request_timeout=int(os.getenv("REQUEST_TIMEOUT"))
# model_name=str(os.getenv("CHAT_MODEL_NAME"))
# if api_key.startswith("sk-"):
# # user is using API from openai.com
# self.llm = ChatOpenAI(
# temperature=temperature,
# openai_api_key=api_key,
# request_timeout=request_timeout,
# streaming=True,
# model=model_name,
# callbacks=[StreamHandler(answer_container)]
# ) # type: ignore

# self.condense_question_llm = ChatOpenAI(
# temperature=temperature,
# openai_api_key=api_key,
# request_timeout=request_timeout,
# streaming=True,
# model=model_name,
# callbacks=[StreamHandler(condense_question_container, "🤔...")]
# ) # type: ignore
# else:
# # user is using Azure OpenAI Service
# self.llm = AzureChatOpenAI(
# deployment_name=os.getenv("OPENAI_GPT_DEPLOYMENT_NAME"),
# temperature=temperature,
# openai_api_version="2023-05-15",
# openai_api_type="azure",
# openai_api_base=os.getenv("OPENAI_API_BASE"),
# openai_api_key=os.getenv("OPENAI_API_KEY"),
# request_timeout=request_timeout,
# model=model_name,
# streaming=True,
# callbacks=[StreamHandler(answer_container)]
# ) # type: ignore

# self.condense_question_llm = AzureChatOpenAI(
# deployment_name=os.getenv("OPENAI_GPT_DEPLOYMENT_NAME"),
# temperature=temperature,
# openai_api_version="2023-05-15",
# openai_api_type="azure",
# openai_api_base=os.getenv("OPENAI_API_BASE"),
# openai_api_key=os.getenv("OPENAI_API_KEY"),
# request_timeout=request_timeout,
# model=model_name,
# streaming=True,
# callbacks=[StreamHandler(condense_question_container, "🤔...")]
# ) # type: ignore

def init_chatchain(self, chain_type : str = "stuff") -> None:
# init for ConversationalRetrievalChain
Expand Down Expand Up @@ -211,8 +274,8 @@ def save_vector_db_to_local(self, path: str, index_name: str):

# split documents, generate embeddings and ingest to vector db
def init_vector_db_from_documents(self, file_list: List[str]):
chunk_size = int(os.getenv("CHUNK_SIZE"))
chunk_overlap = int(os.getenv("CHUNK_OVERLAP"))
chunk_size = CHUNK_SIZE_DEFAULT if os.getenv("CHUNK_SIZE") is None else int(os.getenv("CHUNK_SIZE"))
chunk_overlap = CHUNK_OVERLAP_DEFAULT if os.getenv("CHUNK_OVERLAP") is None else int(os.getenv("CHUNK_OVERLAP"))
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

docs = []
Expand All @@ -226,7 +289,6 @@ def init_vector_db_from_documents(self, file_list: List[str]):
elif ext_name == ".docx":
loader = UnstructuredWordDocumentLoader(file)
elif ext_name == ".pdf":
print("it's pdf")
loader = PyPDFLoader(file)
elif ext_name == ".csv":
loader = CSVLoader(file_path=file)
Expand Down

0 comments on commit 705042a

Please sign in to comment.