-
Notifications
You must be signed in to change notification settings - Fork 1
/
01_Ingest_Data.py
132 lines (117 loc) · 4.54 KB
/
01_Ingest_Data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from os import path
import streamlit as st
import traceback
import requests
import urllib.parse
import sys
import logging
from batch.utilities.helpers.config.config_helper import ConfigHelper
from batch.utilities.helpers.env_helper import EnvHelper
from batch.utilities.helpers.azure_blob_storage_client import AzureBlobStorageClient
sys.path.append(path.join(path.dirname(__file__), ".."))
env_helper: EnvHelper = EnvHelper()
logger = logging.getLogger(__name__)
st.set_page_config(
page_title="Ingest Data",
page_icon=path.join("images", "favicon.ico"),
layout="wide",
menu_items=None,
)
mod_page_style = """
<style>
#MainMenu {visibility: hidden;}
footer {visibility: hidden;}
header {visibility: hidden;}
</style>
"""
st.markdown(mod_page_style, unsafe_allow_html=True)
def reprocess_all():
backend_url = urllib.parse.urljoin(
env_helper.BACKEND_URL, "/api/BatchStartProcessing"
)
params = {}
if env_helper.FUNCTION_KEY is not None:
params["code"] = env_helper.FUNCTION_KEY
params["clientId"] = "clientKey"
try:
response = requests.post(backend_url, params=params)
if response.status_code == 200:
st.success(
f"{response.text}\nPlease note this is an asynchronous process and may take a few minutes to complete."
)
else:
st.error(f"Error: {response.text}")
except Exception:
st.error(traceback.format_exc())
def add_urls():
urls = st.session_state["urls"].split("\n")
add_url_embeddings(urls)
def add_url_embeddings(urls: list[str]):
params = {}
if env_helper.FUNCTION_KEY is not None:
params["code"] = env_helper.FUNCTION_KEY
params["clientId"] = "clientKey"
for url in urls:
body = {"url": url}
backend_url = urllib.parse.urljoin(
env_helper.BACKEND_URL, "/api/AddURLEmbeddings"
)
r = requests.post(url=backend_url, params=params, json=body)
if not r.ok:
raise ValueError(f"Error {r.status_code}: {r.text}")
else:
st.success(f"Embeddings added successfully for {url}")
try:
with st.expander("Add documents in Batch", expanded=True):
config = ConfigHelper.get_active_config_or_default()
file_type = [
processor.document_type for processor in config.document_processors
]
uploaded_files = st.file_uploader(
"Upload a document to add it to the Azure Storage Account, compute embeddings and add them to the Azure AI Search index. Check your configuration for available document processors.",
type=file_type,
accept_multiple_files=True,
)
blob_client = AzureBlobStorageClient()
if uploaded_files is not None:
for up in uploaded_files:
# To read file as bytes:
bytes_data = up.getvalue()
if st.session_state.get("filename", "") != up.name:
# Upload a new file
st.session_state["filename"] = up.name
st.session_state["file_url"] = blob_client.upload_file(
bytes_data, up.name, metadata={"title": up.name}
)
if len(uploaded_files) > 0:
st.success(
f"{len(uploaded_files)} documents uploaded. Embeddings computation in progress. \nPlease note this is an asynchronous process and may take a few minutes to complete.\nYou can check for further details in the Azure Function logs."
)
col1, col2, col3 = st.columns([2, 1, 2])
with col3:
st.button(
"Reprocess all documents in the Azure Storage account",
on_click=reprocess_all,
)
with st.expander("Add URLs to the knowledge base", expanded=True):
col1, col2 = st.columns([3, 1])
with col1:
st.text_area(
"Add a URLs and than click on 'Compute Embeddings'",
placeholder="PLACE YOUR URLS HERE SEPARATED BY A NEW LINE",
height=100,
key="urls",
)
with col2:
st.selectbox(
"Embeddings models",
[env_helper.AZURE_OPENAI_EMBEDDING_MODEL],
disabled=True,
)
st.button(
"Process and ingest web pages",
on_click=add_urls,
key="add_url",
)
except Exception:
st.error(traceback.format_exc())