-
Notifications
You must be signed in to change notification settings - Fork 0
/
web_chunking.py
146 lines (115 loc) · 4.44 KB
/
web_chunking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer, util
import spacy
import ollama
# Load SentenceTransformer model for semantic similarity
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
# Load spaCy model for NER if needed (kept from the original code, though not used here)
nlp = spacy.load("en_core_web_sm")
def fetch_and_chunk_by_sections(url):
"""
Fetches and chunks content from the given URL by <h2> sections.
Parameters:
url (str): The URL to fetch content from.
Returns:
list: A list of content sections.
"""
response = requests.get(url)
if response.status_code != 200:
raise Exception(f"Error fetching the web page: {response.status_code}")
soup = BeautifulSoup(response.content, 'html.parser')
# Extract meaningful content based on <h2> sections
sections = []
current_section = ''
for tag in soup.find_all(['h2', 'p', 'ul', 'li']):
if tag.name == 'h2' and current_section:
sections.append(remove_duplicates(current_section.strip()))
current_section = tag.get_text(separator=' ', strip=True) + '\n'
else:
current_section += tag.get_text(separator=' ', strip=True) + ' '
if current_section:
sections.append(remove_duplicates(current_section.strip()))
return sections
def remove_duplicates(text):
"""
Removes duplicate lines from the text.
Parameters:
text (str): The text to process.
Returns:
str: The text with duplicate lines removed.
"""
lines = text.split('. ')
seen = set()
filtered_lines = []
for line in lines:
if line not in seen:
filtered_lines.append(line)
seen.add(line)
return '. '.join(filtered_lines)
def find_relevant_chunk(sections, user_query):
"""
Finds the most relevant section of content based on the user's query.
Parameters:
sections (list): A list of content sections.
user_query (str): The user's query.
Returns:
str: The most relevant section of content.
"""
# Create embeddings for all sections and the user query
section_embeddings = model.encode(sections, convert_to_tensor=True)
query_embedding = model.encode(user_query, convert_to_tensor=True)
# Compute cosine similarities
cosine_scores = util.pytorch_cos_sim(query_embedding, section_embeddings)
# Find the index of the most relevant section
most_relevant_section_idx = cosine_scores.argmax().item()
return sections[most_relevant_section_idx]
def read_modelfile(filename):
"""
Reads the Modelfile.txt and returns the parameters and system message.
Parameters:
filename (str): The path to the Modelfile.txt
Returns:
dict: A dictionary with parameters and system message
"""
params = {}
with open(filename, 'r') as file:
for line in file:
if line.startswith('PARAMETER'):
key, value = line.split()[1:]
params[key] = float(value) if '.' in value else int(value)
elif line.startswith('SYSTEM'):
params['system_message'] = ' '.join(line.split()[1:])
return params
# User query
user_query = input("Enter your query: ")
# URL to fetch content from
url = 'https://www.canada.ca/en/immigration-refugees-citizenship/services/visit-canada/prepare-arrival.html'
# Fetch and chunk content from the URL by sections
sections = fetch_and_chunk_by_sections(url)
print(f"Fetched {len(sections)} sections of content.")
# Find the most relevant section
relevant_section = find_relevant_chunk(sections, user_query)
print("Most relevant section:")
print(relevant_section)
# Prepare combined query
augmented_query = f"{user_query} (Use this additional information to improve your answer if relevant: {relevant_section})"
print("Augmented query:")
print(augmented_query)
# Read parameters and system message from Modelfile.txt
modelfile_path = './Modelfile.txt'
params = read_modelfile(modelfile_path)
# Prepare the messages with the system message
messages = [
{'role': 'system', 'content': params.get('system_message', '')},
{'role': 'user', 'content': augmented_query}
]
# Query with the specified parameters
stream = ollama.chat(
model='ttl_llama3',
messages=messages,
stream=True
)
for chunk in stream:
print(chunk['message']['content'], end='', flush=True)
print(' For more information, go to: ' + url)