-
Notifications
You must be signed in to change notification settings - Fork 0
/
deep_search.py
264 lines (233 loc) · 10.4 KB
/
deep_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author : Jerry.Shi
# File : deep_search.py
# PythonVersion: python3.5
# Date : 2019/1/25 13:45
# Software: PyCharm
"""The main purpose of this application is introduce a new approach for text or query search and get search result.
You can found more introduction in [].
Notes:
1. This application must run with Faiss package
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import time
from tqdm import tqdm
import jieba
import json
try:
import faiss
except Exception:
print('Faiss package have not install, please check!')
class SearchModel(object):
def __init__(self, filename, embedding_file, emb_dim):
"""
Instance a search model.
:param filename: input dataset, which were segment each line with question \t answer \t label, label to show
whether the answer is the most related reply.
:param embedding_file: pre-trained embedding file
:param emb_dim: embedding dimension
"""
self.embedding_dict = dict() # word embedding map
self.emb_dim = emb_dim
self.filename = filename
self.embeding_file = embedding_file
self.questions_text = [] # list to store question text
self.question_vec = []
# TODO, here we only inner product to calculate similarity between two vector
# self.question_index = faiss.IndexFlatIP(self.emb_dim)
self.question_index = faiss.IndexFlatL2(self.emb_dim)
self.answers_text = [] # lis to store answer text
self.answer_vec = []
# self.answer_index = faiss.IndexFlatIP(self.emb_dim)
self.answer_index = faiss.IndexFlatL2(self.emb_dim)
self.labels = []
def init(self):
"""To initialize search model."""
# load word embedding
s_time = time.time()
with open(self.embeding_file, 'r') as f:
for i, line in enumerate(f.readlines()):
arr = line.strip().split()
word = "".join(arr[:-self.emb_dim])
# vec = []
try:
vec = list(map(float, arr[-self.emb_dim:]))
except:
print('Bad line {} found in embedding file.'.format(i))
self.embedding_dict[word.lower()] = vec
if '<UNK>' not in self.embedding_dict:
print('Add <UNK> into embdding dict')
self.embedding_dict['<UNK>'] = [np.random.uniform(-0.08, 0.08) for _ in range(self.emb_dim)]
e_time = time.time()
print('Loaded word embedding from file:{} total words:{}, time cost:{}s'.format(
self.embeding_file, len(self.embedding_dict), (e_time - s_time)))
# load dataset and parse
print('Start to parse info parts from dataset {}...'.format(self.filename))
s_time = time.time()
with open(self.filename, 'r') as f:
for line in tqdm(f.readlines()):
arr = line.strip().split('\t')
q = arr[0]
self.questions_text.append("".join(q.split()))
self.question_vec.append(self.text_to_vec(q.split()))
a = arr[1]
self.answers_text.append("".join(a.split()))
self.answer_vec.append(self.text_to_vec(a.split()))
self.labels.append(arr[2])
e_time = time.time()
print('Parse completed cost time {}s, start to build index with faiss...'.format(e_time - s_time))
# create index
# Note, faiss only can process float32 type
s_time = time.time()
self.question_index.add(np.array(self.question_vec))
self.answer_index.add(np.array(self.answer_vec))
e_time = time.time()
print('Completed build index time:{}s'.format(e_time - s_time))
def text_to_vec(self, text, segment=False):
"""
Convert text to fixed dimension vector
:param text: input text or word list
:param segment: bool, to show whether the input need segment or not
:return: fixed dimension vector
"""
if segment:
text = list(jieba.cut(text))
vec = np.array([0.]*self.emb_dim)
for w in text:
if w.lower() in self.embedding_dict:
vec += self.embedding_dict[w]
else:
vec += self.embedding_dict['<UNK>']
return np.float32(vec) / len(text)
@staticmethod
def _index_search(index, vec, topk):
"""
Search topk most similar vectors in given Faiss index.
:param index: Faiss builded index
:param vec: input vector
:param topk: topk result want to select
:return:
"""
if not isinstance(vec, np.ndarray):
vec = np.array(vec)
sim, ids = index.search(vec, topk)
return sim, ids
def _rank(self, vec_sim, vec_ids, question_sim, question_ids=None):
"""
To calculate final answer tensor with similar
:param vec_sim: answer similar tensor
:param vec_ids: answer id tensor
:param question_sim: relate question tensor with similar
:param question_ids: relate question id
:return: answer text list
"""
# first we check question number is the same as the answer number
assert np.shape(vec_sim) == np.shape(vec_ids) and np.shape(vec_sim)[0] == len(question_sim)
# print("Debug question_sim==>{}".format(question_sim))
# normalize
if len(question_sim) != 1:
question_sim = np.array(question_sim)
qmin, qmax = question_sim.min(), question_sim.max()
question_sim = (question_sim - qmin) / (qmax - qmin)
vmin, vmax = vec_sim.min(), vec_sim.max()
vec_sim = (vec_sim - vmin) / (vmax - vmin)
# calculate weight
for i, w in enumerate(question_sim):
vec_sim[i, :] = vec_sim[i, :] * w
vmin, vmax = vec_sim.min(), vec_sim.max()
vec_sim = (vec_sim - vmin) / (vmax - vmin)
# result
ans_score_dict = {}
shape = np.shape(vec_sim)
row, column = shape
for i in range(row):
for j in range(column):
id = vec_ids[i, j]
score = vec_sim[i, j]
if id not in ans_score_dict:
ans_score_dict[id] = score
# sort by score
ans_sorted = sorted(ans_score_dict.items(), key=lambda kv:kv[1], reverse=True)
print("Debug ans_sorted==> ", ans_sorted)
return ans_sorted
def question_search(self, vec, question_topk=5):
"""
Search most related questions
:param vec: input question vector
:param question_topk:
:return:
"""
sim, qs_ids = self._index_search(index=self.question_index, vec=[vec], topk=question_topk)
related_ids = qs_ids[0]
id_result = []
sim_result = []
answer_vec = []
# check question has the right answer
for i, id in enumerate(related_ids):
if id > len(self.question_vec):
continue
if self.labels[id] == "1":
id_result.append(id)
sim_result.append(sim[0][i])
answer_vec.append(self.answer_vec[id])
return id_result, sim_result, answer_vec
def search(self, query, topk=3, question_topk=5):
"""
Given one query to get it's related answer
:param query: input query
:param topk: the number of related answer want to return
:param question_topk: number question to return most related
:return: answer text
"""
result = dict()
result['query'] = query
result['answers'] = []
result['answer_score'] = []
# step1, convert text to vector
vec = self.text_to_vec(query, segment=True)
print('Input query:{} convert to vector completed!'.format(query))
# step2, get most related questions get the most related question
# Note, here search query must be a 2-dimension tensor sim is the similarity value tensor with
# shape=[num_vec,question_topk] qs_ids the id of most related similar vector shape the same as 'sim'
# sim, qs_ids = self.question_index.search(np.array([vec]), question_topk)
sim, qs_ids = self._index_search(index=self.question_index, vec=[vec], topk=question_topk)
print("Found {} most related question!".format(np.shape(sim)))
# step3, check question has the right answer
id_result, sim_result, answer_vec = self.question_search(vec, question_topk)
# second search
if len(id_result) == 0:
id_result, sim_result, answer_vec = self.question_search(vec, question_topk+5)
print('After filter get related question number:{}'.format(len(id_result)))
if len(id_result) == 0:
return json.dumps(result)
# step4, get related answers
vec_sim, vec_ids = self._index_search(index=self.answer_index, vec=answer_vec, topk=topk)
# if use L2 distance to calculate similarity, you should convert the result to real sim
# if len(sim_result) != 1:
sim_result = 1 - np.array(sim_result)
vec_sim = 1 - vec_sim
# rank and return results
ans_sorted = self._rank(vec_sim, vec_ids, sim_result)[:topk]
for id, score in ans_sorted:
if id > len(self.answers_text):
print('No match result found for id:{}'.format(id))
continue
text = self.answers_text[id]
result['answers'].append(text)
result['answer_score'].append(str(score))
return json.dumps(result, ensure_ascii=False)
if __name__ == "__main__":
data_dir = '/data/research/data/textMatch/'
input_file = data_dir + 'dev_jieba_seg.txt'
embedding_file = data_dir + 'output/vectors.txt'
dim = 300
model = SearchModel(input_file, embedding_file, dim)
model.init()
query_list = ["银川百吉大酒店的招商银行叫啥网点", "人脸验证开通花呗"]
for query in query_list:
result = model.search(query)
print(result)