-
Notifications
You must be signed in to change notification settings - Fork 0
/
build_inputs.py
106 lines (96 loc) · 3.27 KB
/
build_inputs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/9/22 15:03
# @Author : TheTAO
# @Site :
# @File : build_inputs.py
# @Software: PyCharm
import pickle
import jieba.posseg as psg
from cnradical import Radical, RunOption
# 全角转半角
def full_to_half(s):
n = []
for char in s:
num = ord(char)
if num == 0x3000:
num = 32
elif 0xFF01 <= num <= 0xFF5E:
num -= 0xfee0
char = chr(num)
n.append(char)
return ''.join(n)
# 简单清理数据
def replace_html(stxt):
stxt = stxt.replace('"', '"')
stxt = stxt.replace('&', '&')
stxt = stxt.replace('<', '<')
stxt = stxt.replace('>', '>')
stxt = stxt.replace(' ', ' ')
stxt = stxt.replace("“", "")
stxt = stxt.replace("”", "")
stxt = stxt.replace("—", "")
stxt = stxt.replace("\xa0", " ")
return stxt
def input_from_line_with_feature(line):
"""
此函数将单一输入句子进行实体识别,构造为具体如下形式
[[[raw_text]], [[word]], [[bound]], [[flag]], [[label]], [[radical]], [[pinyin]]]
这里多一列,到时候输入为[1:]
:param line:输入的单一句子
:param char_to_id:词典转索引
:return:
"""
with open(f'datas/prepare_data/dict.pkl', 'rb') as f:
map_dict = pickle.load(f)
def item2id(data, w2i):
return [w2i[x] if x in w2i else w2i['UNK'] for x in data]
inputs = list()
feature_names = ['word', 'bound', 'flag', 'radical', 'pinyin', 'label']
line = full_to_half(line)
line = replace_html(line)
chars = [[char for char in line]]
# 获取标签,先全部打上O
tag_list = ['O' for _ in line]
# 提取词性和词边界特征
word_bounds = ['M' for _ in tag_list] # 保存每个词的边界
word_flags = [] # 保存词性
# 遍历带词性的切分
for word, flag in psg.cut(line):
# 单个词的时候
if len(word) == 1:
start = len(word_flags)
word_bounds[start] = 'S'
word_flags.append(flag)
else:
start = len(word_flags)
word_bounds[start] = 'B'
word_flags += [flag] * len(word)
# 这里end需要-1
end = len(word_flags) - 1
word_bounds[end] = 'E'
bounds = [word_bounds]
flags = [word_flags]
# 由于是测试将label置为空
targets = [[]]
# 获取偏旁和拼音特征
radical = Radical(RunOption.Radical)
pinyin = Radical(RunOption.Pinyin)
# 这里循环迭代去获取,None的去填充
radicals = [[radical.trans_ch(x) if radical.trans_ch(x) is not None else 'UNK' for x in line]]
pinyins = [[pinyin.trans_ch(x) if pinyin.trans_ch(x) is not None else 'UNK' for x in line]]
inputs.append(chars)
inputs.append(bounds)
inputs.append(flags)
inputs.append(radicals)
inputs.append(pinyins)
inputs.append(targets)
# 开始循环转化为数字索引
id_inputs = [[line]]
for i, feature in enumerate(feature_names):
id_inputs.append([item2id(inputs[i][0], map_dict[feature][2])])
return id_inputs[0][0], id_inputs[1:]
if __name__ == '__main__':
lines = '我是中国人'
id_input = input_from_line_with_feature(lines)
print(id_input[0][0])