-
Notifications
You must be signed in to change notification settings - Fork 2
/
gen_data.py
68 lines (67 loc) · 3.23 KB
/
gen_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#encoding=utf8
import os,jieba,csv
import jieba.posseg as pseg
c_root=os.getcwd()+os.sep+"source_data"+os.sep
dev=open("example.dev",'w',encoding='utf8')
train=open("example.train",'w',encoding='utf8')
test=open("example.test",'w',encoding='utf8')
biaoji = set(['DIS', 'SYM', 'SGN', 'TES', 'DRU', 'SUR', 'PRE', 'PT', 'Dur', 'TP', 'REG', 'ORG', 'AT', 'PSB', 'DEG', 'FW','CL'])
fuhao=set(['。','?','?','!','!'])
dics=csv.reader(open("DICT_NOW.csv",'r',encoding='utf8'))
for row in dics:
if len(row)==2:
jieba.add_word(row[0].strip(),tag=row[1].strip())
split_num=0
for file in os.listdir(c_root):
if "txtoriginal.txt" in file:
fp=open(c_root+file,'r',encoding='utf8')
for line in fp:
split_num+=1
words=pseg.cut(line)
for key,value in words:
#print(key)
#print(value)
if value.strip() and key.strip():
import time
start_time=time.time()
index=str(1) if split_num%15<2 else str(2) if split_num%15>1 and split_num%15<4 else str(3)
end_time=time.time()
print("method one used time is {}".format(end_time-start_time))
if value not in biaoji:
value='O'
for achar in key.strip():
if achar and achar.strip() in fuhao:
string=achar+" "+value.strip()+"\n"+"\n"
dev.write(string) if index=='1' else test.write(string) if index=='2' else train.write(string)
elif achar.strip() and achar.strip() not in fuhao:
string = achar + " " + value.strip() + "\n"
dev.write(string) if index=='1' else test.write(string) if index=='2' else train.write(string)
elif value.strip() in biaoji:
begin=0
for char in key.strip():
if begin==0:
begin+=1
string1=char+' '+'B-'+value.strip()+'\n'
if index=='1':
dev.write(string1)
elif index=='2':
test.write(string1)
elif index=='3':
train.write(string1)
else:
pass
else:
string1 = char + ' ' + 'I-' + value.strip() + '\n'
if index=='1':
dev.write(string1)
elif index=='2':
test.write(string1)
elif index=='3':
train.write(string1)
else:
pass
else:
continue
dev.close()
train.close()
test.close()