-
Notifications
You must be signed in to change notification settings - Fork 0
/
individual_words.py
141 lines (113 loc) · 3.57 KB
/
individual_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import re
file_object = open("/home/asmita/NER/data/Patient2.con",'r')
f = file_object.read() #this contains the dictionary of phrases
file_object1 = open("/home/asmita/NER/data/Patient2.txt",'r')
patients = file_object1.read() #this contains the string
P = patients
patients = patients.replace('.', ' . ')
wordList = re.sub("[^\w].,!", " ", patients).split()
print(wordList)
l = len(f)
start = 0
condition_word_list = [""]
conditions = []
tags = []
fff = open("tokenized.txt", "w+")
wordlist = patients.split()
for word in wordlist:
fff.write("( ")
fff.write(word)
fff.write(("), ()\n"))
fff.close()
'''
doing some processing to get the dictionary of phrases as a list, the starting index and length of phrases [ O(n) ]
'''
for i in range(l):
if f[i]=='\n':
end = i
sub = f[start:end]
sub.find('\"')
slasharr = [pos for pos, char in enumerate(sub) if char == "\""]
#find location of quotation marks and subset it
condition = sub[3:slasharr[1]]
tag = sub[slasharr[2]+1:slasharr[3]]
conditions.append(condition)
tags.append(tag)
#print('condition=', condition, ' tag = ', tag)
condition_words = re.sub("[^\w]", " ", condition).split()
#print(condition_words)
condition_word_list.extend(condition_words)
start=i+1
#print(conditions) #this is the dictionary of phrases
''' *** This is the part at which you need to take a look *** '''
index_list = list(map(patients.lower().find, conditions) ) #this is the list of starting indices of the phrases
length_of_conditions = [len(con) for con in conditions] #this is the list of lengths of the phrases
dict_tag = {}
for i in range(len(conditions)):
dict_tag[conditions[i]] = tags[i]
conditions = list(set(conditions))
for c in conditions:
if (len(c)<3):
conditions.remove(c)
del(dict_tag[c])
#print(dict_tag)
length_of_conditions = [len(con) for con in conditions] #print(min(length_of_conditions)) = 3
my_dict = {}
my_dict_tag = {}
counter = 1
for c in conditions:
my_dict[c] = '***'+str(counter)+'***'
counter+=1
for i in range(len(conditions)):
#print(conditions[i], length_of_conditions[i])
#toreplace = '*'*len(conditions[i])
r = patients.lower().replace(conditions[i], my_dict[conditions[i]])
#r = r.strip()
#for q in r:
# if q[0:2]=='***'
patients=r
#print(patients[400:410])
#print(patients)
#split_p = re.findall(r"[\w']+|[.,!?;]", patients)
split_p = patients.split()
final_dict = {}
for k, v in my_dict.items():
#replacing stars with the conditions
for n, i in enumerate(split_p):
if i == v:
split_p[n] = k
print('AAAAAAAA')
print(split_p)
print('heehaw')
ff = open("split_patients.txt","w+")
for word in split_p:
if word in conditions:
#print('okkkkk')
lll = word.split()
#print('word list', lll)
for r in lll:
final_dict[word] = dict_tag[word]
s = word + " NNP" + " B-NP " + dict_tag[word]
ff.write("%s\n" % s)
#print(s)
else:
#print('holy cow')
s = word + " NNP" + " B-NP " + 'O'
ff.write("%s\n" % s)
#print(s)
final_dict[word] = 'O'
ff.close()
'''
ff = open("/home/asmita/Downloads/Named-Entity-Recognition-with-Bidirectional-LSTM-CNNs-master/data/train.txt","w+")
for k,v in final_dict.items():
s = k + " NNP" + " B-NP " + v
print(s)
ff.write("%s\n" % s)
#ff.write("\n")
#ff.write("\n")
ff.close()
print(len(final_dict))
#print('\ufeff' in P.split())
#f=open("train_patient.txt", "r")
#contents = f.read()
'''