forked from jind11/word2vec-on-wikipedia
-
Notifications
You must be signed in to change notification settings - Fork 0
/
wiki-corpus-prepare.py
46 lines (42 loc) · 1.69 KB
/
wiki-corpus-prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
## split texts to sentences
from pycorenlp import StanfordCoreNLP
import re
import time
import sys
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout = stdout
nlp = StanfordCoreNLP('http://localhost:9000')
infilePath = sys.argv[1]
outfilePath = sys.argv[2]
splitted_texts = ''
count_articles = 0
count_tokens = 0
start_time = time.time()
outfile = open(outfilePath, 'w')
with open(infilePath, 'r') as f:
for text in f:
text = text.strip()
if len(text) > 20:
if not text.startswith('<doc id='):
text = re.sub('\d', '0', text.lower())
output = nlp.annotate(text, properties={
'annotators': 'ssplit',
'outputFormat': 'json',
'threads': '24',
'tokenize.options': 'normalizeParentheses=false, normalizeOtherBrackets=false'
})
try:
sents = [[token['word'] for token in sent['tokens']] for sent in output['sentences'] if len(sent['tokens']) >= 5]
except Exception as e:
print(e.message, output)
else:
for sent in sents:
outfile.write(' '.join(sent)+'\n')
count_tokens += len(sent)
else:
count_articles += 1
if count_articles % 2000 == 0:
elapsed_time = time.time() - start_time
print (infilePath.split('/')[-1], 'Processed article number:', count_articles, 'Elapsed time:', elapsed_time, 's')