-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentence_tokenisation.py
44 lines (34 loc) · 1.55 KB
/
sentence_tokenisation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
"""Data must first be converted from raw text files to tokenised files with one sentence per line."""
import nltk
import tensorflow as tf
import glob
import codecs
import re
from multiprocessing import Pool
FLAGS = tf.flags.FLAGS
tf.flags.DEFINE_string('raw_text_dir',
'data/webbase_all/',
'Path to directory containing the raw text files for processing.')
tf.flags.DEFINE_string('output_tokenised_dir',
'data/tokenised_texts',
'Path to directory to write files containing tokenised sentences.')
def process_input_file(input_file):
"""Multiprocessing function to tokenise raw file into set of sentences"""
try:
print('Processing raw text file {}'.format(input_file))
with tf.gfile.GFile(input_file, mode="r") as f:
text = f.read().decode('utf-8')
text = re.sub('\n', ' ', text).lower()
text = ''.join(re.findall('[a-z. ]', text))
sentences = nltk.sent_tokenize(text)
sentences = [''.join(re.findall('[a-z ]', s)) for s in sentences]
sentences = list(filter(lambda x: len(x) >= 3, sentences))
file_name = input_file.split('/')[-1]
with codecs.open(FLAGS.output_tokenised_dir + '/{}'.format(file_name), "w", "utf-8") as f:
f.write('\n'.join(sentences))
except Exception as e:
print('FAILURE:', process_input_file, e)
if __name__ == '__main__':
input_files = glob.glob(FLAGS.raw_text_dir + '/*')
pool = Pool(10)
pool.map(process_input_file, input_files)