-
Notifications
You must be signed in to change notification settings - Fork 9
/
tokenizer.py
66 lines (66 loc) · 2.29 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from __future__ import unicode_literals
from ...errors import Errors, Warnings, deprecation_warning
from ...tokens import Doc
import regex as re
import pdb
class GreekTokenizer:
def __init__(self, vocab, rules=None, prefix_search=None, suffix_search=None, infix_finditer=None, token_match=None):
self.token_match = token_match
self.prefix_search = prefix_search
self.suffix_search = suffix_search
self.infix_finditer = infix_finditer
self.vocab = vocab
self._rules = {}
if rules is not None:
for chunk, substrings in sorted(rules.items()):
self._rules[chunk] = substrings
def __call__(self, string):
if (len(string) >= (2**30)):
raise ValueError(Errors.E025.format(length=len(string)))
tokens = self.tokenize(string)
return Doc(self.vocab, tokens)
def tokenize(self, text):
tokens = []
for substring in text.split(' '):
substring = substring
suffixes = []
while (substring != ''):
if substring in self._rules:
tokens.extend(self._rules[substring])
substring = ''
elif self.find_prefix(substring) is not None:
split = self.find_prefix(substring)
tokens.append(substring[:split])
substring = substring[split:]
elif self.find_suffix(substring) is not None:
split = self.find_suffix(substring)
suffixes.append(substring[-split:])
#pdb.set_trace()
substring = substring[:-split]
elif self.find_infix(substring) is not None and len(self.find_infix(substring))>0:
infixes = self.find_infix(substring)
offset = 0
for match in infixes:
tokens.append(substring[offset : match.start()])
tokens.append(substring[match.start() : match.end()])
offset = match.end()
substring = substring[offset:]
else:
tokens.append(substring)
substring = ''
tokens.extend(reversed(suffixes))
return tokens
def find_infix(self, string):
if self.infix_finditer is None:
return []
return list(self.infix_finditer(string))
def find_prefix(self, string):
if (self.prefix_search is None):
return None
match = self.prefix_search(string)
return (match.end() - match.start()) if match is not None else None
def find_suffix(self, string):
if (self.suffix_search is None):
return None
match = self.suffix_search(string)
return (match.end() - match.start()) if match is not None else None