-
Notifications
You must be signed in to change notification settings - Fork 0
/
lemmaV.py
executable file
·131 lines (120 loc) · 5.28 KB
/
lemmaV.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!python3
from characters import VOWEL_SMALL_O as o#ɔ
from characters import VOWEL_SMALL_E as e#ɛ
from google.cloud import translate_v2 as translate
from google.oauth2 import service_account
import enchant
complement_word_ending = ['i'+e, 'e'+e]
irregular_verbs = {"nni": 'w'+o, 'mfa': 'de'}
credentials = service_account.Credentials.from_service_account_file("service-account.json")
translate_client = translate.Client(credentials=credentials)
englishCheck = enchant.Dict("en_GB")
def filter(words: list[dict], firstAttempt=True) -> list[dict]:
""" Checks if the potential roots in `words` are actual words in Twi
(or loanwords from English). It does not check if the roots are verbs. """
new_list = []
for wordObj in words:
word = wordObj["Root"]
if firstAttempt:
word = word.lower()
# hard code a few words
if word == 'de':
wordObj["English"] = 'put'
new_list.append(wordObj)
elif word == 'mo':
wordObj["English"] = "you"
new_list.append(wordObj)
else:
res = translate_client.translate(word, source_language='ak', target_language='en', format_='text')
translationAttempt = res['translatedText']
if translationAttempt != word: # if twi translation to english does not fail
wordObj["English"] = translationAttempt
new_list.append(wordObj) # is an actual lemmatized word, add to list
if not new_list and firstAttempt: # if nothing with lowercase
upped = []
for poss in words:
poss["Root"] = poss["Root"].capitalize() # try capitalizing
upped.append(poss)
return filter(upped, firstAttempt=False)
if not new_list and not firstAttempt: # if no results from capitalizing as well
words[-1]['Root'] = words[-1]['Root'].lower()
words[-1]["English"] = wordObj["Twi"] # give up and return the present without a translation [maybe its a name?]
return words[-1:]
else:
return new_list
def stdN(word: str):
""" This removes the n prefix from `word` and
returns a list of guesses of the root word. """
if word in irregular_verbs:
return [irregular_verbs[word]]
ret = []
if word[:2] == 'mm': # check m or b
for proto in ['m'+word[2:], 'b'+word[2:]]:
# if existence(proto):
ret.append(proto)
elif word[:2] == 'nn': # check n or d
for proto in ['n'+word[2:], 'd'+word[2:]]:
# if existence(proto):
ret.append(proto)
else:
proto = word[1:]
# if existence(proto): # standard word
ret.append(proto)
return ret
def negationPrefix(word: str):
""" return whether or not `word` has the negation prefix """
if word[:2] in ['mm', 'mp', 'mf']:
return True
if word[0] == 'n':
return True
return False
def lemmatizeVerb(word: str) -> list[dict]:
""" Lemmatize `word` without context, assuming the part of speech is a verb.
This function operates by lemmatizing the word in all manners possible, and then
sending a request to Google Translate to attempt to translate the proto-word
into English. If the attempted translations exists as a word in the English
language, they are returned to the client. """
results = []
# presperfN w comp
if negationPrefix(word) and word[-1]==word[-2]: # n..xx
for protoRoot in stdN(word[:-1]):
results.append({"Twi": word, "Root": protoRoot, "Tense": 'PRP_N'})
# presPerfN wo comp
if negationPrefix(word) and word[-2:] in complement_word_ending: # n...iɛ
for protoRoot in stdN(word[:-2]):
results.append({"Twi": word, "Root": protoRoot, "Tense": 'PRP_N'})
# futureN, presentN
if negationPrefix(word): #n..
for protoRoot in stdN(word):
results.append({"Twi": word, "Root": protoRoot, "Tense": 'PRS_N/FUT_N'})
# pastN
if word[0] == 'a' and len(word)>2 and negationPrefix(word[1:]): # an..
for protoRoot in stdN(word[1:]):
results.append({"Twi": word, "Root": protoRoot, "Tense":'PST_N'})
# progressiveN_immFutureN
if word[:2] == 're' and negationPrefix(word[2:]): #ren..
for protoRoot in stdN(word[2:]):
results.append({"Twi": word, "Root": protoRoot, "Tense":'PRG_N/IMF_N'})
# progressive
if word[:2] == 're': #re..
results.append({"Twi": word, "Root": word[2:], "Tense": 'PRG'})
# future
if word[:2] == 'bɛ': # be..
results.append({"Twi": word, "Root": word[2:], "Tense": 'FUT'})
# immFuture
if word[:4] == 'rebɛ': # rebɛ..
results.append({"Twi": word, "Root": word[4:], "Tense": 'IMF'})
# presPerf
if word[0] == 'a' and len(word)>1: # a..
results.append({"Twi": word, "Root": word[1:], "Tense": 'PRP'})
# past w comp ..iɛ
if word[-2:] in complement_word_ending :
results.append({"Twi": word, "Root": word[:-2], "Tense": 'PST'})
# past wo comp ..xx
if len(word) >= 2 and word[-1]==word[-2]: #
results.append({"Twi": word, "Root": word[:-1], "Tense": 'PST'})
results.append({"Twi": word, "Root": word, "Tense": 'PRS'})
return filter(results) # filter against dictionary
if __name__ == '__main__':
while 1:
print(lemmatizeVerb(input()))