-
Notifications
You must be signed in to change notification settings - Fork 0
/
languageModel.py
109 lines (89 loc) · 2.86 KB
/
languageModel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
https://powcoder.com
代写代考加微信 powcoder
Assignment Project Exam Help
Add WeChat powcoder
import numpy as np
'''
Tuan Do, Kenneth Lai
'''
class LanguageModel(object) :
START = "<S>";
STOP = "</S>";
UNK = "<UNK>";
def __init__(self):
self.word_dict = {}
self.total = None
self.prob_counter = None
'''
Constructs a language model from a collection of sentences.
-----
trainingSentences = list of lists of strings (words)
'''
def train(self, trainingSentences):
pass
'''
Returns the probability, according to the model, of the word specified
by the argument sentence and index. Index ranges from 0 to len(sentence),
inclusive. If index==len(sentence), return P(STOP | context).
-----
sentence: list of strings (words)
index: index to calculate the probablity
'''
def getWordProbability(self, sentence, index):
pass
'''
Returns the set of tokens the model makes predictions for. This
includes STOP and UNK, but not START (because we do not need to
compute P(START | context)).
-----
Return: list of strings
'''
def getVocabulary(self):
return list(self.word_dict)
'''
Returns a random word sampled according to the model.
-----
Return: string
'''
def generateWord(self):
pass
'''
Returns a random sentence sampled according to the model.
-----
Return: list of strings
'''
def generateSentence(self):
result = []
# limit sentence length to 20
for i in range(20):
word = LanguageModel.UNK
while word == LanguageModel.UNK:
# make sure word != UNK
word = self.generateWord(result)
if word == LanguageModel.STOP:
break
result.append(word)
return result
#-----------------------------------------------------------------------
'''
Returns the probability, according to the model, of the specified
sentence. This is the product of the probabilities of each word in
the sentence (including a final stop token).
-----
sentence: list of strings
'''
def getSentenceLogProbability(self, sentence) :
logProbability = sum( np.log2(self.getWordProbability(sentence, i)) for i in range(len(sentence) + 1))
return logProbability
'''
Given a list of words, sums over the probabilities of every token that
could follow. If the model implements a valid probability
distribution, this should always sum to 1.
'''
def checkProbability(self, context):
modelsum = 0.0;
for token in self.getVocabulary():
context.append(token)
modelsum += self.getWordProbability(context, len(context) - 1)
del context[-1]
return modelsum