-
Notifications
You must be signed in to change notification settings - Fork 1
/
correct.py
executable file
·339 lines (277 loc) · 10.3 KB
/
correct.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
"""
Yfirlestur: Online spelling and grammar correction for Icelandic
High-level wrappers for checking grammar and spelling
Copyright (C) 2022 Miðeind ehf.
This software is licensed under the MIT License:
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without restriction,
including without limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of the Software,
and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
This module exports check_grammar(), a function called from main.py
to apply grammar and spelling annotations to user-supplied text.
"""
from typing import (
Any,
List,
Sequence,
Tuple,
Iterator,
Iterable,
Optional,
Callable,
cast,
)
from typing_extensions import TypedDict
import os
import hashlib
import random
from tokenizer import Tok
from reynir.bintokenizer import StringIterable
from reynir import Sentence
import reynir_correct
import nertokenizer
from reynir_correct.annotation import Annotation
from reynir_correct import CorrectionPipeline, check_with_stats
# True if running in a continuous integration (CI) test environment
CI_RUN = os.environ.get("CI", "") > ""
# Salt that is used during generation of a hashed token
# to be returned when giving feedback on an annotation
START_SALT = "*[GC start]*"
END_SALT = "*[GC end]*"
# Type definitions
class StatsDict(TypedDict):
"""Statistics returned from an annotation task"""
num_tokens: int
num_sentences: int
num_parsed: int
num_chars: int
ambiguity: float
class AnnDict(TypedDict):
"""A single annotation, as returned by the API"""
start: int
end: int
start_char: int
end_char: int
references: Optional[List[str]]
code: str
text: str
detail: Optional[str]
suggest: Optional[str]
suggestlist: Optional[List[str]]
class AnnTokenDict(TypedDict, total=False):
"""Type of the token dictionaries returned from check_grammar()"""
# Token kind
k: int
# Token text
x: str
# Original text of token
o: str
# Character offset of token, indexed from the start of the checked text
i: int
class AnnResultDict(TypedDict):
"""The annotation result for a sentence"""
original: str
tokens: List[AnnTokenDict]
token: str
nonce: str
annotations: List[AnnDict]
corrected: str
# List of sentences, each having an associated list of AnnResultDict instances
CheckResult = Tuple[List[List[AnnResultDict]], StatsDict]
class RecognitionPipeline(CorrectionPipeline):
"""Derived class that adds a named entity recognition pass
to the GreynirCorrect tokenization pipeline"""
def __init__(self, text: StringIterable) -> None:
super().__init__(text)
def recognize_entities(self, stream: Iterator[Tok]) -> Iterator[Tok]:
"""Recognize named entities using the nertokenizer module,
but construct tokens using the Correct_TOK class from
reynir_correct"""
if CI_RUN:
# Skip the recognize_entities pass if we are running in a
# continuous integration environment, where we have no database
return stream
return nertokenizer.recognize_entities(
stream, token_ctor=reynir_correct.Correct_TOK
)
class NERCorrect(reynir_correct.GreynirCorrect):
"""Derived class to override the default tokenization of
GreynirCorrect to perform named entity recognition"""
def __init__(self, **options: Any) -> None:
super().__init__(**options)
def tokenize(self, text: StringIterable) -> Iterator[Tok]:
"""Use the recognizing & correcting tokenizer instead
of the normal one"""
pipeline = RecognitionPipeline(text)
return pipeline.tokenize()
def generate_nonce() -> str:
"""Generate a random nonce, consisting of 8 digits"""
return "{0:08}".format(random.randint(0, 10**8 - 1))
def generate_token(original: str, nonce: str) -> str:
"""Generate a 64-character token string using the original
sentence string and the given nonce"""
return hashlib.sha256(
(START_SALT + nonce + original + END_SALT).encode("utf-8")
).hexdigest()[:64]
def validate_token_and_nonce(original: str, token: str, nonce: str) -> bool:
"""Check whether a given token/nonce combination corresponds to
the original sentence given"""
return generate_token(original, nonce) == token
DEFAULT_IGNORED_RULES = [
"ASLBRODDVANTAR",
"ASLSTAFVANTAR",
"ASLVITLSTAF",
"ASLVIXL",
"BEYGVILLA",
"C005_w",
"C005",
"EY4EI",
"P_DIR_LOC",
"P_DOUBLE_DEFINITE",
"P_MOOD_ACK",
"P_MOOD_COND",
"P_MOOD_PURP",
"P_MOOD_REL",
"P_MOOD_TEMP_w",
"P_NT_Einkunn",
"P_NT_EndingIR",
"P_NT_FsMeðFallstjórn",
"P_NT_ÍTölu",
"P_NT_Manns",
"P_NT_Né",
"P_NT_Sem_w",
"P_NT_ÞóAð",
"P_SINGSUB_GEN",
"P_VeraAð",
"P_wrong_case",
"P_WRONG_PLACE_PP",
"P_WRONG_PREP_AÐ",
"RFTGR",
"S001",
"W001_w",
"Y001_w",
"Ý4Í",
"P_WRONG_CASE_nf_þf",
"P_WRONG_CASE_nf_þgf",
"P_WRONG_CASE_þgf_nf",
"P_WRONG_PLACE_PP",
]
def check_grammar(
text: str,
*,
progress_func: Optional[Callable[[float], None]] = None,
split_paragraphs: bool = True,
annotate_unparsed_sentences: bool = True,
suppress_suggestions: bool = False,
ignore_wordlist: Sequence[str] = [],
ignore_rules: Sequence[str] = DEFAULT_IGNORED_RULES
) -> CheckResult:
"""Check the grammar and spelling of the given text and return
a list of annotated paragraphs, containing sentences, containing
tokens. The progress_func, if given, will be called periodically
during processing to indicate progress, with a ratio parameter
which is a float in the range 0.0..1.0."""
result = check_with_stats(
text,
progress_func=progress_func,
split_paragraphs=split_paragraphs,
annotate_unparsed_sentences=annotate_unparsed_sentences,
suppress_suggestions=suppress_suggestions,
ignore_wordlist=ignore_wordlist,
ignore_rules=ignore_rules,
)
# Character index of each token within the submitted text,
# counting from its beginning
offset = 0
def encode_sentence(sent: Sentence) -> AnnResultDict:
"""Map a reynir._Sentence object to a raw sentence dictionary
expected by the web UI"""
tokens: List[AnnTokenDict]
if sent.tree is None:
# Not parsed: use the raw token list
tokens = [
AnnTokenDict(k=d.kind, x=d.txt, o=d.original or d.txt)
for d in sent.tokens
]
else:
# Successfully parsed: use the text from the terminals (where available)
# since we have more info there, for instance on em/en dashes.
# Create a map of token indices to corresponding terminal text
assert sent.terminals is not None
token_map = {t.index: t.text for t in sent.terminals}
tokens = [
AnnTokenDict(
k=d.kind, x=token_map.get(ix, d.txt), o=d.original or d.txt
)
for ix, d in enumerate(sent.tokens)
]
# Add token character offsets to the annotation tokens
nonlocal offset
for ix, t in enumerate(sent.tokens):
tokens[ix]["i"] = offset
offset += len(t.original or "")
a: Iterable[Annotation] = getattr(
sent, "annotations", cast(List[Annotation], [])
)
len_tokens = len(tokens)
# Reassemble the original sentence text, as the tokenizer saw it
original = "".join((t.original or "") for t in sent.tokens)
# Create a nonce and a token that the user must return correctly
# to give feedback on this annotation via the /feedback endpoint
nonce = generate_nonce()
token = generate_token(original, nonce)
annotations: List[AnnDict] = [
AnnDict(
# Start token index of this annotation
start=ann.start,
# End token index (inclusive)
end=ann.end,
# Character offset of the start of the annotation in the original text
start_char=tokens[ann.start].get("i", 0),
# Character offset of the end of the annotation in the original text
# (inclusive, i.e. the offset of the last character)
end_char=(
tokens[ann.end + 1].get("i", 0)
if ann.end + 1 < len_tokens
else offset
)
- 1,
references=ann.references,
code=ann.code,
text=ann.text,
detail=ann.detail,
suggest=ann.suggest,
suggestlist=ann.suggestlist,
)
for ann in a
]
return AnnResultDict(
original=original,
tokens=tokens,
token=token,
nonce=nonce,
annotations=annotations,
corrected=sent.tidy_text,
)
pglist = result["paragraphs"]
pgs = [[encode_sentence(sent) for sent in pg] for pg in pglist]
stats = StatsDict(
num_tokens=result["num_tokens"],
num_sentences=result["num_sentences"],
num_parsed=result["num_parsed"],
num_chars=offset,
ambiguity=result["ambiguity"],
)
return pgs, stats