forked from chrisjbryant/errant
-
Notifications
You must be signed in to change notification settings - Fork 0
/
m2_to_m2.py
102 lines (99 loc) · 4.88 KB
/
m2_to_m2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import argparse
import os
import spacy
from nltk.stem.lancaster import LancasterStemmer
import scripts.align_text as align_text
import scripts.cat_rules as cat_rules
import scripts.toolbox as toolbox
def main(args):
# Get base working directory.
basename = os.path.dirname(os.path.realpath(__file__))
print("Loading resources...")
# Load Tokenizer and other resources
nlp = spacy.load("en")
# Lancaster Stemmer
stemmer = LancasterStemmer()
# GB English word list (inc -ise and -ize)
gb_spell = toolbox.loadDictionary(basename+"/resources/en_GB-large.txt")
# Part of speech map file
tag_map = toolbox.loadTagMap(basename+"/resources/en-ptb_map")
# Setup output m2 file
out_m2 = open(args.out, "w")
print("Processing files...")
# Open the m2 file and split into sentence+edit chunks.
m2_file = open(args.m2).read().strip().split("\n\n")
for info in m2_file:
# Get the original and corrected sentence + edits for each annotator.
orig_sent, coder_dict = toolbox.processM2(info)
# Write the orig_sent to the output m2 file.
out_m2.write("S "+" ".join(orig_sent)+"\n")
# Only process sentences with edits.
if coder_dict:
# Save marked up original sentence here, if required.
proc_orig = ""
# Loop through the annotators
for coder, coder_info in sorted(coder_dict.items()):
cor_sent = coder_info[0]
gold_edits = coder_info[1]
# If there is only 1 edit and it is noop, just write it.
if gold_edits[0][2] == "noop":
out_m2.write(toolbox.formatEdit(gold_edits[0], coder)+"\n")
continue
# Markup the orig and cor sentence with spacy (assume tokenized)
# Orig is marked up only once for the first coder that needs it.
proc_orig = toolbox.applySpacy(orig_sent, nlp) if not proc_orig else proc_orig
proc_cor = toolbox.applySpacy(cor_sent, nlp)
# Loop through gold edits.
for gold_edit in gold_edits:
# Um and UNK edits (uncorrected errors) are always preserved.
if gold_edit[2] in {"Um", "UNK"}:
# Um should get changed to UNK unless using old categories.
if gold_edit[2] == "Um" and not args.old_cats: gold_edit[2] = "UNK"
out_m2.write(toolbox.formatEdit(gold_edit, coder)+"\n")
# Gold edits
elif args.gold:
# Minimise the edit; e.g. [has eaten -> was eaten] = [has -> was]
if not args.max_edits:
gold_edit = toolbox.minimiseEdit(gold_edit, proc_orig, proc_cor)
# If minimised to nothing, the edit disappears.
if not gold_edit: continue
# Give the edit an automatic error type.
if not args.old_cats:
cat = cat_rules.autoTypeEdit(gold_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer)
gold_edit[2] = cat
# Write the edit to the output m2 file.
out_m2.write(toolbox.formatEdit(gold_edit, coder)+"\n")
# Auto edits
if args.auto:
# Auto align the parallel sentences and extract the edits.
auto_edits = align_text.getAutoAlignedEdits(proc_orig, proc_cor, nlp, args)
# Loop through the edits.
for auto_edit in auto_edits:
# Give each edit an automatic error type.
cat = cat_rules.autoTypeEdit(auto_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer)
auto_edit[2] = cat
# Write the edit to the output m2 file.
out_m2.write(toolbox.formatEdit(auto_edit, coder)+"\n")
# Write a newline when there are no more coders.
out_m2.write("\n")
if __name__ == "__main__":
# Define and parse program input
parser = argparse.ArgumentParser(description="Automatically extract and/or type edits in an m2 file.",
formatter_class=argparse.RawTextHelpFormatter,
usage="%(prog)s [-h] (-auto | -gold) [options] m2 -out OUT")
parser.add_argument("m2", help="A path to an m2 file.")
type_group = parser.add_mutually_exclusive_group(required=True)
type_group.add_argument("-auto", help="Extract edits automatically.", action="store_true")
type_group.add_argument("-gold", help="Use existing edit alignments.", action="store_true")
parser.add_argument("-out", help="The output filepath.", required=True)
parser.add_argument("-max_edits", help="Do not minimise edit spans. (gold only)", action="store_true")
parser.add_argument("-old_cats", help="Do not reclassify the edits. (gold only)", action="store_true")
parser.add_argument("-lev", help="Use standard Levenshtein to align sentences.", action="store_true")
parser.add_argument("-merge", choices=["rules", "all-split", "all-merge", "all-equal"], default="rules",
help="Choose a merging strategy for automatic alignment.\n"
"rules: Use a rule-based merging strategy (default)\n"
"all-split: Merge nothing; e.g. MSSDI -> M, S, S, D, I\n"
"all-merge: Merge adjacent non-matches; e.g. MSSDI -> M, SSDI\n"
"all-equal: Merge adjacent same-type non-matches; e.g. MSSDI -> M, SS, D, I")
args = parser.parse_args()
main(args)