This repository has been archived by the owner on Jun 13, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
baselines.py
119 lines (90 loc) · 5.13 KB
/
baselines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# Copyright (C) 2018 Francisco de Borja Valero <franborjavalero@gmail.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import os
import sys
import argparse
import npostagging.baselines
# python baselines.py get source_directory
# python baselines.py evaluate source_directory (--set)
parser = argparse.ArgumentParser()
parser.add_argument('mode', type=str, help='Modes: get, or evaluate')
parser.add_argument('source_directory', type=str, help='Directory source datasets')
parser.add_argument('--set', default="test", type=str, help='Set to evaluate dev or test')
args = parser.parse_args()
def get_arguments():
get_mode_files = ["train_disambiguate.txt", "ambiguity_matrix.npy", "tag_to_id.json", "word_to_id.json", "id_word_to_id_ambiguity.json"]
evaluate_mode_files = ["baseline_tag_level.npy", "baseline_ambiguity_level.npy", "baseline_word_level.npy"]
evaluate_mode_files = ["baseline_tag_level.npy"]
section = ["dev", "test"]
modes = ["get", "evaluate"]
if args.mode not in modes:
print("Error, expected mode {}".format(", ".join(section)))
sys.exit(1)
if not os.path.isdir(args.source_directory):
print("Error, source_directory {} does not exist".format(args.source_directory))
sys.exit(1)
if args.mode == "get":
for file in get_mode_files:
path_file = os.path.join(args.source_directory, file)
if not os.path.exists(path_file):
print("Error, file {} does not exist".format(path_file))
sys.exit(1)
else: # evaluate
for file in evaluate_mode_files:
path_file = os.path.join(args.source_directory, file)
if not os.path.exists(os.path.join(args.source_directory, file)):
print("Error, file {} does not exist".format(path_file))
sys.exit(1)
if args.set not in section:
print("Error, expected argument {}".format(", ".join(section)))
sys.exit(1)
return args
def main():
arguments = get_arguments()
filename_model_tag_level = os.path.join(arguments.source_directory, "baseline_tag_level.npy")
filename_model_ambiguity_level = os.path.join(arguments.source_directory, "baseline_ambiguity_level.npy")
filename_model_word_level = os.path.join(arguments.source_directory, "baseline_word_level.npy")
filename_ambiguity_matrix = os.path.join(arguments.source_directory, "ambiguity_matrix.npy")
filename_tag_to_id = os.path.join(arguments.source_directory, "tag_to_id.json")
filename_word_to_id = os.path.join(arguments.source_directory, "word_to_id.json")
if args.mode == "get":
filename_train = os.path.join(arguments.source_directory, "train_disambiguate.txt")
filename_id_word_to_id_ambiguity = os.path.join(arguments.source_directory, "id_word_to_id_ambiguity.json")
tag_level = npostagging.baselines.BaselineTagLevel(filename_train, filename_ambiguity_matrix)
ambiguity_level = npostagging.baselines.BaselineAmbiguityLevel(filename_train, filename_tag_to_id, filename_word_to_id, filename_id_word_to_id_ambiguity)
word_level = npostagging.baselines.BaselineWordLevel(filename_tag_to_id, filename_word_to_id, disambiguate_corpus=filename_train)
tag_level.export(filename_model_tag_level)
ambiguity_level.export(filename_model_ambiguity_level)
word_level.export(filename_model_word_level)
else:
models_name = ["tag", "ambiguity", "word"]
results = []
filename_X_ambiguity = os.path.join(arguments.source_directory, "X_{}.txt".format(arguments.set))
filename_X_words = os.path.join(arguments.source_directory, "{}_words.txt".format(arguments.set))
filename_y = os.path.join(arguments.source_directory, "y_tag_{}.txt".format(arguments.set))
tag_level = npostagging.baselines.BaselineTagLevel(filename_model_tag_level, filename_ambiguity_matrix, filename_tag_to_id)
total_acc, ambiguity_acc, unk_acc = tag_level.evaluate(filename_X_ambiguity, filename_y)
results.append((total_acc, ambiguity_acc, unk_acc))
ambiguity_level = npostagging.baselines.BaselineAmbiguityLevel(filename_model_ambiguity_level, filename_tag_to_id)
total_acc, ambiguity_acc, unk_acc = ambiguity_level.evaluate(filename_X_ambiguity, filename_y)
results.append((total_acc, ambiguity_acc, unk_acc))
word_level = npostagging.baselines.BaselineWordLevel(filename_tag_to_id, filename_word_to_id, words_apparitions=filename_model_word_level)
total_acc, ambiguity_acc, unk_acc = word_level.evaluate(filename_X_words, filename_y)
results.append((total_acc, ambiguity_acc, unk_acc))
print("Results {} set".format(arguments.set))
for model, result in zip(models_name, results):
print("Baseline: {} level - total_acc: {:.3f} ambiguity_acc: {:.3f} unk_acc: {:.3f}".format(model, result[0], result[1], result[2]))
if __name__ == "__main__":
main()