-
Notifications
You must be signed in to change notification settings - Fork 1
/
evidence_eval_ambifc.py
148 lines (122 loc) · 4.94 KB
/
evidence_eval_ambifc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""
Evaluate the evidence prediction on the sentence level.
Usage:
evidence_eval_ambifc.py <directory> <predictions> <split> [--overwrite]
"""
import os
from os.path import join
from typing import Optional, List, Dict, Tuple, Iterable
from docopt import docopt
from ambifc.modeling.conf.train_data_config import TrainDataConfig
from ambifc.modeling.dataset.samples import get_samples_for_ambifc_subset
from ambifc.modeling.evaluate.eval import get_full_evidence_evaluation
from ambifc.modeling.prediction.sentence_predictions import get_non_empty_sentence_keys
from ambifc.util.fileutil import read_jsonl_from_dir, write_json
DEFAULT_PATH_DATA_DIR: str = './data'
def data_to_dict(
samples: List[Dict],
field_claim_id: str,
field_passage_id: str,
sentence_key: str
) -> Dict[Tuple[int, str, str], Dict]:
result: Dict[Tuple[int, str, str], Dict] = dict()
for sample in samples:
key: Tuple[int, str, str] = (sample[field_claim_id], sample[field_passage_id], sample[sentence_key])
assert key not in result
result[key] = sample
return result
def get_gold_sentence_annotations(data_directory: str, ambifc_subset: str, split: str) -> List[Dict]:
return [
{
'claim_id': sample['claim_id'],
'wiki_passage': sample['wiki_passage'],
'sentence_key': sentence_key,
'sentence_annotations': list(map(lambda x: x['annotation'], sample['sentence_annotations'][sentence_key])),
'category': sample['category'],
'passage_annotations': sample['passage_annotations']
}
for sample in get_samples_for_ambifc_subset(
ambifc_subset=ambifc_subset,
split=split,
data_directory=data_directory
)
for sentence_key in get_non_empty_sentence_keys(sample)
]
def get_gold_sentence_annotations_as_dict(
data_directory: str,
ambifc_subset: str,
split: str) -> Dict[Tuple[int, str, str], Dict]:
sentence_annotations_gold: List[Dict] = get_gold_sentence_annotations(data_directory, ambifc_subset, split)
return data_to_dict(
sentence_annotations_gold,
'claim_id',
'wiki_passage',
'sentence_key'
)
def evaluate_all_evidence_prediction(
prediction_directory: str,
predictions_file: str,
split: str,
overwrite: bool = False,
data_directory: Optional[str] = DEFAULT_PATH_DATA_DIR
):
gold_data: Dict[Tuple[int, str, str], Dict] = get_gold_sentence_annotations_as_dict(
data_directory, TrainDataConfig.SUBSET_ALL_AMBIFC, split
)
keys_certain: List[Tuple[int, str, str]] = list(
filter(lambda x: gold_data[x]['category'] == 'certain', gold_data.keys())
)
keys_uncertain: List[Tuple[int, str, str]] = list(
filter(lambda x: gold_data[x]['category'] != 'certain', gold_data.keys())
)
keys_certain_5plus_annotations: List[Tuple[int, str, str]] = list(
filter(lambda x: len(gold_data[x]['passage_annotations']) >= 5, keys_certain)
)
# Get separate evaluations for uncertain / certain samples
keys_to_evaluate: Iterable[Tuple[str, List[Tuple[int, str, str]]]] = [
('all', list(gold_data.keys())),
('certain', keys_certain),
('certain-5plus', keys_certain_5plus_annotations),
('uncertain', keys_uncertain)
]
# Only keep categories that actually exist
keys_to_evaluate = list(
filter(lambda x: len(x[1]) > 0, keys_to_evaluate)
)
for name, keys in keys_to_evaluate:
print(f'{name}: evaluate {len(keys)} samples.')
# Get predictions
predicted_data: Dict[Tuple[int, str, str], Dict] = data_to_dict(
list(read_jsonl_from_dir(prediction_directory, predictions_file)),
'claim_id',
'passage',
'sentence_key'
)
all_metrics: Dict = {}
for name, keys in keys_to_evaluate:
metrics: Dict = get_full_evidence_evaluation(
gold_samples={key: gold_data[key] for key in keys},
predicted_samples={key: predicted_data[key] for key in keys}
)
all_metrics[name] = metrics
metrics_file_name: str = 'evaluation-' + predictions_file.replace('.jsonl', '.json')
dest_path: str = join(prediction_directory, metrics_file_name)
if overwrite and os.path.exists(dest_path):
os.remove(dest_path)
write_json(dest_path, all_metrics, pretty=True)
print('Write: dest_path', dest_path)
print('Done.')
def main(args) -> None:
directory: str = args['<directory>']
predictions_file: str = args['<predictions>']
split: str = args['<split>']
overwrite: bool = args['--overwrite']
assert os.path.exists(directory)
assert os.path.exists(join(directory, predictions_file))
assert split in ['train', 'dev', 'test']
evaluate_all_evidence_prediction(
directory, predictions_file, split, overwrite=overwrite
)
if __name__ == "__main__":
args = docopt(__doc__)
main(args)