From c26379db5a1acce755fcabc604052e9f47dc81d5 Mon Sep 17 00:00:00 2001 From: lilacheden Date: Fri, 8 Mar 2024 16:21:32 +0200 Subject: [PATCH 1/9] add processor predictions_yes_1_else_0 Signed-off-by: lilacheden --- prepare/processors/processors.py | 11 +++++++++++ .../processors/predictions_yes_1_else_0.json | 10 ++++++++++ src/unitxt/processors.py | 7 +++++++ tests/library/test_postprocessors.py | 16 ++++++++++++++++ 4 files changed, 44 insertions(+) create mode 100644 src/unitxt/catalog/processors/predictions_yes_1_else_0.json diff --git a/prepare/processors/processors.py b/prepare/processors/processors.py index baec959ad..643d2fd41 100644 --- a/prepare/processors/processors.py +++ b/prepare/processors/processors.py @@ -16,6 +16,7 @@ TakeFirstWord, ToYesOrNone, YesNoToInt, + YesToOneElseZero, ) logger = get_logger() @@ -160,6 +161,16 @@ overwrite=True, ) +add_to_catalog( + SequentialOperator( + steps=[ + YesToOneElseZero(field="prediction", process_every_value=False), + ] + ), + "processors.predictions_yes_1_else_0", + overwrite=True, +) + add_to_catalog( SequentialOperator( steps=[ diff --git a/src/unitxt/catalog/processors/predictions_yes_1_else_0.json b/src/unitxt/catalog/processors/predictions_yes_1_else_0.json new file mode 100644 index 000000000..0eef86eef --- /dev/null +++ b/src/unitxt/catalog/processors/predictions_yes_1_else_0.json @@ -0,0 +1,10 @@ +{ + "type": "sequential_operator", + "steps": [ + { + "type": "yes_to_one_else_zero", + "field": "prediction", + "process_every_value": false + } + ] +} diff --git a/src/unitxt/processors.py b/src/unitxt/processors.py index a45ab83f1..ddcd4a2e6 100644 --- a/src/unitxt/processors.py +++ b/src/unitxt/processors.py @@ -152,6 +152,13 @@ def process_value(self, text: Any) -> Any: return text +class YesToOneElseZero(FieldOperator): + def process_value(self, text: Any) -> Any: + if text == "yes": + return "1" + return "0" + + class StrToFloatFormat(FieldOperator): def process_value(self, text: Any) -> Any: try: diff --git a/tests/library/test_postprocessors.py b/tests/library/test_postprocessors.py index 7966467ab..26da90e6c 100644 --- a/tests/library/test_postprocessors.py +++ b/tests/library/test_postprocessors.py @@ -170,6 +170,22 @@ def test_to_yes_or_none(self): tester=self, ) + def test_predictions_yes_1_else_0(self): + parser, _ = fetch_artifact("processors.predictions_yes_1_else_0") + inputs = ["yes", "no", "yaa"] + targets = [ + {"references": ["yes"], "prediction": "1"}, + {"references": ["no"], "prediction": "0"}, + {"references": ["yaa"], "prediction": "0"}, + ] + + check_operator( + operator=parser, + inputs=list_to_stream_with_prediction_and_references(inputs), + targets=targets, + tester=self, + ) + def test_str_to_float_format(self): parser, _ = fetch_artifact("processors.str_to_float_format") inputs = ["-2.4", "5", "5a"] From fd2c24becd43923a2b2e4a538a52c7b135770d0a Mon Sep 17 00:00:00 2001 From: lilacheden Date: Tue, 12 Mar 2024 15:30:21 +0200 Subject: [PATCH 2/9] binary metrics accept real values predictions (with threshold 0.5) Signed-off-by: lilacheden --- src/unitxt/metrics.py | 34 +++++++++++++--------------------- tests/library/test_metrics.py | 23 ++--------------------- 2 files changed, 15 insertions(+), 42 deletions(-) diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py index 631426e20..2d3234643 100644 --- a/src/unitxt/metrics.py +++ b/src/unitxt/metrics.py @@ -1098,11 +1098,6 @@ def get_str_id(self, str): self.id_to_str[id] = str return self.str_to_id[str] - def _labels_match_average_format( - self, references: List[List[str]], predictions: List[str] - ): - return True - def compute( self, references: List[List[str]], @@ -1112,8 +1107,6 @@ def compute( assert all( len(reference) == 1 for reference in references ), "Only a single reference per prediction is allowed in F1 metric" - if not self._labels_match_average_format(references, predictions): - return {self.main_score: np.nan} self.str_to_id = {} self.id_to_str = {} @@ -1155,21 +1148,20 @@ class F1Binary(F1): pos_classes = {"1", "1.0", "yes", "true"} def get_str_id(self, str): - if str.lower() in self.pos_classes: - return 1 - return 0 + return int(str) - # References and predictions must include up to 2 unique values, one of them in pos_classes - def _labels_match_average_format( - self, references: List[List[str]], predictions: List[str] - ): - classes = set(predictions + list(itertools.chain(*references))) - n_classes = len(classes) - if n_classes > 2: - return False - if n_classes == 2 and len(set(classes).difference(self.pos_classes)) == 0: - return False - return True + def compute( + self, + references: List[List[str]], + predictions: List[str], + task_data: List[Dict], + ) -> dict: + predictions_floats = [to_float_or_default(p) for p in predictions] + predictions = [str(int(p > 0.5)) for p in predictions_floats] + references = [ + ["1"] if r[0].lower() in self.pos_classes else "0" for r in references + ] + return super().compute(references, predictions, task_data) class RecallBinary(F1Binary): diff --git a/tests/library/test_metrics.py b/tests/library/test_metrics.py index 1ff4352a6..282515ae7 100644 --- a/tests/library/test_metrics.py +++ b/tests/library/test_metrics.py @@ -169,8 +169,8 @@ def test_f1_micro(self): def test_f1_binary(self): metric = F1Binary() - references = [["1"], ["0"], ["0"], ["0"], ["1"], ["1"]] - predictions = ["1", "1", "0", "0", "1", "1"] + references = [["1"], ["0"], ["0"], ["0"], ["Yes"], ["1"]] + predictions = ["0.8", "1", "0.2", "0", "0.6", "1"] global_target = 0.8571428571428 outputs = apply_metric( @@ -213,25 +213,6 @@ def test_recall_binary(self): self.assertEqual("recall_binary", outputs[0]["score"]["global"]["score_name"]) self.assertEqual("recall_binary", outputs[0]["score"]["instance"]["score_name"]) - def test_f1_binary_non_binary(self): - metric = F1Binary() - references = [["1"], ["0"], ["yes"], ["0"], ["1"], ["1"]] - predictions = ["1", "1", "0", "0", "1", "1"] - - outputs = apply_metric( - metric=metric, predictions=predictions, references=references - ) - self.assertTrue(isnan(outputs[0]["score"]["global"]["score"])) - - metric = F1Binary() - references = [["1"], ["yes"], ["1"], ["1"]] - predictions = ["1", "1", "1", "1"] - - outputs = apply_metric( - metric=metric, predictions=predictions, references=references - ) - self.assertTrue(isnan(outputs[0]["score"]["global"]["score"])) - def test_max_f1(self): metric = BinaryMaxF1() references = [["1"], ["0"], ["0"]] From 0e739be065ac275b45875fd5a890f00b4bc6bfa7 Mon Sep 17 00:00:00 2001 From: lilacheden Date: Tue, 12 Mar 2024 15:41:58 +0200 Subject: [PATCH 3/9] add [] Signed-off-by: lilacheden --- src/unitxt/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py index 2d3234643..0cfa48fbb 100644 --- a/src/unitxt/metrics.py +++ b/src/unitxt/metrics.py @@ -1159,7 +1159,7 @@ def compute( predictions_floats = [to_float_or_default(p) for p in predictions] predictions = [str(int(p > 0.5)) for p in predictions_floats] references = [ - ["1"] if r[0].lower() in self.pos_classes else "0" for r in references + ["1"] if r[0].lower() in self.pos_classes else ["0"] for r in references ] return super().compute(references, predictions, task_data) From f0917a2c316592560ca2bfadb5b75dd4ba0e3ff9 Mon Sep 17 00:00:00 2001 From: lilacheden Date: Tue, 12 Mar 2024 17:59:16 +0200 Subject: [PATCH 4/9] update BinaryMaxF1 Signed-off-by: lilacheden --- src/unitxt/metrics.py | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py index 0cfa48fbb..18dfc3ee6 100644 --- a/src/unitxt/metrics.py +++ b/src/unitxt/metrics.py @@ -1,4 +1,3 @@ -import itertools import re import string import uuid @@ -3091,31 +3090,14 @@ def compute( assert all( len(reference) == 1 for reference in references ), "Only a single reference per prediction is allowed in F1 metric" - classes = set(itertools.chain(*references)) - n_clases = len(classes) - assert len(classes) <= 2, "References of BinaryMaxF1 must be binary" - pos_classes = classes.intersection(self.pos_classes) - neg_classes = classes.difference(self.pos_classes) - n_pos_classes = len(pos_classes) - if n_clases == 2: - assert ( - n_pos_classes == 1 - ), "Only one positive class is allowed in BinaryMaxF1" - pos_class = next(iter(pos_classes)) if n_pos_classes > 0 else "1.0" - neg_class = next(iter(neg_classes)) if len(neg_classes) > 0 else "0.0" - float_predictions = [] - for prediction in predictions: - try: - float_predictions.append(float(prediction)) - except Exception: - float_predictions.append(0) + float_predictions = [to_float_or_default(p) for p in predictions] best_thr = -1 best_f1 = -1 for thr in set(float_predictions): new_predictions = [ - pos_class if float_prediction >= thr else neg_class + "1" if float_prediction >= thr else "0" for float_prediction in float_predictions ] f1 = super().compute(references, new_predictions, task_data)[ From cc2af9ffed6b42982e05121cf3831f0b8740fcba Mon Sep 17 00:00:00 2001 From: Ariel Gera Date: Tue, 12 Mar 2024 23:13:30 +0200 Subject: [PATCH 5/9] Add BinaryMaxAccuracy metric Signed-off-by: Ariel Gera --- prepare/metrics/accuracy.py | 5 ++- .../catalog/metrics/max_accuracy_binary.json | 3 ++ src/unitxt/metrics.py | 40 +++++++++++++++++++ tests/library/test_metrics.py | 19 +++++++++ 4 files changed, 66 insertions(+), 1 deletion(-) create mode 100644 src/unitxt/catalog/metrics/max_accuracy_binary.json diff --git a/prepare/metrics/accuracy.py b/prepare/metrics/accuracy.py index 8892cea7e..787711c8e 100644 --- a/prepare/metrics/accuracy.py +++ b/prepare/metrics/accuracy.py @@ -1,5 +1,5 @@ from src.unitxt import add_to_catalog -from src.unitxt.metrics import Accuracy +from src.unitxt.metrics import Accuracy, BinaryMaxAccuracy from src.unitxt.test_utils.metrics import test_metric metric = Accuracy() @@ -32,3 +32,6 @@ ) add_to_catalog(metric, "metrics.accuracy", overwrite=True) + +metric = BinaryMaxAccuracy() +add_to_catalog(metric, "metrics.max_accuracy_binary", overwrite=True) diff --git a/src/unitxt/catalog/metrics/max_accuracy_binary.json b/src/unitxt/catalog/metrics/max_accuracy_binary.json new file mode 100644 index 000000000..15097cf78 --- /dev/null +++ b/src/unitxt/catalog/metrics/max_accuracy_binary.json @@ -0,0 +1,3 @@ +{ + "type": "binary_max_accuracy" +} diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py index 18dfc3ee6..538d65acb 100644 --- a/src/unitxt/metrics.py +++ b/src/unitxt/metrics.py @@ -3108,3 +3108,43 @@ def compute( best_thr = thr return {self.main_score: best_f1, "best_thr_maxf1": best_thr} + + +class BinaryMaxAccuracy(GlobalMetric): + process_single_instances = False + main_score = "max_accuracy_binary" + pos_classes = {"1", "1.0", "yes", "true"} + + def compute( + self, + references: List[List[str]], + predictions: List[List[str]], + task_data: List[Dict], + ) -> dict: + assert all( + len(reference) == 1 for reference in references + ), "Only a single reference per prediction is allowed in BinaryMaxAccuracy metric" + + float_predictions = [to_float_or_default(p) for p in predictions] + references = [ + ["1"] if r[0].lower() in self.pos_classes else ["0"] for r in references + ] + + best_thr = -1 + best_acc = -1 + for thr in set(float_predictions): + new_predictions = [ + "1" if float_prediction >= thr else "0" + for float_prediction in float_predictions + ] + acc = np.mean( + [ + [prediction] == reference + for prediction, reference in zip(new_predictions, references) + ] + ) + if acc > best_acc: + best_acc = acc + best_thr = thr + + return {self.main_score: best_acc, "best_thr_max_acc": best_thr} diff --git a/tests/library/test_metrics.py b/tests/library/test_metrics.py index 282515ae7..a11b9f22a 100644 --- a/tests/library/test_metrics.py +++ b/tests/library/test_metrics.py @@ -3,6 +3,7 @@ from src.unitxt.logging_utils import get_logger from src.unitxt.metrics import ( Accuracy, + BinaryMaxAccuracy, BinaryMaxF1, F1Binary, F1Macro, @@ -227,6 +228,24 @@ def test_max_f1(self): self.assertEqual("max_f1_binary", outputs[0]["score"]["global"]["score_name"]) self.assertEqual("max_f1_binary", outputs[0]["score"]["instance"]["score_name"]) + def test_binary_max_accuracy(self): + metric = BinaryMaxAccuracy() + references = [["1"], ["0"], ["0"], ["1"], ["0"]] + predictions = ["0.3", "0", "0.7", "1.0", "0.2"] + + global_target = 0.8 + outputs = apply_metric( + metric=metric, predictions=predictions, references=references + ) + + self.assertAlmostEqual(global_target, outputs[0]["score"]["global"]["score"]) + self.assertEqual( + "max_accuracy_binary", outputs[0]["score"]["global"]["score_name"] + ) + self.assertEqual( + "max_accuracy_binary", outputs[0]["score"]["instance"]["score_name"] + ) + def test_f1_macro(self): metric = F1Macro() references = [["cat"], ["dog"], ["dog"], ["dog"], ["cat"], ["cat"]] From c67e5f6017b47ac1ae7174b9308915defb6a1e78 Mon Sep 17 00:00:00 2001 From: Ariel Gera Date: Wed, 13 Mar 2024 00:31:17 +0200 Subject: [PATCH 6/9] Add Binary Accuracy metric (threshold 0.5) Signed-off-by: Ariel Gera --- prepare/metrics/accuracy.py | 5 +++- .../catalog/metrics/accuracy_binary.json | 3 +++ src/unitxt/metrics.py | 25 +++++++++++++++++++ tests/library/test_metrics.py | 22 ++++++++++++++++ 4 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 src/unitxt/catalog/metrics/accuracy_binary.json diff --git a/prepare/metrics/accuracy.py b/prepare/metrics/accuracy.py index 787711c8e..374532432 100644 --- a/prepare/metrics/accuracy.py +++ b/prepare/metrics/accuracy.py @@ -1,5 +1,5 @@ from src.unitxt import add_to_catalog -from src.unitxt.metrics import Accuracy, BinaryMaxAccuracy +from src.unitxt.metrics import Accuracy, BinaryAccuracy, BinaryMaxAccuracy from src.unitxt.test_utils.metrics import test_metric metric = Accuracy() @@ -33,5 +33,8 @@ add_to_catalog(metric, "metrics.accuracy", overwrite=True) +metric = BinaryAccuracy() +add_to_catalog(metric, "metrics.accuracy_binary") + metric = BinaryMaxAccuracy() add_to_catalog(metric, "metrics.max_accuracy_binary", overwrite=True) diff --git a/src/unitxt/catalog/metrics/accuracy_binary.json b/src/unitxt/catalog/metrics/accuracy_binary.json new file mode 100644 index 000000000..7141514c6 --- /dev/null +++ b/src/unitxt/catalog/metrics/accuracy_binary.json @@ -0,0 +1,3 @@ +{ + "type": "binary_accuracy" +} diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py index 538d65acb..d871709ba 100644 --- a/src/unitxt/metrics.py +++ b/src/unitxt/metrics.py @@ -3110,6 +3110,31 @@ def compute( return {self.main_score: best_f1, "best_thr_maxf1": best_thr} +class BinaryAccuracy(InstanceMetric): + """Calculate accuracy for a binary task, using 0.5 as the threshold in the case of float predictions.""" + + reduction_map = {"mean": ["accuracy_binary"]} + main_score = "accuracy_binary" + ci_scores = ["accuracy_binary"] + pos_classes = {"1", "1.0", "yes", "true"} + + def compute( + self, references: List[Any], prediction: Any, task_data: List[Dict] + ) -> dict: + assert ( + len(references) == 1 + ), "Only a single reference per prediction is allowed in Binary Accuracy metric" + + float_prediction = to_float_or_default(prediction) + prediction = str(int(float_prediction > 0.5)) + references = ["1"] if references[0].lower() in self.pos_classes else ["0"] + + result = {self.main_score: float([prediction] == references)} + result["score"] = result[self.main_score] + result["score_name"] = self.main_score + return result + + class BinaryMaxAccuracy(GlobalMetric): process_single_instances = False main_score = "max_accuracy_binary" diff --git a/tests/library/test_metrics.py b/tests/library/test_metrics.py index a11b9f22a..d35cd293c 100644 --- a/tests/library/test_metrics.py +++ b/tests/library/test_metrics.py @@ -3,6 +3,7 @@ from src.unitxt.logging_utils import get_logger from src.unitxt.metrics import ( Accuracy, + BinaryAccuracy, BinaryMaxAccuracy, BinaryMaxF1, F1Binary, @@ -228,6 +229,27 @@ def test_max_f1(self): self.assertEqual("max_f1_binary", outputs[0]["score"]["global"]["score_name"]) self.assertEqual("max_f1_binary", outputs[0]["score"]["instance"]["score_name"]) + def test_accuracy_binary(self): + metric = BinaryAccuracy() + references = [["1"], ["0"], ["0"], ["1"], ["0"]] + predictions = ["0.3", "0", "0.7", "1.0", "0.2"] + + expected_global_result = { + "accuracy_binary": 3 / 5, + "score": 3 / 5, + "score_name": "accuracy_binary", + } + + outputs = apply_metric( + metric=metric, predictions=predictions, references=references + ) + global_result = { + k: v + for k, v in outputs[0]["score"]["global"].items() + if k in expected_global_result + } + self.assertDictEqual(expected_global_result, global_result) + def test_binary_max_accuracy(self): metric = BinaryMaxAccuracy() references = [["1"], ["0"], ["0"], ["1"], ["0"]] From 681034642197e8d9fe3024c80820b8a7113e64eb Mon Sep 17 00:00:00 2001 From: lilacheden Date: Wed, 13 Mar 2024 09:51:16 +0200 Subject: [PATCH 7/9] threshold to metric parameter Signed-off-by: lilacheden --- src/unitxt/metrics.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py index d871709ba..f28ea550c 100644 --- a/src/unitxt/metrics.py +++ b/src/unitxt/metrics.py @@ -1141,10 +1141,13 @@ class F1Micro(F1): class F1Binary(F1): + """Calculate f1 for a binary task, using 0.5 as the threshold in the case of float predictions.""" + process_single_instances = False main_score = "f1_binary" average = "binary" pos_classes = {"1", "1.0", "yes", "true"} + threshold = 0.5 def get_str_id(self, str): return int(str) @@ -1156,7 +1159,7 @@ def compute( task_data: List[Dict], ) -> dict: predictions_floats = [to_float_or_default(p) for p in predictions] - predictions = [str(int(p > 0.5)) for p in predictions_floats] + predictions = [str(int(p > self.threshold)) for p in predictions_floats] references = [ ["1"] if r[0].lower() in self.pos_classes else ["0"] for r in references ] @@ -3117,6 +3120,7 @@ class BinaryAccuracy(InstanceMetric): main_score = "accuracy_binary" ci_scores = ["accuracy_binary"] pos_classes = {"1", "1.0", "yes", "true"} + threshold = 0.5 def compute( self, references: List[Any], prediction: Any, task_data: List[Dict] @@ -3126,7 +3130,7 @@ def compute( ), "Only a single reference per prediction is allowed in Binary Accuracy metric" float_prediction = to_float_or_default(prediction) - prediction = str(int(float_prediction > 0.5)) + prediction = str(int(float_prediction > self.threshold)) references = ["1"] if references[0].lower() in self.pos_classes else ["0"] result = {self.main_score: float([prediction] == references)} From ad5294db8f3d63714a742bcea883059a016307c2 Mon Sep 17 00:00:00 2001 From: lilacheden Date: Wed, 13 Mar 2024 11:49:57 +0200 Subject: [PATCH 8/9] add overwrite Signed-off-by: lilacheden --- prepare/metrics/accuracy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prepare/metrics/accuracy.py b/prepare/metrics/accuracy.py index 374532432..7528e4adb 100644 --- a/prepare/metrics/accuracy.py +++ b/prepare/metrics/accuracy.py @@ -34,7 +34,7 @@ add_to_catalog(metric, "metrics.accuracy", overwrite=True) metric = BinaryAccuracy() -add_to_catalog(metric, "metrics.accuracy_binary") +add_to_catalog(metric, "metrics.accuracy_binary", overwrite=True) metric = BinaryMaxAccuracy() add_to_catalog(metric, "metrics.max_accuracy_binary", overwrite=True) From 3a9d93bd5a198b99fb22f5a8b65c19bc5a38d3e5 Mon Sep 17 00:00:00 2001 From: lilacheden Date: Wed, 13 Mar 2024 16:14:02 +0200 Subject: [PATCH 9/9] add doc Signed-off-by: lilacheden --- src/unitxt/metrics.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py index f28ea550c..51eb66f2c 100644 --- a/src/unitxt/metrics.py +++ b/src/unitxt/metrics.py @@ -3082,6 +3082,8 @@ class FixedGroupAbsvalNormHedgesGParaphraseStringContainment(StringContainment): class BinaryMaxF1(F1Binary): + """Calculate the maximal F1 and the decision threshold that achieves it for a binary task with float predictions.""" + main_score = "max_f1_binary" def compute( @@ -3140,6 +3142,8 @@ def compute( class BinaryMaxAccuracy(GlobalMetric): + """Calculate the maximal accuracy and the decision threshold that achieves it for a binary task with float predictions.""" + process_single_instances = False main_score = "max_accuracy_binary" pos_classes = {"1", "1.0", "yes", "true"}