Add raw predictions and references to results (#934)

* Add raw (textual) predictions and references now there are prediction - original model output references - original references as text [list] processed_prediction - model input after post procesing processed - references after postprocesing [list] Updated examples to use it Added a check in test_api to see expected fields are returnd Signed-off-by: Yoav Katz <katz@il.ibm.com> * Updated glossarry Signed-off-by: Yoav Katz <katz@il.ibm.com> * fix expected results of evaluate api Signed-off-by: Yoav Katz <katz@il.ibm.com> * Fixed LLMasJudge to use process_predictions field Signed-off-by: Yoav Katz <katz@il.ibm.com> --------- Signed-off-by: Yoav Katz <katz@il.ibm.com>
IBM · Jul 15, 2024 · 8eabe10 · 8eabe10
1 parent 7654912
commit 8eabe10
Show file tree

Hide file tree

Showing 7 changed files with 151 additions and 27 deletions.
diff --git a/docs/docs/glossary.rst b/docs/docs/glossary.rst
@@ -109,6 +109,18 @@ the Instruction-User-Agent schema cues, and the two presented demonstrations.
 
 The catalog contains predefined formats :ref:`here <catalog.formats>`.
 
+.. _inference_engine:
+
+Inference Engine
+----------------
+
+An inference engine in Unitxt is an object that performs model inference on Unitxt datasets.
+Unitxt provides out of the box inference engines that wrap Huggingface pipelines and OpenAI and IBMGenAI APIs. 
+Since Unitxt has separate data preparation and evaluation pipelines, you can use any external code or engine to generate
+model predictions. The built in inference engines can make it more convenient  
+They also ensure that no sensitive data is passed to external services. 
+(`see code example here <https://github.com/IBM/unitxt/blob/main/examples/standalone_qa_evaluation.py>`_)
+
 .. _operator:
 
 Operator
@@ -137,6 +149,22 @@ template could remove trailing whitespace, take the first word, convert `Yes`` t
 
 .. _recipe:
 
+Prediction and Processed Prediction
+------------------------------------
+
+A prediction is the output of the model on the input provided to it.
+The inference process used to generated the prediction can be done with an Unitxt :ref:`Inference Engine <inference_engine>` or any other 
+framework or code.  The predictions over all instances are  passed to the evaluation pipeline, together with the original dataset.
+
+The textual predictions returned by the model are processed by the :ref:`Template <template>`'s :ref:`Post processors <post_processors>` 
+before being passed to the :ref:`Metrics <metric>`.  The post processors converts the textual prediction to the 
+type required by the metrics. For example, `Yes` and `No` values could be first normalized to `yes`` and `no` and then converted 
+into `0.0`` and `1.0`.
+
+After evaluation, the `prediction` field of each instance in the result datasets contains the prediction returned by the model and 
+the  `processed_prediction` field holds the prediction after post processing by the template.
+
+
 Recipe
 ------
 
@@ -147,21 +175,25 @@ This includes :ref:`DataTask card <data_task_card>`, :ref:`Template <template>`,
 
 .. _references:
 
-References
---------------------------
+References and Processed References
+------------------------------------
 
 References are the "correct answers" for the task for a given instance.
 They are stored as a list of strings in the `references` field of the generated Unitxt dataset.
-For example, a reference for a binary classification task could be `Yes`` or `No`.
+For example, a reference for a binary classification task could be `Yes` or `No`.
 
 It is expect that the model will get a perfect score from the metrics if the model prediction
 is equal to one of the references.
 
 The textual references are processed by the :ref:`Template <template>`'s :ref:`Post processors <post_processors>` 
-before passed to the :ref:`Metrics <metric>`.  The post processor de-verbalize the textual representation
-of the references and converted it to the types required by the metric. For example, `Yes` and `No`
+before being passed to the :ref:`Metrics <metric>`.  The post processor converts the textual representation
+of the references and converted them to the type required by the metrics. For example, `Yes` and `No`
 values could be converted into `0.0`` and `1`.
 
+The `references` field of the dataset contains the textual references, and the result dataset after evaluation
+contains an additional `processed_references` field with the references after post processing by the template.
+
+
 .. _target:
 
 Target

diff --git a/examples/evaluate_existing_dataset_no_install.py b/examples/evaluate_existing_dataset_no_install.py
@@ -22,7 +22,7 @@
 # (f1_micro, f1_macro, accuracy, including confidence intervals)
 
 metric = evaluate.load("unitxt/metric")
-results = metric.compute(predictions=predictions, references=dataset["test"])
+evaluated_dataset = metric.compute(predictions=predictions, references=dataset["test"])
 
 # print the aggregated scores dictionary.
-print_dict(results[0]["score"]["global"])
+print_dict(evaluated_dataset[0]["score"]["global"])
diff --git a/examples/qa_evaluation.py b/examples/qa_evaluation.py
@@ -58,18 +58,25 @@
 # inference_model = OpenAiInferenceEngine(model_name=model_name, parameters=gen_params)
 #
 predictions = inference_model.infer(test_dataset)
-dataset_with_scores = evaluate(predictions=predictions, data=test_dataset)
+evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
 
 # Print results
-for sample, prediction in zip(dataset_with_scores, predictions):
+for instance in evaluated_dataset:
     logger.info("*" * 80)
-    logger.info(f"Model input:\n{sample['source']}")
-    logger.info(f"Model prediction (as returned by the model):\n{prediction}")
-    logger.info(f"Model prediction (after post processing):\n{sample['prediction']}")
-    logger.info(f"References:\n{sample['references']}")
-    score_name = sample["score"]["instance"]["score_name"]
-    score = sample["score"]["instance"]["score"]
+    logger.info(f"Model input:\n{instance['source']}")
+    logger.info(
+        f"Model prediction (as returned by the model):\n{instance['prediction']}"
+    )
+    logger.info(
+        f"Model prediction (after post processing):\n{instance['processed_prediction']}"
+    )
+    logger.info(f"References:\n{instance['references']}")
+    logger.info(
+        f"References (after post processing):\n{instance['processed_references']}"
+    )
+    score_name = instance["score"]["instance"]["score_name"]
+    score = instance["score"]["instance"]["score"]
     logger.info(f"Sample score ({score_name}) : {score}")
-global_score = dataset_with_scores[0]["score"]["global"]["score"]
+global_score = evaluated_dataset[0]["score"]["global"]["score"]
 logger.info("*" * 80)
 logger.info(f"Aggregated score ({score_name}) : {global_score}")
diff --git a/examples/standalone_qa_evaluation.py b/examples/standalone_qa_evaluation.py
@@ -68,18 +68,25 @@
 # loader=LoadFromDictionary(data=data,data_classification_policy=["public"]),
 
 predictions = inference_model.infer(test_dataset)
-dataset_with_scores = evaluate(predictions=predictions, data=test_dataset)
+evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
 
 # Print results
-for sample, prediction in zip(dataset_with_scores, predictions):
+for instance in evaluated_dataset:
     logger.info("*" * 80)
-    logger.info(f"Model input:\n{sample['source']}")
-    logger.info(f"Model prediction (as returned by the model):\n{prediction}")
-    logger.info(f"Model prediction (after post processing):\n{sample['prediction']}")
-    logger.info(f"References:\n{sample['references']}")
-    score_name = sample["score"]["instance"]["score_name"]
-    score = sample["score"]["instance"]["score"]
+    logger.info(f"Model input:\n{instance['source']}")
+    logger.info(
+        f"Model prediction (as returned by the model):\n{instance['prediction']}"
+    )
+    logger.info(
+        f"Model prediction (after post processing):\n{instance['processed_prediction']}"
+    )
+    logger.info(f"References:\n{instance['references']}")
+    logger.info(
+        f"References (after post processing):\n{instance['processed_references']}"
+    )
+    score_name = instance["score"]["instance"]["score_name"]
+    score = instance["score"]["instance"]["score"]
     logger.info(f"Sample score ({score_name}) : {score}")
-global_score = dataset_with_scores[0]["score"]["global"]["score"]
+global_score = evaluated_dataset[0]["score"]["global"]["score"]
 logger.info("*" * 80)
 logger.info(f"Aggregated score ({score_name}) : {global_score}")
diff --git a/src/unitxt/llm_as_judge.py b/src/unitxt/llm_as_judge.py
@@ -136,7 +136,10 @@ def compute(
         verdicts = self.inference_model.infer(dataset)
         meta_scores = evaluate(predictions=verdicts, data=dataset)
         return [
-            {self.main_score: instance["prediction"], "judge_raw_output": verdict}
+            {
+                self.main_score: instance["processed_prediction"],
+                "judge_raw_output": verdict,
+            }
             for instance in meta_scores
             for verdict in verdicts
         ]
diff --git a/src/unitxt/metric_utils.py b/src/unitxt/metric_utils.py
@@ -18,6 +18,7 @@
     Copy,
     FlattenInstances,
     MergeStreams,
+    RenameFields,
     SplitByNestedGroup,
 )
 from .register import _reset_env_local_catalogs, register_all_artifacts
@@ -155,13 +156,29 @@ def prepare(self):
         self.steps = [
             FromPredictionsAndOriginalData(),
             LoadJson(field="task_data"),
+            Copy(
+                field="prediction",
+                to_field="raw_prediction",
+            ),
+            Copy(
+                field="references",
+                to_field="raw_references",
+            ),
             Copy(
                 field="source",
                 to_field="task_data/source",
             ),
             ApplyOperatorsField(
                 operators_field="postprocessors",
             ),
+            Copy(
+                field="prediction",
+                to_field="processed_prediction",
+            ),
+            Copy(
+                field="references",
+                to_field="processed_references",
+            ),
             SplitByNestedGroup(
                 field_name_of_group="group",
                 number_of_fusion_generations=self.number_of_fusion_generations,
@@ -172,6 +189,18 @@ def prepare(self):
             ),
             MultiStreamScoreMean(),
             MergeStreams(),
+            RenameFields(
+                field="raw_prediction",
+                to_field="prediction",
+            ),
+            RenameFields(
+                field="raw_references",
+                to_field="references",
+            ),
+            Copy(
+                field="source",
+                to_field="task_data/source",
+            ),
         ]
 
 

diff --git a/tests/library/test_api.py b/tests/library/test_api.py
@@ -1,3 +1,4 @@
+import numpy as np
 from unitxt.api import evaluate, load_dataset, produce
 from unitxt.card import TaskCard
 from unitxt.loaders import LoadHF
@@ -40,7 +41,52 @@ def test_evaluate(self):
         )
         predictions = ["2.5", "2.5", "2.2", "3", "4"]
         results = evaluate(predictions, dataset["train"])
-        self.assertAlmostEqual(results[0]["score"]["global"]["score"], 0.026, 3)
+        instance_with_results = {
+            "metrics": ["metrics.spearman"],
+            "source": "Given this sentence: 'A plane is taking off.', on a scale of 1.0 to 5.0, what is the similarity to this text 'An air plane is taking off.'?\n",
+            "target": "5.0",
+            "references": ["5.0"],
+            "task_data": {
+                "text1": "A plane is taking off.",
+                "text2": "An air plane is taking off.",
+                "attribute_name": "similarity",
+                "min_value": 1.0,
+                "max_value": 5.0,
+                "attribute_value": 5.0,
+                "metadata": {"template": "templates.regression.two_texts.simple"},
+                "source": "Given this sentence: 'A plane is taking off.', on a scale of 1.0 to 5.0, what is the similarity to this text 'An air plane is taking off.'?\n",
+            },
+            "group": "unitxt",
+            "origin": "all~unitxt",
+            "postprocessors": [
+                "processors.take_first_non_empty_line",
+                "processors.cast_to_float_return_zero_if_failed",
+            ],
+            "data_classification_policy": ["public"],
+            "prediction": "2.5",
+            "processed_prediction": 2.5,
+            "processed_references": [5.0],
+            "score": {
+                "global": {
+                    "score": 0.026315789473684213,
+                    "score_ci_high": 0.9639697714358006,
+                    "score_ci_low": -0.970678676196682,
+                    "score_name": "spearmanr",
+                    "spearmanr": 0.026315789473684213,
+                    "spearmanr_ci_high": 0.9639697714358006,
+                    "spearmanr_ci_low": -0.970678676196682,
+                },
+                "instance": {
+                    "score": np.nan,
+                    "score_name": "spearmanr",
+                    "spearmanr": np.nan,
+                },
+            },
+        }
+        # Processors are not serialized by correctly yet
+        del results[0]["postprocessors"]
+        del instance_with_results["postprocessors"]
+        self.assertDictEqual(results[0], instance_with_results)
 
     def test_evaluate_with_metrics_external_setup(self):
         dataset = load_dataset(