add LlamaIndex faithfulness metric (#971)

* add LlamaIndex faithfulness metric Signed-off-by: Ariel Gera <ariel.gera1@ibm.com> * share code between LlamaIndex metrics Signed-off-by: Ariel Gera <ariel.gera1@ibm.com> * use existing 'score_prefix' field Signed-off-by: Ariel Gera <ariel.gera1@ibm.com> * remove unused field Signed-off-by: Ariel Gera <ariel.gera1@ibm.com> --------- Signed-off-by: Ariel Gera <ariel.gera1@ibm.com>
IBM · Jul 15, 2024 · 643a438 · 643a438
1 parent 1483095
commit 643a438
Show file tree

Hide file tree

Showing 6 changed files with 143 additions and 94 deletions.
diff --git a/prepare/metrics/llama_index_correctness.py b/prepare/metrics/llama_index_correctness.py
diff --git a/prepare/metrics/llama_index_metrics.py b/prepare/metrics/llama_index_metrics.py
@@ -0,0 +1,68 @@
+from unitxt import add_to_catalog
+from unitxt.metrics import LlamaIndexCorrectness, LlamaIndexFaithfulness
+from unitxt.test_utils.metrics import test_metric
+
+# Test with mock
+model_name = "mock"
+model_name_normalized = model_name.replace(".", "_").replace("-", "_")
+
+predictions = ["The right answer"]
+references = [["The right answer", "The wrong answer"]]
+task_data = [
+    {
+        "question": "question number 1",
+        "contexts": ["context number 1"],
+        # "reference_answers": ["The right answer", "The wrong answer"],
+    },
+]
+
+metric_classes = {
+    "correctness": LlamaIndexCorrectness,
+    "faithfulness": LlamaIndexFaithfulness,
+}
+
+for metric_name, metric_class in metric_classes.items():
+    metric = metric_class(model_name=model_name)
+
+    score_name = f"{metric_name}_llama_index_by_{model_name_normalized}_judge"
+
+    instance_targets = [
+        {
+            "score": 1.0,
+            "score_name": score_name,
+            score_name: 1.0,
+            # "feedback": "The generated answer is fully correct and relevant to the user query, matching the reference answer exactly.",
+        }
+    ] * len(predictions)
+
+    global_target = {
+        "score": 1.0,
+        "score_name": score_name,
+        score_name: 1.0,
+    }
+
+    outputs = test_metric(
+        metric=metric,
+        predictions=predictions,
+        references=references,
+        task_data=task_data,
+        instance_targets=instance_targets,
+        global_target=global_target,
+    )
+
+    # GPT model to catalog
+    model_names = ["gpt-3.5-turbo", "mock"]
+    for model_name in model_names:
+        model_name_normalized = model_name.replace(".", "_").replace("-", "_")
+
+        metric = (
+            metric_class(model_name=model_name, data_classification_policy=["public"])
+            if model_name != "mock"
+            else metric_class(model_name=model_name)
+        )
+
+        add_to_catalog(
+            metric,
+            f"metrics.rag.{metric_name}.llama_index_by_{model_name_normalized}",
+            overwrite=True,
+        )
diff --git a/src/unitxt/catalog/metrics/rag/correctness/llama_index_by_gpt_3_5_turbo.json b/src/unitxt/catalog/metrics/rag/correctness/llama_index_by_gpt_3_5_turbo.json
@@ -1,4 +1,7 @@
 {
     "__type__": "llama_index_correctness",
-    "model_name": "gpt-3.5-turbo"
+    "model_name": "gpt-3.5-turbo",
+    "data_classification_policy": [
+        "public"
+    ]
 }
diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_index_by_gpt_3_5_turbo.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_index_by_gpt_3_5_turbo.json
@@ -0,0 +1,7 @@
+{
+    "__type__": "llama_index_faithfulness",
+    "model_name": "gpt-3.5-turbo",
+    "data_classification_policy": [
+        "public"
+    ]
+}
diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_index_by_mock.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_index_by_mock.json
@@ -0,0 +1,4 @@
+{
+    "__type__": "llama_index_faithfulness",
+    "model_name": "mock"
+}
diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
@@ -2134,9 +2134,7 @@ def compute(
         return self.pipe(predictions, batch_size=self.batch_size)
 
 
-class LlamaIndexCorrectness(InstanceMetric):
-    """LlamaIndex based metric class for evaluating correctness."""
-
+class LlamaIndexLLMMetric(InstanceMetric):
     model_name: str = ""
     main_score: str = ""
     prediction_type: str = "str"
@@ -2151,6 +2149,34 @@ class LlamaIndexCorrectness(InstanceMetric):
 
     _requirements_list: List[str] = ["llama_index"]
 
+    def prepare(self):
+        self.model_name_normalized = self.model_name.replace(".", "_").replace("-", "_")
+        self.main_score: str = f"llama_index_by_{self.model_name_normalized}_judge"
+
+        self.reduction_map: Dict[str, List[str]] = {"mean": [self.main_score]}
+
+        if self.model_name in self.openai_models:
+            from llama_index.llms.openai import OpenAI
+
+            self.llm = OpenAI("gpt-3.5-turbo")
+        elif self.model_name in self.mock_models:
+            from llama_index.core.llms.mock import MockLLM
+
+            self.llm = MockLLM(system_prompt="5")  # perfect score
+        else:
+            raise NotImplementedError(
+                f"LlamaIndexLLM metric does not support {self.model_name}, currently only gpt-3.5-turbo is supported"
+            )
+
+    def _model_using_extrnal_api(self):
+        return self.model_name in self.external_api_models
+
+
+class LlamaIndexCorrectness(LlamaIndexLLMMetric):
+    """LlamaIndex based metric class for evaluating correctness."""
+
+    score_prefix = "correctness_"
+
     @staticmethod
     def _custom_parser(eval_response: str):
         """Default parser function for evaluation response.
@@ -2174,37 +2200,14 @@ def _custom_parser(eval_response: str):
         reasoning = reasoning_str.lstrip("\n")
         return score, reasoning
 
-    def _model_using_extrnal_api(self):
-        return self.model_name in self.external_api_models
-
     def prepare(self):
         """Initialization method for the metric. Initializes the CorrectnessEvaluator with the OpenAI model."""
         super().prepare()
 
-        self.model_name_normalized = self.model_name.replace(".", "_").replace("-", "_")
-        self.main_score: str = (
-            f"correctness_llama_index_by_{self.model_name_normalized}_judge"
-        )
-
-        self.reduction_map: Dict[str, List[str]] = {"mean": [self.main_score]}
-
         from llama_index.core.evaluation import CorrectnessEvaluator
 
-        if self.model_name in self.openai_models:
-            from llama_index.llms.openai import OpenAI
-
-            llm = OpenAI("gpt-3.5-turbo")
-        elif self.model_name in self.mock_models:
-            from llama_index.core.llms.mock import MockLLM
-
-            llm = MockLLM(system_prompt="5")  # perfect score
-        else:
-            raise NotImplementedError(
-                f"LlamaIndexCorrectnessMetric does not support {self.model_name}, currently only gpt-3.5-turbo is supported"
-            )
-
         self.evaluator = CorrectnessEvaluator(
-            llm=llm, parser_function=self._custom_parser
+            llm=self.llm, parser_function=self._custom_parser
         )
 
     def compute(
@@ -2226,9 +2229,6 @@ def compute(
         Raises:
             AssertionError: If the input does not meet the expected format.
         """
-        # treat the references as the questions and the predictions as answers
-        # assume a single reference
-
         query = task_data["question"]
 
         contexts = None
@@ -2247,11 +2247,36 @@ def compute(
             )
         result = max([results.score for results in per_reference_results])
 
-        return {
-            self.main_score: result / 5,
-            # "score_name": self.main_score,
-            # "feedback": result.feedback, # removed since this cannot be tested
-        }
+        return {self.main_score: result / 5}
+
+
+class LlamaIndexFaithfulness(LlamaIndexLLMMetric):
+    """LlamaIndex based metric class for evaluating faithfulness."""
+
+    score_prefix = "faithfulness_"
+
+    def prepare(self):
+        """Initialization method for the metric. Initializes the FaithfulnessEvaluator with the OpenAI model."""
+        super().prepare()
+
+        from llama_index.core.evaluation import FaithfulnessEvaluator
+
+        self.evaluator = FaithfulnessEvaluator(llm=self.llm)
+
+    def compute(
+        self,
+        references: List[str],
+        prediction: str,
+        task_data: Dict,
+    ) -> Dict[str, Any]:
+        result = self.evaluator.evaluate(
+            query=task_data["question"],
+            response=prediction,
+            contexts=task_data["contexts"],
+        )
+        score = result.score
+
+        return {self.main_score: score}
 
 
 class Perplexity(BulkInstanceMetric):