share code between LlamaIndex metrics

Signed-off-by: Ariel Gera <ariel.gera1@ibm.com>
IBM · Jun 30, 2024 · 1794321 · 1794321
1 parent 864647b
commit 1794321
Showing 1 changed file with 37 additions and 54 deletions.
diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
@@ -2134,11 +2134,10 @@ def compute(
         return self.pipe(predictions, batch_size=self.batch_size)
 
 
-class LlamaIndexCorrectness(InstanceMetric):
-    """LlamaIndex based metric class for evaluating correctness."""
-
+class LlamaIndexLLMMetric(InstanceMetric):
     model_name: str = ""
     main_score: str = ""
+    metric_name_prefix: str = ""
     prediction_type: str = "str"
     reduction_map: Dict[str, List[str]] = None
     openai_models: List[str] = ["gpt-3.5-turbo"]
@@ -2151,6 +2150,34 @@ class LlamaIndexCorrectness(InstanceMetric):
 
     _requirements_list: List[str] = ["llama_index"]
 
+    def prepare(self):
+        self.model_name_normalized = self.model_name.replace(".", "_").replace("-", "_")
+        self.main_score: str = f"{self.metric_name_prefix}_llama_index_by_{self.model_name_normalized}_judge"
+
+        self.reduction_map: Dict[str, List[str]] = {"mean": [self.main_score]}
+
+        if self.model_name in self.openai_models:
+            from llama_index.llms.openai import OpenAI
+
+            self.llm = OpenAI("gpt-3.5-turbo")
+        elif self.model_name in self.mock_models:
+            from llama_index.core.llms.mock import MockLLM
+
+            self.llm = MockLLM(system_prompt="5")  # perfect score
+        else:
+            raise NotImplementedError(
+                f"LlamaIndexLLM metric does not support {self.model_name}, currently only gpt-3.5-turbo is supported"
+            )
+
+    def _model_using_extrnal_api(self):
+        return self.model_name in self.external_api_models
+
+
+class LlamaIndexCorrectness(LlamaIndexLLMMetric):
+    """LlamaIndex based metric class for evaluating correctness."""
+
+    metric_name_prefix = "correctness"
+
     @staticmethod
     def _custom_parser(eval_response: str):
         """Default parser function for evaluation response.
@@ -2174,37 +2201,14 @@ def _custom_parser(eval_response: str):
         reasoning = reasoning_str.lstrip("\n")
         return score, reasoning
 
-    def _model_using_extrnal_api(self):
-        return self.model_name in self.external_api_models
-
     def prepare(self):
         """Initialization method for the metric. Initializes the CorrectnessEvaluator with the OpenAI model."""
         super().prepare()
 
-        self.model_name_normalized = self.model_name.replace(".", "_").replace("-", "_")
-        self.main_score: str = (
-            f"correctness_llama_index_by_{self.model_name_normalized}_judge"
-        )
-
-        self.reduction_map: Dict[str, List[str]] = {"mean": [self.main_score]}
-
         from llama_index.core.evaluation import CorrectnessEvaluator
 
-        if self.model_name in self.openai_models:
-            from llama_index.llms.openai import OpenAI
-
-            llm = OpenAI("gpt-3.5-turbo")
-        elif self.model_name in self.mock_models:
-            from llama_index.core.llms.mock import MockLLM
-
-            llm = MockLLM(system_prompt="5")  # perfect score
-        else:
-            raise NotImplementedError(
-                f"LlamaIndexCorrectness metric does not support {self.model_name}, currently only gpt-3.5-turbo is supported"
-            )
-
         self.evaluator = CorrectnessEvaluator(
-            llm=llm, parser_function=self._custom_parser
+            llm=self.llm, parser_function=self._custom_parser
         )
 
     def compute(
@@ -2244,43 +2248,22 @@ def compute(
             )
         result = max([results.score for results in per_reference_results])
 
-        return {
-            self.main_score: result / 5,
-            # "score_name": self.main_score,
-            # "feedback": result.feedback, # removed since this cannot be tested
-        }
+        return {self.main_score: result / 5}
 
 
-class LlamaIndexFaithfulness(LlamaIndexCorrectness):
+class LlamaIndexFaithfulness(LlamaIndexLLMMetric):
     """LlamaIndex based metric class for evaluating faithfulness."""
 
+    metric_name_prefix = "faithfulness"
     reference_field = "contexts"  # metric doesn't require reference answers
 
     def prepare(self):
         """Initialization method for the metric. Initializes the FaithfulnessEvaluator with the OpenAI model."""
-        from llama_index.core.evaluation import FaithfulnessEvaluator
-
-        self.model_name_normalized = self.model_name.replace(".", "_").replace("-", "_")
-        self.main_score: str = (
-            f"faithfulness_llama_index_by_{self.model_name_normalized}_judge"
-        )
-
-        self.reduction_map: Dict[str, List[str]] = {"mean": [self.main_score]}
-
-        if self.model_name in self.openai_models:
-            from llama_index.llms.openai import OpenAI
-
-            llm = OpenAI("gpt-3.5-turbo")
-        elif self.model_name in self.mock_models:
-            from llama_index.core.llms.mock import MockLLM
+        super().prepare()
 
-            llm = MockLLM(system_prompt="5")  # perfect score
-        else:
-            raise NotImplementedError(
-                f"LlamaIndexFaithfulness metric does not support {self.model_name}, currently only gpt-3.5-turbo is supported"
-            )
+        from llama_index.core.evaluation import FaithfulnessEvaluator
 
-        self.evaluator = FaithfulnessEvaluator(llm=llm)
+        self.evaluator = FaithfulnessEvaluator(llm=self.llm)
 
     def compute(
         self,