diff --git a/prepare/metrics/llama_index_correctness.py b/prepare/metrics/llama_index_correctness.py deleted file mode 100644 index a81f2df3d..000000000 --- a/prepare/metrics/llama_index_correctness.py +++ /dev/null @@ -1,58 +0,0 @@ -from unitxt import add_to_catalog -from unitxt.metrics import ( - LlamaIndexCorrectness, -) -from unitxt.test_utils.metrics import test_metric - -# Test with mock -model_name = "mock" -model_name_normalized = model_name.replace(".", "_").replace("-", "_") -metric = LlamaIndexCorrectness(model_name=model_name) - -predictions = ["The right answer"] -references = [["The right answer", "The wrong answer"]] -task_data = [ - { - "question": "question number 1", - "contexts": "['context number 1']", - # "reference_answers": ["The right answer", "The wrong answer"], - }, -] - -score_name = f"correctness_llama_index_by_{model_name_normalized}_judge" - -instance_targets = [ # nDCG is undefined at instance level - { - "score": 1.0, - "score_name": score_name, - score_name: 1.0, - # "feedback": "The generated answer is fully correct and relevant to the user query, matching the reference answer exactly.", - } -] * len(predictions) - -global_target = { - "score": 1.0, - "score_name": score_name, - score_name: 1.0, -} - -outputs = test_metric( - metric=metric, - predictions=predictions, - references=references, - task_data=task_data, - instance_targets=instance_targets, - global_target=global_target, -) - -# GPT model to catalog -model_names = ["gpt-3.5-turbo", "mock"] -for model_name in model_names: - model_name_normalized = model_name.replace(".", "_").replace("-", "_") - metric = LlamaIndexCorrectness(model_name=model_name) - - add_to_catalog( - metric, - f"metrics.rag.correctness.llama_index_by_{model_name_normalized}", - overwrite=True, - ) diff --git a/prepare/metrics/llama_index_metrics.py b/prepare/metrics/llama_index_metrics.py new file mode 100644 index 000000000..ac4f46ec0 --- /dev/null +++ b/prepare/metrics/llama_index_metrics.py @@ -0,0 +1,68 @@ +from unitxt import add_to_catalog +from unitxt.metrics import LlamaIndexCorrectness, LlamaIndexFaithfulness +from unitxt.test_utils.metrics import test_metric + +# Test with mock +model_name = "mock" +model_name_normalized = model_name.replace(".", "_").replace("-", "_") + +predictions = ["The right answer"] +references = [["The right answer", "The wrong answer"]] +task_data = [ + { + "question": "question number 1", + "contexts": ["context number 1"], + # "reference_answers": ["The right answer", "The wrong answer"], + }, +] + +metric_classes = { + "correctness": LlamaIndexCorrectness, + "faithfulness": LlamaIndexFaithfulness, +} + +for metric_name, metric_class in metric_classes.items(): + metric = metric_class(model_name=model_name) + + score_name = f"{metric_name}_llama_index_by_{model_name_normalized}_judge" + + instance_targets = [ + { + "score": 1.0, + "score_name": score_name, + score_name: 1.0, + # "feedback": "The generated answer is fully correct and relevant to the user query, matching the reference answer exactly.", + } + ] * len(predictions) + + global_target = { + "score": 1.0, + "score_name": score_name, + score_name: 1.0, + } + + outputs = test_metric( + metric=metric, + predictions=predictions, + references=references, + task_data=task_data, + instance_targets=instance_targets, + global_target=global_target, + ) + + # GPT model to catalog + model_names = ["gpt-3.5-turbo", "mock"] + for model_name in model_names: + model_name_normalized = model_name.replace(".", "_").replace("-", "_") + + metric = ( + metric_class(model_name=model_name, data_classification_policy=["public"]) + if model_name != "mock" + else metric_class(model_name=model_name) + ) + + add_to_catalog( + metric, + f"metrics.rag.{metric_name}.llama_index_by_{model_name_normalized}", + overwrite=True, + ) diff --git a/src/unitxt/catalog/metrics/rag/correctness/llama_index_by_gpt_3_5_turbo.json b/src/unitxt/catalog/metrics/rag/correctness/llama_index_by_gpt_3_5_turbo.json index c62c3aa5e..0cf813615 100644 --- a/src/unitxt/catalog/metrics/rag/correctness/llama_index_by_gpt_3_5_turbo.json +++ b/src/unitxt/catalog/metrics/rag/correctness/llama_index_by_gpt_3_5_turbo.json @@ -1,4 +1,7 @@ { "__type__": "llama_index_correctness", - "model_name": "gpt-3.5-turbo" + "model_name": "gpt-3.5-turbo", + "data_classification_policy": [ + "public" + ] } diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_index_by_gpt_3_5_turbo.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_index_by_gpt_3_5_turbo.json new file mode 100644 index 000000000..db8ea0601 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/faithfulness/llama_index_by_gpt_3_5_turbo.json @@ -0,0 +1,7 @@ +{ + "__type__": "llama_index_faithfulness", + "model_name": "gpt-3.5-turbo", + "data_classification_policy": [ + "public" + ] +} diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/llama_index_by_mock.json b/src/unitxt/catalog/metrics/rag/faithfulness/llama_index_by_mock.json new file mode 100644 index 000000000..7247e543d --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/faithfulness/llama_index_by_mock.json @@ -0,0 +1,4 @@ +{ + "__type__": "llama_index_faithfulness", + "model_name": "mock" +} diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py index c6768a6e3..8a41bb85d 100644 --- a/src/unitxt/metrics.py +++ b/src/unitxt/metrics.py @@ -2134,9 +2134,7 @@ def compute( return self.pipe(predictions, batch_size=self.batch_size) -class LlamaIndexCorrectness(InstanceMetric): - """LlamaIndex based metric class for evaluating correctness.""" - +class LlamaIndexLLMMetric(InstanceMetric): model_name: str = "" main_score: str = "" prediction_type: str = "str" @@ -2151,6 +2149,34 @@ class LlamaIndexCorrectness(InstanceMetric): _requirements_list: List[str] = ["llama_index"] + def prepare(self): + self.model_name_normalized = self.model_name.replace(".", "_").replace("-", "_") + self.main_score: str = f"llama_index_by_{self.model_name_normalized}_judge" + + self.reduction_map: Dict[str, List[str]] = {"mean": [self.main_score]} + + if self.model_name in self.openai_models: + from llama_index.llms.openai import OpenAI + + self.llm = OpenAI("gpt-3.5-turbo") + elif self.model_name in self.mock_models: + from llama_index.core.llms.mock import MockLLM + + self.llm = MockLLM(system_prompt="5") # perfect score + else: + raise NotImplementedError( + f"LlamaIndexLLM metric does not support {self.model_name}, currently only gpt-3.5-turbo is supported" + ) + + def _model_using_extrnal_api(self): + return self.model_name in self.external_api_models + + +class LlamaIndexCorrectness(LlamaIndexLLMMetric): + """LlamaIndex based metric class for evaluating correctness.""" + + score_prefix = "correctness_" + @staticmethod def _custom_parser(eval_response: str): """Default parser function for evaluation response. @@ -2174,37 +2200,14 @@ def _custom_parser(eval_response: str): reasoning = reasoning_str.lstrip("\n") return score, reasoning - def _model_using_extrnal_api(self): - return self.model_name in self.external_api_models - def prepare(self): """Initialization method for the metric. Initializes the CorrectnessEvaluator with the OpenAI model.""" super().prepare() - self.model_name_normalized = self.model_name.replace(".", "_").replace("-", "_") - self.main_score: str = ( - f"correctness_llama_index_by_{self.model_name_normalized}_judge" - ) - - self.reduction_map: Dict[str, List[str]] = {"mean": [self.main_score]} - from llama_index.core.evaluation import CorrectnessEvaluator - if self.model_name in self.openai_models: - from llama_index.llms.openai import OpenAI - - llm = OpenAI("gpt-3.5-turbo") - elif self.model_name in self.mock_models: - from llama_index.core.llms.mock import MockLLM - - llm = MockLLM(system_prompt="5") # perfect score - else: - raise NotImplementedError( - f"LlamaIndexCorrectnessMetric does not support {self.model_name}, currently only gpt-3.5-turbo is supported" - ) - self.evaluator = CorrectnessEvaluator( - llm=llm, parser_function=self._custom_parser + llm=self.llm, parser_function=self._custom_parser ) def compute( @@ -2226,9 +2229,6 @@ def compute( Raises: AssertionError: If the input does not meet the expected format. """ - # treat the references as the questions and the predictions as answers - # assume a single reference - query = task_data["question"] contexts = None @@ -2247,11 +2247,36 @@ def compute( ) result = max([results.score for results in per_reference_results]) - return { - self.main_score: result / 5, - # "score_name": self.main_score, - # "feedback": result.feedback, # removed since this cannot be tested - } + return {self.main_score: result / 5} + + +class LlamaIndexFaithfulness(LlamaIndexLLMMetric): + """LlamaIndex based metric class for evaluating faithfulness.""" + + score_prefix = "faithfulness_" + + def prepare(self): + """Initialization method for the metric. Initializes the FaithfulnessEvaluator with the OpenAI model.""" + super().prepare() + + from llama_index.core.evaluation import FaithfulnessEvaluator + + self.evaluator = FaithfulnessEvaluator(llm=self.llm) + + def compute( + self, + references: List[str], + prediction: str, + task_data: Dict, + ) -> Dict[str, Any]: + result = self.evaluator.evaluate( + query=task_data["question"], + response=prediction, + contexts=task_data["contexts"], + ) + score = result.score + + return {self.main_score: score} class Perplexity(BulkInstanceMetric):