Skip to content

Commit

Permalink
share code between LlamaIndex metrics
Browse files Browse the repository at this point in the history
Signed-off-by: Ariel Gera <ariel.gera1@ibm.com>
  • Loading branch information
arielge committed Jun 30, 2024
1 parent 864647b commit 1794321
Showing 1 changed file with 37 additions and 54 deletions.
91 changes: 37 additions & 54 deletions src/unitxt/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2134,11 +2134,10 @@ def compute(
return self.pipe(predictions, batch_size=self.batch_size)


class LlamaIndexCorrectness(InstanceMetric):
"""LlamaIndex based metric class for evaluating correctness."""

class LlamaIndexLLMMetric(InstanceMetric):
model_name: str = ""
main_score: str = ""
metric_name_prefix: str = ""
prediction_type: str = "str"
reduction_map: Dict[str, List[str]] = None
openai_models: List[str] = ["gpt-3.5-turbo"]
Expand All @@ -2151,6 +2150,34 @@ class LlamaIndexCorrectness(InstanceMetric):

_requirements_list: List[str] = ["llama_index"]

def prepare(self):
self.model_name_normalized = self.model_name.replace(".", "_").replace("-", "_")
self.main_score: str = f"{self.metric_name_prefix}_llama_index_by_{self.model_name_normalized}_judge"

self.reduction_map: Dict[str, List[str]] = {"mean": [self.main_score]}

if self.model_name in self.openai_models:
from llama_index.llms.openai import OpenAI

self.llm = OpenAI("gpt-3.5-turbo")
elif self.model_name in self.mock_models:
from llama_index.core.llms.mock import MockLLM

self.llm = MockLLM(system_prompt="5") # perfect score
else:
raise NotImplementedError(
f"LlamaIndexLLM metric does not support {self.model_name}, currently only gpt-3.5-turbo is supported"
)

def _model_using_extrnal_api(self):
return self.model_name in self.external_api_models


class LlamaIndexCorrectness(LlamaIndexLLMMetric):
"""LlamaIndex based metric class for evaluating correctness."""

metric_name_prefix = "correctness"

@staticmethod
def _custom_parser(eval_response: str):
"""Default parser function for evaluation response.
Expand All @@ -2174,37 +2201,14 @@ def _custom_parser(eval_response: str):
reasoning = reasoning_str.lstrip("\n")
return score, reasoning

def _model_using_extrnal_api(self):
return self.model_name in self.external_api_models

def prepare(self):
"""Initialization method for the metric. Initializes the CorrectnessEvaluator with the OpenAI model."""
super().prepare()

self.model_name_normalized = self.model_name.replace(".", "_").replace("-", "_")
self.main_score: str = (
f"correctness_llama_index_by_{self.model_name_normalized}_judge"
)

self.reduction_map: Dict[str, List[str]] = {"mean": [self.main_score]}

from llama_index.core.evaluation import CorrectnessEvaluator

if self.model_name in self.openai_models:
from llama_index.llms.openai import OpenAI

llm = OpenAI("gpt-3.5-turbo")
elif self.model_name in self.mock_models:
from llama_index.core.llms.mock import MockLLM

llm = MockLLM(system_prompt="5") # perfect score
else:
raise NotImplementedError(
f"LlamaIndexCorrectness metric does not support {self.model_name}, currently only gpt-3.5-turbo is supported"
)

self.evaluator = CorrectnessEvaluator(
llm=llm, parser_function=self._custom_parser
llm=self.llm, parser_function=self._custom_parser
)

def compute(
Expand Down Expand Up @@ -2244,43 +2248,22 @@ def compute(
)
result = max([results.score for results in per_reference_results])

return {
self.main_score: result / 5,
# "score_name": self.main_score,
# "feedback": result.feedback, # removed since this cannot be tested
}
return {self.main_score: result / 5}


class LlamaIndexFaithfulness(LlamaIndexCorrectness):
class LlamaIndexFaithfulness(LlamaIndexLLMMetric):
"""LlamaIndex based metric class for evaluating faithfulness."""

metric_name_prefix = "faithfulness"
reference_field = "contexts" # metric doesn't require reference answers

def prepare(self):
"""Initialization method for the metric. Initializes the FaithfulnessEvaluator with the OpenAI model."""
from llama_index.core.evaluation import FaithfulnessEvaluator

self.model_name_normalized = self.model_name.replace(".", "_").replace("-", "_")
self.main_score: str = (
f"faithfulness_llama_index_by_{self.model_name_normalized}_judge"
)

self.reduction_map: Dict[str, List[str]] = {"mean": [self.main_score]}

if self.model_name in self.openai_models:
from llama_index.llms.openai import OpenAI

llm = OpenAI("gpt-3.5-turbo")
elif self.model_name in self.mock_models:
from llama_index.core.llms.mock import MockLLM
super().prepare()

llm = MockLLM(system_prompt="5") # perfect score
else:
raise NotImplementedError(
f"LlamaIndexFaithfulness metric does not support {self.model_name}, currently only gpt-3.5-turbo is supported"
)
from llama_index.core.evaluation import FaithfulnessEvaluator

self.evaluator = FaithfulnessEvaluator(llm=llm)
self.evaluator = FaithfulnessEvaluator(llm=self.llm)

def compute(
self,
Expand Down

0 comments on commit 1794321

Please sign in to comment.