Add example of using LLM as a judge for summarization dataset. (#965)

IBM · Jun 30, 2024 · 80243f5 · 80243f5
1 parent b155523
commit 80243f5
Show file tree

Hide file tree

Showing 3 changed files with 92 additions and 4 deletions.
diff --git a/docs/docs/examples.rst b/docs/docs/examples.rst
@@ -37,4 +37,9 @@ Each example is a self contained python file that you can run and later modify.
      - Demonstrates how to evaluate a user QA answering dataset in a standalone file using a user defined task and template. In addition, it shows how to define an LLM as a judge metric, specify the template it uses to produce the input to the judge, and select the judge model and platform.
      - `code <https://github.com/IBM/unitxt/blob/main/examples/standalone_evaluation_llm_as_judge>`_
      - | :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.
+   * - Evaluate your summarization dataset  - using LLM as a judge
+     - Demonstrates how to evaluate a document summarization dataset by define an LLM as a judge metric, specify the template it uses to produce the input to the judge, and select the judge model and platform.
+     - `code <https://github.com/IBM/unitxt/blob/main/examples/evaluation_summarization_dataset_llm_as_judge>`_
+     - | :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.
+
 
diff --git a/examples/evaluation_summarization_dataset_llm_as_judge.py b/examples/evaluation_summarization_dataset_llm_as_judge.py
@@ -0,0 +1,80 @@
+from unitxt import get_logger
+from unitxt.api import evaluate, load_dataset
+from unitxt.inference import (
+    HFPipelineBasedInferenceEngine,
+)
+from unitxt.llm_as_judge import LLMAsJudge
+from unitxt.templates import InputOutputTemplate
+from unitxt.text_utils import print_dict
+
+logger = get_logger()
+# First, we define the judge template.
+judge_summary_rating_template = InputOutputTemplate(
+    instruction="Please act as an impartial judge and evaluate if the assistant's summary summarise well the given text.\n"
+    'You must respond according the following format: "[[rate]] - explanation".\n'
+    'Were the rate is a score between 0 to 10 (10 for great summary, 0 for a very poor one)".\n'
+    "The explanation describe shortly why you decided to give the rank you chosen.\n"
+    "Please make sure to start with your rank ([[rank]]) before anything else.\n"
+    "For example: [[9]] The summary catches the main text ideas."
+    ".\n\n",
+    input_format="[Text:\n{question}\n\n" "Assistant's summary:\n{answer}\n",
+    output_format="[[{rating}]]",
+    postprocessors=[
+        r"processors.extract_mt_bench_rating_judgment",
+    ],
+)
+
+# Second, we define the inference engine we use for judge, with the preferred model and platform.
+platform = "hf"
+model_name = "google/flan-t5-large"
+inference_model = HFPipelineBasedInferenceEngine(
+    model_name=model_name, max_new_tokens=256, use_fp16=True
+)
+# change to this to infer with IbmGenAI APIs:
+#
+# platform = 'ibm_gen_ai'
+# model_name = 'meta-llama/llama-3-70b-instruct'
+# gen_params = IbmGenAiInferenceEngineParams(max_new_tokens=512)
+# inference_model = IbmGenAiInferenceEngine(model_name="meta-llama/llama-3-70b-instruct", parameters=gen_params)
+
+# Third, We define the metric as LLM as a judge, with the desired platform and model.
+llm_judge_metric = LLMAsJudge(
+    inference_model=inference_model,
+    template=judge_summary_rating_template,
+    task="rating.single_turn",
+    main_score=f"llm_judge_{model_name.split('/')[1].replace('-', '_')}_{platform}",
+    strip_system_prompt_and_format_from_inputs=False,
+)
+
+# Load XSUM dataset, with the above metric.
+dataset = load_dataset(
+    card="cards.xsum",
+    template="templates.summarization.abstractive.formal",
+    metrics=[llm_judge_metric],
+    loader_limit=20,
+)
+
+test_dataset = dataset["test"]
+
+# Infer a model to get predictions.
+model_name = "google/flan-t5-base"
+inference_model = HFPipelineBasedInferenceEngine(
+    model_name=model_name, max_new_tokens=32
+)
+predictions = inference_model.infer(test_dataset)
+
+# Evaluate the predictions using the defined metric.
+evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
+
+# Print results
+for instance in evaluated_dataset:
+    print_dict(
+        instance,
+        keys_to_print=[
+            "source",
+            "prediction",
+            "processed_prediction",
+            "references",
+            "score",
+        ],
+    )
diff --git a/src/unitxt/standard.py b/src/unitxt/standard.py
@@ -117,12 +117,15 @@ def prepare_metrics_and_postprocessors(self):
             postprocessors = self.postprocessors
 
         if self.metrics is None:
-            metrics = [
-                metric if isinstance(metric, str) else metric.to_json()
-                for metric in self.card.task.metrics
-            ]
+            metrics = self.card.task.metrics
         else:
             metrics = self.metrics
+
+        metrics = [
+            metric if isinstance(metric, str) else metric.to_json()
+            for metric in metrics
+        ]
+
         return metrics, postprocessors
 
     def set_pipelines(self):