Add simple LLM as a judge example, of using it without installaiotn

IBM · Jul 1, 2024 · af92b20 · af92b20
1 parent 87c3802
commit af92b20
Show file tree

Hide file tree

Showing 8 changed files with 212 additions and 77 deletions.
diff --git a/docs/docs/examples.rst b/docs/docs/examples.rst
@@ -48,13 +48,19 @@ Each example is a self contained python file that you can run and later modify.
      - Description
      - Link to code
      - Related documentation
-   * - Evaluate your question-answering dataset  
+   * - Evaluate an existing question-answering dataset from the Unitxt catalog, and evaluate it
+     - Demonstrates how to evaluate an existing QA dataset (squad) using Huggingface
+       datasets and evaluate APIs, with no installation required. By using predefined LLM as a judge metric.
+     - `code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_dataset_by_llm_as_judge_no_install.py>`_
+     - | :ref:`Evaluating datasets <evaluating_datasets>`.
+     - | :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.
+   * - Evaluate your question-answering dataset
      - Demonstrates how to evaluate a user QA answering dataset in a standalone file using a user defined task and template. In addition, it shows how to define an LLM as a judge metric, specify the template it uses to produce the input to the judge, and select the judge model and platform.
      - `code <https://github.com/IBM/unitxt/blob/main/examples/standalone_evaluation_llm_as_judge.py>`_
      - | :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.
    * - Evaluate an existing summarization dataset from the catalog with LLM as judge
      - Demonstrates how to evaluate a document summarization dataset by define an LLM as a judge metric, specify the template it uses to produce the input to the judge, and select the judge model and platform.
-     - `code <https://github.com/IBM/unitxt/blob/main/examples/evaluation_summarization_dataset_llm_as_judge>`_
+     - `code <https://github.com/IBM/unitxt/blob/main/examples/evaluation_summarization_dataset_llm_as_judge.py>`_
      - | :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.
 
 
diff --git a/examples/evaluate_dataset_by_llm_as_judge_no_install.py b/examples/evaluate_dataset_by_llm_as_judge_no_install.py
@@ -0,0 +1,43 @@
+from datasets import load_dataset
+from unitxt import get_logger, get_settings
+from unitxt.api import evaluate
+from unitxt.inference import (
+    HFPipelineBasedInferenceEngine,
+)
+from unitxt.text_utils import print_dict
+
+logger = get_logger()
+settings = get_settings()
+settings.allow_unverified_code = True
+
+# Use the HF load_dataset API, to load the squad QA dataset using the standard template in the catalog.
+# We set loader_limit to 20 to reduce download time.
+test_dataset = load_dataset(
+    "unitxt/data",
+    "card=cards.squad,template=templates.qa.with_context.simple,metrics=[metrics.llm_as_judge.rating.llama_3_70b_instruct_ibm_genai_template_generic_single_turn],loader_limit=20",
+    trust_remote_code=True,
+    split="test",
+)
+
+# Infer a model to get predictions.
+model_name = "google/flan-t5-base"
+inference_model = HFPipelineBasedInferenceEngine(
+    model_name=model_name, max_new_tokens=32
+)
+predictions = inference_model.infer(test_dataset)
+
+# Evaluate the predictions using the defined metric.
+evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
+
+# Print results
+for instance in evaluated_dataset:
+    print_dict(
+        instance,
+        keys_to_print=[
+            "source",
+            "prediction",
+            "processed_prediction",
+            "references",
+            "score",
+        ],
+    )
diff --git a/examples/evaluation_summarization_dataset_llm_as_judge_with_refs.py b/examples/evaluation_summarization_dataset_llm_as_judge_with_refs.py
@@ -0,0 +1,84 @@
+from unitxt import get_logger
+from unitxt.api import evaluate, load_dataset
+from unitxt.inference import (
+    HFPipelineBasedInferenceEngine,
+    IbmGenAiInferenceEngine,
+    IbmGenAiInferenceEngineParams,
+)
+from unitxt.llm_as_judge import LLMAsJudge
+from unitxt.templates import InputOutputTemplate
+from unitxt.text_utils import print_dict
+
+logger = get_logger()
+# First, we define the judge template.
+judge_summary_rating_template = InputOutputTemplate(
+    instruction="Please act as an impartial judge and evaluate if the assistant's summary summarise well the given text.\n"
+    'You must respond according the following format: "[[rate]] - explanation".\n'
+    'Were the rate is a score between 0 to 10 (10 for great summary, 0 for a very poor one)".\n'
+    "The explanation describe shortly why you decided to give the rank you chosen.\n"
+    "Please make sure to start with your rank ([[rank]]) before anything else.\n"
+    "For example: [[9]] The summary catches the main text ideas."
+    ".\n\n",
+    input_format="[Text:\n{model_input}\n\n" "Assistant's summary:\n{model_output}\n",
+    output_format="[[{rating}]]",
+    postprocessors=[
+        r"processors.extract_mt_bench_rating_judgment",
+    ],
+)
+
+# Second, we define the inference engine we use for judge, with the preferred model and platform.
+# platform = "hf"
+# model_name = "google/flan-t5-large"
+# inference_model = HFPipelineBasedInferenceEngine(
+#     model_name=model_name, max_new_tokens=256, use_fp16=True
+# )
+# change to this to infer with IbmGenAI APIs:
+#
+platform = "ibm_gen_ai"
+model_name = "meta-llama/llama-3-70b-instruct"
+gen_params = IbmGenAiInferenceEngineParams(max_new_tokens=512)
+inference_model = IbmGenAiInferenceEngine(
+    model_name="meta-llama/llama-3-70b-instruct", parameters=gen_params
+)
+
+# Third, We define the metric as LLM as a judge, with the desired platform and model.
+llm_judge_metric = LLMAsJudge(
+    inference_model=inference_model,
+    template=judge_summary_rating_template,
+    task="rating.single_turn",
+    main_score=f"llm_judge_{model_name.split('/')[1].replace('-', '_')}_{platform}",
+    strip_system_prompt_and_format_from_inputs=False,
+)
+
+# Load XSUM dataset, with the above metric.
+dataset = load_dataset(
+    card="cards.xsum",
+    template="templates.summarization.abstractive.formal",
+    metrics=[llm_judge_metric],
+    loader_limit=20,
+)
+
+test_dataset = dataset["test"]
+
+# Infer a model to get predictions.
+model_name = "google/flan-t5-base"
+inference_model = HFPipelineBasedInferenceEngine(
+    model_name=model_name, max_new_tokens=32
+)
+predictions = inference_model.infer(test_dataset)
+
+# Evaluate the predictions using the defined metric.
+evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
+
+# Print results
+for instance in evaluated_dataset:
+    print_dict(
+        instance,
+        keys_to_print=[
+            "source",
+            "prediction",
+            "processed_prediction",
+            "references",
+            "score",
+        ],
+    )
diff --git a/examples/use_llm_as_judge_metric.py b/examples/use_llm_as_judge_metric.py
diff --git a/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_generic_template.py b/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_generic_template.py
@@ -0,0 +1,31 @@
+from unitxt import add_to_catalog
+from unitxt.inference import (
+    IbmGenAiInferenceEngine,
+    IbmGenAiInferenceEngineParams,
+)
+from unitxt.llm_as_judge import LLMAsJudge
+
+model = "meta-llama/llama-3-70b-instruct"
+format = "formats.llama3_chat"
+template = "templates.response_assessment.rating.generic_single_turn"
+task = "rating.single_turn"
+
+gen_params = IbmGenAiInferenceEngineParams(max_new_tokens=252)
+inference_model = IbmGenAiInferenceEngine(model_name=model, parameters=gen_params)
+model_label = model.split("/")[1].replace("-", "_").replace(".", ",").lower()
+model_label = f"{model_label}_ibm_genai"
+template_label = template.split(".")[-1]
+metric_label = f"{model_label}_template_{template_label}"
+metric = LLMAsJudge(
+    inference_model=inference_model,
+    template=template,
+    task=task,
+    format=format,
+    main_score=metric_label,
+)
+
+add_to_catalog(
+    metric,
+    f"metrics.llm_as_judge.rating.{model_label}_template_{template_label}",
+    overwrite=True,
+)
diff --git a/prepare/templates/response_assessment/rating/generic_single_turn.py b/prepare/templates/response_assessment/rating/generic_single_turn.py
@@ -0,0 +1,22 @@
+from unitxt import add_to_catalog
+from unitxt.templates import InputOutputTemplate
+
+add_to_catalog(
+    InputOutputTemplate(
+        instruction="Please act as an impartial judge and evaluate the quality of the response provided"
+        " by an AI assistant to the user input displayed below. Your evaluation should consider"
+        " factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of"
+        " detail of the response. Begin your evaluation by providing a short explanation. Be as"
+        " objective as possible. After providing your explanation, you must rate the response"
+        ' on a scale of 1 to 10 by strictly following this format: "[[rating]]", for example:'
+        ' "Rating: [[5]]".\n\n',
+        input_format="[User input]\n{question}\n\n"
+        "[Assistant's respond]\n{answer}\n[The End of Assistant's respond]",
+        output_format="[[{rating}]]",
+        postprocessors=[
+            r"processors.extract_mt_bench_rating_judgment",
+        ],
+    ),
+    "templates.response_assessment.rating.generic_single_turn",
+    overwrite=True,
+)
diff --git a/...rics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_generic_single_turn.json b/...rics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_generic_single_turn.json
@@ -0,0 +1,15 @@
+{
+    "__type__": "llm_as_judge",
+    "inference_model": {
+        "__type__": "ibm_gen_ai_inference_engine",
+        "model_name": "meta-llama/llama-3-70b-instruct",
+        "parameters": {
+            "__type__": "ibm_gen_ai_inference_engine_params",
+            "max_new_tokens": 252
+        }
+    },
+    "template": "templates.response_assessment.rating.generic_single_turn",
+    "task": "rating.single_turn",
+    "format": "formats.llama3_chat",
+    "main_score": "llama_3_70b_instruct_ibm_genai_template_generic_single_turn"
+}
diff --git a/src/unitxt/catalog/templates/response_assessment/rating/generic_single_turn.json b/src/unitxt/catalog/templates/response_assessment/rating/generic_single_turn.json
@@ -0,0 +1,9 @@
+{
+    "__type__": "input_output_template",
+    "instruction": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user input displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n",
+    "input_format": "[User input]\n{question}\n\n[Assistant's respond]\n{answer}\n[The End of Assistant's respond]",
+    "output_format": "[[{rating}]]",
+    "postprocessors": [
+        "processors.extract_mt_bench_rating_judgment"
+    ]
+}