Add simple LLM as a judge example, of using it without installaiotn (#…

…968) * Add simple LLM as a judge example, of using it without installaiotn * Exclude example that requires GENAI key from tests
IBM · Jul 15, 2024 · a5360ed · a5360ed
1 parent 0c7bed5
commit a5360ed
Show file tree

Hide file tree

Showing 8 changed files with 129 additions and 77 deletions.
diff --git a/docs/docs/examples.rst b/docs/docs/examples.rst
@@ -48,13 +48,19 @@ Each example is a self contained python file that you can run and later modify.
      - Description
      - Link to code
      - Related documentation
-   * - Evaluate your question-answering dataset  
+   * - Evaluate an existing question-answering dataset from the Unitxt catalog, and evaluate it
+     - Demonstrates how to evaluate an existing QA dataset (squad) using Huggingface
+       datasets and evaluate APIs, with no installation required. By using predefined LLM as a judge metric.
+     - `code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_dataset_by_llm_as_judge_no_install.py>`_
+     - | :ref:`Evaluating datasets <evaluating_datasets>`.
+     - | :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.
+   * - Evaluate your question-answering dataset
      - Demonstrates how to evaluate a user QA answering dataset in a standalone file using a user defined task and template. In addition, it shows how to define an LLM as a judge metric, specify the template it uses to produce the input to the judge, and select the judge model and platform.
      - `code <https://github.com/IBM/unitxt/blob/main/examples/standalone_evaluation_llm_as_judge.py>`_
      - | :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.
    * - Evaluate an existing summarization dataset from the catalog with LLM as judge
      - Demonstrates how to evaluate a document summarization dataset by define an LLM as a judge metric, specify the template it uses to produce the input to the judge, and select the judge model and platform.
-     - `code <https://github.com/IBM/unitxt/blob/main/examples/evaluation_summarization_dataset_llm_as_judge>`_
+     - `code <https://github.com/IBM/unitxt/blob/main/examples/evaluation_summarization_dataset_llm_as_judge.py>`_
      - | :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.
 
 
diff --git a/examples/evaluate_dataset_by_llm_as_judge_no_install.py b/examples/evaluate_dataset_by_llm_as_judge_no_install.py
@@ -0,0 +1,43 @@
+from datasets import load_dataset
+from unitxt import get_logger, get_settings
+from unitxt.api import evaluate
+from unitxt.inference import (
+    HFPipelineBasedInferenceEngine,
+)
+from unitxt.text_utils import print_dict
+
+logger = get_logger()
+settings = get_settings()
+settings.allow_unverified_code = True
+
+# Use the HF load_dataset API, to load the squad QA dataset using the standard template in the catalog.
+# We set loader_limit to 20 to reduce download time.
+test_dataset = load_dataset(
+    "unitxt/data",
+    "card=cards.squad,template=templates.qa.with_context.simple,metrics=[metrics.llm_as_judge.rating.llama_3_70b_instruct_ibm_genai_template_generic_single_turn],loader_limit=20",
+    trust_remote_code=True,
+    split="test",
+)
+
+# Infer a model to get predictions.
+model_name = "google/flan-t5-base"
+inference_model = HFPipelineBasedInferenceEngine(
+    model_name=model_name, max_new_tokens=32
+)
+predictions = inference_model.infer(test_dataset)
+
+# Evaluate the predictions using the defined metric.
+evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
+
+# Print results
+for instance in evaluated_dataset:
+    print_dict(
+        instance,
+        keys_to_print=[
+            "source",
+            "prediction",
+            "processed_prediction",
+            "references",
+            "score",
+        ],
+    )
diff --git a/examples/use_llm_as_judge_metric.py b/examples/use_llm_as_judge_metric.py
diff --git a/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_generic_template.py b/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_generic_template.py
@@ -0,0 +1,31 @@
+from unitxt import add_to_catalog
+from unitxt.inference import (
+    IbmGenAiInferenceEngine,
+    IbmGenAiInferenceEngineParams,
+)
+from unitxt.llm_as_judge import LLMAsJudge
+
+model = "meta-llama/llama-3-70b-instruct"
+format = "formats.llama3_chat"
+template = "templates.response_assessment.rating.generic_single_turn"
+task = "rating.single_turn"
+
+gen_params = IbmGenAiInferenceEngineParams(max_new_tokens=252)
+inference_model = IbmGenAiInferenceEngine(model_name=model, parameters=gen_params)
+model_label = model.split("/")[1].replace("-", "_").replace(".", ",").lower()
+model_label = f"{model_label}_ibm_genai"
+template_label = template.split(".")[-1]
+metric_label = f"{model_label}_template_{template_label}"
+metric = LLMAsJudge(
+    inference_model=inference_model,
+    template=template,
+    task=task,
+    format=format,
+    main_score=metric_label,
+)
+
+add_to_catalog(
+    metric,
+    f"metrics.llm_as_judge.rating.{model_label}_template_{template_label}",
+    overwrite=True,
+)
diff --git a/prepare/templates/response_assessment/rating/generic_single_turn.py b/prepare/templates/response_assessment/rating/generic_single_turn.py
@@ -0,0 +1,22 @@
+from unitxt import add_to_catalog
+from unitxt.templates import InputOutputTemplate
+
+add_to_catalog(
+    InputOutputTemplate(
+        instruction="Please act as an impartial judge and evaluate the quality of the response provided"
+        " by an AI assistant to the user input displayed below. Your evaluation should consider"
+        " factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of"
+        " detail of the response. Begin your evaluation by providing a short explanation. Be as"
+        " objective as possible. After providing your explanation, you must rate the response"
+        ' on a scale of 1 to 10 by strictly following this format: "[[rating]]", for example:'
+        ' "Rating: [[5]]".\n\n',
+        input_format="[User input]\n{question}\n\n"
+        "[Assistant's respond]\n{answer}\n[The End of Assistant's respond]",
+        output_format="[[{rating}]]",
+        postprocessors=[
+            r"processors.extract_mt_bench_rating_judgment",
+        ],
+    ),
+    "templates.response_assessment.rating.generic_single_turn",
+    overwrite=True,
+)
diff --git a/...rics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_generic_single_turn.json b/...rics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_generic_single_turn.json
@@ -0,0 +1,15 @@
+{
+    "__type__": "llm_as_judge",
+    "inference_model": {
+        "__type__": "ibm_gen_ai_inference_engine",
+        "model_name": "meta-llama/llama-3-70b-instruct",
+        "parameters": {
+            "__type__": "ibm_gen_ai_inference_engine_params",
+            "max_new_tokens": 252
+        }
+    },
+    "template": "templates.response_assessment.rating.generic_single_turn",
+    "task": "rating.single_turn",
+    "format": "formats.llama3_chat",
+    "main_score": "llama_3_70b_instruct_ibm_genai_template_generic_single_turn"
+}
diff --git a/src/unitxt/catalog/templates/response_assessment/rating/generic_single_turn.json b/src/unitxt/catalog/templates/response_assessment/rating/generic_single_turn.json
@@ -0,0 +1,9 @@
+{
+    "__type__": "input_output_template",
+    "instruction": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user input displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n",
+    "input_format": "[User input]\n{question}\n\n[Assistant's respond]\n{answer}\n[The End of Assistant's respond]",
+    "output_format": "[[{rating}]]",
+    "postprocessors": [
+        "processors.extract_mt_bench_rating_judgment"
+    ]
+}
diff --git a/tests/library/test_examples.py b/tests/library/test_examples.py
@@ -36,6 +36,7 @@ def test_examples(self):
             "standalone_evaluation_llm_as_judge.py",
             "evaluation_summarization_dataset_llm_as_judge.py",
             "evaluate_different_formats.py",
+            "evaluate_dataset_by_llm_as_judge_no_install.py",
         ]
         for file in all_example_files:
             logger.info(