IBM · yoavkatz · Jun 30, 2024 · Jun 25, 2024 · Jun 26, 2024 · Jun 26, 2024
diff --git a/docs/docs/examples.rst b/docs/docs/examples.rst
@@ -7,7 +7,7 @@ Here you find complete examples showing how to perform different tasks using Uni
 Each example is a self contained python file that you can run and later modify.
 
 
-.. list-table:: 
+.. list-table:: Common Usecases
    :widths: 50 50 50 50
    :header-rows: 1
 
@@ -33,11 +33,26 @@ Each example is a self contained python file that you can run and later modify.
      - | :ref:`Add new dataset tutorial <adding_dataset>`.  
        | :ref:`Open QA task in catalog <catalog.tasks.qa.open>`.
        | :ref:`Open QA template in catalog <catalog.templates.qa.open.title>`.
-   * - Evaluate your question-answering dataset  - using LLM as a judge
+   * - Evaluate the impact of different formats and system prompts on the same task
+     - Demonstrates how different formats and system prompts effect the input provided to a llama3 chat model and evaluate their impact on the obtain scores.
+     - `code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_different_formats.py>`_
+     - | :ref:`Formatting tutorial <adding_format>`.
+
+
+
+.. list-table:: LLM as a judge
+   :widths: 50 50 50 50
+   :header-rows: 1
+
+   * - What do you want to do?
+     - Description
+     - Link to code
+     - Related documentation
+   * - Evaluate your question-answering dataset  
      - Demonstrates how to evaluate a user QA answering dataset in a standalone file using a user defined task and template. In addition, it shows how to define an LLM as a judge metric, specify the template it uses to produce the input to the judge, and select the judge model and platform.
-     - `code <https://github.com/IBM/unitxt/blob/main/examples/standalone_evaluation_llm_as_judge>`_
+     - `code <https://github.com/IBM/unitxt/blob/main/examples/standalone_evaluation_llm_as_judge.py>`_
      - | :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.
-   * - Evaluate your summarization dataset  - using LLM as a judge
+   * - Evaluate an existing summarization dataset from the catalog with LLM as judge
      - Demonstrates how to evaluate a document summarization dataset by define an LLM as a judge metric, specify the template it uses to produce the input to the judge, and select the judge model and platform.
      - `code <https://github.com/IBM/unitxt/blob/main/examples/evaluation_summarization_dataset_llm_as_judge>`_
      - | :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.

diff --git a/examples/evaluate_different_formats.py b/examples/evaluate_different_formats.py
@@ -0,0 +1,60 @@
+from unitxt import get_logger
+from unitxt.api import evaluate, load_dataset
+from unitxt.inference import IbmGenAiInferenceEngine, IbmGenAiInferenceEngineParams
+from unitxt.text_utils import print_dict
+
+logger = get_logger()
+
+
+model_name = "meta-llama/llama-3-8b-instruct"
+gen_params = IbmGenAiInferenceEngineParams(max_new_tokens=32)
+inference_model = IbmGenAiInferenceEngine(model_name=model_name, parameters=gen_params)
+card = "cards.boolq.classification"
+template = "templates.classification.multi_class.relation.default"
+
+all_scores = {}
+for format in [
+    "formats.llama3_instruct",
+    "formats.empty",
+    "formats.llama3_instruct_all_demos_in_one_turn",
+]:
+    for system_prompt in ["system_prompts.models.llama2", "system_prompts.empty"]:
+        dataset = load_dataset(
+            card=card,
+            template=template,
+            format=format,
+            system_prompt=system_prompt,
+            num_demos=2,
+            demos_pool_size=100,
+            loader_limit=1000,
+            max_test_instances=300,
+        )
+
+        test_dataset = dataset["test"]
+
+        predictions = inference_model.infer(test_dataset)
+        evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
+
+        logger.info(
+            f"Sample input and output for format '{format}' and system prompt '{system_prompt}':"
+        )
+        print_dict(
+            evaluated_dataset[0],
+            keys_to_print=[
+                "source",
+                "prediction",
+            ],
+        )
+        global_scores = evaluated_dataset[0]["score"]["global"]
+        print_dict(
+            global_scores,
+            keys_to_print=["score_name", "score", "score_ci_low", "score_ci_high"],
+        )
+        all_scores[(format, system_prompt)] = global_scores
+
+
+for (format, system_prompt), global_scores in all_scores.items():
+    logger.info(f"**** score for format '{format}' and system prompt '{system_prompt}'")
+    logger.info(
+        f"**** {global_scores['score_name']} : {global_scores['score']} - 95% confidence internal [{global_scores['score_ci_low']},{global_scores['score_ci_high']}]"
+    )
diff --git a/prepare/formats/models/llama3.py b/prepare/formats/models/llama3.py
@@ -2,25 +2,43 @@
 from unitxt.formats import SystemFormat
 
 # see: https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/
+# According to: https://huggingface.co/blog/llama3#how-to-prompt-llama-3
+# The Instruct versions use the following conversation structure:
 # <|begin_of_text|><|start_header_id|>system<|end_header_id|>
+#
 # {{ system_prompt }}<|eot_id|><|start_header_id|>user<|end_header_id|>
-# {{ user_message }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+#
+# {{ user_msg_1 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+#
+# {{ model_answer_1 }}<|eot_id|>
 
 format = SystemFormat(
-    demo_format="{source}\n\n{target_prefix}{target}\n\n",
-    model_input_format="<|begin_of_text|><|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
-    "{instruction}\\N{demos}{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
-    "{target_prefix}",
+    demo_format="<|start_header_id|>user<|end_header_id|>\n\n"
+    "{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+    "{target_prefix}{target}<|eot_id|>",
+    model_input_format="<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
+    + "{system_prompt}{instruction}"
+    + "<|eot_id|>{demos}<|start_header_id|>user<|end_header_id|>\n\n"
+    "{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{target_prefix}",
 )
 
-add_to_catalog(format, "formats.llama3_chat", overwrite=True)
+add_to_catalog(
+    format,
+    "formats.llama3_instruct",
+    overwrite=True,
+)
 
 format = SystemFormat(
     demo_format="{source}\n\n{target_prefix}{target}\n\n",
-    model_input_format="<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
-    "{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n"
-    "{instruction}{demos}{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"
-    "{target_prefix}",
+    model_input_format="<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
+    "{system_prompt}{instruction}"
+    "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
+    "{demos}"
+    "{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{target_prefix}",
 )
 
-add_to_catalog(format, "formats.llama3_chat_with_system_prompt", overwrite=True)
+add_to_catalog(
+    format,
+    "formats.llama3_instruct_all_demos_in_one_turn",
+    overwrite=True,
+)
diff --git a/prepare/metrics/llm_as_judge/llamaguard.py b/prepare/metrics/llm_as_judge/llamaguard.py
@@ -9,7 +9,7 @@
     "meta-llama/llama-3-8b-instruct",
     "meta-llama/llama-3-70b-instruct",
 ]  # will point to llamaguard2
-format = "formats.llama3_chat"
+format = "formats.llama3_instruct"
 template = "templates.safety.unsafe_content"
 task = "rating.single_turn"
 

diff --git a/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_mt_bench_template.py b/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_mt_bench_template.py
@@ -6,7 +6,7 @@
 from unitxt.llm_as_judge import LLMAsJudge
 
 model_list = ["meta-llama/llama-3-8b-instruct", "meta-llama/llama-3-70b-instruct"]
-format = "formats.llama3_chat"
+format = "formats.llama3_instruct"
 template = "templates.response_assessment.rating.mt_bench_single_turn"
 task = "rating.single_turn"
 

diff --git a/prepare/system_prompts/tasks/boolqa.py b/prepare/system_prompts/tasks/boolqa.py
@@ -0,0 +1,11 @@
+from unitxt.catalog import add_to_catalog
+from unitxt.system_prompts import TextualSystemPrompt
+
+system_prompt = TextualSystemPrompt(
+    "You are an agent in charge of answering a boolean (yes/no) question. The system presents "
+    "you with a passage and a question. Read the passage carefully, and then answer yes or no. "
+    "Think about your answer, and make sure it makes sense. Do not explain the answer. "
+    "Only say yes or no."
+)
+
+add_to_catalog(system_prompt, "system_prompts.boolqa", overwrite=True)
diff --git a/src/unitxt/catalog/formats/llama3_chat.json b/src/unitxt/catalog/formats/llama3_chat.json
diff --git a/src/unitxt/catalog/formats/llama3_chat_with_system_prompt.json b/src/unitxt/catalog/formats/llama3_chat_with_system_prompt.json
diff --git a/src/unitxt/catalog/formats/llama3_instruct.json b/src/unitxt/catalog/formats/llama3_instruct.json
@@ -0,0 +1,5 @@
+{
+    "__type__": "system_format",
+    "demo_format": "<|start_header_id|>user<|end_header_id|>\n\n{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{target_prefix}{target}<|eot_id|>",
+    "model_input_format": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_prompt}{instruction}<|eot_id|>{demos}<|start_header_id|>user<|end_header_id|>\n\n{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{target_prefix}"
+}
diff --git a/src/unitxt/catalog/formats/llama3_instruct_all_demos_in_one_turn.json b/src/unitxt/catalog/formats/llama3_instruct_all_demos_in_one_turn.json
@@ -0,0 +1,5 @@
+{
+    "__type__": "system_format",
+    "demo_format": "{source}\n\n{target_prefix}{target}\n\n",
+    "model_input_format": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_prompt}{instruction}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{demos}{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{target_prefix}"
+}
diff --git a/...ics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_mt_bench_single_turn.json b/...ics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_mt_bench_single_turn.json
@@ -10,6 +10,6 @@
     },
     "template": "templates.response_assessment.rating.mt_bench_single_turn",
     "task": "rating.single_turn",
-    "format": "formats.llama3_chat",
+    "format": "formats.llama3_instruct",
     "main_score": "llama_3_70b_instruct_ibm_genai_template_mt_bench_single_turn"
 }
diff --git a/...rics/llm_as_judge/rating/llama_3_8b_instruct_ibm_genai_template_mt_bench_single_turn.json b/...rics/llm_as_judge/rating/llama_3_8b_instruct_ibm_genai_template_mt_bench_single_turn.json
@@ -10,6 +10,6 @@
     },
     "template": "templates.response_assessment.rating.mt_bench_single_turn",
     "task": "rating.single_turn",
-    "format": "formats.llama3_chat",
+    "format": "formats.llama3_instruct",
     "main_score": "llama_3_8b_instruct_ibm_genai_template_mt_bench_single_turn"
 }
diff --git a/...g/metrics/llm_as_judge/safety/llama_3_70b_instruct_ibm_genai_template_unsafe_content.json b/...g/metrics/llm_as_judge/safety/llama_3_70b_instruct_ibm_genai_template_unsafe_content.json
@@ -10,6 +10,6 @@
     },
     "template": "templates.safety.unsafe_content",
     "task": "rating.single_turn",
-    "format": "formats.llama3_chat",
+    "format": "formats.llama3_instruct",
     "main_score": "llama_3_70b_instruct_ibm_genai_template_unsafe_content"
 }
diff --git a/...og/metrics/llm_as_judge/safety/llama_3_8b_instruct_ibm_genai_template_unsafe_content.json b/...og/metrics/llm_as_judge/safety/llama_3_8b_instruct_ibm_genai_template_unsafe_content.json
@@ -10,6 +10,6 @@
     },
     "template": "templates.safety.unsafe_content",
     "task": "rating.single_turn",
-    "format": "formats.llama3_chat",
+    "format": "formats.llama3_instruct",
     "main_score": "llama_3_8b_instruct_ibm_genai_template_unsafe_content"
 }
diff --git a/src/unitxt/catalog/system_prompts/boolqa.json b/src/unitxt/catalog/system_prompts/boolqa.json
@@ -0,0 +1,4 @@
+{
+    "__type__": "textual_system_prompt",
+    "text": "You are an agent in charge of answering a boolean (yes/no) question. The system presents you with a passage and a question. Read the passage carefully, and then answer yes or no. Think about your answer, and make sure it makes sense. Do not explain the answer. Only say yes or no."
+}
diff --git a/tests/library/test_examples.py b/tests/library/test_examples.py
@@ -31,7 +31,12 @@ def test_examples(self):
         times = {}
         all_example_files.sort()
 
-        excluded_files = ["use_llm_as_judje_metric.py"]
+        excluded_files = [
+            "use_llm_as_judge_metric.py",
+            "standalone_evaluation_llm_as_judge.py",
+            "evaluation_summarization_dataset_llm_as_judge.py",
+            "evaluate_different_formats.py",
+        ]
         for file in all_example_files:
             logger.info(
                 "\n_____________________________________________\n"
@@ -40,6 +45,8 @@ def test_examples(self):
             )
             if Path(file).name in excluded_files:
                 logger.info("Skipping file because in exclude list")
+                continue
+
             start_time = time.time()
             with self.subTest(file=file):
                 import_module_from_file(file)
@@ -55,5 +62,5 @@ def test_examples(self):
             )
 
             times[file] = formatted_time
-        logger.info("Examplexamples table:")
+        logger.info("Example table:")
         print_dict(times)
diff --git a/tests/library/test_metrics.py b/tests/library/test_metrics.py
@@ -1452,7 +1452,7 @@ def _test_grouped_instance_confidence_interval(
 
     def test_llm_as_judge_metric(self):
         model_id = "meta-llama/llama-3-8b-instruct"
-        format = "formats.llama3_chat"
+        format = "formats.llama3_instruct"
         task = "rating.single_turn"
         template = "templates.response_assessment.rating.mt_bench_single_turn"