diff --git a/docs/docs/examples.rst b/docs/docs/examples.rst index ba1b5aaee..ce5c1c4f7 100644 --- a/docs/docs/examples.rst +++ b/docs/docs/examples.rst @@ -7,7 +7,7 @@ Here you find complete examples showing how to perform different tasks using Uni Each example is a self contained python file that you can run and later modify. -.. list-table:: +.. list-table:: Common Usecases :widths: 50 50 50 50 :header-rows: 1 @@ -33,11 +33,26 @@ Each example is a self contained python file that you can run and later modify. - | :ref:`Add new dataset tutorial `. | :ref:`Open QA task in catalog `. | :ref:`Open QA template in catalog `. - * - Evaluate your question-answering dataset - using LLM as a judge + * - Evaluate the impact of different formats and system prompts on the same task + - Demonstrates how different formats and system prompts effect the input provided to a llama3 chat model and evaluate their impact on the obtain scores. + - `code `_ + - | :ref:`Formatting tutorial `. + + + +.. list-table:: LLM as a judge + :widths: 50 50 50 50 + :header-rows: 1 + + * - What do you want to do? + - Description + - Link to code + - Related documentation + * - Evaluate your question-answering dataset - Demonstrates how to evaluate a user QA answering dataset in a standalone file using a user defined task and template. In addition, it shows how to define an LLM as a judge metric, specify the template it uses to produce the input to the judge, and select the judge model and platform. - - `code `_ + - `code `_ - | :ref:`LLM as a Judge Metrics Guide `. - * - Evaluate your summarization dataset - using LLM as a judge + * - Evaluate an existing summarization dataset from the catalog with LLM as judge - Demonstrates how to evaluate a document summarization dataset by define an LLM as a judge metric, specify the template it uses to produce the input to the judge, and select the judge model and platform. - `code `_ - | :ref:`LLM as a Judge Metrics Guide `. diff --git a/examples/evaluate_different_formats.py b/examples/evaluate_different_formats.py new file mode 100644 index 000000000..56646742b --- /dev/null +++ b/examples/evaluate_different_formats.py @@ -0,0 +1,60 @@ +from unitxt import get_logger +from unitxt.api import evaluate, load_dataset +from unitxt.inference import IbmGenAiInferenceEngine, IbmGenAiInferenceEngineParams +from unitxt.text_utils import print_dict + +logger = get_logger() + + +model_name = "meta-llama/llama-3-8b-instruct" +gen_params = IbmGenAiInferenceEngineParams(max_new_tokens=32) +inference_model = IbmGenAiInferenceEngine(model_name=model_name, parameters=gen_params) +card = "cards.boolq.classification" +template = "templates.classification.multi_class.relation.default" + +all_scores = {} +for format in [ + "formats.llama3_instruct", + "formats.empty", + "formats.llama3_instruct_all_demos_in_one_turn", +]: + for system_prompt in ["system_prompts.models.llama2", "system_prompts.empty"]: + dataset = load_dataset( + card=card, + template=template, + format=format, + system_prompt=system_prompt, + num_demos=2, + demos_pool_size=100, + loader_limit=1000, + max_test_instances=300, + ) + + test_dataset = dataset["test"] + + predictions = inference_model.infer(test_dataset) + evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) + + logger.info( + f"Sample input and output for format '{format}' and system prompt '{system_prompt}':" + ) + print_dict( + evaluated_dataset[0], + keys_to_print=[ + "source", + "prediction", + ], + ) + global_scores = evaluated_dataset[0]["score"]["global"] + print_dict( + global_scores, + keys_to_print=["score_name", "score", "score_ci_low", "score_ci_high"], + ) + all_scores[(format, system_prompt)] = global_scores + + +for (format, system_prompt), global_scores in all_scores.items(): + logger.info(f"**** score for format '{format}' and system prompt '{system_prompt}'") + logger.info( + f"**** {global_scores['score_name']} : {global_scores['score']} - 95% confidence internal [{global_scores['score_ci_low']},{global_scores['score_ci_high']}]" + ) diff --git a/prepare/formats/models/llama3.py b/prepare/formats/models/llama3.py index dd0866278..36a53b66b 100644 --- a/prepare/formats/models/llama3.py +++ b/prepare/formats/models/llama3.py @@ -2,25 +2,43 @@ from unitxt.formats import SystemFormat # see: https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/ +# According to: https://huggingface.co/blog/llama3#how-to-prompt-llama-3 +# The Instruct versions use the following conversation structure: # <|begin_of_text|><|start_header_id|>system<|end_header_id|> +# # {{ system_prompt }}<|eot_id|><|start_header_id|>user<|end_header_id|> -# {{ user_message }}<|eot_id|><|start_header_id|>assistant<|end_header_id|> +# +# {{ user_msg_1 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|> +# +# {{ model_answer_1 }}<|eot_id|> format = SystemFormat( - demo_format="{source}\n\n{target_prefix}{target}\n\n", - model_input_format="<|begin_of_text|><|eot_id|><|start_header_id|>user<|end_header_id|>\n\n" - "{instruction}\\N{demos}{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" - "{target_prefix}", + demo_format="<|start_header_id|>user<|end_header_id|>\n\n" + "{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" + "{target_prefix}{target}<|eot_id|>", + model_input_format="<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n" + + "{system_prompt}{instruction}" + + "<|eot_id|>{demos}<|start_header_id|>user<|end_header_id|>\n\n" + "{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{target_prefix}", ) -add_to_catalog(format, "formats.llama3_chat", overwrite=True) +add_to_catalog( + format, + "formats.llama3_instruct", + overwrite=True, +) format = SystemFormat( demo_format="{source}\n\n{target_prefix}{target}\n\n", - model_input_format="<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n" - "{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n" - "{instruction}{demos}{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n" - "{target_prefix}", + model_input_format="<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n" + "{system_prompt}{instruction}" + "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n" + "{demos}" + "{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{target_prefix}", ) -add_to_catalog(format, "formats.llama3_chat_with_system_prompt", overwrite=True) +add_to_catalog( + format, + "formats.llama3_instruct_all_demos_in_one_turn", + overwrite=True, +) diff --git a/prepare/metrics/llm_as_judge/llamaguard.py b/prepare/metrics/llm_as_judge/llamaguard.py index 3d6fed44a..91eac36f7 100644 --- a/prepare/metrics/llm_as_judge/llamaguard.py +++ b/prepare/metrics/llm_as_judge/llamaguard.py @@ -9,7 +9,7 @@ "meta-llama/llama-3-8b-instruct", "meta-llama/llama-3-70b-instruct", ] # will point to llamaguard2 -format = "formats.llama3_chat" +format = "formats.llama3_instruct" template = "templates.safety.unsafe_content" task = "rating.single_turn" diff --git a/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_mt_bench_template.py b/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_mt_bench_template.py index c53fcc1a5..8716dda0d 100644 --- a/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_mt_bench_template.py +++ b/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_mt_bench_template.py @@ -6,7 +6,7 @@ from unitxt.llm_as_judge import LLMAsJudge model_list = ["meta-llama/llama-3-8b-instruct", "meta-llama/llama-3-70b-instruct"] -format = "formats.llama3_chat" +format = "formats.llama3_instruct" template = "templates.response_assessment.rating.mt_bench_single_turn" task = "rating.single_turn" diff --git a/prepare/system_prompts/tasks/boolqa.py b/prepare/system_prompts/tasks/boolqa.py new file mode 100644 index 000000000..8ee42dd47 --- /dev/null +++ b/prepare/system_prompts/tasks/boolqa.py @@ -0,0 +1,11 @@ +from unitxt.catalog import add_to_catalog +from unitxt.system_prompts import TextualSystemPrompt + +system_prompt = TextualSystemPrompt( + "You are an agent in charge of answering a boolean (yes/no) question. The system presents " + "you with a passage and a question. Read the passage carefully, and then answer yes or no. " + "Think about your answer, and make sure it makes sense. Do not explain the answer. " + "Only say yes or no." +) + +add_to_catalog(system_prompt, "system_prompts.boolqa", overwrite=True) diff --git a/src/unitxt/catalog/formats/llama3_chat.json b/src/unitxt/catalog/formats/llama3_chat.json deleted file mode 100644 index 28d5248ed..000000000 --- a/src/unitxt/catalog/formats/llama3_chat.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "__type__": "system_format", - "demo_format": "{source}\n\n{target_prefix}{target}\n\n", - "model_input_format": "<|begin_of_text|><|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{instruction}\\N{demos}{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{target_prefix}" -} diff --git a/src/unitxt/catalog/formats/llama3_chat_with_system_prompt.json b/src/unitxt/catalog/formats/llama3_chat_with_system_prompt.json deleted file mode 100644 index b3a8791fa..000000000 --- a/src/unitxt/catalog/formats/llama3_chat_with_system_prompt.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "__type__": "system_format", - "demo_format": "{source}\n\n{target_prefix}{target}\n\n", - "model_input_format": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n{instruction}{demos}{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n{target_prefix}" -} diff --git a/src/unitxt/catalog/formats/llama3_instruct.json b/src/unitxt/catalog/formats/llama3_instruct.json new file mode 100644 index 000000000..e006be2f3 --- /dev/null +++ b/src/unitxt/catalog/formats/llama3_instruct.json @@ -0,0 +1,5 @@ +{ + "__type__": "system_format", + "demo_format": "<|start_header_id|>user<|end_header_id|>\n\n{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{target_prefix}{target}<|eot_id|>", + "model_input_format": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_prompt}{instruction}<|eot_id|>{demos}<|start_header_id|>user<|end_header_id|>\n\n{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{target_prefix}" +} diff --git a/src/unitxt/catalog/formats/llama3_instruct_all_demos_in_one_turn.json b/src/unitxt/catalog/formats/llama3_instruct_all_demos_in_one_turn.json new file mode 100644 index 000000000..324f4c084 --- /dev/null +++ b/src/unitxt/catalog/formats/llama3_instruct_all_demos_in_one_turn.json @@ -0,0 +1,5 @@ +{ + "__type__": "system_format", + "demo_format": "{source}\n\n{target_prefix}{target}\n\n", + "model_input_format": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_prompt}{instruction}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{demos}{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{target_prefix}" +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_mt_bench_single_turn.json b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_mt_bench_single_turn.json index 860eb19f3..1251f05b2 100644 --- a/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_mt_bench_single_turn.json +++ b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_mt_bench_single_turn.json @@ -10,6 +10,6 @@ }, "template": "templates.response_assessment.rating.mt_bench_single_turn", "task": "rating.single_turn", - "format": "formats.llama3_chat", + "format": "formats.llama3_instruct", "main_score": "llama_3_70b_instruct_ibm_genai_template_mt_bench_single_turn" } diff --git a/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_8b_instruct_ibm_genai_template_mt_bench_single_turn.json b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_8b_instruct_ibm_genai_template_mt_bench_single_turn.json index 5b3258e8c..44e356bd2 100644 --- a/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_8b_instruct_ibm_genai_template_mt_bench_single_turn.json +++ b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_8b_instruct_ibm_genai_template_mt_bench_single_turn.json @@ -10,6 +10,6 @@ }, "template": "templates.response_assessment.rating.mt_bench_single_turn", "task": "rating.single_turn", - "format": "formats.llama3_chat", + "format": "formats.llama3_instruct", "main_score": "llama_3_8b_instruct_ibm_genai_template_mt_bench_single_turn" } diff --git a/src/unitxt/catalog/metrics/llm_as_judge/safety/llama_3_70b_instruct_ibm_genai_template_unsafe_content.json b/src/unitxt/catalog/metrics/llm_as_judge/safety/llama_3_70b_instruct_ibm_genai_template_unsafe_content.json index 5d5ea86ee..0d86bece2 100644 --- a/src/unitxt/catalog/metrics/llm_as_judge/safety/llama_3_70b_instruct_ibm_genai_template_unsafe_content.json +++ b/src/unitxt/catalog/metrics/llm_as_judge/safety/llama_3_70b_instruct_ibm_genai_template_unsafe_content.json @@ -10,6 +10,6 @@ }, "template": "templates.safety.unsafe_content", "task": "rating.single_turn", - "format": "formats.llama3_chat", + "format": "formats.llama3_instruct", "main_score": "llama_3_70b_instruct_ibm_genai_template_unsafe_content" } diff --git a/src/unitxt/catalog/metrics/llm_as_judge/safety/llama_3_8b_instruct_ibm_genai_template_unsafe_content.json b/src/unitxt/catalog/metrics/llm_as_judge/safety/llama_3_8b_instruct_ibm_genai_template_unsafe_content.json index 32344eda0..f6742d136 100644 --- a/src/unitxt/catalog/metrics/llm_as_judge/safety/llama_3_8b_instruct_ibm_genai_template_unsafe_content.json +++ b/src/unitxt/catalog/metrics/llm_as_judge/safety/llama_3_8b_instruct_ibm_genai_template_unsafe_content.json @@ -10,6 +10,6 @@ }, "template": "templates.safety.unsafe_content", "task": "rating.single_turn", - "format": "formats.llama3_chat", + "format": "formats.llama3_instruct", "main_score": "llama_3_8b_instruct_ibm_genai_template_unsafe_content" } diff --git a/src/unitxt/catalog/system_prompts/boolqa.json b/src/unitxt/catalog/system_prompts/boolqa.json new file mode 100644 index 000000000..85ed7442d --- /dev/null +++ b/src/unitxt/catalog/system_prompts/boolqa.json @@ -0,0 +1,4 @@ +{ + "__type__": "textual_system_prompt", + "text": "You are an agent in charge of answering a boolean (yes/no) question. The system presents you with a passage and a question. Read the passage carefully, and then answer yes or no. Think about your answer, and make sure it makes sense. Do not explain the answer. Only say yes or no." +} diff --git a/tests/library/test_examples.py b/tests/library/test_examples.py index ee6ccab72..be99c64b1 100644 --- a/tests/library/test_examples.py +++ b/tests/library/test_examples.py @@ -31,7 +31,12 @@ def test_examples(self): times = {} all_example_files.sort() - excluded_files = ["use_llm_as_judje_metric.py"] + excluded_files = [ + "use_llm_as_judge_metric.py", + "standalone_evaluation_llm_as_judge.py", + "evaluation_summarization_dataset_llm_as_judge.py", + "evaluate_different_formats.py", + ] for file in all_example_files: logger.info( "\n_____________________________________________\n" @@ -40,6 +45,8 @@ def test_examples(self): ) if Path(file).name in excluded_files: logger.info("Skipping file because in exclude list") + continue + start_time = time.time() with self.subTest(file=file): import_module_from_file(file) @@ -55,5 +62,5 @@ def test_examples(self): ) times[file] = formatted_time - logger.info("Examplexamples table:") + logger.info("Example table:") print_dict(times) diff --git a/tests/library/test_metrics.py b/tests/library/test_metrics.py index f094e8923..6cd386724 100644 --- a/tests/library/test_metrics.py +++ b/tests/library/test_metrics.py @@ -1452,7 +1452,7 @@ def _test_grouped_instance_confidence_interval( def test_llm_as_judge_metric(self): model_id = "meta-llama/llama-3-8b-instruct" - format = "formats.llama3_chat" + format = "formats.llama3_instruct" task = "rating.single_turn" template = "templates.response_assessment.rating.mt_bench_single_turn"