Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llama3 instruct and chat system prompts #950

Merged
merged 14 commits into from
Jun 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 19 additions & 4 deletions docs/docs/examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Here you find complete examples showing how to perform different tasks using Uni
Each example is a self contained python file that you can run and later modify.


.. list-table::
.. list-table:: Common Usecases
:widths: 50 50 50 50
:header-rows: 1

Expand All @@ -33,11 +33,26 @@ Each example is a self contained python file that you can run and later modify.
- | :ref:`Add new dataset tutorial <adding_dataset>`.
| :ref:`Open QA task in catalog <catalog.tasks.qa.open>`.
| :ref:`Open QA template in catalog <catalog.templates.qa.open.title>`.
* - Evaluate your question-answering dataset - using LLM as a judge
* - Evaluate the impact of different formats and system prompts on the same task
- Demonstrates how different formats and system prompts effect the input provided to a llama3 chat model and evaluate their impact on the obtain scores.
- `code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_different_formats.py>`_
- | :ref:`Formatting tutorial <adding_format>`.



.. list-table:: LLM as a judge
:widths: 50 50 50 50
:header-rows: 1

* - What do you want to do?
- Description
- Link to code
- Related documentation
* - Evaluate your question-answering dataset
- Demonstrates how to evaluate a user QA answering dataset in a standalone file using a user defined task and template. In addition, it shows how to define an LLM as a judge metric, specify the template it uses to produce the input to the judge, and select the judge model and platform.
- `code <https://github.com/IBM/unitxt/blob/main/examples/standalone_evaluation_llm_as_judge>`_
- `code <https://github.com/IBM/unitxt/blob/main/examples/standalone_evaluation_llm_as_judge.py>`_
- | :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.
* - Evaluate your summarization dataset - using LLM as a judge
* - Evaluate an existing summarization dataset from the catalog with LLM as judge
- Demonstrates how to evaluate a document summarization dataset by define an LLM as a judge metric, specify the template it uses to produce the input to the judge, and select the judge model and platform.
- `code <https://github.com/IBM/unitxt/blob/main/examples/evaluation_summarization_dataset_llm_as_judge>`_
- | :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.
Expand Down
60 changes: 60 additions & 0 deletions examples/evaluate_different_formats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from unitxt import get_logger
from unitxt.api import evaluate, load_dataset
from unitxt.inference import IbmGenAiInferenceEngine, IbmGenAiInferenceEngineParams
from unitxt.text_utils import print_dict

logger = get_logger()


model_name = "meta-llama/llama-3-8b-instruct"
gen_params = IbmGenAiInferenceEngineParams(max_new_tokens=32)
inference_model = IbmGenAiInferenceEngine(model_name=model_name, parameters=gen_params)
card = "cards.boolq.classification"
template = "templates.classification.multi_class.relation.default"

all_scores = {}
for format in [
"formats.llama3_instruct",
"formats.empty",
"formats.llama3_instruct_all_demos_in_one_turn",
]:
for system_prompt in ["system_prompts.models.llama2", "system_prompts.empty"]:
dataset = load_dataset(
card=card,
template=template,
format=format,
system_prompt=system_prompt,
num_demos=2,
demos_pool_size=100,
loader_limit=1000,
max_test_instances=300,
)

test_dataset = dataset["test"]

predictions = inference_model.infer(test_dataset)
evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)

logger.info(
f"Sample input and output for format '{format}' and system prompt '{system_prompt}':"
)
print_dict(
evaluated_dataset[0],
keys_to_print=[
"source",
"prediction",
],
)
global_scores = evaluated_dataset[0]["score"]["global"]
print_dict(
global_scores,
keys_to_print=["score_name", "score", "score_ci_low", "score_ci_high"],
)
all_scores[(format, system_prompt)] = global_scores


for (format, system_prompt), global_scores in all_scores.items():
logger.info(f"**** score for format '{format}' and system prompt '{system_prompt}'")
logger.info(
f"**** {global_scores['score_name']} : {global_scores['score']} - 95% confidence internal [{global_scores['score_ci_low']},{global_scores['score_ci_high']}]"
)
40 changes: 29 additions & 11 deletions prepare/formats/models/llama3.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,43 @@
from unitxt.formats import SystemFormat

# see: https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/
# According to: https://huggingface.co/blog/llama3#how-to-prompt-llama-3
# The Instruct versions use the following conversation structure:
# <|begin_of_text|><|start_header_id|>system<|end_header_id|>
#
# {{ system_prompt }}<|eot_id|><|start_header_id|>user<|end_header_id|>
# {{ user_message }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
#
# {{ user_msg_1 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
#
# {{ model_answer_1 }}<|eot_id|>

format = SystemFormat(
demo_format="{source}\n\n{target_prefix}{target}\n\n",
model_input_format="<|begin_of_text|><|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
"{instruction}\\N{demos}{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
"{target_prefix}",
demo_format="<|start_header_id|>user<|end_header_id|>\n\n"
"{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
"{target_prefix}{target}<|eot_id|>",
model_input_format="<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
+ "{system_prompt}{instruction}"
+ "<|eot_id|>{demos}<|start_header_id|>user<|end_header_id|>\n\n"
"{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{target_prefix}",
)

add_to_catalog(format, "formats.llama3_chat", overwrite=True)
add_to_catalog(
format,
"formats.llama3_instruct",
overwrite=True,
)

format = SystemFormat(
demo_format="{source}\n\n{target_prefix}{target}\n\n",
model_input_format="<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
"{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n"
"{instruction}{demos}{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"
"{target_prefix}",
model_input_format="<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
"{system_prompt}{instruction}"
"<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
"{demos}"
"{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{target_prefix}",
)

add_to_catalog(format, "formats.llama3_chat_with_system_prompt", overwrite=True)
add_to_catalog(
format,
"formats.llama3_instruct_all_demos_in_one_turn",
overwrite=True,
)
2 changes: 1 addition & 1 deletion prepare/metrics/llm_as_judge/llamaguard.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"meta-llama/llama-3-8b-instruct",
"meta-llama/llama-3-70b-instruct",
] # will point to llamaguard2
format = "formats.llama3_chat"
format = "formats.llama3_instruct"
template = "templates.safety.unsafe_content"
task = "rating.single_turn"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from unitxt.llm_as_judge import LLMAsJudge

model_list = ["meta-llama/llama-3-8b-instruct", "meta-llama/llama-3-70b-instruct"]
format = "formats.llama3_chat"
format = "formats.llama3_instruct"
template = "templates.response_assessment.rating.mt_bench_single_turn"
task = "rating.single_turn"

Expand Down
11 changes: 11 additions & 0 deletions prepare/system_prompts/tasks/boolqa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from unitxt.catalog import add_to_catalog
from unitxt.system_prompts import TextualSystemPrompt

system_prompt = TextualSystemPrompt(
"You are an agent in charge of answering a boolean (yes/no) question. The system presents "
"you with a passage and a question. Read the passage carefully, and then answer yes or no. "
"Think about your answer, and make sure it makes sense. Do not explain the answer. "
"Only say yes or no."
)

add_to_catalog(system_prompt, "system_prompts.boolqa", overwrite=True)
5 changes: 0 additions & 5 deletions src/unitxt/catalog/formats/llama3_chat.json

This file was deleted.

This file was deleted.

5 changes: 5 additions & 0 deletions src/unitxt/catalog/formats/llama3_instruct.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"__type__": "system_format",
"demo_format": "<|start_header_id|>user<|end_header_id|>\n\n{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{target_prefix}{target}<|eot_id|>",
"model_input_format": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_prompt}{instruction}<|eot_id|>{demos}<|start_header_id|>user<|end_header_id|>\n\n{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{target_prefix}"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"__type__": "system_format",
"demo_format": "{source}\n\n{target_prefix}{target}\n\n",
"model_input_format": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_prompt}{instruction}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{demos}{source}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{target_prefix}"
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@
},
"template": "templates.response_assessment.rating.mt_bench_single_turn",
"task": "rating.single_turn",
"format": "formats.llama3_chat",
"format": "formats.llama3_instruct",
"main_score": "llama_3_70b_instruct_ibm_genai_template_mt_bench_single_turn"
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@
},
"template": "templates.response_assessment.rating.mt_bench_single_turn",
"task": "rating.single_turn",
"format": "formats.llama3_chat",
"format": "formats.llama3_instruct",
"main_score": "llama_3_8b_instruct_ibm_genai_template_mt_bench_single_turn"
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@
},
"template": "templates.safety.unsafe_content",
"task": "rating.single_turn",
"format": "formats.llama3_chat",
"format": "formats.llama3_instruct",
"main_score": "llama_3_70b_instruct_ibm_genai_template_unsafe_content"
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@
},
"template": "templates.safety.unsafe_content",
"task": "rating.single_turn",
"format": "formats.llama3_chat",
"format": "formats.llama3_instruct",
"main_score": "llama_3_8b_instruct_ibm_genai_template_unsafe_content"
}
4 changes: 4 additions & 0 deletions src/unitxt/catalog/system_prompts/boolqa.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"__type__": "textual_system_prompt",
"text": "You are an agent in charge of answering a boolean (yes/no) question. The system presents you with a passage and a question. Read the passage carefully, and then answer yes or no. Think about your answer, and make sure it makes sense. Do not explain the answer. Only say yes or no."
}
11 changes: 9 additions & 2 deletions tests/library/test_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,12 @@ def test_examples(self):
times = {}
all_example_files.sort()

excluded_files = ["use_llm_as_judje_metric.py"]
excluded_files = [
"use_llm_as_judge_metric.py",
"standalone_evaluation_llm_as_judge.py",
"evaluation_summarization_dataset_llm_as_judge.py",
"evaluate_different_formats.py",
]
for file in all_example_files:
logger.info(
"\n_____________________________________________\n"
Expand All @@ -40,6 +45,8 @@ def test_examples(self):
)
if Path(file).name in excluded_files:
logger.info("Skipping file because in exclude list")
continue

start_time = time.time()
with self.subTest(file=file):
import_module_from_file(file)
Expand All @@ -55,5 +62,5 @@ def test_examples(self):
)

times[file] = formatted_time
logger.info("Examplexamples table:")
logger.info("Example table:")
print_dict(times)
2 changes: 1 addition & 1 deletion tests/library/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1452,7 +1452,7 @@ def _test_grouped_instance_confidence_interval(

def test_llm_as_judge_metric(self):
model_id = "meta-llama/llama-3-8b-instruct"
format = "formats.llama3_chat"
format = "formats.llama3_instruct"
task = "rating.single_turn"
template = "templates.response_assessment.rating.mt_bench_single_turn"

Expand Down
Loading