-
Notifications
You must be signed in to change notification settings - Fork 39
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add llm as judge mt-bench dataset and metrics (#791)
* add mt_bench_single_turn_gpt4_judge dataset * added typings to model_response_assessment task field * fixed output_format in mt_bench template * fixed output_format in mt_bench template * add llama3 format * temporal changes to the inference engines * add llama3_bam_mt_bench_prompt llm-as-judge metric * add assert to openai model recipe * update genai and openai inference apis * add model_response_assessment_chat task * add ChatTemplate * add model_response_assessment.json * fix model_response_assessment.json * add template and task of chat llm as judge * mt bench templates * mt bench templates * model assessment tasks * add InterleaveListsToDialogOperator operator * update dialog template * update mt bench template * update mt bench template update * update chat template * add mt bench datasets * small fixes * update metrics * update metrics * delete old files * update test requirements file * update test requirements file * update llam3 metric with correct format * add model assestmnt tasks with reference * update tasks * clear catalog * add tasks * update task * update templates * update * update * update * add mt bench pairwise proccessor * remove odl file * update * add model assesment pairwise comparison tass * add pairwise templates * fix pairwise templates * fix mt bench pairwise processor * fix template * add mt-bench pairwise dataset * llm as judge metric cards * add llama3 metrics * update * update * update prepare test python version * clean catalog * update templates * update tasks * update tasks * update templates * update cards * update cards * update templates * add cards * add cards for llm as judge metric * add cards for llm as judge metric * add metrics * merge * add mt becnh generation datasets * fix * fix * fix * fix * update python to 3.9 for catalog testing * remove old catalog items * update llm as a judge * update readme * update tests * update dynamic cards for llm as judge * update llm as jusge etric * update tests * add the ability to strip_system_prompt_and_format_from_inputs * update tests * update * update * update * update * update * update * update * update * update * update * update * update * add phi3 format * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update cards with LiteralEval * update cards with LiteralEval * make llm judge dynamic fields * add json * update * update metric * update * fix * update readme * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * Update llm_as_judge.rst * update * update * update * update * update * Update llm_as_judge.rst (#847) * update * update * update * update * update * update * update * update * small fix * small fix --------- Co-authored-by: Yoav Katz <68273864+yoavkatz@users.noreply.github.com>
- Loading branch information
Showing
95 changed files
with
2,995 additions
and
383 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
15 changes: 15 additions & 0 deletions
15
prepare/cards/dynamic_cards_for_llm_judges/llm_as_judge_metrics.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
from unitxt.blocks import TaskCard | ||
from unitxt.catalog import add_to_catalog | ||
|
||
tasks = [ | ||
"tasks.response_assessment.rating.single_turn", | ||
"tasks.response_assessment.rating.single_turn_with_reference", | ||
] | ||
for task in tasks: | ||
card = TaskCard(loader=None, preprocess_steps=[], task=task) | ||
sub_task = ".".join(task.split(".")[-2:]) | ||
add_to_catalog( | ||
card, | ||
f"cards.dynamic_cards_for_llm_judges.{sub_task}", | ||
overwrite=True, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
from unitxt.blocks import ( | ||
TaskCard, | ||
) | ||
from unitxt.catalog import add_to_catalog | ||
from unitxt.loaders import LoadHF | ||
from unitxt.operators import ( | ||
AddFields, | ||
CopyFields, | ||
RenameFields, | ||
) | ||
from unitxt.splitters import RenameSplits | ||
from unitxt.test_utils.card import test_card | ||
|
||
card = TaskCard( | ||
loader=LoadHF(path="dim/mt_bench_en", split="train"), | ||
preprocess_steps=[ | ||
RenameSplits({"train": "test"}), | ||
CopyFields(field_to_field={"turns/0": "turns"}), | ||
RenameFields( | ||
field_to_field={ | ||
"turns": "input", | ||
"category": "group", | ||
} | ||
), | ||
AddFields( | ||
fields={ | ||
"output": "None", | ||
"type_of_input": "question", | ||
"type_of_output": "answer", | ||
} | ||
), | ||
], | ||
task="tasks.generation", | ||
templates=["templates.empty"], | ||
) | ||
|
||
test_card(card, demos_taken_from="test", strict=False) | ||
add_to_catalog( | ||
card, | ||
"cards.mt_bench.generation.english_single_turn", | ||
overwrite=True, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
from unitxt.blocks import ( | ||
TaskCard, | ||
) | ||
from unitxt.catalog import add_to_catalog | ||
from unitxt.loaders import LoadHF | ||
from unitxt.operators import ( | ||
AddFields, | ||
CopyFields, | ||
RenameFields, | ||
) | ||
from unitxt.splitters import RenameSplits | ||
from unitxt.test_utils.card import test_card | ||
|
||
card = TaskCard( | ||
loader=LoadHF(path="shi3z/MTbenchJapanese", split="train"), | ||
preprocess_steps=[ | ||
RenameSplits({"train": "test"}), | ||
CopyFields(field_to_field={"turns/0": "turns"}), | ||
RenameFields( | ||
field_to_field={ | ||
"turns": "input", | ||
"category": "group", | ||
} | ||
), | ||
AddFields( | ||
fields={ | ||
"output": "None", | ||
"type_of_input": "question", | ||
"type_of_output": "answer", | ||
} | ||
), | ||
], | ||
task="tasks.generation", | ||
templates=["templates.empty"], | ||
) | ||
|
||
test_card(card, demos_taken_from="test", strict=False) | ||
add_to_catalog( | ||
card, | ||
"cards.mt_bench.generation.japanese_single_turn", | ||
overwrite=True, | ||
) |
62 changes: 62 additions & 0 deletions
62
prepare/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_gpt4_judgement.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
from unitxt.blocks import ( | ||
TaskCard, | ||
) | ||
from unitxt.catalog import add_to_catalog | ||
from unitxt.loaders import LoadHF | ||
from unitxt.operators import ( | ||
FilterByCondition, | ||
InterleaveListsToDialogOperator, | ||
MapInstanceValues, | ||
RenameFields, | ||
) | ||
from unitxt.processors import LiteralEval | ||
from unitxt.splitters import RenameSplits | ||
from unitxt.test_utils.card import test_card | ||
|
||
card = TaskCard( | ||
loader=LoadHF( | ||
path="OfirArviv/mt_bench_pairwise_comparison_gpt4_judgments", split="train" | ||
), | ||
preprocess_steps=[ | ||
RenameSplits({"train": "test"}), | ||
FilterByCondition(values={"turn": 2}, condition="eq"), | ||
FilterByCondition(values={"reference": "[]"}, condition="eq"), | ||
FilterByCondition( | ||
values={"winner": ["model_1", "tie", "model_2"]}, condition="in" | ||
), | ||
MapInstanceValues( | ||
mappers={ | ||
"winner": {"model_1": "choice_a", "model_2": "choice_b", "tie": "tie"} | ||
} | ||
), | ||
RenameFields( | ||
field_to_field={ | ||
"category": "group", | ||
} | ||
), | ||
LiteralEval("model_input", to_field="model_input"), | ||
LiteralEval("model_1_output", to_field="model_1_output"), | ||
LiteralEval("model_2_output", to_field="model_2_output"), | ||
InterleaveListsToDialogOperator( | ||
user_turns_field="model_input", | ||
assistant_turns_field="model_1_output", | ||
to_field="dialog_a", | ||
), | ||
InterleaveListsToDialogOperator( | ||
user_turns_field="model_input", | ||
assistant_turns_field="model_2_output", | ||
to_field="dialog_b", | ||
), | ||
], | ||
task="tasks.response_assessment.pairwise_comparison.multi_turn", | ||
templates=[ | ||
"templates.response_assessment.pairwise_comparison.mt_bench_multi_turn_with_shuffle" | ||
], | ||
) | ||
|
||
test_card(card, demos_taken_from="test", strict=False) | ||
add_to_catalog( | ||
card, | ||
"cards.mt_bench.response_assessment.pairwise_comparison.multi_turn_gpt4_judgement", | ||
overwrite=True, | ||
) |
64 changes: 64 additions & 0 deletions
64
...bench/response_assessment/pairwise_comparison/multi_turn_with_reference_gpt4_judgement.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
from unitxt.blocks import ( | ||
TaskCard, | ||
) | ||
from unitxt.catalog import add_to_catalog | ||
from unitxt.loaders import LoadHF | ||
from unitxt.operators import ( | ||
FilterByCondition, | ||
InterleaveListsToDialogOperator, | ||
MapInstanceValues, | ||
RenameFields, | ||
) | ||
from unitxt.processors import LiteralEval | ||
from unitxt.splitters import RenameSplits | ||
from unitxt.test_utils.card import test_card | ||
|
||
card = TaskCard( | ||
loader=LoadHF( | ||
path="OfirArviv/mt_bench_pairwise_comparison_gpt4_judgments", split="train" | ||
), | ||
preprocess_steps=[ | ||
RenameSplits({"train": "test"}), | ||
FilterByCondition(values={"turn": 2}, condition="eq"), | ||
FilterByCondition(values={"reference": "[]"}, condition="ne"), | ||
FilterByCondition( | ||
values={"winner": ["model_1", "tie", "model_2"]}, condition="in" | ||
), | ||
MapInstanceValues( | ||
mappers={ | ||
"winner": {"model_1": "choice_a", "model_2": "choice_b", "tie": "tie"} | ||
} | ||
), | ||
RenameFields(field_to_field={"category": "group"}), | ||
LiteralEval("model_input", to_field="model_input"), | ||
LiteralEval("model_1_output", to_field="model_1_output"), | ||
LiteralEval("model_2_output", to_field="model_2_output"), | ||
LiteralEval("reference", to_field="reference"), | ||
InterleaveListsToDialogOperator( | ||
user_turns_field="model_input", | ||
assistant_turns_field="model_1_output", | ||
to_field="dialog_a", | ||
), | ||
InterleaveListsToDialogOperator( | ||
user_turns_field="model_input", | ||
assistant_turns_field="model_2_output", | ||
to_field="dialog_b", | ||
), | ||
InterleaveListsToDialogOperator( | ||
user_turns_field="model_input", | ||
assistant_turns_field="reference", | ||
to_field="reference_dialog", | ||
), | ||
], | ||
task="tasks.response_assessment.pairwise_comparison.multi_turn_with_reference", | ||
templates=[ | ||
"templates.response_assessment.pairwise_comparison.mt_bench_multi_turn_with_reference_with_shuffle" | ||
], | ||
) | ||
|
||
test_card(card, demos_taken_from="test", strict=False, loader_limit=1000) | ||
add_to_catalog( | ||
card, | ||
"cards.mt_bench.response_assessment.pairwise_comparison.multi_turn_with_reference_gpt4_judgement", | ||
overwrite=True, | ||
) |
58 changes: 58 additions & 0 deletions
58
prepare/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_gpt4_judgement.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
from unitxt.blocks import ( | ||
TaskCard, | ||
) | ||
from unitxt.catalog import add_to_catalog | ||
from unitxt.loaders import LoadHF | ||
from unitxt.operators import ( | ||
CopyFields, | ||
FilterByCondition, | ||
MapInstanceValues, | ||
RenameFields, | ||
) | ||
from unitxt.processors import LiteralEval | ||
from unitxt.splitters import RenameSplits | ||
from unitxt.test_utils.card import test_card | ||
|
||
card = TaskCard( | ||
loader=LoadHF( | ||
path="OfirArviv/mt_bench_pairwise_comparison_gpt4_judgments", split="train" | ||
), | ||
preprocess_steps=[ | ||
RenameSplits({"train": "test"}), | ||
FilterByCondition(values={"turn": 1}, condition="eq"), | ||
FilterByCondition(values={"reference": "[]"}, condition="eq"), | ||
FilterByCondition( | ||
values={"winner": ["model_1", "tie", "model_2"]}, condition="in" | ||
), | ||
MapInstanceValues( | ||
mappers={ | ||
"winner": {"model_1": "choice_a", "model_2": "choice_b", "tie": "tie"} | ||
} | ||
), | ||
RenameFields( | ||
field_to_field={ | ||
"model_input": "question", | ||
"model_1_output": "answer_a", | ||
"model_2_output": "answer_b", | ||
"category": "group", | ||
} | ||
), | ||
LiteralEval("question", to_field="question"), | ||
CopyFields(field_to_field={"question/0": "question"}), | ||
LiteralEval("answer_a", to_field="answer_a"), | ||
CopyFields(field_to_field={"answer_a/0": "answer_a"}), | ||
LiteralEval("answer_b", to_field="answer_b"), | ||
CopyFields(field_to_field={"answer_b/0": "answer_b"}), | ||
], | ||
task="tasks.response_assessment.pairwise_comparison.single_turn", | ||
templates=[ | ||
"templates.response_assessment.pairwise_comparison.mt_bench_single_turn_with_shuffle" | ||
], | ||
) | ||
|
||
test_card(card, demos_taken_from="test", strict=False) | ||
add_to_catalog( | ||
card, | ||
"cards.mt_bench.response_assessment.pairwise_comparison.single_turn_gpt4_judgement", | ||
overwrite=True, | ||
) |
61 changes: 61 additions & 0 deletions
61
...ench/response_assessment/pairwise_comparison/single_turn_with_reference_gpt4_judgement.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
from unitxt.blocks import ( | ||
TaskCard, | ||
) | ||
from unitxt.catalog import add_to_catalog | ||
from unitxt.loaders import LoadHF | ||
from unitxt.operators import ( | ||
CopyFields, | ||
FilterByCondition, | ||
MapInstanceValues, | ||
RenameFields, | ||
) | ||
from unitxt.processors import LiteralEval | ||
from unitxt.splitters import RenameSplits | ||
from unitxt.test_utils.card import test_card | ||
|
||
card = TaskCard( | ||
loader=LoadHF( | ||
path="OfirArviv/mt_bench_pairwise_comparison_gpt4_judgments", split="train" | ||
), | ||
preprocess_steps=[ | ||
RenameSplits({"train": "test"}), | ||
FilterByCondition(values={"turn": 1}, condition="eq"), | ||
FilterByCondition(values={"reference": "[]"}, condition="ne"), | ||
FilterByCondition( | ||
values={"winner": ["model_1", "tie", "model_2"]}, condition="in" | ||
), | ||
MapInstanceValues( | ||
mappers={ | ||
"winner": {"model_1": "choice_a", "model_2": "choice_b", "tie": "tie"} | ||
} | ||
), | ||
RenameFields( | ||
field_to_field={ | ||
"model_input": "question", | ||
"model_1_output": "answer_a", | ||
"model_2_output": "answer_b", | ||
"reference": "reference_answer", | ||
"category": "group", | ||
} | ||
), | ||
LiteralEval("question", to_field="question"), | ||
CopyFields(field_to_field={"question/0": "question"}), | ||
LiteralEval("answer_a", to_field="answer_a"), | ||
CopyFields(field_to_field={"answer_a/0": "answer_a"}), | ||
LiteralEval("answer_b", to_field="answer_b"), | ||
CopyFields(field_to_field={"answer_b/0": "answer_b"}), | ||
LiteralEval("reference_answer", to_field="reference_answer"), | ||
CopyFields(field_to_field={"reference_answer/0": "reference_answer"}), | ||
], | ||
task="tasks.response_assessment.pairwise_comparison.single_turn_with_reference", | ||
templates=[ | ||
"templates.response_assessment.pairwise_comparison.mt_bench_single_turn_with_reference_with_shuffle" | ||
], | ||
) | ||
|
||
test_card(card, demos_taken_from="test", strict=False, loader_limit=1000) | ||
add_to_catalog( | ||
card, | ||
"cards.mt_bench.response_assessment.pairwise_comparison.single_turn_with_reference_gpt4_judgement", | ||
overwrite=True, | ||
) |
Oops, something went wrong.