Add llm as judge mt-bench dataset and metrics (#791)

* add mt_bench_single_turn_gpt4_judge dataset * added typings to model_response_assessment task field * fixed output_format in mt_bench template * fixed output_format in mt_bench template * add llama3 format * temporal changes to the inference engines * add llama3_bam_mt_bench_prompt llm-as-judge metric * add assert to openai model recipe * update genai and openai inference apis * add model_response_assessment_chat task * add ChatTemplate * add model_response_assessment.json * fix model_response_assessment.json * add template and task of chat llm as judge * mt bench templates * mt bench templates * model assessment tasks * add InterleaveListsToDialogOperator operator * update dialog template * update mt bench template * update mt bench template update * update chat template * add mt bench datasets * small fixes * update metrics * update metrics * delete old files * update test requirements file * update test requirements file * update llam3 metric with correct format * add model assestmnt tasks with reference * update tasks * clear catalog * add tasks * update task * update templates * update * update * update * add mt bench pairwise proccessor * remove odl file * update * add model assesment pairwise comparison tass * add pairwise templates * fix pairwise templates * fix mt bench pairwise processor * fix template * add mt-bench pairwise dataset * llm as judge metric cards * add llama3 metrics * update * update * update prepare test python version * clean catalog * update templates * update tasks * update tasks * update templates * update cards * update cards * update templates * add cards * add cards for llm as judge metric * add cards for llm as judge metric * add metrics * merge * add mt becnh generation datasets * fix * fix * fix * fix * update python to 3.9 for catalog testing * remove old catalog items * update llm as a judge * update readme * update tests * update dynamic cards for llm as judge * update llm as jusge etric * update tests * add the ability to strip_system_prompt_and_format_from_inputs * update tests * update * update * update * update * update * update * update * update * update * update * update * update * add phi3 format * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update readme * update cards with LiteralEval * update cards with LiteralEval * make llm judge dynamic fields * add json * update * update metric * update * fix * update readme * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * Update llm_as_judge.rst * update * update * update * update * update * Update llm_as_judge.rst (#847) * update * update * update * update * update * update * update * update * small fix * small fix --------- Co-authored-by: Yoav Katz <68273864+yoavkatz@users.noreply.github.com>
IBM · May 20, 2024 · d854109 · d854109
1 parent 4064804
commit d854109
Show file tree

Hide file tree

Showing 95 changed files with 2,995 additions and 383 deletions.
diff --git a/.github/workflows/catalog_consistency.yml b/.github/workflows/catalog_consistency.yml
@@ -12,13 +12,15 @@ jobs:
      runs-on: ubuntu-latest
      env:
        OS: ubuntu-latest
+       GENAI_KEY: "dummy"
+       UNITXT_ALLOW_PASSING_DATA_TO_REMOTE_API: "True"
 
      steps:
      - uses: actions/checkout@v4
 
      - uses: actions/setup-python@v5
        with:
-         python-version: '3.8'
+         python-version: '3.9'
          cache: 'pip' # caching pip dependencies
      - run: pip install -r requirements/base.rqr
      - run: pip install -r requirements/tests.rqr

diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml
@@ -12,13 +12,15 @@ jobs:
      runs-on: ubuntu-latest
      env:
        OS: ubuntu-latest
+       GENAI_KEY: "dummy"
+       UNITXT_ALLOW_PASSING_DATA_TO_REMOTE_API: "True"
 
      steps:
      - uses: actions/checkout@v4
 
      - uses: actions/setup-python@v5
        with:
-         python-version: '3.8'
+         python-version: '3.9'
          cache: 'pip' # caching pip dependencies
      - run: pip install -r requirements/base.rqr
      - run: pip install -r requirements/tests.rqr

diff --git a/.github/workflows/library_tests.yml b/.github/workflows/library_tests.yml
@@ -18,7 +18,7 @@ jobs:
 
     - uses: actions/setup-python@v5
       with:
-        python-version: '3.8'
+        python-version: '3.9'
         cache: 'pip' # caching pip dependencies
     - run: pip install -r requirements/base.rqr
     - run: pip install -r requirements/tests.rqr

diff --git a/docs/docs/llm_as_judge.rst b/docs/docs/llm_as_judge.rst
diff --git a/prepare/cards/dynamic_cards_for_llm_judges/llm_as_judge_metrics.py b/prepare/cards/dynamic_cards_for_llm_judges/llm_as_judge_metrics.py
@@ -0,0 +1,15 @@
+from unitxt.blocks import TaskCard
+from unitxt.catalog import add_to_catalog
+
+tasks = [
+    "tasks.response_assessment.rating.single_turn",
+    "tasks.response_assessment.rating.single_turn_with_reference",
+]
+for task in tasks:
+    card = TaskCard(loader=None, preprocess_steps=[], task=task)
+    sub_task = ".".join(task.split(".")[-2:])
+    add_to_catalog(
+        card,
+        f"cards.dynamic_cards_for_llm_judges.{sub_task}",
+        overwrite=True,
+    )
diff --git a/prepare/cards/mt_bench/generation/english_single_turn.py b/prepare/cards/mt_bench/generation/english_single_turn.py
@@ -0,0 +1,42 @@
+from unitxt.blocks import (
+    TaskCard,
+)
+from unitxt.catalog import add_to_catalog
+from unitxt.loaders import LoadHF
+from unitxt.operators import (
+    AddFields,
+    CopyFields,
+    RenameFields,
+)
+from unitxt.splitters import RenameSplits
+from unitxt.test_utils.card import test_card
+
+card = TaskCard(
+    loader=LoadHF(path="dim/mt_bench_en", split="train"),
+    preprocess_steps=[
+        RenameSplits({"train": "test"}),
+        CopyFields(field_to_field={"turns/0": "turns"}),
+        RenameFields(
+            field_to_field={
+                "turns": "input",
+                "category": "group",
+            }
+        ),
+        AddFields(
+            fields={
+                "output": "None",
+                "type_of_input": "question",
+                "type_of_output": "answer",
+            }
+        ),
+    ],
+    task="tasks.generation",
+    templates=["templates.empty"],
+)
+
+test_card(card, demos_taken_from="test", strict=False)
+add_to_catalog(
+    card,
+    "cards.mt_bench.generation.english_single_turn",
+    overwrite=True,
+)
diff --git a/prepare/cards/mt_bench/generation/japanese_single_turn.py b/prepare/cards/mt_bench/generation/japanese_single_turn.py
@@ -0,0 +1,42 @@
+from unitxt.blocks import (
+    TaskCard,
+)
+from unitxt.catalog import add_to_catalog
+from unitxt.loaders import LoadHF
+from unitxt.operators import (
+    AddFields,
+    CopyFields,
+    RenameFields,
+)
+from unitxt.splitters import RenameSplits
+from unitxt.test_utils.card import test_card
+
+card = TaskCard(
+    loader=LoadHF(path="shi3z/MTbenchJapanese", split="train"),
+    preprocess_steps=[
+        RenameSplits({"train": "test"}),
+        CopyFields(field_to_field={"turns/0": "turns"}),
+        RenameFields(
+            field_to_field={
+                "turns": "input",
+                "category": "group",
+            }
+        ),
+        AddFields(
+            fields={
+                "output": "None",
+                "type_of_input": "question",
+                "type_of_output": "answer",
+            }
+        ),
+    ],
+    task="tasks.generation",
+    templates=["templates.empty"],
+)
+
+test_card(card, demos_taken_from="test", strict=False)
+add_to_catalog(
+    card,
+    "cards.mt_bench.generation.japanese_single_turn",
+    overwrite=True,
+)
diff --git a/prepare/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_gpt4_judgement.py b/prepare/cards/mt_bench/response_assessment/pairwise_comparison/multi_turn_gpt4_judgement.py
@@ -0,0 +1,62 @@
+from unitxt.blocks import (
+    TaskCard,
+)
+from unitxt.catalog import add_to_catalog
+from unitxt.loaders import LoadHF
+from unitxt.operators import (
+    FilterByCondition,
+    InterleaveListsToDialogOperator,
+    MapInstanceValues,
+    RenameFields,
+)
+from unitxt.processors import LiteralEval
+from unitxt.splitters import RenameSplits
+from unitxt.test_utils.card import test_card
+
+card = TaskCard(
+    loader=LoadHF(
+        path="OfirArviv/mt_bench_pairwise_comparison_gpt4_judgments", split="train"
+    ),
+    preprocess_steps=[
+        RenameSplits({"train": "test"}),
+        FilterByCondition(values={"turn": 2}, condition="eq"),
+        FilterByCondition(values={"reference": "[]"}, condition="eq"),
+        FilterByCondition(
+            values={"winner": ["model_1", "tie", "model_2"]}, condition="in"
+        ),
+        MapInstanceValues(
+            mappers={
+                "winner": {"model_1": "choice_a", "model_2": "choice_b", "tie": "tie"}
+            }
+        ),
+        RenameFields(
+            field_to_field={
+                "category": "group",
+            }
+        ),
+        LiteralEval("model_input", to_field="model_input"),
+        LiteralEval("model_1_output", to_field="model_1_output"),
+        LiteralEval("model_2_output", to_field="model_2_output"),
+        InterleaveListsToDialogOperator(
+            user_turns_field="model_input",
+            assistant_turns_field="model_1_output",
+            to_field="dialog_a",
+        ),
+        InterleaveListsToDialogOperator(
+            user_turns_field="model_input",
+            assistant_turns_field="model_2_output",
+            to_field="dialog_b",
+        ),
+    ],
+    task="tasks.response_assessment.pairwise_comparison.multi_turn",
+    templates=[
+        "templates.response_assessment.pairwise_comparison.mt_bench_multi_turn_with_shuffle"
+    ],
+)
+
+test_card(card, demos_taken_from="test", strict=False)
+add_to_catalog(
+    card,
+    "cards.mt_bench.response_assessment.pairwise_comparison.multi_turn_gpt4_judgement",
+    overwrite=True,
+)
diff --git a/...bench/response_assessment/pairwise_comparison/multi_turn_with_reference_gpt4_judgement.py b/...bench/response_assessment/pairwise_comparison/multi_turn_with_reference_gpt4_judgement.py
@@ -0,0 +1,64 @@
+from unitxt.blocks import (
+    TaskCard,
+)
+from unitxt.catalog import add_to_catalog
+from unitxt.loaders import LoadHF
+from unitxt.operators import (
+    FilterByCondition,
+    InterleaveListsToDialogOperator,
+    MapInstanceValues,
+    RenameFields,
+)
+from unitxt.processors import LiteralEval
+from unitxt.splitters import RenameSplits
+from unitxt.test_utils.card import test_card
+
+card = TaskCard(
+    loader=LoadHF(
+        path="OfirArviv/mt_bench_pairwise_comparison_gpt4_judgments", split="train"
+    ),
+    preprocess_steps=[
+        RenameSplits({"train": "test"}),
+        FilterByCondition(values={"turn": 2}, condition="eq"),
+        FilterByCondition(values={"reference": "[]"}, condition="ne"),
+        FilterByCondition(
+            values={"winner": ["model_1", "tie", "model_2"]}, condition="in"
+        ),
+        MapInstanceValues(
+            mappers={
+                "winner": {"model_1": "choice_a", "model_2": "choice_b", "tie": "tie"}
+            }
+        ),
+        RenameFields(field_to_field={"category": "group"}),
+        LiteralEval("model_input", to_field="model_input"),
+        LiteralEval("model_1_output", to_field="model_1_output"),
+        LiteralEval("model_2_output", to_field="model_2_output"),
+        LiteralEval("reference", to_field="reference"),
+        InterleaveListsToDialogOperator(
+            user_turns_field="model_input",
+            assistant_turns_field="model_1_output",
+            to_field="dialog_a",
+        ),
+        InterleaveListsToDialogOperator(
+            user_turns_field="model_input",
+            assistant_turns_field="model_2_output",
+            to_field="dialog_b",
+        ),
+        InterleaveListsToDialogOperator(
+            user_turns_field="model_input",
+            assistant_turns_field="reference",
+            to_field="reference_dialog",
+        ),
+    ],
+    task="tasks.response_assessment.pairwise_comparison.multi_turn_with_reference",
+    templates=[
+        "templates.response_assessment.pairwise_comparison.mt_bench_multi_turn_with_reference_with_shuffle"
+    ],
+)
+
+test_card(card, demos_taken_from="test", strict=False, loader_limit=1000)
+add_to_catalog(
+    card,
+    "cards.mt_bench.response_assessment.pairwise_comparison.multi_turn_with_reference_gpt4_judgement",
+    overwrite=True,
+)
diff --git a/prepare/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_gpt4_judgement.py b/prepare/cards/mt_bench/response_assessment/pairwise_comparison/single_turn_gpt4_judgement.py
@@ -0,0 +1,58 @@
+from unitxt.blocks import (
+    TaskCard,
+)
+from unitxt.catalog import add_to_catalog
+from unitxt.loaders import LoadHF
+from unitxt.operators import (
+    CopyFields,
+    FilterByCondition,
+    MapInstanceValues,
+    RenameFields,
+)
+from unitxt.processors import LiteralEval
+from unitxt.splitters import RenameSplits
+from unitxt.test_utils.card import test_card
+
+card = TaskCard(
+    loader=LoadHF(
+        path="OfirArviv/mt_bench_pairwise_comparison_gpt4_judgments", split="train"
+    ),
+    preprocess_steps=[
+        RenameSplits({"train": "test"}),
+        FilterByCondition(values={"turn": 1}, condition="eq"),
+        FilterByCondition(values={"reference": "[]"}, condition="eq"),
+        FilterByCondition(
+            values={"winner": ["model_1", "tie", "model_2"]}, condition="in"
+        ),
+        MapInstanceValues(
+            mappers={
+                "winner": {"model_1": "choice_a", "model_2": "choice_b", "tie": "tie"}
+            }
+        ),
+        RenameFields(
+            field_to_field={
+                "model_input": "question",
+                "model_1_output": "answer_a",
+                "model_2_output": "answer_b",
+                "category": "group",
+            }
+        ),
+        LiteralEval("question", to_field="question"),
+        CopyFields(field_to_field={"question/0": "question"}),
+        LiteralEval("answer_a", to_field="answer_a"),
+        CopyFields(field_to_field={"answer_a/0": "answer_a"}),
+        LiteralEval("answer_b", to_field="answer_b"),
+        CopyFields(field_to_field={"answer_b/0": "answer_b"}),
+    ],
+    task="tasks.response_assessment.pairwise_comparison.single_turn",
+    templates=[
+        "templates.response_assessment.pairwise_comparison.mt_bench_single_turn_with_shuffle"
+    ],
+)
+
+test_card(card, demos_taken_from="test", strict=False)
+add_to_catalog(
+    card,
+    "cards.mt_bench.response_assessment.pairwise_comparison.single_turn_gpt4_judgement",
+    overwrite=True,
+)
diff --git a/...ench/response_assessment/pairwise_comparison/single_turn_with_reference_gpt4_judgement.py b/...ench/response_assessment/pairwise_comparison/single_turn_with_reference_gpt4_judgement.py
@@ -0,0 +1,61 @@
+from unitxt.blocks import (
+    TaskCard,
+)
+from unitxt.catalog import add_to_catalog
+from unitxt.loaders import LoadHF
+from unitxt.operators import (
+    CopyFields,
+    FilterByCondition,
+    MapInstanceValues,
+    RenameFields,
+)
+from unitxt.processors import LiteralEval
+from unitxt.splitters import RenameSplits
+from unitxt.test_utils.card import test_card
+
+card = TaskCard(
+    loader=LoadHF(
+        path="OfirArviv/mt_bench_pairwise_comparison_gpt4_judgments", split="train"
+    ),
+    preprocess_steps=[
+        RenameSplits({"train": "test"}),
+        FilterByCondition(values={"turn": 1}, condition="eq"),
+        FilterByCondition(values={"reference": "[]"}, condition="ne"),
+        FilterByCondition(
+            values={"winner": ["model_1", "tie", "model_2"]}, condition="in"
+        ),
+        MapInstanceValues(
+            mappers={
+                "winner": {"model_1": "choice_a", "model_2": "choice_b", "tie": "tie"}
+            }
+        ),
+        RenameFields(
+            field_to_field={
+                "model_input": "question",
+                "model_1_output": "answer_a",
+                "model_2_output": "answer_b",
+                "reference": "reference_answer",
+                "category": "group",
+            }
+        ),
+        LiteralEval("question", to_field="question"),
+        CopyFields(field_to_field={"question/0": "question"}),
+        LiteralEval("answer_a", to_field="answer_a"),
+        CopyFields(field_to_field={"answer_a/0": "answer_a"}),
+        LiteralEval("answer_b", to_field="answer_b"),
+        CopyFields(field_to_field={"answer_b/0": "answer_b"}),
+        LiteralEval("reference_answer", to_field="reference_answer"),
+        CopyFields(field_to_field={"reference_answer/0": "reference_answer"}),
+    ],
+    task="tasks.response_assessment.pairwise_comparison.single_turn_with_reference",
+    templates=[
+        "templates.response_assessment.pairwise_comparison.mt_bench_single_turn_with_reference_with_shuffle"
+    ],
+)
+
+test_card(card, demos_taken_from="test", strict=False, loader_limit=1000)
+add_to_catalog(
+    card,
+    "cards.mt_bench.response_assessment.pairwise_comparison.single_turn_with_reference_gpt4_judgement",
+    overwrite=True,
+)