Skip to content

Commit

Permalink
Add simple LLM as a judge example, of using it without installaiotn (#…
Browse files Browse the repository at this point in the history
…968)

* Add simple LLM as a judge example, of using it without installaiotn

* Exclude example that requires GENAI key from tests
  • Loading branch information
eladven authored and gitMichal committed Jul 15, 2024
1 parent 0c7bed5 commit a5360ed
Show file tree
Hide file tree
Showing 8 changed files with 129 additions and 77 deletions.
10 changes: 8 additions & 2 deletions docs/docs/examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,19 @@ Each example is a self contained python file that you can run and later modify.
- Description
- Link to code
- Related documentation
* - Evaluate your question-answering dataset
* - Evaluate an existing question-answering dataset from the Unitxt catalog, and evaluate it
- Demonstrates how to evaluate an existing QA dataset (squad) using Huggingface
datasets and evaluate APIs, with no installation required. By using predefined LLM as a judge metric.
- `code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_dataset_by_llm_as_judge_no_install.py>`_
- | :ref:`Evaluating datasets <evaluating_datasets>`.
- | :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.
* - Evaluate your question-answering dataset
- Demonstrates how to evaluate a user QA answering dataset in a standalone file using a user defined task and template. In addition, it shows how to define an LLM as a judge metric, specify the template it uses to produce the input to the judge, and select the judge model and platform.
- `code <https://github.com/IBM/unitxt/blob/main/examples/standalone_evaluation_llm_as_judge.py>`_
- | :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.
* - Evaluate an existing summarization dataset from the catalog with LLM as judge
- Demonstrates how to evaluate a document summarization dataset by define an LLM as a judge metric, specify the template it uses to produce the input to the judge, and select the judge model and platform.
- `code <https://github.com/IBM/unitxt/blob/main/examples/evaluation_summarization_dataset_llm_as_judge>`_
- `code <https://github.com/IBM/unitxt/blob/main/examples/evaluation_summarization_dataset_llm_as_judge.py>`_
- | :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.


43 changes: 43 additions & 0 deletions examples/evaluate_dataset_by_llm_as_judge_no_install.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from datasets import load_dataset
from unitxt import get_logger, get_settings
from unitxt.api import evaluate
from unitxt.inference import (
HFPipelineBasedInferenceEngine,
)
from unitxt.text_utils import print_dict

logger = get_logger()
settings = get_settings()
settings.allow_unverified_code = True

# Use the HF load_dataset API, to load the squad QA dataset using the standard template in the catalog.
# We set loader_limit to 20 to reduce download time.
test_dataset = load_dataset(
"unitxt/data",
"card=cards.squad,template=templates.qa.with_context.simple,metrics=[metrics.llm_as_judge.rating.llama_3_70b_instruct_ibm_genai_template_generic_single_turn],loader_limit=20",
trust_remote_code=True,
split="test",
)

# Infer a model to get predictions.
model_name = "google/flan-t5-base"
inference_model = HFPipelineBasedInferenceEngine(
model_name=model_name, max_new_tokens=32
)
predictions = inference_model.infer(test_dataset)

# Evaluate the predictions using the defined metric.
evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)

# Print results
for instance in evaluated_dataset:
print_dict(
instance,
keys_to_print=[
"source",
"prediction",
"processed_prediction",
"references",
"score",
],
)
75 changes: 0 additions & 75 deletions examples/use_llm_as_judge_metric.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from unitxt import add_to_catalog
from unitxt.inference import (
IbmGenAiInferenceEngine,
IbmGenAiInferenceEngineParams,
)
from unitxt.llm_as_judge import LLMAsJudge

model = "meta-llama/llama-3-70b-instruct"
format = "formats.llama3_chat"
template = "templates.response_assessment.rating.generic_single_turn"
task = "rating.single_turn"

gen_params = IbmGenAiInferenceEngineParams(max_new_tokens=252)
inference_model = IbmGenAiInferenceEngine(model_name=model, parameters=gen_params)
model_label = model.split("/")[1].replace("-", "_").replace(".", ",").lower()
model_label = f"{model_label}_ibm_genai"
template_label = template.split(".")[-1]
metric_label = f"{model_label}_template_{template_label}"
metric = LLMAsJudge(
inference_model=inference_model,
template=template,
task=task,
format=format,
main_score=metric_label,
)

add_to_catalog(
metric,
f"metrics.llm_as_judge.rating.{model_label}_template_{template_label}",
overwrite=True,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from unitxt import add_to_catalog
from unitxt.templates import InputOutputTemplate

add_to_catalog(
InputOutputTemplate(
instruction="Please act as an impartial judge and evaluate the quality of the response provided"
" by an AI assistant to the user input displayed below. Your evaluation should consider"
" factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of"
" detail of the response. Begin your evaluation by providing a short explanation. Be as"
" objective as possible. After providing your explanation, you must rate the response"
' on a scale of 1 to 10 by strictly following this format: "[[rating]]", for example:'
' "Rating: [[5]]".\n\n',
input_format="[User input]\n{question}\n\n"
"[Assistant's respond]\n{answer}\n[The End of Assistant's respond]",
output_format="[[{rating}]]",
postprocessors=[
r"processors.extract_mt_bench_rating_judgment",
],
),
"templates.response_assessment.rating.generic_single_turn",
overwrite=True,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"__type__": "llm_as_judge",
"inference_model": {
"__type__": "ibm_gen_ai_inference_engine",
"model_name": "meta-llama/llama-3-70b-instruct",
"parameters": {
"__type__": "ibm_gen_ai_inference_engine_params",
"max_new_tokens": 252
}
},
"template": "templates.response_assessment.rating.generic_single_turn",
"task": "rating.single_turn",
"format": "formats.llama3_chat",
"main_score": "llama_3_70b_instruct_ibm_genai_template_generic_single_turn"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"__type__": "input_output_template",
"instruction": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user input displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n",
"input_format": "[User input]\n{question}\n\n[Assistant's respond]\n{answer}\n[The End of Assistant's respond]",
"output_format": "[[{rating}]]",
"postprocessors": [
"processors.extract_mt_bench_rating_judgment"
]
}
1 change: 1 addition & 0 deletions tests/library/test_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def test_examples(self):
"standalone_evaluation_llm_as_judge.py",
"evaluation_summarization_dataset_llm_as_judge.py",
"evaluate_different_formats.py",
"evaluate_dataset_by_llm_as_judge_no_install.py",
]
for file in all_example_files:
logger.info(
Expand Down

0 comments on commit a5360ed

Please sign in to comment.