Skip to content

Commit

Permalink
Add simple LLM as a judge example, of using it without installaiotn
Browse files Browse the repository at this point in the history
  • Loading branch information
eladven committed Jul 1, 2024
1 parent 87c3802 commit af92b20
Show file tree
Hide file tree
Showing 8 changed files with 212 additions and 77 deletions.
10 changes: 8 additions & 2 deletions docs/docs/examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,19 @@ Each example is a self contained python file that you can run and later modify.
- Description
- Link to code
- Related documentation
* - Evaluate your question-answering dataset
* - Evaluate an existing question-answering dataset from the Unitxt catalog, and evaluate it
- Demonstrates how to evaluate an existing QA dataset (squad) using Huggingface
datasets and evaluate APIs, with no installation required. By using predefined LLM as a judge metric.
- `code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_dataset_by_llm_as_judge_no_install.py>`_
- | :ref:`Evaluating datasets <evaluating_datasets>`.
- | :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.
* - Evaluate your question-answering dataset
- Demonstrates how to evaluate a user QA answering dataset in a standalone file using a user defined task and template. In addition, it shows how to define an LLM as a judge metric, specify the template it uses to produce the input to the judge, and select the judge model and platform.
- `code <https://github.com/IBM/unitxt/blob/main/examples/standalone_evaluation_llm_as_judge.py>`_
- | :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.
* - Evaluate an existing summarization dataset from the catalog with LLM as judge
- Demonstrates how to evaluate a document summarization dataset by define an LLM as a judge metric, specify the template it uses to produce the input to the judge, and select the judge model and platform.
- `code <https://github.com/IBM/unitxt/blob/main/examples/evaluation_summarization_dataset_llm_as_judge>`_
- `code <https://github.com/IBM/unitxt/blob/main/examples/evaluation_summarization_dataset_llm_as_judge.py>`_
- | :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.


43 changes: 43 additions & 0 deletions examples/evaluate_dataset_by_llm_as_judge_no_install.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from datasets import load_dataset
from unitxt import get_logger, get_settings
from unitxt.api import evaluate
from unitxt.inference import (
HFPipelineBasedInferenceEngine,
)
from unitxt.text_utils import print_dict

logger = get_logger()
settings = get_settings()
settings.allow_unverified_code = True

# Use the HF load_dataset API, to load the squad QA dataset using the standard template in the catalog.
# We set loader_limit to 20 to reduce download time.
test_dataset = load_dataset(
"unitxt/data",
"card=cards.squad,template=templates.qa.with_context.simple,metrics=[metrics.llm_as_judge.rating.llama_3_70b_instruct_ibm_genai_template_generic_single_turn],loader_limit=20",
trust_remote_code=True,
split="test",
)

# Infer a model to get predictions.
model_name = "google/flan-t5-base"
inference_model = HFPipelineBasedInferenceEngine(
model_name=model_name, max_new_tokens=32
)
predictions = inference_model.infer(test_dataset)

# Evaluate the predictions using the defined metric.
evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)

# Print results
for instance in evaluated_dataset:
print_dict(
instance,
keys_to_print=[
"source",
"prediction",
"processed_prediction",
"references",
"score",
],
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from unitxt import get_logger
from unitxt.api import evaluate, load_dataset
from unitxt.inference import (
HFPipelineBasedInferenceEngine,
IbmGenAiInferenceEngine,
IbmGenAiInferenceEngineParams,
)
from unitxt.llm_as_judge import LLMAsJudge
from unitxt.templates import InputOutputTemplate
from unitxt.text_utils import print_dict

logger = get_logger()
# First, we define the judge template.
judge_summary_rating_template = InputOutputTemplate(
instruction="Please act as an impartial judge and evaluate if the assistant's summary summarise well the given text.\n"
'You must respond according the following format: "[[rate]] - explanation".\n'
'Were the rate is a score between 0 to 10 (10 for great summary, 0 for a very poor one)".\n'
"The explanation describe shortly why you decided to give the rank you chosen.\n"
"Please make sure to start with your rank ([[rank]]) before anything else.\n"
"For example: [[9]] The summary catches the main text ideas."
".\n\n",
input_format="[Text:\n{model_input}\n\n" "Assistant's summary:\n{model_output}\n",
output_format="[[{rating}]]",
postprocessors=[
r"processors.extract_mt_bench_rating_judgment",
],
)

# Second, we define the inference engine we use for judge, with the preferred model and platform.
# platform = "hf"
# model_name = "google/flan-t5-large"
# inference_model = HFPipelineBasedInferenceEngine(
# model_name=model_name, max_new_tokens=256, use_fp16=True
# )
# change to this to infer with IbmGenAI APIs:
#
platform = "ibm_gen_ai"
model_name = "meta-llama/llama-3-70b-instruct"
gen_params = IbmGenAiInferenceEngineParams(max_new_tokens=512)
inference_model = IbmGenAiInferenceEngine(
model_name="meta-llama/llama-3-70b-instruct", parameters=gen_params
)

# Third, We define the metric as LLM as a judge, with the desired platform and model.
llm_judge_metric = LLMAsJudge(
inference_model=inference_model,
template=judge_summary_rating_template,
task="rating.single_turn",
main_score=f"llm_judge_{model_name.split('/')[1].replace('-', '_')}_{platform}",
strip_system_prompt_and_format_from_inputs=False,
)

# Load XSUM dataset, with the above metric.
dataset = load_dataset(
card="cards.xsum",
template="templates.summarization.abstractive.formal",
metrics=[llm_judge_metric],
loader_limit=20,
)

test_dataset = dataset["test"]

# Infer a model to get predictions.
model_name = "google/flan-t5-base"
inference_model = HFPipelineBasedInferenceEngine(
model_name=model_name, max_new_tokens=32
)
predictions = inference_model.infer(test_dataset)

# Evaluate the predictions using the defined metric.
evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)

# Print results
for instance in evaluated_dataset:
print_dict(
instance,
keys_to_print=[
"source",
"prediction",
"processed_prediction",
"references",
"score",
],
)
75 changes: 0 additions & 75 deletions examples/use_llm_as_judge_metric.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from unitxt import add_to_catalog
from unitxt.inference import (
IbmGenAiInferenceEngine,
IbmGenAiInferenceEngineParams,
)
from unitxt.llm_as_judge import LLMAsJudge

model = "meta-llama/llama-3-70b-instruct"
format = "formats.llama3_chat"
template = "templates.response_assessment.rating.generic_single_turn"
task = "rating.single_turn"

gen_params = IbmGenAiInferenceEngineParams(max_new_tokens=252)
inference_model = IbmGenAiInferenceEngine(model_name=model, parameters=gen_params)
model_label = model.split("/")[1].replace("-", "_").replace(".", ",").lower()
model_label = f"{model_label}_ibm_genai"
template_label = template.split(".")[-1]
metric_label = f"{model_label}_template_{template_label}"
metric = LLMAsJudge(
inference_model=inference_model,
template=template,
task=task,
format=format,
main_score=metric_label,
)

add_to_catalog(
metric,
f"metrics.llm_as_judge.rating.{model_label}_template_{template_label}",
overwrite=True,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from unitxt import add_to_catalog
from unitxt.templates import InputOutputTemplate

add_to_catalog(
InputOutputTemplate(
instruction="Please act as an impartial judge and evaluate the quality of the response provided"
" by an AI assistant to the user input displayed below. Your evaluation should consider"
" factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of"
" detail of the response. Begin your evaluation by providing a short explanation. Be as"
" objective as possible. After providing your explanation, you must rate the response"
' on a scale of 1 to 10 by strictly following this format: "[[rating]]", for example:'
' "Rating: [[5]]".\n\n',
input_format="[User input]\n{question}\n\n"
"[Assistant's respond]\n{answer}\n[The End of Assistant's respond]",
output_format="[[{rating}]]",
postprocessors=[
r"processors.extract_mt_bench_rating_judgment",
],
),
"templates.response_assessment.rating.generic_single_turn",
overwrite=True,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"__type__": "llm_as_judge",
"inference_model": {
"__type__": "ibm_gen_ai_inference_engine",
"model_name": "meta-llama/llama-3-70b-instruct",
"parameters": {
"__type__": "ibm_gen_ai_inference_engine_params",
"max_new_tokens": 252
}
},
"template": "templates.response_assessment.rating.generic_single_turn",
"task": "rating.single_turn",
"format": "formats.llama3_chat",
"main_score": "llama_3_70b_instruct_ibm_genai_template_generic_single_turn"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"__type__": "input_output_template",
"instruction": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user input displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n",
"input_format": "[User input]\n{question}\n\n[Assistant's respond]\n{answer}\n[The End of Assistant's respond]",
"output_format": "[[{rating}]]",
"postprocessors": [
"processors.extract_mt_bench_rating_judgment"
]
}

0 comments on commit af92b20

Please sign in to comment.