-
Notifications
You must be signed in to change notification settings - Fork 39
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add simple LLM as a judge example, of using it without installaiotn
- Loading branch information
Showing
8 changed files
with
212 additions
and
77 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
from datasets import load_dataset | ||
from unitxt import get_logger, get_settings | ||
from unitxt.api import evaluate | ||
from unitxt.inference import ( | ||
HFPipelineBasedInferenceEngine, | ||
) | ||
from unitxt.text_utils import print_dict | ||
|
||
logger = get_logger() | ||
settings = get_settings() | ||
settings.allow_unverified_code = True | ||
|
||
# Use the HF load_dataset API, to load the squad QA dataset using the standard template in the catalog. | ||
# We set loader_limit to 20 to reduce download time. | ||
test_dataset = load_dataset( | ||
"unitxt/data", | ||
"card=cards.squad,template=templates.qa.with_context.simple,metrics=[metrics.llm_as_judge.rating.llama_3_70b_instruct_ibm_genai_template_generic_single_turn],loader_limit=20", | ||
trust_remote_code=True, | ||
split="test", | ||
) | ||
|
||
# Infer a model to get predictions. | ||
model_name = "google/flan-t5-base" | ||
inference_model = HFPipelineBasedInferenceEngine( | ||
model_name=model_name, max_new_tokens=32 | ||
) | ||
predictions = inference_model.infer(test_dataset) | ||
|
||
# Evaluate the predictions using the defined metric. | ||
evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) | ||
|
||
# Print results | ||
for instance in evaluated_dataset: | ||
print_dict( | ||
instance, | ||
keys_to_print=[ | ||
"source", | ||
"prediction", | ||
"processed_prediction", | ||
"references", | ||
"score", | ||
], | ||
) |
84 changes: 84 additions & 0 deletions
84
examples/evaluation_summarization_dataset_llm_as_judge_with_refs.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
from unitxt import get_logger | ||
from unitxt.api import evaluate, load_dataset | ||
from unitxt.inference import ( | ||
HFPipelineBasedInferenceEngine, | ||
IbmGenAiInferenceEngine, | ||
IbmGenAiInferenceEngineParams, | ||
) | ||
from unitxt.llm_as_judge import LLMAsJudge | ||
from unitxt.templates import InputOutputTemplate | ||
from unitxt.text_utils import print_dict | ||
|
||
logger = get_logger() | ||
# First, we define the judge template. | ||
judge_summary_rating_template = InputOutputTemplate( | ||
instruction="Please act as an impartial judge and evaluate if the assistant's summary summarise well the given text.\n" | ||
'You must respond according the following format: "[[rate]] - explanation".\n' | ||
'Were the rate is a score between 0 to 10 (10 for great summary, 0 for a very poor one)".\n' | ||
"The explanation describe shortly why you decided to give the rank you chosen.\n" | ||
"Please make sure to start with your rank ([[rank]]) before anything else.\n" | ||
"For example: [[9]] The summary catches the main text ideas." | ||
".\n\n", | ||
input_format="[Text:\n{model_input}\n\n" "Assistant's summary:\n{model_output}\n", | ||
output_format="[[{rating}]]", | ||
postprocessors=[ | ||
r"processors.extract_mt_bench_rating_judgment", | ||
], | ||
) | ||
|
||
# Second, we define the inference engine we use for judge, with the preferred model and platform. | ||
# platform = "hf" | ||
# model_name = "google/flan-t5-large" | ||
# inference_model = HFPipelineBasedInferenceEngine( | ||
# model_name=model_name, max_new_tokens=256, use_fp16=True | ||
# ) | ||
# change to this to infer with IbmGenAI APIs: | ||
# | ||
platform = "ibm_gen_ai" | ||
model_name = "meta-llama/llama-3-70b-instruct" | ||
gen_params = IbmGenAiInferenceEngineParams(max_new_tokens=512) | ||
inference_model = IbmGenAiInferenceEngine( | ||
model_name="meta-llama/llama-3-70b-instruct", parameters=gen_params | ||
) | ||
|
||
# Third, We define the metric as LLM as a judge, with the desired platform and model. | ||
llm_judge_metric = LLMAsJudge( | ||
inference_model=inference_model, | ||
template=judge_summary_rating_template, | ||
task="rating.single_turn", | ||
main_score=f"llm_judge_{model_name.split('/')[1].replace('-', '_')}_{platform}", | ||
strip_system_prompt_and_format_from_inputs=False, | ||
) | ||
|
||
# Load XSUM dataset, with the above metric. | ||
dataset = load_dataset( | ||
card="cards.xsum", | ||
template="templates.summarization.abstractive.formal", | ||
metrics=[llm_judge_metric], | ||
loader_limit=20, | ||
) | ||
|
||
test_dataset = dataset["test"] | ||
|
||
# Infer a model to get predictions. | ||
model_name = "google/flan-t5-base" | ||
inference_model = HFPipelineBasedInferenceEngine( | ||
model_name=model_name, max_new_tokens=32 | ||
) | ||
predictions = inference_model.infer(test_dataset) | ||
|
||
# Evaluate the predictions using the defined metric. | ||
evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) | ||
|
||
# Print results | ||
for instance in evaluated_dataset: | ||
print_dict( | ||
instance, | ||
keys_to_print=[ | ||
"source", | ||
"prediction", | ||
"processed_prediction", | ||
"references", | ||
"score", | ||
], | ||
) |
This file was deleted.
Oops, something went wrong.
31 changes: 31 additions & 0 deletions
31
prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_generic_template.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
from unitxt import add_to_catalog | ||
from unitxt.inference import ( | ||
IbmGenAiInferenceEngine, | ||
IbmGenAiInferenceEngineParams, | ||
) | ||
from unitxt.llm_as_judge import LLMAsJudge | ||
|
||
model = "meta-llama/llama-3-70b-instruct" | ||
format = "formats.llama3_chat" | ||
template = "templates.response_assessment.rating.generic_single_turn" | ||
task = "rating.single_turn" | ||
|
||
gen_params = IbmGenAiInferenceEngineParams(max_new_tokens=252) | ||
inference_model = IbmGenAiInferenceEngine(model_name=model, parameters=gen_params) | ||
model_label = model.split("/")[1].replace("-", "_").replace(".", ",").lower() | ||
model_label = f"{model_label}_ibm_genai" | ||
template_label = template.split(".")[-1] | ||
metric_label = f"{model_label}_template_{template_label}" | ||
metric = LLMAsJudge( | ||
inference_model=inference_model, | ||
template=template, | ||
task=task, | ||
format=format, | ||
main_score=metric_label, | ||
) | ||
|
||
add_to_catalog( | ||
metric, | ||
f"metrics.llm_as_judge.rating.{model_label}_template_{template_label}", | ||
overwrite=True, | ||
) |
22 changes: 22 additions & 0 deletions
22
prepare/templates/response_assessment/rating/generic_single_turn.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
from unitxt import add_to_catalog | ||
from unitxt.templates import InputOutputTemplate | ||
|
||
add_to_catalog( | ||
InputOutputTemplate( | ||
instruction="Please act as an impartial judge and evaluate the quality of the response provided" | ||
" by an AI assistant to the user input displayed below. Your evaluation should consider" | ||
" factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of" | ||
" detail of the response. Begin your evaluation by providing a short explanation. Be as" | ||
" objective as possible. After providing your explanation, you must rate the response" | ||
' on a scale of 1 to 10 by strictly following this format: "[[rating]]", for example:' | ||
' "Rating: [[5]]".\n\n', | ||
input_format="[User input]\n{question}\n\n" | ||
"[Assistant's respond]\n{answer}\n[The End of Assistant's respond]", | ||
output_format="[[{rating}]]", | ||
postprocessors=[ | ||
r"processors.extract_mt_bench_rating_judgment", | ||
], | ||
), | ||
"templates.response_assessment.rating.generic_single_turn", | ||
overwrite=True, | ||
) |
15 changes: 15 additions & 0 deletions
15
...rics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_generic_single_turn.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
{ | ||
"__type__": "llm_as_judge", | ||
"inference_model": { | ||
"__type__": "ibm_gen_ai_inference_engine", | ||
"model_name": "meta-llama/llama-3-70b-instruct", | ||
"parameters": { | ||
"__type__": "ibm_gen_ai_inference_engine_params", | ||
"max_new_tokens": 252 | ||
} | ||
}, | ||
"template": "templates.response_assessment.rating.generic_single_turn", | ||
"task": "rating.single_turn", | ||
"format": "formats.llama3_chat", | ||
"main_score": "llama_3_70b_instruct_ibm_genai_template_generic_single_turn" | ||
} |
9 changes: 9 additions & 0 deletions
9
src/unitxt/catalog/templates/response_assessment/rating/generic_single_turn.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
{ | ||
"__type__": "input_output_template", | ||
"instruction": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user input displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", | ||
"input_format": "[User input]\n{question}\n\n[Assistant's respond]\n{answer}\n[The End of Assistant's respond]", | ||
"output_format": "[[{rating}]]", | ||
"postprocessors": [ | ||
"processors.extract_mt_bench_rating_judgment" | ||
] | ||
} |