-
Notifications
You must be signed in to change notification settings - Fork 39
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add simple LLM as a judge example, of using it without installaiotn (#…
…968) * Add simple LLM as a judge example, of using it without installaiotn * Exclude example that requires GENAI key from tests
- Loading branch information
Showing
8 changed files
with
129 additions
and
77 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
from datasets import load_dataset | ||
from unitxt import get_logger, get_settings | ||
from unitxt.api import evaluate | ||
from unitxt.inference import ( | ||
HFPipelineBasedInferenceEngine, | ||
) | ||
from unitxt.text_utils import print_dict | ||
|
||
logger = get_logger() | ||
settings = get_settings() | ||
settings.allow_unverified_code = True | ||
|
||
# Use the HF load_dataset API, to load the squad QA dataset using the standard template in the catalog. | ||
# We set loader_limit to 20 to reduce download time. | ||
test_dataset = load_dataset( | ||
"unitxt/data", | ||
"card=cards.squad,template=templates.qa.with_context.simple,metrics=[metrics.llm_as_judge.rating.llama_3_70b_instruct_ibm_genai_template_generic_single_turn],loader_limit=20", | ||
trust_remote_code=True, | ||
split="test", | ||
) | ||
|
||
# Infer a model to get predictions. | ||
model_name = "google/flan-t5-base" | ||
inference_model = HFPipelineBasedInferenceEngine( | ||
model_name=model_name, max_new_tokens=32 | ||
) | ||
predictions = inference_model.infer(test_dataset) | ||
|
||
# Evaluate the predictions using the defined metric. | ||
evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) | ||
|
||
# Print results | ||
for instance in evaluated_dataset: | ||
print_dict( | ||
instance, | ||
keys_to_print=[ | ||
"source", | ||
"prediction", | ||
"processed_prediction", | ||
"references", | ||
"score", | ||
], | ||
) |
This file was deleted.
Oops, something went wrong.
31 changes: 31 additions & 0 deletions
31
prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_generic_template.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
from unitxt import add_to_catalog | ||
from unitxt.inference import ( | ||
IbmGenAiInferenceEngine, | ||
IbmGenAiInferenceEngineParams, | ||
) | ||
from unitxt.llm_as_judge import LLMAsJudge | ||
|
||
model = "meta-llama/llama-3-70b-instruct" | ||
format = "formats.llama3_chat" | ||
template = "templates.response_assessment.rating.generic_single_turn" | ||
task = "rating.single_turn" | ||
|
||
gen_params = IbmGenAiInferenceEngineParams(max_new_tokens=252) | ||
inference_model = IbmGenAiInferenceEngine(model_name=model, parameters=gen_params) | ||
model_label = model.split("/")[1].replace("-", "_").replace(".", ",").lower() | ||
model_label = f"{model_label}_ibm_genai" | ||
template_label = template.split(".")[-1] | ||
metric_label = f"{model_label}_template_{template_label}" | ||
metric = LLMAsJudge( | ||
inference_model=inference_model, | ||
template=template, | ||
task=task, | ||
format=format, | ||
main_score=metric_label, | ||
) | ||
|
||
add_to_catalog( | ||
metric, | ||
f"metrics.llm_as_judge.rating.{model_label}_template_{template_label}", | ||
overwrite=True, | ||
) |
22 changes: 22 additions & 0 deletions
22
prepare/templates/response_assessment/rating/generic_single_turn.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
from unitxt import add_to_catalog | ||
from unitxt.templates import InputOutputTemplate | ||
|
||
add_to_catalog( | ||
InputOutputTemplate( | ||
instruction="Please act as an impartial judge and evaluate the quality of the response provided" | ||
" by an AI assistant to the user input displayed below. Your evaluation should consider" | ||
" factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of" | ||
" detail of the response. Begin your evaluation by providing a short explanation. Be as" | ||
" objective as possible. After providing your explanation, you must rate the response" | ||
' on a scale of 1 to 10 by strictly following this format: "[[rating]]", for example:' | ||
' "Rating: [[5]]".\n\n', | ||
input_format="[User input]\n{question}\n\n" | ||
"[Assistant's respond]\n{answer}\n[The End of Assistant's respond]", | ||
output_format="[[{rating}]]", | ||
postprocessors=[ | ||
r"processors.extract_mt_bench_rating_judgment", | ||
], | ||
), | ||
"templates.response_assessment.rating.generic_single_turn", | ||
overwrite=True, | ||
) |
15 changes: 15 additions & 0 deletions
15
...rics/llm_as_judge/rating/llama_3_70b_instruct_ibm_genai_template_generic_single_turn.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
{ | ||
"__type__": "llm_as_judge", | ||
"inference_model": { | ||
"__type__": "ibm_gen_ai_inference_engine", | ||
"model_name": "meta-llama/llama-3-70b-instruct", | ||
"parameters": { | ||
"__type__": "ibm_gen_ai_inference_engine_params", | ||
"max_new_tokens": 252 | ||
} | ||
}, | ||
"template": "templates.response_assessment.rating.generic_single_turn", | ||
"task": "rating.single_turn", | ||
"format": "formats.llama3_chat", | ||
"main_score": "llama_3_70b_instruct_ibm_genai_template_generic_single_turn" | ||
} |
9 changes: 9 additions & 0 deletions
9
src/unitxt/catalog/templates/response_assessment/rating/generic_single_turn.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
{ | ||
"__type__": "input_output_template", | ||
"instruction": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user input displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", | ||
"input_format": "[User input]\n{question}\n\n[Assistant's respond]\n{answer}\n[The End of Assistant's respond]", | ||
"output_format": "[[{rating}]]", | ||
"postprocessors": [ | ||
"processors.extract_mt_bench_rating_judgment" | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters