Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changes to support pass@k evaluation on the HumanEval dataset #1180

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 165 additions & 3 deletions src/deepsparse/transformers/eval_downstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,106 @@


from datasets import load_dataset, load_metric # isort: skip
from evaluate import load
import re
import time

def truncate(completion):
# completion is the prediction generated by calling codegen
prints = list(re.finditer('^print', completion, re.MULTILINE))
if len(prints) > 1:
completion = completion[:prints[1].start()]

defs = list(re.finditer('^def', completion, re.MULTILINE))
if len(defs) > 1:
completion = completion[:defs[1].start()]

return completion


def process_unit_test(sample):
# sample is one row from the HumanEval dataset
unit_test = sample["test"]
function_start = sample["prompt"].find("def") + 4
function_end = sample["prompt"][function_start:].find("(")
function_name = sample["prompt"][function_start:function_start + function_end]
unit_test = unit_test + f"check({function_name})\n"

return unit_test


def human_eval(args, dataset_name="openai_humaneval"):
# n in the number of predictions to generate for each task where n >= k
# (in the original paper k = [1, 10, 100], where k <= 100

# temperatures is the temperature to use for each prediction
# (in the original paper temperatures = [0.2, 0.6, 0.8]

# also, the original paper uses nucleus sampling, that
# is not supported by easy to implement

import os

os.environ["HF_ALLOW_CODE_EVAL"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

text_generation = Pipeline.create(
task="codegen",
model_path=args.model_path,
engine_type=args.engine,
num_cores=args.num_cores,
sequence_length=428,
prompt_processing_sequence_length=428,
max_generated_tokens=256,
deterministic=False,
sampling_temperature=args.temperature,
)

# Load the full dataset in streaming mode to facilitate subset creation
full_dataset = load_dataset(dataset_name, split="test", streaming=True)

# Set to total length of the full dataset if user hasn't specified any
if not args.max_samples:
args.max_samples = 164

if args.benchmark_humaneval:
# A selection of tasks from the HumanEval dataset to perform faster evaluation on -
# this selection tries to keep a variety of tasks based on token lengths
benchmark_problems_tokenlen = {"HumanEval/83": 46, "HumanEval/35": 76, "HumanEval/22": 95,
"HumanEval/146": 112, "HumanEval/77": 121, "HumanEval/33": 143,
"HumanEval/41": 160, "HumanEval/113": 178, "HumanEval/72": 236,
"HumanEval/115": 334, "HumanEval/129": 428
}
print("Creating Benchmark Dataset")
dataset_subset = full_dataset.filter(lambda x: x["task_id"] in benchmark_problems_tokenlen)
dataset_subset_len = len(benchmark_problems_tokenlen)
else:
# Create a subset from the dataset for evaluation starting on "start" index upto "max_samples" number of samples
print("Creating Subset from Dataset")
temp_dataset_subset = full_dataset.skip(args.start)
dataset_subset = temp_dataset_subset.take(args.max_samples)
dataset_subset_len = args.max_samples

references = []
predictions = []

for idx, sample in _enumerate_progress(dataset_subset, dataset_subset_len):
sample_prompt = sample["prompt"]
# print(f"\n sample_prompt:\n {sample_prompt}")
sample_test = process_unit_test(sample)
sample_task_id = sample["task_id"]
print(f"sample_task_id: {sample_task_id}")
# print(f"\n sample_test: \n {sample_test}")
sample_predictions = text_generation(sequences=[sample_prompt] * args.n_solutions)
sample_predictions = sample_predictions.sequences
for i in range(args.n_solutions):
sample_predictions[i] = truncate(sample_prompt + sample_predictions[i])

references.append(sample_test)
predictions.append(sample_predictions)

return references, predictions


def perplexity_eval(args, batch_size=16, dataset_name="openai_humaneval"):
Expand Down Expand Up @@ -109,6 +209,25 @@ def perplexity_eval(args, batch_size=16, dataset_name="openai_humaneval"):
return perplexity_metrics


def select_openai_humaneval_method(args):
if args.humaneval_method == "pass_at_k":
start_time = time.time()
references, predictions = human_eval(args)
code_eval = load("code_eval")
pass_at_k, results = code_eval.compute(references=references, predictions=predictions, k=[1, 2, 10])
print(f"\nopenai_humaneval evaluation results: {pass_at_k}")
end_time = time.time()
eval_time = "Evaluation time: " + str(end_time - start_time) + "\n"
file_name_detailed_result = "result_humaneval_" + str(args.start) + "_" + str(args.start + args.max_samples - 1)
with open(file_name_detailed_result, "w") as fp:
fp.write(eval_time)
json.dump(pass_at_k, fp)
json.dump(results, fp)
else:
perplexity_metrics = perplexity_eval(args)
return perplexity_metrics


def qa_eval(args, dataset_name="squad"):
# load validation dataset and eval tool
dataset = load_dataset(dataset_name)["validation"]
Expand Down Expand Up @@ -474,7 +593,9 @@ def _split_train_val(train_dataset, val_ratio, seed=42):
"imdb": imdb_eval,
"conll2003": conll2003_eval,
"go_emotions": go_emotions_eval,
"openai_humaneval": perplexity_eval,
# #"openai_humaneval": human_eval,
# "openai_humaneval": perplexity_eval,
"openai_humaneval": select_openai_humaneval_method,
}


Expand Down Expand Up @@ -606,6 +727,47 @@ def parse_args():
default=False,
)

parser.add_argument(
"--humaneval-method",
default="perplexity",
choices=["perplexity", "pass_at_k"],
help="Whether to run perplexity evaluation or pass@k evaluation on the openai_humaneval dataset."
" Default is perplexity",
type=str,
)

parser.add_argument(
"--n-solutions",
help="The total number of solutions to generate for one code prompt of the openai_humaneval dataset. "
"Default is 1.",
type=int,
default=1,
)

parser.add_argument(
"--temperature",
help="Used with openai_humaneval dataset - The temperature to use for sampling. "
"Default is 0.8",
type=int,
default=0.8,
)

parser.add_argument(
"--benchmark-humaneval",
help="Set to allow on a smaller subset of the dataset."
"Default is False",
default=False,
action="store_true",
)

parser.add_argument(
"--start",
help="Used only with openai_humaneval dataset when evaluation method is pass_at_k and "
"parallel processing is needed. Default is 0.",
type=int,
default=0,
)

return parser.parse_args()


Expand All @@ -629,8 +791,8 @@ def _main(args):
print(f"\nmnli eval results: {mnli_metrics}")
else:
metrics = SUPPORTED_DATASETS[dataset](args)

print(f"\n{dataset} eval results: {metrics.compute()}")
if not (dataset == "openai_humaneval" and args.humaneval_method == "pass_at_k"):
print(f"\n{dataset} eval results: {metrics.compute()}")


def main():
Expand Down
Loading