neuralmagic · shubhra · Aug 11, 2023
diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
@@ -73,6 +73,106 @@
 
 
 from datasets import load_dataset, load_metric  # isort: skip
+from evaluate import load
+import re
+import time
+
+def truncate(completion):
+    # completion is the prediction generated by calling codegen
+    prints = list(re.finditer('^print', completion, re.MULTILINE))
+    if len(prints) > 1:
+        completion = completion[:prints[1].start()]
+
+    defs = list(re.finditer('^def', completion, re.MULTILINE))
+    if len(defs) > 1:
+        completion = completion[:defs[1].start()]
+
+    return completion
+
+
+def process_unit_test(sample):
+    # sample is one row from the HumanEval dataset
+    unit_test = sample["test"]
+    function_start = sample["prompt"].find("def") + 4
+    function_end = sample["prompt"][function_start:].find("(")
+    function_name = sample["prompt"][function_start:function_start + function_end]
+    unit_test = unit_test + f"check({function_name})\n"
+
+    return unit_test
+
+
+def human_eval(args, dataset_name="openai_humaneval"):
+    # n in the number of predictions to generate for each task where n >= k
+    # (in the original paper k = [1, 10, 100], where k <= 100
+
+    # temperatures is the temperature to use for each prediction
+    # (in the original paper temperatures = [0.2, 0.6, 0.8]
+
+    # also, the original paper uses nucleus sampling, that
+    # is not supported by easy to implement
+
+    import os
+
+    os.environ["HF_ALLOW_CODE_EVAL"] = "1"
+    os.environ["TOKENIZERS_PARALLELISM"] = "true"
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+    text_generation = Pipeline.create(
+        task="codegen",
+        model_path=args.model_path,
+        engine_type=args.engine,
+        num_cores=args.num_cores,
+        sequence_length=428,
+        prompt_processing_sequence_length=428,
+        max_generated_tokens=256,
+        deterministic=False,
+        sampling_temperature=args.temperature,
+    )
+
+    # Load the full dataset in streaming mode to facilitate subset creation
+    full_dataset = load_dataset(dataset_name, split="test", streaming=True)
+
+    # Set to total length of the full dataset if user hasn't specified any
+    if not args.max_samples:
+        args.max_samples = 164
+
+    if args.benchmark_humaneval:
+        # A selection of tasks from the HumanEval dataset to perform faster evaluation on -
+        # this selection tries to keep a variety of tasks based on token lengths
+        benchmark_problems_tokenlen = {"HumanEval/83": 46, "HumanEval/35": 76, "HumanEval/22": 95,
+                                       "HumanEval/146": 112, "HumanEval/77": 121, "HumanEval/33": 143,
+                                       "HumanEval/41": 160, "HumanEval/113": 178, "HumanEval/72": 236,
+                                       "HumanEval/115": 334, "HumanEval/129": 428
+                                       }
+        print("Creating Benchmark Dataset")
+        dataset_subset = full_dataset.filter(lambda x: x["task_id"] in benchmark_problems_tokenlen)
+        dataset_subset_len = len(benchmark_problems_tokenlen)
+    else:
+        # Create a subset from the dataset for evaluation starting on "start" index upto "max_samples" number of samples
+        print("Creating Subset from Dataset")
+        temp_dataset_subset = full_dataset.skip(args.start)
+        dataset_subset = temp_dataset_subset.take(args.max_samples)
+        dataset_subset_len = args.max_samples
+
+    references = []
+    predictions = []
+
+    for idx, sample in _enumerate_progress(dataset_subset, dataset_subset_len):
+        sample_prompt = sample["prompt"]
+        # print(f"\n sample_prompt:\n {sample_prompt}")
+        sample_test = process_unit_test(sample)
+        sample_task_id = sample["task_id"]
+        print(f"sample_task_id: {sample_task_id}")
+        # print(f"\n sample_test: \n {sample_test}")
+        sample_predictions = text_generation(sequences=[sample_prompt] * args.n_solutions)
+        sample_predictions = sample_predictions.sequences
+        for i in range(args.n_solutions):
+            sample_predictions[i] = truncate(sample_prompt + sample_predictions[i])
+
+        references.append(sample_test)
+        predictions.append(sample_predictions)
+
+    return references, predictions
 
 
 def perplexity_eval(args, batch_size=16, dataset_name="openai_humaneval"):
@@ -109,6 +209,25 @@ def perplexity_eval(args, batch_size=16, dataset_name="openai_humaneval"):
     return perplexity_metrics
 
 
+def select_openai_humaneval_method(args):
+    if args.humaneval_method == "pass_at_k":
+        start_time = time.time()
+        references, predictions = human_eval(args)
+        code_eval = load("code_eval")
+        pass_at_k, results = code_eval.compute(references=references, predictions=predictions, k=[1, 2, 10])
+        print(f"\nopenai_humaneval evaluation results: {pass_at_k}")
+        end_time = time.time()
+        eval_time = "Evaluation time: " + str(end_time - start_time) + "\n"
+        file_name_detailed_result = "result_humaneval_" + str(args.start) + "_" + str(args.start + args.max_samples - 1)
+        with open(file_name_detailed_result, "w") as fp:
+            fp.write(eval_time)
+            json.dump(pass_at_k, fp)
+            json.dump(results, fp)
+    else:
+        perplexity_metrics = perplexity_eval(args)
+        return perplexity_metrics
+
+
 def qa_eval(args, dataset_name="squad"):
     # load validation dataset and eval tool
     dataset = load_dataset(dataset_name)["validation"]
@@ -474,7 +593,9 @@ def _split_train_val(train_dataset, val_ratio, seed=42):
     "imdb": imdb_eval,
     "conll2003": conll2003_eval,
     "go_emotions": go_emotions_eval,
-    "openai_humaneval": perplexity_eval,
+    # #"openai_humaneval": human_eval,
+    # "openai_humaneval": perplexity_eval,
+    "openai_humaneval": select_openai_humaneval_method,
 }
 
 
@@ -606,6 +727,47 @@ def parse_args():
         default=False,
     )
 
+    parser.add_argument(
+        "--humaneval-method",
+        default="perplexity",
+        choices=["perplexity", "pass_at_k"],
+        help="Whether to run perplexity evaluation or pass@k evaluation on the openai_humaneval dataset."
+             " Default is perplexity",
+        type=str,
+    )
+
+    parser.add_argument(
+        "--n-solutions",
+        help="The total number of solutions to generate for one code prompt of the openai_humaneval dataset. "
+             "Default is 1.",
+        type=int,
+        default=1,
+    )
+
+    parser.add_argument(
+        "--temperature",
+        help="Used with openai_humaneval dataset - The temperature to use for sampling. "
+             "Default is 0.8",
+        type=int,
+        default=0.8,
+    )
+
+    parser.add_argument(
+        "--benchmark-humaneval",
+        help="Set to allow on a smaller subset of the dataset."
+             "Default is False",
+        default=False,
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "--start",
+        help="Used only with openai_humaneval dataset when evaluation method is pass_at_k and "
+             "parallel processing is needed. Default is 0.",
+        type=int,
+        default=0,
+    )
+
     return parser.parse_args()
 
 
@@ -629,8 +791,8 @@ def _main(args):
         print(f"\nmnli eval results: {mnli_metrics}")
     else:
         metrics = SUPPORTED_DATASETS[dataset](args)
-
-        print(f"\n{dataset} eval results: {metrics.compute()}")
+        if not (dataset == "openai_humaneval" and args.humaneval_method == "pass_at_k"):
+            print(f"\n{dataset} eval results: {metrics.compute()}")
 
 
 def main():