Merge pull request #223 from ganler/evalplus-maintain

refactor(evalplus): maintain mbpp+ v0.2.0
bigcode-project · Apr 19, 2024 · 1b0147c · 1b0147c
2 parents 642c57f + ae6c309
commit 1b0147c
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 7 deletions.
diff --git a/bigcode_eval/tasks/humanevalplus.py b/bigcode_eval/tasks/humanevalplus.py
@@ -29,11 +29,11 @@ class GeneralHumanEvalPlus(GeneralHumanEval):
 
     DATASET_PATH = "evalplus/humanevalplus"
 
-    def __init__(self, strip_prompt, k=[1, 10, 100], num_workers=16, timeout=10.0):
-        if timeout < 10.0:
+    def __init__(self, strip_prompt, k=[1, 10, 100], num_workers=16, timeout=20.0):
+        if timeout < 20.0:
             warn(
                 "It is suggested to have a longer timeout as HumanEval+ has lots of tests. "
-                f"The current timeout is {timeout}s while the suggested timeout is 10s."
+                f"The current timeout is {timeout}s while the suggested timeout is 20s."
             )
         super().__init__(strip_prompt, k, num_workers, timeout)
 

diff --git a/bigcode_eval/tasks/mbppplus.py b/bigcode_eval/tasks/mbppplus.py
@@ -4,7 +4,7 @@
 The MBPP+ dataset is created by the EvalPlus framework which extends the original MBPP dataset
 by adding more automatically generated test cases to each problem. Note MBPP+ only includes 399
 tasks which are a subset of the original MBPP dataset. The subset is selected from the sanitized
-MBPP (a subset of manually examined tasks by the original MBPP authors) and EvalPlus further 
+MBPP (a subset of manually examined tasks by the original MBPP authors) and EvalPlus further
 removes low-quality and ill-formed tasks for benchmark quality control.
 
 Homepage: https://github.com/evalplus/evalplus
@@ -56,9 +56,6 @@ def get_reference(self, doc):
     def get_dataset(self):
         """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
         dataset = self.dataset["test"]
-        assert (
-            len(dataset) == 399
-        ), "MBPP+ only has 399 problems. Please retry by deleting its old cache"
         return dataset
 
     def process_results(self, generations, references):