From 037279e8df84596a88ff0433e27b5191c8e1287c Mon Sep 17 00:00:00 2001 From: Max Marion Date: Tue, 13 Feb 2024 20:00:21 +0000 Subject: [PATCH 1/4] del fewshot_random default, fix hf_eval, fix gauntlet readme --- llmfoundry/utils/builders.py | 10 ++--- scripts/eval/eval.py | 1 - scripts/eval/local_data/EVAL_GAUNTLET.md | 52 ++---------------------- scripts/eval/yamls/hf_eval.yaml | 10 ++--- tests/utils/test_builders.py | 1 - 5 files changed, 11 insertions(+), 63 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index fb3a0d97f8..17d8a3c818 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -53,7 +53,6 @@ def build_evaluators( device_eval_batch_size: int, icl_seq_len: int, icl_subset_num_batches: Optional[int], - fewshot_random_seed: Optional[int] = 1234, ) -> Tuple[List[Evaluator], List[str], Optional[EvalGauntlet]]: evaluators = [] @@ -73,7 +72,6 @@ def build_evaluators( tokenizer, device_eval_batch_size, icl_seq_len, - fewshot_random_seed, icl_subset_num_batches, ) evaluators.extend(icl_evaluators) @@ -130,7 +128,6 @@ def build_icl_data_and_gauntlet( tokenizer: PreTrainedTokenizerBase, device_eval_batch_size: int, icl_seq_len: int, - fewshot_random_seed: Optional[int] = 1234, icl_subset_num_batches: Optional[int] = None ) -> Tuple[List[Evaluator], List[str], Optional[EvalGauntlet]]: icl_evaluators, logger_keys = build_icl_evaluators( @@ -138,7 +135,6 @@ def build_icl_data_and_gauntlet( tokenizer, icl_seq_len, device_eval_batch_size, - fewshot_random_seed=fewshot_random_seed, icl_subset_num_batches=icl_subset_num_batches) eval_gauntlet_cb = None if eval_gauntlet_config is not None: @@ -446,7 +442,6 @@ def build_icl_evaluators( default_max_seq_len: int, default_batch_size: int, destination_dir: Optional[str] = None, - fewshot_random_seed: Optional[int] = 1234, icl_subset_num_batches: Optional[int] = None, ) -> Tuple[List[Evaluator], List[str]]: if destination_dir is None: @@ -504,6 +499,7 @@ def _validate_cfg(icl_cfg: DictConfig): icl_cfg.pass_at_k = 1 if 'num_beams' not in icl_cfg: icl_cfg.num_beams = 20 + ## NOTE: This is one possible location to set the default for icl_cfg in icl_tasks_list: assert isinstance(icl_cfg, DictConfig) @@ -547,8 +543,8 @@ def _validate_cfg(icl_cfg: DictConfig): continuation_delimiter=icl_cfg.continuation_delimiter, question_prelimiter=icl_cfg.get('question_prelimiter', ''), destination_path=destination_path, - fewshot_random_seed=icl_cfg.get('fewshot_random_seed', - fewshot_random_seed), + ## NOTE: This is the other possible location to set the default + fewshot_random_seed=icl_cfg.get('fewshot_random_seed', 1234), pass_at_k=icl_cfg.pass_at_k, generations_per_sample=icl_cfg.num_beams, has_categories=icl_cfg.get('has_categories', False), diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 9c8dad0977..e36e08575b 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -95,7 +95,6 @@ def evaluate_model( tokenizer=tokenizer, device_eval_batch_size=device_eval_batch_size, icl_seq_len=max_seq_len, - fewshot_random_seed=seed, icl_subset_num_batches=icl_subset_num_batches, ) diff --git a/scripts/eval/local_data/EVAL_GAUNTLET.md b/scripts/eval/local_data/EVAL_GAUNTLET.md index ab11ea71de..b857e1664e 100644 --- a/scripts/eval/local_data/EVAL_GAUNTLET.md +++ b/scripts/eval/local_data/EVAL_GAUNTLET.md @@ -233,65 +233,19 @@ Language understanding tasks evaluate the model’s ability to understand the st - Number of few shot examples: 5 - Random baseline accuracy: 50% - - -### Programming -Programming tasks evaluate the model's ability to understand code, write functionally correct code given a specification, simulate code, and document code. Right now we just have HumanEval but later versions will include more. By default the programming tasks are disabled in `scripts/eval/yamls/tasks.yaml` due to their long duration. - -36. HumanEval Python code generation - - Description: HumanEval Python consists of 164 python programming challenges, in which the model is presented with the method signature and docstring comment for a python program and is expected to complete the program. We then test the resultant code’s functional correctness on a number of test input/output pairs. - - Year released: 2022 - - Number of few shot examples: 0 - - Random baseline accuracy: 0% -37. HumanEval C++ code generation - - Description: HumanEval C++ consists of 161 C++ programming challenges, in which the model is presented with the method signature and docstring comment for a C++ program and is expected to complete the program. We then test the resultant code’s functional correctness on a number of test input/output pairs. The C++ translation of HumanEval comes from the [CodeGeex](https://huggingface.co/datasets/THUDM/humaneval-x/viewer/cpp) project. - - Year released: 2022 - - Number of few shot examples: 0 - - Random baseline accuracy: 0% -38. HumanEval JS code generation - - Description: HumanEval JS consists of 164 Javscript programming challenges, in which the model is presented with the method signature and docstring comment for a Javacript program and is expected to complete the program. We then test the resultant code’s functional correctness on a number of test input/output pairs. The JS translation of HumanEval comes from the [CodeGeex](https://huggingface.co/datasets/THUDM/humaneval-x/viewer/cpp) project. - - Year released: 2022 - - Number of few shot examples: 0 - - Random baseline accuracy: 0% -54. HumanEval Python 25% code generation - - Description: HumanEval Python 25% is an easier variant of HumanEval Python in which in addition to the original method signature, the model is also provided 25% of the lines in the canonical solution and expected to complete the reaminder of the program. It consists of 164 samples. - - Year released: 2023 - - Number of few shot examples: 0 - - Random baseline accuracy: 0% -55. HumanEval Python 50% code generation - - Description: HumanEval Python 50% is an easier variant of HumanEval Python in which in addition to the original method signature, the model is also provided 50% of the lines in the canonical solution and expected to complete the reaminder of the program. It consists of 164 samples. - - Year released: 2023 - - Number of few shot examples: 0 - - Random baseline accuracy: 0% -56. HumanEval Python 75% code generation - - Description: HumanEval Python 75% is an easier variant of HumanEval Python in which in addition to the original method signature, the model is also provided 75% of the lines in the canonical solution and expected to complete the reaminder of the program. It consists of 164 samples. - - Year released: 2023 - - Number of few shot examples: 0 - - Random baseline accuracy: 0% -57. HumanEval Python simple return statement code generation - - Description: HumanEval Python simple return statament is an easier variant of HumanEval Python in which the model is provided all of the canonical solution with the exception of the return statement and is expected to complete the return statement. Additionally, this set contains only the problems for which the canonical solution has a "simple" return statement consisting only of a line of the form `return VARIABLE\_NAME`. There are 37 samples. - - Year released: 2023 - - Number of few shot examples: 0 - - Random baseline accuracy: 0% -58. HumanEval Python complex return statement code generation - - Description: HumanEval Pythom complex return statament is an easier variant of HumanEval Python in which the model is provided all of the canonical solution with the exception of the return statement and is expected to complete the return statement. Additionally, this set contains only the problems for which the canonical solution does not have a "simple" return statement as defined above. There are 127 samples. - - Year released: 2023 - - Number of few shot examples: 0 - - Random baseline accuracy: 0% - ### Long Context Gauntlet We've included three different tasks for long (> 4000 tokens) context length evals. They are meant as litmus tests for a model's ability to properly utilize it's longer context length, which is often the result of fine-tuning after pre-training. For some of these datasets, we explicitly create sets where the required information is located in different sections of the input context, either the beginning, middle, or end of the input context. 1. HotPotQAXL - - Description: (HotPotQA)[https://hotpotqa.github.io/] is originally a dataset of ten documents and a question requiring comprehension of one or more of the supplied documents. The non-related documents are completely unrelated and called "distractor" documents. To extend this to longer context lengths, we randomly sample documents from the full set of documents across the dataset, adding them to the current datapoint until the set of documents and its question fills the current context length. We insert the "gold" document(s) (the document(s) containing the information that answers the question) within the first third, second third, or last third of the context length. + - Description: [HotPotQA](https://hotpotqa.github.io/) is originally a dataset of ten documents and a question requiring comprehension of one or more of the supplied documents. The non-related documents are completely unrelated and called "distractor" documents. To extend this to longer context lengths, we randomly sample documents from the full set of documents across the dataset, adding them to the current datapoint until the set of documents and its question fills the current context length. We insert the "gold" document(s) (the document(s) containing the information that answers the question) within the first third, second third, or last third of the context length. - Lengths: 2k, 4k, 8k, 16k, 32k, 64k - Locations: beginning, middle, end 2. Key Value Pairs (Needle In a Haystack) - - Description: We construct a `.json` of key value pairs, where both the key and value are random hashes, in the style of (Lost in the Middle)[https://github.com/nelson-liu/lost-in-the-middle]. We ask the model to produce a value given a key from a specific key value pair found int he json. The pair is correspondingly located in the first third, second third, or last third of the json. + - Description: We construct a `.json` of key value pairs, where both the key and value are random hashes, in the style of [Lost in the Middle](https://github.com/nelson-liu/lost-in-the-middle). We ask the model to produce a value given a key from a specific key value pair found int he json. The pair is correspondingly located in the first third, second third, or last third of the json. - Lengths: 2k, 4k, 8k, 16k, 32k, 64k - Locations: beginning, middle, end 2. WikiQA Numeric - - Description: (WikiQA Numeric)[https://huggingface.co/datasets/abacusai/WikiQA-Altered_Numeric_QA] is a Wikipedia Question Answering dataset with a focus on questions with numeric answers. We preprocess the data only to easily parse it for our framework. + - Description: [WikiQA Numeric](https://huggingface.co/datasets/abacusai/WikiQA-Altered_Numeric_QA) is a Wikipedia Question Answering dataset with a focus on questions with numeric answers. We preprocess the data only to easily parse it for our framework. - Lengths: 2k, 4k, 8k, 16k - Locations: N/A diff --git a/scripts/eval/yamls/hf_eval.yaml b/scripts/eval/yamls/hf_eval.yaml index 9eb0245f9a..bf0426b357 100644 --- a/scripts/eval/yamls/hf_eval.yaml +++ b/scripts/eval/yamls/hf_eval.yaml @@ -37,11 +37,11 @@ models: device_eval_batch_size: 4 # FSDP config for model sharding -fsdp_config: - sharding_strategy: FULL_SHARD - mixed_precision: FULL - forward_prefetch: True - limit_all_gathers: True +# fsdp_config: +# sharding_strategy: FULL_SHARD +# mixed_precision: FULL +# forward_prefetch: True +# limit_all_gathers: True icl_tasks: "eval/yamls/tasks_v0.3.yaml" eval_gauntlet: "eval/yamls/eval_gauntlet_v0.3.yaml" diff --git a/tests/utils/test_builders.py b/tests/utils/test_builders.py index 81d8a841c7..08c3504491 100644 --- a/tests/utils/test_builders.py +++ b/tests/utils/test_builders.py @@ -250,7 +250,6 @@ def test_build_evaluators_empty(): None, tokenizer=None, # type: ignore device_eval_batch_size=1, - fewshot_random_seed=1234, icl_seq_len=2, icl_subset_num_batches=3) assert evaluators == [] From d1587529af6a260ee13c394c4ddeb6a15fe24280 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Wed, 14 Feb 2024 17:20:10 +0000 Subject: [PATCH 2/4] set in cfg defaults area --- llmfoundry/utils/builders.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 17d8a3c818..bb01b2962c 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -499,7 +499,8 @@ def _validate_cfg(icl_cfg: DictConfig): icl_cfg.pass_at_k = 1 if 'num_beams' not in icl_cfg: icl_cfg.num_beams = 20 - ## NOTE: This is one possible location to set the default + if 'fewshot_random_seed' not in icl_cfg: + icl_cfg.fewshot_random_seed = 1234 for icl_cfg in icl_tasks_list: assert isinstance(icl_cfg, DictConfig) @@ -543,8 +544,7 @@ def _validate_cfg(icl_cfg: DictConfig): continuation_delimiter=icl_cfg.continuation_delimiter, question_prelimiter=icl_cfg.get('question_prelimiter', ''), destination_path=destination_path, - ## NOTE: This is the other possible location to set the default - fewshot_random_seed=icl_cfg.get('fewshot_random_seed', 1234), + fewshot_random_seed=icl_cfg.fewshot_random_seed, pass_at_k=icl_cfg.pass_at_k, generations_per_sample=icl_cfg.num_beams, has_categories=icl_cfg.get('has_categories', False), From 154339f9a1c0a322ccec7f9f73b28a1e616b7071 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Wed, 14 Feb 2024 17:39:32 +0000 Subject: [PATCH 3/4] fix the fix i applied that was actually not a fix --- scripts/eval/yamls/hf_eval.yaml | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/scripts/eval/yamls/hf_eval.yaml b/scripts/eval/yamls/hf_eval.yaml index bf0426b357..7884e366d3 100644 --- a/scripts/eval/yamls/hf_eval.yaml +++ b/scripts/eval/yamls/hf_eval.yaml @@ -37,11 +37,13 @@ models: device_eval_batch_size: 4 # FSDP config for model sharding -# fsdp_config: -# sharding_strategy: FULL_SHARD -# mixed_precision: FULL -# forward_prefetch: True -# limit_all_gathers: True +fsdp_config: + sharding_strategy: FULL_SHARD + mixed_precision: FULL + forward_prefetch: True + limit_all_gathers: True + +icl_subset_num_batch: 1 icl_tasks: "eval/yamls/tasks_v0.3.yaml" eval_gauntlet: "eval/yamls/eval_gauntlet_v0.3.yaml" From 12216f51ad232c080ae7490afe69b81d892cbb1b Mon Sep 17 00:00:00 2001 From: Max Marion Date: Wed, 14 Feb 2024 17:40:12 +0000 Subject: [PATCH 4/4] rm num_batch from hf_eval --- scripts/eval/yamls/hf_eval.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/eval/yamls/hf_eval.yaml b/scripts/eval/yamls/hf_eval.yaml index 7884e366d3..9eb0245f9a 100644 --- a/scripts/eval/yamls/hf_eval.yaml +++ b/scripts/eval/yamls/hf_eval.yaml @@ -43,7 +43,5 @@ fsdp_config: forward_prefetch: True limit_all_gathers: True -icl_subset_num_batch: 1 - icl_tasks: "eval/yamls/tasks_v0.3.yaml" eval_gauntlet: "eval/yamls/eval_gauntlet_v0.3.yaml"