Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Model gauntlet #308

Merged
merged 39 commits into from
Jun 29, 2023
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
aadaba4
fix ocfg
bmosaicml May 31, 2023
1e65ae9
finish postprocessing metrics
bmosaicml Jun 5, 2023
17132d6
foo
bmosaicml Jun 5, 2023
dd2915c
foo
bmosaicml Jun 5, 2023
5e396a0
model gauntlet
bmosaicml May 31, 2023
da0f111
finish up post-processing
bmosaicml Jun 6, 2023
75b5628
model gauntlet
bmosaicml May 31, 2023
71715c8
foo
bmosaicml Jun 6, 2023
aa6ac5d
subsample pubmedqa
bmosaicml Jun 6, 2023
ce0abf4
add callback for calcualting scores
bmosaicml Jun 8, 2023
5a2d7ae
edit
bmosaicml Jun 12, 2023
b76c388
finish
bmosaicml Jun 12, 2023
25d352b
finish printing
bmosaicml Jun 13, 2023
88806bf
fix datasets to have multiple of 2 choices
bmosaicml Jun 14, 2023
c3aa1ab
fix spacing bug
bmosaicml Jun 15, 2023
284c9e2
fix merge
bmosaicml Jun 21, 2023
be0a675
modify coqa
bmosaicml Jun 21, 2023
730c89c
modify coqa
bmosaicml Jun 21, 2023
60dfc55
modify coqa
bmosaicml Jun 21, 2023
4b31ff0
clean up PR
bmosaicml Jun 22, 2023
b38c503
merge
bmosaicml Jun 22, 2023
8002258
Merge branch 'main' into model_gauntlet
bmosaicml Jun 27, 2023
cf1b793
Update llmfoundry/callbacks/__init__.py
bmosaicml Jun 28, 2023
3f22feb
adds image to model gauntlet link to eval readme
codestar12 Jun 28, 2023
266fcea
Merge branch 'main' into model_gauntlet
codestar12 Jun 28, 2023
be886da
scrubs cluster
codestar12 Jun 28, 2023
57e65e6
wrap lines of docstring for readability
codestar12 Jun 28, 2023
eb5c3aa
precommit hook
codestar12 Jun 28, 2023
9695692
yaml lint
codestar12 Jun 28, 2023
7e86f88
fix small bug
bmosaicml Jun 28, 2023
e69cff7
bug fix
bmosaicml Jun 28, 2023
e985c81
Merge branch 'main' into model_gauntlet
bmosaicml Jun 28, 2023
743518b
Merge branch 'model_gauntlet' of github.com:mosaicml/llm-foundry into…
codestar12 Jun 29, 2023
7295930
bug fix
bmosaicml Jun 28, 2023
90ff7dd
change key name from tasks to categories
bmosaicml Jun 29, 2023
2ccb445
Merge branch 'model_gauntlet' of github.com:mosaicml/llm-foundry into…
codestar12 Jun 29, 2023
9757424
fix lint
bmosaicml Jun 29, 2023
9a2f609
lint
codestar12 Jun 29, 2023
4c1e217
Merge branch 'model_gauntlet' of github.com:mosaicml/llm-foundry into…
codestar12 Jun 29, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions llmfoundry/callbacks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
try:
from llmfoundry.callbacks.fdiff_callback import FDiffMetrics
from llmfoundry.callbacks.generate_callback import Generate
from llmfoundry.callbacks.model_gauntlet_callback import ModelGauntlet
from llmfoundry.callbacks.monolithic_ckpt_callback import \
MonolithicCheckpointSaver
from llmfoundry.callbacks.resumption_callbacks import (GlobalLRScaling,
Expand All @@ -22,4 +23,5 @@
'GlobalLRScaling',
'LayerFreezing',
'ScheduledGarbageCollector',
'ModelGauntlet',
]
152 changes: 152 additions & 0 deletions llmfoundry/callbacks/model_gauntlet_callback.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# Copyright 2022 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

"""Aggregate ICL evals into composite scores."""

import math
import re
from enum import Enum
from typing import Optional

from composer.core import Callback, State
from composer.loggers import Logger

__all__ = ['ModelGauntlet']


class Weighting(Enum):
EQUAL = 1
SAMPLE_SZ = 2
LOG_SAMPLE_SZ = 3


class ModelGauntlet(Callback):
"""The ModelGauntlet aggregates ICL eval results.

After `eval_end`, this callback inspects the logger for different ICL metrics and aggregates the scores according to the aggregation
specification provided in the constructor.

Args:
logger_keys (dict): These are the exact keys that the individual benchmark metrics will be logged under in the logger after eval
tasks (dict): This contains the list of categories, as well as the subtasks within them, the random baseline accuracy of each subtask, and the number of fewshot examples
used for the task. See `llmfoundry/scripts/eval/yamls/model_gauntlet.yaml` to see the structure.
weighting (Weighting): The weighting scheme used to balance different tasks within each category. Either assign them all equal weight, assign them weight proportional to the dataset size, or assign them weight proportional to the log2 of the dataset size.
substract_random_baseline (bool): Flag determining whether to subtract random baseline accuracy from the performance on each individual benchmark before aggregating.
rescale_accuracy (bool): Flag determining whether to rescale the accuracy on each benchmark by (1-random_baseline_accuracy) before aggregating. Using this ensures that all benchmarks max out at 1.0.
benchmark_sizes (Optional[dict]): Optional data on benchmark sizes, used when not relying on equal weighting.
codestar12 marked this conversation as resolved.
Show resolved Hide resolved
"""

vchiley marked this conversation as resolved.
Show resolved Hide resolved
def __init__(self,
logger_keys: dict,
categories: dict,
weighting: Weighting = Weighting.EQUAL,
subtract_random_baseline: bool = True,
rescale_accuracy: bool = True,
benchmark_sizes: Optional[dict] = None):
if weighting != Weighting.EQUAL and benchmark_sizes is None:
raise Exception(
'When not using equal weighting, you must provide the benchmark sizes.'
)

if rescale_accuracy and not subtract_random_baseline:
raise Exception(
'Only use accuracy rescaling in conjunction with subtracting random baseline accuracy.'
)

self.categories = categories
self.weighting = Weighting[weighting]
self.subtract_random_baseline = subtract_random_baseline
self.rescale_accuracy = rescale_accuracy
self.logger_keys = logger_keys

for category in self.categories:

for benchmark in category['benchmarks']:
bench_name = f"{benchmark['name']}/{benchmark['num_fewshot']}-shot"
cumulative_samples = max(
sum(count for name, count in benchmark_sizes.items()
if name.startswith(bench_name)), 1)

if self.weighting == Weighting.EQUAL:
weight = 1
elif self.weighting == Weighting.SAMPLE_SZ:
weight = cumulative_samples
elif self.weighting == Weighting.LOG_SAMPLE_SZ:
weight = max(math.log(cumulative_samples, 2), 1)

benchmark['weighting'] = weight

def compute_averages(self, logger_data):

results = {}
pat = re.compile(
'metrics/(.*?)/(\d+)-shot(/.*?)?/InContextLearning(.*)')
for key in self.logger_keys:
match = pat.match(key)
val = logger_data.data[key][0][1].item()

if match:
eval_name = match.group(1)
num_shot = match.group(2)
subcat = match.group(3)
metric = match.group(4)

if subcat is not None:
subcat = subcat[1:]
if f'metrics/{eval_name}/{num_shot}-shot/InContextLearning{metric}' not in results:
results[f'metrics/{eval_name}/{num_shot}-shot/InContextLearning{metric}'] = []
results[
f'metrics/{eval_name}/{num_shot}-shot/InContextLearning{metric}'].append(
val)
else:
results[key] = [val]
return {k: sum(v) / len(v) for k, v in results.items()}

def eval_end(self, state: State, logger: Logger):
new_metrics = self.compute_averages(logger)
composite_scores = {}
for category in self.categories:
composite_scores[category['name']] = []
for benchmark in category['benchmarks']:
key_pat = re.compile(
f"metrics/{benchmark['name']}/{benchmark['num_fewshot']}-shot/.*Accuracy"
)

matching_key = [
k for k in new_metrics.keys()
if key_pat.match(k) is not None
]
if len(matching_key) == 0:
print(
f"Warning: couldn't find results for benchmark: {benchmark}"
)
else:
score = new_metrics[matching_key[0]]

if self.subtract_random_baseline:
score -= benchmark['random_baseline']

if self.rescale_accuracy and self.subtract_random_baseline:
score /= 1.0 - benchmark['random_baseline']

composite_scores[category['name']].append({
'name': benchmark['name'],
'score': score,
'weighting': benchmark['weighting']
})
total_weight = sum(
k['weighting'] for k in composite_scores[category['name']])
composite_scores[category['name']] = sum(
k['score'] * (k['weighting'] / total_weight)
for k in composite_scores[category['name']])

composite_scores = {
f'metrics/model_gauntlet/{k}': v
for k, v in composite_scores.items()
}

composite_scores['metrics/model_gauntlet/average'] = sum(
composite_scores.values()) / len(composite_scores.values())
logger.log_metrics(composite_scores)

return composite_scores
4 changes: 2 additions & 2 deletions mcli/mcli-1b.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
# git_branch: # use your branch
# git_branch: # use your branch
bmosaicml marked this conversation as resolved.
Show resolved Hide resolved
# git_commit: # OR use your commit hash
pip_install: -e .[gpu]
ssh_clone: false # Should be true if using a private repo
Expand All @@ -28,7 +28,7 @@ compute:
gpus: 8 # Number of GPUs to use

## These configurations are optional
# cluster: TODO # Name of the cluster to use for this run
# cluster: # TODO # Name of the cluster to use for this run
# gpu_type: a100_80gb # Type of GPU to use. We use a100_80gb in our experiments


Expand Down
Loading