Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add combine to compose multiple evaluations #150

Merged
merged 9 commits into from
Jun 29, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/evaluate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from .info import EvaluationModuleInfo
from .inspect import inspect_evaluation_module, list_evaluation_modules
from .loading import load
from .module import EvaluationModule
from .module import AggregateEvaluation, EvaluationModule, combine
from .saving import save
from .utils import *
from .utils import gradio, logging
120 changes: 120 additions & 0 deletions src/evaluate/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

# Lint as: python3
""" EvaluationModule base class."""
import collections
import itertools
import os
import types
import uuid
Expand Down Expand Up @@ -706,3 +708,121 @@ def _enforce_nested_string_type(self, schema, obj):
elif isinstance(schema, Value):
if pa.types.is_string(schema.pa_type) and not isinstance(obj, str):
raise TypeError(f"Expected type str but got {type(obj)}.")


class AggregateEvaluation:
def __init__(self, evaluation_modules, force_prefix=False):
from .loading import load # avoid circular imports

self.evaluation_module_names = None
if isinstance(evaluation_modules, list):
self.evaluation_modules = evaluation_modules
elif isinstance(evaluation_modules, dict):
self.evaluation_modules = list(evaluation_modules.values())
self.evaluation_module_names = list(evaluation_modules.keys())
loaded_modules = []

for module in self.evaluation_modules:
if isinstance(module, str):
module = load(module)
loaded_modules.append(module)
self.evaluation_modules = loaded_modules

if self.evaluation_module_names is None:
self.evaluation_module_names = [module.name for module in self.evaluation_modules]

self.force_prefix = force_prefix

def add(self, prediction=None, reference=None, **kwargs):
"""Add one prediction and reference for each evaluation module's stack.

Args:
prediction (list/array/tensor, optional): Predictions.
reference (list/array/tensor, optional): References.
"""
for evaluation_module in self.evaluation_modules:
batch = {"predictions": prediction, "references": reference, **kwargs}
batch = {intput_name: batch[intput_name] for intput_name in evaluation_module._feature_names()}
lvwerra marked this conversation as resolved.
Show resolved Hide resolved
evaluation_module.add(**batch)

def add_batch(self, predictions=None, references=None, **kwargs):
"""Add a batch of predictions and references for each evaluation module's stack.

Args:
predictions (list/array/tensor, optional): Predictions.
references (list/array/tensor, optional): References.
"""
for evaluation_module in self.evaluation_modules:
batch = {"predictions": predictions, "references": references, **kwargs}
batch = {intput_name: batch[intput_name] for intput_name in evaluation_module._feature_names()}
evaluation_module.add_batch(**batch)

def compute(self, predictions=None, references=None, **kwargs):
"""Compute each evaluation module.

Usage of positional arguments is not allowed to prevent mistakes.

Args:
predictions (list/array/tensor, optional): Predictions.
references (list/array/tensor, optional): References.
**kwargs (optional): Keyword arguments that will be forwarded to the evaluation module :meth:`_compute`
method (see details in the docstring).

Return:
dict or None

- Dictionary with the results if this evaluation module is run on the main process (``process_id == 0``).
- None if the evaluation module is not run on the main process (``process_id != 0``).
"""
results = []

for evaluation_module in self.evaluation_modules:
batch = {"predictions": predictions, "references": references, **kwargs}
batch = {intput_name: batch[intput_name] for intput_name in evaluation_module._feature_names()}
results.append(evaluation_module.compute(**batch))

return self._merge_results(results)

def _merge_results(self, results):
merged_results = {}
results_keys = list(itertools.chain.from_iterable([r.keys() for r in results]))
duplicate_keys = {item for item, count in collections.Counter(results_keys).items() if count > 1}

duplicate_names = [
item for item, count in collections.Counter(self.evaluation_module_names).items() if count > 1
]
duplicate_counter = {name: 0 for name in duplicate_names}
Comment on lines +791 to +794
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's also possible to simply raise an error for now, if we need more time to think about how to handle this case.

As a user it's a bit unclear for me that it would automatically use prefixes when there are name conflicts, but no prefixes if there are no conflicts. This is the kind of black magic we try to avoid usually - though I don't have a strong opinion on this very specific case, since it would be easy to explain in the docs. What do you think ?

an alternative could be to ask users to name the metrics:

combine({
    "metric_1": metric_1,
    "metric_2": metric_2
})

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you think about adding a warning when a prefixes are added:

f"The key '{key}' occurs in several evaluation results. Prefixing with the key with '{prefix}'."

The user can already use a dict in the current state of the PR (see last example in initial comment). What do you think?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually I noticed that even if you provide a dictionary, it may not use a prefix. Therefore the logic is always "use a prefix if there is a name conflict", even with a dictionary. Because of that I don't think a warning is necessary, since a warning would mean that a behavior does not really fit the minimal expected behavior.

This requires to be written explicitly in the docstring though in this case


for module_name, result in zip(self.evaluation_module_names, results):
for k, v in result.items():
if k not in duplicate_keys and not self.force_prefix:
merged_results[f"{k}"] = v
elif module_name in duplicate_counter:
merged_results[f"{module_name}_{duplicate_counter[module_name]}_{k}"] = v
else:
merged_results[f"{module_name}_{k}"] = v

if module_name in duplicate_counter:
duplicate_counter[module_name] += 1

return merged_results


def combine(evaluation_modules, force_prefix=False):
lhoestq marked this conversation as resolved.
Show resolved Hide resolved
"""A function to comine several metrics, comparisons, and measurements into a single `AggregateEvalution` that
lvwerra marked this conversation as resolved.
Show resolved Hide resolved
can be used like a single evaluation module.
lvwerra marked this conversation as resolved.
Show resolved Hide resolved

Args:
evaluation_modules (``Union[list, dict]``): A list or dictionary of evaluation modules. The modules can either be passed
as strings or loaded `EvaluationMoudule`s. If a dictionary is passed its keys are the names used and the values the modules.
lvwerra marked this conversation as resolved.
Show resolved Hide resolved
The names are used as prefix in case there are name overlaps in the returned results of each module or if `force_prefix=True`.
force_prefix (``bool``, optional, defaults to `False`): If `True` all scores from the modules are prefixed with their name. If
a dictionary is passed the keys are used as name otherwise the module's name.

Examples:
>>> clf_metrics = combine(["accuracy", "f1", "precision","recall"])
>>> clf_metrics.compute(predictions=[0,1], references=[1,1])
{'accuracy': 0.5, 'f1': 0.66, 'precision': 1.0, 'recall': 0.5}
"""

return AggregateEvaluation(evaluation_modules, force_prefix=force_prefix)
97 changes: 96 additions & 1 deletion tests/test_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pytest
from datasets.features import Features, Sequence, Value

from evaluate.module import EvaluationModule, EvaluationModuleInfo
from evaluate.module import EvaluationModule, EvaluationModuleInfo, combine

from .utils import require_tf, require_torch

Expand Down Expand Up @@ -70,6 +70,22 @@ def separate_expected_results(cls):
return [{"accuracy": 1.0, "set_equality": True}, {"accuracy": 0.5, "set_equality": False}]


class AnotherDummyMetric(EvaluationModule):
def _info(self):
return EvaluationModuleInfo(
description="another dummy metric for tests",
citation="insert citation here",
features=Features({"predictions": Value("int64"), "references": Value("int64")}),
)

def _compute(self, predictions, references):
return {"set_equality": False}

@classmethod
def expected_results(cls):
return {"set_equality": False}


def properly_del_metric(metric):
"""properly delete a metric on windows if the process is killed during multiprocessing"""
if metric is not None:
Expand Down Expand Up @@ -612,3 +628,82 @@ def test_metric_with_non_standard_feature_names_compute(tmp_path):
metric = AccuracyWithNonStandardFeatureNames(cache_dir=cache_dir)
results = metric.compute(inputs=inputs, targets=targets)
assert results == AccuracyWithNonStandardFeatureNames.expected_results()


class TestEvaluationcombined_evaluation(TestCase):
def test_single_module(self):
preds, refs = DummyMetric.predictions_and_references()
expected_results = DummyMetric.expected_results()

combined_evaluation = combine([DummyMetric()])

self.assertDictEqual(expected_results, combined_evaluation.compute(predictions=preds, references=refs))

def test_add(self):
preds, refs = DummyMetric.predictions_and_references()
expected_results = DummyMetric.expected_results()

combined_evaluation = combine([DummyMetric()])

for pred, ref in zip(preds, refs):
combined_evaluation.add(pred, ref)
self.assertDictEqual(expected_results, combined_evaluation.compute())

def test_add_batch(self):
preds, refs = DummyMetric.predictions_and_references()
expected_results = DummyMetric.expected_results()

combined_evaluation = combine([DummyMetric()])

combined_evaluation.add_batch(predictions=preds, references=refs)
self.assertDictEqual(expected_results, combined_evaluation.compute())

def test_force_prefix_with_dict(self):
prefix = "test_prefix"
preds, refs = DummyMetric.predictions_and_references()

expected_results = DummyMetric.expected_results()
expected_results[f"{prefix}_accuracy"] = expected_results.pop("accuracy")
expected_results[f"{prefix}_set_equality"] = expected_results.pop("set_equality")

combined_evaluation = combine({prefix: DummyMetric()}, force_prefix=True)

self.assertDictEqual(expected_results, combined_evaluation.compute(predictions=preds, references=refs))

def test_duplicate_module(self):
preds, refs = DummyMetric.predictions_and_references()
dummy_metric = DummyMetric()
dummy_result = DummyMetric.expected_results()
combined_evaluation = combine([dummy_metric, dummy_metric])

expected_results = {}
for i in range(2):
for k in dummy_result:
expected_results[f"{dummy_metric.name}_{i}_{k}"] = dummy_result[k]
self.assertDictEqual(expected_results, combined_evaluation.compute(predictions=preds, references=refs))

def test_two_modules_with_same_score_name(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you could also test when they have the same module name

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that is covered in test_duplicate_module, no?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I merge this now, if that's not the case I can add more tests in another PR.

preds, refs = DummyMetric.predictions_and_references()
dummy_metric = DummyMetric()
another_dummy_metric = AnotherDummyMetric()

dummy_result_1 = DummyMetric.expected_results()
dummy_result_2 = AnotherDummyMetric.expected_results()

dummy_result_1[dummy_metric.name + "_set_equality"] = dummy_result_1.pop("set_equality")
dummy_result_1[another_dummy_metric.name + "_set_equality"] = dummy_result_2["set_equality"]

combined_evaluation = combine([dummy_metric, another_dummy_metric])

self.assertDictEqual(dummy_result_1, combined_evaluation.compute(predictions=preds, references=refs))

def test_modules_from_string(self):
expected_result = {"accuracy": 0.5, "recall": 0.5, "precision": 1.0}
predictions = [0, 1]
references = [1, 1]

combined_evaluation = combine(["accuracy", "recall", "precision"])

self.assertDictEqual(
expected_result, combined_evaluation.compute(predictions=predictions, references=references)
)