Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add TokenClassificationEvaluator #167

Merged
merged 32 commits into from
Jul 21, 2022
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
1103674
added token-classification evaluator
fxmarty Jun 29, 2022
4fdcbbf
refactor
fxmarty Jun 29, 2022
50906f1
style
fxmarty Jun 29, 2022
1e748a0
renaming
fxmarty Jun 29, 2022
0b4d295
renaming
fxmarty Jun 29, 2022
56e8baa
tempo
fxmarty Jul 1, 2022
d8a5d88
rebased
fxmarty Jul 18, 2022
8e3afa6
added test back
fxmarty Jul 18, 2022
02e955f
new version
fxmarty Jul 18, 2022
930c9bb
add parity test
fxmarty Jul 18, 2022
5a5b0e9
better doc
fxmarty Jul 18, 2022
22b8c28
style
fxmarty Jul 18, 2022
b34cb05
better doc
fxmarty Jul 18, 2022
f41949a
better doc
fxmarty Jul 18, 2022
2740fcf
yet again better doc
fxmarty Jul 18, 2022
80d4859
fix doc
fxmarty Jul 18, 2022
fbfbec7
fix doc
fxmarty Jul 18, 2022
a6d0001
trigger build
fxmarty Jul 18, 2022
f00600d
merged
fxmarty Jul 19, 2022
24349f8
fix with qa merged
fxmarty Jul 19, 2022
3750408
Update src/evaluate/evaluator/token_classification.py
fxmarty Jul 19, 2022
60b7a15
better doc
fxmarty Jul 19, 2022
791986c
Merge branch 'main' into add-token-classification-evaluator
fxmarty Jul 20, 2022
9485c8d
fix
fxmarty Jul 20, 2022
797919a
Update src/evaluate/evaluator/token_classification.py
fxmarty Jul 20, 2022
bc1081c
Update src/evaluate/evaluator/token_classification.py
fxmarty Jul 20, 2022
90e08e6
better doc
fxmarty Jul 20, 2022
7d81c46
Update src/evaluate/evaluator/token_classification.py
fxmarty Jul 20, 2022
69cee88
Update src/evaluate/evaluator/token_classification.py
fxmarty Jul 20, 2022
42ec88f
Update src/evaluate/evaluator/token_classification.py
fxmarty Jul 20, 2022
9f40462
Merge branch 'main' into add-token-classification-evaluator
fxmarty Jul 20, 2022
728b81a
hopefully we pass the tests this time
fxmarty Jul 20, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/source/package_reference/evaluator_classes.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,8 @@ The base class for all evaluator classes:
### TextClassificationEvaluator

[[autodoc]] evaluate.TextClassificationEvaluator

### TokenClassificationEvaluator

[[autodoc]] evaluate.TokenClassificationEvaluator
- compute
1 change: 1 addition & 0 deletions src/evaluate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
ImageClassificationEvaluator,
QuestionAnsweringEvaluator,
TextClassificationEvaluator,
TokenClassificationEvaluator,
evaluator,
)
from .hub import push_to_hub
Expand Down
7 changes: 7 additions & 0 deletions src/evaluate/evaluator/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from .image_classification import ImageClassificationEvaluator
from .question_answering import QuestionAnsweringEvaluator
from .text_classification import TextClassificationEvaluator
from .token_classification import TokenClassificationEvaluator


SUPPORTED_EVALUATOR_TASKS = {
Expand All @@ -43,6 +44,10 @@
"implementation": QuestionAnsweringEvaluator,
"default_metric_name": "squad",
},
"token-classification": {
"implementation": TokenClassificationEvaluator,
"default_metric_name": "seqeval",
},
}


Expand All @@ -64,6 +69,7 @@ def check_task(task: str) -> Dict:
- `"image-classification"`
- `"question-answering"`
- `"text-classification"` (alias `"sentiment-analysis"` available)
- `"token-classification"`
Returns:
task_defaults: `dict`, contains the implementasion class of a give Evaluator and the default metric name.
"""
Expand All @@ -87,6 +93,7 @@ def evaluator(task: str = None) -> Evaluator:
- `"image-classification"`: will return a [`ImageClassificationEvaluator`].
- `"question-answering"`: will return a [`QuestionAnsweringEvaluator`].
- `"text-classification"` (alias `"sentiment-analysis"` available): will return a [`TextClassificationEvaluator`].
- `"token-classification"`: will return a [`TokenClassificationEvaluator`].
Returns:
[`Evaluator`]: An evaluator suitable for the task.
Examples:
Expand Down
4 changes: 2 additions & 2 deletions src/evaluate/evaluator/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

try:
import transformers
from transformers import FeatureExtractionMixin, pipeline
from transformers import pipeline

TRANSFORMERS_AVAILABLE = True
except ImportError:
Expand Down Expand Up @@ -141,7 +141,7 @@ def compute(
data: Union[str, Dataset] = None,
metric: Union[str, EvaluationModule] = None,
tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None, # noqa: F821
feature_extractor: Optional[Union[str, "FeatureExtractionMixin"]] = None,
feature_extractor: Optional[Union[str, "FeatureExtractionMixin"]] = None, # noqa: F821
strategy: Literal["simple", "bootstrap"] = "simple",
confidence_level: float = 0.95,
n_resamples: int = 9999,
Expand Down
4 changes: 2 additions & 2 deletions src/evaluate/evaluator/image_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,11 @@ def compute(self, input_column: str = "image", *args, **kwargs) -> Tuple[Dict[st
metric (`str` or `EvaluationModule`, defaults to `None`):
Specifies the metric we use in evaluator. If it is of type `str`, we treat it as the metric name, and
load it. Otherwise we assume it represents a pre-loaded metric.
feature_extractor: (`str` or `FeatureExtractionMixin`, *optional*, defaults to `None`):
feature_extractor (`str` or `FeatureExtractionMixin`, *optional*, defaults to `None`):
Argument can be used to overwrite a default feature extractor if `model_or_pipeline` represents a model for
which we build a pipeline. If `model_or_pipeline` is `None` or a pre-initialized pipeline, we ignore
this argument.
strategy: (`Literal["simple", "bootstrap"]`, defaults to "simple"):
strategy (`Literal["simple", "bootstrap"]`, defaults to "simple"):
specifies the evaluation strategy. Possible values are:
- `"simple"` - we evaluate the metric and return the scores.
- `"bootstrap"` - on top of computing the metric scores, we calculate the confidence interval for each
Expand Down
4 changes: 2 additions & 2 deletions src/evaluate/evaluator/text_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,11 @@ def compute(self, *args, **kwargs) -> Tuple[Dict[str, float], Any]:
metric (`str` or `EvaluationModule`, defaults to `None`):
Specifies the metric we use in evaluator. If it is of type `str`, we treat it as the metric name, and
load it. Otherwise we assume it represents a pre-loaded metric.
tokenizer: (`str` or `PreTrainedTokenizer`, *optional*, defaults to `None`):
tokenizer (`str` or `PreTrainedTokenizer`, *optional*, defaults to `None`):
Argument can be used to overwrite a default tokenizer if `model_or_pipeline` represents a model for
which we build a pipeline. If `model_or_pipeline` is `None` or a pre-initialized pipeline, we ignore
this argument.
strategy: (`Literal["simple", "bootstrap"]`, defaults to "simple"):
strategy (`Literal["simple", "bootstrap"]`, defaults to "simple"):
specifies the evaluation strategy. Possible values are:
- `"simple"` - we evaluate the metric and return the scores.
- `"bootstrap"` - on top of computing the metric scores, we calculate the confidence interval for each
Expand Down
294 changes: 294 additions & 0 deletions src/evaluate/evaluator/token_classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,294 @@
# Copyright 2022 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, Callable, Dict, List, Optional, Tuple, Union

from datasets import ClassLabel, Dataset, Sequence
from typing_extensions import Literal

from .base import Evaluator


class TokenClassificationEvaluator(Evaluator):
"""
Token classification evaluator.

This token classification evaluator can currently be loaded from [`evaluator`] using the default task name
`token-classification`.

Methods in this class assume a data format compatible with the [`TokenClassificationPipeline`].
"""

PIPELINE_KWARGS = {"ignore_labels": []}

def __init__(self, task="token-classification", default_metric_name=None):
super().__init__(task, default_metric_name=default_metric_name)

def predictions_processor(self, predictions: List[List[Dict]], words: List[List[str]], join_by: str):
"""
Transform the pipeline predictions into a list of predicted labels of the same length as the true labels.

Args:
predictions (List[List[Dict]]): List of pipeline predictions, where each token has been labeled.
words (List[List[str]]): Original input data to the pipeline, used to build predicted labels of the same length.
join_by (str): String to use to join two words. In English, it will typically be " ".

Returns:
Dict: a dictionary holding the predictions
"""
preds = []
fxmarty marked this conversation as resolved.
Show resolved Hide resolved

# iterate over the data rows
for i, prediction in enumerate(predictions):
pred_processed = []

# get a list of tuples giving the indexes of the start and end character of each word
words_offsets = self.words_to_offsets(words[i], join_by)

token_index = 0
for word_offset in words_offsets:
# for each word, we may keep only the predicted label for the first token, discard the others
while prediction[token_index]["start"] < word_offset[0]:
token_index += 1

if prediction[token_index]["start"] > word_offset[0]: # bad indexing
pred_processed.append("O")
fxmarty marked this conversation as resolved.
Show resolved Hide resolved
elif prediction[token_index]["start"] == word_offset[0]:
pred_processed.append(prediction[token_index]["entity"])

preds.append(pred_processed)

return {"predictions": preds}

def words_to_offsets(self, words: List[str], join_by: str):
"""
Convert a list of words to a list of offsets, where word are joined by `join_by`.

Args:
words (List[str]): List of words to get offsets from.
join_by (str): String to insert between words.

Returns:
List[Tuple[int, int]]: List of the characters (start index, end index) for each of the words.
"""
offsets = []

start = 0
for word in words:
end = start + len(word) - 1
offsets.append((start, end))
start = end + len(join_by) + 1

return offsets

def prepare_data(self, data: Union[str, Dataset], input_column: str, label_column: str, join_by: str):
super().prepare_data(data, input_column, label_column)

if not isinstance(data.features[input_column], Sequence) or not isinstance(
data.features[label_column], Sequence
):
raise ValueError(
"TokenClassificationEvaluator expects the input and label columns to be provided as lists."
)

# If the labels are of type ClassLabel, they are already integers and we have the map stored somewhere.
# Otherwise, we have to get the list of labels manually.
labels_are_int = isinstance(data.features[label_column].feature, ClassLabel)
if labels_are_int:
label_list = data.features[label_column].feature.names # list of string labels
id_to_label = {i: label for i, label in enumerate(label_list)}
references = [[id_to_label[label_id] for label_id in label_ids] for label_ids in data[label_column]]
elif data.features[label_column].feature.dtype.startswith("int"):
raise NotImplementedError(
"References provided as integers, but the reference column is not a Sequence of ClassLabels."
)
else:
# In the event the labels are not a `Sequence[ClassLabel]`, we have already labels as strings
# An example is labels as ["PER", "PER", "O", "LOC", "O", "LOC", "O"], e.g. in polyglot_ner dataset
references = data[label_column]

metric_inputs = {"references": references}
pipeline_inputs = [join_by.join(element) for element in data[input_column]]

return metric_inputs, pipeline_inputs

def prepare_pipeline(
self,
model_or_pipeline: Union[str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"], # noqa: F821
tokenizer: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"] = None, # noqa: F821
feature_extractor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"] = None, # noqa: F821
):
pipe = super().prepare_pipeline(model_or_pipeline, tokenizer, feature_extractor)

# check the pipeline outputs start characters in its predictions
dummy_output = pipe(["2003 New York Gregory"], **self.PIPELINE_KWARGS)
if dummy_output[0][0]["start"] is None:
raise ValueError(
"TokenClassificationEvaluator supports only pipelines giving 'start' index as a pipeline output (got None). "
"Transformers pipelines with a slow tokenizer will raise this error."
)

return pipe

def compute(
self,
model_or_pipeline: Union[
str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel" # noqa: F821
] = None,
data: Union[str, Dataset] = None,
metric: Union[str, "EvaluationModule"] = None, # noqa: F821
tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None, # noqa: F821
strategy: Literal["simple", "bootstrap"] = "simple",
confidence_level: float = 0.95,
n_resamples: int = 9999,
random_state: Optional[int] = None,
fxmarty marked this conversation as resolved.
Show resolved Hide resolved
input_column: str = "tokens",
label_column: str = "ner_tags",
join_by: Optional[str] = " ",
) -> Tuple[Dict[str, float], Any]:
"""
Compute the metric for a given pipeline and dataset combination.

Args:
model_or_pipeline (`str` or `Pipeline` or `Callable` or `PreTrainedModel` or `TFPreTrainedModel`, defaults to `None`):
If the argument in not specified, we initialize the default pipeline for the task (in this case
`token-classification`). If the argument is of the type `str` or
is a model instance, we use it to initialize a new `Pipeline` with the given model. Otherwise we assume the
argument specifies a pre-initialized pipeline.
data (`str` or `Dataset`, defaults to `None`):
Specifies the dataset we will run evaluation on. If it is of type `str`, we treat it as the dataset
name, and load it. Otherwise we assume it represents a pre-loaded dataset.
metric (`str` or `EvaluationModule`, defaults to `None`):
Specifies the metric we use in evaluator. If it is of type `str`, we treat it as the metric name, and
load it. Otherwise we assume it represents a pre-loaded metric.
tokenizer (`str` or `PreTrainedTokenizer`, *optional*, defaults to `None`):
Argument can be used to overwrite a default tokenizer if `model_or_pipeline` represents a model for
which we build a pipeline. If `model_or_pipeline` is `None` or a pre-initialized pipeline, we ignore
this argument.
strategy (`Literal["simple", "bootstrap"]`, defaults to "simple"):
specifies the evaluation strategy. Possible values are:

- `"simple"` - we evaluate the metric and return the scores.
- `"bootstrap"` - on top of computing the metric scores, we calculate the confidence interval for each
of the returned metric keys, using `scipy`'s `bootstrap` method
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.bootstrap.html.
confidence_level (`float`, defaults to `0.95`):
The `confidence_level` value passed to `bootstrap` if `"bootstrap"` strategy is chosen.
n_resamples (`int`, defaults to `9999`):
The `n_resamples` value passed to `bootstrap` if `"bootstrap"` strategy is chosen.
random_state (`int`, *optional*, defaults to `None`):
fxmarty marked this conversation as resolved.
Show resolved Hide resolved
The `random_state` value passed to `bootstrap` if `"bootstrap"` strategy is chosen. Useful for
debugging.
input_column (`str`, defaults to `"tokens"`):
the name of the column containing the tokens feature in the dataset specified by `data`.
label_column (`str`, defaults to `"label"`):
the name of the column containing the labels in the dataset specified by `data`.
label_mapping (`Dict`, *optional*, defaults to `None`):
We want to map class labels defined by the model in the pipeline to values consistent with those
defined in the `label_column` of the `data` dataset.
join_by (`str`, *optional*, defaults to `" "`):
This evaluator supports dataset whose input column is a list of words. This parameter specifies how to join
words to generate a string input. This is especially useful for languages that do not separate words by a space.

Return:
A `Dict`. The keys represent metric keys calculated for the `metric` spefied in function arguments. For the
`"simple"` strategy, the value is the metric score. For the `"bootstrap"` strategy, the value is a `Dict`
containing the score, the confidence interval and the standard error calculated for each metric key.

The dataset input and label columns are expected to be formatted as a list of words and a list of labels respectively, following [conll2003 dataset](https://huggingface.co/datasets/conll2003). Datasets whose inputs are single strings, and labels are a list of offset are not supported.

Examples:
```python
>>> from evaluate import evaluator
>>> from datasets import load_dataset
>>> task_evaluator = evaluator("token-classification")
>>> data = load_dataset("conll2003", split="validation[:2]")
>>> results = task_evaluator.compute(
>>> model_or_pipeline="elastic/distilbert-base-uncased-finetuned-conll03-english",
>>> data=data,
>>> metric="seqeval",
>>> )
```

<Tip>

For example, the following dataset format is accepted by the evaluator:

```python
dataset = Dataset.from_dict(
mapping={
"tokens": [["New", "York", "is", "a", "city", "and", "Felix", "a", "person", "."]],
"ner_tags": [[1, 2, 0, 0, 0, 0, 3, 0, 0, 0]],
},
features=Features({
"tokens": Sequence(feature=Value(dtype="string")),
"ner_tags": Sequence(feature=ClassLabel(names=["O", "B-LOC", "I-LOC", "B-PER", "I-PER"])),
}),
)
```
fxmarty marked this conversation as resolved.
Show resolved Hide resolved

</Tip>

<Tip warning={true}>

For example, the following dataset format is **not** accepted by the evaluator:

```python
dataset = Dataset.from_dict(
mapping={
"tokens": [["New York is a city and Felix a person."]],
"starts": [[0, 23]],
"ends": [[7, 27]],
"ner_tags": [["LOC", "PER"]],
},
features=Features({
"tokens": Value(dtype="string"),
"starts": Sequence(feature=Value(dtype="int32")),
"ends": Sequence(feature=Value(dtype="int32")),
"ner_tags": Sequence(feature=Value(dtype="string")),
}),
)
```

</Tip>
"""
result = {}

# Prepare inputs
metric_inputs, pipe_inputs = self.prepare_data(
data=data, input_column=input_column, label_column=label_column, join_by=join_by
)
pipe = self.prepare_pipeline(model_or_pipeline=model_or_pipeline, tokenizer=tokenizer)
fxmarty marked this conversation as resolved.
Show resolved Hide resolved
metric = self.prepare_metric(metric)

# Compute predictions
predictions, perf_results = self.call_pipeline(pipe, pipe_inputs)
predictions = self.predictions_processor(predictions, data[input_column], join_by)

metric_inputs.update(predictions)

# Compute metrics from references and predictions
metric_results = self.compute_metric(
metric=metric,
metric_inputs=metric_inputs,
strategy=strategy,
confidence_level=confidence_level,
n_resamples=n_resamples,
random_state=random_state,
)

result.update(metric_results)
result.update(perf_results)

return result
Loading