huggingface · sashavor · Jul 6, 2022 · Jun 28, 2022 · Jun 28, 2022 · Jun 28, 2022
diff --git a/metrics/meteor/README.md b/metrics/meteor/README.md
@@ -42,9 +42,9 @@ METEOR is based on a generalized concept of unigram matching between the machine
 
 METEOR has two mandatory arguments:
 
-`predictions`: a list of predictions to score. Each prediction should be a string with tokens separated by spaces.
+`predictions`: a `list` of predictions to score. Each prediction should be a string with tokens separated by spaces.
 
-`references`: a list of references for each prediction. Each reference should be a string with tokens separated by spaces.
+`references`: a `list` of references (in the case of one `reference` per `prediction`), or a `list` of `lists` of references (in the case of multiple `references` per `prediction`. Each reference should be a string with tokens separated by spaces.
 
 It also has several optional parameters:
 
@@ -65,7 +65,10 @@ Refer to the [METEOR paper](https://aclanthology.org/W05-0909.pdf) for more info
 
 ## Output values
 
-The metric outputs a dictionary containing the METEOR score. Its values range from 0 to 1. 
+The metric outputs a dictionary containing the METEOR score. Its values range from 0 to 1, e.g.:
+```
+{'meteor': 0.9999142661179699}
+```
 
 
 ### Values from popular papers
@@ -74,34 +77,34 @@ The [METEOR paper](https://aclanthology.org/W05-0909.pdf) does not report METEOR
 
 ## Examples 
 
-Maximal values :
+One `reference` per `prediction`:
 
 ```python
 >>> meteor = evaluate.load('meteor')
 >>> predictions = ["It is a guide to action which ensures that the military always obeys the commands of the party"]
->>> references = ["It is a guide to action which ensures that the military always obeys the commands of the party"]
->>> results = meteor.compute(predictions=predictions, references=references)
+>>> reference = ["It is a guide to action which ensures that the military always obeys the commands of the party"]
+>>> results = meteor.compute(predictions=predictions, references=reference)
 >>> print(round(results['meteor'], 2))
 1.0
 ```
 
-Minimal values:
+Multiple `references` per `prediction`:
 
 ```python
 >>> meteor = evaluate.load('meteor')
 >>> predictions = ["It is a guide to action which ensures that the military always obeys the commands of the party"]
->>> references = ["Hello world"]
+>>> references = [['It is a guide to action that ensures that the military will forever heed Party commands', 'It is the guiding principle which guarantees the military forces always being under the command of the Party', 'It is the practical guide for the army always to heed the directions of the party']]
 >>> results = meteor.compute(predictions=predictions, references=references)
 >>> print(round(results['meteor'], 2))
-0.0
+1.0
 ```
 
-Partial match:
+Multiple `references` per `prediction`, partial match:
 
 ```python
 >>> meteor = evaluate.load('meteor')
 >>> predictions = ["It is a guide to action which ensures that the military always obeys the commands of the party"]
->>> references = ["It is a guide to action that ensures that the military will forever heed Party commands"]
+>>> references = [['It is a guide to action that ensures that the military will forever heed Party commands', 'It is the guiding principle which guarantees the military forces always being under the command of the Party', 'It is the practical guide for the army always to heed the directions of the party']]
 >>> results = meteor.compute(predictions=predictions, references=references)
 >>> print(round(results['meteor'], 2))
 0.69

diff --git a/metrics/meteor/meteor.py b/metrics/meteor/meteor.py
@@ -89,12 +89,20 @@ def _info(self):
             description=_DESCRIPTION,
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
-            features=datasets.Features(
-                {
-                    "predictions": datasets.Value("string", id="sequence"),
-                    "references": datasets.Value("string", id="sequence"),
-                }
-            ),
+            features=[
+                datasets.Features(
+                    {
+                        "predictions": datasets.Value("string", id="sequence"),
+                        "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
+                    }
+                ),
+                datasets.Features(
+                    {
+                        "predictions": datasets.Value("string", id="sequence"),
+                        "references": datasets.Value("string", id="sequence"),
+                    }
+                ),
+            ],
             codebase_urls=["https://github.com/nltk/nltk/blob/develop/nltk/translate/meteor_score.py"],
             reference_urls=[
                 "https://www.nltk.org/api/nltk.translate.html#module-nltk.translate.meteor_score",
@@ -112,17 +120,43 @@ def _download_and_prepare(self, dl_manager):
             nltk.download("omw-1.4")
 
     def _compute(self, predictions, references, alpha=0.9, beta=3, gamma=0.5):
+        multiple_refs = isinstance(references[0], list)
         if NLTK_VERSION >= version.Version("3.6.5"):
-            scores = [
-                meteor_score.single_meteor_score(
-                    word_tokenize(ref), word_tokenize(pred), alpha=alpha, beta=beta, gamma=gamma
-                )
-                for ref, pred in zip(references, predictions)
-            ]
+            # the version of METEOR in NLTK version 3.6.5 and earlier expect tokenized inputs
+            if multiple_refs:
+                scores = [
+                    meteor_score.meteor_score(
+                        [word_tokenize(ref) for ref in refs],
+                        word_tokenize(pred),
+                        alpha=alpha,
+                        beta=beta,
+                        gamma=gamma,
+                    )
+                    for refs, pred in zip(references, predictions)
+                ]
+            else:
+                scores = [
+                    meteor_score.single_meteor_score(
+                        word_tokenize(ref), word_tokenize(pred), alpha=alpha, beta=beta, gamma=gamma
+                    )
+                    for ref, pred in zip(references, predictions)
+                ]
         else:
-            scores = [
-                meteor_score.single_meteor_score(ref, pred, alpha=alpha, beta=beta, gamma=gamma)
-                for ref, pred in zip(references, predictions)
-            ]
+            if multiple_refs:
+                scores = [
+                    meteor_score.meteor_score(
+                        [[word_tokenize(ref) for ref in group] for group in references][0],
+                        word_tokenize(pred),
+                        alpha=alpha,
+                        beta=beta,
+                        gamma=gamma,
+                    )
+                    for ref, pred in zip(references, predictions)
+                ]
+            else:
+                scores = [
+                    meteor_score.single_meteor_score(ref, pred, alpha=alpha, beta=beta, gamma=gamma)
+                    for ref, pred in zip(references, predictions)
+                ]
 
         return {"meteor": np.mean(scores)}