huggingface · lvwerra · Jul 26, 2022 · Jul 26, 2022 · Jul 26, 2022 · Jul 26, 2022
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -47,8 +47,8 @@ jobs:
         steps:
             - checkout
             - run: sudo pip install .[quality]
-            - run: black --check --line-length 119 --target-version py36 tests src metrics
-            - run: isort --check-only tests src metrics
+            - run: black --check --line-length 119 --target-version py36 tests src metrics comparisons measurements
+            - run: isort --check-only tests src metrics comparisons measurements
             - run: flake8 tests src metrics
 
 workflows:

diff --git a/Makefile b/Makefile
@@ -4,14 +4,14 @@
 
 quality:
 	black --check --line-length 119 --target-version py36 tests src metrics comparisons measurements
-	isort --check-only tests src metrics
+	isort --check-only tests src metrics measurements
 	flake8 tests src metrics
 
 # Format source code automatically
 
 style:
 	black --line-length 119 --target-version py36 tests src metrics comparisons measurements
-	isort tests src metrics
+	isort tests src metrics measurements
 
 # Run tests for the library
 

diff --git a/measurements/perplexity/app.py b/measurements/perplexity/app.py
@@ -2,5 +2,5 @@
 from evaluate.utils import launch_gradio_widget
 
 
-module = evaluate.load("perplexity", module_type= "measurement")
+module = evaluate.load("perplexity", module_type="measurement")
 launch_gradio_widget(module)
diff --git a/measurements/text_duplicates/text_duplicates.py b/measurements/text_duplicates/text_duplicates.py
@@ -12,10 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import evaluate
-import datasets
-from collections import Counter
 import hashlib
+from collections import Counter
+
+import datasets
+
+import evaluate
+
 
 logger = evaluate.logging.get_logger(__name__)
 
@@ -47,10 +50,13 @@
 
 # TODO: Add BibTeX citation
 _CITATION = ""
+
+
 def get_hash(example):
     """Get the hash of a string"""
     return hashlib.md5(example.strip().encode("utf-8")).hexdigest()
 
+
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class TextDuplicates(evaluate.Measurement):
     """This measurement returns the duplicate strings contained in the input(s)."""
@@ -64,19 +70,21 @@ def _info(self):
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
-            features=datasets.Features({
-                'data': datasets.Value('string'),
-            })
+            features=datasets.Features(
+                {
+                    "data": datasets.Value("string"),
+                }
+            ),
         )
 
-    def _compute(self, data, list_duplicates = False):
+    def _compute(self, data, list_duplicates=False):
         """Returns the duplicates contained in the input data and the number of times they are repeated."""
         if list_duplicates == True:
             logger.warning("This functionality can be memory-intensive for large datasets!")
             n_dedup = len(set([get_hash(d) for d in data]))
             c = Counter(data)
             duplicates = {k: v for k, v in c.items() if v > 1}
-            return {"duplicate_fraction": 1 - (n_dedup/len(data)), "duplicates_list": duplicates}
+            return {"duplicate_fraction": 1 - (n_dedup / len(data)), "duplicates_list": duplicates}
         else:
-             n_dedup = len(set([get_hash(d) for d in data]))
-             return  {"duplicate_fraction": 1 - (n_dedup/len(data))}
+            n_dedup = len(set([get_hash(d) for d in data]))
+            return {"duplicate_fraction": 1 - (n_dedup / len(data))}
diff --git a/measurements/word_count/word_count.py b/measurements/word_count/word_count.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import evaluate
 import datasets
 from sklearn.feature_extraction.text import CountVectorizer
 
+import evaluate
+
+
 _DESCRIPTION = """
 Returns the total number of words, and the number of unique words in the input data.
 """
@@ -38,24 +40,27 @@
 """
 _CITATION = ""
 
+
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class WordCount(evaluate.Measurement):
     """This measurement returns the total number of words and the number of unique words
-     in the input string(s)."""
+    in the input string(s)."""
 
     def _info(self):
         return evaluate.MeasurementInfo(
             # This is the description that will appear on the modules page.
             module_type="measurement",
             description=_DESCRIPTION,
-            citation = _CITATION,
+            citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
-            features=datasets.Features({
-                'data': datasets.Value('string'),
-            })
+            features=datasets.Features(
+                {
+                    "data": datasets.Value("string"),
+                }
+            ),
         )
 
-    def _compute(self, data, max_vocab = None):
+    def _compute(self, data, max_vocab=None):
         """Returns the number of unique words in the input data"""
         count_vectorizer = CountVectorizer(max_features=max_vocab)
         document_matrix = count_vectorizer.fit_transform(data)

diff --git a/measurements/word_length/word_length.py b/measurements/word_length/word_length.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from statistics import mean
+
+import datasets
 from nltk import word_tokenize
+
 import evaluate
-import datasets
-from statistics import mean
 
 
 _DESCRIPTION = """
@@ -49,6 +51,7 @@
 }
 """
 
+
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class WordLength(evaluate.Measurement):
     """This measurement returns the average number of words in the input string(s)."""
@@ -62,13 +65,16 @@ def _info(self):
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
-            features=datasets.Features({
-                'data': datasets.Value('string'),
-            })
+            features=datasets.Features(
+                {
+                    "data": datasets.Value("string"),
+                }
+            ),
         )
 
     def _download_and_prepare(self, dl_manager):
         import nltk
+
         nltk.download("punkt")
 
     def _compute(self, data, tokenizer=word_tokenize):