From 99a94dc8c6ce41e079fb9092c0981680c2b8f028 Mon Sep 17 00:00:00 2001
From: Justus Schock <12886177+justusschock@users.noreply.github.com>
Date: Wed, 1 Apr 2020 16:30:56 +0200
Subject: [PATCH 01/44] Create utils.py

---
 pytorch_lightning/metrics/utils.py | 131 +++++++++++++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 pytorch_lightning/metrics/utils.py

diff --git a/pytorch_lightning/metrics/utils.py b/pytorch_lightning/metrics/utils.py
new file mode 100644
index 0000000000000..9942545273546
--- /dev/null
+++ b/pytorch_lightning/metrics/utils.py
@@ -0,0 +1,131 @@
+import numbers
+from typing import Union, Any, Optional
+
+import numpy as np
+import torch
+from torch.utils.data._utils.collate import default_convert
+
+from pytorch_lightning.utilities.apply_to_collection import apply_to_collection
+
+
+def _apply_to_inputs(func_to_apply, *dec_args, **dec_kwargs):
+    def decorator_fn(func_to_decorate):
+        def new_func(*args, **kwargs):
+            args = func_to_apply(args, *dec_args, **dec_kwargs)
+            kwargs = func_to_apply(kwargs, *dec_args, **dec_kwargs)
+            return func_to_decorate(*args, **kwargs)
+
+        return new_func
+
+    return decorator_fn
+
+
+def _apply_to_outputs(func_to_apply, *dec_args, **dec_kwargs):
+    def decorator_fn(function_to_decorate):
+        def new_func(*args, **kwargs):
+            result = function_to_decorate(*args, **kwargs)
+            return func_to_apply(result, *dec_args, **dec_kwargs)
+
+        return new_func
+
+    return decorator_fn
+
+
+def _convert_to_tensor(data: Any) -> Any:
+    """
+    Maps all kind of collections and numbers to tensors
+
+    Args:
+        data: the data to convert to tensor
+
+    Returns:
+        the converted data
+
+    """
+    if isinstance(data, numbers.Number):
+        return torch.tensor([data])
+
+    else:
+        return default_convert(data)
+
+
+def _convert_to_numpy(data: Union[torch.Tensor, np.ndarray, numbers.Number]) -> np.ndarray:
+    """
+    converts all tensors and numpy arrays to numpy arrays
+    Args:
+        data: the tensor or array to convert to numpy
+
+    Returns:
+        the resulting numpy array
+
+    """
+    if isinstance(data, torch.Tensor):
+        return data.cpu().detach().numpy()
+    elif isinstance(data, numbers.Number):
+        return np.array([data])
+    return data
+
+
+def _numpy_metric_conversion(func_to_decorate):
+    # Applies collection conversion from tensor to numpy to all inputs
+    # we need to include numpy arrays here, since otherwise they will also be treated as sequences
+    func_convert_inputs = _apply_to_inputs(
+        apply_to_collection, (torch.Tensor, np.ndarray, numbers.Number), _convert_to_numpy)(func_to_decorate)
+    # converts all inputs back to tensors (device doesn't matter here, since this is handled by BaseMetric)
+    func_convert_in_out = _apply_to_outputs(_convert_to_tensor)(func_convert_inputs)
+    return func_convert_in_out
+
+
+def _tensor_metric_conversion(func_to_decorate):
+    # Converts all inputs to tensor if possible
+    func_convert_inputs = _apply_to_inputs(_convert_to_tensor)(func_to_decorate)
+    # convert all outputs to tensor if possible
+    return _apply_to_outputs(_convert_to_tensor)(func_convert_inputs)
+
+
+def _sync_ddp(result: Union[torch.Tensor],
+              group: Any = torch.distributed.group.WORLD,
+              reduce_op: torch.distributed.ReduceOp = torch.distributed.ReduceOp.SUM,
+              ) -> torch.Tensor:
+    """
+    Function to reduce the tensors from several ddp processes to one master process
+
+    Args:
+        result: the value to sync and reduce (typically tensor or number)
+        device: the device to put the synced and reduced value to
+        dtype: the datatype to convert the synced and reduced value to
+        group: the process group to gather results from. Defaults to all processes (world)
+        reduce_op: the reduction operation. Defaults to sum
+
+    Returns:
+        reduced value
+
+    """
+
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        # sync all processes before reduction
+        torch.distributed.barrier(group=group)
+        torch.distributed.all_reduce(result, op=reduce_op, group=group,
+                                     async_op=False)
+
+    return result
+
+
+def numpy_metric(group: Any = torch.distributed.group.WORLD,
+                 reduce_op: torch.distributed.ReduceOp = torch.distributed.ReduceOp.SUM):
+    def decorator_fn(func_to_decorate):
+        return _apply_to_outputs(apply_to_collection, torch.Tensor, _sync_ddp,
+                                 group=group,
+                                 reduce_op=reduce_op)(_numpy_metric_conversion(func_to_decorate))
+
+    return decorator_fn
+
+
+def tensor_metric(group: Any = torch.distributed.group.WORLD,
+                  reduce_op: torch.distributed.ReduceOp = torch.distributed.ReduceOp.SUM):
+    def decorator_fn(func_to_decorate):
+        return _apply_to_outputs(apply_to_collection, torch.Tensor, _sync_ddp,
+                                 group=group,
+                                 reduce_op=reduce_op)(_tensor_metric_conversion(func_to_decorate))
+
+    return decorator_fn

From 4f546b5d7c46e32faf8d9c8722a9ca6467a4c683 Mon Sep 17 00:00:00 2001
From: Justus Schock <12886177+justusschock@users.noreply.github.com>
Date: Wed, 1 Apr 2020 16:32:01 +0200
Subject: [PATCH 02/44] Create __init__.py

---
 tests/metrics/__init__.py | 205 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 205 insertions(+)

diff --git a/tests/metrics/__init__.py b/tests/metrics/__init__.py
index e69de29bb2d1d..a6dfcf8be94a4 100644
--- a/tests/metrics/__init__.py
+++ b/tests/metrics/__init__.py
@@ -0,0 +1,205 @@
+import numpy as np
+import pytest
+import torch
+import torch.distributed as dist
+
+import tests.base.utils as tutils
+from pytorch_lightning.metrics.utils import _apply_to_inputs, _apply_to_outputs, \
+    _convert_to_tensor, _convert_to_numpy, _numpy_metric_conversion, \
+    _tensor_metric_conversion, _sync_ddp, tensor_metric, numpy_metric
+
+
+def test_apply_to_inputs():
+    def apply_fn(inputs, factor):
+        if isinstance(inputs, (float, int)):
+            return inputs * factor
+        elif isinstance(inputs, dict):
+            return {k: apply_fn(v, factor) for k, v in inputs.items()}
+        elif isinstance(inputs, (tuple, list)):
+            return [apply_fn(x, factor) for x in inputs]
+
+    @_apply_to_inputs(apply_fn, factor=2.)
+    def test_fn(*args, **kwargs):
+        return args, kwargs
+
+    for args in [[], [1., 2.]]:
+        for kwargs in [{}, {1., 2.}]:
+            result_args, result_kwargs = test_fn(*args, **kwargs)
+            assert isinstance(result_args, list)
+            assert isinstance(result_kwargs, dict)
+            assert len(result_args) == len(args)
+            assert len(result_kwargs) == len(kwargs)
+            assert all([k in result_kwargs for k in kwargs.keys()])
+            for arg, result_arg in zip(args, result_args):
+                assert arg * 2. == result_arg
+
+            for key in kwargs.keys():
+                arg = kwargs[key],
+                result_arg = result_kwargs[key]
+                assert arg * 2. == result_arg
+
+
+def test_apply_to_outputs():
+    def apply_fn(inputs, additional_str):
+        return str(inputs) + additional_str
+
+    @_apply_to_outputs(apply_fn, additional_str='_str')
+    def test_fn(*args, **kwargs):
+        return 'dummy'
+
+    assert test_fn() == 'dummy_str'
+
+
+def test_convert_to_tensor():
+    for test_item in [1., np.array([1.])]:
+        assert isinstance(_convert_to_tensor(test_item), torch.Tensor)
+        assert test_item.item() == 1.
+
+
+def test_convert_to_numpy():
+    for test_item in [1., torch.tensor([1.])]:
+        result = _convert_to_numpy(test_item)
+        assert isinstance(result, np.ndarray)
+        assert result.item() == 1.
+
+
+def test_numpy_metric_conversion():
+    @_numpy_metric_conversion
+    def numpy_test_metric(*args, **kwargs):
+        for arg in args:
+            assert isinstance(arg, np.ndarray)
+
+        for v in kwargs.values():
+            assert isinstance(v, np.ndarray)
+
+        return 5.
+
+    result = numpy_test_metric(torch.tensor([1.]), dummy_kwarg=2.)
+    assert isinstance(result, torch.Tensor)
+    assert result.item() == 5.
+
+
+def test_tensor_metric_conversion():
+    @_tensor_metric_conversion
+    def tensor_test_metric(*args, **kwargs):
+        for arg in args:
+            assert isinstance(arg, torch.Tensor)
+
+        for v in kwargs.values():
+            assert isinstance(v, torch.Tensor)
+
+        return 5.
+
+    result = tensor_test_metric(np.array([1.]), dummy_kwarg=2.)
+    assert isinstance(result, torch.Tensor)
+    assert result.item() == 5.
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2, "test requires multi-GPU machine")
+def test_sync_reduce_ddp():
+    """Make sure sync-reduce works with DDP"""
+    tutils.reset_seed()
+    tutils.set_random_master_port()
+
+    dist.init_process_group('gloo')
+
+    tensor = torch.tensor([1.], device='cuda:0')
+
+    reduced_tensor = _sync_ddp(tensor)
+
+    assert reduced_tensor.item() == dist.get_world_size(), \
+        'Sync-Reduce does not work properly with DDP and Tensors'
+
+    number = 1.
+    reduced_number = _sync_ddp(number)
+    assert isinstance(reduced_number, torch.Tensor), 'When reducing a number we should get a tensor out'
+    assert reduced_number.item() == dist.get_world_size(), \
+        'Sync-Reduce does not work properly with DDP and Numbers'
+
+    dist.destroy_process_group()
+
+
+def test_sync_reduce_simple():
+    """Make sure sync-reduce works without DDP"""
+    tensor = torch.tensor([1.], device='cpu')
+
+    reduced_tensor = _sync_ddp(tensor)
+
+    assert torch.allclose(tensor,
+                          reduced_tensor), 'Sync-Reduce does not work properly without DDP and Tensors'
+
+    number = 1.
+
+    reduced_number = _sync_ddp(number)
+    assert isinstance(reduced_number, torch.Tensor), 'When reducing a number we should get a tensor out'
+    assert reduced_number.item() == number, 'Sync-Reduce does not work properly without DDP and Numbers'
+
+
+def _test_tensor_metric(is_ddp: bool):
+    @tensor_metric()
+    def tensor_test_metric(*args, **kwargs):
+        for arg in args:
+            assert isinstance(arg, torch.Tensor)
+
+        for v in kwargs.values():
+            assert isinstance(v, torch.Tensor)
+
+        return 5.
+
+    if is_ddp:
+        factor = dist.get_world_size()
+    else:
+        factor = 1.
+
+    result = tensor_test_metric(np.array([1.]), dummy_kwarg=2.)
+    assert isinstance(result, torch.Tensor)
+    assert result.item() == 5. * factor
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2, "test requires multi-GPU machine")
+def test_tensor_metric_ddp():
+    tutils.reset_seed()
+    tutils.set_random_master_port()
+
+    dist.init_process_group('gloo')
+    _test_tensor_metric(True)
+    dist.destroy_process_group()
+
+
+def test_tensor_metric_simple():
+    _test_tensor_metric(False)
+
+
+def _test_numpy_metric(is_ddp: bool):
+    @numpy_metric()
+    def numpy_test_metric(*args, **kwargs):
+        for arg in args:
+            assert isinstance(arg, np.ndarray)
+
+        for v in kwargs.values():
+            assert isinstance(v, np.ndarray)
+
+        return 5.
+
+    if is_ddp:
+        factor = dist.get_world_size()
+    else:
+        factor = 1.
+
+    result = numpy_test_metric(torch.tensor([1.]), dummy_kwarg=2.)
+    assert isinstance(result, torch.Tensor)
+    assert result.item() == 5. * factor
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2, "test requires multi-GPU machine")
+def test_numpy_metric_ddp():
+    tutils.reset_seed()
+    tutils.set_random_master_port()
+
+    dist.init_process_group('gloo')
+    _test_tensor_metric(True)
+    dist.destroy_process_group()
+
+
+def test_numpy_metric_simple():
+    _test_tensor_metric(False)

From ae19aa897fe50eda66964170e156d1bb50551127 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Wed, 1 Apr 2020 16:36:58 +0200
Subject: [PATCH 03/44] redo sklearn metrics

---
 environment.yml                      |   2 +
 pytorch_lightning/metrics/sklearn.py | 141 +++++++++++++++++++++++++++
 requirements-extra.txt               |   3 +-
 3 files changed, 145 insertions(+), 1 deletion(-)
 create mode 100644 pytorch_lightning/metrics/sklearn.py

diff --git a/environment.yml b/environment.yml
index f2718a99c3a45..cad6c002d1a4d 100644
--- a/environment.yml
+++ b/environment.yml
@@ -26,6 +26,8 @@ dependencies:
     - autopep8
     - check-manifest
     - twine==1.13.0
+    - pillow<7.0.0
+    - scikit-learn>=0.16.1
 
     - pip:
         - test-tube>=0.7.5
diff --git a/pytorch_lightning/metrics/sklearn.py b/pytorch_lightning/metrics/sklearn.py
new file mode 100644
index 0000000000000..d4875b25bfe5b
--- /dev/null
+++ b/pytorch_lightning/metrics/sklearn.py
@@ -0,0 +1,141 @@
+from typing import Any, Optional, Union
+
+import numpy as np
+
+import torch
+
+from pytorch_lightning import _logger as lightning_logger
+from pytorch_lightning.metrics.metric import NumpyMetric
+
+
+class SklearnMetric(NumpyMetric):
+    def __init__(self, metric_name: str,
+                 reduce_group: Any = torch.distributed.group.WORLD,
+                 reduce_op: Any = torch.distributed.ReduceOp.SUM, **kwargs):
+        """
+        Bridge between PyTorch Lightning and scikit-learn metrics
+
+        .. warning::
+            Every metric call will cause a GPU synchronization, which may slow down your code
+
+        .. note::
+            The order of targets and predictions may be different from the order typically used in PyTorch
+
+        Args:
+            metric_name: the metric name to import anc compute from scikit-learn.metrics
+            reduce_group: the process group for DDP reduces (only needed for DDP training).
+                Defaults to all processes (world)
+            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
+                Defaults to sum.
+            **kwargs: additonal keyword arguments (will be forwarded to metric call)
+        """
+        super().__init__(name=metric_name, reduce_group=reduce_group,
+                         reduce_op=reduce_op)
+
+        self.metric_kwargs = kwargs
+
+        lightning_logger.debug(
+            'Every metric call will cause a GPU synchronization, which may slow down your code')
+
+    @property
+    def metric_fn(self):
+        import sklearn.metrics
+        return getattr(sklearn.metrics, self.name)
+
+    def forward(self, *args, **kwargs) -> Union[np.ndarray, int, float]:
+        """
+        Carries the actual metric computation and therefore co
+        Args:
+            *args: Positional arguments forwarded to metric call (should be already converted to numpy)
+            **kwargs: keyword arguments forwarded to metric call (should be already converted to numpy)
+
+        Returns:
+            the metric value (will be converted to tensor by baseclass
+
+        """
+        return self.metric_fn(*args, **kwargs)
+
+
+# metrics : accuracy, auc, average_precision (AP), confusion_matrix, f1, fbeta, hamm, precision, recall, precision_recall_curve, roc, roc_auc, r2, jaccard
+
+class Accuracy(SklearnMetric):
+    def __init__(self, normalize: bool = True,
+                 reduce_group: Any = torch.distributed.group.WORLD,
+                 reduce_op: Any = torch.distributed.ReduceOp.SUM):
+        """
+        Calculates the Accuracy Score
+
+        .. warning::
+            Every metric call will cause a GPU synchronization, which may slow down your code
+
+        Args:
+            normalize: If ``False``, return the number of correctly classified samples.
+                Otherwise, return the fraction of correctly classified samples.
+            reduce_group: the process group for DDP reduces (only needed for DDP training).
+                Defaults to all processes (world)
+            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
+                Defaults to sum.
+        """
+        super().__init__(metric_name='accuracy_score',
+                         reduce_group=reduce_group,
+                         reduce_op=reduce_op,
+                         normalize=normalize)
+
+    def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
+                sample_weight: Optional[np.ndarray] = None) -> float:
+        """
+        Computes the accuracy
+        Args:
+            y_pred: the array containing the predictions (already in categorical form)
+            y_true: the array containing the targets (in categorical form)
+            sample_weight:
+
+        Returns:
+            Accuracy Score
+
+
+        """
+        return super().forward(y_pred=y_pred, y_true=y_true, sample_weight=sample_weight)
+
+class AUC(SklearnMetric):
+    def __init__(self, reorder: bool = False,
+                 reduce_group: Any = torch.distributed.group.WORLD,
+                 reduce_op: Any = torch.distributed.ReduceOp.SUM
+                 ):
+        """
+        Calculates the Area Under the Curve using the trapoezoidal rule
+
+        .. warning::
+            Every metric call will cause a GPU synchronization, which may slow down your code
+
+        Args:
+            reorder: If ``True``, assume that the curve is ascending in the case of ties, as for an ROC curve.
+                If the curve is non-ascending, the result will be wrong.
+            reduce_group: the process group for DDP reduces (only needed for DDP training).
+                Defaults to all processes (world)
+            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
+                Defaults to sum.
+        """
+
+        super().__init__(metric_name='auc',
+                         reduce_group=reduce_group,
+                         reduce_op=reduce_op,
+                         reorder=reorder)
+
+    def forward(self, x: np.ndarray, y: np.ndarray) -> float:
+        """
+        Computes the AUC
+        Args:
+            x: x coordinates.
+            y: y coordinates.
+
+        Returns:
+            AUC calculated with trapezoidal rule
+
+        """
+        return super().forward(x=x, y=y)
+
+
+
+
+
diff --git a/requirements-extra.txt b/requirements-extra.txt
index 30bc84ab5190b..cdc10043bc858 100644
--- a/requirements-extra.txt
+++ b/requirements-extra.txt
@@ -9,4 +9,5 @@ trains>=0.14.1
 matplotlib>=3.1.1
 # no need to install with [pytorch] as pytorch is already installed and torchvision is required only for Horovod examples
 horovod>=0.19.1
-omegaconf==2.0.0
\ No newline at end of file
+omegaconf>=2.0.0
+scikit-learn>=0.16.1

From fec66b4237ec02e5182670b83910c9219a94d615 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Fri, 3 Apr 2020 14:06:55 +0200
Subject: [PATCH 04/44] add some more metrics

---
 pytorch_lightning/metrics/sklearn.py | 199 ++++++++++++++++++++++++++-
 1 file changed, 193 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/metrics/sklearn.py b/pytorch_lightning/metrics/sklearn.py
index d4875b25bfe5b..d9a8c24d2dd62 100644
--- a/pytorch_lightning/metrics/sklearn.py
+++ b/pytorch_lightning/metrics/sklearn.py
@@ -1,7 +1,6 @@
-from typing import Any, Optional, Union
+from typing import Any, Optional, Union, Sequence
 
 import numpy as np
-
 import torch
 
 from pytorch_lightning import _logger as lightning_logger
@@ -44,7 +43,7 @@ def metric_fn(self):
 
     def forward(self, *args, **kwargs) -> Union[np.ndarray, int, float]:
         """
-        Carries the actual metric computation and therefore co
+        Carries the actual metric computation
         Args:
             *args: Positional arguments forwarded to metric call (should be already converted to numpy)
             **kwargs: keyword arguments forwarded to metric call (should be already converted to numpy)
@@ -53,10 +52,8 @@ def forward(self, *args, **kwargs) -> Union[np.ndarray, int, float]:
             the metric value (will be converted to tensor by baseclass
 
         """
-        return self.metric_fn(*args, **kwargs)
-
+        return self.metric_fn(*args, **kwargs, **self.metric_kwargs)
 
-# metrics : accuracy, auc, average_precision (AP), confusion_matrix, f1, fbeta, hamm, precision, recall, precision_recall_curve, roc, roc_auc, r2, jaccard
 
 class Accuracy(SklearnMetric):
     def __init__(self, normalize: bool = True,
@@ -97,6 +94,7 @@ def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
         """
         return super().forward(y_pred=y_pred, y_true=y_true, sample_weight=sample_weight)
 
+
 class AUC(SklearnMetric):
     def __init__(self, reorder: bool = False,
                  reduce_group: Any = torch.distributed.group.WORLD,
@@ -136,6 +134,195 @@ def forward(self, x: np.ndarray, y: np.ndarray) -> float:
         return super().forward(x=x, y=y)
 
 
+class AveragePrecision(SklearnMetric):
+    def __init__(self, average: Optional[str] = 'macro',
+                 reduce_group: Any = torch.distributed.group.WORLD,
+                 reduce_op: Any = torch.distributed.ReduceOp.SUM
+                 ):
+        """
+        Calculates the average precision (AP) score.
+        Args:
+            average: If None, the scores for each class are returned. Otherwise, this determines the type of
+                averaging performed on the data:
+                * If 'micro': Calculate metrics globally by considering each element of the label indicator
+                    matrix as a label.
+                * If 'macro': Calculate metrics for each label, and find their unweighted mean.
+                    This does not take label imbalance into account.
+                * If 'weighted': Calculate metrics for each label, and find their average, weighted by
+                    support (the number of true instances for each label).
+                * If 'samples': Calculate metrics for each instance, and find their average.
+            reduce_group: the process group for DDP reduces (only needed for DDP training).
+                Defaults to all processes (world)
+            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
+                Defaults to sum.
+        """
+        super().__init__('average_precision_score',
+                         reduce_group=reduce_group,
+                         reduce_op=reduce_op,
+                         average=average)
+
+    def forward(self, y_score: np.ndarray, y_true: np.ndarray,
+                sample_weight: Optional[np.ndarray] = None) -> float:
+        """
+
+        Args:
+            y_score: Target scores, can either be probability estimates of the positive class,
+                confidence values, or binary decisions.
+            y_true: True binary labels in binary label indicators.
+            sample_weight: Sample weights.
+        Returns:
+            average precision score
+        """
+        return super().forward(y_score=y_score, y_true=y_true,
+                               sample_weight=sample_weight)
+
+
+class ConfusionMatric(SklearnMetric):
+    def __init__(self, labels: Optional[Sequence] = None,
+                 reduce_group: Any = torch.distributed.group.WORLD,
+                 reduce_op: Any = torch.distributed.ReduceOp.SUM
+                 ):
+        """
+        Compute confusion matrix to evaluate the accuracy of a classification
+        By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}`
+        is equal to the number of observations known to be in group :math:`i` but
+        predicted to be in group :math:`j`.
+
+        Args:
+            labels: List of labels to index the matrix. This may be used to reorder
+                or select a subset of labels.
+                If none is given, those that appear at least once
+                in ``y_true`` or ``y_pred`` are used in sorted order.
+            reduce_group: the process group for DDP reduces (only needed for DDP training).
+                Defaults to all processes (world)
+            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
+                Defaults to sum.
+        """
+        super().__init__('confusion_matrix',
+                         reduce_group=reduce_group,
+                         reduce_op=reduce_op,
+                         labels=labels)
+
+    def forward(self, y_pred: np.ndarray, y_true: np.ndarray) -> np.ndarray:
+        """
+
+        Args:
+            y_pred: Estimated targets as returned by a classifier.
+            y_true: Ground truth (correct) target values.
+
+        Returns: Confusion matrix (array of shape [n_classes, n_classes])
+
+        """
+        return super().forward(y_pred=y_pred, y_true=y_true)
+
+
+class F1(SklearnMetric):
+    """
+    Compute the F1 score, also known as balanced F-score or F-measure
+    The F1 score can be interpreted as a weighted average of the precision and
+    recall, where an F1 score reaches its best value at 1 and worst score at 0.
+    The relative contribution of precision and recall to the F1 score are
+    equal. The formula for the F1 score is::
+        F1 = 2 * (precision * recall) / (precision + recall)
+    In the multi-class and multi-label case, this is the weighted average of
+    the F1 score of each class.
+
+    References:
+        .. [1] `Wikipedia entry for the F1-score
+           <http://en.wikipedia.org/wiki/F1_score>`_
+    """
+
+    def __init__(self, labels: Optional[Sequence] = None,
+                 pos_labels: Union[str, int] = 1,
+                 average: Optional[str] = 'binary',
+                 reduce_group: Any = torch.distributed.group.WORLD,
+                 reduce_op: Any = torch.distributed.ReduceOp.SUM):
+        """
+
+        Args:
+            labels: Integer array of labels.
+            pos_labels: The class to report if ``average='binary'``.
+            average: This parameter is required for multiclass/multilabel targets.
+                If ``None``, the scores for each class are returned. Otherwise, this
+                determines the type of averaging performed on the data:
+                ``'binary'``:
+                    Only report results for the class specified by ``pos_label``.
+                    This is applicable only if targets (``y_{true,pred}``) are binary.
+                ``'micro'``:
+                    Calculate metrics globally by counting the total true positives,
+                    false negatives and false positives.
+                ``'macro'``:
+                    Calculate metrics for each label, and find their unweighted
+                    mean.  This does not take label imbalance into account.
+                ``'weighted'``:
+                    Calculate metrics for each label, and find their average, weighted
+                    by support (the number of true instances for each label). This
+                    alters 'macro' to account for label imbalance; it can result in an
+                    F-score that is not between precision and recall.
+                ``'samples'``:
+                    Calculate metrics for each instance, and find their average (only
+                    meaningful for multilabel classification where this differs from
+                    :func:`accuracy_score`).
+                Note that if ``pos_label`` is given in binary classification with
+                `average != 'binary'`, only that positive class is reported. This
+                behavior is deprecated and will change in version 0.18.
+            reduce_group: the process group for DDP reduces (only needed for DDP training).
+                Defaults to all processes (world)
+            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
+                Defaults to sum.
+        """
+        super().__init__('f1_score',
+                         reduce_group=reduce_group,
+                         reduce_op=reduce_op,
+                         labels=labels,
+                         pos_labels=pos_labels,
+                         average=average)
+
+    def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
+                sample_weight: Optional[np.ndarray] = None) -> Union[np.ndarray, float]:
+        """
+
+        Args:
+            y_pred : Estimated targets as returned by a classifier.
+            y_true: Ground truth (correct) target values.
+            sample_weight: Sample weights.
+
+
+        Returns: F1 score of the positive class in binary classification or weighted
+            average of the F1 scores of each class for the multiclass task.
+
+        """
+        return super().forward(y_pred=y_pred, y_true=y_true, sample_weight=sample_weight)
+
+
+class FBeta(SklearnMetric):
+
+    pass
+
+
+class Precision(SklearnMetric):
+    pass
+
+
+class Recall(SklearnMetric):
+    pass
+
+
+class PrecisionRecallCurve(SklearnMetric):
+    pass
+
+
+class ROC(SklearnMetric):
+    pass
+
+
+class AUROC(SklearnMetric):
+    pass
+
 
+class R2(SklearnMetric):
+    pass
 
 
+class Jaccard(SklearnMetric):
+    pass

From 08ad7b0756eeb2c11884fc632216cfcdaa6d4c76 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Mon, 13 Apr 2020 12:56:43 +0200
Subject: [PATCH 05/44] add sklearn metrics

---
 pytorch_lightning/metrics/sklearn.py | 385 ++++++++++++++++++++++++++-
 1 file changed, 373 insertions(+), 12 deletions(-)

diff --git a/pytorch_lightning/metrics/sklearn.py b/pytorch_lightning/metrics/sklearn.py
index d9a8c24d2dd62..d084f18e810b8 100644
--- a/pytorch_lightning/metrics/sklearn.py
+++ b/pytorch_lightning/metrics/sklearn.py
@@ -177,7 +177,7 @@ def forward(self, y_score: np.ndarray, y_true: np.ndarray,
                                sample_weight=sample_weight)
 
 
-class ConfusionMatric(SklearnMetric):
+class ConfusionMatrix(SklearnMetric):
     def __init__(self, labels: Optional[Sequence] = None,
                  reduce_group: Any = torch.distributed.group.WORLD,
                  reduce_op: Any = torch.distributed.ReduceOp.SUM
@@ -296,33 +296,394 @@ def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
 
 
 class FBeta(SklearnMetric):
+    """
+    Compute the F-beta score.The `beta` parameter determines the weight of precision in the combined
+    score. ``beta < 1`` lends more weight to precision, while ``beta > 1``
+    favors recall (``beta -> 0`` considers only precision, ``beta -> inf``
+    only recall).
+
+        References:
+            .. [1] R. Baeza-Yates and B. Ribeiro-Neto (2011).
+                Modern Information Retrieval. Addison Wesley, pp. 327-328.
+            .. [2] `Wikipedia entry for the F1-score
+                   <http://en.wikipedia.org/wiki/F1_score>`_
+        """
 
-    pass
+    def __init__(self, beta: float, labels: Optional[Sequence] = None,
+                 pos_labels: Union[str, int] = 1,
+                 average: Optional[str] = 'binary',
+                 reduce_group: Any = torch.distributed.group.WORLD,
+                 reduce_op: Any = torch.distributed.ReduceOp.SUM):
+        """
+
+        Args:
+            beta: Weight of precision in harmonic mean.
+            labels: Integer array of labels.
+            pos_labels: The class to report if ``average='binary'``.
+            average: This parameter is required for multiclass/multilabel targets.
+                If ``None``, the scores for each class are returned. Otherwise, this
+                determines the type of averaging performed on the data:
+                ``'binary'``:
+                    Only report results for the class specified by ``pos_label``.
+                    This is applicable only if targets (``y_{true,pred}``) are binary.
+                ``'micro'``:
+                    Calculate metrics globally by counting the total true positives,
+                    false negatives and false positives.
+                ``'macro'``:
+                    Calculate metrics for each label, and find their unweighted
+                    mean.  This does not take label imbalance into account.
+                ``'weighted'``:
+                    Calculate metrics for each label, and find their average, weighted
+                    by support (the number of true instances for each label). This
+                    alters 'macro' to account for label imbalance; it can result in an
+                    F-score that is not between precision and recall.
+                ``'samples'``:
+                    Calculate metrics for each instance, and find their average (only
+                    meaningful for multilabel classification where this differs from
+                    :func:`accuracy_score`).
+                Note that if ``pos_label`` is given in binary classification with
+                `average != 'binary'`, only that positive class is reported. This
+                behavior is deprecated and will change in version 0.18.
+            reduce_group: the process group for DDP reduces (only needed for DDP training).
+                Defaults to all processes (world)
+            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
+                Defaults to sum.
+        """
+        super().__init__('fbeta_score',
+                         reduce_group=reduce_group,
+                         reduce_op=reduce_op,
+                         beta=beta,
+                         labels=labels,
+                         pos_labels=pos_labels,
+                         average=average)
+
+    def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
+                sample_weight: Optional[np.ndarray] = None) -> Union[np.ndarray, float]:
+        """
+
+        Args:
+            y_pred : Estimated targets as returned by a classifier.
+            y_true: Ground truth (correct) target values.
+            sample_weight: Sample weights.
+
+
+        Returns: FBeta score of the positive class in binary classification or weighted
+            average of the FBeta scores of each class for the multiclass task.
+
+        """
+        return super().forward(y_pred=y_pred, y_true=y_true, sample_weight=sample_weight)
 
 
 class Precision(SklearnMetric):
-    pass
+    """
+    Compute the precision
+    The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of
+    true positives and ``fp`` the number of false positives. The precision is
+    intuitively the ability of the classifier not to label as positive a sample
+    that is negative.
+    The best value is 1 and the worst value is 0.
+
+    """
+    def __init__(self, labels: Optional[Sequence] = None,
+                 pos_labels: Union[str, int] = 1,
+                 average: Optional[str] = 'binary',
+                 reduce_group: Any = torch.distributed.group.WORLD,
+                 reduce_op: Any = torch.distributed.ReduceOp.SUM):
+        """
+
+        Args:
+            labels: Integer array of labels.
+            pos_labels: The class to report if ``average='binary'``.
+            average: This parameter is required for multiclass/multilabel targets.
+                If ``None``, the scores for each class are returned. Otherwise, this
+                determines the type of averaging performed on the data:
+                ``'binary'``:
+                    Only report results for the class specified by ``pos_label``.
+                    This is applicable only if targets (``y_{true,pred}``) are binary.
+                ``'micro'``:
+                    Calculate metrics globally by counting the total true positives,
+                    false negatives and false positives.
+                ``'macro'``:
+                    Calculate metrics for each label, and find their unweighted
+                    mean.  This does not take label imbalance into account.
+                ``'weighted'``:
+                    Calculate metrics for each label, and find their average, weighted
+                    by support (the number of true instances for each label). This
+                    alters 'macro' to account for label imbalance; it can result in an
+                    F-score that is not between precision and recall.
+                ``'samples'``:
+                    Calculate metrics for each instance, and find their average (only
+                    meaningful for multilabel classification where this differs from
+                    :func:`accuracy_score`).
+                Note that if ``pos_label`` is given in binary classification with
+                `average != 'binary'`, only that positive class is reported. This
+                behavior is deprecated and will change in version 0.18.
+            reduce_group: the process group for DDP reduces (only needed for DDP training).
+                Defaults to all processes (world)
+            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
+                Defaults to sum.
+        """
+        super().__init__('precision_score',
+                         reduce_group=reduce_group,
+                         reduce_op=reduce_op,
+                         labels=labels,
+                         pos_labels=pos_labels,
+                         average=average)
+
+    def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
+                sample_weight: Optional[np.ndarray] = None) -> Union[np.ndarray, float]:
+        """
+
+        Args:
+            y_pred : Estimated targets as returned by a classifier.
+            y_true: Ground truth (correct) target values.
+            sample_weight: Sample weights.
+
+
+        Returns:  Precision of the positive class in binary classification or weighted
+        average of the precision of each class for the multiclass task.
+
+        """
+        return super().forward(y_pred=y_pred, y_true=y_true, sample_weight=sample_weight)
 
 
 class Recall(SklearnMetric):
-    pass
+    """
+    Compute the recall
+    The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of
+    true positives and ``fn`` the number of false negatives. The recall is
+    intuitively the ability of the classifier to find all the positive samples.
+    The best value is 1 and the worst value is 0.
+
+    """
+
+    def __init__(self, labels: Optional[Sequence] = None,
+                 pos_labels: Union[str, int] = 1,
+                 average: Optional[str] = 'binary',
+                 reduce_group: Any = torch.distributed.group.WORLD,
+                 reduce_op: Any = torch.distributed.ReduceOp.SUM):
+        """
+
+        Args:
+            labels: Integer array of labels.
+            pos_labels: The class to report if ``average='binary'``.
+            average: This parameter is required for multiclass/multilabel targets.
+                If ``None``, the scores for each class are returned. Otherwise, this
+                determines the type of averaging performed on the data:
+                ``'binary'``:
+                    Only report results for the class specified by ``pos_label``.
+                    This is applicable only if targets (``y_{true,pred}``) are binary.
+                ``'micro'``:
+                    Calculate metrics globally by counting the total true positives,
+                    false negatives and false positives.
+                ``'macro'``:
+                    Calculate metrics for each label, and find their unweighted
+                    mean.  This does not take label imbalance into account.
+                ``'weighted'``:
+                    Calculate metrics for each label, and find their average, weighted
+                    by support (the number of true instances for each label). This
+                    alters 'macro' to account for label imbalance; it can result in an
+                    F-score that is not between precision and recall.
+                ``'samples'``:
+                    Calculate metrics for each instance, and find their average (only
+                    meaningful for multilabel classification where this differs from
+                    :func:`accuracy_score`).
+                Note that if ``pos_label`` is given in binary classification with
+                `average != 'binary'`, only that positive class is reported. This
+                behavior is deprecated and will change in version 0.18.
+            reduce_group: the process group for DDP reduces (only needed for DDP training).
+                Defaults to all processes (world)
+            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
+                Defaults to sum.
+        """
+        super().__init__('recall_score',
+                         reduce_group=reduce_group,
+                         reduce_op=reduce_op,
+                         labels=labels,
+                         pos_labels=pos_labels,
+                         average=average)
+
+    def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
+                sample_weight: Optional[np.ndarray] = None) -> Union[np.ndarray, float]:
+        """
+
+        Args:
+            y_pred : Estimated targets as returned by a classifier.
+            y_true: Ground truth (correct) target values.
+            sample_weight: Sample weights.
+
+
+        Returns:  Recall of the positive class in binary classification or weighted
+        average of the recall of each class for the multiclass task.
+
+        """
+        return super().forward(y_pred=y_pred, y_true=y_true, sample_weight=sample_weight)
 
 
 class PrecisionRecallCurve(SklearnMetric):
-    pass
+    """
+    Compute precision-recall pairs for different probability thresholds
+
+    Note:
+        this implementation is restricted to the binary classification task.
+
+    The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of
+    true positives and ``fp`` the number of false positives. The precision is
+    intuitively the ability of the classifier not to label as positive a sample
+    that is negative.
+    The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of
+    true positives and ``fn`` the number of false negatives. The recall is
+    intuitively the ability of the classifier to find all the positive samples.
+    The last precision and recall values are 1. and 0. respectively and do not
+    have a corresponding threshold.  This ensures that the graph starts on the
+    x axis.
+
+    """
+
+    def __init__(self,
+                 pos_labels: Union[str, int] = 1,
+                 reduce_group: Any = torch.distributed.group.WORLD,
+                 reduce_op: Any = torch.distributed.ReduceOp.SUM):
+        """
+
+        Args:
+            pos_labels: The class to report if ``average='binary'``.
+            reduce_group: the process group for DDP reduces (only needed for DDP training).
+                Defaults to all processes (world)
+            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
+                Defaults to sum.
+        """
+        super().__init__('precision_recall_curve',
+                         reduce_group=reduce_group,
+                         reduce_op=reduce_op,
+                         pos_labels=pos_labels)
+
+    def forward(self, probas_pred: np.ndarray, y_true: np.ndarray,
+                sample_weight: Optional[np.ndarray] = None) -> Union[np.ndarray, float]:
+        """
+
+        Args:
+            probas_pred : Estimated probabilities or decision function.
+            y_true: Ground truth (correct) target values.
+            sample_weight: Sample weights.
+
+
+        Returns:
+            precision:
+                Precision values such that element i is the precision of
+                predictions with score >= thresholds[i] and the last element is 1.
+            recall:
+                Decreasing recall values such that element i is the recall of
+                predictions with score >= thresholds[i] and the last element is 0.
+            thresholds:
+                Increasing thresholds on the decision function used to compute
+                precision and recall.
+
+        """
+        return super().forward(probas_pred=probas_pred, y_true=y_true, sample_weight=sample_weight)
 
 
 class ROC(SklearnMetric):
-    pass
+    """
+    Compute Receiver operating characteristic (ROC)
 
+    Note:
+        this implementation is restricted to the binary classification task.
 
-class AUROC(SklearnMetric):
-    pass
+    """
+
+    def __init__(self,
+                 pos_labels: Union[str, int] = 1,
+                 reduce_group: Any = torch.distributed.group.WORLD,
+                 reduce_op: Any = torch.distributed.ReduceOp.SUM):
+        """
+
+        Args:
+            pos_labels: The class to report if ``average='binary'``.
+            reduce_group: the process group for DDP reduces (only needed for DDP training).
+                Defaults to all processes (world)
+            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
+                Defaults to sum.
 
+        References:
+            .. [1] `Wikipedia entry for the Receiver operating characteristic
+                <http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
+        """
+        super().__init__('roc_curve',
+                         reduce_group=reduce_group,
+                         reduce_op=reduce_op,
+                         pos_labels=pos_labels)
 
-class R2(SklearnMetric):
-    pass
+    def forward(self, y_score: np.ndarray, y_true: np.ndarray,
+                sample_weight: Optional[np.ndarray] = None) -> Union[np.ndarray, float]:
+        """
+
+        Args:
+            y_score : Target scores, can either be probability estimates of the positive
+                class or confidence values.
+            y_true: Ground truth (correct) target values.
+            sample_weight: Sample weights.
+
+
+        Returns:
+            fpr:
+                Increasing false positive rates such that element i is the false
+                positive rate of predictions with score >= thresholds[i].
+            tpr:
+                Increasing true positive rates such that element i is the true
+                positive rate of predictions with score >= thresholds[i].
+            thresholds:
+                Decreasing thresholds on the decision function used to compute
+                fpr and tpr. `thresholds[0]` represents no instances being predicted
+                and is arbitrarily set to `max(y_score) + 1`.
+
+        """
+        return super().forward(y_score=y_score, y_true=y_true, sample_weight=sample_weight)
+
+
+class AUROC(SklearnMetric):
+    """
+    Compute Area Under the Curve (AUC) from prediction scores
+    Note:
+        this implementation is restricted to the binary classification task
+        or multilabel classification task in label indicator format.
+    """
+    def __init__(self, average: Optional[str] = 'macro',
+                 reduce_group: Any = torch.distributed.group.WORLD,
+                 reduce_op: Any = torch.distributed.ReduceOp.SUM
+                 ):
+        """
+        Args:
+            average: If None, the scores for each class are returned. Otherwise, this determines the type of
+                averaging performed on the data:
+                * If 'micro': Calculate metrics globally by considering each element of the label indicator
+                    matrix as a label.
+                * If 'macro': Calculate metrics for each label, and find their unweighted mean.
+                    This does not take label imbalance into account.
+                * If 'weighted': Calculate metrics for each label, and find their average, weighted by
+                    support (the number of true instances for each label).
+                * If 'samples': Calculate metrics for each instance, and find their average.
+            reduce_group: the process group for DDP reduces (only needed for DDP training).
+                Defaults to all processes (world)
+            reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
+                Defaults to sum.
+        """
+        super().__init__('roc_auc_score',
+                         reduce_group=reduce_group,
+                         reduce_op=reduce_op,
+                         average=average)
 
+    def forward(self, y_score: np.ndarray, y_true: np.ndarray,
+                sample_weight: Optional[np.ndarray] = None) -> float:
+        """
 
-class Jaccard(SklearnMetric):
-    pass
+        Args:
+            y_score: Target scores, can either be probability estimates of the positive class,
+                confidence values, or binary decisions.
+            y_true: True binary labels in binary label indicators.
+            sample_weight: Sample weights.
+        Returns:
+            Area Under Receiver Operating Characteristic Curve
+        """
+        return super().forward(y_score=y_score, y_true=y_true,
+                               sample_weight=sample_weight)

From 2722c08e9ffb0e4ac46519c8f916d3635a64b701 Mon Sep 17 00:00:00 2001
From: Justus Schock <12886177+justusschock@users.noreply.github.com>
Date: Fri, 3 Apr 2020 21:10:40 +0200
Subject: [PATCH 06/44] New metric classes (#1326)

* Create metrics package

* Create metric.py

* Create utils.py

* Create __init__.py

* add tests for metric utils

* add docstrings for metrics utils

* add function to recursively apply other function to collection

* add tests for this function

* update test

* Update pytorch_lightning/metrics/metric.py

Co-Authored-By: Jirka Borovec <Borda@users.noreply.github.com>

* update metric name

* remove example docs

* fix tests

* add metric tests

* fix to tensor conversion

* fix apply to collection

* Update CHANGELOG.md

* Update pytorch_lightning/metrics/metric.py

Co-Authored-By: Jirka Borovec <Borda@users.noreply.github.com>

* remove tests from init

* add missing type annotations

* rename utils to convertors

* Create metrics.rst

* Update index.rst

* Update index.rst

* Update pytorch_lightning/metrics/convertors.py

Co-Authored-By: Jirka Borovec <Borda@users.noreply.github.com>

* Update pytorch_lightning/metrics/convertors.py

Co-Authored-By: Jirka Borovec <Borda@users.noreply.github.com>

* Update pytorch_lightning/metrics/convertors.py

Co-Authored-By: Jirka Borovec <Borda@users.noreply.github.com>

* Update pytorch_lightning/metrics/metric.py

Co-Authored-By: Jirka Borovec <Borda@users.noreply.github.com>

* Update tests/utilities/test_apply_to_collection.py

Co-Authored-By: Jirka Borovec <Borda@users.noreply.github.com>

* Update tests/utilities/test_apply_to_collection.py

Co-Authored-By: Jirka Borovec <Borda@users.noreply.github.com>

* Update tests/metrics/convertors.py

Co-Authored-By: Jirka Borovec <Borda@users.noreply.github.com>

* Apply suggestions from code review

Co-Authored-By: Jirka Borovec <Borda@users.noreply.github.com>

* add doctest example

* rename file and fix imports

* added parametrized test

* replace lambda with inlined function

* rename apply_to_collection to apply_func

* Separated class description from init args

* Apply suggestions from code review

Co-Authored-By: Jirka Borovec <Borda@users.noreply.github.com>

* adjust random values

* suppress output when seeding

* remove gpu from doctest

* Add requested changes and add ellipsis for doctest

* forgot to push these files...

* add explicit check for dtype to convert to

* fix ddp tests

* remove explicit ddp destruction

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 tests/metrics/__init__.py | 205 --------------------------------------
 1 file changed, 205 deletions(-)

diff --git a/tests/metrics/__init__.py b/tests/metrics/__init__.py
index a6dfcf8be94a4..e69de29bb2d1d 100644
--- a/tests/metrics/__init__.py
+++ b/tests/metrics/__init__.py
@@ -1,205 +0,0 @@
-import numpy as np
-import pytest
-import torch
-import torch.distributed as dist
-
-import tests.base.utils as tutils
-from pytorch_lightning.metrics.utils import _apply_to_inputs, _apply_to_outputs, \
-    _convert_to_tensor, _convert_to_numpy, _numpy_metric_conversion, \
-    _tensor_metric_conversion, _sync_ddp, tensor_metric, numpy_metric
-
-
-def test_apply_to_inputs():
-    def apply_fn(inputs, factor):
-        if isinstance(inputs, (float, int)):
-            return inputs * factor
-        elif isinstance(inputs, dict):
-            return {k: apply_fn(v, factor) for k, v in inputs.items()}
-        elif isinstance(inputs, (tuple, list)):
-            return [apply_fn(x, factor) for x in inputs]
-
-    @_apply_to_inputs(apply_fn, factor=2.)
-    def test_fn(*args, **kwargs):
-        return args, kwargs
-
-    for args in [[], [1., 2.]]:
-        for kwargs in [{}, {1., 2.}]:
-            result_args, result_kwargs = test_fn(*args, **kwargs)
-            assert isinstance(result_args, list)
-            assert isinstance(result_kwargs, dict)
-            assert len(result_args) == len(args)
-            assert len(result_kwargs) == len(kwargs)
-            assert all([k in result_kwargs for k in kwargs.keys()])
-            for arg, result_arg in zip(args, result_args):
-                assert arg * 2. == result_arg
-
-            for key in kwargs.keys():
-                arg = kwargs[key],
-                result_arg = result_kwargs[key]
-                assert arg * 2. == result_arg
-
-
-def test_apply_to_outputs():
-    def apply_fn(inputs, additional_str):
-        return str(inputs) + additional_str
-
-    @_apply_to_outputs(apply_fn, additional_str='_str')
-    def test_fn(*args, **kwargs):
-        return 'dummy'
-
-    assert test_fn() == 'dummy_str'
-
-
-def test_convert_to_tensor():
-    for test_item in [1., np.array([1.])]:
-        assert isinstance(_convert_to_tensor(test_item), torch.Tensor)
-        assert test_item.item() == 1.
-
-
-def test_convert_to_numpy():
-    for test_item in [1., torch.tensor([1.])]:
-        result = _convert_to_numpy(test_item)
-        assert isinstance(result, np.ndarray)
-        assert result.item() == 1.
-
-
-def test_numpy_metric_conversion():
-    @_numpy_metric_conversion
-    def numpy_test_metric(*args, **kwargs):
-        for arg in args:
-            assert isinstance(arg, np.ndarray)
-
-        for v in kwargs.values():
-            assert isinstance(v, np.ndarray)
-
-        return 5.
-
-    result = numpy_test_metric(torch.tensor([1.]), dummy_kwarg=2.)
-    assert isinstance(result, torch.Tensor)
-    assert result.item() == 5.
-
-
-def test_tensor_metric_conversion():
-    @_tensor_metric_conversion
-    def tensor_test_metric(*args, **kwargs):
-        for arg in args:
-            assert isinstance(arg, torch.Tensor)
-
-        for v in kwargs.values():
-            assert isinstance(v, torch.Tensor)
-
-        return 5.
-
-    result = tensor_test_metric(np.array([1.]), dummy_kwarg=2.)
-    assert isinstance(result, torch.Tensor)
-    assert result.item() == 5.
-
-
-@pytest.mark.skipif(torch.cuda.device_count() < 2, "test requires multi-GPU machine")
-def test_sync_reduce_ddp():
-    """Make sure sync-reduce works with DDP"""
-    tutils.reset_seed()
-    tutils.set_random_master_port()
-
-    dist.init_process_group('gloo')
-
-    tensor = torch.tensor([1.], device='cuda:0')
-
-    reduced_tensor = _sync_ddp(tensor)
-
-    assert reduced_tensor.item() == dist.get_world_size(), \
-        'Sync-Reduce does not work properly with DDP and Tensors'
-
-    number = 1.
-    reduced_number = _sync_ddp(number)
-    assert isinstance(reduced_number, torch.Tensor), 'When reducing a number we should get a tensor out'
-    assert reduced_number.item() == dist.get_world_size(), \
-        'Sync-Reduce does not work properly with DDP and Numbers'
-
-    dist.destroy_process_group()
-
-
-def test_sync_reduce_simple():
-    """Make sure sync-reduce works without DDP"""
-    tensor = torch.tensor([1.], device='cpu')
-
-    reduced_tensor = _sync_ddp(tensor)
-
-    assert torch.allclose(tensor,
-                          reduced_tensor), 'Sync-Reduce does not work properly without DDP and Tensors'
-
-    number = 1.
-
-    reduced_number = _sync_ddp(number)
-    assert isinstance(reduced_number, torch.Tensor), 'When reducing a number we should get a tensor out'
-    assert reduced_number.item() == number, 'Sync-Reduce does not work properly without DDP and Numbers'
-
-
-def _test_tensor_metric(is_ddp: bool):
-    @tensor_metric()
-    def tensor_test_metric(*args, **kwargs):
-        for arg in args:
-            assert isinstance(arg, torch.Tensor)
-
-        for v in kwargs.values():
-            assert isinstance(v, torch.Tensor)
-
-        return 5.
-
-    if is_ddp:
-        factor = dist.get_world_size()
-    else:
-        factor = 1.
-
-    result = tensor_test_metric(np.array([1.]), dummy_kwarg=2.)
-    assert isinstance(result, torch.Tensor)
-    assert result.item() == 5. * factor
-
-
-@pytest.mark.skipif(torch.cuda.device_count() < 2, "test requires multi-GPU machine")
-def test_tensor_metric_ddp():
-    tutils.reset_seed()
-    tutils.set_random_master_port()
-
-    dist.init_process_group('gloo')
-    _test_tensor_metric(True)
-    dist.destroy_process_group()
-
-
-def test_tensor_metric_simple():
-    _test_tensor_metric(False)
-
-
-def _test_numpy_metric(is_ddp: bool):
-    @numpy_metric()
-    def numpy_test_metric(*args, **kwargs):
-        for arg in args:
-            assert isinstance(arg, np.ndarray)
-
-        for v in kwargs.values():
-            assert isinstance(v, np.ndarray)
-
-        return 5.
-
-    if is_ddp:
-        factor = dist.get_world_size()
-    else:
-        factor = 1.
-
-    result = numpy_test_metric(torch.tensor([1.]), dummy_kwarg=2.)
-    assert isinstance(result, torch.Tensor)
-    assert result.item() == 5. * factor
-
-
-@pytest.mark.skipif(torch.cuda.device_count() < 2, "test requires multi-GPU machine")
-def test_numpy_metric_ddp():
-    tutils.reset_seed()
-    tutils.set_random_master_port()
-
-    dist.init_process_group('gloo')
-    _test_tensor_metric(True)
-    dist.destroy_process_group()
-
-
-def test_numpy_metric_simple():
-    _test_tensor_metric(False)

From d7bf19a7320f7b30d27368d4251d34bd9e9c712d Mon Sep 17 00:00:00 2001
From: Justus Schock <12886177+justusschock@users.noreply.github.com>
Date: Wed, 1 Apr 2020 16:32:01 +0200
Subject: [PATCH 07/44] Create __init__.py

---
 tests/metrics/__init__.py | 205 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 205 insertions(+)

diff --git a/tests/metrics/__init__.py b/tests/metrics/__init__.py
index e69de29bb2d1d..a6dfcf8be94a4 100644
--- a/tests/metrics/__init__.py
+++ b/tests/metrics/__init__.py
@@ -0,0 +1,205 @@
+import numpy as np
+import pytest
+import torch
+import torch.distributed as dist
+
+import tests.base.utils as tutils
+from pytorch_lightning.metrics.utils import _apply_to_inputs, _apply_to_outputs, \
+    _convert_to_tensor, _convert_to_numpy, _numpy_metric_conversion, \
+    _tensor_metric_conversion, _sync_ddp, tensor_metric, numpy_metric
+
+
+def test_apply_to_inputs():
+    def apply_fn(inputs, factor):
+        if isinstance(inputs, (float, int)):
+            return inputs * factor
+        elif isinstance(inputs, dict):
+            return {k: apply_fn(v, factor) for k, v in inputs.items()}
+        elif isinstance(inputs, (tuple, list)):
+            return [apply_fn(x, factor) for x in inputs]
+
+    @_apply_to_inputs(apply_fn, factor=2.)
+    def test_fn(*args, **kwargs):
+        return args, kwargs
+
+    for args in [[], [1., 2.]]:
+        for kwargs in [{}, {1., 2.}]:
+            result_args, result_kwargs = test_fn(*args, **kwargs)
+            assert isinstance(result_args, list)
+            assert isinstance(result_kwargs, dict)
+            assert len(result_args) == len(args)
+            assert len(result_kwargs) == len(kwargs)
+            assert all([k in result_kwargs for k in kwargs.keys()])
+            for arg, result_arg in zip(args, result_args):
+                assert arg * 2. == result_arg
+
+            for key in kwargs.keys():
+                arg = kwargs[key],
+                result_arg = result_kwargs[key]
+                assert arg * 2. == result_arg
+
+
+def test_apply_to_outputs():
+    def apply_fn(inputs, additional_str):
+        return str(inputs) + additional_str
+
+    @_apply_to_outputs(apply_fn, additional_str='_str')
+    def test_fn(*args, **kwargs):
+        return 'dummy'
+
+    assert test_fn() == 'dummy_str'
+
+
+def test_convert_to_tensor():
+    for test_item in [1., np.array([1.])]:
+        assert isinstance(_convert_to_tensor(test_item), torch.Tensor)
+        assert test_item.item() == 1.
+
+
+def test_convert_to_numpy():
+    for test_item in [1., torch.tensor([1.])]:
+        result = _convert_to_numpy(test_item)
+        assert isinstance(result, np.ndarray)
+        assert result.item() == 1.
+
+
+def test_numpy_metric_conversion():
+    @_numpy_metric_conversion
+    def numpy_test_metric(*args, **kwargs):
+        for arg in args:
+            assert isinstance(arg, np.ndarray)
+
+        for v in kwargs.values():
+            assert isinstance(v, np.ndarray)
+
+        return 5.
+
+    result = numpy_test_metric(torch.tensor([1.]), dummy_kwarg=2.)
+    assert isinstance(result, torch.Tensor)
+    assert result.item() == 5.
+
+
+def test_tensor_metric_conversion():
+    @_tensor_metric_conversion
+    def tensor_test_metric(*args, **kwargs):
+        for arg in args:
+            assert isinstance(arg, torch.Tensor)
+
+        for v in kwargs.values():
+            assert isinstance(v, torch.Tensor)
+
+        return 5.
+
+    result = tensor_test_metric(np.array([1.]), dummy_kwarg=2.)
+    assert isinstance(result, torch.Tensor)
+    assert result.item() == 5.
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2, "test requires multi-GPU machine")
+def test_sync_reduce_ddp():
+    """Make sure sync-reduce works with DDP"""
+    tutils.reset_seed()
+    tutils.set_random_master_port()
+
+    dist.init_process_group('gloo')
+
+    tensor = torch.tensor([1.], device='cuda:0')
+
+    reduced_tensor = _sync_ddp(tensor)
+
+    assert reduced_tensor.item() == dist.get_world_size(), \
+        'Sync-Reduce does not work properly with DDP and Tensors'
+
+    number = 1.
+    reduced_number = _sync_ddp(number)
+    assert isinstance(reduced_number, torch.Tensor), 'When reducing a number we should get a tensor out'
+    assert reduced_number.item() == dist.get_world_size(), \
+        'Sync-Reduce does not work properly with DDP and Numbers'
+
+    dist.destroy_process_group()
+
+
+def test_sync_reduce_simple():
+    """Make sure sync-reduce works without DDP"""
+    tensor = torch.tensor([1.], device='cpu')
+
+    reduced_tensor = _sync_ddp(tensor)
+
+    assert torch.allclose(tensor,
+                          reduced_tensor), 'Sync-Reduce does not work properly without DDP and Tensors'
+
+    number = 1.
+
+    reduced_number = _sync_ddp(number)
+    assert isinstance(reduced_number, torch.Tensor), 'When reducing a number we should get a tensor out'
+    assert reduced_number.item() == number, 'Sync-Reduce does not work properly without DDP and Numbers'
+
+
+def _test_tensor_metric(is_ddp: bool):
+    @tensor_metric()
+    def tensor_test_metric(*args, **kwargs):
+        for arg in args:
+            assert isinstance(arg, torch.Tensor)
+
+        for v in kwargs.values():
+            assert isinstance(v, torch.Tensor)
+
+        return 5.
+
+    if is_ddp:
+        factor = dist.get_world_size()
+    else:
+        factor = 1.
+
+    result = tensor_test_metric(np.array([1.]), dummy_kwarg=2.)
+    assert isinstance(result, torch.Tensor)
+    assert result.item() == 5. * factor
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2, "test requires multi-GPU machine")
+def test_tensor_metric_ddp():
+    tutils.reset_seed()
+    tutils.set_random_master_port()
+
+    dist.init_process_group('gloo')
+    _test_tensor_metric(True)
+    dist.destroy_process_group()
+
+
+def test_tensor_metric_simple():
+    _test_tensor_metric(False)
+
+
+def _test_numpy_metric(is_ddp: bool):
+    @numpy_metric()
+    def numpy_test_metric(*args, **kwargs):
+        for arg in args:
+            assert isinstance(arg, np.ndarray)
+
+        for v in kwargs.values():
+            assert isinstance(v, np.ndarray)
+
+        return 5.
+
+    if is_ddp:
+        factor = dist.get_world_size()
+    else:
+        factor = 1.
+
+    result = numpy_test_metric(torch.tensor([1.]), dummy_kwarg=2.)
+    assert isinstance(result, torch.Tensor)
+    assert result.item() == 5. * factor
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2, "test requires multi-GPU machine")
+def test_numpy_metric_ddp():
+    tutils.reset_seed()
+    tutils.set_random_master_port()
+
+    dist.init_process_group('gloo')
+    _test_tensor_metric(True)
+    dist.destroy_process_group()
+
+
+def test_numpy_metric_simple():
+    _test_tensor_metric(False)

From ba2c6f7387fe66cf89c6b2589e6669baee0ec80e Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Wed, 1 Apr 2020 16:36:58 +0200
Subject: [PATCH 08/44] redo sklearn metrics

---
 pytorch_lightning/metrics/sklearn.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pytorch_lightning/metrics/sklearn.py b/pytorch_lightning/metrics/sklearn.py
index d084f18e810b8..c3daac3110ba5 100644
--- a/pytorch_lightning/metrics/sklearn.py
+++ b/pytorch_lightning/metrics/sklearn.py
@@ -384,6 +384,7 @@ class Precision(SklearnMetric):
     The best value is 1 and the worst value is 0.
 
     """
+
     def __init__(self, labels: Optional[Sequence] = None,
                  pos_labels: Union[str, int] = 1,
                  average: Optional[str] = 'binary',
@@ -648,6 +649,7 @@ class AUROC(SklearnMetric):
         this implementation is restricted to the binary classification task
         or multilabel classification task in label indicator format.
     """
+
     def __init__(self, average: Optional[str] = 'macro',
                  reduce_group: Any = torch.distributed.group.WORLD,
                  reduce_op: Any = torch.distributed.ReduceOp.SUM

From 6595de8a87aec43b536ba9a368201c828f31fbe7 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Mon, 13 Apr 2020 12:56:43 +0200
Subject: [PATCH 09/44] add sklearn metrics

---
 pytorch_lightning/metrics/sklearn.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pytorch_lightning/metrics/sklearn.py b/pytorch_lightning/metrics/sklearn.py
index c3daac3110ba5..7f4832cd775e1 100644
--- a/pytorch_lightning/metrics/sklearn.py
+++ b/pytorch_lightning/metrics/sklearn.py
@@ -625,7 +625,6 @@ class or confidence values.
             y_true: Ground truth (correct) target values.
             sample_weight: Sample weights.
 
-
         Returns:
             fpr:
                 Increasing false positive rates such that element i is the false

From 729690ea5ed5d39967d622283b7aaae2fa1ca0c1 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Sun, 26 Apr 2020 16:01:49 +0200
Subject: [PATCH 10/44] start adding sklearn tests

---
 tests/metrics/__init__.py             | 205 --------------------------
 tests/metrics/test_sklearn_metrics.py |  48 ++++++
 2 files changed, 48 insertions(+), 205 deletions(-)
 create mode 100644 tests/metrics/test_sklearn_metrics.py

diff --git a/tests/metrics/__init__.py b/tests/metrics/__init__.py
index a6dfcf8be94a4..e69de29bb2d1d 100644
--- a/tests/metrics/__init__.py
+++ b/tests/metrics/__init__.py
@@ -1,205 +0,0 @@
-import numpy as np
-import pytest
-import torch
-import torch.distributed as dist
-
-import tests.base.utils as tutils
-from pytorch_lightning.metrics.utils import _apply_to_inputs, _apply_to_outputs, \
-    _convert_to_tensor, _convert_to_numpy, _numpy_metric_conversion, \
-    _tensor_metric_conversion, _sync_ddp, tensor_metric, numpy_metric
-
-
-def test_apply_to_inputs():
-    def apply_fn(inputs, factor):
-        if isinstance(inputs, (float, int)):
-            return inputs * factor
-        elif isinstance(inputs, dict):
-            return {k: apply_fn(v, factor) for k, v in inputs.items()}
-        elif isinstance(inputs, (tuple, list)):
-            return [apply_fn(x, factor) for x in inputs]
-
-    @_apply_to_inputs(apply_fn, factor=2.)
-    def test_fn(*args, **kwargs):
-        return args, kwargs
-
-    for args in [[], [1., 2.]]:
-        for kwargs in [{}, {1., 2.}]:
-            result_args, result_kwargs = test_fn(*args, **kwargs)
-            assert isinstance(result_args, list)
-            assert isinstance(result_kwargs, dict)
-            assert len(result_args) == len(args)
-            assert len(result_kwargs) == len(kwargs)
-            assert all([k in result_kwargs for k in kwargs.keys()])
-            for arg, result_arg in zip(args, result_args):
-                assert arg * 2. == result_arg
-
-            for key in kwargs.keys():
-                arg = kwargs[key],
-                result_arg = result_kwargs[key]
-                assert arg * 2. == result_arg
-
-
-def test_apply_to_outputs():
-    def apply_fn(inputs, additional_str):
-        return str(inputs) + additional_str
-
-    @_apply_to_outputs(apply_fn, additional_str='_str')
-    def test_fn(*args, **kwargs):
-        return 'dummy'
-
-    assert test_fn() == 'dummy_str'
-
-
-def test_convert_to_tensor():
-    for test_item in [1., np.array([1.])]:
-        assert isinstance(_convert_to_tensor(test_item), torch.Tensor)
-        assert test_item.item() == 1.
-
-
-def test_convert_to_numpy():
-    for test_item in [1., torch.tensor([1.])]:
-        result = _convert_to_numpy(test_item)
-        assert isinstance(result, np.ndarray)
-        assert result.item() == 1.
-
-
-def test_numpy_metric_conversion():
-    @_numpy_metric_conversion
-    def numpy_test_metric(*args, **kwargs):
-        for arg in args:
-            assert isinstance(arg, np.ndarray)
-
-        for v in kwargs.values():
-            assert isinstance(v, np.ndarray)
-
-        return 5.
-
-    result = numpy_test_metric(torch.tensor([1.]), dummy_kwarg=2.)
-    assert isinstance(result, torch.Tensor)
-    assert result.item() == 5.
-
-
-def test_tensor_metric_conversion():
-    @_tensor_metric_conversion
-    def tensor_test_metric(*args, **kwargs):
-        for arg in args:
-            assert isinstance(arg, torch.Tensor)
-
-        for v in kwargs.values():
-            assert isinstance(v, torch.Tensor)
-
-        return 5.
-
-    result = tensor_test_metric(np.array([1.]), dummy_kwarg=2.)
-    assert isinstance(result, torch.Tensor)
-    assert result.item() == 5.
-
-
-@pytest.mark.skipif(torch.cuda.device_count() < 2, "test requires multi-GPU machine")
-def test_sync_reduce_ddp():
-    """Make sure sync-reduce works with DDP"""
-    tutils.reset_seed()
-    tutils.set_random_master_port()
-
-    dist.init_process_group('gloo')
-
-    tensor = torch.tensor([1.], device='cuda:0')
-
-    reduced_tensor = _sync_ddp(tensor)
-
-    assert reduced_tensor.item() == dist.get_world_size(), \
-        'Sync-Reduce does not work properly with DDP and Tensors'
-
-    number = 1.
-    reduced_number = _sync_ddp(number)
-    assert isinstance(reduced_number, torch.Tensor), 'When reducing a number we should get a tensor out'
-    assert reduced_number.item() == dist.get_world_size(), \
-        'Sync-Reduce does not work properly with DDP and Numbers'
-
-    dist.destroy_process_group()
-
-
-def test_sync_reduce_simple():
-    """Make sure sync-reduce works without DDP"""
-    tensor = torch.tensor([1.], device='cpu')
-
-    reduced_tensor = _sync_ddp(tensor)
-
-    assert torch.allclose(tensor,
-                          reduced_tensor), 'Sync-Reduce does not work properly without DDP and Tensors'
-
-    number = 1.
-
-    reduced_number = _sync_ddp(number)
-    assert isinstance(reduced_number, torch.Tensor), 'When reducing a number we should get a tensor out'
-    assert reduced_number.item() == number, 'Sync-Reduce does not work properly without DDP and Numbers'
-
-
-def _test_tensor_metric(is_ddp: bool):
-    @tensor_metric()
-    def tensor_test_metric(*args, **kwargs):
-        for arg in args:
-            assert isinstance(arg, torch.Tensor)
-
-        for v in kwargs.values():
-            assert isinstance(v, torch.Tensor)
-
-        return 5.
-
-    if is_ddp:
-        factor = dist.get_world_size()
-    else:
-        factor = 1.
-
-    result = tensor_test_metric(np.array([1.]), dummy_kwarg=2.)
-    assert isinstance(result, torch.Tensor)
-    assert result.item() == 5. * factor
-
-
-@pytest.mark.skipif(torch.cuda.device_count() < 2, "test requires multi-GPU machine")
-def test_tensor_metric_ddp():
-    tutils.reset_seed()
-    tutils.set_random_master_port()
-
-    dist.init_process_group('gloo')
-    _test_tensor_metric(True)
-    dist.destroy_process_group()
-
-
-def test_tensor_metric_simple():
-    _test_tensor_metric(False)
-
-
-def _test_numpy_metric(is_ddp: bool):
-    @numpy_metric()
-    def numpy_test_metric(*args, **kwargs):
-        for arg in args:
-            assert isinstance(arg, np.ndarray)
-
-        for v in kwargs.values():
-            assert isinstance(v, np.ndarray)
-
-        return 5.
-
-    if is_ddp:
-        factor = dist.get_world_size()
-    else:
-        factor = 1.
-
-    result = numpy_test_metric(torch.tensor([1.]), dummy_kwarg=2.)
-    assert isinstance(result, torch.Tensor)
-    assert result.item() == 5. * factor
-
-
-@pytest.mark.skipif(torch.cuda.device_count() < 2, "test requires multi-GPU machine")
-def test_numpy_metric_ddp():
-    tutils.reset_seed()
-    tutils.set_random_master_port()
-
-    dist.init_process_group('gloo')
-    _test_tensor_metric(True)
-    dist.destroy_process_group()
-
-
-def test_numpy_metric_simple():
-    _test_tensor_metric(False)
diff --git a/tests/metrics/test_sklearn_metrics.py b/tests/metrics/test_sklearn_metrics.py
new file mode 100644
index 0000000000000..bc7c7b07cb540
--- /dev/null
+++ b/tests/metrics/test_sklearn_metrics.py
@@ -0,0 +1,48 @@
+import numbers
+from collections import Mapping, Sequence
+
+import numpy as np
+import pytest
+import torch
+from sklearn.metrics import accuracy_score, average_precision_score, auc
+
+from pytorch_lightning.metrics.converters import _convert_to_numpy
+from pytorch_lightning.metrics.sklearn import Accuracy, AveragePrecision, AUC
+from pytorch_lightning.utilities.apply_func import apply_to_collection
+
+
+@pytest.mark.parametrize(['metric_class', 'sklearn_func', 'inputs'], [
+    pytest.param(Accuracy(), accuracy_score,
+                 {'y_pred': torch.randint(low=0, high=10, size=(10,)),
+                  'y_true': torch.randint(low=0, high=10, size=(10,))}),
+    pytest.param(AUC(), auc, {'x': torch.arange(10, dtype=torch.float)/10,
+                              'y': torch.tensor([0.2, 0.2, 0.2, 0.2, 0.2,
+                                                 0.2, 0.3, 0.5, 0.6, 0.7])})
+])
+def test_sklearn_metric(metric_class, sklearn_func, inputs: dict):
+    numpy_inputs = apply_to_collection(
+        inputs, (torch.Tensor, np.ndarray, numbers.Number), _convert_to_numpy)
+
+    sklearn_result = sklearn_func(**numpy_inputs)
+    lightning_result = metric_class(**inputs)
+
+    sklearn_result = apply_to_collection(
+        sklearn_result, (torch.Tensor, np.ndarray, numbers.Number), _convert_to_numpy)
+
+    lightning_result = apply_to_collection(
+        lightning_result, (torch.Tensor, np.ndarray, numbers.Number), _convert_to_numpy)
+
+    assert isinstance(lightning_result, type(sklearn_result))
+
+    if isinstance(lightning_result, np.ndarray):
+        assert np.allclose(lightning_result, sklearn_result)
+    elif isinstance(lightning_result, Mapping):
+        for key in lightning_result.keys():
+            assert np.allclose(lightning_result[key], sklearn_result[key])
+
+    elif isinstance(lightning_result, Sequence):
+        for val_lightning, val_sklearn in zip(lightning_result, sklearn_result):
+            assert np.allclose(val_lightning, val_sklearn)
+
+    else:
+        raise TypeError

From 429dab6896ef536f74862f07674c1202654bf954 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Mon, 27 Apr 2020 08:38:58 +0200
Subject: [PATCH 11/44] fix typo

---
 pytorch_lightning/metrics/sklearn.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/metrics/sklearn.py b/pytorch_lightning/metrics/sklearn.py
index 7f4832cd775e1..8fb7c5aa69223 100644
--- a/pytorch_lightning/metrics/sklearn.py
+++ b/pytorch_lightning/metrics/sklearn.py
@@ -233,7 +233,7 @@ class F1(SklearnMetric):
     """
 
     def __init__(self, labels: Optional[Sequence] = None,
-                 pos_labels: Union[str, int] = 1,
+                 pos_label: Union[str, int] = 1,
                  average: Optional[str] = 'binary',
                  reduce_group: Any = torch.distributed.group.WORLD,
                  reduce_op: Any = torch.distributed.ReduceOp.SUM):
@@ -241,7 +241,7 @@ def __init__(self, labels: Optional[Sequence] = None,
 
         Args:
             labels: Integer array of labels.
-            pos_labels: The class to report if ``average='binary'``.
+            pos_label: The class to report if ``average='binary'``.
             average: This parameter is required for multiclass/multilabel targets.
                 If ``None``, the scores for each class are returned. Otherwise, this
                 determines the type of averaging performed on the data:
@@ -275,7 +275,7 @@ def __init__(self, labels: Optional[Sequence] = None,
                          reduce_group=reduce_group,
                          reduce_op=reduce_op,
                          labels=labels,
-                         pos_labels=pos_labels,
+                         pos_label=pos_label,
                          average=average)
 
     def forward(self, y_pred: np.ndarray, y_true: np.ndarray,

From 387b9b29c751493de6a03f5898980ba8b2d5565a Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Mon, 27 Apr 2020 08:39:24 +0200
Subject: [PATCH 12/44] fix typo

---
 pytorch_lightning/metrics/sklearn.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/metrics/sklearn.py b/pytorch_lightning/metrics/sklearn.py
index 8fb7c5aa69223..855fd5aeaf67c 100644
--- a/pytorch_lightning/metrics/sklearn.py
+++ b/pytorch_lightning/metrics/sklearn.py
@@ -310,7 +310,7 @@ class FBeta(SklearnMetric):
         """
 
     def __init__(self, beta: float, labels: Optional[Sequence] = None,
-                 pos_labels: Union[str, int] = 1,
+                 pos_label: Union[str, int] = 1,
                  average: Optional[str] = 'binary',
                  reduce_group: Any = torch.distributed.group.WORLD,
                  reduce_op: Any = torch.distributed.ReduceOp.SUM):
@@ -319,7 +319,7 @@ def __init__(self, beta: float, labels: Optional[Sequence] = None,
         Args:
             beta: Weight of precision in harmonic mean.
             labels: Integer array of labels.
-            pos_labels: The class to report if ``average='binary'``.
+            pos_label: The class to report if ``average='binary'``.
             average: This parameter is required for multiclass/multilabel targets.
                 If ``None``, the scores for each class are returned. Otherwise, this
                 determines the type of averaging performed on the data:
@@ -354,7 +354,7 @@ def __init__(self, beta: float, labels: Optional[Sequence] = None,
                          reduce_op=reduce_op,
                          beta=beta,
                          labels=labels,
-                         pos_labels=pos_labels,
+                         pos_label=pos_label,
                          average=average)
 
     def forward(self, y_pred: np.ndarray, y_true: np.ndarray,

From 74ab62bfbae4be2dfc01018fcd55878c19b99c63 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Mon, 27 Apr 2020 08:42:29 +0200
Subject: [PATCH 13/44] fix typo

---
 pytorch_lightning/metrics/sklearn.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/metrics/sklearn.py b/pytorch_lightning/metrics/sklearn.py
index 855fd5aeaf67c..5d605ae05b484 100644
--- a/pytorch_lightning/metrics/sklearn.py
+++ b/pytorch_lightning/metrics/sklearn.py
@@ -386,7 +386,7 @@ class Precision(SklearnMetric):
     """
 
     def __init__(self, labels: Optional[Sequence] = None,
-                 pos_labels: Union[str, int] = 1,
+                 pos_label: Union[str, int] = 1,
                  average: Optional[str] = 'binary',
                  reduce_group: Any = torch.distributed.group.WORLD,
                  reduce_op: Any = torch.distributed.ReduceOp.SUM):
@@ -394,7 +394,7 @@ def __init__(self, labels: Optional[Sequence] = None,
 
         Args:
             labels: Integer array of labels.
-            pos_labels: The class to report if ``average='binary'``.
+            pos_label: The class to report if ``average='binary'``.
             average: This parameter is required for multiclass/multilabel targets.
                 If ``None``, the scores for each class are returned. Otherwise, this
                 determines the type of averaging performed on the data:
@@ -428,7 +428,7 @@ def __init__(self, labels: Optional[Sequence] = None,
                          reduce_group=reduce_group,
                          reduce_op=reduce_op,
                          labels=labels,
-                         pos_labels=pos_labels,
+                         pos_label=pos_label,
                          average=average)
 
     def forward(self, y_pred: np.ndarray, y_true: np.ndarray,

From 8e5f1d6b20436768abbae9c640f2c243fccba1db Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Mon, 27 Apr 2020 08:42:48 +0200
Subject: [PATCH 14/44] fix typo

---
 pytorch_lightning/metrics/sklearn.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/metrics/sklearn.py b/pytorch_lightning/metrics/sklearn.py
index 5d605ae05b484..66303b6b4fdee 100644
--- a/pytorch_lightning/metrics/sklearn.py
+++ b/pytorch_lightning/metrics/sklearn.py
@@ -459,7 +459,7 @@ class Recall(SklearnMetric):
     """
 
     def __init__(self, labels: Optional[Sequence] = None,
-                 pos_labels: Union[str, int] = 1,
+                 pos_label: Union[str, int] = 1,
                  average: Optional[str] = 'binary',
                  reduce_group: Any = torch.distributed.group.WORLD,
                  reduce_op: Any = torch.distributed.ReduceOp.SUM):
@@ -467,7 +467,7 @@ def __init__(self, labels: Optional[Sequence] = None,
 
         Args:
             labels: Integer array of labels.
-            pos_labels: The class to report if ``average='binary'``.
+            pos_label: The class to report if ``average='binary'``.
             average: This parameter is required for multiclass/multilabel targets.
                 If ``None``, the scores for each class are returned. Otherwise, this
                 determines the type of averaging performed on the data:
@@ -501,7 +501,7 @@ def __init__(self, labels: Optional[Sequence] = None,
                          reduce_group=reduce_group,
                          reduce_op=reduce_op,
                          labels=labels,
-                         pos_labels=pos_labels,
+                         pos_label=pos_label,
                          average=average)
 
     def forward(self, y_pred: np.ndarray, y_true: np.ndarray,

From 50822687669bce4960c77cd49857679d18ff31b6 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Mon, 27 Apr 2020 09:01:36 +0200
Subject: [PATCH 15/44] return x and y only for curves

---
 pytorch_lightning/metrics/sklearn.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/metrics/sklearn.py b/pytorch_lightning/metrics/sklearn.py
index 66303b6b4fdee..22e380bea2a7e 100644
--- a/pytorch_lightning/metrics/sklearn.py
+++ b/pytorch_lightning/metrics/sklearn.py
@@ -542,13 +542,13 @@ class PrecisionRecallCurve(SklearnMetric):
     """
 
     def __init__(self,
-                 pos_labels: Union[str, int] = 1,
+                 pos_label: Union[str, int] = 1,
                  reduce_group: Any = torch.distributed.group.WORLD,
                  reduce_op: Any = torch.distributed.ReduceOp.SUM):
         """
 
         Args:
-            pos_labels: The class to report if ``average='binary'``.
+            pos_label: The class to report if ``average='binary'``.
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
@@ -557,7 +557,7 @@ def __init__(self,
         super().__init__('precision_recall_curve',
                          reduce_group=reduce_group,
                          reduce_op=reduce_op,
-                         pos_labels=pos_labels)
+                         pos_label=pos_label)
 
     def forward(self, probas_pred: np.ndarray, y_true: np.ndarray,
                 sample_weight: Optional[np.ndarray] = None) -> Union[np.ndarray, float]:
@@ -581,7 +581,10 @@ def forward(self, probas_pred: np.ndarray, y_true: np.ndarray,
                 precision and recall.
 
         """
-        return super().forward(probas_pred=probas_pred, y_true=y_true, sample_weight=sample_weight)
+        # only return x and y here, since for now we cannot auto-convert elements of multiple length.
+        # Will be fixed in native implementation
+        return np.array(
+            super().forward(probas_pred=probas_pred, y_true=y_true, sample_weight=sample_weight)[:2])
 
 
 class ROC(SklearnMetric):
@@ -638,7 +641,7 @@ class or confidence values.
                 and is arbitrarily set to `max(y_score) + 1`.
 
         """
-        return super().forward(y_score=y_score, y_true=y_true, sample_weight=sample_weight)
+        return np.array(super().forward(y_score=y_score, y_true=y_true, sample_weight=sample_weight)[:2])
 
 
 class AUROC(SklearnMetric):

From 10cde374bd65aa3de0e36e36a8f6f2ef72022924 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Mon, 27 Apr 2020 09:01:51 +0200
Subject: [PATCH 16/44] fix typo

---
 pytorch_lightning/metrics/sklearn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/metrics/sklearn.py b/pytorch_lightning/metrics/sklearn.py
index 22e380bea2a7e..134f84dba2581 100644
--- a/pytorch_lightning/metrics/sklearn.py
+++ b/pytorch_lightning/metrics/sklearn.py
@@ -597,7 +597,7 @@ class ROC(SklearnMetric):
     """
 
     def __init__(self,
-                 pos_labels: Union[str, int] = 1,
+                 pos_label: Union[str, int] = 1,
                  reduce_group: Any = torch.distributed.group.WORLD,
                  reduce_op: Any = torch.distributed.ReduceOp.SUM):
         """
@@ -616,7 +616,7 @@ def __init__(self,
         super().__init__('roc_curve',
                          reduce_group=reduce_group,
                          reduce_op=reduce_op,
-                         pos_labels=pos_labels)
+                         pos_label=pos_label)
 
     def forward(self, y_score: np.ndarray, y_true: np.ndarray,
                 sample_weight: Optional[np.ndarray] = None) -> Union[np.ndarray, float]:

From 1a1762d888f655e09794919091e332e8151be21e Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Mon, 27 Apr 2020 09:03:15 +0200
Subject: [PATCH 17/44] add missing tests for sklearn funcs

---
 tests/metrics/test_sklearn_metrics.py | 52 +++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/tests/metrics/test_sklearn_metrics.py b/tests/metrics/test_sklearn_metrics.py
index bc7c7b07cb540..5009887cdfecc 100644
--- a/tests/metrics/test_sklearn_metrics.py
+++ b/tests/metrics/test_sklearn_metrics.py
@@ -1,23 +1,63 @@
 import numbers
 from collections import Mapping, Sequence
+from functools import partial
 
 import numpy as np
 import pytest
 import torch
-from sklearn.metrics import accuracy_score, average_precision_score, auc
+from sklearn.metrics import accuracy_score, average_precision_score, auc, confusion_matrix, f1_score, \
+    fbeta_score, precision_score, recall_score, precision_recall_curve, roc_curve, roc_auc_score
 
 from pytorch_lightning.metrics.converters import _convert_to_numpy
-from pytorch_lightning.metrics.sklearn import Accuracy, AveragePrecision, AUC
+from pytorch_lightning.metrics.sklearn import Accuracy, AveragePrecision, AUC, ConfusionMatrix, F1, FBeta, \
+    Precision, Recall, PrecisionRecallCurve, ROC, AUROC
 from pytorch_lightning.utilities.apply_func import apply_to_collection
 
 
+def xy_only(func):
+    def new_func(*args, **kwargs):
+        return np.array(func(*args, **kwargs)[:2])
+
+    return new_func
+
+
 @pytest.mark.parametrize(['metric_class', 'sklearn_func', 'inputs'], [
     pytest.param(Accuracy(), accuracy_score,
-                 {'y_pred': torch.randint(low=0, high=10, size=(10,)),
-                  'y_true': torch.randint(low=0, high=10, size=(10,))}),
-    pytest.param(AUC(), auc, {'x': torch.arange(10, dtype=torch.float)/10,
+                 {'y_pred': torch.randint(low=0, high=10, size=(128,)),
+                  'y_true': torch.randint(low=0, high=10, size=(128,))}),
+    pytest.param(AUC(), auc, {'x': torch.arange(10, dtype=torch.float) / 10,
                               'y': torch.tensor([0.2, 0.2, 0.2, 0.2, 0.2,
-                                                 0.2, 0.3, 0.5, 0.6, 0.7])})
+                                                 0.2, 0.3, 0.5, 0.6, 0.7])}),
+    pytest.param(AveragePrecision(), average_precision_score,
+                 {'y_score': torch.randint(2, size=(128,)),
+                  'y_true': torch.randint(2, size=(128,))}),
+    pytest.param(ConfusionMatrix(), confusion_matrix,
+                 {'y_pred': torch.randint(10, size=(128,)),
+                  'y_true': torch.randint(10, size=(128,))}),
+    pytest.param(F1(average='macro'), partial(f1_score, average='macro'),
+                 {'y_pred': torch.randint(10, size=(128,)),
+                  'y_true': torch.randint(10, size=(128,))}),
+    pytest.param(FBeta(beta=0.5, average='macro'), partial(fbeta_score,
+                                                           beta=0.5,
+                                                           average='macro'),
+                 {'y_pred': torch.randint(10, size=(128,)),
+                  'y_true': torch.randint(10, size=(128,))}),
+    pytest.param(Precision(average='macro'), partial(precision_score,
+                                                     average='macro'),
+                 {'y_pred': torch.randint(10, size=(128,)),
+                  'y_true': torch.randint(10, size=(128,))}),
+    pytest.param(Recall(average='macro'), partial(recall_score, average='macro'),
+                 {'y_pred': torch.randint(10, size=(128,)),
+                  'y_true': torch.randint(10, size=(128,))}),
+    pytest.param(PrecisionRecallCurve(), xy_only(precision_recall_curve),
+                 {'probas_pred': torch.rand(size=(128,)),
+                  'y_true': torch.randint(2, size=(128,))}),
+    pytest.param(ROC(), xy_only(roc_curve),
+                 {'y_score': torch.rand(size=(128,)),
+                  'y_true': torch.randint(2, size=(128,))}),
+    pytest.param(AUROC(), roc_auc_score,
+                 {'y_score': torch.rand(size=(128,)),
+                  'y_true': torch.randint(2, size=(128,))}),
 ])
 def test_sklearn_metric(metric_class, sklearn_func, inputs: dict):
     numpy_inputs = apply_to_collection(

From a698282e7e8fc806fbfd611ac737558307240794 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Mon, 27 Apr 2020 09:09:20 +0200
Subject: [PATCH 18/44] imports

---
 tests/metrics/test_sklearn_metrics.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/metrics/test_sklearn_metrics.py b/tests/metrics/test_sklearn_metrics.py
index 5009887cdfecc..fc2f04dcdf479 100644
--- a/tests/metrics/test_sklearn_metrics.py
+++ b/tests/metrics/test_sklearn_metrics.py
@@ -5,12 +5,13 @@
 import numpy as np
 import pytest
 import torch
-from sklearn.metrics import accuracy_score, average_precision_score, auc, confusion_matrix, f1_score, \
-    fbeta_score, precision_score, recall_score, precision_recall_curve, roc_curve, roc_auc_score
+from sklearn.metrics import (accuracy_score, average_precision_score, auc, confusion_matrix, f1_score,
+                             fbeta_score, precision_score, recall_score, precision_recall_curve, roc_curve,
+                             roc_auc_score)
 
 from pytorch_lightning.metrics.converters import _convert_to_numpy
-from pytorch_lightning.metrics.sklearn import Accuracy, AveragePrecision, AUC, ConfusionMatrix, F1, FBeta, \
-    Precision, Recall, PrecisionRecallCurve, ROC, AUROC
+from pytorch_lightning.metrics.sklearn import (Accuracy, AveragePrecision, AUC, ConfusionMatrix, F1, FBeta,
+                                               Precision, Recall, PrecisionRecallCurve, ROC, AUROC)
 from pytorch_lightning.utilities.apply_func import apply_to_collection
 
 

From debd245745310f694b489c7e92d80fdd6af55323 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Mon, 27 Apr 2020 09:09:37 +0200
Subject: [PATCH 19/44] __all__

---
 pytorch_lightning/metrics/sklearn.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pytorch_lightning/metrics/sklearn.py b/pytorch_lightning/metrics/sklearn.py
index 134f84dba2581..8eb858021f045 100644
--- a/pytorch_lightning/metrics/sklearn.py
+++ b/pytorch_lightning/metrics/sklearn.py
@@ -6,6 +6,9 @@
 from pytorch_lightning import _logger as lightning_logger
 from pytorch_lightning.metrics.metric import NumpyMetric
 
+__all__ = ['SklearnMetric', 'Accuracy', 'AveragePrecision', 'AUC', 'ConfusionMatrix', 'F1', 'FBeta',
+           'Precision', 'Recall', 'PrecisionRecallCurve', 'ROC', 'AUROC']
+
 
 class SklearnMetric(NumpyMetric):
     def __init__(self, metric_name: str,

From 04adf4b6875a29667af1c6b9f5bdbab62d44d7ef Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Mon, 27 Apr 2020 09:11:19 +0200
Subject: [PATCH 20/44] imports

---
 pytorch_lightning/metrics/__init__.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pytorch_lightning/metrics/__init__.py b/pytorch_lightning/metrics/__init__.py
index cd721851307df..cdc7f3f4d90fc 100644
--- a/pytorch_lightning/metrics/__init__.py
+++ b/pytorch_lightning/metrics/__init__.py
@@ -22,3 +22,9 @@
 
 
 """
+
+from pytorch_lightning.metrics.metric import Metric, TensorMetric, NumpyMetric
+from pytorch_lightning.metrics.sklearn import (SklearnMetric, Accuracy, AveragePrecision, AUC,
+                                               ConfusionMatrix, F1, FBeta,
+                                               Precision, Recall, PrecisionRecallCurve, ROC, AUROC)
+from pytorch_lightning.metrics.converters import numpy_metric, tensor_metric

From c72bda94a5e397774ebe4ccac35c4c2fe24fd137 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Mon, 25 May 2020 08:54:01 +0200
Subject: [PATCH 21/44] fix sklearn arguments

---
 pytorch_lightning/metrics/sklearn.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/metrics/sklearn.py b/pytorch_lightning/metrics/sklearn.py
index 8eb858021f045..1fbb1bc124761 100644
--- a/pytorch_lightning/metrics/sklearn.py
+++ b/pytorch_lightning/metrics/sklearn.py
@@ -99,7 +99,7 @@ def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
 
 
 class AUC(SklearnMetric):
-    def __init__(self, reorder: bool = False,
+    def __init__(self,
                  reduce_group: Any = torch.distributed.group.WORLD,
                  reduce_op: Any = torch.distributed.ReduceOp.SUM
                  ):
@@ -110,8 +110,6 @@ def __init__(self, reorder: bool = False,
             Every metric call will cause a GPU synchronization, which may slow down your code
 
         Args:
-            reorder: If ``True``, assume that the curve is ascending in the case of ties, as for an ROC curve.
-                If the curve is non-ascending, the result will be wrong.
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
@@ -121,7 +119,7 @@ def __init__(self, reorder: bool = False,
         super().__init__(metric_name='auc',
                          reduce_group=reduce_group,
                          reduce_op=reduce_op,
-                         reorder=reorder)
+                         )
 
     def forward(self, x: np.ndarray, y: np.ndarray) -> float:
         """

From ca471d663d833289446cba477c822c7899250481 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Mon, 25 May 2020 08:54:09 +0200
Subject: [PATCH 22/44] fix imports

---
 pytorch_lightning/metrics/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/metrics/utils.py b/pytorch_lightning/metrics/utils.py
index 9942545273546..e284b9494d8b8 100644
--- a/pytorch_lightning/metrics/utils.py
+++ b/pytorch_lightning/metrics/utils.py
@@ -5,7 +5,7 @@
 import torch
 from torch.utils.data._utils.collate import default_convert
 
-from pytorch_lightning.utilities.apply_to_collection import apply_to_collection
+from pytorch_lightning.utilities.apply_func import apply_to_collection
 
 
 def _apply_to_inputs(func_to_apply, *dec_args, **dec_kwargs):

From e9f5faf8e56fc398b52aaf3df4df0958b23346e6 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@rwth-aachen.de>
Date: Mon, 25 May 2020 09:12:07 +0200
Subject: [PATCH 23/44] update requirements

---
 environment.yml        | 4 +++-
 requirements-extra.txt | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/environment.yml b/environment.yml
index cad6c002d1a4d..98f5fb81e1cdd 100644
--- a/environment.yml
+++ b/environment.yml
@@ -27,7 +27,9 @@ dependencies:
     - check-manifest
     - twine==1.13.0
     - pillow<7.0.0
-    - scikit-learn>=0.16.1
+    - scipy>=0.13.3
+    - scikit-learn>=0.20.0
+
 
     - pip:
         - test-tube>=0.7.5
diff --git a/requirements-extra.txt b/requirements-extra.txt
index cdc10043bc858..1ce0aa550212f 100644
--- a/requirements-extra.txt
+++ b/requirements-extra.txt
@@ -10,4 +10,5 @@ matplotlib>=3.1.1
 # no need to install with [pytorch] as pytorch is already installed and torchvision is required only for Horovod examples
 horovod>=0.19.1
 omegaconf>=2.0.0
-scikit-learn>=0.16.1
+scipy>=0.13.3
+scikit-learn>=0.20.0

From 13f205a19a3c75a8ccae8e13af78ae6cfe32ad6f Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Mon, 25 May 2020 18:24:03 +0200
Subject: [PATCH 24/44] Update CHANGELOG.md

---
 CHANGELOG.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 80070aeb3dde6..b615abc47a4d1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,7 +4,6 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
-
 ## [unreleased] - YYYY-MM-DD
 
 ### Added

From b5dbdb8302857ec0034ab9607447bfe73aadabf5 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Mon, 25 May 2020 18:28:30 +0200
Subject: [PATCH 25/44] Update test_sklearn_metrics.py

---
 tests/metrics/test_sklearn_metrics.py | 29 ++++++++++++---------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/tests/metrics/test_sklearn_metrics.py b/tests/metrics/test_sklearn_metrics.py
index fc2f04dcdf479..e075330d60a3c 100644
--- a/tests/metrics/test_sklearn_metrics.py
+++ b/tests/metrics/test_sklearn_metrics.py
@@ -25,40 +25,37 @@ def new_func(*args, **kwargs):
 @pytest.mark.parametrize(['metric_class', 'sklearn_func', 'inputs'], [
     pytest.param(Accuracy(), accuracy_score,
                  {'y_pred': torch.randint(low=0, high=10, size=(128,)),
-                  'y_true': torch.randint(low=0, high=10, size=(128,))}),
+                  'y_true': torch.randint(low=0, high=10, size=(128,))}, id='Accuracy'),
     pytest.param(AUC(), auc, {'x': torch.arange(10, dtype=torch.float) / 10,
                               'y': torch.tensor([0.2, 0.2, 0.2, 0.2, 0.2,
-                                                 0.2, 0.3, 0.5, 0.6, 0.7])}),
+                                                 0.2, 0.3, 0.5, 0.6, 0.7])}, id='AUC'),
     pytest.param(AveragePrecision(), average_precision_score,
                  {'y_score': torch.randint(2, size=(128,)),
-                  'y_true': torch.randint(2, size=(128,))}),
+                  'y_true': torch.randint(2, size=(128,))}, id='AveragePrecision'),
     pytest.param(ConfusionMatrix(), confusion_matrix,
                  {'y_pred': torch.randint(10, size=(128,)),
-                  'y_true': torch.randint(10, size=(128,))}),
+                  'y_true': torch.randint(10, size=(128,))}, id='ConfusionMatrix'),
     pytest.param(F1(average='macro'), partial(f1_score, average='macro'),
                  {'y_pred': torch.randint(10, size=(128,)),
-                  'y_true': torch.randint(10, size=(128,))}),
-    pytest.param(FBeta(beta=0.5, average='macro'), partial(fbeta_score,
-                                                           beta=0.5,
-                                                           average='macro'),
+                  'y_true': torch.randint(10, size=(128,))}, id='F1'),
+    pytest.param(FBeta(beta=0.5, average='macro'), partial(fbeta_score, beta=0.5, average='macro'),
                  {'y_pred': torch.randint(10, size=(128,)),
-                  'y_true': torch.randint(10, size=(128,))}),
-    pytest.param(Precision(average='macro'), partial(precision_score,
-                                                     average='macro'),
+                  'y_true': torch.randint(10, size=(128,))}, id='FBeta'),
+    pytest.param(Precision(average='macro'), partial(precision_score, average='macro'),
                  {'y_pred': torch.randint(10, size=(128,)),
-                  'y_true': torch.randint(10, size=(128,))}),
+                  'y_true': torch.randint(10, size=(128,))}, id='Precision'),
     pytest.param(Recall(average='macro'), partial(recall_score, average='macro'),
                  {'y_pred': torch.randint(10, size=(128,)),
-                  'y_true': torch.randint(10, size=(128,))}),
+                  'y_true': torch.randint(10, size=(128,))}, id='Recall'),
     pytest.param(PrecisionRecallCurve(), xy_only(precision_recall_curve),
                  {'probas_pred': torch.rand(size=(128,)),
-                  'y_true': torch.randint(2, size=(128,))}),
+                  'y_true': torch.randint(2, size=(128,))}, id='PrecisionRecallCurve'),
     pytest.param(ROC(), xy_only(roc_curve),
                  {'y_score': torch.rand(size=(128,)),
-                  'y_true': torch.randint(2, size=(128,))}),
+                  'y_true': torch.randint(2, size=(128,))}, id='ROC'),
     pytest.param(AUROC(), roc_auc_score,
                  {'y_score': torch.rand(size=(128,)),
-                  'y_true': torch.randint(2, size=(128,))}),
+                  'y_true': torch.randint(2, size=(128,))}, id='AUROC'),
 ])
 def test_sklearn_metric(metric_class, sklearn_func, inputs: dict):
     numpy_inputs = apply_to_collection(

From a7e3e4f035f186ceceee80d714e3d2f12fc2fea5 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 25 May 2020 21:40:21 +0200
Subject: [PATCH 26/44] formatting

---
 pytorch_lightning/metrics/__init__.py |  6 ++---
 pytorch_lightning/metrics/sklearn.py  | 34 ++++++++++++++++++---------
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/pytorch_lightning/metrics/__init__.py b/pytorch_lightning/metrics/__init__.py
index cdc7f3f4d90fc..83446d11701f9 100644
--- a/pytorch_lightning/metrics/__init__.py
+++ b/pytorch_lightning/metrics/__init__.py
@@ -24,7 +24,7 @@
 """
 
 from pytorch_lightning.metrics.metric import Metric, TensorMetric, NumpyMetric
-from pytorch_lightning.metrics.sklearn import (SklearnMetric, Accuracy, AveragePrecision, AUC,
-                                               ConfusionMatrix, F1, FBeta,
-                                               Precision, Recall, PrecisionRecallCurve, ROC, AUROC)
+from pytorch_lightning.metrics.sklearn import (
+    SklearnMetric, Accuracy, AveragePrecision, AUC, ConfusionMatrix, F1, FBeta,
+    Precision, Recall, PrecisionRecallCurve, ROC, AUROC)
 from pytorch_lightning.metrics.converters import numpy_metric, tensor_metric
diff --git a/pytorch_lightning/metrics/sklearn.py b/pytorch_lightning/metrics/sklearn.py
index 1fbb1bc124761..20b14980f942e 100644
--- a/pytorch_lightning/metrics/sklearn.py
+++ b/pytorch_lightning/metrics/sklearn.py
@@ -6,8 +6,20 @@
 from pytorch_lightning import _logger as lightning_logger
 from pytorch_lightning.metrics.metric import NumpyMetric
 
-__all__ = ['SklearnMetric', 'Accuracy', 'AveragePrecision', 'AUC', 'ConfusionMatrix', 'F1', 'FBeta',
-           'Precision', 'Recall', 'PrecisionRecallCurve', 'ROC', 'AUROC']
+__all__ = [
+    'SklearnMetric',
+    'Accuracy',
+    'AveragePrecision',
+    'AUC',
+    'ConfusionMatrix',
+    'F1',
+    'FBeta',
+    'Precision',
+    'Recall',
+    'PrecisionRecallCurve',
+    'ROC',
+    'AUROC'
+]
 
 
 class SklearnMetric(NumpyMetric):
@@ -143,15 +155,15 @@ def __init__(self, average: Optional[str] = 'macro',
         """
         Calculates the average precision (AP) score.
         Args:
-            average: If None, the scores for each class are returned. Otherwise, this determines the type of
-                averaging performed on the data:
-                * If 'micro': Calculate metrics globally by considering each element of the label indicator
-                    matrix as a label.
-                * If 'macro': Calculate metrics for each label, and find their unweighted mean.
-                    This does not take label imbalance into account.
-                * If 'weighted': Calculate metrics for each label, and find their average, weighted by
-                    support (the number of true instances for each label).
-                * If 'samples': Calculate metrics for each instance, and find their average.
+            average: If None, the scores for each class are returned.
+                Otherwise, this determines the type of averaging performed on the data:
+                    * If 'micro': Calculate metrics globally by considering each element
+                        of the label indicator matrix as a label.
+                    * If 'macro': Calculate metrics for each label, and find their unweighted mean.
+                        This does not take label imbalance into account.
+                    * If 'weighted': Calculate metrics for each label, and find their average,
+                        weighted by support (the number of true instances for each label).
+                    * If 'samples': Calculate metrics for each instance, and find their average.
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).

From cc9b1b315e0fbe33719bc7f08e6edd5abae16b84 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 25 May 2020 21:49:56 +0200
Subject: [PATCH 27/44] formatting

---
 pytorch_lightning/metrics/sklearn.py | 108 +++++++++++----------------
 1 file changed, 44 insertions(+), 64 deletions(-)

diff --git a/pytorch_lightning/metrics/sklearn.py b/pytorch_lightning/metrics/sklearn.py
index 20b14980f942e..af8cd3c02373b 100644
--- a/pytorch_lightning/metrics/sklearn.py
+++ b/pytorch_lightning/metrics/sklearn.py
@@ -23,18 +23,19 @@
 
 
 class SklearnMetric(NumpyMetric):
+    """
+    Bridge between PyTorch Lightning and scikit-learn metrics
+
+    .. warning::
+        Every metric call will cause a GPU synchronization, which may slow down your code
+
+    .. note::
+        The order of targets and predictions may be different from the order typically used in PyTorch
+    """
     def __init__(self, metric_name: str,
                  reduce_group: Any = torch.distributed.group.WORLD,
                  reduce_op: Any = torch.distributed.ReduceOp.SUM, **kwargs):
         """
-        Bridge between PyTorch Lightning and scikit-learn metrics
-
-        .. warning::
-            Every metric call will cause a GPU synchronization, which may slow down your code
-
-        .. note::
-            The order of targets and predictions may be different from the order typically used in PyTorch
-
         Args:
             metric_name: the metric name to import anc compute from scikit-learn.metrics
             reduce_group: the process group for DDP reduces (only needed for DDP training).
@@ -57,8 +58,8 @@ def metric_fn(self):
         return getattr(sklearn.metrics, self.name)
 
     def forward(self, *args, **kwargs) -> Union[np.ndarray, int, float]:
-        """
-        Carries the actual metric computation
+        """ Carries the actual metric computation
+
         Args:
             *args: Positional arguments forwarded to metric call (should be already converted to numpy)
             **kwargs: keyword arguments forwarded to metric call (should be already converted to numpy)
@@ -71,15 +72,16 @@ def forward(self, *args, **kwargs) -> Union[np.ndarray, int, float]:
 
 
 class Accuracy(SklearnMetric):
+    """
+    Calculates the Accuracy Score
+
+    .. warning::
+        Every metric call will cause a GPU synchronization, which may slow down your code
+    """
     def __init__(self, normalize: bool = True,
                  reduce_group: Any = torch.distributed.group.WORLD,
                  reduce_op: Any = torch.distributed.ReduceOp.SUM):
         """
-        Calculates the Accuracy Score
-
-        .. warning::
-            Every metric call will cause a GPU synchronization, which may slow down your code
-
         Args:
             normalize: If ``False``, return the number of correctly classified samples.
                 Otherwise, return the fraction of correctly classified samples.
@@ -95,8 +97,8 @@ def __init__(self, normalize: bool = True,
 
     def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
                 sample_weight: Optional[np.ndarray] = None) -> float:
-        """
-        Computes the accuracy
+        """ Computes the accuracy
+
         Args:
             y_pred: the array containing the predictions (already in categorical form)
             y_true: the array containing the targets (in categorical form)
@@ -104,23 +106,22 @@ def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
 
         Returns:
             Accuracy Score
-
-
         """
         return super().forward(y_pred=y_pred, y_true=y_true, sample_weight=sample_weight)
 
 
 class AUC(SklearnMetric):
+    """
+    Calculates the Area Under the Curve using the trapoezoidal rule
+
+    .. warning::
+        Every metric call will cause a GPU synchronization, which may slow down your code
+    """
     def __init__(self,
                  reduce_group: Any = torch.distributed.group.WORLD,
                  reduce_op: Any = torch.distributed.ReduceOp.SUM
                  ):
         """
-        Calculates the Area Under the Curve using the trapoezoidal rule
-
-        .. warning::
-            Every metric call will cause a GPU synchronization, which may slow down your code
-
         Args:
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
@@ -134,8 +135,8 @@ def __init__(self,
                          )
 
     def forward(self, x: np.ndarray, y: np.ndarray) -> float:
-        """
-        Computes the AUC
+        """ Computes the AUC
+
         Args:
             x: x coordinates.
             y: y coordinates.
@@ -148,12 +149,14 @@ def forward(self, x: np.ndarray, y: np.ndarray) -> float:
 
 
 class AveragePrecision(SklearnMetric):
+    """
+    Calculates the average precision (AP) score.
+    """
     def __init__(self, average: Optional[str] = 'macro',
                  reduce_group: Any = torch.distributed.group.WORLD,
                  reduce_op: Any = torch.distributed.ReduceOp.SUM
                  ):
         """
-        Calculates the average precision (AP) score.
         Args:
             average: If None, the scores for each class are returned.
                 Otherwise, this determines the type of averaging performed on the data:
@@ -177,7 +180,6 @@ def __init__(self, average: Optional[str] = 'macro',
     def forward(self, y_score: np.ndarray, y_true: np.ndarray,
                 sample_weight: Optional[np.ndarray] = None) -> float:
         """
-
         Args:
             y_score: Target scores, can either be probability estimates of the positive class,
                 confidence values, or binary decisions.
@@ -191,16 +193,17 @@ def forward(self, y_score: np.ndarray, y_true: np.ndarray,
 
 
 class ConfusionMatrix(SklearnMetric):
+    """
+    Compute confusion matrix to evaluate the accuracy of a classification
+    By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}`
+    is equal to the number of observations known to be in group :math:`i` but
+    predicted to be in group :math:`j`.
+    """
     def __init__(self, labels: Optional[Sequence] = None,
                  reduce_group: Any = torch.distributed.group.WORLD,
                  reduce_op: Any = torch.distributed.ReduceOp.SUM
                  ):
         """
-        Compute confusion matrix to evaluate the accuracy of a classification
-        By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}`
-        is equal to the number of observations known to be in group :math:`i` but
-        predicted to be in group :math:`j`.
-
         Args:
             labels: List of labels to index the matrix. This may be used to reorder
                 or select a subset of labels.
@@ -218,7 +221,6 @@ def __init__(self, labels: Optional[Sequence] = None,
 
     def forward(self, y_pred: np.ndarray, y_true: np.ndarray) -> np.ndarray:
         """
-
         Args:
             y_pred: Estimated targets as returned by a classifier.
             y_true: Ground truth (correct) target values.
@@ -251,7 +253,6 @@ def __init__(self, labels: Optional[Sequence] = None,
                  reduce_group: Any = torch.distributed.group.WORLD,
                  reduce_op: Any = torch.distributed.ReduceOp.SUM):
         """
-
         Args:
             labels: Integer array of labels.
             pos_label: The class to report if ``average='binary'``.
@@ -294,13 +295,11 @@ def __init__(self, labels: Optional[Sequence] = None,
     def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
                 sample_weight: Optional[np.ndarray] = None) -> Union[np.ndarray, float]:
         """
-
         Args:
             y_pred : Estimated targets as returned by a classifier.
             y_true: Ground truth (correct) target values.
             sample_weight: Sample weights.
 
-
         Returns: F1 score of the positive class in binary classification or weighted
             average of the F1 scores of each class for the multiclass task.
 
@@ -315,12 +314,12 @@ class FBeta(SklearnMetric):
     favors recall (``beta -> 0`` considers only precision, ``beta -> inf``
     only recall).
 
-        References:
-            .. [1] R. Baeza-Yates and B. Ribeiro-Neto (2011).
-                Modern Information Retrieval. Addison Wesley, pp. 327-328.
-            .. [2] `Wikipedia entry for the F1-score
-                   <http://en.wikipedia.org/wiki/F1_score>`_
-        """
+    References:
+        .. [1] R. Baeza-Yates and B. Ribeiro-Neto (2011).
+            Modern Information Retrieval. Addison Wesley, pp. 327-328.
+        .. [2] `Wikipedia entry for the F1-score
+               <http://en.wikipedia.org/wiki/F1_score>`_
+    """
 
     def __init__(self, beta: float, labels: Optional[Sequence] = None,
                  pos_label: Union[str, int] = 1,
@@ -328,7 +327,6 @@ def __init__(self, beta: float, labels: Optional[Sequence] = None,
                  reduce_group: Any = torch.distributed.group.WORLD,
                  reduce_op: Any = torch.distributed.ReduceOp.SUM):
         """
-
         Args:
             beta: Weight of precision in harmonic mean.
             labels: Integer array of labels.
@@ -373,13 +371,11 @@ def __init__(self, beta: float, labels: Optional[Sequence] = None,
     def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
                 sample_weight: Optional[np.ndarray] = None) -> Union[np.ndarray, float]:
         """
-
         Args:
             y_pred : Estimated targets as returned by a classifier.
             y_true: Ground truth (correct) target values.
             sample_weight: Sample weights.
 
-
         Returns: FBeta score of the positive class in binary classification or weighted
             average of the FBeta scores of each class for the multiclass task.
 
@@ -395,7 +391,6 @@ class Precision(SklearnMetric):
     intuitively the ability of the classifier not to label as positive a sample
     that is negative.
     The best value is 1 and the worst value is 0.
-
     """
 
     def __init__(self, labels: Optional[Sequence] = None,
@@ -404,7 +399,6 @@ def __init__(self, labels: Optional[Sequence] = None,
                  reduce_group: Any = torch.distributed.group.WORLD,
                  reduce_op: Any = torch.distributed.ReduceOp.SUM):
         """
-
         Args:
             labels: Integer array of labels.
             pos_label: The class to report if ``average='binary'``.
@@ -447,16 +441,13 @@ def __init__(self, labels: Optional[Sequence] = None,
     def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
                 sample_weight: Optional[np.ndarray] = None) -> Union[np.ndarray, float]:
         """
-
         Args:
             y_pred : Estimated targets as returned by a classifier.
             y_true: Ground truth (correct) target values.
             sample_weight: Sample weights.
 
-
         Returns:  Precision of the positive class in binary classification or weighted
         average of the precision of each class for the multiclass task.
-
         """
         return super().forward(y_pred=y_pred, y_true=y_true, sample_weight=sample_weight)
 
@@ -468,7 +459,6 @@ class Recall(SklearnMetric):
     true positives and ``fn`` the number of false negatives. The recall is
     intuitively the ability of the classifier to find all the positive samples.
     The best value is 1 and the worst value is 0.
-
     """
 
     def __init__(self, labels: Optional[Sequence] = None,
@@ -477,7 +467,6 @@ def __init__(self, labels: Optional[Sequence] = None,
                  reduce_group: Any = torch.distributed.group.WORLD,
                  reduce_op: Any = torch.distributed.ReduceOp.SUM):
         """
-
         Args:
             labels: Integer array of labels.
             pos_label: The class to report if ``average='binary'``.
@@ -520,16 +509,13 @@ def __init__(self, labels: Optional[Sequence] = None,
     def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
                 sample_weight: Optional[np.ndarray] = None) -> Union[np.ndarray, float]:
         """
-
         Args:
             y_pred : Estimated targets as returned by a classifier.
             y_true: Ground truth (correct) target values.
             sample_weight: Sample weights.
 
-
         Returns:  Recall of the positive class in binary classification or weighted
         average of the recall of each class for the multiclass task.
-
         """
         return super().forward(y_pred=y_pred, y_true=y_true, sample_weight=sample_weight)
 
@@ -551,7 +537,6 @@ class PrecisionRecallCurve(SklearnMetric):
     The last precision and recall values are 1. and 0. respectively and do not
     have a corresponding threshold.  This ensures that the graph starts on the
     x axis.
-
     """
 
     def __init__(self,
@@ -559,7 +544,6 @@ def __init__(self,
                  reduce_group: Any = torch.distributed.group.WORLD,
                  reduce_op: Any = torch.distributed.ReduceOp.SUM):
         """
-
         Args:
             pos_label: The class to report if ``average='binary'``.
             reduce_group: the process group for DDP reduces (only needed for DDP training).
@@ -575,13 +559,11 @@ def __init__(self,
     def forward(self, probas_pred: np.ndarray, y_true: np.ndarray,
                 sample_weight: Optional[np.ndarray] = None) -> Union[np.ndarray, float]:
         """
-
         Args:
             probas_pred : Estimated probabilities or decision function.
             y_true: Ground truth (correct) target values.
             sample_weight: Sample weights.
 
-
         Returns:
             precision:
                 Precision values such that element i is the precision of
@@ -606,7 +588,6 @@ class ROC(SklearnMetric):
 
     Note:
         this implementation is restricted to the binary classification task.
-
     """
 
     def __init__(self,
@@ -614,7 +595,6 @@ def __init__(self,
                  reduce_group: Any = torch.distributed.group.WORLD,
                  reduce_op: Any = torch.distributed.ReduceOp.SUM):
         """
-
         Args:
             pos_labels: The class to report if ``average='binary'``.
             reduce_group: the process group for DDP reduces (only needed for DDP training).
@@ -634,7 +614,6 @@ def __init__(self,
     def forward(self, y_score: np.ndarray, y_true: np.ndarray,
                 sample_weight: Optional[np.ndarray] = None) -> Union[np.ndarray, float]:
         """
-
         Args:
             y_score : Target scores, can either be probability estimates of the positive
                 class or confidence values.
@@ -660,6 +639,7 @@ class or confidence values.
 class AUROC(SklearnMetric):
     """
     Compute Area Under the Curve (AUC) from prediction scores
+
     Note:
         this implementation is restricted to the binary classification task
         or multilabel classification task in label indicator format.
@@ -693,12 +673,12 @@ def __init__(self, average: Optional[str] = 'macro',
     def forward(self, y_score: np.ndarray, y_true: np.ndarray,
                 sample_weight: Optional[np.ndarray] = None) -> float:
         """
-
         Args:
             y_score: Target scores, can either be probability estimates of the positive class,
                 confidence values, or binary decisions.
             y_true: True binary labels in binary label indicators.
             sample_weight: Sample weights.
+
         Returns:
             Area Under Receiver Operating Characteristic Curve
         """

From c9908a1a644921348079870fde58eb157cd5e8eb Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 25 May 2020 22:03:36 +0200
Subject: [PATCH 28/44] format

---
 pytorch_lightning/metrics/sklearn.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/pytorch_lightning/metrics/sklearn.py b/pytorch_lightning/metrics/sklearn.py
index af8cd3c02373b..005c305adaa77 100644
--- a/pytorch_lightning/metrics/sklearn.py
+++ b/pytorch_lightning/metrics/sklearn.py
@@ -160,13 +160,13 @@ def __init__(self, average: Optional[str] = 'macro',
         Args:
             average: If None, the scores for each class are returned.
                 Otherwise, this determines the type of averaging performed on the data:
-                    * If 'micro': Calculate metrics globally by considering each element
-                        of the label indicator matrix as a label.
-                    * If 'macro': Calculate metrics for each label, and find their unweighted mean.
-                        This does not take label imbalance into account.
-                    * If 'weighted': Calculate metrics for each label, and find their average,
-                        weighted by support (the number of true instances for each label).
-                    * If 'samples': Calculate metrics for each instance, and find their average.
+                * If 'micro': Calculate metrics globally by considering each element
+                    of the label indicator matrix as a label.
+                * If 'macro': Calculate metrics for each label, and find their unweighted mean.
+                    This does not take label imbalance into account.
+                * If 'weighted': Calculate metrics for each label, and find their average,
+                    weighted by support (the number of true instances for each label).
+                * If 'samples': Calculate metrics for each instance, and find their average.
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).

From 41e59717c1a540d4463d30c15fbf344176beb3c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 26 May 2020 00:23:29 +0200
Subject: [PATCH 29/44] fix all warnings and formatting problems

---
 docs/source/conf.py                  |   1 +
 pytorch_lightning/metrics/sklearn.py | 259 +++++++++++++++------------
 2 files changed, 145 insertions(+), 115 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 4133571c65635..a084e5e349e39 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -90,6 +90,7 @@
     'sphinx.ext.linkcode',
     'sphinx.ext.autosummary',
     'sphinx.ext.napoleon',
+    'sphinx.ext.imgmath',
     'recommonmark',
     'sphinx.ext.autosectionlabel',
     # 'm2r',
diff --git a/pytorch_lightning/metrics/sklearn.py b/pytorch_lightning/metrics/sklearn.py
index 005c305adaa77..16cfd24cd7f30 100644
--- a/pytorch_lightning/metrics/sklearn.py
+++ b/pytorch_lightning/metrics/sklearn.py
@@ -26,18 +26,18 @@ class SklearnMetric(NumpyMetric):
     """
     Bridge between PyTorch Lightning and scikit-learn metrics
 
-    .. warning::
-        Every metric call will cause a GPU synchronization, which may slow down your code
+        Warning:
+            Every metric call will cause a GPU synchronization, which may slow down your code
 
-    .. note::
-        The order of targets and predictions may be different from the order typically used in PyTorch
+        Note:
+            The order of targets and predictions may be different from the order typically used in PyTorch
     """
     def __init__(self, metric_name: str,
                  reduce_group: Any = torch.distributed.group.WORLD,
                  reduce_op: Any = torch.distributed.ReduceOp.SUM, **kwargs):
         """
         Args:
-            metric_name: the metric name to import anc compute from scikit-learn.metrics
+            metric_name: the metric name to import and compute from scikit-learn.metrics
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
@@ -58,14 +58,15 @@ def metric_fn(self):
         return getattr(sklearn.metrics, self.name)
 
     def forward(self, *args, **kwargs) -> Union[np.ndarray, int, float]:
-        """ Carries the actual metric computation
+        """
+        Carries the actual metric computation
 
         Args:
             *args: Positional arguments forwarded to metric call (should be already converted to numpy)
             **kwargs: keyword arguments forwarded to metric call (should be already converted to numpy)
 
         Returns:
-            the metric value (will be converted to tensor by baseclass
+            the metric value (will be converted to tensor by baseclass)
 
         """
         return self.metric_fn(*args, **kwargs, **self.metric_kwargs)
@@ -97,15 +98,17 @@ def __init__(self, normalize: bool = True,
 
     def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
                 sample_weight: Optional[np.ndarray] = None) -> float:
-        """ Computes the accuracy
+        """
+        Computes the accuracy
 
         Args:
             y_pred: the array containing the predictions (already in categorical form)
             y_true: the array containing the targets (in categorical form)
-            sample_weight:
+            sample_weight:  Sample weights.
 
         Returns:
             Accuracy Score
+
         """
         return super().forward(y_pred=y_pred, y_true=y_true, sample_weight=sample_weight)
 
@@ -114,8 +117,8 @@ class AUC(SklearnMetric):
     """
     Calculates the Area Under the Curve using the trapoezoidal rule
 
-    .. warning::
-        Every metric call will cause a GPU synchronization, which may slow down your code
+        Warning:
+            Every metric call will cause a GPU synchronization, which may slow down your code
     """
     def __init__(self,
                  reduce_group: Any = torch.distributed.group.WORLD,
@@ -135,7 +138,8 @@ def __init__(self,
                          )
 
     def forward(self, x: np.ndarray, y: np.ndarray) -> float:
-        """ Computes the AUC
+        """
+        Computes the AUC
 
         Args:
             x: x coordinates.
@@ -158,15 +162,17 @@ def __init__(self, average: Optional[str] = 'macro',
                  ):
         """
         Args:
-            average: If None, the scores for each class are returned.
-                Otherwise, this determines the type of averaging performed on the data:
-                * If 'micro': Calculate metrics globally by considering each element
-                    of the label indicator matrix as a label.
+            average: If None, the scores for each class are returned. Otherwise, this determines the type of
+                averaging performed on the data:
+
+                * If 'micro': Calculate metrics globally by considering each element of the label indicator
+                  matrix as a label.
                 * If 'macro': Calculate metrics for each label, and find their unweighted mean.
-                    This does not take label imbalance into account.
-                * If 'weighted': Calculate metrics for each label, and find their average,
-                    weighted by support (the number of true instances for each label).
+                  This does not take label imbalance into account.
+                * If 'weighted': Calculate metrics for each label, and find their average, weighted by
+                  support (the number of true instances for each label).
                 * If 'samples': Calculate metrics for each instance, and find their average.
+
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
@@ -185,6 +191,7 @@ def forward(self, y_score: np.ndarray, y_true: np.ndarray,
                 confidence values, or binary decisions.
             y_true: True binary labels in binary label indicators.
             sample_weight: Sample weights.
+
         Returns:
             average precision score
         """
@@ -225,26 +232,31 @@ def forward(self, y_pred: np.ndarray, y_true: np.ndarray) -> np.ndarray:
             y_pred: Estimated targets as returned by a classifier.
             y_true: Ground truth (correct) target values.
 
-        Returns: Confusion matrix (array of shape [n_classes, n_classes])
+        Returns:
+            Confusion matrix (array of shape [n_classes, n_classes])
 
         """
         return super().forward(y_pred=y_pred, y_true=y_true)
 
 
 class F1(SklearnMetric):
-    """
+    r"""
     Compute the F1 score, also known as balanced F-score or F-measure
     The F1 score can be interpreted as a weighted average of the precision and
     recall, where an F1 score reaches its best value at 1 and worst score at 0.
     The relative contribution of precision and recall to the F1 score are
-    equal. The formula for the F1 score is::
-        F1 = 2 * (precision * recall) / (precision + recall)
+    equal. The formula for the F1 score is:
+
+    .. math::
+
+        F_1 = 2 \cdot \frac{precision \cdot recall}{precision + recall}
+
     In the multi-class and multi-label case, this is the weighted average of
     the F1 score of each class.
 
-    References:
-        .. [1] `Wikipedia entry for the F1-score
-           <http://en.wikipedia.org/wiki/F1_score>`_
+    References
+        - [1] `Wikipedia entry for the F1-score
+          <http://en.wikipedia.org/wiki/F1_score>`_
     """
 
     def __init__(self, labels: Optional[Sequence] = None,
@@ -259,24 +271,26 @@ def __init__(self, labels: Optional[Sequence] = None,
             average: This parameter is required for multiclass/multilabel targets.
                 If ``None``, the scores for each class are returned. Otherwise, this
                 determines the type of averaging performed on the data:
-                ``'binary'``:
-                    Only report results for the class specified by ``pos_label``.
-                    This is applicable only if targets (``y_{true,pred}``) are binary.
-                ``'micro'``:
-                    Calculate metrics globally by counting the total true positives,
-                    false negatives and false positives.
-                ``'macro'``:
-                    Calculate metrics for each label, and find their unweighted
-                    mean.  This does not take label imbalance into account.
-                ``'weighted'``:
-                    Calculate metrics for each label, and find their average, weighted
-                    by support (the number of true instances for each label). This
-                    alters 'macro' to account for label imbalance; it can result in an
-                    F-score that is not between precision and recall.
-                ``'samples'``:
-                    Calculate metrics for each instance, and find their average (only
-                    meaningful for multilabel classification where this differs from
-                    :func:`accuracy_score`).
+
+                * ``'binary'``:
+                  Only report results for the class specified by ``pos_label``.
+                  This is applicable only if targets (``y_{true,pred}``) are binary.
+                * ``'micro'``:
+                  Calculate metrics globally by counting the total true positives,
+                  false negatives and false positives.
+                * ``'macro'``:
+                  Calculate metrics for each label, and find their unweighted
+                  mean.  This does not take label imbalance into account.
+                * ``'weighted'``:
+                  Calculate metrics for each label, and find their average, weighted
+                  by support (the number of true instances for each label). This
+                  alters 'macro' to account for label imbalance; it can result in an
+                  F-score that is not between precision and recall.
+                * ``'samples'``:
+                  Calculate metrics for each instance, and find their average (only
+                  meaningful for multilabel classification where this differs from
+                  :func:`accuracy_score`).
+
                 Note that if ``pos_label`` is given in binary classification with
                 `average != 'binary'`, only that positive class is reported. This
                 behavior is deprecated and will change in version 0.18.
@@ -300,7 +314,8 @@ def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
             y_true: Ground truth (correct) target values.
             sample_weight: Sample weights.
 
-        Returns: F1 score of the positive class in binary classification or weighted
+        Returns:
+            F1 score of the positive class in binary classification or weighted
             average of the F1 scores of each class for the multiclass task.
 
         """
@@ -309,16 +324,16 @@ def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
 
 class FBeta(SklearnMetric):
     """
-    Compute the F-beta score.The `beta` parameter determines the weight of precision in the combined
+    Compute the F-beta score. The `beta` parameter determines the weight of precision in the combined
     score. ``beta < 1`` lends more weight to precision, while ``beta > 1``
     favors recall (``beta -> 0`` considers only precision, ``beta -> inf``
     only recall).
 
     References:
-        .. [1] R. Baeza-Yates and B. Ribeiro-Neto (2011).
-            Modern Information Retrieval. Addison Wesley, pp. 327-328.
-        .. [2] `Wikipedia entry for the F1-score
-               <http://en.wikipedia.org/wiki/F1_score>`_
+        - [1] R. Baeza-Yates and B. Ribeiro-Neto (2011).
+          Modern Information Retrieval. Addison Wesley, pp. 327-328.
+        - [2] `Wikipedia entry for the F1-score
+          <http://en.wikipedia.org/wiki/F1_score>`_
     """
 
     def __init__(self, beta: float, labels: Optional[Sequence] = None,
@@ -334,24 +349,26 @@ def __init__(self, beta: float, labels: Optional[Sequence] = None,
             average: This parameter is required for multiclass/multilabel targets.
                 If ``None``, the scores for each class are returned. Otherwise, this
                 determines the type of averaging performed on the data:
-                ``'binary'``:
-                    Only report results for the class specified by ``pos_label``.
-                    This is applicable only if targets (``y_{true,pred}``) are binary.
-                ``'micro'``:
-                    Calculate metrics globally by counting the total true positives,
-                    false negatives and false positives.
-                ``'macro'``:
-                    Calculate metrics for each label, and find their unweighted
-                    mean.  This does not take label imbalance into account.
-                ``'weighted'``:
-                    Calculate metrics for each label, and find their average, weighted
-                    by support (the number of true instances for each label). This
-                    alters 'macro' to account for label imbalance; it can result in an
-                    F-score that is not between precision and recall.
-                ``'samples'``:
-                    Calculate metrics for each instance, and find their average (only
-                    meaningful for multilabel classification where this differs from
-                    :func:`accuracy_score`).
+
+                * ``'binary'``:
+                  Only report results for the class specified by ``pos_label``.
+                  This is applicable only if targets (``y_{true,pred}``) are binary.
+                * ``'micro'``:
+                  Calculate metrics globally by counting the total true positives,
+                  false negatives and false positives.
+                * ``'macro'``:
+                  Calculate metrics for each label, and find their unweighted
+                  mean.  This does not take label imbalance into account.
+                * ``'weighted'``:
+                  Calculate metrics for each label, and find their average, weighted
+                  by support (the number of true instances for each label). This
+                  alters 'macro' to account for label imbalance; it can result in an
+                  F-score that is not between precision and recall.
+                * ``'samples'``:
+                  Calculate metrics for each instance, and find their average (only
+                  meaningful for multilabel classification where this differs from
+                  :func:`accuracy_score`).
+
                 Note that if ``pos_label`` is given in binary classification with
                 `average != 'binary'`, only that positive class is reported. This
                 behavior is deprecated and will change in version 0.18.
@@ -376,7 +393,9 @@ def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
             y_true: Ground truth (correct) target values.
             sample_weight: Sample weights.
 
-        Returns: FBeta score of the positive class in binary classification or weighted
+
+        Returns:
+            FBeta score of the positive class in binary classification or weighted
             average of the FBeta scores of each class for the multiclass task.
 
         """
@@ -405,24 +424,26 @@ def __init__(self, labels: Optional[Sequence] = None,
             average: This parameter is required for multiclass/multilabel targets.
                 If ``None``, the scores for each class are returned. Otherwise, this
                 determines the type of averaging performed on the data:
-                ``'binary'``:
-                    Only report results for the class specified by ``pos_label``.
-                    This is applicable only if targets (``y_{true,pred}``) are binary.
-                ``'micro'``:
-                    Calculate metrics globally by counting the total true positives,
-                    false negatives and false positives.
-                ``'macro'``:
-                    Calculate metrics for each label, and find their unweighted
-                    mean.  This does not take label imbalance into account.
-                ``'weighted'``:
-                    Calculate metrics for each label, and find their average, weighted
-                    by support (the number of true instances for each label). This
-                    alters 'macro' to account for label imbalance; it can result in an
-                    F-score that is not between precision and recall.
-                ``'samples'``:
-                    Calculate metrics for each instance, and find their average (only
-                    meaningful for multilabel classification where this differs from
-                    :func:`accuracy_score`).
+
+                * ``'binary'``:
+                  Only report results for the class specified by ``pos_label``.
+                  This is applicable only if targets (``y_{true,pred}``) are binary.
+                * ``'micro'``:
+                  Calculate metrics globally by counting the total true positives,
+                  false negatives and false positives.
+                * ``'macro'``:
+                  Calculate metrics for each label, and find their unweighted
+                  mean.  This does not take label imbalance into account.
+                * ``'weighted'``:
+                  Calculate metrics for each label, and find their average, weighted
+                  by support (the number of true instances for each label). This
+                  alters 'macro' to account for label imbalance; it can result in an
+                  F-score that is not between precision and recall.
+                * ``'samples'``:
+                  Calculate metrics for each instance, and find their average (only
+                  meaningful for multilabel classification where this differs from
+                  :func:`accuracy_score`).
+
                 Note that if ``pos_label`` is given in binary classification with
                 `average != 'binary'`, only that positive class is reported. This
                 behavior is deprecated and will change in version 0.18.
@@ -446,8 +467,10 @@ def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
             y_true: Ground truth (correct) target values.
             sample_weight: Sample weights.
 
-        Returns:  Precision of the positive class in binary classification or weighted
-        average of the precision of each class for the multiclass task.
+        Returns:
+            Precision of the positive class in binary classification or weighted
+            average of the precision of each class for the multiclass task.
+
         """
         return super().forward(y_pred=y_pred, y_true=y_true, sample_weight=sample_weight)
 
@@ -473,24 +496,26 @@ def __init__(self, labels: Optional[Sequence] = None,
             average: This parameter is required for multiclass/multilabel targets.
                 If ``None``, the scores for each class are returned. Otherwise, this
                 determines the type of averaging performed on the data:
-                ``'binary'``:
-                    Only report results for the class specified by ``pos_label``.
-                    This is applicable only if targets (``y_{true,pred}``) are binary.
-                ``'micro'``:
-                    Calculate metrics globally by counting the total true positives,
-                    false negatives and false positives.
-                ``'macro'``:
-                    Calculate metrics for each label, and find their unweighted
-                    mean.  This does not take label imbalance into account.
-                ``'weighted'``:
-                    Calculate metrics for each label, and find their average, weighted
-                    by support (the number of true instances for each label). This
-                    alters 'macro' to account for label imbalance; it can result in an
-                    F-score that is not between precision and recall.
-                ``'samples'``:
-                    Calculate metrics for each instance, and find their average (only
-                    meaningful for multilabel classification where this differs from
-                    :func:`accuracy_score`).
+
+                * ``'binary'``:
+                  Only report results for the class specified by ``pos_label``.
+                  This is applicable only if targets (``y_{true,pred}``) are binary.
+                * ``'micro'``:
+                  Calculate metrics globally by counting the total true positives,
+                  false negatives and false positives.
+                * ``'macro'``:
+                  Calculate metrics for each label, and find their unweighted
+                  mean.  This does not take label imbalance into account.
+                * ``'weighted'``:
+                  Calculate metrics for each label, and find their average, weighted
+                  by support (the number of true instances for each label). This
+                  alters 'macro' to account for label imbalance; it can result in an
+                  F-score that is not between precision and recall.
+                * ``'samples'``:
+                  Calculate metrics for each instance, and find their average (only
+                  meaningful for multilabel classification where this differs from
+                  :func:`accuracy_score`).
+
                 Note that if ``pos_label`` is given in binary classification with
                 `average != 'binary'`, only that positive class is reported. This
                 behavior is deprecated and will change in version 0.18.
@@ -514,8 +539,10 @@ def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
             y_true: Ground truth (correct) target values.
             sample_weight: Sample weights.
 
-        Returns:  Recall of the positive class in binary classification or weighted
-        average of the recall of each class for the multiclass task.
+        Returns:
+            Recall of the positive class in binary classification or weighted
+            average of the recall of each class for the multiclass task.
+
         """
         return super().forward(y_pred=y_pred, y_true=y_true, sample_weight=sample_weight)
 
@@ -525,7 +552,7 @@ class PrecisionRecallCurve(SklearnMetric):
     Compute precision-recall pairs for different probability thresholds
 
     Note:
-        this implementation is restricted to the binary classification task.
+        This implementation is restricted to the binary classification task.
 
     The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of
     true positives and ``fp`` the number of false positives. The precision is
@@ -603,8 +630,8 @@ def __init__(self,
                 Defaults to sum.
 
         References:
-            .. [1] `Wikipedia entry for the Receiver operating characteristic
-                <http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
+            - [1] `Wikipedia entry for the Receiver operating characteristic
+              <http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
         """
         super().__init__('roc_curve',
                          reduce_group=reduce_group,
@@ -653,13 +680,15 @@ def __init__(self, average: Optional[str] = 'macro',
         Args:
             average: If None, the scores for each class are returned. Otherwise, this determines the type of
                 averaging performed on the data:
+
                 * If 'micro': Calculate metrics globally by considering each element of the label indicator
-                    matrix as a label.
+                  matrix as a label.
                 * If 'macro': Calculate metrics for each label, and find their unweighted mean.
-                    This does not take label imbalance into account.
+                  This does not take label imbalance into account.
                 * If 'weighted': Calculate metrics for each label, and find their average, weighted by
-                    support (the number of true instances for each label).
+                  support (the number of true instances for each label).
                 * If 'samples': Calculate metrics for each instance, and find their average.
+
             reduce_group: the process group for DDP reduces (only needed for DDP training).
                 Defaults to all processes (world)
             reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).

From 82781e56b27186cbd6f2ce1f2031015d1c14fe12 Mon Sep 17 00:00:00 2001
From: Justus Schock <12886177+justusschock@users.noreply.github.com>
Date: Tue, 26 May 2020 13:09:05 +0200
Subject: [PATCH 30/44] Update environment.yml

---
 environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index 98f5fb81e1cdd..d610f0a1379b0 100644
--- a/environment.yml
+++ b/environment.yml
@@ -27,7 +27,7 @@ dependencies:
     - check-manifest
     - twine==1.13.0
     - pillow<7.0.0
-    - scipy>=0.13.3
+    - scipy>=0.13.3,<1.4.0
     - scikit-learn>=0.20.0
 
 

From 01c3e5714c44756b94cf69bac4f547f1eb3eba21 Mon Sep 17 00:00:00 2001
From: Justus Schock <12886177+justusschock@users.noreply.github.com>
Date: Tue, 26 May 2020 13:09:31 +0200
Subject: [PATCH 31/44] Update requirements-extra.txt

---
 requirements-extra.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-extra.txt b/requirements-extra.txt
index 1ce0aa550212f..8598fb45b5029 100644
--- a/requirements-extra.txt
+++ b/requirements-extra.txt
@@ -10,5 +10,5 @@ matplotlib>=3.1.1
 # no need to install with [pytorch] as pytorch is already installed and torchvision is required only for Horovod examples
 horovod>=0.19.1
 omegaconf>=2.0.0
-scipy>=0.13.3
+scipy>=0.13.3, <1.4.0
 scikit-learn>=0.20.0

From 6a674b6fb3e09a39c09359c598bfb6ced3140d46 Mon Sep 17 00:00:00 2001
From: Justus Schock <12886177+justusschock@users.noreply.github.com>
Date: Tue, 26 May 2020 15:09:27 +0200
Subject: [PATCH 32/44] Update environment.yml

---
 environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index d610f0a1379b0..98f5fb81e1cdd 100644
--- a/environment.yml
+++ b/environment.yml
@@ -27,7 +27,7 @@ dependencies:
     - check-manifest
     - twine==1.13.0
     - pillow<7.0.0
-    - scipy>=0.13.3,<1.4.0
+    - scipy>=0.13.3
     - scikit-learn>=0.20.0
 
 

From c96e3d778224140fa4455a9e4f8be0ae798aa158 Mon Sep 17 00:00:00 2001
From: Justus Schock <12886177+justusschock@users.noreply.github.com>
Date: Tue, 26 May 2020 15:09:49 +0200
Subject: [PATCH 33/44] Update requirements-extra.txt

---
 requirements-extra.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-extra.txt b/requirements-extra.txt
index 8598fb45b5029..1ce0aa550212f 100644
--- a/requirements-extra.txt
+++ b/requirements-extra.txt
@@ -10,5 +10,5 @@ matplotlib>=3.1.1
 # no need to install with [pytorch] as pytorch is already installed and torchvision is required only for Horovod examples
 horovod>=0.19.1
 omegaconf>=2.0.0
-scipy>=0.13.3, <1.4.0
+scipy>=0.13.3
 scikit-learn>=0.20.0

From b779e3792461d60a88de8161daf797dd3cdc0480 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Mon, 8 Jun 2020 15:23:39 +0200
Subject: [PATCH 34/44] Update CHANGELOG.md

---
 CHANGELOG.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b615abc47a4d1..ec6157d2bc1ed 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,7 +22,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ### Added
 
 - Remove explicit flush from tensorboard logger ([#2126](https://github.com/PyTorchLightning/pytorch-lightning/pull/2126))
-- Add Metric Base Classes ([#1326](https://github.com/PyTorchLightning/pytorch-lightning/pull/1326), [#1877](https://github.com/PyTorchLightning/pytorch-lightning/pull/1877))
+- Added metric Base classes ([#1326](https://github.com/PyTorchLightning/pytorch-lightning/pull/1326), [#1877](https://github.com/PyTorchLightning/pytorch-lightning/pull/1877))
+- Added Sklearn metrics classes ([#1327](https://github.com/PyTorchLightning/pytorch-lightning/pull/1327))
 - Added type hints in `Trainer.fit()` and `Trainer.test()` to reflect that also a list of dataloaders can be passed in ([#1723](https://github.com/PyTorchLightning/pytorch-lightning/pull/1723))
 - Allow dataloaders without sampler field present ([#1907](https://github.com/PyTorchLightning/pytorch-lightning/pull/1907))
 - Added option `save_last` to save the model at the end of every epoch in `ModelCheckpoint` [(#1908)](https://github.com/PyTorchLightning/pytorch-lightning/pull/1908)

From f6e6cecadaecefac46bbe9fd162278cc00a6d83e Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 8 Jun 2020 15:28:43 +0200
Subject: [PATCH 35/44] docs

---
 .circleci/config.yml                 |  9 ++++++---
 pytorch_lightning/metrics/sklearn.py | 28 ++++++++++++++--------------
 requirements-extra.txt               |  2 +-
 requirements.txt                     |  2 +-
 tests/requirements.txt               |  2 +-
 5 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 7bd1de8f6c947..fcff1e65feca1 100755
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -64,8 +64,11 @@ references:
      name: Make Documentation
      command: |
        # First run the same pipeline as Read-The-Docs
-       sudo apt-get update && sudo apt-get install -y cmake
-       sudo pip install -r docs/requirements.txt
+       # apt-get update && apt-get install -y cmake
+       # using: https://hub.docker.com/r/readthedocs/build
+       pyenv global 3.6.8
+       python --version
+       pip install -r docs/requirements.txt
        cd docs; make clean; make html --debug --jobs 2 SPHINXOPTS="-W"
 
   test_docs: &test_docs
@@ -81,7 +84,7 @@ jobs:
 
   Build-Docs:
     docker:
-      - image: circleci/python:3.7
+      - image: readthedocs/build:latest
     steps:
       - checkout
       - *make_docs
diff --git a/pytorch_lightning/metrics/sklearn.py b/pytorch_lightning/metrics/sklearn.py
index 16cfd24cd7f30..fdc56884db48e 100644
--- a/pytorch_lightning/metrics/sklearn.py
+++ b/pytorch_lightning/metrics/sklearn.py
@@ -65,7 +65,7 @@ def forward(self, *args, **kwargs) -> Union[np.ndarray, int, float]:
             *args: Positional arguments forwarded to metric call (should be already converted to numpy)
             **kwargs: keyword arguments forwarded to metric call (should be already converted to numpy)
 
-        Returns:
+        Return:
             the metric value (will be converted to tensor by baseclass)
 
         """
@@ -76,7 +76,7 @@ class Accuracy(SklearnMetric):
     """
     Calculates the Accuracy Score
 
-    .. warning::
+    Warning:
         Every metric call will cause a GPU synchronization, which may slow down your code
     """
     def __init__(self, normalize: bool = True,
@@ -106,7 +106,7 @@ def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
             y_true: the array containing the targets (in categorical form)
             sample_weight:  Sample weights.
 
-        Returns:
+        Return:
             Accuracy Score
 
         """
@@ -145,7 +145,7 @@ def forward(self, x: np.ndarray, y: np.ndarray) -> float:
             x: x coordinates.
             y: y coordinates.
 
-        Returns:
+        Return:
             AUC calculated with trapezoidal rule
 
         """
@@ -192,7 +192,7 @@ def forward(self, y_score: np.ndarray, y_true: np.ndarray,
             y_true: True binary labels in binary label indicators.
             sample_weight: Sample weights.
 
-        Returns:
+        Return:
             average precision score
         """
         return super().forward(y_score=y_score, y_true=y_true,
@@ -232,7 +232,7 @@ def forward(self, y_pred: np.ndarray, y_true: np.ndarray) -> np.ndarray:
             y_pred: Estimated targets as returned by a classifier.
             y_true: Ground truth (correct) target values.
 
-        Returns:
+        Return:
             Confusion matrix (array of shape [n_classes, n_classes])
 
         """
@@ -314,7 +314,7 @@ def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
             y_true: Ground truth (correct) target values.
             sample_weight: Sample weights.
 
-        Returns:
+        Return:
             F1 score of the positive class in binary classification or weighted
             average of the F1 scores of each class for the multiclass task.
 
@@ -394,9 +394,9 @@ def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
             sample_weight: Sample weights.
 
 
-        Returns:
+        Return:
             FBeta score of the positive class in binary classification or weighted
-            average of the FBeta scores of each class for the multiclass task.
+                average of the FBeta scores of each class for the multiclass task.
 
         """
         return super().forward(y_pred=y_pred, y_true=y_true, sample_weight=sample_weight)
@@ -467,9 +467,9 @@ def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
             y_true: Ground truth (correct) target values.
             sample_weight: Sample weights.
 
-        Returns:
+        Return:
             Precision of the positive class in binary classification or weighted
-            average of the precision of each class for the multiclass task.
+                average of the precision of each class for the multiclass task.
 
         """
         return super().forward(y_pred=y_pred, y_true=y_true, sample_weight=sample_weight)
@@ -539,9 +539,9 @@ def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
             y_true: Ground truth (correct) target values.
             sample_weight: Sample weights.
 
-        Returns:
+        Return:
             Recall of the positive class in binary classification or weighted
-            average of the recall of each class for the multiclass task.
+                average of the recall of each class for the multiclass task.
 
         """
         return super().forward(y_pred=y_pred, y_true=y_true, sample_weight=sample_weight)
@@ -708,7 +708,7 @@ def forward(self, y_score: np.ndarray, y_true: np.ndarray,
             y_true: True binary labels in binary label indicators.
             sample_weight: Sample weights.
 
-        Returns:
+        Return:
             Area Under Receiver Operating Characteristic Curve
         """
         return super().forward(y_score=y_score, y_true=y_true,
diff --git a/requirements-extra.txt b/requirements-extra.txt
index 1ce0aa550212f..ac12429220f82 100644
--- a/requirements-extra.txt
+++ b/requirements-extra.txt
@@ -11,4 +11,4 @@ matplotlib>=3.1.1
 horovod>=0.19.1
 omegaconf>=2.0.0
 scipy>=0.13.3
-scikit-learn>=0.20.0
+scikit-learn>=0.19
diff --git a/requirements.txt b/requirements.txt
index 0aa44aae24f4c..79899d1b9c71f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # the default package dependencies
 
+numpy>=1.11.0
 tqdm>=4.41.0
-numpy>=1.16.4
 torch>=1.3
 tensorboard>=1.14
 future>=0.17.1  # required for builtins in setup.py
diff --git a/tests/requirements.txt b/tests/requirements.txt
index fdf2e83337acb..2945bc5f968d2 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -8,4 +8,4 @@ flake8-black
 check-manifest
 twine==1.13.0
 black==19.10b0
-pre-commit>=1.21.0
+pre-commit>=1.0

From 533bb7cd76b2ebfcbd55b791dc753dd28de5be9d Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Tue, 9 Jun 2020 11:06:07 +0200
Subject: [PATCH 36/44] inherit

---
 pytorch_lightning/metrics/metric.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/metrics/metric.py b/pytorch_lightning/metrics/metric.py
index 5247084498559..bd14655f30fa3 100644
--- a/pytorch_lightning/metrics/metric.py
+++ b/pytorch_lightning/metrics/metric.py
@@ -1,8 +1,9 @@
 from abc import ABC, abstractmethod
-from typing import Any, Optional, Union
+from typing import Any, Optional
 
 import torch
 import torch.distributed
+from torch.nn import Module
 
 from pytorch_lightning.metrics.converters import tensor_metric, numpy_metric
 from pytorch_lightning.utilities.apply_func import apply_to_collection
@@ -11,7 +12,7 @@
 __all__ = ['Metric', 'TensorMetric', 'NumpyMetric']
 
 
-class Metric(DeviceDtypeModuleMixin, torch.nn.Module, ABC):
+class Metric(ABC, DeviceDtypeModuleMixin, Module):
     """
     Abstract base class for metric implementation.
 

From ca117e4207235d823e328f49b5e8c1526dcd67b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 26 May 2020 00:23:29 +0200
Subject: [PATCH 37/44] fix all warnings and formatting problems

---
 pytorch_lightning/metrics/sklearn.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/pytorch_lightning/metrics/sklearn.py b/pytorch_lightning/metrics/sklearn.py
index fdc56884db48e..075949fe25988 100644
--- a/pytorch_lightning/metrics/sklearn.py
+++ b/pytorch_lightning/metrics/sklearn.py
@@ -26,11 +26,11 @@ class SklearnMetric(NumpyMetric):
     """
     Bridge between PyTorch Lightning and scikit-learn metrics
 
-        Warning:
-            Every metric call will cause a GPU synchronization, which may slow down your code
+    Warning:
+        Every metric call will cause a GPU synchronization, which may slow down your code
 
-        Note:
-            The order of targets and predictions may be different from the order typically used in PyTorch
+    Note:
+        The order of targets and predictions may be different from the order typically used in PyTorch
     """
     def __init__(self, metric_name: str,
                  reduce_group: Any = torch.distributed.group.WORLD,
@@ -77,7 +77,7 @@ class Accuracy(SklearnMetric):
     Calculates the Accuracy Score
 
     Warning:
-        Every metric call will cause a GPU synchronization, which may slow down your code
+            Every metric call will cause a GPU synchronization, which may slow down your code
     """
     def __init__(self, normalize: bool = True,
                  reduce_group: Any = torch.distributed.group.WORLD,
@@ -117,8 +117,8 @@ class AUC(SklearnMetric):
     """
     Calculates the Area Under the Curve using the trapoezoidal rule
 
-        Warning:
-            Every metric call will cause a GPU synchronization, which may slow down your code
+    Warning:
+        Every metric call will cause a GPU synchronization, which may slow down your code
     """
     def __init__(self,
                  reduce_group: Any = torch.distributed.group.WORLD,
@@ -396,7 +396,7 @@ def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
 
         Return:
             FBeta score of the positive class in binary classification or weighted
-                average of the FBeta scores of each class for the multiclass task.
+            average of the FBeta scores of each class for the multiclass task.
 
         """
         return super().forward(y_pred=y_pred, y_true=y_true, sample_weight=sample_weight)
@@ -469,7 +469,7 @@ def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
 
         Return:
             Precision of the positive class in binary classification or weighted
-                average of the precision of each class for the multiclass task.
+            average of the precision of each class for the multiclass task.
 
         """
         return super().forward(y_pred=y_pred, y_true=y_true, sample_weight=sample_weight)
@@ -541,7 +541,7 @@ def forward(self, y_pred: np.ndarray, y_true: np.ndarray,
 
         Return:
             Recall of the positive class in binary classification or weighted
-                average of the recall of each class for the multiclass task.
+            average of the recall of each class for the multiclass task.
 
         """
         return super().forward(y_pred=y_pred, y_true=y_true, sample_weight=sample_weight)

From 36fa04c100297da4c16e653d44241101f311eb87 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Tue, 9 Jun 2020 11:26:34 +0200
Subject: [PATCH 38/44] docs inherit.

---
 .../computer_vision_fine_tuning.py              | 12 +++++++-----
 pytorch_lightning/core/grads.py                 |  3 ++-
 pytorch_lightning/core/hooks.py                 |  3 ++-
 pytorch_lightning/core/lightning.py             |  2 +-
 pytorch_lightning/metrics/metric.py             |  2 +-
 .../utilities/device_dtype_mixin.py             | 17 +++++++++--------
 6 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/pl_examples/domain_templates/computer_vision_fine_tuning.py b/pl_examples/domain_templates/computer_vision_fine_tuning.py
index e2db1b98fdb09..703f1c9b02419 100644
--- a/pl_examples/domain_templates/computer_vision_fine_tuning.py
+++ b/pl_examples/domain_templates/computer_vision_fine_tuning.py
@@ -27,6 +27,8 @@
 from tempfile import TemporaryDirectory
 from typing import Optional, Generator, Union
 
+from torch.nn import Module
+
 import pytorch_lightning as pl
 import torch
 import torch.nn.functional as F
@@ -47,7 +49,7 @@
 #  --- Utility functions ---
 
 
-def _make_trainable(module: torch.nn.Module) -> None:
+def _make_trainable(module: Module) -> None:
     """Unfreezes a given module.
 
     Args:
@@ -58,7 +60,7 @@ def _make_trainable(module: torch.nn.Module) -> None:
     module.train()
 
 
-def _recursive_freeze(module: torch.nn.Module,
+def _recursive_freeze(module: Module,
                       train_bn: bool = True) -> None:
     """Freezes the layers of a given module.
 
@@ -80,7 +82,7 @@ def _recursive_freeze(module: torch.nn.Module,
             _recursive_freeze(module=child, train_bn=train_bn)
 
 
-def freeze(module: torch.nn.Module,
+def freeze(module: Module,
            n: Optional[int] = None,
            train_bn: bool = True) -> None:
     """Freezes the layers up to index n (if n is not None).
@@ -101,7 +103,7 @@ def freeze(module: torch.nn.Module,
         _make_trainable(module=child)
 
 
-def filter_params(module: torch.nn.Module,
+def filter_params(module: Module,
                   train_bn: bool = True) -> Generator:
     """Yields the trainable parameters of a given module.
 
@@ -124,7 +126,7 @@ def filter_params(module: torch.nn.Module,
                 yield param
 
 
-def _unfreeze_and_add_param_group(module: torch.nn.Module,
+def _unfreeze_and_add_param_group(module: Module,
                                   optimizer: Optimizer,
                                   lr: Optional[float] = None,
                                   train_bn: bool = True):
diff --git a/pytorch_lightning/core/grads.py b/pytorch_lightning/core/grads.py
index cb2215002c7d8..f58bbdf25ec88 100644
--- a/pytorch_lightning/core/grads.py
+++ b/pytorch_lightning/core/grads.py
@@ -4,9 +4,10 @@
 from typing import Dict, Union
 
 import torch
+from torch.nn import Module
 
 
-class GradInformation(torch.nn.Module):
+class GradInformation(Module):
 
     def grad_norm(self, norm_type: Union[float, int, str]) -> Dict[str, float]:
         """Compute each parameter's gradient's norm and their overall norm.
diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py
index 960c7124383b0..d3fea6d446845 100644
--- a/pytorch_lightning/core/hooks.py
+++ b/pytorch_lightning/core/hooks.py
@@ -2,6 +2,7 @@
 
 import torch
 from torch import Tensor
+from torch.nn import Module
 from torch.optim.optimizer import Optimizer
 from pytorch_lightning.utilities import move_data_to_device
 
@@ -14,7 +15,7 @@
     APEX_AVAILABLE = True
 
 
-class ModelHooks(torch.nn.Module):
+class ModelHooks(Module):
 
     # TODO: remove in v0.9.0
     def on_sanity_check_start(self):
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index 7632822e462c3..fa21f3da44560 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -32,7 +32,7 @@
     XLA_AVAILABLE = True
 
 
-class LightningModule(ABC, DeviceDtypeModuleMixin, GradInformation, ModelIO, ModelHooks, Module):
+class LightningModule(ABC, DeviceDtypeModuleMixin, GradInformation, ModelIO, ModelHooks):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
diff --git a/pytorch_lightning/metrics/metric.py b/pytorch_lightning/metrics/metric.py
index bd14655f30fa3..9fb86d4b46154 100644
--- a/pytorch_lightning/metrics/metric.py
+++ b/pytorch_lightning/metrics/metric.py
@@ -12,7 +12,7 @@
 __all__ = ['Metric', 'TensorMetric', 'NumpyMetric']
 
 
-class Metric(ABC, DeviceDtypeModuleMixin, Module):
+class Metric(ABC, DeviceDtypeModuleMixin):
     """
     Abstract base class for metric implementation.
 
diff --git a/pytorch_lightning/utilities/device_dtype_mixin.py b/pytorch_lightning/utilities/device_dtype_mixin.py
index eb3faf54faf6e..48ccad5307552 100644
--- a/pytorch_lightning/utilities/device_dtype_mixin.py
+++ b/pytorch_lightning/utilities/device_dtype_mixin.py
@@ -1,9 +1,10 @@
 from typing import Union, Optional
 
 import torch
+from torch.nn import Module
 
 
-class DeviceDtypeModuleMixin(torch.nn.Module):
+class DeviceDtypeModuleMixin(Module):
     _device: ...
     _dtype: Union[str, torch.dtype]
 
@@ -25,7 +26,7 @@ def device(self, new_device: Union[str, torch.device]):
         # Necessary to avoid infinite recursion
         raise RuntimeError('Cannot set the device explicitly. Please use module.to(new_device).')
 
-    def to(self, *args, **kwargs) -> torch.nn.Module:
+    def to(self, *args, **kwargs) -> Module:
         """Moves and/or casts the parameters and buffers.
 
         This can be called as
@@ -91,7 +92,7 @@ def to(self, *args, **kwargs) -> torch.nn.Module:
 
         return super().to(*args, **kwargs)
 
-    def cuda(self, device: Optional[int] = None) -> torch.nn.Module:
+    def cuda(self, device: Optional[int] = None) -> Module:
         """Moves all model parameters and buffers to the GPU.
         This also makes associated parameters and buffers different objects. So
         it should be called before constructing optimizer if the module will
@@ -108,7 +109,7 @@ def cuda(self, device: Optional[int] = None) -> torch.nn.Module:
         self._device = torch.device('cuda', index=device)
         return super().cuda(device=device)
 
-    def cpu(self) -> torch.nn.Module:
+    def cpu(self) -> Module:
         """Moves all model parameters and buffers to the CPU.
         Returns:
             Module: self
@@ -116,7 +117,7 @@ def cpu(self) -> torch.nn.Module:
         self._device = torch.device('cpu')
         return super().cpu()
 
-    def type(self, dst_type: Union[str, torch.dtype]) -> torch.nn.Module:
+    def type(self, dst_type: Union[str, torch.dtype]) -> Module:
         """Casts all parameters and buffers to :attr:`dst_type`.
 
         Arguments:
@@ -128,7 +129,7 @@ def type(self, dst_type: Union[str, torch.dtype]) -> torch.nn.Module:
         self._dtype = dst_type
         return super().type(dst_type=dst_type)
 
-    def float(self) -> torch.nn.Module:
+    def float(self) -> Module:
         """Casts all floating point parameters and buffers to float datatype.
 
         Returns:
@@ -137,7 +138,7 @@ def float(self) -> torch.nn.Module:
         self._dtype = torch.float
         return super().float()
 
-    def double(self) -> torch.nn.Module:
+    def double(self) -> Module:
         """Casts all floating point parameters and buffers to ``double`` datatype.
 
         Returns:
@@ -146,7 +147,7 @@ def double(self) -> torch.nn.Module:
         self._dtype = torch.double
         return super().double()
 
-    def half(self) -> torch.nn.Module:
+    def half(self) -> Module:
         """Casts all floating point parameters and buffers to ``half`` datatype.
 
         Returns:

From 0a3a31aae1f043646cee318bf008d3671b4ea7b6 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Tue, 9 Jun 2020 11:34:27 +0200
Subject: [PATCH 39/44] docs

---
 .circleci/config.yml                | 2 +-
 pytorch_lightning/core/lightning.py | 2 +-
 pytorch_lightning/metrics/metric.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index fcff1e65feca1..8a2f5dfb59c8e 100755
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -66,7 +66,7 @@ references:
        # First run the same pipeline as Read-The-Docs
        # apt-get update && apt-get install -y cmake
        # using: https://hub.docker.com/r/readthedocs/build
-       pyenv global 3.6.8
+       pyenv global 3.7.3
        python --version
        pip install -r docs/requirements.txt
        cd docs; make clean; make html --debug --jobs 2 SPHINXOPTS="-W"
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index fa21f3da44560..7632822e462c3 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -32,7 +32,7 @@
     XLA_AVAILABLE = True
 
 
-class LightningModule(ABC, DeviceDtypeModuleMixin, GradInformation, ModelIO, ModelHooks):
+class LightningModule(ABC, DeviceDtypeModuleMixin, GradInformation, ModelIO, ModelHooks, Module):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
diff --git a/pytorch_lightning/metrics/metric.py b/pytorch_lightning/metrics/metric.py
index 9fb86d4b46154..bd14655f30fa3 100644
--- a/pytorch_lightning/metrics/metric.py
+++ b/pytorch_lightning/metrics/metric.py
@@ -12,7 +12,7 @@
 __all__ = ['Metric', 'TensorMetric', 'NumpyMetric']
 
 
-class Metric(ABC, DeviceDtypeModuleMixin):
+class Metric(ABC, DeviceDtypeModuleMixin, Module):
     """
     Abstract base class for metric implementation.
 

From c8d6cacf4c1b4e7d30f5c098fd6cef85e1935982 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Tue, 9 Jun 2020 11:38:26 +0200
Subject: [PATCH 40/44] Apply suggestions from code review

Co-authored-by: Nicki Skafte <skaftenicki@gmail.com>
---
 pytorch_lightning/metrics/sklearn.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/metrics/sklearn.py b/pytorch_lightning/metrics/sklearn.py
index 075949fe25988..45009b32e53e1 100644
--- a/pytorch_lightning/metrics/sklearn.py
+++ b/pytorch_lightning/metrics/sklearn.py
@@ -49,8 +49,9 @@ def __init__(self, metric_name: str,
 
         self.metric_kwargs = kwargs
 
-        lightning_logger.debug(
-            'Every metric call will cause a GPU synchronization, which may slow down your code')
+            f'Metric {self.__class__.__name__} is using Sklearn as backend, meaning that'
+            ' every metric call will cause a GPU synchronization, which may slow down your code'
+            
 
     @property
     def metric_fn(self):

From eeede877c9fe9b50415dd931ace844a7571aef0e Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Tue, 9 Jun 2020 11:53:30 +0200
Subject: [PATCH 41/44] docs

---
 .circleci/config.yml                 | 1 +
 pytorch_lightning/metrics/sklearn.py | 4 ++--
 requirements-extra.txt               | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 8a2f5dfb59c8e..2b7f2ad578a32 100755
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -66,6 +66,7 @@ references:
        # First run the same pipeline as Read-The-Docs
        # apt-get update && apt-get install -y cmake
        # using: https://hub.docker.com/r/readthedocs/build
+       # we need to use py3.7 ot higher becase of an issue with metaclass inheritence
        pyenv global 3.7.3
        python --version
        pip install -r docs/requirements.txt
diff --git a/pytorch_lightning/metrics/sklearn.py b/pytorch_lightning/metrics/sklearn.py
index 45009b32e53e1..60cc98c2c329f 100644
--- a/pytorch_lightning/metrics/sklearn.py
+++ b/pytorch_lightning/metrics/sklearn.py
@@ -48,10 +48,10 @@ def __init__(self, metric_name: str,
                          reduce_op=reduce_op)
 
         self.metric_kwargs = kwargs
-
+        lightning_logger.debug(
             f'Metric {self.__class__.__name__} is using Sklearn as backend, meaning that'
             ' every metric call will cause a GPU synchronization, which may slow down your code'
-            
+        )
 
     @property
     def metric_fn(self):
diff --git a/requirements-extra.txt b/requirements-extra.txt
index ac12429220f82..1ce0aa550212f 100644
--- a/requirements-extra.txt
+++ b/requirements-extra.txt
@@ -11,4 +11,4 @@ matplotlib>=3.1.1
 horovod>=0.19.1
 omegaconf>=2.0.0
 scipy>=0.13.3
-scikit-learn>=0.19
+scikit-learn>=0.20.0

From db8e7243cd1c88ce39ac3174af383a17e1272d5e Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Tue, 9 Jun 2020 20:27:11 +0200
Subject: [PATCH 42/44] req

---
 .github/workflows/ci-testing.yml | 6 +++---
 requirements-extra.txt           | 2 +-
 requirements.txt                 | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci-testing.yml b/.github/workflows/ci-testing.yml
index 29711c0c62295..d905df63dec61 100644
--- a/.github/workflows/ci-testing.yml
+++ b/.github/workflows/ci-testing.yml
@@ -68,9 +68,9 @@ jobs:
     - name: Set min. dependencies
       if: matrix.requires == 'minimal'
       run: |
-        python -c "req = open('requirements.txt').read().replace('>', '=') ; open('requirements.txt', 'w').write(req)"
-        python -c "req = open('requirements-extra.txt').read().replace('>', '=') ; open('requirements-extra.txt', 'w').write(req)"
-        python -c "req = open('tests/requirements-devel.txt').read().replace('>', '=') ; open('tests/requirements-devel.txt', 'w').write(req)"
+        python -c "req = open('requirements.txt').read().replace('>=', '==') ; open('requirements.txt', 'w').write(req)"
+        python -c "req = open('requirements-extra.txt').read().replace('>=', '==') ; open('requirements-extra.txt', 'w').write(req)"
+        python -c "req = open('tests/requirements-devel.txt').read().replace('>=', '==') ; open('tests/requirements-devel.txt', 'w').write(req)"
 
     # Note: This uses an internal pip API and may not always work
     # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow
diff --git a/requirements-extra.txt b/requirements-extra.txt
index 1ce0aa550212f..0fcd2f8a1bd92 100644
--- a/requirements-extra.txt
+++ b/requirements-extra.txt
@@ -10,5 +10,5 @@ matplotlib>=3.1.1
 # no need to install with [pytorch] as pytorch is already installed and torchvision is required only for Horovod examples
 horovod>=0.19.1
 omegaconf>=2.0.0
-scipy>=0.13.3
+# scipy>=0.13.3
 scikit-learn>=0.20.0
diff --git a/requirements.txt b/requirements.txt
index 79899d1b9c71f..8a3923894393b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 # the default package dependencies
 
-numpy>=1.11.0
+numpy>=1.13.3
 tqdm>=4.41.0
 torch>=1.3
 tensorboard>=1.14

From d49298aa948d9f70eea8636a4b0ca7c3d080b092 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Tue, 9 Jun 2020 21:25:14 +0200
Subject: [PATCH 43/44] min

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 8a3923894393b..62e723574a9bc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 # the default package dependencies
 
-numpy>=1.13.3
+numpy>=1.15  # because some BLAS compilation issues
 tqdm>=4.41.0
 torch>=1.3
 tensorboard>=1.14

From 363bd640207fc5a8352fe13cf282bc2989154ed4 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Wed, 10 Jun 2020 00:46:28 +0200
Subject: [PATCH 44/44] Apply suggestions from code review

Co-authored-by: Tullie Murrell <tulliemurrell@gmail.com>
---
 pytorch_lightning/metrics/utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pytorch_lightning/metrics/utils.py b/pytorch_lightning/metrics/utils.py
index e284b9494d8b8..0829c9711cb44 100644
--- a/pytorch_lightning/metrics/utils.py
+++ b/pytorch_lightning/metrics/utils.py
@@ -44,7 +44,6 @@ def _convert_to_tensor(data: Any) -> Any:
     """
     if isinstance(data, numbers.Number):
         return torch.tensor([data])
-
     else:
         return default_convert(data)