resample idx (#354)

* Change `bootstrap` keyword to `iterations` * Implement index-based resampling/bootstrapping for stats functions.
pangeo-data · Apr 21, 2020 · 2bc9cd5 · 2bc9cd5
1 parent f956be3
commit 2bc9cd5
Show file tree

Hide file tree

Showing 24 changed files with 304 additions and 176 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -5,6 +5,12 @@ What's New
 climpred v2.1.0 (2020-04-##)
 ============================
 
+Breaking change
+---------------
+
+- replace keyword `bootstrap` with `iterations` (:pr:`354`) `Aaron Spring`_.
+
+
 New Features
 ------------
 
@@ -67,6 +73,9 @@ Internals/Minor Fixes
   (:pr:`351`) `Aaron Spring`_.
 - Require ``xskillscore v0.0.15`` and use their functions for effective sample
   size-based metrics. (:pr: `353`) `Riley X. Brady`_.
+- Faster bootstrapping without replacement used in threshold functions of
+  climpred.stats (:pr:`354`) `Aaron Spring`_.
+
 
 
 Documentation

diff --git a/asv_bench/benchmarks/benchmarks_hindcast.py b/asv_bench/benchmarks/benchmarks_hindcast.py
@@ -11,7 +11,7 @@
 # only take comparisons compatible with probabilistic metrics
 HINDCAST_COMPARISONS = ['m2o']
 
-BOOTSTRAP = 8
+ITERATIONS = 8
 
 
 class Generate:
@@ -78,15 +78,8 @@ def make_hind_obs(self):
         )
 
         self.uninit['var'] = xr.DataArray(
-            randn(
-                (self.ninit, self.nx, self.ny, self.nmember), frac_nan=FRAC_NAN
-            ),
-            coords={
-                'lon': lons,
-                'lat': lats,
-                'time': inits,
-                'member': members,
-            },
+            randn((self.ninit, self.nx, self.ny, self.nmember), frac_nan=FRAC_NAN),
+            coords={'lon': lons, 'lat': lats, 'time': inits, 'member': members},
             dims=('time', 'lon', 'lat', 'member'),
             name='var',
             attrs={'units': 'var units', 'description': 'a description'},
@@ -108,10 +101,7 @@ def time_compute_hindcast(self, metric, comparison):
         """Take time for `compute_hindcast`."""
         ensure_loaded(
             compute_hindcast(
-                self.hind,
-                self.observations,
-                metric=metric,
-                comparison=comparison,
+                self.hind, self.observations, metric=metric, comparison=comparison,
             )
         )
 
@@ -120,10 +110,7 @@ def peakmem_compute_hindcast(self, metric, comparison):
         """Take memory peak for `compute_hindcast`."""
         ensure_loaded(
             compute_hindcast(
-                self.hind,
-                self.observations,
-                metric=metric,
-                comparison=comparison,
+                self.hind, self.observations, metric=metric, comparison=comparison,
             )
         )
 
@@ -137,7 +124,7 @@ def time_bootstrap_hindcast(self, metric, comparison):
                 self.observations,
                 metric=metric,
                 comparison=comparison,
-                bootstrap=BOOTSTRAP,
+                iterations=ITERATIONS,
                 dim='member',
             )
         )
@@ -152,7 +139,7 @@ def peakmem_bootstrap_hindcast(self, metric, comparison):
                 self.observations,
                 metric=metric,
                 comparison=comparison,
-                bootstrap=BOOTSTRAP,
+                iterations=ITERATIONS,
                 dim='member',
             )
         )
@@ -168,8 +155,24 @@ def setup(self, *args, **kwargs):
         # https://github.com/pydata/xarray/blob/stable/asv_bench/benchmarks/rolling.py
         super().setup(**kwargs)
         # chunk along a spatial dimension to enable embarrasingly parallel computation
-        self.hind = self.hind['var'].chunk({'lon': self.nx // BOOTSTRAP})
+        self.hind = self.hind['var'].chunk({'lon': self.nx // ITERATIONS})
         self.observations = self.observations['var'].chunk(
-            {'lon': self.nx // BOOTSTRAP}
+            {'lon': self.nx // ITERATIONS}
         )
-        self.uninit = self.uninit['var'].chunk({'lon': self.nx // BOOTSTRAP})
+        self.uninit = self.uninit['var'].chunk({'lon': self.nx // ITERATIONS})
+
+
+class ComputeSmall(Compute):
+    def setup(self, *args, **kwargs):
+        """Benchmark time and peak memory of `compute_hindcast` and
+        `bootstrap_hindcast`. This executes the same tests as `Compute` but on 1D
+        data."""
+        requires_dask()
+        # magic taken from
+        # https://github.com/pydata/xarray/blob/stable/asv_bench/benchmarks/rolling.py
+        super().setup(**kwargs)
+        # chunk along a spatial dimension to enable embarrasingly parallel computation
+        spatial_dims = ['lon', 'lat']
+        self.hind = self.hind.mean(spatial_dims)
+        self.observations = self.observations.mean(spatial_dims)
+        self.uninit = self.uninit.mean(spatial_dims)
diff --git a/asv_bench/benchmarks/benchmarks_perfect_model.py b/asv_bench/benchmarks/benchmarks_perfect_model.py
@@ -11,7 +11,7 @@
 # only take comparisons compatible with probabilistic metrics
 PM_COMPARISONS = ['m2m', 'm2c']
 
-BOOTSTRAP = 8
+ITERATIONS = 8
 
 
 class Generate:
@@ -118,7 +118,7 @@ def time_bootstrap_perfect_model(self, metric, comparison):
                 self.control,
                 metric=metric,
                 comparison=comparison,
-                bootstrap=BOOTSTRAP,
+                iterations=ITERATIONS,
             )
         )
 
@@ -131,20 +131,34 @@ def peakmem_bootstrap_perfect_model(self, metric, comparison):
                 self.control,
                 metric=metric,
                 comparison=comparison,
-                bootstrap=BOOTSTRAP,
+                iterations=ITERATIONS,
             )
         )
 
 
 class ComputeDask(Compute):
     def setup(self, *args, **kwargs):
-        """Benchmark time and peak memory of `compute_hindcast` and
-        `bootstrap_hindcast`. This executes the same tests as `Compute` but on chunked
-        data."""
+        """Benchmark time and peak memory of `compute_perfect_model` and
+        `bootstrap_perfect_model`. This executes the same tests as `Compute` but
+        on chunked data."""
         requires_dask()
         # magic taken from
         # https://github.com/pydata/xarray/blob/stable/asv_bench/benchmarks/rolling.py
         super().setup(**kwargs)
         # chunk along a spatial dimension to enable embarrasingly parallel computation
-        self.ds = self.ds['var'].chunk({'lon': self.nx // BOOTSTRAP})
-        self.control = self.control['var'].chunk({'lon': self.nx // BOOTSTRAP})
+        self.ds = self.ds['var'].chunk({'lon': self.nx // ITERATIONS})
+        self.control = self.control['var'].chunk({'lon': self.nx // ITERATIONS})
+
+
+class ComputeSmall(Compute):
+    def setup(self, *args, **kwargs):
+        """Benchmark time and peak memory of `compute_perfect_model` and
+        `bootstrap_perfect_model`. This executes the same tests as `Compute`
+        but on 1D data."""
+        # magic taken from
+        # https://github.com/pydata/xarray/blob/stable/asv_bench/benchmarks/rolling.py
+        super().setup(**kwargs)
+        # chunk along a spatial dimension to enable embarrasingly parallel computation
+        spatial_dims = ['lon', 'lat']
+        self.ds = self.ds.mean(spatial_dims)
+        self.control = self.control.mean(spatial_dims)