Skip to content

Commit

Permalink
resample idx (#354)
Browse files Browse the repository at this point in the history
* Change `bootstrap` keyword to `iterations`
* Implement index-based resampling/bootstrapping for stats functions.
  • Loading branch information
aaronspring authored Apr 21, 2020
1 parent f956be3 commit 2bc9cd5
Show file tree
Hide file tree
Showing 24 changed files with 304 additions and 176 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ What's New
climpred v2.1.0 (2020-04-##)
============================

Breaking change
---------------

- replace keyword `bootstrap` with `iterations` (:pr:`354`) `Aaron Spring`_.


New Features
------------

Expand Down Expand Up @@ -67,6 +73,9 @@ Internals/Minor Fixes
(:pr:`351`) `Aaron Spring`_.
- Require ``xskillscore v0.0.15`` and use their functions for effective sample
size-based metrics. (:pr: `353`) `Riley X. Brady`_.
- Faster bootstrapping without replacement used in threshold functions of
climpred.stats (:pr:`354`) `Aaron Spring`_.



Documentation
Expand Down
49 changes: 26 additions & 23 deletions asv_bench/benchmarks/benchmarks_hindcast.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# only take comparisons compatible with probabilistic metrics
HINDCAST_COMPARISONS = ['m2o']

BOOTSTRAP = 8
ITERATIONS = 8


class Generate:
Expand Down Expand Up @@ -78,15 +78,8 @@ def make_hind_obs(self):
)

self.uninit['var'] = xr.DataArray(
randn(
(self.ninit, self.nx, self.ny, self.nmember), frac_nan=FRAC_NAN
),
coords={
'lon': lons,
'lat': lats,
'time': inits,
'member': members,
},
randn((self.ninit, self.nx, self.ny, self.nmember), frac_nan=FRAC_NAN),
coords={'lon': lons, 'lat': lats, 'time': inits, 'member': members},
dims=('time', 'lon', 'lat', 'member'),
name='var',
attrs={'units': 'var units', 'description': 'a description'},
Expand All @@ -108,10 +101,7 @@ def time_compute_hindcast(self, metric, comparison):
"""Take time for `compute_hindcast`."""
ensure_loaded(
compute_hindcast(
self.hind,
self.observations,
metric=metric,
comparison=comparison,
self.hind, self.observations, metric=metric, comparison=comparison,
)
)

Expand All @@ -120,10 +110,7 @@ def peakmem_compute_hindcast(self, metric, comparison):
"""Take memory peak for `compute_hindcast`."""
ensure_loaded(
compute_hindcast(
self.hind,
self.observations,
metric=metric,
comparison=comparison,
self.hind, self.observations, metric=metric, comparison=comparison,
)
)

Expand All @@ -137,7 +124,7 @@ def time_bootstrap_hindcast(self, metric, comparison):
self.observations,
metric=metric,
comparison=comparison,
bootstrap=BOOTSTRAP,
iterations=ITERATIONS,
dim='member',
)
)
Expand All @@ -152,7 +139,7 @@ def peakmem_bootstrap_hindcast(self, metric, comparison):
self.observations,
metric=metric,
comparison=comparison,
bootstrap=BOOTSTRAP,
iterations=ITERATIONS,
dim='member',
)
)
Expand All @@ -168,8 +155,24 @@ def setup(self, *args, **kwargs):
# https://github.com/pydata/xarray/blob/stable/asv_bench/benchmarks/rolling.py
super().setup(**kwargs)
# chunk along a spatial dimension to enable embarrasingly parallel computation
self.hind = self.hind['var'].chunk({'lon': self.nx // BOOTSTRAP})
self.hind = self.hind['var'].chunk({'lon': self.nx // ITERATIONS})
self.observations = self.observations['var'].chunk(
{'lon': self.nx // BOOTSTRAP}
{'lon': self.nx // ITERATIONS}
)
self.uninit = self.uninit['var'].chunk({'lon': self.nx // BOOTSTRAP})
self.uninit = self.uninit['var'].chunk({'lon': self.nx // ITERATIONS})


class ComputeSmall(Compute):
def setup(self, *args, **kwargs):
"""Benchmark time and peak memory of `compute_hindcast` and
`bootstrap_hindcast`. This executes the same tests as `Compute` but on 1D
data."""
requires_dask()
# magic taken from
# https://github.com/pydata/xarray/blob/stable/asv_bench/benchmarks/rolling.py
super().setup(**kwargs)
# chunk along a spatial dimension to enable embarrasingly parallel computation
spatial_dims = ['lon', 'lat']
self.hind = self.hind.mean(spatial_dims)
self.observations = self.observations.mean(spatial_dims)
self.uninit = self.uninit.mean(spatial_dims)
30 changes: 22 additions & 8 deletions asv_bench/benchmarks/benchmarks_perfect_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# only take comparisons compatible with probabilistic metrics
PM_COMPARISONS = ['m2m', 'm2c']

BOOTSTRAP = 8
ITERATIONS = 8


class Generate:
Expand Down Expand Up @@ -118,7 +118,7 @@ def time_bootstrap_perfect_model(self, metric, comparison):
self.control,
metric=metric,
comparison=comparison,
bootstrap=BOOTSTRAP,
iterations=ITERATIONS,
)
)

Expand All @@ -131,20 +131,34 @@ def peakmem_bootstrap_perfect_model(self, metric, comparison):
self.control,
metric=metric,
comparison=comparison,
bootstrap=BOOTSTRAP,
iterations=ITERATIONS,
)
)


class ComputeDask(Compute):
def setup(self, *args, **kwargs):
"""Benchmark time and peak memory of `compute_hindcast` and
`bootstrap_hindcast`. This executes the same tests as `Compute` but on chunked
data."""
"""Benchmark time and peak memory of `compute_perfect_model` and
`bootstrap_perfect_model`. This executes the same tests as `Compute` but
on chunked data."""
requires_dask()
# magic taken from
# https://github.com/pydata/xarray/blob/stable/asv_bench/benchmarks/rolling.py
super().setup(**kwargs)
# chunk along a spatial dimension to enable embarrasingly parallel computation
self.ds = self.ds['var'].chunk({'lon': self.nx // BOOTSTRAP})
self.control = self.control['var'].chunk({'lon': self.nx // BOOTSTRAP})
self.ds = self.ds['var'].chunk({'lon': self.nx // ITERATIONS})
self.control = self.control['var'].chunk({'lon': self.nx // ITERATIONS})


class ComputeSmall(Compute):
def setup(self, *args, **kwargs):
"""Benchmark time and peak memory of `compute_perfect_model` and
`bootstrap_perfect_model`. This executes the same tests as `Compute`
but on 1D data."""
# magic taken from
# https://github.com/pydata/xarray/blob/stable/asv_bench/benchmarks/rolling.py
super().setup(**kwargs)
# chunk along a spatial dimension to enable embarrasingly parallel computation
spatial_dims = ['lon', 'lat']
self.ds = self.ds.mean(spatial_dims)
self.control = self.control.mean(spatial_dims)
Loading

0 comments on commit 2bc9cd5

Please sign in to comment.