Skip to content
This repository has been archived by the owner on Aug 29, 2023. It is now read-only.

Commit

Permalink
Pearson correlation scalar with N-D cubes
Browse files Browse the repository at this point in the history
pearson_correlation_scalar now works with N-D data variables, by using
xr.stack to flatten them. This should work well with Dask.

Closes #746
  • Loading branch information
Jānis Gailis committed Sep 27, 2018
1 parent 3762653 commit 2e5780a
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 31 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
## Version 2.0.0.dev21 (in development)

* Pearson correlation scalar operation now works on N-D data variables [#746](https://github.com/CCI-Tools/cate/issues/746)

## Version 2.0.0.dev20

* Changed order and names of new `data_frame_subset` operation inputs.
Expand Down
44 changes: 25 additions & 19 deletions cate/ops/correlation.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def pearson_correlation_scalar(ds_x: DatasetLike.TYPE,
"""
Do product moment `Pearson's correlation <http://www.statsoft.com/Textbook/Statistics-Glossary/P/button/p#Pearson%20Correlation>`_ analysis.
Performs a simple correlation analysis on two timeseries and returns
Performs a simple correlation analysis on two data variables and returns
a correlation coefficient and the corresponding p_value.
Positive correlation implies that as x grows, so does y. Negative
Expand All @@ -80,7 +80,7 @@ def pearson_correlation_scalar(ds_x: DatasetLike.TYPE,
:param var_x: Dataset variable to use for correlation analysis in the 'variable' dataset
:param var_y: Dataset variable to use for correlation analysis in the 'dependent' dataset
:param monitor: a progress monitor.
:return: {'corr_coef': correlation coefficient, 'p_value': probability value}
:return: Data frame {'corr_coef': correlation coefficient, 'p_value': probability value}
"""
ds_x = DatasetLike.convert(ds_x)
ds_y = DatasetLike.convert(ds_y)
Expand All @@ -90,24 +90,30 @@ def pearson_correlation_scalar(ds_x: DatasetLike.TYPE,
array_y = ds_y[var_y]
array_x = ds_x[var_x]

if ((len(array_x.dims) != len(array_y.dims)) or
(len(array_x.dims) != 1)):
raise ValidationError('To calculate simple correlation, both provided'
' datasets should be simple 1d timeseries. To'
' create a map of correlation coefficients, use'
' pearson_correlation operation instead.')
if (array_x.dims != array_y.dims):
raise ValidationError('Both datasets should feature the same'
' dimensionality. Currently provided ds_x[var_x] '
f'has {array_x.dims}, provided ds_y[var_y]'
f' has {array_y.dims}')

if len(array_x['time']) != len(array_y['time']):
raise ValidationError('The length of the time dimension differs between'
' the given datasets. Can not perform the calculation'
', please review operation documentation.')
for dim in array_x.dims:
if len(array_x[dim]) != len(array_y[dim]):
raise ValidationError('All dimensions of both provided data variables'
f' must be the same length. Currently {dim} of ds_x[var_x]'
f' has {len(array_x[dim])} values, while'
f' {dim} of ds_y[var_y] has {len(array_y[dim])} values.'
' You may want to try to coregister the datasets beforehand.')

if len(array_x['time']) < 3:
raise ValidationError('The length of the time dimension should not be less'
' than three to run the calculation.')
n_vals = 1
for dim in array_x.dims:
n_vals = n_vals * len(array_x[dim])

if n_vals < 3:
raise ValidationError('There should be no less than 3 values in both data variables'
f' to perform the correlation. Currently there are {n_vals} values')

with monitor.observing("Calculate Pearson correlation"):
cc, pv = pearsonr(array_x.values, array_y.values)
cc, pv = pearsonr(array_x.stack(z=array_x.dims), array_y.stack(z=array_y.dims))

return pd.DataFrame({'corr_coef': [cc], 'p_value': [pv]})

Expand Down Expand Up @@ -173,10 +179,10 @@ def pearson_correlation(ds_x: DatasetLike.TYPE,
' is provided.')

if array_x.values.shape != array_y.values.shape:
raise ValidationError('The provided variables {} and {} do not have the'
raise ValidationError(f'The provided variables {var_x} and {var_y} do not have the'
' same shape, Pearson correlation can not be'
' performed. Please review operation'
' documentation'.format(var_x, var_y))
' documentation')

if (not ds_x['lat'].equals(ds_y['lat']) or
not ds_x['lon'].equals(ds_y['lon'])):
Expand Down Expand Up @@ -206,7 +212,7 @@ def pearson_correlation(ds_x: DatasetLike.TYPE,

# Do pixel by pixel correlation
retset = _pearsonr(array_x, array_y, monitor)
retset.attrs['Cate_Description'] = 'Correlation between {} {}'.format(var_y, var_x)
retset.attrs['Cate_Description'] = f'Correlation between {var_y} {var_x}'

return adjust_spatial_attrs(retset)

Expand Down
43 changes: 31 additions & 12 deletions test/ops/test_correlation.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,34 +52,53 @@ def test_error(self):
"""
Test error conditions
"""
# Test incompatible time dimension
# Test incompatible dimension length
ds1 = xr.Dataset({'first': ('time', np.linspace(0, 5, 6))})
ds2 = xr.Dataset({'first': ('time', np.linspace(0, 1, 2))})

with self.assertRaises(ValueError) as err:
pearson_correlation_scalar(ds1, ds2, 'first', 'first')
self.assertIn('dimension differs', str(err.exception))
self.assertIn('All dimensions', str(err.exception))

# Test incompatible time dimension
# Test not enough values to correlate
ds1 = xr.Dataset({'first': ('time', np.linspace(0, 1, 2))})
ds2 = xr.Dataset({'first': ('time', np.linspace(0, 1, 2))})

with self.assertRaises(ValueError) as err:
pearson_correlation_scalar(ds1, ds2, 'first', 'first')
self.assertIn('dimension should not be less', str(err.exception))
self.assertIn('no less than 3 values', str(err.exception))

# Test incompatible dimensionality
ds1 = xr.Dataset({
'first': (['time', 'lon', 'lat'], np.array([np.ones([8, 4])])),
'second': (['time', 'lon', 'lat'], np.array([np.ones([8, 4])])),
'lat': np.linspace(-67.5, 67.5, 4),
'lon': np.linspace(-157.5, 157.5, 8),
'time': np.array([1])}).chunk(chunks={'lat': 2, 'lon': 4})

ds2 = xr.Dataset({
'second': (['time', 'lat', 'lon'], np.array([np.ones([4, 8])])),
'first': (['time', 'lat', 'lon'], np.array([np.ones([4, 8]) * 2])),
'lat': np.linspace(-67.5, 67.5, 4),
'lon': np.linspace(-157.5, 157.5, 8),
'time': np.array([1])}).chunk(chunks={'lat': 2, 'lon': 4})

with self.assertRaises(ValueError) as err:
pearson_correlation_scalar(ds1, ds2, 'first', 'first')
self.assertIn('same dimensionality', str(err.exception))

def test_3D(self):
"""
Test nominal run
"""
# Test general functionality 3D dataset variables
ds1 = xr.Dataset({
'first': (['time', 'lat', 'lon'], np.array([np.ones([4, 8]),
np.ones([4, 8]) * 2,
np.ones([4, 8]) * 3])),
'second': (['time', 'lat', 'lon'], np.array([np.ones([4, 8]) * 2,
'second': (['time', 'lat', 'lon'], np.array([np.ones([4, 8]),
np.ones([4, 8]) * 3,
np.ones([4, 8])])),
np.ones([4, 8]) * 4])),
'first': (['time', 'lat', 'lon'], np.array([np.ones([4, 8]) * 2,
np.ones([4, 8]) * 3,
np.ones([4, 8])])),
'lat': np.linspace(-67.5, 67.5, 4),
'lon': np.linspace(-157.5, 157.5, 8),
'time': np.array([1, 2, 3])}).chunk(chunks={'lat': 2, 'lon': 4})
Expand All @@ -95,9 +114,9 @@ def test_3D(self):
'lon': np.linspace(-157.5, 157.5, 8),
'time': np.array([1, 2, 3])}).chunk(chunks={'lat': 2, 'lon': 4})

with self.assertRaises(ValueError) as err:
pearson_correlation_scalar(ds1, ds2, 'first', 'first')
self.assertIn('should be simple 1d', str(err.exception))
correlation = pearson_correlation_scalar(ds1, ds2, 'first', 'first')
self.assertTrue(correlation['corr_coef'][0] == 1.0)
self.assertTrue(correlation['p_value'][0] == 0.0)


class TestPearson(TestCase):
Expand Down

0 comments on commit 2e5780a

Please sign in to comment.