From 2e5780ab76088edb98f216d3d5f44573aea31457 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C4=81nis=20Gailis?= Date: Thu, 27 Sep 2018 11:11:01 +0200 Subject: [PATCH] Pearson correlation scalar with N-D cubes pearson_correlation_scalar now works with N-D data variables, by using xr.stack to flatten them. This should work well with Dask. Closes #746 --- CHANGES.md | 2 ++ cate/ops/correlation.py | 44 ++++++++++++++++++++---------------- test/ops/test_correlation.py | 43 +++++++++++++++++++++++++---------- 3 files changed, 58 insertions(+), 31 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index d307caf10..44c3b8d24 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,7 @@ ## Version 2.0.0.dev21 (in development) +* Pearson correlation scalar operation now works on N-D data variables [#746](https://github.com/CCI-Tools/cate/issues/746) + ## Version 2.0.0.dev20 * Changed order and names of new `data_frame_subset` operation inputs. diff --git a/cate/ops/correlation.py b/cate/ops/correlation.py index b95e909d0..e852e8a25 100644 --- a/cate/ops/correlation.py +++ b/cate/ops/correlation.py @@ -65,7 +65,7 @@ def pearson_correlation_scalar(ds_x: DatasetLike.TYPE, """ Do product moment `Pearson's correlation `_ analysis. - Performs a simple correlation analysis on two timeseries and returns + Performs a simple correlation analysis on two data variables and returns a correlation coefficient and the corresponding p_value. Positive correlation implies that as x grows, so does y. Negative @@ -80,7 +80,7 @@ def pearson_correlation_scalar(ds_x: DatasetLike.TYPE, :param var_x: Dataset variable to use for correlation analysis in the 'variable' dataset :param var_y: Dataset variable to use for correlation analysis in the 'dependent' dataset :param monitor: a progress monitor. - :return: {'corr_coef': correlation coefficient, 'p_value': probability value} + :return: Data frame {'corr_coef': correlation coefficient, 'p_value': probability value} """ ds_x = DatasetLike.convert(ds_x) ds_y = DatasetLike.convert(ds_y) @@ -90,24 +90,30 @@ def pearson_correlation_scalar(ds_x: DatasetLike.TYPE, array_y = ds_y[var_y] array_x = ds_x[var_x] - if ((len(array_x.dims) != len(array_y.dims)) or - (len(array_x.dims) != 1)): - raise ValidationError('To calculate simple correlation, both provided' - ' datasets should be simple 1d timeseries. To' - ' create a map of correlation coefficients, use' - ' pearson_correlation operation instead.') + if (array_x.dims != array_y.dims): + raise ValidationError('Both datasets should feature the same' + ' dimensionality. Currently provided ds_x[var_x] ' + f'has {array_x.dims}, provided ds_y[var_y]' + f' has {array_y.dims}') - if len(array_x['time']) != len(array_y['time']): - raise ValidationError('The length of the time dimension differs between' - ' the given datasets. Can not perform the calculation' - ', please review operation documentation.') + for dim in array_x.dims: + if len(array_x[dim]) != len(array_y[dim]): + raise ValidationError('All dimensions of both provided data variables' + f' must be the same length. Currently {dim} of ds_x[var_x]' + f' has {len(array_x[dim])} values, while' + f' {dim} of ds_y[var_y] has {len(array_y[dim])} values.' + ' You may want to try to coregister the datasets beforehand.') - if len(array_x['time']) < 3: - raise ValidationError('The length of the time dimension should not be less' - ' than three to run the calculation.') + n_vals = 1 + for dim in array_x.dims: + n_vals = n_vals * len(array_x[dim]) + + if n_vals < 3: + raise ValidationError('There should be no less than 3 values in both data variables' + f' to perform the correlation. Currently there are {n_vals} values') with monitor.observing("Calculate Pearson correlation"): - cc, pv = pearsonr(array_x.values, array_y.values) + cc, pv = pearsonr(array_x.stack(z=array_x.dims), array_y.stack(z=array_y.dims)) return pd.DataFrame({'corr_coef': [cc], 'p_value': [pv]}) @@ -173,10 +179,10 @@ def pearson_correlation(ds_x: DatasetLike.TYPE, ' is provided.') if array_x.values.shape != array_y.values.shape: - raise ValidationError('The provided variables {} and {} do not have the' + raise ValidationError(f'The provided variables {var_x} and {var_y} do not have the' ' same shape, Pearson correlation can not be' ' performed. Please review operation' - ' documentation'.format(var_x, var_y)) + ' documentation') if (not ds_x['lat'].equals(ds_y['lat']) or not ds_x['lon'].equals(ds_y['lon'])): @@ -206,7 +212,7 @@ def pearson_correlation(ds_x: DatasetLike.TYPE, # Do pixel by pixel correlation retset = _pearsonr(array_x, array_y, monitor) - retset.attrs['Cate_Description'] = 'Correlation between {} {}'.format(var_y, var_x) + retset.attrs['Cate_Description'] = f'Correlation between {var_y} {var_x}' return adjust_spatial_attrs(retset) diff --git a/test/ops/test_correlation.py b/test/ops/test_correlation.py index 62416b254..8a54bf361 100644 --- a/test/ops/test_correlation.py +++ b/test/ops/test_correlation.py @@ -52,21 +52,40 @@ def test_error(self): """ Test error conditions """ - # Test incompatible time dimension + # Test incompatible dimension length ds1 = xr.Dataset({'first': ('time', np.linspace(0, 5, 6))}) ds2 = xr.Dataset({'first': ('time', np.linspace(0, 1, 2))}) with self.assertRaises(ValueError) as err: pearson_correlation_scalar(ds1, ds2, 'first', 'first') - self.assertIn('dimension differs', str(err.exception)) + self.assertIn('All dimensions', str(err.exception)) - # Test incompatible time dimension + # Test not enough values to correlate ds1 = xr.Dataset({'first': ('time', np.linspace(0, 1, 2))}) ds2 = xr.Dataset({'first': ('time', np.linspace(0, 1, 2))}) with self.assertRaises(ValueError) as err: pearson_correlation_scalar(ds1, ds2, 'first', 'first') - self.assertIn('dimension should not be less', str(err.exception)) + self.assertIn('no less than 3 values', str(err.exception)) + + # Test incompatible dimensionality + ds1 = xr.Dataset({ + 'first': (['time', 'lon', 'lat'], np.array([np.ones([8, 4])])), + 'second': (['time', 'lon', 'lat'], np.array([np.ones([8, 4])])), + 'lat': np.linspace(-67.5, 67.5, 4), + 'lon': np.linspace(-157.5, 157.5, 8), + 'time': np.array([1])}).chunk(chunks={'lat': 2, 'lon': 4}) + + ds2 = xr.Dataset({ + 'second': (['time', 'lat', 'lon'], np.array([np.ones([4, 8])])), + 'first': (['time', 'lat', 'lon'], np.array([np.ones([4, 8]) * 2])), + 'lat': np.linspace(-67.5, 67.5, 4), + 'lon': np.linspace(-157.5, 157.5, 8), + 'time': np.array([1])}).chunk(chunks={'lat': 2, 'lon': 4}) + + with self.assertRaises(ValueError) as err: + pearson_correlation_scalar(ds1, ds2, 'first', 'first') + self.assertIn('same dimensionality', str(err.exception)) def test_3D(self): """ @@ -74,12 +93,12 @@ def test_3D(self): """ # Test general functionality 3D dataset variables ds1 = xr.Dataset({ - 'first': (['time', 'lat', 'lon'], np.array([np.ones([4, 8]), - np.ones([4, 8]) * 2, - np.ones([4, 8]) * 3])), - 'second': (['time', 'lat', 'lon'], np.array([np.ones([4, 8]) * 2, + 'second': (['time', 'lat', 'lon'], np.array([np.ones([4, 8]), np.ones([4, 8]) * 3, - np.ones([4, 8])])), + np.ones([4, 8]) * 4])), + 'first': (['time', 'lat', 'lon'], np.array([np.ones([4, 8]) * 2, + np.ones([4, 8]) * 3, + np.ones([4, 8])])), 'lat': np.linspace(-67.5, 67.5, 4), 'lon': np.linspace(-157.5, 157.5, 8), 'time': np.array([1, 2, 3])}).chunk(chunks={'lat': 2, 'lon': 4}) @@ -95,9 +114,9 @@ def test_3D(self): 'lon': np.linspace(-157.5, 157.5, 8), 'time': np.array([1, 2, 3])}).chunk(chunks={'lat': 2, 'lon': 4}) - with self.assertRaises(ValueError) as err: - pearson_correlation_scalar(ds1, ds2, 'first', 'first') - self.assertIn('should be simple 1d', str(err.exception)) + correlation = pearson_correlation_scalar(ds1, ds2, 'first', 'first') + self.assertTrue(correlation['corr_coef'][0] == 1.0) + self.assertTrue(correlation['p_value'][0] == 0.0) class TestPearson(TestCase):