From 2e5780ab76088edb98f216d3d5f44573aea31457 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C4=81nis=20Gailis?= <gailis@stcorp.no>
Date: Thu, 27 Sep 2018 11:11:01 +0200
Subject: [PATCH] Pearson correlation scalar with N-D cubes

pearson_correlation_scalar now works with N-D data variables, by using
xr.stack to flatten them. This should work well with Dask.

Closes #746
---
 CHANGES.md                   |  2 ++
 cate/ops/correlation.py      | 44 ++++++++++++++++++++----------------
 test/ops/test_correlation.py | 43 +++++++++++++++++++++++++----------
 3 files changed, 58 insertions(+), 31 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index d307caf10..44c3b8d24 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,5 +1,7 @@
 ## Version 2.0.0.dev21 (in development)
 
+* Pearson correlation scalar operation now works on N-D data variables [#746](https://github.com/CCI-Tools/cate/issues/746)
+
 ## Version 2.0.0.dev20
 
 * Changed order and names of new `data_frame_subset` operation inputs.
diff --git a/cate/ops/correlation.py b/cate/ops/correlation.py
index b95e909d0..e852e8a25 100644
--- a/cate/ops/correlation.py
+++ b/cate/ops/correlation.py
@@ -65,7 +65,7 @@ def pearson_correlation_scalar(ds_x: DatasetLike.TYPE,
     """
     Do product moment `Pearson's correlation <http://www.statsoft.com/Textbook/Statistics-Glossary/P/button/p#Pearson%20Correlation>`_ analysis.
 
-    Performs a simple correlation analysis on two timeseries and returns
+    Performs a simple correlation analysis on two data variables and returns
     a correlation coefficient and the corresponding p_value.
 
     Positive correlation implies that as x grows, so does y. Negative
@@ -80,7 +80,7 @@ def pearson_correlation_scalar(ds_x: DatasetLike.TYPE,
     :param var_x: Dataset variable to use for correlation analysis in the 'variable' dataset
     :param var_y: Dataset variable to use for correlation analysis in the 'dependent' dataset
     :param monitor: a progress monitor.
-    :return: {'corr_coef': correlation coefficient, 'p_value': probability value}
+    :return: Data frame {'corr_coef': correlation coefficient, 'p_value': probability value}
     """
     ds_x = DatasetLike.convert(ds_x)
     ds_y = DatasetLike.convert(ds_y)
@@ -90,24 +90,30 @@ def pearson_correlation_scalar(ds_x: DatasetLike.TYPE,
     array_y = ds_y[var_y]
     array_x = ds_x[var_x]
 
-    if ((len(array_x.dims) != len(array_y.dims)) or
-       (len(array_x.dims) != 1)):
-        raise ValidationError('To calculate simple correlation, both provided'
-                              ' datasets should be simple 1d timeseries. To'
-                              ' create a map of correlation coefficients, use'
-                              ' pearson_correlation operation instead.')
+    if (array_x.dims != array_y.dims):
+        raise ValidationError('Both datasets should feature the same'
+                              ' dimensionality. Currently provided ds_x[var_x] '
+                              f'has {array_x.dims}, provided ds_y[var_y]'
+                              f' has {array_y.dims}')
 
-    if len(array_x['time']) != len(array_y['time']):
-        raise ValidationError('The length of the time dimension differs between'
-                              ' the given datasets. Can not perform the calculation'
-                              ', please review operation documentation.')
+    for dim in array_x.dims:
+        if len(array_x[dim]) != len(array_y[dim]):
+            raise ValidationError('All dimensions of both provided data variables'
+                                  f' must be the same length. Currently {dim} of ds_x[var_x]'
+                                  f' has {len(array_x[dim])} values, while'
+                                  f' {dim} of ds_y[var_y] has {len(array_y[dim])} values.'
+                                  ' You may want to try to coregister the datasets beforehand.')
 
-    if len(array_x['time']) < 3:
-        raise ValidationError('The length of the time dimension should not be less'
-                              ' than three to run the calculation.')
+    n_vals = 1
+    for dim in array_x.dims:
+        n_vals = n_vals * len(array_x[dim])
+
+    if n_vals < 3:
+        raise ValidationError('There should be no less than 3 values in both data variables'
+                              f' to perform the correlation. Currently there are {n_vals} values')
 
     with monitor.observing("Calculate Pearson correlation"):
-        cc, pv = pearsonr(array_x.values, array_y.values)
+        cc, pv = pearsonr(array_x.stack(z=array_x.dims), array_y.stack(z=array_y.dims))
 
     return pd.DataFrame({'corr_coef': [cc], 'p_value': [pv]})
 
@@ -173,10 +179,10 @@ def pearson_correlation(ds_x: DatasetLike.TYPE,
                                   ' is provided.')
 
         if array_x.values.shape != array_y.values.shape:
-            raise ValidationError('The provided variables {} and {} do not have the'
+            raise ValidationError(f'The provided variables {var_x} and {var_y} do not have the'
                                   ' same shape, Pearson correlation can not be'
                                   ' performed. Please review operation'
-                                  ' documentation'.format(var_x, var_y))
+                                  ' documentation')
 
         if (not ds_x['lat'].equals(ds_y['lat']) or
                 not ds_x['lon'].equals(ds_y['lon'])):
@@ -206,7 +212,7 @@ def pearson_correlation(ds_x: DatasetLike.TYPE,
 
     # Do pixel by pixel correlation
     retset = _pearsonr(array_x, array_y, monitor)
-    retset.attrs['Cate_Description'] = 'Correlation between {} {}'.format(var_y, var_x)
+    retset.attrs['Cate_Description'] = f'Correlation between {var_y} {var_x}'
 
     return adjust_spatial_attrs(retset)
 
diff --git a/test/ops/test_correlation.py b/test/ops/test_correlation.py
index 62416b254..8a54bf361 100644
--- a/test/ops/test_correlation.py
+++ b/test/ops/test_correlation.py
@@ -52,21 +52,40 @@ def test_error(self):
         """
         Test error conditions
         """
-        # Test incompatible time dimension
+        # Test incompatible dimension length
         ds1 = xr.Dataset({'first': ('time', np.linspace(0, 5, 6))})
         ds2 = xr.Dataset({'first': ('time', np.linspace(0, 1, 2))})
 
         with self.assertRaises(ValueError) as err:
             pearson_correlation_scalar(ds1, ds2, 'first', 'first')
-        self.assertIn('dimension differs', str(err.exception))
+        self.assertIn('All dimensions', str(err.exception))
 
-        # Test incompatible time dimension
+        # Test not enough values to correlate
         ds1 = xr.Dataset({'first': ('time', np.linspace(0, 1, 2))})
         ds2 = xr.Dataset({'first': ('time', np.linspace(0, 1, 2))})
 
         with self.assertRaises(ValueError) as err:
             pearson_correlation_scalar(ds1, ds2, 'first', 'first')
-        self.assertIn('dimension should not be less', str(err.exception))
+        self.assertIn('no less than 3 values', str(err.exception))
+
+        # Test incompatible dimensionality
+        ds1 = xr.Dataset({
+            'first': (['time', 'lon', 'lat'], np.array([np.ones([8, 4])])),
+            'second': (['time', 'lon', 'lat'], np.array([np.ones([8, 4])])),
+            'lat': np.linspace(-67.5, 67.5, 4),
+            'lon': np.linspace(-157.5, 157.5, 8),
+            'time': np.array([1])}).chunk(chunks={'lat': 2, 'lon': 4})
+
+        ds2 = xr.Dataset({
+            'second': (['time', 'lat', 'lon'], np.array([np.ones([4, 8])])),
+            'first': (['time', 'lat', 'lon'], np.array([np.ones([4, 8]) * 2])),
+            'lat': np.linspace(-67.5, 67.5, 4),
+            'lon': np.linspace(-157.5, 157.5, 8),
+            'time': np.array([1])}).chunk(chunks={'lat': 2, 'lon': 4})
+
+        with self.assertRaises(ValueError) as err:
+            pearson_correlation_scalar(ds1, ds2, 'first', 'first')
+        self.assertIn('same dimensionality', str(err.exception))
 
     def test_3D(self):
         """
@@ -74,12 +93,12 @@ def test_3D(self):
         """
         # Test general functionality 3D dataset variables
         ds1 = xr.Dataset({
-            'first': (['time', 'lat', 'lon'], np.array([np.ones([4, 8]),
-                                                        np.ones([4, 8]) * 2,
-                                                        np.ones([4, 8]) * 3])),
-            'second': (['time', 'lat', 'lon'], np.array([np.ones([4, 8]) * 2,
+            'second': (['time', 'lat', 'lon'], np.array([np.ones([4, 8]),
                                                          np.ones([4, 8]) * 3,
-                                                         np.ones([4, 8])])),
+                                                         np.ones([4, 8]) * 4])),
+            'first': (['time', 'lat', 'lon'], np.array([np.ones([4, 8]) * 2,
+                                                        np.ones([4, 8]) * 3,
+                                                        np.ones([4, 8])])),
             'lat': np.linspace(-67.5, 67.5, 4),
             'lon': np.linspace(-157.5, 157.5, 8),
             'time': np.array([1, 2, 3])}).chunk(chunks={'lat': 2, 'lon': 4})
@@ -95,9 +114,9 @@ def test_3D(self):
             'lon': np.linspace(-157.5, 157.5, 8),
             'time': np.array([1, 2, 3])}).chunk(chunks={'lat': 2, 'lon': 4})
 
-        with self.assertRaises(ValueError) as err:
-            pearson_correlation_scalar(ds1, ds2, 'first', 'first')
-        self.assertIn('should be simple 1d', str(err.exception))
+        correlation = pearson_correlation_scalar(ds1, ds2, 'first', 'first')
+        self.assertTrue(correlation['corr_coef'][0] == 1.0)
+        self.assertTrue(correlation['p_value'][0] == 0.0)
 
 
 class TestPearson(TestCase):