unbiased dcorr should not be square-rooted

neurodata · Aug 17, 2023 · 9e7fc39 · 9e7fc39
1 parent c18854a
commit 9e7fc39
Show file tree

Hide file tree

Showing 14 changed files with 27 additions and 21 deletions.
diff --git a/hyppo/conditional/FCIT.py b/hyppo/conditional/FCIT.py
@@ -164,8 +164,8 @@ def test(self, x, y, z=None):
         >>> model = DecisionTreeRegressor()
         >>> cv_grid = {"min_samples_split": [2, 8, 64, 512, 1e-2, 0.2, 0.4]}
         >>> stat, pvalue = FCIT(model=model, cv_grid=cv_grid).test(x1.T, y1.T, z1)
-        >>> '%.2f, %.3f' % (stat, pvalue)
-        '-3.59, 0.995'
+        >>> '%.1f, %.2f' % (stat, pvalue)
+        '-3.6, 1.00'
         """
 
         n_samples = x.shape[0]

diff --git a/hyppo/conditional/kci.py b/hyppo/conditional/kci.py
@@ -98,10 +98,10 @@ def test(self, x, y):
         >>> from hyppo.conditional import KCI
         >>> from hyppo.tools.indep_sim import linear
         >>> np.random.seed(123456789)
-        >>> x, y = linear(n, 1)
+        >>> x, y = linear(100, 1)
         >>> stat, pvalue = KCI().test(x, y)
-        >>> print("Statistic: ", stat)
-        >>> print("p-value: ", pvalue)
+        >>> '%.1f, %.2f' % (stat, pvalue)
+        '544.7, 0.00'
         """
 
         T = len(y)

diff --git a/hyppo/independence/dcorr.py b/hyppo/independence/dcorr.py
@@ -404,7 +404,9 @@ def _dcov(distx, disty, bias=False, only_dcov=True):  # pragma: no cover
     if only_dcov:
         N = distx.shape[0]
         if bias:
-            stat = 1 / (N**2) * stat
+            # only biased Dcov is square-rooted
+            # https://search.r-project.org/CRAN/refmans/energy/html/dcovu.html
+            stat = np.sqrt(1 / (N**2) * stat)
         else:
             stat = 1 / (N * (N - 3)) * stat
 
@@ -437,6 +439,10 @@ def _dcorr(distx, disty, bias=False, is_fast=False):  # pragma: no cover
 
     # calculate generalized test statistic
     else:
-        stat = np.sqrt(covar / np.real(np.sqrt(varx * vary)))
+        stat = covar / np.sqrt(np.real(varx * vary))
+        # only biased Dcov is square-rooted
+        # https://search.r-project.org/CRAN/refmans/energy/html/dcovu.html
+        if bias:
+            stat = np.sqrt(stat)
 
     return stat
diff --git a/hyppo/independence/kmerf.py b/hyppo/independence/kmerf.py
@@ -242,7 +242,7 @@ def test(self, x, y, reps=1000, workers=1, auto=True, random_state=None):
             stat = self.statistic(x, y)
             statx = _dcorr(self.distx, self.distx, bias=False, is_fast=False)
             staty = _dcorr(self.disty, self.disty, bias=False, is_fast=False)
-            pvalue = chi2.sf(stat ** 2 / np.sqrt(statx ** 2 * staty ** 2) * n + 1, 1)
+            pvalue = chi2.sf(stat / np.sqrt(statx * staty) * n + 1, 1)
             self.stat = stat
             self.pvalue = pvalue
             self.null_dist = None

diff --git a/hyppo/independence/tests/test_kmerf.py b/hyppo/independence/tests/test_kmerf.py
@@ -16,7 +16,7 @@ class TestKMERFStat(object):
             (linear, 1.0, 9.19834440770024e-24),  # test linear simulation
             (
                 spiral,
-                0.4284332761214592,
+                0.1835550720881665,
                 1.3087565060526607e-05,
             ),  # test spiral simulation
             (
@@ -34,7 +34,7 @@ def test_oned(self, sim, obs_stat, obs_pvalue):
 
         # test stat and pvalue
         stat1 = KMERF().statistic(x, y)
-        stat2, pvalue, _ = KMERF().test(x, y, reps=0)
+        stat2, pvalue, _ = KMERF().test(x, y)
         assert_approx_equal(stat1, obs_stat, significant=1)
         assert_approx_equal(stat2, obs_stat, significant=1)
         assert_approx_equal(pvalue, obs_pvalue, significant=1)

diff --git a/hyppo/independence/tests/test_maxmargin.py b/hyppo/independence/tests/test_maxmargin.py
@@ -8,7 +8,7 @@
 
 class TestMaxMarginStat:
     @pytest.mark.parametrize(
-        "n, obs_stat", [(100, 0.8356763870539561), (200, 0.8382397975358477)]
+        "n, obs_stat", [(100, 0.6983550238795534), (200, 0.7026459581729388)]
     )
     @pytest.mark.parametrize("obs_pvalue", [1 / 1000])
     def test_linear_oned(self, n, obs_stat, obs_pvalue):

diff --git a/hyppo/ksample/energy.py b/hyppo/ksample/energy.py
@@ -116,7 +116,7 @@ def statistic(self, x, y):
 
         # exact equivalence transformation Dcorr and Energy
         stat = (
-            _dcov(distx, disty, self.bias, only_dcov=True)
+            _dcov(distx, disty, bias=self.bias, only_dcov=True)
             * ((n + m) ** 4)
             / (2 * (n**2) * (m**2))
         )

diff --git a/hyppo/ksample/ksamp.py b/hyppo/ksample/ksamp.py
@@ -281,7 +281,7 @@ def test(self, *args, reps=1000, workers=1, auto=True, random_state=None):
         >>> z = np.arange(10)
         >>> stat, pvalue = KSample("Dcorr").test(x, y)
         >>> '%.3f, %.1f' % (stat, pvalue)
-        '-0.136, 1.0'
+        '0.000, 1.0'
         """
         inputs = list(args)
         check_input = _CheckInputs(

diff --git a/hyppo/ksample/mmd.py b/hyppo/ksample/mmd.py
@@ -159,7 +159,7 @@ def test(self, x, y, reps=1000, workers=1, auto=True, random_state=None):
         >>> y = x
         >>> stat, pvalue = MMD().test(x, y)
         >>> '%.3f, %.1f' % (stat, pvalue)
-        '-0.015, 1.0'
+        '0.000, 1.0'
         """
         check_input = _CheckInputs(
             inputs=[x, y],

diff --git a/hyppo/ksample/tests/test_energy.py b/hyppo/ksample/tests/test_energy.py
@@ -36,7 +36,7 @@ def test_against_dcor(self):
         y = x**2
         stat = Energy(bias=True).statistic(x.reshape(-1, 1), y.reshape(-1, 1))
 
-        assert_almost_equal(stat, 3146.5236, decimal=4)
+        assert_almost_equal(stat, 158.6574574358228, decimal=4)
 
 
 class TestEnergyTypeIError:

diff --git a/hyppo/ksample/tests/test_ksamp.py b/hyppo/ksample/tests/test_ksamp.py
@@ -9,7 +9,7 @@
 class TestKSample:
     @pytest.mark.parametrize(
         "n, obs_stat, obs_pvalue, indep_test",
-        [(1000, 4.28e-7, 1.0, "CCA"), (100, 0.2136515250373329, 0.001, "Dcorr")],
+        [(1000, 4.28e-7, 1.0, "CCA"), (100, 0.045646974150778084, 0.001, "Dcorr")],
     )
     def test_twosamp_linear_oned(self, n, obs_stat, obs_pvalue, indep_test):
         np.random.seed(123456789)
@@ -24,7 +24,7 @@ def test_rf(self):
         x, y = rot_ksamp("linear", 50, 1, k=2)
         stat, _, _ = KSample("KMERF").test(x, y, reps=0)
 
-        assert_almost_equal(stat, 0.5209659346243869, decimal=1)
+        assert_almost_equal(stat, 0.27140550503906097, decimal=1)
 
     def test_maxmargin(self):
         np.random.seed(123456789)

diff --git a/hyppo/time_series/dcorrx.py b/hyppo/time_series/dcorrx.py
@@ -151,7 +151,7 @@ def test(self, x, y, reps=1000, workers=1, random_state=None):
         >>> y = x
         >>> stat, pvalue, dcorrx_dict = DcorrX().test(x, y, reps = 100)
         >>> '%.1f, %.2f, %d' % (stat, pvalue, dcorrx_dict['opt_lag'])
-        '1.0, 0.04, 0'
+        '1.0, 0.05, 0'
         """
         check_input = _CheckInputs(
             x,

diff --git a/hyppo/time_series/mgcx.py b/hyppo/time_series/mgcx.py
@@ -171,7 +171,7 @@ def test(self, x, y, reps=1000, workers=1, random_state=None):
         >>> stat, pvalue, mgcx_dict = MGCX().test(x, y, reps = 100)
         >>> '%.1f, %.2f, [%d, %d]' % (stat, pvalue, mgcx_dict['opt_scale'][0],
         ... mgcx_dict['opt_scale'][1])
-        '1.0, 0.05, [7, 7]'
+        '1.0, 0.06, [7, 7]'
 
         The increasing the max_lag can increase the ability to identify dependence.
 
@@ -182,7 +182,7 @@ def test(self, x, y, reps=1000, workers=1, random_state=None):
         >>> y = np.roll(x, -1)
         >>> stat, pvalue, mgcx_dict = MGCX(max_lag=1).test(x, y, reps=1000)
         >>> '%.1f, %.2f, %d' % (stat, pvalue, mgcx_dict['opt_lag'])
-        '1.1, 0.01, 1'
+        '1.1, 0.00, 1'
         """
         check_input = _CheckInputs(
             x,

diff --git a/hyppo/tools/common.py b/hyppo/tools/common.py
@@ -678,6 +678,6 @@ def chi2_approx(calc_stat, x, y):
     """
     n = x.shape[0]
     stat = calc_stat(x, y)
-    pvalue = chi2.sf((stat ** 2) * n + 1, 1)
+    pvalue = chi2.sf(stat * n + 1, 1)
 
     return stat, pvalue