Skip to content

Commit

Permalink
unbiased dcorr should not be square-rooted
Browse files Browse the repository at this point in the history
  • Loading branch information
sampan501 committed Aug 17, 2023
1 parent c18854a commit 9e7fc39
Show file tree
Hide file tree
Showing 14 changed files with 27 additions and 21 deletions.
4 changes: 2 additions & 2 deletions hyppo/conditional/FCIT.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,8 @@ def test(self, x, y, z=None):
>>> model = DecisionTreeRegressor()
>>> cv_grid = {"min_samples_split": [2, 8, 64, 512, 1e-2, 0.2, 0.4]}
>>> stat, pvalue = FCIT(model=model, cv_grid=cv_grid).test(x1.T, y1.T, z1)
>>> '%.2f, %.3f' % (stat, pvalue)
'-3.59, 0.995'
>>> '%.1f, %.2f' % (stat, pvalue)
'-3.6, 1.00'
"""

n_samples = x.shape[0]
Expand Down
6 changes: 3 additions & 3 deletions hyppo/conditional/kci.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,10 @@ def test(self, x, y):
>>> from hyppo.conditional import KCI
>>> from hyppo.tools.indep_sim import linear
>>> np.random.seed(123456789)
>>> x, y = linear(n, 1)
>>> x, y = linear(100, 1)
>>> stat, pvalue = KCI().test(x, y)
>>> print("Statistic: ", stat)
>>> print("p-value: ", pvalue)
>>> '%.1f, %.2f' % (stat, pvalue)
'544.7, 0.00'
"""

T = len(y)
Expand Down
10 changes: 8 additions & 2 deletions hyppo/independence/dcorr.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,9 @@ def _dcov(distx, disty, bias=False, only_dcov=True): # pragma: no cover
if only_dcov:
N = distx.shape[0]
if bias:
stat = 1 / (N**2) * stat
# only biased Dcov is square-rooted
# https://search.r-project.org/CRAN/refmans/energy/html/dcovu.html
stat = np.sqrt(1 / (N**2) * stat)
else:
stat = 1 / (N * (N - 3)) * stat

Expand Down Expand Up @@ -437,6 +439,10 @@ def _dcorr(distx, disty, bias=False, is_fast=False): # pragma: no cover

# calculate generalized test statistic
else:
stat = np.sqrt(covar / np.real(np.sqrt(varx * vary)))
stat = covar / np.sqrt(np.real(varx * vary))
# only biased Dcov is square-rooted
# https://search.r-project.org/CRAN/refmans/energy/html/dcovu.html
if bias:
stat = np.sqrt(stat)

return stat
2 changes: 1 addition & 1 deletion hyppo/independence/kmerf.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def test(self, x, y, reps=1000, workers=1, auto=True, random_state=None):
stat = self.statistic(x, y)
statx = _dcorr(self.distx, self.distx, bias=False, is_fast=False)
staty = _dcorr(self.disty, self.disty, bias=False, is_fast=False)
pvalue = chi2.sf(stat ** 2 / np.sqrt(statx ** 2 * staty ** 2) * n + 1, 1)
pvalue = chi2.sf(stat / np.sqrt(statx * staty) * n + 1, 1)
self.stat = stat
self.pvalue = pvalue
self.null_dist = None
Expand Down
4 changes: 2 additions & 2 deletions hyppo/independence/tests/test_kmerf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class TestKMERFStat(object):
(linear, 1.0, 9.19834440770024e-24), # test linear simulation
(
spiral,
0.4284332761214592,
0.1835550720881665,
1.3087565060526607e-05,
), # test spiral simulation
(
Expand All @@ -34,7 +34,7 @@ def test_oned(self, sim, obs_stat, obs_pvalue):

# test stat and pvalue
stat1 = KMERF().statistic(x, y)
stat2, pvalue, _ = KMERF().test(x, y, reps=0)
stat2, pvalue, _ = KMERF().test(x, y)
assert_approx_equal(stat1, obs_stat, significant=1)
assert_approx_equal(stat2, obs_stat, significant=1)
assert_approx_equal(pvalue, obs_pvalue, significant=1)
Expand Down
2 changes: 1 addition & 1 deletion hyppo/independence/tests/test_maxmargin.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

class TestMaxMarginStat:
@pytest.mark.parametrize(
"n, obs_stat", [(100, 0.8356763870539561), (200, 0.8382397975358477)]
"n, obs_stat", [(100, 0.6983550238795534), (200, 0.7026459581729388)]
)
@pytest.mark.parametrize("obs_pvalue", [1 / 1000])
def test_linear_oned(self, n, obs_stat, obs_pvalue):
Expand Down
2 changes: 1 addition & 1 deletion hyppo/ksample/energy.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def statistic(self, x, y):

# exact equivalence transformation Dcorr and Energy
stat = (
_dcov(distx, disty, self.bias, only_dcov=True)
_dcov(distx, disty, bias=self.bias, only_dcov=True)
* ((n + m) ** 4)
/ (2 * (n**2) * (m**2))
)
Expand Down
2 changes: 1 addition & 1 deletion hyppo/ksample/ksamp.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ def test(self, *args, reps=1000, workers=1, auto=True, random_state=None):
>>> z = np.arange(10)
>>> stat, pvalue = KSample("Dcorr").test(x, y)
>>> '%.3f, %.1f' % (stat, pvalue)
'-0.136, 1.0'
'0.000, 1.0'
"""
inputs = list(args)
check_input = _CheckInputs(
Expand Down
2 changes: 1 addition & 1 deletion hyppo/ksample/mmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def test(self, x, y, reps=1000, workers=1, auto=True, random_state=None):
>>> y = x
>>> stat, pvalue = MMD().test(x, y)
>>> '%.3f, %.1f' % (stat, pvalue)
'-0.015, 1.0'
'0.000, 1.0'
"""
check_input = _CheckInputs(
inputs=[x, y],
Expand Down
2 changes: 1 addition & 1 deletion hyppo/ksample/tests/test_energy.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def test_against_dcor(self):
y = x**2
stat = Energy(bias=True).statistic(x.reshape(-1, 1), y.reshape(-1, 1))

assert_almost_equal(stat, 3146.5236, decimal=4)
assert_almost_equal(stat, 158.6574574358228, decimal=4)


class TestEnergyTypeIError:
Expand Down
4 changes: 2 additions & 2 deletions hyppo/ksample/tests/test_ksamp.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
class TestKSample:
@pytest.mark.parametrize(
"n, obs_stat, obs_pvalue, indep_test",
[(1000, 4.28e-7, 1.0, "CCA"), (100, 0.2136515250373329, 0.001, "Dcorr")],
[(1000, 4.28e-7, 1.0, "CCA"), (100, 0.045646974150778084, 0.001, "Dcorr")],
)
def test_twosamp_linear_oned(self, n, obs_stat, obs_pvalue, indep_test):
np.random.seed(123456789)
Expand All @@ -24,7 +24,7 @@ def test_rf(self):
x, y = rot_ksamp("linear", 50, 1, k=2)
stat, _, _ = KSample("KMERF").test(x, y, reps=0)

assert_almost_equal(stat, 0.5209659346243869, decimal=1)
assert_almost_equal(stat, 0.27140550503906097, decimal=1)

def test_maxmargin(self):
np.random.seed(123456789)
Expand Down
2 changes: 1 addition & 1 deletion hyppo/time_series/dcorrx.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def test(self, x, y, reps=1000, workers=1, random_state=None):
>>> y = x
>>> stat, pvalue, dcorrx_dict = DcorrX().test(x, y, reps = 100)
>>> '%.1f, %.2f, %d' % (stat, pvalue, dcorrx_dict['opt_lag'])
'1.0, 0.04, 0'
'1.0, 0.05, 0'
"""
check_input = _CheckInputs(
x,
Expand Down
4 changes: 2 additions & 2 deletions hyppo/time_series/mgcx.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def test(self, x, y, reps=1000, workers=1, random_state=None):
>>> stat, pvalue, mgcx_dict = MGCX().test(x, y, reps = 100)
>>> '%.1f, %.2f, [%d, %d]' % (stat, pvalue, mgcx_dict['opt_scale'][0],
... mgcx_dict['opt_scale'][1])
'1.0, 0.05, [7, 7]'
'1.0, 0.06, [7, 7]'
The increasing the max_lag can increase the ability to identify dependence.
Expand All @@ -182,7 +182,7 @@ def test(self, x, y, reps=1000, workers=1, random_state=None):
>>> y = np.roll(x, -1)
>>> stat, pvalue, mgcx_dict = MGCX(max_lag=1).test(x, y, reps=1000)
>>> '%.1f, %.2f, %d' % (stat, pvalue, mgcx_dict['opt_lag'])
'1.1, 0.01, 1'
'1.1, 0.00, 1'
"""
check_input = _CheckInputs(
x,
Expand Down
2 changes: 1 addition & 1 deletion hyppo/tools/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,6 @@ def chi2_approx(calc_stat, x, y):
"""
n = x.shape[0]
stat = calc_stat(x, y)
pvalue = chi2.sf((stat ** 2) * n + 1, 1)
pvalue = chi2.sf(stat * n + 1, 1)

return stat, pvalue

0 comments on commit 9e7fc39

Please sign in to comment.