diff --git a/doubleml/datasets.py b/doubleml/datasets.py index b510f4b6..21585f15 100644 --- a/doubleml/datasets.py +++ b/doubleml/datasets.py @@ -1,5 +1,6 @@ import pandas as pd import numpy as np +import warnings from scipy.linalg import toeplitz from scipy.optimize import minimize_scalar @@ -895,11 +896,11 @@ def f_ps(w, xi): raise ValueError('Invalid return_type.') -def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04): +def make_confounded_irm_data(n_obs=500, theta=0.0, gamma_a=0.127, beta_a=0.58, linear=False, **kwargs): """ Generates counfounded data from an interactive regression model. - The data generating process is defined as follows (similar to the Monte Carlo simulation used + The data generating process is defined as follows (inspired by the Monte Carlo simulation used in Sant'Anna and Zhao (2020)). Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` corresponds @@ -924,14 +925,22 @@ def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04): .. math:: - m(X, A) = P(D=1|X,A) = 0.5 + \\gamma_A \\cdot A + m(X, A) = P(D=1|X,A) = p(Z) + \\gamma_A \\cdot A + + where + + .. math:: + + p(Z) &= \\frac{\\exp(f_{ps}(Z))}{1 + \\exp(f_{ps}(Z))}, + + f_{ps}(Z) &= 0.75 \\cdot (-Z_1 + 0.1 \\cdot Z_2 -0.25 \\cdot Z_3 - 0.1 \\cdot Z_4). and generate the treatment :math:`D = 1\\{m(X, A) \\ge U\\}` with :math:`U \\sim \\mathcal{U}[0, 1]`. Since :math:`A` is independent of :math:`X`, the short form of the propensity score is given as .. math:: - P(D=1|X) = 0.5. + P(D=1|X) = p(Z). Further, generate the outcome of interest :math:`Y` as @@ -939,7 +948,7 @@ def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04): Y &= \\theta \\cdot D (Z_5 + 1) + g(Z) + \\beta_A \\cdot A + \\varepsilon - g(Z) &= 210 + 27.4 \\cdot Z_1 +13.7 \\cdot (Z_2 + Z_3 + Z_4) + g(Z) &= 2.5 + 0.74 \\cdot Z_1 + 0.25 \\cdot Z_2 + 0.137 \\cdot (Z_3 + Z_4) where :math:`\\varepsilon \\sim \\mathcal{N}(0,5)`. This implies an average treatment effect of :math:`\\theta`. Additionally, the long and short forms of @@ -952,13 +961,13 @@ def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04): \\mathbb{E}[Y|D, X] &= (\\theta + \\beta_A \\frac{\\mathrm{Cov}(A, D(Z_5 + 1))}{\\mathrm{Var}(D(Z_5 + 1))}) \\cdot D (Z_5 + 1) + g(Z). - Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`. - Both are chosen to obtain the desired confounding of the outcome and Riesz Representer (in sample). + Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`, which can be + set via the parameters ``gamma_a`` and ``beta_a``. - The observed data is given as :math:`W = (Y, D, X)`. + The observed data is given as :math:`W = (Y, D, Z)`. Further, orcale values of the confounder :math:`A`, the transformed covariated :math:`Z`, - the potential outcomes of :math:`Y`, the coefficients :math:`\\gamma_a`, :math:`\\beta_a`, the - long and short forms of the main regression and the propensity score + the potential outcomes of :math:`Y`, the long and short forms of the main regression and the propensity score and + in sample versions of the confounding parameters :math:`cf_d` and :math:`cf_y` (for ATE and ATTE) are returned in a dictionary. Parameters @@ -968,13 +977,16 @@ def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04): Default is ``500``. theta : float or int Average treatment effect. - Default is ``5.0``. - cf_y : float - Percentage of the residual variation of the outcome explained by latent/confounding variable. - Default is ``0.04``. - cf_d : float - Percentage gains in the variation of the Riesz Representer generated by latent/confounding variable. - Default is ``0.04``. + Default is ``0.0``. + gamma_a : float + Coefficient of the unobserved confounder in the propensity score. + Default is ``0.127``. + beta_a : float + Coefficient of the unobserved confounder in the outcome regression. + Default is ``0.58``. + linear : bool + If ``True``, the Z will be set to X, such that the underlying (short) models are linear/logistic. + Default is ``False``. Returns ------- @@ -988,82 +1000,122 @@ def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04): doi:`10.1016/j.jeconom.2020.06.003 `_. """ c = 0.0 # the confounding strength is only valid for c=0 - dim_x = 5 + xi = 0.75 + dim_x = kwargs.get('dim_x', 5) + trimming_threshold = kwargs.get('trimming_threshold', 0.01) + var_eps_y = kwargs.get('var_eps_y', 1.0) + + # Specification of main regression function + def f_reg(w): + res = 2.5 + 0.74*w[:, 0] + 0.25 * w[:, 1] + 0.137*(w[:, 2] + w[:, 3]) + return res + # Specification of prop score function + def f_ps(w, xi): + res = xi*(-w[:, 0] + 0.1*w[:, 1] - 0.25*w[:, 2] - 0.1*w[:, 3]) + return res # observed covariates cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) - z_tilde_1 = np.exp(0.5*x[:, 0]) z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) z_tilde_3 = (0.6 + x[:, 0] * x[:, 2]/25)**3 z_tilde_4 = (20 + x[:, 1] + x[:, 3])**2 - - z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, x[:, 4:])) + z_tilde_5 = x[:, 4] + z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, z_tilde_5)) z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) - # error terms and unobserved confounder - var_eps_y = 5 eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs) - # unobserved confounder a_bounds = (-1, 1) a = np.random.uniform(low=a_bounds[0], high=a_bounds[1], size=n_obs) + var_a = np.square(a_bounds[1] - a_bounds[0]) / 12 - # get the required impact of the confounder on the propensity score - possible_coefs = np.arange(0.001, 0.4999, 0.001) - gamma_a = possible_coefs[(np.arctanh(2*possible_coefs) / (2*possible_coefs)) - 1 - cf_d/(1 - cf_d) >= 0][0] - - # compute short and long form of riesz representer - m_long = 0.5 + gamma_a*a - m_short = 0.5 * np.ones_like(m_long) + # Choose the features used in the models + if linear: + features_ps = x + features_reg = x + else: + features_ps = z + features_reg = z + p = np.exp(f_ps(features_ps, xi)) / (1 + np.exp(f_ps(features_ps, xi))) + # compute short and long form of propensity score + m_long = p + gamma_a*a + m_short = p + # check propensity score bounds + if np.any(m_long < trimming_threshold) or np.any(m_long > 1.0 - trimming_threshold): + m_long = np.clip(m_long, trimming_threshold, 1.0 - trimming_threshold) + m_short = np.clip(m_short, trimming_threshold, 1.0 - trimming_threshold) + warnings.warn(f'Propensity score is close to 0 or 1. ' + f'Trimming is at {trimming_threshold} and {1.0-trimming_threshold} is applied') + # generate treatment based on long form u = np.random.uniform(low=0, high=1, size=n_obs) d = 1.0 * (m_long >= u) - - # short and long version of g - g_partial_reg = 210 + 27.4*z[:, 0] + 13.7*(z[:, 1] + z[:, 2] + z[:, 3]) - - dx = d * (x[:, 4] + 1) - d1x = x[:, 4] + 1 - var_dx = np.var(dx) - cov_adx = np.cov(a, dx)[0, 1] - - def f_g(beta_a): - g_diff = beta_a * (a - cov_adx / var_dx) - y_diff = eps_y + g_diff - return np.square(np.mean(np.square(g_diff)) / np.mean(np.square(y_diff)) - cf_y) - beta_a = minimize_scalar(f_g).x - + # add treatment heterogeneity + d1x = z[:, 4] + 1 + var_dx = np.var(d*(d1x)) + cov_adx = gamma_a * var_a + # Outcome regression + g_partial_reg = f_reg(features_reg) + # short model g_short_d0 = g_partial_reg g_short_d1 = (theta + beta_a * cov_adx / var_dx) * d1x + g_partial_reg g_short = d * g_short_d1 + (1.0-d) * g_short_d0 - + # long model g_long_d0 = g_partial_reg + beta_a * a g_long_d1 = theta * d1x + g_partial_reg + beta_a * a g_long = d * g_long_d1 + (1.0-d) * g_long_d0 - - y0 = g_long_d0 + eps_y - y1 = g_long_d1 + eps_y - - y = d * y1 + (1.0-d) * y0 - - oracle_values = {'g_long': g_long, - 'g_short': g_short, - 'm_long': m_long, - 'm_short': m_short, - 'gamma_a': gamma_a, - 'beta_a': beta_a, - 'a': a, - 'y0': y0, - 'y1': y1, - 'z': z} - - res_dict = {'x': x, - 'y': y, - 'd': d, - 'oracle_values': oracle_values} - + # Potential outcomes + y_0 = g_long_d0 + eps_y + y_1 = g_long_d1 + eps_y + # Realized outcome + y = d * y_1 + (1.0-d) * y_0 + # In-sample values for confounding strength + explained_residual_variance = np.square(g_long - g_short) + residual_variance = np.square(y - g_short) + cf_y = np.mean(explained_residual_variance) / np.mean(residual_variance) + # compute the Riesz representation + treated_weight = d / np.mean(d) + untreated_weight = (1.0 - d) / np.mean(d) + # Odds ratios + propensity_ratio_long = m_long / (1.0 - m_long) + rr_long_ate = d / m_long - (1.0 - d) / (1.0 - m_long) + rr_long_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_long) + propensity_ratio_short = m_short / (1.0 - m_short) + rr_short_ate = d / m_short - (1.0 - d) / (1.0 - m_short) + rr_short_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_short) + cf_d_ate = (np.mean(1/(m_long * (1 - m_long))) - np.mean(1/(m_short * (1 - m_short)))) / np.mean(1/(m_long * (1 - m_long))) + cf_d_atte = (np.mean(propensity_ratio_long) - np.mean(propensity_ratio_short)) / np.mean(propensity_ratio_long) + if (beta_a == 0) | (gamma_a == 0): + rho_ate = 0.0 + rho_atte = 0.0 + else: + rho_ate = np.corrcoef((g_long - g_short), (rr_long_ate - rr_short_ate))[0, 1] + rho_atte = np.corrcoef((g_long - g_short), (rr_long_atte - rr_short_atte))[0, 1] + oracle_values = { + 'g_long': g_long, + 'g_short': g_short, + 'm_long': m_long, + 'm_short': m_short, + 'gamma_a': gamma_a, + 'beta_a': beta_a, + 'a': a, + 'y_0': y_0, + 'y_1': y_1, + 'z': z, + 'cf_y': cf_y, + 'cf_d_ate': cf_d_ate, + 'cf_d_atte': cf_d_atte, + 'rho_ate': rho_ate, + 'rho_atte': rho_atte, + } + res_dict = { + 'x': x, + 'y': y, + 'd': d, + 'oracle_values': oracle_values + } return res_dict diff --git a/doubleml/tests/test_datasets.py b/doubleml/tests/test_datasets.py index d662cd07..d1227869 100644 --- a/doubleml/tests/test_datasets.py +++ b/doubleml/tests/test_datasets.py @@ -186,10 +186,16 @@ def test_make_did_SZ2020_return_types(cross_sectional, dgp_type): _ = make_did_SZ2020(n_obs=100, dgp_type="5", cross_sectional_data=cross_sectional, return_type='matrix') +@pytest.fixture(scope='function', + params=[True, False]) +def linear(request): + return request.param + + @pytest.mark.ci -def test_make_confounded_irm_data_return_types(): +def test_make_confounded_irm_data_return_types(linear): np.random.seed(3141) - res = make_confounded_irm_data() + res = make_confounded_irm_data(linear=linear) assert isinstance(res, dict) assert isinstance(res['x'], np.ndarray) assert isinstance(res['y'], np.ndarray) @@ -203,9 +209,14 @@ def test_make_confounded_irm_data_return_types(): assert isinstance(res['oracle_values']['gamma_a'], float) assert isinstance(res['oracle_values']['beta_a'], float) assert isinstance(res['oracle_values']['a'], np.ndarray) - assert isinstance(res['oracle_values']['y0'], np.ndarray) - assert isinstance(res['oracle_values']['y1'], np.ndarray) + assert isinstance(res['oracle_values']['y_0'], np.ndarray) + assert isinstance(res['oracle_values']['y_1'], np.ndarray) assert isinstance(res['oracle_values']['z'], np.ndarray) + assert isinstance(res['oracle_values']['cf_y'], float) + assert isinstance(res['oracle_values']['cf_d_ate'], float) + assert isinstance(res['oracle_values']['cf_d_atte'], float) + assert isinstance(res['oracle_values']['rho_ate'], float) + assert isinstance(res['oracle_values']['rho_atte'], float) @pytest.mark.ci