Skip to content

Commit

Permalink
Merge pull request #263 from DoubleML/s-add-conf-irm
Browse files Browse the repository at this point in the history
Update `make_confounded_irm_data`
  • Loading branch information
SvenKlaassen authored Aug 2, 2024
2 parents 0511f6b + 727df17 commit dc5d347
Show file tree
Hide file tree
Showing 2 changed files with 136 additions and 73 deletions.
190 changes: 121 additions & 69 deletions doubleml/datasets.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pandas as pd
import numpy as np
import warnings

from scipy.linalg import toeplitz
from scipy.optimize import minimize_scalar
Expand Down Expand Up @@ -895,11 +896,11 @@ def f_ps(w, xi):
raise ValueError('Invalid return_type.')


def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04):
def make_confounded_irm_data(n_obs=500, theta=0.0, gamma_a=0.127, beta_a=0.58, linear=False, **kwargs):
"""
Generates counfounded data from an interactive regression model.
The data generating process is defined as follows (similar to the Monte Carlo simulation used
The data generating process is defined as follows (inspired by the Monte Carlo simulation used
in Sant'Anna and Zhao (2020)).
Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` corresponds
Expand All @@ -924,22 +925,30 @@ def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04):
.. math::
m(X, A) = P(D=1|X,A) = 0.5 + \\gamma_A \\cdot A
m(X, A) = P(D=1|X,A) = p(Z) + \\gamma_A \\cdot A
where
.. math::
p(Z) &= \\frac{\\exp(f_{ps}(Z))}{1 + \\exp(f_{ps}(Z))},
f_{ps}(Z) &= 0.75 \\cdot (-Z_1 + 0.1 \\cdot Z_2 -0.25 \\cdot Z_3 - 0.1 \\cdot Z_4).
and generate the treatment :math:`D = 1\\{m(X, A) \\ge U\\}` with :math:`U \\sim \\mathcal{U}[0, 1]`.
Since :math:`A` is independent of :math:`X`, the short form of the propensity score is given as
.. math::
P(D=1|X) = 0.5.
P(D=1|X) = p(Z).
Further, generate the outcome of interest :math:`Y` as
.. math::
Y &= \\theta \\cdot D (Z_5 + 1) + g(Z) + \\beta_A \\cdot A + \\varepsilon
g(Z) &= 210 + 27.4 \\cdot Z_1 +13.7 \\cdot (Z_2 + Z_3 + Z_4)
g(Z) &= 2.5 + 0.74 \\cdot Z_1 + 0.25 \\cdot Z_2 + 0.137 \\cdot (Z_3 + Z_4)
where :math:`\\varepsilon \\sim \\mathcal{N}(0,5)`.
This implies an average treatment effect of :math:`\\theta`. Additionally, the long and short forms of
Expand All @@ -952,13 +961,13 @@ def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04):
\\mathbb{E}[Y|D, X] &= (\\theta + \\beta_A \\frac{\\mathrm{Cov}(A, D(Z_5 + 1))}{\\mathrm{Var}(D(Z_5 + 1))})
\\cdot D (Z_5 + 1) + g(Z).
Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`.
Both are chosen to obtain the desired confounding of the outcome and Riesz Representer (in sample).
Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`, which can be
set via the parameters ``gamma_a`` and ``beta_a``.
The observed data is given as :math:`W = (Y, D, X)`.
The observed data is given as :math:`W = (Y, D, Z)`.
Further, orcale values of the confounder :math:`A`, the transformed covariated :math:`Z`,
the potential outcomes of :math:`Y`, the coefficients :math:`\\gamma_a`, :math:`\\beta_a`, the
long and short forms of the main regression and the propensity score
the potential outcomes of :math:`Y`, the long and short forms of the main regression and the propensity score and
in sample versions of the confounding parameters :math:`cf_d` and :math:`cf_y` (for ATE and ATTE)
are returned in a dictionary.
Parameters
Expand All @@ -968,13 +977,16 @@ def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04):
Default is ``500``.
theta : float or int
Average treatment effect.
Default is ``5.0``.
cf_y : float
Percentage of the residual variation of the outcome explained by latent/confounding variable.
Default is ``0.04``.
cf_d : float
Percentage gains in the variation of the Riesz Representer generated by latent/confounding variable.
Default is ``0.04``.
Default is ``0.0``.
gamma_a : float
Coefficient of the unobserved confounder in the propensity score.
Default is ``0.127``.
beta_a : float
Coefficient of the unobserved confounder in the outcome regression.
Default is ``0.58``.
linear : bool
If ``True``, the Z will be set to X, such that the underlying (short) models are linear/logistic.
Default is ``False``.
Returns
-------
Expand All @@ -988,82 +1000,122 @@ def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04):
doi:`10.1016/j.jeconom.2020.06.003 <https://doi.org/10.1016/j.jeconom.2020.06.003>`_.
"""
c = 0.0 # the confounding strength is only valid for c=0
dim_x = 5
xi = 0.75
dim_x = kwargs.get('dim_x', 5)
trimming_threshold = kwargs.get('trimming_threshold', 0.01)
var_eps_y = kwargs.get('var_eps_y', 1.0)

# Specification of main regression function
def f_reg(w):
res = 2.5 + 0.74*w[:, 0] + 0.25 * w[:, 1] + 0.137*(w[:, 2] + w[:, 3])
return res

# Specification of prop score function
def f_ps(w, xi):
res = xi*(-w[:, 0] + 0.1*w[:, 1] - 0.25*w[:, 2] - 0.1*w[:, 3])
return res
# observed covariates
cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)])
x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ])

z_tilde_1 = np.exp(0.5*x[:, 0])
z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0]))
z_tilde_3 = (0.6 + x[:, 0] * x[:, 2]/25)**3
z_tilde_4 = (20 + x[:, 1] + x[:, 3])**2

z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, x[:, 4:]))
z_tilde_5 = x[:, 4]
z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, z_tilde_5))
z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0)

# error terms and unobserved confounder
var_eps_y = 5
eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs)

# unobserved confounder
a_bounds = (-1, 1)
a = np.random.uniform(low=a_bounds[0], high=a_bounds[1], size=n_obs)
var_a = np.square(a_bounds[1] - a_bounds[0]) / 12

# get the required impact of the confounder on the propensity score
possible_coefs = np.arange(0.001, 0.4999, 0.001)
gamma_a = possible_coefs[(np.arctanh(2*possible_coefs) / (2*possible_coefs)) - 1 - cf_d/(1 - cf_d) >= 0][0]

# compute short and long form of riesz representer
m_long = 0.5 + gamma_a*a
m_short = 0.5 * np.ones_like(m_long)
# Choose the features used in the models
if linear:
features_ps = x
features_reg = x
else:
features_ps = z
features_reg = z

p = np.exp(f_ps(features_ps, xi)) / (1 + np.exp(f_ps(features_ps, xi)))
# compute short and long form of propensity score
m_long = p + gamma_a*a
m_short = p
# check propensity score bounds
if np.any(m_long < trimming_threshold) or np.any(m_long > 1.0 - trimming_threshold):
m_long = np.clip(m_long, trimming_threshold, 1.0 - trimming_threshold)
m_short = np.clip(m_short, trimming_threshold, 1.0 - trimming_threshold)
warnings.warn(f'Propensity score is close to 0 or 1. '
f'Trimming is at {trimming_threshold} and {1.0-trimming_threshold} is applied')
# generate treatment based on long form
u = np.random.uniform(low=0, high=1, size=n_obs)
d = 1.0 * (m_long >= u)

# short and long version of g
g_partial_reg = 210 + 27.4*z[:, 0] + 13.7*(z[:, 1] + z[:, 2] + z[:, 3])

dx = d * (x[:, 4] + 1)
d1x = x[:, 4] + 1
var_dx = np.var(dx)
cov_adx = np.cov(a, dx)[0, 1]

def f_g(beta_a):
g_diff = beta_a * (a - cov_adx / var_dx)
y_diff = eps_y + g_diff
return np.square(np.mean(np.square(g_diff)) / np.mean(np.square(y_diff)) - cf_y)
beta_a = minimize_scalar(f_g).x

# add treatment heterogeneity
d1x = z[:, 4] + 1
var_dx = np.var(d*(d1x))
cov_adx = gamma_a * var_a
# Outcome regression
g_partial_reg = f_reg(features_reg)
# short model
g_short_d0 = g_partial_reg
g_short_d1 = (theta + beta_a * cov_adx / var_dx) * d1x + g_partial_reg
g_short = d * g_short_d1 + (1.0-d) * g_short_d0

# long model
g_long_d0 = g_partial_reg + beta_a * a
g_long_d1 = theta * d1x + g_partial_reg + beta_a * a
g_long = d * g_long_d1 + (1.0-d) * g_long_d0

y0 = g_long_d0 + eps_y
y1 = g_long_d1 + eps_y

y = d * y1 + (1.0-d) * y0

oracle_values = {'g_long': g_long,
'g_short': g_short,
'm_long': m_long,
'm_short': m_short,
'gamma_a': gamma_a,
'beta_a': beta_a,
'a': a,
'y0': y0,
'y1': y1,
'z': z}

res_dict = {'x': x,
'y': y,
'd': d,
'oracle_values': oracle_values}

# Potential outcomes
y_0 = g_long_d0 + eps_y
y_1 = g_long_d1 + eps_y
# Realized outcome
y = d * y_1 + (1.0-d) * y_0
# In-sample values for confounding strength
explained_residual_variance = np.square(g_long - g_short)
residual_variance = np.square(y - g_short)
cf_y = np.mean(explained_residual_variance) / np.mean(residual_variance)
# compute the Riesz representation
treated_weight = d / np.mean(d)
untreated_weight = (1.0 - d) / np.mean(d)
# Odds ratios
propensity_ratio_long = m_long / (1.0 - m_long)
rr_long_ate = d / m_long - (1.0 - d) / (1.0 - m_long)
rr_long_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_long)
propensity_ratio_short = m_short / (1.0 - m_short)
rr_short_ate = d / m_short - (1.0 - d) / (1.0 - m_short)
rr_short_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_short)
cf_d_ate = (np.mean(1/(m_long * (1 - m_long))) - np.mean(1/(m_short * (1 - m_short)))) / np.mean(1/(m_long * (1 - m_long)))
cf_d_atte = (np.mean(propensity_ratio_long) - np.mean(propensity_ratio_short)) / np.mean(propensity_ratio_long)
if (beta_a == 0) | (gamma_a == 0):
rho_ate = 0.0
rho_atte = 0.0
else:
rho_ate = np.corrcoef((g_long - g_short), (rr_long_ate - rr_short_ate))[0, 1]
rho_atte = np.corrcoef((g_long - g_short), (rr_long_atte - rr_short_atte))[0, 1]
oracle_values = {
'g_long': g_long,
'g_short': g_short,
'm_long': m_long,
'm_short': m_short,
'gamma_a': gamma_a,
'beta_a': beta_a,
'a': a,
'y_0': y_0,
'y_1': y_1,
'z': z,
'cf_y': cf_y,
'cf_d_ate': cf_d_ate,
'cf_d_atte': cf_d_atte,
'rho_ate': rho_ate,
'rho_atte': rho_atte,
}
res_dict = {
'x': x,
'y': y,
'd': d,
'oracle_values': oracle_values
}
return res_dict


Expand Down
19 changes: 15 additions & 4 deletions doubleml/tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,10 +186,16 @@ def test_make_did_SZ2020_return_types(cross_sectional, dgp_type):
_ = make_did_SZ2020(n_obs=100, dgp_type="5", cross_sectional_data=cross_sectional, return_type='matrix')


@pytest.fixture(scope='function',
params=[True, False])
def linear(request):
return request.param


@pytest.mark.ci
def test_make_confounded_irm_data_return_types():
def test_make_confounded_irm_data_return_types(linear):
np.random.seed(3141)
res = make_confounded_irm_data()
res = make_confounded_irm_data(linear=linear)
assert isinstance(res, dict)
assert isinstance(res['x'], np.ndarray)
assert isinstance(res['y'], np.ndarray)
Expand All @@ -203,9 +209,14 @@ def test_make_confounded_irm_data_return_types():
assert isinstance(res['oracle_values']['gamma_a'], float)
assert isinstance(res['oracle_values']['beta_a'], float)
assert isinstance(res['oracle_values']['a'], np.ndarray)
assert isinstance(res['oracle_values']['y0'], np.ndarray)
assert isinstance(res['oracle_values']['y1'], np.ndarray)
assert isinstance(res['oracle_values']['y_0'], np.ndarray)
assert isinstance(res['oracle_values']['y_1'], np.ndarray)
assert isinstance(res['oracle_values']['z'], np.ndarray)
assert isinstance(res['oracle_values']['cf_y'], float)
assert isinstance(res['oracle_values']['cf_d_ate'], float)
assert isinstance(res['oracle_values']['cf_d_atte'], float)
assert isinstance(res['oracle_values']['rho_ate'], float)
assert isinstance(res['oracle_values']['rho_atte'], float)


@pytest.mark.ci
Expand Down

0 comments on commit dc5d347

Please sign in to comment.