From 2fcc6bb58459243ff03e0b4d091e187fdf1b1759 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Mon, 5 Sep 2022 16:12:41 +0200 Subject: [PATCH 01/15] create DoubleMLIRMBLP class and basic unit tests Best linear Predictor to construct CATEs and GATEs --- doubleml/__init__.py | 4 +- doubleml/double_ml_blp.py | 185 ++++++++++++++++++++++++++++ doubleml/tests/_utils_blp_manual.py | 127 +++++++++++++++++++ doubleml/tests/test_blp.py | 92 ++++++++++++++ 4 files changed, 407 insertions(+), 1 deletion(-) create mode 100644 doubleml/double_ml_blp.py create mode 100644 doubleml/tests/_utils_blp_manual.py create mode 100644 doubleml/tests/test_blp.py diff --git a/doubleml/__init__.py b/doubleml/__init__.py index d5b7c420..ccd491ab 100644 --- a/doubleml/__init__.py +++ b/doubleml/__init__.py @@ -5,12 +5,14 @@ from .double_ml_irm import DoubleMLIRM from .double_ml_iivm import DoubleMLIIVM from .double_ml_data import DoubleMLData, DoubleMLClusterData +from .double_ml_blp import DoubleMLIRMBLP __all__ = ['DoubleMLPLR', 'DoubleMLPLIV', 'DoubleMLIRM', 'DoubleMLIIVM', 'DoubleMLData', - 'DoubleMLClusterData'] + 'DoubleMLClusterData', + 'DoubleMLIRMBLP'] __version__ = get_distribution('doubleml').version diff --git a/doubleml/double_ml_blp.py b/doubleml/double_ml_blp.py new file mode 100644 index 00000000..d3666e76 --- /dev/null +++ b/doubleml/double_ml_blp.py @@ -0,0 +1,185 @@ +import statsmodels.api as sm +import numpy as np +import pandas as pd +import doubleml as dml + +from scipy.stats import norm +from scipy.linalg import sqrtm + +class DoubleMLIRMBLP: + """Best Linear Predictor for DoubleML IRM models + + Parameters + ---------- + obj_dml_irm : :class:`DoubleMLIRM` object + The :class:`DoubleMLIRM` object providing the interactive regression model with fitted nuisance functions. + + basis : :class:`pandas.DataFrame` + The basis for estimating the best linear predictor. Has to correspond to the observations of the IRM model. + """ + + def __init__(self, + obj_dml_irm, + basis): + + # check and pick up obj_dml_irm + if not isinstance(obj_dml_irm, dml.DoubleMLIRM): + raise TypeError('The model must be of DoubleMLIRM type. ' + f'{str(obj_dml_irm)} of type {str(type(obj_dml_irm))} was passed.') + + if not isinstance(basis, pd.DataFrame): + raise TypeError('The basis must be of DataFrame type. ' + f'{str(basis)} of type {str(type(basis))} was passed.') + + self._dml_irm = obj_dml_irm + self._basis = basis + + # initialize the orthogonal signal, the score and the covariance + self._orth_signal = None + self._blp_model = None + self._blp_omega = None + + @property + def blp_model(self): + """ + Best-Linear-Predictor model. + """ + return self._blp_model + + @property + def orth_signal(self): + """ + Orthogonal signal. + """ + return self._orth_signal + + @property + def blp_omega(self): + """ + Covariance matrix. + """ + return self._blp_omega + + def fit(self): + """ + Estimate DoubleML models. + + Returns + ------- + self : object + """ + # get the orthogonal signal from the IRM model + self._orth_signal = self._dml_irm.psi_b.reshape(-1, 1) + # fit the best-linear-predictor of the orthogonal signal with respect to the grid + self._blp_model = sm.OLS(self._orth_signal, self._basis).fit() + self._blp_omega = self._blp_model.cov_HC0 + + return self + + def confint(self, basis, joint=False, level=0.95, n_rep_boot=500): + """ + Confidence intervals for BLP for DoubleML IRM. + + Parameters + ---------- + basis : :class:`pandas.DataFrame` + The basis for constructing the confidence interval. Has to have the same form as the basis from + the construction. + + joint : bool + Indicates whether joint confidence intervals are computed. + Default is ``False`` + + level : float + The confidence level. + Default is ``0.95``. + + n_rep_boot : int + The number of bootstrap repetitions (only relevant for joint confidence intervals). + Default is ``500``. + + Returns + ------- + df_ci : pd.DataFrame + A data frame with the confidence interval(s). + """ + alpha = 1 - level + # blp of the orthogonal signal + g_hat = self._blp_model.predict(basis) + + # calculate se for basis elements + # check this again (calculation of HC0 should include scaling with sample size) + # se_scaling = np.sqrt(self._dml_irm._dml_data.n_obs) + se_scaling = 1 + blp_se = np.sqrt((basis.to_numpy().dot(self._blp_omega) * basis.to_numpy()).sum(axis=1)) / se_scaling + + if joint: + # calculate the maximum t-statistic with bootstrap + normal_samples = np.random.normal(size=[basis.shape[1], n_rep_boot]) + bootstrap_samples = np.multiply(basis.to_numpy().dot(np.dot(sqrtm(self._blp_omega), normal_samples)).T, + (blp_se * se_scaling)) + + max_t_stat = np.quantile(np.max(np.abs(bootstrap_samples), axis=0), q=level) + + # Lower simultaneous CI + g_hat_lower = g_hat - max_t_stat * blp_se + # Upper simultaneous CI + g_hat_upper = g_hat + max_t_stat * blp_se + + else: + # Lower point-wise CI + g_hat_lower = g_hat + norm.ppf(q=alpha / 2) * blp_se + # Upper point-wise CI + g_hat_upper = g_hat + norm.ppf(q=1 - alpha / 2) * blp_se + + ci = np.vstack((g_hat_lower, g_hat, g_hat_upper)).T + df_ci = pd.DataFrame(ci, + columns=['{:.1f} %'.format(alpha/2 * 100), 'effect', '{:.1f} %'.format((1-alpha/2) * 100)], + index=basis.index) + return df_ci + + +from doubleml.tests._utils_blp_manual import create_spline_basis, create_synthetic_data + +# DGP constants +np.random.seed(123) +n = 2000 +n_w = 10 +support_size = 5 +n_x = 1 + +# Create data +data, covariates = create_synthetic_data(n=n, n_w=n_w, support_size=support_size, n_x=n_x, constant=True) +data_dml_base = dml.DoubleMLData(data, + y_col='y', + d_cols='t', + x_cols=covariates) + +# First stage estimation +# Lasso regression +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +randomForest_reg = RandomForestRegressor(n_estimators=500) +randomForest_class = RandomForestClassifier(n_estimators=500) + +np.random.seed(123) + +dml_irm = dml.DoubleMLIRM(data_dml_base, + ml_g=randomForest_reg, + ml_m=randomForest_class, + trimming_threshold=0.01, + n_folds=5 + ) +print("Training first stage") +dml_irm.fit(store_predictions=True) + +spline_basis = create_spline_basis(X=data["x"], knots=3, degree=2) + +cate = DoubleMLIRMBLP(dml_irm, basis=spline_basis).fit() + +print(cate.confint(spline_basis, joint=False)) + +groups = pd.DataFrame(np.vstack([data["x"] <= 0.2, (data["x"] >= 0.2) & (data["x"] <= 0.7), data["x"] >= 0.7]).T, + columns=['Group 1', 'Group 2', 'Group 3']) + +gate = DoubleMLIRMBLP(dml_irm, basis=groups).fit() +print(gate.confint(groups)) diff --git a/doubleml/tests/_utils_blp_manual.py b/doubleml/tests/_utils_blp_manual.py new file mode 100644 index 00000000..42cb37c9 --- /dev/null +++ b/doubleml/tests/_utils_blp_manual.py @@ -0,0 +1,127 @@ +import numpy as np +import statsmodels.api as sm +from scipy.linalg import sqrtm +from scipy.stats import norm +import pandas as pd +import patsy + + +def fit_blp(obj_dml_irm, basis): + orth_signal = obj_dml_irm.psi_b.reshape(-1, 1) + blp_model = sm.OLS(orth_signal, basis).fit() + + return blp_model + + +def blp_confint(obj_dml_irm, blp_model, basis, joint=False, level=0.95, n_rep_boot=500): + alpha = 1 - level + g_hat = blp_model.predict(basis) + + blp_omega = blp_model.cov_HC0 + + # check this again (calculation of HC0 should include scaling with sample size) + # se_scaling = np.sqrt(self._dml_irm._dml_data.n_obs) + se_scaling = 1 + blp_se = np.sqrt((basis.dot(blp_omega) * basis).sum(axis=1)) / se_scaling + + if joint: + # calculate the maximum t-statistic with bootstrap + normal_samples = np.random.normal(size=[basis.shape[1], n_rep_boot]) + bootstrap_samples = np.multiply(basis.dot(np.dot(sqrtm(blp_omega), normal_samples)).T, + (blp_se * se_scaling)) + + max_t_stat = np.quantile(np.max(np.abs(bootstrap_samples), axis=0), q=level) + + # Lower simultaneous CI + g_hat_lower = g_hat - max_t_stat * blp_se + # Upper simultaneous CI + g_hat_upper = g_hat + max_t_stat * blp_se + + else: + # Lower point-wise CI + g_hat_lower = g_hat + norm.ppf(q=alpha / 2) * blp_se + # Upper point-wise CI + g_hat_upper = g_hat + norm.ppf(q=1 - alpha / 2) * blp_se + + ci = np.vstack((g_hat_lower, g_hat, g_hat_upper)).T + df_ci = pd.DataFrame(ci, + columns=['{:.1f} %'.format(alpha / 2 * 100), 'effect', + '{:.1f} %'.format((1 - alpha / 2) * 100)], + index=basis.index) + return df_ci + + +def create_spline_basis(X, knots=5, degree=3): + if type(knots) == int: + X_splines = patsy.bs(X, df=knots + degree, degree=degree) + else: + X_splines = patsy.bs(X, knots=knots, degree=degree) + return X_splines + + +def create_synthetic_data(n=200, n_w=30, support_size=5, n_x=1, constant=True): + """ + Creates a synthetic example based on example 2 of https://github.com/microsoft/EconML/blob/master/notebooks/Double%20Machine%20Learning%20Examples.ipynb + + Parameters + ---------- + n_samples : int + Number of samples. + Default is ``200``. + + n_w : int + Dimension of covariates. + Default is ``30``. + + support_size : int + Number of relevant covariates. + Default is ``5``. + + n_x : int + Dimension of treatment variable. + Default is ``1``. + + Returns + ------- + data : pd.DataFrame + A data frame. + + """ + # Outcome support + # With the next two lines we are effectively choosing the matrix gamma in the example + support_Y = np.random.choice(np.arange(n_w), size=support_size, replace=False) + coefs_Y = np.random.uniform(0, 1, size=support_size) + # Define the function to generate the noise + epsilon_sample = lambda n: np.random.uniform(-1, 1, size=n) + # Treatment support + # Assuming the matrices gamma and beta have the same non-zero components + support_T = support_Y + coefs_T = np.random.uniform(0, 1, size=support_size) + # Define the function to generate the noise + eta_sample = lambda n: np.random.uniform(-1, 1, size=n) + + # Generate controls, covariates, treatments and outcomes + W = np.random.normal(0, 1, size=(n, n_w)) + X = np.random.uniform(0, 1, size=(n, n_x)) + # Heterogeneous treatment effects + if constant: + TE = np.array([3 for x_i in X]) + else: + TE = np.array([np.exp(4 + 2 * x_i) for x_i in X]) + # Define treatment + log_odds = np.dot(W[:, support_T], coefs_T) + eta_sample(n) + T_sigmoid = 1 / (1 + np.exp(-log_odds)) + T = np.array([np.random.binomial(1, p) for p in T_sigmoid]) + # Define the outcome + Y = TE * T + np.dot(W[:, support_Y], coefs_Y) + epsilon_sample(n) + + # Now we build the dataset + y_df = pd.DataFrame({'y': Y}) + x_df = pd.DataFrame({'x': X.reshape(-1)}) + t_df = pd.DataFrame({'t': T}) + w_df = pd.DataFrame(data=W, index=np.arange(W.shape[0]), columns=[f'w_{i}' for i in range(W.shape[1])]) + + data = pd.concat([y_df, x_df, t_df, w_df], axis=1) + + covariates = list(w_df.columns.values) + list(x_df.columns.values) + return data, covariates \ No newline at end of file diff --git a/doubleml/tests/test_blp.py b/doubleml/tests/test_blp.py new file mode 100644 index 00000000..0f75483b --- /dev/null +++ b/doubleml/tests/test_blp.py @@ -0,0 +1,92 @@ +import numpy as np +import pytest + +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +import doubleml as dml + +from ._utils_blp_manual import fit_blp, blp_confint, create_spline_basis, create_synthetic_data + + +@pytest.fixture(scope='module', + params=[True]) +def constant(request): + return request.param + + +@pytest.fixture(scope='module', + params=[True, False]) +def ci_joint(request): + return request.param + + +@pytest.fixture(scope='module') +def dml_blp_fixture(constant, ci_joint): + np.random.seed(42) + n = 200 + n_w = 10 + support_size = 5 + n_x = 1 + + # Create data + data, covariates = create_synthetic_data(n=n, n_w=n_w, support_size=support_size, n_x=n_x, constant=constant) + data_dml_base = dml.DoubleMLData(data, + y_col='y', + d_cols='t', + x_cols=covariates) + + # First stage estimation + ml_m = RandomForestRegressor(n_estimators=100) + ml_g = RandomForestClassifier(n_estimators=100) + + np.random.seed(42) + dml_irm = dml.DoubleMLIRM(data_dml_base, + ml_g=ml_m, + ml_m=ml_g, + trimming_threshold=0.05, + n_folds=5) + + dml_irm.fit(store_predictions=True) + + spline_basis = create_spline_basis(X=data["x"]) + + cate = dml.double_ml_blp.DoubleMLIRMBLP(dml_irm, basis=spline_basis).fit() + cate_manual = fit_blp(dml_irm, spline_basis) + + np.random.seed(42) + ci = cate.confint(spline_basis, joint=ci_joint, level=0.95, n_rep_boot=1000) + np.random.seed(42) + ci_manual = blp_confint(dml_irm, cate_manual, spline_basis, joint=ci_joint, level=0.95, n_rep_boot=1000) + + res_dict = {'coef': cate.blp_model.params, + 'coef_manual': cate_manual.params, + 'values': cate.blp_model.fittedvalues, + 'values_manual': cate_manual.fittedvalues, + 'omega': cate.blp_omega, + 'omega_manual': cate_manual.cov_HC0, + 'ci': ci, + 'ci_manual': ci_manual} + + return res_dict + + +def test_dml_blp_coef(dml_blp_fixture): + assert np.allclose(dml_blp_fixture['coef'], + dml_blp_fixture['coef_manual'], + rtol=1e-9, atol=1e-4) + + +def test_dml_blp_values(dml_blp_fixture): + assert np.allclose(dml_blp_fixture['values'], + dml_blp_fixture['values_manual'], + rtol=1e-9, atol=1e-4) + + +def test_dml_blp_omega(dml_blp_fixture): + assert np.allclose(dml_blp_fixture['omega'], + dml_blp_fixture['omega_manual'], + rtol=1e-9, atol=1e-4) + +def test_dml_blp_ci(dml_blp_fixture): + assert np.allclose(dml_blp_fixture['ci'], + dml_blp_fixture['ci_manual'], + rtol=1e-9, atol=1e-4) From a25d3bc1cfed55452cff0d527738a5fbb4e26ac9 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Tue, 6 Sep 2022 11:14:11 +0200 Subject: [PATCH 02/15] create cate and gate method for irm Use the DoubleMLIRMBLP class to create CATE and GATE estimates for DoubleMLIRM objects. --- doubleml/double_ml_blp.py | 90 +++++++++-------------------- doubleml/double_ml_irm.py | 65 ++++++++++++++++++++- doubleml/tests/_utils_blp_manual.py | 61 +++++++++++++++++-- doubleml/tests/test_blp.py | 11 +++- 4 files changed, 153 insertions(+), 74 deletions(-) diff --git a/doubleml/double_ml_blp.py b/doubleml/double_ml_blp.py index d3666e76..8644252a 100644 --- a/doubleml/double_ml_blp.py +++ b/doubleml/double_ml_blp.py @@ -1,7 +1,6 @@ import statsmodels.api as sm import numpy as np import pandas as pd -import doubleml as dml from scipy.stats import norm from scipy.linalg import sqrtm @@ -11,31 +10,43 @@ class DoubleMLIRMBLP: Parameters ---------- - obj_dml_irm : :class:`DoubleMLIRM` object - The :class:`DoubleMLIRM` object providing the interactive regression model with fitted nuisance functions. + orth_signal : :class:`numpy.array` + The orthogonal signal to be predicted. Has to be of shape (n,). basis : :class:`pandas.DataFrame` - The basis for estimating the best linear predictor. Has to correspond to the observations of the IRM model. + The basis for estimating the best linear predictor. Has to have the shape (n,d), + where d is the number of predictors. """ def __init__(self, - obj_dml_irm, + orth_signal, basis): - # check and pick up obj_dml_irm - if not isinstance(obj_dml_irm, dml.DoubleMLIRM): - raise TypeError('The model must be of DoubleMLIRM type. ' - f'{str(obj_dml_irm)} of type {str(type(obj_dml_irm))} was passed.') + if not isinstance(orth_signal, np.ndarray): + raise TypeError('The signal must be of np.ndarray type. ' + f'{str(orth_signal)} of type {str(type(orth_signal))} was passed.') + + if orth_signal.ndim != 1: + raise ValueError('The signal must be of one dimensional. ' + f'{str(orth_signal)} of dimensions {str(orth_signal.ndim)} was passed.') if not isinstance(basis, pd.DataFrame): raise TypeError('The basis must be of DataFrame type. ' f'{str(basis)} of type {str(type(basis))} was passed.') - self._dml_irm = obj_dml_irm + if not basis.columns.is_unique: + raise ValueError('Invalid pd.DataFrame: ' + 'Contains duplicate column names.') + + if orth_signal.shape[0] != basis.shape[0]: + raise ValueError('Incompatible input dimensions.' + f'{str(orth_signal)} of shape {str(orth_signal.shape)} and' + f'{str(basis)} of shape {str(basis.shape)} were passed.') + + self._orth_signal = orth_signal self._basis = basis - # initialize the orthogonal signal, the score and the covariance - self._orth_signal = None + # initialize the score and the covariance self._blp_model = None self._blp_omega = None @@ -68,8 +79,7 @@ def fit(self): ------- self : object """ - # get the orthogonal signal from the IRM model - self._orth_signal = self._dml_irm.psi_b.reshape(-1, 1) + # fit the best-linear-predictor of the orthogonal signal with respect to the grid self._blp_model = sm.OLS(self._orth_signal, self._basis).fit() self._blp_omega = self._blp_model.cov_HC0 @@ -84,7 +94,7 @@ def confint(self, basis, joint=False, level=0.95, n_rep_boot=500): ---------- basis : :class:`pandas.DataFrame` The basis for constructing the confidence interval. Has to have the same form as the basis from - the construction. + the construction. joint : bool Indicates whether joint confidence intervals are computed. @@ -108,8 +118,6 @@ def confint(self, basis, joint=False, level=0.95, n_rep_boot=500): g_hat = self._blp_model.predict(basis) # calculate se for basis elements - # check this again (calculation of HC0 should include scaling with sample size) - # se_scaling = np.sqrt(self._dml_irm._dml_data.n_obs) se_scaling = 1 blp_se = np.sqrt((basis.to_numpy().dot(self._blp_omega) * basis.to_numpy()).sum(axis=1)) / se_scaling @@ -136,50 +144,4 @@ def confint(self, basis, joint=False, level=0.95, n_rep_boot=500): df_ci = pd.DataFrame(ci, columns=['{:.1f} %'.format(alpha/2 * 100), 'effect', '{:.1f} %'.format((1-alpha/2) * 100)], index=basis.index) - return df_ci - - -from doubleml.tests._utils_blp_manual import create_spline_basis, create_synthetic_data - -# DGP constants -np.random.seed(123) -n = 2000 -n_w = 10 -support_size = 5 -n_x = 1 - -# Create data -data, covariates = create_synthetic_data(n=n, n_w=n_w, support_size=support_size, n_x=n_x, constant=True) -data_dml_base = dml.DoubleMLData(data, - y_col='y', - d_cols='t', - x_cols=covariates) - -# First stage estimation -# Lasso regression -from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -randomForest_reg = RandomForestRegressor(n_estimators=500) -randomForest_class = RandomForestClassifier(n_estimators=500) - -np.random.seed(123) - -dml_irm = dml.DoubleMLIRM(data_dml_base, - ml_g=randomForest_reg, - ml_m=randomForest_class, - trimming_threshold=0.01, - n_folds=5 - ) -print("Training first stage") -dml_irm.fit(store_predictions=True) - -spline_basis = create_spline_basis(X=data["x"], knots=3, degree=2) - -cate = DoubleMLIRMBLP(dml_irm, basis=spline_basis).fit() - -print(cate.confint(spline_basis, joint=False)) - -groups = pd.DataFrame(np.vstack([data["x"] <= 0.2, (data["x"] >= 0.2) & (data["x"] <= 0.7), data["x"] >= 0.7]).T, - columns=['Group 1', 'Group 2', 'Group 3']) - -gate = DoubleMLIRMBLP(dml_irm, basis=groups).fit() -print(gate.confint(groups)) + return df_ci \ No newline at end of file diff --git a/doubleml/double_ml_irm.py b/doubleml/double_ml_irm.py index 4ecd2b78..85d449e7 100644 --- a/doubleml/double_ml_irm.py +++ b/doubleml/double_ml_irm.py @@ -3,8 +3,9 @@ from sklearn.utils.multiclass import type_of_target from .double_ml import DoubleML -from ._utils import _dml_cv_predict, _get_cond_smpls, _dml_tune, _check_finite_predictions +from .double_ml_blp import DoubleMLIRMBLP +from ._utils import _dml_cv_predict, _get_cond_smpls, _dml_tune, _check_finite_predictions class DoubleMLIRM(DoubleML): """Double machine learning for interactive regression models @@ -309,3 +310,65 @@ def _nuisance_tuning(self, smpls, param_grids, scoring_methods, n_folds_tune, n_ 'tune_res': tune_res} return res + + def cate(self, basis): + """ + Calculate conditional average treatment effects (CATE) for a given basis. + + Parameters + ---------- + basis : :class:`pandas.DataFrame` + The basis for estimating the best linear predictor. Has to have the shape (n,d), + where d is the number of predictors. + + Returns + ------- + model : :class:`doubleML.DoubleMLIRMBLP` + Best linear Predictor model. + """ + # define the orthogonal signal + orth_signal = self.psi_b.reshape(-1) + + model = DoubleMLIRMBLP(orth_signal, basis=basis).fit() + + return model + + def gate(self, groups, joint=False, level=0.95, n_rep_boot=500): + """ + Calculate group average treatment effects (GATE) for a given basis. + + Parameters + ---------- + groups : :class:`pandas.DataFrame` + The basis for estimating the best linear predictor. Has to have the shape (n,d), + where d is the number of groups. + + joint : bool + Indicates whether joint confidence intervals are computed. + Default is ``False`` + + level : float + The confidence level. + Default is ``0.95``. + + n_rep_boot : int + Number of bootstrap samples for joint confidence interval. + Default is ``500``. + + Returns + ------- + df_ci : pd.DataFrame + A data frame with the confidence interval(s). + """ + + + # define the orthogonal signal + orth_signal = self.psi_b.reshape(-1) + model = DoubleMLIRMBLP(orth_signal, basis=groups).fit() + + unique_groups = groups.drop_duplicates(keep='last') + df_ci = model.confint(unique_groups, joint, level, n_rep_boot) + df_ci.index = unique_groups.columns.values + + return df_ci + diff --git a/doubleml/tests/_utils_blp_manual.py b/doubleml/tests/_utils_blp_manual.py index 42cb37c9..2d560b70 100644 --- a/doubleml/tests/_utils_blp_manual.py +++ b/doubleml/tests/_utils_blp_manual.py @@ -6,21 +6,18 @@ import patsy -def fit_blp(obj_dml_irm, basis): - orth_signal = obj_dml_irm.psi_b.reshape(-1, 1) +def fit_blp(orth_signal, basis): blp_model = sm.OLS(orth_signal, basis).fit() return blp_model -def blp_confint(obj_dml_irm, blp_model, basis, joint=False, level=0.95, n_rep_boot=500): +def blp_confint(blp_model, basis, joint=False, level=0.95, n_rep_boot=500): alpha = 1 - level g_hat = blp_model.predict(basis) blp_omega = blp_model.cov_HC0 - # check this again (calculation of HC0 should include scaling with sample size) - # se_scaling = np.sqrt(self._dml_irm._dml_data.n_obs) se_scaling = 1 blp_se = np.sqrt((basis.dot(blp_omega) * basis).sum(axis=1)) / se_scaling @@ -124,4 +121,56 @@ def create_synthetic_data(n=200, n_w=30, support_size=5, n_x=1, constant=True): data = pd.concat([y_df, x_df, t_df, w_df], axis=1) covariates = list(w_df.columns.values) + list(x_df.columns.values) - return data, covariates \ No newline at end of file + return data, covariates + + +import doubleml as dml +from doubleml.tests._utils_blp_manual import create_spline_basis, create_synthetic_data + +# DGP constants +np.random.seed(123) +n = 2000 +n_w = 10 +support_size = 5 +n_x = 1 + +# Create data +data, covariates = create_synthetic_data(n=n, n_w=n_w, support_size=support_size, n_x=n_x, constant=True) +data_dml_base = dml.DoubleMLData(data, + y_col='y', + d_cols='t', + x_cols=covariates) + +# First stage estimation +# Lasso regression +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +randomForest_reg = RandomForestRegressor(n_estimators=500) +randomForest_class = RandomForestClassifier(n_estimators=500) + +np.random.seed(123) + +dml_irm = dml.DoubleMLIRM(data_dml_base, + ml_g=randomForest_reg, + ml_m=randomForest_class, + trimming_threshold=0.01, + n_folds=5 + ) +print("Training first stage") +dml_irm.fit(store_predictions=True) + +spline_basis = create_spline_basis(X=data["x"], knots=3, degree=2) + + + +# get the orthogonal signal from the IRM model +#orth_signal = dml_irm.psi_b.reshape(-1) +#cate = DoubleMLIRMBLP(orth_signal, basis=spline_basis).fit() +cate = dml_irm.cate(spline_basis) + +print(cate.confint(spline_basis, joint=False)) + +groups = pd.DataFrame(np.vstack([data["x"] <= 0.2, (data["x"] >= 0.2) & (data["x"] <= 0.7), data["x"] >= 0.7]).T, + columns=['Group 1', 'Group 2', 'Group 3']) + +gate = dml_irm.gate(groups) +print(gate) \ No newline at end of file diff --git a/doubleml/tests/test_blp.py b/doubleml/tests/test_blp.py index 0f75483b..6ad7217a 100644 --- a/doubleml/tests/test_blp.py +++ b/doubleml/tests/test_blp.py @@ -49,13 +49,17 @@ def dml_blp_fixture(constant, ci_joint): spline_basis = create_spline_basis(X=data["x"]) - cate = dml.double_ml_blp.DoubleMLIRMBLP(dml_irm, basis=spline_basis).fit() - cate_manual = fit_blp(dml_irm, spline_basis) + # cate + cate = dml_irm.cate(spline_basis) + + # get the orthogonal signal from the IRM model + orth_signal = dml_irm.psi_b.reshape(-1) + cate_manual = fit_blp(orth_signal, spline_basis) np.random.seed(42) ci = cate.confint(spline_basis, joint=ci_joint, level=0.95, n_rep_boot=1000) np.random.seed(42) - ci_manual = blp_confint(dml_irm, cate_manual, spline_basis, joint=ci_joint, level=0.95, n_rep_boot=1000) + ci_manual = blp_confint(cate_manual, spline_basis, joint=ci_joint, level=0.95, n_rep_boot=1000) res_dict = {'coef': cate.blp_model.params, 'coef_manual': cate_manual.params, @@ -90,3 +94,4 @@ def test_dml_blp_ci(dml_blp_fixture): assert np.allclose(dml_blp_fixture['ci'], dml_blp_fixture['ci_manual'], rtol=1e-9, atol=1e-4) + From fff77acf17f245587a0b0136638ee5d3a1d7e27b Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Thu, 8 Sep 2022 15:49:13 +0200 Subject: [PATCH 03/15] restructure basis unit tests --- .gitignore | 1 + doubleml/double_ml_blp.py | 33 +++++-- doubleml/double_ml_irm.py | 23 +++-- doubleml/tests/_utils_blp_manual.py | 134 +--------------------------- doubleml/tests/test_blp.py | 58 ++++++------ 5 files changed, 74 insertions(+), 175 deletions(-) diff --git a/.gitignore b/.gitignore index 14d18a6a..69699106 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,4 @@ share/python-wheels/ .installed.cfg *.egg MANIFEST +*.idea diff --git a/doubleml/double_ml_blp.py b/doubleml/double_ml_blp.py index 8644252a..642d1f40 100644 --- a/doubleml/double_ml_blp.py +++ b/doubleml/double_ml_blp.py @@ -5,6 +5,7 @@ from scipy.stats import norm from scipy.linalg import sqrtm + class DoubleMLIRMBLP: """Best Linear Predictor for DoubleML IRM models @@ -24,15 +25,15 @@ def __init__(self, if not isinstance(orth_signal, np.ndarray): raise TypeError('The signal must be of np.ndarray type. ' - f'{str(orth_signal)} of type {str(type(orth_signal))} was passed.') + f'Signal of type {str(type(orth_signal))} was passed.') if orth_signal.ndim != 1: raise ValueError('The signal must be of one dimensional. ' - f'{str(orth_signal)} of dimensions {str(orth_signal.ndim)} was passed.') + f'Signal of dimensions {str(orth_signal.ndim)} was passed.') if not isinstance(basis, pd.DataFrame): raise TypeError('The basis must be of DataFrame type. ' - f'{str(basis)} of type {str(type(basis))} was passed.') + f'Basis of type {str(type(basis))} was passed.') if not basis.columns.is_unique: raise ValueError('Invalid pd.DataFrame: ' @@ -40,8 +41,8 @@ def __init__(self, if orth_signal.shape[0] != basis.shape[0]: raise ValueError('Incompatible input dimensions.' - f'{str(orth_signal)} of shape {str(orth_signal.shape)} and' - f'{str(basis)} of shape {str(basis.shape)} were passed.') + f'Signal of shape {str(orth_signal.shape)} and' + f'basis of shape {str(basis.shape)} were passed.') self._orth_signal = orth_signal self._basis = basis @@ -64,6 +65,13 @@ def orth_signal(self): """ return self._orth_signal + @property + def basis(self): + """ + Basis. + """ + return self._basis + @property def blp_omega(self): """ @@ -113,6 +121,17 @@ def confint(self, basis, joint=False, level=0.95, n_rep_boot=500): df_ci : pd.DataFrame A data frame with the confidence interval(s). """ + if not isinstance(joint, bool): + raise TypeError('joint must be True or False. ' + f'Got {str(joint)}.') + + if not isinstance(level, float): + raise TypeError('The confidence level must be of float type. ' + f'{str(level)} of type {str(type(level))} was passed.') + if (level <= 0) | (level >= 1): + raise ValueError('The confidence level must be in (0,1). ' + f'{str(level)} was passed.') + alpha = 1 - level # blp of the orthogonal signal g_hat = self._blp_model.predict(basis) @@ -142,6 +161,6 @@ def confint(self, basis, joint=False, level=0.95, n_rep_boot=500): ci = np.vstack((g_hat_lower, g_hat, g_hat_upper)).T df_ci = pd.DataFrame(ci, - columns=['{:.1f} %'.format(alpha/2 * 100), 'effect', '{:.1f} %'.format((1-alpha/2) * 100)], - index=basis.index) + columns=['{:.1f} %'.format(alpha/2 * 100), 'effect', '{:.1f} %'.format((1-alpha/2) * 100)], + index=basis.index) return df_ci \ No newline at end of file diff --git a/doubleml/double_ml_irm.py b/doubleml/double_ml_irm.py index 85d449e7..329dacb1 100644 --- a/doubleml/double_ml_irm.py +++ b/doubleml/double_ml_irm.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd from sklearn.utils import check_X_y from sklearn.utils.multiclass import type_of_target @@ -328,7 +329,7 @@ def cate(self, basis): """ # define the orthogonal signal orth_signal = self.psi_b.reshape(-1) - + # fit the best linear predictor model = DoubleMLIRMBLP(orth_signal, basis=basis).fit() return model @@ -340,8 +341,8 @@ def gate(self, groups, joint=False, level=0.95, n_rep_boot=500): Parameters ---------- groups : :class:`pandas.DataFrame` - The basis for estimating the best linear predictor. Has to have the shape (n,d), - where d is the number of groups. + The group indicator for estimating the best linear predictor. Has to have the shape (n,d), + where d is the number of groups or (n,1) and contain the corresponding groups. joint : bool Indicates whether joint confidence intervals are computed. @@ -361,14 +362,26 @@ def gate(self, groups, joint=False, level=0.95, n_rep_boot=500): A data frame with the confidence interval(s). """ + if not isinstance(groups, pd.DataFrame): + raise TypeError('Groups must be of DataFrame type. ' + f'Groups of type {str(type(groups))} was passed.') + + if not all(groups.dtypes == bool) or all(groups.dtypes == int): + if groups.shape[1] == 1: + groups = pd.get_dummies(groups, prefix='Group', prefix_sep='_') + else: + raise TypeError('Columns must be of of bool or int type or the data frame only should contain' + 'one column.') # define the orthogonal signal orth_signal = self.psi_b.reshape(-1) + # fit the best linear predictor model = DoubleMLIRMBLP(orth_signal, basis=groups).fit() - unique_groups = groups.drop_duplicates(keep='last') + # reduce to unique groups and create confidence interval + unique_groups = pd.DataFrame(np.diag(v=np.full((groups.shape[1]), True))) df_ci = model.confint(unique_groups, joint, level, n_rep_boot) - df_ci.index = unique_groups.columns.values + df_ci.index = groups.columns.values return df_ci diff --git a/doubleml/tests/_utils_blp_manual.py b/doubleml/tests/_utils_blp_manual.py index 2d560b70..213a2b16 100644 --- a/doubleml/tests/_utils_blp_manual.py +++ b/doubleml/tests/_utils_blp_manual.py @@ -18,14 +18,12 @@ def blp_confint(blp_model, basis, joint=False, level=0.95, n_rep_boot=500): blp_omega = blp_model.cov_HC0 - se_scaling = 1 - blp_se = np.sqrt((basis.dot(blp_omega) * basis).sum(axis=1)) / se_scaling + blp_se = np.sqrt((basis.dot(blp_omega) * basis).sum(axis=1)) if joint: # calculate the maximum t-statistic with bootstrap normal_samples = np.random.normal(size=[basis.shape[1], n_rep_boot]) - bootstrap_samples = np.multiply(basis.dot(np.dot(sqrtm(blp_omega), normal_samples)).T, - (blp_se * se_scaling)) + bootstrap_samples = np.multiply(basis.dot(np.dot(sqrtm(blp_omega), normal_samples)).T, blp_se) max_t_stat = np.quantile(np.max(np.abs(bootstrap_samples), axis=0), q=level) @@ -46,131 +44,3 @@ def blp_confint(blp_model, basis, joint=False, level=0.95, n_rep_boot=500): '{:.1f} %'.format((1 - alpha / 2) * 100)], index=basis.index) return df_ci - - -def create_spline_basis(X, knots=5, degree=3): - if type(knots) == int: - X_splines = patsy.bs(X, df=knots + degree, degree=degree) - else: - X_splines = patsy.bs(X, knots=knots, degree=degree) - return X_splines - - -def create_synthetic_data(n=200, n_w=30, support_size=5, n_x=1, constant=True): - """ - Creates a synthetic example based on example 2 of https://github.com/microsoft/EconML/blob/master/notebooks/Double%20Machine%20Learning%20Examples.ipynb - - Parameters - ---------- - n_samples : int - Number of samples. - Default is ``200``. - - n_w : int - Dimension of covariates. - Default is ``30``. - - support_size : int - Number of relevant covariates. - Default is ``5``. - - n_x : int - Dimension of treatment variable. - Default is ``1``. - - Returns - ------- - data : pd.DataFrame - A data frame. - - """ - # Outcome support - # With the next two lines we are effectively choosing the matrix gamma in the example - support_Y = np.random.choice(np.arange(n_w), size=support_size, replace=False) - coefs_Y = np.random.uniform(0, 1, size=support_size) - # Define the function to generate the noise - epsilon_sample = lambda n: np.random.uniform(-1, 1, size=n) - # Treatment support - # Assuming the matrices gamma and beta have the same non-zero components - support_T = support_Y - coefs_T = np.random.uniform(0, 1, size=support_size) - # Define the function to generate the noise - eta_sample = lambda n: np.random.uniform(-1, 1, size=n) - - # Generate controls, covariates, treatments and outcomes - W = np.random.normal(0, 1, size=(n, n_w)) - X = np.random.uniform(0, 1, size=(n, n_x)) - # Heterogeneous treatment effects - if constant: - TE = np.array([3 for x_i in X]) - else: - TE = np.array([np.exp(4 + 2 * x_i) for x_i in X]) - # Define treatment - log_odds = np.dot(W[:, support_T], coefs_T) + eta_sample(n) - T_sigmoid = 1 / (1 + np.exp(-log_odds)) - T = np.array([np.random.binomial(1, p) for p in T_sigmoid]) - # Define the outcome - Y = TE * T + np.dot(W[:, support_Y], coefs_Y) + epsilon_sample(n) - - # Now we build the dataset - y_df = pd.DataFrame({'y': Y}) - x_df = pd.DataFrame({'x': X.reshape(-1)}) - t_df = pd.DataFrame({'t': T}) - w_df = pd.DataFrame(data=W, index=np.arange(W.shape[0]), columns=[f'w_{i}' for i in range(W.shape[1])]) - - data = pd.concat([y_df, x_df, t_df, w_df], axis=1) - - covariates = list(w_df.columns.values) + list(x_df.columns.values) - return data, covariates - - -import doubleml as dml -from doubleml.tests._utils_blp_manual import create_spline_basis, create_synthetic_data - -# DGP constants -np.random.seed(123) -n = 2000 -n_w = 10 -support_size = 5 -n_x = 1 - -# Create data -data, covariates = create_synthetic_data(n=n, n_w=n_w, support_size=support_size, n_x=n_x, constant=True) -data_dml_base = dml.DoubleMLData(data, - y_col='y', - d_cols='t', - x_cols=covariates) - -# First stage estimation -# Lasso regression -from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -randomForest_reg = RandomForestRegressor(n_estimators=500) -randomForest_class = RandomForestClassifier(n_estimators=500) - -np.random.seed(123) - -dml_irm = dml.DoubleMLIRM(data_dml_base, - ml_g=randomForest_reg, - ml_m=randomForest_class, - trimming_threshold=0.01, - n_folds=5 - ) -print("Training first stage") -dml_irm.fit(store_predictions=True) - -spline_basis = create_spline_basis(X=data["x"], knots=3, degree=2) - - - -# get the orthogonal signal from the IRM model -#orth_signal = dml_irm.psi_b.reshape(-1) -#cate = DoubleMLIRMBLP(orth_signal, basis=spline_basis).fit() -cate = dml_irm.cate(spline_basis) - -print(cate.confint(spline_basis, joint=False)) - -groups = pd.DataFrame(np.vstack([data["x"] <= 0.2, (data["x"] >= 0.2) & (data["x"] <= 0.7), data["x"] >= 0.7]).T, - columns=['Group 1', 'Group 2', 'Group 3']) - -gate = dml_irm.gate(groups) -print(gate) \ No newline at end of file diff --git a/doubleml/tests/test_blp.py b/doubleml/tests/test_blp.py index 6ad7217a..477437c1 100644 --- a/doubleml/tests/test_blp.py +++ b/doubleml/tests/test_blp.py @@ -1,65 +1,59 @@ import numpy as np +import pandas as pd import pytest from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor import doubleml as dml -from ._utils_blp_manual import fit_blp, blp_confint, create_spline_basis, create_synthetic_data +from ._utils_blp_manual import fit_blp, blp_confint @pytest.fixture(scope='module', - params=[True]) -def constant(request): + params=[True, False]) +def ci_joint(request): return request.param - @pytest.fixture(scope='module', - params=[True, False]) -def ci_joint(request): + params=[0.95, 0.9]) +def ci_level(request): return request.param @pytest.fixture(scope='module') -def dml_blp_fixture(constant, ci_joint): +def dml_blp_fixture(ci_joint, ci_level): np.random.seed(42) n = 200 - n_w = 10 - support_size = 5 - n_x = 1 - # Create data - data, covariates = create_synthetic_data(n=n, n_w=n_w, support_size=support_size, n_x=n_x, constant=constant) - data_dml_base = dml.DoubleMLData(data, - y_col='y', - d_cols='t', - x_cols=covariates) + # collect data + np.random.seed(42) + obj_dml_data = dml.datasets.make_irm_data(n_obs=n, dim_x=5) # First stage estimation ml_m = RandomForestRegressor(n_estimators=100) ml_g = RandomForestClassifier(n_estimators=100) - np.random.seed(42) - dml_irm = dml.DoubleMLIRM(data_dml_base, - ml_g=ml_m, - ml_m=ml_g, - trimming_threshold=0.05, - n_folds=5) + dml_irm_obj = dml.DoubleMLIRM(obj_dml_data, + ml_g=ml_m, + ml_m=ml_g, + trimming_threshold=0.05, + n_folds=5) - dml_irm.fit(store_predictions=True) + dml_irm_obj.fit() - spline_basis = create_spline_basis(X=data["x"]) + # create a random basis + random_basis = pd.DataFrame(np.random.normal(0, 1, size=(n, 5))) # cate - cate = dml_irm.cate(spline_basis) + cate = dml_irm_obj.cate(random_basis) # get the orthogonal signal from the IRM model - orth_signal = dml_irm.psi_b.reshape(-1) - cate_manual = fit_blp(orth_signal, spline_basis) + orth_signal = dml_irm_obj.psi_b.reshape(-1) + cate_manual = fit_blp(orth_signal, random_basis) np.random.seed(42) - ci = cate.confint(spline_basis, joint=ci_joint, level=0.95, n_rep_boot=1000) + ci = cate.confint(random_basis, joint=ci_joint, level=ci_level, n_rep_boot=1000) np.random.seed(42) - ci_manual = blp_confint(cate_manual, spline_basis, joint=ci_joint, level=0.95, n_rep_boot=1000) + ci_manual = blp_confint(cate_manual, random_basis, joint=ci_joint, level=ci_level, n_rep_boot=1000) res_dict = {'coef': cate.blp_model.params, 'coef_manual': cate_manual.params, @@ -73,23 +67,25 @@ def dml_blp_fixture(constant, ci_joint): return res_dict +@pytest.mark.ci def test_dml_blp_coef(dml_blp_fixture): assert np.allclose(dml_blp_fixture['coef'], dml_blp_fixture['coef_manual'], rtol=1e-9, atol=1e-4) - +@pytest.mark.ci def test_dml_blp_values(dml_blp_fixture): assert np.allclose(dml_blp_fixture['values'], dml_blp_fixture['values_manual'], rtol=1e-9, atol=1e-4) - +@pytest.mark.ci def test_dml_blp_omega(dml_blp_fixture): assert np.allclose(dml_blp_fixture['omega'], dml_blp_fixture['omega_manual'], rtol=1e-9, atol=1e-4) +@pytest.mark.ci def test_dml_blp_ci(dml_blp_fixture): assert np.allclose(dml_blp_fixture['ci'], dml_blp_fixture['ci_manual'], From fe3448c83b5e35487d1c7cf6b5ddb159152da54b Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Thu, 8 Sep 2022 16:31:50 +0200 Subject: [PATCH 04/15] Update double_ml_blp.py --- doubleml/double_ml_blp.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doubleml/double_ml_blp.py b/doubleml/double_ml_blp.py index 642d1f40..07c15f5c 100644 --- a/doubleml/double_ml_blp.py +++ b/doubleml/double_ml_blp.py @@ -132,6 +132,13 @@ def confint(self, basis, joint=False, level=0.95, n_rep_boot=500): raise ValueError('The confidence level must be in (0,1). ' f'{str(level)} was passed.') + if not isinstance(n_rep_boot, int): + raise TypeError('The number of bootstrap replications must be of int type. ' + f'{str(n_rep_boot)} of type {str(type(n_rep_boot))} was passed.') + if n_rep_boot < 1: + raise ValueError('The number of bootstrap replications must be positive. ' + f'{str(n_rep_boot)} was passed.') + alpha = 1 - level # blp of the orthogonal signal g_hat = self._blp_model.predict(basis) From bc52a5136e92ed6c556f8232cdaaaf811f341a7b Mon Sep 17 00:00:00 2001 From: SvenKlaassen <47529404+SvenKlaassen@users.noreply.github.com> Date: Fri, 9 Sep 2022 12:18:25 +0200 Subject: [PATCH 05/15] add unit tests add unit tests restructure unit tests for blp, cate and gate --- doubleml/double_ml_blp.py | 13 ++--- doubleml/double_ml_irm.py | 2 +- doubleml/tests/_utils_blp_manual.py | 1 - doubleml/tests/test_blp.py | 50 ++++++------------- doubleml/tests/test_doubleml_exceptions.py | 58 +++++++++++++++++++++- doubleml/tests/test_irm.py | 35 +++++++++++++ 6 files changed, 112 insertions(+), 47 deletions(-) diff --git a/doubleml/double_ml_blp.py b/doubleml/double_ml_blp.py index 07c15f5c..3465c5a8 100644 --- a/doubleml/double_ml_blp.py +++ b/doubleml/double_ml_blp.py @@ -39,11 +39,6 @@ def __init__(self, raise ValueError('Invalid pd.DataFrame: ' 'Contains duplicate column names.') - if orth_signal.shape[0] != basis.shape[0]: - raise ValueError('Incompatible input dimensions.' - f'Signal of shape {str(orth_signal.shape)} and' - f'basis of shape {str(basis.shape)} were passed.') - self._orth_signal = orth_signal self._basis = basis @@ -139,19 +134,21 @@ def confint(self, basis, joint=False, level=0.95, n_rep_boot=500): raise ValueError('The number of bootstrap replications must be positive. ' f'{str(n_rep_boot)} was passed.') + if self._blp_model is None: + raise ValueError('Apply fit() before confint().') + alpha = 1 - level # blp of the orthogonal signal g_hat = self._blp_model.predict(basis) # calculate se for basis elements - se_scaling = 1 - blp_se = np.sqrt((basis.to_numpy().dot(self._blp_omega) * basis.to_numpy()).sum(axis=1)) / se_scaling + blp_se = np.sqrt((basis.to_numpy().dot(self._blp_omega) * basis.to_numpy()).sum(axis=1)) if joint: # calculate the maximum t-statistic with bootstrap normal_samples = np.random.normal(size=[basis.shape[1], n_rep_boot]) bootstrap_samples = np.multiply(basis.to_numpy().dot(np.dot(sqrtm(self._blp_omega), normal_samples)).T, - (blp_se * se_scaling)) + blp_se) max_t_stat = np.quantile(np.max(np.abs(bootstrap_samples), axis=0), q=level) diff --git a/doubleml/double_ml_irm.py b/doubleml/double_ml_irm.py index 329dacb1..459de258 100644 --- a/doubleml/double_ml_irm.py +++ b/doubleml/double_ml_irm.py @@ -370,7 +370,7 @@ def gate(self, groups, joint=False, level=0.95, n_rep_boot=500): if groups.shape[1] == 1: groups = pd.get_dummies(groups, prefix='Group', prefix_sep='_') else: - raise TypeError('Columns must be of of bool or int type or the data frame only should contain' + raise TypeError('Columns must be of of bool or int type or the data frame only should contain ' 'one column.') # define the orthogonal signal diff --git a/doubleml/tests/_utils_blp_manual.py b/doubleml/tests/_utils_blp_manual.py index 213a2b16..aea79d66 100644 --- a/doubleml/tests/_utils_blp_manual.py +++ b/doubleml/tests/_utils_blp_manual.py @@ -3,7 +3,6 @@ from scipy.linalg import sqrtm from scipy.stats import norm import pandas as pd -import patsy def fit_blp(orth_signal, basis): diff --git a/doubleml/tests/test_blp.py b/doubleml/tests/test_blp.py index 477437c1..d932965a 100644 --- a/doubleml/tests/test_blp.py +++ b/doubleml/tests/test_blp.py @@ -2,7 +2,6 @@ import pandas as pd import pytest -from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor import doubleml as dml from ._utils_blp_manual import fit_blp, blp_confint @@ -21,46 +20,25 @@ def ci_level(request): @pytest.fixture(scope='module') def dml_blp_fixture(ci_joint, ci_level): + n = 50 np.random.seed(42) - n = 200 + random_basis = pd.DataFrame(np.random.normal(0, 1, size=(n, 3))) + random_signal = np.random.normal(0, 1, size=(n, )) - # collect data - np.random.seed(42) - obj_dml_data = dml.datasets.make_irm_data(n_obs=n, dim_x=5) - - # First stage estimation - ml_m = RandomForestRegressor(n_estimators=100) - ml_g = RandomForestClassifier(n_estimators=100) - - dml_irm_obj = dml.DoubleMLIRM(obj_dml_data, - ml_g=ml_m, - ml_m=ml_g, - trimming_threshold=0.05, - n_folds=5) - - dml_irm_obj.fit() - - # create a random basis - random_basis = pd.DataFrame(np.random.normal(0, 1, size=(n, 5))) - - # cate - cate = dml_irm_obj.cate(random_basis) - - # get the orthogonal signal from the IRM model - orth_signal = dml_irm_obj.psi_b.reshape(-1) - cate_manual = fit_blp(orth_signal, random_basis) + blp = dml.DoubleMLIRMBLP(random_signal, random_basis).fit() + blp_manual = fit_blp(random_signal, random_basis) np.random.seed(42) - ci = cate.confint(random_basis, joint=ci_joint, level=ci_level, n_rep_boot=1000) + ci = blp.confint(random_basis, joint=ci_joint, level=ci_level, n_rep_boot=1000) np.random.seed(42) - ci_manual = blp_confint(cate_manual, random_basis, joint=ci_joint, level=ci_level, n_rep_boot=1000) - - res_dict = {'coef': cate.blp_model.params, - 'coef_manual': cate_manual.params, - 'values': cate.blp_model.fittedvalues, - 'values_manual': cate_manual.fittedvalues, - 'omega': cate.blp_omega, - 'omega_manual': cate_manual.cov_HC0, + ci_manual = blp_confint(blp_manual, random_basis, joint=ci_joint, level=ci_level, n_rep_boot=1000) + + res_dict = {'coef': blp.blp_model.params, + 'coef_manual': blp_manual.params, + 'values': blp.blp_model.fittedvalues, + 'values_manual': blp_manual.fittedvalues, + 'omega': blp.blp_omega, + 'omega_manual': blp_manual.cov_HC0, 'ci': ci, 'ci_manual': ci_manual} diff --git a/doubleml/tests/test_doubleml_exceptions.py b/doubleml/tests/test_doubleml_exceptions.py index 875902d1..be2794cb 100644 --- a/doubleml/tests/test_doubleml_exceptions.py +++ b/doubleml/tests/test_doubleml_exceptions.py @@ -2,7 +2,7 @@ import pandas as pd import numpy as np -from doubleml import DoubleMLPLR, DoubleMLIRM, DoubleMLIIVM, DoubleMLPLIV, DoubleMLData, DoubleMLClusterData +from doubleml import DoubleMLPLR, DoubleMLIRM, DoubleMLIIVM, DoubleMLPLIV, DoubleMLData, DoubleMLClusterData, DoubleMLIRMBLP from doubleml.datasets import make_plr_CCDDHNR2018, make_irm_data, make_pliv_CHS2015, make_iivm_data, \ make_pliv_multiway_cluster_CKMS2021 @@ -652,3 +652,59 @@ def test_doubleml_nan_prediction(): msg = r'Predictions from learner LassoWithInfPred\(\) for ml_m are not finite.' with pytest.raises(ValueError, match=msg): _ = DoubleMLPLR(dml_data, ml_l, LassoWithInfPred()).fit() + +@pytest.mark.ci +def test_doubleml_exception_blp(): + random_basis = pd.DataFrame(np.random.normal(0, 1, size=(2, 3))) + signal = np.array([1, 2]) + + msg = "The signal must be of np.ndarray type. Signal of type was passed." + with pytest.raises(TypeError, match=msg): + DoubleMLIRMBLP(orth_signal=1, basis=random_basis) + msg = 'The signal must be of one dimensional. Signal of dimensions 2 was passed.' + with pytest.raises(ValueError, match=msg): + DoubleMLIRMBLP(orth_signal=np.array([[1], [2]]), basis=random_basis) + msg = "The basis must be of DataFrame type. Basis of type was passed." + with pytest.raises(TypeError, match=msg): + DoubleMLIRMBLP(orth_signal=signal, basis=1) + msg = 'Invalid pd.DataFrame: Contains duplicate column names.' + with pytest.raises(ValueError, match=msg): + DoubleMLIRMBLP(orth_signal=signal, basis=pd.DataFrame(np.array([[1, 2], [4, 5]]), + columns=['x_1', 'x_1'])) + + + dml_blp_confint = DoubleMLIRMBLP(orth_signal=signal, basis=random_basis) + msg = 'joint must be True or False. Got 1.' + with pytest.raises(TypeError, match=msg): + dml_blp_confint.confint(random_basis, joint=1) + msg = "The confidence level must be of float type. 5% of type was passed." + with pytest.raises(TypeError, match=msg): + dml_blp_confint.confint(random_basis, level='5%') + msg = r'The confidence level must be in \(0,1\). 0.0 was passed.' + with pytest.raises(ValueError, match=msg): + dml_blp_confint.confint(random_basis, level=0.) + msg = "The number of bootstrap replications must be of int type. 500 of type was passed." + with pytest.raises(TypeError, match=msg): + dml_blp_confint.confint(random_basis, n_rep_boot='500') + msg = 'The number of bootstrap replications must be positive. 0 was passed.' + with pytest.raises(ValueError, match=msg): + dml_blp_confint.confint(random_basis, n_rep_boot=0) + msg = r'Apply fit\(\) before confint\(\).' + with pytest.raises(ValueError, match=msg): + dml_blp_confint.confint(random_basis) + +@pytest.mark.ci +def test_doubleml_exception_gate(): + dml_irm_obj = DoubleMLIRM(dml_data_irm, + ml_g=Lasso(), + ml_m=LogisticRegression(), + trimming_threshold=0.05, + n_folds=5) + dml_irm_obj.fit() + + msg = "Groups must be of DataFrame type. Groups of type was passed." + with pytest.raises(TypeError, match=msg): + dml_irm_obj.gate(groups=2) + msg = 'Columns must be of of bool or int type or the data frame only should contain one column.' + with pytest.raises(TypeError, match=msg): + dml_irm_obj.gate(groups=pd.DataFrame(np.random.normal(0, 1, size=(50, 3)))) diff --git a/doubleml/tests/test_irm.py b/doubleml/tests/test_irm.py index 12370a4b..c4df18ec 100644 --- a/doubleml/tests/test_irm.py +++ b/doubleml/tests/test_irm.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd import pytest import math @@ -118,3 +119,37 @@ def test_dml_irm_boot(dml_irm_fixture): assert np.allclose(dml_irm_fixture['boot_t_stat' + bootstrap], dml_irm_fixture['boot_t_stat' + bootstrap + '_manual'], rtol=1e-9, atol=1e-4) + + +@pytest.mark.ci +def test_dml_irm_cate_gate(): + n = 50 + # collect data + np.random.seed(42) + obj_dml_data = dml.datasets.make_irm_data(n_obs=n, dim_x=2) + + # First stage estimation + ml_g = RandomForestRegressor(n_estimators=10) + ml_m = RandomForestClassifier(n_estimators=10) + + dml_irm_obj = dml.DoubleMLIRM(obj_dml_data, + ml_m=ml_m, + ml_g=ml_g, + trimming_threshold=0.05, + n_folds=5) + + dml_irm_obj.fit() + # create a random basis + random_basis = pd.DataFrame(np.random.normal(0, 1, size=(n, 5))) + cate = dml_irm_obj.cate(random_basis) + assert isinstance(cate, dml.double_ml_blp.DoubleMLIRMBLP) + + groups_1 = pd.DataFrame(np.column_stack([obj_dml_data.data['X1'] <= 0, + obj_dml_data.data['X1'] > 0.2]), + columns=['Group 1', 'Group 2']) + assert isinstance(dml_irm_obj.gate(groups_1), pd.DataFrame) + + groups_2 = pd.DataFrame(np.random.choice(["1", "2"], n)) + assert isinstance(dml_irm_obj.gate(groups_2), pd.DataFrame) + + From fb0809c76326eef0d77f5af8dfc74772e86b9b63 Mon Sep 17 00:00:00 2001 From: SvenKlaassen <47529404+SvenKlaassen@users.noreply.github.com> Date: Fri, 9 Sep 2022 13:30:50 +0200 Subject: [PATCH 06/15] change to fit PEP8 standards --- doubleml/double_ml_blp.py | 2 +- doubleml/tests/test_blp.py | 5 ++++- doubleml/tests/test_doubleml_exceptions.py | 5 +++-- doubleml/tests/test_irm.py | 5 ++--- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/doubleml/double_ml_blp.py b/doubleml/double_ml_blp.py index 3465c5a8..1e133d4c 100644 --- a/doubleml/double_ml_blp.py +++ b/doubleml/double_ml_blp.py @@ -167,4 +167,4 @@ def confint(self, basis, joint=False, level=0.95, n_rep_boot=500): df_ci = pd.DataFrame(ci, columns=['{:.1f} %'.format(alpha/2 * 100), 'effect', '{:.1f} %'.format((1-alpha/2) * 100)], index=basis.index) - return df_ci \ No newline at end of file + return df_ci diff --git a/doubleml/tests/test_blp.py b/doubleml/tests/test_blp.py index d932965a..5d60cf2f 100644 --- a/doubleml/tests/test_blp.py +++ b/doubleml/tests/test_blp.py @@ -12,6 +12,7 @@ def ci_joint(request): return request.param + @pytest.fixture(scope='module', params=[0.95, 0.9]) def ci_level(request): @@ -51,21 +52,23 @@ def test_dml_blp_coef(dml_blp_fixture): dml_blp_fixture['coef_manual'], rtol=1e-9, atol=1e-4) + @pytest.mark.ci def test_dml_blp_values(dml_blp_fixture): assert np.allclose(dml_blp_fixture['values'], dml_blp_fixture['values_manual'], rtol=1e-9, atol=1e-4) + @pytest.mark.ci def test_dml_blp_omega(dml_blp_fixture): assert np.allclose(dml_blp_fixture['omega'], dml_blp_fixture['omega_manual'], rtol=1e-9, atol=1e-4) + @pytest.mark.ci def test_dml_blp_ci(dml_blp_fixture): assert np.allclose(dml_blp_fixture['ci'], dml_blp_fixture['ci_manual'], rtol=1e-9, atol=1e-4) - diff --git a/doubleml/tests/test_doubleml_exceptions.py b/doubleml/tests/test_doubleml_exceptions.py index be2794cb..a8dfe4b5 100644 --- a/doubleml/tests/test_doubleml_exceptions.py +++ b/doubleml/tests/test_doubleml_exceptions.py @@ -653,6 +653,7 @@ def test_doubleml_nan_prediction(): with pytest.raises(ValueError, match=msg): _ = DoubleMLPLR(dml_data, ml_l, LassoWithInfPred()).fit() + @pytest.mark.ci def test_doubleml_exception_blp(): random_basis = pd.DataFrame(np.random.normal(0, 1, size=(2, 3))) @@ -670,8 +671,7 @@ def test_doubleml_exception_blp(): msg = 'Invalid pd.DataFrame: Contains duplicate column names.' with pytest.raises(ValueError, match=msg): DoubleMLIRMBLP(orth_signal=signal, basis=pd.DataFrame(np.array([[1, 2], [4, 5]]), - columns=['x_1', 'x_1'])) - + columns=['x_1', 'x_1'])) dml_blp_confint = DoubleMLIRMBLP(orth_signal=signal, basis=random_basis) msg = 'joint must be True or False. Got 1.' @@ -693,6 +693,7 @@ def test_doubleml_exception_blp(): with pytest.raises(ValueError, match=msg): dml_blp_confint.confint(random_basis) + @pytest.mark.ci def test_doubleml_exception_gate(): dml_irm_obj = DoubleMLIRM(dml_data_irm, diff --git a/doubleml/tests/test_irm.py b/doubleml/tests/test_irm.py index c4df18ec..52c9aeae 100644 --- a/doubleml/tests/test_irm.py +++ b/doubleml/tests/test_irm.py @@ -9,6 +9,7 @@ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor import doubleml as dml +from doubleml.datasets import make_irm_data from ._utils import draw_smpls from ._utils_irm_manual import fit_irm, boot_irm @@ -126,7 +127,7 @@ def test_dml_irm_cate_gate(): n = 50 # collect data np.random.seed(42) - obj_dml_data = dml.datasets.make_irm_data(n_obs=n, dim_x=2) + obj_dml_data = make_irm_data(n_obs=n, dim_x=2) # First stage estimation ml_g = RandomForestRegressor(n_estimators=10) @@ -151,5 +152,3 @@ def test_dml_irm_cate_gate(): groups_2 = pd.DataFrame(np.random.choice(["1", "2"], n)) assert isinstance(dml_irm_obj.gate(groups_2), pd.DataFrame) - - From 756330f73dd26428ccb13cbdf8ed35f62caf0c9e Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Mon, 12 Sep 2022 13:34:18 +0200 Subject: [PATCH 07/15] correct variance scaling for blp --- doubleml/double_ml_blp.py | 2 +- doubleml/double_ml_irm.py | 2 +- doubleml/tests/_utils_blp_manual.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doubleml/double_ml_blp.py b/doubleml/double_ml_blp.py index 1e133d4c..a52e0939 100644 --- a/doubleml/double_ml_blp.py +++ b/doubleml/double_ml_blp.py @@ -148,7 +148,7 @@ def confint(self, basis, joint=False, level=0.95, n_rep_boot=500): # calculate the maximum t-statistic with bootstrap normal_samples = np.random.normal(size=[basis.shape[1], n_rep_boot]) bootstrap_samples = np.multiply(basis.to_numpy().dot(np.dot(sqrtm(self._blp_omega), normal_samples)).T, - blp_se) + (1.0 / blp_se)) max_t_stat = np.quantile(np.max(np.abs(bootstrap_samples), axis=0), q=level) diff --git a/doubleml/double_ml_irm.py b/doubleml/double_ml_irm.py index 459de258..dbb192dd 100644 --- a/doubleml/double_ml_irm.py +++ b/doubleml/double_ml_irm.py @@ -8,6 +8,7 @@ from ._utils import _dml_cv_predict, _get_cond_smpls, _dml_tune, _check_finite_predictions + class DoubleMLIRM(DoubleML): """Double machine learning for interactive regression models @@ -384,4 +385,3 @@ def gate(self, groups, joint=False, level=0.95, n_rep_boot=500): df_ci.index = groups.columns.values return df_ci - diff --git a/doubleml/tests/_utils_blp_manual.py b/doubleml/tests/_utils_blp_manual.py index aea79d66..3d9b5721 100644 --- a/doubleml/tests/_utils_blp_manual.py +++ b/doubleml/tests/_utils_blp_manual.py @@ -22,7 +22,7 @@ def blp_confint(blp_model, basis, joint=False, level=0.95, n_rep_boot=500): if joint: # calculate the maximum t-statistic with bootstrap normal_samples = np.random.normal(size=[basis.shape[1], n_rep_boot]) - bootstrap_samples = np.multiply(basis.dot(np.dot(sqrtm(blp_omega), normal_samples)).T, blp_se) + bootstrap_samples = np.multiply(basis.dot(np.dot(sqrtm(blp_omega), normal_samples)).T, (1.0 / blp_se)) max_t_stat = np.quantile(np.max(np.abs(bootstrap_samples), axis=0), q=level) From a83fe0f10a4e682ffcb731bf1368314e2dc36443 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Mon, 12 Sep 2022 14:14:34 +0200 Subject: [PATCH 08/15] correct typos --- doubleml/double_ml_blp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/double_ml_blp.py b/doubleml/double_ml_blp.py index a52e0939..7277eaf5 100644 --- a/doubleml/double_ml_blp.py +++ b/doubleml/double_ml_blp.py @@ -7,7 +7,7 @@ class DoubleMLIRMBLP: - """Best Linear Predictor for DoubleML IRM models + """Best linear predictor for DoubleML IRM models Parameters ---------- From 6385162b6c12888c613f4680c11b2cb2117a1657 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Fri, 4 Nov 2022 10:22:42 +0100 Subject: [PATCH 09/15] refactor BLP, CATE and GATE --- .coverage | Bin 0 -> 69632 bytes doubleml/__init__.py | 4 +- doubleml/double_ml_blp.py | 37 ++++++++++--- doubleml/double_ml_irm.py | 57 +++++++++----------- doubleml/tests/test_blp.py | 20 +++++-- doubleml/tests/test_doubleml_exceptions.py | 58 +++++++++++++++++---- doubleml/tests/test_irm.py | 20 +++++-- 7 files changed, 137 insertions(+), 59 deletions(-) create mode 100644 .coverage diff --git a/.coverage b/.coverage new file mode 100644 index 0000000000000000000000000000000000000000..c54090593472f4773cf5f4f99e12a7949991082a GIT binary patch literal 69632 zcmeHQ3sf7|nU)x#w`Pnn#(**M6R>$>-WdE&>^Se#-LA0}0RvVH64t|jZ7eJ4Imx!0 zwr6`{cC&TdY?4j0-Qb+0HF0cE&bCQULlT;_v6~Q}-K|YGF;)t-6%f+wf3HSD!ge|% zc2BZBS2p*KX6FCyf4~2~|IB@i=Kgzkne42-$zp9a+V#bf1rn)LvQe*>NF*xwOMt)7 zM*$nM&=<(15!<)gR7vu_zEj8SmT2WCB+NeD-!Qq_=XCj+kFz9!s<=tMpJ{)ZZbFP?fco#*$x}qh^~0B53YIW#5^pIwhX35 zkg~JQ0MTYOwHmFR`U7ld4nBb|&CYh%gF66(H8q>j8Tu?#Nn^-GrZeVReEwW6F z+3M^^oN7gKVxsg>7T3vWZGiupsM^4e*{bYl0-y$qwGnAwZ#9}5_OrGey)j(F1`Fuf zVGq{UYH7{4Tl6g^Gp?rDX0n?sW)3LJtNd`LZI| znH9c?B!4gR5b-Z5%PdyI{R1_H4l zbXt?c+<=;j4b`E+a)`AWo7p`6XfP~%*^UIRQ>cx5mvP#E4+yHezeT5OWq`mh7Pr|Mw`PL3M5|?BtsIjQ8Ls<_#;rAT!lP&Q`)Vn!=Gm; z8oQkN&7AmC>GAmG59Xa}OC?AeyupO;%{K5QBqP06jXf(Fi9S z>n#pD_FwcByC=H#%<3BinEf8&9?r_fr$h6Gf^3zcFZ#^=MjHw~woHA!#nQqW&B03o zoVddVMVV}RT=b0>quN;vTPMP3K9q@730+fAbbkQaTBupUMr*6l+{j<1(A77H3Lzi0 zNmNcSc7UKh6WXE;TIL+g5Ga2X1*#B!UK>z>P+N_S!B&oJ5rNpgpS=MtcoEuaw6k_o zE2~GRU}dmBwHs~v1}h5(;S!piNqv#^k&z6AC`G+wCAyJuWk?mt>()tK4!#Wr!;8TN z*YZ|lNNxUWT$FFjIXLSkcSD?oo194ZK-@*pfxZSyKrt<7!TKNzp|ki~9Cfx1w3ISF z3H&D?!T@1_FhCd}3=jqg1B3y>0AYYIKo}ql5C;DG7@(waQU&V&%b35BFyCh$W}0Aw zdJxBzYBW$sl&jbzOG66Ss8&&(g0QNa}?ClUq-1B3y>0AYYIKo}ql5C#YXgaN_; zVSq63B{HC;(xst+2bq#elA>`3N<*bd`9X#_4VA764KKulrfB^?d=eNRlSO;!bM(XX zPI{s4L){N`&ARnkpY~O4yS6|xq4|mCAU_VZAwnrtF$VM6jO>b zii3)^@+l!T%cp|j5B^Mq>%Wdlnf3HKM8#bI8+FGmzD+}M@ z`AvAn4n0i>&se}?TsC+@WLp&gGU5Tv7DI#8VzU{VOm=|TS_v>Z@i4897Q4w{N6(Z1 z%9aX1(TGQBXffJsrX~|>1&~eU0HP2KVsO|^EjCONJdEQowg5y~86|NDMnF^}c@V5A zg|ek$B~6D6c88e-`P)lCJ}CzIHlx*UGPW2Fhmdxy2c&etNMRO0mszYZAzO+8O|)Sm zv7mB0iU1^8uu43w5g;Am+TB?QP#J=uZdN-03s-Mz0lKE~?^8!r!# z=4ZT)+1hQ%1+@7XuLCN#JqJJ%1T#Ksyg7lQd>tie7Y_p(RIQ~XT_Poo4F}Nkt8JZ| zvq628VChJQ342LR7GMZ(+&o4@i^FEefoWSN$cqfMI6k%;I}Nii+t-Mj2>Bu>GMV1ptUK2mL4CB8C9e2zP8SqU&AeG|b%NBk84B0AzDn6Pcj zmWz(~AP+3o=%H-kP7$QhbSTW@mW&t(K=L+6eY4f#XhWvlz6?M_Hck@)nPZKu>3}0L z4q}`rb5t&+Bu4~07T-+ZVOw+uusIDh5Zz9(4q;oCq*9WDBFl%O%Tq*0CR8p+-nRsl z&K6uE_JPoio59A~4K`zI8$2s*Yl9y_KtIm`m|K&@n_?C-ba%INF`%TxsF{_uJFI4d zy|WG9Deg)FEa4ZW=vuZ~8rc>@Bim$z8$RT+w=V)Tkr&Hw{n(BMwhjMG0U%W;ijHHA zt)L5fx*DVxikAjSuo}L5PJmhn59z$Gqv9LjJ_~+8z|?@ezIFk?P~yxK1+a+$0MQY! zzNIZ90@9)*AjpIAg*r-N5}-RGoTsf!E2b>?-Ge6NO;A>uMvS1rU^1KR1_KgQ3lj{w zm;nsLx9w6DC9w&q9qx!B!5}N8O0jam>q(hHtfCM!4nYko<-(#a?}O0$LqR7PZ1UqN z2`iwIL0LqnOstlCb1R}K$pHZ+h3x=^mBxvsGT0d_lnQgqpqoIqdqH&^q*CS#>Vh-O z8C?lePbV_jx{v5P=tr0nx~EldYR{^+%a>^jRSzqA<3Ex;M!gU>Abmr(Upu4i&`fH6 zsyU!ZSHGwJwz^E^S4otwDs9SQ#k9hzSSKHsKPTT8e=@!yK23H(RzrPG?W0oSTydM? zl+xqUO39?;Ur@nc{JBTwS}F2&;|u%$)!St*CI-MccMetCWG-C{Fwy=0%B?b&CI*zK z+mDLPGM7R)2;UwMN#Q$)@^Z={906bHf-YB9D|1OjOF~DIzpF0YA#)|gB0t+_=aEY8 zl)2J{BZXOr=+s7Zkk`j#m`E&W$HhBku4LgV@w7&O%=Z6_?vlZei3o@z2*g%*v)V=W z{|jUC-YCtWeg*S2Ubz3CKY!yz^#AkbYdqBd&%I6NN)XQYtnuaqiku3nJq8%iU|luU zB~~)r|6jXB=28ilj&zuCRLB|0`~Pb;P)^~DV1ptU zKEAAup)yE(jytT1!8Z|1RK#ByQ^ZFwVcV_{AMrsRymGlXrwGy*(f`-SLcrYqe?}}W zag+Z4vKZpvjs5@hD(Z-E$Ksm_-~V44Q#+0B|EHBw2gQ~TmP{2NnfN|J%Kb7|w(t_M z4}|-&x9I;biNO@Jn4$iE@*Of)O01gA>HjYlcTtM20AYYIKo}ql5C#YXgaN_;VSq3| z82Bq>fTDCNH2=R*p^`9O=4XtSeuh3yAE1@GU+G@bwd*XpYVDBrm)i5%f73ppJ)$kr zyshceJgw=_+@&egWU6P>pQ%4o|A+cV>c3OBs<*0h)yXQK>i4P_RoyD9>LJzrs%llH zO067M{#N;#@}%-HrA4`0S*BzZ!-`)hdK8Bh&5C;z6^b>W0QnFG2m^!x!T@35%U~dF zqfDwQ-`I0`a;kdJHr75dUE%i!hP{64+4J!UwQz7%ZO;T8x_s?)U|(KxQVs0NsexUm zu1_C0%}w`rcXwN)NgL4a4d!>oI-eRmAGkI&=DTF=)Ff0RfNCCKlqg=hRltF@73Mw1o*J8+zBUw8yPzEQ)|Z?6x%L@fAkWM74!FJhM*5%VwXpu) z6I`doeYCTC&-A&`_iLv|@9Q1fy`~H#?kd}N0C0-Eo{N@I@4%&r6Q|C)M-9z>zrWAh z1zK99>7{U9bt$k_Jvdq29(ebt|7^hTJL2_Yb&^Zqa6w7Wlfc5*$zR{?_Z{>2j(y|A zp5*oD!1^bz_w`%42c8%_H*x9G`&{me>cnC=kX4K%YMwVZp_rV3+_WOtSzgpr0%Z)^ znmvE0x#n?m-u}`20jp+dAsnnMEFJd@d&ghq`hOG1<0ke^O<4b%JFz=Hu>j!}V7#-Z z-9RqF%SU+mLA>HQcxid)U|tZV0~XWQ5_1t;ZZup{4(!ay!4^E~ z{}l8)DXATt{3qa$x1D?~?55XFtjvb|_UzK}Q7>@tmUWDUp^t;stO0H2xsW#H4RKIGERU8zc44M$2> z2iZ93X>|5@{k_-=Ez(7+Q2DERo*bM!`m%HQoX68Wa49!wCG5>vS$80Sp&Y!mGgrXw zdsdVVyFGoq15V$e;pco0O+l5qoKuE=Lw~;C-}{rH@vivP<*3Bv`-Wz~ExzyPRk_q; z@VK4ndN@|C4{9@Z_LcU_fgUGRVLIsfOETbaQ3g_|y7tsrr@xPzLiM<$NLmI*a+YCL zF9)W<1#y#2N=LiXdrF*7jZOU_@ETt+^-|cqccy(9Yq<3oK% zx<{@Z-W%Wy{mxNCpqm?T|JghGexK*-bSj*>EcM-5=fLQ`KyF!53PMi_qQC9G);@{} z%31=u?^|*va5Qjod}@QwcV+5|`|V!0+xhOyB(x9z$SZFAsm)A=Gw(@0)6Y%ydnTuz zsxp|TMy&(f2`-o06WrVr$Q|`|r7ngOOBb83`(~gKz3v@3vaG*#=uGR7b7WYT1jm*p zo#`K&oSGcVXj~L8Rk?0oWWH|Ujz0O1S4Up!ogSI-`Hux=Mt!wz?v%6U6lde6fVn}x zm;0D|*WwBIoUizXhlZz`|NigFJiRH;4-fbKdi_ zGwzvbzbz4!LMNK1F4X09JKdvhZp?Yzd2Dcy8yxGY7#r+*XY!QqNn4=GTQYd++-1(| z898~HYd_LH*){6%*`6928TJSK=a?l6LHnYG=83VX$tiBe8K~ug|D1^la3m|?qvAuJ z^gDWbPkFOnj@!9l0qk6{VE1J8#n*g?U$hM5YZ%zGgsB<$+8y`SE}+q2x~F>3;^wSp z>++HfI;3cg?!w_X$??7C7D^;nCTUO4+vx4=5=r|JZYSP&sf*u`4DI2!Uhlo&wtgCT z0w0%jdHQ^(gFHHt&g*9rz}%X z6?s1#ow#%AUyt5@%riXB9yv2}*FL!hL}*Q1nHusowOJl4BA6ckjPtm!e89QAM{3{f zt?jHGVFrhWxaUXwN3Q+)9M|t2_U0~Afnbda1fM4{@A$2zzdJm*KfpE$U7r*oa;vj?Qd+q+*dDiB$l z701$QpOS`UU;m8rIej{aJ)<)LZuB$QxH@v=w(t0MyFZ=$^OSSsV_&VKd33n5Yh-8| zVtt?gS?4dl(>LOsKII8~opU?gAB=zP$8N8evA1gy<)B)Y{KW4kucmbuf6wb54RG$| z@vw7i{Pe`c%%#b*?Mz37cW>wL#k)uQ+}GDlQI~2bq|Gv<+Fi1%DOj(KGZ(pOjz-%v z=lf27@Zx(LpK-mJ`uLlrsgG~xde5GIk2~~u!2KTgV%hY?Kf15wzJAK@_kA$*kG+M# zdVLd0<~(@2|L`;1_~Y9zzPNj&X;A7~PeDl!@27e`*>#-THR1Dr{BC=9^UEI2&80oQ z{h9anuTTB^?axSl_sHW1{!0rx(uRNgTiB<2+{awKdNmIqM}}Jmuk5%_EqTdz@ois{ zMB?>%_eqWqk8ddd^P4^Kq7!hqK&VoaN-;Y~4DXtzC<=>};H6W#KF{6K8AI;B573oUK}gvz04xwqga& zmM_PdUXQbk44f@nhO_i^oGo37v$QmvrKaL6B?V_omf$Qo8E1(IHM?> z#l^{B2msIjOBQ^^U5uzl7$6J~1_%R$0m1-bfG|K9APf)&2m^!x!oa*3K=c2k|35DZ zi7;V+FhCd}3=jqg1B3y>0AYYIKo}ql5C*>T44~)#H__+N`v2?9pO`Dm81tXZFgyeB zYvx_%m(0%@H}e`i2k;W}W9DC(r0AYYIKp6OP7@#)6@7GfsQBsSN8kB56Ni|BU zP*RDK3Y3(iqzomcC@De7dXyBSqzEO2C@DZmK1%XXl8cfYl&nL^T9jm?Bnu^(C|QG& z)hJnol9eb~fs*AY(W4{-CCgBfj*_J)Nkd60N>Wg=1SQERS&Wh-lq^C?B1#sbBmpH0 zP{N>uMu`q3T9jx|qDF}dB}$YiP$EZ3JW6CJp->Ws68!uBaY~6^`i6vg7oOWcuPb5d z=|m=5_Yr*u{RneH_q6Iw?OD}!`7&*x>S0B1{716Ks2Ab}q;Kfi5*&R+p*#Dv9z{rA=9^m{wR7>*V9|=j8k1PsTUIr^znJYN*eteN;-ED{fPqQhHok zDVdb~%U^aKICrIx7bv3FuiEPHOIXpX?X4_)H?`}FGEHsQE!8;i0NIZSQ(?)VQc|C%Bd8|&ji0+KEw$K zmc{TkX!QQh&|9@jV|XhrmTxm!?IvT3;cy74MBLkY`L}}3!3w_~dp-5J(|hg{xO6yD-1%bsUEJ70lOo9U=G1~gb#O?8Q_g|Xp)v6%(owK2U{99JOh zCE4N{H}7-}Ee@NVwSo>=ifQRLMAiVu$97{Ugt!n)W(<}<@49a{+4mb-O*Wg!+-w7N z)`)-eHG;7_%!ts{F3fawri!5#I@R*RzznKC05mp~wMtg$SHt2nMqlsVD`zKI%fVXNbtNxj8_ z?f{m?)K0MuVOyq^QU`@Mvtap9bSm6@{YIn<0EZx%a=*-#ExbhR1L1es+gQ87W^8S1 zVGTBjH+DNTPrzIfgDGY)Lw9${cgS2Rv1(>z?GCHiVDD^Wk&`VJcTtM2Wviu;Z80>m zO-4tH9ib(~aIp;6kL_q++w3Nb86hnaAIBP7K^LRl2-1l$NP{F;4VAr6TomMe9TndI z_eNH>(bRyvK4GKGMTs<16u^R*B4B+>+iV161isQe*lMwLK^~N+izt&2-4WqDZ8}k9 zHCW6|s5~f3D@xE{Fqutug8>9J0tLhGoj0JE0)pzl^zwN}eGA*#Qa39YWJM)fE_gjr zidK}rJyHmay1WlsSsZkN!A4#{u|iB3j+ODEwd9+dte83=q@*yNU;-GYu!)i}KxYo* z;tDB~N|Qq+D;7T; z#1G8;pA5R9_5an9rzOlg%oFgt{#md-{~Y}=y^~(3`%w2oU9)bz)~9_{+paCpOlW?h zc}TNV{VVmi)TOE!)f+08YMYW%_A0H)BE^*AjN+hTt^A7oS@~Xhdi;m+KZvi7PnCTj zdsMc8@=`yd_EJmY-ikXOw=qs3{khaBt&mJW!GGyHprtp zxKb$l3+`e=BiPaRC$sbax0ir?lGu_XrfS18mWwBi<6|9%kE)9bBf~=!SFrlZL7pr z28qvchb=1sMx<{dnCOVV0zgDZd;}A=Z5gai6dCbB9=x(f4`mB?3fwO6Z4l4nmW&t( zm^=T!eHnm=Y@Aq$Io8;k4mcv?;Enfmm9U`mh+xO!n+ZSvzc~#w5Zz9(4q;oCz%tT< zBFhI$mcznRk&y|N3zGLO0j0AAmxz5JbSDrTTDo!me`~UMQ_Nz9=Kpst29%T-HJda4 zzbgr_gkO}RYk9-`|LuzaP2|NgT)(LK|7utgEHaKo%>P%zT4J%%vlH3d5}+2sLpty4 zH_ZRnE&v!xoSCBL|2HuJAUXm@&HtCd0&0;FFgVj)2#c;w0(6g<|1X0j+QQ3 zW!=IBBPMpsU@5o|!Jvy7U~rai+oiC4+$N-U7Bq5z1cR)U!isZoa>46K87xv4rzk)F zUkMA_g+yK62czcy^I<(aE1;4=S%`RLuy$UYmi+vG5v;dAAfTkM9iXsMSe`GA%3x=# z5LWaHam)xeLH+-D`m6-r_SecRqkjjR0AYYIKo}ql5C#YXgn_RP z1FJVtl6!C6uMVxAtA^hP5INd~CPw+O>N{&;lquuZrG&=kqGva6xq9x-n%LoPvU+aw S27nVjbd1L*QRO06&;1W$loSI1 literal 0 HcmV?d00001 diff --git a/doubleml/__init__.py b/doubleml/__init__.py index ccd491ab..c4aa5971 100644 --- a/doubleml/__init__.py +++ b/doubleml/__init__.py @@ -5,7 +5,7 @@ from .double_ml_irm import DoubleMLIRM from .double_ml_iivm import DoubleMLIIVM from .double_ml_data import DoubleMLData, DoubleMLClusterData -from .double_ml_blp import DoubleMLIRMBLP +from .double_ml_blp import DoubleMLBLP __all__ = ['DoubleMLPLR', 'DoubleMLPLIV', @@ -13,6 +13,6 @@ 'DoubleMLIIVM', 'DoubleMLData', 'DoubleMLClusterData', - 'DoubleMLIRMBLP'] + 'DoubleMLBLP'] __version__ = get_distribution('doubleml').version diff --git a/doubleml/double_ml_blp.py b/doubleml/double_ml_blp.py index 7277eaf5..c49730e5 100644 --- a/doubleml/double_ml_blp.py +++ b/doubleml/double_ml_blp.py @@ -6,8 +6,8 @@ from scipy.linalg import sqrtm -class DoubleMLIRMBLP: - """Best linear predictor for DoubleML IRM models +class DoubleMLBLP: + """Best linear predictor (BLP) for DoubleML with orthogonal signals. Parameters ---------- @@ -17,11 +17,16 @@ class DoubleMLIRMBLP: basis : :class:`pandas.DataFrame` The basis for estimating the best linear predictor. Has to have the shape (n,d), where d is the number of predictors. + + is_gate : bool + Indicates whether the basis is constructed for GATEs (dummy-basis). + Default is ``False``. """ def __init__(self, orth_signal, - basis): + basis, + is_gate=False): if not isinstance(orth_signal, np.ndarray): raise TypeError('The signal must be of np.ndarray type. ' @@ -41,6 +46,7 @@ def __init__(self, self._orth_signal = orth_signal self._basis = basis + self._is_gate = is_gate # initialize the score and the covariance self._blp_model = None @@ -89,15 +95,16 @@ def fit(self): return self - def confint(self, basis, joint=False, level=0.95, n_rep_boot=500): + def confint(self, basis=None, joint=False, level=0.95, n_rep_boot=500): """ - Confidence intervals for BLP for DoubleML IRM. + Confidence intervals for the BLP model. Parameters ---------- basis : :class:`pandas.DataFrame` The basis for constructing the confidence interval. Has to have the same form as the basis from - the construction. + the construction. If ``None`` the basis for the construction of the model is used. + Default is ``None`` joint : bool Indicates whether joint confidence intervals are computed. @@ -138,6 +145,20 @@ def confint(self, basis, joint=False, level=0.95, n_rep_boot=500): raise ValueError('Apply fit() before confint().') alpha = 1 - level + gate_names = None + # define basis if none is supplied + if basis is None: + if self._is_gate: + # reduce to unique groups + basis = pd.DataFrame(np.diag(v=np.full((self._basis.shape[1]), True))) + gate_names = list(self._basis.columns.values) + else: + basis = self._basis + elif not (basis.shape[1] == self._basis.shape[1]): + raise ValueError('Invalid basis: DataFrame has to have the exact same number and ordering of columns.') + elif not list(basis.columns.values) == list(self._basis.columns.values): + raise ValueError('Invalid basis: DataFrame has to have the exact same number and ordering of columns.') + # blp of the orthogonal signal g_hat = self._blp_model.predict(basis) @@ -167,4 +188,8 @@ def confint(self, basis, joint=False, level=0.95, n_rep_boot=500): df_ci = pd.DataFrame(ci, columns=['{:.1f} %'.format(alpha/2 * 100), 'effect', '{:.1f} %'.format((1-alpha/2) * 100)], index=basis.index) + + if self._is_gate and gate_names is not None: + df_ci.index = gate_names + return df_ci diff --git a/doubleml/double_ml_irm.py b/doubleml/double_ml_irm.py index 69d41fcd..aaf08b26 100644 --- a/doubleml/double_ml_irm.py +++ b/doubleml/double_ml_irm.py @@ -1,11 +1,11 @@ import numpy as np import pandas as pd +import warnings from sklearn.utils import check_X_y from sklearn.utils.multiclass import type_of_target from .double_ml import DoubleML -from .double_ml_blp import DoubleMLIRMBLP - +from .double_ml_blp import DoubleMLBLP from ._utils import _dml_cv_predict, _get_cond_smpls, _dml_tune, _check_finite_predictions @@ -334,43 +334,40 @@ def cate(self, basis): Returns ------- - model : :class:`doubleML.DoubleMLIRMBLP` + model : :class:`doubleML.DoubleMLBLP` Best linear Predictor model. """ + valid_score = ['ATE'] + if self.score not in valid_score: + raise ValueError('Invalid score ' + self.score + '. ' + + 'Valid score ' + ' or '.join(valid_score) + '.') + # define the orthogonal signal orth_signal = self.psi_b.reshape(-1) # fit the best linear predictor - model = DoubleMLIRMBLP(orth_signal, basis=basis).fit() + model = DoubleMLBLP(orth_signal, basis=basis).fit() return model - def gate(self, groups, joint=False, level=0.95, n_rep_boot=500): + def gate(self, groups): """ - Calculate group average treatment effects (GATE) for a given basis. + Calculate group average treatment effects (GATE) for mutually exclusive groups. Parameters ---------- groups : :class:`pandas.DataFrame` - The group indicator for estimating the best linear predictor. Has to have the shape (n,d), + The group indicator for estimating the best linear predictor. Has to be dummy coded with shape (n,d), where d is the number of groups or (n,1) and contain the corresponding groups. - joint : bool - Indicates whether joint confidence intervals are computed. - Default is ``False`` - - level : float - The confidence level. - Default is ``0.95``. - - n_rep_boot : int - Number of bootstrap samples for joint confidence interval. - Default is ``500``. - Returns ------- - df_ci : pd.DataFrame - A data frame with the confidence interval(s). + model : :class:`doubleML.DoubleMLBLPGATE` + Best linear Predictor model for Group Effects. """ + valid_score = ['ATE'] + if self.score not in valid_score: + raise ValueError('Invalid score ' + self.score + '. ' + + 'Valid score ' + ' or '.join(valid_score) + '.') if not isinstance(groups, pd.DataFrame): raise TypeError('Groups must be of DataFrame type. ' @@ -380,17 +377,15 @@ def gate(self, groups, joint=False, level=0.95, n_rep_boot=500): if groups.shape[1] == 1: groups = pd.get_dummies(groups, prefix='Group', prefix_sep='_') else: - raise TypeError('Columns must be of of bool or int type or the data frame only should contain ' - 'one column.') + raise TypeError('Columns of groups must be of bool type or int type (dummy coded). ' + 'Alternatively, groups should only contain one column.') + + if any(groups.sum(0) <= 5): + warnings.warn('At least one group effect is estimated with less than 6 observations.') # define the orthogonal signal orth_signal = self.psi_b.reshape(-1) - # fit the best linear predictor - model = DoubleMLIRMBLP(orth_signal, basis=groups).fit() + # fit the best linear predictor for GATE (different confint() method) + model = DoubleMLBLP(orth_signal, basis=groups, is_gate=True).fit() - # reduce to unique groups and create confidence interval - unique_groups = pd.DataFrame(np.diag(v=np.full((groups.shape[1]), True))) - df_ci = model.confint(unique_groups, joint, level, n_rep_boot) - df_ci.index = groups.columns.values - - return df_ci + return model diff --git a/doubleml/tests/test_blp.py b/doubleml/tests/test_blp.py index 5d60cf2f..2f125d50 100644 --- a/doubleml/tests/test_blp.py +++ b/doubleml/tests/test_blp.py @@ -26,11 +26,13 @@ def dml_blp_fixture(ci_joint, ci_level): random_basis = pd.DataFrame(np.random.normal(0, 1, size=(n, 3))) random_signal = np.random.normal(0, 1, size=(n, )) - blp = dml.DoubleMLIRMBLP(random_signal, random_basis).fit() + blp = dml.DoubleMLBLP(random_signal, random_basis).fit() blp_manual = fit_blp(random_signal, random_basis) np.random.seed(42) - ci = blp.confint(random_basis, joint=ci_joint, level=ci_level, n_rep_boot=1000) + ci_1 = blp.confint(random_basis, joint=ci_joint, level=ci_level, n_rep_boot=1000) + np.random.seed(42) + ci_2 = blp.confint(joint=ci_joint, level=ci_level, n_rep_boot=1000) np.random.seed(42) ci_manual = blp_confint(blp_manual, random_basis, joint=ci_joint, level=ci_level, n_rep_boot=1000) @@ -40,7 +42,8 @@ def dml_blp_fixture(ci_joint, ci_level): 'values_manual': blp_manual.fittedvalues, 'omega': blp.blp_omega, 'omega_manual': blp_manual.cov_HC0, - 'ci': ci, + 'ci_1': ci_1, + 'ci_2': ci_2, 'ci_manual': ci_manual} return res_dict @@ -68,7 +71,14 @@ def test_dml_blp_omega(dml_blp_fixture): @pytest.mark.ci -def test_dml_blp_ci(dml_blp_fixture): - assert np.allclose(dml_blp_fixture['ci'], +def test_dml_blp_ci_1(dml_blp_fixture): + assert np.allclose(dml_blp_fixture['ci_1'], + dml_blp_fixture['ci_2'], + rtol=1e-9, atol=1e-4) + + +@pytest.mark.ci +def test_dml_blp_ci_2(dml_blp_fixture): + assert np.allclose(dml_blp_fixture['ci_1'], dml_blp_fixture['ci_manual'], rtol=1e-9, atol=1e-4) diff --git a/doubleml/tests/test_doubleml_exceptions.py b/doubleml/tests/test_doubleml_exceptions.py index 47aed3d3..5c2b3b09 100644 --- a/doubleml/tests/test_doubleml_exceptions.py +++ b/doubleml/tests/test_doubleml_exceptions.py @@ -2,7 +2,7 @@ import pandas as pd import numpy as np -from doubleml import DoubleMLPLR, DoubleMLIRM, DoubleMLIIVM, DoubleMLPLIV, DoubleMLData, DoubleMLClusterData, DoubleMLIRMBLP +from doubleml import DoubleMLPLR, DoubleMLIRM, DoubleMLIIVM, DoubleMLPLIV, DoubleMLData, DoubleMLClusterData, DoubleMLBLP from doubleml.datasets import make_plr_CCDDHNR2018, make_irm_data, make_pliv_CHS2015, make_iivm_data, \ make_pliv_multiway_cluster_CKMS2021 @@ -664,19 +664,24 @@ def test_doubleml_exception_blp(): msg = "The signal must be of np.ndarray type. Signal of type was passed." with pytest.raises(TypeError, match=msg): - DoubleMLIRMBLP(orth_signal=1, basis=random_basis) + DoubleMLBLP(orth_signal=1, basis=random_basis) msg = 'The signal must be of one dimensional. Signal of dimensions 2 was passed.' with pytest.raises(ValueError, match=msg): - DoubleMLIRMBLP(orth_signal=np.array([[1], [2]]), basis=random_basis) + DoubleMLBLP(orth_signal=np.array([[1], [2]]), basis=random_basis) msg = "The basis must be of DataFrame type. Basis of type was passed." with pytest.raises(TypeError, match=msg): - DoubleMLIRMBLP(orth_signal=signal, basis=1) + DoubleMLBLP(orth_signal=signal, basis=1) msg = 'Invalid pd.DataFrame: Contains duplicate column names.' with pytest.raises(ValueError, match=msg): - DoubleMLIRMBLP(orth_signal=signal, basis=pd.DataFrame(np.array([[1, 2], [4, 5]]), - columns=['x_1', 'x_1'])) + DoubleMLBLP(orth_signal=signal, basis=pd.DataFrame(np.array([[1, 2], [4, 5]]), + columns=['a_1', 'a_1'])) - dml_blp_confint = DoubleMLIRMBLP(orth_signal=signal, basis=random_basis) + dml_blp_confint = DoubleMLBLP(orth_signal=signal, basis=random_basis) + msg = r'Apply fit\(\) before confint\(\).' + with pytest.raises(ValueError, match=msg): + dml_blp_confint.confint(random_basis) + + dml_blp_confint.fit() msg = 'joint must be True or False. Got 1.' with pytest.raises(TypeError, match=msg): dml_blp_confint.confint(random_basis, joint=1) @@ -692,9 +697,12 @@ def test_doubleml_exception_blp(): msg = 'The number of bootstrap replications must be positive. 0 was passed.' with pytest.raises(ValueError, match=msg): dml_blp_confint.confint(random_basis, n_rep_boot=0) - msg = r'Apply fit\(\) before confint\(\).' + msg = 'Invalid basis: DataFrame has to have the exact same number and ordering of columns.' with pytest.raises(ValueError, match=msg): - dml_blp_confint.confint(random_basis) + dml_blp_confint.confint(basis=pd.DataFrame(np.array([[1], [4]]), columns=['a_1'])) + msg = 'Invalid basis: DataFrame has to have the exact same number and ordering of columns.' + with pytest.raises(ValueError, match=msg): + dml_blp_confint.confint(basis=pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]), columns=['x_1', 'x_2', 'x_3'])) @pytest.mark.ci @@ -709,6 +717,34 @@ def test_doubleml_exception_gate(): msg = "Groups must be of DataFrame type. Groups of type was passed." with pytest.raises(TypeError, match=msg): dml_irm_obj.gate(groups=2) - msg = 'Columns must be of of bool or int type or the data frame only should contain one column.' + msg = (r'Columns of groups must be of bool type or int type \(dummy coded\). ' + 'Alternatively, groups should only contain one column.') with pytest.raises(TypeError, match=msg): - dml_irm_obj.gate(groups=pd.DataFrame(np.random.normal(0, 1, size=(50, 3)))) + dml_irm_obj.gate(groups=pd.DataFrame(np.random.normal(0, 1, size=(dml_data_irm.n_obs, 3)))) + + dml_irm_obj = DoubleMLIRM(dml_data_irm, + ml_g=Lasso(), + ml_m=LogisticRegression(), + trimming_threshold=0.05, + n_folds=5, + score='ATTE') + dml_irm_obj.fit() + + msg = 'Invalid score ATTE. Valid score ATE.' + with pytest.raises(ValueError, match=msg): + dml_irm_obj.gate(groups=2) + + +@pytest.mark.ci +def test_doubleml_exception_cate(): + dml_irm_obj = DoubleMLIRM(dml_data_irm, + ml_g=Lasso(), + ml_m=LogisticRegression(), + trimming_threshold=0.05, + n_folds=5, + score='ATTE') + dml_irm_obj.fit() + + msg = 'Invalid score ATTE. Valid score ATE.' + with pytest.raises(ValueError, match=msg): + dml_irm_obj.cate(basis=2) diff --git a/doubleml/tests/test_irm.py b/doubleml/tests/test_irm.py index 52c9aeae..8609c55a 100644 --- a/doubleml/tests/test_irm.py +++ b/doubleml/tests/test_irm.py @@ -124,7 +124,7 @@ def test_dml_irm_boot(dml_irm_fixture): @pytest.mark.ci def test_dml_irm_cate_gate(): - n = 50 + n = 9 # collect data np.random.seed(42) obj_dml_data = make_irm_data(n_obs=n, dim_x=2) @@ -143,12 +143,24 @@ def test_dml_irm_cate_gate(): # create a random basis random_basis = pd.DataFrame(np.random.normal(0, 1, size=(n, 5))) cate = dml_irm_obj.cate(random_basis) - assert isinstance(cate, dml.double_ml_blp.DoubleMLIRMBLP) + assert isinstance(cate, dml.double_ml_blp.DoubleMLBLP) + assert isinstance(cate.confint(), pd.DataFrame) groups_1 = pd.DataFrame(np.column_stack([obj_dml_data.data['X1'] <= 0, obj_dml_data.data['X1'] > 0.2]), columns=['Group 1', 'Group 2']) - assert isinstance(dml_irm_obj.gate(groups_1), pd.DataFrame) + msg = ('At least one group effect is estimated with less than 6 observations.') + with pytest.warns(UserWarning, match=msg): + gate_1 = dml_irm_obj.gate(groups_1) + assert isinstance(gate_1, dml.double_ml_blp.DoubleMLBLP) + assert isinstance(gate_1.confint(), pd.DataFrame) + assert all(gate_1.confint().index == groups_1.columns) + np.random.seed(42) groups_2 = pd.DataFrame(np.random.choice(["1", "2"], n)) - assert isinstance(dml_irm_obj.gate(groups_2), pd.DataFrame) + msg = ('At least one group effect is estimated with less than 6 observations.') + with pytest.warns(UserWarning, match=msg): + gate_2 = dml_irm_obj.gate(groups_2) + assert isinstance(gate_2, dml.double_ml_blp.DoubleMLBLP) + assert isinstance(gate_2.confint(), pd.DataFrame) + assert all(gate_2.confint().index == ["Group_1", "Group_2"]) From e3270816305c0e1ef4e6d208621c2a378e1735c1 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Fri, 4 Nov 2022 11:53:31 +0100 Subject: [PATCH 10/15] extend unit tests --- .coverage | Bin 69632 -> 69632 bytes doubleml/tests/test_blp.py | 2 ++ 2 files changed, 2 insertions(+) diff --git a/.coverage b/.coverage index c54090593472f4773cf5f4f99e12a7949991082a..2424fd2d3e3d1decbb1d5b054b6bb1c95bb585e8 100644 GIT binary patch delta 38 ucmZozz|ydQWkan#tFf__spaM-{g7DJ(@%?b#!SB5^MonGWb?sZ`z8P)01!t2 delta 38 ucmZozz|ydQWkan#tD&)#vGL|6{g7DJqMb2%;*)RpJYh;O*nF_pz6k&es1C*e diff --git a/doubleml/tests/test_blp.py b/doubleml/tests/test_blp.py index 2f125d50..a0a2aaf6 100644 --- a/doubleml/tests/test_blp.py +++ b/doubleml/tests/test_blp.py @@ -42,6 +42,8 @@ def dml_blp_fixture(ci_joint, ci_level): 'values_manual': blp_manual.fittedvalues, 'omega': blp.blp_omega, 'omega_manual': blp_manual.cov_HC0, + 'basis': blp.basis, + 'signal': blp.orth_signal, 'ci_1': ci_1, 'ci_2': ci_2, 'ci_manual': ci_manual} From 4a1fa2c56e0a276e8a6c97097e7c95d428c24cd1 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Mon, 28 Nov 2022 16:34:48 +0100 Subject: [PATCH 11/15] adjust to PEP8 --- doubleml/double_ml_data.py | 7 +++---- doubleml/double_ml_iivm.py | 4 ++-- doubleml/double_ml_irm.py | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/doubleml/double_ml_data.py b/doubleml/double_ml_data.py index e2d6a36d..c37fa37b 100644 --- a/doubleml/double_ml_data.py +++ b/doubleml/double_ml_data.py @@ -725,10 +725,9 @@ def from_arrays(cls, x, y, d, cluster_vars, z=None, use_other_treat_as_covariate data = pd.concat((pd.DataFrame(cluster_vars, columns=cluster_cols), dml_data.data), axis=1) - return(cls(data, dml_data.y_col, dml_data.d_cols, - cluster_cols, - dml_data.x_cols, dml_data.z_cols, - dml_data.use_other_treat_as_covariate, dml_data.force_all_x_finite)) + return (cls(data, dml_data.y_col, dml_data.d_cols, cluster_cols, + dml_data.x_cols, dml_data.z_cols, + dml_data.use_other_treat_as_covariate, dml_data.force_all_x_finite)) @property def cluster_cols(self): diff --git a/doubleml/double_ml_iivm.py b/doubleml/double_ml_iivm.py index 75d498af..cab8d342 100644 --- a/doubleml/double_ml_iivm.py +++ b/doubleml/double_ml_iivm.py @@ -206,7 +206,7 @@ def _check_data(self, obj_dml_data): one_treat = (obj_dml_data.n_treat == 1) binary_treat = (type_of_target(obj_dml_data.d) == 'binary') zero_one_treat = np.all((np.power(obj_dml_data.d, 2) - obj_dml_data.d) == 0) - if not(one_treat & binary_treat & zero_one_treat): + if not (one_treat & binary_treat & zero_one_treat): raise ValueError('Incompatible data. ' 'To fit an IIVM model with DML ' 'exactly one binary variable with values 0 and 1 ' @@ -219,7 +219,7 @@ def _check_data(self, obj_dml_data): if one_instr: binary_instr = (type_of_target(obj_dml_data.z) == 'binary') zero_one_instr = np.all((np.power(obj_dml_data.z, 2) - obj_dml_data.z) == 0) - if not(one_instr & binary_instr & zero_one_instr): + if not (one_instr & binary_instr & zero_one_instr): raise ValueError(err_msg) else: raise ValueError(err_msg) diff --git a/doubleml/double_ml_irm.py b/doubleml/double_ml_irm.py index 17dee093..66673bc3 100644 --- a/doubleml/double_ml_irm.py +++ b/doubleml/double_ml_irm.py @@ -176,7 +176,7 @@ def _check_data(self, obj_dml_data): one_treat = (obj_dml_data.n_treat == 1) binary_treat = (type_of_target(obj_dml_data.d) == 'binary') zero_one_treat = np.all((np.power(obj_dml_data.d, 2) - obj_dml_data.d) == 0) - if not(one_treat & binary_treat & zero_one_treat): + if not (one_treat & binary_treat & zero_one_treat): raise ValueError('Incompatible data. ' 'To fit an IRM model with DML ' 'exactly one binary variable with values 0 and 1 ' From c9fc8b6f6fc5dd7eb9ad46c98d57709c23f7d043 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Mon, 28 Nov 2022 16:47:17 +0100 Subject: [PATCH 12/15] refactor for nonlinear score update --- doubleml/double_ml_irm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doubleml/double_ml_irm.py b/doubleml/double_ml_irm.py index 66673bc3..93438931 100644 --- a/doubleml/double_ml_irm.py +++ b/doubleml/double_ml_irm.py @@ -352,7 +352,7 @@ def cate(self, basis): 'Valid score ' + ' or '.join(valid_score) + '.') # define the orthogonal signal - orth_signal = self.psi_b.reshape(-1) + orth_signal = self.psi_elements['psi_b'].reshape(-1) # fit the best linear predictor model = DoubleMLBLP(orth_signal, basis=basis).fit() @@ -393,7 +393,7 @@ def gate(self, groups): warnings.warn('At least one group effect is estimated with less than 6 observations.') # define the orthogonal signal - orth_signal = self.psi_b.reshape(-1) + orth_signal = self.psi_elements['psi_b'].reshape(-1) # fit the best linear predictor for GATE (different confint() method) model = DoubleMLBLP(orth_signal, basis=groups, is_gate=True).fit() From 369be3e5731204d31cfa42cb7e567e9ed761d6a0 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Mon, 28 Nov 2022 17:06:16 +0100 Subject: [PATCH 13/15] add warnings for multiple treatments --- doubleml/double_ml_irm.py | 8 +++++++ doubleml/tests/test_doubleml_exceptions.py | 25 ++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/doubleml/double_ml_irm.py b/doubleml/double_ml_irm.py index 93438931..9b117f98 100644 --- a/doubleml/double_ml_irm.py +++ b/doubleml/double_ml_irm.py @@ -351,6 +351,10 @@ def cate(self, basis): raise ValueError('Invalid score ' + self.score + '. ' + 'Valid score ' + ' or '.join(valid_score) + '.') + if self.n_rep != 1: + raise NotImplementedError('Only implemented for one repetition. ' + + f'Number of repetitions is {str(self.n_rep)}.') + # define the orthogonal signal orth_signal = self.psi_elements['psi_b'].reshape(-1) # fit the best linear predictor @@ -378,6 +382,10 @@ def gate(self, groups): raise ValueError('Invalid score ' + self.score + '. ' + 'Valid score ' + ' or '.join(valid_score) + '.') + if self.n_rep != 1: + raise NotImplementedError('Only implemented for one repetition. ' + + f'Number of repetitions is {str(self.n_rep)}.') + if not isinstance(groups, pd.DataFrame): raise TypeError('Groups must be of DataFrame type. ' f'Groups of type {str(type(groups))} was passed.') diff --git a/doubleml/tests/test_doubleml_exceptions.py b/doubleml/tests/test_doubleml_exceptions.py index 41cd359c..f40c8ba5 100644 --- a/doubleml/tests/test_doubleml_exceptions.py +++ b/doubleml/tests/test_doubleml_exceptions.py @@ -752,6 +752,19 @@ def test_doubleml_exception_gate(): with pytest.raises(ValueError, match=msg): dml_irm_obj.gate(groups=2) + dml_irm_obj = DoubleMLIRM(dml_data_irm, + ml_g=Lasso(), + ml_m=LogisticRegression(), + trimming_threshold=0.05, + n_folds=5, + score='ATE', + n_rep=2) + dml_irm_obj.fit() + + msg = 'Only implemented for one repetition. Number of repetitions is 2.' + with pytest.raises(NotImplementedError, match=msg): + dml_irm_obj.gate(groups=2) + @pytest.mark.ci def test_doubleml_exception_cate(): @@ -766,3 +779,15 @@ def test_doubleml_exception_cate(): msg = 'Invalid score ATTE. Valid score ATE.' with pytest.raises(ValueError, match=msg): dml_irm_obj.cate(basis=2) + + dml_irm_obj = DoubleMLIRM(dml_data_irm, + ml_g=Lasso(), + ml_m=LogisticRegression(), + trimming_threshold=0.05, + n_folds=5, + score='ATE', + n_rep=2) + dml_irm_obj.fit() + msg = 'Only implemented for one repetition. Number of repetitions is 2.' + with pytest.raises(NotImplementedError, match=msg): + dml_irm_obj.cate(basis=2) From 9a18d0681a29cf7a9ddf5c4941f16ed3ece4b884 Mon Sep 17 00:00:00 2001 From: Philipp Bach Date: Mon, 5 Dec 2022 12:19:15 +0100 Subject: [PATCH 14/15] drop .coverage file --- .coverage | Bin 69632 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .coverage diff --git a/.coverage b/.coverage deleted file mode 100644 index 2424fd2d3e3d1decbb1d5b054b6bb1c95bb585e8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 69632 zcmeHw3sf7|xvsJoc`esurt2ddOOpQjn$=ss1A7DdgJ8Wzdy5hk;xaOe~3$Q%e zGMJh`%Fea`M7!10X0&$c53*f(_yoQ*JKJdw?f?+h)M7?w=yQ1ST)mZTX05Ebk+lW2 z$T2nLs&gN8suiip$uh$kH2~!gJKEZjaX=ziu-$Sy$=k`=EqrAK z+vXOwjMO*AoFU4QMT1ffv+0n74I4jCACb(%)i7J~^r#Zx0jo9g)T(5u1iX^%v>j@N z8!v;=VYlG#1~9&%7!vJwl9YPSe zfmjeat=VC2L`}tp>d@PhXWhvts zMuzke6?P-uT70I6aiNeUZxKdAhuT=9&0!4%k}nIAA&E$o47Cyd2vjFuAy3_$al7j9 z=NXFm4W&yVQ>3P(NL|4ef#MSSON^-m+7oF4b1nqJ)o(Z259n{!9qk6ivlHd18&Ynk zI8wn-j8gfd4l-YVR@P_^ zUJ~HMoi-@SWYgoKZ@w7SMKEk#2&3h2HdZBcO+nH9A!uu%W(6B0AYYIKo}ql{OvJ7NfV?B)c=<;e(wE3~3wZ}!&e`ucC!U@b&_{ey>D zbAA2h_I6mVV63m-ZD};N*4N)>wX|4`ZE*a)F8cusoDs}Ri;F;{(Q0b9!?E@*J8QFN z=jpTa^I>rT+9=50r-mBIm=7e(2h3lXKQUv1D@aZx3=jqg1B3y>0AYYIKo}ql5C#YX zgaN_;Vc;udKucvxLjw;oC6yvY;|`RD%8>Gd3<(-4QxzIsNCZvM`hWN&F+L`T_R{C+ zN9bMjLfuEYAL&|j>$N`ZYuXNNk!Di!Q_aJgrRra+zpE}+&8ps1xm4SgoU%`8RhB5G z6=xNP6l>*I<6`fXBFO@Px?r4FJf92Q*s@jaG}zW@t9q0cKk@!05!ov^iSsCW9S4 zQvxVks{lnK9;LC>XtSA`O{^6_Hdg|OLNJKIVK=qfFiG$*j>Fgr5ET`a#32{~QH|t5 zu(lk^mWq`$9X8kWWQyBMId1Vi1bb^sQx-nJru6Yae*njzyA0?K@h*BLfm z0U*uKc%6~jZOsR?`53PgDz_sKK#~MA9x>jWKvB7ll5~iN0Sz{+r6k=VC5?>-(etbA zU0ZTNeU)J8Xom@VNo@{b2yfgxMq{hPX2*ePdp5|647E5ub{M-15ttoo#9P8-Z8LP3 z><0{OCJ2<~78LnwR#OtEU`DV(5ey%xwyzRj86-Z(9k#9n7?Hk-U}7Ww3IGuu@exee zwiU}oM|_Y67HjoTws5Bi(r7vy=5cFQ90VYFo1>w{YH_q9Q|?#>AR-&534zS9#n?1W$g~D z*b08Qk@GF(5lvypAbKT`lmHOZpm zSW_G5f}XAh>4oB@K@zNn@1B#O7Q#b1@9U`eM!3&{9}qA#BCoGo05Fs|GsOUGW&l8R z1Z-$+kBWe_=m-e%pnS28l9&YOjtJ*ztI&!m3x4;Y8F>?wRiP0hXfT+}CcD9a1l7U> zgDz$O1MzLUTt!K2LTZOQVn{H^O1V<3T=05Qp%AMm1dYQ`!)m#(sLT5x^!`xL2?m?O zL`uR6sANzU(J2$FCEwgiC`xirKuKXcKw;$xVyO&v#)_rF95d)9(CuDOlK`odIg7gB z40Bdj#x&5$Os?)@`Y!rW=A`Z!)mz$gsvYuW+G5osioV2;Wsg%YCJaj7)E&^yYCAPk znxAP7YBJUDtG}nNQ2A97QUr_t0v;hV3$!E)JO3{(tp0nM)G~ zO3dv?)fSmcAsmEn4~V4j9Ykd%&Q6Q22f5|;E_%RUyaRhTgxM z=>C6kT;3a_8Pu<6zQzmp{|o1Dyr}+v!F-K}`v3WN$XrRn8IKrmPN2xEqB`P$0S(sG zP~BoB!~OrYTV*blaOr4=2}gxoagCeDh;-a@;s~|3=>KQOVF}dNo!kFkvypNNX9OD* z!SL~AbsUvJ;&a?#RUE#FU}7Tv%D5svf(hGph4_dM^5B)r#W_Wg#;E?kJ{|(*_W!fu zafw^>|ChxP2XF5GXKtX53U@5NnehGprE#^>*#3V;Idw>E`C!R(@sWw|BcwejbL9#z z5&J;6FMFH*|B^UN5y1@g|5NXhxzgg*Y)=1wvABy;Y%Oo<|EI)pu?*KQrvJZ4d>o7F z|0l;G9qGs}6c+_~U%#pUpR`HlqC}c0rvJYnt_T>@|7XNUz+g|8E}=|9bdT!)>qM0m z)&JLu5{&BqYXl00`<Y$L4!ghedsJ})f^qA@Yns7;e|L@C3AIMR{0AYYIKo}ql5C#YX zgaN_;VSq3|7$6M%f6f2}s|9rE`F|-hD}n#yLl__o5C#YXgaN_;VSq3|7$6J~1_%R$ z0m8uFAOjSoQ=$3)O$wES@iISWwDhy|3Hl(d)csobvaUmC(bZ^&wZGC{(Ef+^N$pW> ziRK+mzvdZDr{*3_g(h1)tNvX5k@|nDf2{s{b(?ydI$xct@~QrydP&uzvZ@|dJ*cWt zWvkT63FYsUuPaX}A6HtGdz2MQMlquJrJ`4HMA4$SPf?{<0}7B2VSq3|7$6J~2EGag zGB(Mis>)5hSEi-EKocPhV;^$P`tdy#hME`s(=g zOP+4-___Mgghf?wU~QFo@A0R{r)I7X2h}dBguM-w<^ir_))y%7a(#nt@BYz&C;Kd{ zzwad1WpN+t>e)MUe(Zy~nXw1@hWD(g0Ev4l_8$bCQm^NdWz0KxdGh4xbM7%ii{J0> z_jZGp7HMWVoL5s0Y}E`+)pP{jJLW$Z@cWK>{aBsUGB{jR*83E&Fn;Pc_xgRuJ-*}L zI=MG>Jvy-dsT=(RmY%^Uht5x4zWf1~|B^bn6b|H+B8l1;3{EH}Z!kZj1a?-I^p-&x zL$(&rA8W6B+?;n{tZ>k(Sy~JStBcDgJR{zTSGa-S1`4>z{nL}y|I3}+lbBqD@QN_r zxifAc7vU8myuu(}=^VU_0(7t-h-W=>{Q2I&LeGsM}*Y%dw4Nd(EaLC(Ez7}@VYbRIcLVjm%`NWtPIQYrf+aGc-4RV)# zH+l@Cmrn))`4fkS)r)fAq_sKaQ{A57$vTdkhChxe$%ehn+2+0b3}By)Uf=b-1w)s{ z2DmY|e|XGmJ@>;i@Wnr2@m#iktJ%pxCS9@yWJ=Z)91MWZO#LeG*)<<>>F2L*NM8*{ z%2x;3IOS<__Imw&*b6PvMXONxt9qXrnmYE1bL70o(=&KEKV>EC&0AT2Fo2;PytT7e z!0!82l#jSQ{e6Q@-{FzxeGgAVmAakNh5^Gsq2J&4)8UEk#PsE;#O3>kXTdFg;OAAj z+-&f;otb(#R-+GUGk)&Xjw^v)Csbi3==n>s;BZM6QmCfx^f{-$pPNSYxU5K721oLi zVO6gLX21n;lTFD)yEA*soKKHW|1t17UorJk*u8k^WdE#3lbZqC4`q~(ILExB`vVih z{YQI7uOHbL;0y!KF+-q-8+8B0JN7}p=bLmooVqOiy*lUM*#1C%MM@e%PYa^I24K-=)ywqfV!h%N<= zEloK)Fg`UsHJ;V9C{e0%-MPqo!@?bV>YuNTzTP)8I_vWv56q7F>fGFEXYFau#!Ulr zLw+y!3HP4G6Yx1-^^FXVOt<{Q-&c70(q0%D>Hp38Yg+HIjbo>Lvu)hW^z^iMD&Y2V z+>~d+Guv@TGAe~mHcwxyFX(Z)$K2ew^M>>I&=5B?-di<3)cx+%Y2Q<}zy@#G(CPD6 zIIm~))ETbhXvb9dn8#;(dU$liAMl@NmMjGAOBR|Z$ET;JxmjnRjtl;CCMUs>oTQIS z4|_82>g_x2&3z?d*MbGGbH#!^Q#F@f_Z@l3GFYf#V9ye!cJLc_-CwtWMu+L%njwpu z8@y~iS8vcEMQe2zk0eM=>^r|uBDp$6dwSnNZ)cZCI*xL?@W#vC{Dx$BFTeFh-$l3e zv%r)1xTM?D?>iF|LOc2^UhD9_+SbRZi?vW>llCmzeC3|$d!}o-!m*39gB@>wl50C{ znSQ#&`_b6s-P8Ym?7`!nkqP$b+1Y#c%QYZEYZ5BdkhiPN@?a6c^u*_!$9?rf&h0%~ z_f}tBSKTNxG(60`FzP>g{Ws^i0r!YEf0+sdYgHil-1&cf_5B0u)!&AnaC^gleE7!B zecwHzDObXAvvQy`3i!{Qx6s!==Bu5VJU)B8lY7x~Va)rfgX?fQ*GWEqNP429=S8Ce zk+nr}JfrSuX;}7+&pDsdr-RrtHXGo^K8KBKqet)fN8cXzXH$Qfc8-4Ht8=uBjdXR7 z4$nZW@Ap6F{N+FPk9ucLdjj9&+)nq06W{oW+v{cQ9hzi0sFou?`G=`%89k-n_xi^I zoO^j9?A(?(GdVeXdFosT(^=)+*EMqK-m!l7jdj!1<+@2}iwvoDkL+3+)@#%3C2oeJ z(e~_x{xcuG^!}!2U2mm7@m6{I6Fa!Rb7$V?4nGlazt6o?F>~op?(6w)oc8;D9}fR> zUvaQr-+_`j58fU)@+>#;#EwfZ?HO$zlDgJYP|_m@sNPR^pWt>+`uv}~*U{7RiidM^ z8BgqZ7T8YzhaJyKe*fqb2mezGJ2FOo_dD39d&0+DyLPPrAV){qhOX{>KrMOMcj+Bp zibUe|dG||Bj7)5-{PfeTVEJ$Oe}CNTy>@Z(*wLAOIK?yY$-~VT`~&Xsi^IJ~Iy$6Z zcwsO4ym8Ir_n#k`9tn(I`0R$m@#ODocg}qA-FN%0aozyOrJSFcx#TV=NdO9)6Gq?q zrrKOri|U|8JtV zNZ_~s{=!T!SC}#8cg*|D+b|k%f%zHJ%e=t+i1`=he=^@=zQc4ghZzfVfH5-vgL#nI z&1`3CnF^+e$z@hB8O$O^$H-}p{(}B9{VDyQ^e6O(^n3I#={M+e^egmF=;!ED^hx@k z=*MXneT25rtu#yTr@u+xNACm$$cHdM7$6J~1_%R$0m1-bfG|K9APf)&zUB;|UcGeF zCY;sP;jFe6XB#)-tfmHM8#dsqx*BIyRXD4x#92iJ&dSSiR#t|y_3Lp~T8guh5}Xwm z6f@^H3p9nRLS#aV7H&T?{amYt2WHEVFTdNs~gt-{&Ll{j0m z0%yyY<4mu|SymR#mMz0sW+u*-F2z|!2F}vcah8^bvn5M#mYRyQ#fx#4l7h2Ei*S~V z$3>(I7vgV8NjO`u0A~z?Gn&SkPKPtC7H1j_&eUq0sZ=;qDsiSz;7l&ZSz;p2WHOvl z6wVS7WH1DP=l>-OzUD4Q)FTWK1_%R$0m1-bfG|K9APf)&2m^!x!T@1lUJRi5f71V- z7llNaFhCd}3=jqg1B3y>0AYYIKo}ql5C#YXUwa17^Z%Ra^JxA54d&0xRc4&|KgJ?2-;FBmuTIy?vPGV>GW-KI zH{lt9^Iv0AYYI@YOItZHC{kr#7La4kfiH*@%)F zlx#prHA<>bQi+lZl$4{S3?=JPQi_riloX?+2qlFmDL_d+O7c*$4kc?*l8cfYlw_l1 z4N6v{WEDzQqGSb1mZLWg=2qnoVS%{J( zlq^6AgAy7gI+SQpqCtroB`TCCQKCSJ93_b;k)eb_Ndijn@Bb$#C3@+b66QU4ZvTR= zjA@{gnOxn+^j-9$%t_rds<*V~R6FF$w8g4N6n%*w%O0m*Oc<2DsXL&Z)ply8G(XcE z)MTpPSAS1kq4KLF%GZ=OWvOCDVO6Y?PspE_?@v6H*qE3hyC|!rzM%F~X$h``%?V2B z32C)tO7d@i-F4vHl|o*ih+e;HtH&>4MX$EEvhdy1X0)@0W~-&mU<j`uGjz3}_34R$MQw8NVotK)bXGD3-Y8+27%??T1O7|jl2E9g;4rAdA+`0er` zPC&3Cj<-Rh_jiWgs$Cw(TXC^`yU}Vl8CwlULP%xe-qy>%6?6_(`1RQ9sV^ko7wvXX zudrS~PU$Y0D|J2~o#EP*+#_>k35U8>?EoxXy<*wL`Bks;2-L4=zQ*ee8Lx2u#_NpK zu0XzDDicl?@6J)i>x9bX$MK4B>`ik5MV|P#Y$F)ZU|kK>Ew&cM#)HNd7KGQv^1m_Y|42#_lj9LRZIe-9X}V++kH55g)N#PuAb*n9}fYh!*B(6G_+VPj&@|qtaw}kfy}YSvN*2dxH2*3$Q1Y{YQ%-Do^K`% z77MxqSQ=M5#X5v-nNdz165h;$HMM~*M!ONDljD#ENw69!d!e`}$oo1f zz7g(?tZb915qW*mCYg&8X{H!}1#v~dhSv5-1Y`uh(mmK}v2{Tnl&4E5lMvkz;XG|R zQDrq+%+07gC`&6!&|ol`O?HC;1T_K$!|$CppqK)J>c95#c}GJl+tyki5e%}T5-k_J zo+w2t%HJL-1V&xn2dyj)I>BHgFQQl>CJe{QMA2IE%}rKH9TZYhm`*SOj8oV|$rzwB z2XYC;lu0EZphvk0TK}(C9Fs8ri?K3!^dITdw1Hlu`-AQ&UA=CR_Fe7wwR^O*=2sf0 zrb>NH{i52S&Qe`bJ)_#ITBQ8B(y1&`{6X<70FV!1fG|K9APf)&2m^u`SVP_aC0BX4 zb9A7)&CTNP*U``}-`n0KZrl*x}S*rfE`n&3K)vW4Gl}oi<$tnAkR%MA|T5(o!NU>IaRsNiOpFA`1qr@L3HYBFY zK9oHs+emq-A5;6NB?)gQoJiP|ppgDT>XcSVCZXWJ_PMu4CULD4FsKpbz~~0%w)6kn zHvk|b9&pag!M19E(TRtNo&Voj1t=QvC^47!&6NP65DdbPhar-}GYA#1GR7eo0lPNH zqdT~ADEmw9VnZX?vG*sD`Tsl1Kt4rm$x&0a;TiJX>j5cKFjAO>7_2ZMTT1~=v|*yL zfSjEr0Fo+LC7#wOkjVW1-NgWvB^c^fwTqtr-&O>0qP;gpGpOI4g@7_226!+cE)1WE{Nto~{}ebRHG#SbQ_#=l{24fCi%5Db^uu%Q9F- zdPrpXV982Ycq%e7p>jd;fhC}HuHX`}4}|Unfo|NW@!F@*J40Pi&L{X z^Z&b30898qDYlk3&HvxI2+%}cEW`DSng6eWCBY)&Sk(M~4Xh;=D;=50-kt=t5FXNb zU%zSozit7*P~yxKGylJt0RYhvFlPS00v1q+%m9P4eA_OE<>NLXwX>j+10)z^r5skAi<1jpPby%Mx;RDo z`TuHI*e)dM@;(?d|6d5};aLHd49Y^ptAMrh; Date: Thu, 8 Dec 2022 10:39:33 +0100 Subject: [PATCH 15/15] add summary and fix docu --- doubleml/double_ml_blp.py | 39 +++++++++++++++++++++++++++++++++----- doubleml/double_ml_irm.py | 9 +++++---- doubleml/tests/test_blp.py | 8 +++++++- 3 files changed, 46 insertions(+), 10 deletions(-) diff --git a/doubleml/double_ml_blp.py b/doubleml/double_ml_blp.py index c49730e5..25aa8f25 100644 --- a/doubleml/double_ml_blp.py +++ b/doubleml/double_ml_blp.py @@ -12,11 +12,12 @@ class DoubleMLBLP: Parameters ---------- orth_signal : :class:`numpy.array` - The orthogonal signal to be predicted. Has to be of shape (n,). + The orthogonal signal to be predicted. Has to be of shape ``(n_obs,)``, + where ``n_obs`` is the number of observations. basis : :class:`pandas.DataFrame` - The basis for estimating the best linear predictor. Has to have the shape (n,d), - where d is the number of predictors. + The basis for estimating the best linear predictor. Has to have the shape ``(n_obs, d)``, + where ``n_obs`` is the number of observations and ``d`` is the number of predictors. is_gate : bool Indicates whether the basis is constructed for GATEs (dummy-basis). @@ -52,6 +53,14 @@ def __init__(self, self._blp_model = None self._blp_omega = None + def __str__(self): + class_name = self.__class__.__name__ + header = f'================== {class_name} Object ==================\n' + fit_summary = str(self.summary) + res = header + \ + '\n------------------ Fit summary ------------------\n' + fit_summary + return res + @property def blp_model(self): """ @@ -80,6 +89,25 @@ def blp_omega(self): """ return self._blp_omega + @property + def summary(self): + """ + A summary for the best linear predictor effect after calling :meth:`fit`. + """ + col_names = ['coef', 'std err', 't', 'P>|t|', '[0.025', '0.975]'] + if self.blp_model is None: + df_summary = pd.DataFrame(columns=col_names) + else: + summary_stats = {'coef': self.blp_model.params, + 'std err': self.blp_model.bse, + 't': self.blp_model.tvalues, + 'P>|t|': self.blp_model.pvalues, + '[0.025': self.blp_model.conf_int()[0], + '0.975]': self.blp_model.conf_int()[1]} + df_summary = pd.DataFrame(summary_stats, + columns=col_names) + return df_summary + def fit(self): """ Estimate DoubleML models. @@ -162,13 +190,14 @@ def confint(self, basis=None, joint=False, level=0.95, n_rep_boot=500): # blp of the orthogonal signal g_hat = self._blp_model.predict(basis) + np_basis = basis.to_numpy() # calculate se for basis elements - blp_se = np.sqrt((basis.to_numpy().dot(self._blp_omega) * basis.to_numpy()).sum(axis=1)) + blp_se = np.sqrt((np.dot(np_basis, self._blp_omega) * np_basis).sum(axis=1)) if joint: # calculate the maximum t-statistic with bootstrap normal_samples = np.random.normal(size=[basis.shape[1], n_rep_boot]) - bootstrap_samples = np.multiply(basis.to_numpy().dot(np.dot(sqrtm(self._blp_omega), normal_samples)).T, + bootstrap_samples = np.multiply(np.dot(np_basis, np.dot(sqrtm(self._blp_omega), normal_samples)).T, (1.0 / blp_se)) max_t_stat = np.quantile(np.max(np.abs(bootstrap_samples), axis=0), q=level) diff --git a/doubleml/double_ml_irm.py b/doubleml/double_ml_irm.py index 9b117f98..f39f57b3 100644 --- a/doubleml/double_ml_irm.py +++ b/doubleml/double_ml_irm.py @@ -338,8 +338,8 @@ def cate(self, basis): Parameters ---------- basis : :class:`pandas.DataFrame` - The basis for estimating the best linear predictor. Has to have the shape (n,d), - where d is the number of predictors. + The basis for estimating the best linear predictor. Has to have the shape ``(n_obs, d)``, + where ``n_obs`` is the number of observations and ``d`` is the number of predictors. Returns ------- @@ -369,8 +369,9 @@ def gate(self, groups): Parameters ---------- groups : :class:`pandas.DataFrame` - The group indicator for estimating the best linear predictor. Has to be dummy coded with shape (n,d), - where d is the number of groups or (n,1) and contain the corresponding groups. + The group indicator for estimating the best linear predictor. + Has to be dummy coded with shape ``(n_obs, d)``, where ``n_obs`` is the number of observations + and ``d`` is the number of groups or ``(n_obs, 1)`` and contain the corresponding groups. Returns ------- diff --git a/doubleml/tests/test_blp.py b/doubleml/tests/test_blp.py index a0a2aaf6..cdbc0081 100644 --- a/doubleml/tests/test_blp.py +++ b/doubleml/tests/test_blp.py @@ -46,7 +46,8 @@ def dml_blp_fixture(ci_joint, ci_level): 'signal': blp.orth_signal, 'ci_1': ci_1, 'ci_2': ci_2, - 'ci_manual': ci_manual} + 'ci_manual': ci_manual, + 'blp_model': blp} return res_dict @@ -84,3 +85,8 @@ def test_dml_blp_ci_2(dml_blp_fixture): assert np.allclose(dml_blp_fixture['ci_1'], dml_blp_fixture['ci_manual'], rtol=1e-9, atol=1e-4) + + +def test_dml_blp_return_types(dml_blp_fixture): + assert isinstance(dml_blp_fixture['blp_model'].__str__(), str) + assert isinstance(dml_blp_fixture['blp_model'].summary, pd.DataFrame) \ No newline at end of file