Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enabling higher orders feature importance for F filter and LR filter #509

Merged
merged 2 commits into from
May 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 111 additions & 19 deletions causalml/feature_selection/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import statsmodels.api as sm
from scipy import stats
from sklearn.impute import SimpleImputer
import warnings


class FilterSelect:
Expand All @@ -18,7 +19,7 @@ def __init__(self):
return

@staticmethod
def _filter_F_one_feature(data, treatment_indicator, feature_name, y_name):
def _filter_F_one_feature(data, treatment_indicator, feature_name, y_name, order=1):
"""
Conduct F-test of the interaction between treatment and one feature.

Expand All @@ -27,6 +28,8 @@ def _filter_F_one_feature(data, treatment_indicator, feature_name, y_name):
treatment_indicator (string): the column name for binary indicator of treatment (value 1) or control (value 0)
feature_name (string): feature name, as one column in the data DataFrame
y_name (string): name of the outcome variable
order (int): the order of feature to be evaluated with the treatment effect, order takes 3 values: 1,2,3. order = 1 corresponds to linear importance of the feature, order=2 corresponds to quadratic and linear importance of the feature,
order= 3 will calculate feature importance up to cubic forms.

Returns:
F_test_result : pd.DataFrame
Expand All @@ -39,26 +42,64 @@ def _filter_F_one_feature(data, treatment_indicator, feature_name, y_name):
[treatment_indicator, feature_name]
].product(axis=1)

if order not in [1, 2, 3]:
raise Exception("ValueError: order argument only takes value 1,2,3.")

if order == 1:
pass
elif order == 2:
x_tmp_name = "{}_o{}".format(feature_name, order)
X[x_tmp_name] = X[[feature_name]] ** order
X["{}-{}".format(treatment_indicator, x_tmp_name)] = X[
[treatment_indicator, x_tmp_name]
].product(axis=1)
elif order == 3:
x_tmp_name = "{}_o{}".format(feature_name, 2)
X[x_tmp_name] = X[[feature_name]] ** 2
X["{}-{}".format(treatment_indicator, x_tmp_name)] = X[
[treatment_indicator, x_tmp_name]
].product(axis=1)

x_tmp_name = "{}_o{}".format(feature_name, order)
X[x_tmp_name] = X[[feature_name]] ** order
X["{}-{}".format(treatment_indicator, x_tmp_name)] = X[
[treatment_indicator, x_tmp_name]
].product(axis=1)

model = sm.OLS(Y, X)
result = model.fit()

F_test = result.f_test(np.array([0, 0, 0, 1]))
if order == 1:
F_test = result.f_test(np.array([0, 0, 0, 1]))
elif order == 2:
F_test = result.f_test(np.array([[0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1]]))
elif order == 3:
F_test = result.f_test(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@zhenyuz0500 can you remind me why is the r_matrix configured in this way? Maybe we can add comment here too

Copy link
Collaborator Author

@zhenyuz0500 zhenyuz0500 May 6, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The r_matrix assumes the linear combination of each row times the coefficients equals 0.
For example, when order=2, then the linear model will be: a + b1 * I_treatment + b2 * x + b3 * x * I_treatment + b4 * x^2 + b5 * x^2 * I_treatment
we want to test H0: b3==0 and b5 ==0 vs H1: b3!=0 or b5 !=0
then it translates to test H0: [0, 0, 0, 1, 0, 0] * [a, b1, b2, b3, b4,b5]' = 0 and [0, 0, 0, 0, 0, 1] * [a, b1, b2, b3, b4, b5]' = 0

reference: https://www.statsmodels.org/dev/generated/statsmodels.regression.linear_model.RegressionResults.f_test.html

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, thanks!

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for the review!

np.array(
[
[0, 0, 0, 1, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 1],
]
)
)

F_test_result = pd.DataFrame(
{
"feature": feature_name, # for the interaction, not the main effect
"method": "F-statistic",
"method": "F{} Filter".format(order),
"score": float(F_test.fvalue),
"p_value": F_test.pvalue,
"misc": "df_num: {}, df_denom: {}".format(
F_test.df_num, F_test.df_denom
"misc": "df_num: {}, df_denom: {}, order:{}".format(
F_test.df_num, F_test.df_denom, order
),
},
index=[0],
).reset_index(drop=True)

return F_test_result

def filter_F(self, data, treatment_indicator, features, y_name):
def filter_F(self, data, treatment_indicator, features, y_name, order=1):
"""
Rank features based on the F-statistics of the interaction.

Expand All @@ -67,18 +108,24 @@ def filter_F(self, data, treatment_indicator, features, y_name):
treatment_indicator (string): the column name for binary indicator of treatment (value 1) or control (value 0)
features (list of string): list of feature names, that are columns in the data DataFrame
y_name (string): name of the outcome variable
order (int): the order of feature to be evaluated with the treatment effect, order takes 3 values: 1,2,3. order = 1 corresponds to linear importance of the feature, order=2 corresponds to quadratic and linear importance of the feature,
order= 3 will calculate feature importance up to cubic forms.

Returns:
all_result : pd.DataFrame
a data frame containing the feature importance statistics
"""
if order not in [1, 2, 3]:
raise Exception("ValueError: order argument only takes value 1,2,3.")

all_result = pd.DataFrame()
for x_name_i in features:
one_result = self._filter_F_one_feature(
data=data,
treatment_indicator=treatment_indicator,
feature_name=x_name_i,
y_name=y_name,
order=order,
)
all_result = pd.concat([all_result, one_result])

Expand All @@ -89,7 +136,7 @@ def filter_F(self, data, treatment_indicator, features, y_name):

@staticmethod
def _filter_LR_one_feature(
data, treatment_indicator, feature_name, y_name, disp=True
data, treatment_indicator, feature_name, y_name, order=1, disp=True
):
"""
Conduct LR (Likelihood Ratio) test of the interaction between treatment and one feature.
Expand All @@ -99,6 +146,8 @@ def _filter_LR_one_feature(
treatment_indicator (string): the column name for binary indicator of treatment (value 1) or control (value 0)
feature_name (string): feature name, as one column in the data DataFrame
y_name (string): name of the outcome variable
order (int): the order of feature to be evaluated with the treatment effect, order takes 3 values: 1,2,3. order = 1 corresponds to linear importance of the feature, order=2 corresponds to quadratic and linear importance of the feature,
order= 3 will calculate feature importance up to cubic forms.

Returns:
LR_test_result : pd.DataFrame
Expand All @@ -107,17 +156,45 @@ def _filter_LR_one_feature(
Y = data[y_name]

# Restricted model
X_r = data[[treatment_indicator, feature_name]]
X_r = sm.add_constant(X_r)
model_r = sm.Logit(Y, X_r)
result_r = model_r.fit(disp=disp)
x_name_r = ["const", treatment_indicator, feature_name]
x_name_f = x_name_r.copy()
X = data[[treatment_indicator, feature_name]]
X = sm.add_constant(X)

# Full model (with interaction)
X_f = X_r.copy()
X_f["{}-{}".format(treatment_indicator, feature_name)] = X_f[
X["{}-{}".format(treatment_indicator, feature_name)] = X[
[treatment_indicator, feature_name]
].product(axis=1)
model_f = sm.Logit(Y, X_f)
x_name_f.append("{}-{}".format(treatment_indicator, feature_name))

if order == 2:
x_tmp_name = "{}_o{}".format(feature_name, order)
X[x_tmp_name] = X[[feature_name]] ** order
X["{}-{}".format(treatment_indicator, x_tmp_name)] = X[
[treatment_indicator, x_tmp_name]
].product(axis=1)
x_name_r.append(x_tmp_name)
x_name_f += [x_tmp_name, "{}-{}".format(treatment_indicator, x_tmp_name)]
elif order == 3:
x_tmp_name = "{}_o{}".format(feature_name, 2)
X[x_tmp_name] = X[[feature_name]] ** 2
X["{}-{}".format(treatment_indicator, x_tmp_name)] = X[
[treatment_indicator, x_tmp_name]
].product(axis=1)
x_name_r.append(x_tmp_name)
x_name_f += [x_tmp_name, "{}-{}".format(treatment_indicator, x_tmp_name)]
x_tmp_name = "{}_o{}".format(feature_name, order)
X[x_tmp_name] = X[[feature_name]] ** order
X["{}-{}".format(treatment_indicator, x_tmp_name)] = X[
[treatment_indicator, x_tmp_name]
].product(axis=1)
x_name_r.append(x_tmp_name)
x_name_f += [x_tmp_name, "{}-{}".format(treatment_indicator, x_tmp_name)]

# Full model (with interaction)
model_r = sm.Logit(Y, X[x_name_r])
result_r = model_r.fit(disp=disp)

model_f = sm.Logit(Y, X[x_name_f])
result_f = model_f.fit(disp=disp)

LR_stat = -2 * (result_r.llf - result_f.llf)
Expand All @@ -127,17 +204,19 @@ def _filter_LR_one_feature(
LR_test_result = pd.DataFrame(
{
"feature": feature_name, # for the interaction, not the main effect
"method": "LRT-statistic",
"method": "LR{} Filter".format(order),
"score": LR_stat,
"p_value": LR_pvalue,
"misc": "df: {}".format(LR_df),
"misc": "df: {}, order: {}".format(LR_df, order),
},
index=[0],
).reset_index(drop=True)

return LR_test_result

def filter_LR(self, data, treatment_indicator, features, y_name, disp=True):
def filter_LR(
self, data, treatment_indicator, features, y_name, order=1, disp=True
):
"""
Rank features based on the LRT-statistics of the interaction.

Expand All @@ -146,18 +225,24 @@ def filter_LR(self, data, treatment_indicator, features, y_name, disp=True):
treatment_indicator (string): the column name for binary indicator of treatment (value 1) or control (value 0)
feature_name (string): feature name, as one column in the data DataFrame
y_name (string): name of the outcome variable
order (int): the order of feature to be evaluated with the treatment effect, order takes 3 values: 1,2,3. order = 1 corresponds to linear importance of the feature, order=2 corresponds to quadratic and linear importance of the feature,
order= 3 will calculate feature importance up to cubic forms.

Returns:
all_result : pd.DataFrame
a data frame containing the feature importance statistics
"""
if order not in [1, 2, 3]:
raise Exception("ValueError: order argument only takes value 1,2,3.")

all_result = pd.DataFrame()
for x_name_i in features:
one_result = self._filter_LR_one_feature(
data=data,
treatment_indicator=treatment_indicator,
feature_name=x_name_i,
y_name=y_name,
order=order,
disp=disp,
)
all_result = pd.concat([all_result, one_result])
Expand Down Expand Up @@ -477,6 +562,8 @@ def get_importance(
treatment_group="treatment",
n_bins=5,
null_impute=None,
order=1,
disp=False,
):
"""
Rank features based on the chosen statistic of the interaction.
Expand All @@ -495,6 +582,9 @@ def get_importance(
treatment_group (string): name for treatment group, value in the experiment group column
n_bins (int, optional): number of bins to be used for bin-based uplift filter methods
null_impute (str, optional, default=None): impute np.nan present in the data taking on of the following strategy values {'mean', 'median', 'most_frequent', None}. If value is None and null is present then exception will be raised
order (int): the order of feature to be evaluated with the treatment effect for F filter and LR filter, order takes 3 values: 1,2,3. order = 1 corresponds to linear importance of the feature, order=2 corresponds to quadratic and linear importance of the feature,
order= 3 will calculate feature importance up to cubic forms.
disp (bool): Set to True to print convergence messages for Logistic regression convergence in LR method.

Returns:
all_result : pd.DataFrame
Expand All @@ -514,6 +604,7 @@ def get_importance(
treatment_indicator="treatment_indicator",
features=features,
y_name=y_name,
order=order,
)
elif method == "LR":
data = data[
Expand All @@ -525,10 +616,11 @@ def get_importance(
] = 1
all_result = self.filter_LR(
data=data,
disp=True,
disp=disp,
treatment_indicator="treatment_indicator",
features=features,
y_name=y_name,
order=order,
)
else:
all_result = self.filter_D(
Expand Down
Loading