Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

"support valid error early stopping" #648

Merged
merged 17 commits into from
Aug 4, 2023
103 changes: 83 additions & 20 deletions causalml/inference/tree/uplift.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,10 @@ class UpliftTreeClassifier:
n_reg: int, optional (default=100)
The regularization parameter defined in Rzepakowski et al. 2012, the weight (in terms of sample size) of the
parent node influence on the child node, only effective for 'KL', 'ED', 'Chi', 'CTS' methods.

early_stopping_eval_diff_scale: float, optional (default=1)
If train and valid uplift score diff bigger than
min(train_uplift_score,valid_uplift_score)/early_stopping_eval_diff_scale, stop.

control_name: string
The name of the control group (other experiment groups will be regarded as treatment groups).
Expand All @@ -240,12 +244,13 @@ class UpliftTreeClassifier:

"""
def __init__(self, control_name, max_features=None, max_depth=3, min_samples_leaf=100,
min_samples_treatment=10, n_reg=100, evaluationFunction='KL',
min_samples_treatment=10, n_reg=100, early_stopping_eval_diff_scale=1, evaluationFunction='KL',
normalization=True, honesty=False, estimation_sample_size=0.5, random_state=None):
self.max_depth = max_depth
self.min_samples_leaf = min_samples_leaf
self.min_samples_treatment = min_samples_treatment
self.n_reg = n_reg
self.early_stopping_eval_diff_scale = early_stopping_eval_diff_scale
self.max_features = max_features

assert evaluationFunction in ['KL', 'ED', 'Chi', 'CTS', 'DDP', 'IT', 'CIT', 'IDDP'], \
Expand Down Expand Up @@ -282,7 +287,7 @@ class UpliftTreeClassifier:
self.honesty = True


def fit(self, X, treatment, y):
def fit(self, X, treatment, y, X_val=None, treatment_val=None, y_val=None):
""" Fit the uplift model.

Args
Expand All @@ -306,14 +311,23 @@ class UpliftTreeClassifier:
X, y = check_X_y(X, y)
treatment = np.asarray(treatment)
assert len(y) == len(treatment), 'Data length must be equal for X, treatment, and y.'

if X_val is not None:
X_val, y_val = check_X_y(X_val, y_val)
treatment_val = np.asarray(treatment_val)
assert len(y_val) == len(treatment_val), 'Data length must be equal for X_val, treatment_val, and y_val.'

# Get treatment group keys. self.classes_[0] is reserved for the control group.
treatment_groups = sorted([x for x in list(set(treatment)) if x != self.control_name])
self.classes_ = [self.control_name]
treatment_idx = np.zeros_like(treatment, dtype=int)
treatment_val_idx = None
if treatment_val is not None:
treatment_val_idx = np.zeros_like(treatment_val, dtype=int)
for i, tr in enumerate(treatment_groups, 1):
self.classes_.append(tr)
treatment_idx[treatment == tr] = i
if treatment_val_idx is not None:
treatment_val_idx[treatment_val == tr] = i
self.n_class = len(self.classes_)

self.feature_imp_dict = defaultdict(float)
Expand All @@ -333,8 +347,9 @@ class UpliftTreeClassifier:
random_state=self.random_state)

self.fitted_uplift_tree = self.growDecisionTreeFrom(
X, treatment_idx, y,
max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf,
X, treatment_idx, y, X_val, treatment_val_idx, y_val,
max_depth=self.max_depth, early_stopping_eval_diff_scale=self.early_stopping_eval_diff_scale,
min_samples_leaf=self.min_samples_leaf,
depth=1, min_samples_treatment=self.min_samples_treatment,
n_reg=self.n_reg, parentNodeSummary=None
)
Expand Down Expand Up @@ -1118,7 +1133,8 @@ class UpliftTreeClassifier:
res.append(p)
return res

def growDecisionTreeFrom(self, X, treatment_idx, y, max_depth=10,
def growDecisionTreeFrom(self, X, treatment_idx, y, X_val, treatment_val_idx, y_val,
early_stopping_eval_diff_scale=1, max_depth=10,
min_samples_leaf=100, depth=1,
min_samples_treatment=10, n_reg=100,
parentNodeSummary=None):
Expand All @@ -1133,6 +1149,12 @@ class UpliftTreeClassifier:
An array containing the treatment group idx for each unit.
y : array-like, shape = [num_samples]
An array containing the outcome of interest for each unit.
X_val : ndarray, shape = [num_samples, num_features]
An ndarray of the covariates used to valid the uplift model.
treatment_val_idx : array-like, shape = [num_samples]
An array containing the validation treatment group idx for each unit.
y_val : array-like, shape = [num_samples]
An array containing the validation outcome of interest for each unit.
max_depth: int, optional (default=10)
The maximum depth of the tree.
min_samples_leaf: int, optional (default=100)
Expand Down Expand Up @@ -1194,7 +1216,6 @@ class UpliftTreeClassifier:
else:
p_t = currentNodeSummary[suboptTreatment][0]
n_t = currentNodeSummary[suboptTreatment][1]

p_value = (1. - stats.norm.cdf(abs(p_c - p_t) / np.sqrt(p_t * (1 - p_t) / n_t + p_c * (1 - p_c) / n_c))) * 2
upliftScore = [maxDiff, p_value]

Expand Down Expand Up @@ -1223,6 +1244,7 @@ class UpliftTreeClassifier:

for value in lsUnique:
X_l, X_r, w_l, w_r, y_l, y_r = self.divideSet(X, treatment_idx, y, col, value)

# check the split validity on min_samples_leaf 372
if (len(X_l) < min_samples_leaf or len(X_r) < min_samples_leaf):
continue
Expand All @@ -1233,15 +1255,28 @@ class UpliftTreeClassifier:
min_samples_treatment=min_samples_treatment,
n_reg=n_reg,
parentNodeSummary=currentNodeSummary)

rightNodeSummary = self.tree_node_summary(w_r, y_r,
min_samples_treatment=min_samples_treatment,
min_samples_treatment=min_samples_treatment,
n_reg=n_reg,
parentNodeSummary=currentNodeSummary)

# check the split validity on min_samples_treatment
assert len(leftNodeSummary) == len(rightNodeSummary)

if X_val is not None:
X_val_l, X_val_r, w_val_l, w_val_r, y_val_l, y_val_r = self.divideSet(X_val, treatment_val_idx, y_val, col, value)
leftNodeSummary_val = self.tree_node_summary(w_val_l, y_val_l,
parentNodeSummary=currentNodeSummary)
rightNodeSummary_val = self.tree_node_summary(w_val_r, y_val_r,
parentNodeSummary=currentNodeSummary)
early_stopping_flag = False
for k in range(len(leftNodeSummary_val)):
if (abs(leftNodeSummary_val[k][0]-leftNodeSummary[k][0]) > min(leftNodeSummary_val[k][0],leftNodeSummary[k][0])/early_stopping_eval_diff_scale or
abs(rightNodeSummary_val[k][0]-rightNodeSummary[k][0]) > min(rightNodeSummary_val[k][0],rightNodeSummary[k][0])/early_stopping_eval_diff_scale):
early_stopping_flag = True
break
if early_stopping_flag:
continue

# check the split validity on min_samples_treatment
node_mst = min([stat[1] for stat in leftNodeSummary + rightNodeSummary])
if node_mst < min_samples_treatment:
continue
Expand Down Expand Up @@ -1293,13 +1328,16 @@ class UpliftTreeClassifier:
norm_factor = self.normI(n_c, n_c_left, n_t, n_t_left, alpha=0.9)
else:
norm_factor = 1
gain = gain / norm_factor
gain = gain / norm_factor
if (gain > bestGain and len(X_l) > min_samples_leaf and len(X_r) > min_samples_leaf):
bestGain = gain
bestGainImp = gain_for_imp
bestAttribute = (col, value)
best_set_left = [X_l, w_l, y_l]
best_set_right = [X_r, w_r, y_r]
best_set_left = [X_l, w_l, y_l, None, None, None]
best_set_right = [X_r, w_r, y_r, None, None, None]
if X_val is not None:
best_set_left = [X_l, w_l, y_l, X_val_l, w_val_l, y_val_l]
best_set_right = [X_r, w_r, y_r, X_val_r, w_val_r, y_val_r]

dcY = {'impurity': '%.3f' % currentScore, 'samples': '%d' % len(X)}
# Add treatment size
Expand All @@ -1312,12 +1350,12 @@ class UpliftTreeClassifier:
if bestGain > 0 and depth < max_depth:
self.feature_imp_dict[bestAttribute[0]] += bestGainImp
trueBranch = self.growDecisionTreeFrom(
*best_set_left, max_depth, min_samples_leaf,
*best_set_left, self.early_stopping_eval_diff_scale, max_depth, min_samples_leaf,
depth + 1, min_samples_treatment=min_samples_treatment,
n_reg=n_reg, parentNodeSummary=currentNodeSummary
)
falseBranch = self.growDecisionTreeFrom(
*best_set_right, max_depth, min_samples_leaf,
*best_set_right, self.early_stopping_eval_diff_scale, max_depth, min_samples_leaf,
depth + 1, min_samples_treatment=min_samples_treatment,
n_reg=n_reg, parentNodeSummary=currentNodeSummary
)
Expand Down Expand Up @@ -1484,6 +1522,10 @@ class UpliftRandomForestClassifier:
weight (in terms of sample size) of the parent node influence on the
child node, only effective for 'KL', 'ED', 'Chi', 'CTS' methods.

early_stopping_eval_diff_scale: float, optional (default=1)
If train and valid uplift score diff bigger than
min(train_uplift_score,valid_uplift_score)/early_stopping_eval_diff_scale, stop.

control_name: string
The name of the control group (other experiment groups will be regarded as treatment groups)

Expand Down Expand Up @@ -1521,6 +1563,7 @@ class UpliftRandomForestClassifier:
min_samples_leaf=100,
min_samples_treatment=10,
n_reg=10,
early_stopping_eval_diff_scale=1,
evaluationFunction='KL',
normalization=True,
honesty=False,
Expand All @@ -1538,6 +1581,7 @@ class UpliftRandomForestClassifier:
self.min_samples_leaf = min_samples_leaf
self.min_samples_treatment = min_samples_treatment
self.n_reg = n_reg
self.early_stopping_eval_diff_scale = early_stopping_eval_diff_scale
self.evaluationFunction = evaluationFunction
self.control_name = control_name
self.normalization = normalization
Expand All @@ -1554,7 +1598,7 @@ class UpliftRandomForestClassifier:
if self.n_jobs == -1:
self.n_jobs = mp.cpu_count()

def fit(self, X, treatment, y):
def fit(self, X, treatment, y, X_val=None, treatment_val=None, y_val=None):
"""
Fit the UpliftRandomForestClassifier.

Expand All @@ -1568,6 +1612,15 @@ class UpliftRandomForestClassifier:

y : array-like, shape = [num_samples]
An array containing the outcome of interest for each unit.

X_val : ndarray, shape = [num_samples, num_features]
An ndarray of the covariates used to valid the uplift model.

treatment_val : array-like, shape = [num_samples]
An array containing the validation treatment group for each unit.

y_val : array-like, shape = [num_samples]
An array containing the validation outcome of interest for each unit.
"""
random_state = check_random_state(self.random_state)

Expand All @@ -1578,6 +1631,7 @@ class UpliftRandomForestClassifier:
min_samples_leaf=self.min_samples_leaf,
min_samples_treatment=self.min_samples_treatment,
n_reg=self.n_reg,
early_stopping_eval_diff_scale=self.early_stopping_eval_diff_scale,
evaluationFunction=self.evaluationFunction,
control_name=self.control_name,
normalization=self.normalization,
Expand All @@ -1595,21 +1649,30 @@ class UpliftRandomForestClassifier:

self.uplift_forest = (
Parallel(n_jobs=self.n_jobs, prefer=self.joblib_prefer)
(delayed(self.bootstrap)(X, treatment, y, tree) for tree in self.uplift_forest)
(delayed(self.bootstrap)(X, treatment, y, X_val, treatment_val, y_val, tree) for tree in self.uplift_forest)
)

all_importances = [tree.feature_importances_ for tree in self.uplift_forest]
self.feature_importances_ = np.mean(all_importances, axis=0)
self.feature_importances_ /= self.feature_importances_.sum() # normalize to add to 1

@staticmethod
def bootstrap(X, treatment, y, tree):
def bootstrap(X, treatment, y, X_val, treatment_val, y_val, tree):
random_state = check_random_state(tree.random_state)
bt_index = random_state.choice(len(X), len(X))
x_train_bt = X[bt_index]
y_train_bt = y[bt_index]
treatment_train_bt = treatment[bt_index]
tree.fit(X=x_train_bt, treatment=treatment_train_bt, y=y_train_bt)

if X_val is None:
tree.fit(X=x_train_bt, treatment=treatment_train_bt, y=y_train_bt)
else:
bt_val_index = random_state.choice(len(X_val), len(X_val))
x_val_bt = X_val[bt_val_index]
y_val_bt = y_val[bt_val_index]
treatment_val_bt = treatment_val[bt_val_index]

tree.fit(X=x_train_bt, treatment=treatment_train_bt, y=y_train_bt, X_val=x_val_bt, treatment_val=treatment_val_bt, y_val=y_val_bt)
return tree

@ignore_warnings(category=FutureWarning)
Expand Down
40 changes: 31 additions & 9 deletions tests/test_uplift_trees.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,24 @@ def test_make_uplift_classification(generate_classification_data):

@pytest.mark.parametrize("backend", ["loky", "threading", "multiprocessing"])
@pytest.mark.parametrize("joblib_prefer", ["threads", "processes"])
@pytest.mark.parametrize("early_stopping", ["true", "false"])
def test_UpliftRandomForestClassifier(
generate_classification_data, backend, joblib_prefer
generate_classification_data, backend, joblib_prefer, early_stopping
):
df, x_names = generate_classification_data()
df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)
df_train, df_test, df_val = None, None, None

if early_stopping == "true":
df_train, df_test_val = train_test_split(
df, test_size=0.2, random_state=RANDOM_SEED
)
df_test, df_val = train_test_split(
df_test_val, test_size=0.5, random_state=RANDOM_SEED
)
else:
df_train, df_test = train_test_split(
df, test_size=0.2, random_state=RANDOM_SEED
)

with parallel_backend(backend):
# Train the UpLift Random Forest classifier
Expand All @@ -32,14 +45,23 @@ def test_UpliftRandomForestClassifier(
control_name=TREATMENT_NAMES[0],
random_state=RANDOM_SEED,
joblib_prefer=joblib_prefer,
early_stopping_eval_diff_scale=1,
)

uplift_model.fit(
df_train[x_names].values,
treatment=df_train["treatment_group_key"].values,
y=df_train[CONVERSION].values,
)

if early_stopping == "true":
uplift_model.fit(
df_train[x_names].values,
treatment=df_train["treatment_group_key"].values,
y=df_train[CONVERSION].values,
X_val=df_val[x_names].values,
treatment_val=df_val["treatment_group_key"].values,
y_val=df_val[CONVERSION].values,
)
else:
uplift_model.fit(
df_train[x_names].values,
treatment=df_train["treatment_group_key"].values,
y=df_train[CONVERSION].values,
)
predictions = {}
predictions["single"] = uplift_model.predict(df_test[x_names].values)
with parallel_backend("loky", n_jobs=2):
Expand Down
Loading