Skip to content

Commit

Permalink
limit pandas version to 2.1.4 (issues with query function)
Browse files Browse the repository at this point in the history
  • Loading branch information
csinva committed Feb 29, 2024
1 parent b250a5f commit 928f2e5
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 17 deletions.
30 changes: 22 additions & 8 deletions imodels/rule_set/rule_fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,8 @@ def fit(self, X, y=None, feature_names=None):
self.feature_names = np.array(list(self.feature_dict_.values()))

extracted_rules = self._extract_rules(X, y)
self.rules_without_feature_names_, self.coef, self.intercept = self._score_rules(X, y, extracted_rules)
self.rules_without_feature_names_, self.coef, self.intercept = self._score_rules(
X, y, extracted_rules)
self.rules_ = [
replace_feature_name(rule, self.feature_dict_) for rule in self.rules_without_feature_names_
]
Expand Down Expand Up @@ -160,7 +161,8 @@ def predict_proba(self, X):
X = X.toarray()
X = check_array(X)
continuous_output = self._predict_continuous_output(X)
logits = np.vstack((1 - continuous_output, continuous_output)).transpose()
logits = np.vstack(
(1 - continuous_output, continuous_output)).transpose()
return softmax(logits, axis=1)

def transform(self, X=None, rules=None):
Expand All @@ -178,9 +180,15 @@ def transform(self, X=None, rules=None):
Transformed data set
"""
df = pd.DataFrame(X, columns=self.feature_placeholders)
print('df', df.dtypes, df.head())
X_transformed = np.zeros((X.shape[0], len(rules)))
for i, r in enumerate(rules):
features_r_uses = [term.split(' ')[0] for term in r.split(' and ')]
# print('r', r)
# print('feats', df[features_r_uses])
# print('ans', df[features_r_uses].query(r))
# print(
# 'tra', X_transformed[df[features_r_uses].query(r).index.values, i])
X_transformed[df[features_r_uses].query(r).index.values, i] = 1
return X_transformed

Expand Down Expand Up @@ -216,21 +224,26 @@ def _get_rules(self, exclude_zero_coef=False, subregion=None):
subregion = np.array(subregion)
importance = sum(abs(coef) * abs([x[i] for x in self.winsorizer.trim(subregion)] - self.mean[i])) / len(
subregion)
output_rules += [(self.feature_names[i], 'linear', coef, 1, importance)]
output_rules += [(self.feature_names[i],
'linear', coef, 1, importance)]

# Add rules
for i in range(0, len(self.rules_)):
rule = rule_ensemble[i]
coef = self.coef[i + n_features]

if subregion is None:
importance = abs(coef) * (rule.support * (1 - rule.support)) ** (1 / 2)
importance = abs(coef) * (rule.support *
(1 - rule.support)) ** (1 / 2)
else:
rkx = self.transform(subregion, [rule])[:, -1]
importance = sum(abs(coef) * abs(rkx - rule.support)) / len(subregion)
importance = sum(
abs(coef) * abs(rkx - rule.support)) / len(subregion)

output_rules += [(self.rules_[i].rule, 'rule', coef, rule.support, importance)]
rules = pd.DataFrame(output_rules, columns=["rule", "type", "coef", "support", "importance"])
output_rules += [(self.rules_[i].rule, 'rule',
coef, rule.support, importance)]
rules = pd.DataFrame(output_rules, columns=[
"rule", "type", "coef", "support", "importance"])
if exclude_zero_coef:
rules = rules.ix[rules.coef != 0]
return rules
Expand Down Expand Up @@ -292,7 +305,8 @@ def _score_rules(self, X, y, rules) -> Tuple[List[Rule], List[float], float]:
# no rules fit and self.include_linear == False
if X_concat.shape[1] == 0:
return [], [], 0
prediction_task = 'regression' if isinstance(self, RegressorMixin) else 'classification'
prediction_task = 'regression' if isinstance(
self, RegressorMixin) else 'classification'
return score_linear(X_concat, y, rules,
prediction_task=prediction_task,
max_rules=self.max_rules,
Expand Down
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@

required_pypi = [
'matplotlib',
'mlxtend>=0.18.0', # some lower version are missing fpgrowth
'mlxtend>=0.18.0', # some lower versions are missing fpgrowth
'numpy',
'pandas',
'pandas<=2.1.4', # pandas 2.2 introduced some issues with the query function
'requests', # used in c4.5
'scipy',
'scikit-learn>=1.2.0', # recently updates this
'scikit-learn>=1.2.0', # recently updated this
'tqdm', # used in BART
]

Expand Down
15 changes: 9 additions & 6 deletions tests/classification_continuous_inputs_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,20 @@ class TestClassClassificationContinuousInputs:
'''Tests simple classification for different models. Note: still doesn't test all the models!
'''

def setup(self):
def setup_method(self):
np.random.seed(13)
random.seed(13)
self.n = 40
self.p = 2
self.X_classification_binary = np.random.randn(self.n, self.p)

# y = x0 > 0
self.y_classification_binary = (self.X_classification_binary[:, 0] > 0).astype(int)
self.y_classification_binary = (
self.X_classification_binary[:, 0] > 0).astype(int)

# flip labels for last few
self.y_classification_binary[-2:] = 1 - self.y_classification_binary[-2:]
self.y_classification_binary[-2:] = 1 - \
self.y_classification_binary[-2:]

def test_classification_binary(self):
'''Test imodels on basic binary classification task
Expand Down Expand Up @@ -58,7 +60,8 @@ def test_classification_binary(self):
preds_proba = m.predict_proba(X)
assert len(preds_proba.shape) == 2, 'preds_proba has 2 columns'
assert preds_proba.shape[1] == 2, 'preds_proba has 2 columns'
assert np.max(preds_proba) < 1.1, 'preds_proba has no values over 1'
assert np.max(
preds_proba) < 1.1, 'preds_proba has no values over 1'
assert (np.argmax(preds_proba, axis=1) == preds).all(), ("predict_proba and "
"predict agree")

Expand All @@ -70,5 +73,5 @@ def test_classification_binary(self):

if __name__ == '__main__':
t = TestClassClassificationContinuousInputs()
t.setup()
t.setup_method()
t.test_classification_binary()

0 comments on commit 928f2e5

Please sign in to comment.