diff --git a/imodels/experimental/figs_ensembles.py b/imodels/experimental/figs_ensembles.py
index 634e6048..ea6fa6c6 100644
--- a/imodels/experimental/figs_ensembles.py
+++ b/imodels/experimental/figs_ensembles.py
@@ -2,7 +2,6 @@
 
 import numpy as np
 from matplotlib import pyplot as plt
-import sklearn
 from sklearn import datasets
 from sklearn import tree
 from sklearn.base import BaseEstimator
@@ -73,22 +72,18 @@ def setattrs(self, **kwargs):
             setattr(self, k, v)
 
     def __str__(self):
-        try:
-            sklearn.utils.validation.check_is_fitted(self)
-            if self.split_or_linear == 'linear':
-                if self.is_root:
-                    return f'X_{self.feature} * {self.value:0.3f} (Tree #{self.tree_num} linear root)'
-                else:
-                    return f'X_{self.feature} * {self.value:0.3f} (linear)'
+        if self.split_or_linear == 'linear':
+            if self.is_root:
+                return f'X_{self.feature} * {self.value:0.3f} (Tree #{self.tree_num} linear root)'
             else:
-                if self.is_root:
-                    return f'X_{self.feature} <= {self.threshold:0.3f} (Tree #{self.tree_num} root)'
-                elif self.left is None and self.right is None:
-                    return f'Val: {self.value[0][0]:0.3f} (leaf)'
-                else:
-                    return f'X_{self.feature} <= {self.threshold:0.3f} (split)'
-        except ValueError:
-            return self.__class__.__name__
+                return f'X_{self.feature} * {self.value:0.3f} (linear)'
+        else:
+            if self.is_root:
+                return f'X_{self.feature} <= {self.threshold:0.3f} (Tree #{self.tree_num} root)'
+            elif self.left is None and self.right is None:
+                return f'Val: {self.value[0][0]:0.3f} (leaf)'
+            else:
+                return f'X_{self.feature} <= {self.threshold:0.3f} (split)'
 
     def __repr__(self):
         return self.__str__()
@@ -422,17 +417,13 @@ def _tree_to_str(self, root: Node, prefix=''):
                                                                                                      pprefix)
 
     def __str__(self):
-        try:
-            sklearn.utils.validation.check_is_fitted(self)
-            s = '------------\n' + \
-                '\n\t+\n'.join([self._tree_to_str(t) for t in self.trees_])
-            if hasattr(self, 'feature_names_') and self.feature_names_ is not None:
-                for i in range(len(self.feature_names_))[::-1]:
-                    s = s.replace(f'X_{i}', self.feature_names_[i])
-            return s
-        except ValueError:
-            return self.__class__.__name__
-            
+        s = '------------\n' + \
+            '\n\t+\n'.join([self._tree_to_str(t) for t in self.trees_])
+        if hasattr(self, 'feature_names_') and self.feature_names_ is not None:
+            for i in range(len(self.feature_names_))[::-1]:
+                s = s.replace(f'X_{i}', self.feature_names_[i])
+        return s
+
     def predict(self, X):
         if self.posthoc_ridge and self.weighted_model_:  # note, during fitting don't use the weighted moel
             X_feats = self._extract_tree_predictions(X)
diff --git a/imodels/rule_list/corels_wrapper.py b/imodels/rule_list/corels_wrapper.py
index f968e3f5..39788df6 100644
--- a/imodels/rule_list/corels_wrapper.py
+++ b/imodels/rule_list/corels_wrapper.py
@@ -4,7 +4,6 @@
 
 import numpy as np
 import pandas as pd
-import sklearn
 from sklearn.preprocessing import KBinsDiscretizer
 
 from imodels.rule_list.greedy_rule_list import GreedyRuleListClassifier
@@ -234,18 +233,14 @@ def _traverse_rule(self, X: np.ndarray, y: np.ndarray, feature_names: List[str],
         self.str_print = str_print
 
     def __str__(self):
-        try:
-            sklearn.utils.validation.check_is_fitted(self)
-            if corels_supported:
-                if self.str_print is not None:
-                    return 'OptimalRuleList:\n\n' + self.str_print
-                else:
-                    return 'OptimalRuleList:\n\n' + self.rl_.__str__()
+        if corels_supported:
+            if self.str_print is not None:
+                return 'OptimalRuleList:\n\n' + self.str_print
             else:
-                return super().__str__()
-        except ValueError:
-            return self.__class__.__name__
-            
+                return 'OptimalRuleList:\n\n' + self.rl_.__str__()
+        else:
+            return super().__str__()
+
     def _get_complexity(self):
         return sum([len(corule['antecedents']) for corule in self.rl_.rules])
 
diff --git a/imodels/rule_list/greedy_rule_list.py b/imodels/rule_list/greedy_rule_list.py
index 962f9998..0991e2f4 100644
--- a/imodels/rule_list/greedy_rule_list.py
+++ b/imodels/rule_list/greedy_rule_list.py
@@ -8,7 +8,6 @@
 from copy import deepcopy
 
 import numpy as np
-import sklearn
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.utils.multiclass import unique_labels
 from sklearn.utils.validation import check_array, check_is_fitted
@@ -141,43 +140,48 @@ def predict(self, X):
         X = check_array(X)
         return np.argmax(self.predict_proba(X), axis=1)
 
+    """
+    def __str__(self):
+        # s = ''
+        # for rule in self.rules_:
+        #     s += f"mean {rule['val'].round(3)} ({rule['num_pts']} pts)\n"
+        #     if 'col' in rule:
+        #         s += f"if {rule['col']} >= {rule['cutoff']} then {rule['val_right'].round(3)} ({rule['num_pts_right']} pts)\n"
+        # return s
+    """
 
     def __str__(self):
         '''Print out the list in a nice way
         '''
-        try:
-            sklearn.utils.validation.check_is_fitted(self)
-            s = '> ------------------------------\n> Greedy Rule List\n> ------------------------------\n'
-
-            def red(s):
-                # return f"\033[91m{s}\033[00m"
-                return s
-    
-            def cyan(s):
-                # return f"\033[96m{s}\033[00m"
-                return s
-    
-            def rule_name(rule):
-                if rule['flip']:
-                    return '~' + rule['col']
-                return rule['col']
-    
-            # rule = self.rules_[0]
-            #     s += f"{red((100 * rule['val']).round(3))}% IwI ({rule['num_pts']} pts)\n"
-            for rule in self.rules_:
-                s += u'\u2193\n' + f"{cyan((100 * rule['val']).round(2))}% risk ({rule['num_pts']} pts)\n"
-                #         s += f"\t{'Else':>45} => {cyan((100 * rule['val']).round(2)):>6}% IwI ({rule['val'] * rule['num_pts']:.0f}/{rule['num_pts']} pts)\n"
-                if 'col' in rule:
-                    #             prefix = f"if {rule['col']} >= {rule['cutoff']}"
-                    prefix = f"if {rule_name(rule)}"
-                    val = f"{100 * rule['val_right'].round(3)}"
-                    s += f"\t{prefix} ==> {red(val)}% risk ({rule['num_pts_right']} pts)\n"
-            # rule = self.rules_[-1]
-            #     s += f"{red((100 * rule['val']).round(3))}% IwI ({rule['num_pts']} pts)\n"
+        s = '> ------------------------------\n> Greedy Rule List\n> ------------------------------\n'
+
+        def red(s):
+            # return f"\033[91m{s}\033[00m"
+            return s
+
+        def cyan(s):
+            # return f"\033[96m{s}\033[00m"
             return s
-        except ValueError:
-            return self.__class__.__name__
-            
+
+        def rule_name(rule):
+            if rule['flip']:
+                return '~' + rule['col']
+            return rule['col']
+
+        # rule = self.rules_[0]
+        #     s += f"{red((100 * rule['val']).round(3))}% IwI ({rule['num_pts']} pts)\n"
+        for rule in self.rules_:
+            s += u'\u2193\n' + f"{cyan((100 * rule['val']).round(2))}% risk ({rule['num_pts']} pts)\n"
+            #         s += f"\t{'Else':>45} => {cyan((100 * rule['val']).round(2)):>6}% IwI ({rule['val'] * rule['num_pts']:.0f}/{rule['num_pts']} pts)\n"
+            if 'col' in rule:
+                #             prefix = f"if {rule['col']} >= {rule['cutoff']}"
+                prefix = f"if {rule_name(rule)}"
+                val = f"{100 * rule['val_right'].round(3)}"
+                s += f"\t{prefix} ==> {red(val)}% risk ({rule['num_pts_right']} pts)\n"
+        # rule = self.rules_[-1]
+        #     s += f"{red((100 * rule['val']).round(3))}% IwI ({rule['num_pts']} pts)\n"
+        return s
+
     ######## HERE ONWARDS CUSTOM SPLITTING (DEPRECATED IN FAVOR OF SKLEARN STUMP) ########
     ######################################################################################
     def _find_best_split(self, x, y):
diff --git a/imodels/rule_set/brs.py b/imodels/rule_set/brs.py
index a65c2abb..933864ca 100644
--- a/imodels/rule_set/brs.py
+++ b/imodels/rule_set/brs.py
@@ -18,7 +18,6 @@
 from numpy.random import random
 from pandas import read_csv
 from scipy.sparse import csc_matrix
-import sklearn
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.utils.multiclass import check_classification_targets
@@ -193,12 +192,8 @@ def fit(self, X, y, feature_names: list = None, init=[], verbose=False):
         return self
 
     def __str__(self):
-        try:
-            sklearn.utils.validation.check_is_fitted(self)
-            return ' '.join(str(r) for r in self.rules_)
-        except ValueError:
-            return self.__class__.__name__
-            
+        return ' '.join(str(r) for r in self.rules_)
+
     def predict(self, X):
         check_is_fitted(self)
         if isinstance(X, np.ndarray):
diff --git a/imodels/rule_set/rule_fit.py b/imodels/rule_set/rule_fit.py
index a6d05d2f..dee403e7 100644
--- a/imodels/rule_set/rule_fit.py
+++ b/imodels/rule_set/rule_fit.py
@@ -13,7 +13,6 @@
 import pandas as pd
 import scipy
 from scipy.special import softmax
-import sklearn
 from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
 from sklearn.base import TransformerMixin
 from sklearn.utils.multiclass import unique_labels
@@ -243,16 +242,12 @@ def visualize(self, decimals=2):
         return rules[['rule', 'coef']].round(decimals)
 
     def __str__(self):
-        try:
-            sklearn.utils.validation.check_is_fitted(self)
-            s = '> ------------------------------\n'
-            s += '> RuleFit:\n'
-            s += '> \tPredictions are made by summing the coefficients of each rule\n'
-            s += '> ------------------------------\n'
-            return s + self.visualize().to_string(index=False) + '\n'
-        except ValueError:
-            return self.__class__.__name__
-    
+        s = '> ------------------------------\n'
+        s += '> RuleFit:\n'
+        s += '> \tPredictions are made by summing the coefficients of each rule\n'
+        s += '> ------------------------------\n'
+        return s + self.visualize().to_string(index=False) + '\n'
+
     def _extract_rules(self, X, y) -> List[str]:
         return extract_rulefit(X, y,
                                feature_names=self.feature_placeholders,
diff --git a/imodels/tree/cart_wrapper.py b/imodels/tree/cart_wrapper.py
index 2f6f7021..7bb9ec93 100644
--- a/imodels/tree/cart_wrapper.py
+++ b/imodels/tree/cart_wrapper.py
@@ -1,7 +1,6 @@
 # This is just a simple wrapper around sklearn decisiontree
 # https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
 
-import sklearn
 from sklearn.tree import DecisionTreeClassifier, export_text, DecisionTreeRegressor
 from imodels.util.arguments import check_fit_arguments
 
@@ -49,18 +48,15 @@ def _set_complexity(self):
         self.complexity_ = compute_tree_complexity(self.tree_)
 
     def __str__(self):
-        try:
-            sklearn.utils.validation.check_is_fitted(self)
-            s = '> ------------------------------\n'
-            s += '> Greedy CART Tree:\n'
-            s += '> \tPrediction is made by looking at the value in the appropriate leaf of the tree\n'
-            s += '> ------------------------------' + '\n'
-            if hasattr(self, 'feature_names') and self.feature_names is not None:
-                return s + export_text(self, feature_names=self.feature_names, show_weights=True)
-            else:
-                return s + export_text(self, show_weights=True)
-        except ValueError:
-            return self.__class__.__name__
+        s = '> ------------------------------\n'
+        s += '> Greedy CART Tree:\n'
+        s += '> \tPrediction is made by looking at the value in the appropriate leaf of the tree\n'
+        s += '> ------------------------------' + '\n'
+        if hasattr(self, 'feature_names') and self.feature_names is not None:
+            return s + export_text(self, feature_names=self.feature_names, show_weights=True)
+        else:
+            return s + export_text(self, show_weights=True)
+
 
 class GreedyTreeRegressor(DecisionTreeRegressor):
     """Wrapper around sklearn greedy tree regressor
@@ -102,11 +98,7 @@ def _set_complexity(self):
         self.complexity_ = compute_tree_complexity(self.tree_)
 
     def __str__(self):
-        try:
-            sklearn.utils.validation.check_is_fitted(self)
-            if hasattr(self, 'feature_names') and self.feature_names is not None:
-                return 'GreedyTree:\n' + export_text(self, feature_names=self.feature_names, show_weights=True)
-            else:
-                return 'GreedyTree:\n' + export_text(self, show_weights=True)
-        except ValueError:
-            return self.__class__.__name__                
+        if hasattr(self, 'feature_names') and self.feature_names is not None:
+            return 'GreedyTree:\n' + export_text(self, feature_names=self.feature_names, show_weights=True)
+        else:
+            return 'GreedyTree:\n' + export_text(self, show_weights=True)
\ No newline at end of file
diff --git a/imodels/tree/figs.py b/imodels/tree/figs.py
index 2c2e5104..7baacb9b 100644
--- a/imodels/tree/figs.py
+++ b/imodels/tree/figs.py
@@ -5,7 +5,6 @@
 import numpy as np
 import pandas as pd
 from scipy.special import expit
-import sklearn
 from sklearn import datasets
 from sklearn import tree
 from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
@@ -52,17 +51,13 @@ def setattrs(self, **kwargs):
             setattr(self, k, v)
 
     def __str__(self):
-        try:
-            sklearn.utils.validation.check_is_fitted(self)
-            if self.is_root:
-                return f'X_{self.feature} <= {self.threshold:0.3f} (Tree #{self.tree_num} root)'
-            elif self.left is None and self.right is None:
-                return f'Val: {self.value[0][0]:0.3f} (leaf)'
-            else:
-                return f'X_{self.feature} <= {self.threshold:0.3f} (split)'
-        except ValueError:
-            return self.__class__.__name__
-            
+        if self.is_root:
+            return f'X_{self.feature} <= {self.threshold:0.3f} (Tree #{self.tree_num} root)'
+        elif self.left is None and self.right is None:
+            return f'Val: {self.value[0][0]:0.3f} (leaf)'
+        else:
+            return f'X_{self.feature} <= {self.threshold:0.3f} (split)'
+
     def print_root(self, y):
         try:
             one_count = pd.Series(y).value_counts()[1.0]
@@ -77,6 +72,8 @@ def print_root(self, y):
         else:
             return f'X_{self.feature} <= {self.threshold:0.3f}' + one_proportion
 
+    def __repr__(self):
+        return self.__str__()
 
 
 class FIGS(BaseEstimator):
@@ -414,21 +411,17 @@ def _tree_to_str_with_data(self, X, y, root: Node, prefix=''):
             self._tree_to_str_with_data(X[~left], y[~left], root.right, pprefix))
 
     def __str__(self):
-        try:
-            sklearn.utils.validation.check_is_fitted(self)
-            s = '> ------------------------------\n'
-            s += '> FIGS-Fast Interpretable Greedy-Tree Sums:\n'
-            s += '> \tPredictions are made by summing the "Val" reached by traversing each tree.\n'
-            s += '> \tFor classifiers, a sigmoid function is then applied to the sum.\n'
-            s += '> ------------------------------\n'
-            s += '\n\t+\n'.join([self._tree_to_str(t) for t in self.trees_])
-            if hasattr(self, 'feature_names_') and self.feature_names_ is not None:
-                for i in range(len(self.feature_names_))[::-1]:
-                    s = s.replace(f'X_{i}', self.feature_names_[i])
-            return s
-        except ValueError:
-            return self.__class__.__name__
-            
+        s = '> ------------------------------\n'
+        s += '> FIGS-Fast Interpretable Greedy-Tree Sums:\n'
+        s += '> \tPredictions are made by summing the "Val" reached by traversing each tree.\n'
+        s += '> \tFor classifiers, a sigmoid function is then applied to the sum.\n'
+        s += '> ------------------------------\n'
+        s += '\n\t+\n'.join([self._tree_to_str(t) for t in self.trees_])
+        if hasattr(self, 'feature_names_') and self.feature_names_ is not None:
+            for i in range(len(self.feature_names_))[::-1]:
+                s = s.replace(f'X_{i}', self.feature_names_[i])
+        return s
+
     def print_tree(self, X, y, feature_names=None):
         s = '------------\n' + \
             '\n\t+\n'.join([self._tree_to_str_with_data(X, y, t)
diff --git a/imodels/tree/hierarchical_shrinkage.py b/imodels/tree/hierarchical_shrinkage.py
index bfe99c33..ac525438 100644
--- a/imodels/tree/hierarchical_shrinkage.py
+++ b/imodels/tree/hierarchical_shrinkage.py
@@ -1,33 +1,25 @@
+import time
 from copy import deepcopy
 from typing import List
 
 import numpy as np
-import sklearn
 from sklearn import datasets
-from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin, clone
-from sklearn.metrics import r2_score
-from sklearn.model_selection import cross_val_score
+from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin
+from sklearn.metrics import r2_score, mean_squared_error, log_loss
+from sklearn.model_selection import cross_val_score, KFold
 from sklearn.model_selection import train_test_split
 from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, \
     export_text
-from sklearn.utils import check_X_y
-from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.ensemble import GradientBoostingClassifier, RandomForestRegressor
 
 from imodels.util import checks
 from imodels.util.arguments import check_fit_arguments
 from imodels.util.tree import compute_tree_complexity
 
-# leading and traiing undescores
-# https://github.com/rasbt/python-machine-learning-book/blob/master/faq/underscore-convention.md
-# developer guideline
-# https://scikit-learn.org/stable/developers/contributing.html#estimated-attributes
 
-# https://scikit-learn.org/stable/developers/contributing.html
-
-
-class HSTree(BaseEstimator):
-    def __init__(self, estimator=None,
-                reg_param: float = 1, shrinkage_scheme_: str = 'node_based'):
+class HSTree:
+    def __init__(self, estimator_: BaseEstimator = DecisionTreeClassifier(max_leaf_nodes=20),
+                 reg_param: float = 1, shrinkage_scheme_: str = 'node_based'):
         """HSTree (Tree with hierarchical shrinkage applied).
         Hierarchical shinkage is an extremely fast post-hoc regularization method which works on any decision tree (or tree-based ensemble, such as Random Forest).
         It does not modify the tree structure, and instead regularizes the tree by shrinking the prediction over each node towards the sample means of its ancestors (using a single regularization parameter).
@@ -42,7 +34,7 @@ def __init__(self, estimator=None,
 
         reg_param: float
             Higher is more regularization (can be arbitrarily large, should not be < 0)
-
+        
         shrinkage_scheme: str
             Experimental: Used to experiment with different forms of shrinkage. options are: 
                 (i) node_based shrinks based on number of samples in parent node
@@ -51,35 +43,24 @@ def __init__(self, estimator=None,
         """
         super().__init__()
         self.reg_param = reg_param
-        self.estimator = estimator
+        self.estimator_ = estimator_
         self.shrinkage_scheme_ = shrinkage_scheme_
-
-
-    def _validate_estimator(self, default=None):
-        """Check the base estimator.
-
-        Sets the `estimator_` attributes.
-        """
-        if self.estimator is not None:
-            self.estimator_ = self.estimator
-        else:
-            self.estimator_ = default
-
-
-    def fit(self, X, y, sample_weight=None, *args, **kwargs):
-
-        self._validate_estimator()
-
         if checks.check_is_fitted(self.estimator_):
             self._shrink()
 
-        else:
-            # remove feature_names if it exists (note: only works as keyword-arg)
-            feature_names = kwargs.pop('feature_names', None)  # None returned if not passed
-            X, y, feature_names = check_fit_arguments(self, X, y, feature_names)
-            X, y = check_X_y(X,y)
-            self.estimator_.fit(X, y, *args, sample_weight=sample_weight, **kwargs)
-            self._shrink()
+    def get_params(self, deep=True):
+        if deep:
+            return deepcopy({'reg_param': self.reg_param, 'estimator_': self.estimator_,
+                             'shrinkage_scheme_': self.shrinkage_scheme_})
+        return {'reg_param': self.reg_param, 'estimator_': self.estimator_,
+                'shrinkage_scheme_': self.shrinkage_scheme_}
+
+    def fit(self, X, y, sample_weight=None, *args, **kwargs):
+        # remove feature_names if it exists (note: only works as keyword-arg)
+        feature_names = kwargs.pop('feature_names', None)  # None returned if not passed
+        X, y, feature_names = check_fit_arguments(self, X, y, feature_names)
+        self.estimator_ = self.estimator_.fit(X, y, *args, sample_weight=sample_weight, **kwargs)
+        self._shrink()
 
         # compute complexity
         if hasattr(self.estimator_, 'tree_'):
@@ -92,7 +73,6 @@ def fit(self, X, y, sample_weight=None, *args, **kwargs):
                     assert t.size == 1, 'multiple trees stored under tree_?'
                     t = t[0]
                 self.complexity_ += compute_tree_complexity(t.tree_)
-
         return self
 
     def _shrink_tree(self, tree, reg_param, i=0, parent_val=None, parent_num=None, cum_sum=0):
@@ -106,7 +86,7 @@ def _shrink_tree(self, tree, reg_param, i=0, parent_val=None, parent_num=None, c
         n_samples = tree.weighted_n_node_samples[i]
         if isinstance(self, RegressorMixin) or isinstance(self.estimator_, GradientBoostingClassifier):
             val = deepcopy(tree.value[i, :, :])
-        else: # If classification, normalize to probability vector
+        else:  # If classification, normalize to probability vector
             val = tree.value[i, :, :] / n_samples
 
         # Step 1: Update cum_sum
@@ -120,15 +100,15 @@ def _shrink_tree(self, tree, reg_param, i=0, parent_val=None, parent_num=None, c
                 val_new = (val - parent_val) / (1 + reg_param / parent_num)
             elif self.shrinkage_scheme_ == 'constant':
                 val_new = (val - parent_val) / (1 + reg_param)
-            else: # leaf_based
+            else:  # leaf_based
                 val_new = 0
             cum_sum += val_new
 
         # Step 2: Update node values
         if self.shrinkage_scheme_ == 'node_based' or self.shrinkage_scheme_ == 'constant':
             tree.value[i, :, :] = cum_sum
-        else: # leaf_based
-            if is_leaf: # update node values if leaf_based
+        else:  # leaf_based
+            if is_leaf:  # update node values if leaf_based
                 root_val = tree.value[0, :, :]
                 tree.value[i, :, :] = root_val + (val - root_val) / (1 + reg_param / n_samples)
             else:
@@ -137,11 +117,11 @@ def _shrink_tree(self, tree, reg_param, i=0, parent_val=None, parent_num=None, c
                 # Step 3: Recurse if not leaf
         if not is_leaf:
             self._shrink_tree(tree, reg_param, left,
-                                parent_val=val, parent_num=n_samples, cum_sum=deepcopy(cum_sum))
+                              parent_val=val, parent_num=n_samples, cum_sum=deepcopy(cum_sum))
             self._shrink_tree(tree, reg_param, right,
-                                parent_val=val, parent_num=n_samples, cum_sum=deepcopy(cum_sum))
+                              parent_val=val, parent_num=n_samples, cum_sum=deepcopy(cum_sum))
 
-                # edit the non-leaf nodes for later visualization (doesn't effect predictions)
+            # edit the non-leaf nodes for later visualization (doesn't effect predictions)
 
         return tree
 
@@ -171,88 +151,93 @@ def score(self, X, y, *args, **kwargs):
             return NotImplemented
 
     def __str__(self):
-        try:
-            sklearn.utils.validation.check_is_fitted(self)
-            s = '> ------------------------------\n'
-            s += '> Decision Tree with Hierarchical Shrinkage\n'
-            s += '> \tPrediction is made by looking at the value in the appropriate leaf of the tree\n'
-            s += '> ------------------------------' + '\n'
-            if hasattr(self, 'feature_names') and self.feature_names is not None:
-                return s + export_text(self.estimator_, feature_names=self.feature_names, show_weights=True)
-            else:
-                return s + export_text(self.estimator_, show_weights=True)
-        except:
-            return self.__class__.__name__
+        s = '> ------------------------------\n'
+        s += '> Decision Tree with Hierarchical Shrinkage\n'
+        s += '> \tPrediction is made by looking at the value in the appropriate leaf of the tree\n'
+        s += '> ------------------------------' + '\n'
+        if hasattr(self, 'feature_names') and self.feature_names is not None:
+            return s + export_text(self.estimator_, feature_names=self.feature_names, show_weights=True)
+        else:
+            return s + export_text(self.estimator_, show_weights=True)
 
     def __repr__(self):
-        try:
-            sklearn.utils.validation.check_is_fitted(self)
-            # s = self.__class__.__name__
-            # s += "("
-            # s += "estimator_="
-            # s += repr(self.estimator_)
-            # s += ", "
-            # s += "reg_param="
-            # s += str(self.reg_param)
-            # s += ", "
-            # s += "shrinkage_scheme_="
-            # s += self.shrinkage_scheme_
-            # s += ")"
-            # return s
-            attr_list = ["estimator_", "reg_param", "shrinkage_scheme_"]
-            s = self.__class__.__name__
-            s += "("
-            for attr in attr_list:
-                s += attr + "=" + repr(getattr(self, attr)) + ", "
-            s = s[:-2] + ")"
-            return s
-        except :
-            return self.__class__.__name__
+        # s = self.__class__.__name__
+        # s += "("
+        # s += "estimator_="
+        # s += repr(self.estimator_)
+        # s += ", "
+        # s += "reg_param="
+        # s += str(self.reg_param)
+        # s += ", "
+        # s += "shrinkage_scheme_="
+        # s += self.shrinkage_scheme_
+        # s += ")"
+        # return s
+        attr_list = ["estimator_", "reg_param", "shrinkage_scheme_"]
+        s = self.__class__.__name__
+        s += "("
+        for attr in attr_list:
+            s += attr + "=" + repr(getattr(self, attr)) + ", "
+        s = s[:-2] + ")"
+        return s
+
+
+class HSTreeRegressor(HSTree, RegressorMixin):
+    def __init__(self, estimator_: BaseEstimator = DecisionTreeRegressor(max_leaf_nodes=20),
+                 reg_param: float = 1, shrinkage_scheme_: str = 'node_based'):
+        super().__init__(estimator_=estimator_,
+                         reg_param=reg_param,
+                         shrinkage_scheme_=shrinkage_scheme_,
+                         )
+
 
 class HSTreeClassifier(HSTree, ClassifierMixin):
-    def __init__(self, estimator=None,
-                    reg_param: float = 1, shrinkage_scheme_: str = 'node_based'):
-        super().__init__(estimator=estimator,
-                            reg_param=reg_param,
-                            shrinkage_scheme_=shrinkage_scheme_,
-                            )
+    def __init__(self, estimator_: BaseEstimator = DecisionTreeClassifier(max_leaf_nodes=20),
+                 reg_param: float = 1, shrinkage_scheme_: str = 'node_based'):
+        super().__init__(estimator_=estimator_,
+                         reg_param=reg_param,
+                         shrinkage_scheme_=shrinkage_scheme_,
+                         )
 
-    def _validate_estimator(self):
-        """Check the estimator and set the estimator_ attribute."""
-        super()._validate_estimator(default=DecisionTreeClassifier(max_leaf_nodes=20))
 
-class HSTreeRegressor(HSTree, RegressorMixin):
-    def __init__(self, estimator=None,
-                    reg_param: float = 1, shrinkage_scheme_: str = 'node_based'):
-        super().__init__(estimator=estimator,
-                            reg_param=reg_param,
-                            shrinkage_scheme_=shrinkage_scheme_,
-                            )
-    def _validate_estimator(self):
-        """Check the estimator and set the estimator_ attribute."""
-        super()._validate_estimator(default=DecisionTreeRegressor(max_leaf_nodes=20))
+def _get_cv_criterion(scorer):
+    y_true = np.random.binomial(n=1, p=.5, size=100)
+
+    y_pred_good = y_true
+    y_pred_bad = np.random.uniform(0, 1, 100)
+
+    score_good = scorer(y_true, y_pred_good)
+    score_bad = scorer(y_true, y_pred_bad)
+
+    if score_good > score_bad:
+        return np.argmax
+    elif score_good < score_bad:
+        return np.argmin
 
 
 class HSTreeClassifierCV(HSTreeClassifier):
-    def __init__(self, estimator=None,
-                    reg_param_list: List[float] = [0.1, 1, 10, 50, 100, 500],
-                    shrinkage_scheme_: str = 'node_based',
-                    cv: int = 3, scoring=None):
+    def __init__(self, estimator_: BaseEstimator = None,
+                 reg_param_list: List[float] = [0, 0.1, 1, 10, 50, 100, 500],
+                 shrinkage_scheme_: str = 'node_based',
+                 max_leaf_nodes: int = 20,
+                 cv: int = 3, scoring=None, *args, **kwargs):
         """Cross-validation is used to select the best regularization parameter for hierarchical shrinkage.
 
-        Params
+         Params
         ------
         estimator_
             Sklearn estimator (already initialized).
             If no estimator_ is passed, sklearn decision tree is used
 
-        reg_param_list : list
+        max_rules
             If estimator is None, then max_leaf_nodes is passed to the default decision tree
 
         args, kwargs
             Note: args, kwargs are not used but left so that imodels-experiments can still pass redundant args.
         """
-        super().__init__(estimator, reg_param=None)
+        if estimator_ is None:
+            estimator_ = DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes)
+        super().__init__(estimator_, reg_param=None)
         self.reg_param_list = np.array(reg_param_list)
         self.cv = cv
         self.scoring = scoring
@@ -263,26 +248,44 @@ def __init__(self, estimator=None,
         #     raise Warning('Passed an already fitted estimator,'
         #                   'but shrinking not applied until fit method is called.')
 
-
     def fit(self, X, y, *args, **kwargs):
-        self.scores_ = []
-        for reg_param in self.reg_param_list:
-            est = HSTreeClassifier(deepcopy(self.estimator), reg_param)
-            cv_scores = cross_val_score(est, X, y, cv=self.cv, scoring=self.scoring)
-            self.scores_.append(np.mean(cv_scores))
-        self.reg_param = self.reg_param_list[np.argmax(self.scores_)]
+        self.scores_ = [[] for _ in self.reg_param_list]
+        scorer = kwargs.get('scoring', log_loss)
+        kf = KFold(n_splits=self.cv)
+        for train_index, test_index in kf.split(X):
+            X_out, y_out = X[test_index, :], y[test_index]
+            X_in, y_in = X[train_index, :], y[train_index]
+            base_est = deepcopy(self.estimator_)
+            base_est.fit(X_in, y_in)
+            for i, reg_param in enumerate(self.reg_param_list):
+                est_hs = HSTreeClassifier(base_est, reg_param)
+                est_hs.fit(X_in, y_in)
+                self.scores_[i].append(scorer(y_out, est_hs.predict_proba(X_out)))
+        self.scores_ = [np.mean(s) for s in self.scores_]
+        cv_criterion = _get_cv_criterion(scorer)
+        self.reg_param = self.reg_param_list[cv_criterion(self.scores_)]
         super().fit(X=X, y=y, *args, **kwargs)
-        return self
+
+    def __repr__(self):
+        attr_list = ["estimator_", "reg_param_list", "shrinkage_scheme_",
+                     "cv", "scoring"]
+        s = self.__class__.__name__
+        s += "("
+        for attr in attr_list:
+            s += attr + "=" + repr(getattr(self, attr)) + ", "
+        s = s[:-2] + ")"
+        return s
 
 
 class HSTreeRegressorCV(HSTreeRegressor):
-    def __init__(self, estimator=None,
-                    reg_param_list: List[float] = [0.1, 1, 10, 50, 100, 500],
-                    shrinkage_scheme_: str = 'node_based',
-                    cv: int = 3, scoring=None):
+    def __init__(self, estimator_: BaseEstimator = None,
+                 reg_param_list: List[float] = [0, 0.1, 1, 10, 50, 100, 500],
+                 shrinkage_scheme_: str = 'node_based',
+                 max_leaf_nodes: int = 20,
+                 cv: int = 3, scoring=None, *args, **kwargs):
         """Cross-validation is used to select the best regularization parameter for hierarchical shrinkage.
 
-        Params
+         Params
         ------
         estimator_
             Sklearn estimator (already initialized).
@@ -294,7 +297,9 @@ def __init__(self, estimator=None,
         args, kwargs
             Note: args, kwargs are not used but left so that imodels-experiments can still pass redundant args.
         """
-        super().__init__(estimator, reg_param=None)
+        if estimator_ is None:
+            estimator_ = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes)
+        super().__init__(estimator_, reg_param=None)
         self.reg_param_list = np.array(reg_param_list)
         self.cv = cv
         self.scoring = scoring
@@ -306,14 +311,32 @@ def __init__(self, estimator=None,
         #                   'but shrinking not applied until fit method is called.')
 
     def fit(self, X, y, *args, **kwargs):
-        self.scores_ = []
-        for reg_param in self.reg_param_list:
-            est = HSTreeRegressor(deepcopy(self.estimator), reg_param)
-            cv_scores = cross_val_score(est, X, y, cv=self.cv, scoring=self.scoring)
-            self.scores_.append(np.mean(cv_scores))
-        self.reg_param = self.reg_param_list[np.argmax(self.scores_)]
+        self.scores_ = [[] for _ in self.reg_param_list]
+        kf = KFold(n_splits=self.cv)
+        scorer = kwargs.get('scoring', mean_squared_error)
+        for train_index, test_index in kf.split(X):
+            X_out, y_out = X[test_index, :], y[test_index]
+            X_in, y_in = X[train_index, :], y[train_index]
+            base_est = deepcopy(self.estimator_)
+            base_est.fit(X_in, y_in)
+            for i, reg_param in enumerate(self.reg_param_list):
+                est_hs = HSTreeRegressor(base_est, reg_param)
+                est_hs.fit(X_in, y_in)
+                self.scores_[i].append(scorer(est_hs.predict(X_out), y_out))
+        self.scores_ = [np.mean(s) for s in self.scores_]
+        cv_criterion = _get_cv_criterion(scorer)
+        self.reg_param = self.reg_param_list[cv_criterion(self.scores_)]
         super().fit(X=X, y=y, *args, **kwargs)
-        return self
+
+    def __repr__(self):
+        attr_list = ["estimator_", "reg_param_list", "shrinkage_scheme_",
+                     "cv", "scoring"]
+        s = self.__class__.__name__
+        s += "("
+        for attr in attr_list:
+            s += attr + "=" + repr(getattr(self, attr)) + ", "
+        s = s[:-2] + ")"
+        return s
 
 
 if __name__ == '__main__':
@@ -330,9 +353,9 @@ def fit(self, X, y, *args, **kwargs):
     print('X.shape', X.shape)
     print('ys', np.unique(y_train))
 
-    # m = HSTree(estimator=DecisionTreeClassifier(), reg_param=0.1)
+    # m = HSTree(estimator_=DecisionTreeClassifier(), reg_param=0.1)
     # m = DecisionTreeClassifier(max_leaf_nodes = 20,random_state=1, max_features=None)
-    m = DecisionTreeRegressor(random_state=42, max_leaf_nodes=20)
+    m = DecisionTreeClassifier(random_state=42)
     # print('best alpha', m.reg_param)
     m.fit(X_train, y_train)
     # m.predict_proba(X_train)  # just run this
@@ -342,15 +365,14 @@ def fit(self, X, y, *args, **kwargs):
     # x = DecisionTreeRegressor(random_state = 42, ccp_alpha = 0.3)
     # x.fit(X_train,y_train)
 
-    # m = HSTree(estimator=DecisionTreeRegressor(random_state=42, max_features=None), reg_param=10)
-    # m = HSTree(estimator=DecisionTreeClassifier(random_state=42, max_features=None), reg_param=0)
-    m = HSTreeClassifierCV(estimator=DecisionTreeRegressor(max_leaf_nodes=10, random_state=1),
-                           shrinkage_scheme_='node_based',
-                           reg_param_list=[0.1, 1, 2, 5, 10, 25, 50, 100, 500])
-    print(m)
-    # m = ShrunkTreeCV(estimator=DecisionTreeClassifier())
+    # m = HSTree(estimator_=DecisionTreeRegressor(random_state=42, max_features=None), reg_param=10)
+    # m = HSTree(estimator_=DecisionTreeClassifier(random_state=42, max_features=None), reg_param=0)
+    m = HSTreeRegressorCV(estimator_=DecisionTreeClassifier(random_state=42),
+                          shrinkage_scheme_='node_based',
+                          reg_param_list=[0.1, 1, 2, 5, 10, 25, 50, 100, 500])
+    # m = ShrunkTreeCV(estimator_=DecisionTreeClassifier())
 
-    # m = HSTreeClassifier(estimator = GradientBoostingClassifier(random_state = 10),reg_param = 5)
+    # m = HSTreeClassifier(estimator_ = GradientBoostingClassifier(random_state = 10),reg_param = 5)
     m.fit(X_train, y_train)
     print('best alpha', m.reg_param)
     # m.predict_proba(X_train)  # just run this