feat: adaboost->generic classifier (default lgb), joblib->skops

martibosch · Mar 28, 2024 · c619239 · c619239
1 parent 694966a
commit c619239
Show file tree

Hide file tree

Showing 15 changed files with 86 additions and 79 deletions.
diff --git a/detectree/classifier.py b/detectree/classifier.py
@@ -7,7 +7,6 @@
 import numpy as np
 import rasterio as rio
 from dask import diagnostics
-from sklearn import ensemble
 
 from . import pixel_features, pixel_response, settings, utils
 
@@ -22,15 +21,15 @@ class ClassifierTrainer:
     def __init__(
         self,
         *,
-        num_estimators=None,
         sigmas=None,
         num_orientations=None,
         neighborhood=None,
         min_neighborhood_range=None,
         num_neighborhoods=None,
         tree_val=None,
         nontree_val=None,
-        **adaboost_kws,
+        classifier_class=None,
+        **classifier_kws,
     ):
         """
         Initialize the classifier.
@@ -39,11 +38,6 @@ def __init__(
 
         Parameters
         ----------
-        num_estimators : int, optional
-            The maximum number of estimators at which boosting is terminated. Directly
-            passed to the `n_estimators` keyword argument of
-            `sklearn.ensemble.AdaBoostClassifier`. If no value is provided, the default
-            value set in `settings.CLF_DEFAULT_NUM_ESTIMATORS` will be taken.
         sigmas : list-like, optional
             The list of scale parameters (sigmas) to build the Gaussian filter bank that
             will be used to compute the pixel-level features. The provided argument will
@@ -84,16 +78,17 @@ def __init__(
             provided argument will be passed to the initialization method of the
             `PixelResponseBuilder` class. If no value is provided, the default value set
             in `settings.RESPONSE_DEFAULT_NONTREE_VAL` will be taken.
-        adaboost_kws : key-value pairings, optional
-            Keyword arguments that will be passed to
-            `sklearn.ensemble.AdaBoostClassifier`.
+        classifier_class : class, optional
+            The class of the classifier to be trained. It can be any scikit-learn
+            compatible estimator that implements the `fit`, `predict` and
+            `predict_proba` methods and that can be saved to and loaded from memory
+            using skops. If no value is provided, the default value set in
+            `settings.CLF_DEFAULT_CLASS` will be taken.
+        classifier_kws : key-value pairings, optional
+            Keyword arguments that will be passed to the initialization of
+            `classifier_class`. If no value is provided, the default value set in
+            `settings.CLF_DEFAULT_KWS` will be taken.
         """
-        super().__init__()
-
-        if num_estimators is None:
-            num_estimators = settings.CLF_DEFAULT_NUM_ESTIMATORS
-        self.num_estimators = num_estimators
-
         self.pixel_features_builder_kws = dict(
             sigmas=sigmas,
             num_orientations=num_orientations,
@@ -104,7 +99,12 @@ def __init__(
         self.pixel_response_builder_kws = dict(
             tree_val=tree_val, nontree_val=nontree_val
         )
-        self.adaboost_kws = adaboost_kws
+        if classifier_class is None:
+            classifier_class = settings.CLF_DEFAULT_CLASS
+        self.classifier_class = classifier_class
+        if classifier_kws == {}:
+            classifier_kws = settings.CLF_DEFAULT_KWS
+        self.classifier_kws = classifier_kws
 
     def train_classifier(
         self,
@@ -155,8 +155,8 @@ def train_classifier(
 
         Returns
         -------
-        clf : scikit-learn AdaBoostClassifier
-            The trained classifier
+        clf : scikit-learn-like classifier
+            The trained classifier.
         """
         if split_df is None and response_img_filepaths is None:
             # this is the only case that needs argument tweaking: otherwise, if we pass
@@ -202,9 +202,7 @@ def train_classifier(
             img_cluster=img_cluster,
         )
 
-        clf = ensemble.AdaBoostClassifier(
-            n_estimators=self.num_estimators, **self.adaboost_kws
-        )
+        clf = self.classifier_class(**self.classifier_kws)
         clf.fit(X, y)
 
         return clf
@@ -227,7 +225,7 @@ def train_classifiers(self, split_df, response_img_dir):
         Returns
         -------
         clf_dict : dictionary
-            Dictionary mapping a scikit-learn AdaBoostClassifier to each first-level
+            Dictionary mapping a scikit-learn-like classifier to each first-level
             cluster label.
         """
         if "img_cluster" not in split_df:
@@ -331,7 +329,7 @@ def classify_img(self, img_filepath, clf, output_filepath=None):
             Path to a file, URI, file object opened in binary ('rb') mode, or a Path
             object representing the image to be classified. The value will be passed to
             `rasterio.open`.
-        clf : scikit-learn AdaBoostClassifier
+        clf : scikit-learn-like classifier.
             Trained classifier.
         output_filepath : str, file object or pathlib.Path object, optional
             Path to a file, URI, file object opened in binary ('rb') mode, or a Path
@@ -361,8 +359,8 @@ def classify_img(self, img_filepath, clf, output_filepath=None):
             P_nontree = p_nontree.reshape(img_shape)
             P_tree = p_tree.reshape(img_shape)
 
-            # The AdaBoost probabilities are floats between 0 and 1, and the graph cuts
-            # algorithm requires an integer representation.  Therefore, we will multiply
+            # The classifier probabilities are floats between 0 and 1, and the graph
+            # cuts algorithm requires an integer representation. Therefore, we multiply
             # the probabilities by an arbitrary large number and then transform the
             # result to integers. For instance, we could use a `refine_int_rescale` of
             # `100` so that the probabilities are rescaled into integers between 0 and
@@ -447,10 +445,10 @@ def classify_imgs(
             Data frame with the train/test split.
         output_dir : str or pathlib.Path object
             Path to the directory where the predicted images are to be dumped.
-        clf : scikit-learn AdaBoostClassifier
+        clf : scikit-learn-like classifier
             Trained classifier.
         clf_dict : dictionary
-            Dictionary mapping a trained scikit-learn AdaBoostClassifier to each
+            Dictionary mapping a trained scikit-learn-like classifier to each
             first-level cluster label.
         method : {'cluster-I', 'cluster-II'}, optional
             Method used in the train/test split.

diff --git a/detectree/cli/main.py b/detectree/cli/main.py
@@ -3,10 +3,11 @@
 from os import path
 
 import click
-import joblib
 import pandas as pd
+from skops import io
 
 import detectree as dtr
+from detectree import settings
 
 
 # utils for the CLI
@@ -65,33 +66,32 @@ def _dict_from_kws(kws):
 
 
 def _init_classifier_trainer(
-    num_estimators,
     sigmas,
     num_orientations,
     min_neighborhood_range,
     num_neighborhoods,
     tree_val,
     nontree_val,
-    adaboost_kws,
+    classifier_kws,
 ):
     # pixel_features_builder_kws = _dict_from_kws(pixel_features_builder_kws)
     # pixel_response_builder_kws = _dict_from_kws(pixel_response_builder_kws)
-    adaboost_kws = _dict_from_kws(adaboost_kws)
+    classifier_kws = _dict_from_kws(classifier_kws)
 
     return dtr.ClassifierTrainer(
-        num_estimators=num_estimators,
         sigmas=sigmas,
         num_orientations=num_orientations,
         min_neighborhood_range=min_neighborhood_range,
         num_neighborhoods=num_neighborhoods,
         tree_val=tree_val,
         nontree_val=nontree_val,
-        **adaboost_kws,
+        **classifier_kws,
     )
 
 
 def _dump_clf(clf, output_filepath, logger):
-    joblib.dump(clf, output_filepath)
+    # joblib.dump(clf, output_filepath)
+    io.dump(clf, output_filepath)
     logger.info("Dumped trained classifier to %s", output_filepath)
 
 
@@ -184,14 +184,13 @@ def train_test_split(
 @click.option("--img-filename-pattern")
 @click.option("--method")
 @click.option("--img-cluster", type=int)
-@click.option("--num-estimators", type=int)
 @click.option("--sigmas", cls=_OptionEatAll)
 @click.option("--num-orientations", type=int)
 @click.option("--min-neighborhood-range", type=int)
 @click.option("--num-neighborhoods", type=int)
 @click.option("--tree-val", type=int)
 @click.option("--nontree-val", type=int)
-@click.option("--adaboost-kws", cls=_OptionEatAll)
+@click.option("--classifier-kws", cls=_OptionEatAll)
 @click.option("--output-filepath", type=click.Path())
 def train_classifier(
     ctx,
@@ -203,14 +202,13 @@ def train_classifier(
     img_filename_pattern,
     method,
     img_cluster,
-    num_estimators,
     sigmas,
     num_orientations,
     min_neighborhood_range,
     num_neighborhoods,
     tree_val,
     nontree_val,
-    adaboost_kws,
+    classifier_kws,
     output_filepath,
 ):
     """Train a tree/non-tree pixel classifier."""
@@ -223,14 +221,13 @@ def train_classifier(
         split_df = None
 
     ct = _init_classifier_trainer(
-        num_estimators=num_estimators,
         sigmas=sigmas,
         num_orientations=num_orientations,
         min_neighborhood_range=min_neighborhood_range,
         num_neighborhoods=num_neighborhoods,
         tree_val=tree_val,
         nontree_val=nontree_val,
-        adaboost_kws=adaboost_kws,
+        classifier_kws=classifier_kws,
     )
     clf = ct.train_classifier(
         split_df=split_df,
@@ -244,7 +241,7 @@ def train_classifier(
     )
 
     if output_filepath is None:
-        output_filepath = "clf.joblib"
+        output_filepath = "clf.skops"
 
     _dump_clf(clf, output_filepath, logger)
 
@@ -253,27 +250,25 @@ def train_classifier(
 @click.pass_context
 @click.argument("split_filepath", type=click.Path(exists=True))
 @click.argument("response_img_dir", type=click.Path(exists=True))
-@click.option("--num-estimators", type=int)
 @click.option("--sigmas", cls=_OptionEatAll)
 @click.option("--num-orientations", type=int)
 @click.option("--min-neighborhood-range", type=int)
 @click.option("--num-neighborhoods", type=int)
 @click.option("--tree-val", type=int)
 @click.option("--nontree-val", type=int)
-@click.option("--adaboost-kws", cls=_OptionEatAll)
+@click.option("--classifier-kws", cls=_OptionEatAll)
 @click.option("--output-dir", type=click.Path(exists=True))
 def train_classifiers(
     ctx,
     split_filepath,
     response_img_dir,
-    num_estimators,
     sigmas,
     num_orientations,
     min_neighborhood_range,
     num_neighborhoods,
     tree_val,
     nontree_val,
-    adaboost_kws,
+    classifier_kws,
     output_dir,
 ):
     """Train tree/non-tree pixel classifier(s) for a given train/test split."""
@@ -286,14 +281,13 @@ def train_classifiers(
         split_df = None
 
     ct = _init_classifier_trainer(
-        num_estimators=num_estimators,
         sigmas=sigmas,
         num_orientations=num_orientations,
         min_neighborhood_range=min_neighborhood_range,
         num_neighborhoods=num_neighborhoods,
         tree_val=tree_val,
         nontree_val=nontree_val,
-        adaboost_kws=adaboost_kws,
+        classifier_kws=classifier_kws,
     )
     clfs_dict = ct.train_classifiers(split_df, response_img_dir)
 
@@ -303,7 +297,7 @@ def train_classifiers(
     for img_cluster in clfs_dict:
         _dump_clf(
             clfs_dict[img_cluster],
-            path.join(output_dir, f"{img_cluster}.joblib"),
+            path.join(output_dir, f"{img_cluster}.skops"),
             logger,
         )
 
@@ -350,7 +344,11 @@ def classify_img(
         filename, ext = path.splitext(path.basename(img_filepath))
         output_filepath = f"{filename}-pred{ext}"
 
-    c.classify_img(img_filepath, joblib.load(clf_filepath), output_filepath)
+    c.classify_img(
+        img_filepath,
+        io.load(clf_filepath, trusted=settings.SKOPS_DEFAULT_TRUSTED),
+        output_filepath,
+    )
     logger.info("Dumped predicted image to %s", output_filepath)
 
 
@@ -390,7 +388,7 @@ def classify_imgs(
 
     if clf_filepath is not None:
         clf_dict = None
-        clf = joblib.load(clf_filepath)
+        clf = io.load(clf_filepath, settings.SKOPS_DEFAULT_TRUSTED)
         logger.info(
             "Classifying images from %s with classifier of %s",
             split_filepath,
@@ -401,8 +399,9 @@ def classify_imgs(
         clf = None
         clf_dict = {}
         for img_cluster in split_df["img_cluster"].unique():
-            clf_dict[img_cluster] = joblib.load(
-                path.join(clf_dir, f"{img_cluster}.joblib")
+            clf_dict[img_cluster] = io.load(
+                path.join(clf_dir, f"{img_cluster}.skops"),
+                settings.SKOPS_DEFAULT_TRUSTED,
             )
 
     pixel_features_builder_kws = _dict_from_kws(pixel_features_builder_kws)

diff --git a/detectree/settings.py b/detectree/settings.py
@@ -1,6 +1,7 @@
 """detectree general settings."""
 import logging as lg
 
+import lightgbm as lgb
 import numpy as np
 
 # train/test split
@@ -20,12 +21,18 @@
 RESPONSE_DEFAULT_NONTREE_VAL = 0
 
 # classifier
-CLF_DEFAULT_NUM_ESTIMATORS = 200
+CLF_DEFAULT_CLASS = lgb.LGBMClassifier
+CLF_DEFAULT_KWS = {"n_estimators": 200}
 CLF_DEFAULT_TREE_VAL = 255
 CLF_DEFAULT_NONTREE_VAL = 0
 CLF_DEFAULT_REFINE = True
 CLF_DEFAULT_REFINE_BETA = 50
 CLF_DEFAULT_REFINE_INT_RESCALE = 10000
+SKOPS_DEFAULT_TRUSTED = [
+    "collections.defaultdict",
+    "lightgbm.basic.Booster",
+    "lightgbm.sklearn.LGBMClassifier",
+]
 
 # LIDAR
 LIDAR_TREE_THRESHOLD = 15

diff --git a/pyproject.toml b/pyproject.toml
@@ -26,6 +26,7 @@ dependencies = [
     "dask[delayed,distributed]",
     "joblib",
     "laspy >= 2.0.0",
+    "lightgbm",
     "numpy >= 1.15",
     "opencv-python >= 4.0.0",
     "pandas >= 0.23",
@@ -34,6 +35,7 @@ dependencies = [
     "scikit-image",
     "scikit-learn",
     "scipy >= 1.0.0",
+    "skops",
     "shapely",
     "tqdm",
 ]

diff --git a/tests/data/models/0.joblib b/tests/data/models/0.joblib
diff --git a/tests/data/models/0.skops b/tests/data/models/0.skops
diff --git a/tests/data/models/1.joblib b/tests/data/models/1.joblib
diff --git a/tests/data/models/1.skops b/tests/data/models/1.skops
diff --git a/tests/data/models/2.joblib b/tests/data/models/2.joblib
diff --git a/tests/data/models/2.skops b/tests/data/models/2.skops
diff --git a/tests/data/models/3.joblib b/tests/data/models/3.joblib
diff --git a/tests/data/models/3.skops b/tests/data/models/3.skops
diff --git a/tests/data/models/clf.joblib b/tests/data/models/clf.joblib
diff --git a/tests/data/models/clf.skops b/tests/data/models/clf.skops