Skip to content

Commit

Permalink
feat: adaboost->generic classifier (default lgb), joblib->skops
Browse files Browse the repository at this point in the history
  • Loading branch information
martibosch committed Mar 28, 2024
1 parent 694966a commit c619239
Show file tree
Hide file tree
Showing 15 changed files with 86 additions and 79 deletions.
56 changes: 27 additions & 29 deletions detectree/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import numpy as np
import rasterio as rio
from dask import diagnostics
from sklearn import ensemble

from . import pixel_features, pixel_response, settings, utils

Expand All @@ -22,15 +21,15 @@ class ClassifierTrainer:
def __init__(
self,
*,
num_estimators=None,
sigmas=None,
num_orientations=None,
neighborhood=None,
min_neighborhood_range=None,
num_neighborhoods=None,
tree_val=None,
nontree_val=None,
**adaboost_kws,
classifier_class=None,
**classifier_kws,
):
"""
Initialize the classifier.
Expand All @@ -39,11 +38,6 @@ def __init__(
Parameters
----------
num_estimators : int, optional
The maximum number of estimators at which boosting is terminated. Directly
passed to the `n_estimators` keyword argument of
`sklearn.ensemble.AdaBoostClassifier`. If no value is provided, the default
value set in `settings.CLF_DEFAULT_NUM_ESTIMATORS` will be taken.
sigmas : list-like, optional
The list of scale parameters (sigmas) to build the Gaussian filter bank that
will be used to compute the pixel-level features. The provided argument will
Expand Down Expand Up @@ -84,16 +78,17 @@ def __init__(
provided argument will be passed to the initialization method of the
`PixelResponseBuilder` class. If no value is provided, the default value set
in `settings.RESPONSE_DEFAULT_NONTREE_VAL` will be taken.
adaboost_kws : key-value pairings, optional
Keyword arguments that will be passed to
`sklearn.ensemble.AdaBoostClassifier`.
classifier_class : class, optional
The class of the classifier to be trained. It can be any scikit-learn
compatible estimator that implements the `fit`, `predict` and
`predict_proba` methods and that can be saved to and loaded from memory
using skops. If no value is provided, the default value set in
`settings.CLF_DEFAULT_CLASS` will be taken.
classifier_kws : key-value pairings, optional
Keyword arguments that will be passed to the initialization of
`classifier_class`. If no value is provided, the default value set in
`settings.CLF_DEFAULT_KWS` will be taken.
"""
super().__init__()

if num_estimators is None:
num_estimators = settings.CLF_DEFAULT_NUM_ESTIMATORS
self.num_estimators = num_estimators

self.pixel_features_builder_kws = dict(
sigmas=sigmas,
num_orientations=num_orientations,
Expand All @@ -104,7 +99,12 @@ def __init__(
self.pixel_response_builder_kws = dict(
tree_val=tree_val, nontree_val=nontree_val
)
self.adaboost_kws = adaboost_kws
if classifier_class is None:
classifier_class = settings.CLF_DEFAULT_CLASS
self.classifier_class = classifier_class
if classifier_kws == {}:
classifier_kws = settings.CLF_DEFAULT_KWS
self.classifier_kws = classifier_kws

def train_classifier(
self,
Expand Down Expand Up @@ -155,8 +155,8 @@ def train_classifier(
Returns
-------
clf : scikit-learn AdaBoostClassifier
The trained classifier
clf : scikit-learn-like classifier
The trained classifier.
"""
if split_df is None and response_img_filepaths is None:
# this is the only case that needs argument tweaking: otherwise, if we pass
Expand Down Expand Up @@ -202,9 +202,7 @@ def train_classifier(
img_cluster=img_cluster,
)

clf = ensemble.AdaBoostClassifier(
n_estimators=self.num_estimators, **self.adaboost_kws
)
clf = self.classifier_class(**self.classifier_kws)
clf.fit(X, y)

return clf
Expand All @@ -227,7 +225,7 @@ def train_classifiers(self, split_df, response_img_dir):
Returns
-------
clf_dict : dictionary
Dictionary mapping a scikit-learn AdaBoostClassifier to each first-level
Dictionary mapping a scikit-learn-like classifier to each first-level
cluster label.
"""
if "img_cluster" not in split_df:
Expand Down Expand Up @@ -331,7 +329,7 @@ def classify_img(self, img_filepath, clf, output_filepath=None):
Path to a file, URI, file object opened in binary ('rb') mode, or a Path
object representing the image to be classified. The value will be passed to
`rasterio.open`.
clf : scikit-learn AdaBoostClassifier
clf : scikit-learn-like classifier.
Trained classifier.
output_filepath : str, file object or pathlib.Path object, optional
Path to a file, URI, file object opened in binary ('rb') mode, or a Path
Expand Down Expand Up @@ -361,8 +359,8 @@ def classify_img(self, img_filepath, clf, output_filepath=None):
P_nontree = p_nontree.reshape(img_shape)
P_tree = p_tree.reshape(img_shape)

# The AdaBoost probabilities are floats between 0 and 1, and the graph cuts
# algorithm requires an integer representation. Therefore, we will multiply
# The classifier probabilities are floats between 0 and 1, and the graph
# cuts algorithm requires an integer representation. Therefore, we multiply
# the probabilities by an arbitrary large number and then transform the
# result to integers. For instance, we could use a `refine_int_rescale` of
# `100` so that the probabilities are rescaled into integers between 0 and
Expand Down Expand Up @@ -447,10 +445,10 @@ def classify_imgs(
Data frame with the train/test split.
output_dir : str or pathlib.Path object
Path to the directory where the predicted images are to be dumped.
clf : scikit-learn AdaBoostClassifier
clf : scikit-learn-like classifier
Trained classifier.
clf_dict : dictionary
Dictionary mapping a trained scikit-learn AdaBoostClassifier to each
Dictionary mapping a trained scikit-learn-like classifier to each
first-level cluster label.
method : {'cluster-I', 'cluster-II'}, optional
Method used in the train/test split.
Expand Down
49 changes: 24 additions & 25 deletions detectree/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
from os import path

import click
import joblib
import pandas as pd
from skops import io

import detectree as dtr
from detectree import settings


# utils for the CLI
Expand Down Expand Up @@ -65,33 +66,32 @@ def _dict_from_kws(kws):


def _init_classifier_trainer(
num_estimators,
sigmas,
num_orientations,
min_neighborhood_range,
num_neighborhoods,
tree_val,
nontree_val,
adaboost_kws,
classifier_kws,
):
# pixel_features_builder_kws = _dict_from_kws(pixel_features_builder_kws)
# pixel_response_builder_kws = _dict_from_kws(pixel_response_builder_kws)
adaboost_kws = _dict_from_kws(adaboost_kws)
classifier_kws = _dict_from_kws(classifier_kws)

return dtr.ClassifierTrainer(
num_estimators=num_estimators,
sigmas=sigmas,
num_orientations=num_orientations,
min_neighborhood_range=min_neighborhood_range,
num_neighborhoods=num_neighborhoods,
tree_val=tree_val,
nontree_val=nontree_val,
**adaboost_kws,
**classifier_kws,
)


def _dump_clf(clf, output_filepath, logger):
joblib.dump(clf, output_filepath)
# joblib.dump(clf, output_filepath)
io.dump(clf, output_filepath)
logger.info("Dumped trained classifier to %s", output_filepath)


Expand Down Expand Up @@ -184,14 +184,13 @@ def train_test_split(
@click.option("--img-filename-pattern")
@click.option("--method")
@click.option("--img-cluster", type=int)
@click.option("--num-estimators", type=int)
@click.option("--sigmas", cls=_OptionEatAll)
@click.option("--num-orientations", type=int)
@click.option("--min-neighborhood-range", type=int)
@click.option("--num-neighborhoods", type=int)
@click.option("--tree-val", type=int)
@click.option("--nontree-val", type=int)
@click.option("--adaboost-kws", cls=_OptionEatAll)
@click.option("--classifier-kws", cls=_OptionEatAll)
@click.option("--output-filepath", type=click.Path())
def train_classifier(
ctx,
Expand All @@ -203,14 +202,13 @@ def train_classifier(
img_filename_pattern,
method,
img_cluster,
num_estimators,
sigmas,
num_orientations,
min_neighborhood_range,
num_neighborhoods,
tree_val,
nontree_val,
adaboost_kws,
classifier_kws,
output_filepath,
):
"""Train a tree/non-tree pixel classifier."""
Expand All @@ -223,14 +221,13 @@ def train_classifier(
split_df = None

ct = _init_classifier_trainer(
num_estimators=num_estimators,
sigmas=sigmas,
num_orientations=num_orientations,
min_neighborhood_range=min_neighborhood_range,
num_neighborhoods=num_neighborhoods,
tree_val=tree_val,
nontree_val=nontree_val,
adaboost_kws=adaboost_kws,
classifier_kws=classifier_kws,
)
clf = ct.train_classifier(
split_df=split_df,
Expand All @@ -244,7 +241,7 @@ def train_classifier(
)

if output_filepath is None:
output_filepath = "clf.joblib"
output_filepath = "clf.skops"

_dump_clf(clf, output_filepath, logger)

Expand All @@ -253,27 +250,25 @@ def train_classifier(
@click.pass_context
@click.argument("split_filepath", type=click.Path(exists=True))
@click.argument("response_img_dir", type=click.Path(exists=True))
@click.option("--num-estimators", type=int)
@click.option("--sigmas", cls=_OptionEatAll)
@click.option("--num-orientations", type=int)
@click.option("--min-neighborhood-range", type=int)
@click.option("--num-neighborhoods", type=int)
@click.option("--tree-val", type=int)
@click.option("--nontree-val", type=int)
@click.option("--adaboost-kws", cls=_OptionEatAll)
@click.option("--classifier-kws", cls=_OptionEatAll)
@click.option("--output-dir", type=click.Path(exists=True))
def train_classifiers(
ctx,
split_filepath,
response_img_dir,
num_estimators,
sigmas,
num_orientations,
min_neighborhood_range,
num_neighborhoods,
tree_val,
nontree_val,
adaboost_kws,
classifier_kws,
output_dir,
):
"""Train tree/non-tree pixel classifier(s) for a given train/test split."""
Expand All @@ -286,14 +281,13 @@ def train_classifiers(
split_df = None

ct = _init_classifier_trainer(
num_estimators=num_estimators,
sigmas=sigmas,
num_orientations=num_orientations,
min_neighborhood_range=min_neighborhood_range,
num_neighborhoods=num_neighborhoods,
tree_val=tree_val,
nontree_val=nontree_val,
adaboost_kws=adaboost_kws,
classifier_kws=classifier_kws,
)
clfs_dict = ct.train_classifiers(split_df, response_img_dir)

Expand All @@ -303,7 +297,7 @@ def train_classifiers(
for img_cluster in clfs_dict:
_dump_clf(
clfs_dict[img_cluster],
path.join(output_dir, f"{img_cluster}.joblib"),
path.join(output_dir, f"{img_cluster}.skops"),
logger,
)

Expand Down Expand Up @@ -350,7 +344,11 @@ def classify_img(
filename, ext = path.splitext(path.basename(img_filepath))
output_filepath = f"{filename}-pred{ext}"

c.classify_img(img_filepath, joblib.load(clf_filepath), output_filepath)
c.classify_img(
img_filepath,
io.load(clf_filepath, trusted=settings.SKOPS_DEFAULT_TRUSTED),
output_filepath,
)
logger.info("Dumped predicted image to %s", output_filepath)


Expand Down Expand Up @@ -390,7 +388,7 @@ def classify_imgs(

if clf_filepath is not None:
clf_dict = None
clf = joblib.load(clf_filepath)
clf = io.load(clf_filepath, settings.SKOPS_DEFAULT_TRUSTED)
logger.info(
"Classifying images from %s with classifier of %s",
split_filepath,
Expand All @@ -401,8 +399,9 @@ def classify_imgs(
clf = None
clf_dict = {}
for img_cluster in split_df["img_cluster"].unique():
clf_dict[img_cluster] = joblib.load(
path.join(clf_dir, f"{img_cluster}.joblib")
clf_dict[img_cluster] = io.load(
path.join(clf_dir, f"{img_cluster}.skops"),
settings.SKOPS_DEFAULT_TRUSTED,
)

pixel_features_builder_kws = _dict_from_kws(pixel_features_builder_kws)
Expand Down
9 changes: 8 additions & 1 deletion detectree/settings.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""detectree general settings."""
import logging as lg

import lightgbm as lgb
import numpy as np

# train/test split
Expand All @@ -20,12 +21,18 @@
RESPONSE_DEFAULT_NONTREE_VAL = 0

# classifier
CLF_DEFAULT_NUM_ESTIMATORS = 200
CLF_DEFAULT_CLASS = lgb.LGBMClassifier
CLF_DEFAULT_KWS = {"n_estimators": 200}
CLF_DEFAULT_TREE_VAL = 255
CLF_DEFAULT_NONTREE_VAL = 0
CLF_DEFAULT_REFINE = True
CLF_DEFAULT_REFINE_BETA = 50
CLF_DEFAULT_REFINE_INT_RESCALE = 10000
SKOPS_DEFAULT_TRUSTED = [
"collections.defaultdict",
"lightgbm.basic.Booster",
"lightgbm.sklearn.LGBMClassifier",
]

# LIDAR
LIDAR_TREE_THRESHOLD = 15
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ dependencies = [
"dask[delayed,distributed]",
"joblib",
"laspy >= 2.0.0",
"lightgbm",
"numpy >= 1.15",
"opencv-python >= 4.0.0",
"pandas >= 0.23",
Expand All @@ -34,6 +35,7 @@ dependencies = [
"scikit-image",
"scikit-learn",
"scipy >= 1.0.0",
"skops",
"shapely",
"tqdm",
]
Expand Down
Binary file removed tests/data/models/0.joblib
Binary file not shown.
Binary file added tests/data/models/0.skops
Binary file not shown.
Binary file removed tests/data/models/1.joblib
Binary file not shown.
Binary file added tests/data/models/1.skops
Binary file not shown.
Binary file removed tests/data/models/2.joblib
Binary file not shown.
Binary file added tests/data/models/2.skops
Binary file not shown.
Binary file removed tests/data/models/3.joblib
Binary file not shown.
Binary file added tests/data/models/3.skops
Binary file not shown.
Binary file removed tests/data/models/clf.joblib
Binary file not shown.
Binary file added tests/data/models/clf.skops
Binary file not shown.
Loading

0 comments on commit c619239

Please sign in to comment.