Fix: use uncalibrated probabilities for realized performance calculat…

…ion in CBPE (#98)
NannyML · Sep 19, 2022 · 946a384 · 946a384
1 parent cc1c866
commit 946a384
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 16 deletions.
diff --git a/nannyml/performance_estimation/confidence_based/_cbpe_binary_classification.py b/nannyml/performance_estimation/confidence_based/_cbpe_binary_classification.py
@@ -67,6 +67,11 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs) -> CBPE:
 
         _list_missing([self.y_true, self.y_pred_proba, self.y_pred], list(reference_data.columns))
 
+        # We need uncalibrated data to calculate the realized performance on.
+        # We need realized performance in threshold calculations.
+        # https://github.com/NannyML/nannyml/issues/98
+        reference_data[f'uncalibrated_{self.y_pred_proba}'] = reference_data[self.y_pred_proba]
+
         for metric in self.metrics:
             metric.fit(reference_data)
 
@@ -95,6 +100,10 @@ def _estimate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
 
         _list_missing([self.y_pred_proba, self.y_pred], list(data.columns))
 
+        # We need uncalibrated data to calculate the realized performance on.
+        # https://github.com/NannyML/nannyml/issues/98
+        data[f'uncalibrated_{self.y_pred_proba}'] = data[self.y_pred_proba]
+
         if self.needs_calibration:
             data[self.y_pred_proba] = self.calibrator.calibrate(data[self.y_pred_proba])
 

diff --git a/nannyml/performance_estimation/confidence_based/_cbpe_multiclass_classification.py b/nannyml/performance_estimation/confidence_based/_cbpe_multiclass_classification.py
@@ -70,6 +70,12 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs) -> CBPE:
 
         _list_missing([self.y_true, self.y_pred] + model_output_column_names(self.y_pred_proba), reference_data)
 
+        # We need uncalibrated data to calculate the realized performance on.
+        # We need realized performance in threshold calculations.
+        # https://github.com/NannyML/nannyml/issues/98
+        for class_proba in model_output_column_names(self.y_pred_proba):
+            reference_data[f'uncalibrated_{class_proba}'] = reference_data[class_proba]
+
         for metric in self.metrics:
             metric.fit(reference_data)
 
@@ -84,6 +90,11 @@ def _estimate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
 
         _list_missing([self.y_pred] + model_output_column_names(self.y_pred_proba), data)
 
+        # We need uncalibrated data to calculate the realized performance on.
+        # https://github.com/NannyML/nannyml/issues/98
+        for class_proba in model_output_column_names(self.y_pred_proba):
+            data[f'uncalibrated_{class_proba}'] = data[class_proba]
+
         data = _calibrate_predicted_probabilities(data, self.y_true, self.y_pred_proba, self._calibrators)
 
         chunks = self.chunker.split(data)

diff --git a/nannyml/performance_estimation/confidence_based/metrics.py b/nannyml/performance_estimation/confidence_based/metrics.py
@@ -57,6 +57,8 @@ def __init__(
         self.lower_threshold: Optional[float] = None
         self.confidence_deviation: Optional[float] = None
 
+        self.uncalibrated_y_pred_proba = f'uncalibrated_{self.estimator.y_pred_proba}'
+
     def __str__(self):
         return self.column_name
 
@@ -153,10 +155,20 @@ def __eq__(self, other):
         """Establishes equality by comparing all properties."""
         return self.display_name == other.display_name and self.column_name == other.column_name
 
-    def _common_cleaning(self, data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    def _common_cleaning(
+        self, data: pd.DataFrame, y_pred_proba_column_name: str = None
+    ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+        if y_pred_proba_column_name is None:
+            if not isinstance(self.estimator.y_pred_proba, str):
+                raise InvalidArgumentsException(
+                    f"'y_pred_proba' is of type '{type(self.estimator.y_pred_proba)}'. "
+                    f"Binary use cases require 'y_pred_proba' to be a string."
+                )
+            y_pred_proba_column_name = self.estimator.y_pred_proba
+
         clean_targets = self.estimator.y_true in data.columns and not data[self.estimator.y_true].isna().all()
 
-        y_pred_proba = data[self.estimator.y_pred_proba]
+        y_pred_proba = data[y_pred_proba_column_name]
         y_pred = data[self.estimator.y_pred]
 
         y_pred_proba.dropna(inplace=True)
@@ -243,7 +255,7 @@ def _estimate(self, data: pd.DataFrame):
         return estimate_roc_auc(y_pred_proba)
 
     def realized_performance(self, data: pd.DataFrame) -> float:
-        y_pred_proba, _, y_true = self._common_cleaning(data)
+        y_pred_proba, _, y_true = self._common_cleaning(data, y_pred_proba_column_name=self.uncalibrated_y_pred_proba)
 
         if y_true is None:
             return np.NaN
@@ -303,7 +315,7 @@ def _sampling_error(self, data: pd.DataFrame) -> float:
         return bse.f1_sampling_error(self._sampling_error_components, data)
 
     def realized_performance(self, data: pd.DataFrame) -> float:
-        _, y_pred, y_true = self._common_cleaning(data)
+        _, y_pred, y_true = self._common_cleaning(data, y_pred_proba_column_name=self.uncalibrated_y_pred_proba)
 
         if y_true is None:
             return np.NaN
@@ -345,7 +357,7 @@ def _sampling_error(self, data: pd.DataFrame) -> float:
         return bse.precision_sampling_error(self._sampling_error_components, data)
 
     def realized_performance(self, data: pd.DataFrame) -> float:
-        _, y_pred, y_true = self._common_cleaning(data)
+        _, y_pred, y_true = self._common_cleaning(data, y_pred_proba_column_name=self.uncalibrated_y_pred_proba)
 
         if y_true is None:
             return np.NaN
@@ -385,7 +397,7 @@ def _sampling_error(self, data: pd.DataFrame) -> float:
         return bse.recall_sampling_error(self._sampling_error_components, data)
 
     def realized_performance(self, data: pd.DataFrame) -> float:
-        _, y_pred, y_true = self._common_cleaning(data)
+        _, y_pred, y_true = self._common_cleaning(data, y_pred_proba_column_name=self.uncalibrated_y_pred_proba)
 
         if y_true is None:
             return np.NaN
@@ -425,7 +437,7 @@ def _sampling_error(self, data: pd.DataFrame) -> float:
         return bse.specificity_sampling_error(self._sampling_error_components, data)
 
     def realized_performance(self, data: pd.DataFrame) -> float:
-        _, y_pred, y_true = self._common_cleaning(data)
+        _, y_pred, y_true = self._common_cleaning(data, y_pred_proba_column_name=self.uncalibrated_y_pred_proba)
 
         if y_true is None:
             return np.NaN
@@ -470,7 +482,7 @@ def _sampling_error(self, data: pd.DataFrame) -> float:
         return bse.accuracy_sampling_error(self._sampling_error_components, data)
 
     def realized_performance(self, data: pd.DataFrame) -> float:
-        _, y_pred, y_true = self._common_cleaning(data)
+        _, y_pred, y_true = self._common_cleaning(data, y_pred_proba_column_name=self.uncalibrated_y_pred_proba)
 
         if y_true is None:
             return np.NaN
@@ -492,7 +504,7 @@ def _get_binarized_multiclass_predictions(data: pd.DataFrame, y_pred: str, y_pre
     return y_preds, y_pred_probas, classes
 
 
-def _get_multiclass_predictions(data: pd.DataFrame, y_pred: str, y_pred_proba: ModelOutputsType):
+def _get_multiclass_uncalibrated_predictions(data: pd.DataFrame, y_pred: str, y_pred_proba: ModelOutputsType):
     if not isinstance(y_pred_proba, dict):
         raise CalculatorException(
             "multiclass model outputs should be of type Dict[str, str].\n"
@@ -502,7 +514,7 @@ def _get_multiclass_predictions(data: pd.DataFrame, y_pred: str, y_pred_proba: M
     labels, class_probability_columns = [], []
     for label in sorted(y_pred_proba.keys()):
         labels.append(label)
-        class_probability_columns.append(y_pred_proba[label])
+        class_probability_columns.append(f'uncalibrated_{y_pred_proba[label]}')
     return data[y_pred], data[class_probability_columns], labels
 
 
@@ -541,7 +553,9 @@ def realized_performance(self, data: pd.DataFrame) -> float:
             return np.NaN
 
         y_true = data[self.estimator.y_true]
-        _, y_pred_probas, labels = _get_multiclass_predictions(data, self.estimator.y_pred, self.estimator.y_pred_proba)
+        _, y_pred_probas, labels = _get_multiclass_uncalibrated_predictions(
+            data, self.estimator.y_pred, self.estimator.y_pred_proba
+        )
 
         return roc_auc_score(y_true, y_pred_probas, multi_class='ovr', average='macro', labels=labels)
 
@@ -582,7 +596,9 @@ def realized_performance(self, data: pd.DataFrame) -> float:
             return np.NaN
 
         y_true = data[self.estimator.y_true]
-        y_pred, _, labels = _get_multiclass_predictions(data, self.estimator.y_pred, self.estimator.y_pred_proba)
+        y_pred, _, labels = _get_multiclass_uncalibrated_predictions(
+            data, self.estimator.y_pred, self.estimator.y_pred_proba
+        )
 
         return f1_score(y_true=y_true, y_pred=y_pred, average='macro', labels=labels)
 
@@ -623,7 +639,9 @@ def realized_performance(self, data: pd.DataFrame) -> float:
             return np.NaN
 
         y_true = data[self.estimator.y_true]
-        y_pred, _, labels = _get_multiclass_predictions(data, self.estimator.y_pred, self.estimator.y_pred_proba)
+        y_pred, _, labels = _get_multiclass_uncalibrated_predictions(
+            data, self.estimator.y_pred, self.estimator.y_pred_proba
+        )
 
         return precision_score(y_true=y_true, y_pred=y_pred, average='macro', labels=labels)
 
@@ -664,7 +682,9 @@ def realized_performance(self, data: pd.DataFrame) -> float:
             return np.NaN
 
         y_true = data[self.estimator.y_true]
-        y_pred, _, labels = _get_multiclass_predictions(data, self.estimator.y_pred, self.estimator.y_pred_proba)
+        y_pred, _, labels = _get_multiclass_uncalibrated_predictions(
+            data, self.estimator.y_pred, self.estimator.y_pred_proba
+        )
 
         return recall_score(y_true=y_true, y_pred=y_pred, average='macro', labels=labels)
 
@@ -705,7 +725,9 @@ def realized_performance(self, data: pd.DataFrame) -> float:
             return np.NaN
 
         y_true = data[self.estimator.y_true]
-        y_pred, _, labels = _get_multiclass_predictions(data, self.estimator.y_pred, self.estimator.y_pred_proba)
+        y_pred, _, labels = _get_multiclass_uncalibrated_predictions(
+            data, self.estimator.y_pred, self.estimator.y_pred_proba
+        )
 
         mcm = multilabel_confusion_matrix(y_true, y_pred, labels=labels)
         tn_sum = mcm[:, 0, 0]
@@ -748,6 +770,8 @@ def realized_performance(self, data: pd.DataFrame) -> float:
             return np.NaN
 
         y_true = data[self.estimator.y_true]
-        y_pred, _, labels = _get_multiclass_predictions(data, self.estimator.y_pred, self.estimator.y_pred_proba)
+        y_pred, _, labels = _get_multiclass_uncalibrated_predictions(
+            data, self.estimator.y_pred, self.estimator.y_pred_proba
+        )
 
         return accuracy_score(y_true, y_pred)