automl · eddiebergman · May 24, 2022 · May 13, 2022 · May 13, 2022 · May 13, 2022
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -61,7 +61,7 @@ Following that we'll tell you about how you can test your changes locally and th
     # If you missed the --recurse-submodules arg during clone or need to install the
     # submodule manually, then execute the following line:
     #
-    # git submodule udate --init --recursive
+    # git submodule update --init --recursive
     ```
 
     The reason to create a new branch is two fold:
@@ -207,16 +207,16 @@ Sometimes, the new functionality isn't so clear from a simple parameter descript
 Lastly, if the feature really is a game changer or you're very proud of it, consider making an `example_*.py` that will be run and rendered in the online docs!
 
 ## Testing
-*   Let's assume you've made some changes, now we have to make sure they work.
+* Let's assume you've made some changes, now we have to make sure they work.
     Begin by simply running all the tests.
     If there's any errors, they'll pop up once it's complete.
     ```bash
     pytest
     ```
-    *   Note that these may take a while so check out `pytest --help` to see how you can run tests so that only previous failures run or only certain tests are run.
+    * Note that these may take a while so check out `pytest --help` to see how you can run tests so that only previous failures run or only certain tests are run.
         This can help you try changes and get results faster.
         Do however run one last full `pytest` once you are finished and happy!
-    *   Here are some we find particularly useful
+    * Here are some we find particularly useful
         ```
         # Run tests in specific file like 'test_estimators.py'
         pytest "test/test_automl/test_estimators.py"
@@ -236,9 +236,18 @@ Lastly, if the feature really is a game changer or you're very proud of it, cons
         # Exit on the first test failure
         pytest -x
         ```
-    *   More advanced editors like PyCharm may have built in integrations which could be good to check out!
+    * More advanced editors like PyCharm may have built in integrations which could be good to check out!
+    * Running all unittests will take a while, here's how you can run them in parallel
+        ```
+        export OPENBLAS_NUM_THREADS=1
+        export MKL_NUM_THREADS=1
+        export OMP_NUM_THREADS=1
+
+        pytest -n 4
+        ```
+
 
-*   Now we are going to use [sphinx](https://www.sphinx-doc.org/en/master/) to generate all the documentation and make sure there are no issues.
+* Now we are going to use [sphinx](https://www.sphinx-doc.org/en/master/) to generate all the documentation and make sure there are no issues.
     ```bash
     make doc
     ```
@@ -261,7 +270,7 @@ Lastly, if the feature really is a game changer or you're very proud of it, cons
     xdg-open ./doc/build/html/index.html
     ```
 
-*   Once you've made all your changes and all the tests pass successfully, we need to make sure that the code fits a certain format and that the [typing](https://docs.python.org/3/library/typing.html) is correct.
+* Once you've made all your changes and all the tests pass successfully, we need to make sure that the code fits a certain format and that the [typing](https://docs.python.org/3/library/typing.html) is correct.
     * Formatting and import sorting can helps keep things uniform across all coding styles. We use [`black`](https://black.readthedocs.io/en/stable/) and [`isort`](https://isort.readthedocs.io/en/latest/) to do this for us. To automatically run these formatters across the code base, just run the following command:
     ```bash
     make format

diff --git a/autosklearn/automl_common b/autosklearn/automl_common
diff --git a/autosklearn/data/target_validator.py b/autosklearn/data/target_validator.py
@@ -17,6 +17,7 @@
 from autosklearn.util.logging_ import PickableLoggerAdapter
 
 SUPPORTED_TARGET_TYPES = Union[List, pd.Series, pd.DataFrame, np.ndarray, spmatrix]
+SUPPORTED_XDATA_TYPES = Union[pd.Series, pd.DataFrame, np.ndarray, spmatrix]
 
 
 class TargetValidator(BaseEstimator):

diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py
@@ -25,6 +25,10 @@
     MULTIOUTPUT_REGRESSION,
     REGRESSION_TASKS,
 )
+from autosklearn.data.target_validator import (
+    SUPPORTED_TARGET_TYPES,
+    SUPPORTED_XDATA_TYPES,
+)
 from autosklearn.metrics import Scorer, calculate_losses
 from autosklearn.pipeline.components.base import ThirdPartyComponents, _addons
 from autosklearn.pipeline.implementations.util import (
@@ -273,7 +277,8 @@ def __init__(
                 port=self.port,
             )
 
-        self.Y_optimization: Optional[Union[List, np.ndarray]] = None
+        self.X_optimization: Optional[SUPPORTED_XDATA_TYPES] = None
+        self.Y_optimization: Optional[SUPPORTED_TARGET_TYPES] = None
         self.Y_actual_train = None
 
         self.budget = budget
@@ -328,6 +333,7 @@ def _loss(
         self,
         y_true: np.ndarray,
         y_hat: np.ndarray,
+        X_data: Optional[SUPPORTED_XDATA_TYPES] = None,
     ) -> Dict[str, float]:
         """Auto-sklearn follows a minimization goal.
         The calculate_loss internally translate a score function to
@@ -354,6 +360,7 @@ def _loss(
                 y_hat,
                 self.task_type,
                 self.metrics,
+                X_data=X_data,
                 scoring_functions=self.scoring_functions,
             )
 
@@ -522,7 +529,12 @@ def file_output(
         # This file can be written independently of the others down bellow
         if "y_optimization" not in self.disable_file_output:
             if self.output_y_hat_optimization:
-                self.backend.save_targets_ensemble(self.Y_optimization)
+                self.backend.save_additional_data(
+                    self.Y_optimization, what="targets_ensemble"
+                )
+                self.backend.save_additional_data(
+                    self.X_optimization, what="input_ensemble"
+                )
 
         models: Optional[BaseEstimator] = None
         if hasattr(self, "models"):

diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py
@@ -7,6 +7,9 @@
 import warnings
 
 import numpy as np
+import pandas
+import pandas as pd
+import scipy.sparse
 from ConfigSpace import Configuration
 from sklearn.base import BaseEstimator
 from sklearn.model_selection import (
@@ -177,6 +180,26 @@ def _fit_with_budget(
         raise ValueError(budget_type)
 
 
+def concat_data(
+    data: List[Any], num_cv_folds: int
+) -> Union[np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix]:
+    if isinstance(data[0], np.ndarray):
+        return np.concatenate(
+            [data[i] for i in range(num_cv_folds) if data[i] is not None]
+        )
+    elif isinstance(data[0], scipy.sparse.spmatrix):
+        return scipy.sparse.vstack(
+            [data[i] for i in range(num_cv_folds) if data[i] is not None]
+        )
+    elif isinstance(data[0], pd.DataFrame):
+        return pd.concat(
+            [data[i] for i in range(num_cv_folds) if data[i] is not None],
+            axis=0,
+        )
+    else:
+        raise ValueError(f"Unknown datatype {type(data[0])}")
+
+
 class TrainEvaluator(AbstractEvaluator):
     def __init__(
         self,
@@ -235,7 +258,7 @@ def __init__(
         )
         self.X_train = self.datamanager.data["X_train"]
         self.Y_train = self.datamanager.data["Y_train"]
-        self.Y_optimization: Optional[SUPPORTED_TARGET_TYPES] = None
+        self.X_targets = [None] * self.num_cv_folds
         self.Y_targets = [None] * self.num_cv_folds
         self.Y_train_targets = np.ones(self.Y_train.shape) * np.NaN
         self.models = [None] * self.num_cv_folds
@@ -265,6 +288,11 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                     self.Y_train,
                     groups=self.resampling_strategy_args.get("groups"),
                 ):
+                    self.X_optimization = (
+                        self.X_train.iloc[test_split]
+                        if hasattr(self.X_train, "iloc")
+                        else self.X_train[test_split]
+                    )
                     self.Y_optimization = self.Y_train[test_split]
                     self.Y_actual_train = self.Y_train[train_split]
                     self._partial_fit_and_predict_iterative(
@@ -359,6 +387,12 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                                 if hasattr(self.Y_train, "iloc")
                                 else self.Y_train[train_indices]
                             )
+                            self.X_targets[i] = (
+                                self.X_train.iloc[test_indices]
+                                if hasattr(self.X_train, "iloc")
+                                else self.X_train[train_indices]
+                            )
+
                             self.Y_targets[i] = self.Y_train[test_indices]
 
                             Xt, fit_params = model.fit_transformer(
@@ -400,6 +434,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                             if hasattr(self.Y_train, "iloc")
                             else self.Y_train[train_indices],
                             train_pred,
+                            X_data=Xt_array[i],
                         )
                         train_losses[i] = train_loss
                         # Number of training data points for this fold.
@@ -408,8 +443,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
 
                         # Compute validation loss of this fold and store it.
                         optimization_loss = self._loss(
-                            self.Y_targets[i],
-                            opt_pred,
+                            self.Y_targets[i], opt_pred, X_data=self.X_targets[i]
                         )
                         opt_losses[i] = optimization_loss
                         # number of optimization data points for this fold.
@@ -455,23 +489,15 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                             weights=opt_fold_weights_percentage,
                         )
 
+                    X_targets = self.X_targets
                     Y_targets = self.Y_targets
                     Y_train_targets = self.Y_train_targets
 
-                    Y_optimization_preds = np.concatenate(
-                        [
-                            Y_optimization_pred[i]
-                            for i in range(self.num_cv_folds)
-                            if Y_optimization_pred[i] is not None
-                        ]
-                    )
-                    Y_targets = np.concatenate(
-                        [
-                            Y_targets[i]
-                            for i in range(self.num_cv_folds)
-                            if Y_targets[i] is not None
-                        ]
+                    Y_optimization_pred_concat = concat_data(
+                        Y_optimization_pred, num_cv_folds=self.num_cv_folds
                     )
+                    X_targets = concat_data(X_targets, num_cv_folds=self.num_cv_folds)
+                    Y_targets = concat_data(Y_targets, num_cv_folds=self.num_cv_folds)
 
                     if self.X_valid is not None:
                         Y_valid_preds = np.array(
@@ -501,6 +527,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                     else:
                         Y_test_preds = None
 
+                    self.X_optimization = X_targets
                     self.Y_optimization = Y_targets
                     self.Y_actual_train = Y_train_targets
 
@@ -516,7 +543,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                     self.finish_up(
                         loss=opt_loss,
                         train_loss=train_loss,
-                        opt_pred=Y_optimization_preds,
+                        opt_pred=Y_optimization_pred_concat,
                         valid_pred=Y_valid_preds,
                         test_pred=Y_test_preds,
                         additional_run_info=additional_run_info,
@@ -602,6 +629,9 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                 train_loss = self._loss(
                     self.Y_train_targets[train_split],
                     train_pred,
+                    X_data=self.X_train.iloc[train_split]
+                    if hasattr(self.X_train, "iloc")
+                    else self.X_train[train_split],
                 )
                 train_losses.append(train_loss)
                 # number of training data points for this fold. Used for weighting
@@ -610,8 +640,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
 
                 # Compute validation loss of this fold and store it.
                 optimization_loss = self._loss(
-                    self.Y_targets[i],
-                    opt_pred,
+                    self.Y_targets[i], opt_pred, X_data=self.X_targets[i]
                 )
                 opt_losses.append(optimization_loss)
                 # number of optimization data points for this fold. Used for weighting
@@ -644,23 +673,15 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                     weights=opt_fold_weights,
                 )
 
+            X_targets = self.X_targets
             Y_targets = self.Y_targets
             Y_train_targets = self.Y_train_targets
 
-            Y_optimization_pred = np.concatenate(
-                [
-                    Y_optimization_pred[i]
-                    for i in range(self.num_cv_folds)
-                    if Y_optimization_pred[i] is not None
-                ]
-            )
-            Y_targets = np.concatenate(
-                [
-                    Y_targets[i]
-                    for i in range(self.num_cv_folds)
-                    if Y_targets[i] is not None
-                ]
+            Y_optimization_pred = concat_data(
+                Y_optimization_pred, num_cv_folds=self.num_cv_folds
             )
+            X_targets = concat_data(X_targets, num_cv_folds=self.num_cv_folds)
+            Y_targets = concat_data(Y_targets, num_cv_folds=self.num_cv_folds)
 
             if self.X_valid is not None:
                 Y_valid_pred = np.array(
@@ -686,6 +707,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                 if len(np.shape(Y_test_pred)) == 3:
                     Y_test_pred = np.nanmean(Y_test_pred, axis=0)
 
+            self.X_optimization = X_targets
             self.Y_optimization = Y_targets
             self.Y_actual_train = Y_train_targets
 
@@ -754,6 +776,7 @@ def partial_fit_predict_and_loss(self, fold: int, iterative: bool = False) -> No
                 break
 
         if self.num_cv_folds > 1:
+            self.X_optimization = self.X_train[test_split]
             self.Y_optimization = self.Y_train[test_split]
             self.Y_actual_train = self.Y_train[train_split]
 
@@ -981,6 +1004,11 @@ def _partial_fit_and_predict_standard(
         else:
             self.models[fold] = model
 
+        self.X_targets[fold] = (
+            self.X_train.iloc[test_indices]
+            if hasattr(self.X_train, "iloc")
+            else self.X_train[test_indices]
+        )
         self.Y_targets[fold] = (
             self.Y_train.iloc[test_indices]
             if hasattr(self.Y_train, "iloc")
@@ -1026,6 +1054,7 @@ def _partial_fit_and_predict_budget(
 
         model = self._get_model()
         self.indices[fold] = (train_indices, test_indices)
+        self.X_targets[fold] = self.X_train[test_indices]
         self.Y_targets[fold] = self.Y_train[test_indices]
         self.Y_train_targets[train_indices] = (
             self.Y_train.iloc[train_indices]