Merge pull request #29 from google:20240213-fsaad-fixes

PiperOrigin-RevId: 606725305
google · Feb 13, 2024 · d9c9146 · d9c9146
2 parents 2a8f793 + 856c2b0
commit d9c9146
Show file tree

Hide file tree

Showing 4 changed files with 127 additions and 47 deletions.
diff --git a/.github/workflows/pytest_and_autopublish.yml b/.github/workflows/pytest_and_autopublish.yml
@@ -51,4 +51,4 @@ jobs:
       with:
         pypi-token: ${{ secrets.PYPI_API_TOKEN }}
         gh-token: ${{ secrets.GITHUB_TOKEN }}
-        parse-changelog: true
+        parse-changelog: false
diff --git a/README.md b/README.md
@@ -22,14 +22,17 @@ gradient descent for handling large-scale data.
 
 ## Installation
 
-`bayesnf` can be installed from the Python Package Index (PyPI) using:
+`bayesnf` can be installed from the Python Package Index
+([PyPI](https://pypi.org/project/bayesnf/)) using:
 
 ```
 python -m pip install .
 ```
 
-Typical install time is 1 minute. This software is tested on Python 3.9.
-Experiments were run using TPU accelerators.
+The typical install time is 1 minute. This software is tested on Python 3.9
+with a standard Debian GNU/Linux setup. The large-scale experiments in
+`scripts/` were run using TPU v3-8 accelerators. For running BayesNF
+locally on medium to large-scale data, a GPU is required at minimum.
 
 ## Documentation and Tutorials
 

diff --git a/scripts/README b/scripts/README
@@ -6,30 +6,30 @@ Step 1. Obtain the training data
     $ mkdir -p data/
     $ tar -zxv -C data -f datasets.tar.gz
 
-Step 2. Run the scripts on the test files. Original experiments were run
-  on a 2x2 TPU v5e accelerator, available on Google Cloud
-  https://cloud.google.com/tpu/docs/supported-tpu-configurations.
+Step 2. Run the BayesNF evaluation scripts. Original experiments
+  were run on a TPU v3-8 accelerator, available on Google Cloud
+  https://cloud.google.com/tpu/docs/supported-tpu-configurations#tpu-v3-config
 
   For the command-line interface, run
 
     $ python evaluate.py --help
 
-        Evaluate ST-BNF.
-        flags:
-
-        evaluate.py:
-          --data_root: Location of input data.
-          --dataset: <air_quality|wind|air|chickenpox|coprecip|sst>: Dataset name
-          --num_particles: Override the number of particles for inference.
-            (an integer)
-          --objective: <map|mle|vi>: Training objective
-            (default: 'map')
-          --output_dir: Output directory.
-          --start_id: Run experiments on series with IDs >= this value.
-            (default: '5')
-            (an integer)
-          --stop_id: Run experiments on series with IDs < this value.
-            (an integer)
+      Evaluate BayesianNeuralField on spatiotemporal datasets.
+      flags:
+
+      evaluate.py:
+        --data_root: Location of input data.
+        --dataset: <air_quality|wind|air|chickenpox|coprecip|sst>: Dataset name
+        --num_particles: Override the number of particles for inference.
+          (an integer)
+        --objective: <map|mle|vi>: Training objective
+          (default: 'map')
+        --output_dir: Output directory.
+        --start_id: Run experiments on series with IDs >= this value.
+          (default: '5')
+          (an integer)
+        --stop_id: Run experiments on series with IDs < this value.
+          (an integer)
 
   An example invocation is the following:
 
@@ -40,3 +40,33 @@ Step 2. Run the scripts on the test files. Original experiments were run
       --objective=map \
       --start_id=5 \
       --stop_id=6
+
+Step 3. Run the baseline evaluation scripts.
+
+  For the command-line interface, run
+
+    $ python evaluate_baseline.py --help
+
+      Evaluate baseline prediction methods on spatiotemporal datasets.
+      flags:
+
+      evaluate_baseline.py:
+        --algorithm: <SVGP|ST-SVGP|MF-ST-SVGP|RF|GBOOST|TSREG>: Algorithm name
+        --data_root: Location of input data.
+        --dataset: <air_quality|wind|air|chickenpox|coprecip|sst>: Dataset name
+        --gboost_estimators: Number of GBOOST estimators.
+          (default: '100')
+          (an integer)
+        --[no]gboost_featurize: Add Fourier features to GBOOST baseline.
+          (default: 'false')
+        --output_dir: Output directory.
+        --start_id: Run experiments on series with IDs >= this value.
+          (default: '5')
+          (an integer)
+        --stop_id: Run experiments on series with IDs < this value.
+          (an integer)
+        --svgp_num_z: SVGP number of inducing points.
+          (default: '2000')
+          (an integer)
+        --tsreg_method: <OLS|RIDGE|LASSO>: Method for trend-surface regression.
+          (default: 'OLS')
diff --git a/scripts/evaluate_baseline.py b/scripts/evaluate_baseline.py
@@ -16,7 +16,7 @@
 
 from collections.abc import Sequence
 import os
-from pathlib import Path  # pylint:disable=g-importing-member
+import pathlib
 import time
 import types
 
@@ -25,14 +25,13 @@
 from absl import logging
 from bayesnf.models import make_fourier_features
 from bayesnf.models import make_seasonal_features
-import evaluate
 from dataset_config import DATASET_CONFIG
 from dataset_config import MODEL_CONFIG
 import numpy as np
 import pandas as pd
 import scipy
-from scipy.cluster.vq import kmeans2  # pylint:disable=g-importing-member
-from tqdm import tqdm  # pylint:disable=g-importing-member
+from scipy.cluster.vq import kmeans2  # pylint: disable=g-importing-member
+from tqdm import tqdm  # pylint: disable=g-importing-member
 
 
 _DATA_ROOT = flags.DEFINE_string(
@@ -152,6 +151,54 @@
     }
 
 
+def drop_nan(x, y):
+  """Drop elements of x and y at indexes where y is NaN."""
+  keep = ~np.isnan(y)
+  return (x[keep], y[keep])
+
+
+def create_spatiotemporal_grid(x, y):
+  """Create a spatiotemporal grid from feature matrix x and data vector y.
+
+
+  Args:
+    x: Feature matrix. Rows are i.i.d. instances, first column is "time",
+    and remaining columns are "location" features.
+    y: Observations.
+  Returns:
+    Tuple (t, R, y) representing grid of time, space, and observations.
+      - t is a column vector of unique time points.
+      - R[i] is a 2D array of spatial locations at time t[i].
+      - y[i][j] are observations at time t[i] and location R[i][j].
+  Notes:
+    Generalizes create_spatiotemporal_grid from BayesNewton.
+    https://github.com/AaltoML/BayesNewton/blob/ad5679439e58b6f53bbcc9708dc43452af26b8ac/bayesnewton/utils.py#L271
+  """
+  if y.ndim < 2:
+    y = y[:, np.newaxis]
+  num_spatial_dims = x.shape[1] - 1
+  sort_ind = np.lexsort([x[:, i] for i in range(num_spatial_dims, -1, -1)])
+  x = x[sort_ind]
+  y = y[sort_ind]
+  unique_time = np.unique(x[:, 0])
+  unique_space = np.unique(x[:, 1:], axis=0)
+  n_t = unique_time.shape[0]
+  n_r = unique_space.shape[0]
+  r = np.tile(unique_space, [n_t] + [1] * num_spatial_dims)
+  r_flat = r.reshape(-1, num_spatial_dims)
+  y_dummy = np.nan * np.zeros([n_t * n_r, 1])
+  time_duplicate = np.tile(unique_time, [n_r, 1]).T.flatten()
+  x_dummy = np.block([time_duplicate[:, None], r_flat])
+  x_all = np.vstack([x, x_dummy])
+  y_all = np.vstack([y, y_dummy])
+  x_unique, ind = np.unique(x_all, axis=0, return_index=True)
+  y_unique = y_all[ind]
+  grid_shape = (unique_time.shape[0],) + unique_space.shape
+  r_grid = x_unique[:, 1:].reshape(grid_shape)
+  y_grid = y_unique.reshape(grid_shape[:-1] + (1,))
+  return unique_time[:, None], r_grid, y_grid
+
+
 def get_dataset_tidy(
     root,
     dataset,
@@ -343,16 +390,16 @@ def run_experiment_bayesnewton(
   y_test_norm = (table.y_test - y_train_mu) / y_train_std
 
   # Data for training.
-  (X, Y) = evaluate.drop_nan(table.x_train, y_train_norm)
-  t, R, Y = evaluate.create_spatiotemporal_grid(X, Y)
+  (X, Y) = drop_nan(table.x_train, y_train_norm)
+  t, R, Y = create_spatiotemporal_grid(X, Y)
 
   # Data for RMSE scoring.
-  (X_test, Y_test) = evaluate.drop_nan(table.x_test, table.y_test)
-  t_test, R_test, Y_test = evaluate.create_spatiotemporal_grid(X_test, Y_test)
+  (X_test, Y_test) = drop_nan(table.x_test, table.y_test)
+  t_test, R_test, Y_test = create_spatiotemporal_grid(X_test, Y_test)
 
   # Data for NLPD scoring.
-  Y_test_norm = evaluate.drop_nan(table.x_test, y_test_norm)[1]
-  Y_test_norm = evaluate.create_spatiotemporal_grid(X_test, Y_test_norm)[2]
+  Y_test_norm = drop_nan(table.x_test, y_test_norm)[1]
+  Y_test_norm = create_spatiotemporal_grid(X_test, Y_test_norm)[2]
 
   var_f = 1.
   opt_z = sparse
@@ -439,7 +486,7 @@ def compute_metrics(model):
 
   # Write log history.
   df = pd.DataFrame(dict(epoch=epoch, runtime=runtime, rmse=rmse, nlpd=nlpd))
-  Path(_OUTPUT_DIR.value).mkdir(parents=True, exist_ok=True)
+  pathlib.Path(_OUTPUT_DIR.value).mkdir(parents=True, exist_ok=True)
   prefix = method.lower()
   csv_path_log = os.path.join(
       _OUTPUT_DIR.value,
@@ -454,7 +501,7 @@ def compute_metrics(model):
   index_probe = np.concatenate((table.index_train, table.index_test))
   x_probe = np.concatenate((table.x_train, table.x_test))
   y_probe = np.concatenate((table.y_train, table.y_test))
-  t_probe, R_probe, _ = evaluate.create_spatiotemporal_grid(x_probe, y_probe)
+  t_probe, R_probe, _ = create_spatiotemporal_grid(x_probe, y_probe)
   df_probe = pd.DataFrame(x_probe, index=index_probe)
   df_probe.index.name = '__index__'
   df_probe.reset_index(inplace=True)
@@ -524,8 +571,8 @@ def run_experiment_gpflow(
       target_col=DATASET_CONFIG_BASELINE[dataset]['target_col'],
       timetype=DATASET_CONFIG_BASELINE[dataset]['timetype'],
       standardize=DATASET_CONFIG_BASELINE[dataset]['standardize'])
-  (x_train, y_train) = evaluate.drop_nan(table.x_train, table.y_train)
-  (x_test, y_test) = evaluate.drop_nan(table.x_test, table.y_test)
+  (x_train, y_train) = drop_nan(table.x_train, table.y_train)
+  (x_test, y_test) = drop_nan(table.x_test, table.y_test)
 
   logging.info('x: %s', x_train.shape)
 
@@ -678,7 +725,7 @@ def train_step():
       runtime=np.array(runtimes),
       rmse=np.array(rmse),
       nlpd=np.array(nlpd)))
-  Path(_OUTPUT_DIR.value).mkdir(parents=True, exist_ok=True)
+  pathlib.Path(_OUTPUT_DIR.value).mkdir(parents=True, exist_ok=True)
   method = f'gpflow-svgp-{num_z}-{batch_size}'
   csv_path_log = os.path.join(
       _OUTPUT_DIR.value,
@@ -729,8 +776,8 @@ def run_experiment_rf(
       timetype=DATASET_CONFIG_BASELINE[dataset]['timetype'],
       standardize=DATASET_CONFIG_BASELINE[dataset]['standardize'],
       )
-  (x_train, y_train) = evaluate.drop_nan(table.x_train, table.y_train)
-  (x_test, y_test) = evaluate.drop_nan(table.x_test, table.y_test)
+  (x_train, y_train) = drop_nan(table.x_train, table.y_train)
+  (x_test, y_test) = drop_nan(table.x_test, table.y_test)
   start = time.time()
   regressor = RandomForestRegressor().fit(x_train, y_train)
   runtime = time.time() - start
@@ -741,7 +788,7 @@ def run_experiment_rf(
   df = pd.DataFrame(
       dict(epoch=[0], runtime=[runtime], rmse=[rmse], nlpd=[np.nan])
   )
-  Path(_OUTPUT_DIR.value).mkdir(parents=True, exist_ok=True)
+  pathlib.Path(_OUTPUT_DIR.value).mkdir(parents=True, exist_ok=True)
   csv_path_log = os.path.join(
       _OUTPUT_DIR.value, f'rf.{dataset}.{series_id}.log.csv'
   )
@@ -834,8 +881,8 @@ def run_experiment_gboost(
         for z in (table.x_train, table.x_test)
     ]
 
-  (x_train_drop, y_train_drop) = evaluate.drop_nan(x_train, table.y_train)
-  (x_test_drop, y_test_drop) = evaluate.drop_nan(x_test, table.y_test)
+  (x_train_drop, y_train_drop) = drop_nan(x_train, table.y_train)
+  (x_test_drop, y_test_drop) = drop_nan(x_test, table.y_test)
 
   models = {}
   common_params = dict(
@@ -861,7 +908,7 @@ def run_experiment_gboost(
   df = pd.DataFrame(
       dict(epoch=[0], runtime=[runtime], rmse=[rmse], nlpd=[np.nan])
   )
-  Path(_OUTPUT_DIR.value).mkdir(parents=True, exist_ok=True)
+  pathlib.Path(_OUTPUT_DIR.value).mkdir(parents=True, exist_ok=True)
   csv_path_log = os.path.join(
       _OUTPUT_DIR.value,
       f'gboost-{n_estimators}-{featurize}.{dataset}.{series_id}.log.csv',
@@ -935,8 +982,8 @@ def run_experiment_tsreg(
         for z in (table.x_train, table.x_test)
     ]
 
-  (x_train_drop, y_train_drop) = evaluate.drop_nan(x_train, table.y_train)
-  (x_test_drop, y_test_drop) = evaluate.drop_nan(x_test, table.y_test)
+  (x_train_drop, y_train_drop) = drop_nan(x_train, table.y_train)
+  (x_test_drop, y_test_drop) = drop_nan(x_test, table.y_test)
 
   # Fit regression.
   if method == 'OLS':
@@ -962,7 +1009,7 @@ def run_experiment_tsreg(
   df = pd.DataFrame(
       dict(epoch=[0], runtime=[runtime], rmse=[rmse], nlpd=[np.nan])
   )
-  Path(_OUTPUT_DIR.value).mkdir(parents=True, exist_ok=True)
+  pathlib.Path(_OUTPUT_DIR.value).mkdir(parents=True, exist_ok=True)
   csv_path_log = os.path.join(
       _OUTPUT_DIR.value, f'tsreg-{method}.{dataset}.{series_id}.log.csv'
   )