Skip to content

Commit

Permalink
Merge pull request #29 from google:20240213-fsaad-fixes
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 606725305
  • Loading branch information
The bayesnf Authors committed Feb 13, 2024
2 parents 2a8f793 + 856c2b0 commit d9c9146
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 47 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/pytest_and_autopublish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,4 @@ jobs:
with:
pypi-token: ${{ secrets.PYPI_API_TOKEN }}
gh-token: ${{ secrets.GITHUB_TOKEN }}
parse-changelog: true
parse-changelog: false
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,17 @@ gradient descent for handling large-scale data.

## Installation

`bayesnf` can be installed from the Python Package Index (PyPI) using:
`bayesnf` can be installed from the Python Package Index
([PyPI](https://pypi.org/project/bayesnf/)) using:

```
python -m pip install .
```

Typical install time is 1 minute. This software is tested on Python 3.9.
Experiments were run using TPU accelerators.
The typical install time is 1 minute. This software is tested on Python 3.9
with a standard Debian GNU/Linux setup. The large-scale experiments in
`scripts/` were run using TPU v3-8 accelerators. For running BayesNF
locally on medium to large-scale data, a GPU is required at minimum.

## Documentation and Tutorials

Expand Down
68 changes: 49 additions & 19 deletions scripts/README
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,30 @@ Step 1. Obtain the training data
$ mkdir -p data/
$ tar -zxv -C data -f datasets.tar.gz

Step 2. Run the scripts on the test files. Original experiments were run
on a 2x2 TPU v5e accelerator, available on Google Cloud
https://cloud.google.com/tpu/docs/supported-tpu-configurations.
Step 2. Run the BayesNF evaluation scripts. Original experiments
were run on a TPU v3-8 accelerator, available on Google Cloud
https://cloud.google.com/tpu/docs/supported-tpu-configurations#tpu-v3-config

For the command-line interface, run

$ python evaluate.py --help

Evaluate ST-BNF.
flags:

evaluate.py:
--data_root: Location of input data.
--dataset: <air_quality|wind|air|chickenpox|coprecip|sst>: Dataset name
--num_particles: Override the number of particles for inference.
(an integer)
--objective: <map|mle|vi>: Training objective
(default: 'map')
--output_dir: Output directory.
--start_id: Run experiments on series with IDs >= this value.
(default: '5')
(an integer)
--stop_id: Run experiments on series with IDs < this value.
(an integer)
Evaluate BayesianNeuralField on spatiotemporal datasets.
flags:

evaluate.py:
--data_root: Location of input data.
--dataset: <air_quality|wind|air|chickenpox|coprecip|sst>: Dataset name
--num_particles: Override the number of particles for inference.
(an integer)
--objective: <map|mle|vi>: Training objective
(default: 'map')
--output_dir: Output directory.
--start_id: Run experiments on series with IDs >= this value.
(default: '5')
(an integer)
--stop_id: Run experiments on series with IDs < this value.
(an integer)

An example invocation is the following:

Expand All @@ -40,3 +40,33 @@ Step 2. Run the scripts on the test files. Original experiments were run
--objective=map \
--start_id=5 \
--stop_id=6

Step 3. Run the baseline evaluation scripts.

For the command-line interface, run

$ python evaluate_baseline.py --help

Evaluate baseline prediction methods on spatiotemporal datasets.
flags:

evaluate_baseline.py:
--algorithm: <SVGP|ST-SVGP|MF-ST-SVGP|RF|GBOOST|TSREG>: Algorithm name
--data_root: Location of input data.
--dataset: <air_quality|wind|air|chickenpox|coprecip|sst>: Dataset name
--gboost_estimators: Number of GBOOST estimators.
(default: '100')
(an integer)
--[no]gboost_featurize: Add Fourier features to GBOOST baseline.
(default: 'false')
--output_dir: Output directory.
--start_id: Run experiments on series with IDs >= this value.
(default: '5')
(an integer)
--stop_id: Run experiments on series with IDs < this value.
(an integer)
--svgp_num_z: SVGP number of inducing points.
(default: '2000')
(an integer)
--tsreg_method: <OLS|RIDGE|LASSO>: Method for trend-surface regression.
(default: 'OLS')
95 changes: 71 additions & 24 deletions scripts/evaluate_baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from collections.abc import Sequence
import os
from pathlib import Path # pylint:disable=g-importing-member
import pathlib
import time
import types

Expand All @@ -25,14 +25,13 @@
from absl import logging
from bayesnf.models import make_fourier_features
from bayesnf.models import make_seasonal_features
import evaluate
from dataset_config import DATASET_CONFIG
from dataset_config import MODEL_CONFIG
import numpy as np
import pandas as pd
import scipy
from scipy.cluster.vq import kmeans2 # pylint:disable=g-importing-member
from tqdm import tqdm # pylint:disable=g-importing-member
from scipy.cluster.vq import kmeans2 # pylint: disable=g-importing-member
from tqdm import tqdm # pylint: disable=g-importing-member


_DATA_ROOT = flags.DEFINE_string(
Expand Down Expand Up @@ -152,6 +151,54 @@
}


def drop_nan(x, y):
"""Drop elements of x and y at indexes where y is NaN."""
keep = ~np.isnan(y)
return (x[keep], y[keep])


def create_spatiotemporal_grid(x, y):
"""Create a spatiotemporal grid from feature matrix x and data vector y.
Args:
x: Feature matrix. Rows are i.i.d. instances, first column is "time",
and remaining columns are "location" features.
y: Observations.
Returns:
Tuple (t, R, y) representing grid of time, space, and observations.
- t is a column vector of unique time points.
- R[i] is a 2D array of spatial locations at time t[i].
- y[i][j] are observations at time t[i] and location R[i][j].
Notes:
Generalizes create_spatiotemporal_grid from BayesNewton.
https://github.com/AaltoML/BayesNewton/blob/ad5679439e58b6f53bbcc9708dc43452af26b8ac/bayesnewton/utils.py#L271
"""
if y.ndim < 2:
y = y[:, np.newaxis]
num_spatial_dims = x.shape[1] - 1
sort_ind = np.lexsort([x[:, i] for i in range(num_spatial_dims, -1, -1)])
x = x[sort_ind]
y = y[sort_ind]
unique_time = np.unique(x[:, 0])
unique_space = np.unique(x[:, 1:], axis=0)
n_t = unique_time.shape[0]
n_r = unique_space.shape[0]
r = np.tile(unique_space, [n_t] + [1] * num_spatial_dims)
r_flat = r.reshape(-1, num_spatial_dims)
y_dummy = np.nan * np.zeros([n_t * n_r, 1])
time_duplicate = np.tile(unique_time, [n_r, 1]).T.flatten()
x_dummy = np.block([time_duplicate[:, None], r_flat])
x_all = np.vstack([x, x_dummy])
y_all = np.vstack([y, y_dummy])
x_unique, ind = np.unique(x_all, axis=0, return_index=True)
y_unique = y_all[ind]
grid_shape = (unique_time.shape[0],) + unique_space.shape
r_grid = x_unique[:, 1:].reshape(grid_shape)
y_grid = y_unique.reshape(grid_shape[:-1] + (1,))
return unique_time[:, None], r_grid, y_grid


def get_dataset_tidy(
root,
dataset,
Expand Down Expand Up @@ -343,16 +390,16 @@ def run_experiment_bayesnewton(
y_test_norm = (table.y_test - y_train_mu) / y_train_std

# Data for training.
(X, Y) = evaluate.drop_nan(table.x_train, y_train_norm)
t, R, Y = evaluate.create_spatiotemporal_grid(X, Y)
(X, Y) = drop_nan(table.x_train, y_train_norm)
t, R, Y = create_spatiotemporal_grid(X, Y)

# Data for RMSE scoring.
(X_test, Y_test) = evaluate.drop_nan(table.x_test, table.y_test)
t_test, R_test, Y_test = evaluate.create_spatiotemporal_grid(X_test, Y_test)
(X_test, Y_test) = drop_nan(table.x_test, table.y_test)
t_test, R_test, Y_test = create_spatiotemporal_grid(X_test, Y_test)

# Data for NLPD scoring.
Y_test_norm = evaluate.drop_nan(table.x_test, y_test_norm)[1]
Y_test_norm = evaluate.create_spatiotemporal_grid(X_test, Y_test_norm)[2]
Y_test_norm = drop_nan(table.x_test, y_test_norm)[1]
Y_test_norm = create_spatiotemporal_grid(X_test, Y_test_norm)[2]

var_f = 1.
opt_z = sparse
Expand Down Expand Up @@ -439,7 +486,7 @@ def compute_metrics(model):

# Write log history.
df = pd.DataFrame(dict(epoch=epoch, runtime=runtime, rmse=rmse, nlpd=nlpd))
Path(_OUTPUT_DIR.value).mkdir(parents=True, exist_ok=True)
pathlib.Path(_OUTPUT_DIR.value).mkdir(parents=True, exist_ok=True)
prefix = method.lower()
csv_path_log = os.path.join(
_OUTPUT_DIR.value,
Expand All @@ -454,7 +501,7 @@ def compute_metrics(model):
index_probe = np.concatenate((table.index_train, table.index_test))
x_probe = np.concatenate((table.x_train, table.x_test))
y_probe = np.concatenate((table.y_train, table.y_test))
t_probe, R_probe, _ = evaluate.create_spatiotemporal_grid(x_probe, y_probe)
t_probe, R_probe, _ = create_spatiotemporal_grid(x_probe, y_probe)
df_probe = pd.DataFrame(x_probe, index=index_probe)
df_probe.index.name = '__index__'
df_probe.reset_index(inplace=True)
Expand Down Expand Up @@ -524,8 +571,8 @@ def run_experiment_gpflow(
target_col=DATASET_CONFIG_BASELINE[dataset]['target_col'],
timetype=DATASET_CONFIG_BASELINE[dataset]['timetype'],
standardize=DATASET_CONFIG_BASELINE[dataset]['standardize'])
(x_train, y_train) = evaluate.drop_nan(table.x_train, table.y_train)
(x_test, y_test) = evaluate.drop_nan(table.x_test, table.y_test)
(x_train, y_train) = drop_nan(table.x_train, table.y_train)
(x_test, y_test) = drop_nan(table.x_test, table.y_test)

logging.info('x: %s', x_train.shape)

Expand Down Expand Up @@ -678,7 +725,7 @@ def train_step():
runtime=np.array(runtimes),
rmse=np.array(rmse),
nlpd=np.array(nlpd)))
Path(_OUTPUT_DIR.value).mkdir(parents=True, exist_ok=True)
pathlib.Path(_OUTPUT_DIR.value).mkdir(parents=True, exist_ok=True)
method = f'gpflow-svgp-{num_z}-{batch_size}'
csv_path_log = os.path.join(
_OUTPUT_DIR.value,
Expand Down Expand Up @@ -729,8 +776,8 @@ def run_experiment_rf(
timetype=DATASET_CONFIG_BASELINE[dataset]['timetype'],
standardize=DATASET_CONFIG_BASELINE[dataset]['standardize'],
)
(x_train, y_train) = evaluate.drop_nan(table.x_train, table.y_train)
(x_test, y_test) = evaluate.drop_nan(table.x_test, table.y_test)
(x_train, y_train) = drop_nan(table.x_train, table.y_train)
(x_test, y_test) = drop_nan(table.x_test, table.y_test)
start = time.time()
regressor = RandomForestRegressor().fit(x_train, y_train)
runtime = time.time() - start
Expand All @@ -741,7 +788,7 @@ def run_experiment_rf(
df = pd.DataFrame(
dict(epoch=[0], runtime=[runtime], rmse=[rmse], nlpd=[np.nan])
)
Path(_OUTPUT_DIR.value).mkdir(parents=True, exist_ok=True)
pathlib.Path(_OUTPUT_DIR.value).mkdir(parents=True, exist_ok=True)
csv_path_log = os.path.join(
_OUTPUT_DIR.value, f'rf.{dataset}.{series_id}.log.csv'
)
Expand Down Expand Up @@ -834,8 +881,8 @@ def run_experiment_gboost(
for z in (table.x_train, table.x_test)
]

(x_train_drop, y_train_drop) = evaluate.drop_nan(x_train, table.y_train)
(x_test_drop, y_test_drop) = evaluate.drop_nan(x_test, table.y_test)
(x_train_drop, y_train_drop) = drop_nan(x_train, table.y_train)
(x_test_drop, y_test_drop) = drop_nan(x_test, table.y_test)

models = {}
common_params = dict(
Expand All @@ -861,7 +908,7 @@ def run_experiment_gboost(
df = pd.DataFrame(
dict(epoch=[0], runtime=[runtime], rmse=[rmse], nlpd=[np.nan])
)
Path(_OUTPUT_DIR.value).mkdir(parents=True, exist_ok=True)
pathlib.Path(_OUTPUT_DIR.value).mkdir(parents=True, exist_ok=True)
csv_path_log = os.path.join(
_OUTPUT_DIR.value,
f'gboost-{n_estimators}-{featurize}.{dataset}.{series_id}.log.csv',
Expand Down Expand Up @@ -935,8 +982,8 @@ def run_experiment_tsreg(
for z in (table.x_train, table.x_test)
]

(x_train_drop, y_train_drop) = evaluate.drop_nan(x_train, table.y_train)
(x_test_drop, y_test_drop) = evaluate.drop_nan(x_test, table.y_test)
(x_train_drop, y_train_drop) = drop_nan(x_train, table.y_train)
(x_test_drop, y_test_drop) = drop_nan(x_test, table.y_test)

# Fit regression.
if method == 'OLS':
Expand All @@ -962,7 +1009,7 @@ def run_experiment_tsreg(
df = pd.DataFrame(
dict(epoch=[0], runtime=[runtime], rmse=[rmse], nlpd=[np.nan])
)
Path(_OUTPUT_DIR.value).mkdir(parents=True, exist_ok=True)
pathlib.Path(_OUTPUT_DIR.value).mkdir(parents=True, exist_ok=True)
csv_path_log = os.path.join(
_OUTPUT_DIR.value, f'tsreg-{method}.{dataset}.{series_id}.log.csv'
)
Expand Down

0 comments on commit d9c9146

Please sign in to comment.