Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sample from constrained kernel params before model fitting #297

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 102 additions & 18 deletions tests/unit/models/test_model_interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,19 +361,29 @@ def test_gaussian_process_regression_pairwise_covariance(gpr_interface_factory)
@unittest.mock.patch(
"trieste.models.model_interfaces.GaussianProcessRegression.find_best_model_initialization"
)
@pytest.mark.parametrize("d", [1, 2])
def test_gaussian_process_regression_correctly_counts_num_trainable_params_with_priors(
mocked_model_initializer, d, gpr_interface_factory
@pytest.mark.parametrize("d", [1, 3])
@pytest.mark.parametrize("prior_for_lengthscale", [True, False])
def test_gaussian_process_regression_correctly_counts_params_that_can_be_sampled(
mocked_model_initializer, d, prior_for_lengthscale, gpr_interface_factory
) -> None:
x = tf.constant(np.arange(1, 5 * d + 1).reshape(-1, d), dtype=tf.float64) # shape: [5, d]
model = gpr_interface_factory(x, _3x_plus_10(x))
model.model.kernel = gpflow.kernels.RBF(lengthscales=tf.ones([d], dtype=tf.float64))
model.model.likelihood.variance.assign(1.0)
gpflow.set_trainable(model.model.likelihood, True)

model.model.kernel.lengthscales.prior = tfp.distributions.LogNormal(
loc=tf.math.log(model.model.kernel.lengthscales), scale=tf.cast(1.0, dtype=tf.float64)
)
if prior_for_lengthscale:
model.model.kernel.lengthscales.prior = tfp.distributions.LogNormal(
loc=tf.math.log(model.model.kernel.lengthscales), scale=1.0
)

else:
upper = tf.cast([10.0] * d, dtype=tf.float64)
lower = upper / 100
model.model.kernel.lengthscales = gpflow.Parameter(
model.model.kernel.lengthscales, transform=tfp.bijectors.Sigmoid(low=lower, high=upper)
)

model.model.likelihood.variance.prior = tfp.distributions.LogNormal(
loc=tf.cast(-2.0, dtype=tf.float64), scale=tf.cast(5.0, dtype=tf.float64)
)
Expand All @@ -389,12 +399,13 @@ def test_gaussian_process_regression_correctly_counts_num_trainable_params_with_
npt.assert_array_equal(num_samples, tf.minimum(1000, 100 * (d + 1)))


def test_find_best_model_initialization_only_changes_params_with_priors(
gpr_interface_factory,
@pytest.mark.parametrize("dim", [1, 10])
def test_find_best_model_initialization_changes_params_with_priors(
gpr_interface_factory, dim: int
) -> None:
x = tf.constant(np.arange(1, 5).reshape(-1, 1), dtype=gpflow.default_float()) # shape: [4, 1]
model = gpr_interface_factory(x, _3x_plus_10(x))
model.model.kernel = gpflow.kernels.RBF()
model.model.kernel = gpflow.kernels.RBF(lengthscales=[0.2] * dim)

if isinstance(model, (VariationalGaussianProcess, SparseVariational)):
pytest.skip("find_best_model_initialization is only implemented for the GPR models.")
Expand All @@ -406,27 +417,62 @@ def test_find_best_model_initialization_only_changes_params_with_priors(
model.find_best_model_initialization(2)

npt.assert_allclose(1.0, model.model.kernel.variance)
npt.assert_array_equal(dim, model.model.kernel.lengthscales.shape)
npt.assert_raises(
AssertionError, npt.assert_allclose, [0.2, 0.2], model.model.kernel.lengthscales
)


@pytest.mark.parametrize("dim", [1, 10])
def test_find_best_model_initialization_changes_params_with_sigmoid_bjectors(
gpr_interface_factory, dim: int
) -> None:
x = tf.constant(np.arange(1, 5).reshape(-1, 1), dtype=gpflow.default_float()) # shape: [4, 1]
model = gpr_interface_factory(x, _3x_plus_10(x))
model.model.kernel = gpflow.kernels.RBF(lengthscales=[0.2] * dim)

if isinstance(model, (VariationalGaussianProcess, SparseVariational)):
pytest.skip("find_best_model_initialization is only implemented for the GPR models.")

upper = tf.cast([10.0] * dim, dtype=tf.float64)
lower = upper / 100
model.model.kernel.lengthscales = gpflow.Parameter(
henrymoss marked this conversation as resolved.
Show resolved Hide resolved
model.model.kernel.lengthscales, transform=tfp.bijectors.Sigmoid(low=lower, high=upper)
)

model.find_best_model_initialization(2)

npt.assert_allclose(1.0, model.model.kernel.variance)
npt.assert_array_equal(dim, model.model.kernel.lengthscales.shape)
npt.assert_raises(
AssertionError, npt.assert_allclose, [0.2, 0.2], model.model.kernel.lengthscales
)


@random_seed
def test_find_best_model_initialization_improves_likelihood(gpr_interface_factory) -> None:
@pytest.mark.parametrize("dim", [1, 10])
def test_find_best_model_initialization_improves_likelihood(
gpr_interface_factory, dim: int
) -> None:
x = tf.constant(np.arange(1, 10).reshape(-1, 1), dtype=gpflow.default_float()) # shape: [4, 1]
model = gpr_interface_factory(x, _3x_plus_10(x))
model.model.kernel = gpflow.kernels.RBF()
model.model.kernel = gpflow.kernels.RBF(variance=1.0, lengthscales=[0.2] * dim)

if isinstance(model, (VariationalGaussianProcess, SparseVariational)):
pytest.skip("find_best_model_initialization is only implemented for the GPR models.")

model.model.kernel.lengthscales.prior = tfp.distributions.LogNormal(
loc=tf.math.log(model.model.kernel.lengthscales), scale=1.0
model.model.kernel.variance.prior = tfp.distributions.LogNormal(
loc=np.float64(-2.0), scale=np.float64(1.0)
)
upper = tf.cast([10.0] * dim, dtype=tf.float64)
lower = upper / 100
model.model.kernel.lengthscales = gpflow.Parameter(
model.model.kernel.lengthscales, transform=tfp.bijectors.Sigmoid(low=lower, high=upper)
)

pre_init_likelihood = model.model.maximum_log_likelihood_objective()
pre_init_likelihood = -model.model.training_loss()
model.find_best_model_initialization(10)
post_init_likelihood = model.model.maximum_log_likelihood_objective()
post_init_likelihood = -model.model.training_loss()

npt.assert_array_less(pre_init_likelihood, post_init_likelihood)

Expand Down Expand Up @@ -660,13 +706,51 @@ def test_sparse_variational_optimize(batcher, compile: bool) -> None:


@random_seed
def test_randomize_model_hyperparameters_only_randomize_kernel_parameters_with_priors() -> None:
kernel = gpflow.kernels.RBF(variance=1.0, lengthscales=[0.2, 0.2])
@pytest.mark.parametrize("dim", [1, 10])
def test_randomize_model_hyperparameters_randomize_kernel_parameters_with_priors(dim: int) -> None:
kernel = gpflow.kernels.RBF(variance=1.0, lengthscales=[0.2] * dim)
kernel.lengthscales.prior = tfp.distributions.LogNormal(
loc=tf.math.log(kernel.lengthscales), scale=1.0
)
randomize_model_hyperparameters(kernel)

npt.assert_allclose(1.0, kernel.variance)
npt.assert_array_equal(dim, kernel.lengthscales.shape)
npt.assert_raises(AssertionError, npt.assert_allclose, [0.2] * dim, kernel.lengthscales)


@random_seed
@pytest.mark.parametrize("dim", [1, 10])
def test_randomize_model_hyperparameters_randomizes_constrained_kernel_parameters(dim: int) -> None:
kernel = gpflow.kernels.RBF(variance=1.0, lengthscales=[0.2] * dim)
upper = tf.cast([10.0] * dim, dtype=tf.float64)
lower = upper / 100
kernel.lengthscales = gpflow.Parameter(
kernel.lengthscales, transform=tfp.bijectors.Sigmoid(low=lower, high=upper)
)

randomize_model_hyperparameters(kernel)

npt.assert_allclose(1.0, kernel.variance)
npt.assert_raises(AssertionError, npt.assert_allclose, [0.2, 0.2], kernel.lengthscales)
npt.assert_array_equal(dim, kernel.lengthscales.shape)
npt.assert_raises(AssertionError, npt.assert_allclose, [0.2] * dim, kernel.lengthscales)


@random_seed
@pytest.mark.parametrize("dim", [1, 10])
def test_randomize_model_hyperparameters_randomizes_kernel_parameters_with_constraints_or_priors(
dim: int,
) -> None:
kernel = gpflow.kernels.RBF(variance=1.0, lengthscales=[0.2] * dim)
upper = tf.cast([10.0] * dim, dtype=tf.float64)
lower = upper / 100
kernel.lengthscales = gpflow.Parameter(
kernel.lengthscales, transform=tfp.bijectors.Sigmoid(low=lower, high=upper)
)
kernel.variance.prior = tfp.distributions.LogNormal(loc=np.float64(-2.0), scale=np.float64(1.0))

randomize_model_hyperparameters(kernel)

npt.assert_raises(AssertionError, npt.assert_allclose, 1.0, kernel.variance)
npt.assert_array_equal(dim, kernel.lengthscales.shape)
npt.assert_raises(AssertionError, npt.assert_allclose, [0.2] * dim, kernel.lengthscales)
65 changes: 46 additions & 19 deletions trieste/models/model_interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import gpflow
import tensorflow as tf
import tensorflow_probability as tfp
from gpflow.models import GPR, SGPR, SVGP, VGP, GPModel
from gpflow.utilities import multiple_assign, read_values

Expand Down Expand Up @@ -424,30 +425,47 @@ def optimize(self, dataset: Dataset) -> None:
"""
Optimize the model with the specified `dataset`.

If any of the kernel's trainable parameters have priors, then we begin model optimization
from the best of a random sample from these parameters' priors. For trainable parameters
without priors, we begin optimization from their initial values.
For :class:`GaussianProcessRegression`, we (optionally) try multiple randomly sampled
kernel parameter configurations as well as the configuration specified when initializing
the kernel. The best configuration is used as the starting point for model optimization.

For trainable parameters constrained to lie in a finite interval (through a sigmoid
bijector), we begin model optimization from the best of a random sample from these
parameters' acceptable domains.

For trainable parameters without constraints but with priors, we begin model optimization
from the best of a random sample from these parameters' priors.

For trainable parameters with neither priors or constraints, we begin optimization from
henrymoss marked this conversation as resolved.
Show resolved Hide resolved
their initial values.

:param dataset: The data with which to optimize the `model`.
"""

num_trainable_params_with_priors = tf.reduce_sum(
[tf.size(param) for param in self.model.trainable_parameters if param.prior is not None]
num_trainable_params_with_priors_or_constraints = tf.reduce_sum(
[
tf.size(param)
for param in self.model.trainable_parameters
if param.prior is not None or isinstance(param.bijector, tfp.bijectors.Sigmoid)
]
)

if num_trainable_params_with_priors >= 1: # Find a promising kernel initialization
num_prior_samples = tf.minimum(1000, 100 * num_trainable_params_with_priors)
self.find_best_model_initialization(num_prior_samples)
if (
num_trainable_params_with_priors_or_constraints >= 1
): # Find a promising kernel initialization
num_kernel_samples = tf.minimum(
1000, 100 * num_trainable_params_with_priors_or_constraints
)
self.find_best_model_initialization(num_kernel_samples)

self.optimizer.optimize(self.model, dataset)

def find_best_model_initialization(self, num_prior_samples) -> None:
def find_best_model_initialization(self, num_kernel_samples) -> None:
"""
Test `num_prior_samples` models with kernel parameters sampled from their
priors. The model's kernel parameters are then set to those achieving maximal
likelihood across the sample.
Test `num_kernel_samples` models with sampled kernel parameters. The model's kernel
parameters are then set to the sample achieving maximal likelihood.

:param num_prior_samples: Number of randomly sampled kernels to evaluate.
:param num_kernel_samples: Number of randomly sampled kernels to evaluate.
"""

@tf.function
Expand All @@ -460,10 +478,10 @@ def evaluate_likelihood_of_model_parameters() -> tf.Tensor:
self.model.maximum_log_likelihood_objective() + self.model.log_prior_density()
)

for _ in tf.range(num_prior_samples):
for _ in tf.range(num_kernel_samples):
try:
log_likelihood = evaluate_likelihood_of_model_parameters()
except tf.errors.InvalidArgumentError: # allow badly specified priors
except tf.errors.InvalidArgumentError: # allow badly specified kernel params
log_likelihood = -1e100

if log_likelihood > max_log_likelihood: # only keep best kernel params
Expand Down Expand Up @@ -589,10 +607,19 @@ def _assert_data_is_compatible(new_data: Dataset, existing_data: Dataset) -> Non

def randomize_model_hyperparameters(model: gpflow.models.GPModel) -> None:
"""
Sets hyperparameters to random samples from their prior distributions.
Sets hyperparameters to random samples from their constrained domains or (if not constraints
are available) their prior distributions.

:param model: Any GPModel from gpflow.
"""
for parameter in model.trainable_parameters:
if parameter.prior is not None:
parameter.assign(parameter.prior.sample())
for param in model.trainable_parameters:
if isinstance(param.bijector, tfp.bijectors.Sigmoid):
henrymoss marked this conversation as resolved.
Show resolved Hide resolved
sample = tf.random.uniform(
[1],
minval=param.bijector.low,
maxval=param.bijector.high,
dtype=param.bijector.low.dtype,
)
param.assign(sample)
elif param.prior is not None:
param.assign(param.prior.sample())