Add example about the behavior of var_names in `sample_posterior_pr…

…edictive`
pymc-devs · Nov 17, 2023 · f0dccc7 · f0dccc7
1 parent cac99d9
commit f0dccc7
Showing 1 changed file with 88 additions and 8 deletions.
diff --git a/pymc/sampling/forward.py b/pymc/sampling/forward.py
@@ -464,8 +464,10 @@ def sample_posterior_predictive(
     model : Model (optional if in ``with`` context)
         Model to be used to generate the posterior predictive samples. It will
         generally be the model used to generate the ``trace``, but it doesn't need to be.
-    var_names : Iterable[str]
+    var_names : Iterable[str], optional
         Names of variables for which to compute the posterior predictive samples.
+        By default, only observed variables are sampled.
+        See the example below for what happens when this argument is customized.
     sample_dims : list of str, optional
         Dimensions over which to loop and generate posterior predictive samples.
         When `sample_dims` is ``None`` (default) both "chain" and "draw" are considered sample
@@ -497,22 +499,100 @@ def sample_posterior_predictive(
 
     Examples
     --------
-    Thin a sampled inferencedata by keeping 1 out of every 5 draws
-    before passing it to sample_posterior_predictive
+    Sample posterior predictive samples any variable included in `var_names`,
+    conditioned on the values of other random variables found in the trace.
+    To ensure the samples are internally consistent, any random variable that depends
+    on another random variable that is being sampled is also sampled.
+    The final list of variables being sampled is shown in the log output.
+
+    Notice that if a random variable has no dependency on other random variables,
+    posterior predictive sampling is equivalent to prior sampling.
+    Likewise, if all random variables are being sampled, the behavior is equivalent to
+    that of sample_prior_predictive.
+
+    Notice how the behavior of the function changes with different `var_names` below:
 
     .. code:: python
 
-        thinned_idata = idata.sel(draw=slice(None, None, 5))
-        with model:
-            idata.extend(pymc.sample_posterior_predictive(thinned_idata))
+        from logging import getLogger
+        import pymc as pm
+
+        # Some environments like google colab suppress the default logging output of PyMC
+        getLogger("pymc").setLevel("INFO")
+
+        kwargs = {"progressbar": False, "random_seed": 0}
+
+        with pm.Model() as model:
+          x = pm.Normal("x")
+          y = pm.Normal("y")
+          z = pm.Normal("z", x + y**2)
+          det = pm.Deterministic("det", pm.math.exp(z))
+          obs = pm.Normal("obs", det, 1, observed=[20])
+
+          idata = pm.sample(tune=10, draws=10, chains=2, **kwargs)
+
+          # Default behavior
+          # Posterior predictive draws of `obs`, conditioned on the posterior draws of `z`
+          pm.sample_posterior_predictive(idata, var_names=["obs"], **kwargs)
+          # Sampling: [obs]
+
+          # Deterministic re-computation of `det` given the posterior draws of `z`
+          pm.sample_posterior_predictive(idata, var_names=["det"], **kwargs)
+          # Sampling: []
+
+          # Posterior predictive draws of `z` and `det`, conditioned on the posterior draws of `x` and `y`
+          # It does not matter that `z` was not an observed variable.
+          pm.sample_posterior_predictive(idata, var_names=["z", "det"], **kwargs)
+          # Sampling: [z]
 
-    Generate 5 posterior predictive samples per posterior sample.
+          # Posterior predictive draws of `y`, `z` and `det`, conditioned on the posterior draws of `x`
+          # Note that `y` samples are equivalent to the prior, since no other variables influence it.
+          pm.sample_posterior_predictive(idata, var_names=["y", "z", "det"], **kwargs)
+          # Sampling: [y, z]
+
+          # Same as before, except `z` is not stored in the returned trace.
+          # For computing `det` we still have to resample `z` as it depends on `y`, which is also being sampled.
+          pm.sample_posterior_predictive(idata, var_names=["y", "det"], **kwargs)
+          # Sampling: [y, z]
+
+          # Everything is sampled, which is equivalent to calling `sample_prior_predictive`
+          pm.sample_posterior_predictive(idata, var_names=["x", "y", "z", "det"], **kwargs)
+          # Sampling: [x, y, z]
+
+
+        # Notice that "sampling" a dependent deterministic doesn't force random variables to be sampled.
+        with pm.Model() as model:
+          x = pm.Normal("x")
+          y = pm.Normal("y")
+          det_xy = pm.Deterministic("det_xy", x + y**2)
+          z = pm.Normal("z", x + y**2)
+          det_z = pm.Deterministic("det_z", pm.math.exp(z))
+          obs = pm.Normal("obs", det, 1, observed=[20])
+
+          idata = pm.sample(tune=10, draws=10, chains=2, **kwargs)
+
+          pm.sample_posterior_predictive(idata, var_names=["det_xy", det_z"], **kwargs)
+          Sampling: [z]
+
+
+    You can manipulate the InferenceData to control the number of samples
 
     .. code:: python
+        import pymc as pm
+
+        with pm.Model() as model:
+            ...
+            idata = pm.sample()
+
+        # Generate 1 posterior predictive sample for every 5 posterior samples.
+        thinned_idata = idata.sel(draw=slice(None, None, 5))
+        with model:
+            idata.extend(pm.sample_posterior_predictive(thinned_idata))
 
+        # Generate 5 posterior predictive samples for every posterior sample.
         expanded_data = idata.posterior.expand_dims(pred_id=5)
         with model:
-            idata.extend(pymc.sample_posterior_predictive(expanded_data))
+            idata.extend(pm.sample_posterior_predictive(expanded_data))
     """
 
     _trace: Union[MultiTrace, PointList]