diff --git a/Project.toml b/Project.toml
index fd175069..f2910362 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "TMLE"
 uuid = "8afdd2fb-6e73-43df-8b62-b1650cd9c8cf"
 authors = ["Olivier Labayle"]
-version = "0.16.0"
+version = "0.16.1"
 
 [deps]
 AbstractDifferentiation = "c29ec348-61ec-40c8-8164-b8c60e9d9f3d"
diff --git a/docs/make.jl b/docs/make.jl
index 1b5acea6..69c5463e 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -34,6 +34,7 @@ makedocs(;
             joinpath("examples", "super_learning.md"),
             joinpath("examples", "double_robustness.md")
             ],
+        "Estimators' Cheat Sheet" => "estimators_cheatsheet.md",
         "Resources" => "resources.md",
         "API Reference" => "api.md"
     ],
diff --git a/docs/src/assets/sample_splitting.png b/docs/src/assets/sample_splitting.png
new file mode 100644
index 00000000..b1d724d8
Binary files /dev/null and b/docs/src/assets/sample_splitting.png differ
diff --git a/docs/src/estimators_cheatsheet.md b/docs/src/estimators_cheatsheet.md
new file mode 100644
index 00000000..05c1d27c
--- /dev/null
+++ b/docs/src/estimators_cheatsheet.md
@@ -0,0 +1,304 @@
+# Estimators' Cheatsheet
+
+This section is an effort to succinctly summarize the definition of semi-parametric estimators available in this package. As such, it is not self-contained, rather, it is intended as a mathematical memo that can be quickly searched. Gradients, One-Step and Targeted Maximum-Likelihood estimators are provided for the Counterfactual Mean, Average Treatment Effect and Average Interaction Effect. Estimators are presented in both their canonical and cross-validated versions.
+
+One major difficulty I personally faced when entering the field, was the overwhelming notational burden. Unfortunately, this burden is necessary to understand how the various mathematical objects are handled by the procedures presented below. It is thus worth the effort to make sure you understand what each notation means. The reward? After reading this document, you should be able to implement any estimator present in this page.
+
+Finally, if you find inconsistencies or imprecision, please report it, so we can keep improving!
+
+## Where it all begins
+
+### Notations
+
+This is the notation we use throughout:
+
+- The observed data: We assume we observe the realization of a random vector ``\bold{Z}_n = (Z_1, ..., Z_n)``. The components of ``\bold{Z}`` are assumed independent and identically distributed according to ``\mathbb{P}``, i.e. ``\forall i \in \{1, ..., n\},Z_i \sim \mathbb{P}``. Note that each ``Z_i`` is usually a vector as well, for us: ``Z_i = (W_i, T_i, Y_i)``.
+- The Empirical Distribution : ``\mathbb{P}_n(A) = \frac{1}{n}\sum_{i=1}^n 1_A(Z_i)``. For each set ``A``, it computes the mean number of points falling into ``A``.
+- Expected value of a function ``f`` of the data ``Z`` with respect to a distribution ``\mathbb{P}``: ``\mathbb{P}f \equiv \mathbb{E}_{\mathbb{P}}[f(Z)]``. Note that for the empirical distribution ``\mathbb{P}_n``, this is simply: ``\mathbb{P}_nf = \frac{1}{n}\sum_{i=1}^nf(Z_i)``, in other words, the sample mean.
+- The Estimand: The unknown finite dimensional quantity we are interested in. ``\Psi`` is defined as a functional, i.e. ``\mathbb{P} \mapsto \Psi(\mathbb{P}) \in \mathbb{R}^d``. In this document ``d=1``.
+- The Gradient of ``\Psi`` at the distribution ``\mathbb{P}`` : ``\phi_{\mathbb{P}}``, a function of the data, i.e. ``Z \mapsto \phi_{\mathbb{P}}(Z)``. The gradient satisfies two properties: (i) ``\mathbb{P}\phi_{\mathbb{P}} = 0``, and (ii) ``Var[\phi_{\mathbb{P}}] < \infty``.
+- An estimator, is a function of the data ``\bold{Z}_n``, and thus a random variable, that seeks to approximate an unknown quantity. For instance, ``\hat{\Psi}_n`` denotes an estimator for ``\Psi``. Notice that the empirical distribution is an estimator for ``\mathbb{P}``, but the hat is omitted to distinguish it from other estimators that will be defined later on.
+
+### The Counterfactual Mean
+
+Let ``\bold{Z}=(W, T, Y)_{i=1..n}`` be a dataset generated according to the following structural causal model:
+
+```math
+\begin{aligned}
+W &= f_W(U_W) \\
+T &= f_T(W, U_T) \\
+Y &= f_Y(T, W, U_Y)
+\end{aligned}
+```
+
+This is certainly the most common scenario in causal inference where ``Y`` is an outcome of interest, ``T`` a set of treatment variables and ``W`` a set of confounding variables. We are generally interested in the effect of ``T`` on ``Y``. In this document we will consider the counterfactual mean as a workhorse. We will show that the estimators for the Average Treatment Effect and Average Interaction Effect can easily be derived from it. Under usual conditions, the counterfactual mean identifies to the following statistical estimand:
+
+```math
+CM_t(\mathbb{P}) = \Psi_t(\mathbb{P}) = \mathbb{E}_{\mathbb{P}}[\mathbb{E}_{\mathbb{P}}[Y | W, T = t]] = \int \mathbb{E}_{\mathbb{P}}[Y | W=w, T = t] d\mathbb{P}(w)
+```
+
+From this definition, we can see that ``\Psi_t`` depends on ``\mathbb{P}`` only through two relevant factors:
+
+```math
+\begin{aligned}
+Q_Y(W, T) &= \mathbb{E}_{\mathbb{P}}[Y | W, T] \\
+Q_W(W) &= \mathbb{P}(W)
+\end{aligned}
+```
+
+So that ``\Psi(\mathbb{P}) = \Psi(Q_Y, Q_W)`` (the ``t`` subscript is dropped as it is unambiguous). This makes explicit that we don't need to estimate ``\mathbb{P}`` but only the relevant factors ``(Q_Y, Q_W)`` to obtain a plugin estimate of ``\Psi(\mathbb{P})``. Finally, it will be useful to define an additional factor:
+
+```math
+g(W, T) = \mathbb{P}(T | W) 
+```
+
+This is because the gradient of ``\Psi``:
+
+```math
+\phi_{CM_{t}, \mathbb{P}}(W, T, Y) = \frac{\mathbb{1}(T = t)}{g(W, t)}(Y − Q_Y(W, t)) + Q_Y(W, t) − \Psi(\mathbb{P}) 
+```
+
+which is the foundation of semi-parametric estimation, depends on this so-called nuisance parameter.
+
+### Average Treatment Effect and Average Interaction Effect
+
+They are simple linear combinations of counterfactual means and so are their gradients.
+
+#### Average Treatment Effect
+
+In all generality, for two values of a categorical treatment variable ``T: t_{control} \rightarrow t_{case}``, the Average Treatment Effect (ATE) is defined by:
+
+```math
+ATE_{t_{case}, t_{control}}(\mathbb{P}) = (CM_{t_{case}} - CM_{t_{control}})(\mathbb{P}) 
+```
+
+And it's associated gradient is:
+
+```math
+\begin{aligned}
+\phi_{ATE}(W, T, Y) &= (\phi_{CM_{t_{case}}} - \phi_{CM_{t_{control}}})(W, T, Y) \\
+&=  H(W, T)(Y − Q_Y(W, T)) + C(W) − ATE_{t_{case}, t_{control}}(\mathbb{P})
+\end{aligned}
+```
+
+with:
+
+```math
+\begin{aligned}
+  \begin{cases}
+    H(W, T) &= \frac{ \mathbb{1}(T = t_{case}) - \mathbb{1}(T = t_{control})}{g(W, T)} \\
+    C(W) &= (Q_Y(W, t_{case}) - Q_Y(W, t_{control}))
+  \end{cases}
+\end{aligned}
+```
+
+``H`` is also known as the clever covariate in the Targeted Learning literature (see later).
+
+#### Average Interaction Effect
+
+For simplicity, we only consider two treatments ``T = (T_1, T_2)`` such that ``T_1: t_{1,control} \rightarrow t_{1,case}`` and ``T_2: t_{2,control} \rightarrow t_{2,case}``. The Average Interaction Effect (AIE) of ``(T_1, T_2)`` is defined as:
+
+```math
+\begin{aligned}
+AIE(\mathbb{P}) &= (CM_{t_{1, case}, t_{2, case}} - CM_{t_{1, case}, t_{2, control}} \\
+&- CM_{t_{1, control}, t_{2, case}} + CM_{t_{1, control}, t_{2, control}})(\mathbb{P})
+\end{aligned}
+```
+
+And its gradient is given by:
+
+```math
+\begin{aligned}
+\phi_{AIE, }(W, T, Y) &= (\phi_{CM_{t_{1,case}, t_{2,case}}} - \phi_{CM_{t_{1,case}, t_{2,control}}} \\
+&- \phi_{CM_{t_{1,control}, t_{2,case}}} + \phi_{CM_{t_{1,control}, t_{2,control}}})(W, T, Y) \\
+&=  H(W, T)(Y − Q_Y(W, T)) + C(W) − ATE_{t_{case}, t_{control}}(\mathbb{P})
+\end{aligned}
+```
+
+with:
+
+```math
+\begin{aligned}
+  \begin{cases}
+    H(W, T) &= \frac{(\mathbb{1}(T_1 = t_{1, case}) - \mathbb{1}(T_1 = t_{1, control}))(\mathbb{1}(T_2 = t_{2, case}) - \mathbb{1}(T_2 = t_{2, control}))}{g(W, T)} \\
+    C(W) &= (Q_Y(W, t_{1, case}, t_{2, case}) - Q_Y(W, t_{1, case}, t_{2, control}) - Q_Y(W, t_{1, control}, t_{2, case}) + Q_Y(W, t_{1, control}, t_{2, control}))
+  \end{cases}
+\end{aligned}
+```
+
+For higher-order interactions the two factors ``H`` and ``C`` can be similarly inferred.
+
+## Asymptotic Analysis of Plugin Estimators
+
+The theory is based on the von Mises expansion (functional equivalent of Taylor's expansion) which reduces due to the fact that the gradient satisfies: ``\mathbb{E}_{\mathbb{P}}[\phi_{\mathbb{P}}(Z)] = 0``.
+
+```math
+\begin{aligned}
+\Psi(\hat{\mathbb{P}}) − \Psi(\mathbb{P}) &= \int \phi_{\hat{\mathbb{P}}}(z) \mathrm{d}(\hat{\mathbb{P}} − \mathbb{P})(z) + R_2(\hat{\mathbb{P}}, \mathbb{P}) \\
+&= - \int \phi_{\hat{\mathbb{P}}}(z) \mathrm{d}\mathbb{P}(z) + R_2(\hat{\mathbb{P}}, \mathbb{P})
+\end{aligned}
+```
+
+This suggests that a plugin estimator, one that simply evaluates ``\Psi`` at an estimator ``\hat{\mathbb{P}}`` of ``\mathbb{P}``, is biased. This bias can be elegantly decomposed in four terms by reworking the previous expression:
+
+```math
+\Psi(\hat{\mathbb{P}}) − \Psi(\mathbb{P}) = \mathbb{P}_n\phi_{\mathbb{P}}(Z) - \mathbb{P}_n\phi_{\mathbb{\hat{P}}}(Z) + (\mathbb{P}_n − \mathbb{P})(ϕ_{\mathbb{\hat{P}}}(Z) − \phi_{\mathbb{P}}(Z)) + R_2(\mathbb{\hat{P}}, \mathbb{P})
+```
+
+### 1. The Asymptotically Linear Term
+
+```math
+\mathbb{P}_n\phi_{\mathbb{P}}(Z)
+```
+
+By the central limit theorem, it is asymptotically normal with variance ``Var[\phi]/n``, it will be used to construct a confidence interval for our final estimate.
+
+### 2. The First-Order Bias Term
+
+```math
+- \mathbb{P}_n\phi_{\mathbb{\hat{P}}}(Z)
+```
+
+This is the term both the One-Step estimator and the Targeted Maximum-Likelihood estimator deal with.
+
+### 3. The Empirical Process Term
+
+```math
+(\mathbb{P}_n − \mathbb{P})(ϕ_{\mathbb{\hat{P}}}(Z) − \phi_{\mathbb{P}}(Z))
+```
+
+This can be shown to be of order ``o_{\mathbb{P}}(\frac{1}{\sqrt{n}})`` if ``\phi_{\hat{\mathbb{P}}}`` converges to ``\phi_{\mathbb{P}}`` in ``L_2(\mathbb{P})`` norm, that is:
+
+```math
+\int (\phi_{\hat{\mathbb{P}}}(Z) - \phi_{\mathbb{P}}(Z) )^2 d\mathbb{P}(Z) = o_{\mathbb{P}}\left(\frac{1}{\sqrt{n}}\right)
+```
+
+and, any of the following holds:
+
+- ``\phi`` (or equivalently its components) is [[Donsker](https://en.wikipedia.org/wiki/Donsker_classes)], i.e., not too complex.
+- The estimator is constructed using sample-splitting (see cross-validated estimators).
+
+### 4. The Exact Second-Order Remainder Term
+
+```math
+R_2(\mathbb{\hat{P}}, \mathbb{P})
+```
+
+This term is usually more complex to analyse. Note however, that it is entirely defined by the von Mises expansion, and for the counterfactual mean, it can be shown that if ``g(W,T) \geq \frac{1}{\eta}`` (positivity constraint):
+
+```math
+|R_2(\mathbb{\hat{P}}, \mathbb{P})| \leq \eta ||\hat{Q}_{n, Y} - Q_{Y}|| \cdot ||\hat{g}_{n} - g||
+```
+
+and thus, if the estimators ``\hat{Q}_{n, Y}`` and ``\hat{g}_{n}`` converge at a rate ``o_{\mathbb{P}}(n^{-\frac{1}{4}})``, the second-order remainder will be ``o_{\mathbb{P}}(\frac{1}{\sqrt{n}})``. This is the case for many popular estimators like random forests, neural networks, etc...
+
+## Asymptotic Linearity and Inference
+
+According to the previous section, the OSE and TMLE will be asymptotically linear with efficient influence curve the gradient ``\phi``:
+
+```math
+\sqrt{n}(\hat{\Psi} - \Psi) = \frac{1}{\sqrt{n}} \sum_{i=1}^n \phi(Z_i) + o_{\mathbb{P}}\left(\frac{1}{\sqrt{n}}\right)
+```
+
+By the [Central Limit Theorem](https://en.wikipedia.org/wiki/Central_limit_theorem) and [Slutsky's Theorem](https://en.wikipedia.org/wiki/Slutsky%27s_theorem) we have:
+
+```math
+\sqrt{n}(\hat{\Psi} - \Psi) \leadsto \mathcal{N}(0, Std(\phi))
+```
+
+Now, if we consider ``S_n = \hat{Std}(\phi_{\hat{\mathbb{P}}})`` as a consistent estimator for ``Std(\phi)``, we have by Slutsky's Theorem again, the following pivot:
+
+```math
+\frac{\sqrt{n}(\hat{\Psi} - \Psi)}{S_n} \leadsto \mathcal{N}(0, 1)
+```
+
+which can be used to build confidence intervals:
+
+```math
+\underset{n \to \infty}{\lim} P(\hat{\Psi}_n - \frac{S_n}{\sqrt{n}}z_{\alpha} \leq \Psi \leq \hat{\Psi}_n + \frac{S_n}{\sqrt{n}}z_{\alpha}) = 1 - 2\alpha
+```
+
+Here, ``z_{\alpha}`` denotes the ``\alpha``-quantile function of the standard normal distribution
+
+## One-Step Estimator
+
+### Canonical OSE
+
+The One-Step estimator is very intuitive, it simply corrects the initial plugin estimator by adding in the residual bias term. As such, it corrects for the bias in the estimand's space. Let ``\hat{P}= (\hat{Q}_{n,Y}, \hat{Q}_{n,W})`` be an estimator of the relevant factors of ``\mathbb{P}`` as well as ``\hat{g}_n`` an estimator of the nuisance function ``g``. The OSE is:
+
+```math
+\hat{\Psi}_{n, OSE} = \Psi(\hat{P}) + \mathbb{P}_n{\phi_{\hat{P}}}
+```
+
+### CV-OSE
+
+Instead of assuming Donsker conditions limiting the complexity of the algorithms used, we can use sample splitting techniques. For this, we split the data into K folds of (roughly) equal size. For a given sample i, we denote by ``k(i)`` the fold it belongs to (called validation fold/set) and by ``-k(i)`` the union of all remaining folds (called training set). Similarly, we denote by ``\hat{Q}^{k}`` an estimator for ``Q`` obtained from samples in the validation fold ``k`` and ``\hat{Q}^{-k}`` an estimator for ``Q`` obtained from samples in the (training) fold ``\{1, ..., K\}-\{k\}``.
+
+The cross-validated One-Step estimator can be compactly written as an average over the folds of sub one-step estimators:
+
+```math
+\begin{aligned}
+\hat{\Psi}_{n, CV-OSE} &= \sum_{k=1}^K \frac{N_k}{n} (\Psi(\hat{Q}_Y^{-k}, \hat{Q}_W^{k}) + \hat{\mathbb{P}}_n^k \phi^{-k}) \\
+&= \frac{1}{n} \sum_{k=1}^K \sum_{\{i: k(i) = k\}} (\hat{Q}_Y^{-k}(W_i, T_i) + \phi^{-k}(W_i, T_i, Y_i))
+\end{aligned}
+```
+
+Where the first equation is the general form of the estimator while the second one corresponds to the counterfactual mean. The important thing to note is that for each sub one-step estimator, the sum runs over the validation samples while ``\hat{Q}_Y^{-k}`` and ``\hat{\phi}^{-k}`` are estimated using the training samples.
+
+## Targeted Maximum-Likelihood Estimator
+
+Unlike the One-Step estimator, the Targeted Maximum-Likelihood Estimator corrects the bias term in distribution space. That is, it moves the initial estimate ``\hat{\mathbb{P}}^0=\hat{\mathbb{P}}`` to a corrected ``\hat{\mathbb{P}}^*`` (notice the new superscript notation). Then the plugin principle can be applied and the targeted estimator is simply ``\hat{\Psi}_{n, TMLE} = \Psi(\hat{\mathbb{P}}^*)``. This means TMLE always respects the natural range of the estimand, giving it an upper hand on the One-Step estimator.
+
+### Canonical TMLE
+
+The way ``\hat{\mathbb{P}}`` is modified is by means of a parametric sub-model also known as a fluctuation. The choice of fluctuation depends on the target parameter of interest. It can be shown that for the conditional mean, it is sufficient to fluctuate ``\hat{Q}_{n, Y}`` only once using the following fluctuations:
+
+- ``\hat{Q}_{Y, \epsilon}(W, T) = \hat{Q}_{n, Y}(T, W) + \epsilon \hat{H}(T, W)``, for continuous outcomes ``Y``.
+- ``\hat{Q}_{Y, \epsilon}(W, T) = \frac{1}{1 + e^{-(logit(\hat{Q}_{n, Y}(T, W)) + \epsilon \hat{H}(T, W))}}``, for binary outcomes ``Y``.
+
+where ``\hat{H}(T, W) = \frac{1(T=t)}{\hat{g}_n(W)}`` is known as the clever covariate. The value of ``\epsilon`` is obtained by minimizing the loss ``L`` associated with ``Q_Y``, that is the mean-squared error for continuous outcomes and negative log-likelihood for binary outcomes. This can easily be done via linear and logistic regression respectively, using the initial fit as off-set.
+
+!!! note
+    For the ATE and AIE, just like the gradient is linear in ``\Psi``, the clever covariate used to fluctuate the initial ``\hat{Q}_{n, Y}`` is as presented in [Average Treatment Effect and Average Interaction Effect](@ref)
+
+If we denote by ``\epsilon^*`` the value of ``\epsilon`` minimizing the loss, the TMLE is:
+
+```math
+\hat{\Psi}_{n, TMLE} = \Psi(\hat{Q}_{n, Y, \epsilon^*}, \hat{Q}_{n, W})
+```
+
+### CV-TMLE
+
+Using the same notation as the cross-validated One-Step estimator, the fluctuated distribution is obtained by solving:
+
+```math
+\epsilon^* = \underset{\epsilon}{\arg \min} \frac{1}{n} \sum_{k=1}^K \sum_{\{i: k(i) = k\}} L(Y_i, \hat{Q}_{Y, \epsilon}^{-k}(W_i, T_i, Y_i))
+```
+
+where ``\hat{Q}_{Y, \epsilon}`` and ``L`` are the respective fluctuations and loss for continuous and binary outcomes. This leads to a targeted ``\hat{Q}_{n,Y}^{*}`` such that:
+
+```math
+\forall i \in \{1, ..., n\}, \hat{Q}_{n, Y}^{*}(W_i, T_i) = \hat{Q}_{Y, \epsilon^*}^{-k(i)}(W_i, T_i)
+```
+
+That is, the predictions of ``\hat{Q}_{n, Y}^{*}`` for sample ``i`` are based on the out of fold predictions of ``\hat{Q}_{Y}^{-k(i)}`` and the "pooled" fluctuation given by ``\epsilon^*``.
+
+Then, the CV-TMLE is:
+
+```math
+\begin{aligned}
+\hat{\Psi}_{n, CV-TMLE} &= \sum_{k=1}^K \frac{N_k}{n} \Psi(\hat{Q}_{n, Y}^{*}, \hat{Q}_W^{k}) \\
+&= \frac{1}{n} \sum_{k=1}^K \sum_{\{i: k(i) = k\}} \hat{Q}_{n, Y}^{*}(W_i, T_i)
+\end{aligned}
+```
+
+Notice that, while ``\hat{\Psi}_{n, CV-TMLE}`` is not a plugin estimator anymore, it still respects the natural range of the parameter because it is an average of plugin estimators.
+
+## References
+
+The content of this page is largely inspired from:
+
+- [Semiparametric doubly robust targeted double machine learning: a review](https://arxiv.org/pdf/2203.06469.pdf).
+- [Introduction to Modern Causal Inference](https://alejandroschuler.github.io/mci/).
+- [Targeted Learning, Causal Inference for Observational and Experimental Data](https://link.springer.com/book/10.1007/978-1-4419-9782-1).
+- [STATS 361: Causal Inference](https://web.stanford.edu/~swager/stats361.pdf)
diff --git a/docs/src/index.md b/docs/src/index.md
index 5fcbd2ad..faa470c2 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -6,7 +6,7 @@ CurrentModule = TMLE
 
 ## Overview
 
-TMLE.jl is a Julia implementation of the Targeted Minimum Loss-Based Estimation ([TMLE](https://link.springer.com/book/10.1007/978-1-4419-9782-1)) framework. If you are interested in efficient and unbiased estimation of causal effects, you are in the right place. Since TMLE uses machine-learning methods to estimate nuisance estimands, the present package is based upon [MLJ](https://alan-turing-institute.github.io/MLJ.jl/dev/).
+TMLE.jl is a Julia implementation of the Targeted Minimum Loss-Based Estimation ([TMLE](https://link.springer.com/book/10.1007/978-1-4419-9782-1)) framework. If you are interested in leveraging the power of modern machine-learning methods while preserving interpretability and statistical inference guarantees, you are in the right place. TMLE.jl is compatible with any [MLJ](https://alan-turing-institute.github.io/MLJ.jl/dev/) compliant algorithm and any dataset respecting the [Tables](https://tables.juliadata.org/stable/) interface.
 
 ## Installation
 
@@ -20,7 +20,7 @@ Pkg> add TMLE
 
 To run an estimation procedure, we need 3 ingredients:
 
-1. A dataset: here a simulation dataset.
+### 1. A dataset: here a simulation dataset
 
 For illustration, assume we know the actual data generating process is as follows:
 
@@ -52,7 +52,7 @@ dataset = (Y=Y, T=categorical(T), W=W)
 nothing # hide
 ```
 
-2. A quantity of interest: here the Average Treatment Effect (ATE).
+### 2. A quantity of interest: here the Average Treatment Effect (ATE)
 
 The Average Treatment Effect of ``T`` on ``Y`` confounded by ``W`` is defined as:
 
@@ -64,7 +64,7 @@ The Average Treatment Effect of ``T`` on ``Y`` confounded by ``W`` is defined as
 )
 ```
 
-3. An estimator: here a Targeted Maximum Likelihood Estimator (TMLE).
+### 3. An estimator: here a Targeted Maximum Likelihood Estimator (TMLE)
 
 ```@example quick-start
 tmle = TMLEE()
@@ -79,3 +79,35 @@ using Test # hide
 @test pvalue(OneSampleTTest(result, 2.5)) > 0.05 # hide
 nothing # hide
 ```
+
+## Scope and Distinguishing Features
+
+The goal of this package is to provide an entry point for semi-parametric asymptotic unbiased and efficient estimation in Julia. The two main general estimators that are known to achieve these properties are the One-Step estimator and the Targeted Maximum-Likelihood estimator. Most of the current effort has been centered around estimands that are composite of the counterfactual mean.
+
+Distinguishing Features:
+
+- Estimands: Counterfactual Mean, Average Treatment Effect, Interactions, Any composition thereof
+- Estimators: TMLE, One-Step, in both canonical and cross-validated versions.
+- Machine-Learning: Any [MLJ](https://alan-turing-institute.github.io/MLJ.jl/stable/) compatible model
+- Dataset: Any dataset respecting the [Tables](https://tables.juliadata.org/stable/) interface (e.g. [DataFrames.jl](https://dataframes.juliadata.org/stable/))
+- Factorial Treatment Variables:
+  - Multiple treatments
+  - Categorical treatment values
+
+## Citing TMLE.jl
+
+If you use TMLE.jl for your own work and would like to cite us, here are the BibTeX and APA formats:
+
+- BibTeX
+
+```bibtex
+@software{Labayle_TMLE_jl,
+    author = {Labayle, Olivier and Beentjes, Sjoerd and Khamseh, Ava and Ponting, Chris},
+    title = {{TMLE.jl}},
+    url = {https://github.com/olivierlabayle/TMLE.jl}
+}
+```
+
+- APA
+
+Labayle, O., Beentjes, S., Khamseh, A., & Ponting, C. TMLE.jl [Computer software]. https://github.com/olivierlabayle/TMLE.jl
\ No newline at end of file
diff --git a/docs/src/resources.md b/docs/src/resources.md
index ea3542cd..f39d4901 100644
--- a/docs/src/resources.md
+++ b/docs/src/resources.md
@@ -9,9 +9,15 @@ These are two very clear introductions to causal inference and semi-parametric e
 - [Introduction to Modern Causal Inference](https://alejandroschuler.github.io/mci/) (Alejandro Schuler, Mark J. van der Laan).
 - [A Ride in Targeted Learning Territory](https://achambaz.github.io/tlride/) (David Benkeser, Antoine Chambaz).
 
-## Text Books
+## Youtube
+
+- [Targeted Learning Webinar Series](https://youtube.com/playlist?list=PLy_CaFomwGGGH10tbq9zSyfHVrdklMaLe&si=BfJZ2fvDtGUZwELy)
+- [TL Briefs](https://youtube.com/playlist?list=PLy_CaFomwGGFMxFtf4gkmC70dP9J6Q3Wa&si=aBZUnjJtOidIjhwR)
+
+## Books and Lecture Notes
 
 - [Targeted Learning](https://link.springer.com/book/10.1007/978-1-4419-9782-1) (Mark J. van der Laan, Sherri Rose).
+- [STATS 361: Causal Inference](https://web.stanford.edu/~swager/stats361.pdf)
 
 ## Journal articles
 
diff --git a/docs/src/user_guide/estimation.md b/docs/src/user_guide/estimation.md
index 61bd2a86..f676f48c 100644
--- a/docs/src/user_guide/estimation.md
+++ b/docs/src/user_guide/estimation.md
@@ -4,7 +4,7 @@ CurrentModule = TMLE
 
 # Estimation
 
-## Estimating a single Estimand
+## Constructing and Using Estimators
 
 ```@setup estimation
 using Random
@@ -51,11 +51,12 @@ scm = SCM([
 )
 ```
 
-Once a statistical estimand has been defined, we can proceed with estimation. At the moment, we provide 3 main types of estimators:
+Once a statistical estimand has been defined, we can proceed with estimation. There are two semi-parametric efficient estimators in TMLE.jl:
 
-- Targeted Maximum Likelihood Estimator (`TMLEE`)
-- One-Step Estimator (`OSE`)
-- Naive Plugin Estimator (`NAIVE`)
+- The Targeted Maximum-Likelihood Estimator (`TMLEE`)
+- The One-Step Estimator (`OSE`)
+
+While they have similar asymptotic properties, their finite sample performance may be different. They also have a very distinguishing feature, the TMLE is a plugin estimator, which means it respects the natural bounds of the estimand of interest. In contrast, the OSE may in theory report values outside these bounds. In practice, this is not often the case and the estimand of interest may not impose any restriction on its domain.
 
 Drawing from the example dataset and `SCM` from the Walk Through section, we can estimate the ATE for `T₁`. Let's use TMLE:
 
@@ -72,27 +73,25 @@ result₁
 nothing # hide
 ```
 
-We see that both models corresponding to variables `Y` and `T₁` were fitted in the process but that the model for `T₂` was not because it was not necessary to estimate this estimand.
-
-The `cache` contains estimates for the nuisance functions that were necessary to estimate the ATE. For instance, we can see what is the value of ``\epsilon`` corresponding to the clever covariate.
+The `cache` (see below) contains estimates for the nuisance functions that were necessary to estimate the ATE. For instance, we can see what is the value of ``\epsilon`` corresponding to the clever covariate.
 
 ```@example estimation
 ϵ = last_fluctuation_epsilon(cache)
 ```
 
-The `result₁` structure corresponds to the estimation result and should report 3 main elements:
+The `result₁` structure corresponds to the estimation result and will display the result of a T-Test including:
 
 - A point estimate.
 - A 95% confidence interval.
 - A p-value (Corresponding to the test that the estimand is different than 0).
 
-This is only summary statistics but since both the TMLE and OSE are asymptotically linear estimators, standard Z/T tests from [HypothesisTests.jl](https://juliastats.org/HypothesisTests.jl/stable/) can be performed.
+Both the TMLE and OSE are asymptotically linear estimators, standard Z/T tests from [HypothesisTests.jl](https://juliastats.org/HypothesisTests.jl/stable/) can be performed and `confint` and `pvalue` methods used.
 
 ```@example estimation
-tmle_test_result₁ = OneSampleTTest(result₁)
+tmle_test_result₁ = pvalue(OneSampleTTest(result₁))
 ```
 
-We could now get an interest in the Average Treatment Effect of `T₂` that we will estimate with an `OSE`:
+Let us now turn to the Average Treatment Effect of `T₂`, we will estimate it with a `OSE`:
 
 ```@example estimation
 Ψ₂ = ATE(
@@ -109,24 +108,73 @@ nothing # hide
 
 Again, required nuisance functions are fitted and stored in the cache.
 
-## CV-Estimation
+## Specifying Models
 
-Both TMLE and OSE can be used with sample-splitting, which, for an additional computational cost, further reduces the assumptions we need to make regarding our data generating process ([see here](https://arxiv.org/abs/2203.06469)). Note that this sample-splitting procedure should not be confused with the sample-splitting happening in Super Learning. Using both CV-TMLE and Super-Learning will result in two nested sample-splitting loops.
+By default, TMLE.jl uses generalized linear models for the estimation of relevant and nuisance factors such as the outcome mean and the propensity score. However, this is not the recommended usage since the estimators' performance is closely related to how well we can estimate these factors. More sophisticated models can be provided using the `models` keyword argument of each estimator which is essentially a `NamedTuple` mapping variables' names to their respective model.
 
-To leverage sample-splitting, simply specify a `resampling` strategy when building an estimator:
+Rather than specifying a specific model for each variable it may be easier to override the default models using the `default_models` function:
+
+For example one can override all default models with XGBoost models from `MLJXGBoostInterface`:
 
 ```@example estimation
-cvtmle = TMLEE(resampling=CV())
-cvresult₁, _ = cvtmle(Ψ₁, dataset);
+using MLJXGBoostInterface
+xgboost_regressor = XGBoostRegressor()
+xgboost_classifier = XGBoostClassifier()
+models = default_models(
+    Q_binary=xgboost_classifier,
+    Q_continuous=xgboost_regressor,
+    G=xgboost_classifier
+)
+tmle_gboost = TMLEE(models=models)
 ```
 
-Similarly, one could build CV-OSE:
+The advantage of using `default_models` is that it will automatically prepend each model with a [ContinuousEncoder](https://alan-turing-institute.github.io/MLJ.jl/dev/transformers/#MLJModels.ContinuousEncoder) to make sure the correct types are passed to the downstream models.
+
+Super Learning ([Stack](https://alan-turing-institute.github.io/MLJ.jl/dev/model_stacking/#Model-Stacking)) as well as variable specific models can be defined as well. Here is a more customized version:
+
+```@example estimation
+lr = LogisticClassifier(lambda=0.)
+stack_binary = Stack(
+    metalearner=lr,
+    xgboost=xgboost_classifier,
+    lr=lr
+)
+
+models = (
+    T₁ = with_encoder(xgboost_classifier), # T₁ with XGBoost prepended with a Continuous Encoder
+    default_models( # For all other variables use the following defaults
+        Q_binary=stack_binary, # A Super Learner
+        Q_continuous=xgboost_regressor, # An XGBoost
+        # Unspecified G defaults to Logistic Regression
+    )...
+)
+
+tmle_custom = TMLEE(models=models)
+```
+
+Notice that `with_encoder` is simply a shorthand to construct a pipeline with a `ContinuousEncoder` and that the resulting `models` is simply a `NamedTuple`.
+
+## CV-Estimation
+
+Canonical TMLE/OSE are essentially using the dataset twice, once for the estimation of the nuisance functions and once for the estimation of the parameter of interest. This means that there is a risk of over-fitting and residual bias ([see here](https://arxiv.org/abs/2203.06469) for some discussion). One way to address this limitation is to use a technique called sample-splitting / cross-validating. In order to activate the sample-splitting mode, simply provide a `MLJ.ResamplingStrategy` using the `resampling` keyword argument:
+
+```@example estimation
+TMLEE(resampling=StratifiedCV());
+```
+
+or
 
 ```julia
-cvose = OSE(resampling=CV(nfolds=3))
+OSE(resampling=StratifiedCV(nfolds=3));
 ```
 
-## Caching model fits
+There are some practical considerations
+
+- Choice of `resampling` Strategy: The theory behind sample-splitting requires the nuisance functions to be sufficiently well estimated on **each and every** fold. A practical aspect of it is that each fold should contain a sample representative of the dataset. In particular, when the treatment and outcome variables are categorical it is important to make sure the proportions are preserved. This is typically done using `StratifiedCV`.
+- Computational Complexity: Sample-splitting results in ``K`` fits of the nuisance functions, drastically increasing computational complexity. In particular, if the nuisance functions are estimated using (P-fold) Super-Learning, this will result in two nested cross-validation loops and ``K \times P`` fits.
+- Caching of Nuisance Functions: Because the `resampling` strategy typically needs to preserve the outcome and treatment proportions, very little reuse of cached models is possible (see [Caching Models](@ref)).
+
+## Caching Models
 
 Let's now see how the `cache` can be reused with a new estimand, say the Total Average Treatment Effect of both `T₁` and `T₂`.
 
diff --git a/src/counterfactual_mean_based/estimators.jl b/src/counterfactual_mean_based/estimators.jl
index e06fb92e..2df3944b 100644
--- a/src/counterfactual_mean_based/estimators.jl
+++ b/src/counterfactual_mean_based/estimators.jl
@@ -221,7 +221,7 @@ function (tmle::TMLEE)(Ψ::StatisticalCMCompositeEstimand, dataset; cache=Dict()
         machine_cache=tmle.machine_cache
         )
     # Estimation results after TMLE
-    IC, Ψ̂ = gradient_and_estimate(Ψ, targeted_factors_estimate, nomissing_dataset; ps_lowerbound=ps_lowerbound)
+    IC, Ψ̂ = gradient_and_estimate(tmle, Ψ, targeted_factors_estimate, nomissing_dataset; ps_lowerbound=ps_lowerbound)
     σ̂ = std(IC)
     n = size(IC, 1)
     verbosity >= 1 && @info "Done."
@@ -229,6 +229,9 @@ function (tmle::TMLEE)(Ψ::StatisticalCMCompositeEstimand, dataset; cache=Dict()
     return TMLEstimate(Ψ, Ψ̂, σ̂, n, IC), cache
 end
 
+gradient_and_estimate(::TMLEE, Ψ, factors, dataset; ps_lowerbound=1e-8) = 
+    gradient_and_plugin_estimate(Ψ, factors, dataset; ps_lowerbound=ps_lowerbound)
+
 #####################################################################
 ###                            OSE                                ###
 #####################################################################
@@ -267,14 +270,14 @@ ose = OSE()
 OSE(;models=default_models(), resampling=nothing, ps_lowerbound=1e-8, machine_cache=false) = 
     OSE(models, resampling, ps_lowerbound, machine_cache)
 
-function (estimator::OSE)(Ψ::StatisticalCMCompositeEstimand, dataset; cache=Dict(), verbosity=1)
+function (ose::OSE)(Ψ::StatisticalCMCompositeEstimand, dataset; cache=Dict(), verbosity=1)
     # Check the estimand against the dataset
     check_treatment_levels(Ψ, dataset)
     # Initial fit of the SCM's relevant factors
     initial_factors = get_relevant_factors(Ψ)
     nomissing_dataset = nomissing(dataset, variables(initial_factors))
-    initial_factors_dataset = choose_initial_dataset(dataset, nomissing_dataset, estimator.resampling)
-    initial_factors_estimator = CMRelevantFactorsEstimator(estimator.resampling, estimator.models)
+    initial_factors_dataset = choose_initial_dataset(dataset, nomissing_dataset, ose.resampling)
+    initial_factors_estimator = CMRelevantFactorsEstimator(ose.resampling, ose.models)
     initial_factors_estimate = initial_factors_estimator(
         initial_factors, 
         initial_factors_dataset;
@@ -283,16 +286,21 @@ function (estimator::OSE)(Ψ::StatisticalCMCompositeEstimand, dataset; cache=Dic
     )
     # Get propensity score truncation threshold
     n = nrows(nomissing_dataset)
-    ps_lowerbound = ps_lower_bound(n, estimator.ps_lowerbound)
+    ps_lowerbound = ps_lower_bound(n, ose.ps_lowerbound)
 
     # Gradient and estimate
-    IC, Ψ̂ = gradient_and_estimate(Ψ, initial_factors_estimate, nomissing_dataset; ps_lowerbound=ps_lowerbound)
-    IC_mean = mean(IC)
-    IC .-= IC_mean
+    IC, Ψ̂ = gradient_and_estimate(ose, Ψ, initial_factors_estimate, nomissing_dataset; ps_lowerbound=ps_lowerbound)
     σ̂ = std(IC)
     n = size(IC, 1)
     verbosity >= 1 && @info "Done."
-    return OSEstimate(Ψ, Ψ̂ + IC_mean, σ̂, n, IC), cache
+    return OSEstimate(Ψ, Ψ̂, σ̂, n, IC), cache
+end
+
+function gradient_and_estimate(::OSE, Ψ, factors, dataset; ps_lowerbound=1e-8)
+    IC, Ψ̂ = gradient_and_plugin_estimate(Ψ, factors, dataset; ps_lowerbound=ps_lowerbound)
+    IC_mean = mean(IC)
+    IC .-= IC_mean
+    return IC, Ψ̂ + IC_mean
 end
 
 #####################################################################
diff --git a/src/counterfactual_mean_based/gradient.jl b/src/counterfactual_mean_based/gradient.jl
index 882e325e..0cbf7424 100644
--- a/src/counterfactual_mean_based/gradient.jl
+++ b/src/counterfactual_mean_based/gradient.jl
@@ -24,10 +24,7 @@ function counterfactual_aggregate(Ψ::StatisticalCMCompositeEstimand, Q, dataset
     return ctf_agg
 end
 
-compute_estimate(ctf_aggregate, ::Nothing) = mean(ctf_aggregate)
-
-compute_estimate(ctf_aggregate, train_validation_indices) =
-    mean(compute_estimate(ctf_aggregate[val_indices], nothing) for (_, val_indices) in train_validation_indices)
+plugin_estimate(ctf_aggregate) = mean(ctf_aggregate)
 
 
 """
@@ -53,11 +50,11 @@ function ∇YX(Ψ::StatisticalCMCompositeEstimand, Q, G, dataset; ps_lowerbound=
 end
 
 
-function gradient_and_estimate(Ψ::StatisticalCMCompositeEstimand, factors, dataset; ps_lowerbound=1e-8)
+function gradient_and_plugin_estimate(Ψ::StatisticalCMCompositeEstimand, factors, dataset; ps_lowerbound=1e-8)
     Q = factors.outcome_mean
     G = factors.propensity_score
     ctf_agg = counterfactual_aggregate(Ψ, Q, dataset)
-    Ψ̂ = compute_estimate(ctf_agg, train_validation_indices_from_factors(factors))
+    Ψ̂ = plugin_estimate(ctf_agg)
     IC = ∇YX(Ψ, Q, G, dataset; ps_lowerbound = ps_lowerbound) .+ ∇W(ctf_agg, Ψ̂)
     return IC, Ψ̂
 end
diff --git a/src/estimates.jl b/src/estimates.jl
index f49eee45..175b5706 100644
--- a/src/estimates.jl
+++ b/src/estimates.jl
@@ -50,14 +50,70 @@ string_repr(estimate::SampleSplitMLConditionalDistribution) =
     Base.typename(typeof(first(estimate.machines).model)).wrapper
 )
 
+"""
+Prediction for the subset of X identified by idx and fold.
+"""
+function fold_prediction(estimate::SampleSplitMLConditionalDistribution, X, idx, fold)
+    Xval = selectrows(X, idx)
+    return predict(estimate.machines[fold], Xval)
+end
+
+"""
+In the case where newpreds is a UnivariateFiniteVector, we update the probability matrix.
+"""
+function update_preds!(probs::Matrix, newpreds::UnivariateFiniteVector, idx)
+    for (key, vals) in newpreds.prob_given_ref
+        probs[idx, key] = vals
+    end
+end
+
+update_preds!(ŷ, preds, idx) = ŷ[idx] = preds
+
+"""
+In the case where predictions are a UnivariateFiniteVector, we store a Matrix of probabilities.
+"""
+initialize_cv_preds(first_preds::UnivariateFiniteVector, n) = 
+    Matrix{Float64}(undef, n, length(first_preds.prob_given_ref))
+
+"""
+As a default, we initialize predictions with a Vector of the type corresponding to the
+predictions from the first machine.
+"""
+initialize_cv_preds(first_preds, n) = 
+    Vector{eltype(first_preds)}(undef, n)
+    
+"""
+In the case where predictions are a UnivariateFiniteVector, we create a special 
+UnivariateFinite vector for downstream optimizaton.
+"""
+finalize_cv_preds(probs, first_preds::UnivariateFiniteVector) = UnivariateFinite(support(first_preds), probs)
+
+"""
+As a default we simply return the vector
+"""
+finalize_cv_preds(ŷ, first_preds) = ŷ
+
+"""
+Out of fold prediction, predictions for fold k are made from machines trained on fold k̄.
+We distinguish the case where preidctions are a UnivariateFiniteVector that requires specific attention.
+"""
+function cv_predict(estimate, X)
+    fold_to_val_idx = [(fold, val_idx) for (fold, (_, val_idx)) in enumerate(estimate.train_validation_indices)]
+    first_fold, first_val_idx = first(fold_to_val_idx)
+    first_preds = fold_prediction(estimate, X, first_val_idx, first_fold)
+
+    ŷ = initialize_cv_preds(first_preds, nrows(X))
+    update_preds!(ŷ, first_preds, first_val_idx)
+    for (fold, val_idx) in fold_to_val_idx[2:end]
+        preds = fold_prediction(estimate, X, val_idx, fold)
+        update_preds!(ŷ, preds, val_idx)
+    end
+    return finalize_cv_preds(ŷ, first_preds)
+end
+
 function MLJBase.predict(estimate::SampleSplitMLConditionalDistribution, dataset)
     X = selectcols(dataset, estimate.estimand.parents)
-    ŷs = []
-    for (fold, (_, validation_indices)) in enumerate(estimate.train_validation_indices)
-        Xval = selectrows(X, validation_indices)
-        push!(ŷs, predict(estimate.machines[fold], Xval))
-    end
-    return vcat(ŷs...)
+    return cv_predict(estimate, X)
 end
 
 #####################################################################
@@ -76,7 +132,7 @@ function likelihood(estimate::ConditionalDistributionEstimate, dataset)
     return pdf.(ŷ, y)
 end
 
-function compute_offset(ŷ::UnivariateFiniteVector{<:Union{OrderedFactor{2}, Multiclass{2}}})
+function compute_offset(ŷ::AbstractVector{<:UnivariateFinite{<:Union{OrderedFactor{2}, Multiclass{2}}}})
     μy = expected_value(ŷ)
     logit!(μy)
     return μy
@@ -84,7 +140,7 @@ end
 
 compute_offset(ŷ::AbstractVector{<:Distributions.UnivariateDistribution}) = expected_value(ŷ)
 
-compute_offset(ŷ::AbstractVector{<:Real}) = expected_value(ŷ)
+compute_offset(ŷ::AbstractVector{T}) where T<:Real = expected_value(ŷ)
 
 function compute_offset(estimate::ConditionalDistributionEstimate, X)
     ŷ = predict(estimate, X)
diff --git a/src/utils.jl b/src/utils.jl
index 2a3a6e52..c23d9aa1 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -9,7 +9,18 @@ key(estimand, estimator) = (key(estimand), key(estimator))
 
 unique_sorted_tuple(iter) = Tuple(sort(unique(Symbol(x) for x in iter)))
 
+"""
+For "vanilla" estimators, missingness management is deferred to the nuisance function estimators. 
+This is in order to maximize data usage.
+"""
 choose_initial_dataset(dataset, nomissing_dataset, resampling::Nothing) = dataset
+
+"""
+For cross-validated estimators, missing data are removed early on based on all columns relevant to the estimand. 
+This is to avoid the complications of:
+    - Equally distributing missing across folds
+    - Tracking sample_ids
+"""
 choose_initial_dataset(dataset, nomissing_dataset, resampling) = nomissing_dataset
 
 selectcols(data, cols) = data |> TableOperations.select(cols...) |> Tables.columntable
@@ -46,7 +57,7 @@ function indicator_values(indicators, T)
     return indic
 end
 
-expected_value(ŷ::UnivariateFiniteVector{<:Union{OrderedFactor{2}, Multiclass{2}}}) = pdf.(ŷ, levels(first(ŷ))[2])
+expected_value(ŷ::AbstractArray{<:UnivariateFinite{<:Union{OrderedFactor{2}, Multiclass{2}}}}) = pdf.(ŷ, levels(first(ŷ))[2])
 expected_value(ŷ::AbstractVector{<:Distributions.UnivariateDistribution}) = mean.(ŷ)
 expected_value(ŷ::AbstractVector{<:Real}) = ŷ
 
diff --git a/test/counterfactual_mean_based/3points_interactions.jl b/test/counterfactual_mean_based/3points_interactions.jl
index 436aaa6d..9f032841 100644
--- a/test/counterfactual_mean_based/3points_interactions.jl
+++ b/test/counterfactual_mean_based/3points_interactions.jl
@@ -37,7 +37,7 @@ end
         treatment_confounders = (T₁=[:W], T₂=[:W], T₃=[:W])
     )
     models = (
-        Y = TreatmentTransformer() |> InteractionTransformer(order=3) |> LinearRegressor(),
+        Y = with_encoder(InteractionTransformer(order=3) |> LinearRegressor()),
         T₁ = LogisticClassifier(lambda=0),
         T₂ = LogisticClassifier(lambda=0),
         T₃ = LogisticClassifier(lambda=0)
diff --git a/test/counterfactual_mean_based/double_robustness_ate.jl b/test/counterfactual_mean_based/double_robustness_ate.jl
index f87dbc4b..acc73e4d 100644
--- a/test/counterfactual_mean_based/double_robustness_ate.jl
+++ b/test/counterfactual_mean_based/double_robustness_ate.jl
@@ -153,7 +153,7 @@ end
         Y = with_encoder(ConstantClassifier()),
         T = with_encoder(LogisticClassifier(lambda=0))
     )
-    dr_estimators = double_robust_estimators(models)
+    dr_estimators = double_robust_estimators(models, resampling=StratifiedCV())
     results, cache = test_coverage_and_get_results(dr_estimators, Ψ, Ψ₀, dataset; verbosity=0)
     test_mean_inf_curve_almost_zero(results.tmle; atol=1e-6)
     test_mean_inf_curve_almost_zero(results.ose; atol=1e-6)
@@ -166,7 +166,7 @@ end
         Y = with_encoder(LogisticClassifier(lambda=0)),
         T = with_encoder(ConstantClassifier())
     )
-    dr_estimators = double_robust_estimators(models)
+    dr_estimators = double_robust_estimators(models, resampling=StratifiedCV())
     results, cache = test_coverage_and_get_results(dr_estimators, Ψ, Ψ₀, dataset; verbosity=0)
     test_mean_inf_curve_almost_zero(results.tmle; atol=1e-6)
 end
diff --git a/test/counterfactual_mean_based/double_robustness_iate.jl b/test/counterfactual_mean_based/double_robustness_iate.jl
index 349471e5..9afabba5 100644
--- a/test/counterfactual_mean_based/double_robustness_iate.jl
+++ b/test/counterfactual_mean_based/double_robustness_iate.jl
@@ -182,7 +182,7 @@ end
         T₁ = with_encoder(LogisticClassifier(lambda=0)),
         T₂ = with_encoder(LogisticClassifier(lambda=0)),
     )
-    dr_estimators = double_robust_estimators(models)
+    dr_estimators = double_robust_estimators(models, resampling=StratifiedCV())
     results, cache = test_coverage_and_get_results(dr_estimators, Ψ, Ψ₀, dataset; verbosity=0)
     test_mean_inf_curve_almost_zero(results.tmle; atol=1e-9)
     test_mean_inf_curve_almost_zero(results.ose; atol=1e-9)
@@ -197,7 +197,7 @@ end
         T₁ = with_encoder(ConstantClassifier()),
         T₂ = with_encoder(ConstantClassifier()),
     )
-    dr_estimators = double_robust_estimators(models)
+    dr_estimators = double_robust_estimators(models, resampling=StratifiedCV())
     results, cache = test_coverage_and_get_results(dr_estimators, Ψ, Ψ₀, dataset; verbosity=0)
     test_mean_inf_curve_almost_zero(results.tmle; atol=1e-9)
     test_mean_inf_curve_almost_zero(results.ose; atol=1e-9)
@@ -267,7 +267,7 @@ end
         T₁ = with_encoder(LogisticClassifier(lambda=0)),
         T₂ = with_encoder(LogisticClassifier(lambda=0))
     )
-    dr_estimators = double_robust_estimators(models)
+    dr_estimators = double_robust_estimators(models, resampling=StratifiedCV())
     results, cache = test_coverage_and_get_results(dr_estimators, Ψ, Ψ₀, dataset; verbosity=0)
     test_mean_inf_curve_almost_zero(results.tmle; atol=1e-5)
     test_mean_inf_curve_almost_zero(results.ose; atol=1e-10)
@@ -283,7 +283,7 @@ end
         T₁ = with_encoder(ConstantClassifier()),
         T₂ = with_encoder(ConstantClassifier()),
     )
-    dr_estimators = double_robust_estimators(models)
+    dr_estimators = double_robust_estimators(models, resampling=StratifiedCV())
     results, cache = test_coverage_and_get_results(dr_estimators, Ψ, Ψ₀, dataset; verbosity=0)
     test_mean_inf_curve_almost_zero(results.tmle; atol=1e-5)
     test_mean_inf_curve_almost_zero(results.ose; atol=1e-10)
diff --git a/test/counterfactual_mean_based/fluctuation.jl b/test/counterfactual_mean_based/fluctuation.jl
index 4ea5461b..6f263c33 100644
--- a/test/counterfactual_mean_based/fluctuation.jl
+++ b/test/counterfactual_mean_based/fluctuation.jl
@@ -30,6 +30,7 @@ using DataFrames
     η̂ₙ = η̂(η, dataset, verbosity = 0) 
     X = dataset[!, collect(η̂ₙ.outcome_mean.estimand.parents)]
     y = dataset[!, η̂ₙ.outcome_mean.estimand.outcome]
+    mse_initial = sum((TMLE.expected_value(η̂ₙ.outcome_mean, X) .- y).^2)
 
     ps_lowerbound = 1e-8
     # Weighted fluctuation
@@ -42,6 +43,9 @@ using DataFrames
         cache=true    
     )
     fitresult, cache, report = MLJBase.fit(fluctuation, 0, X, y)
+    fluctuation_mean = TMLE.expected_value(MLJBase.predict(fluctuation, fitresult, X))
+    mse_fluct = sum((fluctuation_mean .- y).^2)
+    @test mse_fluct < mse_initial
     @test fitted_params(fitresult.one_dimensional_path).features == [:covariate]
     @test cache.weighted_covariate == expected_weights .* expected_covariate
     @test cache.training_expected_value isa AbstractVector
diff --git a/test/counterfactual_mean_based/gradient.jl b/test/counterfactual_mean_based/gradient.jl
index f610ad7a..8a2e7f7a 100644
--- a/test/counterfactual_mean_based/gradient.jl
+++ b/test/counterfactual_mean_based/gradient.jl
@@ -24,7 +24,7 @@ function one_treatment_dataset(;n=100)
     )
 end
 
-@testset "Test gradient_and_estimate" begin
+@testset "Test gradient_and_plugin_estimate" begin
     ps_lowerbound = 1e-8
     Ψ = ATE(
         outcome = :Y, 
@@ -60,8 +60,8 @@ end
     expectedΨ̂ = mean(ctf_agg)
     ∇W = TMLE.∇W(ctf_agg, expectedΨ̂)
     @test ∇W == ctf_agg .- expectedΨ̂
-    # gradient_and_estimate
-    IC, Ψ̂ = TMLE.gradient_and_estimate(Ψ, η̂ₙ, dataset; ps_lowerbound=ps_lowerbound)
+    # gradient_and_plugin_estimate
+    IC, Ψ̂ = TMLE.gradient_and_plugin_estimate(Ψ, η̂ₙ, dataset; ps_lowerbound=ps_lowerbound)
     @test expectedΨ̂ == Ψ̂
     @test IC == ∇YX .+ ∇W
 end
diff --git a/test/estimators_and_estimates.jl b/test/estimators_and_estimates.jl
index f7fb5c23..f0c146f7 100644
--- a/test/estimators_and_estimates.jl
+++ b/test/estimators_and_estimates.jl
@@ -7,127 +7,159 @@ using DataFrames
 using MLJGLMInterface
 using MLJModels
 using LogExpFunctions
+using Distributions
+using MLJLinearModels
 
 verbosity = 1
 n = 100
 X, y = make_moons(n)
-dataset = DataFrame(Y=y, X₁=X.x1, X₂=X.x2)
+binary_dataset = DataFrame(Y=y, X₁=X.x1, X₂=X.x2)
+
+X, y = make_regression(n)
+continuous_dataset = DataFrame(Y=y, X₁=X.x1, X₂=X.x2)
+
 estimand = TMLE.ConditionalDistribution(:Y, [:X₁, :X₂])
 fit_log = string("Estimating: ", TMLE.string_repr(estimand))
 reuse_log = string("Reusing estimate for: ", TMLE.string_repr(estimand))
 
-@testset "Test MLConditionalDistributionEstimator" begin
+@testset "Test MLConditionalDistributionEstimator: binary outcome" begin
+    # Check predict / expected_value / compute_offset
     estimator = TMLE.MLConditionalDistributionEstimator(LinearBinaryClassifier())
     # Fitting with no cache
     cache = Dict()
-    conditional_density_estimate = @test_logs (:info, fit_log) estimator(estimand, dataset; cache=cache, verbosity=verbosity)
+    conditional_density_estimate = @test_logs (:info, fit_log) estimator(estimand, binary_dataset; cache=cache, verbosity=verbosity)
     expected_features = collect(estimand.parents)
     @test conditional_density_estimate isa TMLE.MLConditionalDistribution
     @test fitted_params(conditional_density_estimate.machine).features == expected_features
-    ŷ = predict(conditional_density_estimate, dataset)
-    mach_ŷ = predict(conditional_density_estimate.machine, dataset[!, expected_features])
+    ŷ = predict(conditional_density_estimate, binary_dataset)
+    mach_ŷ = predict(conditional_density_estimate.machine, binary_dataset[!, expected_features])
     @test all(ŷ[i].prob_given_ref == mach_ŷ[i].prob_given_ref for i in eachindex(ŷ))
-    μ̂ = TMLE.expected_value(conditional_density_estimate, dataset)
+    μ̂ = TMLE.expected_value(conditional_density_estimate, binary_dataset)
     @test μ̂ == [ŷ[i].prob_given_ref[2] for i in eachindex(ŷ)]
-    @test all(0. <= x <= 1. for x in TMLE.likelihood(conditional_density_estimate, dataset)) # The pdf is not necessarily between 0 and 1
-    # Uses the cache instead of fitting
+    offset = TMLE.compute_offset(conditional_density_estimate, binary_dataset)
+    @test offset == logit.(μ̂)
+    @test all(0. <= x <= 1. for x in TMLE.likelihood(conditional_density_estimate, binary_dataset)) # The pdf is not necessarily between 0 and 1
+    # Check cache management
+    ## Uses the cache instead of fitting
     new_estimator = TMLE.MLConditionalDistributionEstimator(LinearBinaryClassifier())
     @test TMLE.key(new_estimator) == TMLE.key(estimator)
-    @test_logs (:info, reuse_log) estimator(estimand, dataset; cache=cache, verbosity=verbosity)
-    # Changing the model leads to refit
+    @test_logs (:info, reuse_log) estimator(estimand, binary_dataset; cache=cache, verbosity=verbosity)
+    ## Changing the model leads to refit
     new_estimator = TMLE.MLConditionalDistributionEstimator(LinearBinaryClassifier(fit_intercept=false))
     @test TMLE.key(new_estimator) != TMLE.key(estimator)
-    @test_logs (:info, fit_log) new_estimator(estimand, dataset; cache=cache, verbosity=verbosity)
+    @test_logs (:info, fit_log) new_estimator(estimand, binary_dataset; cache=cache, verbosity=verbosity)
+end
+
+@testset "Test MLConditionalDistributionEstimator: continuous outcome" begin
+    # Check predict / expected_value / compute_offset
+    ## Probabilistic Model
+    model = MLJGLMInterface.LinearRegressor()
+    estimator = TMLE.MLConditionalDistributionEstimator(model)
+    conditional_density_estimate = @test_logs (:info, fit_log) estimator(estimand, continuous_dataset; cache=Dict(), verbosity=verbosity)
+    ŷ = predict(conditional_density_estimate, continuous_dataset)
+    @test ŷ isa Vector{Normal{Float64}}
+    μ̂ = TMLE.expected_value(conditional_density_estimate, continuous_dataset)
+    @test [ŷᵢ.μ for ŷᵢ in ŷ] == μ̂
+    offset = TMLE.compute_offset(conditional_density_estimate, continuous_dataset)
+    @test offset == μ̂
+
+    ## Deterministic Model
+    model = MLJLinearModels.LinearRegressor()
+    estimator = TMLE.MLConditionalDistributionEstimator(model)
+    conditional_density_estimate = estimator(estimand, continuous_dataset; cache=Dict(), verbosity=verbosity)
+    ŷ = predict(conditional_density_estimate, continuous_dataset)
+    @test ŷ isa Vector{Float64}
+    μ̂ = TMLE.expected_value(conditional_density_estimate, continuous_dataset)
+    @test ŷ == μ̂
+    offset = TMLE.compute_offset(conditional_density_estimate, continuous_dataset)
+    @test offset == μ̂
 end
 
-@testset "Test SampleSplitMLConditionalDistributionEstimator" begin
+@testset "Test SampleSplitMLConditionalDistributionEstimator: Binary outcome" begin
+    # Check predict / expected_value / compute_offset
     nfolds = 3
-    train_validation_indices = Tuple(MLJBase.train_test_pairs(CV(nfolds=nfolds), 1:n, dataset))
+    train_validation_indices = Tuple(MLJBase.train_test_pairs(StratifiedCV(nfolds=nfolds), 1:n, binary_dataset, binary_dataset.Y))
     model = LinearBinaryClassifier()
     estimator = TMLE.SampleSplitMLConditionalDistributionEstimator(
         model,
         train_validation_indices
     )
     cache = Dict()
-    conditional_density_estimate = @test_logs (:info, fit_log) estimator(estimand, dataset;cache=cache, verbosity=verbosity)
+    conditional_density_estimate = @test_logs (:info, fit_log) estimator(estimand, binary_dataset;cache=cache, verbosity=verbosity)
     @test conditional_density_estimate isa TMLE.SampleSplitMLConditionalDistribution
     expected_features = collect(estimand.parents)
     @test all(fitted_params(mach).features == expected_features for mach in conditional_density_estimate.machines)
-    ŷ = predict(conditional_density_estimate, dataset)
-    μ̂ = TMLE.expected_value(conditional_density_estimate, dataset)
+    ŷ = predict(conditional_density_estimate, binary_dataset)
+    @test ŷ isa UnivariateFiniteVector
     for foldid in 1:nfolds
         train, val = train_validation_indices[foldid]
         # The predictions on validation samples are made from
         # the machine trained on the train sample
-        ŷfold = predict(conditional_density_estimate.machines[foldid], dataset[val, expected_features])
+        ŷfold = predict(conditional_density_estimate.machines[foldid], binary_dataset[val, expected_features])
         @test [ŷᵢ.prob_given_ref for ŷᵢ ∈ ŷ[val]] == [ŷᵢ.prob_given_ref for ŷᵢ ∈ ŷfold]
-        @test μ̂[val] == [ŷᵢ.prob_given_ref[2] for ŷᵢ ∈ ŷfold]
     end
-    @test all(0. <= x <= 1. for x in TMLE.likelihood(conditional_density_estimate, dataset))
-    # Uses the cache instead of fitting
+    μ̂ = TMLE.expected_value(conditional_density_estimate, binary_dataset)
+    @test [ŷᵢ.prob_given_ref[2] for ŷᵢ ∈ ŷ] == μ̂
+    offset = TMLE.compute_offset(conditional_density_estimate, binary_dataset)
+    @test offset == logit.(μ̂)
+    @test all(0. <= x <= 1. for x in TMLE.likelihood(conditional_density_estimate, binary_dataset))
+    # Check cache management
+    ## Uses the cache instead of fitting
     new_estimator = TMLE.SampleSplitMLConditionalDistributionEstimator(
         LinearBinaryClassifier(),
         train_validation_indices
     )
     @test TMLE.key(new_estimator) == TMLE.key(estimator)
-    @test_logs (:info, reuse_log) estimator(estimand, dataset;cache=cache, verbosity=verbosity)
-    # Changing the model leads to refit
+    @test_logs (:info, reuse_log) estimator(estimand, binary_dataset;cache=cache, verbosity=verbosity)
+    ## Changing the model leads to refit
     new_model = LinearBinaryClassifier(fit_intercept=false)
     new_estimator = TMLE.SampleSplitMLConditionalDistributionEstimator(
         new_model,
         train_validation_indices
     )
-    @test_logs (:info, fit_log) new_estimator(estimand, dataset; cache=cache, verbosity=verbosity)
-    # Changing the train/validation splits leads to refit
-    train_validation_indices = Tuple(MLJBase.train_test_pairs(CV(nfolds=4), 1:n, dataset))
+    @test_logs (:info, fit_log) new_estimator(estimand, binary_dataset; cache=cache, verbosity=verbosity)
+    ## Changing the train/validation splits leads to refit
+    train_validation_indices = Tuple(MLJBase.train_test_pairs(CV(nfolds=4), 1:n, binary_dataset))
     new_estimator = TMLE.SampleSplitMLConditionalDistributionEstimator(
         new_model,
         train_validation_indices
     )
-    @test_logs (:info, fit_log) new_estimator(estimand, dataset; cache=cache, verbosity=verbosity)
+    @test_logs (:info, fit_log) new_estimator(estimand, binary_dataset; cache=cache, verbosity=verbosity)
 end
 
-@testset "Test compute_offset MLConditionalDistributionEstimator" begin
-    dataset = (
-        T = categorical(["a", "b", "c", "a", "a", "b", "a"]),
-        Ycont = [1., 2., 3, 4, 5, 6, 7],
-        Ycat = categorical([1, 0, 0, 1, 1, 1, 0]),
-        W = rand(7),
-    )
-    μYcont = mean(dataset.Ycont)
-    μYcat = mean(float(dataset.Ycat))
-    # The model is probabilistic continuous, the offset is the mean of 
-    # the conditional distribution
-    conditional_density_estimate = TMLE.MLConditionalDistributionEstimator(ConstantRegressor())(
-        TMLE.ConditionalDistribution(:Ycont, [:W, :T]),
-        dataset,
-        verbosity=0
-    )
-    offset = TMLE.compute_offset(conditional_density_estimate, dataset)
-    @test offset == mean.(predict(conditional_density_estimate, dataset))
-    @test offset == repeat([μYcont], 7)
-    # The model is deterministic, the offset is simply the output 
-    # of the predict function which is assumed to correspond to the mean
-    # if the squared loss was optimized for by the underlying model
-    conditional_density_estimate = TMLE.MLConditionalDistributionEstimator(DeterministicConstantRegressor())(
-        TMLE.ConditionalDistribution(:Ycont, [:W, :T]),
-        dataset,
-        verbosity=0
+@testset "Test SampleSplitMLConditionalDistributionEstimator: Continuous outcome" begin
+    # Check predict / expected_value / compute_offset
+    nfolds = 3
+    train_validation_indices = Tuple(MLJBase.train_test_pairs(CV(nfolds=nfolds), 1:n, continuous_dataset))
+    ## Probabilistic Model
+    model = MLJGLMInterface.LinearRegressor()
+    estimator = TMLE.SampleSplitMLConditionalDistributionEstimator(
+        model,
+        train_validation_indices
     )
-    offset = TMLE.compute_offset(conditional_density_estimate, dataset)
-    @test offset == predict(conditional_density_estimate, dataset)
-    @test offset == repeat([μYcont], 7)
-    # The model is probabilistic binary, the offset is the logit
-    # of the mean of the conditional distribution
-    conditional_density_estimate = TMLE.MLConditionalDistributionEstimator(ConstantClassifier())(
-        TMLE.ConditionalDistribution(:Ycat, [:W, :T]),
-        dataset,
-        verbosity=0
+    conditional_density_estimate = estimator(estimand, continuous_dataset; verbosity=verbosity)
+    ŷ = predict(conditional_density_estimate, continuous_dataset)
+    @test ŷ isa Vector{Distributions.Normal{Float64}}
+    μ̂ = TMLE.expected_value(conditional_density_estimate, continuous_dataset)
+    @test μ̂ isa Vector{Float64}
+    @test μ̂ == [ŷᵢ.μ for ŷᵢ in ŷ]
+    offset = TMLE.compute_offset(conditional_density_estimate, continuous_dataset)
+    @test offset == μ̂
+
+    ## Deterministic Model
+    model = MLJLinearModels.LinearRegressor()
+    estimator = TMLE.SampleSplitMLConditionalDistributionEstimator(
+        model,
+        train_validation_indices
     )
-    offset = TMLE.compute_offset(conditional_density_estimate, dataset)
-    @test offset == repeat([logit(μYcat)], 7)
+    conditional_density_estimate = estimator(estimand, continuous_dataset; verbosity=verbosity)
+    ŷ = predict(conditional_density_estimate, continuous_dataset)
+    @test ŷ isa Vector{Float64}
+    μ̂ = TMLE.expected_value(conditional_density_estimate, continuous_dataset)
+    @test μ̂ == ŷ
+    offset = TMLE.compute_offset(conditional_density_estimate, continuous_dataset)
+    @test offset == μ̂
 end
-
 end
 
 true
\ No newline at end of file