From 4a74b4011cd2b5aea9869888f6e480016f0e54ab Mon Sep 17 00:00:00 2001
From: TensorFlow Lattice Authors <no-reply@google.com>
Date: Fri, 22 Mar 2024 15:28:13 -0700
Subject: [PATCH] Internal change

PiperOrigin-RevId: 618303175
Change-Id: I7d5db653bc2d804502cae70ea4d06b58c536e705
---
 docs/_book.yaml                               |    4 -
 docs/_index.yaml                              |    3 +-
 docs/install.md                               |    4 +-
 docs/overview.md                              |    7 +-
 .../tutorials/aggregate_function_models.ipynb |  109 +-
 docs/tutorials/canned_estimators.ipynb        |  724 ------
 docs/tutorials/custom_estimators.ipynb        |  409 ----
 docs/tutorials/keras_layers.ipynb             |  150 +-
 docs/tutorials/premade_models.ipynb           |   77 +-
 docs/tutorials/shape_constraints.ipynb        |  916 ++++----
 .../shape_constraints_for_ethics.ipynb        |  726 ++++---
 examples/BUILD                                |   24 -
 examples/canned_estimators_uci_heart.py       |  327 ---
 examples/custom_estimators_uci_heart.py       |  179 --
 examples/keras_functional_uci_heart.py        |   13 +-
 examples/keras_sequential_uci_heart.py        |   13 +-
 setup.py                                      |   10 +-
 tensorflow_lattice/BUILD                      |    2 -
 tensorflow_lattice/__init__.py                |    2 -
 tensorflow_lattice/python/BUILD               |   56 -
 .../python/aggregation_layer.py               |   24 +-
 tensorflow_lattice/python/aggregation_test.py |   18 +-
 .../python/categorical_calibration_layer.py   |   13 +-
 .../python/categorical_calibration_test.py    |   29 +-
 tensorflow_lattice/python/cdf_layer.py        |   27 +-
 tensorflow_lattice/python/cdf_test.py         |   53 +-
 tensorflow_lattice/python/configs.py          |   23 +-
 tensorflow_lattice/python/estimators.py       | 1934 -----------------
 tensorflow_lattice/python/estimators_test.py  |  810 -------
 .../kronecker_factored_lattice_layer.py       |   24 +-
 .../python/kronecker_factored_lattice_test.py |  137 +-
 tensorflow_lattice/python/lattice_layer.py    |   14 +-
 tensorflow_lattice/python/lattice_test.py     |  231 +-
 tensorflow_lattice/python/linear_layer.py     |   26 +-
 tensorflow_lattice/python/linear_test.py      |   58 +-
 .../python/parallel_combination_layer.py      |   10 +-
 .../python/parallel_combination_test.py       |   17 +-
 tensorflow_lattice/python/premade.py          |   54 +-
 tensorflow_lattice/python/premade_lib.py      |   69 +-
 tensorflow_lattice/python/premade_test.py     |  102 +-
 .../python/pwl_calibration_layer.py           |   16 +-
 .../python/pwl_calibration_test.py            |  127 +-
 tensorflow_lattice/python/rtl_layer.py        |   28 +-
 tensorflow_lattice/python/rtl_test.py         |   55 +-
 tensorflow_lattice/python/test_utils.py       |   23 +-
 tensorflow_lattice/python/visualization.py    |  609 ------
 46 files changed, 1664 insertions(+), 6622 deletions(-)
 delete mode 100644 docs/tutorials/canned_estimators.ipynb
 delete mode 100644 docs/tutorials/custom_estimators.ipynb
 delete mode 100644 examples/canned_estimators_uci_heart.py
 delete mode 100644 examples/custom_estimators_uci_heart.py
 delete mode 100644 tensorflow_lattice/python/estimators.py
 delete mode 100644 tensorflow_lattice/python/estimators_test.py
 delete mode 100644 tensorflow_lattice/python/visualization.py

diff --git a/docs/_book.yaml b/docs/_book.yaml
index c70873c..19f9a8c 100644
--- a/docs/_book.yaml
+++ b/docs/_book.yaml
@@ -26,10 +26,6 @@ upper_tabs:
         path: /lattice/tutorials/keras_layers
       - title: Keras Premade Models
         path: /lattice/tutorials/premade_models
-      - title: Canned Estimators
-        path: /lattice/tutorials/canned_estimators
-      - title: Custom Estimators
-        path: /lattice/tutorials/custom_estimators
       - title: Aggregate Function Models
         path: /lattice/tutorials/aggregate_function_models
 
diff --git a/docs/_index.yaml b/docs/_index.yaml
index ee8c54e..8b5692c 100644
--- a/docs/_index.yaml
+++ b/docs/_index.yaml
@@ -15,8 +15,7 @@ landing_page:
         <a href="./tutorials/shape_constraints">shape constraints</a>. This is done using a
         collection of <a href="./tutorials/keras_layers">Keras layers</a> that can satisfy
         constraints such as monotonicity, convexity and how features interact. The library also
-        provides easy to setup <a href="./tutorials/premade_models">premade models</a> and
-        <a href="./tutorials/canned_estimators">canned estimators</a>.</p>
+        provides easy to setup <a href="./tutorials/premade_models">premade models</a>.</p>
         <p>With TF Lattice you can use domain knowledge to better extrapolate to the parts of the
         input space not covered by the training dataset. This helps avoid unexpected model behaviour
         when the serving distribution is different from the training distribution.</p>
diff --git a/docs/install.md b/docs/install.md
index 9937349..27c3e19 100644
--- a/docs/install.md
+++ b/docs/install.md
@@ -5,7 +5,7 @@ There are several ways to set up your environment to use TensorFlow Lattice
 
 *   The easiest way to learn and use TFL requires no installation: run the any
     of the tutorials (e.g.
-    [canned estimators tutorial](tutorials/canned_estimators.ipynb)).
+    [premade models](tutorials/premade_models.ipynb)).
 *   To use TFL on a local machine, install the `tensorflow-lattice` pip package.
 *   If you have a unique machine configuration, you can build the package from
     source.
@@ -18,6 +18,8 @@ Install using pip.
 pip install --upgrade tensorflow-lattice
 ```
 
+Note that you will need to have `tf_keras` package installed as well.
+
 ## Build from source
 
 Clone the github repo:
diff --git a/docs/overview.md b/docs/overview.md
index 0b36838..cb6c20a 100644
--- a/docs/overview.md
+++ b/docs/overview.md
@@ -6,7 +6,7 @@ knowledge into the learning process through common-sense or policy-driven
 [shape constraints](tutorials/shape_constraints.ipynb). This is done using a
 collection of [Keras layers](tutorials/keras_layers.ipynb) that can satisfy
 constraints such as monotonicity, convexity and pairwise trust. The library also
-provides easy to setup [canned estimators](tutorials/canned_estimators.ipynb).
+provides easy to setup [premade models](tutorials/premade_models.ipynb).
 
 ## Concepts
 
@@ -59,7 +59,7 @@ following show examples such calibrations functions with 10 keypoints:
 </p>
 
 It is often a good idea to use the quantiles of the features as input keypoints.
-TensorFlow Lattice [canned estimators](tutorials/canned_estimators.ipynb) can
+TensorFlow Lattice [premade models](tutorials/premade_models.ipynb) can
 automatically set the input keypoints to the feature quantiles.
 
 For categorical features, TensorFlow Lattice provides categorical calibration
@@ -209,8 +209,7 @@ include embeddings or other Keras layers.
 ## Tutorials and API docs
 
 For common model architectures, you can use
-[Keras premade models](tutorials/premade_models.ipynb) or
-[canned Estimators](tutorials/canned_estimators.ipynb). You can also create
+[Keras premade models](tutorials/premade_models.ipynb). You can also create
 custom models using [TF Lattice Keras layers](tutorials/keras_layers.ipynb) or
 mix and match with other Keras layers. Check out the
 [full API docs](https://www.tensorflow.org/lattice/api_docs/python/tfl) for
diff --git a/docs/tutorials/aggregate_function_models.ipynb b/docs/tutorials/aggregate_function_models.ipynb
index e42dcb9..de2224e 100644
--- a/docs/tutorials/aggregate_function_models.ipynb
+++ b/docs/tutorials/aggregate_function_models.ipynb
@@ -3,7 +3,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "RYmPh1qB_KO2"
       },
       "source": [
@@ -12,10 +11,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "oMRm3czy9tLh"
       },
       "outputs": [],
@@ -36,7 +33,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "ooXoR4kx_YL9"
       },
       "source": [
@@ -46,7 +42,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "BR6XNYEXEgSU"
       },
       "source": [
@@ -69,19 +64,17 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "-ZfQWUmfEsyZ"
       },
       "source": [
         "## Overview\n",
         "\n",
-        "TFL Premade Aggregate Function Models are quick and easy ways to build TFL `tf.keras.model` instances for learning complex aggregation functions. This guide outlines the steps needed to construct a TFL Premade Aggregate Function Model and train/test it. "
+        "TFL Premade Aggregate Function Models are quick and easy ways to build TFL `keras.Model` instances for learning complex aggregation functions. This guide outlines the steps needed to construct a TFL Premade Aggregate Function Model and train/test it."
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "L0lgWoB6Gmk1"
       },
       "source": [
@@ -92,22 +85,19 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "ivwKrEdLGphZ"
       },
       "outputs": [],
       "source": [
         "#@test {\"skip\": true}\n",
-        "!pip install tensorflow-lattice pydot"
+        "!pip install --pre -U tensorflow tf-keras tensorflow-lattice  pydot graphviz"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "VQsRKS4wGrMu"
       },
       "source": [
@@ -116,10 +106,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "j41-kd4MGtDS"
       },
       "outputs": [],
@@ -135,10 +123,25 @@
         "logging.disable(sys.maxsize)"
       ]
     },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "HlJH1SMx3Vul"
+      },
+      "outputs": [],
+      "source": [
+        "# Use Keras 2.\n",
+        "version_fn = getattr(tf.keras, \"version\", None)\n",
+        "if version_fn and version_fn().startswith(\"3.\"):\n",
+        "  import tf_keras as keras\n",
+        "else:\n",
+        "  keras = tf.keras"
+      ]
+    },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "ZHPohKjBIFG5"
       },
       "source": [
@@ -147,10 +150,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "VjYHpw2dSfHH"
       },
       "outputs": [],
@@ -162,10 +163,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "UOsgu3eIEur6"
       },
       "outputs": [],
@@ -178,7 +177,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "XG7MPCyzVr22"
       },
       "source": [
@@ -187,10 +185,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "bYdJicq5bBuz"
       },
       "outputs": [],
@@ -213,10 +209,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "kx0ZX2HR-4qb"
       },
       "outputs": [],
@@ -276,10 +270,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "Nd6j_J5CbNiz"
       },
       "outputs": [],
@@ -290,10 +282,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "KfHHhCRsHejl"
       },
       "outputs": [],
@@ -306,7 +296,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "9TwqlRirIhAq"
       },
       "source": [
@@ -315,10 +304,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "GckmXFzRIhdD"
       },
       "outputs": [],
@@ -335,7 +322,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "TpDKon4oIh2W"
       },
       "source": [
@@ -349,7 +335,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "_IMwcDh7Xs5n"
       },
       "source": [
@@ -360,10 +345,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "l0uYl9ZpXtW1"
       },
       "outputs": [],
@@ -396,7 +379,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "9oYZdVeWEhf2"
       },
       "source": [
@@ -407,10 +389,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "rEYlSXhTEmoh"
       },
       "outputs": [],
@@ -470,7 +450,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "9zoPJRBvPdcH"
       },
       "source": [
@@ -481,10 +460,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "l_4J7EjSPiP3"
       },
       "outputs": [],
@@ -507,14 +484,13 @@
         "aggregate_function_model = tfl.premade.AggregateFunction(\n",
         "    aggregate_function_model_config)\n",
         "# Let's plot our model.\n",
-        "tf.keras.utils.plot_model(\n",
+        "keras.utils.plot_model(\n",
         "    aggregate_function_model, show_layer_names=False, rankdir='LR')"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "4F7AwiXgWhe2"
       },
       "source": [
@@ -523,10 +499,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "UM7XF6UIWo4T"
       },
       "outputs": [],
@@ -535,33 +509,30 @@
         "    layer for layer in aggregate_function_model.layers\n",
         "    if isinstance(layer, tfl.layers.Aggregation)\n",
         "]\n",
-        "tf.keras.utils.plot_model(\n",
+        "keras.utils.plot_model(\n",
         "    aggregation_layers[0].model, show_layer_names=False, rankdir='LR')"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "0ohYOftgTZhq"
       },
       "source": [
-        "Now, as with any other [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model), we compile and fit the model to our data."
+        "Now, as with any other [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model), we compile and fit the model to our data."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "uB9di3-lTfMy"
       },
       "outputs": [],
       "source": [
         "aggregate_function_model.compile(\n",
         "    loss='mae',\n",
-        "    optimizer=tf.keras.optimizers.Adam(LEARNING_RATE))\n",
+        "    optimizer=keras.optimizers.Adam(LEARNING_RATE))\n",
         "aggregate_function_model.fit(\n",
         "    train_xs, train_ys, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, verbose=False)"
       ]
@@ -569,7 +540,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "pwZtGDR-Tzur"
       },
       "source": [
@@ -578,10 +548,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "RWj1YfubT0NE"
       },
       "outputs": [],
@@ -593,7 +561,6 @@
   ],
   "metadata": {
     "colab": {
-      "collapsed_sections": [],
       "name": "aggregate_function_models.ipynb",
       "private_outputs": true,
       "provenance": [
diff --git a/docs/tutorials/canned_estimators.ipynb b/docs/tutorials/canned_estimators.ipynb
deleted file mode 100644
index b4ad6a2..0000000
--- a/docs/tutorials/canned_estimators.ipynb
+++ /dev/null
@@ -1,724 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "7765UFHoyGx6"
-      },
-      "source": [
-        "##### Copyright 2020 The TensorFlow Authors."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "cellView": "form",
-        "id": "KsOkK8O69PyT"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
-        "# you may not use this file except in compliance with the License.\n",
-        "# You may obtain a copy of the License at\n",
-        "#\n",
-        "# https://www.apache.org/licenses/LICENSE-2.0\n",
-        "#\n",
-        "# Unless required by applicable law or agreed to in writing, software\n",
-        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
-        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
-        "# See the License for the specific language governing permissions and\n",
-        "# limitations under the License."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ZS8z-_KeywY9"
-      },
-      "source": [
-        "# TF Lattice Canned Estimators"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "r61fkA2i9Y3_"
-      },
-      "source": [
-        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/lattice/tutorials/canned_estimators\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/lattice/blob/master/docs/tutorials/canned_estimators.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/lattice/blob/master/docs/tutorials/canned_estimators.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/lattice/docs/tutorials/canned_estimators.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "\u003c/table\u003e"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "kQRNKVzRR9pa"
-      },
-      "source": [
-        "\u003e Warning: Estimators are not recommended for new code.  Estimators run `v1. Session`-style code which is more difficult to write correctly, and can behave unexpectedly, especially when combined with TF 2 code. Estimators do fall under our [compatibility guarantees](https://tensorflow.org/guide/versions), but will receive no fixes other than security vulnerabilities. See the [migration guide](https://tensorflow.org/guide/migrate) for details."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "WCpl-9WDVq9d"
-      },
-      "source": [
-        "## Overview\n",
-        "\n",
-        "Canned estimators are quick and easy ways to train TFL models for typical use cases. This guide outlines the steps needed to create a TFL canned estimator."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "x769lI12IZXB"
-      },
-      "source": [
-        "## Setup"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "fbBVAR6UeRN5"
-      },
-      "source": [
-        "Installing TF Lattice package:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "bpXjJKpSd3j4"
-      },
-      "outputs": [],
-      "source": [
-        "#@test {\"skip\": true}\n",
-        "!pip install tensorflow-lattice"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "jSVl9SHTeSGX"
-      },
-      "source": [
-        "Importing required packages:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "cellView": "both",
-        "id": "FbZDk8bIx8ig"
-      },
-      "outputs": [],
-      "source": [
-        "import tensorflow as tf\n",
-        "\n",
-        "import copy\n",
-        "import logging\n",
-        "import numpy as np\n",
-        "import pandas as pd\n",
-        "import sys\n",
-        "import tensorflow_lattice as tfl\n",
-        "from tensorflow import feature_column as fc\n",
-        "logging.disable(sys.maxsize)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "svPuM6QNxlrH"
-      },
-      "source": [
-        "Downloading the UCI Statlog (Heart) dataset:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "cellView": "both",
-        "id": "j-k1qTR_yvBl"
-      },
-      "outputs": [],
-      "source": [
-        "csv_file = tf.keras.utils.get_file(\n",
-        "    'heart.csv', 'http://storage.googleapis.com/download.tensorflow.org/data/heart.csv')\n",
-        "df = pd.read_csv(csv_file)\n",
-        "target = df.pop('target')\n",
-        "train_size = int(len(df) * 0.8)\n",
-        "train_x = df[:train_size]\n",
-        "train_y = target[:train_size]\n",
-        "test_x = df[train_size:]\n",
-        "test_y = target[train_size:]\n",
-        "df.head()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "nKkAw12SxvGG"
-      },
-      "source": [
-        "Setting the default values used for training in this guide:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "cellView": "both",
-        "id": "1T6GFI9F6mcG"
-      },
-      "outputs": [],
-      "source": [
-        "LEARNING_RATE = 0.01\n",
-        "BATCH_SIZE = 128\n",
-        "NUM_EPOCHS = 500\n",
-        "PREFITTING_NUM_EPOCHS = 10"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "0TGfzhPHzpix"
-      },
-      "source": [
-        "## Feature Columns\n",
-        "\n",
-        "As for any other TF estimator, data needs to be passed to the estimator, which is typically via an input_fn and parsed using [FeatureColumns](https://www.tensorflow.org/guide/feature_columns)."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "DCIUz8apzs0l"
-      },
-      "outputs": [],
-      "source": [
-        "# Feature columns.\n",
-        "# - age\n",
-        "# - sex\n",
-        "# - cp        chest pain type (4 values)\n",
-        "# - trestbps  resting blood pressure\n",
-        "# - chol      serum cholestoral in mg/dl\n",
-        "# - fbs       fasting blood sugar \u003e 120 mg/dl\n",
-        "# - restecg   resting electrocardiographic results (values 0,1,2)\n",
-        "# - thalach   maximum heart rate achieved\n",
-        "# - exang     exercise induced angina\n",
-        "# - oldpeak   ST depression induced by exercise relative to rest\n",
-        "# - slope     the slope of the peak exercise ST segment\n",
-        "# - ca        number of major vessels (0-3) colored by flourosopy\n",
-        "# - thal      3 = normal; 6 = fixed defect; 7 = reversable defect\n",
-        "feature_columns = [\n",
-        "    fc.numeric_column('age', default_value=-1),\n",
-        "    fc.categorical_column_with_vocabulary_list('sex', [0, 1]),\n",
-        "    fc.numeric_column('cp'),\n",
-        "    fc.numeric_column('trestbps', default_value=-1),\n",
-        "    fc.numeric_column('chol'),\n",
-        "    fc.categorical_column_with_vocabulary_list('fbs', [0, 1]),\n",
-        "    fc.categorical_column_with_vocabulary_list('restecg', [0, 1, 2]),\n",
-        "    fc.numeric_column('thalach'),\n",
-        "    fc.categorical_column_with_vocabulary_list('exang', [0, 1]),\n",
-        "    fc.numeric_column('oldpeak'),\n",
-        "    fc.categorical_column_with_vocabulary_list('slope', [0, 1, 2]),\n",
-        "    fc.numeric_column('ca'),\n",
-        "    fc.categorical_column_with_vocabulary_list(\n",
-        "        'thal', ['normal', 'fixed', 'reversible']),\n",
-        "]"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "hEZstmtT2CA3"
-      },
-      "source": [
-        "TFL canned estimators use the type of the feature column to decide what type of calibration layer to use. We use a `tfl.layers.PWLCalibration` layer for numeric feature columns and a `tfl.layers.CategoricalCalibration` layer for categorical feature columns.\n",
-        "\n",
-        "Note that categorical feature columns are not wrapped by an embedding feature column. They are directly fed into the estimator."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "H_LoW_9m5OFL"
-      },
-      "source": [
-        "## Creating input_fn\n",
-        "\n",
-        "As for any other estimator, you can use an input_fn to feed data to the model for training and evaluation. TFL estimators can automatically calculate quantiles of the features and use them as input keypoints for the PWL calibration layer. To do so, they require passing a `feature_analysis_input_fn`, which is similar to the training input_fn but with a single epoch or a subsample of the data."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "lFVy1Efy5NKD"
-      },
-      "outputs": [],
-      "source": [
-        "train_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(\n",
-        "    x=train_x,\n",
-        "    y=train_y,\n",
-        "    shuffle=False,\n",
-        "    batch_size=BATCH_SIZE,\n",
-        "    num_epochs=NUM_EPOCHS,\n",
-        "    num_threads=1)\n",
-        "\n",
-        "# feature_analysis_input_fn is used to collect statistics about the input.\n",
-        "feature_analysis_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(\n",
-        "    x=train_x,\n",
-        "    y=train_y,\n",
-        "    shuffle=False,\n",
-        "    batch_size=BATCH_SIZE,\n",
-        "    # Note that we only need one pass over the data.\n",
-        "    num_epochs=1,\n",
-        "    num_threads=1)\n",
-        "\n",
-        "test_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(\n",
-        "    x=test_x,\n",
-        "    y=test_y,\n",
-        "    shuffle=False,\n",
-        "    batch_size=BATCH_SIZE,\n",
-        "    num_epochs=1,\n",
-        "    num_threads=1)\n",
-        "\n",
-        "# Serving input fn is used to create saved models.\n",
-        "serving_input_fn = (\n",
-        "    tf.estimator.export.build_parsing_serving_input_receiver_fn(\n",
-        "        feature_spec=fc.make_parse_example_spec(feature_columns)))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "uQlzREcm2Wbj"
-      },
-      "source": [
-        "## Feature Configs\n",
-        "\n",
-        "Feature calibration and per-feature configurations are set using `tfl.configs.FeatureConfig`. Feature configurations include monotonicity constraints, per-feature regularization (see `tfl.configs.RegularizerConfig`), and lattice sizes for lattice models.\n",
-        "\n",
-        "If no configuration is defined for an input feature, the default configuration in `tfl.config.FeatureConfig` is used."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "vD0tNpiO3p9c"
-      },
-      "outputs": [],
-      "source": [
-        "# Feature configs are used to specify how each feature is calibrated and used.\n",
-        "feature_configs = [\n",
-        "    tfl.configs.FeatureConfig(\n",
-        "        name='age',\n",
-        "        lattice_size=3,\n",
-        "        # By default, input keypoints of pwl are quantiles of the feature.\n",
-        "        pwl_calibration_num_keypoints=5,\n",
-        "        monotonicity='increasing',\n",
-        "        pwl_calibration_clip_max=100,\n",
-        "        # Per feature regularization.\n",
-        "        regularizer_configs=[\n",
-        "            tfl.configs.RegularizerConfig(name='calib_wrinkle', l2=0.1),\n",
-        "        ],\n",
-        "    ),\n",
-        "    tfl.configs.FeatureConfig(\n",
-        "        name='cp',\n",
-        "        pwl_calibration_num_keypoints=4,\n",
-        "        # Keypoints can be uniformly spaced.\n",
-        "        pwl_calibration_input_keypoints='uniform',\n",
-        "        monotonicity='increasing',\n",
-        "    ),\n",
-        "    tfl.configs.FeatureConfig(\n",
-        "        name='chol',\n",
-        "        # Explicit input keypoint initialization.\n",
-        "        pwl_calibration_input_keypoints=[126.0, 210.0, 247.0, 286.0, 564.0],\n",
-        "        monotonicity='increasing',\n",
-        "        # Calibration can be forced to span the full output range by clamping.\n",
-        "        pwl_calibration_clamp_min=True,\n",
-        "        pwl_calibration_clamp_max=True,\n",
-        "        # Per feature regularization.\n",
-        "        regularizer_configs=[\n",
-        "            tfl.configs.RegularizerConfig(name='calib_hessian', l2=1e-4),\n",
-        "        ],\n",
-        "    ),\n",
-        "    tfl.configs.FeatureConfig(\n",
-        "        name='fbs',\n",
-        "        # Partial monotonicity: output(0) \u003c= output(1)\n",
-        "        monotonicity=[(0, 1)],\n",
-        "    ),\n",
-        "    tfl.configs.FeatureConfig(\n",
-        "        name='trestbps',\n",
-        "        pwl_calibration_num_keypoints=5,\n",
-        "        monotonicity='decreasing',\n",
-        "    ),\n",
-        "    tfl.configs.FeatureConfig(\n",
-        "        name='thalach',\n",
-        "        pwl_calibration_num_keypoints=5,\n",
-        "        monotonicity='decreasing',\n",
-        "    ),\n",
-        "    tfl.configs.FeatureConfig(\n",
-        "        name='restecg',\n",
-        "        # Partial monotonicity: output(0) \u003c= output(1), output(0) \u003c= output(2)\n",
-        "        monotonicity=[(0, 1), (0, 2)],\n",
-        "    ),\n",
-        "    tfl.configs.FeatureConfig(\n",
-        "        name='exang',\n",
-        "        # Partial monotonicity: output(0) \u003c= output(1)\n",
-        "        monotonicity=[(0, 1)],\n",
-        "    ),\n",
-        "    tfl.configs.FeatureConfig(\n",
-        "        name='oldpeak',\n",
-        "        pwl_calibration_num_keypoints=5,\n",
-        "        monotonicity='increasing',\n",
-        "    ),\n",
-        "    tfl.configs.FeatureConfig(\n",
-        "        name='slope',\n",
-        "        # Partial monotonicity: output(0) \u003c= output(1), output(1) \u003c= output(2)\n",
-        "        monotonicity=[(0, 1), (1, 2)],\n",
-        "    ),\n",
-        "    tfl.configs.FeatureConfig(\n",
-        "        name='ca',\n",
-        "        pwl_calibration_num_keypoints=4,\n",
-        "        monotonicity='increasing',\n",
-        "    ),\n",
-        "    tfl.configs.FeatureConfig(\n",
-        "        name='thal',\n",
-        "        # Partial monotonicity:\n",
-        "        # output(normal) \u003c= output(fixed)\n",
-        "        # output(normal) \u003c= output(reversible)        \n",
-        "        monotonicity=[('normal', 'fixed'), ('normal', 'reversible')],\n",
-        "    ),\n",
-        "]"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "LKBULveZ4mr3"
-      },
-      "source": [
-        "## Calibrated Linear Model\n",
-        "\n",
-        "To construct a TFL canned estimator, construct a model configuration from `tfl.configs`. A calibrated linear model is constructed using `tfl.configs.CalibratedLinearConfig`. It applies piecewise-linear and categorical calibration on the input features, followed by a linear combination and an optional output piecewise-linear calibration. When using output calibration or when output bounds are specified, the linear layer will apply weighted averaging on calibrated inputs.\n",
-        "\n",
-        "This example creates a calibrated linear model on the first 5 features. We use\n",
-        "`tfl.visualization` to plot the model graph with the calibrator plots."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "diRRozio4sAL"
-      },
-      "outputs": [],
-      "source": [
-        "# Model config defines the model structure for the estimator.\n",
-        "model_config = tfl.configs.CalibratedLinearConfig(\n",
-        "    feature_configs=feature_configs,\n",
-        "    use_bias=True,\n",
-        "    output_calibration=True,\n",
-        "    regularizer_configs=[\n",
-        "        # Regularizer for the output calibrator.\n",
-        "        tfl.configs.RegularizerConfig(name='output_calib_hessian', l2=1e-4),\n",
-        "    ])\n",
-        "# A CannedClassifier is constructed from the given model config.\n",
-        "estimator = tfl.estimators.CannedClassifier(\n",
-        "    feature_columns=feature_columns[:5],\n",
-        "    model_config=model_config,\n",
-        "    feature_analysis_input_fn=feature_analysis_input_fn,\n",
-        "    optimizer=tf.keras.optimizers.legacy.Adam(LEARNING_RATE),\n",
-        "    config=tf.estimator.RunConfig(tf_random_seed=42))\n",
-        "estimator.train(input_fn=train_input_fn)\n",
-        "results = estimator.evaluate(input_fn=test_input_fn)\n",
-        "print('Calibrated linear test AUC: {}'.format(results['auc']))\n",
-        "saved_model_path = estimator.export_saved_model(estimator.model_dir,\n",
-        "                                                serving_input_fn)\n",
-        "model_graph = tfl.estimators.get_model_graph(saved_model_path)\n",
-        "tfl.visualization.draw_model_graph(model_graph)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "zWzPM2_p977t"
-      },
-      "source": [
-        "## Calibrated Lattice Model\n",
-        "\n",
-        "A calibrated lattice model is constructed using `tfl.configs.CalibratedLatticeConfig`. A calibrated lattice model applies piecewise-linear and categorical calibration on the input features, followed by a lattice model and an optional output piecewise-linear calibration.\n",
-        "\n",
-        "This example creates a calibrated lattice model on the first 5 features.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "C6EvVpKW4BbC"
-      },
-      "outputs": [],
-      "source": [
-        "# This is calibrated lattice model: Inputs are calibrated, then combined\n",
-        "# non-linearly using a lattice layer.\n",
-        "model_config = tfl.configs.CalibratedLatticeConfig(\n",
-        "    feature_configs=feature_configs,\n",
-        "    regularizer_configs=[\n",
-        "        # Torsion regularizer applied to the lattice to make it more linear.\n",
-        "        tfl.configs.RegularizerConfig(name='torsion', l2=1e-4),\n",
-        "        # Globally defined calibration regularizer is applied to all features.\n",
-        "        tfl.configs.RegularizerConfig(name='calib_hessian', l2=1e-4),\n",
-        "    ])\n",
-        "# A CannedClassifier is constructed from the given model config.\n",
-        "estimator = tfl.estimators.CannedClassifier(\n",
-        "    feature_columns=feature_columns[:5],\n",
-        "    model_config=model_config,\n",
-        "    feature_analysis_input_fn=feature_analysis_input_fn,\n",
-        "    optimizer=tf.keras.optimizers.legacy.Adam(LEARNING_RATE),\n",
-        "    config=tf.estimator.RunConfig(tf_random_seed=42))\n",
-        "estimator.train(input_fn=train_input_fn)\n",
-        "results = estimator.evaluate(input_fn=test_input_fn)\n",
-        "print('Calibrated lattice test AUC: {}'.format(results['auc']))\n",
-        "saved_model_path = estimator.export_saved_model(estimator.model_dir,\n",
-        "                                                serving_input_fn)\n",
-        "model_graph = tfl.estimators.get_model_graph(saved_model_path)\n",
-        "tfl.visualization.draw_model_graph(model_graph)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "9494K_ZBKFcm"
-      },
-      "source": [
-        "## Calibrated Lattice Ensemble\n",
-        "\n",
-        "When the number of features is large, you can use an ensemble model, which creates multiple smaller lattices for subsets of the features and averages their output instead of creating just a single huge lattice. Ensemble lattice models are constructed using `tfl.configs.CalibratedLatticeEnsembleConfig`. A calibrated lattice ensemble model applies piecewise-linear and categorical calibration on the input feature, followed by an ensemble of lattice models and an optional output piecewise-linear calibration.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "KjrzziMFKuCB"
-      },
-      "source": [
-        "### Random Lattice Ensemble\n",
-        "\n",
-        "The following model config uses a random subset of features for each lattice."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "YBSS7dLjKExq"
-      },
-      "outputs": [],
-      "source": [
-        "# This is random lattice ensemble model with separate calibration:\n",
-        "# model output is the average output of separately calibrated lattices.\n",
-        "model_config = tfl.configs.CalibratedLatticeEnsembleConfig(\n",
-        "    feature_configs=feature_configs,\n",
-        "    num_lattices=5,\n",
-        "    lattice_rank=3)\n",
-        "# A CannedClassifier is constructed from the given model config.\n",
-        "estimator = tfl.estimators.CannedClassifier(\n",
-        "    feature_columns=feature_columns,\n",
-        "    model_config=model_config,\n",
-        "    feature_analysis_input_fn=feature_analysis_input_fn,\n",
-        "    optimizer=tf.keras.optimizers.legacy.Adam(LEARNING_RATE),\n",
-        "    config=tf.estimator.RunConfig(tf_random_seed=42))\n",
-        "estimator.train(input_fn=train_input_fn)\n",
-        "results = estimator.evaluate(input_fn=test_input_fn)\n",
-        "print('Random ensemble test AUC: {}'.format(results['auc']))\n",
-        "saved_model_path = estimator.export_saved_model(estimator.model_dir,\n",
-        "                                                serving_input_fn)\n",
-        "model_graph = tfl.estimators.get_model_graph(saved_model_path)\n",
-        "tfl.visualization.draw_model_graph(model_graph, calibrator_dpi=15)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "7uyO8s97FGJM"
-      },
-      "source": [
-        "### RTL Layer Random Lattice Ensemble\n",
-        "\n",
-        "The following model config uses a `tfl.layers.RTL` layer that uses a random subset of features for each lattice. We note that `tfl.layers.RTL` only supports monotonicity constraints and must have the same lattice size for all features and no per-feature regularization. Note that using a `tfl.layers.RTL` layer lets you scale to much larger ensembles than using separate `tfl.layers.Lattice` instances."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "8v7dKg-FF7iz"
-      },
-      "outputs": [],
-      "source": [
-        "# Make sure our feature configs have the same lattice size, no per-feature\n",
-        "# regularization, and only monotonicity constraints.\n",
-        "rtl_layer_feature_configs = copy.deepcopy(feature_configs)\n",
-        "for feature_config in rtl_layer_feature_configs:\n",
-        "  feature_config.lattice_size = 2\n",
-        "  feature_config.unimodality = 'none'\n",
-        "  feature_config.reflects_trust_in = None\n",
-        "  feature_config.dominates = None\n",
-        "  feature_config.regularizer_configs = None\n",
-        "# This is RTL layer ensemble model with separate calibration:\n",
-        "# model output is the average output of separately calibrated lattices.\n",
-        "model_config = tfl.configs.CalibratedLatticeEnsembleConfig(\n",
-        "    lattices='rtl_layer',\n",
-        "    feature_configs=rtl_layer_feature_configs,\n",
-        "    num_lattices=5,\n",
-        "    lattice_rank=3)\n",
-        "# A CannedClassifier is constructed from the given model config.\n",
-        "estimator = tfl.estimators.CannedClassifier(\n",
-        "    feature_columns=feature_columns,\n",
-        "    model_config=model_config,\n",
-        "    feature_analysis_input_fn=feature_analysis_input_fn,\n",
-        "    optimizer=tf.keras.optimizers.legacy.Adam(LEARNING_RATE),\n",
-        "    config=tf.estimator.RunConfig(tf_random_seed=42))\n",
-        "estimator.train(input_fn=train_input_fn)\n",
-        "results = estimator.evaluate(input_fn=test_input_fn)\n",
-        "print('Random ensemble test AUC: {}'.format(results['auc']))\n",
-        "saved_model_path = estimator.export_saved_model(estimator.model_dir,\n",
-        "                                                serving_input_fn)\n",
-        "model_graph = tfl.estimators.get_model_graph(saved_model_path)\n",
-        "tfl.visualization.draw_model_graph(model_graph, calibrator_dpi=15)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "LSXEaYAULRvf"
-      },
-      "source": [
-        "### Crystals Lattice Ensemble\n",
-        "\n",
-        "TFL also provides a heuristic feature arrangement algorithm, called [Crystals](https://papers.nips.cc/paper/6377-fast-and-flexible-monotonic-functions-with-ensembles-of-lattices). The Crystals algorithm first trains a *prefitting model* that estimates pairwise feature interactions. It then arranges the final ensemble such that features with more non-linear interactions are in the same lattices.\n",
-        "\n",
-        "For Crystals models, you will also need to provide a `prefitting_input_fn` that is used to train the prefitting model, as described above. The prefitting model does not need to be fully trained, so a few epochs should be enough.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "FjQKh9saMaFu"
-      },
-      "outputs": [],
-      "source": [
-        "prefitting_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(\n",
-        "    x=train_x,\n",
-        "    y=train_y,\n",
-        "    shuffle=False,\n",
-        "    batch_size=BATCH_SIZE,\n",
-        "    num_epochs=PREFITTING_NUM_EPOCHS,\n",
-        "    num_threads=1)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "fVnZpwX8MtPi"
-      },
-      "source": [
-        "You can then create a Crystal model by setting `lattice='crystals'` in the model config."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "f4awRMDe-eMv"
-      },
-      "outputs": [],
-      "source": [
-        "# This is Crystals ensemble model with separate calibration: model output is\n",
-        "# the average output of separately calibrated lattices.\n",
-        "model_config = tfl.configs.CalibratedLatticeEnsembleConfig(\n",
-        "    feature_configs=feature_configs,\n",
-        "    lattices='crystals',\n",
-        "    num_lattices=5,\n",
-        "    lattice_rank=3)\n",
-        "# A CannedClassifier is constructed from the given model config.\n",
-        "estimator = tfl.estimators.CannedClassifier(\n",
-        "    feature_columns=feature_columns,\n",
-        "    model_config=model_config,\n",
-        "    feature_analysis_input_fn=feature_analysis_input_fn,\n",
-        "    # prefitting_input_fn is required to train the prefitting model.\n",
-        "    prefitting_input_fn=prefitting_input_fn,\n",
-        "    optimizer=tf.keras.optimizers.legacy.Adam(LEARNING_RATE),\n",
-        "    prefitting_optimizer=tf.keras.optimizers.legacy.Adam(LEARNING_RATE),\n",
-        "    config=tf.estimator.RunConfig(tf_random_seed=42))\n",
-        "estimator.train(input_fn=train_input_fn)\n",
-        "results = estimator.evaluate(input_fn=test_input_fn)\n",
-        "print('Crystals ensemble test AUC: {}'.format(results['auc']))\n",
-        "saved_model_path = estimator.export_saved_model(estimator.model_dir,\n",
-        "                                                serving_input_fn)\n",
-        "model_graph = tfl.estimators.get_model_graph(saved_model_path)\n",
-        "tfl.visualization.draw_model_graph(model_graph, calibrator_dpi=15)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Isb2vyLAVBM1"
-      },
-      "source": [
-        "You can plot feature calibrators with more details using the `tfl.visualization` module."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "DJPaREuWS2sg"
-      },
-      "outputs": [],
-      "source": [
-        "_ = tfl.visualization.plot_feature_calibrator(model_graph, \"age\")\n",
-        "_ = tfl.visualization.plot_feature_calibrator(model_graph, \"restecg\")"
-      ]
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "collapsed_sections": [],
-      "name": "canned_estimators.ipynb",
-      "private_outputs": true,
-      "provenance": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
diff --git a/docs/tutorials/custom_estimators.ipynb b/docs/tutorials/custom_estimators.ipynb
deleted file mode 100644
index 86f6c22..0000000
--- a/docs/tutorials/custom_estimators.ipynb
+++ /dev/null
@@ -1,409 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "7765UFHoyGx6"
-      },
-      "source": [
-        "##### Copyright 2020 The TensorFlow Authors."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "cellView": "form",
-        "id": "KsOkK8O69PyT"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
-        "# you may not use this file except in compliance with the License.\n",
-        "# You may obtain a copy of the License at\n",
-        "#\n",
-        "# https://www.apache.org/licenses/LICENSE-2.0\n",
-        "#\n",
-        "# Unless required by applicable law or agreed to in writing, software\n",
-        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
-        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
-        "# See the License for the specific language governing permissions and\n",
-        "# limitations under the License."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ZS8z-_KeywY9"
-      },
-      "source": [
-        "# TF Lattice Custom Estimators"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "r61fkA2i9Y3_"
-      },
-      "source": [
-        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/lattice/tutorials/custom_estimators\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/lattice/blob/master/docs/tutorials/custom_estimators.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/lattice/blob/master/docs/tutorials/custom_estimators.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/lattice/docs/tutorials/custom_estimators.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "\u003c/table\u003e"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "rVdTRCQiSGAb"
-      },
-      "source": [
-        "\u003e Warning: Estimators are not recommended for new code.  Estimators run `v1. Session`-style code which is more difficult to write correctly, and can behave unexpectedly, especially when combined with TF 2 code. Estimators do fall under our [compatibility guarantees] (https://tensorflow.org/guide/versions), but they will not receive any additional features, and there will be no fixes other than to security vulnerabilities. See the [migration guide](https://tensorflow.org/guide/migrate) for details."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Ur6yCw7YVvr8"
-      },
-      "source": [
-        "## Overview\n",
-        "\n",
-        "You can use custom estimators to create arbitrarily monotonic models using TFL layers. This guide outlines the steps needed to create such estimators."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "x769lI12IZXB"
-      },
-      "source": [
-        "## Setup"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "fbBVAR6UeRN5"
-      },
-      "source": [
-        "Installing TF Lattice package:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "bpXjJKpSd3j4"
-      },
-      "outputs": [],
-      "source": [
-        "#@test {\"skip\": true}\n",
-        "!pip install tensorflow-lattice"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "jSVl9SHTeSGX"
-      },
-      "source": [
-        "Importing required packages:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "cellView": "both",
-        "id": "P9rMpg1-ASY3"
-      },
-      "outputs": [],
-      "source": [
-        "import tensorflow as tf\n",
-        "\n",
-        "import logging\n",
-        "import numpy as np\n",
-        "import pandas as pd\n",
-        "import sys\n",
-        "import tensorflow_lattice as tfl\n",
-        "from tensorflow import feature_column as fc\n",
-        "\n",
-        "from tensorflow_estimator.python.estimator.canned import optimizers\n",
-        "from tensorflow_estimator.python.estimator.head import binary_class_head\n",
-        "logging.disable(sys.maxsize)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "svPuM6QNxlrH"
-      },
-      "source": [
-        "Downloading the UCI Statlog (Heart) dataset:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "cellView": "both",
-        "id": "M0CmH1gPASZF"
-      },
-      "outputs": [],
-      "source": [
-        "csv_file = tf.keras.utils.get_file(\n",
-        "    'heart.csv', 'http://storage.googleapis.com/download.tensorflow.org/data/heart.csv')\n",
-        "df = pd.read_csv(csv_file)\n",
-        "target = df.pop('target')\n",
-        "train_size = int(len(df) * 0.8)\n",
-        "train_x = df[:train_size]\n",
-        "train_y = target[:train_size]\n",
-        "test_x = df[train_size:]\n",
-        "test_y = target[train_size:]\n",
-        "df.head()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "nKkAw12SxvGG"
-      },
-      "source": [
-        "Setting the default values used for training in this guide:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "cellView": "both",
-        "id": "1T6GFI9F6mcG"
-      },
-      "outputs": [],
-      "source": [
-        "LEARNING_RATE = 0.1\n",
-        "BATCH_SIZE = 128\n",
-        "NUM_EPOCHS = 1000"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "0TGfzhPHzpix"
-      },
-      "source": [
-        "## Feature Columns\n",
-        "\n",
-        "As for any other TF estimator, data needs to be passed to the estimator, which is typically via an input_fn and parsed using [FeatureColumns](https://www.tensorflow.org/guide/feature_columns)."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "DCIUz8apzs0l"
-      },
-      "outputs": [],
-      "source": [
-        "# Feature columns.\n",
-        "# - age\n",
-        "# - sex\n",
-        "# - ca        number of major vessels (0-3) colored by flourosopy\n",
-        "# - thal      3 = normal; 6 = fixed defect; 7 = reversable defect\n",
-        "feature_columns = [\n",
-        "    fc.numeric_column('age', default_value=-1),\n",
-        "    fc.categorical_column_with_vocabulary_list('sex', [0, 1]),\n",
-        "    fc.numeric_column('ca'),\n",
-        "    fc.categorical_column_with_vocabulary_list(\n",
-        "        'thal', ['normal', 'fixed', 'reversible']),\n",
-        "]"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "hEZstmtT2CA3"
-      },
-      "source": [
-        "Note that categorical features do not need to be wrapped by a dense feature column, since `tfl.laysers.CategoricalCalibration` layer can directly consume category indices."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "H_LoW_9m5OFL"
-      },
-      "source": [
-        "## Creating input_fn\n",
-        "\n",
-        "As for any other estimator, you can use input_fn to feed data to the model for training and evaluation."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "lFVy1Efy5NKD"
-      },
-      "outputs": [],
-      "source": [
-        "train_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(\n",
-        "    x=train_x,\n",
-        "    y=train_y,\n",
-        "    shuffle=True,\n",
-        "    batch_size=BATCH_SIZE,\n",
-        "    num_epochs=NUM_EPOCHS,\n",
-        "    num_threads=1)\n",
-        "\n",
-        "test_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(\n",
-        "    x=test_x,\n",
-        "    y=test_y,\n",
-        "    shuffle=False,\n",
-        "    batch_size=BATCH_SIZE,\n",
-        "    num_epochs=1,\n",
-        "    num_threads=1)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "kbrgSr9KaRg0"
-      },
-      "source": [
-        "## Creating model_fn\n",
-        "\n",
-        "There are several ways to create a custom estimator. Here we will construct a `model_fn` that calls a Keras model on the parsed input tensors. To parse the input features, you can use `tf.feature_column.input_layer`, `tf.keras.layers.DenseFeatures`, or `tfl.estimators.transform_features`. If you use the latter, you will not need to wrap categorical features with dense feature columns, and the resulting tensors will not be concatenated, which makes it easier to use the features in the calibration layers.\n",
-        "\n",
-        "To construct a model, you can mix and match TFL layers or any other Keras layers. Here we create a calibrated lattice Keras model out of TFL layers and impose several monotonicity constraints. We then use the Keras model to create the custom estimator.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "n2Zrv6OPaQO2"
-      },
-      "outputs": [],
-      "source": [
-        "def model_fn(features, labels, mode, config):\n",
-        "  \"\"\"model_fn for the custom estimator.\"\"\"\n",
-        "  del config\n",
-        "  input_tensors = tfl.estimators.transform_features(features, feature_columns)\n",
-        "  inputs = {\n",
-        "      key: tf.keras.layers.Input(shape=(1,), name=key) for key in input_tensors\n",
-        "  }\n",
-        "\n",
-        "  lattice_sizes = [3, 2, 2, 2]\n",
-        "  lattice_monotonicities = ['increasing', 'none', 'increasing', 'increasing']\n",
-        "  lattice_input = tf.keras.layers.Concatenate(axis=1)([\n",
-        "      tfl.layers.PWLCalibration(\n",
-        "          input_keypoints=np.linspace(10, 100, num=8, dtype=np.float32),\n",
-        "          # The output range of the calibrator should be the input range of\n",
-        "          # the following lattice dimension.\n",
-        "          output_min=0.0,\n",
-        "          output_max=lattice_sizes[0] - 1.0,\n",
-        "          monotonicity='increasing',\n",
-        "      )(inputs['age']),\n",
-        "      tfl.layers.CategoricalCalibration(\n",
-        "          # Number of categories including any missing/default category.\n",
-        "          num_buckets=2,\n",
-        "          output_min=0.0,\n",
-        "          output_max=lattice_sizes[1] - 1.0,\n",
-        "      )(inputs['sex']),\n",
-        "      tfl.layers.PWLCalibration(\n",
-        "          input_keypoints=[0.0, 1.0, 2.0, 3.0],\n",
-        "          output_min=0.0,\n",
-        "          output_max=lattice_sizes[0] - 1.0,\n",
-        "          # You can specify TFL regularizers as tuple\n",
-        "          # ('regularizer name', l1, l2).\n",
-        "          kernel_regularizer=('hessian', 0.0, 1e-4),\n",
-        "          monotonicity='increasing',\n",
-        "      )(inputs['ca']),\n",
-        "      tfl.layers.CategoricalCalibration(\n",
-        "          num_buckets=3,\n",
-        "          output_min=0.0,\n",
-        "          output_max=lattice_sizes[1] - 1.0,\n",
-        "          # Categorical monotonicity can be partial order.\n",
-        "          # (i, j) indicates that we must have output(i) \u003c= output(j).\n",
-        "          # Make sure to set the lattice monotonicity to 'increasing' for this\n",
-        "          # dimension.\n",
-        "          monotonicities=[(0, 1), (0, 2)],\n",
-        "      )(inputs['thal']),\n",
-        "  ])\n",
-        "  output = tfl.layers.Lattice(\n",
-        "      lattice_sizes=lattice_sizes, monotonicities=lattice_monotonicities)(\n",
-        "          lattice_input)\n",
-        "\n",
-        "  training = (mode == tf.estimator.ModeKeys.TRAIN)\n",
-        "  model = tf.keras.Model(inputs=inputs, outputs=output)\n",
-        "  logits = model(input_tensors, training=training)\n",
-        "\n",
-        "  if training:\n",
-        "    optimizer = optimizers.get_optimizer_instance_v2('Adagrad', LEARNING_RATE)\n",
-        "  else:\n",
-        "    optimizer = None\n",
-        "\n",
-        "  head = binary_class_head.BinaryClassHead()\n",
-        "  return head.create_estimator_spec(\n",
-        "      features=features,\n",
-        "      mode=mode,\n",
-        "      labels=labels,\n",
-        "      optimizer=optimizer,\n",
-        "      logits=logits,\n",
-        "      trainable_variables=model.trainable_variables,\n",
-        "      update_ops=model.updates)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "mng-VtsSbVtQ"
-      },
-      "source": [
-        "## Training and Estimator\n",
-        "\n",
-        "Using the `model_fn` we can create and train the estimator."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "j38GaEbKbZju"
-      },
-      "outputs": [],
-      "source": [
-        "estimator = tf.estimator.Estimator(model_fn=model_fn)\n",
-        "estimator.train(input_fn=train_input_fn)\n",
-        "results = estimator.evaluate(input_fn=test_input_fn)\n",
-        "print('AUC: {}'.format(results['auc']))"
-      ]
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "collapsed_sections": [],
-      "name": "custom_estimators.ipynb",
-      "private_outputs": true,
-      "provenance": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
diff --git a/docs/tutorials/keras_layers.ipynb b/docs/tutorials/keras_layers.ipynb
index e7fc513..b030176 100644
--- a/docs/tutorials/keras_layers.ipynb
+++ b/docs/tutorials/keras_layers.ipynb
@@ -3,7 +3,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "7765UFHoyGx6"
       },
       "source": [
@@ -12,11 +11,9 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "cellView": "form",
-        "colab": {},
-        "colab_type": "code",
         "id": "KsOkK8O69PyT"
       },
       "outputs": [],
@@ -37,7 +34,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "ZS8z-_KeywY9"
       },
       "source": [
@@ -47,7 +43,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "r61fkA2i9Y3_"
       },
       "source": [
@@ -70,7 +65,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "ecLbJCvJSSCd"
       },
       "source": [
@@ -84,7 +78,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "x769lI12IZXB"
       },
       "source": [
@@ -94,7 +87,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "fbBVAR6UeRN5"
       },
       "source": [
@@ -103,22 +95,19 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "bpXjJKpSd3j4"
       },
       "outputs": [],
       "source": [
         "#@test {\"skip\": true}\n",
-        "!pip install tensorflow-lattice pydot"
+        "!pip install --pre -U tensorflow tf-keras tensorflow-lattice  pydot graphviz"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "jSVl9SHTeSGX"
       },
       "source": [
@@ -127,11 +116,9 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "cellView": "both",
-        "colab": {},
-        "colab_type": "code",
         "id": "pm0LD8iyIZXF"
       },
       "outputs": [],
@@ -147,10 +134,25 @@
         "logging.disable(sys.maxsize)"
       ]
     },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "m8TsvLIe4Az-"
+      },
+      "outputs": [],
+      "source": [
+        "# Use Keras 2.\n",
+        "version_fn = getattr(tf.keras, \"version\", None)\n",
+        "if version_fn and version_fn().startswith(\"3.\"):\n",
+        "  import tf_keras as keras\n",
+        "else:\n",
+        "  keras = tf.keras"
+      ]
+    },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "svPuM6QNxlrH"
       },
       "source": [
@@ -159,17 +161,15 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "cellView": "both",
-        "colab": {},
-        "colab_type": "code",
         "id": "PG3pFtK-IZXM"
       },
       "outputs": [],
       "source": [
         "# UCI Statlog (Heart) dataset.\n",
-        "csv_file = tf.keras.utils.get_file(\n",
+        "csv_file = keras.utils.get_file(\n",
         "    'heart.csv', 'http://storage.googleapis.com/download.tensorflow.org/data/heart.csv')\n",
         "training_data_df = pd.read_csv(csv_file).sample(\n",
         "    frac=1.0, random_state=41).reset_index(drop=True)\n",
@@ -179,7 +179,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "nKkAw12SxvGG"
       },
       "source": [
@@ -188,11 +187,9 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "cellView": "both",
-        "colab": {},
-        "colab_type": "code",
         "id": "krAJBE-yIZXR"
       },
       "outputs": [],
@@ -205,7 +202,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "0TGfzhPHzpix"
       },
       "source": [
@@ -218,10 +214,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "nOQWqPAbQS3o"
       },
       "outputs": [],
@@ -233,7 +227,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "W3DnEKWvQYXm"
       },
       "source": [
@@ -242,10 +235,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "o_hyk5GkQfl8"
       },
       "outputs": [],
@@ -256,7 +247,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "BPZsSUZiQiwc"
       },
       "source": [
@@ -265,10 +255,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "DXPc6rSGxzFZ"
       },
       "outputs": [],
@@ -350,7 +338,7 @@
         "    # You can specify list of regularizers. You are not limited to TFL\n",
         "    # regularizrs. Feel free to use any :)\n",
         "    kernel_regularizer=[('laplacian', 0.0, 1e-4),\n",
-        "                        tf.keras.regularizers.l1_l2(l1=0.001)])\n",
+        "                        keras.regularizers.l1_l2(l1=0.001)])\n",
         "combined_calibrators.append(calibrator)\n",
         "\n",
         "# ############### fbs ###############\n",
@@ -367,7 +355,7 @@
         "    monotonicities=[(0, 1)],\n",
         "    # This initializer is identical to default one('uniform'), but has fixed\n",
         "    # seed in order to simplify experimentation.\n",
-        "    kernel_initializer=tf.keras.initializers.RandomUniform(\n",
+        "    kernel_initializer=keras.initializers.RandomUniform(\n",
         "        minval=0.0, maxval=lattice_sizes[5] - 1.0, seed=1))\n",
         "combined_calibrators.append(calibrator)\n",
         "\n",
@@ -379,7 +367,7 @@
         "    # Categorical monotonicity can be partial order.\n",
         "    monotonicities=[(0, 1), (0, 2)],\n",
         "    # Categorical calibration layer supports standard Keras regularizers.\n",
-        "    kernel_regularizer=tf.keras.regularizers.l1_l2(l1=0.001),\n",
+        "    kernel_regularizer=keras.regularizers.l1_l2(l1=0.001),\n",
         "    kernel_initializer='constant')\n",
         "combined_calibrators.append(calibrator)"
       ]
@@ -387,7 +375,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "inyNlSBeQyp7"
       },
       "source": [
@@ -398,10 +385,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "DNCc9oBTRo6w"
       },
       "outputs": [],
@@ -419,7 +404,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "T5q2InayRpDr"
       },
       "source": [
@@ -428,15 +412,13 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "xX6lroYZQy3L"
       },
       "outputs": [],
       "source": [
-        "model = tf.keras.models.Sequential()\n",
+        "model = keras.models.Sequential()\n",
         "model.add(combined_calibrators)\n",
         "model.add(lattice)"
       ]
@@ -444,7 +426,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "W3UFxD3fRzIC"
       },
       "source": [
@@ -453,10 +434,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "2jz4JvI-RzSj"
       },
       "outputs": [],
@@ -467,8 +446,8 @@
         "target = training_data_df[['target']].values.astype(np.float32)\n",
         "\n",
         "model.compile(\n",
-        "    loss=tf.keras.losses.mean_squared_error,\n",
-        "    optimizer=tf.keras.optimizers.Adagrad(learning_rate=LEARNING_RATE))\n",
+        "    loss=keras.losses.mean_squared_error,\n",
+        "    optimizer=keras.optimizers.Adagrad(learning_rate=LEARNING_RATE))\n",
         "model.fit(\n",
         "    features,\n",
         "    target,\n",
@@ -484,7 +463,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "RTHoW_5lxwT5"
       },
       "source": [
@@ -497,10 +475,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "gJjUYvBuW1qE"
       },
       "outputs": [],
@@ -512,7 +488,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Z03qY5MYW1yT"
       },
       "source": [
@@ -521,10 +496,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "DCIUz8apzs0l"
       },
       "outputs": [],
@@ -532,7 +505,7 @@
         "model_inputs = []\n",
         "lattice_inputs = []\n",
         "# ############### age ###############\n",
-        "age_input = tf.keras.layers.Input(shape=[1], name='age')\n",
+        "age_input = keras.layers.Input(shape=[1], name='age')\n",
         "model_inputs.append(age_input)\n",
         "age_calibrator = tfl.layers.PWLCalibration(\n",
         "    # Every PWLCalibration layer must have keypoints of piecewise linear\n",
@@ -556,7 +529,7 @@
         "# ############### sex ###############\n",
         "# For boolean features simply specify CategoricalCalibration layer with 2\n",
         "# buckets.\n",
-        "sex_input = tf.keras.layers.Input(shape=[1], name='sex')\n",
+        "sex_input = keras.layers.Input(shape=[1], name='sex')\n",
         "model_inputs.append(sex_input)\n",
         "sex_calibrator = tfl.layers.CategoricalCalibration(\n",
         "    num_buckets=2,\n",
@@ -570,7 +543,7 @@
         "lattice_inputs.append(sex_calibrator)\n",
         "\n",
         "# ############### cp ###############\n",
-        "cp_input = tf.keras.layers.Input(shape=[1], name='cp')\n",
+        "cp_input = keras.layers.Input(shape=[1], name='cp')\n",
         "model_inputs.append(cp_input)\n",
         "cp_calibrator = tfl.layers.PWLCalibration(\n",
         "    # Here instead of specifying dtype of layer we convert keypoints into\n",
@@ -587,7 +560,7 @@
         "lattice_inputs.append(cp_calibrator)\n",
         "\n",
         "# ############### trestbps ###############\n",
-        "trestbps_input = tf.keras.layers.Input(shape=[1], name='trestbps')\n",
+        "trestbps_input = keras.layers.Input(shape=[1], name='trestbps')\n",
         "model_inputs.append(trestbps_input)\n",
         "trestbps_calibrator = tfl.layers.PWLCalibration(\n",
         "    # Alternatively, you might want to use quantiles as keypoints instead of\n",
@@ -612,7 +585,7 @@
         "lattice_inputs.append(trestbps_calibrator)\n",
         "\n",
         "# ############### chol ###############\n",
-        "chol_input = tf.keras.layers.Input(shape=[1], name='chol')\n",
+        "chol_input = keras.layers.Input(shape=[1], name='chol')\n",
         "model_inputs.append(chol_input)\n",
         "chol_calibrator = tfl.layers.PWLCalibration(\n",
         "    # Explicit input keypoint initialization.\n",
@@ -629,14 +602,14 @@
         "    # You can specify list of regularizers. You are not limited to TFL\n",
         "    # regularizrs. Feel free to use any :)\n",
         "    kernel_regularizer=[('laplacian', 0.0, 1e-4),\n",
-        "                        tf.keras.regularizers.l1_l2(l1=0.001)],\n",
+        "                        keras.regularizers.l1_l2(l1=0.001)],\n",
         "    name='chol_calib',\n",
         ")(\n",
         "    chol_input)\n",
         "lattice_inputs.append(chol_calibrator)\n",
         "\n",
         "# ############### fbs ###############\n",
-        "fbs_input = tf.keras.layers.Input(shape=[1], name='fbs')\n",
+        "fbs_input = keras.layers.Input(shape=[1], name='fbs')\n",
         "model_inputs.append(fbs_input)\n",
         "fbs_calibrator = tfl.layers.CategoricalCalibration(\n",
         "    num_buckets=2,\n",
@@ -651,7 +624,7 @@
         "    monotonicities=[(0, 1)],\n",
         "    # This initializer is identical to default one ('uniform'), but has fixed\n",
         "    # seed in order to simplify experimentation.\n",
-        "    kernel_initializer=tf.keras.initializers.RandomUniform(\n",
+        "    kernel_initializer=keras.initializers.RandomUniform(\n",
         "        minval=0.0, maxval=lattice_sizes[5] - 1.0, seed=1),\n",
         "    name='fbs_calib',\n",
         ")(\n",
@@ -659,7 +632,7 @@
         "lattice_inputs.append(fbs_calibrator)\n",
         "\n",
         "# ############### restecg ###############\n",
-        "restecg_input = tf.keras.layers.Input(shape=[1], name='restecg')\n",
+        "restecg_input = keras.layers.Input(shape=[1], name='restecg')\n",
         "model_inputs.append(restecg_input)\n",
         "restecg_calibrator = tfl.layers.CategoricalCalibration(\n",
         "    num_buckets=3,\n",
@@ -668,7 +641,7 @@
         "    # Categorical monotonicity can be partial order.\n",
         "    monotonicities=[(0, 1), (0, 2)],\n",
         "    # Categorical calibration layer supports standard Keras regularizers.\n",
-        "    kernel_regularizer=tf.keras.regularizers.l1_l2(l1=0.001),\n",
+        "    kernel_regularizer=keras.regularizers.l1_l2(l1=0.001),\n",
         "    kernel_initializer='constant',\n",
         "    name='restecg_calib',\n",
         ")(\n",
@@ -679,7 +652,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Fr0k8La_YgQG"
       },
       "source": [
@@ -690,10 +662,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "X15RE0NybNbU"
       },
       "outputs": [],
@@ -714,7 +684,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "31VzsnMCA9dh"
       },
       "source": [
@@ -723,10 +692,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "efCP3Yx2A9n7"
       },
       "outputs": [],
@@ -741,7 +708,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "1SURnNl8bNgw"
       },
       "source": [
@@ -750,24 +716,21 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "7gY-VXuYbZLa"
       },
       "outputs": [],
       "source": [
-        "model = tf.keras.models.Model(\n",
+        "model = keras.models.Model(\n",
         "    inputs=model_inputs,\n",
         "    outputs=model_output)\n",
-        "tf.keras.utils.plot_model(model, rankdir='LR')"
+        "keras.utils.plot_model(model, rankdir='LR')"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "tvFJTs94bZXK"
       },
       "source": [
@@ -776,10 +739,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "vMQTGbFAYgYS"
       },
       "outputs": [],
@@ -792,8 +753,8 @@
         "target = training_data_df[['target']].values.astype(np.float32)\n",
         "\n",
         "model.compile(\n",
-        "    loss=tf.keras.losses.mean_squared_error,\n",
-        "    optimizer=tf.keras.optimizers.Adagrad(LEARNING_RATE))\n",
+        "    loss=keras.losses.mean_squared_error,\n",
+        "    optimizer=keras.optimizers.Adagrad(LEARNING_RATE))\n",
         "model.fit(\n",
         "    features,\n",
         "    target,\n",
@@ -809,7 +770,6 @@
   ],
   "metadata": {
     "colab": {
-      "collapsed_sections": [],
       "name": "keras_layers.ipynb",
       "private_outputs": true,
       "provenance": [],
diff --git a/docs/tutorials/premade_models.ipynb b/docs/tutorials/premade_models.ipynb
index 4a5a68a..2d72a7d 100644
--- a/docs/tutorials/premade_models.ipynb
+++ b/docs/tutorials/premade_models.ipynb
@@ -70,7 +70,7 @@
       "source": [
         "## Overview\n",
         "\n",
-        "Premade Models are quick and easy ways to build TFL `tf.keras.model` instances for typical use cases. This guide outlines the steps needed to construct a TFL Premade Model and train/test it. "
+        "Premade Models are quick and easy ways to build TFL `keras.Model` instances for typical use cases. This guide outlines the steps needed to construct a TFL Premade Model and train/test it."
       ]
     },
     {
@@ -93,7 +93,7 @@
       "outputs": [],
       "source": [
         "#@test {\"skip\": true}\n",
-        "!pip install tensorflow-lattice pydot"
+        "!pip install --pre -U tensorflow tf-keras tensorflow-lattice  pydot graphviz"
       ]
     },
     {
@@ -124,6 +124,22 @@
         "logging.disable(sys.maxsize)"
       ]
     },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "k-AAoRho3x5N"
+      },
+      "outputs": [],
+      "source": [
+        "# Use Keras 2.\n",
+        "version_fn = getattr(tf.keras, \"version\", None)\n",
+        "if version_fn and version_fn().startswith(\"3.\"):\n",
+        "  import tf_keras as keras\n",
+        "else:\n",
+        "  keras = tf.keras"
+      ]
+    },
     {
       "cell_type": "markdown",
       "metadata": {
@@ -164,7 +180,7 @@
       },
       "outputs": [],
       "source": [
-        "heart_csv_file = tf.keras.utils.get_file(\n",
+        "heart_csv_file = keras.utils.get_file(\n",
         "    'heart.csv',\n",
         "    'http://storage.googleapis.com/download.tensorflow.org/data/heart.csv')\n",
         "heart_df = pd.read_csv(heart_csv_file)\n",
@@ -433,7 +449,7 @@
         "# A CalibratedLinear premade model constructed from the given model config.\n",
         "linear_model = tfl.premade.CalibratedLinear(linear_model_config)\n",
         "# Let's plot our model.\n",
-        "tf.keras.utils.plot_model(linear_model, show_layer_names=False, rankdir='LR')"
+        "keras.utils.plot_model(linear_model, show_layer_names=False, rankdir='LR')"
       ]
     },
     {
@@ -442,7 +458,7 @@
         "id": "3MC3-AyX00-A"
       },
       "source": [
-        "Now, as with any other [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model), we compile and fit the model to our data."
+        "Now, as with any other [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model), we compile and fit the model to our data."
       ]
     },
     {
@@ -454,9 +470,9 @@
       "outputs": [],
       "source": [
         "linear_model.compile(\n",
-        "    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),\n",
-        "    metrics=[tf.keras.metrics.AUC(from_logits=True)],\n",
-        "    optimizer=tf.keras.optimizers.Adam(LEARNING_RATE))\n",
+        "    loss=keras.losses.BinaryCrossentropy(from_logits=True),\n",
+        "    metrics=[keras.metrics.AUC(from_logits=True)],\n",
+        "    optimizer=keras.optimizers.Adam(LEARNING_RATE))\n",
         "linear_model.fit(\n",
         "    heart_train_xs[:5],\n",
         "    heart_train_ys,\n",
@@ -522,7 +538,7 @@
         "# A CalibratedLattice premade model constructed from the given model config.\n",
         "lattice_model = tfl.premade.CalibratedLattice(lattice_model_config)\n",
         "# Let's plot our model.\n",
-        "tf.keras.utils.plot_model(lattice_model, show_layer_names=False, rankdir='LR')"
+        "keras.utils.plot_model(lattice_model, show_layer_names=False, rankdir='LR')"
       ]
     },
     {
@@ -543,9 +559,9 @@
       "outputs": [],
       "source": [
         "lattice_model.compile(\n",
-        "    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),\n",
-        "    metrics=[tf.keras.metrics.AUC(from_logits=True)],\n",
-        "    optimizer=tf.keras.optimizers.Adam(LEARNING_RATE))\n",
+        "    loss=keras.losses.BinaryCrossentropy(from_logits=True),\n",
+        "    metrics=[keras.metrics.AUC(from_logits=True)],\n",
+        "    optimizer=keras.optimizers.Adam(LEARNING_RATE))\n",
         "lattice_model.fit(\n",
         "    heart_train_xs[:5],\n",
         "    heart_train_ys,\n",
@@ -602,7 +618,7 @@
         "explicit_ensemble_model = tfl.premade.CalibratedLatticeEnsemble(\n",
         "    explicit_ensemble_model_config)\n",
         "# Let's plot our model.\n",
-        "tf.keras.utils.plot_model(\n",
+        "keras.utils.plot_model(\n",
         "    explicit_ensemble_model, show_layer_names=False, rankdir='LR')"
       ]
     },
@@ -624,9 +640,9 @@
       "outputs": [],
       "source": [
         "explicit_ensemble_model.compile(\n",
-        "    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),\n",
-        "    metrics=[tf.keras.metrics.AUC(from_logits=True)],\n",
-        "    optimizer=tf.keras.optimizers.Adam(LEARNING_RATE))\n",
+        "    loss=keras.losses.BinaryCrossentropy(from_logits=True),\n",
+        "    metrics=[keras.metrics.AUC(from_logits=True)],\n",
+        "    optimizer=keras.optimizers.Adam(LEARNING_RATE))\n",
         "explicit_ensemble_model.fit(\n",
         "    heart_train_xs,\n",
         "    heart_train_ys,\n",
@@ -673,7 +689,7 @@
         "random_ensemble_model = tfl.premade.CalibratedLatticeEnsemble(\n",
         "    random_ensemble_model_config)\n",
         "# Let's plot our model.\n",
-        "tf.keras.utils.plot_model(\n",
+        "keras.utils.plot_model(\n",
         "    random_ensemble_model, show_layer_names=False, rankdir='LR')"
       ]
     },
@@ -695,9 +711,9 @@
       "outputs": [],
       "source": [
         "random_ensemble_model.compile(\n",
-        "    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),\n",
-        "    metrics=[tf.keras.metrics.AUC(from_logits=True)],\n",
-        "    optimizer=tf.keras.optimizers.Adam(LEARNING_RATE))\n",
+        "    loss=keras.losses.BinaryCrossentropy(from_logits=True),\n",
+        "    metrics=[keras.metrics.AUC(from_logits=True)],\n",
+        "    optimizer=keras.optimizers.Adam(LEARNING_RATE))\n",
         "random_ensemble_model.fit(\n",
         "    heart_train_xs,\n",
         "    heart_train_ys,\n",
@@ -755,7 +771,7 @@
         "rtl_layer_ensemble_model = tfl.premade.CalibratedLatticeEnsemble(\n",
         "    rtl_layer_ensemble_model_config)\n",
         "# Let's plot our model.\n",
-        "tf.keras.utils.plot_model(\n",
+        "keras.utils.plot_model(\n",
         "    rtl_layer_ensemble_model, show_layer_names=False, rankdir='LR')"
       ]
     },
@@ -777,9 +793,9 @@
       "outputs": [],
       "source": [
         "rtl_layer_ensemble_model.compile(\n",
-        "    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),\n",
-        "    metrics=[tf.keras.metrics.AUC(from_logits=True)],\n",
-        "    optimizer=tf.keras.optimizers.Adam(LEARNING_RATE))\n",
+        "    loss=keras.losses.BinaryCrossentropy(from_logits=True),\n",
+        "    metrics=[keras.metrics.AUC(from_logits=True)],\n",
+        "    optimizer=keras.optimizers.Adam(LEARNING_RATE))\n",
         "rtl_layer_ensemble_model.fit(\n",
         "    heart_train_xs,\n",
         "    heart_train_ys,\n",
@@ -832,8 +848,8 @@
         "    prefitting_model_config)\n",
         "# We can compile and train our prefitting model as we like.\n",
         "prefitting_model.compile(\n",
-        "    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),\n",
-        "    optimizer=tf.keras.optimizers.Adam(LEARNING_RATE))\n",
+        "    loss=keras.losses.BinaryCrossentropy(from_logits=True),\n",
+        "    optimizer=keras.optimizers.Adam(LEARNING_RATE))\n",
         "prefitting_model.fit(\n",
         "    heart_train_xs,\n",
         "    heart_train_ys,\n",
@@ -849,7 +865,7 @@
         "crystals_ensemble_model = tfl.premade.CalibratedLatticeEnsemble(\n",
         "    crystals_ensemble_model_config)\n",
         "# Let's plot our model.\n",
-        "tf.keras.utils.plot_model(\n",
+        "keras.utils.plot_model(\n",
         "    crystals_ensemble_model, show_layer_names=False, rankdir='LR')"
       ]
     },
@@ -871,9 +887,9 @@
       "outputs": [],
       "source": [
         "crystals_ensemble_model.compile(\n",
-        "    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),\n",
-        "    metrics=[tf.keras.metrics.AUC(from_logits=True)],\n",
-        "    optimizer=tf.keras.optimizers.Adam(LEARNING_RATE))\n",
+        "    loss=keras.losses.BinaryCrossentropy(from_logits=True),\n",
+        "    metrics=[keras.metrics.AUC(from_logits=True)],\n",
+        "    optimizer=keras.optimizers.Adam(LEARNING_RATE))\n",
         "crystals_ensemble_model.fit(\n",
         "    heart_train_xs,\n",
         "    heart_train_ys,\n",
@@ -887,7 +903,6 @@
   ],
   "metadata": {
     "colab": {
-      "collapsed_sections": [],
       "name": "premade_models.ipynb",
       "private_outputs": true,
       "provenance": [],
diff --git a/docs/tutorials/shape_constraints.ipynb b/docs/tutorials/shape_constraints.ipynb
index 39bad36..6da8f87 100644
--- a/docs/tutorials/shape_constraints.ipynb
+++ b/docs/tutorials/shape_constraints.ipynb
@@ -70,7 +70,7 @@
       "source": [
         "## Overview\n",
         "\n",
-        "This tutorial is an overview of the constraints and regularizers provided by the TensorFlow Lattice (TFL) library. Here we use TFL canned estimators on synthetic datasets, but note that everything in this tutorial can also be done with models constructed from TFL Keras layers.\n",
+        "This tutorial is an overview of the constraints and regularizers provided by the TensorFlow Lattice (TFL) library. Here we use TFL premade models on synthetic datasets, but note that everything in this tutorial can also be done with models constructed from TFL Keras layers.\n",
         "\n",
         "Before proceeding, make sure your runtime has all required packages installed (as imported in the code cells below)."
       ]
@@ -102,7 +102,7 @@
       "outputs": [],
       "source": [
         "#@test {\"skip\": true}\n",
-        "!pip install tensorflow-lattice tensorflow_decision_forests"
+        "!pip install --pre -U tensorflow tf-keras tensorflow-lattice tensorflow_decision_forests pydot graphviz"
       ]
     },
     {
@@ -128,7 +128,7 @@
         "import tensorflow_decision_forests as tfdf\n",
         "\n",
         "from IPython.core.pylabtools import figsize\n",
-        "import itertools\n",
+        "import functools\n",
         "import logging\n",
         "import matplotlib\n",
         "from matplotlib import pyplot as plt\n",
@@ -139,6 +139,22 @@
         "logging.disable(sys.maxsize)"
       ]
     },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "8dsfk2oNlakY"
+      },
+      "outputs": [],
+      "source": [
+        "# Use Keras 2.\n",
+        "version_fn = getattr(tf.keras, \"version\", None)\n",
+        "if version_fn and version_fn().startswith(\"3.\"):\n",
+        "  import tf_keras as keras\n",
+        "else:\n",
+        "  keras = tf.keras"
+      ]
+    },
     {
       "cell_type": "markdown",
       "metadata": {
@@ -187,10 +203,10 @@
         "$$\n",
         "where $b(\\cdot)$ translates each `dollar_rating` to a baseline value:\n",
         "$$\n",
-        "\\mbox{D}\\to 3,\\ \\mbox{DD}\\to 2,\\ \\mbox{DDD}\\to 4,\\ \\mbox{DDDD}\\to 4.5. \n",
+        "\\mbox{D}\\to 3,\\ \\mbox{DD}\\to 2,\\ \\mbox{DDD}\\to 4,\\ \\mbox{DDDD}\\to 4.5.\n",
         "$$\n",
         "\n",
-        "This formula reflects typical user patterns. e.g. given everything else fixed, users prefer restaurants with higher star ratings, and \"\\\\$\\\\$\" restaurants will receive more clicks than \"\\\\$\", followed by \"\\\\$\\\\$\\\\$\" and \"\\\\$\\\\$\\\\$\\\\$\". "
+        "This formula reflects typical user patterns. e.g. given everything else fixed, users prefer restaurants with higher star ratings, and \"\\\\$\\\\$\" restaurants will receive more clicks than \"\\\\$\", followed by \"\\\\$\\\\$\\\\$\" and \"\\\\$\\\\$\\\\$\\\\$\"."
       ]
     },
     {
@@ -201,6 +217,7 @@
       },
       "outputs": [],
       "source": [
+        "dollar_ratings_vocab = [\"D\", \"DD\", \"DDD\", \"DDDD\"]\n",
         "def click_through_rate(avg_ratings, num_reviews, dollar_ratings):\n",
         "  dollar_rating_baseline = {\"D\": 3, \"DD\": 2, \"DDD\": 4, \"DDDD\": 4.5}\n",
         "  return 1 / (1 + np.exp(\n",
@@ -234,32 +251,23 @@
         "  return bar\n",
         "\n",
         "\n",
-        "def plot_fns(fns, split_by_dollar=False, res=25):\n",
+        "def plot_fns(fns, res=25):\n",
         "  \"\"\"Generates contour plots for a list of (name, fn) functions.\"\"\"\n",
         "  num_reviews, avg_ratings = np.meshgrid(\n",
         "      np.linspace(0, 200, num=res),\n",
         "      np.linspace(1, 5, num=res),\n",
         "  )\n",
-        "  if split_by_dollar:\n",
-        "    dollar_rating_splits = [\"D\", \"DD\", \"DDD\", \"DDDD\"]\n",
-        "  else:\n",
-        "    dollar_rating_splits = [None]\n",
-        "  if len(fns) == 1:\n",
-        "    fig, axes = plt.subplots(2, 2, sharey=True, tight_layout=False)\n",
-        "  else:\n",
-        "    fig, axes = plt.subplots(\n",
-        "        len(dollar_rating_splits), len(fns), sharey=True, tight_layout=False)\n",
+        "  figsize(13, 3.5 * len(fns))\n",
+        "  fig, axes = plt.subplots(\n",
+        "      len(fns), len(dollar_ratings_vocab), sharey=True, layout=\"constrained\"\n",
+        "  )\n",
         "  axes = axes.flatten()\n",
         "  axes_index = 0\n",
-        "  for dollar_rating_split in dollar_rating_splits:\n",
-        "    for title, fn in fns:\n",
-        "      if dollar_rating_split is not None:\n",
-        "        dollar_ratings = np.repeat(dollar_rating_split, res**2)\n",
-        "        values = fn(avg_ratings.flatten(), num_reviews.flatten(),\n",
-        "                    dollar_ratings)\n",
-        "        title = \"{}: dollar_rating={}\".format(title, dollar_rating_split)\n",
-        "      else:\n",
-        "        values = fn(avg_ratings.flatten(), num_reviews.flatten())\n",
+        "  for fn_name, fn in fns:\n",
+        "    for dollar_rating_split in dollar_ratings_vocab:\n",
+        "      dollar_ratings = np.repeat(dollar_rating_split, res**2)\n",
+        "      values = fn(avg_ratings.flatten(), num_reviews.flatten(), dollar_ratings)\n",
+        "      title = \"{}: dollar_rating={}\".format(fn_name, dollar_rating_split)\n",
         "      subplot = axes[axes_index]\n",
         "      axes_index += 1\n",
         "      subplot.contourf(\n",
@@ -267,17 +275,24 @@
         "          num_reviews,\n",
         "          np.reshape(values, (res, res)),\n",
         "          vmin=0,\n",
-        "          vmax=1)\n",
+        "          vmax=1,\n",
+        "      )\n",
         "      subplot.title.set_text(title)\n",
         "      subplot.set(xlabel=\"Average Rating\")\n",
         "      subplot.set(ylabel=\"Number of Reviews\")\n",
         "      subplot.set(xlim=(1, 5))\n",
         "\n",
-        "  _ = fig.colorbar(color_bar(), cax=fig.add_axes([0.95, 0.2, 0.01, 0.6]))\n",
+        "  if len(fns) \u003c= 2:\n",
+        "    cax = fig.add_axes([\n",
+        "        axes[-1].get_position().x1 + 0.11,\n",
+        "        axes[-1].get_position().y0,\n",
+        "        0.02,\n",
+        "        0.8,\n",
+        "    ])\n",
+        "    _ = fig.colorbar(color_bar(), cax=cax)\n",
         "\n",
         "\n",
-        "figsize(11, 11)\n",
-        "plot_fns([(\"CTR\", click_through_rate)], split_by_dollar=True)"
+        "plot_fns([(\"CTR\", click_through_rate)])"
       ]
     },
     {
@@ -309,7 +324,7 @@
         "def sample_restaurants(n):\n",
         "  avg_ratings = np.random.uniform(1.0, 5.0, n)\n",
         "  num_reviews = np.round(np.exp(np.random.uniform(0.0, np.log(200), n)))\n",
-        "  dollar_ratings = np.random.choice([\"D\", \"DD\", \"DDD\", \"DDDD\"], n)\n",
+        "  dollar_ratings = np.random.choice(dollar_ratings_vocab, n)\n",
         "  ctr_labels = click_through_rate(avg_ratings, num_reviews, dollar_ratings)\n",
         "  return avg_ratings, num_reviews, dollar_ratings, ctr_labels\n",
         "\n",
@@ -318,7 +333,8 @@
         "avg_ratings, num_reviews, dollar_ratings, ctr_labels = sample_restaurants(2000)\n",
         "\n",
         "figsize(5, 5)\n",
-        "fig, axs = plt.subplots(1, 1, sharey=False, tight_layout=False)\n",
+        "fig, axs = plt.subplots(1, 1, sharey=False, layout=\"constrained\")\n",
+        "\n",
         "for rating, marker in [(\"D\", \"o\"), (\"DD\", \"^\"), (\"DDD\", \"+\"), (\"DDDD\", \"x\")]:\n",
         "  plt.scatter(\n",
         "      x=avg_ratings[np.where(dollar_ratings == rating)],\n",
@@ -333,7 +349,7 @@
         "plt.legend()\n",
         "plt.xlim((1, 5))\n",
         "plt.title(\"Distribution of restaurants\")\n",
-        "_ = fig.colorbar(color_bar(), cax=fig.add_axes([0.95, 0.2, 0.01, 0.6]))"
+        "_ = fig.colorbar(color_bar(), cax=fig.add_axes([1.05, 0.1, 0.05, 0.85]))"
       ]
     },
     {
@@ -342,7 +358,7 @@
         "id": "tRetsfLv_JSR"
       },
       "source": [
-        "Let's produce the training, validation and testing datasets. When a restaurant is viewed in the search results, we can record user's engagement (click or no click) as a sample point. \n",
+        "Let's produce the training, validation and testing datasets. When a restaurant is viewed in the search results, we can record user's engagement (click or no click) as a sample point.\n",
         "\n",
         "In practice, users often do not go through all search results. This means that users will likely only see restaurants already considered \"good\" by the current ranking model in use. As a result, \"good\" restaurants are more frequently impressed and over-represented in the training datasets. When using more features, the training dataset can have large gaps in \"bad\" parts of the feature space.\n",
         "\n",
@@ -372,7 +388,7 @@
         "      \"avg_rating\": np.repeat(avg_ratings, num_views),\n",
         "      \"num_reviews\": np.repeat(num_reviews, num_views),\n",
         "      \"dollar_rating\": np.repeat(dollar_ratings, num_views),\n",
-        "      \"clicked\": np.random.binomial(n=1, p=np.repeat(ctr_labels, num_views))\n",
+        "      \"clicked\": np.random.binomial(n=1, p=np.repeat(ctr_labels, num_views)),\n",
         "  })\n",
         "\n",
         "\n",
@@ -382,11 +398,30 @@
         "data_val = sample_dataset(500, testing_set=False)\n",
         "data_test = sample_dataset(500, testing_set=True)\n",
         "\n",
+        "ds_train = tfdf.keras.pd_dataframe_to_tf_dataset(\n",
+        "    data_train, label=\"clicked\", batch_size=BATCH_SIZE\n",
+        ")\n",
+        "ds_val = tfdf.keras.pd_dataframe_to_tf_dataset(\n",
+        "    data_val, label=\"clicked\", batch_size=BATCH_SIZE\n",
+        ")\n",
+        "ds_test = tfdf.keras.pd_dataframe_to_tf_dataset(\n",
+        "    data_test, label=\"clicked\", batch_size=BATCH_SIZE\n",
+        ")\n",
+        "\n",
+        "# feature_analysis_data is used to find quantiles of featurse.\n",
+        "feature_analysis_data = data_train.copy()\n",
+        "feature_analysis_data[\"dollar_rating\"] = feature_analysis_data[\n",
+        "    \"dollar_rating\"\n",
+        "].map({v: i for i, v in enumerate(dollar_ratings_vocab)})\n",
+        "feature_analysis_data = dict(feature_analysis_data)\n",
+        "\n",
         "# Plotting dataset densities.\n",
         "figsize(12, 5)\n",
         "fig, axs = plt.subplots(1, 2, sharey=False, tight_layout=False)\n",
-        "for ax, data, title in [(axs[0], data_train, \"training\"),\n",
-        "                        (axs[1], data_test, \"testing\")]:\n",
+        "for ax, data, title in [\n",
+        "    (axs[0], data_train, \"training\"),\n",
+        "    (axs[1], data_test, \"testing\"),\n",
+        "]:\n",
         "  _, _, _, density = ax.hist2d(\n",
         "      x=data[\"avg_rating\"],\n",
         "      y=data[\"num_reviews\"],\n",
@@ -401,57 +436,6 @@
         "  _ = fig.colorbar(density, ax=ax)"
       ]
     },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "4fVyLgpCT1nW"
-      },
-      "source": [
-        "Defining input_fns used for training and evaluation:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "DYzRTRR2GKoS"
-      },
-      "outputs": [],
-      "source": [
-        "train_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(\n",
-        "    x=data_train,\n",
-        "    y=data_train[\"clicked\"],\n",
-        "    batch_size=BATCH_SIZE,\n",
-        "    num_epochs=NUM_EPOCHS,\n",
-        "    shuffle=False,\n",
-        ")\n",
-        "\n",
-        "# feature_analysis_input_fn is used for TF Lattice estimators.\n",
-        "feature_analysis_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(\n",
-        "    x=data_train,\n",
-        "    y=data_train[\"clicked\"],\n",
-        "    batch_size=BATCH_SIZE,\n",
-        "    num_epochs=1,\n",
-        "    shuffle=False,\n",
-        ")\n",
-        "\n",
-        "val_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(\n",
-        "    x=data_val,\n",
-        "    y=data_val[\"clicked\"],\n",
-        "    batch_size=BATCH_SIZE,\n",
-        "    num_epochs=1,\n",
-        "    shuffle=False,\n",
-        ")\n",
-        "\n",
-        "test_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(\n",
-        "    x=data_test,\n",
-        "    y=data_test[\"clicked\"],\n",
-        "    batch_size=BATCH_SIZE,\n",
-        "    num_epochs=1,\n",
-        "    shuffle=False,\n",
-        ")"
-      ]
-    },
     {
       "cell_type": "markdown",
       "metadata": {
@@ -467,71 +451,46 @@
         "id": "ZklNowexE3wB"
       },
       "source": [
-        "Let's start off with only two features: `avg_rating` and `num_reviews`.\n",
-        "\n",
-        "We create a few auxillary functions for plotting and calculating validation and test metrics."
+        "We first create a few auxillary functions for plotting and calculating validation and test metrics."
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "SX6rARJWURWl"
+        "id": "3BqGqScQzlYf"
       },
       "outputs": [],
       "source": [
-        "def analyze_two_d_estimator(estimator, name):\n",
-        "  # Extract validation metrics.\n",
-        "  if isinstance(estimator, tf.estimator.Estimator):\n",
-        "    metric = estimator.evaluate(input_fn=val_input_fn)\n",
-        "  else:\n",
-        "    metric = estimator.evaluate(\n",
-        "        tfdf.keras.pd_dataframe_to_tf_dataset(data_val, label=\"clicked\"),\n",
-        "        return_dict=True,\n",
-        "        verbose=0)\n",
-        "  print(\"Validation AUC: {}\".format(metric[\"auc\"]))\n",
+        "def pred_fn(model, from_logits, avg_ratings, num_reviews, dollar_rating):\n",
+        "  preds = model.predict(\n",
+        "      tf.data.Dataset.from_tensor_slices({\n",
+        "          \"avg_rating\": avg_ratings,\n",
+        "          \"num_reviews\": num_reviews,\n",
+        "          \"dollar_rating\": dollar_rating,\n",
+        "      }).batch(1),\n",
+        "      verbose=0,\n",
+        "  )\n",
+        "  if from_logits:\n",
+        "    preds = tf.math.sigmoid(preds)\n",
+        "  return preds\n",
         "\n",
-        "  if isinstance(estimator, tf.estimator.Estimator):\n",
-        "    metric = estimator.evaluate(input_fn=test_input_fn)\n",
-        "  else:\n",
-        "    metric = estimator.evaluate(\n",
-        "        tfdf.keras.pd_dataframe_to_tf_dataset(data_test, label=\"clicked\"),\n",
-        "        return_dict=True,\n",
-        "        verbose=0)\n",
-        "  print(\"Testing AUC: {}\".format(metric[\"auc\"]))\n",
         "\n",
-        "  def two_d_pred(avg_ratings, num_reviews):\n",
-        "    if isinstance(estimator, tf.estimator.Estimator):\n",
-        "      results = estimator.predict(\n",
-        "          tf.compat.v1.estimator.inputs.pandas_input_fn(\n",
-        "              x=pd.DataFrame({\n",
-        "                  \"avg_rating\": avg_ratings,\n",
-        "                  \"num_reviews\": num_reviews,\n",
-        "              }),\n",
-        "              shuffle=False,\n",
-        "          ))\n",
-        "      return [x[\"logistic\"][0] for x in results]\n",
-        "    else:\n",
-        "      return estimator.predict(\n",
-        "          tfdf.keras.pd_dataframe_to_tf_dataset(\n",
-        "              pd.DataFrame({\n",
-        "                  \"avg_rating\": avg_ratings,\n",
-        "                  \"num_reviews\": num_reviews,\n",
-        "              })),\n",
-        "          verbose=0)\n",
+        "def analyze_model(models, from_logits=False, print_metrics=True):\n",
+        "  pred_fns = []\n",
+        "  for model, name in models:\n",
+        "    if print_metrics:\n",
+        "      metric = model.evaluate(ds_val, return_dict=True, verbose=0)\n",
+        "      print(\"Validation AUC: {}\".format(metric[\"auc\"]))\n",
+        "      metric = model.evaluate(ds_test, return_dict=True, verbose=0)\n",
+        "      print(\"Testing AUC: {}\".format(metric[\"auc\"]))\n",
         "\n",
-        "  def two_d_click_through_rate(avg_ratings, num_reviews):\n",
-        "    return np.mean([\n",
-        "        click_through_rate(avg_ratings, num_reviews,\n",
-        "                           np.repeat(d, len(avg_ratings)))\n",
-        "        for d in [\"D\", \"DD\", \"DDD\", \"DDDD\"]\n",
-        "    ],\n",
-        "                   axis=0)\n",
+        "    pred_fns.append(\n",
+        "        (\"{} pCTR\".format(name), functools.partial(pred_fn, model, from_logits))\n",
+        "    )\n",
         "\n",
-        "  figsize(11, 5)\n",
-        "  plot_fns([(\"{} Estimated CTR\".format(name), two_d_pred),\n",
-        "            (\"CTR\", two_d_click_through_rate)],\n",
-        "           split_by_dollar=False)"
+        "  pred_fns.append((\"CTR\", click_through_rate))\n",
+        "  plot_fns(pred_fns)"
       ]
     },
     {
@@ -554,7 +513,8 @@
         "gbt_model = tfdf.keras.GradientBoostedTreesModel(\n",
         "    features=[\n",
         "        tfdf.keras.FeatureUsage(name=\"num_reviews\"),\n",
-        "        tfdf.keras.FeatureUsage(name=\"avg_rating\")\n",
+        "        tfdf.keras.FeatureUsage(name=\"avg_rating\"),\n",
+        "        tfdf.keras.FeatureUsage(name=\"dollar_rating\"),\n",
         "    ],\n",
         "    exclude_non_specified_features=True,\n",
         "    num_threads=1,\n",
@@ -565,13 +525,9 @@
         "    random_seed=42,\n",
         "    temp_directory=tempfile.mkdtemp(),\n",
         ")\n",
-        "gbt_model.compile(metrics=[tf.keras.metrics.AUC(name=\"auc\")])\n",
-        "gbt_model.fit(\n",
-        "    tfdf.keras.pd_dataframe_to_tf_dataset(data_train, label=\"clicked\"),\n",
-        "    validation_data=tfdf.keras.pd_dataframe_to_tf_dataset(\n",
-        "        data_val, label=\"clicked\"),\n",
-        "    verbose=0)\n",
-        "analyze_two_d_estimator(gbt_model, \"GBT\")"
+        "gbt_model.compile(metrics=[keras.metrics.AUC(name=\"auc\")])\n",
+        "gbt_model.fit(ds_train, validation_data=ds_val, verbose=0)\n",
+        "analyze_model([(gbt_model, \"GBT\")])"
       ]
     },
     {
@@ -600,30 +556,61 @@
         "id": "_s2aT3x0E_tF"
       },
       "source": [
-        "We can repeat the same steps with a DNN classifier. We can observe a similar pattern: not having enough sample points with small number of reviews results in nonsensical extrapolation. Note that even though the validation metric is better than the tree solution, the testing metric is much worse."
+        "We can repeat the same steps with a DNN classifier. We can observe a similar pattern: not having enough sample points with small number of reviews results in nonsensical extrapolation."
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "gFUeG6kLDNhO"
+        "id": "WKZzCY-UkZX-"
       },
       "outputs": [],
       "source": [
-        "feature_columns = [\n",
-        "    tf.feature_column.numeric_column(\"num_reviews\"),\n",
-        "    tf.feature_column.numeric_column(\"avg_rating\"),\n",
-        "]\n",
-        "dnn_estimator = tf.estimator.DNNClassifier(\n",
-        "    feature_columns=feature_columns,\n",
-        "    # Hyper-params optimized on validation set.\n",
-        "    hidden_units=[16, 8, 8],\n",
-        "    optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=LEARNING_RATE),\n",
-        "    config=tf.estimator.RunConfig(tf_random_seed=42),\n",
+        "keras.utils.set_random_seed(42)\n",
+        "inputs = {\n",
+        "    \"num_reviews\": keras.Input(shape=(1,), dtype=tf.float32),\n",
+        "    \"avg_rating\": keras.Input(shape=(1), dtype=tf.float32),\n",
+        "    \"dollar_rating\": keras.Input(shape=(1), dtype=tf.string),\n",
+        "}\n",
+        "inputs_flat = keras.layers.Concatenate()([\n",
+        "    inputs[\"num_reviews\"],\n",
+        "    inputs[\"avg_rating\"],\n",
+        "    keras.layers.StringLookup(\n",
+        "        vocabulary=dollar_ratings_vocab,\n",
+        "        num_oov_indices=0,\n",
+        "        output_mode=\"one_hot\",\n",
+        "    )(inputs[\"dollar_rating\"]),\n",
+        "])\n",
+        "dense_layers = keras.Sequential(\n",
+        "    [\n",
+        "        keras.layers.Dense(16, activation=\"relu\"),\n",
+        "        keras.layers.Dense(16, activation=\"relu\"),\n",
+        "        keras.layers.Dense(1, activation=None),\n",
+        "    ],\n",
+        "    name=\"dense_layers\",\n",
         ")\n",
-        "dnn_estimator.train(input_fn=train_input_fn)\n",
-        "analyze_two_d_estimator(dnn_estimator, \"DNN\")"
+        "dnn_model = keras.Model(inputs=inputs, outputs=dense_layers(inputs_flat))\n",
+        "keras.utils.plot_model(\n",
+        "    dnn_model, expand_nested=True, show_layer_names=False, rankdir=\"LR\"\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "6zFqu6wf1I30"
+      },
+      "outputs": [],
+      "source": [
+        "dnn_model.compile(\n",
+        "    loss=keras.losses.BinaryCrossentropy(from_logits=True),\n",
+        "    metrics=[keras.metrics.AUC(from_logits=True, name=\"auc\")],\n",
+        "    optimizer=keras.optimizers.Adam(LEARNING_RATE),\n",
+        ")\n",
+        "dnn_model.fit(ds_train, epochs=200, verbose=0)\n",
+        "analyze_model([(dnn_model, \"DNN\")], from_logits=True)"
       ]
     },
     {
@@ -641,16 +628,15 @@
         "id": "3ExyethCFBrP"
       },
       "source": [
-        "TensorFlow Lattice (TFL) is focused on enforcing shape constraints to safeguard model behavior beyond the training data. These shape constraints are applied to TFL Keras layers. Their details can be found in [our JMLR paper](http://jmlr.org/papers/volume17/15-243/15-243.pdf). \n",
+        "TensorFlow Lattice (TFL) is focused on enforcing shape constraints to safeguard model behavior beyond the training data. These shape constraints are applied to TFL Keras layers. Their details can be found in [our JMLR paper](http://jmlr.org/papers/volume17/15-243/15-243.pdf).\n",
         "\n",
-        "In this tutorial we use TF canned estimators to cover various shape constraints, but note that all these steps can be done with models created from TFL Keras layers.\n",
+        "In this tutorial we use TF premade models to cover various shape constraints, but note that all these steps can be done with models created from TFL Keras layers.\n",
         "\n",
-        "As with any other TensorFlow estimator, TFL canned estimators use [feature columns](https://www.tensorflow.org/api_docs/python/tf/feature_column) to define the input format and use a training input_fn to pass in the data.\n",
-        "Using TFL canned estimators also requires:\n",
+        "Using TFL premade models also requires:\n",
         "- a *model config*: defining the model architecture and per-feature shape constraints and regularizers.\n",
-        "- a *feature analysis input_fn*: a TF input_fn passing data for TFL initialization.\n",
+        "- a *feature analysis dataset*: a dataset used for TFL initialization (feature quantile calcuation).\n",
         "\n",
-        "For a more thorough description, please refer to the canned estimators tutorial or the API docs."
+        "For a more thorough description, please refer to the premade models or the API docs."
       ]
     },
     {
@@ -660,7 +646,7 @@
       },
       "source": [
         "### Monotonicity\n",
-        "We first address the monotonicity concerns by adding monotonicity shape constraints to both features.\n",
+        "We first address the monotonicity concerns by adding monotonicity shape constraints to the continuous features. We use a calibrated lattice model with added output calibration: each feature is calibrated using categorical or piecewise-linear calibrators, then fed into a lattice model, followed by an output piecewise-linear calibrator.\n",
         "\n",
         "To instruct TFL to enforce shape constraints, we specify the constraints in the *feature configs*. The following code shows how we can require the output to be monotonically increasing with respect to both `num_reviews` and `avg_rating` by setting `monotonicity=\"increasing\"`.\n"
       ]
@@ -669,38 +655,101 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "FCm1lOjmwur_"
+        "id": "hFlkZs5RgFcP"
       },
       "outputs": [],
       "source": [
-        "feature_columns = [\n",
-        "    tf.feature_column.numeric_column(\"num_reviews\"),\n",
-        "    tf.feature_column.numeric_column(\"avg_rating\"),\n",
-        "]\n",
         "model_config = tfl.configs.CalibratedLatticeConfig(\n",
         "    feature_configs=[\n",
         "        tfl.configs.FeatureConfig(\n",
         "            name=\"num_reviews\",\n",
-        "            lattice_size=2,\n",
+        "            lattice_size=3,\n",
         "            monotonicity=\"increasing\",\n",
-        "            pwl_calibration_num_keypoints=20,\n",
+        "            pwl_calibration_num_keypoints=32,\n",
         "        ),\n",
         "        tfl.configs.FeatureConfig(\n",
         "            name=\"avg_rating\",\n",
-        "            lattice_size=2,\n",
+        "            lattice_size=3,\n",
         "            monotonicity=\"increasing\",\n",
-        "            pwl_calibration_num_keypoints=20,\n",
-        "        )\n",
-        "    ])\n",
-        "tfl_estimator = tfl.estimators.CannedClassifier(\n",
-        "    feature_columns=feature_columns,\n",
-        "    model_config=model_config,\n",
-        "    feature_analysis_input_fn=feature_analysis_input_fn,\n",
-        "    optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=LEARNING_RATE),\n",
-        "    config=tf.estimator.RunConfig(tf_random_seed=42),\n",
+        "            pwl_calibration_num_keypoints=32,\n",
+        "        ),\n",
+        "        tfl.configs.FeatureConfig(\n",
+        "            name=\"dollar_rating\",\n",
+        "            lattice_size=3,\n",
+        "            pwl_calibration_num_keypoints=4,\n",
+        "            vocabulary_list=dollar_ratings_vocab,\n",
+        "            num_buckets=len(dollar_ratings_vocab),\n",
+        "        ),\n",
+        "    ],\n",
+        "    output_calibration=True,\n",
+        "    output_initialization=np.linspace(-2, 2, num=5),\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "GOlzuyQsGre5"
+      },
+      "source": [
+        "We now use the `feature_analysis_data` to find and set the quantile values for the input features. These values can be pre-calculated and set explicitly in the feature config depending on the training pipeline."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "f-bTmfBnghuX"
+      },
+      "outputs": [],
+      "source": [
+        "feature_analysis_data = data_train.copy()\n",
+        "feature_analysis_data[\"dollar_rating\"] = feature_analysis_data[\n",
+        "    \"dollar_rating\"\n",
+        "].map({v: i for i, v in enumerate(dollar_ratings_vocab)})\n",
+        "feature_analysis_data = dict(feature_analysis_data)\n",
+        "\n",
+        "feature_keypoints = tfl.premade_lib.compute_feature_keypoints(\n",
+        "    feature_configs=model_config.feature_configs, features=feature_analysis_data\n",
         ")\n",
-        "tfl_estimator.train(input_fn=train_input_fn)\n",
-        "analyze_two_d_estimator(tfl_estimator, \"TF Lattice\")"
+        "tfl.premade_lib.set_feature_keypoints(\n",
+        "    feature_configs=model_config.feature_configs,\n",
+        "    feature_keypoints=feature_keypoints,\n",
+        "    add_missing_feature_configs=False,\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "FCm1lOjmwur_"
+      },
+      "outputs": [],
+      "source": [
+        "keras.utils.set_random_seed(42)\n",
+        "inputs = {\n",
+        "    \"num_reviews\": keras.Input(shape=(1,), dtype=tf.float32),\n",
+        "    \"avg_rating\": keras.Input(shape=(1), dtype=tf.float32),\n",
+        "    \"dollar_rating\": keras.Input(shape=(1), dtype=tf.string),\n",
+        "}\n",
+        "ordered_inputs = [\n",
+        "    inputs[\"num_reviews\"],\n",
+        "    inputs[\"avg_rating\"],\n",
+        "    keras.layers.StringLookup(\n",
+        "        vocabulary=dollar_ratings_vocab,\n",
+        "        num_oov_indices=0,\n",
+        "        output_mode=\"int\",\n",
+        "    )(inputs[\"dollar_rating\"]),\n",
+        "]\n",
+        "outputs = tfl.premade.CalibratedLattice(\n",
+        "    model_config=model_config, name=\"CalibratedLattice\"\n",
+        ")(ordered_inputs)\n",
+        "tfl_model_0 = keras.Model(inputs=inputs, outputs=outputs)\n",
+        "\n",
+        "keras.utils.plot_model(\n",
+        "    tfl_model_0, expand_nested=True, show_layer_names=False, rankdir=\"LR\"\n",
+        ")"
       ]
     },
     {
@@ -709,30 +758,24 @@
         "id": "ubNRBCWW5wQ9"
       },
       "source": [
-        "Using a `CalibratedLatticeConfig` creates a canned classifier that first applies a *calibrator* to each input (a piece-wise linear function for numeric features) followed by a *lattice* layer to non-linearly fuse the calibrated features. We can use `tfl.visualization` to visualize the model. In particular, the following plot shows the two trained calibrators included in the canned classifier. \n"
+        "Using a `CalibratedLatticeConfig` creates a premade classifier that first applies a *calibrator* to each input (a piece-wise linear function for numeric features) followed by a *lattice* layer to non-linearly fuse the calibrated features. We have also enabled output piece-wise linear calibration.\n"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "cellView": "both",
-        "id": "C0py9Q6OBRBE"
+        "id": "Am1OwtzzU7no"
       },
       "outputs": [],
       "source": [
-        "def save_and_visualize_lattice(tfl_estimator):\n",
-        "  saved_model_path = tfl_estimator.export_saved_model(\n",
-        "      \"/tmp/TensorFlow_Lattice_101/\",\n",
-        "      tf.estimator.export.build_parsing_serving_input_receiver_fn(\n",
-        "          feature_spec=tf.feature_column.make_parse_example_spec(\n",
-        "              feature_columns)))\n",
-        "  model_graph = tfl.estimators.get_model_graph(saved_model_path)\n",
-        "  figsize(8, 8)\n",
-        "  tfl.visualization.draw_model_graph(model_graph)\n",
-        "  return model_graph\n",
-        "\n",
-        "_ = save_and_visualize_lattice(tfl_estimator)"
+        "tfl_model_0.compile(\n",
+        "    loss=keras.losses.BinaryCrossentropy(from_logits=True),\n",
+        "    metrics=[keras.metrics.AUC(from_logits=True, name=\"auc\")],\n",
+        "    optimizer=keras.optimizers.Adam(LEARNING_RATE),\n",
+        ")\n",
+        "tfl_model_0.fit(ds_train, epochs=100, verbose=0)\n",
+        "analyze_model([(tfl_model_0, \"TFL0\")], from_logits=True)"
       ]
     },
     {
@@ -747,63 +790,85 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "RfniRZCHIvfK"
+        "id": "pSUd6aFlpYz4"
       },
       "source": [
-        "### Diminishing Returns\n",
-        "[Diminishing returns](https://en.wikipedia.org/wiki/Diminishing_returns) means that the marginal gain of increasing a certain feature value will decrease as we increase the value. In our case we expect that the `num_reviews` feature follows this pattern, so we can configure its calibrator accordingly. Notice that we can decompose diminishing returns into two sufficient conditions:\n",
-        "\n",
-        "- the calibrator is monotonicially increasing, and\n",
-        "- the calibrator is concave.\n"
+        "### Partial Monotonicity for Categorical Calibration\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "CnPiqf4rq6kJ"
+      },
+      "source": [
+        "To use constraints on the third feature, `dollar_rating`, we should recall that categorical features require a slightly different treatment in TFL. Here we enforce the partial monotonicity constraint that outputs for \"DD\" restaurants should be larger than \"D\" restaurants when all other inputs are fixed. This is done using the `monotonicity` setting in the feature config. We also need to use `tfl.premade_lib.set_categorical_monotonicities` to convert the constrains specified in string values into the numerical format understood by the library."
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "XQrM9BskY-wx"
+        "id": "FH2ItfsTsE3S"
       },
       "outputs": [],
       "source": [
-        "feature_columns = [\n",
-        "    tf.feature_column.numeric_column(\"num_reviews\"),\n",
-        "    tf.feature_column.numeric_column(\"avg_rating\"),\n",
-        "]\n",
+        "keras.utils.set_random_seed(42)\n",
         "model_config = tfl.configs.CalibratedLatticeConfig(\n",
         "    feature_configs=[\n",
         "        tfl.configs.FeatureConfig(\n",
         "            name=\"num_reviews\",\n",
-        "            lattice_size=2,\n",
+        "            lattice_size=3,\n",
         "            monotonicity=\"increasing\",\n",
         "            pwl_calibration_convexity=\"concave\",\n",
-        "            pwl_calibration_num_keypoints=20,\n",
+        "            pwl_calibration_num_keypoints=32,\n",
         "        ),\n",
         "        tfl.configs.FeatureConfig(\n",
         "            name=\"avg_rating\",\n",
-        "            lattice_size=2,\n",
+        "            lattice_size=3,\n",
         "            monotonicity=\"increasing\",\n",
-        "            pwl_calibration_num_keypoints=20,\n",
-        "        )\n",
-        "    ])\n",
-        "tfl_estimator = tfl.estimators.CannedClassifier(\n",
-        "    feature_columns=feature_columns,\n",
-        "    model_config=model_config,\n",
-        "    feature_analysis_input_fn=feature_analysis_input_fn,\n",
-        "    optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=LEARNING_RATE),\n",
-        "    config=tf.estimator.RunConfig(tf_random_seed=42),\n",
+        "            pwl_calibration_num_keypoints=32,\n",
+        "        ),\n",
+        "        tfl.configs.FeatureConfig(\n",
+        "            name=\"dollar_rating\",\n",
+        "            lattice_size=3,\n",
+        "            pwl_calibration_num_keypoints=4,\n",
+        "            vocabulary_list=dollar_ratings_vocab,\n",
+        "            num_buckets=len(dollar_ratings_vocab),\n",
+        "            monotonicity=[(\"D\", \"DD\")],\n",
+        "        ),\n",
+        "    ],\n",
+        "    output_calibration=True,\n",
+        "    output_initialization=np.linspace(-2, 2, num=5),\n",
         ")\n",
-        "tfl_estimator.train(input_fn=train_input_fn)\n",
-        "analyze_two_d_estimator(tfl_estimator, \"TF Lattice\")\n",
-        "_ = save_and_visualize_lattice(tfl_estimator)"
+        "\n",
+        "tfl.premade_lib.set_feature_keypoints(\n",
+        "    feature_configs=model_config.feature_configs,\n",
+        "    feature_keypoints=feature_keypoints,\n",
+        "    add_missing_feature_configs=False,\n",
+        ")\n",
+        "tfl.premade_lib.set_categorical_monotonicities(model_config.feature_configs)\n",
+        "\n",
+        "outputs = tfl.premade.CalibratedLattice(\n",
+        "    model_config=model_config, name=\"CalibratedLattice\"\n",
+        ")(ordered_inputs)\n",
+        "tfl_model_1 = keras.Model(inputs=inputs, outputs=outputs)\n",
+        "tfl_model_1.compile(\n",
+        "    loss=keras.losses.BinaryCrossentropy(from_logits=True),\n",
+        "    metrics=[keras.metrics.AUC(from_logits=True, name=\"auc\")],\n",
+        "    optimizer=keras.optimizers.Adam(LEARNING_RATE),\n",
+        ")\n",
+        "tfl_model_1.fit(ds_train, epochs=100, verbose=0)\n",
+        "analyze_model([(tfl_model_1, \"TFL1\")], from_logits=True)"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "LSmzHkPUo9u5"
+        "id": "gdIzhYL79_Pp"
       },
       "source": [
-        "Notice how the testing metric improves by adding the concavity constraint. The prediction plot also better resembles the ground truth."
+        "Here we also plot the predicted CTR of this model conditioned on `dollar_rating`. Notice that all the constraints we required are fulfilled in each of the slices."
       ]
     },
     {
@@ -813,7 +878,7 @@
       },
       "source": [
         "### 2D Shape Constraint: Trust\n",
-        "A 5-star rating for a restaurant with only one or two reviews is likely an unreliable rating (the restaurant might not actually be good), whereas a 4-star rating for a restaurant with hundreds of reviews is much more reliable (the restaurant is likely good in this case). We can see that the number of reviews of a restaurant affects how much trust we place in its average rating. \n",
+        "A 5-star rating for a restaurant with only one or two reviews is likely an unreliable rating (the restaurant might not actually be good), whereas a 4-star rating for a restaurant with hundreds of reviews is much more reliable (the restaurant is likely good in this case). We can see that the number of reviews of a restaurant affects how much trust we place in its average rating.\n",
         "\n",
         "We can exercise TFL trust constraints to inform the model that the larger (or smaller) value of one feature indicates more reliance or trust of another feature. This is done by setting `reflects_trust_in` configuration in the feature config."
       ]
@@ -826,41 +891,58 @@
       },
       "outputs": [],
       "source": [
-        "feature_columns = [\n",
-        "    tf.feature_column.numeric_column(\"num_reviews\"),\n",
-        "    tf.feature_column.numeric_column(\"avg_rating\"),\n",
-        "]\n",
+        "keras.utils.set_random_seed(42)\n",
         "model_config = tfl.configs.CalibratedLatticeConfig(\n",
         "    feature_configs=[\n",
         "        tfl.configs.FeatureConfig(\n",
         "            name=\"num_reviews\",\n",
-        "            lattice_size=2,\n",
+        "            lattice_size=3,\n",
         "            monotonicity=\"increasing\",\n",
-        "            pwl_calibration_convexity=\"concave\",\n",
-        "            pwl_calibration_num_keypoints=20,\n",
+        "            pwl_calibration_num_keypoints=32,\n",
         "            # Larger num_reviews indicating more trust in avg_rating.\n",
         "            reflects_trust_in=[\n",
         "                tfl.configs.TrustConfig(\n",
-        "                    feature_name=\"avg_rating\", trust_type=\"edgeworth\"),\n",
+        "                    feature_name=\"avg_rating\", trust_type=\"edgeworth\"\n",
+        "                ),\n",
         "            ],\n",
         "        ),\n",
         "        tfl.configs.FeatureConfig(\n",
         "            name=\"avg_rating\",\n",
-        "            lattice_size=2,\n",
+        "            lattice_size=3,\n",
         "            monotonicity=\"increasing\",\n",
-        "            pwl_calibration_num_keypoints=20,\n",
-        "        )\n",
-        "    ])\n",
-        "tfl_estimator = tfl.estimators.CannedClassifier(\n",
-        "    feature_columns=feature_columns,\n",
-        "    model_config=model_config,\n",
-        "    feature_analysis_input_fn=feature_analysis_input_fn,\n",
-        "    optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=LEARNING_RATE),\n",
-        "    config=tf.estimator.RunConfig(tf_random_seed=42),\n",
+        "            pwl_calibration_num_keypoints=32,\n",
+        "        ),\n",
+        "        tfl.configs.FeatureConfig(\n",
+        "            name=\"dollar_rating\",\n",
+        "            lattice_size=3,\n",
+        "            pwl_calibration_num_keypoints=4,\n",
+        "            vocabulary_list=dollar_ratings_vocab,\n",
+        "            num_buckets=len(dollar_ratings_vocab),\n",
+        "            monotonicity=[(\"D\", \"DD\")],\n",
+        "        ),\n",
+        "    ],\n",
+        "    output_calibration=True,\n",
+        "    output_initialization=np.linspace(-2, 2, num=5),\n",
+        ")\n",
+        "\n",
+        "tfl.premade_lib.set_feature_keypoints(\n",
+        "    feature_configs=model_config.feature_configs,\n",
+        "    feature_keypoints=feature_keypoints,\n",
+        "    add_missing_feature_configs=False,\n",
+        ")\n",
+        "tfl.premade_lib.set_categorical_monotonicities(model_config.feature_configs)\n",
+        "\n",
+        "outputs = tfl.premade.CalibratedLattice(\n",
+        "    model_config=model_config, name=\"CalibratedLattice\"\n",
+        ")(ordered_inputs)\n",
+        "tfl_model_2 = keras.Model(inputs=inputs, outputs=outputs)\n",
+        "tfl_model_2.compile(\n",
+        "    loss=keras.losses.BinaryCrossentropy(from_logits=True),\n",
+        "    metrics=[keras.metrics.AUC(from_logits=True, name=\"auc\")],\n",
+        "    optimizer=keras.optimizers.Adam(LEARNING_RATE),\n",
         ")\n",
-        "tfl_estimator.train(input_fn=train_input_fn)\n",
-        "analyze_two_d_estimator(tfl_estimator, \"TF Lattice\")\n",
-        "model_graph = save_and_visualize_lattice(tfl_estimator)"
+        "tfl_model_2.fit(ds_train, epochs=100, verbose=0)\n",
+        "analyze_model([(tfl_model_2, \"TFL2\")], from_logits=True)"
       ]
     },
     {
@@ -881,321 +963,231 @@
       },
       "outputs": [],
       "source": [
-        "lat_mesh_n = 12\n",
-        "lat_mesh_x, lat_mesh_y = tfl.test_utils.two_dim_mesh_grid(\n",
-        "    lat_mesh_n**2, 0, 0, 1, 1)\n",
-        "lat_mesh_fn = tfl.test_utils.get_hypercube_interpolation_fn(\n",
-        "    model_graph.output_node.weights.flatten())\n",
-        "lat_mesh_z = [\n",
-        "    lat_mesh_fn([lat_mesh_x.flatten()[i],\n",
-        "                 lat_mesh_y.flatten()[i]]) for i in range(lat_mesh_n**2)\n",
-        "]\n",
-        "trust_plt = tfl.visualization.plot_outputs(\n",
-        "    (lat_mesh_x, lat_mesh_y),\n",
-        "    {\"Lattice Lookup\": lat_mesh_z},\n",
-        "    figsize=(6, 6),\n",
+        "lattice_params = tfl_model_2.layers[-1].layers[-2].weights[0].numpy()\n",
+        "lat_mesh_x, lat_mesh_y = np.meshgrid(\n",
+        "    np.linspace(0, 1, num=3),\n",
+        "    np.linspace(0, 1, num=3),\n",
         ")\n",
-        "trust_plt.title(\"Trust\")\n",
-        "trust_plt.xlabel(\"Calibrated avg_rating\")\n",
-        "trust_plt.ylabel(\"Calibrated num_reviews\")\n",
-        "trust_plt.show()"
+        "lat_mesh_z = np.reshape(np.asarray(lattice_params[0::3]), (3, 3))\n",
+        "\n",
+        "figure = plt.figure(figsize=(6, 6))\n",
+        "axes = figure.add_subplot(projection=\"3d\")\n",
+        "axes.plot_wireframe(lat_mesh_x, lat_mesh_y, lat_mesh_z, color=\"dodgerblue\")\n",
+        "plt.legend([\"Lattice Lookup\"])\n",
+        "plt.title(\"Trust\")\n",
+        "plt.xlabel(\"Calibrated avg_rating\")\n",
+        "plt.ylabel(\"Calibrated num_reviews\")\n",
+        "plt.show()"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "SKe3UHX6pUjw"
+        "id": "RfniRZCHIvfK"
       },
       "source": [
-        "### Smoothing Calibrators\n",
-        "Let's now take a look at the calibrator of `avg_rating`. Though it is monotonically increasing, the changes in its slopes are abrupt and hard to interpret. That suggests we might want to consider smoothing this calibrator using a regularizer setup in the `regularizer_configs`.\n",
+        "### Diminishing Returns\n",
+        "[Diminishing returns](https://en.wikipedia.org/wiki/Diminishing_returns) means that the marginal gain of increasing a certain feature value will decrease as we increase the value. In our case we expect that the `num_reviews` feature follows this pattern, so we can configure its calibrator accordingly. Notice that we can decompose diminishing returns into two sufficient conditions:\n",
         "\n",
-        "Here we apply a `wrinkle` regularizer to reduce changes in the curvature. You can also use the `laplacian` regularizer to flatten the calibrator and the `hessian` regularizer to make it more linear.\n"
+        "- the calibrator is monotonicially increasing, and\n",
+        "- the calibrator is concave (setting `pwl_calibration_convexity=\"concave\"`).\n"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "qxFHH3hSpWfq"
+        "id": "XQrM9BskY-wx"
       },
       "outputs": [],
       "source": [
-        "feature_columns = [\n",
-        "    tf.feature_column.numeric_column(\"num_reviews\"),\n",
-        "    tf.feature_column.numeric_column(\"avg_rating\"),\n",
-        "]\n",
+        "keras.utils.set_random_seed(42)\n",
         "model_config = tfl.configs.CalibratedLatticeConfig(\n",
         "    feature_configs=[\n",
         "        tfl.configs.FeatureConfig(\n",
         "            name=\"num_reviews\",\n",
-        "            lattice_size=2,\n",
+        "            lattice_size=3,\n",
         "            monotonicity=\"increasing\",\n",
         "            pwl_calibration_convexity=\"concave\",\n",
-        "            pwl_calibration_num_keypoints=20,\n",
-        "            regularizer_configs=[\n",
-        "                tfl.configs.RegularizerConfig(name=\"calib_wrinkle\", l2=1.0),\n",
-        "            ],\n",
+        "            pwl_calibration_num_keypoints=32,\n",
         "            reflects_trust_in=[\n",
         "                tfl.configs.TrustConfig(\n",
-        "                    feature_name=\"avg_rating\", trust_type=\"edgeworth\"),\n",
+        "                    feature_name=\"avg_rating\", trust_type=\"edgeworth\"\n",
+        "                ),\n",
         "            ],\n",
         "        ),\n",
         "        tfl.configs.FeatureConfig(\n",
         "            name=\"avg_rating\",\n",
-        "            lattice_size=2,\n",
+        "            lattice_size=3,\n",
         "            monotonicity=\"increasing\",\n",
-        "            pwl_calibration_num_keypoints=20,\n",
-        "            regularizer_configs=[\n",
-        "                tfl.configs.RegularizerConfig(name=\"calib_wrinkle\", l2=1.0),\n",
-        "            ],\n",
-        "        )\n",
-        "    ])\n",
-        "tfl_estimator = tfl.estimators.CannedClassifier(\n",
-        "    feature_columns=feature_columns,\n",
-        "    model_config=model_config,\n",
-        "    feature_analysis_input_fn=feature_analysis_input_fn,\n",
-        "    optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=LEARNING_RATE),\n",
-        "    config=tf.estimator.RunConfig(tf_random_seed=42),\n",
+        "            pwl_calibration_num_keypoints=32,\n",
+        "        ),\n",
+        "        tfl.configs.FeatureConfig(\n",
+        "            name=\"dollar_rating\",\n",
+        "            lattice_size=3,\n",
+        "            pwl_calibration_num_keypoints=4,\n",
+        "            vocabulary_list=dollar_ratings_vocab,\n",
+        "            num_buckets=len(dollar_ratings_vocab),\n",
+        "            monotonicity=[(\"D\", \"DD\")],\n",
+        "        ),\n",
+        "    ],\n",
+        "    output_calibration=True,\n",
+        "    output_initialization=np.linspace(-2, 2, num=5),\n",
         ")\n",
-        "tfl_estimator.train(input_fn=train_input_fn)\n",
-        "analyze_two_d_estimator(tfl_estimator, \"TF Lattice\")\n",
-        "_ = save_and_visualize_lattice(tfl_estimator)"
+        "\n",
+        "tfl.premade_lib.set_feature_keypoints(\n",
+        "    feature_configs=model_config.feature_configs,\n",
+        "    feature_keypoints=feature_keypoints,\n",
+        "    add_missing_feature_configs=False,\n",
+        ")\n",
+        "tfl.premade_lib.set_categorical_monotonicities(model_config.feature_configs)\n",
+        "\n",
+        "outputs = tfl.premade.CalibratedLattice(\n",
+        "    model_config=model_config, name=\"CalibratedLattice\"\n",
+        ")(ordered_inputs)\n",
+        "tfl_model_3 = keras.Model(inputs=inputs, outputs=outputs)\n",
+        "tfl_model_3.compile(\n",
+        "    loss=keras.losses.BinaryCrossentropy(from_logits=True),\n",
+        "    metrics=[keras.metrics.AUC(from_logits=True, name=\"auc\")],\n",
+        "    optimizer=keras.optimizers.Adam(LEARNING_RATE),\n",
+        ")\n",
+        "tfl_model_3.fit(\n",
+        "    ds_train,\n",
+        "    epochs=100,\n",
+        "    verbose=0\n",
+        ")\n",
+        "analyze_model([(tfl_model_3, \"TFL3\")], from_logits=True)"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "HHpp4goLvuPi"
+        "id": "LSmzHkPUo9u5"
       },
       "source": [
-        "The calibrators are now smooth, and the overall estimated CTR better matches the ground truth. This is reflected both in the testing metric and in the contour plots."
+        "Notice how the testing metric improves by adding the concavity constraint. The prediction plot also better resembles the ground truth."
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "pSUd6aFlpYz4"
-      },
-      "source": [
-        "### Partial Monotonicity for Categorical Calibration\n",
-        "So far we have been using only two of the numeric features in the model. Here we will add a third feature using a categorical calibration layer. Again we start by setting up helper functions for plotting and metric calculation."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "5tLDKwTmjrLw"
+        "id": "SKe3UHX6pUjw"
       },
-      "outputs": [],
       "source": [
-        "def analyze_three_d_estimator(estimator, name):\n",
-        "  # Extract validation metrics.\n",
-        "  metric = estimator.evaluate(input_fn=val_input_fn)\n",
-        "  print(\"Validation AUC: {}\".format(metric[\"auc\"]))\n",
-        "  metric = estimator.evaluate(input_fn=test_input_fn)\n",
-        "  print(\"Testing AUC: {}\".format(metric[\"auc\"]))\n",
-        "\n",
-        "  def three_d_pred(avg_ratings, num_reviews, dollar_rating):\n",
-        "    results = estimator.predict(\n",
-        "        tf.compat.v1.estimator.inputs.pandas_input_fn(\n",
-        "            x=pd.DataFrame({\n",
-        "                \"avg_rating\": avg_ratings,\n",
-        "                \"num_reviews\": num_reviews,\n",
-        "                \"dollar_rating\": dollar_rating,\n",
-        "            }),\n",
-        "            shuffle=False,\n",
-        "        ))\n",
-        "    return [x[\"logistic\"][0] for x in results]\n",
+        "### Smoothing Calibrators\n",
+        "We notice in the prediction curves above that even though the output is monotonic in specified features, the changes in the slopes are abrupt and hard to interpret. That suggests we might want to consider smoothing this calibrator using a regularizer setup in the `regularizer_configs`.\n",
         "\n",
-        "  figsize(11, 22)\n",
-        "  plot_fns([(\"{} Estimated CTR\".format(name), three_d_pred),\n",
-        "            (\"CTR\", click_through_rate)],\n",
-        "           split_by_dollar=True)\n",
-        "  "
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "CnPiqf4rq6kJ"
-      },
-      "source": [
-        "To involve the third feature, `dollar_rating`, we should recall that categorical features require a slightly different treatment in TFL, both as a feature column and as a feature config. Here we enforce the partial monotonicity constraint that outputs for \"DD\" restaurants should be larger than \"D\" restaurants when all other inputs are fixed. This is done using the `monotonicity` setting in the feature config."
+        "Here we apply a `hessian` regularizer to make the calibration more linear. You can also use the `laplacian` regularizer to flatten the calibrator and the `wrinkle` regularizer to reduce changes in the curvature.\n"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "m-w7iGEEpgGt"
+        "id": "CxcCNxhkqC7u"
       },
       "outputs": [],
       "source": [
-        "feature_columns = [\n",
-        "    tf.feature_column.numeric_column(\"num_reviews\"),\n",
-        "    tf.feature_column.numeric_column(\"avg_rating\"),\n",
-        "    tf.feature_column.categorical_column_with_vocabulary_list(\n",
-        "        \"dollar_rating\",\n",
-        "        vocabulary_list=[\"D\", \"DD\", \"DDD\", \"DDDD\"],\n",
-        "        dtype=tf.string,\n",
-        "        default_value=0),\n",
-        "]\n",
+        "keras.utils.set_random_seed(42)\n",
         "model_config = tfl.configs.CalibratedLatticeConfig(\n",
         "    feature_configs=[\n",
         "        tfl.configs.FeatureConfig(\n",
         "            name=\"num_reviews\",\n",
-        "            lattice_size=2,\n",
+        "            lattice_size=3,\n",
         "            monotonicity=\"increasing\",\n",
         "            pwl_calibration_convexity=\"concave\",\n",
-        "            pwl_calibration_num_keypoints=20,\n",
+        "            pwl_calibration_num_keypoints=32,\n",
         "            regularizer_configs=[\n",
-        "                tfl.configs.RegularizerConfig(name=\"calib_wrinkle\", l2=1.0),\n",
+        "                tfl.configs.RegularizerConfig(name=\"calib_hessian\", l2=0.5),\n",
         "            ],\n",
         "            reflects_trust_in=[\n",
         "                tfl.configs.TrustConfig(\n",
-        "                    feature_name=\"avg_rating\", trust_type=\"edgeworth\"),\n",
+        "                    feature_name=\"avg_rating\", trust_type=\"edgeworth\"\n",
+        "                ),\n",
         "            ],\n",
         "        ),\n",
         "        tfl.configs.FeatureConfig(\n",
         "            name=\"avg_rating\",\n",
-        "            lattice_size=2,\n",
+        "            lattice_size=3,\n",
         "            monotonicity=\"increasing\",\n",
-        "            pwl_calibration_num_keypoints=20,\n",
+        "            pwl_calibration_num_keypoints=32,\n",
         "            regularizer_configs=[\n",
-        "                tfl.configs.RegularizerConfig(name=\"calib_wrinkle\", l2=1.0),\n",
+        "                tfl.configs.RegularizerConfig(name=\"calib_hessian\", l2=0.5),\n",
         "            ],\n",
         "        ),\n",
         "        tfl.configs.FeatureConfig(\n",
         "            name=\"dollar_rating\",\n",
-        "            lattice_size=2,\n",
+        "            lattice_size=3,\n",
         "            pwl_calibration_num_keypoints=4,\n",
-        "            # Here we only specify one monotonicity:\n",
-        "            # `D` resturants has smaller value than `DD` restaurants\n",
+        "            vocabulary_list=dollar_ratings_vocab,\n",
+        "            num_buckets=len(dollar_ratings_vocab),\n",
         "            monotonicity=[(\"D\", \"DD\")],\n",
         "        ),\n",
-        "    ])\n",
-        "tfl_estimator = tfl.estimators.CannedClassifier(\n",
-        "    feature_columns=feature_columns,\n",
-        "    model_config=model_config,\n",
-        "    feature_analysis_input_fn=feature_analysis_input_fn,\n",
-        "    optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=LEARNING_RATE),\n",
-        "    config=tf.estimator.RunConfig(tf_random_seed=42),\n",
+        "    ],\n",
+        "    output_calibration=True,\n",
+        "    output_initialization=np.linspace(-2, 2, num=5),\n",
+        "    regularizer_configs=[\n",
+        "        tfl.configs.RegularizerConfig(name=\"calib_hessian\", l2=0.1),\n",
+        "    ],\n",
         ")\n",
-        "tfl_estimator.train(input_fn=train_input_fn)\n",
-        "analyze_three_d_estimator(tfl_estimator, \"TF Lattice\")\n",
-        "_ = save_and_visualize_lattice(tfl_estimator)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "gdIzhYL79_Pp"
-      },
-      "source": [
-        "This categorical calibrator shows the preference of the model output: DD \u003e D \u003e DDD \u003e DDDD, which is consistent with our setup. Notice there is also a column for missing values. Though there is no missing feature in our training and testing data, the model provides us with an imputation for the missing value should it happen during downstream model serving.\n",
         "\n",
-        "Here we also plot the predicted CTR of this model conditioned on `dollar_rating`. Notice that all the constraints we required are fulfilled in each of the slices."
+        "tfl.premade_lib.set_feature_keypoints(\n",
+        "    feature_configs=model_config.feature_configs,\n",
+        "    feature_keypoints=feature_keypoints,\n",
+        "    add_missing_feature_configs=False,\n",
+        ")\n",
+        "tfl.premade_lib.set_categorical_monotonicities(model_config.feature_configs)\n",
+        "\n",
+        "outputs = tfl.premade.CalibratedLattice(\n",
+        "    model_config=model_config, name=\"CalibratedLattice\"\n",
+        ")(ordered_inputs)\n",
+        "tfl_model_4 = keras.Model(inputs=inputs, outputs=outputs)\n",
+        "tfl_model_4.compile(\n",
+        "    loss=keras.losses.BinaryCrossentropy(from_logits=True),\n",
+        "    metrics=[keras.metrics.AUC(from_logits=True, name=\"auc\")],\n",
+        "    optimizer=keras.optimizers.Adam(LEARNING_RATE),\n",
+        ")\n",
+        "tfl_model_4.fit(ds_train, epochs=100, verbose=0)\n",
+        "analyze_model([(tfl_model_4, \"TFL4\")], from_logits=True)"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "rh0H2b6l_rwZ"
+        "id": "HHpp4goLvuPi"
       },
       "source": [
-        "### Output Calibration"
+        "The calibrators are now smooth, and the overall estimated CTR better matches the ground truth. This is reflected both in the testing metric and in the contour plots."
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "KPb2ri4e7HXF"
+        "id": "TLOGDrYY0hH7"
       },
       "source": [
-        "For all the TFL models we have trained so far, the lattice layer (indicated as \"Lattice\" in the model graph) directly outputs the model prediction. Sometimes we are not sure whether the lattice output should be rescaled to emit model outputs:\n",
-        "- the features are $log$ counts while the labels are counts.\n",
-        "- the lattice is configured to have very few vertices but the label distribution is relatively complicated.\n",
-        "\n",
-        "In those cases we can add another calibrator between the lattice output and the model output to increase model flexibility. Here let's add a calibrator layer with 5 keypoints to the model we just built. We also add a regularizer for the output calibrator to keep the function smooth.\n"
+        "Here you can see the results of each step as we added domain-specific constraints and regularizers to the model."
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "k5Sg_gUj_0i4"
+        "id": "nUEuihX815ix"
       },
       "outputs": [],
       "source": [
-        "feature_columns = [\n",
-        "    tf.feature_column.numeric_column(\"num_reviews\"),\n",
-        "    tf.feature_column.numeric_column(\"avg_rating\"),\n",
-        "    tf.feature_column.categorical_column_with_vocabulary_list(\n",
-        "        \"dollar_rating\",\n",
-        "        vocabulary_list=[\"D\", \"DD\", \"DDD\", \"DDDD\"],\n",
-        "        dtype=tf.string,\n",
-        "        default_value=0),\n",
-        "]\n",
-        "model_config = tfl.configs.CalibratedLatticeConfig(\n",
-        "    output_calibration=True,\n",
-        "    output_calibration_num_keypoints=5,\n",
-        "    regularizer_configs=[\n",
-        "        tfl.configs.RegularizerConfig(name=\"output_calib_wrinkle\", l2=0.1),\n",
+        "analyze_model(\n",
+        "    [\n",
+        "        (tfl_model_0, \"TFL0\"),\n",
+        "        (tfl_model_1, \"TFL1\"),\n",
+        "        (tfl_model_2, \"TFL2\"),\n",
+        "        (tfl_model_3, \"TFL3\"),\n",
+        "        (tfl_model_4, \"TFL4\"),\n",
         "    ],\n",
-        "    feature_configs=[\n",
-        "    tfl.configs.FeatureConfig(\n",
-        "        name=\"num_reviews\",\n",
-        "        lattice_size=2,\n",
-        "        monotonicity=\"increasing\",\n",
-        "        pwl_calibration_convexity=\"concave\",\n",
-        "        pwl_calibration_num_keypoints=20,\n",
-        "        regularizer_configs=[\n",
-        "            tfl.configs.RegularizerConfig(name=\"calib_wrinkle\", l2=1.0),\n",
-        "        ],\n",
-        "        reflects_trust_in=[\n",
-        "            tfl.configs.TrustConfig(\n",
-        "                feature_name=\"avg_rating\", trust_type=\"edgeworth\"),\n",
-        "        ],\n",
-        "    ),\n",
-        "    tfl.configs.FeatureConfig(\n",
-        "        name=\"avg_rating\",\n",
-        "        lattice_size=2,\n",
-        "        monotonicity=\"increasing\",\n",
-        "        pwl_calibration_num_keypoints=20,\n",
-        "        regularizer_configs=[\n",
-        "            tfl.configs.RegularizerConfig(name=\"calib_wrinkle\", l2=1.0),\n",
-        "        ],\n",
-        "    ),\n",
-        "    tfl.configs.FeatureConfig(\n",
-        "        name=\"dollar_rating\",\n",
-        "        lattice_size=2,\n",
-        "        pwl_calibration_num_keypoints=4,\n",
-        "        # Here we only specify one monotonicity:\n",
-        "        # `D` resturants has smaller value than `DD` restaurants\n",
-        "        monotonicity=[(\"D\", \"DD\")],\n",
-        "    ),\n",
-        "])\n",
-        "tfl_estimator = tfl.estimators.CannedClassifier(\n",
-        "    feature_columns=feature_columns,\n",
-        "    model_config=model_config,\n",
-        "    feature_analysis_input_fn=feature_analysis_input_fn,\n",
-        "    optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=LEARNING_RATE),\n",
-        "    config=tf.estimator.RunConfig(tf_random_seed=42),\n",
-        ")\n",
-        "tfl_estimator.train(input_fn=train_input_fn)\n",
-        "analyze_three_d_estimator(tfl_estimator, \"TF Lattice\")\n",
-        "_ = save_and_visualize_lattice(tfl_estimator)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "TLOGDrYY0hH7"
-      },
-      "source": [
-        "The final testing metric and plots show how using common-sense constraints can help the model avoid unexpected behaviour and extrapolate better to the entire input space."
+        "    from_logits=True,\n",
+        "    print_metrics=False,\n",
+        ")"
       ]
     }
   ],
@@ -1203,6 +1195,8 @@
     "colab": {
       "collapsed_sections": [],
       "name": "shape_constraints.ipynb",
+      "private_outputs": true,
+      "provenance": [],
       "toc_visible": true
     },
     "kernelspec": {
diff --git a/docs/tutorials/shape_constraints_for_ethics.ipynb b/docs/tutorials/shape_constraints_for_ethics.ipynb
index 0c8d438..f23bb1e 100644
--- a/docs/tutorials/shape_constraints_for_ethics.ipynb
+++ b/docs/tutorials/shape_constraints_for_ethics.ipynb
@@ -78,7 +78,7 @@
         "by Serena Wang and Maya Gupta, published at\n",
         "[AISTATS 2020](https://www.aistats.org/).\n",
         "\n",
-        "We will use TFL canned estimators on public datasets, but note that\n",
+        "We will use TFL premade models on public datasets, but note that\n",
         "everything in this tutorial can also be done with models constructed from TFL\n",
         "Keras layers.\n",
         "\n",
@@ -113,7 +113,7 @@
       "outputs": [],
       "source": [
         "#@test {\"skip\": true}\n",
-        "!pip install tensorflow-lattice tensorflow_decision_forests seaborn"
+        "!pip install --pre -U tensorflow tf-keras tensorflow-lattice tensorflow_decision_forests seaborn pydot graphviz"
       ]
     },
     {
@@ -149,6 +149,22 @@
         "logging.disable(sys.maxsize)"
       ]
     },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "csVitiM20zAY"
+      },
+      "outputs": [],
+      "source": [
+        "# Use Keras 2.\n",
+        "version_fn = getattr(tf.keras, \"version\", None)\n",
+        "if version_fn and version_fn().startswith(\"3.\"):\n",
+        "  import tf_keras as keras\n",
+        "else:\n",
+        "  keras = tf.keras"
+      ]
+    },
     {
       "cell_type": "markdown",
       "metadata": {
@@ -166,12 +182,10 @@
       },
       "outputs": [],
       "source": [
-        "# List of learning rate hyperparameters to try.\n",
-        "# For a longer list of reasonable hyperparameters, try [0.001, 0.01, 0.1].\n",
-        "LEARNING_RATES = [0.01]\n",
-        "# Default number of training epochs and batch sizes.\n",
-        "NUM_EPOCHS = 1000\n",
-        "BATCH_SIZE = 1000\n",
+        "# Default number of training epochs, batch sizes and learning rate.\n",
+        "NUM_EPOCHS = 256\n",
+        "BATCH_SIZE = 256\n",
+        "LEARNING_RATES = 0.01\n",
         "# Directory containing dataset files.\n",
         "DATA_DIR = 'https://github.com/raw/serenalwang/shape_constraints_for_ethics/master'"
       ]
@@ -284,13 +298,25 @@
         "def split_dataset(input_df, random_state=888):\n",
         "  \"\"\"Splits an input dataset into train, val, and test sets.\"\"\"\n",
         "  train_df, test_val_df = train_test_split(\n",
-        "      input_df, test_size=0.3, random_state=random_state)\n",
+        "      input_df, test_size=0.3, random_state=random_state\n",
+        "  )\n",
         "  val_df, test_df = train_test_split(\n",
-        "      test_val_df, test_size=0.66, random_state=random_state)\n",
+        "      test_val_df, test_size=0.66, random_state=random_state\n",
+        "  )\n",
         "  return train_df, val_df, test_df\n",
         "\n",
         "\n",
-        "law_train_df, law_val_df, law_test_df = split_dataset(law_df)"
+        "dataframes = {}\n",
+        "datasets = {}\n",
+        "\n",
+        "(dataframes['law_train'], dataframes['law_val'], dataframes['law_test']) = (\n",
+        "    split_dataset(law_df)\n",
+        ")\n",
+        "\n",
+        "for df_name, df in dataframes.items():\n",
+        "  datasets[df_name] = tf.data.Dataset.from_tensor_slices(\n",
+        "      ((df[['ugpa']], df[['lsat']]), df[['pass_bar']])\n",
+        "  ).batch(BATCH_SIZE)"
       ]
     },
     {
@@ -342,17 +368,7 @@
       "source": [
         "law_df_pos = law_df[law_df[LAW_LABEL] == 1]\n",
         "plot_dataset_contour(\n",
-        "    law_df_pos, title='Distribution of students that passed the bar')"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "ct-2tEedU0aO"
-      },
-      "outputs": [],
-      "source": [
+        "    law_df_pos, title='Distribution of students that passed the bar')\n",
         "law_df_neg = law_df[law_df[LAW_LABEL] == 0]\n",
         "plot_dataset_contour(\n",
         "    law_df_neg, title='Distribution of students that failed the bar')"
@@ -364,289 +380,200 @@
         "id": "6grrFEMPfPjk"
       },
       "source": [
-        "## Train calibrated linear model to predict bar exam passage\n",
+        "## Train calibrated lattice model to predict bar exam passage\n",
         "\n",
-        "Next, we will train a *calibrated linear model* from TFL to predict whether or\n",
+        "Next, we will train a *calibrated lattice model* from TFL to predict whether or\n",
         "not a student will pass the bar. The two input features will be LSAT score and\n",
         "undergraduate GPA, and the training label will be whether the student passed the\n",
         "bar.\n",
         "\n",
-        "We will first train a calibrated linear model without any constraints. Then, we\n",
-        "will train a calibrated linear model with monotonicity constraints and observe\n",
+        "We will first train a calibrated lattice model without any constraints. Then, we\n",
+        "will train a calibrated lattice model with monotonicity constraints and observe\n",
         "the difference in the model output and accuracy."
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "vrUZvP8V736o"
+        "id": "HSfAwgiO_6YA"
       },
       "source": [
-        "### Helper functions for training a TFL calibrated linear estimator\n",
-        "\n",
-        "These functions will be used for this law school case study, as well as the\n",
-        "credit default case study below."
+        "### Helper functions for visualization of trained model outputs"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "ollW4xAZ72kz"
+        "id": "aw28Xc7IS6vR"
       },
       "outputs": [],
       "source": [
-        "def train_tfl_estimator(train_df, monotonicity, learning_rate, num_epochs,\n",
-        "                        batch_size, get_input_fn,\n",
-        "                        get_feature_columns_and_configs):\n",
-        "  \"\"\"Trains a TFL calibrated linear estimator.\n",
-        "\n",
-        "  Args:\n",
-        "    train_df: pandas dataframe containing training data.\n",
-        "    monotonicity: if 0, then no monotonicity constraints. If 1, then all\n",
-        "      features are constrained to be monotonically increasing.\n",
-        "    learning_rate: learning rate of Adam optimizer for gradient descent.\n",
-        "    num_epochs: number of training epochs.\n",
-        "    batch_size: batch size for each epoch. None means the batch size is the full\n",
-        "      dataset size.\n",
-        "    get_input_fn: function that returns the input_fn for a TF estimator.\n",
-        "    get_feature_columns_and_configs: function that returns TFL feature columns\n",
-        "      and configs.\n",
-        "\n",
-        "  Returns:\n",
-        "    estimator: a trained TFL calibrated linear estimator.\n",
-        "\n",
-        "  \"\"\"\n",
-        "  feature_columns, feature_configs = get_feature_columns_and_configs(\n",
-        "      monotonicity)\n",
-        "\n",
-        "  model_config = tfl.configs.CalibratedLinearConfig(\n",
-        "      feature_configs=feature_configs, use_bias=False)\n",
-        "\n",
-        "  estimator = tfl.estimators.CannedClassifier(\n",
-        "      feature_columns=feature_columns,\n",
-        "      model_config=model_config,\n",
-        "      feature_analysis_input_fn=get_input_fn(input_df=train_df, num_epochs=1),\n",
-        "      optimizer=tf.keras.optimizers.legacy.Adam(learning_rate))\n",
+        "def plot_model_contour(model, from_logits=False, num_keypoints=20):\n",
+        "  x = np.linspace(min(law_df['ugpa']), max(law_df['ugpa']), num_keypoints)\n",
+        "  y = np.linspace(min(law_df['lsat']), max(law_df['lsat']), num_keypoints)\n",
         "\n",
-        "  estimator.train(\n",
-        "      input_fn=get_input_fn(\n",
-        "          input_df=train_df, num_epochs=num_epochs, batch_size=batch_size))\n",
-        "  return estimator\n",
+        "  x_grid, y_grid = np.meshgrid(x, y)\n",
         "\n",
+        "  positions = np.vstack([x_grid.ravel(), y_grid.ravel()])\n",
+        "  plot_df = pd.DataFrame(positions.T, columns=['ugpa', 'lsat'])\n",
+        "  plot_df[LAW_LABEL] = np.ones(len(plot_df))\n",
+        "  predictions = model.predict((plot_df[['ugpa']], plot_df[['lsat']]))\n",
+        "  if from_logits:\n",
+        "    predictions = tf.math.sigmoid(predictions)\n",
+        "  grid_predictions = np.reshape(predictions, x_grid.shape)\n",
         "\n",
-        "def optimize_learning_rates(\n",
-        "    train_df,\n",
-        "    val_df,\n",
-        "    test_df,\n",
-        "    monotonicity,\n",
-        "    learning_rates,\n",
-        "    num_epochs,\n",
-        "    batch_size,\n",
-        "    get_input_fn,\n",
-        "    get_feature_columns_and_configs,\n",
-        "):\n",
-        "  \"\"\"Optimizes learning rates for TFL estimators.\n",
+        "  plt.rcParams['font.family'] = ['serif']\n",
+        "  plt.contour(\n",
+        "      x_grid,\n",
+        "      y_grid,\n",
+        "      grid_predictions,\n",
+        "      colors=('k',),\n",
+        "      levels=np.linspace(0, 1, 11),\n",
+        "  )\n",
+        "  plt.contourf(\n",
+        "      x_grid,\n",
+        "      y_grid,\n",
+        "      grid_predictions,\n",
+        "      cmap=plt.cm.bone,\n",
+        "      levels=np.linspace(0, 1, 11),\n",
+        "  )\n",
+        "  plt.xticks(fontsize=20)\n",
+        "  plt.yticks(fontsize=20)\n",
         "\n",
-        "  Args:\n",
-        "    train_df: pandas dataframe containing training data.\n",
-        "    val_df: pandas dataframe containing validation data.\n",
-        "    test_df: pandas dataframe containing test data.\n",
-        "    monotonicity: if 0, then no monotonicity constraints. If 1, then all\n",
-        "      features are constrained to be monotonically increasing.\n",
-        "    learning_rates: list of learning rates to try.\n",
-        "    num_epochs: number of training epochs.\n",
-        "    batch_size: batch size for each epoch. None means the batch size is the full\n",
-        "      dataset size.\n",
-        "    get_input_fn: function that returns the input_fn for a TF estimator.\n",
-        "    get_feature_columns_and_configs: function that returns TFL feature columns\n",
-        "      and configs.\n",
+        "  cbar = plt.colorbar()\n",
+        "  cbar.ax.set_ylabel('Model score', fontsize=20)\n",
+        "  cbar.ax.tick_params(labelsize=20)\n",
         "\n",
-        "  Returns:\n",
-        "    A single TFL estimator that achieved the best validation accuracy.\n",
-        "  \"\"\"\n",
-        "  estimators = []\n",
-        "  train_accuracies = []\n",
-        "  val_accuracies = []\n",
-        "  test_accuracies = []\n",
-        "  for lr in learning_rates:\n",
-        "    estimator = train_tfl_estimator(\n",
-        "        train_df=train_df,\n",
-        "        monotonicity=monotonicity,\n",
-        "        learning_rate=lr,\n",
-        "        num_epochs=num_epochs,\n",
-        "        batch_size=batch_size,\n",
-        "        get_input_fn=get_input_fn,\n",
-        "        get_feature_columns_and_configs=get_feature_columns_and_configs)\n",
-        "    estimators.append(estimator)\n",
-        "    train_acc = estimator.evaluate(\n",
-        "        input_fn=get_input_fn(train_df, num_epochs=1))['accuracy']\n",
-        "    val_acc = estimator.evaluate(\n",
-        "        input_fn=get_input_fn(val_df, num_epochs=1))['accuracy']\n",
-        "    test_acc = estimator.evaluate(\n",
-        "        input_fn=get_input_fn(test_df, num_epochs=1))['accuracy']\n",
-        "    print('accuracies for learning rate %f: train: %f, val: %f, test: %f' %\n",
-        "          (lr, train_acc, val_acc, test_acc))\n",
-        "    train_accuracies.append(train_acc)\n",
-        "    val_accuracies.append(val_acc)\n",
-        "    test_accuracies.append(test_acc)\n",
-        "  max_index = val_accuracies.index(max(val_accuracies))\n",
-        "  return estimators[max_index]"
+        "  plt.xlabel('Undergraduate GPA', fontsize=20)\n",
+        "  plt.ylabel('LSAT score', fontsize=20)"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "jeEfKSA7_aOg"
+        "id": "fAMSCaRHIn1w"
       },
       "source": [
-        "### Helper functions for configuring law school dataset features\n",
-        "\n",
-        "These helper functions are specific to the law school case study."
+        "## Train unconstrained (non-monotonic) calibrated lattice model"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "mK7RWDJ5ugdd"
+      },
+      "source": [
+        "We create a TFL premade model using a '`CalibratedLatticeConfig`. This model is a calibrated lattice model with an output calibration."
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "B6NU6EEKIYMJ"
+        "id": "J16TOicHQ1sM"
       },
       "outputs": [],
       "source": [
-        "def get_input_fn_law(input_df, num_epochs, batch_size=None):\n",
-        "  \"\"\"Gets TF input_fn for law school models.\"\"\"\n",
-        "  return tf.compat.v1.estimator.inputs.pandas_input_fn(\n",
-        "      x=input_df[['ugpa', 'lsat']],\n",
-        "      y=input_df['pass_bar'],\n",
-        "      num_epochs=num_epochs,\n",
-        "      batch_size=batch_size or len(input_df),\n",
-        "      shuffle=False)\n",
-        "\n",
-        "\n",
-        "def get_feature_columns_and_configs_law(monotonicity):\n",
-        "  \"\"\"Gets TFL feature configs for law school models.\"\"\"\n",
-        "  feature_columns = [\n",
-        "      tf.feature_column.numeric_column('ugpa'),\n",
-        "      tf.feature_column.numeric_column('lsat'),\n",
-        "  ]\n",
-        "  feature_configs = [\n",
-        "      tfl.configs.FeatureConfig(\n",
-        "          name='ugpa',\n",
-        "          lattice_size=2,\n",
-        "          pwl_calibration_num_keypoints=20,\n",
-        "          monotonicity=monotonicity,\n",
-        "          pwl_calibration_always_monotonic=False),\n",
-        "      tfl.configs.FeatureConfig(\n",
-        "          name='lsat',\n",
-        "          lattice_size=2,\n",
-        "          pwl_calibration_num_keypoints=20,\n",
-        "          monotonicity=monotonicity,\n",
-        "          pwl_calibration_always_monotonic=False),\n",
-        "  ]\n",
-        "  return feature_columns, feature_configs"
+        "model_config = tfl.configs.CalibratedLatticeConfig(\n",
+        "    feature_configs=[\n",
+        "        tfl.configs.FeatureConfig(\n",
+        "            name='ugpa',\n",
+        "            lattice_size=3,\n",
+        "            pwl_calibration_num_keypoints=16,\n",
+        "            monotonicity=0,\n",
+        "            pwl_calibration_always_monotonic=False,\n",
+        "        ),\n",
+        "        tfl.configs.FeatureConfig(\n",
+        "            name='lsat',\n",
+        "            lattice_size=3,\n",
+        "            pwl_calibration_num_keypoints=16,\n",
+        "            monotonicity=0,\n",
+        "            pwl_calibration_always_monotonic=False,\n",
+        "        ),\n",
+        "    ],\n",
+        "    output_calibration=True,\n",
+        "    output_initialization=np.linspace(-2, 2, num=8),\n",
+        ")"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "HSfAwgiO_6YA"
+        "id": "jt1Rm6qCuuat"
       },
       "source": [
-        "### Helper functions for visualization of trained model outputs"
+        "We calculate and populate feature quantiles in the feature configs using the `premade_lib` API."
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "HESNIC5H-1dz"
+        "id": "eSELqBdURE0F"
       },
       "outputs": [],
       "source": [
-        "def get_predicted_probabilities(estimator, input_df, get_input_fn):\n",
-        "  if isinstance(estimator, tf.estimator.Estimator):\n",
-        "    predictions = estimator.predict(\n",
-        "        input_fn=get_input_fn(input_df=input_df, num_epochs=1))\n",
-        "    return [prediction['probabilities'][1] for prediction in predictions]\n",
-        "  else:\n",
-        "    return estimator.predict(tfdf.keras.pd_dataframe_to_tf_dataset(input_df))\n",
-        "\n",
-        "\n",
-        "def plot_model_contour(estimator, input_df, num_keypoints=20):\n",
-        "  x = np.linspace(min(input_df['ugpa']), max(input_df['ugpa']), num_keypoints)\n",
-        "  y = np.linspace(min(input_df['lsat']), max(input_df['lsat']), num_keypoints)\n",
-        "\n",
-        "  x_grid, y_grid = np.meshgrid(x, y)\n",
-        "\n",
-        "  positions = np.vstack([x_grid.ravel(), y_grid.ravel()])\n",
-        "  plot_df = pd.DataFrame(positions.T, columns=['ugpa', 'lsat'])\n",
-        "  plot_df[LAW_LABEL] = np.ones(len(plot_df))\n",
-        "  predictions = get_predicted_probabilities(\n",
-        "      estimator=estimator, input_df=plot_df, get_input_fn=get_input_fn_law)\n",
-        "  grid_predictions = np.reshape(predictions, x_grid.shape)\n",
-        "\n",
-        "  plt.rcParams['font.family'] = ['serif']\n",
-        "  plt.contour(\n",
-        "      x_grid,\n",
-        "      y_grid,\n",
-        "      grid_predictions,\n",
-        "      colors=('k',),\n",
-        "      levels=np.linspace(0, 1, 11))\n",
-        "  plt.contourf(\n",
-        "      x_grid,\n",
-        "      y_grid,\n",
-        "      grid_predictions,\n",
-        "      cmap=plt.cm.bone,\n",
-        "      levels=np.linspace(0, 1, 11))  # levels=np.linspace(0,1,8));\n",
-        "  plt.xticks(fontsize=20)\n",
-        "  plt.yticks(fontsize=20)\n",
-        "\n",
-        "  cbar = plt.colorbar()\n",
-        "  cbar.ax.set_ylabel('Model score', fontsize=20)\n",
-        "  cbar.ax.tick_params(labelsize=20)\n",
-        "\n",
-        "  plt.xlabel('Undergraduate GPA', fontsize=20)\n",
-        "  plt.ylabel('LSAT score', fontsize=20)"
+        "feature_keypoints = tfl.premade_lib.compute_feature_keypoints(\n",
+        "    feature_configs=model_config.feature_configs,\n",
+        "    features=dataframes['law_train'][['ugpa', 'lsat', 'pass_bar']],\n",
+        ")\n",
+        "tfl.premade_lib.set_feature_keypoints(\n",
+        "    feature_configs=model_config.feature_configs,\n",
+        "    feature_keypoints=feature_keypoints,\n",
+        "    add_missing_feature_configs=False,\n",
+        ")"
       ]
     },
     {
-      "cell_type": "markdown",
+      "cell_type": "code",
+      "execution_count": null,
       "metadata": {
-        "id": "fAMSCaRHIn1w"
+        "id": "ahV2Sn0Xz1aO"
       },
+      "outputs": [],
       "source": [
-        "## Train unconstrained (non-monotonic) calibrated linear model"
+        "nomon_lattice_model = tfl.premade.CalibratedLattice(model_config=model_config)\n",
+        "keras.utils.plot_model(\n",
+        "    nomon_lattice_model, expand_nested=True, show_layer_names=False, rankdir=\"LR\"\n",
+        ")"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "Iff8omH3Ij_x"
+        "id": "Oc5f-6zNtyxr"
       },
       "outputs": [],
       "source": [
-        "nomon_linear_estimator = optimize_learning_rates(\n",
-        "    train_df=law_train_df,\n",
-        "    val_df=law_val_df,\n",
-        "    test_df=law_test_df,\n",
-        "    monotonicity=0,\n",
-        "    learning_rates=LEARNING_RATES,\n",
-        "    batch_size=BATCH_SIZE,\n",
-        "    num_epochs=NUM_EPOCHS,\n",
-        "    get_input_fn=get_input_fn_law,\n",
-        "    get_feature_columns_and_configs=get_feature_columns_and_configs_law)"
+        "nomon_lattice_model.compile(\n",
+        "    loss=keras.losses.BinaryCrossentropy(from_logits=True),\n",
+        "    metrics=[\n",
+        "        keras.metrics.BinaryAccuracy(name='accuracy'),\n",
+        "    ],\n",
+        "    optimizer=keras.optimizers.Adam(LEARNING_RATES),\n",
+        ")\n",
+        "nomon_lattice_model.fit(datasets['law_train'], epochs=NUM_EPOCHS, verbose=0)\n",
+        "\n",
+        "train_acc = nomon_lattice_model.evaluate(datasets['law_train'])[1]\n",
+        "val_acc = nomon_lattice_model.evaluate(datasets['law_val'])[1]\n",
+        "test_acc = nomon_lattice_model.evaluate(datasets['law_test'])[1]\n",
+        "print(\n",
+        "    'accuracies for train: %f, val: %f, test: %f'\n",
+        "    % (train_acc, val_acc, test_acc)\n",
+        ")"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "Gxfv8hXMh4_E"
+        "id": "LuFxP9lDTZup"
       },
       "outputs": [],
       "source": [
-        "plot_model_contour(nomon_linear_estimator, input_df=law_df)"
+        "plot_model_contour(nomon_lattice_model, from_logits=True)"
       ]
     },
     {
@@ -655,7 +582,28 @@
         "id": "eKVkjHg_LaWb"
       },
       "source": [
-        "## Train monotonic calibrated linear model"
+        "## Train monotonic calibrated lattice model"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "W42OXWLVwx3w"
+      },
+      "source": [
+        "We can get a monotonic model by setting the monotonicity constraints in feature configs."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "XeOKlPRc0BQe"
+      },
+      "outputs": [],
+      "source": [
+        "model_config.feature_configs[0].monotonicity = 1\n",
+        "model_config.feature_configs[1].monotonicity = 1"
       ]
     },
     {
@@ -666,16 +614,24 @@
       },
       "outputs": [],
       "source": [
-        "mon_linear_estimator = optimize_learning_rates(\n",
-        "    train_df=law_train_df,\n",
-        "    val_df=law_val_df,\n",
-        "    test_df=law_test_df,\n",
-        "    monotonicity=1,\n",
-        "    learning_rates=LEARNING_RATES,\n",
-        "    batch_size=BATCH_SIZE,\n",
-        "    num_epochs=NUM_EPOCHS,\n",
-        "    get_input_fn=get_input_fn_law,\n",
-        "    get_feature_columns_and_configs=get_feature_columns_and_configs_law)"
+        "mon_lattice_model = tfl.premade.CalibratedLattice(model_config=model_config)\n",
+        "\n",
+        "mon_lattice_model.compile(\n",
+        "    loss=keras.losses.BinaryCrossentropy(from_logits=True),\n",
+        "    metrics=[\n",
+        "        keras.metrics.BinaryAccuracy(name='accuracy'),\n",
+        "    ],\n",
+        "    optimizer=keras.optimizers.Adam(LEARNING_RATES),\n",
+        ")\n",
+        "mon_lattice_model.fit(datasets['law_train'], epochs=NUM_EPOCHS, verbose=0)\n",
+        "\n",
+        "train_acc = mon_lattice_model.evaluate(datasets['law_train'])[1]\n",
+        "val_acc = mon_lattice_model.evaluate(datasets['law_val'])[1]\n",
+        "test_acc = mon_lattice_model.evaluate(datasets['law_test'])[1]\n",
+        "print(\n",
+        "    'accuracies for train: %f, val: %f, test: %f'\n",
+        "    % (train_acc, val_acc, test_acc)\n",
+        ")"
       ]
     },
     {
@@ -686,7 +642,17 @@
       },
       "outputs": [],
       "source": [
-        "plot_model_contour(mon_linear_estimator, input_df=law_df)"
+        "plot_model_contour(mon_lattice_model, from_logits=True)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "GWzBEV_p0WE-"
+      },
+      "source": [
+        "We demonstrated that TFL calibrated lattice models could be trained to be\n",
+        "monotonic in both LSAT score and GPA without too big of a sacrifice in accuracy."
       ]
     },
     {
@@ -697,10 +663,7 @@
       "source": [
         "## Train other unconstrained models\n",
         "\n",
-        "We demonstrated that TFL calibrated linear models could be trained to be\n",
-        "monotonic in both LSAT score and GPA without too big of a sacrifice in accuracy.\n",
-        "\n",
-        "But, how does the calibrated linear model compare to other types of models, like\n",
+        "How does the calibrated lattice model compare to other types of models, like\n",
         "deep neural networks (DNNs) or gradient boosted trees (GBTs)? Do DNNs and GBTs\n",
         "appear to have reasonably fair outputs? To address this question, we will next\n",
         "train an unconstrained DNN and GBT. In fact, we will observe that the DNN and\n",
@@ -726,27 +689,35 @@
       },
       "outputs": [],
       "source": [
-        "feature_names = ['ugpa', 'lsat']\n",
-        "\n",
-        "dnn_estimator = tf.estimator.DNNClassifier(\n",
-        "    feature_columns=[\n",
-        "        tf.feature_column.numeric_column(feature) for feature in feature_names\n",
+        "keras.utils.set_random_seed(42)\n",
+        "inputs = [\n",
+        "    keras.Input(shape=(1,), dtype=tf.float32),\n",
+        "    keras.Input(shape=(1), dtype=tf.float32),\n",
+        "]\n",
+        "inputs_flat = keras.layers.Concatenate()(inputs)\n",
+        "dense_layers = keras.Sequential(\n",
+        "    [\n",
+        "        keras.layers.Dense(64, activation='relu'),\n",
+        "        keras.layers.Dense(32, activation='relu'),\n",
+        "        keras.layers.Dense(1, activation=None),\n",
         "    ],\n",
-        "    hidden_units=[100, 100],\n",
-        "    optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.008),\n",
-        "    activation_fn=tf.nn.relu)\n",
+        "    name='dense_layers',\n",
+        ")\n",
+        "dnn_model = keras.Model(inputs=inputs, outputs=dense_layers(inputs_flat))\n",
+        "dnn_model.compile(\n",
+        "    loss=keras.losses.BinaryCrossentropy(from_logits=True),\n",
+        "    metrics=[keras.metrics.BinaryAccuracy(name='accuracy')],\n",
+        "    optimizer=keras.optimizers.Adam(LEARNING_RATES),\n",
+        ")\n",
+        "dnn_model.fit(datasets['law_train'], epochs=NUM_EPOCHS, verbose=0)\n",
         "\n",
-        "dnn_estimator.train(\n",
-        "    input_fn=get_input_fn_law(\n",
-        "        law_train_df, batch_size=BATCH_SIZE, num_epochs=NUM_EPOCHS))\n",
-        "dnn_train_acc = dnn_estimator.evaluate(\n",
-        "    input_fn=get_input_fn_law(law_train_df, num_epochs=1))['accuracy']\n",
-        "dnn_val_acc = dnn_estimator.evaluate(\n",
-        "    input_fn=get_input_fn_law(law_val_df, num_epochs=1))['accuracy']\n",
-        "dnn_test_acc = dnn_estimator.evaluate(\n",
-        "    input_fn=get_input_fn_law(law_test_df, num_epochs=1))['accuracy']\n",
-        "print('accuracies for DNN: train: %f, val: %f, test: %f' %\n",
-        "      (dnn_train_acc, dnn_val_acc, dnn_test_acc))"
+        "train_acc = dnn_model.evaluate(datasets['law_train'])[1]\n",
+        "val_acc = dnn_model.evaluate(datasets['law_val'])[1]\n",
+        "test_acc = dnn_model.evaluate(datasets['law_test'])[1]\n",
+        "print(\n",
+        "    'accuracies for train: %f, val: %f, test: %f'\n",
+        "    % (train_acc, val_acc, test_acc)\n",
+        ")"
       ]
     },
     {
@@ -757,7 +728,7 @@
       },
       "outputs": [],
       "source": [
-        "plot_model_contour(dnn_estimator, input_df=law_df)"
+        "plot_model_contour(dnn_model, from_logits=True)"
       ]
     },
     {
@@ -775,19 +746,12 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "mFaI9hB-rgoL"
+        "id": "6UrCJHqhgd3o"
       },
       "outputs": [],
       "source": [
-        "law_train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(\n",
-        "    law_train_df, label='pass_bar')\n",
-        "law_test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(\n",
-        "    law_test_df, label='pass_bar')\n",
-        "law_val_ds = tfdf.keras.pd_dataframe_to_tf_dataset(law_val_df, label='pass_bar')\n",
-        "\n",
         "tree_model = tfdf.keras.GradientBoostedTreesModel(\n",
-        "    features=[tfdf.keras.FeatureUsage(name=name) for name in feature_names],\n",
-        "    exclude_non_specified_features=True,\n",
+        "    exclude_non_specified_features=False,\n",
         "    num_threads=1,\n",
         "    num_trees=20,\n",
         "    max_depth=4,\n",
@@ -796,13 +760,17 @@
         "    temp_directory=tempfile.mkdtemp(),\n",
         ")\n",
         "tree_model.compile(metrics=[tf.keras.metrics.BinaryAccuracy(name='accuracy')])\n",
-        "tree_model.fit(law_train_ds, validation_data=law_val_ds, verbose=0)\n",
+        "tree_model.fit(\n",
+        "    datasets['law_train'], validation_data=datasets['law_val'], verbose=0\n",
+        ")\n",
         "\n",
-        "tree_train_acc = tree_model.evaluate(law_train_ds, verbose=0)[1]\n",
-        "tree_val_acc = tree_model.evaluate(law_val_ds, verbose=0)[1]\n",
-        "tree_test_acc = tree_model.evaluate(law_test_ds, verbose=0)[1]\n",
-        "print('accuracies for GBT: train: %f, val: %f, test: %f' %\n",
-        "      (tree_train_acc, tree_val_acc, tree_test_acc))"
+        "tree_train_acc = tree_model.evaluate(datasets['law_train'], verbose=0)[1]\n",
+        "tree_val_acc = tree_model.evaluate(datasets['law_val'], verbose=0)[1]\n",
+        "tree_test_acc = tree_model.evaluate(datasets['law_test'], verbose=0)[1]\n",
+        "print(\n",
+        "    'accuracies for GBT: train: %f, val: %f, test: %f'\n",
+        "    % (tree_train_acc, tree_val_acc, tree_test_acc)\n",
+        ")"
       ]
     },
     {
@@ -813,7 +781,7 @@
       },
       "outputs": [],
       "source": [
-        "plot_model_contour(tree_model, input_df=law_df)"
+        "plot_model_contour(tree_model)"
       ]
     },
     {
@@ -891,7 +859,17 @@
       },
       "outputs": [],
       "source": [
-        "credit_train_df, credit_val_df, credit_test_df = split_dataset(credit_df)"
+        "dfs = {}\n",
+        "datasets = {}\n",
+        "\n",
+        "dfs[\"credit_train\"], dfs[\"credit_val\"], dfs[\"credit_test\"] = split_dataset(\n",
+        "    credit_df\n",
+        ")\n",
+        "\n",
+        "for df_name, df in dfs.items():\n",
+        "  datasets[df_name] = tf.data.Dataset.from_tensor_slices(\n",
+        "      ((df[['MARRIAGE']], df[['PAY_0']]), df[['default']])\n",
+        "  ).batch(BATCH_SIZE)"
       ]
     },
     {
@@ -974,8 +952,13 @@
       },
       "outputs": [],
       "source": [
-        "plot_2d_means_credit(credit_train_df, 'PAY_0', 'default',\n",
-        "                     'Repayment Status (April)', 'Observed default rate')"
+        "plot_2d_means_credit(\n",
+        "    dfs['credit_train'],\n",
+        "    'PAY_0',\n",
+        "    'default',\n",
+        "    'Repayment Status (April)',\n",
+        "    'Observed default rate',\n",
+        ")"
       ]
     },
     {
@@ -984,127 +967,134 @@
         "id": "4hnZBigB7kzY"
       },
       "source": [
-        "## Train calibrated linear model to predict credit default rate\n",
+        "## Train calibrated lattice model to predict credit default rate\n",
         "\n",
-        "Next, we will train a *calibrated linear model* from TFL to predict whether or\n",
+        "Next, we will train a *calibrated lattice model* from TFL to predict whether or\n",
         "not a person will default on a loan. The two input features will be the person's\n",
         "marital status and how many months the person is behind on paying back their\n",
         "loans in April (repayment status). The training label will be whether or not the\n",
         "person defaulted on a loan.\n",
         "\n",
-        "We will first train a calibrated linear model without any constraints. Then, we\n",
-        "will train a calibrated linear model with monotonicity constraints and observe\n",
+        "We will first train a calibrated lattice model without any constraints. Then, we\n",
+        "will train a calibrated lattice model with monotonicity constraints and observe\n",
         "the difference in the model output and accuracy."
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "UEcHW1u3Jk_2"
+        "id": "iwxnlRrQPdTg"
       },
       "source": [
-        "### Helper functions for configuring credit default dataset features\n",
-        "\n",
-        "These helper functions are specific to the credit default case study."
+        "### Helper functions for visualization of trained model outputs"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "QBa-hczLi7DM"
+        "id": "zVGxEfbhPZ5H"
       },
       "outputs": [],
       "source": [
-        "def get_input_fn_credit(input_df, num_epochs, batch_size=None):\n",
-        "  \"\"\"Gets TF input_fn for credit default models.\"\"\"\n",
-        "  return tf.compat.v1.estimator.inputs.pandas_input_fn(\n",
-        "      x=input_df[['MARRIAGE', 'PAY_0']],\n",
-        "      y=input_df['default'],\n",
-        "      num_epochs=num_epochs,\n",
-        "      batch_size=batch_size or len(input_df),\n",
-        "      shuffle=False)\n",
-        "\n",
-        "\n",
-        "def get_feature_columns_and_configs_credit(monotonicity):\n",
-        "  \"\"\"Gets TFL feature configs for credit default models.\"\"\"\n",
-        "  feature_columns = [\n",
-        "      tf.feature_column.numeric_column('MARRIAGE'),\n",
-        "      tf.feature_column.numeric_column('PAY_0'),\n",
-        "  ]\n",
-        "  feature_configs = [\n",
-        "      tfl.configs.FeatureConfig(\n",
-        "          name='MARRIAGE',\n",
-        "          lattice_size=2,\n",
-        "          pwl_calibration_num_keypoints=3,\n",
-        "          monotonicity=monotonicity,\n",
-        "          pwl_calibration_always_monotonic=False),\n",
-        "      tfl.configs.FeatureConfig(\n",
-        "          name='PAY_0',\n",
-        "          lattice_size=2,\n",
-        "          pwl_calibration_num_keypoints=10,\n",
-        "          monotonicity=monotonicity,\n",
-        "          pwl_calibration_always_monotonic=False),\n",
-        "  ]\n",
-        "  return feature_columns, feature_configs"
+        "def plot_predictions_credit(\n",
+        "    input_df,\n",
+        "    model,\n",
+        "    x_col,\n",
+        "    x_label='Repayment Status (April)',\n",
+        "    y_label='Predicted default probability',\n",
+        "):\n",
+        "  predictions = model.predict((input_df[['MARRIAGE']], input_df[['PAY_0']]))\n",
+        "  predictions = tf.math.sigmoid(predictions)\n",
+        "  new_df = input_df.copy()\n",
+        "  new_df.loc[:, 'predictions'] = predictions\n",
+        "  plot_2d_means_credit(new_df, x_col, 'predictions', x_label, y_label)"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "iwxnlRrQPdTg"
+        "id": "UMIpywE1P07H"
       },
       "source": [
-        "### Helper functions for visualization of trained model outputs"
+        "## Train unconstrained (non-monotonic) calibrated lattice model"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "zVGxEfbhPZ5H"
+        "id": "cxGu3gBOApOm"
       },
       "outputs": [],
       "source": [
-        "def plot_predictions_credit(input_df,\n",
-        "                            estimator,\n",
-        "                            x_col,\n",
-        "                            x_label='Repayment Status (April)',\n",
-        "                            y_label='Predicted default probability'):\n",
-        "  predictions = get_predicted_probabilities(\n",
-        "      estimator=estimator, input_df=input_df, get_input_fn=get_input_fn_credit)\n",
-        "  new_df = input_df.copy()\n",
-        "  new_df.loc[:, 'predictions'] = predictions\n",
-        "  plot_2d_means_credit(new_df, x_col, 'predictions', x_label, y_label)"
+        "model_config = tfl.configs.CalibratedLatticeConfig(\n",
+        "    feature_configs=[\n",
+        "        tfl.configs.FeatureConfig(\n",
+        "            name='MARRIAGE',\n",
+        "            lattice_size=3,\n",
+        "            pwl_calibration_num_keypoints=2,\n",
+        "            monotonicity=0,\n",
+        "            pwl_calibration_always_monotonic=False,\n",
+        "        ),\n",
+        "        tfl.configs.FeatureConfig(\n",
+        "            name='PAY_0',\n",
+        "            lattice_size=3,\n",
+        "            pwl_calibration_num_keypoints=16,\n",
+        "            monotonicity=0,\n",
+        "            pwl_calibration_always_monotonic=False,\n",
+        "        ),\n",
+        "    ],\n",
+        "    output_calibration=True,\n",
+        "    output_initialization=np.linspace(-2, 2, num=8),\n",
+        ")"
       ]
     },
     {
-      "cell_type": "markdown",
+      "cell_type": "code",
+      "execution_count": null,
       "metadata": {
-        "id": "UMIpywE1P07H"
+        "id": "cVZKH36LA8BQ"
       },
+      "outputs": [],
       "source": [
-        "## Train unconstrained (non-monotonic) calibrated linear model"
+        "feature_keypoints = tfl.premade_lib.compute_feature_keypoints(\n",
+        "    feature_configs=model_config.feature_configs,\n",
+        "    features=dfs[\"credit_train\"][['MARRIAGE', 'PAY_0', 'default']],\n",
+        ")\n",
+        "tfl.premade_lib.set_feature_keypoints(\n",
+        "    feature_configs=model_config.feature_configs,\n",
+        "    feature_keypoints=feature_keypoints,\n",
+        "    add_missing_feature_configs=False,\n",
+        ")"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "FfXUKns5cPEw"
+        "id": "2It6hvNRA8Bi"
       },
       "outputs": [],
       "source": [
-        "nomon_linear_estimator = optimize_learning_rates(\n",
-        "    train_df=credit_train_df,\n",
-        "    val_df=credit_val_df,\n",
-        "    test_df=credit_test_df,\n",
-        "    monotonicity=0,\n",
-        "    learning_rates=LEARNING_RATES,\n",
-        "    batch_size=BATCH_SIZE,\n",
-        "    num_epochs=NUM_EPOCHS,\n",
-        "    get_input_fn=get_input_fn_credit,\n",
-        "    get_feature_columns_and_configs=get_feature_columns_and_configs_credit)"
+        "nomon_lattice_model = tfl.premade.CalibratedLattice(model_config=model_config)\n",
+        "\n",
+        "nomon_lattice_model.compile(\n",
+        "    loss=keras.losses.BinaryCrossentropy(from_logits=True),\n",
+        "    metrics=[\n",
+        "        keras.metrics.BinaryAccuracy(name='accuracy'),\n",
+        "    ],\n",
+        "    optimizer=keras.optimizers.Adam(LEARNING_RATES),\n",
+        ")\n",
+        "nomon_lattice_model.fit(datasets['credit_train'], epochs=NUM_EPOCHS, verbose=0)\n",
+        "\n",
+        "train_acc = nomon_lattice_model.evaluate(datasets['credit_train'])[1]\n",
+        "val_acc = nomon_lattice_model.evaluate(datasets['credit_val'])[1]\n",
+        "test_acc = nomon_lattice_model.evaluate(datasets['credit_test'])[1]\n",
+        "print(\n",
+        "    'accuracies for train: %f, val: %f, test: %f'\n",
+        "    % (train_acc, val_acc, test_acc)\n",
+        ")"
       ]
     },
     {
@@ -1115,7 +1105,7 @@
       },
       "outputs": [],
       "source": [
-        "plot_predictions_credit(credit_train_df, nomon_linear_estimator, 'PAY_0')"
+        "plot_predictions_credit(dfs['credit_train'], nomon_lattice_model, 'PAY_0')"
       ]
     },
     {
@@ -1124,7 +1114,19 @@
         "id": "0aokp7qLQBIr"
       },
       "source": [
-        "## Train monotonic calibrated linear model"
+        "## Train monotonic calibrated lattice model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "MbB2ixYMC6Za"
+      },
+      "outputs": [],
+      "source": [
+        "model_config.feature_configs[0].monotonicity = 1\n",
+        "model_config.feature_configs[1].monotonicity = 1"
       ]
     },
     {
@@ -1135,16 +1137,24 @@
       },
       "outputs": [],
       "source": [
-        "mon_linear_estimator = optimize_learning_rates(\n",
-        "    train_df=credit_train_df,\n",
-        "    val_df=credit_val_df,\n",
-        "    test_df=credit_test_df,\n",
-        "    monotonicity=1,\n",
-        "    learning_rates=LEARNING_RATES,\n",
-        "    batch_size=BATCH_SIZE,\n",
-        "    num_epochs=NUM_EPOCHS,\n",
-        "    get_input_fn=get_input_fn_credit,\n",
-        "    get_feature_columns_and_configs=get_feature_columns_and_configs_credit)"
+        "mon_lattice_model = tfl.premade.CalibratedLattice(model_config=model_config)\n",
+        "\n",
+        "mon_lattice_model.compile(\n",
+        "    loss=keras.losses.BinaryCrossentropy(from_logits=True),\n",
+        "    metrics=[\n",
+        "        keras.metrics.BinaryAccuracy(name='accuracy'),\n",
+        "    ],\n",
+        "    optimizer=keras.optimizers.Adam(LEARNING_RATES),\n",
+        ")\n",
+        "mon_lattice_model.fit(datasets['credit_train'], epochs=NUM_EPOCHS, verbose=0)\n",
+        "\n",
+        "train_acc = mon_lattice_model.evaluate(datasets['credit_train'])[1]\n",
+        "val_acc = mon_lattice_model.evaluate(datasets['credit_val'])[1]\n",
+        "test_acc = mon_lattice_model.evaluate(datasets['credit_test'])[1]\n",
+        "print(\n",
+        "    'accuracies for train: %f, val: %f, test: %f'\n",
+        "    % (train_acc, val_acc, test_acc)\n",
+        ")"
       ]
     },
     {
@@ -1155,7 +1165,7 @@
       },
       "outputs": [],
       "source": [
-        "plot_predictions_credit(credit_train_df, mon_linear_estimator, 'PAY_0')"
+        "plot_predictions_credit(dfs['credit_train'], mon_lattice_model, 'PAY_0')"
       ]
     }
   ],
@@ -1163,6 +1173,8 @@
     "colab": {
       "collapsed_sections": [],
       "name": "shape_constraints_for_ethics.ipynb",
+      "private_outputs": true,
+      "provenance": [],
       "toc_visible": true
     },
     "kernelspec": {
diff --git a/examples/BUILD b/examples/BUILD
index 6d6ca86..1b170b7 100644
--- a/examples/BUILD
+++ b/examples/BUILD
@@ -23,18 +23,6 @@ package(
     ],
 )
 
-py_binary(
-    name = "canned_estimators_uci_heart",
-    srcs = ["canned_estimators_uci_heart.py"],
-    python_version = "PY3",
-    deps = [
-        # tensorflow dep,
-        # tensorflow:tensorflow_compat_v1_estimator dep,
-        # tensorflow:tensorflow_estimator dep,
-        "//tensorflow_lattice",
-    ],
-)
-
 py_binary(
     name = "keras_sequential_uci_heart",
     srcs = ["keras_sequential_uci_heart.py"],
@@ -54,15 +42,3 @@ py_binary(
         "//tensorflow_lattice",
     ],
 )
-
-py_binary(
-    name = "custom_estimators_uci_heart",
-    srcs = ["custom_estimators_uci_heart.py"],
-    python_version = "PY3",
-    deps = [
-        # tensorflow dep,
-        # tensorflow:tensorflow_compat_v1_estimator dep,
-        # tensorflow:tensorflow_estimator dep,
-        "//tensorflow_lattice",
-    ],
-)
diff --git a/examples/canned_estimators_uci_heart.py b/examples/canned_estimators_uci_heart.py
deleted file mode 100644
index a83c2d1..0000000
--- a/examples/canned_estimators_uci_heart.py
+++ /dev/null
@@ -1,327 +0,0 @@
-# Copyright 2019 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Example usage of TFL canned estimators.
-
-This example trains several TFL canned estimators on the UCI heart dataset.
-
-Example usage:
-canned_estimators_uci_heart --config_updates=feature__age__lattice_size=4
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import re
-from absl import app
-from absl import flags
-import pandas as pd
-import tensorflow as tf
-from tensorflow import estimator as tf_estimator
-from tensorflow import feature_column as fc
-from tensorflow.compat.v1 import estimator as tf_compat_v1_estimator
-from tensorflow_lattice import configs
-from tensorflow_lattice import estimators
-
-FLAGS = flags.FLAGS
-flags.DEFINE_float('learning_rate', 0.1, 'Learning rate.')
-flags.DEFINE_integer('batch_size', 100, 'Batch size.')
-flags.DEFINE_integer('num_epochs', 50, 'Number of training epoch.')
-flags.DEFINE_integer('prefitting_num_epochs', 10, 'Prefitting epochs.')
-flags.DEFINE_list(
-    'config_updates', '',
-    'Comma separated list of updates to model configs in name=value format.'
-    'See tfl.configs.apply_updates().')
-
-
-def main(_):
-  # Parse configs updates from command line flags.
-  config_updates = []
-  for update in FLAGS.config_updates:
-    config_updates.extend(re.findall(r'(\S*)\s*=\s*(\S*)', update))
-
-  # UCI Statlog (Heart) dataset.
-  csv_file = tf.keras.utils.get_file(
-      'heart.csv',
-      'http://storage.googleapis.com/download.tensorflow.org/data/heart.csv')
-  df = pd.read_csv(csv_file)
-  target = df.pop('target')
-  train_size = int(len(df) * 0.8)
-  train_x = df[:train_size]
-  train_y = target[:train_size]
-  test_x = df[train_size:]
-  test_y = target[train_size:]
-
-  # feature_analysis_input_fn is used to collect statistics about the input
-  # features, thus requiring only one loop of the dataset.
-  #
-  # feature_analysis_input_fn is required if you have at least one FeatureConfig
-  # with "pwl_calibration_input_keypoints='quantiles'". Note that 'quantiles' is
-  # default keypoints configuration so most likely you'll need it.
-  feature_analysis_input_fn = tf_compat_v1_estimator.inputs.pandas_input_fn(
-      x=train_x,
-      y=train_y,
-      shuffle=False,
-      batch_size=FLAGS.batch_size,
-      num_epochs=1,
-      num_threads=1)
-
-  # prefitting_input_fn is used to prefit an initial ensemble that is used to
-  # estimate feature interactions. This prefitting step does not need to fully
-  # converge and thus requiring fewer epochs than the main training.
-  #
-  # prefitting_input_fn is only required if your model_config is
-  # CalibratedLatticeEnsembleConfig with "lattices='crystals'"
-  prefitting_input_fn = tf_compat_v1_estimator.inputs.pandas_input_fn(
-      x=train_x,
-      y=train_y,
-      shuffle=True,
-      batch_size=FLAGS.batch_size,
-      num_epochs=FLAGS.prefitting_num_epochs,
-      num_threads=1)
-
-  train_input_fn = tf_compat_v1_estimator.inputs.pandas_input_fn(
-      x=train_x,
-      y=train_y,
-      shuffle=True,
-      batch_size=FLAGS.batch_size,
-      num_epochs=FLAGS.num_epochs,
-      num_threads=1)
-
-  test_input_fn = tf_compat_v1_estimator.inputs.pandas_input_fn(
-      x=test_x,
-      y=test_y,
-      shuffle=False,
-      batch_size=FLAGS.batch_size,
-      num_epochs=FLAGS.num_epochs,
-      num_threads=1)
-
-  # Feature columns.
-  # - age
-  # - sex
-  # - cp        chest pain type (4 values)
-  # - trestbps  resting blood pressure
-  # - chol      serum cholestoral in mg/dl
-  # - fbs       fasting blood sugar > 120 mg/dl
-  # - restecg   resting electrocardiographic results (values 0,1,2)
-  # - thalach   maximum heart rate achieved
-  # - exang     exercise induced angina
-  # - oldpeak   ST depression induced by exercise relative to rest
-  # - slope     the slope of the peak exercise ST segment
-  # - ca        number of major vessels (0-3) colored by flourosopy
-  # - thal      3 = normal; 6 = fixed defect; 7 = reversable defect
-  feature_columns = [
-      fc.numeric_column('age', default_value=-1),
-      fc.categorical_column_with_vocabulary_list('sex', [0, 1]),
-      fc.numeric_column('cp'),
-      fc.numeric_column('trestbps', default_value=-1),
-      fc.numeric_column('chol'),
-      fc.categorical_column_with_vocabulary_list('fbs', [0, 1]),
-      fc.categorical_column_with_vocabulary_list('restecg', [0, 1, 2]),
-      fc.numeric_column('thalach'),
-      fc.categorical_column_with_vocabulary_list('exang', [0, 1]),
-      fc.numeric_column('oldpeak'),
-      fc.categorical_column_with_vocabulary_list('slope', [0, 1, 2]),
-      fc.numeric_column('ca'),
-      fc.categorical_column_with_vocabulary_list(
-          'thal', ['normal', 'fixed', 'reversible']),
-  ]
-
-  # Feature configs are used to specify how each feature is calibrated and used.
-  feature_configs = [
-      configs.FeatureConfig(
-          name='age',
-          lattice_size=3,
-          # By default, input keypoints of pwl are quantiles of the feature.
-          pwl_calibration_num_keypoints=5,
-          monotonicity='increasing',
-          pwl_calibration_clip_max=100,
-      ),
-      configs.FeatureConfig(
-          name='cp',
-          pwl_calibration_num_keypoints=4,
-          # Keypoints can be uniformly spaced.
-          pwl_calibration_input_keypoints='uniform',
-          monotonicity='increasing',
-      ),
-      configs.FeatureConfig(
-          name='chol',
-          # Explicit input keypoint initialization.
-          pwl_calibration_input_keypoints=[126.0, 210.0, 247.0, 286.0, 564.0],
-          monotonicity='increasing',
-          pwl_calibration_clip_min=130,
-          # Calibration can be forced to span the full output range by clamping.
-          pwl_calibration_clamp_min=True,
-          pwl_calibration_clamp_max=True,
-          # Per feature regularization.
-          regularizer_configs=[
-              configs.RegularizerConfig(name='calib_hessian', l2=1e-4),
-          ],
-      ),
-      configs.FeatureConfig(
-          name='fbs',
-          # Monotonicity: output for 1 should be larger than output for 0.
-          monotonicity=[(0, 1)],
-      ),
-      configs.FeatureConfig(
-          name='trestbps',
-          pwl_calibration_num_keypoints=5,
-          monotonicity='decreasing',
-      ),
-      configs.FeatureConfig(
-          name='thalach',
-          pwl_calibration_num_keypoints=5,
-          monotonicity='decreasing',
-      ),
-      configs.FeatureConfig(
-          name='restecg',
-          # Categorical monotonicity can be partial order.
-          monotonicity=[(0, 1), (0, 2)],
-      ),
-      configs.FeatureConfig(
-          name='exang',
-          monotonicity=[(0, 1)],
-      ),
-      configs.FeatureConfig(
-          name='oldpeak',
-          pwl_calibration_num_keypoints=5,
-          monotonicity='increasing',
-      ),
-      configs.FeatureConfig(
-          name='slope',
-          monotonicity=[(0, 1), (1, 2)],
-      ),
-      configs.FeatureConfig(
-          name='ca',
-          pwl_calibration_num_keypoints=4,
-          monotonicity='increasing',
-      ),
-      configs.FeatureConfig(
-          name='thal',
-          monotonicity=[('normal', 'fixed'), ('normal', 'reversible')],
-      ),
-  ]
-
-  # Serving input fn is used to create saved models.
-  serving_input_fn = (
-      tf_estimator.export.build_parsing_serving_input_receiver_fn(
-          feature_spec=fc.make_parse_example_spec(feature_columns)))
-
-  # Model config defines the model strcutre for the estimator.
-  # This is calibrated linear model with outputput calibration: Inputs are
-  # calibrated, linearly combined and the output of the linear layer is
-  # calibrated again using a PWL function.
-  model_config = configs.CalibratedLinearConfig(
-      feature_configs=feature_configs,
-      use_bias=True,
-      output_calibration=True,
-      regularizer_configs=[
-          # Regularizer for the output calibrator.
-          configs.RegularizerConfig(name='output_calib_hessian', l2=1e-4),
-      ])
-  # Update model configuration.
-  # See tfl.configs.apply_updates for details.
-  configs.apply_updates(model_config, config_updates)
-  estimator = estimators.CannedClassifier(
-      feature_columns=feature_columns,
-      model_config=model_config,
-      feature_analysis_input_fn=feature_analysis_input_fn,
-      optimizer=tf.keras.optimizers.legacy.Adam(FLAGS.learning_rate))
-  estimator.train(input_fn=train_input_fn)
-  results = estimator.evaluate(input_fn=test_input_fn)
-  print('Calibrated linear results: {}'.format(results))
-  print('Calibrated linear model exported to {}'.format(
-      estimator.export_saved_model(estimator.model_dir, serving_input_fn)))
-
-  # This is calibrated lattice model: Inputs are calibrated, then combined
-  # non-linearly using a lattice layer.
-  model_config = configs.CalibratedLatticeConfig(
-      feature_configs=feature_configs,
-      regularizer_configs=[
-          # Torsion regularizer applied to the lattice to make it more linear.
-          configs.RegularizerConfig(name='torsion', l2=1e-4),
-          # Globally defined calibration regularizer is applied to all features.
-          configs.RegularizerConfig(name='calib_hessian', l2=1e-4),
-      ])
-  estimator = estimators.CannedClassifier(
-      feature_columns=feature_columns,
-      model_config=model_config,
-      feature_analysis_input_fn=feature_analysis_input_fn,
-      optimizer=tf.keras.optimizers.legacy.Adam(FLAGS.learning_rate))
-  estimator.train(input_fn=train_input_fn)
-  results = estimator.evaluate(input_fn=test_input_fn)
-  print('Calibrated lattice results: {}'.format(results))
-  print('Calibrated lattice model exported to {}'.format(
-      estimator.export_saved_model(estimator.model_dir, serving_input_fn)))
-
-  # This is random lattice ensemble model with separate calibration:
-  # model output is the average output of separately calibrated lattices.
-  model_config = configs.CalibratedLatticeEnsembleConfig(
-      feature_configs=feature_configs,
-      num_lattices=6,
-      lattice_rank=5,
-      separate_calibrators=True,
-      regularizer_configs=[
-          # Torsion regularizer applied to the lattice to make it more linear.
-          configs.RegularizerConfig(name='torsion', l2=1e-4),
-          # Globally defined calibration regularizer is applied to all features.
-          configs.RegularizerConfig(name='calib_hessian', l2=1e-4),
-      ])
-  configs.apply_updates(model_config, config_updates)
-  estimator = estimators.CannedClassifier(
-      feature_columns=feature_columns,
-      model_config=model_config,
-      feature_analysis_input_fn=feature_analysis_input_fn,
-      optimizer=tf.keras.optimizers.legacy.Adam(FLAGS.learning_rate))
-  estimator.train(input_fn=train_input_fn)
-  results = estimator.evaluate(input_fn=test_input_fn)
-  print('Random ensemble results: {}'.format(results))
-  print('Random ensemble model exported to {}'.format(
-      estimator.export_saved_model(estimator.model_dir, serving_input_fn)))
-
-  # This is Crystals ensemble model with separate calibration: model output is
-  # the average output of separately calibrated lattices.
-  # Crystals algorithm first trains a prefitting model and uses the interactions
-  # between features to form the final lattice ensemble.
-  model_config = configs.CalibratedLatticeEnsembleConfig(
-      feature_configs=feature_configs,
-      # Using Crystals algorithm.
-      lattices='crystals',
-      num_lattices=6,
-      lattice_rank=5,
-      separate_calibrators=True,
-      regularizer_configs=[
-          # Torsion regularizer applied to the lattice to make it more linear.
-          configs.RegularizerConfig(name='torsion', l2=1e-4),
-          # Globally defined calibration regularizer is applied to all features.
-          configs.RegularizerConfig(name='calib_hessian', l2=1e-4),
-      ])
-  configs.apply_updates(model_config, config_updates)
-  estimator = estimators.CannedClassifier(
-      feature_columns=feature_columns,
-      model_config=model_config,
-      feature_analysis_input_fn=feature_analysis_input_fn,
-      # prefitting_input_fn is required to train the prefitting model.
-      prefitting_input_fn=prefitting_input_fn,
-      optimizer=tf.keras.optimizers.legacy.Adam(FLAGS.learning_rate))
-  estimator.train(input_fn=train_input_fn)
-  results = estimator.evaluate(input_fn=test_input_fn)
-  print('Crystals ensemble results: {}'.format(results))
-  print('Crystals ensemble model exported to {}'.format(
-      estimator.export_saved_model(estimator.model_dir, serving_input_fn)))
-
-
-if __name__ == '__main__':
-  app.run(main)
diff --git a/examples/custom_estimators_uci_heart.py b/examples/custom_estimators_uci_heart.py
deleted file mode 100644
index 58c092f..0000000
--- a/examples/custom_estimators_uci_heart.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# Copyright 2019 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Example usage of TFL layers in custom estimators.
-
-This example trains a TFL custom estimators on the UCI heart dataset.
-
-Example usage:
-custom_estimators_uci_heart --num_epochs=5000
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl import app
-from absl import flags
-import numpy as np
-import pandas as pd
-import tensorflow as tf
-from tensorflow import estimator as tf_estimator
-from tensorflow import feature_column as fc
-from tensorflow.compat.v1 import estimator as tf_compat_v1_estimator
-import tensorflow_lattice as tfl
-from tensorflow_estimator.python.estimator.canned import optimizers
-from tensorflow_estimator.python.estimator.head import binary_class_head
-
-FLAGS = flags.FLAGS
-flags.DEFINE_float('learning_rate', 0.01, 'Learning rate.')
-flags.DEFINE_integer('batch_size', 100, 'Batch size.')
-flags.DEFINE_integer('num_epochs', 2000, 'Number of training epoch.')
-
-
-def main(_):
-  # UCI Statlog (Heart) dataset.
-  csv_file = tf.keras.utils.get_file(
-      'heart.csv',
-      'http://storage.googleapis.com/download.tensorflow.org/data/heart.csv')
-  df = pd.read_csv(csv_file)
-  target = df.pop('target')
-  train_size = int(len(df) * 0.8)
-  train_x = df[:train_size]
-  train_y = target[:train_size]
-  test_x = df[train_size:]
-  test_y = target[train_size:]
-
-  train_input_fn = tf_compat_v1_estimator.inputs.pandas_input_fn(
-      x=train_x,
-      y=train_y,
-      shuffle=True,
-      batch_size=FLAGS.batch_size,
-      num_epochs=FLAGS.num_epochs,
-      num_threads=1)
-
-  test_input_fn = tf_compat_v1_estimator.inputs.pandas_input_fn(
-      x=test_x,
-      y=test_y,
-      shuffle=False,
-      batch_size=FLAGS.batch_size,
-      num_epochs=FLAGS.num_epochs,
-      num_threads=1)
-
-  # Feature columns.
-  # - age
-  # - sex
-  # - cp        chest pain type (4 values)
-  # - trestbps  resting blood pressure
-  # - chol      serum cholestoral in mg/dl
-  # - fbs       fasting blood sugar > 120 mg/dl
-  # - restecg   resting electrocardiographic results (values 0,1,2)
-  # - thalach   maximum heart rate achieved
-  # - exang     exercise induced angina
-  # - oldpeak   ST depression induced by exercise relative to rest
-  # - slope     the slope of the peak exercise ST segment
-  # - ca        number of major vessels (0-3) colored by flourosopy
-  # - thal      3 = normal; 6 = fixed defect; 7 = reversable defect
-  feature_columns = [
-      fc.numeric_column('age', default_value=-1),
-      fc.categorical_column_with_vocabulary_list('sex', [0, 1]),
-      fc.numeric_column('ca'),
-      fc.categorical_column_with_vocabulary_list(
-          'thal', ['normal', 'fixed', 'reversible']),
-  ]
-
-  def model_fn(features, labels, mode, config):
-    """model_fn for the custom estimator."""
-    del config
-    input_tensors = tfl.estimators.transform_features(features, feature_columns)
-    inputs = {
-        key: tf.keras.layers.Input(shape=(1,), name=key)
-        for key in input_tensors
-    }
-
-    lattice_sizes = [3, 2, 2, 2]
-    lattice_monotonicities = ['increasing', 'none', 'increasing', 'increasing']
-    lattice_input = tf.keras.layers.Concatenate(axis=1)([
-        tfl.layers.PWLCalibration(
-            input_keypoints=np.linspace(10, 100, num=8, dtype=np.float32),
-            # The output range of the calibrator should be the input range of
-            # the following lattice dimension.
-            output_min=0.0,
-            output_max=lattice_sizes[0] - 1.0,
-            monotonicity='increasing',
-        )(inputs['age']),
-        tfl.layers.CategoricalCalibration(
-            # Number of categories including any missing/default category.
-            num_buckets=2,
-            output_min=0.0,
-            output_max=lattice_sizes[1] - 1.0,
-        )(inputs['sex']),
-        tfl.layers.PWLCalibration(
-            input_keypoints=[0.0, 1.0, 2.0, 3.0],
-            output_min=0.0,
-            output_max=lattice_sizes[2] - 1.0,
-            # You can specify TFL regularizers as tuple
-            # ('regularizer name', l1, l2).
-            kernel_regularizer=('hessian', 0.0, 1e-4),
-            monotonicity='increasing',
-        )(inputs['ca']),
-        tfl.layers.CategoricalCalibration(
-            num_buckets=3,
-            output_min=0.0,
-            output_max=lattice_sizes[3] - 1.0,
-            # Categorical monotonicity can be partial order.
-            # (i, j) indicates that we must have output(i) <= output(i).
-            # Make sure to set the lattice monotonicity to 1 for this dimension.
-            monotonicities=[(0, 1), (0, 2)],
-        )(inputs['thal']),
-    ])
-    output = tfl.layers.Lattice(
-        lattice_sizes=lattice_sizes,
-        monotonicities=lattice_monotonicities,
-        # Add a kernel_initializer so that the Lattice is not initialized as a
-        # flat plane. The output_min and output_max could be arbitrary, as long
-        # as output_min < output_max.
-        kernel_initializer=tfl.lattice_layer.RandomMonotonicInitializer(
-            lattice_sizes=lattice_sizes, output_min=-10, output_max=10),
-    )(
-        lattice_input)
-
-    training = (mode == tf_estimator.ModeKeys.TRAIN)
-    model = tf.keras.Model(inputs=inputs, outputs=output)
-    logits = model(input_tensors, training=training)
-
-    if training:
-      optimizer = optimizers.get_optimizer_instance_v2('Adam',
-                                                       FLAGS.learning_rate)
-    else:
-      optimizer = None
-
-    head = binary_class_head.BinaryClassHead()
-    return head.create_estimator_spec(
-        features=features,
-        mode=mode,
-        labels=labels,
-        optimizer=optimizer,
-        logits=logits,
-        trainable_variables=model.trainable_variables,
-        update_ops=model.updates)
-
-  estimator = tf_estimator.Estimator(model_fn=model_fn)
-  estimator.train(input_fn=train_input_fn)
-  results = estimator.evaluate(input_fn=test_input_fn)
-  print('Results: {}'.format(results))
-
-
-if __name__ == '__main__':
-  app.run(main)
diff --git a/examples/keras_functional_uci_heart.py b/examples/keras_functional_uci_heart.py
index 8031241..e20959a 100644
--- a/examples/keras_functional_uci_heart.py
+++ b/examples/keras_functional_uci_heart.py
@@ -70,8 +70,14 @@
 import pandas as pd
 
 import tensorflow as tf
-from tensorflow import keras
 import tensorflow_lattice as tfl
+# pylint: disable=g-import-not-at-top
+# Use Keras 2.
+version_fn = getattr(tf.keras, 'version', None)
+if version_fn and version_fn().startswith('3.'):
+  import tf_keras as keras
+else:
+  keras = tf.keras
 
 FLAGS = flags.FLAGS
 flags.DEFINE_integer('num_epochs', 200, 'Number of training epoch.')
@@ -79,9 +85,10 @@
 
 def main(_):
   # UCI Statlog (Heart) dataset.
-  csv_file = tf.keras.utils.get_file(
+  csv_file = keras.utils.get_file(
       'heart.csv',
-      'http://storage.googleapis.com/download.tensorflow.org/data/heart.csv')
+      'http://storage.googleapis.com/download.tensorflow.org/data/heart.csv',
+  )
   training_data_df = pd.read_csv(csv_file).sample(
       frac=1.0, random_state=41).reset_index(drop=True)
 
diff --git a/examples/keras_sequential_uci_heart.py b/examples/keras_sequential_uci_heart.py
index 7570ce9..e4f5b48 100644
--- a/examples/keras_sequential_uci_heart.py
+++ b/examples/keras_sequential_uci_heart.py
@@ -64,8 +64,14 @@
 import pandas as pd
 
 import tensorflow as tf
-from tensorflow import keras
 import tensorflow_lattice as tfl
+# pylint: disable=g-import-not-at-top
+# Use Keras 2.
+version_fn = getattr(tf.keras, 'version', None)
+if version_fn and version_fn().startswith('3.'):
+  import tf_keras as keras
+else:
+  keras = tf.keras
 
 FLAGS = flags.FLAGS
 flags.DEFINE_integer('num_epochs', 200, 'Number of training epoch.')
@@ -73,9 +79,10 @@
 
 def main(_):
   # UCI Statlog (Heart) dataset.
-  csv_file = tf.keras.utils.get_file(
+  csv_file = keras.utils.get_file(
       'heart.csv',
-      'http://storage.googleapis.com/download.tensorflow.org/data/heart.csv')
+      'http://storage.googleapis.com/download.tensorflow.org/data/heart.csv',
+  )
   training_data_df = pd.read_csv(csv_file).sample(
       frac=1.0, random_state=41).reset_index(drop=True)
 
diff --git a/setup.py b/setup.py
index 464a719..54a3708 100644
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@
 # This version number should always be that of the *next* (unreleased) version.
 # Immediately after uploading a package to PyPI, you should increment the
 # version number and push to gitHub.
-__version__ = "2.0.13"
+__version__ = "2.1.0"
 
 if "--release" in sys.argv:
   sys.argv.remove("--release")
@@ -45,6 +45,7 @@
     "scikit-learn",
     "matplotlib",
     "graphviz",
+    "tf-keras",
 ]
 
 # Part of the visualization code uses colabtools and IPython libraries. These
@@ -77,10 +78,9 @@
 _long_description = """\
 TensorFlow Lattice is a library that implements fast-to-evaluate and
 interpretable (optionally monotonic) lattice based models, which are also known
-as *interpolated look-up tables*. The library includes a collection of
-Estimators, which operate like any TensorFlow Estimator. It also includes
-Keras layers for lattices and feature calibration that can be composed
-into custom models.
+as *interpolated look-up tables*. The library includes a collection of Keras
+layers for lattices and feature calibration that can be composed into custom
+models or used inside generic premade models.
 """
 
 setup(
diff --git a/tensorflow_lattice/BUILD b/tensorflow_lattice/BUILD
index 8e13739..512d8f2 100644
--- a/tensorflow_lattice/BUILD
+++ b/tensorflow_lattice/BUILD
@@ -38,7 +38,6 @@ py_library(
         "//tensorflow_lattice/python:conditional_cdf",
         "//tensorflow_lattice/python:conditional_pwl_calibration",
         "//tensorflow_lattice/python:configs",
-        "//tensorflow_lattice/python:estimators",
         "//tensorflow_lattice/python:kronecker_factored_lattice_layer",
         "//tensorflow_lattice/python:kronecker_factored_lattice_lib",
         "//tensorflow_lattice/python:lattice_layer",
@@ -54,6 +53,5 @@ py_library(
         "//tensorflow_lattice/python:rtl_layer",
         "//tensorflow_lattice/python:test_utils",
         "//tensorflow_lattice/python:utils",
-        "//tensorflow_lattice/python:visualization",
     ],
 )
diff --git a/tensorflow_lattice/__init__.py b/tensorflow_lattice/__init__.py
index f1259aa..ba7b625 100644
--- a/tensorflow_lattice/__init__.py
+++ b/tensorflow_lattice/__init__.py
@@ -27,7 +27,6 @@
 from tensorflow_lattice.python import conditional_cdf
 from tensorflow_lattice.python import conditional_pwl_calibration
 from tensorflow_lattice.python import configs
-from tensorflow_lattice.python import estimators
 from tensorflow_lattice.python import kronecker_factored_lattice_layer
 from tensorflow_lattice.python import kronecker_factored_lattice_lib
 from tensorflow_lattice.python import lattice_layer
@@ -42,4 +41,3 @@
 from tensorflow_lattice.python import pwl_calibration_lib
 from tensorflow_lattice.python import test_utils
 from tensorflow_lattice.python import utils
-from tensorflow_lattice.python import visualization
diff --git a/tensorflow_lattice/python/BUILD b/tensorflow_lattice/python/BUILD
index c0f95fe..df3f448 100644
--- a/tensorflow_lattice/python/BUILD
+++ b/tensorflow_lattice/python/BUILD
@@ -140,46 +140,6 @@ py_test(
     ],
 )
 
-py_library(
-    name = "estimators",
-    srcs = ["estimators.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":categorical_calibration_layer",
-        ":configs",
-        ":kronecker_factored_lattice_layer",
-        ":lattice_layer",
-        ":linear_layer",
-        ":model_info",
-        ":premade",
-        ":premade_lib",
-        ":pwl_calibration_layer",
-        ":rtl_layer",
-        # absl/logging dep,
-        # tensorflow dep,
-        # tensorflow:tensorflow_estimator dep,
-    ],
-)
-
-py_test(
-    name = "estimators_test",
-    size = "enormous",
-    srcs = ["estimators_test.py"],
-    python_version = "PY3",
-    # shard_count = 10,
-    srcs_version = "PY2AND3",
-    deps = [
-        ":configs",
-        ":estimators",
-        ":model_info",
-        # absl/logging dep,
-        # sklearn dep,
-        # tensorflow dep,
-        # tensorflow:tensorflow_compat_v1_estimator dep,
-        # tensorflow:tensorflow_estimator dep,
-    ],
-)
-
 py_library(
     name = "internal_utils",
     srcs = ["internal_utils.py"],
@@ -348,7 +308,6 @@ py_test(
     deps = [
         ":lattice_layer",
         ":parallel_combination_layer",
-        # absl/logging dep,
         # absl/testing:parameterized dep,
         # numpy dep,
         # tensorflow dep,
@@ -393,7 +352,6 @@ py_library(
         # numpy dep,
         # six dep,
         # tensorflow dep,
-        # tensorflow:tensorflow_estimator dep,
     ],
 )
 
@@ -488,7 +446,6 @@ py_test(
         ":linear_layer",
         ":pwl_calibration_layer",
         ":rtl_layer",
-        # absl/logging dep,
         # absl/testing:parameterized dep,
         # numpy dep,
         # tensorflow dep,
@@ -500,7 +457,6 @@ py_library(
     srcs = ["test_utils.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":visualization",
         # absl/logging dep,
         # numpy dep,
     ],
@@ -562,15 +518,3 @@ py_test(
         # tensorflow dep,
     ],
 )
-
-py_library(
-    name = "visualization",
-    srcs = ["visualization.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":model_info",
-        # graphviz dep,
-        # matplotlib dep,
-        # numpy dep,
-    ],
-)
diff --git a/tensorflow_lattice/python/aggregation_layer.py b/tensorflow_lattice/python/aggregation_layer.py
index d59f17e..a4a33a4 100644
--- a/tensorflow_lattice/python/aggregation_layer.py
+++ b/tensorflow_lattice/python/aggregation_layer.py
@@ -24,7 +24,13 @@
 from __future__ import print_function
 
 import tensorflow as tf
-from tensorflow import keras
+# pylint: disable=g-import-not-at-top
+# Use Keras 2.
+version_fn = getattr(tf.keras, 'version', None)
+if version_fn and version_fn().startswith('3.'):
+  import tf_keras as keras
+else:
+  keras = tf.keras
 
 
 class Aggregation(keras.layers.Layer):
@@ -46,7 +52,7 @@ class Aggregation(keras.layers.Layer):
   Example:
 
   ```python
-  model = tf.keras.Model(inputs=inputs, outputs=outputs)
+  model = keras.Model(inputs=inputs, outputs=outputs)
   layer = tfl.layers.Aggregation(model)
   ```
   """
@@ -56,14 +62,14 @@ def __init__(self, model, **kwargs):
     """initializes an instance of `Aggregation`.
 
     Args:
-      model: A tf.keras.Model instance.
-      **kwargs: Other args passed to `tf.keras.layers.Layer` initializer.
+      model: A keras.Model instance.
+      **kwargs: Other args passed to `keras.layers.Layer` initializer.
 
     Raises:
-      ValueError: if model is not at `tf.keras.Model` instance.
+      ValueError: if model is not at `keras.Model` instance.
     """
-    if not isinstance(model, tf.keras.Model):
-      raise ValueError('Model must be a tf.keras.Model instance.')
+    if not isinstance(model, keras.Model):
+      raise ValueError('Model must be a keras.Model instance.')
     super(Aggregation, self).__init__(**kwargs)
     # This flag enables inputs to be Ragged Tensors
     self._supports_ragged_inputs = True
@@ -77,13 +83,13 @@ def get_config(self):
     """Standard Keras get_config() method."""
     config = super(Aggregation, self).get_config().copy()
     config.update(
-        {'model': tf.keras.utils.legacy.serialize_keras_object(self.model)}
+        {'model': keras.utils.legacy.serialize_keras_object(self.model)}
     )
     return config
 
   @classmethod
   def from_config(cls, config, custom_objects=None):
-    model = tf.keras.utils.legacy.deserialize_keras_object(
+    model = keras.utils.legacy.deserialize_keras_object(
         config.pop('model'), custom_objects=custom_objects
     )
     return cls(model, **config)
diff --git a/tensorflow_lattice/python/aggregation_test.py b/tensorflow_lattice/python/aggregation_test.py
index 69e3921..e7d17c2 100644
--- a/tensorflow_lattice/python/aggregation_test.py
+++ b/tensorflow_lattice/python/aggregation_test.py
@@ -19,6 +19,14 @@
 
 import tensorflow as tf
 from tensorflow_lattice.python import aggregation_layer
+# pylint: disable=g-import-not-at-top
+# Use Keras 2.
+version_fn = getattr(tf.keras, 'version', None)
+if version_fn and version_fn().startswith('3.'):
+  import tf_keras as keras
+else:
+  keras = tf.keras
+
 
 test_input = [
     tf.ragged.constant([[1, 2], [1, 2, 3], [3]]),
@@ -32,14 +40,14 @@
 class AggregationTest(tf.test.TestCase):
 
   def testAggregationLayer(self):
-    # First we test our assertion that the model must be a tf.keras.Model
+    # First we test our assertion that the model must be a keras.Model
     with self.assertRaisesRegex(ValueError,
-                                'Model must be a tf.keras.Model instance.'):
+                                'Model must be a keras.Model instance.'):
       aggregation_layer.Aggregation(None)
     # Now let's make sure our layer aggregates properly.
-    inputs = [tf.keras.Input(shape=()) for _ in range(len(test_input))]
-    output = tf.keras.layers.multiply(inputs)
-    model = tf.keras.Model(inputs=inputs, outputs=output)
+    inputs = [keras.Input(shape=()) for _ in range(len(test_input))]
+    output = keras.layers.multiply(inputs)
+    model = keras.Model(inputs=inputs, outputs=output)
     agg_layer = aggregation_layer.Aggregation(model)
     self.assertAllEqual(agg_layer(test_input), expected_output)
 
diff --git a/tensorflow_lattice/python/categorical_calibration_layer.py b/tensorflow_lattice/python/categorical_calibration_layer.py
index 996695e..30fe72c 100644
--- a/tensorflow_lattice/python/categorical_calibration_layer.py
+++ b/tensorflow_lattice/python/categorical_calibration_layer.py
@@ -21,10 +21,15 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
-from . import categorical_calibration_lib
 import tensorflow as tf
-from tensorflow import keras
+# pylint: disable=g-import-not-at-top
+# Use Keras 2.
+version_fn = getattr(tf.keras, "version", None)
+if version_fn and version_fn().startswith("3."):
+  import tf_keras as keras
+else:
+  keras = tf.keras
+from . import categorical_calibration_lib
 
 DEFAULT_INPUT_VALUE_NAME = "default_input_value"
 CATEGORICAL_CALIBRATION_KERNEL_NAME = "categorical_calibration_kernel"
@@ -124,7 +129,7 @@ def __init__(self,
         be treated as default and mapped to the last bucket.
       split_outputs: Whether to split the output tensor into a list of
         outputs for each unit. Ignored if units < 2.
-      **kwargs: Other args passed to `tf.keras.layers.Layer` initializer.
+      **kwargs: Other args passed to `keras.layers.Layer` initializer.
 
     Raises:
       ValueError: If layer hyperparameters are invalid.
diff --git a/tensorflow_lattice/python/categorical_calibration_test.py b/tensorflow_lattice/python/categorical_calibration_test.py
index a0f0e08..ccd4e82 100644
--- a/tensorflow_lattice/python/categorical_calibration_test.py
+++ b/tensorflow_lattice/python/categorical_calibration_test.py
@@ -24,10 +24,16 @@
 from absl.testing import parameterized
 import numpy as np
 import tensorflow as tf
-from tensorflow import keras
 from tensorflow_lattice.python import categorical_calibration_layer as categorical_calibraion
 from tensorflow_lattice.python import parallel_combination_layer as parallel_combination
 from tensorflow_lattice.python import test_utils
+# pylint: disable=g-import-not-at-top
+# Use Keras 2.
+version_fn = getattr(tf.keras, "version", None)
+if version_fn and version_fn().startswith("3."):
+  import tf_keras as keras
+else:
+  keras = tf.keras
 
 
 class CategoricalCalibrationLayerTest(parameterized.TestCase, tf.test.TestCase):
@@ -37,7 +43,7 @@ def setUp(self):
     self._disable_all = False
     self._loss_eps = 1e-2
     self._loss_diff_eps = 1e-4
-    tf.keras.utils.set_random_seed(42)
+    keras.utils.set_random_seed(42)
 
   def _ResetAllBackends(self):
     keras.backend.clear_session()
@@ -78,14 +84,12 @@ def _SetDefaults(self, config):
     config.setdefault("model_dir", "/tmp/test_pwl_model_dir/")
     return config
 
-  def _TrainModel(self, config, plot_path=None):
+  def _TrainModel(self, config):
     """Trains model and returns loss.
 
     Args:
       config: Layer config internal for this test which specifies params of
         piecewise linear layer to train.
-      plot_path: if specified - png file name to save visualization. See
-        test_utils.run_training_loop() for more details.
 
     Returns:
       Training loss.
@@ -150,13 +154,12 @@ def _TrainModel(self, config, plot_path=None):
         loss=keras.losses.mean_squared_error,
         optimizer=config["optimizer"](learning_rate=config["learning_rate"]))
 
-    training_data = (training_inputs, training_labels, training_inputs)
+    training_data = (training_inputs, training_labels)
 
     loss = test_utils.run_training_loop(
         config=config,
         training_data=training_data,
         keras_model=model,
-        plot_path=plot_path,
         input_dtype=np.int32)
 
     assetion_ops = []
@@ -176,7 +179,7 @@ def testUnconstrainedNoMissingValue(self, y_function):
     config = {
         "num_training_records": 200,
         "num_training_epoch": 500,
-        "optimizer": tf.keras.optimizers.Adam,
+        "optimizer": keras.optimizers.Adam,
         "learning_rate": 0.15,
         "x_generator": self._ScatterXUniformly,
         "y_function": y_function,
@@ -201,7 +204,7 @@ def testUnconstrainedWithMissingValue(self, y_function):
     config = {
         "num_training_records": 200,
         "num_training_epoch": 500,
-        "optimizer": tf.keras.optimizers.Adam,
+        "optimizer": keras.optimizers.Adam,
         "learning_rate": 0.15,
         "x_generator": self._ScatterXUniformly,
         "y_function": y_function,
@@ -238,7 +241,7 @@ def testConstraints(self, output_min, output_max, monotonicities,
     config = {
         "num_training_records": 1000,
         "num_training_epoch": 1000,
-        "optimizer": tf.keras.optimizers.Adam,
+        "optimizer": keras.optimizers.Adam,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": np.mean,
@@ -275,7 +278,7 @@ def testCircularMonotonicites(self):
     config = {
         "num_training_records": 200,
         "num_training_epoch": 500,
-        "optimizer": tf.keras.optimizers.Adam,
+        "optimizer": keras.optimizers.Adam,
         "learning_rate": 0.15,
         "x_generator": self._ScatterXUniformly,
         "y_function": float,
@@ -301,7 +304,7 @@ def testRegularizers(self, regularizer):
     config = {
         "num_training_records": 20,
         "num_training_epoch": 0,
-        "optimizer": tf.keras.optimizers.Adam,
+        "optimizer": keras.optimizers.Adam,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": lambda _: 2.0,
@@ -322,7 +325,7 @@ def testOutputShape(self):
     # Not Splitting
     units = 10
     input_shape, output_shape = (units,), (None, units)
-    input_a = tf.keras.layers.Input(shape=input_shape)
+    input_a = keras.layers.Input(shape=input_shape)
     cat_cal_0 = categorical_calibraion.CategoricalCalibration(
         num_buckets=3, units=units)
     output = cat_cal_0(input_a)
diff --git a/tensorflow_lattice/python/cdf_layer.py b/tensorflow_lattice/python/cdf_layer.py
index a82dc15..4da29b1 100644
--- a/tensorflow_lattice/python/cdf_layer.py
+++ b/tensorflow_lattice/python/cdf_layer.py
@@ -21,11 +21,18 @@
 from __future__ import division
 from __future__ import print_function
 
-from . import utils
 import tensorflow as tf
+# pylint: disable=g-import-not-at-top
+# Use Keras 2.
+version_fn = getattr(tf.keras, "version", None)
+if version_fn and version_fn().startswith("3."):
+  import tf_keras as keras
+else:
+  keras = tf.keras
+from . import utils
 
 
-class CDF(tf.keras.layers.Layer):
+class CDF(keras.layers.Layer):
   # pyformat: disable
   """Cumulative Distribution Function (CDF) layer.
 
@@ -128,7 +135,7 @@ def __init__(self,
         - `'random_uniform'`: initializes parameters as uniform
           random functions in the range [0, 1].
         - Any Keras initializer object.
-      **kwargs: Any additional `tf.keras.layers.Layer` arguments.
+      **kwargs: Any additional `keras.layers.Layer` arguments.
     """
     # pyformat: enable
     super(CDF, self).__init__(**kwargs)
@@ -179,15 +186,15 @@ def build(self, input_shape):
     elif self.input_scaling_type == "learned_shared":
       self.input_scaling = self.add_weight(
           "input_scaling",
-          initializer=tf.keras.initializers.Constant(self.input_scaling_init),
-          constraint=tf.keras.constraints.NonNeg()
+          initializer=keras.initializers.Constant(self.input_scaling_init),
+          constraint=keras.constraints.NonNeg()
           if self.input_scaling_monotonicity else None,
           shape=[1])
     elif self.input_scaling_type == "learned_per_input":
       self.input_scaling = self.add_weight(
           "input_scaling",
-          initializer=tf.keras.initializers.Constant(self.input_scaling_init),
-          constraint=tf.keras.constraints.NonNeg()
+          initializer=keras.initializers.Constant(self.input_scaling_init),
+          constraint=keras.constraints.NonNeg()
           if self.input_scaling_monotonicity else None,
           shape=[1, input_dim, 1, 1])
     else:
@@ -255,7 +262,7 @@ def get_config(self):
         "sparsity_factor":
             self.sparsity_factor,
         "kernel_initializer":
-            tf.keras.initializers.serialize(
+            keras.initializers.serialize(
                 self.kernel_initializer, use_legacy_format=True),
     }
     config.update(super(CDF, self).get_config())
@@ -276,6 +283,6 @@ def create_kernel_initializer(kernel_initializer_id):
     The Keras initializer object for the `tfl.layers.CDF` kernel variable.
   """
   if kernel_initializer_id in ["random_uniform", "RandomUniform"]:
-    return tf.keras.initializers.RandomUniform(0.0, 1.0)
+    return keras.initializers.RandomUniform(0.0, 1.0)
   else:
-    return tf.keras.initializers.get(kernel_initializer_id)
+    return keras.initializers.get(kernel_initializer_id)
diff --git a/tensorflow_lattice/python/cdf_test.py b/tensorflow_lattice/python/cdf_test.py
index 91c8b28..625b5dd 100644
--- a/tensorflow_lattice/python/cdf_test.py
+++ b/tensorflow_lattice/python/cdf_test.py
@@ -21,6 +21,13 @@
 import tensorflow as tf
 from tensorflow_lattice.python import cdf_layer
 from tensorflow_lattice.python import test_utils
+# pylint: disable=g-import-not-at-top
+# Use Keras 2.
+version_fn = getattr(tf.keras, "version", None)
+if version_fn and version_fn().startswith("3."):
+  import tf_keras as keras
+else:
+  keras = tf.keras
 
 
 class CdfLayerTest(parameterized.TestCase, tf.test.TestCase):
@@ -30,10 +37,10 @@ def setUp(self):
     self.disable_all = False
     self.loss_eps = 0.001
     self.small_eps = 1e-6
-    tf.keras.utils.set_random_seed(42)
+    keras.utils.set_random_seed(42)
 
   def _ResetAllBackends(self):
-    tf.keras.backend.clear_session()
+    keras.backend.clear_session()
     tf.compat.v1.reset_default_graph()
 
   def _SetDefaults(self, config):
@@ -115,10 +122,8 @@ def _GetTrainingInputsAndLabels(self, config):
       config: Dictionary with config for this unit test.
 
     Returns:
-      Tuple `(training_inputs, training_labels, raw_training_inputs)` where
-        `training_inputs` and `training_labels` are data for training and
-        `raw_training_inputs` are representation of training_inputs for
-        visualization.
+      Tuple `(training_inputs, training_labels)` where
+        `training_inputs` and `training_labels` are data for training.
     """
     raw_training_inputs = config["x_generator"](
         num_points=config["num_training_records"],
@@ -132,15 +137,15 @@ def _GetTrainingInputsAndLabels(self, config):
       training_inputs = raw_training_inputs
 
     training_labels = [config["y_function"](x) for x in training_inputs]
-    return training_inputs, training_labels, raw_training_inputs
+    return training_inputs, training_labels
 
-  def _TrainModel(self, config, plot_path=None):
+  def _TrainModel(self, config):
     logging.info("Testing config:")
     logging.info(config)
     config = self._SetDefaults(config)
     self._ResetAllBackends()
 
-    training_inputs, training_labels, raw_training_inputs = (
+    training_inputs, training_labels = (
         self._GetTrainingInputsAndLabels(config))
 
     keras_layer = cdf_layer.CDF(
@@ -154,25 +159,23 @@ def _TrainModel(self, config, plot_path=None):
         kernel_initializer=config["kernel_initializer"],
         input_shape=(config["input_dims"],),
         dtype=tf.float32)
-    model = tf.keras.models.Sequential()
+    model = keras.models.Sequential()
     model.add(keras_layer)
 
     # When we have multi-unit output, we average across the output units for
     # testing.
     if config["units"] > 1:
       model.add(
-          tf.keras.layers.Lambda(
+          keras.layers.Lambda(
               lambda x: tf.reduce_mean(x, axis=-1, keepdims=True)))
 
     optimizer = config["optimizer"](learning_rate=config["learning_rate"])
     model.compile(loss="mse", optimizer=optimizer)
 
-    training_data = (training_inputs, training_labels, raw_training_inputs)
+    training_data = (training_inputs, training_labels)
     loss = test_utils.run_training_loop(
-        config=config,
-        training_data=training_data,
-        keras_model=model,
-        plot_path=plot_path)
+        config=config, training_data=training_data, keras_model=model
+    )
 
     if tf.executing_eagerly():
       tf.print("final weights: ", keras_layer.kernel)
@@ -203,7 +206,7 @@ def test1Dim(self, activation, reduction, input_scaling_type, expected_loss):
         "input_scaling_type": input_scaling_type,
         "num_training_records": 128,
         "num_training_epoch": 50,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SinPlusX,
@@ -235,7 +238,7 @@ def test2Dim(self, activation, reduction, input_scaling_type, expected_loss):
         "input_scaling_type": input_scaling_type,
         "num_training_records": 900,
         "num_training_epoch": 100,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._TwoDMeshGrid,
         "y_function": self._SinPlusXNd,
@@ -268,7 +271,7 @@ def test5DimScaledSum(self, activation, reduction, input_scaling_type,
         "input_scaling_type": input_scaling_type,
         "num_training_records": 200,
         "num_training_epoch": 200,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._ScaledSum,
@@ -301,7 +304,7 @@ def test5DimSinOfSum(self, activation, reduction, input_scaling_type,
         "input_scaling_type": input_scaling_type,
         "num_training_records": 200,
         "num_training_epoch": 200,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SinOfSum,
@@ -334,7 +337,7 @@ def test1DimInputOutOfBounds(self, activation, reduction, input_scaling_type,
         "input_scaling_type": input_scaling_type,
         "num_training_records": 100,
         "num_training_epoch": 20,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformlyExtendedRange,
         "y_function": self._Sin,
@@ -367,7 +370,7 @@ def test2DimInputOutOfBounds(self, activation, reduction, input_scaling_type,
         "input_scaling_type": input_scaling_type,
         "num_training_records": 100,
         "num_training_epoch": 20,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._TwoDMeshGridExtendedRange,
         "y_function": self._SinOfSum,
@@ -395,7 +398,7 @@ def testMultiUnitOutputSparsity(self, input_dims, units, activation,
     if self.disable_all:
       return
     # Set the random seed for the initializer for consistent results.
-    kernel_initializer = tf.keras.initializers.RandomUniform(0.0, 1.0, seed=42)
+    kernel_initializer = keras.initializers.RandomUniform(0.0, 1.0, seed=42)
     config = {
         "input_dims": input_dims,
         "units": units,
@@ -406,7 +409,7 @@ def testMultiUnitOutputSparsity(self, input_dims, units, activation,
         "kernel_initializer": kernel_initializer,
         "num_training_records": 100,
         "num_training_epoch": 20,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._Square,
@@ -440,7 +443,7 @@ def testInputScalingInit(self, activation, reduction, input_scaling_init,
         "input_scaling_type": input_scaling_type,
         "num_training_records": 900,
         "num_training_epoch": 100,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._TwoDMeshGrid,
         "y_function": self._SinPlusXNd,
diff --git a/tensorflow_lattice/python/configs.py b/tensorflow_lattice/python/configs.py
index f049efb..e66454f 100644
--- a/tensorflow_lattice/python/configs.py
+++ b/tensorflow_lattice/python/configs.py
@@ -65,6 +65,13 @@
 
 from absl import logging
 import tensorflow as tf
+# pylint: disable=g-import-not-at-top
+# Use Keras 2.
+version_fn = getattr(tf.keras, "version", None)
+if version_fn and version_fn().startswith("3."):
+  import tf_keras as keras
+else:
+  keras = tf.keras
 
 _HPARAM_FEATURE_PREFIX = 'feature'
 _HPARAM_REGULARIZER_PREFIX = 'regularizer'
@@ -92,24 +99,24 @@ def get_config(self):
       config.pop('__class__')
     if 'feature_configs' in config and config['feature_configs'] is not None:
       config['feature_configs'] = [
-          tf.keras.utils.legacy.serialize_keras_object(feature_config)
+          keras.utils.legacy.serialize_keras_object(feature_config)
           for feature_config in config['feature_configs']
       ]
     if 'regularizer_configs' in config and config[
         'regularizer_configs'] is not None:
       config['regularizer_configs'] = [
-          tf.keras.utils.legacy.serialize_keras_object(regularizer_config)
+          keras.utils.legacy.serialize_keras_object(regularizer_config)
           for regularizer_config in config['regularizer_configs']
       ]
     if ('reflects_trust_in' in config and
         config['reflects_trust_in'] is not None):
       config['reflects_trust_in'] = [
-          tf.keras.utils.legacy.serialize_keras_object(trust_config)
+          keras.utils.legacy.serialize_keras_object(trust_config)
           for trust_config in config['reflects_trust_in']
       ]
     if 'dominates' in config and config['dominates'] is not None:
       config['dominates'] = [
-          tf.keras.utils.legacy.serialize_keras_object(dominance_config)
+          keras.utils.legacy.serialize_keras_object(dominance_config)
           for dominance_config in config['dominates']
       ]
     return config
@@ -120,7 +127,7 @@ def deserialize_nested_configs(cls, config, custom_objects=None):
     config = copy.deepcopy(config)
     if 'feature_configs' in config and config['feature_configs'] is not None:
       config['feature_configs'] = [
-          tf.keras.utils.legacy.deserialize_keras_object(
+          keras.utils.legacy.deserialize_keras_object(
               feature_config, custom_objects=custom_objects
           )
           for feature_config in config['feature_configs']
@@ -128,7 +135,7 @@ def deserialize_nested_configs(cls, config, custom_objects=None):
     if 'regularizer_configs' in config and config[
         'regularizer_configs'] is not None:
       config['regularizer_configs'] = [
-          tf.keras.utils.legacy.deserialize_keras_object(
+          keras.utils.legacy.deserialize_keras_object(
               regularizer_config, custom_objects=custom_objects
           )
           for regularizer_config in config['regularizer_configs']
@@ -136,14 +143,14 @@ def deserialize_nested_configs(cls, config, custom_objects=None):
     if ('reflects_trust_in' in config and
         config['reflects_trust_in'] is not None):
       config['reflects_trust_in'] = [
-          tf.keras.utils.legacy.deserialize_keras_object(
+          keras.utils.legacy.deserialize_keras_object(
               trust_config, custom_objects=custom_objects
           )
           for trust_config in config['reflects_trust_in']
       ]
     if 'dominates' in config and config['dominates'] is not None:
       config['dominates'] = [
-          tf.keras.utils.legacy.deserialize_keras_object(
+          keras.utils.legacy.deserialize_keras_object(
               dominance_config, custom_objects=custom_objects
           )
           for dominance_config in config['dominates']
diff --git a/tensorflow_lattice/python/estimators.py b/tensorflow_lattice/python/estimators.py
deleted file mode 100644
index 01a3ca3..0000000
--- a/tensorflow_lattice/python/estimators.py
+++ /dev/null
@@ -1,1934 +0,0 @@
-# Copyright 2019 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""TF Lattice canned estimators implement typical monotonic model architectures.
-
-You can use TFL canned estimators to easily construct commonly used monotonic
-model architectures. To construct a TFL canned estimator, construct a model
-configuration from `tfl.configs` and pass it to the canned estimator
-constructor. To use automated quantile calculation, canned estimators also
-require passing a *feature_analysis_input_fn* which is similar to the one used
-for training, but with a single epoch or a subset of the data. To create a
-Crystals ensemble model using `tfl.configs.CalibratedLatticeEnsembleConfig`, you
-will also need to provide a *prefitting_input_fn* to the estimator constructor.
-
-```python
-feature_columns = ...
-model_config = tfl.configs.CalibratedLatticeConfig(...)
-feature_analysis_input_fn = create_input_fn(num_epochs=1, ...)
-train_input_fn = create_input_fn(num_epochs=100, ...)
-estimator = tfl.estimators.CannedClassifier(
-    feature_columns=feature_columns,
-    model_config=model_config,
-    feature_analysis_input_fn=feature_analysis_input_fn)
-estimator.train(input_fn=train_input_fn)
-```
-
-Supported models are defined in `tfl.configs`. Each model architecture can be
-used for:
-
-*   **Classification** using `tfl.estimators.CannedClassifier` with standard
-    classification head (softmax cross-entropy loss).
-
-*   **Regression** using `tfl.estimators.CannedRegressor` with standard
-    regression head (squared loss).
-
-*   **Custom head** using `tfl.estimators.CannedEstimator` with any custom head
-    and loss.
-
-This module also provides `tfl.estimators.get_model_graph` as a mechanism to
-extract abstract model graphs and layer parameters from saved models. The
-resulting graph (not a TF graph) can be used by the `tfl.visualization` module
-for plotting and other visualization and analysis.
-
-```python
-model_graph = estimators.get_model_graph(saved_model_path)
-visualization.plot_feature_calibrator(model_graph, "feature_name")
-visualization.plot_all_calibrators(model_graph)
-visualization.draw_model_graph(model_graph)
-```
-
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import copy
-import json
-import os
-import re
-import time
-
-from . import categorical_calibration_layer
-from . import configs
-from . import kronecker_factored_lattice_layer as kfll
-from . import lattice_layer
-from . import linear_layer
-from . import model_info
-from . import premade
-from . import premade_lib
-from . import pwl_calibration_layer
-from . import rtl_layer
-
-from absl import logging
-import numpy as np
-import six
-import tensorflow as tf
-from tensorflow import estimator as tf_estimator
-
-from tensorflow.python.feature_column import feature_column as fc  # pylint: disable=g-direct-tensorflow-import
-from tensorflow.python.feature_column import feature_column_v2 as fc2  # pylint: disable=g-direct-tensorflow-import
-from tensorflow.python.training import training_util  # pylint: disable=g-direct-tensorflow-import
-from tensorflow_estimator.python.estimator import estimator as estimator_lib
-from tensorflow_estimator.python.estimator.canned import optimizers
-from tensorflow_estimator.python.estimator.head import binary_class_head
-from tensorflow_estimator.python.estimator.head import multi_class_head
-from tensorflow_estimator.python.estimator.head import regression_head
-
-# TODO: support multi dim inputs.
-# TODO: support multi dim output.
-# TODO: add linear layer regularizers.
-# TODO: add examples in docs.
-# TODO: make _REPEATED_PAIR_DISCOUNT_IN_CRYSTALS_SCORE config param
-
-# Feed and fetch names for the model.
-FEATURES_SCOPE = 'features'
-OUTPUT_NAME = 'output'
-
-# File to store and load feature keypoints.
-_KEYPOINTS_FILE = 'keypoints.json'
-
-# File to store and load lattice ensemble structure.
-_ENSEMBLE_STRUCTURE_FILE = 'ensemble_structure.json'
-
-# Name for label keypoints in keypoints file.
-_LABEL_FEATURE_NAME = '__label__'
-
-# Pooling interval and maximum wait time for workers waiting for files.
-_MAX_WAIT_TIME = 2400
-_POLL_INTERVAL_SECS = 10
-
-
-class WaitTimeOutError(Exception):
-  """Timeout error when waiting for a file."""
-  pass
-
-
-def _poll_for_file(filename):
-  """Waits and polls for a file until it exists."""
-  start = time.time()
-  while not tf.io.gfile.exists(filename):
-    time.sleep(_POLL_INTERVAL_SECS)
-    if time.time() - start > _MAX_WAIT_TIME:
-      raise WaitTimeOutError('Waiting for file {} timed-out'.format(filename))
-
-
-def transform_features(features, feature_columns=None):
-  """Parses the input features using the given feature columns.
-
-  This function can be used to parse input features when constructing a custom
-  estimator. When using this function, you will not need to wrap categorical
-  features with dense feature embeddings, and the resulting tensors will not be
-  concatenated, making it easier to use the features in the calibration layers.
-
-  Args:
-    features: A dict from feature names to tensors.
-    feature_columns: A list of FeatureColumn objects to be used for parsing. If
-      not provided, the input features are assumed to be already parsed.
-
-  Returns:
-    collections.OrderedDict mapping feature names to parsed tensors.
-  """
-  with tf.name_scope('transform'):
-    if feature_columns:
-      parsed_features = collections.OrderedDict()
-      for feature_column in feature_columns:
-        # pylint: disable=protected-access
-        if (isinstance(feature_column, fc._DenseColumn) or
-            isinstance(feature_column, fc2.DenseColumn)):
-          parsed_features[
-              feature_column.name] = feature_column._transform_feature(features)
-        elif (isinstance(feature_column, fc._CategoricalColumn) or
-              isinstance(feature_column, fc2.CategoricalColumn)):
-          if feature_column.num_oov_buckets:
-            # If oov buckets are used, missing values are assigned to the last
-            # oov bucket.
-            default_value = feature_column.num_buckets - 1
-          else:
-            default_value = feature_column.default_value
-          parsed_features[feature_column.name] = tf.reshape(
-              tf.sparse.to_dense(
-                  sp_input=feature_column._transform_feature(features),
-                  default_value=default_value),
-              shape=[-1, 1])
-        else:
-          raise ValueError(
-              'Unsupported feature_column: {}'.format(feature_column))
-        # pylint: enable=protected-access
-    else:
-      parsed_features = collections.OrderedDict(features)
-
-    for name, tensor in parsed_features.items():
-      if len(tensor.shape) == 1:
-        parsed_features[name] = tf.expand_dims(tensor, 1)
-      elif len(tensor.shape) > 2 or tensor.shape[1] != 1:
-        raise ValueError('Only 1-d inputs are supported: {}'.format(tensor))
-
-  with tf.name_scope(FEATURES_SCOPE):
-    for name, tensor in parsed_features.items():
-      parsed_features[name] = tf.identity(parsed_features[name], name=name)
-
-  return parsed_features
-
-
-def _materialize_locally(tensors, max_elements=1e6):
-  """Materialize the given tensors locally, during initialization.
-
-  Assumes non-distributed environment (uses SingularMonitoredSession).
-
-  Args:
-    tensors: A dict of name to feed tensors to be materialized.
-    max_elements: Data is read and accmulated from tensors until end-of-input is
-      reached or when we have at least max_elements collected.
-
-  Returns:
-    Materialized tensors as dict.
-  """
-  # tf.compat.v1.train.SingularMonitoredSession silently catches
-  # tf.errors.OutOfRangeError, and we want to expose it to detect end of the
-  # data from the given feed tensors.
-  with tf.compat.v1.train.SingularMonitoredSession() as sess:
-    splits = []
-    count = 0
-    try:
-      while count < max_elements:
-        materialized_tensors = sess.run(tensors)
-        values = list(materialized_tensors.values())
-        if not values:
-          break
-        count += len(values[0])
-        splits.append(materialized_tensors)
-    except (tf.errors.OutOfRangeError, StopIteration):
-      pass
-    concatenated_tensors = {}
-    for k in tensors:
-      k_tensors = [split[k] for split in splits if split[k].size > 0]
-      # If k_tensors is empty, the np.concat call below would raise
-      # an exception below. So we test this condition here to provide
-      # a better error message.
-      if not k_tensors:
-        raise ValueError('Did not find any values for key: {}'.format(k))
-      concatenated_tensors[k] = np.concatenate(k_tensors)
-    return concatenated_tensors
-
-
-def _finalize_keypoints(model_config, config, feature_columns,
-                        feature_analysis_input_fn,
-                        feature_analysis_weight_column,
-                        feature_analysis_weight_reduction, logits_output):
-  """Calculates and sets keypoints for input and output calibration.
-
-  Input and label keypoints are calculated, stored in a file and also set in the
-  model_config to be used for model construction.
-
-  Args:
-    model_config: Model config to be updated.
-    config: A `tf.RunConfig` to indicate if worker is chief.
-    feature_columns: A list of FeatureColumn's to use for feature parsing.
-    feature_analysis_input_fn: An input_fn used to collect feature statistics.
-    feature_analysis_weight_column: None or weight column to use.
-    feature_analysis_weight_reduction: Reduction applied to weights for repeated
-      values. Must be either 'mean' or 'sum'.
-    logits_output: A boolean indicating if model outputs logits.
-
-  Raises:
-    ValueError: If keypoints mode is invalid.
-  """
-  if not feature_analysis_input_fn:
-    return
-
-  keypoints_filename = os.path.join(config.model_dir, _KEYPOINTS_FILE)
-  if ((config is None or config.is_chief) and
-      not tf.io.gfile.exists(keypoints_filename)):
-    # As the chief worker, calculate and store the keypoints.
-    with tf.Graph().as_default():
-      ds = feature_analysis_input_fn()
-      if isinstance(ds, tf.data.Dataset):
-        ds = tf.compat.v1.data.make_one_shot_iterator(ds).get_next()
-      features, labels = ds
-
-      # Add weights and labels to the feature list to be materialized.
-      if feature_analysis_weight_column is not None:
-        if isinstance(feature_analysis_weight_column, str):
-          feature_analysis_weight_column = tf.feature_column.numeric_column(
-              feature_analysis_weight_column)
-        feature_columns = feature_columns + [feature_analysis_weight_column]
-
-      values = transform_features(features, feature_columns)
-      values[_LABEL_FEATURE_NAME] = labels
-      values = _materialize_locally(values)
-      for feature_name in values:
-        values[feature_name] = values[feature_name].flatten()
-
-      # Pop out label and weight values.
-      labels = values.pop(_LABEL_FEATURE_NAME)
-      if feature_analysis_weight_column is not None:
-        weights = values.pop(feature_analysis_weight_column.name)
-      else:
-        weights = None
-
-    keypoints = premade_lib.compute_feature_keypoints(
-        feature_configs=model_config.feature_configs,
-        features=values,
-        weights=weights,
-        weight_reduction=feature_analysis_weight_reduction)
-    keypoints[_LABEL_FEATURE_NAME] = premade_lib.compute_label_keypoints(
-        model_config=model_config,
-        labels=labels,
-        logits_output=logits_output,
-        weights=weights,
-        weight_reduction=feature_analysis_weight_reduction)
-
-    # Convert to python float type for serialization.
-    keypoints = {
-        k: [float(v) for v in vs] for k, vs in six.iteritems(keypoints)
-    }
-
-    # Save keypoints to file as the chief worker.
-    tmp_keypoints_filename = keypoints_filename + 'tmp'
-    with tf.io.gfile.GFile(tmp_keypoints_filename, 'w') as keypoints_file:
-      keypoints_file.write(json.dumps(keypoints, indent=2))
-    tf.io.gfile.rename(tmp_keypoints_filename, keypoints_filename)
-  else:
-    # Non-chief workers read the keypoints from file.
-    _poll_for_file(keypoints_filename)
-    with tf.io.gfile.GFile(keypoints_filename) as keypoints_file:
-      keypoints = json.loads(keypoints_file.read())
-
-  # Set the keypoint values in the model config.
-  label_keypoints = keypoints.pop(_LABEL_FEATURE_NAME)
-  premade_lib.set_feature_keypoints(
-      feature_configs=model_config.feature_configs,
-      feature_keypoints=keypoints,
-      add_missing_feature_configs=True)
-  premade_lib.set_label_keypoints(
-      model_config=model_config, label_keypoints=label_keypoints)
-
-
-def _fix_ensemble_for_2d_constraints(model_config, feature_names):
-  """Fixes 2d constraint violations by adding missing features to some lattices.
-
-  Some 2d shape constraints require lattices from ensemble to either contain
-  both constrained features or none of them, e.g. trapezoid trust constraint
-  requires a lattice that has the "conditional" feature to include the "main"
-  feature.
-
-  Args:
-    model_config: Model config to be updated.
-    feature_names: List of feature names.
-  """
-  must_include_features = collections.defaultdict(set)
-  for feature_name in feature_names:
-    feature_config = model_config.feature_config_by_name(feature_name)
-    for trust_config in feature_config.reflects_trust_in or []:
-      if trust_config.trust_type == 'trapezoid':
-        must_include_features[feature_name].add(trust_config.feature_name)
-    for dominance_config in feature_config.dominates or []:
-      must_include_features[dominance_config.feature_name].add(feature_name)
-
-  fixed_lattices = []
-  for idx, lattice in enumerate(model_config.lattices):
-    fixed_lattice = set()
-    for feature_name in lattice:
-      fixed_lattice.add(feature_name)
-      fixed_lattice.update(must_include_features[feature_name])
-    assert len(lattice) <= len(fixed_lattice)
-    fixed_lattices.append(list(fixed_lattice))
-    if len(lattice) < len(fixed_lattice):
-      logging.info(
-          'Fixed 2d constraint violations in lattices[%d]. Lattice rank '
-          'increased from %d to %d.', idx, len(lattice), len(fixed_lattice))
-
-  model_config.lattices = fixed_lattices
-
-
-def _set_crystals_lattice_ensemble(model_config, feature_names, label_dimension,
-                                   feature_columns, head, prefitting_input_fn,
-                                   prefitting_optimizer, prefitting_steps,
-                                   config, dtype):
-  """Sets the lattice ensemble in model_config using the crystals algorithm."""
-  if prefitting_input_fn is None:
-    raise ValueError('prefitting_input_fn must be set for crystals models')
-
-  # Get prefitting model config.
-  prefitting_model_config = premade_lib.construct_prefitting_model_config(
-      model_config, feature_names)
-
-  def prefitting_model_fn(features, labels, mode, config):
-    return _calibrated_lattice_ensemble_model_fn(
-        features=features,
-        labels=labels,
-        label_dimension=label_dimension,
-        feature_columns=feature_columns,
-        mode=mode,
-        head=head,
-        model_config=prefitting_model_config,
-        optimizer=prefitting_optimizer,
-        config=config,
-        dtype=dtype)
-
-  config = tf_estimator.RunConfig(
-      keep_checkpoint_max=1,
-      save_summary_steps=0,
-      save_checkpoints_steps=10000000,
-      tf_random_seed=config.tf_random_seed if config is not None else 42)
-  logging.info('Creating the prefitting estimator.')
-  prefitting_estimator = tf_estimator.Estimator(
-      model_fn=prefitting_model_fn, config=config)
-  logging.info('Training the prefitting estimator.')
-  prefitting_estimator.train(
-      input_fn=prefitting_input_fn, steps=prefitting_steps)
-  premade_lib.set_crystals_lattice_ensemble(
-      model_config=model_config,
-      prefitting_model_config=prefitting_model_config,
-      prefitting_model=prefitting_estimator,
-      feature_names=feature_names)
-  logging.info('Finished training the prefitting estimator.')
-
-  # Cleanup model_dir since we might be reusing it for the main estimator.
-  # Note that other workers are blocked until model structure file is
-  # generated by the chief worker, so modifying files here should be safe.
-  remove_list = [
-      os.path.join(prefitting_estimator.model_dir, 'graph.pbtxt'),
-      os.path.join(prefitting_estimator.model_dir, 'checkpoint'),
-  ]
-  remove_list.extend(
-      tf.io.gfile.glob(prefitting_estimator.latest_checkpoint() + '*'))
-  for file_path in remove_list:
-    tf.io.gfile.remove(file_path)
-
-
-def _finalize_model_structure(model_config, label_dimension, feature_columns,
-                              head, prefitting_input_fn, prefitting_optimizer,
-                              prefitting_steps, model_dir, config,
-                              warm_start_from, dtype):
-  """Sets up the lattice ensemble in model_config with requested algorithm."""
-  if (not isinstance(model_config, configs.CalibratedLatticeEnsembleConfig) or
-      isinstance(model_config.lattices, list)):
-    return
-
-  # TODO: If warmstarting, look for the previous ensemble file.
-  if warm_start_from:
-    raise ValueError('Warm starting lattice ensembles without explicitly '
-                     'defined lattices is not supported yet.')
-
-  if feature_columns:
-    feature_names = [feature_column.name for feature_column in feature_columns]
-  else:
-    feature_names = [
-        feature_config.name for feature_config in model_config.feature_configs
-    ]
-
-  if model_config.lattice_rank > len(feature_names):
-    raise ValueError(
-        'lattice_rank {} cannot be larger than the number of features: {}'
-        .format(model_config.lattice_rank, feature_names))
-
-  if model_config.num_lattices * model_config.lattice_rank < len(feature_names):
-    raise ValueError(
-        'Model with {}x{}d lattices is not large enough for all features: {}'
-        .format(model_config.num_lattices, model_config.lattice_rank,
-                feature_names))
-
-  ensemble_structure_filename = os.path.join(model_dir,
-                                             _ENSEMBLE_STRUCTURE_FILE)
-  if ((config is None or config.is_chief) and
-      not tf.io.gfile.exists(ensemble_structure_filename)):
-    if model_config.lattices not in ['random', 'crystals', 'rtl_layer']:
-      raise ValueError('Unsupported ensemble structure: {}'.format(
-          model_config.lattices))
-    if model_config.lattices == 'random':
-      premade_lib.set_random_lattice_ensemble(model_config, feature_names)
-    elif model_config.lattices == 'crystals':
-      _set_crystals_lattice_ensemble(
-          feature_names=feature_names,
-          label_dimension=label_dimension,
-          feature_columns=feature_columns,
-          head=head,
-          model_config=model_config,
-          prefitting_input_fn=prefitting_input_fn,
-          prefitting_optimizer=prefitting_optimizer,
-          prefitting_steps=prefitting_steps,
-          config=config,
-          dtype=dtype)
-    if (model_config.fix_ensemble_for_2d_constraints and
-        model_config.lattices != 'rtl_layer'):
-      # Note that we currently only support monotonicity and bound constraints
-      # for RTL.
-      _fix_ensemble_for_2d_constraints(model_config, feature_names)
-
-    # Save lattices to file as the chief worker.
-    tmp_ensemble_structure_filename = ensemble_structure_filename + 'tmp'
-    with tf.io.gfile.GFile(tmp_ensemble_structure_filename,
-                           'w') as ensemble_structure_file:
-      ensemble_structure_file.write(json.dumps(model_config.lattices, indent=2))
-    tf.io.gfile.rename(tmp_ensemble_structure_filename,
-                       ensemble_structure_filename)
-  else:
-    # Non-chief workers read the lattices from file.
-    _poll_for_file(ensemble_structure_filename)
-    with tf.io.gfile.GFile(
-        ensemble_structure_filename) as ensemble_structure_file:
-      model_config.lattices = json.loads(ensemble_structure_file.read())
-
-  logging.info('Finalized model structure: %s', str(model_config.lattices))
-
-
-def _verify_config(model_config, feature_columns):
-  """Verifies that the config is setup correctly and ready for model_fn."""
-  if feature_columns:
-    feature_configs = [
-        model_config.feature_config_by_name(feature_column.name)
-        for feature_column in feature_columns
-    ]
-  else:
-    feature_configs = model_config.feature_configs or []
-
-  for feature_config in feature_configs:
-    if not feature_config.num_buckets:
-      if (not np.iterable(feature_config.pwl_calibration_input_keypoints) or
-          any(not isinstance(x, float)
-              for x in feature_config.pwl_calibration_input_keypoints)):
-        raise ValueError(
-            'Input keypoints are invalid for feature {}: {}'.format(
-                feature_config.name,
-                feature_config.pwl_calibration_input_keypoints))
-
-  if (not np.iterable(model_config.output_initialization) or any(
-      not isinstance(x, float) for x in model_config.output_initialization)):
-    raise ValueError('Output initilization is invalid: {}'.format(
-        model_config.output_initialization))
-
-
-def _update_by_feature_columns(model_config, feature_columns):
-  """Updates a model config with the given feature columns."""
-  for feature_column in feature_columns or []:
-    feature_config = model_config.feature_config_by_name(feature_column.name)
-    # pylint: disable=protected-access
-    if (isinstance(feature_column, fc._DenseColumn) or
-        isinstance(feature_column, fc2.DenseColumn)):
-      feature_config.default_value = feature_column.default_value
-    elif (isinstance(feature_column, fc._VocabularyListCategoricalColumn) or
-          isinstance(feature_column, fc2.VocabularyListCategoricalColumn)):
-      feature_config.vocabulary_list = feature_column.vocabulary_list
-      feature_config.num_buckets = feature_column.num_buckets
-      if feature_column.num_oov_buckets:
-        # A positive num_oov_buckets can not be specified with default_value.
-        # See tf.feature_column.categorical_column_with_vocabulary_list.
-        feature_config.default_value = None
-      else:
-        # We add a bucket at the end for the default_value, since num_buckets
-        # does not include the default value (but includes oov buckets).
-        feature_config.default_value = feature_column.default_value
-        feature_config.num_buckets += 1
-    else:
-      raise ValueError('Unsupported feature_column: {}'.format(feature_column))
-    # pylint: enable=protected-access
-
-  # Change categorical monotonicities to indices.
-  premade_lib.set_categorical_monotonicities(model_config.feature_configs)
-
-
-def _calibrated_lattice_ensemble_model_fn(features, labels, label_dimension,
-                                          feature_columns, mode, head,
-                                          model_config, optimizer, config,
-                                          dtype):
-  """Calibrated Lattice Ensemble Model."""
-  del config
-  if label_dimension != 1:
-    raise ValueError('Only 1-dimensional output is supported.')
-
-  # Get input tensors and corresponding feature configs.
-  transformed_features = transform_features(features, feature_columns)
-  feature_names = list(transformed_features.keys())
-  input_tensors = [
-      transformed_features[feature_name] for feature_name in feature_names
-  ]
-  # Reconstruct feature_config in order of feature_names
-  feature_configs = [
-      model_config.feature_config_by_name(feature_name)
-      for feature_name in feature_names
-  ]
-  del model_config.feature_configs[:]
-  model_config.feature_configs.extend(feature_configs)
-
-  training = (mode == tf_estimator.ModeKeys.TRAIN)
-  model = premade.CalibratedLatticeEnsemble(
-      model_config=model_config, dtype=dtype)
-  logits = tf.identity(
-      model(input_tensors, training=training), name=OUTPUT_NAME)
-
-  if training:
-    optimizer = optimizers.get_optimizer_instance_v2(optimizer)
-    optimizer.iterations = training_util.get_or_create_global_step()
-  else:
-    optimizer = None
-
-  return head.create_estimator_spec(
-      features=features,
-      mode=mode,
-      labels=labels,
-      optimizer=optimizer,
-      logits=logits,
-      trainable_variables=model.trainable_variables,
-      update_ops=model.updates,
-      regularization_losses=model.losses or None)
-
-
-def _calibrated_lattice_model_fn(features, labels, label_dimension,
-                                 feature_columns, mode, head, model_config,
-                                 optimizer, config, dtype):
-  """Calibrated Lattice Model."""
-  del config
-  if label_dimension != 1:
-    raise ValueError('Only 1-dimensional output is supported.')
-
-  # Get input tensors and corresponding feature configs.
-  transformed_features = transform_features(features, feature_columns)
-  feature_names = list(transformed_features.keys())
-  input_tensors = [
-      transformed_features[feature_name] for feature_name in feature_names
-  ]
-  # Reconstruct feature_config in order of feature_names
-  feature_configs = [
-      model_config.feature_config_by_name(feature_name)
-      for feature_name in feature_names
-  ]
-  del model_config.feature_configs[:]
-  model_config.feature_configs.extend(feature_configs)
-
-  training = (mode == tf_estimator.ModeKeys.TRAIN)
-  model = premade.CalibratedLattice(model_config=model_config, dtype=dtype)
-  logits = tf.identity(
-      model(input_tensors, training=training), name=OUTPUT_NAME)
-
-  if training:
-    optimizer = optimizers.get_optimizer_instance_v2(optimizer)
-    optimizer.iterations = training_util.get_or_create_global_step()
-  else:
-    optimizer = None
-
-  return head.create_estimator_spec(
-      features=features,
-      mode=mode,
-      labels=labels,
-      optimizer=optimizer,
-      logits=logits,
-      trainable_variables=model.trainable_variables,
-      update_ops=model.updates,
-      regularization_losses=model.losses or None)
-
-
-def _calibrated_linear_model_fn(features, labels, label_dimension,
-                                feature_columns, mode, head, model_config,
-                                optimizer, config, dtype):
-  """Calibrated Linear Model."""
-  del config
-  if label_dimension != 1:
-    raise ValueError('Only 1-dimensional output is supported.')
-
-  # Get input tensors and corresponding feature configs.
-  transformed_features = transform_features(features, feature_columns)
-  feature_names = list(transformed_features.keys())
-  input_tensors = [
-      transformed_features[feature_name] for feature_name in feature_names
-  ]
-  # Reconstruct feature_config in order of feature_names
-  feature_configs = [
-      model_config.feature_config_by_name(feature_name)
-      for feature_name in feature_names
-  ]
-  del model_config.feature_configs[:]
-  model_config.feature_configs.extend(feature_configs)
-
-  training = (mode == tf_estimator.ModeKeys.TRAIN)
-  model = premade.CalibratedLinear(model_config=model_config, dtype=dtype)
-  logits = tf.identity(
-      model(input_tensors, training=training), name=OUTPUT_NAME)
-
-  if training:
-    optimizer = optimizers.get_optimizer_instance_v2(optimizer)
-    optimizer.iterations = training_util.get_or_create_global_step()
-
-  return head.create_estimator_spec(
-      features=features,
-      mode=mode,
-      labels=labels,
-      optimizer=optimizer,
-      logits=logits,
-      trainable_variables=model.trainable_variables,
-      update_ops=model.updates,
-      regularization_losses=model.losses or None)
-
-
-def _get_model_fn(label_dimension, feature_columns, head, model_config,
-                  optimizer, dtype):
-  """Returns the model_fn for the given model_config."""
-  if isinstance(model_config, configs.CalibratedLatticeConfig):
-
-    def calibrated_lattice_model_fn(features, labels, mode, config):
-      return _calibrated_lattice_model_fn(
-          features=features,
-          labels=labels,
-          label_dimension=label_dimension,
-          feature_columns=feature_columns,
-          mode=mode,
-          head=head,
-          model_config=model_config,
-          optimizer=optimizer,
-          config=config,
-          dtype=dtype)
-
-    return calibrated_lattice_model_fn
-  elif isinstance(model_config, configs.CalibratedLinearConfig):
-
-    def calibrated_linear_model_fn(features, labels, mode, config):
-      return _calibrated_linear_model_fn(
-          features=features,
-          labels=labels,
-          label_dimension=label_dimension,
-          feature_columns=feature_columns,
-          mode=mode,
-          head=head,
-          model_config=model_config,
-          optimizer=optimizer,
-          config=config,
-          dtype=dtype)
-
-    return calibrated_linear_model_fn
-  if isinstance(model_config, configs.CalibratedLatticeEnsembleConfig):
-
-    def calibrated_lattice_ensemble_model_fn(features, labels, mode, config):
-      return _calibrated_lattice_ensemble_model_fn(
-          features=features,
-          labels=labels,
-          label_dimension=label_dimension,
-          feature_columns=feature_columns,
-          mode=mode,
-          head=head,
-          model_config=model_config,
-          optimizer=optimizer,
-          config=config,
-          dtype=dtype)
-
-    return calibrated_lattice_ensemble_model_fn
-  else:
-    raise ValueError('Unsupported model type: {}'.format(type(model_config)))
-
-
-class CannedEstimator(estimator_lib.EstimatorV2):
-  """An estimator for TensorFlow lattice models.
-
-  Creates an estimator with a custom head for the model architecutre specified
-  by the `model_config`, which should be one of those defined in `tfl.configs`.
-  Calculation of feature quantiles for input keypoint initialization is done
-  using `feature_analysis_input_fn`. If this auxiliary input fn is not provided,
-  all keypoint values should be explicitly provided via the `model_config`.
-
-  Example:
-
-  ```python
-  model_config = tfl.configs.CalibratedLatticeConfig(...)
-  feature_analysis_input_fn = create_input_fn(num_epochs=1, ...)
-  train_input_fn = create_input_fn(num_epochs=100, ...)
-  head = ...
-  estimator = tfl.estimators.CannedEstimator(
-      feature_columns=feature_columns,
-      model_config=model_config,
-      feature_analysis_input_fn=feature_analysis_input_fn
-      head=head)
-  estimator.train(input_fn=train_input_fn)
-  ```
-  """
-
-  def __init__(self,
-               head,
-               model_config,
-               feature_columns,
-               feature_analysis_input_fn=None,
-               feature_analysis_weight_column=None,
-               feature_analysis_weight_reduction='mean',
-               prefitting_input_fn=None,
-               model_dir=None,
-               label_dimension=1,
-               optimizer='Adagrad',
-               prefitting_optimizer='Adagrad',
-               prefitting_steps=None,
-               config=None,
-               warm_start_from=None,
-               dtype=tf.float32):
-    """Initializes a `CannedEstimator` instance.
-
-    Args:
-      head: A `_Head` instance constructed with a method such as
-        `tf.contrib.estimator.multi_label_head`.
-      model_config: Model configuration object describing model architecutre.
-        Should be one of the model configs in `tfl.configs`.
-      feature_columns: An iterable containing all the feature columns used by
-        the model.
-      feature_analysis_input_fn: An input_fn used to calculate statistics about
-        features and labels in order to setup calibration keypoint and values.
-      feature_analysis_weight_column: A string or a `_NumericColumn` created by
-        `tf.feature_column.numeric_column` defining feature column representing
-        weights used for calculating weighted feature statistics (quantiles).
-        Can be the same as `weight_column`.
-      feature_analysis_weight_reduction: Reduction used on weights when
-        aggregating repeated values during feature analysis. Can be either 'sum'
-        or 'mean'.
-      prefitting_input_fn: An input_fn used in the pre fitting stage to estimate
-        non-linear feature interactions. Required for crystals models.
-        Prefitting typically uses the same dataset as the main training, but
-        with fewer epochs.
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator to
-        continue training a previously saved model.
-      label_dimension: Number of regression targets per example. This is the
-        size of the last dimension of the labels and logits `Tensor` objects
-        (typically, these have shape `[batch_size, label_dimension]`).
-      optimizer: An instance of `tf.Optimizer` used to train the model. Can also
-        be a string (one of 'Adagrad', 'Adam', 'Ftrl', 'RMSProp', 'SGD'), or
-        callable. Defaults to Adagrad optimizer.
-      prefitting_optimizer: An instance of `tf.Optimizer` used to train the
-        model during the pre-fitting stage. Can also be a string (one of
-        'Adagrad', 'Adam', 'Ftrl', 'RMSProp', 'SGD'), or callable. Defaults to
-        Adagrad optimizer.
-      prefitting_steps: Number of steps for which to pretraing train the model
-        during the prefitting stage. If None, train forever or train until
-        prefitting_input_fn generates the tf.errors.OutOfRange error or
-        StopIteration exception.
-      config: `RunConfig` object to configure the runtime settings.
-      warm_start_from: A string filepath to a checkpoint to warm-start from, or
-        a `WarmStartSettings` object to fully configure warm-starting.  If the
-        string filepath is provided instead of a `WarmStartSettings`, then all
-        weights are warm-started, and it is assumed that vocabularies and Tensor
-        names are unchanged.
-      dtype: dtype of layers used in the model.
-    """
-    config = estimator_lib.maybe_overwrite_model_dir_and_session_config(
-        config, model_dir)
-    model_dir = config.model_dir
-
-    model_config = copy.deepcopy(model_config)
-    _update_by_feature_columns(model_config, feature_columns)
-
-    _finalize_keypoints(
-        model_config=model_config,
-        config=config,
-        feature_columns=feature_columns,
-        feature_analysis_input_fn=feature_analysis_input_fn,
-        feature_analysis_weight_column=feature_analysis_weight_column,
-        feature_analysis_weight_reduction=feature_analysis_weight_reduction,
-        logits_output=True)
-
-    _verify_config(model_config, feature_columns)
-
-    _finalize_model_structure(
-        label_dimension=label_dimension,
-        feature_columns=feature_columns,
-        head=head,
-        model_config=model_config,
-        prefitting_input_fn=prefitting_input_fn,
-        prefitting_optimizer=prefitting_optimizer,
-        prefitting_steps=prefitting_steps,
-        model_dir=model_dir,
-        config=config,
-        warm_start_from=warm_start_from,
-        dtype=dtype)
-
-    model_fn = _get_model_fn(
-        label_dimension=label_dimension,
-        feature_columns=feature_columns,
-        head=head,
-        model_config=model_config,
-        optimizer=optimizer,
-        dtype=dtype)
-
-    super(CannedEstimator, self).__init__(
-        model_fn=model_fn,
-        model_dir=model_dir,
-        config=config,
-        warm_start_from=warm_start_from)
-
-
-class CannedClassifier(estimator_lib.EstimatorV2):
-  """Canned classifier for TensorFlow lattice models.
-
-  Creates a classifier for the model architecutre specified by the
-  `model_config`, which should be one of those defined in `tfl.configs`.
-  Calclulation of feature quantiles for input keypoint initialization is done
-  using `feature_analysis_input_fn`. If this auxiliary input fn is not provided,
-  all keypoint values should be explicitly provided via the `model_config`.
-
-  Training loss is softmax cross-entropy as defined for the default
-  TF classificaiton head.
-
-  Example:
-
-  ```python
-  model_config = tfl.configs.CalibratedLatticeConfig(...)
-  feature_analysis_input_fn = create_input_fn(num_epochs=1, ...)
-  train_input_fn = create_input_fn(num_epochs=100, ...)
-  estimator = tfl.estimators.CannedClassifier(
-      feature_columns=feature_columns,
-      model_config=model_config,
-      feature_analysis_input_fn=feature_analysis_input_fn)
-  estimator.train(input_fn=train_input_fn)
-  ```
-  """
-
-  def __init__(self,
-               model_config,
-               feature_columns,
-               feature_analysis_input_fn=None,
-               feature_analysis_weight_column=None,
-               feature_analysis_weight_reduction='mean',
-               prefitting_input_fn=None,
-               model_dir=None,
-               n_classes=2,
-               weight_column=None,
-               label_vocabulary=None,
-               optimizer='Adagrad',
-               prefitting_optimizer='Adagrad',
-               prefitting_steps=None,
-               config=None,
-               warm_start_from=None,
-               loss_reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE,
-               loss_fn=None,
-               dtype=tf.float32):
-    """Initializes a `CannedClassifier` instance.
-
-    Args:
-      model_config: Model configuration object describing model architecutre.
-        Should be one of the model configs in `tfl.configs`.
-      feature_columns: An iterable containing all the feature columns used by
-        the model.
-      feature_analysis_input_fn: An input_fn used to calculate statistics about
-        features and labels in order to setup calibration keypoint and values.
-      feature_analysis_weight_column: A string or a `_NumericColumn` created by
-        `tf.feature_column.numeric_column` defining feature column representing
-        weights used for calculating weighted feature statistics (quantiles).
-        Can be the same as `weight_column`.
-      feature_analysis_weight_reduction: Reduction used on weights when
-        aggregating repeated values during feature analysis. Can be either 'sum'
-        or 'mean'.
-      prefitting_input_fn: An input_fn used in the pre fitting stage to estimate
-        non-linear feature interactions. Required for crystals models.
-        Prefitting typically uses the same dataset as the main training, but
-        with fewer epochs.
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator to
-        continue training a previously saved model.
-      n_classes: Number of label classes. Defaults to 2, namely binary
-        classification. Must be > 1.
-      weight_column: A string or a `_NumericColumn` created by
-        `tf.feature_column.numeric_column` defining feature column representing
-        weights. It is only used by the estimator head to down weight or boost
-        examples during training. It will be multiplied by the loss of the
-        example. If it is a string, it is used as a key to fetch the weight
-        tensor from the `features` dictionary output of the input function. If
-        it is a `_NumericColumn`, a raw tensor is fetched by key
-        `weight_column.key`, then weight_column.normalizer_fn is applied on it
-        to get the weight tensor. Note that in both cases 'weight_column' should
-        *not* be a member of the 'feature_columns'  parameter to the constructor
-        since these will be used for both serving and training.
-      label_vocabulary: A list of strings represents possible label values. If
-        given, labels must be string type and have any value in
-        `label_vocabulary`. If it is not given, that means labels are already
-        encoded as integer or float within [0, 1] for `n_classes=2` and encoded
-        as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 . Also
-        there will be errors if vocabulary is not provided and labels are
-        string.
-      optimizer: An instance of `tf.Optimizer` used to train the model. Can also
-        be a string (one of 'Adagrad', 'Adam', 'Ftrl', 'RMSProp', 'SGD'), or
-        callable. Defaults to Adagrad optimizer.
-      prefitting_optimizer: An instance of `tf.Optimizer` used to train the
-        model during the pre-fitting stage. Can also be a string (one of
-        'Adagrad', 'Adam', 'Ftrl', 'RMSProp', 'SGD'), or callable. Defaults to
-        Adagrad optimizer.
-      prefitting_steps: Number of steps for which to pretraing train the model
-        during the prefitting stage. If None, train forever or train until
-        prefitting_input_fn generates the tf.errors.OutOfRange error or
-        StopIteration exception.
-      config: `RunConfig` object to configure the runtime settings.
-      warm_start_from: A string filepath to a checkpoint to warm-start from, or
-        a `WarmStartSettings` object to fully configure warm-starting.  If the
-        string filepath is provided instead of a `WarmStartSettings`, then all
-        weights are warm-started, and it is assumed that vocabularies and Tensor
-        names are unchanged.
-      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
-        to reduce training loss over batch. Defaults to `SUM_OVER_BATCH_SIZE`.
-      loss_fn: Optional loss function.
-      dtype: dtype of layers used in the model.
-    """
-    config = estimator_lib.maybe_overwrite_model_dir_and_session_config(
-        config, model_dir)
-    model_dir = config.model_dir
-
-    if n_classes == 2:
-      head = binary_class_head.BinaryClassHead(
-          weight_column=weight_column,
-          label_vocabulary=label_vocabulary,
-          loss_reduction=loss_reduction,
-          loss_fn=loss_fn)
-    else:
-      head = multi_class_head.MultiClassHead(
-          n_classes,
-          weight_column=weight_column,
-          label_vocabulary=label_vocabulary,
-          loss_reduction=loss_reduction,
-          loss_fn=loss_fn)
-
-    label_dimension = 1 if n_classes == 2 else n_classes
-
-    model_config = copy.deepcopy(model_config)
-    _update_by_feature_columns(model_config, feature_columns)
-
-    _finalize_keypoints(
-        model_config=model_config,
-        config=config,
-        feature_columns=feature_columns,
-        feature_analysis_input_fn=feature_analysis_input_fn,
-        feature_analysis_weight_column=feature_analysis_weight_column,
-        feature_analysis_weight_reduction=feature_analysis_weight_reduction,
-        logits_output=True)
-
-    _verify_config(model_config, feature_columns)
-
-    _finalize_model_structure(
-        label_dimension=label_dimension,
-        feature_columns=feature_columns,
-        head=head,
-        model_config=model_config,
-        prefitting_input_fn=prefitting_input_fn,
-        prefitting_optimizer=prefitting_optimizer,
-        prefitting_steps=prefitting_steps,
-        model_dir=model_dir,
-        config=config,
-        warm_start_from=warm_start_from,
-        dtype=dtype)
-
-    model_fn = _get_model_fn(
-        label_dimension=label_dimension,
-        feature_columns=feature_columns,
-        head=head,
-        model_config=model_config,
-        optimizer=optimizer,
-        dtype=dtype)
-
-    super(CannedClassifier, self).__init__(
-        model_fn=model_fn,
-        model_dir=model_dir,
-        config=config,
-        warm_start_from=warm_start_from)
-
-
-class CannedRegressor(estimator_lib.EstimatorV2):
-  """A regressor for TensorFlow lattice models.
-
-  Creates a regressor for the model architecutre specified by the
-  `model_config`, which should be one of those defined in `tfl.configs`.
-  Calclulation of feature quantiles for input keypoint initialization is done
-  using `feature_analysis_input_fn`. If this auxiliary input fn is not provided,
-  all keypoint values should be explicitly provided via the `model_config`.
-
-  Training loss is squared error as defined for the default TF regression head.
-
-  Example:
-
-  ```python
-  model_config = tfl.configs.CalibratedLatticeConfig(...)
-  feature_analysis_input_fn = create_input_fn(num_epochs=1, ...)
-  train_input_fn = create_input_fn(num_epochs=100, ...)
-  estimator = tfl.estimators.CannedRegressor(
-      feature_columns=feature_columns,
-      model_config=model_config,
-      feature_analysis_input_fn=feature_analysis_input_fn)
-  estimator.train(input_fn=train_input_fn)
-  ```
-  """
-
-  def __init__(self,
-               model_config,
-               feature_columns,
-               feature_analysis_input_fn=None,
-               feature_analysis_weight_column=None,
-               feature_analysis_weight_reduction='mean',
-               prefitting_input_fn=None,
-               model_dir=None,
-               label_dimension=1,
-               weight_column=None,
-               optimizer='Adagrad',
-               prefitting_optimizer='Adagrad',
-               prefitting_steps=None,
-               config=None,
-               warm_start_from=None,
-               loss_reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE,
-               loss_fn=None,
-               dtype=tf.float32):
-    """Initializes a `CannedRegressor` instance.
-
-    Args:
-      model_config: Model configuration object describing model architecutre.
-        Should be one of the model configs in `tfl.configs`.
-      feature_columns: An iterable containing all the feature columns used by
-        the model.
-      feature_analysis_input_fn: An input_fn used to calculate statistics about
-        features and labels in order to setup calibration keypoint and values.
-      feature_analysis_weight_column: A string or a `_NumericColumn` created by
-        `tf.feature_column.numeric_column` defining feature column representing
-        weights used for calculating weighted feature statistics (quantiles).
-        Can be the same as `weight_column`.
-      feature_analysis_weight_reduction: Reduction used on weights when
-        aggregating repeated values during feature analysis. Can be either 'sum'
-        or 'mean'.
-      prefitting_input_fn: An input_fn used in the pre fitting stage to estimate
-        non-linear feature interactions. Required for crystals models.
-        Prefitting typically uses the same dataset as the main training, but
-        with fewer epochs.
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator to
-        continue training a previously saved model.
-      label_dimension: Number of regression targets per example. This is the
-        size of the last dimension of the labels and logits `Tensor` objects
-        (typically, these have shape `[batch_size, label_dimension]`).
-      weight_column: A string or a `_NumericColumn` created by
-        `tf.feature_column.numeric_column` defining feature column representing
-        weights. It is only used by the estimator head to down weight or boost
-        examples during training. It will be multiplied by the loss of the
-        example. If it is a string, it is used as a key to fetch the weight
-        tensor from the `features` dictionary output of the input function. If
-        it is a `_NumericColumn`, a raw tensor is fetched by key
-        `weight_column.key`, then weight_column.normalizer_fn is applied on it
-        to get the weight tensor. Note that in both cases 'weight_column' should
-        *not* be a member of the 'feature_columns'  parameter to the constructor
-        since these will be used for both serving and training.
-      optimizer: An instance of `tf.Optimizer` used to train the model. Can also
-        be a string (one of 'Adagrad', 'Adam', 'Ftrl', 'RMSProp', 'SGD'), or
-        callable. Defaults to Adagrad optimizer.
-      prefitting_optimizer: An instance of `tf.Optimizer` used to train the
-        model during the pre-fitting stage. Can also be a string (one of
-        'Adagrad', 'Adam', 'Ftrl', 'RMSProp', 'SGD'), or callable. Defaults to
-        Adagrad optimizer.
-      prefitting_steps: Number of steps for which to pretraing train the model
-        during the prefitting stage. If None, train forever or train until
-        prefitting_input_fn generates the tf.errors.OutOfRange error or
-        StopIteration exception.
-      config: `RunConfig` object to configure the runtime settings.
-      warm_start_from: A string filepath to a checkpoint to warm-start from, or
-        a `WarmStartSettings` object to fully configure warm-starting.  If the
-        string filepath is provided instead of a `WarmStartSettings`, then all
-        weights are warm-started, and it is assumed that vocabularies and Tensor
-        names are unchanged.
-      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
-        to reduce training loss over batch. Defaults to `SUM_OVER_BATCH_SIZE`.
-      loss_fn: Optional loss function. Defaults to `mean_squared_error`.
-      dtype: dtype of layers used in the model.
-    """
-    config = estimator_lib.maybe_overwrite_model_dir_and_session_config(
-        config, model_dir)
-    model_dir = config.model_dir
-    head = regression_head.RegressionHead(
-        label_dimension=label_dimension,
-        weight_column=weight_column,
-        loss_reduction=loss_reduction,
-        loss_fn=loss_fn)
-
-    model_config = copy.deepcopy(model_config)
-    _update_by_feature_columns(model_config, feature_columns)
-
-    _finalize_keypoints(
-        model_config=model_config,
-        config=config,
-        feature_columns=feature_columns,
-        feature_analysis_input_fn=feature_analysis_input_fn,
-        feature_analysis_weight_column=feature_analysis_weight_column,
-        feature_analysis_weight_reduction=feature_analysis_weight_reduction,
-        logits_output=False)
-
-    _verify_config(model_config, feature_columns)
-
-    _finalize_model_structure(
-        label_dimension=label_dimension,
-        feature_columns=feature_columns,
-        head=head,
-        model_config=model_config,
-        prefitting_input_fn=prefitting_input_fn,
-        prefitting_optimizer=prefitting_optimizer,
-        prefitting_steps=prefitting_steps,
-        model_dir=model_dir,
-        config=config,
-        warm_start_from=warm_start_from,
-        dtype=dtype)
-
-    model_fn = _get_model_fn(
-        label_dimension=label_dimension,
-        feature_columns=feature_columns,
-        head=head,
-        model_config=model_config,
-        optimizer=optimizer,
-        dtype=dtype)
-
-    super(CannedRegressor, self).__init__(
-        model_fn=model_fn,
-        model_dir=model_dir,
-        config=config,
-        warm_start_from=warm_start_from)
-
-
-def _match_op(ops, regex):
-  """Returns ops that match given regex along with the matched sections."""
-  matches = []
-  prog = re.compile(regex)
-  for op in ops:
-    op_matches = prog.findall(op)
-    if op_matches:
-      matches.append((op, op_matches[0]))
-  return matches
-
-
-def _create_feature_nodes(sess, ops, graph):
-  """Returns a map from feature name to InputFeatureNode."""
-  # Extract list of features from the graph.
-  # {FEATURES_SCOPE}/{feature_name}
-  feature_nodes = {}
-  feature_op_re = r'^{}/(.*)'.format(re.escape(FEATURES_SCOPE))
-  for (_, feature_name) in _match_op(ops, feature_op_re):
-    is_categorical = False
-    vocabulary_list = None
-
-    # Check to see if there is a categorical mapping defined for this feature
-    # (i.e. lookup table for categorical feature column). Note that there can be
-    # at most one such mapping either under transform or transform_1 namespace.
-    # transform(_\d)?/{feature_name}_lookup/Const
-    category_table_re = r'transform(_\d)?/{}_lookup/Const'.format(
-        re.escape(feature_name))
-    for (category_table_op, _) in _match_op(ops, category_table_re):
-      if is_categorical:
-        raise ValueError(
-            'Model graph has multiple category mappings for feature {}'.format(
-                feature_name))
-      is_categorical = True
-      vocabulary_list = sess.run(
-          graph.get_operation_by_name(category_table_op).outputs[0])
-      # Replace byte types with their string values.
-      vocabulary_list = [
-          str(x.decode()) if isinstance(x, bytes) else str(x)
-          for x in vocabulary_list
-      ]
-
-    feature_node = model_info.InputFeatureNode(
-        name=feature_name,
-        is_categorical=is_categorical,
-        vocabulary_list=vocabulary_list)
-    feature_nodes[feature_name] = feature_node
-  return feature_nodes
-
-
-def _create_categorical_calibration_nodes(sess, ops, graph, feature_nodes):
-  """Returns a map from feature_name to list of CategoricalCalibrationNode."""
-  categorical_calibration_nodes = collections.defaultdict(list)
-  # Get calibrator output values. We need to call the read variable op.
-  # {CALIB_LAYER_NAME}_{feature_name}/
-  #   {CATEGORICAL_CALIBRATION_KERNEL_NAME}/Read/ReadVariableOp
-  kernel_op_re = '^{}_(.*)/{}/Read/ReadVariableOp$'.format(
-      premade_lib.CALIB_LAYER_NAME,
-      categorical_calibration_layer.CATEGORICAL_CALIBRATION_KERNEL_NAME,
-  )
-  for kernel_op, feature_name in _match_op(ops, kernel_op_re):
-    output_values = sess.run(graph.get_operation_by_name(kernel_op).outputs[0])
-
-    # Get default input value if defined.
-    # {CALIB_LAYER_NAME}_{feature_name}/
-    #   {DEFAULT_INPUT_VALUE_NAME}
-    default_input_value_op = '^{}_{}/{}$'.format(
-        premade_lib.CALIB_LAYER_NAME,
-        feature_name,
-        categorical_calibration_layer.DEFAULT_INPUT_VALUE_NAME,
-    )
-    if default_input_value_op in ops:
-      default_input = sess.run(
-          graph.get_operation_by_name(default_input_value_op).outputs[0])
-    else:
-      default_input = None
-
-    # Create one calibration node per output dimension of the calibrator.
-    for calibration_output_idx in range(output_values.shape[1]):
-      categorical_calibration_node = model_info.CategoricalCalibrationNode(
-          input_node=feature_nodes[feature_name],
-          output_values=output_values[:, calibration_output_idx],
-          default_input=default_input)
-      categorical_calibration_nodes[feature_name].append(
-          categorical_calibration_node)
-  return categorical_calibration_nodes
-
-
-def _create_pwl_calibration_nodes(sess, ops, graph, feature_nodes):
-  """Returns a map from feature_name to list of PWLCalibrationNode."""
-  pwl_calibration_nodes = collections.defaultdict(list)
-  # Calculate input keypoints.
-  # We extract lengh (deltas between keypoints) and kernel interpolation
-  # keypoints (which does not include the last keypoint), and then
-  # construct the full keypoints list using both.
-
-  # Lengths (deltas between keypoints).
-  # {CALIB_LAYER_NAME}_{feature_name}/{LENGTHS_NAME}
-  lengths_op_re = '^{}_(.*)/{}$'.format(
-      premade_lib.CALIB_LAYER_NAME,
-      pwl_calibration_layer.LENGTHS_NAME,
-  )
-  for lengths_op, feature_name in _match_op(ops, lengths_op_re):
-    # Interpolation keypoints does not inlcude the last input keypoint.
-    # {CALIB_LAYER_NAME}_{feature_name}/{INTERPOLATION_KEYPOINTS_NAME}
-    keypoints_op = '{}_{}/{}'.format(
-        premade_lib.CALIB_LAYER_NAME,
-        feature_name,
-        pwl_calibration_layer.INTERPOLATION_KEYPOINTS_NAME,
-    )
-
-    # Output keypoints. We need to call the varible read op.
-    # {CALIB_LAYER_NAME}_{feature_name}/{PWL_CALIBRATION_KERNEL_NAME}
-    kernel_op = '{}_{}/{}/Read/ReadVariableOp'.format(
-        premade_lib.CALIB_LAYER_NAME,
-        feature_name,
-        pwl_calibration_layer.PWL_CALIBRATION_KERNEL_NAME,
-    )
-
-    (lengths, keypoints, kernel) = sess.run(
-        (graph.get_operation_by_name(lengths_op).outputs[0],
-         graph.get_operation_by_name(keypoints_op).outputs[0],
-         graph.get_operation_by_name(kernel_op).outputs[0]))
-    output_keypoints = np.cumsum(kernel, axis=0)
-
-    # For calibrators with fixed input keypoints, the shape of 'keypoints'
-    # and 'lengths' is (num_keypoints - 1,). For calibrators with keypoint_type
-    # set to learned_interior the shape is (num_units, num_keypoints - 1).
-    # Change the shape to (num_units, num_keypoints - 1).
-    # If needed, repeat the keypoints for each output unit.
-    if keypoints.ndim == 1:
-      keypoints = np.expand_dims(keypoints, axis=0)
-      lengths = np.expand_dims(lengths, axis=0)
-    if keypoints.shape[0] == 1:
-      keypoints = np.tile(keypoints, [output_keypoints.shape[1], 1])
-      lengths = np.tile(lengths, [output_keypoints.shape[1], 1])
-
-    # Add the last keypoint to the keypoint list. The resulting shape is
-    # (num_units, num_keypoints).
-    # TODO: handle cyclic PWL layers.
-    input_keypoints = np.concatenate(
-        [keypoints, keypoints[:, -1:] + lengths[:, -1:]], axis=-1)
-
-    # Get missing/default input value if present:
-    # {CALIB_LAYER_NAME}_{feature_name}/{MISSING_INPUT_VALUE_NAME}
-    default_input_value_op = '{}_{}/{}'.format(
-        premade_lib.CALIB_LAYER_NAME,
-        feature_name,
-        pwl_calibration_layer.MISSING_INPUT_VALUE_NAME,
-    )
-    if default_input_value_op in ops:
-      default_input = sess.run(
-          graph.get_operation_by_name(default_input_value_op).outputs[0])[0]
-    else:
-      default_input = None
-
-    # Find corresponding default/missing output if present.
-    # {CALIB_LAYER_NAME}_{feature_name}/{PWL_CALIBRATION_MISSING_OUTPUT_NAME}
-    default_output_op = '{}_{}/{}/Read/ReadVariableOp'.format(
-        premade_lib.CALIB_LAYER_NAME,
-        feature_name,
-        pwl_calibration_layer.PWL_CALIBRATION_MISSING_OUTPUT_NAME,
-    )
-    if default_output_op in ops:
-      default_output = sess.run(
-          graph.get_operation_by_name(default_output_op).outputs[0])
-    else:
-      default_output = None
-
-    # Create one calibration node per output dimension of the calibrator.
-    for calibration_output_idx in range(output_keypoints.shape[1]):
-      pwl_calibration_node = model_info.PWLCalibrationNode(
-          input_node=feature_nodes[feature_name],
-          input_keypoints=input_keypoints[calibration_output_idx, :],
-          output_keypoints=output_keypoints[:, calibration_output_idx],
-          default_input=default_input,
-          default_output=(None if default_output is None else
-                          default_output[:, calibration_output_idx]))
-      pwl_calibration_nodes[feature_name].append(pwl_calibration_node)
-  return pwl_calibration_nodes
-
-
-def _create_submodel_input_map(ops, calibration_nodes_map):
-  """Returns a map from submodel_idx to a list of calibration nodes."""
-  submodel_input_nodes = collections.defaultdict(list)
-  for feature_name, calibration_nodes in calibration_nodes_map.items():
-    # Identity passthrough ops that pass this calibration to each submodel.
-    # {CALIB_PASSTHROUGH_NAME}_{feature_name}_
-    #   {calibration_output_idx}_{submodel_idx}_{submodel_input_idx}
-    shared_calib_passthrough_op_re = r'^{}_{}_(\d*)_(\d*)_(\d*)$'.format(
-        premade_lib.CALIB_PASSTHROUGH_NAME, feature_name)
-    for _, (calibration_output_idx, submodel_idx,
-            submodel_input_idx) in _match_op(ops,
-                                             shared_calib_passthrough_op_re):
-      submodel_input_nodes[submodel_idx].append(
-          (submodel_input_idx, calibration_nodes[int(calibration_output_idx)]))
-  return submodel_input_nodes
-
-
-def _create_linear_nodes(sess, ops, graph, submodel_input_nodes):
-  """Returns a map from submodel_idx to LinearNode."""
-  linear_nodes = {}
-  # Linear coefficients.
-  # {LINEAR_LAYER_NAME}_{submodel_idx}/{LINEAR_LAYER_KERNEL_NAME}
-  linear_kernel_op_re = '^{}_(.*)/{}/Read/ReadVariableOp$'.format(
-      premade_lib.LINEAR_LAYER_NAME,
-      linear_layer.LINEAR_LAYER_KERNEL_NAME,
-  )
-  for linear_kernel_op, submodel_idx in _match_op(ops, linear_kernel_op_re):
-    coefficients = sess.run(
-        graph.get_operation_by_name(linear_kernel_op).outputs[0]).flatten()
-
-    # Bias term.
-    # {LINEAR_LAYER_NAME}_{submodel_idx}/{LINEAR_LAYER_BIAS_NAME}
-    bias_op = '{}_{}/{}/Read/ReadVariableOp'.format(
-        premade_lib.LINEAR_LAYER_NAME,
-        submodel_idx,
-        linear_layer.LINEAR_LAYER_BIAS_NAME,
-    )
-    if bias_op in ops:
-      bias = sess.run(graph.get_operation_by_name(bias_op).outputs[0])
-    else:
-      bias = 0.0
-
-    # Sort input nodes by input index.
-    input_nodes = [
-        node for _, node in sorted(submodel_input_nodes[submodel_idx])
-    ]
-
-    linear_node = model_info.LinearNode(
-        input_nodes=input_nodes, coefficients=coefficients, bias=bias)
-    linear_nodes[submodel_idx] = linear_node
-  return linear_nodes
-
-
-def _create_lattice_nodes(sess, ops, graph, submodel_input_nodes):
-  """Returns a map from submodel_idx to LatticeNode."""
-  lattice_nodes = {}
-  # Lattice weights.
-  # {LATTICE_LAYER_NAME}_{submodel_idx}/{LATTICE_KERNEL_NAME}
-  lattice_kernel_op_re = '^{}_(.*)/{}/Read/ReadVariableOp$'.format(
-      premade_lib.LATTICE_LAYER_NAME,
-      lattice_layer.LATTICE_KERNEL_NAME,
-  )
-  for lattice_kernel_op, submodel_idx in _match_op(ops, lattice_kernel_op_re):
-    lattice_kernel = sess.run(
-        graph.get_operation_by_name(lattice_kernel_op).outputs[0]).flatten()
-
-    # Lattice sizes.
-    # {Lattice_LAYER_NAME}_{submodel_idx}/{LATTICE_SIZES_NAME}
-    lattice_sizes_op_name = '{}_{}/{}'.format(premade_lib.LATTICE_LAYER_NAME,
-                                              submodel_idx,
-                                              lattice_layer.LATTICE_SIZES_NAME)
-    lattice_sizes = sess.run(
-        graph.get_operation_by_name(
-            lattice_sizes_op_name).outputs[0]).flatten()
-
-    # Shape the flat lattice parameters based on the calculated lattice sizes.
-    weights = np.reshape(lattice_kernel, lattice_sizes)
-
-    # Sort input nodes by input index.
-    input_nodes = [
-        node for _, node in sorted(submodel_input_nodes[submodel_idx])
-    ]
-
-    lattice_node = model_info.LatticeNode(
-        input_nodes=input_nodes, weights=weights)
-    lattice_nodes[submodel_idx] = lattice_node
-  return lattice_nodes
-
-
-def _create_kronecker_factored_lattice_nodes(sess, ops, graph,
-                                             submodel_input_nodes):
-  """Returns a map from submodel_idx to KroneckerFactoredLatticeNode."""
-  kfl_nodes = {}
-  # KroneckerFactoredLattice kernel weights.
-  # {KFL_LAYER_NAME}_{submodel_idx}/{KFL_KERNEL_NAME}
-  kfl_kernel_op_re = '^{}_(.*)/{}/Read/ReadVariableOp$'.format(
-      premade_lib.KFL_LAYER_NAME,
-      kfll.KFL_KERNEL_NAME,
-  )
-  for kfl_kernel_op, submodel_idx in _match_op(ops, kfl_kernel_op_re):
-    kfl_kernel = sess.run(
-        graph.get_operation_by_name(kfl_kernel_op).outputs[0]).flatten()
-
-    # KroneckerFactoredLattice scale.
-    # {KFL_LAYER_NAME}_{submodel_idx}/{KFL_SCALE_NAME}
-    kfl_scale_op_name = '{}_{}/{}/Read/ReadVariableOp'.format(
-        premade_lib.KFL_LAYER_NAME, submodel_idx, kfll.KFL_SCALE_NAME)
-    kfl_scale = sess.run(
-        graph.get_operation_by_name(kfl_scale_op_name).outputs[0]).flatten()
-
-    # KroneckerFactoredLattice bias.
-    # {KFL_LAYER_NAME}_{submodel_idx}/{KFL_BIAS_NAME}
-    kfl_bias_op_name = '{}_{}/{}/Read/ReadVariableOp'.format(
-        premade_lib.KFL_LAYER_NAME, submodel_idx, kfll.KFL_BIAS_NAME)
-    kfl_bias = sess.run(
-        graph.get_operation_by_name(kfl_bias_op_name).outputs[0]).flatten()
-
-    # Lattice sizes.
-    # {KFL_LAYER_NAME}_{submodel_idx}/{LATTICE_SIZES_NAME}
-    lattice_sizes_op_name = '{}_{}/{}'.format(premade_lib.KFL_LAYER_NAME,
-                                              submodel_idx,
-                                              kfll.LATTICE_SIZES_NAME)
-    lattice_sizes = sess.run(
-        graph.get_operation_by_name(lattice_sizes_op_name).outputs[0])
-
-    # Units.
-    # {KFL_LAYER_NAME}_{submodel_idx}/{UNITS_NAME}
-    units_op_name = '{}_{}/{}'.format(premade_lib.KFL_LAYER_NAME, submodel_idx,
-                                      kfll.UNITS_NAME)
-    units = sess.run(graph.get_operation_by_name(units_op_name).outputs[0])
-
-    # Dims.
-    # {KFL_LAYER_NAME}_{submodel_idx}/{DIMS_NAME}
-    dims_op_name = '{}_{}/{}'.format(premade_lib.KFL_LAYER_NAME, submodel_idx,
-                                     kfll.DIMS_NAME)
-    dims = sess.run(graph.get_operation_by_name(dims_op_name).outputs[0])
-
-    # Num terms.
-    # {KFL_LAYER_NAME}_{submodel_idx}/{NUM_TERMS_NAME}
-    num_terms_op_name = '{}_{}/{}'.format(premade_lib.KFL_LAYER_NAME,
-                                          submodel_idx, kfll.NUM_TERMS_NAME)
-    num_terms = sess.run(
-        graph.get_operation_by_name(num_terms_op_name).outputs[0])
-
-    # Shape the flat weights, scale, and bias parameters based on the calculated
-    # lattice_sizes, units, dims, and num_terms.
-    weights = np.reshape(kfl_kernel,
-                         (1, lattice_sizes, units * dims, num_terms))
-    scale = np.reshape(kfl_scale, (units, num_terms))
-    bias = np.reshape(kfl_bias, (units))
-
-    # Sort input nodes by input index.
-    input_nodes = [
-        node for _, node in sorted(submodel_input_nodes[submodel_idx])
-    ]
-
-    kfl_node = model_info.KroneckerFactoredLatticeNode(
-        input_nodes=input_nodes, weights=weights, scale=scale, bias=bias)
-    kfl_nodes[submodel_idx] = kfl_node
-  return kfl_nodes
-
-
-def _create_rtl_submodel_kronecker_factored_lattice_nodes(
-    sess, ops, graph, flattened_calibration_nodes, submodel_idx, submodel_key):
-  """Returns next key and map from key+unit to KroneckerFactoredLatticeNode."""
-  submodel_kfl_nodes = {}
-  # KroneckerFactoredLattice kernel weights
-  # {RTL_LAYER_NAME}_{submodel_idx}/
-  # {RTL_KFL_NAME}_{monotonicities}/{KFL_KERNEL_NAME}
-  kfl_kernel_op_re = '^{}_{}/{}_(.*)/{}/Read/ReadVariableOp$'.format(
-      premade_lib.RTL_LAYER_NAME,
-      submodel_idx,
-      rtl_layer.RTL_KFL_NAME,
-      kfll.KFL_KERNEL_NAME,
-  )
-  for kfl_kernel_op, monotonicities in _match_op(ops, kfl_kernel_op_re):
-    kfl_kernel = sess.run(
-        graph.get_operation_by_name(kfl_kernel_op).outputs[0]).flatten()
-
-    # KroneckerFactoredLattice scale.
-    # {RTL_LAYER_NAME}_{submodel_idx}/
-    # {RTL_KFL_NAME}_{monotonicities}/{KFL_SCALE_NAME}
-    kfl_scale_op_name = '{}_{}/{}_{}/{}/Read/ReadVariableOp'.format(
-        premade_lib.RTL_LAYER_NAME,
-        submodel_idx,
-        rtl_layer.RTL_KFL_NAME,
-        monotonicities,
-        kfll.KFL_SCALE_NAME,
-    )
-    kfl_scale = sess.run(
-        graph.get_operation_by_name(kfl_scale_op_name).outputs[0]).flatten()
-
-    # KroneckerFactoredLattice bias.
-    # {RTL_LAYER_NAME}_{submodel_idx}/
-    # {RTL_KFL_NAME}_{monotonicities}/{KFL_BIAS_NAME}
-    kfl_bias_op_name = '{}_{}/{}_{}/{}/Read/ReadVariableOp'.format(
-        premade_lib.RTL_LAYER_NAME,
-        submodel_idx,
-        rtl_layer.RTL_KFL_NAME,
-        monotonicities,
-        kfll.KFL_BIAS_NAME,
-    )
-    kfl_bias = sess.run(
-        graph.get_operation_by_name(kfl_bias_op_name).outputs[0]).flatten()
-
-    # Lattice sizes.
-    # {RTL_LAYER_NAME}_{submodel_idx}/
-    # {RTL_KFL_NAME}_{monotonicities}/{LATTICE_SIZES_NAME}
-    lattice_sizes_op_name = '{}_{}/{}_{}/{}'.format(
-        premade_lib.RTL_LAYER_NAME,
-        submodel_idx,
-        rtl_layer.RTL_KFL_NAME,
-        monotonicities,
-        kfll.LATTICE_SIZES_NAME,
-    )
-    lattice_sizes = sess.run(
-        graph.get_operation_by_name(lattice_sizes_op_name).outputs[0])
-
-    # Dims.
-    # {RTL_LAYER_NAME}_{submodel_idx}/
-    # {RTL_KFL_NAME}_{monotonicities}/{DIMS_NAME}
-    dims_op_name = '{}_{}/{}_{}/{}'.format(
-        premade_lib.RTL_LAYER_NAME,
-        submodel_idx,
-        rtl_layer.RTL_KFL_NAME,
-        monotonicities,
-        kfll.DIMS_NAME,
-    )
-    dims = sess.run(graph.get_operation_by_name(dims_op_name).outputs[0])
-
-    # Num terms.
-    # {RTL_LAYER_NAME}_{submodel_idx}/
-    # {RTL_KFL_NAME}_{monotonicities}/{NUM_TERMS_NAME}
-    num_terms_op_name = '{}_{}/{}_{}/{}'.format(
-        premade_lib.RTL_LAYER_NAME,
-        submodel_idx,
-        rtl_layer.RTL_KFL_NAME,
-        monotonicities,
-        kfll.NUM_TERMS_NAME,
-    )
-    num_terms = sess.run(
-        graph.get_operation_by_name(num_terms_op_name).outputs[0])
-
-    # inputs_for_units
-    # {RTL_LAYER_NAME}_{submodel_index}/
-    # {INPUTS_FOR_UNITS_PREFIX}_{monotonicities}
-    inputs_for_units_op_name = '{}_{}/{}_{}'.format(
-        premade_lib.RTL_LAYER_NAME, submodel_idx,
-        rtl_layer.INPUTS_FOR_UNITS_PREFIX, monotonicities)
-    inputs_for_units = sess.run(
-        graph.get_operation_by_name(inputs_for_units_op_name).outputs[0])
-
-    # Make a unique kfl for each unit.
-    units = inputs_for_units.shape[0]
-    for i in range(units):
-      # Shape the flat weights, scale, and bias parameters based on the
-      # calculated lattice_sizes, units, dims, and num_terms.
-      weights = np.reshape(kfl_kernel,
-                           (1, lattice_sizes, units * dims, num_terms))
-      scale = np.reshape(kfl_scale, (units, num_terms))
-      bias = np.reshape(kfl_bias, (units))
-
-      # Gather input nodes for lattice node.
-      indices = inputs_for_units[i]
-      input_nodes = [flattened_calibration_nodes[index] for index in indices]
-
-      kfl_node = model_info.KroneckerFactoredLatticeNode(
-          input_nodes=input_nodes, weights=weights, scale=scale, bias=bias)
-      submodel_kfl_nodes[submodel_key] = kfl_node
-      submodel_key += 1
-  return submodel_key, submodel_kfl_nodes
-
-
-def _create_rtl_submodel_lattice_nodes(sess, ops, graph,
-                                       flattened_calibration_nodes,
-                                       submodel_idx, submodel_key):
-  """Returns next key and map from key+unit to LatticeNode."""
-  submodel_lattice_nodes = {}
-  # Lattice kernel weights.
-  # {RTL_LAYER_NAME}_{submodel_idx}/
-  # {RTL_LATTICE_NAME}_{monotonicities}/{LATTICE_KERNEL_NAME}
-  lattice_kernel_op_re = '^{}_{}/{}_(.*)/{}/Read/ReadVariableOp$'.format(
-      premade_lib.RTL_LAYER_NAME,
-      submodel_idx,
-      rtl_layer.RTL_LATTICE_NAME,
-      lattice_layer.LATTICE_KERNEL_NAME,
-  )
-  for lattice_kernel_op, monotonicities in _match_op(ops, lattice_kernel_op_re):
-    lattice_kernel = sess.run(
-        graph.get_operation_by_name(lattice_kernel_op).outputs[0])
-
-    # Lattice sizes.
-    # {RTL_LAYER_NAME}_{submodel_idx}/
-    # {RTL_LATTICE_NAME}_{monotonicities}/{LATTICE_SIZES_NAME}
-    lattice_sizes_op_name = '{}_{}/{}_{}/{}'.format(
-        premade_lib.RTL_LAYER_NAME, submodel_idx, rtl_layer.RTL_LATTICE_NAME,
-        monotonicities, lattice_layer.LATTICE_SIZES_NAME)
-    lattice_sizes = sess.run(
-        graph.get_operation_by_name(
-            lattice_sizes_op_name).outputs[0]).flatten()
-
-    # inputs_for_units
-    # {RTL_LAYER_NAME}_{submodel_index}/
-    # {INPUTS_FOR_UNITS_PREFIX}_{monotonicities}
-    inputs_for_units_op_name = '{}_{}/{}_{}'.format(
-        premade_lib.RTL_LAYER_NAME, submodel_idx,
-        rtl_layer.INPUTS_FOR_UNITS_PREFIX, monotonicities)
-    inputs_for_units = sess.run(
-        graph.get_operation_by_name(inputs_for_units_op_name).outputs[0])
-
-    # Make a unique lattice for each unit.
-    units = inputs_for_units.shape[0]
-    for i in range(units):
-      # Shape the flat lattice parameters based on the calculated lattice
-      # sizes.
-      weights = np.reshape(lattice_kernel[:, i], lattice_sizes)
-
-      # Gather input nodes for lattice node.
-      indices = inputs_for_units[i]
-      input_nodes = [flattened_calibration_nodes[index] for index in indices]
-
-      lattice_node = model_info.LatticeNode(
-          input_nodes=input_nodes, weights=weights)
-      submodel_lattice_nodes[submodel_key] = lattice_node
-      submodel_key += 1
-  return submodel_key, submodel_lattice_nodes
-
-
-def _create_rtl_lattice_nodes(sess, ops, graph, calibration_nodes_map,
-                              kronecker_factored):
-  """Returns a map from lattice_submodel_index to lattice type Node."""
-  lattice_nodes = {}
-  lattice_submodel_index = 0
-  # Feature name in concat op.
-  # {RTL_INPUT_NAME}_{feature_name}:0
-  feature_name_prog = re.compile('^{}_(.*):0$'.format(
-      premade_lib.RTL_INPUT_NAME))
-  # RTL Layer identified by single concat op per submodel.
-  # {RTL_LAYER_NAME}_{submodel_idx}/RTL_CONCAT_NAME
-  rtl_layer_concat_op_re = '^{}_(.*)/{}$'.format(premade_lib.RTL_LAYER_NAME,
-                                                 rtl_layer.RTL_CONCAT_NAME)
-  for concat_op_name, submodel_idx in _match_op(ops, rtl_layer_concat_op_re):
-    # First we reconstruct the flattened calibration outputs for this submodel.
-    concat_op = graph.get_operation_by_name(concat_op_name)
-    input_names = [input_tensor.name for input_tensor in concat_op.inputs]
-    names_in_flattened_order = []
-    for input_name in input_names:
-      match = feature_name_prog.match(input_name)
-      if match:
-        names_in_flattened_order.append(match.group(1))
-    flattened_calibration_nodes = []
-    for feature_name in names_in_flattened_order:
-      flattened_calibration_nodes.extend(calibration_nodes_map[feature_name])
-
-    if kronecker_factored:
-      node_fn = _create_rtl_submodel_kronecker_factored_lattice_nodes
-    else:
-      node_fn = _create_rtl_submodel_lattice_nodes
-    lattice_submodel_index, submodel_lattice_nodes = node_fn(
-        sess, ops, graph, flattened_calibration_nodes, submodel_idx,
-        lattice_submodel_index)
-    lattice_nodes.update(submodel_lattice_nodes)
-  return lattice_nodes
-
-
-def _create_output_combination_node(sess, ops, graph, submodel_output_nodes):
-  """Returns None, a LinearNode, or a MeanNode."""
-  output_combination_node = None
-  # Mean node is only added for ensemble models.
-  if len(submodel_output_nodes) > 1:
-    input_nodes = [
-        submodel_output_nodes[idx]
-        for idx in sorted(submodel_output_nodes.keys(), key=int)
-    ]
-
-    # Linear coefficients.
-    # {LINEAR_LAYER_COMBINATION_NAME}/{LINEAR_LAYER_KERNEL_NAME}
-    linear_combination_kernel_op = '{}/{}/Read/ReadVariableOp'.format(
-        premade_lib.OUTPUT_LINEAR_COMBINATION_LAYER_NAME,
-        linear_layer.LINEAR_LAYER_KERNEL_NAME,
-    )
-    if linear_combination_kernel_op in ops:
-      coefficients = sess.run(
-          graph.get_operation_by_name(
-              linear_combination_kernel_op).outputs[0]).flatten()
-
-      # Bias term.
-      # {OUTPUT_LINEAR_COMBINATION_LAYER_NAME}/{LINEAR_LAYER_BIAS_NAME}
-      bias_op = '{}/{}/Read/ReadVariableOp'.format(
-          premade_lib.OUTPUT_LINEAR_COMBINATION_LAYER_NAME,
-          linear_layer.LINEAR_LAYER_BIAS_NAME,
-      )
-      if bias_op in ops:
-        bias = sess.run(graph.get_operation_by_name(bias_op).outputs[0])
-      else:
-        bias = 0.0
-
-      linear_combination_node = model_info.LinearNode(
-          input_nodes=input_nodes, coefficients=coefficients, bias=bias)
-      output_combination_node = linear_combination_node
-    else:
-      average_node = model_info.MeanNode(input_nodes=input_nodes)
-      output_combination_node = average_node
-  return output_combination_node
-
-
-def _create_output_calibration_node(sess, ops, graph, input_node):
-  """Returns a PWLCalibrationNode."""
-  output_calibration_node = None
-  # Lengths (deltas between keypoints).
-  # {OUTPUT_CALIB_LAYER_NAME}/{LENGTHS_NAME}
-  lengths_op = '{}/{}'.format(
-      premade_lib.OUTPUT_CALIB_LAYER_NAME,
-      pwl_calibration_layer.LENGTHS_NAME,
-  )
-  if lengths_op in ops:
-    # Interpolation keypoints does not inlcude the last input keypoint.
-    # {OUTPUT_CALIB_LAYER_NAME}/{INTERPOLATION_KEYPOINTS_NAME}
-    keypoints_op = '{}/{}'.format(
-        premade_lib.OUTPUT_CALIB_LAYER_NAME,
-        pwl_calibration_layer.INTERPOLATION_KEYPOINTS_NAME,
-    )
-
-    # Output keypoints. We need to call the varible read op.
-    # {OUTPUT_CALIB_LAYER_NAME}/{PWL_CALIBRATION_KERNEL_NAME}
-    kernel_op = '{}/{}/Read/ReadVariableOp'.format(
-        premade_lib.OUTPUT_CALIB_LAYER_NAME,
-        pwl_calibration_layer.PWL_CALIBRATION_KERNEL_NAME,
-    )
-
-    (lengths, keypoints, kernel) = sess.run(
-        (graph.get_operation_by_name(lengths_op).outputs[0],
-         graph.get_operation_by_name(keypoints_op).outputs[0],
-         graph.get_operation_by_name(kernel_op).outputs[0]))
-    output_keypoints = np.cumsum(kernel.flatten())
-
-    # Add the last keypoint to the keypoint list.
-    input_keypoints = np.append(keypoints, keypoints[-1] + lengths[-1])
-
-    output_calibration_node = model_info.PWLCalibrationNode(
-        input_node=input_node,
-        input_keypoints=input_keypoints,
-        output_keypoints=output_keypoints,
-        default_input=None,
-        default_output=None)
-  return output_calibration_node
-
-
-# TODO: add support for KFL in RTL Layer
-def get_model_graph(saved_model_path, tag='serve'):
-  """Returns all layers and parameters used in a saved model as a graph.
-
-  The returned graph is not a TF graph, rather a graph of python object that
-  encodes the model structure and includes trained model parameters. The graph
-  can be used by the `tfl.visualization` module for plotting and other
-  visualization and analysis.
-
-  Example:
-
-  ```python
-  model_graph = estimators.get_model_graph(saved_model_path)
-  visualization.plot_feature_calibrator(model_graph, "feature_name")
-  visualization.plot_all_calibrators(model_graph)
-  visualization.draw_model_graph(model_graph)
-  ```
-
-  Args:
-    saved_model_path: Path to the saved model.
-    tag: Saved model tag for loading.
-
-  Returns:
-    A `model_info.ModelGraph` object that includes the model graph.
-  """
-  # List of all the nodes in the model.
-  nodes = []
-
-  # Dict from submodel index to the output node of the submodel.
-  submodel_output_nodes = {}
-
-  tf.compat.v1.reset_default_graph()
-  with tf.compat.v1.Session() as sess:
-    tf.compat.v1.saved_model.loader.load(sess, [tag], saved_model_path)
-    graph = tf.compat.v1.get_default_graph()
-    ops = [op.name for op in graph.get_operations()]
-
-    # Dict from feature name to corresponding InputFeatureNode object.
-    feature_nodes = _create_feature_nodes(sess, ops, graph)
-    nodes.extend(feature_nodes.values())
-
-    # Categorical Calibration Nodes.
-    categorical_calibration_nodes = _create_categorical_calibration_nodes(
-        sess, ops, graph, feature_nodes)
-    for calibration_nodes in categorical_calibration_nodes.values():
-      nodes.extend(calibration_nodes)
-
-    # PWL Calibration Nodes.
-    pwl_calibration_nodes = _create_pwl_calibration_nodes(
-        sess, ops, graph, feature_nodes)
-    for calibration_nodes in pwl_calibration_nodes.values():
-      nodes.extend(calibration_nodes)
-
-    # Dict from feature name to list of calibration nodes (Categorical and PWL).
-    calibration_nodes_map = {}
-    calibration_nodes_map.update(categorical_calibration_nodes)
-    calibration_nodes_map.update(pwl_calibration_nodes)
-    # Dict from submodel index to a list of calibrated inputs for the submodel.
-    submodel_input_nodes = _create_submodel_input_map(ops,
-                                                      calibration_nodes_map)
-
-    # Linear nodes
-    linear_nodes = _create_linear_nodes(sess, ops, graph, submodel_input_nodes)
-    submodel_output_nodes.update(linear_nodes)
-    nodes.extend(linear_nodes.values())
-
-    # Ensemble Lattice nodes.
-    lattice_nodes = _create_lattice_nodes(sess, ops, graph,
-                                          submodel_input_nodes)
-    submodel_output_nodes.update(lattice_nodes)
-    nodes.extend(lattice_nodes.values())
-
-    # Ensemble Kronecker Factored Lattice nodes.
-    kfl_nodes = _create_kronecker_factored_lattice_nodes(
-        sess, ops, graph, submodel_input_nodes)
-    submodel_output_nodes.update(kfl_nodes)
-    nodes.extend(kfl_nodes.values())
-
-    # RTL Lattice nodes.
-    rtl_lattice_nodes = _create_rtl_lattice_nodes(
-        sess, ops, graph, calibration_nodes_map, kronecker_factored=False)
-    submodel_output_nodes.update(rtl_lattice_nodes)
-    nodes.extend(rtl_lattice_nodes.values())
-
-    # RTL Kronecker Factored Lattice nodes.
-    kfl_rtl_nodes = _create_rtl_lattice_nodes(
-        sess, ops, graph, calibration_nodes_map, kronecker_factored=True)
-    submodel_output_nodes.update(kfl_rtl_nodes)
-    nodes.extend(kfl_rtl_nodes.values())
-
-    # Output combination node.
-    model_output_node = _create_output_combination_node(sess, ops, graph,
-                                                        submodel_output_nodes)
-    if model_output_node:
-      nodes.append(model_output_node)
-    else:
-      model_output_node = list(submodel_output_nodes.values())[0]
-
-    # Output calibration node.
-    output_calibration_node = _create_output_calibration_node(
-        sess, ops, graph, model_output_node)
-    if output_calibration_node:
-      nodes.append(output_calibration_node)
-      model_output_node = output_calibration_node
-
-  return model_info.ModelGraph(nodes=nodes, output_node=model_output_node)
diff --git a/tensorflow_lattice/python/estimators_test.py b/tensorflow_lattice/python/estimators_test.py
deleted file mode 100644
index 9aabe58..0000000
--- a/tensorflow_lattice/python/estimators_test.py
+++ /dev/null
@@ -1,810 +0,0 @@
-# Copyright 2019 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tests TFL canned estimators."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import copy
-
-from absl import logging
-from absl.testing import parameterized
-import numpy as np
-import pandas as pd
-from sklearn.datasets import load_boston
-import tensorflow as tf
-from tensorflow import estimator as tf_estimator
-from tensorflow import feature_column as fc
-from tensorflow.compat.v1 import estimator as tf_compat_v1_estimator
-from tensorflow_lattice.python import configs
-from tensorflow_lattice.python import estimators
-from tensorflow_lattice.python import model_info
-from tensorflow_estimator.python.estimator.head import regression_head
-
-
-class CannedEstimatorsTest(parameterized.TestCase, tf.test.TestCase):
-
-  def setUp(self):
-    super(CannedEstimatorsTest, self).setUp()
-    self.eps = 0.001
-    tf.keras.utils.set_random_seed(42)
-
-    # UCI Statlog (Heart) dataset.
-    heart_csv_file = tf.keras.utils.get_file(
-        'heart.csv',
-        'http://storage.googleapis.com/download.tensorflow.org/data/heart.csv')
-    heart_df = pd.read_csv(heart_csv_file)
-    heart_target = heart_df.pop('target')
-    heart_train_size = int(len(heart_df) * 0.8)
-    self.heart_train_x = heart_df[:heart_train_size]
-    self.heart_train_y = heart_target[:heart_train_size]
-    self.heart_test_x = heart_df[heart_train_size:]
-    self.heart_test_y = heart_target[heart_train_size:]
-
-    # Feature columns.
-    # - age
-    # - sex
-    # - cp        chest pain type (4 values)
-    # - trestbps  resting blood pressure
-    # - chol      serum cholestoral in mg/dl
-    # - fbs       fasting blood sugar > 120 mg/dl
-    # - restecg   resting electrocardiographic results (values 0,1,2)
-    # - thalach   maximum heart rate achieved
-    # - exang     exercise induced angina
-    # - oldpeak   ST depression induced by exercise relative to rest
-    # - slope     the slope of the peak exercise ST segment
-    # - ca        number of major vessels (0-3) colored by flourosopy
-    # - thal      3 = normal; 6 = fixed defect; 7 = reversable defect
-    self.heart_feature_columns = [
-        fc.numeric_column('age', default_value=-1),
-        fc.categorical_column_with_vocabulary_list('sex', [0, 1]),
-        fc.numeric_column('cp'),
-        fc.numeric_column('trestbps', default_value=-1),
-        fc.numeric_column('chol'),
-        fc.categorical_column_with_vocabulary_list('fbs', [0, 1]),
-        fc.categorical_column_with_vocabulary_list('restecg', [0, 1, 2]),
-        fc.numeric_column('thalach'),
-        fc.categorical_column_with_vocabulary_list('exang', [0, 1]),
-        fc.numeric_column('oldpeak'),
-        fc.categorical_column_with_vocabulary_list('slope', [0, 1, 2]),
-        fc.numeric_column('ca'),
-        fc.categorical_column_with_vocabulary_list(
-            'thal', ['normal', 'fixed', 'reversible']),
-    ]
-
-    # Feature configs. Each model can pick and choose which features to use.
-    self.heart_feature_configs = [
-        configs.FeatureConfig(
-            name='age',
-            lattice_size=3,
-            pwl_calibration_num_keypoints=5,
-            monotonicity=1,
-            pwl_calibration_clip_max=100,
-        ),
-        configs.FeatureConfig(
-            name='cp',
-            pwl_calibration_num_keypoints=4,
-            pwl_calibration_input_keypoints='uniform',
-            monotonicity='increasing',
-        ),
-        configs.FeatureConfig(
-            name='chol',
-            pwl_calibration_input_keypoints=[126.0, 210.0, 247.0, 286.0, 564.0],
-            monotonicity=1,
-            pwl_calibration_clip_min=130,
-            pwl_calibration_clamp_min=True,
-            pwl_calibration_clamp_max=True,
-            regularizer_configs=[
-                configs.RegularizerConfig(name='calib_hessian', l2=1e-4),
-            ],
-        ),
-        configs.FeatureConfig(
-            name='fbs',
-            monotonicity=[(0, 1)],
-        ),
-        configs.FeatureConfig(
-            name='trestbps',
-            pwl_calibration_num_keypoints=5,
-            monotonicity='decreasing',
-        ),
-        configs.FeatureConfig(
-            name='thalach',
-            pwl_calibration_num_keypoints=5,
-            monotonicity=-1,
-        ),
-        configs.FeatureConfig(
-            name='restecg',
-            monotonicity=[(0, 1), (0, 2)],
-        ),
-        configs.FeatureConfig(
-            name='exang',
-            monotonicity=[(0, 1)],
-        ),
-        configs.FeatureConfig(
-            name='oldpeak',
-            pwl_calibration_num_keypoints=5,
-            monotonicity=1,
-        ),
-        configs.FeatureConfig(
-            name='slope',
-            monotonicity=[(0, 1), (1, 2)],
-        ),
-        configs.FeatureConfig(
-            name='ca',
-            pwl_calibration_num_keypoints=4,
-            monotonicity='increasing',
-        ),
-        configs.FeatureConfig(
-            name='thal',
-            monotonicity=[('normal', 'fixed'), ('normal', 'reversible')],
-        ),
-    ]
-
-    # UCI Boston dataset.
-    boston_dataset = load_boston()
-    boston_df = pd.DataFrame(
-        boston_dataset.data, columns=boston_dataset.feature_names)
-    boston_df['CHAS'] = boston_df['CHAS'].astype(np.int32)
-    boston_target = pd.Series(boston_dataset.target)
-    boston_train_size = int(len(boston_df) * 0.8)
-    self.boston_train_x = boston_df[:boston_train_size]
-    self.boston_train_y = boston_target[:boston_train_size]
-    self.boston_test_x = boston_df[boston_train_size:]
-    self.boston_test_y = boston_target[boston_train_size:]
-
-    # Feature columns.
-    # - CRIM     per capita crime rate by town
-    # - ZN       proportion of residential land zoned for lots over 25,000 sq.ft
-    # - INDUS    proportion of non-retail business acres per town
-    # - CHAS     Charles River dummy variable (= 1 if tract bounds river)
-    # - NOX      nitric oxides concentration (parts per 10 million)
-    # - RM       average number of rooms per dwelling
-    # - AGE      proportion of owner-occupied units built prior to 1940
-    # - DIS      weighted distances to five Boston employment centres
-    # - RAD      index of accessibility to radial highways
-    # - TAX      full-value property-tax rate per $10,000
-    # - PTRATIO  pupil-teacher ratio by town
-    # - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
-    # - LSTAT    % lower status of the population
-    # - Target   Median value of owner-occupied homes in $1000's
-    self.boston_feature_columns = [
-        fc.numeric_column('CRIM'),
-        fc.numeric_column('ZN'),
-        fc.numeric_column('INDUS'),
-        fc.categorical_column_with_vocabulary_list('CHAS', [0, 1]),
-        fc.numeric_column('NOX'),
-        fc.numeric_column('RM'),
-        fc.numeric_column('AGE'),
-        fc.numeric_column('DIS'),
-        fc.numeric_column('RAD'),
-        fc.numeric_column('TAX'),
-        fc.numeric_column('PTRATIO'),
-        fc.numeric_column('B'),
-        fc.numeric_column('LSTAT'),
-    ]
-
-    # Feature configs. Each model can pick and choose which features to use.
-    self.boston_feature_configs = [
-        configs.FeatureConfig(
-            name='CRIM',
-            lattice_size=3,
-            monotonicity=-1,
-            pwl_calibration_convexity=1,
-        ),
-        configs.FeatureConfig(
-            name='ZN',
-            pwl_calibration_input_keypoints=[0.0, 25.0, 50.0, 75.0, 100.0],
-            monotonicity=1,
-            reflects_trust_in=[
-                configs.TrustConfig(feature_name='RM', trust_type='trapezoid'),
-            ],
-        ),
-        configs.FeatureConfig(
-            name='INDUS',
-            pwl_calibration_input_keypoints='uniform',
-            pwl_calibration_always_monotonic=False,
-            reflects_trust_in=[
-                configs.TrustConfig(
-                    feature_name='RM',
-                    trust_type='edgeworth',
-                    direction='negative'),
-            ],
-            regularizer_configs=[
-                configs.RegularizerConfig(name='calib_wrinkle', l2=1e-4),
-            ],
-        ),
-        configs.FeatureConfig(name='CHAS',),
-        configs.FeatureConfig(name='NOX',),
-        configs.FeatureConfig(
-            name='RM',
-            monotonicity='increasing',
-            pwl_calibration_convexity='concave',
-        ),
-        configs.FeatureConfig(
-            name='AGE',
-            monotonicity=-1,
-        ),
-        configs.FeatureConfig(
-            name='DIS',
-            lattice_size=3,
-            unimodality=1,
-        ),
-        configs.FeatureConfig(name='RAD',),
-        configs.FeatureConfig(name='TAX',),
-        configs.FeatureConfig(
-            name='PTRATIO',
-            monotonicity='decreasing',
-        ),
-        configs.FeatureConfig(name='B',),
-        configs.FeatureConfig(
-            name='LSTAT',
-            monotonicity=-1,
-            dominates=[
-                configs.DominanceConfig(
-                    feature_name='AGE', dominance_type='monotonic'),
-            ],
-        ),
-    ]
-
-  def _ResetAllBackends(self):
-    tf.keras.backend.clear_session()
-    tf.compat.v1.reset_default_graph()
-
-  def _GetInputFn(self, x, y, num_epochs=1, batch_size=100, tfds=False):
-    if tfds:
-
-      def _input_fn():
-        return tf.data.Dataset.from_tensor_slices(
-            (x.to_dict('list'), y.values)).batch(batch_size).repeat(num_epochs)
-
-      return _input_fn
-    else:
-      return tf_compat_v1_estimator.inputs.pandas_input_fn(
-          x=x,
-          y=y,
-          batch_size=batch_size,
-          shuffle=False,
-          num_epochs=num_epochs,
-          num_threads=1)
-
-  def _GetHeartTrainInputFn(self, **kwargs):
-    return self._GetInputFn(self.heart_train_x, self.heart_train_y, **kwargs)
-
-  def _GetHeartTestInputFn(self, **kwargs):
-    return self._GetInputFn(
-        self.heart_test_x, self.heart_test_y, num_epochs=1, **kwargs)
-
-  def _GetBostonTrainInputFn(self, **kwargs):
-    return self._GetInputFn(self.boston_train_x, self.boston_train_y, **kwargs)
-
-  def _GetBostonTestInputFn(self, **kwargs):
-    return self._GetInputFn(
-        self.boston_test_x, self.boston_test_y, num_epochs=1, **kwargs)
-
-  @parameterized.parameters(
-      ([
-          'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
-          'exang', 'oldpeak', 'slope', 'ca', 'thal'
-      ], [['sex', 'oldpeak'], ['fbs', 'thalach'], ['thalach', 'thal'],
-          ['cp', 'trestbps'], ['age', 'ca', 'chol']
-         ], None, None, False, True, 0.8),
-      ([
-          'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
-          'exang', 'oldpeak', 'slope', 'ca', 'thal'
-      ], 'random', 6, 5, True, False, 0.85),
-      ([
-          'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
-          'exang', 'oldpeak', 'slope', 'ca', 'thal'
-      ], 'crystals', 6, 5, True, False, 0.85),
-      ([
-          'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
-          'exang', 'oldpeak', 'slope', 'ca', 'thal'
-      ], 'rtl_layer', 6, 5, True, False, 0.85),
-  )
-  def testCalibratedLatticeEnsembleClassifier(self, feature_names, lattices,
-                                              num_lattices, lattice_rank,
-                                              separate_calibrators,
-                                              output_calibration, auc):
-    self._ResetAllBackends()
-    feature_columns = [
-        feature_column for feature_column in self.heart_feature_columns
-        if feature_column.name in feature_names
-    ]
-    feature_configs = [
-        feature_config for feature_config in self.heart_feature_configs
-        if feature_config.name in feature_names
-    ]
-    if lattices == 'rtl_layer':
-      # RTL Layer only supports monotonicity and bound constraints.
-      feature_configs = copy.deepcopy(feature_configs)
-      for feature_config in feature_configs:
-        feature_config.lattice_size = 2
-        feature_config.unimodality = 'none'
-        feature_config.reflects_trust_in = None
-        feature_config.dominates = None
-        feature_config.regularizer_configs = None
-    model_config = configs.CalibratedLatticeEnsembleConfig(
-        regularizer_configs=[
-            configs.RegularizerConfig(name='torsion', l2=1e-4),
-            configs.RegularizerConfig(name='output_calib_hessian', l2=1e-4),
-        ],
-        feature_configs=feature_configs,
-        lattices=lattices,
-        num_lattices=num_lattices,
-        lattice_rank=lattice_rank,
-        separate_calibrators=separate_calibrators,
-        output_calibration=output_calibration,
-    )
-    estimator = estimators.CannedClassifier(
-        feature_columns=feature_columns,
-        model_config=model_config,
-        feature_analysis_input_fn=self._GetHeartTrainInputFn(num_epochs=1),
-        prefitting_input_fn=self._GetHeartTrainInputFn(num_epochs=50),
-        optimizer=tf.keras.optimizers.legacy.Adam(0.01),
-        prefitting_optimizer=tf.keras.optimizers.legacy.Adam(0.01))
-    estimator.train(input_fn=self._GetHeartTrainInputFn(num_epochs=200))
-    results = estimator.evaluate(input_fn=self._GetHeartTestInputFn())
-    logging.info('Calibrated lattice ensemble classifier results:')
-    logging.info(results)
-    self.assertGreater(results['auc'], auc)
-
-  @parameterized.parameters(
-      (['age', 'sex', 'fbs', 'restecg', 'ca', 'thal'], False, 0.75),
-      (['age', 'cp', 'chol', 'slope', 'ca', 'thal'], False, 0.8),
-      (['trestbps', 'thalach', 'exang', 'oldpeak', 'thal'], True, 0.8),
-  )
-  def testCalibratedLatticeClassifier(self, feature_names, output_calibration,
-                                      auc):
-    self._ResetAllBackends()
-    feature_columns = [
-        feature_column for feature_column in self.heart_feature_columns
-        if feature_column.name in feature_names
-    ]
-    feature_configs = [
-        feature_config for feature_config in self.heart_feature_configs
-        if feature_config.name in feature_names
-    ]
-    model_config = configs.CalibratedLatticeConfig(
-        regularizer_configs=[
-            configs.RegularizerConfig(name='torsion', l2=1e-4),
-            configs.RegularizerConfig(name='output_calib_hessian', l2=1e-4),
-        ],
-        output_calibration=output_calibration,
-        feature_configs=feature_configs)
-    estimator = estimators.CannedClassifier(
-        feature_columns=feature_columns,
-        model_config=model_config,
-        feature_analysis_input_fn=self._GetHeartTrainInputFn(num_epochs=1),
-        optimizer=tf.keras.optimizers.legacy.Adam(0.01))
-    estimator.train(input_fn=self._GetHeartTrainInputFn(num_epochs=200))
-    results = estimator.evaluate(input_fn=self._GetHeartTestInputFn())
-    logging.info('Calibrated lattice classifier results:')
-    logging.info(results)
-    self.assertGreater(results['auc'], auc)
-
-  @parameterized.parameters(
-      (['age', 'sex', 'fbs', 'restecg', 'ca', 'thal'
-       ], False, False, None, None, 'mean', 0.7),
-      ([
-          'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
-          'exang', 'oldpeak', 'slope', 'ca', 'thal'
-      ], True, True, None, None, 'mean', 0.8),
-      (['age', 'sex', 'fbs', 'restecg', 'ca', 'thal'
-       ], False, False, 'thalach', None, 'mean', 0.7),
-      (['age', 'sex', 'fbs', 'restecg', 'ca', 'thal'
-       ], False, False, 'thalach', 'thalach', 'mean', 0.7),
-      (['age', 'sex', 'fbs', 'restecg', 'ca', 'thal'
-       ], False, False, 'thalach', 'thalach', 'sum', 0.7),
-  )
-  def testCalibratedLinearClassifier(self, feature_names, output_calibration,
-                                     use_bias, weight_column,
-                                     feature_analysis_weight_column,
-                                     feature_analysis_weight_reduction, auc):
-    self._ResetAllBackends()
-    feature_columns = [
-        feature_column for feature_column in self.heart_feature_columns
-        if feature_column.name in feature_names
-    ]
-    feature_configs = [
-        feature_config for feature_config in self.heart_feature_configs
-        if feature_config.name in feature_names
-    ]
-    model_config = configs.CalibratedLinearConfig(
-        use_bias=use_bias,
-        regularizer_configs=[
-            configs.RegularizerConfig(name='output_calib_hessian', l2=1e-4),
-        ],
-        output_calibration=output_calibration,
-        feature_configs=feature_configs)
-    estimator = estimators.CannedClassifier(
-        feature_columns=feature_columns,
-        model_config=model_config,
-        feature_analysis_input_fn=self._GetHeartTrainInputFn(num_epochs=1),
-        weight_column=weight_column,
-        feature_analysis_weight_column=feature_analysis_weight_column,
-        feature_analysis_weight_reduction=feature_analysis_weight_reduction,
-        optimizer=tf.keras.optimizers.legacy.Adam(0.01))
-    estimator.train(input_fn=self._GetHeartTrainInputFn(num_epochs=200))
-    results = estimator.evaluate(input_fn=self._GetHeartTestInputFn())
-    logging.info('Calibrated linear classifier results:')
-    logging.info(results)
-    self.assertGreater(results['auc'], auc)
-
-  @parameterized.parameters(
-      ([
-          'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
-          'TAX', 'PTRATIO', 'B', 'LSTAT'
-      ], [['CRIM', 'ZN', 'RAD', 'DIS'], ['PTRATIO', 'LSTAT', 'ZN', 'RM'],
-          ['AGE', 'NOX', 'B'], ['INDUS', 'NOX', 'PTRATIO'], ['TAX', 'CHAS'],
-          ['CRIM', 'INDUS', 'AGE', 'RM', 'CHAS']
-         ], None, None, False, True, 60.0),
-      ([
-          'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
-          'TAX', 'PTRATIO', 'B', 'LSTAT'
-      ], 'random', 6, 5, True, False, 50.0),
-      ([
-          'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
-          'TAX', 'PTRATIO', 'B', 'LSTAT'
-      ], 'crystals', 6, 5, True, False, 50.0),
-      ([
-          'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
-          'TAX', 'PTRATIO', 'B', 'LSTAT'
-      ], 'rtl_layer', 6, 5, True, False, 50.0),
-  )
-  def testCalibratedLatticeEnsembleRegressor(self, feature_names, lattices,
-                                             num_lattices, lattice_rank,
-                                             separate_calibrators,
-                                             output_calibration, average_loss):
-    self._ResetAllBackends()
-    feature_columns = [
-        feature_column for feature_column in self.boston_feature_columns
-        if feature_column.name in feature_names
-    ]
-    feature_configs = [
-        feature_config for feature_config in self.boston_feature_configs
-        if feature_config.name in feature_names
-    ]
-    if lattices == 'rtl_layer':
-      # RTL Layer only supports monotonicity and bound constraints.
-      feature_configs = copy.deepcopy(feature_configs)
-      for feature_config in feature_configs:
-        feature_config.lattice_size = 2
-        feature_config.unimodality = 'none'
-        feature_config.reflects_trust_in = None
-        feature_config.dominates = None
-        feature_config.regularizer_configs = None
-    model_config = configs.CalibratedLatticeEnsembleConfig(
-        regularizer_configs=[
-            configs.RegularizerConfig(name='torsion', l2=1e-5),
-            configs.RegularizerConfig(name='output_calib_hessian', l2=1e-5),
-        ],
-        feature_configs=feature_configs,
-        lattices=lattices,
-        num_lattices=num_lattices,
-        lattice_rank=lattice_rank,
-        separate_calibrators=separate_calibrators,
-        output_calibration=output_calibration,
-    )
-    estimator = estimators.CannedRegressor(
-        feature_columns=feature_columns,
-        model_config=model_config,
-        feature_analysis_input_fn=self._GetBostonTrainInputFn(num_epochs=1),
-        prefitting_input_fn=self._GetBostonTrainInputFn(num_epochs=50),
-        optimizer=tf.keras.optimizers.legacy.Adam(0.05),
-        prefitting_optimizer=tf.keras.optimizers.legacy.Adam(0.05))
-    estimator.train(input_fn=self._GetBostonTrainInputFn(num_epochs=200))
-    results = estimator.evaluate(input_fn=self._GetBostonTestInputFn())
-    logging.info('Calibrated lattice ensemble regressor results:')
-    logging.info(results)
-    self.assertLess(results['average_loss'], average_loss)
-
-  @parameterized.parameters(
-      (['CRIM', 'ZN', 'RM', 'DIS', 'PTRATIO', 'LSTAT'], False, 40.0),
-      (['CRIM', 'INDUS', 'CHAS', 'NOX', 'AGE', 'RAD', 'TAX', 'B'], True, 40.0),
-      (['CRIM', 'INDUS', 'LSTAT', 'NOX', 'AGE', 'RAD', 'TAX', 'B'], True, 40.0),
-  )
-  def testCalibratedLatticeRegressor(self, feature_names, output_calibration,
-                                     average_loss):
-    self._ResetAllBackends()
-    feature_columns = [
-        feature_column for feature_column in self.boston_feature_columns
-        if feature_column.name in feature_names
-    ]
-    feature_configs = [
-        feature_config for feature_config in self.boston_feature_configs
-        if feature_config.name in feature_names
-    ]
-    model_config = configs.CalibratedLinearConfig(
-        regularizer_configs=[
-            configs.RegularizerConfig(name='torsion', l2=1e-4),
-            configs.RegularizerConfig(name='output_calib_hessian', l2=1e-4),
-        ],
-        output_calibration=output_calibration,
-        feature_configs=feature_configs)
-    estimator = estimators.CannedRegressor(
-        feature_columns=feature_columns,
-        model_config=model_config,
-        feature_analysis_input_fn=self._GetBostonTrainInputFn(num_epochs=1),
-        optimizer=tf.keras.optimizers.legacy.Adam(0.01))
-    estimator.train(input_fn=self._GetBostonTrainInputFn(num_epochs=200))
-    results = estimator.evaluate(input_fn=self._GetBostonTestInputFn())
-    logging.info('Calibrated lattice regressor results:')
-    logging.info(results)
-    self.assertLess(results['average_loss'], average_loss)
-
-  @parameterized.parameters(
-      (['CRIM', 'ZN', 'RM', 'DIS', 'PTRATIO', 'LSTAT'], False, False, 40.0),
-      ([
-          'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
-          'TAX', 'PTRATIO', 'B', 'LSTAT'
-      ], True, True, 40.0),
-  )
-  def testCalibratedLinearRegressor(self, feature_names, output_calibration,
-                                    use_bias, average_loss):
-    self._ResetAllBackends()
-    feature_columns = [
-        feature_column for feature_column in self.boston_feature_columns
-        if feature_column.name in feature_names
-    ]
-    feature_configs = [
-        feature_config for feature_config in self.boston_feature_configs
-        if feature_config.name in feature_names
-    ]
-    model_config = configs.CalibratedLinearConfig(
-        use_bias=use_bias,
-        regularizer_configs=[
-            configs.RegularizerConfig(name='output_calib_hessian', l2=1e-4),
-        ],
-        output_calibration=output_calibration,
-        feature_configs=feature_configs)
-    estimator = estimators.CannedRegressor(
-        feature_columns=feature_columns,
-        model_config=model_config,
-        feature_analysis_input_fn=self._GetBostonTrainInputFn(num_epochs=1),
-        optimizer=tf.keras.optimizers.legacy.Adam(0.01))
-    estimator.train(input_fn=self._GetBostonTrainInputFn(num_epochs=200))
-    results = estimator.evaluate(input_fn=self._GetBostonTestInputFn())
-    logging.info('Calibrated linear regressor results:')
-    logging.info(results)
-    self.assertLess(results['average_loss'], average_loss)
-
-  @parameterized.parameters(
-      (['CRIM', 'ZN', 'RM', 'DIS', 'PTRATIO', 'LSTAT'], False, False, 40.0),
-      ([
-          'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
-          'TAX', 'PTRATIO', 'B', 'LSTAT'
-      ], True, True, 40.0),
-  )
-  def testCalibratedLinearEstimator(self, feature_names, output_calibration,
-                                    use_bias, average_loss):
-    self._ResetAllBackends()
-    feature_columns = [
-        feature_column for feature_column in self.boston_feature_columns
-        if feature_column.name in feature_names
-    ]
-    feature_configs = [
-        feature_config for feature_config in self.boston_feature_configs
-        if feature_config.name in feature_names
-    ]
-    model_config = configs.CalibratedLinearConfig(
-        use_bias=use_bias,
-        regularizer_configs=[
-            configs.RegularizerConfig(name='output_calib_hessian', l2=1e-4),
-        ],
-        output_calibration=output_calibration,
-        feature_configs=feature_configs)
-    estimator = estimators.CannedEstimator(
-        head=regression_head.RegressionHead(),
-        feature_columns=feature_columns,
-        model_config=model_config,
-        feature_analysis_input_fn=self._GetBostonTrainInputFn(num_epochs=1),
-        optimizer=tf.keras.optimizers.legacy.Adam(0.01))
-    estimator.train(input_fn=self._GetBostonTrainInputFn(num_epochs=200))
-    results = estimator.evaluate(input_fn=self._GetBostonTestInputFn())
-    logging.info('Calibrated linear regressor results:')
-    logging.info(results)
-    self.assertLess(results['average_loss'], average_loss)
-
-  @parameterized.parameters(
-      ('random', 5, 6, 'all_vertices', False, True),
-      ('random', 4, 5, 'kronecker_factored', True, False),
-      ('rtl_layer', 5, 6, 'kronecker_factored', False, True),
-      ('rtl_layer', 4, 5, 'all_vertices', True, False),
-  )
-  def testCalibratedLatticeEnsembleModelInfo(self, lattices, num_lattices,
-                                             lattice_rank, parameterization,
-                                             separate_calibrators,
-                                             output_calibration):
-    self._ResetAllBackends()
-    feature_configs = copy.deepcopy(self.heart_feature_configs)
-    if lattices == 'rtl_layer' or parameterization == 'kronecker_factored':
-      # RTL Layer only supports monotonicity and bound constraints.
-      for feature_config in feature_configs:
-        feature_config.lattice_size = 2
-        feature_config.unimodality = 'none'
-        feature_config.reflects_trust_in = None
-        feature_config.dominates = None
-        feature_config.regularizer_configs = None
-    model_config = configs.CalibratedLatticeEnsembleConfig(
-        feature_configs=feature_configs,
-        lattices=lattices,
-        num_lattices=num_lattices,
-        lattice_rank=lattice_rank,
-        parameterization=parameterization,
-        separate_calibrators=separate_calibrators,
-        output_calibration=output_calibration,
-    )
-    estimator = estimators.CannedClassifier(
-        feature_columns=self.heart_feature_columns,
-        model_config=model_config,
-        feature_analysis_input_fn=self._GetHeartTrainInputFn(num_epochs=1),
-        prefitting_input_fn=self._GetHeartTrainInputFn(num_epochs=5),
-        optimizer=tf.keras.optimizers.legacy.Adam(0.01),
-        prefitting_optimizer=tf.keras.optimizers.legacy.Adam(0.01))
-    estimator.train(input_fn=self._GetHeartTrainInputFn(num_epochs=20))
-
-    # Serving input fn is used to create saved models.
-    serving_input_fn = (
-        tf_estimator.export.build_parsing_serving_input_receiver_fn(
-            feature_spec=fc.make_parse_example_spec(self.heart_feature_columns))
-    )
-    saved_model_path = estimator.export_saved_model(estimator.model_dir,
-                                                    serving_input_fn)
-    logging.info('Model exported to %s', saved_model_path)
-    model = estimators.get_model_graph(saved_model_path)
-
-    expected_num_nodes = (
-        len(self.heart_feature_columns) +  # Input features
-        num_lattices +  # One lattice per submodel
-        1 +  # Averaging submodels
-        int(output_calibration))  # Output calibration
-    if separate_calibrators:
-      expected_num_nodes += num_lattices * lattice_rank
-    else:
-      expected_num_nodes += len(self.heart_feature_columns)
-
-    self.assertLen(model.nodes, expected_num_nodes)
-
-  @parameterized.parameters(
-      (['ZN', 'INDUS', 'RM'], 'random', 3, 1, [['ZN', 'RM'], ['RM'], ['INDUS']
-                                              ]),
-      (['ZN', 'INDUS', 'RM'], 'crystals', 3, 1, [['RM'], ['INDUS'],
-                                                 ['ZN', 'RM']]),
-      (['RM', 'LSTAT', 'AGE'], 'crystals', 3, 1, [['LSTAT'], ['LSTAT', 'AGE'],
-                                                  ['RM']]),
-  )
-  def testCalibratedLatticeEnsembleFix2dConstraintViolations(
-      self, feature_names, lattices, num_lattices, lattice_rank,
-      expected_lattices):
-    self._ResetAllBackends()
-    feature_columns = [
-        feature_column for feature_column in self.boston_feature_columns
-        if feature_column.name in feature_names
-    ]
-    feature_configs = [
-        feature_config for feature_config in self.boston_feature_configs
-        if feature_config.name in feature_names
-    ]
-
-    model_config = configs.CalibratedLatticeEnsembleConfig(
-        feature_configs=feature_configs,
-        lattices=lattices,
-        num_lattices=num_lattices,
-        lattice_rank=lattice_rank,
-    )
-    estimator = estimators.CannedRegressor(
-        feature_columns=feature_columns,
-        model_config=model_config,
-        feature_analysis_input_fn=self._GetBostonTrainInputFn(num_epochs=1),
-        prefitting_input_fn=self._GetBostonTrainInputFn(num_epochs=50),
-        optimizer=tf.keras.optimizers.legacy.Adam(0.05),
-        prefitting_optimizer=tf.keras.optimizers.legacy.Adam(0.05))
-    estimator.train(input_fn=self._GetBostonTrainInputFn(num_epochs=200))
-
-    # Serving input fn is used to create saved models.
-    serving_input_fn = (
-        tf_estimator.export.build_parsing_serving_input_receiver_fn(
-            feature_spec=fc.make_parse_example_spec(feature_columns)))
-    saved_model_path = estimator.export_saved_model(estimator.model_dir,
-                                                    serving_input_fn)
-    logging.info('Model exported to %s', saved_model_path)
-    model = estimators.get_model_graph(saved_model_path)
-    lattices = []
-    for node in model.nodes:
-      if isinstance(node, model_info.LatticeNode):
-        lattices.append(
-            [input_node.input_node.name for input_node in node.input_nodes])
-
-    self.assertLen(lattices, len(expected_lattices))
-    for lattice, expected_lattice in zip(lattices, expected_lattices):
-      self.assertCountEqual(lattice, expected_lattice)
-
-  @parameterized.parameters((True,), (False,))
-  def testDatasetAPI(self, tfds):
-    self._ResetAllBackends()
-    feature_columns = self.heart_feature_columns
-    feature_configs = self.heart_feature_configs
-    model_config = configs.CalibratedLinearConfig(
-        feature_configs=feature_configs)
-    estimator = estimators.CannedClassifier(
-        feature_columns=feature_columns,
-        model_config=model_config,
-        feature_analysis_input_fn=self._GetHeartTrainInputFn(
-            num_epochs=1, tfds=tfds),
-        optimizer=tf.keras.optimizers.legacy.Adam(0.01))
-    estimator.train(
-        input_fn=self._GetHeartTrainInputFn(num_epochs=200, tfds=tfds))
-    results = estimator.evaluate(input_fn=self._GetHeartTestInputFn(tfds=tfds))
-    logging.info('Calibrated linear classifier results:')
-    logging.info(results)
-    self.assertGreater(results['auc'], 0.7)
-
-  @parameterized.parameters(
-      ('linear', None, True),
-      ('lattice', 'all_vertices', False),
-      ('lattice', 'kronecker_factored', False),
-  )
-  def testCalibratedModelInfo(self, model_type, parameterization,
-                              output_calibration):
-    self._ResetAllBackends()
-    if model_type == 'linear':
-      model_config = configs.CalibratedLinearConfig(
-          feature_configs=self.heart_feature_configs,
-          output_calibration=output_calibration,
-      )
-    else:
-      feature_configs = copy.deepcopy(self.heart_feature_configs)
-      if parameterization == 'kronecker_factored':
-        # RTL Layer only supports monotonicity and bound constraints.
-        for feature_config in feature_configs:
-          feature_config.lattice_size = 2
-          feature_config.unimodality = 'none'
-          feature_config.reflects_trust_in = None
-          feature_config.dominates = None
-          feature_config.regularizer_configs = None
-      model_config = configs.CalibratedLatticeConfig(
-          feature_configs=feature_configs,
-          parameterization=parameterization,
-          output_calibration=output_calibration,
-      )
-    estimator = estimators.CannedClassifier(
-        feature_columns=self.heart_feature_columns,
-        model_config=model_config,
-        feature_analysis_input_fn=self._GetHeartTrainInputFn(num_epochs=1),
-        prefitting_input_fn=self._GetHeartTrainInputFn(num_epochs=5),
-        optimizer=tf.keras.optimizers.legacy.Adam(0.01),
-        prefitting_optimizer=tf.keras.optimizers.legacy.Adam(0.01))
-    estimator.train(input_fn=self._GetHeartTrainInputFn(num_epochs=20))
-
-    # Serving input fn is used to create saved models.
-    serving_input_fn = (
-        tf_estimator.export.build_parsing_serving_input_receiver_fn(
-            feature_spec=fc.make_parse_example_spec(self.heart_feature_columns))
-    )
-    saved_model_path = estimator.export_saved_model(estimator.model_dir,
-                                                    serving_input_fn)
-    logging.info('Model exported to %s', saved_model_path)
-    model = estimators.get_model_graph(saved_model_path)
-
-    expected_num_nodes = (
-        2 * len(self.heart_feature_columns) +  # Input features and calibration
-        1 +  # Linear or lattice layer
-        int(output_calibration))  # Output calibration
-
-    self.assertLen(model.nodes, expected_num_nodes)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow_lattice/python/kronecker_factored_lattice_layer.py b/tensorflow_lattice/python/kronecker_factored_lattice_layer.py
index 89ad6f1..1e3ff4b 100644
--- a/tensorflow_lattice/python/kronecker_factored_lattice_layer.py
+++ b/tensorflow_lattice/python/kronecker_factored_lattice_layer.py
@@ -26,10 +26,16 @@
 import functools
 import inspect
 
+import tensorflow as tf
+# pylint: disable=g-import-not-at-top
+# Use Keras 2.
+version_fn = getattr(tf.keras, "version", None)
+if version_fn and version_fn().startswith("3."):
+  import tf_keras as keras
+else:
+  keras = tf.keras
 from . import kronecker_factored_lattice_lib as kfl_lib
 from . import utils
-import tensorflow as tf
-from tensorflow import keras
 
 DIMS_NAME = "dims"
 KFL_SCALE_NAME = "kronecker_factored_lattice_scale"
@@ -152,7 +158,7 @@ def __init__(self,
           only output_max is set, scale is initialized to -1 for each term.
           Otherwise scale is initialized to alternate between 1 and -1 for each
           term.
-      **kwargs: Other args passed to `tf.keras.layers.Layer` initializer.
+      **kwargs: Other args passed to `keras.layers.Layer` initializer.
 
     Raises:
       ValueError: If layer hyperparameters are invalid.
@@ -467,8 +473,8 @@ def __call__(self, shape, scale, dtype=None, **kwargs):
       shape: Must be: `(1, lattice_sizes, units * dims, num_terms)`.
       scale: Scale variable of shape: `(units, num_terms)`.
       dtype: Standard Keras initializer param.
-      **kwargs: Other args passed to `tf.keras.initializers.Initializer`
-        __call__ method.
+      **kwargs: Other args passed to `keras.initializers.Initializer` __call__
+        method.
     """
     return kfl_lib.kfl_random_monotonic_initializer(
         shape=shape,
@@ -519,8 +525,8 @@ def __call__(self, shape, dtype=None, **kwargs):
     Args:
       shape: Must be: `(units, num_terms)`.
       dtype: Standard Keras initializer param.
-      **kwargs: Other args passed to `tf.keras.initializers.Initializer`
-        __call__ method.
+      **kwargs: Other args passed to `keras.initializers.Initializer` __call__
+        method.
     """
     units, num_terms = shape
     return kfl_lib.scale_initializer(
@@ -565,8 +571,8 @@ def __call__(self, shape, dtype=None, **kwargs):
     Args:
       shape: Must be: `(units, num_terms)`.
       dtype: Standard Keras initializer param.
-      **kwargs: Other args passed to `tf.keras.initializers.Initializer`
-        __call__ method.
+      **kwargs: Other args passed to `keras.initializers.Initializer` __call__
+        method.
     """
     return kfl_lib.bias_initializer(
         units=shape[0],
diff --git a/tensorflow_lattice/python/kronecker_factored_lattice_test.py b/tensorflow_lattice/python/kronecker_factored_lattice_test.py
index ac3bd12..a102be1 100644
--- a/tensorflow_lattice/python/kronecker_factored_lattice_test.py
+++ b/tensorflow_lattice/python/kronecker_factored_lattice_test.py
@@ -23,10 +23,16 @@
 from absl.testing import parameterized
 import numpy as np
 import tensorflow as tf
-from tensorflow import keras
 from tensorflow_lattice.python import kronecker_factored_lattice_layer as kfll
 from tensorflow_lattice.python import kronecker_factored_lattice_lib as kfl_lib
 from tensorflow_lattice.python import test_utils
+# pylint: disable=g-import-not-at-top
+# Use Keras 2.
+version_fn = getattr(tf.keras, "version", None)
+if version_fn and version_fn().startswith("3."):
+  import tf_keras as keras
+else:
+  keras = tf.keras
 
 
 class KroneckerFactoredLatticeTest(parameterized.TestCase, tf.test.TestCase):
@@ -38,7 +44,7 @@ def setUp(self):
     self.loss_eps = 0.001
     self.small_eps = 1e-6
     self.seed = 42
-    tf.keras.utils.set_random_seed(42)
+    keras.utils.set_random_seed(42)
 
   def _ResetAllBackends(self):
     keras.backend.clear_session()
@@ -154,10 +160,8 @@ def _GetTrainingInputsAndLabels(self, config):
       config: Dictionary with config for this unit test.
 
     Returns:
-      Tuple `(training_inputs, training_labels, raw_training_inputs)` where
-      `training_inputs` and `training_labels` are data for training and
-      `raw_training_inputs` are representation of training_inputs for
-      visualisation.
+      Tuple `(training_inputs, training_labels)` where
+      `training_inputs` and `training_labels` are data for training.
     """
     raw_training_inputs = config["x_generator"](
         num_points=config["num_training_records"],
@@ -172,7 +176,7 @@ def _GetTrainingInputsAndLabels(self, config):
       training_inputs = raw_training_inputs
 
     training_labels = [config["y_function"](x) for x in training_inputs]
-    return training_inputs, training_labels, raw_training_inputs
+    return training_inputs, training_labels
 
   def _SetDefaults(self, config):
     config.setdefault("units", 1)
@@ -201,17 +205,17 @@ def _TestEnsemble(self, config):
     for units, lattice_index in [(1, 0), (3, 0), (3, 2)]:
       config["units"] = units
       config["lattice_index"] = lattice_index
-      tf.keras.utils.set_random_seed(42)
+      keras.utils.set_random_seed(42)
       losses.append(self._TrainModel(config))
     self.assertAlmostEqual(min(losses), max(losses), delta=self.loss_eps)
 
-  def _TrainModel(self, config, plot_path=None):
+  def _TrainModel(self, config):
     logging.info("Testing config:")
     logging.info(config)
     config = self._SetDefaults(config)
     self._ResetAllBackends()
 
-    training_inputs, training_labels, raw_training_inputs = (
+    training_inputs, training_labels = (
         self._GetTrainingInputsAndLabels(config))
 
     units = config["units"]
@@ -253,12 +257,10 @@ def _TrainModel(self, config, plot_path=None):
     optimizer = config["optimizer"](learning_rate=config["learning_rate"])
     model.compile(loss=keras.losses.mean_squared_error, optimizer=optimizer)
 
-    training_data = (training_inputs, training_labels, raw_training_inputs)
+    training_data = (training_inputs, training_labels)
     loss = test_utils.run_training_loop(
-        config=config,
-        training_data=training_data,
-        keras_model=model,
-        plot_path=plot_path)
+        config=config, training_data=training_data, keras_model=model
+    )
 
     if tf.executing_eagerly():
       tf.print("final weights: ", keras_layer.kernel)
@@ -282,7 +284,7 @@ def testMonotonicityOneD(self):
         "input_dims": 1,
         "num_training_records": 128,
         "num_training_epoch": 50,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SinPlusX,
@@ -301,7 +303,7 @@ def testMonotonicityOneD(self):
         "input_dims": 1,
         "num_training_records": 100,
         "num_training_epoch": 50,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": lambda x: -self._SinPlusX(x),
@@ -321,7 +323,7 @@ def testMonotonicityOneD(self):
         "num_terms": 1,
         "num_training_records": 100,
         "num_training_epoch": 200,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SinPlusLargeX,
@@ -344,7 +346,7 @@ def testMonotonicityTwoD(self):
         "input_dims": 2,
         "num_training_records": 900,
         "num_training_epoch": 100,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._TwoDMeshGrid,
         "y_function": self._SinPlusXNd,
@@ -363,7 +365,7 @@ def testMonotonicityTwoD(self):
         "input_dims": 2,
         "num_training_records": 900,
         "num_training_epoch": 100,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._TwoDMeshGrid,
         "y_function": self._SinPlusXNd,
@@ -382,7 +384,7 @@ def testMonotonicityTwoD(self):
         "input_dims": 2,
         "num_training_records": 900,
         "num_training_epoch": 100,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.5,
         "x_generator": self._TwoDMeshGrid,
         "y_function": self._SinPlusXNd,
@@ -401,7 +403,7 @@ def testMonotonicityTwoD(self):
         "input_dims": 2,
         "num_training_records": 100,
         "num_training_epoch": 20,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._TwoDMeshGrid,
         "y_function": lambda x: -self._ScaledSum(x),
@@ -420,7 +422,7 @@ def testMonotonicity5d(self):
         "input_dims": 5,
         "num_training_records": 100,
         "num_training_epoch": 200,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._ScaledSum,
@@ -441,7 +443,7 @@ def testMonotonicity5d(self):
         "input_dims": 5,
         "num_training_records": 100,
         "num_training_epoch": 40,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": lambda x: -self._ScaledSum(x),
@@ -460,7 +462,7 @@ def testMonotonicity5d(self):
         "input_dims": 4,
         "num_training_records": 100,
         "num_training_epoch": 100,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SinOfSum,
@@ -485,7 +487,7 @@ def testMonotonicityEquivalence(self, monotonicities):
         "monotonicities": monotonicities,
         "num_training_records": 100,
         "num_training_epoch": 50,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 10.0,
         "x_generator": self._SameValueForAllDims,
         "y_function": self._SinOfSum,
@@ -510,7 +512,7 @@ def testMonotonicity10dAlmostMonotone(self):
         "num_terms": 128,
         "num_training_records": 1000,
         "num_training_epoch": 100,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": test_utils.get_hypercube_interpolation_fn(weights),
@@ -536,7 +538,7 @@ def testMonotonicity10dSinOfSum(self):
         "input_dims": 10,
         "num_training_records": 1000,
         "num_training_epoch": 100,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SinOfSum,
@@ -573,7 +575,7 @@ def testInitializerType(self, initializer, expected_loss):
         "input_dims": 2,
         "num_training_records": 100,
         "num_training_epoch": 0,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._TwoDMeshGrid,
         "y_function": self._Max,
@@ -593,7 +595,7 @@ def testAssertMonotonicity(self):
         "input_dims": 2,
         "num_training_records": 100,
         "num_training_epoch": 0,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 0.15,
         "x_generator": self._TwoDMeshGrid,
         "y_function": self._ScaledSum,
@@ -615,24 +617,48 @@ def testAssertMonotonicity(self):
           self._TrainModel(config)
 
   @parameterized.parameters(
-      (-1, 1,
-       kfll.KFLRandomMonotonicInitializer(
-           monotonicities=None, init_min=-10,
-           init_max=10), "scale_initializer"),
-      (None, 1,
-       kfll.KFLRandomMonotonicInitializer(
-           monotonicities=None, init_min=-10,
-           init_max=10), "scale_initializer"),
-      (-1, None,
-       kfll.KFLRandomMonotonicInitializer(
-           monotonicities=None, init_min=-10,
-           init_max=10), "scale_initializer"),
-      (-1, 1, "kfl_random_monotonic_initializer",
-       tf.keras.initializers.Constant(value=-100)),
-      (None, 1, "kfl_random_monotonic_initializer",
-       tf.keras.initializers.Constant(value=100)),
-      (-1, None, "kfl_random_monotonic_initializer",
-       tf.keras.initializers.Constant(value=-100)),
+      (
+          -1,
+          1,
+          kfll.KFLRandomMonotonicInitializer(
+              monotonicities=None, init_min=-10, init_max=10
+          ),
+          "scale_initializer",
+      ),
+      (
+          None,
+          1,
+          kfll.KFLRandomMonotonicInitializer(
+              monotonicities=None, init_min=-10, init_max=10
+          ),
+          "scale_initializer",
+      ),
+      (
+          -1,
+          None,
+          kfll.KFLRandomMonotonicInitializer(
+              monotonicities=None, init_min=-10, init_max=10
+          ),
+          "scale_initializer",
+      ),
+      (
+          -1,
+          1,
+          "kfl_random_monotonic_initializer",
+          keras.initializers.Constant(value=-100),
+      ),
+      (
+          None,
+          1,
+          "kfl_random_monotonic_initializer",
+          keras.initializers.Constant(value=100),
+      ),
+      (
+          -1,
+          None,
+          "kfl_random_monotonic_initializer",
+          keras.initializers.Constant(value=-100),
+      ),
   )
   def testAssertBounds(self, output_min, output_max, kernel_initializer,
                        scale_initializer):
@@ -645,7 +671,7 @@ def testAssertBounds(self, output_min, output_max, kernel_initializer,
         "input_dims": 2,
         "num_training_records": 100,
         "num_training_epoch": 0,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 0.15,
         "x_generator": self._TwoDMeshGrid,
         "y_function": self._ScaledSum,
@@ -681,7 +707,7 @@ def testOutputBounds(self, units, input_dims, output_min, output_max,
         "input_dims": input_dims,
         "num_training_records": 900,
         "num_training_epoch": 100,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SinPlusX,
@@ -789,7 +815,7 @@ def testInputOutOfBounds(self):
         "input_dims": 1,
         "num_training_records": 100,
         "num_training_epoch": 20,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformlyExtendedRange,
         "y_function": self._Sin,
@@ -806,7 +832,7 @@ def testInputOutOfBounds(self):
         "input_dims": 2,
         "num_training_records": 100,
         "num_training_epoch": 20,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._TwoDMeshGridExtendedRange,
         "y_function": self._SinOfSum,
@@ -831,7 +857,7 @@ def testHighDimensionsStressTest(self):
         "monotonicities": monotonicities,
         "num_training_records": 100,
         "num_training_epoch": 3,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SinOfSum,
@@ -868,8 +894,9 @@ def testGraphSize(self, lattice_sizes, input_dims, num_terms,
     self.assertLessEqual(graph_size, expected_graph_size)
 
   @parameterized.parameters(
-      ("random_uniform", tf.keras.initializers.RandomUniform),
-      ("kfl_random_monotonic_initializer", kfll.KFLRandomMonotonicInitializer))
+      ("random_uniform", keras.initializers.RandomUniform),
+      ("kfl_random_monotonic_initializer", kfll.KFLRandomMonotonicInitializer),
+  )
   def testCreateKernelInitializer(self, kernel_initializer_id, expected_type):
     self.assertEqual(
         expected_type,
diff --git a/tensorflow_lattice/python/lattice_layer.py b/tensorflow_lattice/python/lattice_layer.py
index 3bb914d..c9ba4dd 100644
--- a/tensorflow_lattice/python/lattice_layer.py
+++ b/tensorflow_lattice/python/lattice_layer.py
@@ -22,11 +22,17 @@
 from __future__ import division
 from __future__ import print_function
 
-from . import lattice_lib
-from . import utils
 import six
 import tensorflow as tf
-from tensorflow import keras
+# pylint: disable=g-import-not-at-top
+# Use Keras 2.
+version_fn = getattr(tf.keras, "version", None)
+if version_fn and version_fn().startswith("3."):
+  import tf_keras as keras
+else:
+  keras = tf.keras
+from . import lattice_lib
+from . import utils
 
 LATTICE_KERNEL_NAME = "lattice_kernel"
 LATTICE_SIZES_NAME = "lattice_sizes"
@@ -270,7 +276,7 @@ def __init__(self,
           either be single floats or lists of floats to specify different
           regularization amount for every dimension.
         - Any Keras regularizer object.
-      **kwargs: Other args passed to `tf.keras.layers.Layer` initializer.
+      **kwargs: Other args passed to `keras.layers.Layer` initializer.
 
     Raises:
       ValueError: If layer hyperparameters are invalid.
diff --git a/tensorflow_lattice/python/lattice_test.py b/tensorflow_lattice/python/lattice_test.py
index 387e98f..1e4a693 100644
--- a/tensorflow_lattice/python/lattice_test.py
+++ b/tensorflow_lattice/python/lattice_test.py
@@ -22,9 +22,15 @@
 from absl.testing import parameterized
 import numpy as np
 import tensorflow as tf
-from tensorflow import keras
 from tensorflow_lattice.python import lattice_layer as ll
 from tensorflow_lattice.python import test_utils
+# pylint: disable=g-import-not-at-top
+# Use Keras 2.
+version_fn = getattr(tf.keras, "version", None)
+if version_fn and version_fn().startswith("3."):
+  import tf_keras as keras
+else:
+  keras = tf.keras
 
 
 class LatticeTest(parameterized.TestCase, tf.test.TestCase):
@@ -35,7 +41,7 @@ def setUp(self):
     self.disable_ensembles = False
     self.loss_eps = 0.0001
     self.small_eps = 1e-6
-    tf.keras.utils.set_random_seed(42)
+    keras.utils.set_random_seed(42)
 
   def _ResetAllBackends(self):
     keras.backend.clear_session()
@@ -174,10 +180,8 @@ def _GetTrainingInputsAndLabels(self, config):
       config: Dictionary with config for this unit test.
 
     Returns:
-      Tuple `(training_inputs, training_labels, raw_training_inputs)` where
-        `training_inputs` and `training_labels` are data for training and
-        `raw_training_inputs` are representation of training_inputs for
-        visualisation.
+      Tuple `(training_inputs, training_labels)` where
+        `training_inputs` and `training_labels` are data for training.
     """
     raw_training_inputs = config["x_generator"](
         num_points=config["num_training_records"],
@@ -191,7 +195,7 @@ def _GetTrainingInputsAndLabels(self, config):
       training_inputs = raw_training_inputs
 
     training_labels = [config["y_function"](x) for x in training_inputs]
-    return training_inputs, training_labels, raw_training_inputs
+    return training_inputs, training_labels
 
   def _SetDefaults(self, config):
     config.setdefault("monotonicities", None)
@@ -231,13 +235,13 @@ def _TestEnsemble(self, config):
       losses.append(self._TrainModel(config))
     self.assertAlmostEqual(min(losses), max(losses), delta=self.loss_eps)
 
-  def _TrainModel(self, config, plot_path=None):
+  def _TrainModel(self, config):
     logging.info("Testing config:")
     logging.info(config)
     config = self._SetDefaults(config)
     self._ResetAllBackends()
 
-    training_inputs, training_labels, raw_training_inputs = (
+    training_inputs, training_labels = (
         self._GetTrainingInputsAndLabels(config))
 
     units = config["units"]
@@ -287,12 +291,10 @@ def _TrainModel(self, config, plot_path=None):
     optimizer = config["optimizer"](learning_rate=config["learning_rate"])
     model.compile(loss=keras.losses.mean_squared_error, optimizer=optimizer)
 
-    training_data = (training_inputs, training_labels, raw_training_inputs)
+    training_data = (training_inputs, training_labels)
     loss = test_utils.run_training_loop(
-        config=config,
-        training_data=training_data,
-        keras_model=model,
-        plot_path=plot_path)
+        config=config, training_data=training_data, keras_model=model
+    )
 
     if tf.executing_eagerly():
       tf.print("final weights: ", keras_layer.kernel)
@@ -310,7 +312,7 @@ def testMonotonicityOneD(self):
         "lattice_sizes": [20],
         "num_training_records": 128,
         "num_training_epoch": 50,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SinPlusX,
@@ -326,7 +328,7 @@ def testMonotonicityOneD(self):
         "lattice_sizes": [20],
         "num_training_records": 100,
         "num_training_epoch": 50,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": lambda x: -self._SinPlusX(x),
@@ -342,7 +344,7 @@ def testMonotonicityOneD(self):
         "lattice_sizes": [5],
         "num_training_records": 100,
         "num_training_epoch": 50,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SinPlusLargeX,
@@ -362,7 +364,7 @@ def testMonotonicityTwoD(self):
         "lattice_sizes": [21, 6],
         "num_training_records": 900,
         "num_training_epoch": 100,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._TwoDMeshGrid,
         "y_function": self._SinPlusXNd,
@@ -378,7 +380,7 @@ def testMonotonicityTwoD(self):
         "lattice_sizes": [6, 21],
         "num_training_records": 900,
         "num_training_epoch": 100,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._TwoDMeshGrid,
         "y_function": self._SinPlusXNd,
@@ -394,7 +396,7 @@ def testMonotonicityTwoD(self):
         "lattice_sizes": [6, 21],
         "num_training_records": 900,
         "num_training_epoch": 100,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._TwoDMeshGrid,
         "y_function": self._SinPlusXNd,
@@ -410,7 +412,7 @@ def testMonotonicityTwoD(self):
         "lattice_sizes": [6, 21],
         "num_training_records": 900,
         "num_training_epoch": 100,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.5,
         "x_generator": self._TwoDMeshGrid,
         "y_function": self._SinPlusXNd,
@@ -426,7 +428,7 @@ def testMonotonicityTwoD(self):
         "lattice_sizes": [2, 2],
         "num_training_records": 100,
         "num_training_epoch": 20,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._TwoDMeshGrid,
         "y_function": lambda x: -self._ScaledSum(x),
@@ -443,7 +445,7 @@ def testMonotonicity5d(self):
         "lattice_sizes": [2, 2, 2, 2, 2],
         "num_training_records": 100,
         "num_training_epoch": 200,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._ScaledSum,
@@ -460,7 +462,7 @@ def testMonotonicity5d(self):
         "lattice_sizes": [2, 2, 2, 2, 2],
         "num_training_records": 100,
         "num_training_epoch": 40,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": lambda x: -self._ScaledSum(x),
@@ -474,7 +476,7 @@ def testMonotonicity5d(self):
         "lattice_sizes": [3, 3, 3, 3],
         "num_training_records": 100,
         "num_training_epoch": 100,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SinOfSum,
@@ -497,7 +499,7 @@ def testMonotonicityEquivalence(self, monotonicities):
         "monotonicities": monotonicities,
         "num_training_records": 100,
         "num_training_epoch": 50,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 10.0,
         "x_generator": self._SameValueForAllDims,
         "y_function": self._SinOfSum,
@@ -521,7 +523,7 @@ def testMonotonicity10dAlmostMonotone(self):
         "lattice_sizes": [2] * 10,
         "num_training_records": 1000,
         "num_training_epoch": 100,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 100.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": test_utils.get_hypercube_interpolation_fn(weights),
@@ -543,7 +545,7 @@ def testMonotonicity10dSinOfSum(self):
         "lattice_sizes": [2] * 10,
         "num_training_records": 1000,
         "num_training_epoch": 100,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 100.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SinOfSum,
@@ -576,7 +578,7 @@ def testSimpleTrustTwoD(self, edgeworth_trusts, trapezoid_trusts,
         "lattice_sizes": [2, 2],
         "num_training_records": 100,
         "num_training_epoch": 20,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._TwoDMeshGrid,
         "y_function": self._Max,
@@ -605,7 +607,7 @@ def testDenseTrustTwoD(self, edgeworth_trusts, trapezoid_trusts,
         "lattice_sizes": [4, 3],
         "num_training_records": 150,
         "num_training_epoch": 20,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 10.0,
         "x_generator": self._TwoDMeshGrid,
         "y_function": self._PseudoLinear,
@@ -635,7 +637,7 @@ def testSimpleTrust4D(self, edgeworth_trusts, trapezoid_trusts,
         "lattice_sizes": [2, 2, 2, 2],
         "num_training_records": 100,
         "num_training_epoch": 20,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._Max,
@@ -664,7 +666,7 @@ def testMultiDenseTrust4D(self, edgeworth_trusts, trapezoid_trusts,
         "lattice_sizes": [3, 3, 3, 3],
         "num_training_records": 1000,
         "num_training_epoch": 20,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SinOfSum,
@@ -695,7 +697,7 @@ def testEdgeworthTrustEquivalence(self, edgeworth_trusts):
         "edgeworth_trusts": edgeworth_trusts,
         "num_training_records": 100,
         "num_training_epoch": 50,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 10.0,
         "x_generator": self._SameValueForAllDims,
         "y_function": self._PseudoLinear,
@@ -719,7 +721,7 @@ def testSimpleMonotonicDominance2D(self, monotonic_dominances, expected_loss):
         "lattice_sizes": [2, 2],
         "num_training_records": 100,
         "num_training_epoch": 20,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._TwoDMeshGrid,
         "y_function": self._WeightedSum,
@@ -747,7 +749,7 @@ def testDenseMonotonicDominance2D(self, monotonic_dominances, expected_loss):
         "num_training_records": 100,
         "num_training_epoch": 20,
         "num_projection_iterations": 20,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._TwoDMeshGrid,
         "y_function": self._WeightedSum,
@@ -774,7 +776,7 @@ def testDenseMonotonicDominance5D(self, monotonic_dominances, expected_loss):
         "num_training_records": 100,
         "num_training_epoch": 300,
         "num_projection_iterations": 20,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._WeightedSum,
@@ -801,7 +803,7 @@ def testSimpleRangeDominance2D(self, range_dominances, expected_loss):
         "lattice_sizes": [2, 2],
         "num_training_records": 100,
         "num_training_epoch": 20,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 0.1,
         "x_generator": self._TwoDMeshGrid,
         "y_function": self._WeightedSum,
@@ -829,7 +831,7 @@ def testDenseRangeDominance2D(self, range_dominances, expected_loss, expid):
         "num_training_records": 100,
         "num_training_epoch": 40,
         "num_projection_iterations": 20,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 0.1,
         "x_generator": self._TwoDMeshGrid,
         "y_function": self._WeightedSum,
@@ -856,7 +858,7 @@ def testDenseRangeDominance5D(self, range_dominances, expected_loss):
         "num_training_records": 100,
         "num_training_epoch": 300,
         "num_projection_iterations": 40,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._WeightedSum,
@@ -883,7 +885,7 @@ def testSimpleJointMonotonicity2D(self, joint_monotonicities, expected_loss):
         "lattice_sizes": [2, 2],
         "num_training_records": 100,
         "num_training_epoch": 20,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._TwoDMeshGrid,
         "y_function": self._MixedSignWeightedSum,
@@ -918,7 +920,7 @@ def _Sin(x):
         "lattice_sizes": [15],
         "num_training_records": 100,
         "num_training_epoch": 20,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": _Sin,
@@ -940,7 +942,7 @@ def testJointUnimodality2DSinOfSum(self):
         "lattice_sizes": [3, 3],
         "num_training_records": 36*9,
         "num_training_epoch": 150,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 0.1,
         "x_generator": self._TwoDMeshGrid,
         "y_function": lambda x: -math.sin(sum(x) * 2.0),
@@ -978,7 +980,7 @@ def WShaped2dFunction(x):
         "lattice_sizes": [coordinate * 2 + 1 for coordinate in center],
         "num_training_records": 36 * 9,
         "num_training_epoch": 18,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 10.0,
         "x_generator": self._TwoDMeshGrid,
         "y_function": WShaped2dFunction,
@@ -1043,7 +1045,7 @@ def _DistributeXUniformly(num_points, lattice_sizes):
         "lattice_sizes": lattice_sizes,
         "num_training_records": 1,  # Not used by x_generator for this test.
         "num_training_epoch": 10,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 10.0,
         "x_generator": _DistributeXUniformly,
         "y_function": WShaped2dFunction,
@@ -1064,7 +1066,7 @@ def testJointUnimodality3D(self):
         "lattice_sizes": [3, 3, 3, 3],
         "num_training_records": 100,
         "num_training_epoch": 30,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 10.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SinOfSum,
@@ -1091,7 +1093,7 @@ def testDenseJointMonotonicity2D(self, joint_monotonicities, expected_loss):
         "num_training_records": 100,
         "num_training_epoch": 40,
         "num_projection_iterations": 40,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._TwoDMeshGrid,
         "y_function": self._MixedSignWeightedSum,
@@ -1116,7 +1118,7 @@ def testDenseJointMonotonicity5D(self, joint_monotonicities, expected_loss):
         "num_training_records": 100,
         "num_training_epoch": 100,
         "num_projection_iterations": 40,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._MixedSignWeightedSum,
@@ -1146,7 +1148,7 @@ def testInitializerType(self, initializer, expected_loss):
         "lattice_sizes": [2, 3],
         "num_training_records": 98,
         "num_training_epoch": 0,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._TwoDMeshGrid,
         "y_function": self._Max,
@@ -1171,7 +1173,7 @@ def testLinearMonotonicInitializer(self):
     config = {
         "num_training_records": 96,
         "num_training_epoch": 0,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._TwoDMeshGrid,
     }  # pyformat: disable
@@ -1252,7 +1254,7 @@ def testUnimodalInitializer(self):
         "kernel_initializer": "linear_initializer",
         "num_training_records": 96,
         "num_training_epoch": 0,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._TwoDMeshGrid,
         "y_function": self._Max,
@@ -1330,7 +1332,7 @@ def testAssertMonotonicity(self):
         "lattice_sizes": [2, 2],
         "num_training_records": 100,
         "num_training_epoch": 0,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 0.15,
         "x_generator": self._TwoDMeshGrid,
         "y_function": self._ScaledSum,
@@ -1355,7 +1357,7 @@ def testBounds(self):
         "lattice_sizes": [20],
         "num_training_records": 100,
         "num_training_epoch": 40,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._Sin,
@@ -1370,7 +1372,7 @@ def testBounds(self):
         "lattice_sizes": [11, 4],
         "num_training_records": 270,
         "num_training_epoch": 40,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._TwoDMeshGrid,
         "y_function": self._SinPlusXNd,
@@ -1386,7 +1388,7 @@ def testBounds(self):
         "lattice_sizes": [2] * 5,
         "num_training_records": 100,
         "num_training_epoch": 40,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SinOfSum,
@@ -1405,7 +1407,7 @@ def testInputOutOfBounds(self):
         "lattice_sizes": [6],
         "num_training_records": 100,
         "num_training_epoch": 20,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformlyExtendedRange,
         "y_function": self._Sin,
@@ -1419,7 +1421,7 @@ def testInputOutOfBounds(self):
         "lattice_sizes": [2, 2],
         "num_training_records": 100,
         "num_training_epoch": 20,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._TwoDMeshGridExtendedRange,
         "y_function": self._SinOfSum,
@@ -1450,7 +1452,7 @@ def testRegularizers2d(self, regularizer, pure_reg_loss, training_loss):
         "lattice_sizes": [2, 2],
         "num_training_records": 100,
         "num_training_epoch": 0,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._TwoDMeshGrid,
         "y_function": test_utils.get_hypercube_interpolation_fn(
@@ -1484,7 +1486,7 @@ def testRegularizersLargeLattice(self, regularizer, expected_loss):
         "lattice_sizes": [3, 4, 3, 4],
         "num_training_records": 100,
         "num_training_epoch": 100,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SinOfSum,
@@ -1511,7 +1513,7 @@ def testHighDimensionsStressTest(self):
         "output_max": 1.0,
         "num_training_records": 100,
         "num_training_epoch": 3,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1000.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SinOfSum,
@@ -1551,7 +1553,7 @@ def WShaped1dFunction(x):
         "lattice_sizes": [11],
         "num_training_records": 128,
         "num_training_epoch": 200,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 0.15,
         "x_generator": self._ScatterXUniformly,
         "y_function": WShaped1dFunction,
@@ -1593,7 +1595,7 @@ def WShaped2dFunction(x):
         "lattice_sizes": [11, 11],
         "num_training_records": 900,
         "num_training_epoch": 50,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.5,
         "x_generator": self._TwoDMeshGrid,
         "y_function": WShaped2dFunction,
@@ -1614,7 +1616,7 @@ def testUnconstrained(self):
         "lattice_sizes": [20],
         "num_training_records": 100,
         "num_training_epoch": 200,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 0.15,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._Sin,
@@ -1628,7 +1630,7 @@ def testUnconstrained(self):
         "lattice_sizes": [2],
         "num_training_records": 100,
         "num_training_epoch": 50,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 0.15,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._Square,
@@ -1641,7 +1643,7 @@ def testUnconstrained(self):
         "lattice_sizes": [2, 2],
         "num_training_records": 100,
         "num_training_epoch": 200,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 0.15,
         "x_generator": self._ScatterXUniformly,
         "y_function": test_utils.get_hypercube_interpolation_fn(
@@ -1655,7 +1657,7 @@ def testUnconstrained(self):
         "lattice_sizes": [2] * 3,
         "num_training_records": 100,
         "num_training_epoch": 200,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 0.15,
         "x_generator": self._ScatterXUniformly,
         "y_function": test_utils.get_hypercube_interpolation_fn(
@@ -1669,7 +1671,7 @@ def testUnconstrained(self):
         "lattice_sizes": [2] * 5,
         "num_training_records": 100,
         "num_training_epoch": 100,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 1.5,
         "x_generator": self._ScatterXUniformly,
         "y_function": test_utils.get_hypercube_interpolation_fn(
@@ -1683,7 +1685,7 @@ def testUnconstrained(self):
         "lattice_sizes": [2, 2],
         "num_training_records": 100,
         "num_training_epoch": 100,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 0.15,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._Max,
@@ -1696,7 +1698,7 @@ def testUnconstrained(self):
         "lattice_sizes": [2] * 6,
         "num_training_records": 100,
         "num_training_epoch": 300,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 30.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._PseudoLinear,
@@ -1709,7 +1711,7 @@ def testUnconstrained(self):
         "lattice_sizes": [2, 3, 4],
         "num_training_records": 100,
         "num_training_epoch": 200,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 10.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._PseudoLinear,
@@ -1722,7 +1724,7 @@ def testUnconstrained(self):
         "lattice_sizes": [4, 5],
         "num_training_records": 100,
         "num_training_epoch": 100,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 10.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._WeightedSum,
@@ -1735,7 +1737,7 @@ def testUnconstrained(self):
         "lattice_sizes": [2, 3, 4, 5],
         "num_training_records": 100,
         "num_training_epoch": 200,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 30.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._Max,
@@ -1748,7 +1750,7 @@ def testUnconstrained(self):
         "lattice_sizes": [2, 3, 4, 5],
         "num_training_records": 100,
         "num_training_epoch": 200,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 30.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._WeightedSum,
@@ -1762,7 +1764,7 @@ def testUnconstrained(self):
         "interpolation": "simplex",
         "num_training_records": 100,
         "num_training_epoch": 200,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 0.15,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._Sin,
@@ -1777,7 +1779,7 @@ def testUnconstrained(self):
         "interpolation": "simplex",
         "num_training_records": 100,
         "num_training_epoch": 50,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 0.15,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._Square,
@@ -1791,7 +1793,7 @@ def testUnconstrained(self):
         "interpolation": "simplex",
         "num_training_records": 100,
         "num_training_epoch": 100,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 0.15,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._Max,
@@ -1805,7 +1807,7 @@ def testUnconstrained(self):
         "interpolation": "simplex",
         "num_training_records": 100,
         "num_training_epoch": 300,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 30.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._PseudoLinear,
@@ -1819,7 +1821,7 @@ def testUnconstrained(self):
         "interpolation": "simplex",
         "num_training_records": 100,
         "num_training_epoch": 200,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 10.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._PseudoLinear,
@@ -1833,7 +1835,7 @@ def testUnconstrained(self):
         "interpolation": "simplex",
         "num_training_records": 100,
         "num_training_epoch": 100,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 10.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._WeightedSum,
@@ -1847,7 +1849,7 @@ def testUnconstrained(self):
         "interpolation": "simplex",
         "num_training_records": 100,
         "num_training_epoch": 200,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 30.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._Max,
@@ -1861,7 +1863,7 @@ def testUnconstrained(self):
         "interpolation": "simplex",
         "num_training_records": 100,
         "num_training_epoch": 200,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 30.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._WeightedSum,
@@ -1887,7 +1889,7 @@ def testEqaulySizedDimsOptimization(self, lattice_sizes, expected_loss):
         "lattice_sizes": lattice_sizes,
         "num_training_records": 100,
         "num_training_epoch": 1,
-        "optimizer": tf.keras.optimizers.legacy.Adagrad,
+        "optimizer": keras.optimizers.legacy.Adagrad,
         "learning_rate": 10.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._WeightedSum,
@@ -1920,24 +1922,49 @@ def testGraphSize(self, lattice_sizes, expected_graph_size):
     self.assertLessEqual(graph_size, expected_graph_size)
 
   @parameterized.parameters(
-      ("random_uniform_or_linear_initializer", [3, 3, 3], [
-          ([0, 1, 2], "peak")
-      ], tf.keras.initializers.RandomUniform),
-      ("random_uniform_or_linear_initializer", [3, 3, 3], [
-          ([0, 1, 2], "valley")
-      ], tf.keras.initializers.RandomUniform),
-      ("random_uniform_or_linear_initializer", [3, 3, 3], [
-          ([0, 1], "valley")
-      ], ll.LinearInitializer),
-      ("random_uniform_or_linear_initializer", [3, 3, 3], [
-          ([0, 1], "valley"), ([2], "peak")
-      ], ll.LinearInitializer), ("random_uniform_or_linear_initializer",
-                                 [3, 3, 3], None, ll.LinearInitializer),
-      ("linear_initializer", [3, 3, 3], [
-          ([0, 1], "valley")
-      ], ll.LinearInitializer), ("random_monotonic_initializer", [3, 3, 3], [
-          ([0, 1], "valley")
-      ], ll.RandomMonotonicInitializer))
+      (
+          "random_uniform_or_linear_initializer",
+          [3, 3, 3],
+          [([0, 1, 2], "peak")],
+          keras.initializers.RandomUniform,
+      ),
+      (
+          "random_uniform_or_linear_initializer",
+          [3, 3, 3],
+          [([0, 1, 2], "valley")],
+          keras.initializers.RandomUniform,
+      ),
+      (
+          "random_uniform_or_linear_initializer",
+          [3, 3, 3],
+          [([0, 1], "valley")],
+          ll.LinearInitializer,
+      ),
+      (
+          "random_uniform_or_linear_initializer",
+          [3, 3, 3],
+          [([0, 1], "valley"), ([2], "peak")],
+          ll.LinearInitializer,
+      ),
+      (
+          "random_uniform_or_linear_initializer",
+          [3, 3, 3],
+          None,
+          ll.LinearInitializer,
+      ),
+      (
+          "linear_initializer",
+          [3, 3, 3],
+          [([0, 1], "valley")],
+          ll.LinearInitializer,
+      ),
+      (
+          "random_monotonic_initializer",
+          [3, 3, 3],
+          [([0, 1], "valley")],
+          ll.RandomMonotonicInitializer,
+      ),
+  )
   def testCreateKernelInitializer(self, kernel_initializer_id, lattice_sizes,
                                   joint_unimodalities, expected_type):
     self.assertEqual(
@@ -2038,12 +2065,12 @@ def testSimplexInterpolation(self, lattice_sizes, kernel, inputs,
     kernel = tf.constant(kernel, dtype=tf.float32)
     inputs = tf.constant(inputs, dtype=tf.float32)
     units = int(kernel.shape[1])
-    model = tf.keras.models.Sequential([
+    model = keras.models.Sequential([
         ll.Lattice(
             lattice_sizes,
             units=units,
             interpolation="simplex",
-            kernel_initializer=tf.keras.initializers.Constant(kernel),
+            kernel_initializer=keras.initializers.Constant(kernel),
         ),
     ])
     outputs = model.predict(inputs)
@@ -2103,7 +2130,7 @@ def testFinalizeConstraints(self, lattice_sizes, kernel, edgeworth_trusts,
         trapezoid_trusts=trapezoid_trusts,
         output_min=output_min,
         output_max=output_max,
-        kernel_initializer=tf.keras.initializers.Constant(kernel),
+        kernel_initializer=keras.initializers.Constant(kernel),
     )
     layer.build(input_shape=(None, units, len(lattice_sizes)))
     output = layer.finalize_constraints()
diff --git a/tensorflow_lattice/python/linear_layer.py b/tensorflow_lattice/python/linear_layer.py
index aaac196..0a0fcaa 100644
--- a/tensorflow_lattice/python/linear_layer.py
+++ b/tensorflow_lattice/python/linear_layer.py
@@ -22,11 +22,18 @@
 from __future__ import division
 from __future__ import print_function
 
-from . import linear_lib
-from . import utils
 import numpy as np
 import tensorflow as tf
-from tensorflow import keras
+# pylint: disable=g-import-not-at-top
+# Use Keras 2.
+version_fn = getattr(tf.keras, "version", None)
+if version_fn and version_fn().startswith("3."):
+  import tf_keras as keras
+else:
+  keras = tf.keras
+
+from . import linear_lib
+from . import utils
 
 LINEAR_LAYER_KERNEL_NAME = "linear_layer_kernel"
 LINEAR_LAYER_BIAS_NAME = "linear_layer_bias"
@@ -107,11 +114,10 @@ def __init__(self,
         model output should be monotonic in corresponding feature, using
         'increasing' or 1 to indicate increasing monotonicity, 'decreasing' or
         -1 to indicate decreasing monotonicity and 'none' or 0 to indicate no
-        monotonicity constraints..
-        In case of decreasing monotonicity corresponding weight will be
-        constrained to be non positive, in case of increasing non-negative.
-        Instead of a list or tuple single value can be specified to indicate the
-        monotonicity constraint across all dimensions.
+        monotonicity constraints. In case of decreasing monotonicity
+        corresponding weight will be constrained to be non positive, in case of
+        increasing non-negative. Instead of a list or tuple single value can be
+        specified to indicate the monotonicity constraint across all dimensions.
       monotonic_dominances: None or list of two-element tuples. First element is
         the index of the dominant dimension. Second element is the index of the
         weak dimension.
@@ -128,7 +134,7 @@ def __init__(self,
       use_bias: Whether linear function has bias.
       normalization_order: If specified learned weights will be adjusted to have
         norm 1. Norm will be computed by: `tf.norm(tensor,
-          ord=normalization_order)`.
+        ord=normalization_order)`.
       kernel_initializer: Any keras initializer to be applied to kernel.
       bias_initializer: Any keras initializer to be applied to bias. Only valid
         if `use_bias == True`.
@@ -136,7 +142,7 @@ def __init__(self,
         regularizer objects.
       bias_regularizer: None or single element or list of any Keras regularizer
         objects.
-      **kwargs: Other args passed to `tf.keras.layers.Layer` initializer.
+      **kwargs: Other args passed to `keras.layers.Layer` initializer.
 
     Raises:
       ValueError: if monotonicity specified incorrectly.
diff --git a/tensorflow_lattice/python/linear_test.py b/tensorflow_lattice/python/linear_test.py
index 33a0429..5b0c9a1 100644
--- a/tensorflow_lattice/python/linear_test.py
+++ b/tensorflow_lattice/python/linear_test.py
@@ -24,10 +24,16 @@
 from absl.testing import parameterized
 import numpy as np
 import tensorflow as tf
-from tensorflow import keras
 from tensorflow_lattice.python import linear_layer as linl
 from tensorflow_lattice.python import test_utils
 from tensorflow_lattice.python import utils
+# pylint: disable=g-import-not-at-top
+# Use Keras 2.
+version_fn = getattr(tf.keras, "version", None)
+if version_fn and version_fn().startswith("3."):
+  import tf_keras as keras
+else:
+  keras = tf.keras
 
 _DISABLE_ALL = False
 _LOSS_EPS = 0.0001
@@ -39,7 +45,7 @@ class LinearTest(parameterized.TestCase, tf.test.TestCase):
 
   def setUp(self):
     super(LinearTest, self).setUp()
-    tf.keras.utils.set_random_seed(42)
+    keras.utils.set_random_seed(42)
 
   def _ResetAllBackends(self):
     keras.backend.clear_session()
@@ -115,9 +121,7 @@ def _GetTrainingInputsAndLabels(self, config):
 
     Returns:
       Tuple `(training_inputs, training_labels, raw_training_inputs)` where
-        `training_inputs` and `training_labels` are data for training and
-        `raw_training_inputs` are representation of `training_inputs` for
-        visualisation.
+        `training_inputs` and `training_labels` are data for training.
     """
     raw_training_inputs = config["x_generator"](
         num_points=config["num_training_records"],
@@ -133,16 +137,14 @@ def _GetTrainingInputsAndLabels(self, config):
       training_inputs = raw_training_inputs
 
     training_labels = [config["y_function"](x) for x in training_inputs]
-    return training_inputs, training_labels, raw_training_inputs
+    return training_inputs, training_labels
 
-  def _TrainModel(self, config, plot_path=None):
+  def _TrainModel(self, config):
     """Trains model and returns loss.
 
     Args:
       config: Layer config internal for this test which specifies params of
         linear layer to train.
-      plot_path: if specified - png file name to save visualisation. See
-        test_utils.run_training_loop() for more details.
 
     Returns:
       Training loss.
@@ -153,7 +155,7 @@ def _TrainModel(self, config, plot_path=None):
 
     self._ResetAllBackends()
 
-    training_inputs, training_labels, raw_training_inputs = (
+    training_inputs, training_labels = (
         self._GetTrainingInputsAndLabels(config))
     units = config["units"]
     num_input_dims = config["num_input_dims"]
@@ -197,13 +199,11 @@ def _TrainModel(self, config, plot_path=None):
     optimizer = config["optimizer"](learning_rate=config["learning_rate"])
     model.compile(loss=keras.losses.mean_squared_error, optimizer=optimizer)
 
-    training_data = (training_inputs, training_labels, raw_training_inputs)
+    training_data = (training_inputs, training_labels)
 
     loss = test_utils.run_training_loop(
-        config=config,
-        training_data=training_data,
-        keras_model=model,
-        plot_path=plot_path)
+        config=config, training_data=training_data, keras_model=model
+    )
 
     assetion_ops = linear_layer.assert_constraints(
         eps=config["allowed_constraints_violation"])
@@ -240,7 +240,7 @@ def testOneDUnconstrained(self, use_bias, expected_loss):
         "use_bias": use_bias,
         "num_training_records": 128,
         "num_training_epoch": 400,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 1.5,
         "x_generator": self._ScaterXUniformly,
         "input_min": 5.0,
@@ -261,7 +261,7 @@ def testTwoDUnconstrained(self, use_bias, expected_loss):
         "use_bias": use_bias,
         "num_training_records": 64,
         "num_training_epoch": 160,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 1.5,
         "x_generator": self._TwoDMeshGrid,
         "input_min": 0.0,
@@ -283,7 +283,7 @@ def testInitializers(self):
         "use_bias": True,
         "num_training_records": 64,
         "num_training_epoch": 0,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 1.5,
         "x_generator": self._TwoDMeshGrid,
         "input_min": 0.0,
@@ -305,7 +305,7 @@ def testAssertConstraints(self):
         "num_training_epoch": 0,
         "normalization_order": 1,
         "monotonicities": [1] * 4,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 1.5,
         "x_generator": self._ScaterXUniformly,
         "input_min": 0.0,
@@ -340,7 +340,7 @@ def testOneDMonotonicities_MonotonicInput(self, use_bias, expected_loss):
         "use_bias": use_bias,
         "num_training_records": 128,
         "num_training_epoch": 400,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 1.5,
         "x_generator": self._ScaterXUniformly,
         "input_min": 5.0,
@@ -363,7 +363,7 @@ def testOneDMonotonicities_AntiMonotonicInput(self, use_bias, expected_loss):
         "use_bias": use_bias,
         "num_training_records": 128,
         "num_training_epoch": 400,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 1.5,
         "x_generator": self._ScaterXUniformly,
         "input_min": 5.0,
@@ -387,7 +387,7 @@ def testOneDNormalizationOrder(self, norm_order, weight):
         "use_bias": True,
         "num_training_records": 128,
         "num_training_epoch": 20,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 1.5,
         "x_generator": self._ScaterXUniformly,
         "input_min": 0.0,
@@ -410,7 +410,7 @@ def testOneDNormalizationOrderZeroWeights(self):
         "use_bias": True,
         "num_training_records": 128,
         "num_training_epoch": 20,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 1.5,
         "x_generator": self._ScaterXUniformly,
         "input_min": 0.0,
@@ -439,7 +439,7 @@ def testTwoDMonotonicity(self, expected_loss, monotonicities):
         "use_bias": True,
         "num_training_records": 64,
         "num_training_epoch": 160,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 1.5,
         "x_generator": self._TwoDMeshGrid,
         "input_min": 0.0,
@@ -487,7 +487,7 @@ def testTwoDNormalizationOrder(self, norm_order, weights, monotonicities,
         "use_bias": True,
         "num_training_records": 64,
         "num_training_epoch": 160,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 0.15,
         "x_generator": self._TwoDMeshGrid,
         "input_min": 0.0,
@@ -514,7 +514,7 @@ def testFiveDAllConstraints(self, weights, monotonicities, expected_loss):
         "use_bias": True,
         "num_training_records": 640,
         "num_training_epoch": 160,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 1.5,
         "x_generator": self._ScaterXUniformly,
         "input_min": 0.0,
@@ -539,7 +539,7 @@ def testTwoDMonotonicDominance(self, expected_loss, dominances):
         "monotonic_dominances": dominances,
         "num_training_records": 64,
         "num_training_epoch": 160,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 1.5,
         "x_generator": self._TwoDMeshGrid,
         "input_min": 0.0,
@@ -567,7 +567,7 @@ def testTwoDRangeDominance(self, dominances, monotonicities, weights,
         "clip_max": (1.0, 4.0, "none"),
         "num_training_records": 64,
         "num_training_epoch": 160,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 1.5,
         "x_generator": self._ScaterXUniformly,
         "input_min": 0.0,
@@ -592,7 +592,7 @@ def testRegularizers(self, regularizer):
         "use_bias": True,
         "num_training_records": 64,
         "num_training_epoch": 0,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 1.5,
         "x_generator": self._TwoDMeshGrid,
         "input_min": 0.0,
diff --git a/tensorflow_lattice/python/parallel_combination_layer.py b/tensorflow_lattice/python/parallel_combination_layer.py
index ae0c711..3e22930 100644
--- a/tensorflow_lattice/python/parallel_combination_layer.py
+++ b/tensorflow_lattice/python/parallel_combination_layer.py
@@ -22,11 +22,17 @@
 from __future__ import print_function
 
 import tensorflow as tf
-from tensorflow import keras
 from tensorflow_lattice.python import categorical_calibration_layer
 from tensorflow_lattice.python import lattice_layer
 from tensorflow_lattice.python import linear_layer
 from tensorflow_lattice.python import pwl_calibration_layer
+# pylint: disable=g-import-not-at-top
+# Use Keras 2.
+version_fn = getattr(tf.keras, "version", None)
+if version_fn and version_fn().startswith("3."):
+  import tf_keras as keras
+else:
+  keras = tf.keras
 
 
 # TODO: Add support for calibrators with units > 1.
@@ -80,7 +86,7 @@ def __init__(self, calibration_layers=None, single_output=True, **kwargs):
       single_output: if True returns output as single tensor of shape
         `(batch_size, k)`. Otherwise returns list of `k` tensors of shape
         `(batch_size, 1)`.
-      **kwargs: other args passed to `tf.keras.layers.Layer` initializer.
+      **kwargs: other args passed to `keras.layers.Layer` initializer.
     """
     super(ParallelCombination, self).__init__(**kwargs)
     self.calibration_layers = []
diff --git a/tensorflow_lattice/python/parallel_combination_test.py b/tensorflow_lattice/python/parallel_combination_test.py
index ee4e365..fbf4dca 100644
--- a/tensorflow_lattice/python/parallel_combination_test.py
+++ b/tensorflow_lattice/python/parallel_combination_test.py
@@ -20,9 +20,15 @@
 from absl.testing import parameterized
 import numpy as np
 import tensorflow as tf
-from tensorflow import keras
 from tensorflow_lattice.python import lattice_layer as ll
 from tensorflow_lattice.python import parallel_combination_layer as pcl
+# pylint: disable=g-import-not-at-top
+# Use Keras 2.
+version_fn = getattr(tf.keras, "version", None)
+if version_fn and version_fn().startswith("3."):
+  import tf_keras as keras
+else:
+  keras = tf.keras
 
 
 class ParallelCombinationTest(parameterized.TestCase, tf.test.TestCase):
@@ -30,7 +36,7 @@ class ParallelCombinationTest(parameterized.TestCase, tf.test.TestCase):
   def setUp(self):
     super(ParallelCombinationTest, self).setUp()
     self.disable_all = False
-    tf.keras.utils.set_random_seed(42)
+    keras.utils.set_random_seed(42)
 
   def testParallelCombinationSingleInput(self):
     if self.disable_all:
@@ -138,12 +144,13 @@ def testParallelCombinationClone(self):
 
     with tempfile.NamedTemporaryFile(suffix=".h5") as f:
       model.save(f.name)
-      loaded_model = tf.keras.models.load_model(
+      loaded_model = keras.models.load_model(
           f.name,
           custom_objects={
               "ParallelCombination": pcl.ParallelCombination,
-              "Lattice": ll.Lattice
-          })
+              "Lattice": ll.Lattice,
+          },
+      )
       predictions = loaded_model.predict(test_inputs)
       self.assertTrue(
           np.allclose(predictions, np.asarray([[0.0], [1.4], [6.0]])))
diff --git a/tensorflow_lattice/python/premade.py b/tensorflow_lattice/python/premade.py
index d671a67..6143a98 100644
--- a/tensorflow_lattice/python/premade.py
+++ b/tensorflow_lattice/python/premade.py
@@ -29,13 +29,22 @@
 ```
 
 Supported models are defined in `tfl.configs`. Each model architecture can be
-used the same as any other `tf.keras.Model`.
+used the same as any other `keras.Model`.
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import tensorflow as tf
+# pylint: disable=g-import-not-at-top
+# Use Keras 2.
+version_fn = getattr(tf.keras, "version", None)
+if version_fn and version_fn().startswith("3."):
+  import tf_keras as keras
+else:
+  keras = tf.keras
+
 from . import aggregation_layer
 from . import categorical_calibration_layer
 from . import configs
@@ -47,15 +56,12 @@
 from . import pwl_calibration_layer
 from . import rtl_layer
 
-from absl import logging
-import tensorflow as tf
-
 
 # TODO: add support for serialization and object scoping or annoations.
-class CalibratedLatticeEnsemble(tf.keras.Model):
+class CalibratedLatticeEnsemble(keras.Model):
   """Premade model for Tensorflow calibrated lattice ensemble models.
 
-  Creates a `tf.keras.Model` for the model architecture specified by the
+  Creates a `keras.Model` for the model architecture specified by the
   `model_config`, which should be a
   `tfl.configs.CalibratedLatticeEnsembleConfig`. No fields in the model config
   will be automatically filled in, so the config must be fully specified. Note
@@ -84,7 +90,7 @@ def __init__(self, model_config=None, dtype=tf.float32, **kwargs):
       model_config: Model configuration object describing model architecutre.
         Should be one of the model configs in `tfl.configs`.
       dtype: dtype of layers used in the model.
-      **kwargs: Any additional `tf.keras.Model` arguments
+      **kwargs: Any additional `keras.Model` arguments
     """
     # Set our model_config
     self.model_config = model_config
@@ -138,14 +144,14 @@ def __init__(self, model_config=None, dtype=tf.float32, **kwargs):
   def get_config(self):
     """Returns a configuration dictionary."""
     config = {'name': self.name, 'trainable': self.trainable}
-    config['model_config'] = tf.keras.utils.legacy.serialize_keras_object(
+    config['model_config'] = keras.utils.legacy.serialize_keras_object(
         self.model_config
     )
     return config
 
   @classmethod
   def from_config(cls, config, custom_objects=None):
-    model_config = tf.keras.utils.legacy.deserialize_keras_object(
+    model_config = keras.utils.legacy.deserialize_keras_object(
         config.get('model_config'), custom_objects=custom_objects
     )
     premade_lib.verify_config(model_config)
@@ -154,10 +160,10 @@ def from_config(cls, config, custom_objects=None):
                trainable=config.get('trainable', True))
 
 
-class CalibratedLattice(tf.keras.Model):
+class CalibratedLattice(keras.Model):
   """Premade model for Tensorflow calibrated lattice models.
 
-  Creates a `tf.keras.Model` for the model architecture specified by the
+  Creates a `keras.Model` for the model architecture specified by the
   `model_config`, which should be a `tfl.configs.CalibratedLatticeConfig`. No
   fields in the model config will be automatically filled in, so the config
   must be fully specified. Note that the inputs to the model should match the
@@ -185,7 +191,7 @@ def __init__(self, model_config=None, dtype=tf.float32, **kwargs):
       model_config: Model configuration object describing model architecutre.
         Should be one of the model configs in `tfl.configs`.
       dtype: dtype of layers used in the model.
-      **kwargs: Any additional `tf.keras.Model` arguments.
+      **kwargs: Any additional `keras.Model` arguments.
     """
     # Set our model_config
     self.model_config = model_config
@@ -248,14 +254,14 @@ def __init__(self, model_config=None, dtype=tf.float32, **kwargs):
   def get_config(self):
     """Returns a configuration dictionary."""
     config = {'name': self.name, 'trainable': self.trainable}
-    config['model_config'] = tf.keras.utils.legacy.serialize_keras_object(
+    config['model_config'] = keras.utils.legacy.serialize_keras_object(
         self.model_config
     )
     return config
 
   @classmethod
   def from_config(cls, config, custom_objects=None):
-    model_config = tf.keras.utils.legacy.deserialize_keras_object(
+    model_config = keras.utils.legacy.deserialize_keras_object(
         config.get('model_config'), custom_objects=custom_objects
     )
     premade_lib.verify_config(model_config)
@@ -264,10 +270,10 @@ def from_config(cls, config, custom_objects=None):
                trainable=config.get('trainable', True))
 
 
-class CalibratedLinear(tf.keras.Model):
+class CalibratedLinear(keras.Model):
   """Premade model for Tensorflow calibrated linear models.
 
-  Creates a `tf.keras.Model` for the model architecture specified by the
+  Creates a `keras.Model` for the model architecture specified by the
   `model_config`, which should be a `tfl.configs.CalibratedLinearConfig`. No
   fields in the model config will be automatically filled in, so the config
   must be fully specified. Note that the inputs to the model should match the
@@ -295,7 +301,7 @@ def __init__(self, model_config=None, dtype=tf.float32, **kwargs):
       model_config: Model configuration object describing model architecutre.
         Should be one of the model configs in `tfl.configs`.
       dtype: dtype of layers used in the model.
-      **kwargs: Any additional `tf.keras.Model` arguments.
+      **kwargs: Any additional `keras.Model` arguments.
     """
     # Set our model_config
     self.model_config = model_config
@@ -361,14 +367,14 @@ def __init__(self, model_config=None, dtype=tf.float32, **kwargs):
   def get_config(self):
     """Returns a configuration dictionary."""
     config = {'name': self.name, 'trainable': self.trainable}
-    config['model_config'] = tf.keras.utils.legacy.serialize_keras_object(
+    config['model_config'] = keras.utils.legacy.serialize_keras_object(
         self.model_config
     )
     return config
 
   @classmethod
   def from_config(cls, config, custom_objects=None):
-    model_config = tf.keras.utils.legacy.deserialize_keras_object(
+    model_config = keras.utils.legacy.deserialize_keras_object(
         config.get('model_config'), custom_objects=custom_objects
     )
     premade_lib.verify_config(model_config)
@@ -379,10 +385,10 @@ def from_config(cls, config, custom_objects=None):
 
 # TODO: add support for tf.map_fn and inputs of shape (B, ?, input_dim)
 # as well as non-ragged inputs using padding/mask.
-class AggregateFunction(tf.keras.Model):
+class AggregateFunction(keras.Model):
   """Premade model for Tensorflow aggregate function learning models.
 
-  Creates a `tf.keras.Model` for the model architecture specified by the
+  Creates a `keras.Model` for the model architecture specified by the
   `model_config`, which should be a
   `tfl.configs.AggregateFunctionConfig`. No
   fields in the model config will be automatically filled in, so the config
@@ -408,7 +414,7 @@ def __init__(self, model_config=None, dtype=tf.float32, **kwargs):
       model_config: Model configuration object describing model architecutre.
         Should be a `tfl.configs.AggregateFunctionConfig` instance.
       dtype: dtype of layers used in the model.
-      **kwargs: Any additional `tf.keras.Model` arguments.
+      **kwargs: Any additional `keras.Model` arguments.
     """
     # Set our model_config
     self.model_config = model_config
@@ -479,14 +485,14 @@ def __init__(self, model_config=None, dtype=tf.float32, **kwargs):
   def get_config(self):
     """Returns a configuration dictionary."""
     config = {'name': self.name, 'trainable': self.trainable}
-    config['model_config'] = tf.keras.utils.legacy.serialize_keras_object(
+    config['model_config'] = keras.utils.legacy.serialize_keras_object(
         self.model_config
     )
     return config
 
   @classmethod
   def from_config(cls, config, custom_objects=None):
-    model_config = tf.keras.utils.legacy.deserialize_keras_object(
+    model_config = keras.utils.legacy.deserialize_keras_object(
         config.get('model_config'), custom_objects=custom_objects
     )
     premade_lib.verify_config(model_config)
diff --git a/tensorflow_lattice/python/premade_lib.py b/tensorflow_lattice/python/premade_lib.py
index 79d787a..7da1ef3 100644
--- a/tensorflow_lattice/python/premade_lib.py
+++ b/tensorflow_lattice/python/premade_lib.py
@@ -22,6 +22,19 @@
 import enum
 import itertools
 
+from absl import logging
+import numpy as np
+import six
+
+import tensorflow as tf
+# pylint: disable=g-import-not-at-top
+# Use Keras 2.
+version_fn = getattr(tf.keras, 'version', None)
+if version_fn and version_fn().startswith('3.'):
+  import tf_keras as keras
+else:
+  keras = tf.keras
+
 from . import aggregation_layer
 from . import categorical_calibration_layer
 from . import configs
@@ -34,12 +47,6 @@
 from . import rtl_layer
 from . import utils
 
-from absl import logging
-import numpy as np
-import six
-
-import tensorflow as tf
-from tensorflow import estimator as tf_estimator
 
 # Layer names used for layers in the premade models.
 AGGREGATION_LAYER_NAME = 'tfl_aggregation'
@@ -167,7 +174,7 @@ def _output_range(layer_output_range, model_config, feature_config=None):
 
 
 def build_input_layer(feature_configs, dtype, ragged=False):
-  """Creates a mapping from feature name to `tf.keras.Input`.
+  """Creates a mapping from feature name to `keras.Input`.
 
   Args:
     feature_configs: A list of `tfl.configs.FeatureConfig` instances that
@@ -176,7 +183,7 @@ def build_input_layer(feature_configs, dtype, ragged=False):
     ragged: If the inputs are ragged tensors.
 
   Returns:
-    Mapping from feature name to `tf.keras.Input` for the inputs specified by
+    Mapping from feature name to `keras.Input` for the inputs specified by
       `feature_configs`.
   """
   input_layer = {}
@@ -184,10 +191,10 @@ def build_input_layer(feature_configs, dtype, ragged=False):
   for feature_config in feature_configs:
     layer_name = '{}_{}'.format(INPUT_LAYER_NAME, feature_config.name)
     if feature_config.num_buckets:
-      input_layer[feature_config.name] = tf.keras.Input(
+      input_layer[feature_config.name] = keras.Input(
           shape=shape, ragged=ragged, dtype=tf.int32, name=layer_name)
     else:
-      input_layer[feature_config.name] = tf.keras.Input(
+      input_layer[feature_config.name] = keras.Input(
           shape=shape, ragged=ragged, dtype=dtype, name=layer_name)
   return input_layer
 
@@ -199,7 +206,7 @@ def build_multi_unit_calibration_layers(calibration_input_layer,
   """Creates a mapping from feature names to calibration outputs.
 
   Args:
-    calibration_input_layer: A mapping from feature name to `tf.keras.Input`.
+    calibration_input_layer: A mapping from feature name to `keras.Input`.
     calibration_output_units: A mapping from feature name to units.
     model_config: Model configuration object describing model architecture.
       Should be one of the model configs in `tfl.configs`.
@@ -225,7 +232,7 @@ def build_multi_unit_calibration_layers(calibration_input_layer,
                                       feature_config)
 
     if feature_config.num_buckets:
-      kernel_initializer = tf.keras.initializers.RandomUniform(
+      kernel_initializer = keras.initializers.RandomUniform(
           output_init_min, output_init_max)
       calibrated = (
           categorical_calibration_layer.CategoricalCalibration(
@@ -287,7 +294,7 @@ def build_calibration_layers(calibration_input_layer, model_config,
   """Creates a calibration layer for `submodels` as list of list of features.
 
   Args:
-    calibration_input_layer: A mapping from feature name to `tf.keras.Input`.
+    calibration_input_layer: A mapping from feature name to `keras.Input`.
     model_config: Model configuration object describing model architecture.
       Should be one of the model configs in `tfl.configs`.
     layer_output_range: A `tfl.premade_lib.LayerOutputRange` enum.
@@ -357,7 +364,7 @@ def build_aggregation_layer(aggregation_input_layer, model_config,
 
   Args:
     aggregation_input_layer: A list or a mapping from feature name to
-      `tf.keras.Input`, in the order or format expected by
+      `keras.Input`, in the order or format expected by
       `calibrated_lattice_models`.
     model_config: Model configuration object describing model architecture.
       Should be one of the model configs in `tfl.configs`.
@@ -385,7 +392,7 @@ def build_aggregation_layer(aggregation_input_layer, model_config,
     agg_output = aggregation_layer.Aggregation(
         calibrated_lattice_models[i], name=agg_layer_name)(
             aggregation_input_layer)
-    agg_output = tf.keras.layers.Reshape((1,))(agg_output)
+    agg_output = keras.layers.Reshape((1,))(agg_output)
     if model_config.middle_calibration:
       agg_output = pwl_calibration_layer.PWLCalibration(
           input_keypoints=np.linspace(
@@ -403,7 +410,7 @@ def build_aggregation_layer(aggregation_input_layer, model_config,
           dtype=dtype,
       )(
           agg_output)
-      agg_output = tf.keras.layers.Reshape((1,))(agg_output)
+      agg_output = keras.layers.Reshape((1,))(agg_output)
     lattice_inputs.append(agg_output)
 
   # We use random monotonic initialization here to break the symmetry that we
@@ -492,11 +499,11 @@ def build_linear_layer(linear_input, feature_configs, model_config,
   """
   layer_name = '{}_{}'.format(LINEAR_LAYER_NAME, submodel_index)
 
-  linear_input = tf.keras.layers.Concatenate(axis=1)(linear_input)
+  linear_input = keras.layers.Concatenate(axis=1)(linear_input)
   num_input_dims = len(feature_configs)
-  kernel_initializer = tf.keras.initializers.Constant([1.0 / num_input_dims] *
+  kernel_initializer = keras.initializers.Constant([1.0 / num_input_dims] *
                                                       num_input_dims)
-  bias_initializer = tf.keras.initializers.Constant(0)
+  bias_initializer = keras.initializers.Constant(0)
 
   if weighted_average:
     # Linear coefficients should be possitive and sum up to one.
@@ -758,7 +765,7 @@ def build_calibrated_lattice_ensemble_layer(calibration_input_layer,
   """Creates a calibration layer followed by a lattice ensemble layer.
 
   Args:
-    calibration_input_layer: A mapping from feature name to `tf.keras.Input`.
+    calibration_input_layer: A mapping from feature name to `keras.Input`.
     model_config: Model configuration object describing model architecture.
       Should be one of the model configs in `tfl.configs`.
     average_outputs: Whether to average the outputs of this layer.
@@ -811,7 +818,7 @@ def build_calibrated_lattice_ensemble_layer(calibration_input_layer,
         dtype=dtype)
 
     if average_outputs:
-      lattice_outputs = tf.keras.layers.Average()(lattice_outputs)
+      lattice_outputs = keras.layers.Average()(lattice_outputs)
 
   return lattice_outputs
 
@@ -830,12 +837,12 @@ def build_linear_combination_layer(ensemble_outputs, model_config, dtype):
   """
   if isinstance(ensemble_outputs, list):
     num_input_dims = len(ensemble_outputs)
-    linear_input = tf.keras.layers.Concatenate(axis=1)(ensemble_outputs)
+    linear_input = keras.layers.Concatenate(axis=1)(ensemble_outputs)
   else:
     num_input_dims = int(ensemble_outputs.shape[1])
     linear_input = ensemble_outputs
-  kernel_initializer = tf.keras.initializers.Constant(1.0 / num_input_dims)
-  bias_initializer = tf.keras.initializers.Constant(0)
+  kernel_initializer = keras.initializers.Constant(1.0 / num_input_dims)
+  bias_initializer = keras.initializers.Constant(0)
 
   if (not model_config.output_calibration and
       model_config.output_min is None and model_config.output_max is None):
@@ -878,7 +885,7 @@ def build_output_calibration_layer(output_calibration_input, model_config,
       model_config.output_initialization,
       to_begin=model_config.output_initialization[0])
   input_keypoints = np.linspace(0.0, 1.0, num=len(kernel_init_values))
-  kernel_initializer = tf.keras.initializers.Constant(kernel_init_values)
+  kernel_initializer = keras.initializers.Constant(kernel_init_values)
   kernel_regularizer = _output_calibration_regularizers(model_config)
   return pwl_calibration_layer.PWLCalibration(
       input_keypoints=input_keypoints,
@@ -1070,15 +1077,15 @@ def construct_prefitting_model_config(model_config, feature_names=None):
 
 def _verify_prefitting_model(prefitting_model, feature_names):
   """Checks that prefitting_model has the proper input layer."""
-  if isinstance(prefitting_model, tf.keras.Model):
+  if isinstance(prefitting_model, keras.Model):
     layer_names = [layer.name for layer in prefitting_model.layers]
-  elif isinstance(prefitting_model, tf_estimator.Estimator):
+  elif hasattr(prefitting_model, 'get_variable_names'):  # estimator
     layer_names = prefitting_model.get_variable_names()
   else:
     raise ValueError('Invalid model type for prefitting_model: {}'.format(
         type(prefitting_model)))
   for feature_name in feature_names:
-    if isinstance(prefitting_model, tf.keras.Model):
+    if isinstance(prefitting_model, keras.Model):
       input_layer_name = '{}_{}'.format(INPUT_LAYER_NAME, feature_name)
       if input_layer_name not in layer_names:
         raise ValueError(
@@ -1104,9 +1111,9 @@ def _verify_prefitting_model(prefitting_model, feature_names):
 
 def _get_lattice_weights(prefitting_model, lattice_index):
   """Gets the weights of the lattice at the specfied index."""
-  if isinstance(prefitting_model, tf.keras.Model):
+  if isinstance(prefitting_model, keras.Model):
     lattice_layer_name = '{}_{}'.format(LATTICE_LAYER_NAME, lattice_index)
-    weights = tf.keras.backend.get_value(
+    weights = keras.backend.get_value(
         prefitting_model.get_layer(lattice_layer_name).weights[0])
   else:
     # We have already checked the types by this point, so if prefitting_model
@@ -1327,7 +1334,7 @@ def set_crystals_lattice_ensemble(model_config,
     raise ValueError('model_config.lattices must be set to \'crystals\'.')
   # Note that we cannot check the type of the prefitting model without importing
   # premade/estimators, which would cause a cyclic dependency. However, we can
-  # check that the model is a tf.keras.Model or tf.Estimator instance that has
+  # check that the model is a keras.Model or tf.Estimator instance that has
   # the proper input layers matching prefitting_model_config feature_configs.
   # Beyond that, a prefitting_model with proper input layer names that is not of
   # the proper type will have undefined behavior.
diff --git a/tensorflow_lattice/python/premade_test.py b/tensorflow_lattice/python/premade_test.py
index aeb87df..1501509 100644
--- a/tensorflow_lattice/python/premade_test.py
+++ b/tensorflow_lattice/python/premade_test.py
@@ -29,7 +29,13 @@
 from tensorflow_lattice.python import configs
 from tensorflow_lattice.python import premade
 from tensorflow_lattice.python import premade_lib
-
+# pylint: disable=g-import-not-at-top
+# Use Keras 2.
+version_fn = getattr(tf.keras, 'version', None)
+if version_fn and version_fn().startswith('3.'):
+  import tf_keras as keras
+else:
+  keras = tf.keras
 
 fake_data = {
     'train_xs': [np.array([1]), np.array([3]), np.array([0])],
@@ -101,12 +107,13 @@ class PremadeTest(parameterized.TestCase, tf.test.TestCase):
 
   def setUp(self):
     super(PremadeTest, self).setUp()
-    tf.keras.utils.set_random_seed(42)
+    keras.utils.set_random_seed(42)
 
     # UCI Statlog (Heart) dataset.
-    heart_csv_file = tf.keras.utils.get_file(
+    heart_csv_file = keras.utils.get_file(
         'heart.csv',
-        'http://storage.googleapis.com/download.tensorflow.org/data/heart.csv')
+        'http://storage.googleapis.com/download.tensorflow.org/data/heart.csv',
+    )
     heart_df = pd.read_csv(heart_csv_file)
     thal_vocab_list = ['normal', 'fixed', 'reversible']
     heart_df['thal'] = heart_df['thal'].map(
@@ -257,7 +264,7 @@ def setUp(self):
         add_missing_feature_configs=False)
 
   def _ResetAllBackends(self):
-    tf.keras.backend.clear_session()
+    keras.backend.clear_session()
     tf.compat.v1.reset_default_graph()
 
   class Encoder(json.JSONEncoder):
@@ -482,8 +489,9 @@ def testCalibratedLatticeEnsembleCrystals(self, interpolation,
     prefitting_model = premade.CalibratedLatticeEnsemble(
         prefitting_model_config)
     prefitting_model.compile(
-        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
-        optimizer=tf.keras.optimizers.legacy.Adam(0.01))
+        loss=keras.losses.BinaryCrossentropy(from_logits=True),
+        optimizer=keras.optimizers.legacy.Adam(0.01),
+    )
     prefitting_model.fit(
         self.heart_train_x,
         self.heart_train_y,
@@ -496,9 +504,10 @@ def testCalibratedLatticeEnsembleCrystals(self, interpolation,
     # Construct and train final model
     model = premade.CalibratedLatticeEnsemble(model_config)
     model.compile(
-        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
-        metrics=tf.keras.metrics.AUC(from_logits=True),
-        optimizer=tf.keras.optimizers.legacy.Adam(0.01))
+        loss=keras.losses.BinaryCrossentropy(from_logits=True),
+        metrics=keras.metrics.AUC(from_logits=True),
+        optimizer=keras.optimizers.legacy.Adam(0.01),
+    )
     model.fit(
         self.heart_train_x,
         self.heart_train_y,
@@ -550,9 +559,10 @@ def testCalibratedLatticeEnsembleRTL(self, interpolation, parameterization,
     # Construct and train final model
     model = premade.CalibratedLatticeEnsemble(model_config)
     model.compile(
-        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
-        metrics=tf.keras.metrics.AUC(from_logits=True),
-        optimizer=tf.keras.optimizers.legacy.Adam(0.01))
+        loss=keras.losses.BinaryCrossentropy(from_logits=True),
+        metrics=keras.metrics.AUC(from_logits=True),
+        optimizer=keras.optimizers.legacy.Adam(0.01),
+    )
     model.fit(
         self.heart_train_x,
         self.heart_train_y,
@@ -599,9 +609,10 @@ def testCalibratedLattice(self, interpolation, parameterization, num_terms,
     # Construct and train final model
     model = premade.CalibratedLattice(model_config)
     model.compile(
-        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
-        metrics=tf.keras.metrics.AUC(from_logits=True),
-        optimizer=tf.keras.optimizers.legacy.Adam(0.01))
+        loss=keras.losses.BinaryCrossentropy(from_logits=True),
+        metrics=keras.metrics.AUC(from_logits=True),
+        optimizer=keras.optimizers.legacy.Adam(0.01),
+    )
     model.fit(
         self.heart_train_x[:5],
         self.heart_train_y,
@@ -641,9 +652,10 @@ def testLearnedCalibrationInputKeypoints(self):
     # Construct and train final model
     model = premade.CalibratedLatticeEnsemble(model_config)
     model.compile(
-        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
-        metrics=tf.keras.metrics.AUC(from_logits=True),
-        optimizer=tf.keras.optimizers.legacy.Adam(0.01))
+        loss=keras.losses.BinaryCrossentropy(from_logits=True),
+        metrics=keras.metrics.AUC(from_logits=True),
+        optimizer=keras.optimizers.legacy.Adam(0.01),
+    )
     model.fit(
         self.heart_train_x,
         self.heart_train_y,
@@ -672,9 +684,10 @@ def testLearnedCalibrationInputKeypoints(self):
     # Construct and train final model
     model = premade.CalibratedLattice(model_config)
     model.compile(
-        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
-        metrics=tf.keras.metrics.AUC(from_logits=True),
-        optimizer=tf.keras.optimizers.legacy.Adam(0.01))
+        loss=keras.losses.BinaryCrossentropy(from_logits=True),
+        metrics=keras.metrics.AUC(from_logits=True),
+        optimizer=keras.optimizers.legacy.Adam(0.01),
+    )
     model.fit(
         self.heart_train_x[:5],
         self.heart_train_y,
@@ -720,13 +733,14 @@ def testLatticeEnsembleH5FormatSaveLoad(self, parameterization, num_terms):
         feature_config.regularizer_configs = None
     model = premade.CalibratedLatticeEnsemble(model_config)
     # Compile and fit model.
-    model.compile(loss='mse', optimizer=tf.keras.optimizers.legacy.Adam(0.1))
+    model.compile(loss='mse', optimizer=keras.optimizers.legacy.Adam(0.1))
     model.fit(fake_data['train_xs'], fake_data['train_ys'])
     # Save model using H5 format.
     with tempfile.NamedTemporaryFile(suffix='.h5') as f:
-      tf.keras.models.save_model(model, f.name)
-      loaded_model = tf.keras.models.load_model(
-          f.name, custom_objects=premade.get_custom_objects())
+      keras.models.save_model(model, f.name)
+      loaded_model = keras.models.load_model(
+          f.name, custom_objects=premade.get_custom_objects()
+      )
       self.assertAllClose(
           model.predict(fake_data['eval_xs']),
           loaded_model.predict(fake_data['eval_xs']))
@@ -764,13 +778,14 @@ def testLatticeEnsembleRTLH5FormatSaveLoad(self, parameterization, num_terms):
       model_config.regularizer_configs = None
     model = premade.CalibratedLatticeEnsemble(model_config)
     # Compile and fit model.
-    model.compile(loss='mse', optimizer=tf.keras.optimizers.legacy.Adam(0.1))
+    model.compile(loss='mse', optimizer=keras.optimizers.legacy.Adam(0.1))
     model.fit(fake_data['train_xs'], fake_data['train_ys'])
     # Save model using H5 format.
     with tempfile.NamedTemporaryFile(suffix='.h5') as f:
-      tf.keras.models.save_model(model, f.name)
-      loaded_model = tf.keras.models.load_model(
-          f.name, custom_objects=premade.get_custom_objects())
+      keras.models.save_model(model, f.name)
+      loaded_model = keras.models.load_model(
+          f.name, custom_objects=premade.get_custom_objects()
+      )
       self.assertAllClose(
           model.predict(fake_data['eval_xs']),
           loaded_model.predict(fake_data['eval_xs']))
@@ -803,13 +818,14 @@ def testLatticeH5FormatSaveLoad(self, parameterization, num_terms):
         feature_config.regularizer_configs = None
     model = premade.CalibratedLattice(model_config)
     # Compile and fit model.
-    model.compile(loss='mse', optimizer=tf.keras.optimizers.legacy.Adam(0.1))
+    model.compile(loss='mse', optimizer=keras.optimizers.legacy.Adam(0.1))
     model.fit(fake_data['train_xs'], fake_data['train_ys'])
     # Save model using H5 format.
     with tempfile.NamedTemporaryFile(suffix='.h5') as f:
-      tf.keras.models.save_model(model, f.name)
-      loaded_model = tf.keras.models.load_model(
-          f.name, custom_objects=premade.get_custom_objects())
+      keras.models.save_model(model, f.name)
+      loaded_model = keras.models.load_model(
+          f.name, custom_objects=premade.get_custom_objects()
+      )
       self.assertAllClose(
           model.predict(fake_data['eval_xs']),
           loaded_model.predict(fake_data['eval_xs']))
@@ -829,13 +845,14 @@ def testLinearH5FormatSaveLoad(self):
         output_initialization=[-2., -1., 0., 1., 2.])
     model = premade.CalibratedLinear(model_config)
     # Compile and fit model.
-    model.compile(loss='mse', optimizer=tf.keras.optimizers.legacy.Adam(0.1))
+    model.compile(loss='mse', optimizer=keras.optimizers.legacy.Adam(0.1))
     model.fit(fake_data['train_xs'], fake_data['train_ys'])
     # Save model using H5 format.
     with tempfile.NamedTemporaryFile(suffix='.h5') as f:
-      tf.keras.models.save_model(model, f.name)
-      loaded_model = tf.keras.models.load_model(
-          f.name, custom_objects=premade.get_custom_objects())
+      keras.models.save_model(model, f.name)
+      loaded_model = keras.models.load_model(
+          f.name, custom_objects=premade.get_custom_objects()
+      )
       self.assertAllClose(
           model.predict(fake_data['eval_xs']),
           loaded_model.predict(fake_data['eval_xs']))
@@ -856,7 +873,7 @@ def testAggregateH5FormatSaveLoad(self):
         output_initialization=[-2., -1., 0., 1., 2.])
     model = premade.AggregateFunction(model_config)
     # Compile and fit model.
-    model.compile(loss='mse', optimizer=tf.keras.optimizers.legacy.Adam(0.1))
+    model.compile(loss='mse', optimizer=keras.optimizers.legacy.Adam(0.1))
     model.fit(fake_data['train_xs'], fake_data['train_ys'])
     # Save model using H5 format.
     with tempfile.NamedTemporaryFile(suffix='.h5') as f:
@@ -864,9 +881,10 @@ def testAggregateH5FormatSaveLoad(self):
       # when saving in HDF5. The keras team has informed us that we should not
       # push to support this since SavedModel format is the new default and no
       # new HDF5 functionality is desired.
-      tf.keras.models.save_model(model, f.name, include_optimizer=False)
-      loaded_model = tf.keras.models.load_model(
-          f.name, custom_objects=premade.get_custom_objects())
+      keras.models.save_model(model, f.name, include_optimizer=False)
+      loaded_model = keras.models.load_model(
+          f.name, custom_objects=premade.get_custom_objects()
+      )
       self.assertAllClose(
           model.predict(fake_data['eval_xs']),
           loaded_model.predict(fake_data['eval_xs']))
diff --git a/tensorflow_lattice/python/pwl_calibration_layer.py b/tensorflow_lattice/python/pwl_calibration_layer.py
index 30e96e3..6d290cb 100644
--- a/tensorflow_lattice/python/pwl_calibration_layer.py
+++ b/tensorflow_lattice/python/pwl_calibration_layer.py
@@ -23,14 +23,20 @@
 from __future__ import division
 from __future__ import print_function
 
-from . import pwl_calibration_lib
-from . import utils
-
 from absl import logging
 import numpy as np
 import six
 import tensorflow as tf
-from tensorflow import keras
+# pylint: disable=g-import-not-at-top
+# Use Keras 2.
+version_fn = getattr(tf.keras, "version", None)
+if version_fn and version_fn().startswith("3."):
+  import tf_keras as keras
+else:
+  keras = tf.keras
+
+from . import pwl_calibration_lib
+from . import utils
 
 INTERPOLATION_KEYPOINTS_NAME = "interpolation_keypoints"
 LENGTHS_NAME = "lengths"
@@ -184,7 +190,7 @@ class input shape description for more details.
         `input_keypoints` but then allowed to vary during training, with the
         exception of the first and last keypoint location which are fixed.
         Convexity can only be imposed with "fixed".
-      **kwargs: Other args passed to `tf.keras.layers.Layer` initializer.
+      **kwargs: Other args passed to `keras.layers.Layer` initializer.
 
     Raises:
       ValueError: If layer hyperparameters are invalid.
diff --git a/tensorflow_lattice/python/pwl_calibration_test.py b/tensorflow_lattice/python/pwl_calibration_test.py
index f4c491d..c54f9bc 100644
--- a/tensorflow_lattice/python/pwl_calibration_test.py
+++ b/tensorflow_lattice/python/pwl_calibration_test.py
@@ -30,14 +30,20 @@
 from absl.testing import parameterized
 import numpy as np
 import tensorflow as tf
-from tensorflow import keras
 from tensorflow_lattice.python import parallel_combination_layer as parallel_combination
 from tensorflow_lattice.python import pwl_calibration_layer as keras_layer
 from tensorflow_lattice.python import test_utils
 from tensorflow_lattice.python import utils
+# pylint: disable=g-import-not-at-top
+# Use Keras 2.
+version_fn = getattr(tf.keras, "version", None)
+if version_fn and version_fn().startswith("3."):
+  import tf_keras as keras
+else:
+  keras = tf.keras
 
 
-class CalibrateWithSeparateMissing(tf.keras.layers.Layer):
+class CalibrateWithSeparateMissing(keras.layers.Layer):
   """Create separate is_missing tensor.
 
   Splits input tensor into list: [input_tensor, is_missing_tensor] and passes
@@ -62,7 +68,7 @@ def setUp(self):
     self._disable_all = False
     self._loss_eps = 0.0001
     self._small_eps = 1e-6
-    tf.keras.utils.set_random_seed(42)
+    keras.utils.set_random_seed(42)
 
   def _ResetAllBackends(self):
     keras.backend.clear_session()
@@ -165,23 +171,18 @@ def _SetDefaults(self, config):
               num=config["num_keypoints"]))
     return config
 
-  def _TrainModel(self, config, plot_path=None):
+  def _TrainModel(self, config):
     """Trains model and returns loss.
 
     Args:
       config: Layer config internal for this test which specifies params of
         piecewise linear layer to train.
-      plot_path: if specified - png file name to save visualization. See
-        test_utils.run_training_loop() for more details.
 
     Returns:
       Training loss.
     """
     logging.info("Testing config:")
     logging.info(config)
-    if plot_path is not None and config["units"] > 1:
-      raise ValueError("Test config error. "
-                       "Can not plot multi unit calibrators.")
     config = self._SetDefaults(config)
 
     self._ResetAllBackends()
@@ -209,7 +210,7 @@ def _TrainModel(self, config, plot_path=None):
       pwl_calibration_units = config["units"]
 
     model = keras.models.Sequential()
-    model.add(tf.keras.layers.Input(shape=[input_units], dtype=tf.float32))
+    model.add(keras.layers.Input(shape=[input_units], dtype=tf.float32))
     calibration_layers = []
     for _ in range(num_calibration_layers):
       calibration_layers.append(
@@ -251,13 +252,11 @@ def _TrainModel(self, config, plot_path=None):
         loss=keras.losses.mean_squared_error,
         optimizer=config["optimizer"](learning_rate=config["learning_rate"]))
 
-    training_data = (training_inputs, training_labels, training_inputs)
+    training_data = (training_inputs, training_labels)
 
     loss = test_utils.run_training_loop(
-        config=config,
-        training_data=training_data,
-        keras_model=model,
-        plot_path=plot_path)
+        config=config, training_data=training_data, keras_model=model
+    )
 
     assetion_ops = []
     for calibration_layer in calibration_layers:
@@ -346,7 +345,7 @@ def testUnconstrainedNoMissingValue(self, units, one_d_input, expected_loss,
         "one_d_input": one_d_input,
         "num_training_records": 100,
         "num_training_epoch": 2000,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 0.15,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SmallWaves,
@@ -379,7 +378,7 @@ def testUnconstrainedWithMissingValue(self, units, missing_output_value,
         "units": units,
         "num_training_records": 100,
         "num_training_epoch": 2000,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 0.15,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SmallWaves,
@@ -405,17 +404,17 @@ def testUnconstrainedWithMissingValue(self, units, missing_output_value,
       self.assertAlmostEqual(loss, expected_loss, delta=self._loss_eps)
 
   @parameterized.parameters(
-      (1, -1.5, 1.5, tf.keras.optimizers.SGD, 2100, 0.002957),
-      (1, -1.5, 1.5, tf.keras.optimizers.Adagrad, 2100, 0.002798),
+      (1, -1.5, 1.5, keras.optimizers.SGD, 2100, 0.002957),
+      (1, -1.5, 1.5, keras.optimizers.Adagrad, 2100, 0.002798),
       # TODO: Something really weird is going on here with Adam
       # optimizer in case when num_training_epoch is exactly 2010.
       # Test verifies result with 2100 epochs which behaves as expected.
-      (1, -1.5, 1.5, tf.keras.optimizers.Adam, 2100, 0.000769),
-      (1, -0.5, 0.5, tf.keras.optimizers.SGD, 200, 0.011483),
-      (1, -0.5, 0.5, tf.keras.optimizers.Adagrad, 200, 0.011645),
-      (1, -0.5, 0.5, tf.keras.optimizers.Adam, 200, 0.011116),
-      (3, -1.5, 1.5, tf.keras.optimizers.Adagrad, 2100, 0.001759),
-      (3, -0.5, 0.5, tf.keras.optimizers.Adagrad, 200, 0.005986),
+      (1, -1.5, 1.5, keras.optimizers.Adam, 2100, 0.000769),
+      (1, -0.5, 0.5, keras.optimizers.SGD, 200, 0.011483),
+      (1, -0.5, 0.5, keras.optimizers.Adagrad, 200, 0.011645),
+      (1, -0.5, 0.5, keras.optimizers.Adam, 200, 0.011116),
+      (3, -1.5, 1.5, keras.optimizers.Adagrad, 2100, 0.001759),
+      (3, -0.5, 0.5, keras.optimizers.Adagrad, 200, 0.005986),
   )
   def testNonMonotonicFunction(self, units, output_min, output_max, optimizer,
                                num_training_epoch, expected_loss):
@@ -425,7 +424,7 @@ def testNonMonotonicFunction(self, units, output_min, output_max, optimizer,
         "units": units,
         "num_training_records": 100,
         "num_training_epoch": 2100,
-        "optimizer": tf.keras.optimizers.SGD,
+        "optimizer": keras.optimizers.SGD,
         "learning_rate": 0.015,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SmallWaves,
@@ -463,7 +462,7 @@ def testBoundsForMissing(self, units, missing_input_value, expected_loss):
         "units": units,
         "num_training_records": 100,
         "num_training_epoch": 200,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 0.15,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SmallWaves,
@@ -528,7 +527,7 @@ def testAllBoundsWithoutMonotonicityConstraints(self, units, output_min,
         "units": units,
         "num_training_records": 100,
         "num_training_epoch": 200,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 0.15,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SmallWavesPlusOne,
@@ -549,16 +548,16 @@ def testAllBoundsWithoutMonotonicityConstraints(self, units, output_min,
       self.assertAlmostEqual(loss, expected_loss, delta=self._loss_eps)
 
   @parameterized.parameters(
-      (1, False, tf.keras.optimizers.SGD, 0.004715),
-      (1, False, tf.keras.optimizers.Adagrad, 0.003820),
-      (1, False, tf.keras.optimizers.Adam, 0.002797),
-      (1, True, tf.keras.optimizers.SGD, 0.004427),
-      (1, True, tf.keras.optimizers.Adagrad, 0.004084),
+      (1, False, keras.optimizers.SGD, 0.004715),
+      (1, False, keras.optimizers.Adagrad, 0.003820),
+      (1, False, keras.optimizers.Adam, 0.002797),
+      (1, True, keras.optimizers.SGD, 0.004427),
+      (1, True, keras.optimizers.Adagrad, 0.004084),
       # Adam is doing terrible when required to stretch monotonic function
       # even if bounds are proper.
-      (1, True, tf.keras.optimizers.Adam, 0.065664),
-      (3, False, tf.keras.optimizers.Adagrad, 0.002371),
-      (3, True, tf.keras.optimizers.Adagrad, 0.002670),
+      (1, True, keras.optimizers.Adam, 0.065664),
+      (3, False, keras.optimizers.Adagrad, 0.002371),
+      (3, True, keras.optimizers.Adagrad, 0.002670),
   )
   def testMonotonicProperBounds(self, units, is_clamped, optimizer,
                                 expected_loss):
@@ -589,15 +588,15 @@ def testMonotonicProperBounds(self, units, is_clamped, optimizer,
       self.assertAlmostEqual(loss, expected_loss, delta=self._loss_eps)
 
   @parameterized.parameters(
-      (1, False, tf.keras.optimizers.SGD, 0.15, 0.009563),
-      (1, False, tf.keras.optimizers.Adagrad, 0.015, 0.011117),
-      (1, False, tf.keras.optimizers.Adam, 0.015, 0.015356),
-      (1, True, tf.keras.optimizers.SGD, 0.15, 0.009563),
-      (1, True, tf.keras.optimizers.Adagrad, 0.015, 0.011117),
+      (1, False, keras.optimizers.SGD, 0.15, 0.009563),
+      (1, False, keras.optimizers.Adagrad, 0.015, 0.011117),
+      (1, False, keras.optimizers.Adam, 0.015, 0.015356),
+      (1, True, keras.optimizers.SGD, 0.15, 0.009563),
+      (1, True, keras.optimizers.Adagrad, 0.015, 0.011117),
       # Adam squeezes monotonic function just slightly worse than adagrad.
-      (1, True, tf.keras.optimizers.Adam, 0.015, 0.015189),
-      (3, False, tf.keras.optimizers.Adagrad, 0.015, 0.006057),
-      (3, True, tf.keras.optimizers.Adagrad, 0.015, 0.006049),
+      (1, True, keras.optimizers.Adam, 0.015, 0.015189),
+      (3, False, keras.optimizers.Adagrad, 0.015, 0.006057),
+      (3, True, keras.optimizers.Adagrad, 0.015, 0.006049),
   )
   def testMonotonicNarrowBounds(self, units, is_clamped, optimizer,
                                 learning_rate, expected_loss):
@@ -628,15 +627,15 @@ def testMonotonicNarrowBounds(self, units, is_clamped, optimizer,
       self.assertAlmostEqual(loss, expected_loss, delta=self._loss_eps)
 
   @parameterized.parameters(
-      (1, False, tf.keras.optimizers.SGD, 0.005920),
-      (1, False, tf.keras.optimizers.Adagrad, 0.006080),
-      (1, False, tf.keras.optimizers.Adam, 0.002914),
-      (1, True, tf.keras.optimizers.SGD, 0.013836),
-      (1, True, tf.keras.optimizers.Adagrad, 0.066928),
+      (1, False, keras.optimizers.SGD, 0.005920),
+      (1, False, keras.optimizers.Adagrad, 0.006080),
+      (1, False, keras.optimizers.Adam, 0.002914),
+      (1, True, keras.optimizers.SGD, 0.013836),
+      (1, True, keras.optimizers.Adagrad, 0.066928),
       # Adam is doing terrible when required to stretch monotonic function.
-      (1, True, tf.keras.optimizers.Adam, 0.230402),
-      (3, False, tf.keras.optimizers.Adagrad, 0.004891),
-      (3, True, tf.keras.optimizers.Adagrad, 0.021490),
+      (1, True, keras.optimizers.Adam, 0.230402),
+      (3, False, keras.optimizers.Adagrad, 0.004891),
+      (3, True, keras.optimizers.Adagrad, 0.021490),
   )
   def testMonotonicWideBounds(self, units, is_clamped, optimizer,
                               expected_loss):
@@ -805,7 +804,7 @@ def testAllBoundsAndMonotonicityDirection(self, units, output_min, output_max,
         "units": units,
         "num_training_records": 100,
         "num_training_epoch": 200,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 0.15,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SmallWavesPlusOne,
@@ -843,7 +842,7 @@ def testConvexitySimple(self, units, convexity, expected_loss):
         "units": units,
         "num_training_records": 100,
         "num_training_epoch": 120,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SmallWaves,
@@ -878,7 +877,7 @@ def testConvexityNonUniformKeypoints(self, units, convexity, expected_loss):
         "units": units,
         "num_training_records": 100,
         "num_training_epoch": 200,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 1.0,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._WavyParabola,
@@ -919,7 +918,7 @@ def testConvexityDifferentNumKeypoints(self, units, num_keypoints,
         "units": units,
         "num_training_records": 100,
         "num_training_epoch": 120,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 0.3,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._WavyParabola,
@@ -959,7 +958,7 @@ def testConvexityWithMonotonicityAndBounds(self, units, monotonicity,
         "units": units,
         "num_training_records": 100,
         "num_training_epoch": 120,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 0.5,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._WavyParabola,
@@ -993,7 +992,7 @@ def testInputKeypoints(self, keypoints):
     config = {
         "num_training_records": 100,
         "num_training_epoch": 200,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 0.15,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SmallWaves,
@@ -1022,7 +1021,7 @@ def testIsCyclic(self, units, regularizer, num_training_epoch, expected_loss):
         "units": units,
         "num_training_records": 100,
         "num_training_epoch": num_training_epoch,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 0.15,
         "x_generator": self._ScatterXUniformlyIncludeBounds,
         "y_function": self._SinCycle,
@@ -1056,7 +1055,7 @@ def testInitializer(self, units, initializer, expected_loss):
         "num_training_records": 100,
         # 0 training epochs to see pure output of initializer.
         "num_training_epoch": 0,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 0.15,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SmallWaves,
@@ -1103,7 +1102,7 @@ def testRegularizers(self, units, regularizer, pure_reg_loss, training_loss):
         "units": units,
         "num_training_records": 100,
         "num_training_epoch": 0,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 0.15,
         "x_generator": self._ScatterXUniformly,
         "input_keypoints": keypoints,
@@ -1138,7 +1137,7 @@ def testAssertMonotonicity(self):
     config = {
         "num_training_records": 100,
         "num_training_epoch": 0,
-        "optimizer": tf.keras.optimizers.Adagrad,
+        "optimizer": keras.optimizers.Adagrad,
         "learning_rate": 0.15,
         "x_generator": self._ScatterXUniformly,
         "y_function": self._SmallWaves,
@@ -1178,7 +1177,7 @@ def testOutputShape(self):
     units = 10
     input_keypoints = [1, 2, 3, 4, 5]
     input_shape, output_shape = (units,), (None, units)
-    input_a = tf.keras.layers.Input(shape=input_shape)
+    input_a = keras.layers.Input(shape=input_shape)
     pwl_0 = keras_layer.PWLCalibration(
         input_keypoints=input_keypoints, units=units)
     output = pwl_0(input_a)
@@ -1216,7 +1215,7 @@ def testKeypointsInputs(self, input_keypoints_type, input_dims, output_units):
 
     # Check after Keras model compile
     model = keras.models.Sequential()
-    model.add(tf.keras.layers.Input(shape=[input_dims], dtype=tf.float32))
+    model.add(keras.layers.Input(shape=[input_dims], dtype=tf.float32))
     model.add(pwl)
     model.compile(loss=keras.losses.mean_squared_error)
     self.assertAllEqual(expected_function_output, pwl.keypoints_inputs())
diff --git a/tensorflow_lattice/python/rtl_layer.py b/tensorflow_lattice/python/rtl_layer.py
index 95cb772..b82e94d 100644
--- a/tensorflow_lattice/python/rtl_layer.py
+++ b/tensorflow_lattice/python/rtl_layer.py
@@ -28,15 +28,21 @@
 import collections
 import itertools
 
-from . import kronecker_factored_lattice_layer as kfll
-from . import lattice_layer
-from . import rtl_lib
-
 from absl import logging
 import numpy as np
 import six
 import tensorflow as tf
-from tensorflow import keras
+# pylint: disable=g-import-not-at-top
+# Use Keras 2.
+version_fn = getattr(tf.keras, 'version', None)
+if version_fn and version_fn().startswith('3.'):
+  import tf_keras as keras
+else:
+  keras = tf.keras
+
+from . import kronecker_factored_lattice_layer as kfll
+from . import lattice_layer
+from . import rtl_lib
 
 _MAX_RTL_SWAPS = 10000
 _RTLInput = collections.namedtuple('_RTLInput',
@@ -89,10 +95,10 @@ class RTL(keras.layers.Layer):
   Example:
 
   ```python
-  a = tf.keras.Input(shape=(1,))
-  b = tf.keras.Input(shape=(1,))
-  c = tf.keras.Input(shape=(1,))
-  d = tf.keras.Input(shape=(1,))
+  a = keras.Input(shape=(1,))
+  b = keras.Input(shape=(1,))
+  c = keras.Input(shape=(1,))
+  d = keras.Input(shape=(1,))
   cal_a = tfl.layers.CategoricalCalibration(
       units=10, output_min=0, output_max=1, ...)(a)
   cal_b = tfl.layers.PWLCalibration(
@@ -116,7 +122,7 @@ class RTL(keras.layers.Layer):
       num_input_dims=5,
       monotonicities=['increasing'] * 5,
   )(rtl_1)
-  model = tf.keras.Model(inputs=[a, b, c, d], outputs=outputs)
+  model = keras.Model(inputs=[a, b, c, d], outputs=outputs)
   ```
   """
   # pyformat: enable
@@ -249,7 +255,7 @@ def __init__(self,
           is not currently supported.
       average_outputs: Whether to average the outputs of this layer. Ignored
         when separate_outputs is True.
-      **kwargs: Other args passed to `tf.keras.layers.Layer` initializer.
+      **kwargs: Other args passed to `keras.layers.Layer` initializer.
 
     Raises:
       ValueError: If layer hyperparameters are invalid.
diff --git a/tensorflow_lattice/python/rtl_test.py b/tensorflow_lattice/python/rtl_test.py
index 6a0aa51..1fde4c4 100644
--- a/tensorflow_lattice/python/rtl_test.py
+++ b/tensorflow_lattice/python/rtl_test.py
@@ -23,6 +23,13 @@
 from tensorflow_lattice.python import linear_layer
 from tensorflow_lattice.python import pwl_calibration_layer
 from tensorflow_lattice.python import rtl_layer
+# pylint: disable=g-import-not-at-top
+# Use Keras 2.
+version_fn = getattr(tf.keras, "version", None)
+if version_fn and version_fn().startswith("3."):
+  import tf_keras as keras
+else:
+  keras = tf.keras
 
 
 class RTLTest(parameterized.TestCase, tf.test.TestCase):
@@ -30,7 +37,7 @@ class RTLTest(parameterized.TestCase, tf.test.TestCase):
   def setUp(self):
     super(RTLTest, self).setUp()
     self.disable_all = False
-    tf.keras.utils.set_random_seed(42)
+    keras.utils.set_random_seed(42)
 
   def testRTLInputShapes(self):
     if self.disable_all:
@@ -43,13 +50,13 @@ def testRTLInputShapes(self):
     target_ab = (
         np.max(a, axis=1, keepdims=True) + np.min(b, axis=1, keepdims=True))
 
-    input_a = tf.keras.layers.Input(shape=(10,))
-    input_b = tf.keras.layers.Input(shape=(20,))
+    input_a = keras.layers.Input(shape=(10,))
+    input_b = keras.layers.Input(shape=(20,))
 
     rtl_0 = rtl_layer.RTL(num_lattices=6, lattice_rank=5)
     rtl_outputs = rtl_0({"unconstrained": input_a, "increasing": input_b})
-    outputs = tf.keras.layers.Dense(1)(rtl_outputs)
-    model = tf.keras.Model(inputs=[input_a, input_b], outputs=outputs)
+    outputs = keras.layers.Dense(1)(rtl_outputs)
+    model = keras.Model(inputs=[input_a, input_b], outputs=outputs)
     model.compile(loss="mse")
     model.fit([a, b], target_ab)
     model.predict([a, b])
@@ -61,10 +68,10 @@ def testRTLInputShapes(self):
     f = np.random.random_sample(size=(data_size, 1))
     target_cdef = np.sin(np.pi * c) * np.cos(np.pi * d) - e * f
 
-    input_c = tf.keras.layers.Input(shape=(1,))
-    input_d = tf.keras.layers.Input(shape=(1,))
-    input_e = tf.keras.layers.Input(shape=(1,))
-    input_f = tf.keras.layers.Input(shape=(1,))
+    input_c = keras.layers.Input(shape=(1,))
+    input_d = keras.layers.Input(shape=(1,))
+    input_e = keras.layers.Input(shape=(1,))
+    input_f = keras.layers.Input(shape=(1,))
 
     input_keypoints = np.linspace(0.0, 1.0, 10)
     calib_c = pwl_calibration_layer.PWLCalibration(
@@ -102,8 +109,9 @@ def testRTLInputShapes(self):
     outputs = linear_layer.Linear(
         num_input_dims=10, monotonicities=[1] * 10)(
             rtl_0_outputs)
-    model = tf.keras.Model(
-        inputs=[input_c, input_d, input_e, input_f], outputs=outputs)
+    model = keras.Model(
+        inputs=[input_c, input_d, input_e, input_f], outputs=outputs
+    )
     model.compile(loss="mse")
     model.fit([c, d, e, f], target_cdef)
     model.predict([c, d, e, f])
@@ -124,8 +132,9 @@ def testRTLInputShapes(self):
     outputs = linear_layer.Linear(
         num_input_dims=3, monotonicities=[1] * 3)(
             rtl_1_outputs)
-    model = tf.keras.Model(
-        inputs=[input_c, input_d, input_e, input_f], outputs=outputs)
+    model = keras.Model(
+        inputs=[input_c, input_d, input_e, input_f], outputs=outputs
+    )
     model.compile(loss="mse")
     model.fit([c, d, e, f], target_cdef)
     model.predict([c, d, e, f])
@@ -136,7 +145,7 @@ def testRTLOutputShape(self):
 
     # Multiple Outputs Per Lattice
     input_shape, output_shape = (30,), (None, 6)
-    input_a = tf.keras.layers.Input(shape=input_shape)
+    input_a = keras.layers.Input(shape=input_shape)
     rtl_0 = rtl_layer.RTL(num_lattices=6, lattice_rank=5)
     output = rtl_0(input_a)
     self.assertAllEqual(output_shape, rtl_0.compute_output_shape(input_a.shape))
@@ -153,10 +162,10 @@ def testRTLSaveLoad(self):
     if self.disable_all:
       return
 
-    input_c = tf.keras.layers.Input(shape=(1,))
-    input_d = tf.keras.layers.Input(shape=(1,))
-    input_e = tf.keras.layers.Input(shape=(1,))
-    input_f = tf.keras.layers.Input(shape=(1,))
+    input_c = keras.layers.Input(shape=(1,))
+    input_d = keras.layers.Input(shape=(1,))
+    input_e = keras.layers.Input(shape=(1,))
+    input_f = keras.layers.Input(shape=(1,))
 
     input_keypoints = np.linspace(0.0, 1.0, 10)
     calib_c = pwl_calibration_layer.PWLCalibration(
@@ -201,20 +210,22 @@ def testRTLSaveLoad(self):
     outputs = linear_layer.Linear(
         num_input_dims=3, monotonicities=[1] * 3)(
             rtl_1_outputs)
-    model = tf.keras.Model(
-        inputs=[input_c, input_d, input_e, input_f], outputs=outputs)
+    model = keras.Model(
+        inputs=[input_c, input_d, input_e, input_f], outputs=outputs
+    )
     model.compile(loss="mse")
     model.use_legacy_config = True
 
     with tempfile.NamedTemporaryFile(suffix=".h5") as f:
       model.save(f.name)
-      _ = tf.keras.models.load_model(
+      _ = keras.models.load_model(
           f.name,
           custom_objects={
               "RTL": rtl_layer.RTL,
               "PWLCalibration": pwl_calibration_layer.PWLCalibration,
               "Linear": linear_layer.Linear,
-          })
+          },
+      )
 
 
 if __name__ == "__main__":
diff --git a/tensorflow_lattice/python/test_utils.py b/tensorflow_lattice/python/test_utils.py
index 41182c1..e458226 100644
--- a/tensorflow_lattice/python/test_utils.py
+++ b/tensorflow_lattice/python/test_utils.py
@@ -23,8 +23,6 @@
 from absl import logging
 import numpy as np
 
-from . import visualization
-
 
 class TimeTracker(object):
   """Tracks time.
@@ -56,7 +54,6 @@ def __exit__(self, unuesd_type, unuesd_value, unuesd_traceback):
 def run_training_loop(config,
                       training_data,
                       keras_model,
-                      plot_path=None,
                       input_dtype=np.float32,
                       label_dtype=np.float32):
   """Trains models and prints debug info.
@@ -64,15 +61,10 @@ def run_training_loop(config,
   Args:
     config: dictionary of test case parameters. See tests for TensorFlow Lattice
       layers.
-    training_data: tripple: (training_inputs, labels, raw_training_inputs) where
+    training_data: tuple: (training_inputs, labels) where
       training_inputs and labels are proper data to train models passed via
-      other parameters and raw_training_inputs are representation of
-      training_inputs for visualization.
+      other parameters.
     keras_model: Keras model to train on training_data.
-    plot_path: if specified it should be a string which represents file
-      name where to save model output vs ground truth visualization as png.
-      Supported only for 1-d and 2-d inputs. For visualisation of 2-d inputs
-      to work - raw_training_data must be a mesh grid.
     input_dtype: dtype for input conversion.
     label_dtype: dtype for label conversion.
 
@@ -80,7 +72,7 @@ def run_training_loop(config,
     Loss measured on training data and tf.session() if one was initialized
     explicitly during training.
   """
-  (training_inputs, training_labels, raw_training_inputs) = training_data
+  (training_inputs, training_labels) = training_data
   np_training_inputs = np.asarray(training_inputs).astype(input_dtype)
   np_training_labels = np.asarray(training_labels).astype(label_dtype)
 
@@ -115,15 +107,6 @@ def run_training_loop(config,
     logging.info("Median training step time: %f",
                  np.median(training_step_times))
 
-  if plot_path:
-    predictions = keras_model.predict(np_training_inputs)
-    plots = {
-        "Ground truth": training_labels,
-        "Model": predictions
-    }
-    visualization.plot_outputs(inputs=raw_training_inputs,
-                               outputs_map=plots,
-                               file_path=plot_path)
   return loss
 
 
diff --git a/tensorflow_lattice/python/visualization.py b/tensorflow_lattice/python/visualization.py
deleted file mode 100644
index dae7b5c..0000000
--- a/tensorflow_lattice/python/visualization.py
+++ /dev/null
@@ -1,609 +0,0 @@
-# Copyright 2019 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tools to analyse and plot TFL models."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import base64
-import math
-import os
-import sys
-import tempfile
-import xml.etree.cElementTree as cElementTree
-
-from . import model_info
-import numpy as np
-
-
-def _inline_svg_images(image_path):
-  """Inline IMAGE tag refs in graphviz SVG generated files."""
-  # Adaptation of:
-  # https://github.com/parrt/dtreeviz/blob/23518dc9a931eb6b1bef63f1c823db8e76ca94a6/dtreeviz/utils.py
-  with open(image_path, encoding='UTF-8') as f:
-    svg = f.read()
-  ns = {'svg': 'http://www.w3.org/2000/svg'}
-  root = cElementTree.fromstring(svg)
-  tree = cElementTree.ElementTree(root)
-  parent_map = {}
-  for p in tree.iter():
-    for c in p:
-      parent_map[c] = p
-
-  # Find all image tags in document (must use SVG namespace)
-  image_tags = tree.findall('.//svg:g/svg:image', ns)
-  for img in image_tags:
-    # Load ref'd image and get SVG root
-    svgfilename = img.attrib['{http://www.w3.org/1999/xlink}href']
-    with open(svgfilename, encoding='UTF-8') as f:
-      imgsvg = f.read()
-    imgroot = cElementTree.fromstring(imgsvg)
-    # Copy IMAGE tag attributes (width, height, etc) to SVG from image file
-    for k, v in img.attrib.items():
-      if k not in {'{http://www.w3.org/1999/xlink}href'}:
-        imgroot.attrib[k] = v
-    # Replace IMAGE with SVG tag
-    p = parent_map[img]
-    p.append(imgroot)
-    p.remove(img)
-
-  cElementTree.register_namespace('', 'http://www.w3.org/2000/svg')
-  cElementTree.register_namespace('xlink', 'http://www.w3.org/1999/xlink')
-  xml_str = cElementTree.tostring(root).decode()
-  return xml_str
-
-
-def _display(image_path, image_format):
-  """Displays the given image with the given format."""
-  import IPython.display  # pylint: disable=g-import-not-at-top
-  if image_format == 'svg':
-    # Inline embedded SVG data and wrap inside an HTML display object.
-    svg = _inline_svg_images(image_path)
-    svg_base64 = base64.b64encode(svg.encode('utf-8')).decode()
-    html = '<img width="100%" src="data:image/svg+xml;base64,{}" >'.format(
-        svg_base64)
-    IPython.display.display(IPython.display.HTML(html))
-  else:
-    IPython.display.display(IPython.display.Image(image_path))
-
-
-def draw_model_graph(model_graph,
-                     calibrator_dpi=30,
-                     calibrator_figsize=None,
-                     image_format='png'):
-  """Draws the model graph.
-
-  This function requires IPython and graphviz packages.
-
-  ```
-  model_graph = estimators.get_model_graph(saved_model_path)
-  visualization.draw_model_graph(model_graph)
-  ```
-
-  Args:
-    model_graph: a `model_info.ModelInfo` objects to plot.
-    calibrator_dpi: The DPI for calibrator plots inside the graph nodes.
-    calibrator_figsize: The figsize parameter for calibrator plots.
-    image_format: Format of the image to produce. Using 'svg' format can help
-      with font rendering issues.
-  """
-  import graphviz  # pylint: disable=g-import-not-at-top
-  import matplotlib.pyplot as plt  # pylint: disable=g-import-not-at-top
-
-  dot = graphviz.Digraph(format=image_format, engine='dot')
-  dot.graph_attr['ranksep'] = '0.75'
-
-  # Check if we need split nodes for shared calibration
-  model_has_shared_calibration = False
-  for node in model_graph.nodes:
-    model_has_shared_calibration |= (
-        (isinstance(node, model_info.PWLCalibrationNode) or
-         isinstance(node, model_info.CategoricalCalibrationNode)) and
-        (len(_output_nodes(model_graph, node)) > 1))
-
-  split_nodes = {}
-  for node in model_graph.nodes:
-    node_id = _node_id(node)
-    if (isinstance(node, model_info.PWLCalibrationNode) or
-        isinstance(node, model_info.CategoricalCalibrationNode)):
-      # Add node for calibrator with calibrator plot inside.
-      fig = plot_calibrator_nodes([node], figsize=calibrator_figsize)
-      filename = os.path.join(tempfile.tempdir,
-                              'i{}.{}'.format(node_id, image_format))
-      plt.savefig(filename, dpi=calibrator_dpi)
-      plt.close(fig)
-      dot.node(node_id, '', image=filename, imagescale='true', shape='box')
-
-      # Add input feature node.
-      node_is_feature_calibration = isinstance(node.input_node,
-                                               model_info.InputFeatureNode)
-      if node_is_feature_calibration:
-        input_node_id = node_id + 'input'
-        dot.node(input_node_id, node.input_node.name)
-        dot.edge(input_node_id + ':s', node_id + ':n')
-
-        # Add split node for shared calibration.
-        if model_has_shared_calibration:
-          split_node_id = node_id + 'calibrated'
-          split_node_name = 'calibrated {}'.format(node.input_node.name)
-          dot.node(split_node_id, split_node_name)
-          dot.edge(node_id + ':s', split_node_id + ':n')
-          split_nodes[node_id] = (split_node_id, split_node_name)
-
-    elif not isinstance(node, model_info.InputFeatureNode):
-      dot.node(node_id, _node_name(node), shape='box', margin='0.3')
-
-    if node is model_graph.output_node:
-      output_node_id = node_id + 'output'
-      dot.node(output_node_id, 'output')
-      dot.edge(node_id + ':s', output_node_id)
-
-  for node in model_graph.nodes:
-    node_id = _node_id(node)
-    for input_node in _input_nodes(node):
-      if isinstance(input_node, model_info.InputFeatureNode):
-        continue
-      input_node_id = _node_id(input_node)
-      if input_node_id in split_nodes:
-        split_node_id, split_node_name = split_nodes[input_node_id]
-        input_node_id = split_node_id + node_id
-        dot.node(input_node_id, split_node_name)
-
-      dot.edge(input_node_id + ':s', node_id)  # + ':n')
-
-  filename = os.path.join(tempfile.tempdir, 'dot')
-  try:
-    image_path = dot.render(filename)
-    _display(image_path=image_path, image_format=image_format)
-  except graphviz.backend.ExecutableNotFound as e:
-    if 'IPython.core.magics.namespace' in sys.modules:
-      # Similar to Keras visualization lib, we don't raise an exception here to
-      # avoid crashing notebooks during tests.
-      print(
-          'dot binaries were not found or not in PATH. The system running the '
-          'colab binary might not have graphviz package installed: format({})'
-          .format(e))
-    else:
-      raise e
-
-
-def plot_calibrator_nodes(nodes,
-                          plot_submodel_calibration=True,
-                          font_size=12,
-                          axis_label_font_size=14,
-                          figsize=None):
-  """Plots feature calibrator(s) extracted from a TFL canned estimator.
-
-  Args:
-    nodes: List of calibrator nodes to be plotted.
-    plot_submodel_calibration: If submodel calibrators should be included in the
-      output plot, when more than one calibration node is provided. These are
-      individual calibration layers for each lattice in a lattice ensemble
-      constructed from `configs.CalibratedLatticeEnsembleConfig`.
-    font_size: Font size for values and labels on the plot.
-    axis_label_font_size: Font size for axis labels.
-    figsize: The figsize parameter passed to `pyplot.figure()`.
-
-  Returns:
-    Pyplot figure object containing the visualisation.
-  """
-  import matplotlib.pyplot as plt  # pylint: disable=g-import-not-at-top
-
-  with plt.style.context('seaborn-v0_8-whitegrid'):
-    plt.rc('font', size=font_size)
-    plt.rc('axes', titlesize=font_size)
-    plt.rc('xtick', labelsize=font_size)
-    plt.rc('ytick', labelsize=font_size)
-    plt.rc('legend', fontsize=font_size)
-    plt.rc('axes', labelsize=axis_label_font_size)
-    fig = plt.figure(figsize=figsize)
-    axes = fig.add_subplot(1, 1, 1)
-    if isinstance(nodes[0], model_info.PWLCalibrationNode):
-      _plot_pwl_calibrator(nodes, axes, plot_submodel_calibration)
-    elif isinstance(nodes[0], model_info.CategoricalCalibrationNode):
-      _plot_categorical_calibrator(nodes, axes, plot_submodel_calibration)
-    else:
-      raise ValueError('Unknown calibrator type: {}'.format(nodes[0]))
-    plt.tight_layout()
-
-  return fig
-
-
-def plot_feature_calibrator(model_graph,
-                            feature_name,
-                            plot_submodel_calibration=True,
-                            font_size=12,
-                            axis_label_font_size=14,
-                            figsize=None):
-  """Plots feature calibrator(s) extracted from a TFL canned estimator.
-
-  ```
-  model_graph = estimators.get_model_graph(saved_model_path)
-  visualization.plot_feature_calibrator(model_graph, "feature_name")
-  ```
-
-  Args:
-    model_graph: `model_info.ModelGraph` object that includes model nodes.
-    feature_name: Name of the feature to plot the calibrator for.
-    plot_submodel_calibration: If submodel calibrators should be included in the
-      output plot, when more than one calibration node is provided. These are
-      individual calibration layers for each lattice in a lattice ensemble
-      constructed from `configs.CalibratedLatticeEnsembleConfig`.
-    font_size: Font size for values and labels on the plot.
-    axis_label_font_size: Font size for axis labels.
-    figsize: The figsize parameter passed to `pyplot.figure()`.
-
-  Returns:
-    Pyplot figure object containing the visualisation.
-  """
-
-  input_feature_node = [
-      input_feature_node
-      for input_feature_node in _input_feature_nodes(model_graph)
-      if input_feature_node.name == feature_name
-  ]
-  if not input_feature_node:
-    raise ValueError(
-        'Feature "{}" not found in the model_graph.'.format(feature_name))
-
-  input_feature_node = input_feature_node[0]
-  calibrator_nodes = _output_nodes(model_graph, input_feature_node)
-  return plot_calibrator_nodes(calibrator_nodes, plot_submodel_calibration,
-                               font_size, axis_label_font_size, figsize)
-
-
-def plot_all_calibrators(model_graph, num_cols=4, image_format='png', **kwargs):
-  """Plots all feature calibrator(s) extracted from a TFL canned estimator.
-
-  The generated plots are arranged in a grid.
-  This function requires IPython and colabtools packages.
-
-  ```
-  model_graph = estimators.get_model_graph(saved_model_path)
-  visualization.plot_all_calibrators(model_graph)
-  ```
-
-  Args:
-    model_graph: a `model_info.ModelGraph` objects to plot.
-    num_cols: Number of columns in the grid view.
-    image_format: Format of the image to produce.
-    **kwargs: args passed to plot_feature_calibrator and plot_calibrator_nodes.
-  """
-  import google.colab.widgets  # pylint: disable=g-import-not-at-top
-  import matplotlib.pyplot as plt  # pylint: disable=g-import-not-at-top
-
-  feature_infos = _input_feature_nodes(model_graph)
-  feature_names = sorted([feature_info.name for feature_info in feature_infos])
-
-  output_calibrator_node = (
-      model_graph.output_node if isinstance(
-          model_graph.output_node, model_info.PWLCalibrationNode) else None)
-
-  num_feature_calibrators = len(feature_names)
-  num_output_calibrators = 1 if output_calibrator_node else 0
-
-  # Calibrator plots are organized in a grid. We first plot all the feature
-  # calibrators, followed by any existing output calibrator.
-  num_rows = int(
-      math.ceil(
-          float(num_feature_calibrators + num_output_calibrators) / num_cols))
-  for index, _ in enumerate(
-      google.colab.widgets.Grid(
-          num_rows, num_cols, style='border-top: 0; border-bottom: 0;')):
-    if index >= num_feature_calibrators + num_output_calibrators:
-      continue  # Empty cells
-
-    if index < num_feature_calibrators:
-      feature_name = feature_names[index]
-      tb = google.colab.widgets.TabBar(
-          ['Calibrator for "{}"'.format(feature_name), 'Large Plot'])
-    else:
-      feature_name = 'output'
-      tb = google.colab.widgets.TabBar(['Output calibration', 'Large Plot'])
-
-    with tb.output_to(0, select=True):
-      if index < len(feature_names):
-        plot_feature_calibrator(model_graph, feature_name, **kwargs)
-      else:
-        plot_calibrator_nodes([output_calibrator_node], **kwargs)
-      image_path = os.path.join(tempfile.tempdir,
-                                '{}.{}'.format(feature_name, image_format))
-      # Save a larger temporary copy to be shown in a second tab.
-      plt.savefig(image_path, dpi=200)
-      plt.show()
-    with tb.output_to(1, select=False):
-      _display(image_path=image_path, image_format=image_format)
-
-
-def _input_feature_nodes(model_graph):
-  return [
-      node for node in model_graph.nodes
-      if isinstance(node, model_info.InputFeatureNode)
-  ]
-
-
-def _node_id(node):
-  return str(id(node))
-
-
-def _node_name(node):
-  if isinstance(node, model_info.LinearNode):
-    return 'Linear'
-  if isinstance(node, model_info.LatticeNode):
-    return 'Lattice'
-  if isinstance(node, model_info.KroneckerFactoredLatticeNode):
-    return 'KroneckerFactoredLattice'
-  if isinstance(node, model_info.MeanNode):
-    return 'Average'
-  return str(type(node))
-
-
-def _contains(nodes, node):
-  return any(other_node is node for other_node in nodes)
-
-
-def _input_nodes(node):
-  if hasattr(node, 'input_nodes'):
-    return node.input_nodes
-  if hasattr(node, 'input_node'):
-    return [node.input_node]
-  return []
-
-
-def _output_nodes(model_graph, node):
-  return [
-      other_node for other_node in model_graph.nodes
-      if _contains(_input_nodes(other_node), node)
-  ]
-
-
-_MISSING_NAME = 'missing'
-_CALIBRATOR_COLOR = 'tab:blue'
-_MISSING_COLOR = 'tab:orange'
-
-
-def _plot_categorical_calibrator(categorical_calibrator_nodes, axes,
-                                 plot_submodel_calibration):
-  """Plots a categorical calibrator.
-
-
-  Creates a categorical calibraiton plot combining the passed in calibration
-  nodes. You can select to also show individual calibrator nodes in the plot.
-
-  Args:
-    categorical_calibrator_nodes: a list of
-      `model_info.CategoricalCalibrationNode` objects in a model graph. If more
-      that one node is provided, they must be for the same input feature.
-    axes: Pyplot axes object.
-    plot_submodel_calibration: If submodel calibrators should be included in the
-      output plot, when more than one calibration node is provided. These are
-      individual calibration layers for each lattice in a lattice ensemble
-      constructed from `configs.CalibratedLatticeEnsembleConfig`.
-  """
-  import matplotlib.pyplot as plt  # pylint: disable=g-import-not-at-top
-
-  feature_info = categorical_calibrator_nodes[0].input_node
-  assert feature_info.is_categorical
-
-  # Adding missing category to input values.
-  # Note that there might be more than one out-of-vocabulary value
-  # (i.e. (num_oov_buckets + (default_value is not none)) > 1), in which case
-  # we name all of them missing.
-  input_values = list(feature_info.vocabulary_list)
-  while len(input_values) < len(categorical_calibrator_nodes[0].output_values):
-    input_values.append(_MISSING_NAME)
-
-  submodels_output_values = [
-      node.output_values for node in categorical_calibrator_nodes
-  ]
-  mean_output_values = np.mean(submodels_output_values, axis=0)
-
-  # Submodels categorical outputs are plotted in grouped form inside the
-  # average calibration bar.
-  bar_width = 0.8
-  sub_width = bar_width / len(submodels_output_values)
-
-  # Bar colors for each category.
-  color = [
-      _MISSING_COLOR if v == _MISSING_NAME else _CALIBRATOR_COLOR
-      for v in input_values
-  ]
-
-  # Plot submodel calibrations fitting inside the average calibration bar.
-  x = np.arange(len(input_values))
-  if plot_submodel_calibration:
-    for sub_index, output_values in enumerate(submodels_output_values):
-      plt.bar(
-          x - bar_width / 2 + sub_width / 2 + sub_index * sub_width,
-          output_values,
-          width=sub_width,
-          alpha=0.1,
-          color=color,
-          linewidth=0.5)
-
-  # Plot average category output.
-  plt.bar(
-      x,
-      mean_output_values,
-      color=color,
-      linewidth=2,
-      alpha=0.2,
-      width=bar_width)
-  plt.bar(
-      x,
-      mean_output_values,
-      fill=False,
-      edgecolor=color,
-      linewidth=3,
-      width=bar_width)
-
-  # Set axes labels and tick values.
-  plt.xlabel(feature_info.name)
-  plt.ylabel('calibrated {}'.format(feature_info.name))
-  axes.set_xticks(x)
-  axes.set_xticklabels(input_values)
-  axes.yaxis.grid(True, linewidth=0.25)
-  axes.xaxis.grid(False)
-
-
-def _plot_pwl_calibrator(pwl_calibrator_nodes, axes, plot_submodel_calibration):
-  """Plots a PWL calibrator.
-
-  Creates a pwl plot combining the passed in calibration nodes. You can select
-  to also show individual calibrator nodes in the plot.
-
-  Args:
-    pwl_calibrator_nodes: a list of `model_info.PWLCalibrationNode` objects in a
-      model graph. If more that one node is provided, they must be for the same
-      input feature.
-    axes: Pyplot axes object.
-    plot_submodel_calibration: If submodel calibrators should be included in the
-      output plot, when more than one calibration node is provided. These are
-      individual calibration layers for each lattice in a lattice ensemble
-      constructed from `configs.CalibratedLatticeEnsembleConfig`.
-  """
-  import matplotlib.pyplot as plt  # pylint: disable=g-import-not-at-top
-
-  if isinstance(pwl_calibrator_nodes[0].input_node,
-                model_info.InputFeatureNode):
-    assert not pwl_calibrator_nodes[0].input_node.is_categorical
-    input_name = pwl_calibrator_nodes[0].input_node.name
-    output_name = 'calibrated {}'.format(input_name)
-  else:
-    # Output PWL calibration.
-    input_name = 'input'
-    output_name = 'output'
-
-  # Average output_keypoints and (any) default_output across all the nodes.
-  mean_output_keypoints = np.mean(
-      [
-          pwl_calibrator_node.output_keypoints
-          for pwl_calibrator_node in pwl_calibrator_nodes
-      ],
-      axis=0,
-  )
-  if pwl_calibrator_nodes[0].default_output:
-    mean_default_output = np.mean([
-        pwl_calibrator_node.default_output
-        for pwl_calibrator_node in pwl_calibrator_nodes
-    ])
-  else:
-    mean_default_output = None
-
-  if plot_submodel_calibration:
-    for pwl_calibrator_node in pwl_calibrator_nodes:
-      plt.plot(
-          pwl_calibrator_node.input_keypoints,
-          pwl_calibrator_node.output_keypoints,
-          '--',
-          linewidth=0.5,
-          color=_CALIBRATOR_COLOR)
-      if pwl_calibrator_node.default_output is not None:
-        plt.plot(
-            pwl_calibrator_node.input_keypoints,
-            [pwl_calibrator_node.default_output] *
-            len(pwl_calibrator_node.input_keypoints),
-            '--',
-            color=_MISSING_COLOR,
-            linewidth=0.5)
-
-  # Skip plotting average keypoint outputs if input keypoints are not aligned.
-  all_input_keypoints = np.stack(
-      [node.input_keypoints for node in pwl_calibrator_nodes])
-  input_keypoints_match = (all_input_keypoints == all_input_keypoints[0]).all()
-  if input_keypoints_match:
-    plt.plot(
-        pwl_calibrator_nodes[0].input_keypoints,
-        mean_output_keypoints,
-        _CALIBRATOR_COLOR,
-        linewidth=3,
-        label='calibrated')
-  if mean_default_output is not None:
-    plt.plot(
-        pwl_calibrator_nodes[0].input_keypoints,
-        [mean_default_output] * len(pwl_calibrator_nodes[0].input_keypoints),
-        color=_MISSING_COLOR,
-        linewidth=3,
-        label=_MISSING_NAME)
-
-  plt.xlabel(input_name)
-  plt.ylabel(output_name)
-  axes.yaxis.grid(True, linewidth=0.25)
-  axes.xaxis.grid(True, linewidth=0.25)
-  axes.legend()
-
-
-def plot_outputs(inputs, outputs_map, file_path=None, figsize=(20, 20)):
-  """Visualises several outputs for same set of inputs.
-
-  This is generic plotting helper not tied to any layer.
-  Can visualize either:
-    - 2-d graphs: 1-d input, 1-d output.
-    - 3-d surfaces: 2-d input, 1-d output.
-
-  Args:
-    inputs: one of:
-      - ordered list of 1-d points
-      - tuple of exactly 2 elements which represent X and Y coordinates of 2-d
-        mesh grid for pyplot 3-d surface visualization. See
-        `test_utils.two_dim_mesh_grid` for more details.
-    outputs_map: dictionary {name: outputs} where "outputs" is a list of 1-d
-      points which correspond to "inputs". "name" is an arbitrary string used as
-      legend.
-    file_path: if set - visualisation will be saved as png at specified
-      location.
-    figsize: The figsize parameter passed to `pyplot.figure()`.
-
-  Raises:
-    ValueError: if configured to visualise more than 4 3-d plots.
-
-  Returns:
-    Pyplot object containing visualisation.
-  """
-  # pylint: disable=g-import-not-at-top
-  import matplotlib.pyplot as plt
-  # Needed for pyplot 3d projections.
-  from mpl_toolkits.mplot3d import Axes3D as _  # pylint: disable=unused-import
-  # pylint: enable=g-import-not-at-top
-
-  legend = []
-  if isinstance(inputs, tuple):
-    figure = plt.figure(figsize=figsize)
-    axes = figure.add_subplot(projection='3d')
-    # 4 colors is enough because no one would ever think of drawing 5 or more
-    # 3-d surfaces on same graph due to them looking like fabulous mess anyway.
-    colors = ['dodgerblue', 'forestgreen', 'saddiebrown', 'lightsalmon']
-    if len(outputs_map) > 4:
-      raise ValueError('Cannot visualize more than 4 3-d plots.')
-
-    x_inputs, y_inputs = inputs
-    for i, (name, outputs) in enumerate(outputs_map.items()):
-      legend.append(name)
-      z_outputs = np.reshape(
-          np.asarray(outputs), newshape=(len(x_inputs), len(x_inputs[0])))
-
-      axes.plot_wireframe(x_inputs, y_inputs, z_outputs, color=colors[i])
-  else:
-    for name, outputs in sorted(outputs_map.items()):
-      legend.append(name)
-      plt.plot(inputs, outputs)
-
-    plt.ylabel('y')
-    plt.xlabel('x')
-
-  plt.legend(legend)
-  if file_path:
-    plt.savefig(file_path)
-  return plt