diff --git a/Makefile b/Makefile
index 9f471852123..9035094a1c6 100644
--- a/Makefile
+++ b/Makefile
@@ -1,13 +1,14 @@
 .PHONY: build docs test
 
 BUILDDIR := $(PWD)
-CHECKDIRS := examples notebooks scripts src tests utils setup.py
-CHECKGLOBS := 'examples/**/*.py' 'scripts/**/*.py' 'src/**/*.py' 'tests/**/*.py' 'utils/**/*.py' setup.py
+CHECKDIRS := examples integrations notebooks scripts src tests utils setup.py
+CHECKGLOBS := 'examples/**/*.py' 'integrations/**/*.py' 'scripts/**/*.py' 'src/**/*.py' 'tests/**/*.py' 'utils/**/*.py' setup.py
 DOCDIR := docs
-MDCHECKGLOBS := 'docs/**/*.md' 'docs/**/*.rst' 'examples/**/*.md' 'notebooks/**/*.md' 'scripts/**/*.md'
+MDCHECKGLOBS := 'docs/**/*.md' 'docs/**/*.rst' 'examples/**/*.md' 'integrations/**/*.md' 'notebooks/**/*.md' 'scripts/**/*.md'
 MDCHECKFILES := CODE_OF_CONDUCT.md CONTRIBUTING.md DEVELOPING.md README.md
 
-TARGETS := ""  # targets for running pytests: keras,onnx,pytorch,pytorch_models,pytorch_datasets,tensorflow_v1,tensorflow_v1_datasets
+BUILD_ARGS :=  # set nightly to build nightly release
+TARGETS := ""  # targets for running pytests: keras,onnx,pytorch,pytorch_models,pytorch_datasets,tensorflow_v1,tensorflow_v1_models,tensorflow_v1_datasets
 PYTEST_ARGS := ""
 ifneq ($(findstring keras,$(TARGETS)),keras)
     PYTEST_ARGS := $(PYTEST_ARGS) --ignore tests/sparseml/keras
@@ -63,7 +64,7 @@ docs:
 
 # creates wheel file
 build:
-	python3 setup.py sdist bdist_wheel
+	python3 setup.py sdist bdist_wheel $(BUILD_ARGS)
 
 # clean package
 clean:
diff --git a/README.md b/README.md
index 1fa10bf2177..811ee053775 100644
--- a/README.md
+++ b/README.md
@@ -16,11 +16,11 @@ limitations under the License.
 
 # ![icon for SparseMl](https://github.com/raw/neuralmagic/sparseml/main/docs/source/icon-sparseml.png) SparseML
 
-### Libraries for state-of-the-art deep neural network optimization algorithms, enabling simple pipelines integration with a few lines of code
+### Libraries for applying sparsification recipes to neural networks with a few lines of code, enabling faster and smaller models
 
 <p>
     <a href="https://github.com/neuralmagic/sparseml/blob/main/LICENSE">
-        <img alt="GitHub" src="https://img.shields.io/github/license/neuralmagic/comingsoon.svg?color=purple&style=for-the-badge" height=25>
+        <img alt="GitHub" src="https://img.shields.io/github/license/neuralmagic/sparseml.svg?color=purple&style=for-the-badge" height=25>
     </a>
     <a href="https://docs.neuralmagic.com/sparseml/">
         <img alt="Documentation" src="https://img.shields.io/website/http/docs.neuralmagic.com/sparseml/index.html.svg?down_color=red&down_message=offline&up_message=online&style=for-the-badge" height=25>
@@ -44,25 +44,37 @@ limitations under the License.
 
 ## Overview
 
-SparseML is a toolkit that includes APIs, CLIs, scripts and libraries that apply state-of-the-art optimization algorithms such as [pruning](https://neuralmagic.com/blog/pruning-overview/) and [quantization](https://arxiv.org/abs/1609.07061) to any neural network. General, recipe-driven approaches built around these optimizations enable the simplification of creating faster and smaller models for the ML performance community at large.
+SparseML is a toolkit that includes APIs, CLIs, scripts and libraries that apply state-of-the-art sparsification algorithms such as pruning and quantization to any neural network. 
+General, recipe-driven approaches built around these algorithms enable the simplification of creating faster and smaller models for the ML performance community at large.
 
-SparseML is integrated for easy model optimizations within the [PyTorch](https://pytorch.org/),
-[Keras](https://keras.io/), and [TensorFlow V1](http://tensorflow.org/) ecosystems currently.
+This repository contains integrations within the [PyTorch](https://pytorch.org/), [Keras](https://keras.io/), and [TensorFlow V1](http://tensorflow.org/) ecosystems, allowing for seamless model sparsification.
 
-### Related Products
+## Sparsification
 
-- [DeepSparse](https://github.com/neuralmagic/deepsparse): CPU inference engine that delivers unprecedented performance for sparse models
-- [SparseZoo](https://github.com/neuralmagic/sparsezoo): Neural network model repository for highly sparse models and optimization recipes
-- [Sparsify](https://github.com/neuralmagic/sparsify): Easy-to-use autoML interface to optimize deep neural networks for better inference performance and a smaller footprint
+Sparsification is the process of taking a trained deep learning model and removing redundant information from the overprecise and over-parameterized network resulting in a faster and smaller model.
+Techniques for sparsification are all encompassing including everything from inducing sparsity using [pruning](https://neuralmagic.com/blog/pruning-overview/) and [quantization](https://arxiv.org/abs/1609.07061) to enabling naturally occurring sparsity using [activation sparsity](http://proceedings.mlr.press/v119/kurtz20a.html) or [winograd/FFT](https://arxiv.org/abs/1509.09308). 
+When implemented correctly, these techniques result in significantly more performant and smaller models with limited to no effect on the baseline metrics.
+For example, pruning plus quantization can give over [7x improvements in performance](https://neuralmagic.com/blog/benchmark-resnet50-with-deepsparse) while recovering to nearly the same baseline accuracy.
+
+The Deep Sparse product suite builds on top of sparsification enabling you to easily apply the techniques to your datasets and models using recipe-driven approaches.
+Recipes encode the directions for how to sparsify a model into a simple, easily editable format.
+- Download a sparsification recipe and sparsified model from the [SparseZoo](https://github.com/neuralmagic/sparsezoo).
+- Alternatively, create a recipe for your model using [Sparsify](https://github.com/neuralmagic/sparsify).
+- Apply your recipe with only a few lines of code using [SparseML](https://github.com/neuralmagic/sparseml).
+- Finally, for GPU-level performance on CPUs, deploy your sparse-quantized model with the [DeepSparse Engine](https://github.com/neuralmagic/deepsparse).
+
+
+**Full Deep Sparse product flow:**  
+
+<img src="https://docs.neuralmagic.com/docs/source/sparsification/flow-overview.svg" width="960px">
 
 ## Quick Tour
 
-To enable flexibility, ease of use, and repeatability, optimizing a model is generally done using a recipe file.
-The files encode the instructions needed for modifying the model and/or training process as a list of modifiers.
+To enable flexibility, ease of use, and repeatability, sparsifying a model is generally done using a recipe.
+The recipes encode the instructions needed for modifying the model and/or training process as a list of modifiers.
 Example modifiers can be anything from setting the learning rate for the optimizer to gradual magnitude pruning.
 The files are written in [YAML](https://yaml.org/) and stored in YAML or [markdown](https://www.markdownguide.org/) files using [YAML front matter](https://assemble.io/docs/YAML-front-matter.html).
-The rest of the SparseML system is coded to parse the recipe files into a native format for the desired framework
-and apply the modifications to the model and training pipeline.
+The rest of the SparseML system is coded to parse the recipes into a native format for the desired framework and apply the modifications to the model and training pipeline.
 
 A sample recipe for pruning a model generally looks like the following:
 
@@ -91,18 +103,21 @@ modifiers:
         params: ['sections.0.0.conv1.weight', 'sections.0.0.conv2.weight', 'sections.0.0.conv3.weight']
 ```
 
-More information on the available recipes, formats, and arguments can be found [here](https://github.com/neuralmagic/sparseml/blob/main/docs/optimization-recipes.md). Additionally, all code implementations of the modifiers under the `optim` packages for the frameworks are documented with example YAML formats.
+More information on the available recipes, formats, and arguments can be found [here](https://github.com/neuralmagic/sparseml/blob/main/docs/source/recipes.md). Additionally, all code implementations of the modifiers under the `optim` packages for the frameworks are documented with example YAML formats.
 
 Pre-configured recipes and the resulting models can be explored and downloaded from the [SparseZoo](https://github.com/neuralmagic/sparsezoo). Also, [Sparsify](https://github.com/neuralmagic/sparsify) enables autoML style creation of optimization recipes for use with SparseML.
 
 For a more in-depth read, check out [SparseML documentation](https://docs.neuralmagic.com/sparseml/).
 
-### PyTorch Optimization
+### PyTorch Sparsification
 
-The PyTorch optimization libraries are located under the `sparseml.pytorch.optim` package.
-Inside are APIs designed to make model optimization as easy as possible by integrating seamlessly into PyTorch training pipelines.
+The PyTorch sparsification libraries are located under the `sparseml.pytorch.optim` package.
+Inside are APIs designed to make model sparsification as easy as possible by integrating seamlessly into PyTorch training pipelines.
 
-The integration is done using the `ScheduledOptimizer` class. It is intended to wrap your current optimizer and its step function. The step function then calls into the `ScheduledModifierManager` class which can be created from a recipe file. With this setup, the training process can then be modified as desired to optimize the model.
+The integration is done using the `ScheduledOptimizer` class. 
+It is intended to wrap your current optimizer and its step function. 
+The step function then calls into the `ScheduledModifierManager` class which can be created from a recipe file. 
+With this setup, the training process can then be modified as desired to sparsify the model.
 
 To enable all of this, the integration code you'll need to write is only a handful of lines:
 
@@ -121,11 +136,11 @@ optimizer = ScheduledOptimizer(optimizer, model, manager, steps_per_epoch=num_tr
 
 ### Keras Optimization
 
-The Keras optimization libraries are located under the `sparseml.keras.optim` package.
-Inside are APIs designed to make model optimization as easy as possible by integrating seamlessly into Keras training pipelines.
+The Keras sparsification libraries are located under the `sparseml.keras.optim` package.
+Inside are APIs designed to make model sparsification as easy as possible by integrating seamlessly into Keras training pipelines.
 
 The integration is done using the `ScheduledModifierManager` class which can be created from a recipe file.
-This class handles modifying the Keras objects for the desired optimizations using the `modify` method.
+This class handles modifying the Keras objects for the desired algorithms using the `modify` method.
 The edited model, optimizer, and any callbacks necessary to modify the training process are returned.
 The model and optimizer can be used normally and the callbacks must be passed into the `fit` or `fit_generator` function.
 If using `train_on_batch`, the callbacks must be invoked after each call.
@@ -155,13 +170,14 @@ model.fit(..., callbacks=callbacks)
 save_model = manager.finalize(model)
 ```
 
-### TensorFlow V1 Optimization
+### TensorFlow V1 Sparsification
 
-The TensorFlow optimization libraries for TensorFlow version 1.X are located under the `sparseml.tensorflow_v1.optim` package. Inside are APIs designed to make model optimization as easy as possible by integrating seamlessly into TensorFlow V1 training pipelines.
+The TensorFlow sparsification libraries for TensorFlow version 1.X are located under the `sparseml.tensorflow_v1.optim` package. 
+Inside are APIs designed to make model sparsification as easy as possible by integrating seamlessly into TensorFlow V1 training pipelines.
 
 The integration is done using the `ScheduledModifierManager` class which can be created from a recipe file.
-This class handles modifying the TensorFlow graph for the desired optimizations.
-With this setup, the training process can then be modified as desired to optimize the model.
+This class handles modifying the TensorFlow graph for the desired algorithms.
+With this setup, the training process can then be modified as desired to sparsify the model.
 
 #### Estimator-Based pipelines
 
@@ -185,7 +201,7 @@ manager.modify_estimator(estimator, steps_per_epoch=num_train_batches)
 Session-based pipelines need a little bit more as compared to estimator-based pipelines; however,
 it is still designed to require only a few lines of code for integration.
 After graph creation, the manager's `create_ops` method must be called.
-This will modify the graph as needed for the optimizations and return modifying ops and extras.
+This will modify the graph as needed for the algorithms and return modifying ops and extras.
 After creating the session and training normally, call into `session.run` with the modifying ops after each step.
 Modifying extras contain objects such as tensorboard summaries of the modifiers to be used if desired.
 Finally, once completed, `complete_graph` must be called to remove the modifying ops for saving and export.
@@ -289,7 +305,7 @@ Install with pip using:
 pip install sparseml
 ```
 
-Then if you would like to explore any of the [scripts](https://github.com/neuralmagic/sparseml/blob/main/scripts/), [notebooks](https://github.com/neuralmagic/sparseml/blob/main/notebooks/), or [examples](https://github.com/neuralmagic/sparseml/blob/main/examples/)
+Then if you would like to explore any of the [scripts](https://github.com/neuralmagic/sparseml/blob/main/scripts/), [notebooks](https://github.com/neuralmagic/sparseml/blob/main/notebooks/), or [integrations](https://github.com/neuralmagic/sparseml/blob/main/integrations/)
 clone the repository and install any additional dependencies as required.
 
 #### Supported Framework Versions
@@ -343,7 +359,7 @@ Note, TensorFlow V1 is no longer being built for newer operating systems such as
 
 ## Contributing
 
-We appreciate contributions to the code, examples, and documentation as well as bug reports and feature requests! [Learn how here](https://github.com/neuralmagic/sparseml/blob/main/CONTRIBUTING.md).
+We appreciate contributions to the code, examples, integrations, and documentation as well as bug reports and feature requests! [Learn how here](https://github.com/neuralmagic/sparseml/blob/main/CONTRIBUTING.md).
 
 ## Join the Community
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 35571c521fc..8f788e42b74 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -86,6 +86,11 @@
 html_theme = "sphinx_rtd_theme"
 html_logo = "icon-sparseml.png"
 
+html_theme_options = {
+    'analytics_id': 'UA-128364174-1',  #  Provided by Google in your dashboard
+    'analytics_anonymize_ip': False,
+}
+
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
diff --git a/docs/source/index.rst b/docs/source/index.rst
index eaf67c0815f..80e266e9277 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -17,8 +17,7 @@
 SparseML |version|
 ===================
 
-Libraries for state-of-the-art deep neural network optimization algorithms,
-enabling simple pipelines integration with a few lines of code
+Libraries for applying sparsification recipes to neural networks with a few lines of code, enabling faster and smaller models
 
 .. raw:: html
 
@@ -49,45 +48,50 @@ enabling simple pipelines integration with a few lines of code
 Overview
 ========
 
-SparseML is a toolkit that includes APIs, CLIs, scripts and libraries that apply state-of-the-art optimization
-algorithms such as `pruning <https://neuralmagic.com/blog/pruning-overview/ />`_ and
-`quantization <https://arxiv.org/abs/1609.07061 />`_ to any neural network.
-General, recipe-driven approaches built around these optimizations enable the simplification of creating faster
-and smaller models for the ML performance community at large.
+SparseML is a toolkit that includes APIs, CLIs, scripts and libraries that apply state-of-the-art sparsification algorithms such as pruning and quantization to any neural network.
+General, recipe-driven approaches built around these algorithms enable the simplification of creating faster and smaller models for the ML performance community at large.
 
-SparseML is integrated for easy model optimizations within the `PyTorch <https://pytorch.org/ />`_,
-`Keras <https://keras.io/ />`_, and `TensorFlow V1 <http://tensorflow.org/ />`_ ecosystems currently.
+`This repository <https://github.com/neuralmagic/sparseml>`_ contains integrations within the `PyTorch <https://pytorch.org />`_, `Keras <https://keras.io />`_, and `TensorFlow V1 <http://tensorflow.org />`_, allowing for seamless model sparsification.
 
-Related Products
-================
+Sparsification
+==============
 
-- `DeepSparse <https://github.com/neuralmagic/deepsparse />`_:
-  CPU inference engine that delivers unprecedented performance for sparse models
-- `SparseZoo <https://github.com/neuralmagic/sparsezoo />`_:
-  Neural network model repository for highly sparse models and optimization recipes
-- `Sparsify <https://github.com/neuralmagic/sparsify />`_:
-  Easy-to-use autoML interface to optimize deep neural networks for
-  better inference performance and a smaller footprint
+Sparsification is the process of taking a trained deep learning model and removing redundant information from the overprecise and over-parameterized network resulting in a faster and smaller model.
+Techniques for sparsification are all encompassing including everything from inducing sparsity using `pruning <https://neuralmagic.com/blog/pruning-overview/>`_ and `quantization <https://arxiv.org/abs/1609.07061>`_ to enabling naturally occurring sparsity using `activation sparsity <http://proceedings.mlr.press/v119/kurtz20a.html>`_ or `winograd/FFT <https://arxiv.org/abs/1509.09308>`_.
+When implemented correctly, these techniques result in significantly more performant and smaller models with limited to no effect on the baseline metrics.
+For example, pruning plus quantization can give over `7x improvements in performance <https://neuralmagic.com/blog/benchmark-resnet50-with-deepsparse>`_ while recovering to nearly the same baseline accuracy.
+
+The Deep Sparse product suite builds on top of sparsification enabling you to easily apply the techniques to your datasets and models using recipe-driven approaches.
+Recipes encode the directions for how to sparsify a model into a simple, easily editable format.
+- Download a sparsification recipe and sparsified model from the `SparseZoo <https://github.com/neuralmagic/sparsezoo>`_.
+- Alternatively, create a recipe for your model using `Sparsify <https://github.com/neuralmagic/sparsify>`_.
+- Apply your recipe with only a few lines of code using `SparseML <https://github.com/neuralmagic/sparseml>`_.
+- Finally, for GPU-level performance on CPUs, deploy your sparse-quantized model with the `DeepSparse Engine <https://github.com/neuralmagic/deepsparse>`_.
+
+
+**Full Deep Sparse product flow:**  
+
+<img src="https://docs.neuralmagic.com/docs/source/sparsification/flow-overview.svg" width="960px">
 
 Resources and Learning More
 ===========================
 
-- `SparseZoo Documentation <https://docs.neuralmagic.com/sparsezoo/ />`_
-- `Sparsify Documentation <https://docs.neuralmagic.com/sparsify/ />`_
-- `DeepSparse Documentation <https://docs.neuralmagic.com/deepsparse/ />`_
-- `Neural Magic Blog <https://www.neuralmagic.com/blog/ />`_,
-  `Resources <https://www.neuralmagic.com/resources/ />`_,
-  `Website <https://www.neuralmagic.com/ />`_
+- `SparseZoo Documentation <https://docs.neuralmagic.com/sparsezoo>`_
+- `Sparsify Documentation <https://docs.neuralmagic.com/sparsify>`_
+- `DeepSparse Documentation <https://docs.neuralmagic.com/deepsparse>`_
+- `Neural Magic Blog <https://www.neuralmagic.com/blog>`_,
+  `Resources <https://www.neuralmagic.com/resources>`_,
+  `Website <https://www.neuralmagic.com>`_
 
 Release History
 ===============
 
 Official builds are hosted on PyPi
-- stable: `sparseml <https://pypi.org/project/sparseml/ />`_
-- nightly (dev): `sparseml-nightly <https://pypi.org/project/sparseml-nightly/ />`_
+- stable: `sparseml <https://pypi.org/project/sparseml>`_
+- nightly (dev): `sparseml-nightly <https://pypi.org/project/sparseml-nightly>`_
 
 Additionally, more information can be found via
-`GitHub Releases <https://github.com/neuralmagic/sparseml/releases />`_.
+`GitHub Releases <https://github.com/neuralmagic/sparseml/releases>`_.
 
 .. toctree::
     :maxdepth: 3
@@ -104,9 +108,9 @@ Additionally, more information can be found via
     api/sparseml
 
 .. toctree::
-    :maxdepth: 2
-    :caption: Help and Support
+    :maxdepth: 3
+    :caption: Help
 
-    `Bugs, Feature Requests <https://github.com/neuralmagic/sparseml/discussions>`_ 
-    `Support, General Q&A <https://github.com/neuralmagic/sparseml/issues>`_ 
-   
\ No newline at end of file
+    Bugs, Feature Requests <https://github.com/neuralmagic/sparseml/issues>
+    Support, General Q&A <https://github.com/neuralmagic/sparseml/discussions>
+    Neural Magic Docs <https://docs.neuralmagic.com>
diff --git a/docs/source/installation.md b/docs/source/installation.md
index 397046cf16e..5016202241b 100644
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -25,4 +25,4 @@ Install with pip using:
 pip install sparseml
 ```
 
-Then if you would like to explore any of the [scripts](https://github.com/neuralmagic/sparseml/tree/main/scripts), [notebooks](https://github.com/neuralmagic/sparseml/tree/main/notebooks), or [examples](https://github.com/neuralmagic/sparseml/tree/main/examples) clone the repository and install any additional dependencies as required.
+Then if you would like to explore any of the [scripts](https://github.com/neuralmagic/sparseml/tree/main/scripts), [notebooks](https://github.com/neuralmagic/sparseml/tree/main/notebooks), or [integrations](https://github.com/neuralmagic/sparseml/tree/main/integrations) clone the repository and install any additional dependencies as required.
diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md
index 380fc0cf803..eaf4bdfc9c0 100644
--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@@ -16,12 +16,11 @@ limitations under the License.
 
 ## Quick Tour
 
-To enable flexibility, ease of use, and repeatability, optimizing a model is generally done using a recipe file.
-The files encode the instructions needed for modifying the model and/or training process as a list of modifiers.
+To enable flexibility, ease of use, and repeatability, sparsifying a model is generally done using a recipe.
+The recipes encode the instructions needed for modifying the model and/or training process as a list of modifiers.
 Example modifiers can be anything from setting the learning rate for the optimizer to gradual magnitude pruning.
 The files are written in [YAML](https://yaml.org/) and stored in YAML or [markdown](https://www.markdownguide.org/) files using [YAML front matter](https://assemble.io/docs/YAML-front-matter.html).
-The rest of the SparseML system is coded to parse the recipe files into a native format for the desired framework
-and apply the modifications to the model and training pipeline.
+The rest of the SparseML system is coded to parse the recipes into a native format for the desired framework and apply the modifications to the model and training pipeline.
 
 A sample recipe for pruning a model generally looks like the following:
 
@@ -56,12 +55,15 @@ Pre-configured recipes and the resulting models can be explored and downloaded f
 
 For a more in-depth read, check out [SparseML documentation](https://docs.neuralmagic.com/sparseml/).
 
-### PyTorch Optimization
+### PyTorch Sparsification
 
-The PyTorch optimization libraries are located under the `sparseml.pytorch.optim` package.
-Inside are APIs designed to make model optimization as easy as possible by integrating seamlessly into PyTorch training pipelines.
+The PyTorch sparsification libraries are located under the `sparseml.pytorch.optim` package.
+Inside are APIs designed to make model sparsification as easy as possible by integrating seamlessly into PyTorch training pipelines.
 
-The integration is done using the `ScheduledOptimizer` class. It is intended to wrap your current optimizer and its step function. The step function then calls into the `ScheduledModifierManager` class which can be created from a recipe file. With this setup, the training process can then be modified as desired to optimize the model.
+The integration is done using the `ScheduledOptimizer` class. 
+It is intended to wrap your current optimizer and its step function. 
+The step function then calls into the `ScheduledModifierManager` class which can be created from a recipe file. 
+With this setup, the training process can then be modified as desired to sparsify the model.
 
 To enable all of this, the integration code you'll need to write is only a handful of lines:
 
@@ -80,11 +82,11 @@ optimizer = ScheduledOptimizer(optimizer, model, manager, steps_per_epoch=num_tr
 
 ### Keras Optimization
 
-The Keras optimization libraries are located under the `sparseml.keras.optim` package.
-Inside are APIs designed to make model optimization as easy as possible by integrating seamlessly into Keras training pipelines.
+The Keras sparsification libraries are located under the `sparseml.keras.optim` package.
+Inside are APIs designed to make model sparsification as easy as possible by integrating seamlessly into Keras training pipelines.
 
 The integration is done using the `ScheduledModifierManager` class which can be created from a recipe file.
-This class handles modifying the Keras objects for the desired optimizations using the `modify` method.
+This class handles modifying the Keras objects for the desired algorithms using the `modify` method.
 The edited model, optimizer, and any callbacks necessary to modify the training process are returned.
 The model and optimizer can be used normally and the callbacks must be passed into the `fit` or `fit_generator` function.
 If using `train_on_batch`, the callbacks must be invoked after each call.
@@ -114,15 +116,16 @@ model.fit(..., callbacks=callbacks)
 save_model = manager.finalize(model)
 ```
 
-### TensorFlow V1 Optimization
+### TensorFlow V1 Sparsification
 
-The TensorFlow optimization libraries for TensorFlow version 1.X are located under the `sparseml.tensorflow_v1.optim` package. Inside are APIs designed to make model optimization as easy as possible by integrating seamlessly into TensorFlow V1 training pipelines.
+The TensorFlow sparsification libraries for TensorFlow version 1.X are located under the `sparseml.tensorflow_v1.optim` package. 
+Inside are APIs designed to make model sparsification as easy as possible by integrating seamlessly into TensorFlow V1 training pipelines.
 
 The integration is done using the `ScheduledModifierManager` class which can be created from a recipe file.
-This class handles modifying the TensorFlow graph for the desired optimizations.
-With this setup, the training process can then be modified as desired to optimize the model.
+This class handles modifying the TensorFlow graph for the desired algorithms.
+With this setup, the training process can then be modified as desired to sparsify the model.
 
-#### Estimator-based pipelines
+#### Estimator-Based pipelines
 
 Estimator-based pipelines are simpler to integrate with as compared to session-based pipelines.
 The `ScheduledModifierManager` can override the necessary callbacks in the estimator to modify the graph using the `modify_estimator` function.
@@ -139,12 +142,12 @@ manager.modify_estimator(estimator, steps_per_epoch=num_train_batches)
 # Normal estimator training code...
 ```
 
-#### Session-based pipelines
+#### Session-Based pipelines
 
 Session-based pipelines need a little bit more as compared to estimator-based pipelines; however,
 it is still designed to require only a few lines of code for integration.
 After graph creation, the manager's `create_ops` method must be called.
-This will modify the graph as needed for the optimizations and return modifying ops and extras.
+This will modify the graph as needed for the algorithms and return modifying ops and extras.
 After creating the session and training normally, call into `session.run` with the modifying ops after each step.
 Modifying extras contain objects such as tensorboard summaries of the modifiers to be used if desired.
 Finally, once completed, `complete_graph` must be called to remove the modifying ops for saving and export.
@@ -235,3 +238,4 @@ with tf_compat.Graph().as_default() as graph:
         exporter.export_pb(outputs=[logits])
 
 exporter.export_onnx(inputs=input_names, outputs=output_names)
+```
diff --git a/docs/source/recipes.md b/docs/source/recipes.md
index 251ef423ddc..b47e16fef11 100644
--- a/docs/source/recipes.md
+++ b/docs/source/recipes.md
@@ -14,9 +14,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-# Optimization Recipes
+# Sparsification Recipes
 
-All optimization APIs are designed to work with recipe files.
+All SparseML Sparsification APIs are designed to work with recipes.
 The files encode the instructions needed for modifying the model and/or training process as a list of modifiers.
 Example modifiers can be anything from setting the learning rate for the optimizer to gradual magnitude pruning.
 The files are written in [YAML](https://yaml.org/) and stored in YAML or 
@@ -25,12 +25,12 @@ The files are written in [YAML](https://yaml.org/) and stored in YAML or
 The rest of the SparseML system is coded to parse the recipe files into a native format for the desired framework
 and apply the modifications to the model and training pipeline.
 
-The easiest ways to get or create optimization recipes are by either using 
-the pre-configured recipes in [SparseZoo](https://github.com/neuralmagic/sparsezoo) or 
-using [Sparsify's](https://github.com/neuralmagic/sparsify) autoML style creation.
+In a recipe, modifiers must be written in a list that includes "modifiers" in its name.
+
+The easiest ways to get or create recipes are by either using the pre-configured recipes in [SparseZoo](https://github.com/neuralmagic/sparsezoo) or using [Sparsify's](https://github.com/neuralmagic/sparsify) automatic creation.
 
 However, power users may be inclined to create their own recipes by hand to enable more 
-fine grained control or to add in custom modifiers.
+fine-grained control or to add in custom modifiers.
 
 A sample recipe for pruning a model generally looks like the following:
 ```yaml
@@ -183,7 +183,7 @@ Notes:
    the script `scripts/pytorch/model_quantize_qat_export.py` or the function
    `neuralmagicML.pytorch.quantization.quantize_qat_export`.
 - If performing QAT on a sparse model, you must preserve sparsity during QAT by
-   applying a `ConstantKSModifier` or have already used a `GradualKSModifier` with
+   applying a `ConstantPruningModifier` or have already used a `GMPruningModifier` with
    `leave_enabled` set to True.
 
 Required Parameters:
diff --git a/examples/README.md b/examples/README.md
index c9d32b5a8f7..cabf0db25c2 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -16,7 +16,7 @@ limitations under the License.
 
 # Examples
 
-This directory contains self-documented examples of SparseML workflows and integrations
-with other libraries.  Open a Pull Request to
+This directory contains self-documented examples of end-to-end workflows using SparseML
+and its companion libraries.  Open a Pull Request to
 [contribute](https://github.com/neuralmagic/sparseml/blob/main/CONTRIBUTING.md)
-your own.
\ No newline at end of file
+your own.
diff --git a/examples/pytorch_sparse_quantized_transfer_learning/README.md b/examples/pytorch_sparse_quantized_transfer_learning/README.md
new file mode 100644
index 00000000000..b426a19686d
--- /dev/null
+++ b/examples/pytorch_sparse_quantized_transfer_learning/README.md
@@ -0,0 +1,115 @@
+<!--
+Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# PyTorch Sparse-Quantized Transfer Learning with SparseML
+
+<figure class="image">
+  <img src="https://docs.neuralmagic.com/docs/source/sparsification/flow-sparsification-model_recipe_sparsezoo-sparseml_transfer_learning.svg" width="960px">
+</figure>
+
+## Overview
+[Pruning](https://neuralmagic.com/blog/pruning-overview/) and
+[quantization](https://arxiv.org/abs/1609.07061) are well-established methods for accelerating
+neural networks.  Individually, both methods yield significant speedups for CPU inference
+(a theoretical maximum of 4x for INT8 quantization) and can make CPU deployments an attractive
+option for real-time model inference.
+
+Sparse-quantized models leverage both techniques and
+[can achieve speedups upwards of 7x](https://neuralmagic.com/blog/benchmark-resnet50-with-deepsparse)
+when using the [DeepSparse Engine](https://github.com/neuralmagic/deepsparse) with
+[compatible hardware](https://docs.neuralmagic.com/deepsparse/hardware.html).
+
+Using powerful [SparseML](https://github.com/neuralmagic/sparseml) recipes, it is easy to create sparse-quantized models.
+Additionally, the SparseML team is actively creating pre-trained sparse-quantized models that maintain accuracy
+targets and achieve high CPU speedups - and it is easy to leverage these models for speedups with your own datasets
+using sparse-quantized transfer learning.
+
+## Sparse-Quantized Transfer Learning
+
+[Transfer learning](https://en.wikipedia.org/wiki/Transfer_learning) is a technique that
+involves retraining a pre-trained model to learn a new task, with the benefit of starting
+from the pre-trained model's already learned behavior. Sparse-quantized transfer takes the
+additional step of reusing both the pre-trained weights, and pre-trained sparse model
+structure of an existing sparse-quantized model to train it on a new ML task.
+
+This technique allows engineers and researchers to create sparse-quantized optimizations
+for one model and then easily re-apply them to accelerate many tasks.
+
+Sparse-quantized transfer learning takes place in two phases:
+1. Sparse transfer learning \- fine tuning the pre-trained model with the new dataset
+while maintaining the existing pre-optimized sparsity structure.  This creates a model 
+that learns to predict a new task, while preserving the predetermined optimized structure
+from pruning.
+2. [Quantization-aware training](https://pytorch.org/blog/introduction-to-quantization-on-pytorch/#quantization-aware-training)
+\- emulating the effects of INT8 quantization while training the model to overcome the loss of precision
+
+
+## ResNet-50 Imagenette Example
+
+The [SparseZoo](https://github.com/neuralmagic/sparseml) hosts a sparse-quantized ResNet-50 model trained
+on the ImageNet dataset.  It maintains 99% of the baseline accuracy and can achieve over 6.5x
+speedup using the DeepSparse Engine.  There are multiple paths to explore sparse-quantized
+transfer learning with this model.
+
+### Notebook
+`sparseml/examples/pytorch_sparse_quantized_transfer_learning/pytorch_sparse_quantized_transfer_learning.ipynb`
+is a Jupyter Notebook that provides a step-by-step walk-through for
+ - setting up sparse-quantized transfer learning
+ - integrating SparseML with any PyTorch training flow
+ - ONNX export
+ - benchmarking with the DeepSparse Engine 
+
+Run `jupyter notebook` and navigate to this notebook file to run the example.
+
+### Script
+`sparseml/scripts/pytorch_vision.py` is a script for running tasks related to pruning and
+quantization with SparseML for image classification and object detection use cases.
+Using the following example command, you can run sparse-quantized transfer learning on a custom
+[ImageFolder](https://pytorch.org/vision/0.8/datasets.html#imagefolder) based
+classification dataset.
+
+Note that for datasets other than Imagenette, you may need to edit
+the recipe to better train for the dataset following instructions in the downloaded recipe card.
+
+```
+python scripts/pytorch_vision.py train \
+    --recipe-path zoo:cv/classification/resnet_v1-50/pytorch/sparseml/imagenet/pruned_quant-moderate?recipe_type=transfer_learn \
+    --checkpoint-path zoo \
+    --arch-key resnet50 \
+    --model-kwargs '{"ignore_error_tensors": ["classifier.fc.weight", "classifier.fc.bias"]}' \
+    --dataset imagefolder \
+    --dataset-path /PATH/TO/IMAGEFOLDER/DATASET  \
+    --train-batch-size 32 --test-batch-size 64 \
+    --loader-num-workers 8 \
+    --optim Adam \
+    --optim-args '{}' \
+    --model-tag resnet50-imagenette-pruned_quant-transfer_learned
+```
+
+
+### Further Reading
+To learn more about this sparse-quantized ResNet-50 model's benchmarks check out
+[this blog post](https://neuralmagic.com/blog/benchmark-resnet50-with-deepsparse).
+
+For more information on creating sparse and quantized models, check out our
+[documentation](https://neuralmagic.com/contact/).
+
+SparseML and its companion packages are open-source and constantly improving.
+Keep an eye out for new models, algorithms, and faster speeds.
+If you have any questions, extensions, or use cases feel free to,
+[contribute](https://github.com/neuralmagic/sparseml/blob/main/CONTRIBUTING.md),
+[open an issue](https://github.com/neuralmagic/sparseml/issues),
+or [contact us](https://neuralmagic.com/contact/).
\ No newline at end of file
diff --git a/examples/pytorch_sparse_quantized_transfer_learning/pytorch_sparse_quantized_transfer_learning.ipynb b/examples/pytorch_sparse_quantized_transfer_learning/pytorch_sparse_quantized_transfer_learning.ipynb
new file mode 100644
index 00000000000..d748e0936a7
--- /dev/null
+++ b/examples/pytorch_sparse_quantized_transfer_learning/pytorch_sparse_quantized_transfer_learning.ipynb
@@ -0,0 +1,462 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<sub>&copy; 2021-present Neuralmagic, Inc. // [Neural Magic Legal](https://neuralmagic.com/legal)</sub> \n",
+    "\n",
+    "# Sparse-Quantized Transfer Learning in PyTorch using SparseML\n",
+    "\n",
+    "This notebook provides a step-by-step walkthrough for creating a performant sparse-quantized model\n",
+    "by transfer learning the pruned structure from an already sparse-quantized model.\n",
+    "\n",
+    "Sparse-quantized models combine [pruning](https://neuralmagic.com/blog/pruning-overview/) and [quantization](https://arxiv.org/abs/1609.07061) to reduce both the number of parameters and the precision of the remaining parameters to significantly increase the performance of neural networks. Using these optimizations, your model will obtain significantly better (around 7x vs. unoptimized) performance at inference time using the [DeepSparse Engine](https://github.com/neuralmagic/deepsparse).\n",
+    "\n",
+    "Sparse-quantized transfer learning takes two steps. [SparseML](https://github.com/neuralmagic/sparseml) recipes make it easy to perform these optimizations:\n",
+    "- First, fine-tune a pre-trained sparse model for the transfer dataset while maintaining the pre-trained sparsity structure.\n",
+    "- Second, perform [quantization-aware training (QAT)](https://pytorch.org/blog/introduction-to-quantization-on-pytorch/#quantization-aware-training) to quantize the now sparse model while still holding the same sparsity structure.\n",
+    "\n",
+    "In this notebook, you will:\n",
+    "- Set up the model and dataset\n",
+    "- Define a generic PyTorch training flow\n",
+    "- Integrate the PyTorch flow with SparseML for transfer learning\n",
+    "- Perform sparse transfer learning and quantization-aware training using the PyTorch and SparseML flow\n",
+    "- Export to [ONNX](https://onnx.ai/) and convert the model from a QAT\n",
+    "- Compare DeepSparse Engine benchmarks of the final sparse-quantized model to an unoptimized model\n",
+    "\n",
+    "Reading through this notebook will be reasonably quick to gain an intuition for how to plug SparseML into your PyTorch training flow for transfer learning and generically. Rough time estimates for fully pruning the default model are given. Note that training with the PyTorch CPU implementation will be much slower than a GPU:\n",
+    "- 30 minutes on a GPU\n",
+    "- 90 minutes on a laptop CPU"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 1 - Requirements\n",
+    "\n",
+    "To run this notebook, you will need the following packages already installed:\n",
+    "* SparseML, SparseZoo\n",
+    "* PyTorch (>= 1.7.0) and torchvision\n",
+    "* DeepSparse (can be installed with `pip install deepsparse` if not already)\n",
+    "\n",
+    "You can install any package that is not already present via `pip`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import deepsparse\n",
+    "import sparseml\n",
+    "import sparsezoo\n",
+    "import torch\n",
+    "import torchvision\n",
+    "\n",
+    "assert torch.__version__ >= \"1.7\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 2 - Setting Up the Model and Dataset\n",
+    "\n",
+    "By default, you will transfer learn from a sparse-quantized [ResNet-50](https://arxiv.org/abs/1512.03385) model trained on the [ImageNet dataset](http://www.image-net.org/) to the much smaller [Imagenette dataset](https://github.com/fastai/imagenette). The transfer learning weights are downloaded from the [SparseZoo](https://github.com/neuralmagic/sparsezoo) model repository.   The Imagenette dataset is downloaded from its repository via a helper class from SparseML.\n",
+    "\n",
+    "When loading weights for transfer learning classification models, it is standard to override the final classifier layer to fit the output shape for the new dataset.  In the example below, this is done by specifying `ignore_error_tensors` as the weights that will be initialzed for the new model.  In other flows this could be accomplished by setting `model.classifier.fc = torch.nn.Linear(...)`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sparseml.pytorch.models import ModelRegistry\n",
+    "from sparseml.pytorch.datasets import ImagenetteDataset, ImagenetteSize\n",
+    "from sparsezoo import Zoo\n",
+    "\n",
+    "#######################################################\n",
+    "# Define your model below\n",
+    "#######################################################\n",
+    "print(\"loading model...\")\n",
+    "# SparseZoo stub to pre-trained sparse-quantized ResNet-50 for imagenet dataset\n",
+    "zoo_stub_path = (\n",
+    "    \"zoo:cv/classification/resnet_v1-50/pytorch/sparseml/imagenet/pruned_quant-moderate\"\n",
+    "    \"?recipe_type=transfer_learn\"\n",
+    ")\n",
+    "model = ModelRegistry.create(\n",
+    "    key=\"resnet50\",\n",
+    "    pretrained_path=zoo_stub_path,\n",
+    "    pretrained_dataset=\"imagenette\",\n",
+    "    num_classes=10,\n",
+    "    ignore_error_tensors=[\"classifier.fc.weight\", \"classifier.fc.bias\"],\n",
+    ")\n",
+    "input_shape = ModelRegistry.input_shape(\"resnet50\")\n",
+    "input_size = input_shape[-1]\n",
+    "print(model)\n",
+    "#######################################################\n",
+    "# Define your train and validation datasets below\n",
+    "#######################################################\n",
+    "\n",
+    "print(\"\\nloading train dataset...\")\n",
+    "train_dataset = ImagenetteDataset(\n",
+    "    train=True, dataset_size=ImagenetteSize.s320, image_size=input_size\n",
+    ")\n",
+    "print(train_dataset)\n",
+    "\n",
+    "print(\"\\nloading val dataset...\")\n",
+    "val_dataset = ImagenetteDataset(\n",
+    "    train=False, dataset_size=ImagenetteSize.s320, image_size=input_size\n",
+    ")\n",
+    "print(val_dataset)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 3 - Creating a PyTorch Training Loop\n",
+    "SparseML can plug directly into your existing PyTorch training flow by overriding the Optimizer object. To demonstrate this, in the cell below, we define a simple PyTorch training loop adapted from [here](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html).  To prune and quantize your existing models using SparseML, you can use your own training flow."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm.auto import tqdm\n",
+    "import math\n",
+    "import torch\n",
+    "\n",
+    "\n",
+    "def run_model_one_epoch(model, data_loader, criterion, device, train=False, optimizer=None):\n",
+    "    if train:\n",
+    "        model.train()\n",
+    "    else:\n",
+    "        model.eval()\n",
+    "\n",
+    "    running_loss = 0.0\n",
+    "    total_correct = 0\n",
+    "    total_predictions = 0\n",
+    "\n",
+    "    for step, (inputs, labels) in tqdm(enumerate(data_loader), total=len(data_loader)):\n",
+    "        inputs = inputs.to(device)\n",
+    "        labels = labels.to(device)\n",
+    "\n",
+    "        if train:\n",
+    "            optimizer.zero_grad()\n",
+    "\n",
+    "        outputs, _ = model(inputs)  # model returns logits and softmax as a tuple\n",
+    "        loss = criterion(outputs, labels)\n",
+    "\n",
+    "        if train:\n",
+    "            loss.backward()\n",
+    "            optimizer.step()\n",
+    "\n",
+    "        running_loss += loss.item()\n",
+    "\n",
+    "        predictions = outputs.argmax(dim=1)\n",
+    "        total_correct += torch.sum(predictions == labels).item()\n",
+    "        total_predictions += inputs.size(0)\n",
+    "\n",
+    "    loss = running_loss / (step + 1.0)\n",
+    "    accuracy = total_correct / total_predictions\n",
+    "    return loss, accuracy"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 4 - Building PyTorch Training Objects\n",
+    "In this step, you will select hyperparameters, a device to train your model with, set up DataLoader objects, a loss function, and optimizer.  All of these variables and objects can be replaced to fit your training flow."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch.utils.data import DataLoader\n",
+    "from torch.nn import CrossEntropyLoss\n",
+    "from torch.optim import Adam\n",
+    "\n",
+    "# hyperparameters\n",
+    "BATCH_SIZE = 32\n",
+    "\n",
+    "# setup device\n",
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "model.to(device)\n",
+    "print(f\"Using device: {device}\")\n",
+    "\n",
+    "# setup data loaders\n",
+    "train_loader = DataLoader(\n",
+    "    train_dataset, BATCH_SIZE, shuffle=True, pin_memory=True, num_workers=8\n",
+    ")\n",
+    "val_loader = DataLoader(\n",
+    "    val_dataset, BATCH_SIZE, shuffle=False, pin_memory=True, num_workers=8\n",
+    ")\n",
+    "\n",
+    "# setup loss function and optimizer, LR will be overriden by sparseml\n",
+    "criterion = CrossEntropyLoss()\n",
+    "optimizer = Adam(model.parameters(), lr=8e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 5 - Running Sparse-Quantized Transfer Learning with a SparseML Recipe\n",
+    "\n",
+    "To run sparse-quantized transfer learning with SparseML, you will download a transfer learning recipe from SparseZoo and use it to create a `ScheduledModifierManager` object.  This manager will be used to wrap the optimizer object to maintain the pre-optimized model's sparsity structure while learning weights for the new dataset as well as performing quantization-aware training (QAT).\n",
+    "\n",
+    "You can create SparseML recipes to perform various model pruning schedules, QAT, sparse transfer learning, and more.  If you are using a different model than the default, you will have to modify the recipe  file to match the new target's parameters.\n",
+    "\n",
+    "Finally, using the wrapped optimizer object, you will call the training function to prune your model.\n",
+    "\n",
+    "If the kernel shuts down during training, this may be an out of memory error; to resolve this, try lowering the `batch_size` in the cell above."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Downloading a Recipe from SparseZoo\n",
+    "The [SparseZoo](https://github.com/neuralmagic/sparsezoo) API provides preconfigured recipes for its optimized model.  In the cell below, you will download a recipe for pruning ResNet-50 on the Imagenette dataset and record its saved path."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sparsezoo import Zoo\n",
+    "\n",
+    "recipe_path = Zoo.download_recipe_from_stub(zoo_stub_path)\n",
+    "print(f\"Recipe downloaded to: {recipe_path}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sparseml.pytorch.optim import ScheduledModifierManager, ScheduledOptimizer\n",
+    "\n",
+    "# create ScheduledModifierManager and Optimizer wrapper\n",
+    "manager = ScheduledModifierManager.from_yaml(recipe_path)\n",
+    "optimizer = ScheduledOptimizer(\n",
+    "    optimizer,\n",
+    "    model,\n",
+    "    manager,\n",
+    "    steps_per_epoch=len(train_loader),\n",
+    "    loggers=[],\n",
+    ")\n",
+    "\n",
+    "\n",
+    "# Run model pruning\n",
+    "epoch = manager.min_epochs\n",
+    "for epoch in range(manager.max_epochs):\n",
+    "    # run training loop\n",
+    "    epoch_name = f\"{epoch + 1}/{manager.max_epochs}\"\n",
+    "    print(f\"Running Training Epoch {epoch_name}\")\n",
+    "    train_loss, train_acc = run_model_one_epoch(\n",
+    "        model, train_loader, criterion, device, train=True, optimizer=optimizer\n",
+    "    )\n",
+    "    print(\n",
+    "        f\"Training Epoch: {epoch_name}\\nTraining Loss: {train_loss}\\nTop 1 Acc: {train_acc}\\n\"\n",
+    "    )\n",
+    "\n",
+    "    # run validation loop\n",
+    "    print(f\"Running Validation Epoch {epoch_name}\")\n",
+    "    val_loss, val_acc = run_model_one_epoch(model, train_loader, criterion, device)\n",
+    "    print(\n",
+    "        f\"Validation Epoch: {epoch_name}\\nVal Loss: {val_loss}\\nTop 1 Acc: {val_acc}\\n\"\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 6 - Viewing Model Sparsity\n",
+    "To see the effects of sparse-quantized transfer learning, in this step, you will print out the sparsities of each Conv and FC layer in your model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sparseml.pytorch.utils import get_prunable_layers, tensor_sparsity\n",
+    "\n",
+    "# print sparsities of each layer\n",
+    "for (name, layer) in get_prunable_layers(model):\n",
+    "    print(f\"{name}.weight: {tensor_sparsity(layer.weight).item():.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 7 - Exporting to ONNX\n",
+    "\n",
+    "Now that the sparse-quantized transfer learning is complete, it should be prepped for inference.  A common next step for inference is exporting the model to ONNX.  This is also the format used by the [DeepSparse Engine](https://github.com/neuralmagic/deepsparse) to achieve the sparse-quantized speedups.\n",
+    "\n",
+    "For PyTorch, exporting to ONNX is natively supported. In the cell block below, a convenience class, ModuleExporter(), is used to handle exporting.\n",
+    "\n",
+    "Additionally, PyTorch, exports a graph setup for quantization-aware training (QAT) to ONNX. To run a fully quantized graph, you will need to convert these QAT operations to fully quantized INT8 operations.  SparseML provides the `quantize_torch_qat_export` helper function to perform this conversion.\n",
+    "\n",
+    "Once the model is saved as an ONNX ﬁle, it is ready to be used for inference with the DeepSparse Engine.\n",
+    "\n",
+    "If exporting the model only to PyTorch for inference, the graph can be converted to fully quantized in PyTorch only using `torch.quantization.convert`, however the resulting model will not be compatible with ONNX conversion."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from sparseml.pytorch.utils import ModuleExporter\n",
+    "from sparseml.pytorch.optim.quantization import quantize_torch_qat_export\n",
+    "\n",
+    "save_dir = \"pytorch_sparse_quantized_transfer_learning\"\n",
+    "qat_onnx_graph_name = \"resnet50_imagenette_pruned_qat.onnx\"\n",
+    "quantized_onnx_path = os.path.join(save_dir, \"resnet50_imagenette_pruned_quant.onnx\")\n",
+    "\n",
+    "exporter = ModuleExporter(model, output_dir=save_dir)\n",
+    "exporter.export_pytorch(name=\"resnet50_imagenette_pruned_qat.pth\")\n",
+    "exporter.export_onnx(\n",
+    "    torch.randn(1, 3, 224, 224), name=qat_onnx_graph_name\n",
+    ")\n",
+    "\n",
+    "\n",
+    "# convert QAT graph to fully quantized operators\n",
+    "quantize_torch_qat_export(os.path.join(save_dir, qat_onnx_graph_name), output_file_path=quantized_onnx_path)\n",
+    "\n",
+    "print(f\"Sparse-Quantized ONNX model saved to {quantized_onnx_path}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 8 - Benchmarking\n",
+    "\n",
+    "Finally, to see the total effect of these optimizations, you will benchmark an unoptimized, dense ResNet-50 model from SparseZoo against your sparse-quantized model using the `deepsparse` API.\n",
+    "\n",
+    "Note, in order to view speedup from quantization, your CPU must run VNNI instructions.  The benchmarking cell below contains a check for VNNI instructions and will log a warning if they are not detected.  You can learn more about DeepSparse hardware compatibility [here](https://docs.neuralmagic.com/deepsparse/hardware.html)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy\n",
+    "from deepsparse import benchmark_model\n",
+    "from deepsparse.cpu import cpu_architecture\n",
+    "\n",
+    "\n",
+    "# check VNNI\n",
+    "if cpu_architecture()[\"vnni\"]:\n",
+    "    print(\"VNNI extensions detected, model will run with quantized speedups\\n\")\n",
+    "else:\n",
+    "    print(\n",
+    "        \"WARNING: No VNNI extensions detected. Your model will not run with \"\n",
+    "        \"quantized speedups which will affect benchmarking\\n\"\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "BATCH_SIZE = 64\n",
+    "NUM_CORES = None  # maximum number of cores available\n",
+    "NUM_ITERATIONS = 100\n",
+    "NUM_WARMUP_ITERATIONS = 20\n",
+    "\n",
+    "\n",
+    "def benchmark_imagenette_model(model_name, model_path):\n",
+    "    print(\n",
+    "        f\"Benchmarking {model_name} for {NUM_ITERATIONS} iterations at batch \"\n",
+    "        f\"size {BATCH_SIZE} with {NUM_CORES} CPU cores\"\n",
+    "    )\n",
+    "    sample_input = [\n",
+    "        numpy.ascontiguousarray(\n",
+    "            numpy.random.randn(BATCH_SIZE, 3, 224, 224).astype(numpy.float32)\n",
+    "        )\n",
+    "    ]\n",
+    "\n",
+    "    results = benchmark_model(\n",
+    "        model=model_path,\n",
+    "        inp=sample_input,\n",
+    "        batch_size=BATCH_SIZE,\n",
+    "        num_cores=NUM_CORES,\n",
+    "        num_iterations=NUM_ITERATIONS,\n",
+    "        num_warmup_iterations=NUM_WARMUP_ITERATIONS,\n",
+    "        show_progress=True,\n",
+    "    )\n",
+    "    print(f\"results:\\n{results}\")\n",
+    "    return results\n",
+    "\n",
+    "\n",
+    "# base ResNet-50 Imagenette model downloaded from SparseZoo\n",
+    "base_results = benchmark_imagenette_model(\n",
+    "    \"ResNet-50 Imagenette Base\",\n",
+    "    \"zoo:cv/classification/resnet_v1-50/pytorch/sparseml/imagenette/base-none\"\n",
+    ")\n",
+    "\n",
+    "optimized_results = benchmark_imagenette_model(\n",
+    "    \"ResNet-50 Imagenette pruned-quantized\", quantized_onnx_path\n",
+    ")\n",
+    "\n",
+    "speed_up = base_results.ms_per_batch / optimized_results.ms_per_batch\n",
+    "print(f\"Speed-up from sparse-quantized transfer learning: {speed_up}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Next Steps\n",
+    "\n",
+    "Congratulations, you have created a sparse-quantized model and exported it to ONNX for inference!  Next steps you can pursue include:\n",
+    "* Transfer learning, pruning, or quantizing different models using SparseML\n",
+    "* Trying different pruning and optimization recipes\n",
+    "* Benchmarking other models on the [DeepSparse Engine](https://github.com/neuralmagic/deepsparse)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/integrations/README.md b/integrations/README.md
new file mode 100644
index 00000000000..da94154199e
--- /dev/null
+++ b/integrations/README.md
@@ -0,0 +1,22 @@
+<!--
+Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Integrations
+
+This directory contains self-documented examples of SparseML workflows and integrations
+with other libraries.  Open a Pull Request to
+[contribute](https://github.com/neuralmagic/sparseml/blob/main/CONTRIBUTING.md)
+your own.
diff --git a/examples/keras/prune_resnet20.py b/integrations/keras/prune_resnet20.py
similarity index 98%
rename from examples/keras/prune_resnet20.py
rename to integrations/keras/prune_resnet20.py
index 6e02cae9dc4..b9157f0faa7 100644
--- a/examples/keras/prune_resnet20.py
+++ b/integrations/keras/prune_resnet20.py
@@ -15,11 +15,11 @@
 """
 Example of pruning a ResNet20-v1 model pretrained on the Cifar-10 dataset.
 The pretrained model and this pruning script were adapted from:
-https://keras.io/zh/examples/cifar10_resnet/
+https://keras.io/zh/integrations/cifar10_resnet/
 
 Run the following command from the top repo directory:
 
-   python3 examples/keras/prune_resnet20.py
+   python3 integrations/keras/prune_resnet20.py
 
 """
 
@@ -41,7 +41,7 @@
 
 
 # Root directory
-root_dir = "./examples/keras"
+root_dir = "./integrations/keras"
 
 # Logging setup
 log_dir = os.path.join(root_dir, "tensorboard", "resnet20_v1")
diff --git a/examples/pytorch-torchvision/README.md b/integrations/pytorch-torchvision/README.md
similarity index 88%
rename from examples/pytorch-torchvision/README.md
rename to integrations/pytorch-torchvision/README.md
index 5a77cb7dc77..34c89a2286a 100644
--- a/examples/pytorch-torchvision/README.md
+++ b/integrations/pytorch-torchvision/README.md
@@ -31,14 +31,14 @@ To begin, run `pip install sparseml[torchvision]`
 
 ## Notebook
 For a quick, step-by-step walk-through of performing the integration and pruning a model run through the
-[pruning.ipynb](https://github.com/neuralmagic/sparseml/blob/main/examples/pytorch-torchvision/pruning.ipynb) notebook.
+[pruning.ipynb](https://github.com/neuralmagic/sparseml/blob/main/integrations/pytorch-torchvision/pruning.ipynb) notebook.
 
 Run `jupyter notebook` in your terminal and navigate to the notebook in your browser to get started.
 
 ## Script
-`examples/pytorch-torchvision/main.py` is an ease-of-use script for applying a SparseML optimization recipe to a torchvision classification model.
+`integrations/pytorch-torchvision/main.py` is an ease-of-use script for applying a SparseML optimization recipe to a torchvision classification model.
 The script file is fully documented with descriptions, a command help printout, and example commands.
-You can also run `python examples/pytorch-torchvision/main.py -h` for a help printout.
+You can also run `python integrations/pytorch-torchvision/main.py -h` for a help printout.
 
 To run this script, you will need a SparseML recipe as well as an
 [ImageFolder](https://pytorch.org/docs/stable/torchvision/datasets.html#imagefolder)-like classification dataset to train
@@ -59,7 +59,7 @@ in the script file documentation.  The optimization learning rate, and number of
 
 example command:
 ```bash
-python examples/pytorch-torchvision/main.py \
+python integrations/pytorch-torchvision/main.py \
     --recipe-path ~/sparseml_recipes/pruning_resnet50.yaml \
     --model resnet50 \
     --imagefolder-path ~/datasets/my_imagefolder \
diff --git a/examples/pytorch-torchvision/main.py b/integrations/pytorch-torchvision/main.py
similarity index 99%
rename from examples/pytorch-torchvision/main.py
rename to integrations/pytorch-torchvision/main.py
index 4a90a8c602f..47a709d18d4 100644
--- a/examples/pytorch-torchvision/main.py
+++ b/integrations/pytorch-torchvision/main.py
@@ -76,7 +76,7 @@
 
 ##########
 Example command for pruning resnet50 on an imagefolder dataset:
-python examples/pytorch-torchvision/main.py \
+python integrations/pytorch-torchvision/main.py \
     --recipe-path ~/sparseml_recipes/pruning_resnet50.yaml \
     --model resnet50 \
     --imagefolder-path ~/datasets/ILSVRC2012 \
diff --git a/examples/pytorch-torchvision/pruning.ipynb b/integrations/pytorch-torchvision/pruning.ipynb
similarity index 100%
rename from examples/pytorch-torchvision/pruning.ipynb
rename to integrations/pytorch-torchvision/pruning.ipynb
diff --git a/integrations/timm/README.md b/integrations/timm/README.md
new file mode 100644
index 00000000000..a1e96289e10
--- /dev/null
+++ b/integrations/timm/README.md
@@ -0,0 +1,123 @@
+<!--
+Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# SparseML-rwightman/pytorch-image-models integration
+This directory provides a SparseML integrated training script for the popular
+[rwightman/pytorch-image-models](https://github.com/rwightman/pytorch-image-models)
+repository also known as [timm](https://pypi.org/project/timm/).
+
+Using this integration, you will be able to apply SparseML optimizations
+to the powerful training flows of the pytorch-image-models repository.
+
+Some of the tasks you can perform using this integration include, but are not limited to:
+* model pruning
+* quantization-aware-training
+* sparse quantization-aware-training
+* sparse transfer learning
+
+## Installation
+Both requirements can be installed via `pip` or can be cloned
+and installed from their respective source repositories.
+
+```bash
+pip install git+https://github.com/rwightman/pytorch-image-models.git
+pip install sparseml[torchvision]
+```
+
+
+## Script
+`integrations/timm/train.py` modifies
+[`train.py`](https://github.com/rwightman/pytorch-image-models/blob/master/train.py)
+from pytorch-image-models to include a `sparseml-recipe` argument
+to run SparseML optimizations with.  This can be a file path to a local
+SparseML recipe or a SparseZoo model stub prefixed by `zoo:` such as
+`zoo:cv-classification/resnet_v1-50/pytorch-rwightman/imagenet-augmented/pruned_quant-aggressive`.
+
+Additionally, to run sparse transfer learning with a SparseZoo model that has
+a transfer learning recipe, add `?recipe_type=transfer_learn` as part of the model stub.
+i.e. `zoo:cv-classification/resnet_v1-50/pytorch-rwightman/imagenet-augmented/pruned_quant-aggressive?recipe_type=transfer_learn`.
+This will run a recipe that holds the optimized sparsity structure the same while allowing
+non-zero weights to be updated during training, so pre-learned optimizations can be applied
+to different datasets.
+
+To load the base weights for a SparseZoo recipe as the initial checkpoint, set
+`--initial-checkpoint` to `zoo`.  To use the weights of a SparseZoo model as the
+initial checkpoint, pass that model's SparseZoo stub prefixed by `zoo:` to the
+`--initial-checkpoint` argument.
+
+Running the script will
+follow the normal pytorch-image-models training flow with the given
+SparseML optimizations enabled.
+
+Some considerations:
+
+* `--sparseml-recipe` is a required parameter
+* `--epochs` will now be overridden by the epochs set in the SparseML recipe
+* Modifiers will log their outputs to the console as well as to a tensorboard file
+* After training is complete, the final model will be exported to ONNX using SparseML
+
+You can learn how to build or download a recipe using the
+[SparseML](https://github.com/neuralmagic/sparseml)
+or [SparseZoo](https://github.com/neuralmagic/sparsezoo)
+documentation, or export one with [Sparsify](https://github.com/neuralmagic/sparsify).
+
+Documentation on the original script can be found
+[here](https://rwightman.github.io/pytorch-image-models/scripts/).
+The latest commit hash that `train.py` is based on is included in the docstring.
+
+
+#### Example Command
+Training from a local recipe and checkpoint
+```bash
+python integrations/timm/train.py \
+  /PATH/TO/DATASET/imagenet/ \
+  --sparseml-recipe /PATH/TO/RECIPE/recipe.yaml \
+  --initial-checkpoint PATH/TO/CHECKPOINT/model.pth \
+  --dataset imagenet \
+  --batch-size 64 \
+  --remode pixel --reprob 0.6 --smoothing 0.1 \
+  --output models/optimized \
+  --model resnet50 \
+  --workers 8 \
+```  
+
+Training from a local recipe and SparseZoo checkpoint
+```bash
+python integrations/timm/train.py \
+  /PATH/TO/DATASET/imagenet/ \
+  --sparseml-recipe /PATH/TO/RECIPE/recipe.yaml \
+  --initial-checkpoint zoo:model/stub/path \
+  --dataset imagenet \
+  --batch-size 64 \
+  --remode pixel --reprob 0.6 --smoothing 0.1 \
+  --output models/optimized \
+  --model resnet50 \
+  --workers 8 \
+```  
+
+Training from a SparseZoo recipe and checkpoint with sparse transfer learning enabled
+```bash
+python integrations/timm/train.py \
+  /PATH/TO/DATASET/imagenet/ \
+  --sparseml-recipe zoo:model/stub/path?recipe_type=transfer_learn \
+  --initial-checkpoint zoo \
+  --dataset imagenet \
+  --batch-size 64 \
+  --remode pixel --reprob 0.6 --smoothing 0.1 \
+  --output models/optimized \
+  --model resnet50 \
+  --workers 8 \
+```  
diff --git a/integrations/timm/train.py b/integrations/timm/train.py
new file mode 100755
index 00000000000..669a6ce8536
--- /dev/null
+++ b/integrations/timm/train.py
@@ -0,0 +1,878 @@
+#!/usr/bin/env python
+
+# neuralmagic: no copyright
+# flake8: noqa
+# fmt: off
+# isort: skip_file
+
+"""
+Integration between https://github.com/rwightman/pytorch-image-models and SparseML
+
+This script is adapted from https://github.com/rwightman/pytorch-image-models/blob/master/train.py
+to apply a SparseML recipe from the required `--sparseml-recipe` argument.
+Integration lines are preceded by comment blocks.  Run with `--help` for help printout,
+more information can be found in the readme file.
+
+Latest pytorch-image-models commit this script is based on:
+https://github.com/rwightman/pytorch-image-models/tree/aaa715b1e94a8d10a2c0ff0f4abef7ddc97b2576
+(commit hash: aaa715b)
+
+Original doc-string:
+
+ImageNet Training Script
+
+This is intended to be a lean and easily modifiable ImageNet training script that reproduces ImageNet
+training results with some of the latest networks and training techniques. It favours canonical PyTorch
+and standard Python style over trying to be able to 'do it all.' That said, it offers quite a few speed
+and training result improvements over the usual PyTorch example scripts. Repurpose as you see fit.
+
+This script was started from an early version of the PyTorch ImageNet example
+(https://github.com/pytorch/examples/tree/master/imagenet)
+
+NVIDIA CUDA specific speedups adopted from NVIDIA Apex examples
+(https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
+
+Hacked together by / Copyright 2020 Ross Wightman (https://github.com/rwightman)
+"""
+import argparse
+import time
+import yaml
+import os
+import logging
+from collections import OrderedDict
+from contextlib import suppress
+from datetime import datetime
+
+import torch
+import torch.nn as nn
+import torchvision.utils
+from torch.nn.parallel import DistributedDataParallel as NativeDDP
+
+from timm.data import create_dataset, create_loader, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset
+from timm.models import create_model, resume_checkpoint, load_checkpoint, convert_splitbn_model
+from timm.utils import *
+from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy, JsdCrossEntropy
+from timm.optim import create_optimizer
+from timm.scheduler import create_scheduler
+from timm.utils import ApexScaler, NativeScaler
+
+from sparseml.pytorch.optim import ScheduledModifierManager, ScheduledOptimizer
+from sparseml.pytorch.utils import ModuleExporter, PythonLogger, TensorBoardLogger
+from sparsezoo import Zoo
+import warnings
+
+try:
+    from apex import amp
+    from apex.parallel import DistributedDataParallel as ApexDDP
+    from apex.parallel import convert_syncbn_model
+    has_apex = True
+except ImportError:
+    has_apex = False
+
+has_native_amp = False
+try:
+    if getattr(torch.cuda.amp, 'autocast') is not None:
+        has_native_amp = True
+except AttributeError:
+    pass
+
+torch.backends.cudnn.benchmark = True
+_logger = logging.getLogger('train')
+
+# The first arg parser parses out only the --config argument, this argument is used to
+# load a yaml file containing key-values that override the defaults for the main parser below
+config_parser = parser = argparse.ArgumentParser(description='Training Config', add_help=False)
+parser.add_argument('-c', '--config', default='', type=str, metavar='FILE',
+                    help='YAML config file specifying default arguments')
+
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
+
+
+####################################################################################
+# Start SparseML arguments
+####################################################################################
+parser.add_argument(
+    "--sparseml-recipe",
+    required=True,
+    type=str,
+    help="path to a SparseML recipe file or a SparseZoo model stub for a recipe to load. "
+    "SparseZoo stubs should be preceded by 'zoo:'. i.e. '/path/to/local/recipe.yaml', "
+    "'zoo:zoo/model/stub'"
+)
+####################################################################################
+# End SparseML arguments
+####################################################################################
+
+# Dataset / Model parameters
+parser.add_argument('data_dir', metavar='DIR',
+                    help='path to dataset')
+parser.add_argument('--dataset', '-d', metavar='NAME', default='',
+                    help='dataset type (default: ImageFolder/ImageTar if empty)')
+parser.add_argument('--train-split', metavar='NAME', default='train',
+                    help='dataset train split (default: train)')
+parser.add_argument('--val-split', metavar='NAME', default='validation',
+                    help='dataset validation split (default: validation)')
+parser.add_argument('--model', default='resnet101', type=str, metavar='MODEL',
+                    help='Name of model to train (default: "countception"')
+parser.add_argument('--pretrained', action='store_true', default=False,
+                    help='Start with pretrained version of specified network (if avail)')
+parser.add_argument('--initial-checkpoint', default='', type=str, metavar='PATH',
+                    help='Initialize model from this checkpoint (default: none). '
+                    'can pass in "zoo" if using a SparseZoo recipe to load that recipes '
+                    'base weights, or pass in a SparseZoo model stub, prefixed with "zoo:" to '
+                    'load weights directly from SparseZoo')
+parser.add_argument('--resume', default='', type=str, metavar='PATH',
+                    help='Resume full model and optimizer state from checkpoint (default: none)')
+parser.add_argument('--no-resume-opt', action='store_true', default=False,
+                    help='prevent resume of optimizer state when resuming model')
+parser.add_argument('--num-classes', type=int, default=None, metavar='N',
+                    help='number of label classes (Model default if None)')
+parser.add_argument('--gp', default=None, type=str, metavar='POOL',
+                    help='Global pool type, one of (fast, avg, max, avgmax, avgmaxc). Model default if None.')
+parser.add_argument('--img-size', type=int, default=None, metavar='N',
+                    help='Image patch size (default: None => model default)')
+parser.add_argument('--input-size', default=None, nargs=3, type=int,
+                    metavar='N N N', help='Input all image dimensions (d h w, e.g. --input-size 3 224 224), uses model default if empty')
+parser.add_argument('--crop-pct', default=None, type=float,
+                    metavar='N', help='Input image center crop percent (for validation only)')
+parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN',
+                    help='Override mean pixel value of dataset')
+parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD',
+                    help='Override std deviation of of dataset')
+parser.add_argument('--interpolation', default='', type=str, metavar='NAME',
+                    help='Image resize interpolation type (overrides model)')
+parser.add_argument('-b', '--batch-size', type=int, default=32, metavar='N',
+                    help='input batch size for training (default: 32)')
+parser.add_argument('-vb', '--validation-batch-size-multiplier', type=int, default=1, metavar='N',
+                    help='ratio of validation batch size to training batch size (default: 1)')
+
+# Optimizer parameters
+parser.add_argument('--opt', default='sgd', type=str, metavar='OPTIMIZER',
+                    help='Optimizer (default: "sgd"')
+parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON',
+                    help='Optimizer Epsilon (default: None, use opt default)')
+parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA',
+                    help='Optimizer Betas (default: None, use opt default)')
+parser.add_argument('--momentum', type=float, default=0.9, metavar='M',
+                    help='Optimizer momentum (default: 0.9)')
+parser.add_argument('--weight-decay', type=float, default=0.0001,
+                    help='weight decay (default: 0.0001)')
+parser.add_argument('--clip-grad', type=float, default=None, metavar='NORM',
+                    help='Clip gradient norm (default: None, no clipping)')
+
+
+
+# Learning rate schedule parameters
+parser.add_argument('--sched', default='step', type=str, metavar='SCHEDULER',
+                    help='LR scheduler (default: "step"')
+parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
+                    help='learning rate')
+parser.add_argument('--lr-noise', type=float, nargs='+', default=None, metavar='pct, pct',
+                    help='learning rate noise on/off epoch percentages')
+parser.add_argument('--lr-noise-pct', type=float, default=0.67, metavar='PERCENT',
+                    help='learning rate noise limit percent (default: 0.67)')
+parser.add_argument('--lr-noise-std', type=float, default=1.0, metavar='STDDEV',
+                    help='learning rate noise std-dev (default: 1.0)')
+parser.add_argument('--lr-cycle-mul', type=float, default=1.0, metavar='MULT',
+                    help='learning rate cycle len multiplier (default: 1.0)')
+parser.add_argument('--lr-cycle-limit', type=int, default=1, metavar='N',
+                    help='learning rate cycle limit')
+parser.add_argument('--warmup-lr', type=float, default=0.0001, metavar='LR',
+                    help='warmup learning rate (default: 0.0001)')
+parser.add_argument('--min-lr', type=float, default=1e-5, metavar='LR',
+                    help='lower lr bound for cyclic schedulers that hit 0 (1e-5)')
+parser.add_argument('--epochs', type=int, default=200, metavar='N',
+                    help='number of epochs to train (default: 2)')
+parser.add_argument('--start-epoch', default=None, type=int, metavar='N',
+                    help='manual epoch number (useful on restarts)')
+parser.add_argument('--decay-epochs', type=float, default=30, metavar='N',
+                    help='epoch interval to decay LR')
+parser.add_argument('--warmup-epochs', type=int, default=3, metavar='N',
+                    help='epochs to warmup LR, if scheduler supports')
+parser.add_argument('--cooldown-epochs', type=int, default=10, metavar='N',
+                    help='epochs to cooldown LR at min_lr, after cyclic schedule ends')
+parser.add_argument('--patience-epochs', type=int, default=10, metavar='N',
+                    help='patience epochs for Plateau LR scheduler (default: 10')
+parser.add_argument('--decay-rate', '--dr', type=float, default=0.1, metavar='RATE',
+                    help='LR decay rate (default: 0.1)')
+
+# Augmentation & regularization parameters
+parser.add_argument('--no-aug', action='store_true', default=False,
+                    help='Disable all training augmentation, override other train aug args')
+parser.add_argument('--scale', type=float, nargs='+', default=[0.08, 1.0], metavar='PCT',
+                    help='Random resize scale (default: 0.08 1.0)')
+parser.add_argument('--ratio', type=float, nargs='+', default=[3./4., 4./3.], metavar='RATIO',
+                    help='Random resize aspect ratio (default: 0.75 1.33)')
+parser.add_argument('--hflip', type=float, default=0.5,
+                    help='Horizontal flip training aug probability')
+parser.add_argument('--vflip', type=float, default=0.,
+                    help='Vertical flip training aug probability')
+parser.add_argument('--color-jitter', type=float, default=0.4, metavar='PCT',
+                    help='Color jitter factor (default: 0.4)')
+parser.add_argument('--aa', type=str, default=None, metavar='NAME',
+                    help='Use AutoAugment policy. "v0" or "original". (default: None)'),
+parser.add_argument('--aug-splits', type=int, default=0,
+                    help='Number of augmentation splits (default: 0, valid: 0 or >=2)')
+parser.add_argument('--jsd', action='store_true', default=False,
+                    help='Enable Jensen-Shannon Divergence + CE loss. Use with `--aug-splits`.')
+parser.add_argument('--reprob', type=float, default=0., metavar='PCT',
+                    help='Random erase prob (default: 0.)')
+parser.add_argument('--remode', type=str, default='const',
+                    help='Random erase mode (default: "const")')
+parser.add_argument('--recount', type=int, default=1,
+                    help='Random erase count (default: 1)')
+parser.add_argument('--resplit', action='store_true', default=False,
+                    help='Do not random erase first (clean) augmentation split')
+parser.add_argument('--mixup', type=float, default=0.0,
+                    help='mixup alpha, mixup enabled if > 0. (default: 0.)')
+parser.add_argument('--cutmix', type=float, default=0.0,
+                    help='cutmix alpha, cutmix enabled if > 0. (default: 0.)')
+parser.add_argument('--cutmix-minmax', type=float, nargs='+', default=None,
+                    help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)')
+parser.add_argument('--mixup-prob', type=float, default=1.0,
+                    help='Probability of performing mixup or cutmix when either/both is enabled')
+parser.add_argument('--mixup-switch-prob', type=float, default=0.5,
+                    help='Probability of switching to cutmix when both mixup and cutmix enabled')
+parser.add_argument('--mixup-mode', type=str, default='batch',
+                    help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"')
+parser.add_argument('--mixup-off-epoch', default=0, type=int, metavar='N',
+                    help='Turn off mixup after this epoch, disabled if 0 (default: 0)')
+parser.add_argument('--smoothing', type=float, default=0.1,
+                    help='Label smoothing (default: 0.1)')
+parser.add_argument('--train-interpolation', type=str, default='random',
+                    help='Training interpolation (random, bilinear, bicubic default: "random")')
+parser.add_argument('--drop', type=float, default=0.0, metavar='PCT',
+                    help='Dropout rate (default: 0.)')
+parser.add_argument('--drop-connect', type=float, default=None, metavar='PCT',
+                    help='Drop connect rate, DEPRECATED, use drop-path (default: None)')
+parser.add_argument('--drop-path', type=float, default=None, metavar='PCT',
+                    help='Drop path rate (default: None)')
+parser.add_argument('--drop-block', type=float, default=None, metavar='PCT',
+                    help='Drop block rate (default: None)')
+
+# Batch norm parameters (only works with gen_efficientnet based models currently)
+parser.add_argument('--bn-tf', action='store_true', default=False,
+                    help='Use Tensorflow BatchNorm defaults for models that support it (default: False)')
+parser.add_argument('--bn-momentum', type=float, default=None,
+                    help='BatchNorm momentum override (if not None)')
+parser.add_argument('--bn-eps', type=float, default=None,
+                    help='BatchNorm epsilon override (if not None)')
+parser.add_argument('--sync-bn', action='store_true',
+                    help='Enable NVIDIA Apex or Torch synchronized BatchNorm.')
+parser.add_argument('--dist-bn', type=str, default='',
+                    help='Distribute BatchNorm stats between nodes after each epoch ("broadcast", "reduce", or "")')
+parser.add_argument('--split-bn', action='store_true',
+                    help='Enable separate BN layers per augmentation split.')
+
+# Model Exponential Moving Average
+parser.add_argument('--model-ema', action='store_true', default=False,
+                    help='Enable tracking moving average of model weights')
+parser.add_argument('--model-ema-force-cpu', action='store_true', default=False,
+                    help='Force ema to be tracked on CPU, rank=0 node only. Disables EMA validation.')
+parser.add_argument('--model-ema-decay', type=float, default=0.9998,
+                    help='decay factor for model weights moving average (default: 0.9998)')
+
+# Misc
+parser.add_argument('--seed', type=int, default=42, metavar='S',
+                    help='random seed (default: 42)')
+parser.add_argument('--log-interval', type=int, default=50, metavar='N',
+                    help='how many batches to wait before logging training status')
+parser.add_argument('--recovery-interval', type=int, default=0, metavar='N',
+                    help='how many batches to wait before writing recovery checkpoint')
+parser.add_argument('--checkpoint-hist', type=int, default=10, metavar='N',
+                    help='number of checkpoints to keep (default: 10)')
+parser.add_argument('-j', '--workers', type=int, default=4, metavar='N',
+                    help='how many training processes to use (default: 1)')
+parser.add_argument('--save-images', action='store_true', default=False,
+                    help='save images of input bathes every log interval for debugging')
+parser.add_argument('--amp', action='store_true', default=False,
+                    help='use NVIDIA Apex AMP or Native AMP for mixed precision training')
+parser.add_argument('--apex-amp', action='store_true', default=False,
+                    help='Use NVIDIA Apex AMP mixed precision')
+parser.add_argument('--native-amp', action='store_true', default=False,
+                    help='Use Native Torch AMP mixed precision')
+parser.add_argument('--channels-last', action='store_true', default=False,
+                    help='Use channels_last memory layout')
+parser.add_argument('--pin-mem', action='store_true', default=False,
+                    help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
+parser.add_argument('--no-prefetcher', action='store_true', default=False,
+                    help='disable fast prefetcher')
+parser.add_argument('--output', default='', type=str, metavar='PATH',
+                    help='path to output folder (default: none, current dir)')
+parser.add_argument('--eval-metric', default='top1', type=str, metavar='EVAL_METRIC',
+                    help='Best metric (default: "top1"')
+parser.add_argument('--tta', type=int, default=0, metavar='N',
+                    help='Test/inference time augmentation (oversampling) factor. 0=None (default: 0)')
+parser.add_argument("--local_rank", default=0, type=int)
+parser.add_argument('--use-multi-epochs-loader', action='store_true', default=False,
+                    help='use the multi-epochs-loader to save time at the beginning of every epoch')
+parser.add_argument('--torchscript', dest='torchscript', action='store_true',
+                    help='convert model torchscript for inference')
+
+
+def _parse_args():
+    # Do we have a config file to parse?
+    args_config, remaining = config_parser.parse_known_args()
+    if args_config.config:
+        with open(args_config.config, 'r') as f:
+            cfg = yaml.safe_load(f)
+            parser.set_defaults(**cfg)
+
+    # The main arg parser parses the rest of the args, the usual
+    # defaults will have been overridden if config file specified.
+    args = parser.parse_args(remaining)
+
+    # Cache the args as a text string to save them in the output dir later
+    args_text = yaml.safe_dump(args.__dict__, default_flow_style=False)
+    return args, args_text
+
+
+def main():
+    setup_default_logging()
+    args, args_text = _parse_args()
+
+    args.prefetcher = not args.no_prefetcher
+    args.distributed = False
+    if 'WORLD_SIZE' in os.environ:
+        args.distributed = int(os.environ['WORLD_SIZE']) > 1
+    args.device = 'cuda:0'
+    args.world_size = 1
+    args.rank = 0  # global rank
+    if args.distributed:
+        args.device = 'cuda:%d' % args.local_rank
+        torch.cuda.set_device(args.local_rank)
+        torch.distributed.init_process_group(backend='nccl', init_method='env://')
+        args.world_size = torch.distributed.get_world_size()
+        args.rank = torch.distributed.get_rank()
+        _logger.info('Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.'
+                     % (args.rank, args.world_size))
+    else:
+        _logger.info('Training with a single process on 1 GPUs.')
+    assert args.rank >= 0
+
+    # resolve AMP arguments based on PyTorch / Apex availability
+    use_amp = None
+    if args.amp:
+        # for backwards compat, `--amp` arg tries apex before native amp
+        if has_apex:
+            args.apex_amp = True
+        elif has_native_amp:
+            args.native_amp = True
+    if args.apex_amp and has_apex:
+        use_amp = 'apex'
+    elif args.native_amp and has_native_amp:
+        use_amp = 'native'
+    elif args.apex_amp or args.native_amp:
+        _logger.warning("Neither APEX or native Torch AMP is available, using float32. "
+                        "Install NVIDA apex or upgrade to PyTorch 1.6")
+
+    torch.manual_seed(args.seed + args.rank)
+
+    ####################################################################################
+    # Start - SparseML optional load weights from SparseZoo
+    ####################################################################################
+    if args.initial_checkpoint == "zoo":
+        # Load checkpoint from base weights associated with given SparseZoo recipe
+        if args.sparseml_recipe.startswith("zoo:"):
+            args.initial_checkpoint = Zoo.download_recipe_base_framework_files(
+                args.sparseml_recipe,
+                extensions=[".pth.tar", ".pth"]
+            )[0]
+        else:
+            raise ValueError(
+                "Attempting to load weights from SparseZoo recipe, but not given a "
+                "SparseZoo recipe stub.  When initial-checkpoint is set to 'zoo'. "
+                "sparseml-recipe must start with 'zoo:' and be a SparseZoo model "
+                f"stub. sparseml-recipe was set to {args.sparseml_recipe}"
+            )
+    elif args.initial_checkpoint.startswith("zoo:"):
+        # Load weights from a SparseZoo model stub
+        zoo_model = Zoo.load_model_from_stub(args.initial_checkpoint)
+        args.initial_checkpoint = zoo_model.download_framework_files(extensions=[".pth"])
+    ####################################################################################
+    # End - SparseML optional load weights from SparseZoo
+    ####################################################################################
+
+    model = create_model(
+        args.model,
+        pretrained=args.pretrained,
+        num_classes=args.num_classes,
+        drop_rate=args.drop,
+        drop_connect_rate=args.drop_connect,  # DEPRECATED, use drop_path
+        drop_path_rate=args.drop_path,
+        drop_block_rate=args.drop_block,
+        global_pool=args.gp,
+        bn_tf=args.bn_tf,
+        bn_momentum=args.bn_momentum,
+        bn_eps=args.bn_eps,
+        scriptable=args.torchscript,
+        checkpoint_path=args.initial_checkpoint)
+    if args.num_classes is None:
+        assert hasattr(model, 'num_classes'), 'Model must have `num_classes` attr if not set on cmd line/config.'
+        args.num_classes = model.num_classes  # FIXME handle model default vs config num_classes more elegantly
+
+    if args.local_rank == 0:
+        _logger.info('Model %s created, param count: %d' %
+                     (args.model, sum([m.numel() for m in model.parameters()])))
+
+    data_config = resolve_data_config(vars(args), model=model, verbose=args.local_rank == 0)
+
+    # setup augmentation batch splits for contrastive loss or split bn
+    num_aug_splits = 0
+    if args.aug_splits > 0:
+        assert args.aug_splits > 1, 'A split of 1 makes no sense'
+        num_aug_splits = args.aug_splits
+
+    # enable split bn (separate bn stats per batch-portion)
+    if args.split_bn:
+        assert num_aug_splits > 1 or args.resplit
+        model = convert_splitbn_model(model, max(num_aug_splits, 2))
+
+    # move model to GPU, enable channels last layout if set
+    model.cuda()
+    if args.channels_last:
+        model = model.to(memory_format=torch.channels_last)
+
+    # setup synchronized BatchNorm for distributed training
+    if args.distributed and args.sync_bn:
+        assert not args.split_bn
+        if has_apex and use_amp != 'native':
+            # Apex SyncBN preferred unless native amp is activated
+            model = convert_syncbn_model(model)
+        else:
+            model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+        if args.local_rank == 0:
+            _logger.info(
+                'Converted model to use Synchronized BatchNorm. WARNING: You may have issues if using '
+                'zero initialized BN layers (enabled by default for ResNets) while sync-bn enabled.')
+
+    if args.torchscript:
+        assert not use_amp == 'apex', 'Cannot use APEX AMP with torchscripted model'
+        assert not args.sync_bn, 'Cannot use SyncBatchNorm with torchscripted model'
+        model = torch.jit.script(model)
+
+    optimizer = create_optimizer(args, model)
+
+    # setup automatic mixed-precision (AMP) loss scaling and op casting
+    amp_autocast = suppress  # do nothing
+    loss_scaler = None
+    if use_amp == 'apex':
+        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
+        loss_scaler = ApexScaler()
+        if args.local_rank == 0:
+            _logger.info('Using NVIDIA APEX AMP. Training in mixed precision.')
+    elif use_amp == 'native':
+        amp_autocast = torch.cuda.amp.autocast
+        loss_scaler = NativeScaler()
+        if args.local_rank == 0:
+            _logger.info('Using native Torch AMP. Training in mixed precision.')
+    else:
+        if args.local_rank == 0:
+            _logger.info('AMP not enabled. Training in float32.')
+
+    # optionally resume from a checkpoint
+    resume_epoch = None
+    if args.resume:
+        resume_epoch = resume_checkpoint(
+            model, args.resume,
+            optimizer=None if args.no_resume_opt else optimizer,
+            loss_scaler=None if args.no_resume_opt else loss_scaler,
+            log_info=args.local_rank == 0)
+
+    # setup exponential moving average of model weights, SWA could be used here too
+    model_ema = None
+    if args.model_ema:
+        # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper
+        model_ema = ModelEmaV2(
+            model, decay=args.model_ema_decay, device='cpu' if args.model_ema_force_cpu else None)
+        if args.resume:
+            load_checkpoint(model_ema.module, args.resume, use_ema=True)
+
+    # setup distributed training
+    if args.distributed:
+        if has_apex and use_amp != 'native':
+            # Apex DDP preferred unless native amp is activated
+            if args.local_rank == 0:
+                _logger.info("Using NVIDIA APEX DistributedDataParallel.")
+            model = ApexDDP(model, delay_allreduce=True)
+        else:
+            if args.local_rank == 0:
+                _logger.info("Using native Torch DistributedDataParallel.")
+            model = NativeDDP(model, device_ids=[args.local_rank])  # can use device str in Torch >= 1.1
+        # NOTE: EMA model does not need to be wrapped by DDP
+
+    # setup learning rate schedule and starting epoch
+    lr_scheduler, num_epochs = create_scheduler(args, optimizer)
+    start_epoch = 0
+    if args.start_epoch is not None:
+        # a specified start_epoch will always override the resume epoch
+        start_epoch = args.start_epoch
+    elif resume_epoch is not None:
+        start_epoch = resume_epoch
+    if lr_scheduler is not None and start_epoch > 0:
+        lr_scheduler.step(start_epoch)
+
+    # create the train and eval datasets
+    dataset_train = create_dataset(
+        args.dataset, root=args.data_dir, split=args.train_split, is_training=True, batch_size=args.batch_size)
+    dataset_eval = create_dataset(
+        args.dataset, root=args.data_dir, split=args.val_split, is_training=False, batch_size=args.batch_size)
+
+    # setup mixup / cutmix
+    collate_fn = None
+    mixup_fn = None
+    mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None
+    if mixup_active:
+        mixup_args = dict(
+            mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax,
+            prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode,
+            label_smoothing=args.smoothing, num_classes=args.num_classes)
+        if args.prefetcher:
+            assert not num_aug_splits  # collate conflict (need to support deinterleaving in collate mixup)
+            collate_fn = FastCollateMixup(**mixup_args)
+        else:
+            mixup_fn = Mixup(**mixup_args)
+
+    # wrap dataset in AugMix helper
+    if num_aug_splits > 1:
+        dataset_train = AugMixDataset(dataset_train, num_splits=num_aug_splits)
+
+    # create data loaders w/ augmentation pipeiine
+    train_interpolation = args.train_interpolation
+    if args.no_aug or not train_interpolation:
+        train_interpolation = data_config['interpolation']
+    loader_train = create_loader(
+        dataset_train,
+        input_size=data_config['input_size'],
+        batch_size=args.batch_size,
+        is_training=True,
+        use_prefetcher=args.prefetcher,
+        no_aug=args.no_aug,
+        re_prob=args.reprob,
+        re_mode=args.remode,
+        re_count=args.recount,
+        re_split=args.resplit,
+        scale=args.scale,
+        ratio=args.ratio,
+        hflip=args.hflip,
+        vflip=args.vflip,
+        color_jitter=args.color_jitter,
+        auto_augment=args.aa,
+        num_aug_splits=num_aug_splits,
+        interpolation=train_interpolation,
+        mean=data_config['mean'],
+        std=data_config['std'],
+        num_workers=args.workers,
+        distributed=args.distributed,
+        collate_fn=collate_fn,
+        pin_memory=args.pin_mem,
+        use_multi_epochs_loader=args.use_multi_epochs_loader
+    )
+
+    loader_eval = create_loader(
+        dataset_eval,
+        input_size=data_config['input_size'],
+        batch_size=args.validation_batch_size_multiplier * args.batch_size,
+        is_training=False,
+        use_prefetcher=args.prefetcher,
+        interpolation=data_config['interpolation'],
+        mean=data_config['mean'],
+        std=data_config['std'],
+        num_workers=args.workers,
+        distributed=args.distributed,
+        crop_pct=data_config['crop_pct'],
+        pin_memory=args.pin_mem,
+    )
+
+    # setup loss function
+    if args.jsd:
+        assert num_aug_splits > 1  # JSD only valid with aug splits set
+        train_loss_fn = JsdCrossEntropy(num_splits=num_aug_splits, smoothing=args.smoothing).cuda()
+    elif mixup_active:
+        # smoothing is handled with mixup target transform
+        train_loss_fn = SoftTargetCrossEntropy().cuda()
+    elif args.smoothing:
+        train_loss_fn = LabelSmoothingCrossEntropy(smoothing=args.smoothing).cuda()
+    else:
+        train_loss_fn = nn.CrossEntropyLoss().cuda()
+    validate_loss_fn = nn.CrossEntropyLoss().cuda()
+
+    # setup checkpoint saver and eval metric tracking
+    eval_metric = args.eval_metric
+    best_metric = None
+    best_epoch = None
+    saver = None
+    output_dir = ''
+    if args.local_rank == 0:
+        output_base = args.output if args.output else './output'
+        exp_name = '-'.join([
+            datetime.now().strftime("%Y%m%d-%H%M%S"),
+            args.model,
+            str(data_config['input_size'][-1])
+        ])
+        output_dir = get_outdir(output_base, 'train', exp_name)
+        decreasing = True if eval_metric == 'loss' else False
+        saver = CheckpointSaver(
+            model=model, optimizer=optimizer, args=args, model_ema=model_ema, amp_scaler=loss_scaler,
+            checkpoint_dir=output_dir, recovery_dir=output_dir, decreasing=decreasing, max_history=args.checkpoint_hist)
+        with open(os.path.join(output_dir, 'args.yaml'), 'w') as f:
+            f.write(args_text)
+
+    ####################################################################################
+    # Start SparseML Integration
+    ####################################################################################
+    sparseml_loggers = (
+        [PythonLogger(), TensorBoardLogger(log_path=output_dir)]
+        if output_dir
+        else None
+    )
+    manager = ScheduledModifierManager.from_yaml(args.sparseml_recipe)
+    optimizer = ScheduledOptimizer(
+        optimizer,
+        model,
+        manager,
+        steps_per_epoch=len(loader_train),
+        loggers=sparseml_loggers
+    )
+    # override lr scheduler if recipe makes any LR updates
+    if any("LearningRate" in str(modifier) for modifier in manager.modifiers):
+        _logger.info("Disabling timm LR scheduler, managing LR using SparseML recipe")
+        lr_scheduler = None
+    if manager.max_epochs:
+        _logger.info(
+            f"Overriding max_epochs to {manager.max_epochs} from SparseML recipe"
+        )
+        num_epochs = manager.max_epochs or num_epochs
+    ####################################################################################
+    # End SparseML Integration
+    ####################################################################################
+
+    if args.local_rank == 0:
+        _logger.info('Scheduled epochs: {}'.format(num_epochs))
+
+    try:
+        for epoch in range(start_epoch, num_epochs):
+            if args.distributed and hasattr(loader_train.sampler, 'set_epoch'):
+                loader_train.sampler.set_epoch(epoch)
+
+            train_metrics = train_one_epoch(
+                epoch, model, loader_train, optimizer, train_loss_fn, args,
+                lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir,
+                amp_autocast=amp_autocast, loss_scaler=loss_scaler, model_ema=model_ema, mixup_fn=mixup_fn)
+
+            if args.distributed and args.dist_bn in ('broadcast', 'reduce'):
+                if args.local_rank == 0:
+                    _logger.info("Distributing BatchNorm running means and vars")
+                distribute_bn(model, args.world_size, args.dist_bn == 'reduce')
+
+            eval_metrics = validate(model, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast)
+
+            if model_ema is not None and not args.model_ema_force_cpu:
+                if args.distributed and args.dist_bn in ('broadcast', 'reduce'):
+                    distribute_bn(model_ema, args.world_size, args.dist_bn == 'reduce')
+                ema_eval_metrics = validate(
+                    model_ema.module, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast, log_suffix=' (EMA)')
+                eval_metrics = ema_eval_metrics
+
+            if lr_scheduler is not None:
+                # step LR for next epoch
+                lr_scheduler.step(epoch + 1, eval_metrics[eval_metric])
+
+            update_summary(
+                epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'),
+                write_header=best_metric is None)
+
+            if saver is not None:
+                # save proper checkpoint with eval metric
+                save_metric = eval_metrics[eval_metric]
+                best_metric, best_epoch = saver.save_checkpoint(epoch, metric=save_metric)
+
+        #################################################################################
+        # Start SparseML ONNX Export
+        #################################################################################
+        if output_dir:
+            _logger.info(
+                f"training complete, exporting ONNX to {output_dir}/model.onnx"
+            )
+            exporter = ModuleExporter(model, output_dir)
+            exporter.export_onnx(torch.randn((1, *data_config["input_size"])))
+        #################################################################################
+        # End SparseML ONNX Export
+        #################################################################################
+
+    except KeyboardInterrupt:
+        pass
+    if best_metric is not None:
+        _logger.info('*** Best metric: {0} (epoch {1})'.format(best_metric, best_epoch))
+
+
+def train_one_epoch(
+        epoch, model, loader, optimizer, loss_fn, args,
+        lr_scheduler=None, saver=None, output_dir='', amp_autocast=suppress,
+        loss_scaler=None, model_ema=None, mixup_fn=None):
+
+    if args.mixup_off_epoch and epoch >= args.mixup_off_epoch:
+        if args.prefetcher and loader.mixup_enabled:
+            loader.mixup_enabled = False
+        elif mixup_fn is not None:
+            mixup_fn.mixup_enabled = False
+
+    second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order
+    batch_time_m = AverageMeter()
+    data_time_m = AverageMeter()
+    losses_m = AverageMeter()
+
+    model.train()
+
+    end = time.time()
+    last_idx = len(loader) - 1
+    num_updates = epoch * len(loader)
+    for batch_idx, (input, target) in enumerate(loader):
+        last_batch = batch_idx == last_idx
+        data_time_m.update(time.time() - end)
+        if not args.prefetcher:
+            input, target = input.cuda(), target.cuda()
+            if mixup_fn is not None:
+                input, target = mixup_fn(input, target)
+        if args.channels_last:
+            input = input.contiguous(memory_format=torch.channels_last)
+
+        with amp_autocast():
+            output = model(input)
+            loss = loss_fn(output, target)
+
+        if not args.distributed:
+            losses_m.update(loss.item(), input.size(0))
+
+        optimizer.zero_grad()
+        if loss_scaler is not None:
+            loss_scaler(
+                loss, optimizer, clip_grad=args.clip_grad, parameters=model.parameters(), create_graph=second_order)
+        else:
+            loss.backward(create_graph=second_order)
+            if args.clip_grad is not None:
+                torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
+            optimizer.step()
+
+        if model_ema is not None:
+            model_ema.update(model)
+
+        torch.cuda.synchronize()
+        num_updates += 1
+        batch_time_m.update(time.time() - end)
+        if last_batch or batch_idx % args.log_interval == 0:
+            lrl = [param_group['lr'] for param_group in optimizer.param_groups]
+            lr = sum(lrl) / len(lrl)
+
+            if args.distributed:
+                reduced_loss = reduce_tensor(loss.data, args.world_size)
+                losses_m.update(reduced_loss.item(), input.size(0))
+
+            if args.local_rank == 0:
+                _logger.info(
+                    'Train: {} [{:>4d}/{} ({:>3.0f}%)]  '
+                    'Loss: {loss.val:>9.6f} ({loss.avg:>6.4f})  '
+                    'Time: {batch_time.val:.3f}s, {rate:>7.2f}/s  '
+                    '({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s)  '
+                    'LR: {lr:.3e}  '
+                    'Data: {data_time.val:.3f} ({data_time.avg:.3f})'.format(
+                        epoch,
+                        batch_idx, len(loader),
+                        100. * batch_idx / last_idx,
+                        loss=losses_m,
+                        batch_time=batch_time_m,
+                        rate=input.size(0) * args.world_size / batch_time_m.val,
+                        rate_avg=input.size(0) * args.world_size / batch_time_m.avg,
+                        lr=lr,
+                        data_time=data_time_m))
+
+                if args.save_images and output_dir:
+                    torchvision.utils.save_image(
+                        input,
+                        os.path.join(output_dir, 'train-batch-%d.jpg' % batch_idx),
+                        padding=0,
+                        normalize=True)
+
+        if saver is not None and args.recovery_interval and (
+                last_batch or (batch_idx + 1) % args.recovery_interval == 0):
+            saver.save_recovery(epoch, batch_idx=batch_idx)
+
+        if lr_scheduler is not None:
+            lr_scheduler.step_update(num_updates=num_updates, metric=losses_m.avg)
+
+        end = time.time()
+        # end for
+
+    if hasattr(optimizer, 'sync_lookahead'):
+        optimizer.sync_lookahead()
+
+    return OrderedDict([('loss', losses_m.avg)])
+
+
+def validate(model, loader, loss_fn, args, amp_autocast=suppress, log_suffix=''):
+    batch_time_m = AverageMeter()
+    losses_m = AverageMeter()
+    top1_m = AverageMeter()
+    top5_m = AverageMeter()
+
+    model.eval()
+
+    end = time.time()
+    last_idx = len(loader) - 1
+    with torch.no_grad():
+        for batch_idx, (input, target) in enumerate(loader):
+            last_batch = batch_idx == last_idx
+            if not args.prefetcher:
+                input = input.cuda()
+                target = target.cuda()
+            if args.channels_last:
+                input = input.contiguous(memory_format=torch.channels_last)
+
+            with amp_autocast():
+                output = model(input)
+            if isinstance(output, (tuple, list)):
+                output = output[0]
+
+            # augmentation reduction
+            reduce_factor = args.tta
+            if reduce_factor > 1:
+                output = output.unfold(0, reduce_factor, reduce_factor).mean(dim=2)
+                target = target[0:target.size(0):reduce_factor]
+
+            loss = loss_fn(output, target)
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+
+            if args.distributed:
+                reduced_loss = reduce_tensor(loss.data, args.world_size)
+                acc1 = reduce_tensor(acc1, args.world_size)
+                acc5 = reduce_tensor(acc5, args.world_size)
+            else:
+                reduced_loss = loss.data
+
+            torch.cuda.synchronize()
+
+            losses_m.update(reduced_loss.item(), input.size(0))
+            top1_m.update(acc1.item(), output.size(0))
+            top5_m.update(acc5.item(), output.size(0))
+
+            batch_time_m.update(time.time() - end)
+            end = time.time()
+            if args.local_rank == 0 and (last_batch or batch_idx % args.log_interval == 0):
+                log_name = 'Test' + log_suffix
+                _logger.info(
+                    '{0}: [{1:>4d}/{2}]  '
+                    'Time: {batch_time.val:.3f} ({batch_time.avg:.3f})  '
+                    'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f})  '
+                    'Acc@1: {top1.val:>7.4f} ({top1.avg:>7.4f})  '
+                    'Acc@5: {top5.val:>7.4f} ({top5.avg:>7.4f})'.format(
+                        log_name, batch_idx, last_idx, batch_time=batch_time_m,
+                        loss=losses_m, top1=top1_m, top5=top5_m))
+
+    metrics = OrderedDict([('loss', losses_m.avg), ('top1', top1_m.avg), ('top5', top5_m.avg)])
+
+    return metrics
+
+
+if __name__ == '__main__':
+    main()
diff --git a/integrations/ultralytics/README.md b/integrations/ultralytics/README.md
new file mode 100644
index 00000000000..13bcddc9cc0
--- /dev/null
+++ b/integrations/ultralytics/README.md
@@ -0,0 +1,99 @@
+<!--
+Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# SparseML-ultralytics/yolov5 integration
+This directory provides a SparseML integrated training script for the popular
+[ultralytics/yolov5](https://github.com/ultralytics/yolov5)
+repository.
+
+Using this integration, you will be able to apply SparseML optimizations
+to the powerful training flows provided in the yolov5 repository.
+
+Some of the tasks you can perform using this integration include, but are not limited to:
+* model pruning
+* quantization-aware training
+* sparse quantization-aware training
+* sparse transfer learning
+
+## Installation
+To use the script, clone both repositories, install their dependencies,
+and copy the integrated training script into the yolov5 directory to run from.
+
+```bash
+# clone
+git clone https://github.com/ultralytics/yolov5.git
+git clone https://github.com/neuralmagic/sparseml.git
+
+# copy script
+cp sparseml/integrations/ultralytics/train.py yolov5
+cd yolov5
+
+# install dependencies
+pip install -r requirements.txt
+pip install sparseml
+```
+
+
+## Script
+`integrations/ultralytics/train.py` modifies
+[`train.py`](https://github.com/ultralytics/yolov5/blob/master/train.py)
+from yolov5 to include a `sparseml-recipe` argument
+to run SparseML optimizations with.  This can be a file path to a local
+SparseML recipe or a SparseZoo model stub prefixed by `zoo:` such as
+`zoo:cv/detection/yolo_v3-spp/pytorch/ultralytics/coco/pruned-aggressive`.
+
+To load the base weights for a SparseZoo recipe as the initial checkpoint, set
+`--initial-checkpoint` to `zoo`.  To use the weights of a SparseZoo model as the
+initial checkpoint, pass that model's SparseZoo stub prefixed by `zoo:` to the
+`--initial-checkpoint` argument.
+
+Running the script will
+follow the normal yolov5 training flow with the given SparseML optimizations enabled.
+
+Some considerations:
+
+* `--sparseml-recipe` is a required parameter
+* `--epochs` will now be overridden by the epochs set in the SparseML recipe
+* if using learning rate schedulers both with the yolov5 script and your recipe, they
+may conflict with each other causing unintended side effects, so choose
+hyperparameters accordingly
+* Modifiers will log their outputs to the console as well as to the TensorBoard file
+* After training is complete, the final model will be exported to ONNX using SparseML
+* By default, EMA is disabled when using the integrated `train.py`. This is to allow
+for best compatibility with pruning and quantization.  To enable, set the `--use-ema`
+flag
+* By default, Automatic Mixed Precision (AMP) is disabled when using the integrated
+`train.py`. This is because mixed precision is not supported for PyTorch
+quantization-aware training.  To enable, set the `--use-amp` flag
+
+You can learn how to build or download a recipe using the
+[SparseML](https://github.com/neuralmagic/sparseml)
+or [SparseZoo](https://github.com/neuralmagic/sparsezoo)
+documentation, or export one with [Sparsify](https://github.com/neuralmagic/sparsify).
+
+Documentation on the original script can be found
+[here](https://github.com/ultralytics/yolov5).
+The latest commit hash that `train.py` is based on is included in the docstring.
+
+
+#### Example Command
+Call the script from the `yolov5` directory, passing in the same arguments as
+`train.py`, with the additional SparseML argument(s) included.
+```bash
+python train.py \
+  --sparseml-recipe /PATH/TO/RECIPE/recipe.yaml \
+  <regular yolov5/train.py parameters>
+```  
diff --git a/integrations/ultralytics/train.py b/integrations/ultralytics/train.py
new file mode 100644
index 00000000000..e7ddbfdf51a
--- /dev/null
+++ b/integrations/ultralytics/train.py
@@ -0,0 +1,734 @@
+# neuralmagic: no copyright
+# flake8: noqa
+# fmt: off
+# isort: skip_file
+
+"""
+Integration between https://github.com/ultralytics/yolov5 and SparseML
+
+This script is adapted from https://github.com/ultralytics/yolov5/blob/master/train.py
+to apply a SparseML recipe from the required `--sparseml-recipe` argument.
+Integration lines are preceded by comment blocks.  Run with `--help` for help printout,
+more information can be found in the readme file.
+
+Latest yolov5 commit this script is based on:
+https://github.com/ultralytics/yolov5/tree/c9bda112aebaa0be846864f9d224191d0e19d419
+commit hash: c9bda11
+"""
+import argparse
+import logging
+import math
+import os
+import random
+import time
+from pathlib import Path
+from threading import Thread
+
+import numpy as np
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.optim.lr_scheduler as lr_scheduler
+import torch.utils.data
+import yaml
+from torch.cuda import amp
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm
+
+import test  # import test.py to get mAP after each epoch
+from models.experimental import attempt_load
+from models.yolo import Model
+from utils.autoanchor import check_anchors
+from utils.datasets import create_dataloader
+from utils.general import labels_to_class_weights, increment_path, labels_to_image_weights, init_seeds, \
+    fitness, strip_optimizer, get_latest_run, check_dataset, check_file, check_git_status, check_img_size, \
+    check_requirements, print_mutation, set_logging, one_cycle, colorstr
+from utils.google_utils import attempt_download
+from utils.loss import ComputeLoss
+from utils.plots import plot_images, plot_labels, plot_results, plot_evolution
+from utils.torch_utils import ModelEMA, select_device, intersect_dicts, torch_distributed_zero_first
+
+from sparseml.pytorch.optim import ScheduledModifierManager, ScheduledOptimizer
+from sparseml.pytorch.utils import ModuleExporter, PythonLogger, TensorBoardLogger
+from sparsezoo import Zoo
+
+logger = logging.getLogger(__name__)
+
+
+def train(hyp, opt, device, tb_writer=None, wandb=None):
+    logger.info(colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items()))
+    save_dir, epochs, batch_size, total_batch_size, weights, rank = \
+        Path(opt.save_dir), opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank
+
+    # Directories
+    wdir = save_dir / 'weights'
+    wdir.mkdir(parents=True, exist_ok=True)  # make dir
+    last = wdir / 'last.pt'
+    best = wdir / 'best.pt'
+    results_file = save_dir / 'results.txt'
+
+    # Save run settings
+    with open(save_dir / 'hyp.yaml', 'w') as f:
+        yaml.dump(hyp, f, sort_keys=False)
+    with open(save_dir / 'opt.yaml', 'w') as f:
+        yaml.dump(vars(opt), f, sort_keys=False)
+
+    # Configure
+    plots = not opt.evolve  # create plots
+    cuda = device.type != 'cpu'
+    init_seeds(2 + rank)
+    with open(opt.data) as f:
+        data_dict = yaml.load(f, Loader=yaml.SafeLoader)  # data dict
+    with torch_distributed_zero_first(rank):
+        check_dataset(data_dict)  # check
+    train_path = data_dict['train']
+    test_path = data_dict['val']
+    nc = 1 if opt.single_cls else int(data_dict['nc'])  # number of classes
+    names = ['item'] if opt.single_cls and len(data_dict['names']) != 1 else data_dict['names']  # class names
+    assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data)  # check
+
+    # Model
+    pretrained = weights.endswith('.pt')
+    if pretrained:
+        with torch_distributed_zero_first(rank):
+            attempt_download(weights)  # download if not found locally
+        ckpt = torch.load(weights, map_location=device)  # load checkpoint
+        if hyp.get('anchors'):
+            ckpt['model'].yaml['anchors'] = round(hyp['anchors'])  # force autoanchor
+        model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device)  # create
+        exclude = ['anchor'] if opt.cfg or hyp.get('anchors') else []  # exclude keys
+        state_dict = ckpt['model'].float().state_dict()  # to FP32
+        state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude)  # intersect
+        model.load_state_dict(state_dict, strict=False)  # load
+        logger.info('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights))  # report
+    else:
+        model = Model(opt.cfg, ch=3, nc=nc).to(device)  # create
+
+    # Freeze
+    freeze = []  # parameter names to freeze (full or partial)
+    for k, v in model.named_parameters():
+        v.requires_grad = True  # train all layers
+        if any(x in k for x in freeze):
+            print('freezing %s' % k)
+            v.requires_grad = False
+
+    # Optimizer
+    nbs = 64  # nominal batch size
+    accumulate = max(round(nbs / total_batch_size), 1)  # accumulate loss before optimizing
+    hyp['weight_decay'] *= total_batch_size * accumulate / nbs  # scale weight_decay
+    logger.info(f"Scaled weight_decay = {hyp['weight_decay']}")
+
+    pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
+    for k, v in model.named_modules():
+        if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter):
+            pg2.append(v.bias)  # biases
+        if isinstance(v, nn.BatchNorm2d):
+            pg0.append(v.weight)  # no decay
+        elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter):
+            pg1.append(v.weight)  # apply decay
+
+    if opt.adam:
+        optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999))  # adjust beta1 to momentum
+    else:
+        optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
+
+    optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']})  # add pg1 with weight_decay
+    optimizer.add_param_group({'params': pg2})  # add pg2 (biases)
+    logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0)))
+    del pg0, pg1, pg2
+
+    # Scheduler https://arxiv.org/pdf/1812.01187.pdf
+    # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
+    if opt.linear_lr:
+        lf = lambda x: (1 - x / (epochs - 1)) * (1.0 - hyp['lrf']) + hyp['lrf']  # linear
+    else:
+        lf = one_cycle(1, hyp['lrf'], epochs)  # cosine 1->hyp['lrf']
+    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
+    # plot_lr_scheduler(optimizer, scheduler, epochs)
+
+    # Logging
+    if rank in [-1, 0] and wandb and wandb.run is None:
+        opt.hyp = hyp  # add hyperparameters
+        wandb_run = wandb.init(config=opt, resume="allow",
+                               project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem,
+                               name=save_dir.stem,
+                               id=ckpt.get('wandb_id') if 'ckpt' in locals() else None)
+    loggers = {'wandb': wandb}  # loggers dict
+
+    # Resume
+    start_epoch, best_fitness = 0, 0.0
+    if pretrained:
+        # Optimizer
+        if ckpt['optimizer'] is not None:
+            optimizer.load_state_dict(ckpt['optimizer'])
+            best_fitness = ckpt['best_fitness']
+
+        # Results
+        if ckpt.get('training_results') is not None:
+            with open(results_file, 'w') as file:
+                file.write(ckpt['training_results'])  # write results.txt
+
+        # Epochs
+        start_epoch = ckpt['epoch'] + 1
+        if opt.resume:
+            assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (weights, epochs)
+        if epochs < start_epoch:
+            logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' %
+                        (weights, ckpt['epoch'], epochs))
+            epochs += ckpt['epoch']  # finetune additional epochs
+
+        del ckpt, state_dict
+
+    # Image sizes
+    gs = int(model.stride.max())  # grid size (max stride)
+    nl = model.model[-1].nl  # number of detection layers (used for scaling hyp['obj'])
+    imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size]  # verify imgsz are gs-multiples
+
+    # DP mode
+    if cuda and rank == -1 and torch.cuda.device_count() > 1:
+        model = torch.nn.DataParallel(model)
+
+    # SyncBatchNorm
+    if opt.sync_bn and cuda and rank != -1:
+        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
+        logger.info('Using SyncBatchNorm()')
+
+    # EMA
+    ####################################################################################
+    # Start SparseML Integration - optional EMA
+    ####################################################################################
+    ema = ModelEMA(model) if rank in [-1, 0] and opt.use_ema else None
+    ####################################################################################
+    # End SparseML Integration - optional EMA
+    ####################################################################################
+
+    # DDP mode
+    if cuda and rank != -1:
+        model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank)
+
+    # Trainloader
+    dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt,
+                                            hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank,
+                                            world_size=opt.world_size, workers=opt.workers,
+                                            image_weights=opt.image_weights, quad=opt.quad, prefix=colorstr('train: '))
+    mlc = np.concatenate(dataset.labels, 0)[:, 0].max()  # max label class
+    nb = len(dataloader)  # number of batches
+    assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, opt.data, nc - 1)
+
+    # Process 0
+    if rank in [-1, 0]:
+        if ema:
+            ema.updates = start_epoch * nb // accumulate  # set EMA updates
+        testloader = create_dataloader(test_path, imgsz_test, batch_size * 2, gs, opt,  # testloader
+                                       hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True, rank=-1,
+                                       world_size=opt.world_size, workers=opt.workers,
+                                       pad=0.5, prefix=colorstr('val: '))[0]
+
+        if not opt.resume:
+            labels = np.concatenate(dataset.labels, 0)
+            c = torch.tensor(labels[:, 0])  # classes
+            # cf = torch.bincount(c.long(), minlength=nc) + 1.  # frequency
+            # model._initialize_biases(cf.to(device))
+            if plots:
+                plot_labels(labels, save_dir, loggers)
+                if tb_writer:
+                    tb_writer.add_histogram('classes', c, 0)
+
+            # Anchors
+            if not opt.noautoanchor:
+                check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz)
+
+    # Model parameters
+    hyp['box'] *= 3. / nl  # scale to layers
+    hyp['cls'] *= nc / 80. * 3. / nl  # scale to classes and layers
+    hyp['obj'] *= (imgsz / 640) ** 2 * 3. / nl  # scale to image size and layers
+    model.nc = nc  # attach number of classes to model
+    model.hyp = hyp  # attach hyperparameters to model
+    model.gr = 1.0  # iou loss ratio (obj_loss = 1.0 or iou)
+    model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc  # attach class weights
+    model.names = names
+
+    ####################################################################################
+    # Start SparseML Integration
+    ####################################################################################
+    manager = ScheduledModifierManager.from_yaml(opt.sparseml_recipe)
+    optimizer = ScheduledOptimizer(
+        optimizer,
+        model,
+        manager,
+        steps_per_epoch=len(dataloader),
+        loggers=[PythonLogger(), TensorBoardLogger(writer=tb_writer)]
+    )
+    # override lr scheduler if recipe makes any LR updates
+    if any("LearningRate" in str(modifier) for modifier in manager.modifiers):
+        logger.info("Disabling yolo LR scheduler, managing LR using SparseML recipe")
+        scheduler = None
+    if manager.max_epochs:
+        epochs = manager.max_epochs or epochs  # override num_epochs
+        logger.info(
+            f"overriding number of epochs from SparseML manager to {manager.max_epochs}"
+        )
+    ####################################################################################
+    # End SparseML Integration
+    ####################################################################################
+
+    # Start training
+    t0 = time.time()
+    nw = max(round(hyp['warmup_epochs'] * nb), 1000)  # number of warmup iterations, max(3 epochs, 1k iterations)
+    # nw = min(nw, (epochs - start_epoch) / 2 * nb)  # limit warmup to < 1/2 of training
+    maps = np.zeros(nc)  # mAP per class
+    results = (0, 0, 0, 0, 0, 0, 0)  # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls)
+    if scheduler:
+        scheduler.last_epoch = start_epoch - 1  # do not move
+    scaler = amp.GradScaler(enabled=(cuda and opt.use_amp))
+    compute_loss = ComputeLoss(model)  # init loss class
+    logger.info(f'Image sizes {imgsz} train, {imgsz_test} test\n'
+                f'Using {dataloader.num_workers} dataloader workers\n'
+                f'Logging results to {save_dir}\n'
+                f'Starting training for {epochs} epochs...')
+    for epoch in range(start_epoch, epochs):  # epoch ------------------------------------------------------------------
+        model.train()
+
+        # Update image weights (optional)
+        if opt.image_weights:
+            # Generate indices
+            if rank in [-1, 0]:
+                cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc  # class weights
+                iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw)  # image weights
+                dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n)  # rand weighted idx
+            # Broadcast if DDP
+            if rank != -1:
+                indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int()
+                dist.broadcast(indices, 0)
+                if rank != 0:
+                    dataset.indices = indices.cpu().numpy()
+
+        # Update mosaic border
+        # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)
+        # dataset.mosaic_border = [b - imgsz, -b]  # height, width borders
+
+        mloss = torch.zeros(4, device=device)  # mean losses
+        if rank != -1:
+            dataloader.sampler.set_epoch(epoch)
+        pbar = enumerate(dataloader)
+        logger.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'targets', 'img_size'))
+        if rank in [-1, 0]:
+            pbar = tqdm(pbar, total=nb)  # progress bar
+        optimizer.zero_grad()
+        for i, (imgs, targets, paths, _) in pbar:  # batch -------------------------------------------------------------
+            ni = i + nb * epoch  # number integrated batches (since train start)
+            imgs = imgs.to(device, non_blocking=True).float() / 255.0  # uint8 to float32, 0-255 to 0.0-1.0
+
+            # Warmup
+            if ni <= nw:
+                xi = [0, nw]  # x interp
+                # model.gr = np.interp(ni, xi, [0.0, 1.0])  # iou loss ratio (obj_loss = 1.0 or iou)
+                accumulate = max(1, np.interp(ni, xi, [1, nbs / total_batch_size]).round())
+                for j, x in enumerate(optimizer.param_groups):
+                    # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
+                    x['lr'] = np.interp(ni, xi, [hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])
+                    if 'momentum' in x:
+                        x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']])
+
+            # Multi-scale
+            if opt.multi_scale:
+                sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs  # size
+                sf = sz / max(imgs.shape[2:])  # scale factor
+                if sf != 1:
+                    ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]]  # new shape (stretched to gs-multiple)
+                    imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)
+
+            # Forward
+            with amp.autocast(enabled=(cuda and opt.use_amp)):
+                pred = model(imgs)  # forward
+                loss, loss_items = compute_loss(pred, targets.to(device))  # loss scaled by batch_size
+                if rank != -1:
+                    loss *= opt.world_size  # gradient averaged between devices in DDP mode
+                if opt.quad:
+                    loss *= 4.
+
+            # Backward
+            scaler.scale(loss).backward()
+
+            # Optimize
+            if ni % accumulate == 0:
+                scaler.step(optimizer)  # optimizer.step
+                scaler.update()
+                optimizer.zero_grad()
+                if ema:
+                    ema.update(model)
+
+            # Print
+            if rank in [-1, 0]:
+                mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses
+                mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0)  # (GB)
+                s = ('%10s' * 2 + '%10.4g' * 6) % (
+                    '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1])
+                pbar.set_description(s)
+
+                # Plot
+                if plots and ni < 3:
+                    f = save_dir / f'train_batch{ni}.jpg'  # filename
+                    Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start()
+                    # if tb_writer:
+                    #     tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch)
+                    #     tb_writer.add_graph(model, imgs)  # add model to tensorboard
+                elif plots and ni == 10 and wandb:
+                    wandb.log({"Mosaics": [wandb.Image(str(x), caption=x.name) for x in save_dir.glob('train*.jpg')
+                                           if x.exists()]}, commit=False)
+
+            # end batch ------------------------------------------------------------------------------------------------
+        # end epoch ----------------------------------------------------------------------------------------------------
+
+        # Scheduler
+        lr = [x['lr'] for x in optimizer.param_groups]  # for tensorboard
+        if scheduler:
+            scheduler.step()
+
+        # DDP process 0 or single-GPU
+        if rank in [-1, 0]:
+            # mAP
+            if ema:
+                ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride', 'class_weights'])
+
+            final_epoch = epoch + 1 == epochs
+            if not opt.notest or final_epoch:  # Calculate mAP
+                results, maps, times = test.test(opt.data,
+                                                 batch_size=batch_size * 2,
+                                                 imgsz=imgsz_test,
+                                                 model=ema.ema if ema else model,
+                                                 single_cls=opt.single_cls,
+                                                 dataloader=testloader,
+                                                 save_dir=save_dir,
+                                                 verbose=nc < 50 and final_epoch,
+                                                 plots=plots and final_epoch,
+                                                 log_imgs=opt.log_imgs if wandb else 0,
+                                                 compute_loss=compute_loss)
+
+            # Write
+            with open(results_file, 'a') as f:
+                f.write(s + '%10.4g' * 7 % results + '\n')  # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls)
+            if len(opt.name) and opt.bucket:
+                os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name))
+
+            # Log
+            tags = ['train/box_loss', 'train/obj_loss', 'train/cls_loss',  # train loss
+                    'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95',
+                    'val/box_loss', 'val/obj_loss', 'val/cls_loss',  # val loss
+                    'x/lr0', 'x/lr1', 'x/lr2']  # params
+            for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags):
+                if tb_writer:
+                    tb_writer.add_scalar(tag, x, epoch)  # tensorboard
+                if wandb:
+                    wandb.log({tag: x}, step=epoch, commit=tag == tags[-1])  # W&B
+
+            # Update best mAP
+            fi = fitness(np.array(results).reshape(1, -1))  # weighted combination of [P, R, mAP@.5, mAP@.5-.95]
+            if fi > best_fitness:
+                best_fitness = fi
+
+            # Save model
+            save = (not opt.nosave) or (final_epoch and not opt.evolve)
+            if save:
+                with open(results_file, 'r') as f:  # create checkpoint
+                    ckpt = {'epoch': epoch,
+                            'best_fitness': best_fitness,
+                            'training_results': f.read(),
+                            'model': ema.ema if ema else model,
+                            'optimizer': None if final_epoch else optimizer.state_dict(),
+                            'wandb_id': wandb_run.id if wandb else None}
+
+                # Save last, best and delete
+                torch.save(ckpt, last)
+                if best_fitness == fi:
+                    torch.save(ckpt, best)
+                del ckpt
+        # end epoch ----------------------------------------------------------------------------------------------------
+    # end training
+
+    if rank in [-1, 0]:
+        # Strip optimizers
+        final = best if best.exists() else last  # final model
+        for f in [last, best]:
+            if f.exists():
+                strip_optimizer(f)  # strip optimizers
+        if opt.bucket:
+            os.system(f'gsutil cp {final} gs://{opt.bucket}/weights')  # upload
+
+        # Plots
+        if plots:
+            plot_results(save_dir=save_dir)  # save as results.png
+            if wandb:
+                files = ['results.png', 'confusion_matrix.png', *[f'{x}_curve.png' for x in ('F1', 'PR', 'P', 'R')]]
+                wandb.log({"Results": [wandb.Image(str(save_dir / f), caption=f) for f in files
+                                       if (save_dir / f).exists()]})
+                if opt.log_artifacts:
+                    wandb.log_artifact(artifact_or_path=str(final), type='model', name=save_dir.stem)
+
+        # Test best.pt
+        logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
+        if opt.data.endswith('coco.yaml') and nc == 80:  # if COCO
+            for conf, iou, save_json in ([0.25, 0.45, False], [0.001, 0.65, True]):  # speed, mAP tests
+                results, _, _ = test.test(opt.data,
+                                          batch_size=batch_size * 2,
+                                          imgsz=imgsz_test,
+                                          conf_thres=conf,
+                                          iou_thres=iou,
+                                          model=attempt_load(final, device).half(),
+                                          single_cls=opt.single_cls,
+                                          dataloader=testloader,
+                                          save_dir=save_dir,
+                                          save_json=save_json,
+                                          plots=False)
+        #################################################################################
+        # Start SparseML ONNX Export
+        #################################################################################
+            logger.info(
+                f"training complete, exporting ONNX to {save_dir}/model.onnx"
+            )
+            exporter = ModuleExporter(model, save_dir)
+            exporter.export_onnx(torch.randn((1, 3, *imgsz)))
+        #################################################################################
+        # End SparseML ONNX Export
+        #################################################################################
+
+    else:
+        dist.destroy_process_group()
+
+    wandb.run.finish() if wandb and wandb.run else None
+    torch.cuda.empty_cache()
+    return results
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    ####################################################################################
+    # Start SparseML arguments
+    ####################################################################################
+    parser.add_argument(
+        "--sparseml-recipe",
+        required=True,
+        type=str,
+        help="path to a SparseML recipe file or a SparseZoo model stub for a recipe to load. "
+             "SparseZoo stubs should be preceded by 'zoo:'. i.e. '/path/to/local/recipe.yaml', "
+             "'zoo:zoo/model/stub'"
+    )
+    parser.add_argument(
+        "--weights",
+        type=str,
+        default="yolov5s.pt",
+        help="initial weights path. can be a local file path, can pass in 'zoo' if "
+        "using a SparseZoo recipe to load that recipes base weights, or pass in a "
+        "SparseZoo model stub, prefixed with 'zoo:' to load weights directly from "
+        "SparseZoo",
+    )
+    parser.add_argument(
+        "--use-ema",
+        action="store_true",
+        help="set flag to enable EMA updates. disabled by default in SparseML integration"
+    )
+    parser.add_argument(
+        "--use-amp",
+        action="store_true",
+        help="set flag to enable Automatic Mixed Precision (AMP). disabled by default "
+        "in SparseML integration"
+    )
+    ####################################################################################
+    # End SparseML arguments
+    ####################################################################################
+    parser.add_argument('--cfg', type=str, default='', help='model.yaml path')
+    parser.add_argument('--data', type=str, default='data/coco128.yaml', help='data.yaml path')
+    parser.add_argument('--hyp', type=str, default='data/hyp.scratch.yaml', help='hyperparameters path')
+    parser.add_argument('--epochs', type=int, default=300)
+    parser.add_argument('--batch-size', type=int, default=16, help='total batch size for all GPUs')
+    parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='[train, test] image sizes')
+    parser.add_argument('--rect', action='store_true', help='rectangular training')
+    parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume most recent training')
+    parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
+    parser.add_argument('--notest', action='store_true', help='only test final epoch')
+    parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check')
+    parser.add_argument('--evolve', action='store_true', help='evolve hyperparameters')
+    parser.add_argument('--bucket', type=str, default='', help='gsutil bucket')
+    parser.add_argument('--cache-images', action='store_true', help='cache images for faster training')
+    parser.add_argument('--image-weights', action='store_true', help='use weighted image selection for training')
+    parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
+    parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%')
+    parser.add_argument('--single-cls', action='store_true', help='train multi-class data as single-class')
+    parser.add_argument('--adam', action='store_true', help='use torch.optim.Adam() optimizer')
+    parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
+    parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')
+    parser.add_argument('--log-imgs', type=int, default=16, help='number of images for W&B logging, max 100')
+    parser.add_argument('--log-artifacts', action='store_true', help='log artifacts, i.e. final trained model')
+    parser.add_argument('--workers', type=int, default=8, help='maximum number of dataloader workers')
+    parser.add_argument('--project', default='runs/train', help='save to project/name')
+    parser.add_argument('--name', default='exp', help='save to project/name')
+    parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
+    parser.add_argument('--quad', action='store_true', help='quad dataloader')
+    parser.add_argument('--linear-lr', action='store_true', help='linear LR')
+    opt = parser.parse_args()
+
+    # Set DDP variables
+    opt.world_size = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1
+    opt.global_rank = int(os.environ['RANK']) if 'RANK' in os.environ else -1
+    set_logging(opt.global_rank)
+    if opt.global_rank in [-1, 0]:
+        check_git_status()
+        check_requirements()
+
+    ####################################################################################
+    # Start - SparseML optional load weights from SparseZoo
+    ####################################################################################
+    if opt.weights == "zoo":
+        # Load checkpoint from base weights associated with given SparseZoo recipe
+        if opt.sparseml_recipe.startswith("zoo:"):
+            opt.weights = Zoo.download_recipe_base_framework_files(
+                opt.sparseml_recipe,
+                extensions=[".pt", ".pth"]
+            )[0]
+        else:
+            raise ValueError(
+                "Attempting to load weights from SparseZoo recipe, but not given a "
+                "SparseZoo recipe stub.  When --weights is set to 'zoo'. "
+                "sparseml-recipe must start with 'zoo:' and be a SparseZoo model "
+                f"stub. sparseml-recipe was set to {args.sparseml_recipe}"
+            )
+    elif opt.weights.startswith("zoo:"):
+        # Load weights from a SparseZoo model stub
+        zoo_model = Zoo.load_model_from_stub(opt.weights)
+        args.initial_checkpoint = zoo_model.download_framework_files(
+            extensions=[".pt", ".pth"]
+        )
+    ####################################################################################
+    # End - SparseML optional load weights from SparseZoo
+    ####################################################################################
+
+    # Resume
+    if opt.resume:  # resume an interrupted run
+        ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run()  # specified or most recent path
+        assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist'
+        apriori = opt.global_rank, opt.local_rank
+        with open(Path(ckpt).parent.parent / 'opt.yaml') as f:
+            opt = argparse.Namespace(**yaml.load(f, Loader=yaml.SafeLoader))  # replace
+        opt.cfg, opt.weights, opt.resume, opt.batch_size, opt.global_rank, opt.local_rank = '', ckpt, True, opt.total_batch_size, *apriori  # reinstate
+        logger.info('Resuming training from %s' % ckpt)
+    else:
+        # opt.hyp = opt.hyp or ('hyp.finetune.yaml' if opt.weights else 'hyp.scratch.yaml')
+        opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file(opt.cfg), check_file(opt.hyp)  # check files
+        assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified'
+        opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size)))  # extend to 2 sizes (train, test)
+        opt.name = 'evolve' if opt.evolve else opt.name
+        opt.save_dir = increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok | opt.evolve)  # increment run
+
+    # DDP mode
+    opt.total_batch_size = opt.batch_size
+    device = select_device(opt.device, batch_size=opt.batch_size)
+    if opt.local_rank != -1:
+        assert torch.cuda.device_count() > opt.local_rank
+        torch.cuda.set_device(opt.local_rank)
+        device = torch.device('cuda', opt.local_rank)
+        dist.init_process_group(backend='nccl', init_method='env://')  # distributed backend
+        assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count'
+        opt.batch_size = opt.total_batch_size // opt.world_size
+
+    # Hyperparameters
+    with open(opt.hyp) as f:
+        hyp = yaml.load(f, Loader=yaml.SafeLoader)  # load hyps
+
+    # Train
+    logger.info(opt)
+    try:
+        import wandb
+    except ImportError:
+        wandb = None
+        prefix = colorstr('wandb: ')
+        logger.info(f"{prefix}Install Weights & Biases for YOLOv5 logging with 'pip install wandb' (recommended)")
+    if not opt.evolve:
+        tb_writer = None  # init loggers
+        if opt.global_rank in [-1, 0]:
+            logger.info(f'Start Tensorboard with "tensorboard --logdir {opt.project}", view at http://localhost:6006/')
+            tb_writer = SummaryWriter(opt.save_dir)  # Tensorboard
+        train(hyp, opt, device, tb_writer, wandb)
+
+    # Evolve hyperparameters (optional)
+    else:
+        # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit)
+        meta = {'lr0': (1, 1e-5, 1e-1),  # initial learning rate (SGD=1E-2, Adam=1E-3)
+                'lrf': (1, 0.01, 1.0),  # final OneCycleLR learning rate (lr0 * lrf)
+                'momentum': (0.3, 0.6, 0.98),  # SGD momentum/Adam beta1
+                'weight_decay': (1, 0.0, 0.001),  # optimizer weight decay
+                'warmup_epochs': (1, 0.0, 5.0),  # warmup epochs (fractions ok)
+                'warmup_momentum': (1, 0.0, 0.95),  # warmup initial momentum
+                'warmup_bias_lr': (1, 0.0, 0.2),  # warmup initial bias lr
+                'box': (1, 0.02, 0.2),  # box loss gain
+                'cls': (1, 0.2, 4.0),  # cls loss gain
+                'cls_pw': (1, 0.5, 2.0),  # cls BCELoss positive_weight
+                'obj': (1, 0.2, 4.0),  # obj loss gain (scale with pixels)
+                'obj_pw': (1, 0.5, 2.0),  # obj BCELoss positive_weight
+                'iou_t': (0, 0.1, 0.7),  # IoU training threshold
+                'anchor_t': (1, 2.0, 8.0),  # anchor-multiple threshold
+                'anchors': (2, 2.0, 10.0),  # anchors per output grid (0 to ignore)
+                'fl_gamma': (0, 0.0, 2.0),  # focal loss gamma (efficientDet default gamma=1.5)
+                'hsv_h': (1, 0.0, 0.1),  # image HSV-Hue augmentation (fraction)
+                'hsv_s': (1, 0.0, 0.9),  # image HSV-Saturation augmentation (fraction)
+                'hsv_v': (1, 0.0, 0.9),  # image HSV-Value augmentation (fraction)
+                'degrees': (1, 0.0, 45.0),  # image rotation (+/- deg)
+                'translate': (1, 0.0, 0.9),  # image translation (+/- fraction)
+                'scale': (1, 0.0, 0.9),  # image scale (+/- gain)
+                'shear': (1, 0.0, 10.0),  # image shear (+/- deg)
+                'perspective': (0, 0.0, 0.001),  # image perspective (+/- fraction), range 0-0.001
+                'flipud': (1, 0.0, 1.0),  # image flip up-down (probability)
+                'fliplr': (0, 0.0, 1.0),  # image flip left-right (probability)
+                'mosaic': (1, 0.0, 1.0),  # image mixup (probability)
+                'mixup': (1, 0.0, 1.0)}  # image mixup (probability)
+
+        assert opt.local_rank == -1, 'DDP mode not implemented for --evolve'
+        opt.notest, opt.nosave = True, True  # only test/save final epoch
+        # ei = [isinstance(x, (int, float)) for x in hyp.values()]  # evolvable indices
+        yaml_file = Path(opt.save_dir) / 'hyp_evolved.yaml'  # save best result here
+        if opt.bucket:
+            os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket)  # download evolve.txt if exists
+
+        for _ in range(300):  # generations to evolve
+            if Path('evolve.txt').exists():  # if evolve.txt exists: select best hyps and mutate
+                # Select parent(s)
+                parent = 'single'  # parent selection method: 'single' or 'weighted'
+                x = np.loadtxt('evolve.txt', ndmin=2)
+                n = min(5, len(x))  # number of previous results to consider
+                x = x[np.argsort(-fitness(x))][:n]  # top n mutations
+                w = fitness(x) - fitness(x).min()  # weights
+                if parent == 'single' or len(x) == 1:
+                    # x = x[random.randint(0, n - 1)]  # random selection
+                    x = x[random.choices(range(n), weights=w)[0]]  # weighted selection
+                elif parent == 'weighted':
+                    x = (x * w.reshape(n, 1)).sum(0) / w.sum()  # weighted combination
+
+                # Mutate
+                mp, s = 0.8, 0.2  # mutation probability, sigma
+                npr = np.random
+                npr.seed(int(time.time()))
+                g = np.array([x[0] for x in meta.values()])  # gains 0-1
+                ng = len(meta)
+                v = np.ones(ng)
+                while all(v == 1):  # mutate until a change occurs (prevent duplicates)
+                    v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0)
+                for i, k in enumerate(hyp.keys()):  # plt.hist(v.ravel(), 300)
+                    hyp[k] = float(x[i + 7] * v[i])  # mutate
+
+            # Constrain to limits
+            for k, v in meta.items():
+                hyp[k] = max(hyp[k], v[1])  # lower limit
+                hyp[k] = min(hyp[k], v[2])  # upper limit
+                hyp[k] = round(hyp[k], 5)  # significant digits
+
+            # Train mutation
+            results = train(hyp.copy(), opt, device, wandb=wandb)
+
+            # Write mutation results
+            print_mutation(hyp.copy(), results, yaml_file, opt.bucket)
+
+        # Plot results
+        plot_evolution(yaml_file)
+        print(f'Hyperparameter evolution complete. Best results saved as: {yaml_file}\n'
+              f'Command to train a new model with these hyperparameters: $ python train.py --hyp {yaml_file}')
diff --git a/notebooks/README.md b/notebooks/README.md
index 5aceb5d5312..8170838d05f 100644
--- a/notebooks/README.md
+++ b/notebooks/README.md
@@ -46,7 +46,7 @@ ssh -N -f -L localhost:6006:localhost:6006 user@remote_ip_address
 You may need to enable the Jupyter extension to properly see the UIs with the following command:
 
 ```bash
-jupyter nbextension enable --py widgetsnbextension.
+jupyter nbextension enable --py widgetsnbextension
 ```
 </li>
 
diff --git a/scripts/README.md b/scripts/README.md
index 3d5095fbb30..9a7bfa662c7 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -23,7 +23,7 @@ model training, pruning, quantization, exporting, and sensitivity analysis.
 These scripts natively support models in the SparseML submodules, however can be adapted for use with your own models
 or used to inspire new workflows.  To see examples of simple integrations with SparseML check out our
 [notebooks](https://github.com/neuralmagic/sparseml/tree/main/notebooks)
-and [examples](https://github.com/neuralmagic/sparseml/tree/main/examples).  
+and [integrations](https://github.com/neuralmagic/sparseml/tree/main/integrations).  
 
 To run one of the scripts, invoke it with a Python command from the command line along with the relevant arguments.
 
@@ -39,4 +39,4 @@ run any script with `-h` or `--help` to see the help printout.
 | Script     |      Description      |
 |----------|-------------|
 | [PyTorch Vision](https://github.com/neuralmagic/sparseml/blob/main/scripts/pytorch_vision.py)  | Script for training, optimization, export, pruning sensitivity analysis, or learning rate sensitivity analysis of PyTorch classification and Detection models |
-| [TensorFlow V1 Classification](https://github.com/neuralmagic/sparseml/blob/main/scripts/tensorflow_v1_classification.py)  | Script for training, optimization, export, or pruning sensitivity analysis of TensorFlow V1 classification models  |
\ No newline at end of file
+| [TensorFlow V1 Classification](https://github.com/neuralmagic/sparseml/blob/main/scripts/tensorflow_v1_classification.py)  | Script for training, optimization, export, or pruning sensitivity analysis of TensorFlow V1 classification models  |
diff --git a/scripts/pytorch_vision.py b/scripts/pytorch_vision.py
index d38fac15c66..73b2d6ff646 100644
--- a/scripts/pytorch_vision.py
+++ b/scripts/pytorch_vision.py
@@ -80,7 +80,9 @@
   --checkpoint-path CHECKPOINT_PATH
                         A path to a previous checkpoint to load the state from
                         and resume the state for. If provided, pretrained will
-                        be ignored
+                        be ignored. If using a SparseZoo recipe, can also
+                        provide 'zoo' to load the base weights associated with
+                        that recipe
   --model-kwargs MODEL_KWARGS
                         kew word arguments to be passed to model constructor,
                         should be given as a json object
@@ -112,9 +114,9 @@
                         in as a json object
   --recipe-path RECIPE_PATH
                         The path to the yaml file containing the modifiers and
-                        schedule to apply them with. If set to
-                        'transfer_learning', then will create a schedule to
-                        enable sparse transfer learning
+                        schedule to apply them with.  Can also provide a
+                        SparseZoo stub prefixed with 'zoo:' with an optional
+                        '?recipe_type=' argument"
   --sparse-transfer-learn
                         Enable sparse transfer learning modifiers to enforce
                         the sparsity for already sparse layers. The modifiers
@@ -462,6 +464,7 @@
     torch_distributed_zero_first,
 )
 from sparseml.utils import convert_to_bool, create_dirs
+from sparsezoo import Zoo
 
 
 LOGGER = get_main_logger()
@@ -540,12 +543,20 @@ def parse_args():
             "Default is None which will load the default dataset for the architecture."
             " Ex can be set to imagenet, cifar10, etc",
         )
+        checkpoint_path_help = (
+            "A path to a previous checkpoint to load the state from and "
+            "resume the state for. If provided, pretrained will be ignored"
+        )
+        if par == train_parser:
+            checkpoint_path_help += (
+                ". If using a SparseZoo recipe, can also provide 'zoo' to load "
+                "the base weights associated with that recipe"
+            )
         par.add_argument(
             "--checkpoint-path",
             type=str,
             default=None,
-            help="A path to a previous checkpoint to load the state from and "
-            "resume the state for. If provided, pretrained will be ignored",
+            help=checkpoint_path_help,
         )
         par.add_argument(
             "--model-kwargs",
@@ -664,8 +675,8 @@ def parse_args():
                 type=str,
                 default=None,
                 help="The path to the yaml file containing the modifiers and "
-                "schedule to apply them with. If set to 'transfer_learning', "
-                "then will create a schedule to enable sparse transfer learning",
+                "schedule to apply them with. Can also provide a SparseZoo stub "
+                "prefixed with 'zoo:' with an optional '?recipe_type=' argument",
             )
             par.add_argument(
                 "--sparse-transfer-learn",
@@ -1337,6 +1348,17 @@ def main(args):
         num_classes = dataset_attributes["num_classes"]
 
     with torch_distributed_zero_first(args.local_rank):  # only download once locally
+        if args.checkpoint_path == "zoo":
+            if args.recipe_path and args.recipe_path.startswith("zoo:"):
+                args.checkpoint_path = Zoo.download_recipe_base_framework_files(
+                    args.recipe_path, extensions=[".pth"]
+                )[0]
+            else:
+                raise ValueError(
+                    "'zoo' provided as --checkpoint-path but a SparseZoo stub"
+                    " prefixed by 'zoo:' not provided as --recipe-path"
+                )
+
         model = ModelRegistry.create(
             args.arch_key,
             args.pretrained,
diff --git a/setup.py b/setup.py
index 7de4287e05c..1b5b2d2db49 100644
--- a/setup.py
+++ b/setup.py
@@ -12,11 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import sys
+from datetime import date
 from typing import Dict, List, Tuple
 
 from setuptools import find_packages, setup
 
 
+_PACKAGE_NAME = "sparseml"
+_VERSION = "0.1.1"
+_VERSION_MAJOR, _VERSION_MINOR, _VERSION_BUG = _VERSION.split(".")
+_VERSION_MAJOR_MINOR = f"{_VERSION_MAJOR}.{_VERSION_MINOR}"
+_NIGHTLY = "nightly" in sys.argv
+
+if _NIGHTLY:
+    _PACKAGE_NAME += "-nightly"
+    _VERSION += "." + date.today().strftime("%Y%m%d")
+    # remove nightly param so it does not break bdist_wheel
+    sys.argv.remove("nightly")
+
 _deps = [
     "jupyter>=1.0.0",
     "ipywidgets>=7.0.0",
@@ -32,10 +46,15 @@
     "requests>=2.0.0",
     "scikit-image>=0.15.0",
     "scipy>=1.0.0",
-    "sparsezoo>=0.1.0",
     "tqdm>=4.0.0",
     "toposort>=1.0",
 ]
+_nm_deps = [
+    f"{'sparsezoo-nightly' if _NIGHTLY else 'sparsezoo'}~={_VERSION_MAJOR_MINOR}"
+]
+_deepsparse_deps = [
+    f"{'deepsparse-nightly' if _NIGHTLY else 'deepsparse'}~={_VERSION_MAJOR_MINOR}"
+]
 _pytorch_deps = ["torch>=1.1.0", "tensorboard>=1.0", "tensorboardX>=1.0"]
 _pytorch_vision_deps = _pytorch_deps + ["torchvision>=0.3.0"]
 _tensorflow_v1_deps = ["tensorflow<2.0.0", "tensorboard<2.0.0", "tf2onnx>=1.0.0,<1.6"]
@@ -57,6 +76,7 @@
     "sphinx-markdown-tables>=0.0.15",
     "wheel>=0.36.2",
     "pytest>=6.0.0",
+    "flaky>=3.0.0",
     "sphinx-rtd-theme",
 ]
 
@@ -72,12 +92,13 @@ def _setup_package_dir() -> Dict:
 
 
 def _setup_install_requires() -> List:
-    return _deps
+    return _nm_deps + _deps
 
 
 def _setup_extras() -> Dict:
     return {
         "dev": _dev_deps,
+        "deepsparse": _deepsparse_deps,
         "torch": _pytorch_deps,
         "torchvision": _pytorch_vision_deps,
         "tf_v1": _tensorflow_v1_deps,
@@ -95,13 +116,13 @@ def _setup_long_description() -> Tuple[str, str]:
 
 
 setup(
-    name="sparseml",
-    version="0.1.0",
+    name=_PACKAGE_NAME,
+    version=_VERSION,
     author="Neuralmagic, Inc.",
     author_email="support@neuralmagic.com",
     description=(
-        "Libraries for state-of-the-art deep neural network optimization algorithms, "
-        "enabling simple pipelines integration with a few lines of code"
+        "Libraries for applying sparsification recipes to neural networks with a "
+        "few lines of code, enabling faster and smaller models"
     ),
     long_description=_setup_long_description()[0],
     long_description_content_type=_setup_long_description()[1],
diff --git a/src/sparseml/keras/optim/manager.py b/src/sparseml/keras/optim/manager.py
index ebb02013fb1..2b0a0998226 100644
--- a/src/sparseml/keras/optim/manager.py
+++ b/src/sparseml/keras/optim/manager.py
@@ -26,6 +26,7 @@
 from sparseml.keras.utils.logger import KerasLogger
 from sparseml.optim import BaseManager
 from sparseml.utils import load_recipe_yaml_str
+from sparsezoo.objects import OptimizationRecipe
 
 
 __all__ = ["ScheduledModifierManager"]
@@ -37,15 +38,23 @@ class ScheduledModifierManager(BaseManager, Modifier):
     """
 
     @staticmethod
-    def from_yaml(file_path: str, add_modifiers: List[Modifier] = None):
+    def from_yaml(
+        file_path: Union[str, OptimizationRecipe],
+        add_modifiers: List[Modifier] = None,
+    ):
         """
-        Convenience function used to create the manager of multiple modifiers
-        from a yaml file.
-
-        :param file_path: the path to the yaml file to load the modifier from
+        Convenience function used to create the manager of multiple modifiers from a
+        recipe file.
+
+        :param file_path: the path to the recipe file to load the modifier from, or
+            a SparseZoo model stub to load a recipe for a model stored in SparseZoo.
+            SparseZoo stubs should be preceded by 'zoo:', and can contain an optional
+            '?recipe_type=<type>' parameter. Can also be a SparseZoo OptimizationRecipe
+            object. i.e. '/path/to/local/recipe.yaml', 'zoo:model/stub/path',
+            'zoo:model/stub/path?recipe_type=transfer'
         :param add_modifiers: additional modifiers that should be added to the
-            returned manager alongside the ones loaded from the yaml file
-        :return: ScheduledModifierManager() created from the yaml file
+            returned manager alongside the ones loaded from the recipe file
+        :return: ScheduledModifierManager() created from the recipe file
         """
         yaml_str = load_recipe_yaml_str(file_path)
         modifiers = Modifier.load_list(yaml_str)
diff --git a/src/sparseml/keras/optim/mask_pruning.py b/src/sparseml/keras/optim/mask_pruning.py
index 07e45f1b346..95e6f07ba58 100644
--- a/src/sparseml/keras/optim/mask_pruning.py
+++ b/src/sparseml/keras/optim/mask_pruning.py
@@ -15,14 +15,21 @@
 import abc
 import collections
 import inspect
-from typing import List
+from typing import List, Union
 
 import tensorflow as tf
 
-from sparseml.keras.optim.mask_pruning_creator import PruningMaskCreator
+from sparseml.keras.optim.mask_pruning_creator import (
+    PruningMaskCreator,
+    load_mask_creator,
+)
 
 
-__all__ = ["MaskedLayer", "PruningScheduler", "remove_pruning_masks"]
+__all__ = [
+    "MaskedLayer",
+    "PruningScheduler",
+    "remove_pruning_masks",
+]
 
 
 class PruningScheduler(abc.ABC):
@@ -30,6 +37,12 @@ class PruningScheduler(abc.ABC):
     Abstract pruning scheduler
     """
 
+    _REGISTRY = {}
+
+    def __init_subclass__(cls):
+        super().__init_subclass__()
+        PruningScheduler._register_class(cls)
+
     @abc.abstractmethod
     def should_prune(self, step: int) -> bool:
         """
@@ -51,6 +64,32 @@ def target_sparsity(self, step: int, **kwargs) -> float:
         """
         raise NotImplementedError("Not implemented")
 
+    @abc.abstractmethod
+    def get_config(self):
+        raise NotImplementedError("Not implemented")
+
+    @classmethod
+    def deserialize(cls, config):
+        """
+        Deserialize a pruning scheduler from config returned by scheduler's
+        get_config method
+
+        :param config: a pruning scheduler's config
+        :return: a pruning scheduler instance
+        """
+        if "class_name" not in config:
+            raise ValueError("The 'class_name' not found in config: {}".format(config))
+        class_name = config["class_name"]
+        return tf.keras.utils.deserialize_keras_object(
+            config,
+            module_objects=globals(),
+            custom_objects={class_name: PruningScheduler._REGISTRY[class_name]},
+        )
+
+    @classmethod
+    def _register_class(cls, target_cls):
+        PruningScheduler._REGISTRY[target_cls.__name__] = target_cls
+
 
 MaskedParamInfo = collections.namedtuple(
     "MaskedParamInfo", ["name", "param", "mask", "sparsity"]
@@ -192,7 +231,7 @@ def __init__(
         self,
         layer: tf.keras.layers.Layer,
         pruning_scheduler: PruningScheduler,
-        mask_creator: PruningMaskCreator,
+        mask_type: Union[str, List[int]] = "unstructured",
         **kwargs,
     ):
         if not isinstance(layer, MaskedLayer) and not isinstance(
@@ -205,7 +244,16 @@ def __init__(
         super(MaskedLayer, self).__init__(layer, **kwargs)
         self._layer = layer
         self._pruning_scheduler = pruning_scheduler
-        self._mask_creator = mask_creator
+        self._mask_type = mask_type
+        self._mask_creator = None
+        self._pruning_vars = []
+        self._global_step = None
+        self._mask_updater = None
+
+    def build(self, input_shape):
+        super(MaskedLayer, self).build(input_shape)
+        self._mask_creator = load_mask_creator(self._mask_type)
+        self._pruning_vars = self._reuse_or_create_pruning_vars()
         self._global_step = self.add_weight(
             "global_step",
             shape=[],
@@ -213,7 +261,6 @@ def __init__(
             dtype=tf.int64,
             trainable=False,
         )
-        self._pruning_vars = self._reuse_or_create_pruning_vars()
         self._mask_updater = MaskAndWeightUpdater(
             self._pruning_vars,
             self._pruning_scheduler,
@@ -276,6 +323,44 @@ def _no_apply_masks_to_weights():
         else:
             return self._layer.call(inputs)
 
+    def get_config(self):
+        """
+        Get layer config
+        Serialization and deserialization should be done using
+        tf.keras.serialize/deserialize, which create and retrieve the "class_name"
+        field automatically.
+        The resulting config below therefore does not contain the field.
+        """
+        config = super(MaskedLayer, self).get_config()
+        if "layer" not in config:
+            raise RuntimeError("Expected 'layer' field not found in config")
+        config.update(
+            {
+                "pruning_scheduler": self._pruning_scheduler.get_config(),
+                "mask_type": self._mask_type,
+            }
+        )
+        return config
+
+    @classmethod
+    def from_config(cls, config):
+        config = config.copy()
+        layer = tf.keras.layers.deserialize(
+            config.pop("layer"), custom_objects={"MaskedLayer": MaskedLayer}
+        )
+        if not isinstance(layer, MaskedLayer) and not isinstance(
+            layer, tf.keras.layers.Layer
+        ):
+            raise RuntimeError("Unexpected layer created from config")
+        pruning_scheduler = PruningScheduler.deserialize(
+            config.pop("pruning_scheduler")
+        )
+        if not isinstance(pruning_scheduler, PruningScheduler):
+            raise RuntimeError("Unexpected pruning scheduler type created from config")
+        mask_type = config.pop("mask_type")
+        masked_layer = MaskedLayer(layer, pruning_scheduler, mask_type, **config)
+        return masked_layer
+
     def compute_output_shape(self, input_shape):
         return self._layer.compute_output_shape(input_shape)
 
@@ -304,6 +389,10 @@ def pruned_layer(self):
         else:
             raise RuntimeError("Unrecognized layer")
 
+    @property
+    def masked_layer(self):
+        return self._layer
+
 
 def remove_pruning_masks(model: tf.keras.Model):
     """
diff --git a/src/sparseml/keras/optim/modifier_pruning.py b/src/sparseml/keras/optim/modifier_pruning.py
index ced625003ab..996d235fb62 100644
--- a/src/sparseml/keras/optim/modifier_pruning.py
+++ b/src/sparseml/keras/optim/modifier_pruning.py
@@ -26,10 +26,6 @@
     PruningScheduler,
     remove_pruning_masks,
 )
-from sparseml.keras.optim.mask_pruning_creator import (
-    PruningMaskCreator,
-    load_mask_creator,
-)
 from sparseml.keras.optim.modifier import (
     KerasModifierYAML,
     ModifierProp,
@@ -73,6 +69,14 @@ def __init__(
         self._update_frequency_steps = update_frequency_steps
         self._inter_func = inter_func
 
+    @property
+    def init_sparsity(self):
+        return self._init_sparsity
+
+    @property
+    def final_sparsity(self):
+        return self._final_sparsity
+
     @property
     def start_step(self):
         return self._start_step
@@ -85,6 +89,10 @@ def end_step(self):
     def update_frequency_steps(self):
         return self._update_frequency_steps
 
+    @property
+    def inter_func(self):
+        return self._inter_func
+
     @property
     def exponent(self) -> float:
         """
@@ -154,6 +162,20 @@ def target_sparsity(self, step: int, **kwargs):
             sparsity = self._final_sparsity
         return sparsity
 
+    def get_config(self):
+        config = {
+            "class_name": self.__class__.__name__,
+            "config": {
+                "init_sparsity": self.init_sparsity,
+                "final_sparsity": self.final_sparsity,
+                "start_step": self.start_step,
+                "end_step": self.end_step,
+                "update_frequency_steps": self.update_frequency_steps,
+                "inter_func": self.inter_func,
+            },
+        }
+        return config
+
 
 class SparsityFreezer(PruningScheduler):
     """
@@ -172,6 +194,14 @@ def __init__(
         self._start_step = start_step
         self._end_step = end_step
 
+    @property
+    def start_step(self):
+        return self._start_step
+
+    @property
+    def end_step(self):
+        return self._ends_step
+
     def should_prune(self, step: int) -> bool:
         """
         Check if the given step is a right time for pruning
@@ -203,6 +233,14 @@ def target_sparsity(self, step: int, tensor=None) -> float:
             sparsity = None
         return sparsity
 
+    def get_config(self):
+        config = {
+            "class_name": self.__class__.__name__,
+            "start_step": self.start_step,
+            "end_step": self.end_step,
+        }
+        return config
+
 
 class PruningModifierCallback(tensorflow.keras.callbacks.Callback):
     """
@@ -345,7 +383,7 @@ def _log(self, logger: KerasLogger, log_data: Dict):
 
 
 @KerasModifierYAML()
-class ConstantPruningModifier(ScheduledModifier, PruningScheduler):
+class ConstantPruningModifier(ScheduledModifier):
     """
     Holds the sparsity level and shape for a given param constant while training.
     Useful for transfer learning use cases.
@@ -387,7 +425,7 @@ def __init__(
         self._masked_layers = []
 
         self._sparsity_scheduler = None
-        self._mask_creator = load_mask_creator("unstructured")
+        self._mask_type = "unstructured"
 
     @ModifierProp()
     def params(self) -> Union[str, List[str]]:
@@ -456,7 +494,7 @@ def _clone_layer(self, layer: tensorflow.keras.layers.Layer):
         cloned_layer = layer
         if layer.name in self.layer_names:  # TODO: handle regex params
             cloned_layer = MaskedLayer(
-                layer, self._sparsity_scheduler, self._mask_creator, name=layer.name
+                layer, self._sparsity_scheduler, self._mask_type, name=layer.name
             )
             self._masked_layers.append(cloned_layer)
         return cloned_layer
@@ -553,7 +591,7 @@ class GMPruningModifier(ScheduledUpdateModifier):
         default is __ALL__
     :param mask_type: String to define type of sparsity (options: ['unstructured',
         'channel', 'filter']), List to define block shape of a parameter's in and out
-        channels, or a PruningMaskCreator object. default is 'unstructured'
+        channels. default is 'unstructured'
     :param leave_enabled: True to continue masking the weights after end_epoch,
         False to stop masking. Should be set to False if exporting the result
         immediately after or doing some other prune
@@ -569,7 +607,7 @@ def __init__(
         update_frequency: float,
         inter_func: str = "cubic",
         log_types: Union[str, List[str]] = ALL_TOKEN,
-        mask_type: Union[str, List[int], PruningMaskCreator] = "unstructured",
+        mask_type: Union[str, List[int]] = "unstructured",
         leave_enabled: bool = True,
     ):
         super(GMPruningModifier, self).__init__(
@@ -591,10 +629,7 @@ def __init__(
         self._leave_enabled = convert_to_bool(leave_enabled)
         self._inter_func = inter_func
         self._mask_type = mask_type
-        self._mask_creator = mask_type
         self._leave_enabled = convert_to_bool(leave_enabled)
-        if not isinstance(mask_type, PruningMaskCreator):
-            self._mask_creator = load_mask_creator(mask_type)
         self._prune_op_vars = None
         self._update_ready = None
         self._sparsity = None
@@ -694,21 +729,18 @@ def inter_func(self, value: str):
         self.validate()
 
     @ModifierProp()
-    def mask_type(self) -> Union[str, List[int], PruningMaskCreator]:
+    def mask_type(self) -> Union[str, List[int]]:
         """
-        :return: the PruningMaskCreator object used
+        :return: the mask type used
         """
         return self._mask_type
 
     @mask_type.setter
-    def mask_type(self, value: Union[str, List[int], PruningMaskCreator]):
+    def mask_type(self, value: Union[str, List[int]]):
         """
-        :param value: the PruningMaskCreator object to use
+        :param value: the mask type to use
         """
         self._mask_type = value
-        self._mask_creator = value
-        if not isinstance(value, PruningMaskCreator):
-            self._mask_creator = load_mask_creator(value)
 
     @ModifierProp()
     def leave_enabled(self) -> bool:
@@ -834,7 +866,7 @@ def _clone_layer(self, layer: tensorflow.keras.layers.Layer):
             layer.name in self.layer_names
         ):  # TODO: handle regex params --- see create_ops in TF version
             cloned_layer = MaskedLayer(
-                layer, self._sparsity_scheduler, self._mask_creator, name=layer.name
+                layer, self._sparsity_scheduler, self._mask_type, name=layer.name
             )
             self._masked_layers.append(cloned_layer)
         return cloned_layer
diff --git a/src/sparseml/optim/modifier.py b/src/sparseml/optim/modifier.py
index 803fb6da0bf..91b46da744d 100644
--- a/src/sparseml/optim/modifier.py
+++ b/src/sparseml/optim/modifier.py
@@ -294,8 +294,24 @@ def load_framework_list(yaml_str: str, framework: str):
             modifiers = [container]
         elif isinstance(container, List):
             modifiers = container
-        else:
-            modifiers = container["modifiers"]
+        else:  # Dict
+            modifiers = []
+            for name, item in container.items():
+                if "modifiers" in name and isinstance(item, List):
+                    modifiers.extend(item)
+                elif isinstance(item, BaseModifier):
+                    modifiers.append(item)
+                elif isinstance(item, List) and any(
+                    isinstance(element, BaseModifier) for element in item
+                ):
+                    modifier_type = type(
+                        [mod for mod in item if isinstance(mod, BaseModifier)][0]
+                    )
+                    raise ValueError(
+                        "Invalid modifier location. Grouped modifiers in recipes must "
+                        "be listed in lists with 'modifiers' in its name. A modifier of "
+                        f"type {modifier_type} was found in recipe list {name}"
+                    )
 
         return modifiers
 
diff --git a/src/sparseml/pytorch/models/external/torchvision.py b/src/sparseml/pytorch/models/external/torchvision.py
index 09e82748656..720430fa319 100644
--- a/src/sparseml/pytorch/models/external/torchvision.py
+++ b/src/sparseml/pytorch/models/external/torchvision.py
@@ -80,7 +80,10 @@ def wrapper(
     ):
         """
         :param pretrained_path: A path to the pretrained weights to load,
-            if provided will override the pretrained param
+            if provided will override the pretrained param. May also be
+            a SparseZoo stub path preceded by 'zoo:' with the optional
+            `?recipe_type=` argument. If given a recipe type, the base
+                model weights for that recipe will be loaded
         :param pretrained: True to load the default pretrained weights,
             a string to load a specific pretrained weight
             (ex: base, pruned-moderate),
diff --git a/src/sparseml/pytorch/models/registry.py b/src/sparseml/pytorch/models/registry.py
index c3a0d34f255..f60d86af647 100644
--- a/src/sparseml/pytorch/models/registry.py
+++ b/src/sparseml/pytorch/models/registry.py
@@ -308,7 +308,10 @@ def wrapper(
         ):
             """
             :param pretrained_path: A path to the pretrained weights to load,
-                if provided will override the pretrained param
+                if provided will override the pretrained param. May also be
+                a SparseZoo stub path preceded by 'zoo:' with the optional
+                `?recipe_type=` argument. If given a recipe type, the base
+                model weights for that recipe will be loaded
             :param pretrained: True to load the default pretrained weights,
                 a string to load a specific pretrained weight
                 (ex: base, optim, optim-perf),
diff --git a/src/sparseml/pytorch/optim/manager.py b/src/sparseml/pytorch/optim/manager.py
index 46a76fc383e..ac6929d6ec4 100644
--- a/src/sparseml/pytorch/optim/manager.py
+++ b/src/sparseml/pytorch/optim/manager.py
@@ -29,6 +29,7 @@
 from sparseml.pytorch.optim.modifier import Modifier, ScheduledModifier
 from sparseml.pytorch.utils import PyTorchLogger
 from sparseml.utils import load_recipe_yaml_str
+from sparsezoo.objects import OptimizationRecipe
 
 
 __all__ = ["ScheduledModifierManager", "load_manager"]
@@ -56,15 +57,23 @@ class ScheduledModifierManager(BaseManager, Modifier):
     """
 
     @staticmethod
-    def from_yaml(file_path: str, add_modifiers: List[Modifier] = None):
+    def from_yaml(
+        file_path: Union[str, OptimizationRecipe],
+        add_modifiers: List[Modifier] = None,
+    ):
         """
         Convenience function used to create the manager of multiple modifiers from a
-        yaml file.
-
-        :param file_path: the path to the yaml file to load the modifier from
+        recipe file.
+
+        :param file_path: the path to the recipe file to load the modifier from, or
+            a SparseZoo model stub to load a recipe for a model stored in SparseZoo.
+            SparseZoo stubs should be preceded by 'zoo:', and can contain an optional
+            '?recipe_type=<type>' parameter. Can also be a SparseZoo OptimizationRecipe
+            object. i.e. '/path/to/local/recipe.yaml', 'zoo:model/stub/path',
+            'zoo:model/stub/path?recipe_type=transfer'
         :param add_modifiers: additional modifiers that should be added to the
-            returned manager alongside the ones loaded from the yaml file
-        :return: ScheduledModifierManager() created from the yaml file
+            returned manager alongside the ones loaded from the recipe file
+        :return: ScheduledModifierManager() created from the recipe file
         """
         yaml_str = load_recipe_yaml_str(file_path)
         modifiers = Modifier.load_list(yaml_str)
diff --git a/src/sparseml/pytorch/optim/modifier_pruning.py b/src/sparseml/pytorch/optim/modifier_pruning.py
index 238318e8334..abad881c245 100644
--- a/src/sparseml/pytorch/optim/modifier_pruning.py
+++ b/src/sparseml/pytorch/optim/modifier_pruning.py
@@ -77,7 +77,7 @@ class ConstantPruningModifier(ScheduledModifier):
     Useful for transfer learning use cases.
 
     | Sample yaml:
-    |   !ConstantKSModifier
+    |   !ConstantPruningModifier
     |       start_epoch: 0.0
     |       end_epoch: 10.0
     |       params: ['re:.*weight']
@@ -170,7 +170,7 @@ def load_state_dict(self, state_dict: Dict[str, Tensor]):
             if param_name not in module_masks:
                 raise RuntimeError(
                     f"Unexpected parameter name when loading state dict for "
-                    f"ConstantKSModifier Manager has parameters "
+                    f"ConstantPruningModifier Manager has parameters "
                     f"{list(module_masks.keys())}, given {param_name}"
                 )
             mask_disabled = False
@@ -304,7 +304,7 @@ class GMPruningModifier(ScheduledUpdateModifier):
     Applies based on magnitude pruning unless otherwise specified by mask_type.
 
     | Sample yaml:
-    |   !GradualKSModifier
+    |   !GMPruningModifier
     |       init_sparsity: 0.05
     |       final_sparsity: 0.8
     |       start_epoch: 0.0
@@ -412,7 +412,7 @@ def load_state_dict(self, state_dict: Dict[str, Tensor]):
             if param_name not in module_masks:
                 raise RuntimeError(
                     f"Unexpected parameter name when loading state dict for "
-                    f"GradualKSModifier Manager has parameters "
+                    f"GMPruningModifier Manager has parameters "
                     f"{list(module_masks.keys())}, given {param_name}"
                 )
             mask_disabled = False
diff --git a/src/sparseml/pytorch/optim/modifier_quantization.py b/src/sparseml/pytorch/optim/modifier_quantization.py
index 93867496ace..e1c980e9dd0 100644
--- a/src/sparseml/pytorch/optim/modifier_quantization.py
+++ b/src/sparseml/pytorch/optim/modifier_quantization.py
@@ -19,7 +19,7 @@
 """
 
 
-from typing import List, Union
+from typing import Any, Dict, List, Union
 
 from torch.nn import Module
 from torch.optim.optimizer import Optimizer
@@ -75,6 +75,8 @@ class QuantizationModifier(ScheduledModifier):
         None to not stop tracking batch norm stats during QAT. Default is None
     :param end_epoch: Disabled, setting to anything other than -1 will raise an
         exception. For compatibility with YAML serialization only.
+    :param model_fuse_fn_kwargs: dictionary of keyword argument values to be passed
+        to the model fusing function
     """
 
     def __init__(
@@ -85,6 +87,7 @@ def __init__(
         disable_quantization_observer_epoch: Union[float, None] = None,
         freeze_bn_stats_epoch: Union[float, None] = None,
         end_epoch: float = -1,
+        model_fuse_fn_kwargs: Dict[str, Any] = None,
     ):
         if torch_quantization is None or torch_intrinsic is None:
             raise RuntimeError(
@@ -103,6 +106,7 @@ def __init__(
         self._start_epoch = start_epoch
         self._submodules = submodules
         self._model_fuse_fn_name = model_fuse_fn_name
+        self._model_fuse_fn_kwargs = model_fuse_fn_kwargs or {}
         self._disable_quantization_observer_epoch = disable_quantization_observer_epoch
         self._freeze_bn_stats_epoch = freeze_bn_stats_epoch
 
@@ -254,9 +258,10 @@ def update(
                             self._model_fuse_fn_name
                         )
                     )
-                module_fuse_fn()
+                module_fuse_fn(**self._model_fuse_fn_kwargs)
             elif self._model_fuse_fn_name is None:  # default auto fn
-                fuse_module_conv_bn_relus(module, inplace=True)
+                self._model_fuse_fn_kwargs["inplace"] = True
+                fuse_module_conv_bn_relus(module, **self._model_fuse_fn_kwargs)
             # prepare each module / submodule for quantization
             qconfig = get_qat_qconfig()
             for quant_module in self._modules_to_quantize:
diff --git a/src/sparseml/pytorch/optim/quantization/helpers.py b/src/sparseml/pytorch/optim/quantization/helpers.py
index 935318cd767..0c8a0529a92 100644
--- a/src/sparseml/pytorch/optim/quantization/helpers.py
+++ b/src/sparseml/pytorch/optim/quantization/helpers.py
@@ -17,14 +17,17 @@
 """
 
 from copy import deepcopy
+from typing import Union
 
 import torch
 from torch.nn import BatchNorm2d, Conv2d, Module, ReLU
 
 
 try:
+    import torch.nn.intrinsic as nni
     from torch import quantization as torch_quantization
 except Exception:
+    nni = None
     torch_quantization = None
 
 from sparseml.pytorch.nn import ReLU as ReLU_nm
@@ -37,15 +40,40 @@
 ]
 
 
+_QUANTIZABLE_MODULE_TYPES = (
+    {
+        # Conv based layers
+        torch.nn.Conv1d,
+        torch.nn.Conv2d,
+        torch.nn.Conv3d,
+        nni.ConvBn1d,
+        nni.ConvBn2d,
+        nni.ConvBn3d,
+        nni.ConvReLU1d,
+        nni.ConvReLU2d,
+        nni.ConvReLU3d,
+        nni.ConvBnReLU1d,
+        nni.ConvBnReLU2d,
+        nni.ConvBnReLU3d,
+        # Linear Layers
+        torch.nn.Linear,
+        nni.LinearReLU,
+    }
+    if nni  # nni will always import if torch.quantization is available
+    else None
+)
+
+
 def add_quant_dequant(module):
     """
     Wraps all Conv and Linear submodule with a qconfig with a QuantWrapper
     :param module: the module to modify
     """
-    module_type = str(type(module)).split(".")[-1].lower()
-    is_quantizable_module = "conv" in module_type or "linear" in module_type
-
-    if is_quantizable_module and hasattr(module, "qconfig") and module.qconfig:
+    if (
+        type(module) in _QUANTIZABLE_MODULE_TYPES
+        and hasattr(module, "qconfig")
+        and module.qconfig
+    ):
         return torch_quantization.QuantWrapper(module)
 
     for name, child in module.named_children():
@@ -76,7 +104,11 @@ def get_qat_qconfig() -> torch_quantization.QConfig:
     )
 
 
-def fuse_module_conv_bn_relus(module: Module, inplace: bool = True) -> Module:
+def fuse_module_conv_bn_relus(
+    module: Module,
+    inplace: bool = True,
+    override_bn_subclasses_forward: Union[bool, str] = True,
+) -> Module:
     """
     Performs fusion of Conv2d, BatchNorm2d, and ReLU layers found in the
     given module. To be fused, these layers must appear sequentially in
@@ -88,6 +120,12 @@ def fuse_module_conv_bn_relus(module: Module, inplace: bool = True) -> Module:
 
     :param module: the module to fuse
     :param inplace: set True to perform fusions in-place. default is True
+    :param override_bn_subclasses_forward: if True, modules that are subclasses of
+        BatchNorm2d will be modified to be BatchNorm2d but with the forward
+        pass and state variables copied from the subclass. This is so these
+        BN modules can pass PyTorch type checking when fusing. Can set to
+        "override-only" and only parameters will be overwritten, not the
+        forward pass. Default is True
     :return: the fused module
     """
     if torch_quantization is None:
@@ -112,7 +150,22 @@ def fuse_module_conv_bn_relus(module: Module, inplace: bool = True) -> Module:
             and submodule_name == current_block_submodule_name
         ):
             if isinstance(layer, ReLU_nm):
-                _replace_nm_relu(module, name, layer)
+                _set_submodule(module, name, ReLU(inplace=layer.inplace))
+            if isinstance(layer, BatchNorm2d) and not type(layer) is BatchNorm2d:
+                if not override_bn_subclasses_forward:
+                    raise RuntimeError(
+                        "Detected a Conv-BN block that uses a subclass of BatchNorm2d. "
+                        "This will cause a type error when fusing with PyTorch, "
+                        "set override_bn_subclasses_forward to True or 'override-only "
+                        "to modify this BN subclass to be a BatchNorm2d object"
+                    )
+                # swap BN subclass with overwritten BN class that will pass torch
+                # type checking
+                overwritten_bn = _wrap_bn_sub_class(
+                    layer,
+                    override_forward=override_bn_subclasses_forward != "override-only",
+                )
+                _set_submodule(module, name, overwritten_bn),
             current_block.append(name)
         else:
             if current_block:
@@ -128,10 +181,18 @@ def fuse_module_conv_bn_relus(module: Module, inplace: bool = True) -> Module:
     return module
 
 
-def _replace_nm_relu(root_module, relu_path, nm_relu):
+def _set_submodule(root_module, sub_module_path, sub_module):
     current_module = root_module
-    relu_path = relu_path.split(".")
-    for sub_module in relu_path[:-1]:
-        current_module = getattr(current_module, sub_module)
-    new_relu = ReLU(inplace=nm_relu.inplace)
-    setattr(current_module, relu_path[-1], new_relu)
+    sub_module_path = sub_module_path.split(".")
+    for child_module in sub_module_path[:-1]:
+        current_module = getattr(current_module, child_module)
+    setattr(current_module, sub_module_path[-1], sub_module)
+
+
+def _wrap_bn_sub_class(bn_subclass, override_forward=True):
+    batch_norm = BatchNorm2d(bn_subclass.num_features)
+    batch_norm.__dict__ = bn_subclass.__dict__
+    if override_forward:
+        batch_norm.forward = bn_subclass.forward
+    del bn_subclass
+    return batch_norm
diff --git a/src/sparseml/pytorch/optim/quantization/quantize_qat_export.py b/src/sparseml/pytorch/optim/quantization/quantize_qat_export.py
index 44e172385c3..35ef20f95e8 100644
--- a/src/sparseml/pytorch/optim/quantization/quantize_qat_export.py
+++ b/src/sparseml/pytorch/optim/quantization/quantize_qat_export.py
@@ -569,15 +569,23 @@ def _remove_duplicate_quantize__ops(model: ModelProto):
             remove_node_and_params_from_graph(model, remove_node)
 
 
-def quantize_torch_qat_export(model: ModelProto, inplace: bool = True) -> ModelProto:
+def quantize_torch_qat_export(
+    model: Union[ModelProto, str],
+    output_file_path: Union[str, None] = None,
+    inplace: bool = True,
+) -> ModelProto:
     """
-    :param model: The model to convert
+    :param model: The model to convert, or a file path to it
+    :param output_file_path: File path to save the converted model to
     :param inplace: If true, does conversion of model in place. Default is true
     :return: Converts a model exported from a torch QAT session from a QAT graph with
         fake quantize ops surrounding operations to a quantized graph with quantized
         operations. All quantized Convs and FC inputs and outputs be surrounded by
         fake quantize ops
     """
+    if isinstance(model, str):
+        model = onnx.load(model)
+
     if not inplace:
         model = deepcopy(model)
 
@@ -589,4 +597,7 @@ def quantize_torch_qat_export(model: ModelProto, inplace: bool = True) -> ModelP
     quantize_resnet_identity_add_inputs(model)
     _remove_duplicate_quantize__ops(model)
 
+    if output_file_path:
+        onnx.save(model, output_file_path)
+
     return model
diff --git a/src/sparseml/pytorch/utils/model.py b/src/sparseml/pytorch/utils/model.py
index d3b87bcf396..09e654f710f 100644
--- a/src/sparseml/pytorch/utils/model.py
+++ b/src/sparseml/pytorch/utils/model.py
@@ -24,6 +24,7 @@
 from torch.optim.optimizer import Optimizer
 
 from sparseml.utils.helpers import create_parent_dirs
+from sparsezoo import Zoo
 
 
 try:
@@ -57,7 +58,10 @@ def load_model(
     """
     Load the state dict into a model from a given file.
 
-    :param path: the path to the pth file to load the state dict from
+    :param path: the path to the pth file to load the state dict from.
+        May also be a SparseZoo stub path preceded by 'zoo:' with the optional
+        `?recipe_type=` argument. If given a recipe type, the base model weights
+        for that recipe will be loaded.
     :param model: the model to load the state dict into
     :param strict: True to enforce that all tensors match between the model
         and the file; False otherwise
@@ -67,6 +71,15 @@ def load_model(
         look like they came from DataParallel type setup (start with module.).
         This removes "module." all keys
     """
+    if path.startswith("zoo:"):
+        if "recipe_type=" in path:
+            path = Zoo.download_recipe_base_framework_files(path, extensions=[".pth"])[
+                0
+            ]
+        else:
+            path = Zoo.load_model_from_stub(path).download_framework_files(
+                extensions=[".pth"]
+            )[0]
     model_dict = torch.load(path, map_location="cpu")
     current_dict = model.state_dict()
 
diff --git a/src/sparseml/tensorflow_v1/optim/manager.py b/src/sparseml/tensorflow_v1/optim/manager.py
index b1098e5637d..5fefcc17615 100644
--- a/src/sparseml/tensorflow_v1/optim/manager.py
+++ b/src/sparseml/tensorflow_v1/optim/manager.py
@@ -25,6 +25,7 @@
 from sparseml.tensorflow_v1.optim.modifier import NM_RECAL, Modifier, ScheduledModifier
 from sparseml.tensorflow_v1.utils import tf_compat
 from sparseml.utils import load_recipe_yaml_str
+from sparsezoo.objects import OptimizationRecipe
 
 
 __all__ = ["ScheduledModifierManager"]
@@ -74,15 +75,23 @@ class ScheduledModifierManager(BaseManager, Modifier):
     """
 
     @staticmethod
-    def from_yaml(file_path: str, add_modifiers: List[Modifier] = None):
+    def from_yaml(
+        file_path: Union[str, OptimizationRecipe],
+        add_modifiers: List[Modifier] = None,
+    ):
         """
-        Convenience function used to create the manager of multiple modifiers
-        from a yaml file.
-
-        :param file_path: the path to the yaml file to load the modifier from
+        Convenience function used to create the manager of multiple modifiers from a
+        recipe file.
+
+        :param file_path: the path to the recipe file to load the modifier from, or
+            a SparseZoo model stub to load a recipe for a model stored in SparseZoo.
+            SparseZoo stubs should be preceded by 'zoo:', and can contain an optional
+            '?recipe_type=<type>' parameter. Can also be a SparseZoo OptimizationRecipe
+            object. i.e. '/path/to/local/recipe.yaml', 'zoo:model/stub/path',
+            'zoo:model/stub/path?recipe_type=transfer'
         :param add_modifiers: additional modifiers that should be added to the
-            returned manager alongside the ones loaded from the yaml file
-        :return: ScheduledModifierManager() created from the yaml file
+            returned manager alongside the ones loaded from the recipe file
+        :return: ScheduledModifierManager() created from the recipe file
         """
         yaml_str = load_recipe_yaml_str(file_path)
         modifiers = Modifier.load_list(yaml_str)
diff --git a/src/sparseml/tensorflow_v1/optim/modifier_pruning.py b/src/sparseml/tensorflow_v1/optim/modifier_pruning.py
index 6c6f5b50dfb..4762b3e4c7b 100644
--- a/src/sparseml/tensorflow_v1/optim/modifier_pruning.py
+++ b/src/sparseml/tensorflow_v1/optim/modifier_pruning.py
@@ -56,7 +56,7 @@ class ConstantPruningModifier(ScheduledModifier):
     Useful for transfer learning use cases.
 
     | Sample yaml:
-    |   !ConstantKSModifier
+    |   !ConstantPruningModifier
     |       params: __ALL__
     |       start_epoch: 0.0
     |       end_epoch: 10.0
@@ -237,7 +237,7 @@ class GMPruningModifier(ScheduledUpdateModifier):
     Applies based on magnitude pruning without any structure to the pruning.
 
     | Sample yaml:
-    |   !GradualKSModifier
+    |   !GMPruningModifier
     |       params: __ALL__
     |       init_sparsity: 0.05
     |       final_sparsity: 0.8
diff --git a/src/sparseml/tensorflow_v1/utils/variable.py b/src/sparseml/tensorflow_v1/utils/variable.py
index d83dd1b12a9..06cca3013cc 100644
--- a/src/sparseml/tensorflow_v1/utils/variable.py
+++ b/src/sparseml/tensorflow_v1/utils/variable.py
@@ -21,6 +21,7 @@
 try:
     import tensorflow.contrib.graph_editor as graph_editor
     from tensorflow.contrib.graph_editor.util import ListView
+
     tf_contrib_err = None
 except Exception as err:
     graph_editor = None
diff --git a/src/sparseml/utils/helpers.py b/src/sparseml/utils/helpers.py
index eaa9bf1bc6b..8b41f30f422 100644
--- a/src/sparseml/utils/helpers.py
+++ b/src/sparseml/utils/helpers.py
@@ -29,6 +29,8 @@
 
 import numpy
 
+from sparsezoo import Zoo
+from sparsezoo.objects import OptimizationRecipe
 from sparsezoo.utils import load_numpy_list
 
 
@@ -763,16 +765,30 @@ def _tensors_export_batch(
     )
 
 
-def load_recipe_yaml_str(file_path: str) -> str:
+def load_recipe_yaml_str(file_path: Union[str, OptimizationRecipe]) -> str:
     """
     Loads a YAML recipe file to a string or
     extracts recipe from YAML front matter in a sparsezoo markdown recipe card.
+    Recipes can also be provided as SparseZoo model stubs or OptimizationRecipe
+    objects.
 
     YAML front matter: https://jekyllrb.com/docs/front-matter/
 
-    :param file_path: file path to recipe YAML file or markdown recipe card
+    :param file_path: file path to recipe YAML file or markdown recipe card or
+        stub to a SparseZoo model whose recipe will be downloaded and loaded.
+        SparseZoo stubs should be preceded by 'zoo:', and can contain an optional
+        '?recipe_type=<type>' parameter. Can also be a SparseZoo OptimizationRecipe
+        object. i.e. '/path/to/local/recipe.yaml', 'zoo:model/stub/path',
+        'zoo:model/stub/path?recipe_type=transfer'
     :return: the recipe YAML configuration loaded as a string
     """
+    if isinstance(file_path, OptimizationRecipe):
+        # download and unwrap OptimizationRecipe object
+        file_path = file_path.downloaded_path()
+    elif file_path.startswith("zoo:"):
+        # download from zoo stub
+        file_path = Zoo.download_recipe_from_stub(file_path)
+
     extension = file_path.lower().split(".")[-1]
     if extension not in ["md", "yaml"]:
         raise ValueError(
diff --git a/tests/sparseml/keras/optim/mock.py b/tests/sparseml/keras/optim/mock.py
index 36521b34360..cbfde246af7 100644
--- a/tests/sparseml/keras/optim/mock.py
+++ b/tests/sparseml/keras/optim/mock.py
@@ -26,11 +26,13 @@
     "SequentialModelCreator",
     "MockPruningScheduler",
     "model_01",
+    "mnist_model",
 ]
 
 
 class MockPruningScheduler(PruningScheduler):
     def __init__(self, step_and_sparsity_pairs: List[Tuple]):
+        self._org_pairs = step_and_sparsity_pairs
         self.step_and_sparsity_pairs = {
             step: sparsity for (step, sparsity) in step_and_sparsity_pairs
         }
@@ -43,6 +45,12 @@ def target_sparsity(self, step: int):
         sparsity = self.step_and_sparsity_pairs[step] if update_ready else None
         return sparsity
 
+    def get_config(self):
+        return {
+            "class_name": self.__class__.__name__,
+            "step_and_sparsity_pairs": self._org_pairs,
+        }
+
 
 class DenseLayer(tf.keras.layers.Dense):
     def __init__(self, weight: np.ndarray):
@@ -107,3 +115,33 @@ def model_01():
     outputs = tf.keras.layers.Dense(10, name="dense_02")(x)
     model = Model(inputs=inputs, outputs=outputs)
     return model
+
+
+def mnist_model():
+    inputs = tf.keras.Input(shape=(28, 28, 1), name="inputs")
+
+    # Block 1
+    x = tf.keras.layers.Conv2D(16, 5, strides=1)(inputs)
+    x = tf.keras.layers.BatchNormalization()(x)
+    x = tf.keras.layers.ReLU()(x)
+
+    # Block 2
+    x = tf.keras.layers.Conv2D(32, 5, strides=2)(x)
+    x = tf.keras.layers.BatchNormalization()(x)
+    x = tf.keras.layers.ReLU()(x)
+
+    # Block 3
+    x = tf.keras.layers.Conv2D(64, 5, strides=1)(x)
+    x = tf.keras.layers.BatchNormalization()(x)
+    x = tf.keras.layers.ReLU()(x)
+
+    # Block 4
+    x = tf.keras.layers.Conv2D(128, 5, strides=2)(x)
+    x = tf.keras.layers.BatchNormalization()(x)
+    x = tf.keras.layers.ReLU()(x)
+
+    x = tf.keras.layers.AveragePooling2D(pool_size=1)(x)
+    x = tf.keras.layers.Flatten()(x)
+    outputs = tf.keras.layers.Dense(10, activation="softmax", name="outputs")(x)
+
+    return tf.keras.Model(inputs=inputs, outputs=outputs)
diff --git a/tests/sparseml/keras/optim/test_mask_pruning.py b/tests/sparseml/keras/optim/test_mask_pruning.py
index 9a3de232fc4..62f110a1af4 100644
--- a/tests/sparseml/keras/optim/test_mask_pruning.py
+++ b/tests/sparseml/keras/optim/test_mask_pruning.py
@@ -12,16 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Dict, Union
+
 import numpy as np
 import pytest
 import tensorflow as tf
 
-from sparseml.keras.optim import MaskedLayer, UnstructuredPruningMaskCreator
-from tests.sparseml.keras.optim.mock import DenseLayerCreator, MockPruningScheduler
+from sparseml.keras.optim import (
+    GMPruningModifier,
+    MaskedLayer,
+    ScheduledModifierManager,
+)
+from tests.sparseml.keras.optim.mock import (
+    DenseLayerCreator,
+    MockPruningScheduler,
+    mnist_model,
+)
 
 
 @pytest.mark.parametrize(
-    "layer_lambda, pruning_scheduler, mask_creator, expected_mask",
+    "layer_lambda, pruning_scheduler, mask_type, expected_mask",
     [
         (
             # Weight of a dense layer of shape (3, 4)
@@ -32,7 +42,7 @@
                 ),
             ),
             MockPruningScheduler([(1, 0.25), (2, 0.5)]),
-            UnstructuredPruningMaskCreator(),
+            "unstructured",
             # List of expected mask, each corresponding to one of the
             # above update step in the MockPruningScheduler
             [
@@ -57,15 +67,128 @@
     ],
 )
 def test_mask_update_explicit(
-    layer_lambda, pruning_scheduler, mask_creator, expected_mask
+    layer_lambda, pruning_scheduler, mask_type, expected_mask
 ):
     if tf.__version__ < "2":
         pytest.skip("Test needs to be fixed to run with tensorflow_v1 1.x")
     layer = layer_lambda()
-    masked_layer = MaskedLayer(layer, pruning_scheduler, mask_creator)
+    masked_layer = MaskedLayer(layer, pruning_scheduler, mask_type)
+    masked_layer.build(input_shape=None)
     update_steps = list(pruning_scheduler.step_and_sparsity_pairs.keys())
     for idx, update_step in enumerate(update_steps):
         tf.keras.backend.batch_set_value([(masked_layer.global_step, update_step)])
         masked_layer.mask_updater.conditional_update(training=True)
         mask = tf.keras.backend.get_value(masked_layer.masks[0])
         assert np.allclose(mask, expected_mask[idx])
+
+
+@pytest.mark.parametrize(
+    "modifier_lambdas",
+    [
+        (
+            lambda: GMPruningModifier(
+                params=["conv2d/kernel:0"],
+                init_sparsity=0.25,
+                final_sparsity=0.75,
+                start_epoch=0.0,
+                end_epoch=2.0,
+                update_frequency=1.0,
+            ),
+            lambda: GMPruningModifier(
+                params=["conv2d_1/kernel:0"],
+                init_sparsity=0.25,
+                final_sparsity=0.75,
+                start_epoch=0.0,
+                end_epoch=2.0,
+                update_frequency=1.0,
+            ),
+        ),
+        (
+            lambda: GMPruningModifier(
+                params=["conv2d/kernel:0", "conv2d_2/kernel:0"],
+                init_sparsity=0.25,
+                final_sparsity=0.75,
+                start_epoch=0.0,
+                end_epoch=2.0,
+                update_frequency=1.0,
+            ),
+            lambda: GMPruningModifier(
+                params=["conv2d_1/kernel:0", "conv2d/kernel:0", "outputs/kernel:0"],
+                init_sparsity=0.25,
+                final_sparsity=0.75,
+                start_epoch=2.0,
+                end_epoch=3.0,
+                update_frequency=1.0,
+            ),
+            lambda: GMPruningModifier(
+                params=["conv2d_2/kernel:0", "outputs/kernel:0"],
+                init_sparsity=0.25,
+                final_sparsity=0.75,
+                start_epoch=3.0,
+                end_epoch=4.0,
+                update_frequency=1.0,
+            ),
+        ),
+    ],
+    scope="function",
+)
+@pytest.mark.parametrize("steps_per_epoch", [10], scope="function")
+def test_nested_layer_structure(modifier_lambdas, steps_per_epoch):
+    model = mnist_model()
+    modifiers = [mod() for mod in modifier_lambdas]
+    manager = ScheduledModifierManager(modifiers)
+    optimizer = tf.keras.optimizers.Adam()
+    model, optimizer, callbacks = manager.modify(model, optimizer, steps_per_epoch)
+
+    model.build(input_shape=(1, 28, 28, 1))
+
+    # Verify number of (outer-most) masked layers
+    modifier_masked_layer_names = [
+        layer_name for mod in modifiers for layer_name in mod.layer_names
+    ]
+    model_masked_layer_names = [
+        layer.name for layer in model.layers if isinstance(layer, MaskedLayer)
+    ]
+    assert len(model_masked_layer_names) == len(set(modifier_masked_layer_names))
+
+    # Verify that if a layer is modified by N modifiers, then the corresponding
+    # MaskedLayer will have N-1 number of MaskedLayer nested inside it
+    for layer in model.layers:
+        if isinstance(layer, MaskedLayer):
+            expected_layers = len(
+                [name for name in modifier_masked_layer_names if name == layer.name]
+            )
+            assert _count_nested_masked_layers(layer) == expected_layers
+
+    # Verify the returned config dict has expected nested structures
+    model_config = model.get_config()
+    for layer_config in model_config["layers"]:
+        if layer_config["class_name"] == "MaskedLayer":
+            layer_name = layer_config["config"]["name"]
+            expected_layers = len(
+                [name for name in modifier_masked_layer_names if name == layer_name]
+            )
+            assert (
+                _count_nested_masked_layers_in_config(layer_config) == expected_layers
+            )
+
+    # Verify model serialization and deserialization working for (nested) masked layer
+    model_config = model.get_config()
+    new_model = model.__class__.from_config(
+        model_config, custom_objects={"MaskedLayer": MaskedLayer}
+    )
+    assert model_config == new_model.get_config()
+
+    tf.keras.backend.clear_session()
+
+
+def _count_nested_masked_layers_in_config(layer_config: Dict):
+    if layer_config["class_name"] != "MaskedLayer":
+        return 0
+    return 1 + _count_nested_masked_layers_in_config(layer_config["config"]["layer"])
+
+
+def _count_nested_masked_layers(layer: Union[MaskedLayer, tf.keras.layers.Layer]):
+    if not isinstance(layer, MaskedLayer):
+        return 0
+    return 1 + _count_nested_masked_layers(layer.masked_layer)
diff --git a/tests/sparseml/keras/optim/test_modifier_pruning.py b/tests/sparseml/keras/optim/test_modifier_pruning.py
index 47954f02e01..1f3b30604df 100644
--- a/tests/sparseml/keras/optim/test_modifier_pruning.py
+++ b/tests/sparseml/keras/optim/test_modifier_pruning.py
@@ -60,7 +60,7 @@
     scope="function",
 )
 @pytest.mark.parametrize("steps_per_epoch", [10], scope="function")
-class TestGradualKSModifier:
+class TestGMPruningModifier:
     def test_lifecycle(self, model_lambda, modifier_lambda, steps_per_epoch):
         model = model_lambda()
         modifier = modifier_lambda()
diff --git a/tests/sparseml/pytorch/optim/test_modifier.py b/tests/sparseml/pytorch/optim/test_modifier.py
index 479e8ffb690..948d590e6eb 100644
--- a/tests/sparseml/pytorch/optim/test_modifier.py
+++ b/tests/sparseml/pytorch/optim/test_modifier.py
@@ -705,3 +705,71 @@ def __init__(
 )
 class TestScheduledUpdateModifierImpl(ScheduledUpdateModifierTest):
     pass
+
+
+_SAMPLE_RECIPE = """
+modifiers:
+  - !EpochRangeModifier
+    start_epoch: 0.0
+    end_epoch: 1.0
+
+  - !SetLearningRateModifier
+    start_epoch: 0.0
+    learning_rate: 0.1
+"""
+
+_SAMPLE_GROUPED_RECIPE = """
+training_modifiers:
+  - !EpochRangeModifier
+    start_epoch: 0.0
+    end_epoch: 50.0
+
+  - !SetLearningRateModifier
+    start_epoch: 0.0
+    learning_rate: 0.1
+
+pruning_modifiers:
+  - !GMPruningModifier
+    start_epoch: 0
+    end_epoch: 40
+    init_sparsity: 0.05
+    final_sparsity: 0.85
+    params: __ALL__
+    update_frequency: 0.5
+"""
+
+
+@pytest.mark.parametrize(
+    "modifier_str,num_modifiers",
+    [
+        (_SAMPLE_RECIPE, 2),
+        (_SAMPLE_GROUPED_RECIPE, 3),
+    ],
+)
+def test_load_list(modifier_str, num_modifiers):
+    modifier_list = Modifier.load_list(modifier_str)
+    assert len(modifier_list) == num_modifiers
+
+
+_SAMPLE_BAD_RECIPE = """
+incorrect_modifier_list_name:
+  - !EpochRangeModifier
+    start_epoch: 0.0
+    end_epoch: 1.0
+
+  - !SetLearningRateModifier
+    start_epoch: 0.0
+    learning_rate: 0.1
+"""
+
+
+@pytest.mark.parametrize(
+    "modifier_str",
+    [
+        _SAMPLE_BAD_RECIPE,
+    ],
+)
+def test_load_list_fails(modifier_str):
+    # expect ValueError mentioning 'modifiers'
+    with pytest.raises(ValueError, match=r".*'modifiers'.*"):
+        Modifier.load_list(modifier_str)
diff --git a/tests/sparseml/tensorflow_v1/optim/test_analyzer_module.py b/tests/sparseml/tensorflow_v1/optim/test_analyzer_module.py
index 8b51e347af7..ebebe0f05c5 100644
--- a/tests/sparseml/tensorflow_v1/optim/test_analyzer_module.py
+++ b/tests/sparseml/tensorflow_v1/optim/test_analyzer_module.py
@@ -85,6 +85,7 @@ def resnet_v2_50(init_weights):
         return tf_compat.get_default_graph()
 
 
+@pytest.mark.flaky
 @pytest.mark.skipif(
     os.getenv("NM_ML_SKIP_TENSORFLOW_TESTS", False),
     reason="Skipping tensorflow_v1 tests",
diff --git a/tests/sparseml/tensorflow_v1/optim/test_mask_creator_pruning.py b/tests/sparseml/tensorflow_v1/optim/test_mask_creator_pruning.py
index 92aa5a6128d..082c90f4578 100644
--- a/tests/sparseml/tensorflow_v1/optim/test_mask_creator_pruning.py
+++ b/tests/sparseml/tensorflow_v1/optim/test_mask_creator_pruning.py
@@ -26,6 +26,7 @@
 from sparseml.tensorflow_v1.utils import eval_tensor_sparsity, tf_compat
 
 
+@pytest.mark.flaky
 @pytest.mark.parametrize(
     ("tensor_shape,mask_creator"),
     [
diff --git a/tests/sparseml/tensorflow_v1/optim/test_mask_pruning.py b/tests/sparseml/tensorflow_v1/optim/test_mask_pruning.py
index 31a2019cb15..ecbda8a2bfc 100644
--- a/tests/sparseml/tensorflow_v1/optim/test_mask_pruning.py
+++ b/tests/sparseml/tensorflow_v1/optim/test_mask_pruning.py
@@ -202,6 +202,7 @@ def test_create_op_pruning_conv(sparsity_val: float, mask_creator: PruningMaskCr
                 assert sess.run(mask_vals_are_grouped)
 
 
+@pytest.mark.flaky
 @pytest.mark.skipif(
     os.getenv("NM_ML_SKIP_TENSORFLOW_TESTS", False),
     reason="Skipping tensorflow_v1 tests",
@@ -391,6 +392,7 @@ def test_apply_op_vars_masks(
                 assert abs(var_sparsity - sparsity_val) < 1e-2
 
 
+@pytest.mark.flaky
 @pytest.mark.skipif(
     os.getenv("NM_ML_SKIP_TENSORFLOW_TESTS", False),
     reason="Skipping tensorflow_v1 tests",
@@ -502,6 +504,7 @@ def _expected_sparsity(
     )
 
 
+@pytest.mark.flaky
 @pytest.mark.skipif(
     os.getenv("NM_ML_SKIP_TENSORFLOW_TESTS", False),
     reason="Skipping tensorflow_v1 tests",
diff --git a/tests/sparseml/utils/test_helpers.py b/tests/sparseml/utils/test_helpers.py
index 4d0d093d41b..e658a3c6345 100644
--- a/tests/sparseml/utils/test_helpers.py
+++ b/tests/sparseml/utils/test_helpers.py
@@ -19,6 +19,7 @@
     convert_to_bool,
     flatten_iterable,
     interpolate,
+    load_recipe_yaml_str,
     validate_str_iterable,
 )
 
@@ -98,3 +99,14 @@ def test_validate_str_iterable_negative():
 def test_interpolate(x_cur, x0, x1, y0, y1, inter_func, out):
     interpolated = interpolate(x_cur, x0, x1, y0, y1, inter_func)
     assert abs(out - interpolated) < 0.01
+
+
+@pytest.mark.parametrize(
+    "zoo_path",
+    [
+        "zoo:cv/classification/resnet_v1-50/pytorch/sparseml/imagenette/pruned-conservative",
+        "zoo:cv/classification/resnet_v1-50/pytorch/sparseml/imagenette/pruned-conservative?recipe_type=original",
+    ],
+)
+def test_load_recipe_yaml_str_zoo(zoo_path):
+    assert load_recipe_yaml_str(zoo_path)