From 57654958e3c6a51ebcbf322dfecd3c42a5d0638c Mon Sep 17 00:00:00 2001
From: Toby Roseman <troseman@apple.com>
Date: Tue, 15 Aug 2023 09:35:41 -0700
Subject: [PATCH] 7.0b2 Release (#1945)

* 7.0b2 Release

* Fix flake8 errors

* Delete comment copied from trees documentation

* Skip unit test
---
 coremlpython/CoreMLPythonUtils.mm             |   47 +-
 coremltools/converters/_converters_entry.py   |   59 +-
 .../converters/mil/backend/mil/helper.py      |   42 +-
 .../converters/mil/backend/mil/load.py        |    7 +-
 .../passes/adjust_io_to_supported_types.py    |    4 +-
 .../mil/backend/mil/passes/test_passes.py     |   42 +-
 coremltools/converters/mil/frontend/_utils.py |    6 +-
 .../mil/frontend/milproto/helper.py           |    8 +-
 .../mil/frontend/tensorflow/converter.py      |  129 +-
 .../mil/frontend/tensorflow/dialect_ops.py    |    6 +-
 .../mil/frontend/tensorflow/load.py           |    2 +-
 .../converters/mil/frontend/tensorflow/ops.py |  250 +++-
 .../mil/frontend/tensorflow/test/test_load.py |  235 ++-
 .../mil/frontend/tensorflow/test/test_ops.py  |  391 ++++-
 .../tensorflow/test/test_tf_conversion_api.py |  785 ++++++++++-
 .../mil/frontend/tensorflow2/load.py          |    1 +
 .../test/test_tf2_conversion_api.py           |  175 ++-
 .../frontend/tensorflow2/test/test_v2_load.py |  110 +-
 .../tensorflow2/test/test_v2_ops_tf_keras.py  |   30 +
 .../mil/frontend/torch/converter.py           |   66 +-
 .../converters/mil/frontend/torch/load.py     |   18 +-
 .../converters/mil/frontend/torch/ops.py      |   96 +-
 .../mil/frontend/torch/test/test_api.py       |   16 +-
 .../mil/frontend/torch/test/test_examples.py  |   15 +-
 .../torch/test/test_torch_conversion_api.py   |  818 +++++++++--
 .../mil/frontend/torch/test/test_torch_ops.py |   89 +-
 coremltools/converters/mil/input_types.py     |   15 +-
 coremltools/converters/mil/mil/block.py       |    2 +
 .../converters/mil/mil/ops/defs/_utils.py     |   68 +-
 .../mil/mil/ops/defs/iOS15/control_flow.py    |   27 +-
 .../mil/ops/defs/iOS15/elementwise_unary.py   |   59 +-
 .../mil/ops/defs/iOS15/tensor_operation.py    |   13 +-
 .../ops/defs/iOS15/tensor_transformation.py   |    6 +-
 .../mil/mil/ops/defs/iOS16/constexpr_ops.py   |   14 +-
 .../mil/mil/ops/defs/iOS16/image_resizing.py  |    3 -
 .../mil/mil/ops/defs/iOS17/__init__.py        |   19 +-
 .../converters/mil/mil/ops/defs/iOS17/conv.py |   69 +
 .../mil/ops/defs/iOS17/elementwise_unary.py   |  147 +-
 .../mil/mil/ops/defs/iOS17/image_resizing.py  |  159 ++-
 .../mil/mil/ops/defs/iOS17/linear.py          |  119 ++
 .../mil/mil/ops/defs/iOS17/normalization.py   |  148 ++
 .../mil/mil/ops/defs/iOS17/recurrent.py       |   98 ++
 .../mil/ops/defs/iOS17/tensor_operation.py    |   62 +-
 .../ops/defs/iOS17/tensor_transformation.py   |  558 +++++++-
 .../mil/mil/ops/tests/iOS14/__init__.py       |    9 +
 .../ops/tests/{ => iOS14}/test_activation.py  |  294 +---
 .../tests/{ => iOS14}/test_control_flow.py    |  159 ++-
 .../mil/ops/tests/{ => iOS14}/test_conv.py    |  377 ++---
 .../{ => iOS14}/test_elementwise_binary.py    |   18 +-
 .../{ => iOS14}/test_elementwise_unary.py     |  236 +---
 .../ops/tests/iOS14/test_image_resizing.py    |  548 +++++++
 .../mil/ops/tests/{ => iOS14}/test_linear.py  |   66 +-
 .../tests/{ => iOS14}/test_normalization.py   |  372 +++--
 .../mil/ops/tests/{ => iOS14}/test_pool.py    |   57 +-
 .../mil/ops/tests/{ => iOS14}/test_random.py  |  108 +-
 .../ops/tests/{ => iOS14}/test_recurrent.py   |  164 ++-
 .../ops/tests/{ => iOS14}/test_reduction.py   |  103 +-
 .../ops/tests/iOS14/test_scatter_gather.py    |  675 +++++++++
 .../{ => iOS14}/test_tensor_operation.py      |  305 ++--
 .../{ => iOS14}/test_tensor_transformation.py | 1009 +++++++------
 .../mil/mil/ops/tests/iOS15/__init__.py       |    9 +
 .../ops/tests/iOS15/test_image_resizing.py    |  358 +++++
 .../tests/iOS15/test_tensor_transformation.py |  101 ++
 .../mil/mil/ops/tests/iOS16/__init__.py       |    9 +
 .../tests/{ => iOS16}/test_constexpr_ops.py   |  187 +--
 .../ops/tests/iOS16/test_image_resizing.py    |  195 +++
 .../ops/tests/iOS16/test_scatter_gather.py    |  185 +++
 .../ops/tests/iOS16/test_tensor_operation.py  |   85 ++
 .../tests/iOS16/test_tensor_transformation.py |  164 +++
 .../mil/mil/ops/tests/iOS17/__init__.py       |    9 +
 .../mil/ops/tests/iOS17/test_activation.py    |  182 +++
 .../mil/mil/ops/tests/iOS17/test_conv.py      |  171 +++
 .../ops/tests/iOS17/test_elementwise_unary.py |  177 +++
 .../ops/tests/iOS17/test_image_resizing.py    |  407 ++++++
 .../mil/mil/ops/tests/iOS17/test_linear.py    |  158 +++
 .../mil/ops/tests/iOS17/test_normalization.py |  176 +++
 .../tests/{ => iOS17}/test_quantization.py    |   91 +-
 .../mil/mil/ops/tests/iOS17/test_recurrent.py |  325 +++++
 .../mil/mil/ops/tests/iOS17/test_reduction.py |   63 +
 .../ops/tests/iOS17/test_scatter_gather.py    |  358 +++++
 .../ops/tests/iOS17/test_tensor_operation.py  |  141 ++
 .../tests/iOS17/test_tensor_transformation.py |  381 +++++
 .../mil/mil/ops/tests/test_const.py           |   62 -
 .../mil/mil/ops/tests/test_image_resizing.py  | 1049 --------------
 .../mil/mil/ops/tests/test_scatter_gather.py  | 1170 ---------------
 .../mil/mil/ops/tests/test_slice.py           |  403 ------
 .../mil/mil/ops/tests/test_utils.py           |   13 -
 .../mil/mil/ops/tests/testing_utils.py        |   51 +-
 .../passes/defs/cleanup/noop_elimination.py   |   28 +-
 .../defs/cleanup/remove_redundant_ops.py      |    7 +-
 .../defs/optimize_elementwise_binary.py       |  170 ++-
 .../mil/passes/defs/optimize_quantization.py  |  100 +-
 .../mil/passes/defs/optimize_repeat_ops.py    |  390 ++---
 .../mil/mil/passes/defs/preprocess.py         |   17 +-
 .../mil/mil/passes/defs/quantization.py       |    9 +-
 .../converters/mil/mil/passes/graph_pass.md   |    6 +-
 .../mil/mil/passes/pass_pipeline.py           |   26 +-
 .../mil/mil/passes/tests/test_passes.py       | 1256 ++++++++++++++---
 .../passes/tests/test_quantization_passes.py  |  266 +++-
 .../converters/mil/mil/tests/test_programs.py |    7 +-
 .../converters/mil/mil/types/__init__.py      |   40 +-
 .../converters/mil/mil/types/type_dict.py     |    6 +
 .../converters/mil/mil/types/type_mapping.py  |  100 +-
 coremltools/converters/mil/mil/var.py         |    6 +-
 ...inputs.py => test_inputs_outputs_shape.py} |  110 +-
 coremltools/converters/mil/testing_reqs.py    |  138 +-
 coremltools/converters/mil/testing_utils.py   |  135 +-
 .../models/ml_program/compression_utils.py    |    4 +-
 coremltools/models/model.py                   |    6 +-
 .../neural_network/quantization_utils.py      |   32 +-
 coremltools/optimize/coreml/__init__.py       |   16 +-
 coremltools/optimize/coreml/_config.py        |   37 +-
 .../coreml/_post_training_quantization.py     |  354 ++++-
 coremltools/test/api/test_api_examples.py     |   42 +-
 coremltools/test/api/test_api_visibilities.py |    3 +
 .../neural_network/test_numpy_nn_layers.py    |   14 +
 coremltools/test/optimize/api/__init__.py     |    4 +
 .../test/optimize/api/test_optimize_api.py    |  636 +++++++++
 .../coreml/test_post_training_quantization.py |  197 ++-
 coremltools/test/optimize/torch/conftest.py   |    2 +-
 coremltools/version.py                        |    2 +-
 ...mltools.converters.mil.mil.passes.defs.rst |    2 +
 ...mize.coreml.post_training_quantization.rst |    5 +-
 .../docs/Format/ItemSimilarityRecommender.rst |   24 +-
 .../format/ItemSimilarityRecommender.proto    |   19 -
 mlmodel/src/ResultReason.hpp                  |    1 -
 reqs/test.pip                                 |    6 +-
 setup.py                                      |    2 +-
 128 files changed, 14460 insertions(+), 5978 deletions(-)
 create mode 100644 coremltools/converters/mil/mil/ops/defs/iOS17/conv.py
 create mode 100644 coremltools/converters/mil/mil/ops/defs/iOS17/linear.py
 create mode 100644 coremltools/converters/mil/mil/ops/defs/iOS17/normalization.py
 create mode 100644 coremltools/converters/mil/mil/ops/defs/iOS17/recurrent.py
 create mode 100644 coremltools/converters/mil/mil/ops/tests/iOS14/__init__.py
 rename coremltools/converters/mil/mil/ops/tests/{ => iOS14}/test_activation.py (79%)
 rename coremltools/converters/mil/mil/ops/tests/{ => iOS14}/test_control_flow.py (77%)
 rename coremltools/converters/mil/mil/ops/tests/{ => iOS14}/test_conv.py (72%)
 rename coremltools/converters/mil/mil/ops/tests/{ => iOS14}/test_elementwise_binary.py (97%)
 rename coremltools/converters/mil/mil/ops/tests/{ => iOS14}/test_elementwise_unary.py (74%)
 create mode 100644 coremltools/converters/mil/mil/ops/tests/iOS14/test_image_resizing.py
 rename coremltools/converters/mil/mil/ops/tests/{ => iOS14}/test_linear.py (86%)
 rename coremltools/converters/mil/mil/ops/tests/{ => iOS14}/test_normalization.py (67%)
 rename coremltools/converters/mil/mil/ops/tests/{ => iOS14}/test_pool.py (90%)
 rename coremltools/converters/mil/mil/ops/tests/{ => iOS14}/test_random.py (84%)
 rename coremltools/converters/mil/mil/ops/tests/{ => iOS14}/test_recurrent.py (83%)
 rename coremltools/converters/mil/mil/ops/tests/{ => iOS14}/test_reduction.py (81%)
 create mode 100644 coremltools/converters/mil/mil/ops/tests/iOS14/test_scatter_gather.py
 rename coremltools/converters/mil/mil/ops/tests/{ => iOS14}/test_tensor_operation.py (86%)
 rename coremltools/converters/mil/mil/ops/tests/{ => iOS14}/test_tensor_transformation.py (66%)
 create mode 100644 coremltools/converters/mil/mil/ops/tests/iOS15/__init__.py
 create mode 100644 coremltools/converters/mil/mil/ops/tests/iOS15/test_image_resizing.py
 create mode 100644 coremltools/converters/mil/mil/ops/tests/iOS15/test_tensor_transformation.py
 create mode 100644 coremltools/converters/mil/mil/ops/tests/iOS16/__init__.py
 rename coremltools/converters/mil/mil/ops/tests/{ => iOS16}/test_constexpr_ops.py (78%)
 create mode 100644 coremltools/converters/mil/mil/ops/tests/iOS16/test_image_resizing.py
 create mode 100644 coremltools/converters/mil/mil/ops/tests/iOS16/test_scatter_gather.py
 create mode 100644 coremltools/converters/mil/mil/ops/tests/iOS16/test_tensor_operation.py
 create mode 100644 coremltools/converters/mil/mil/ops/tests/iOS16/test_tensor_transformation.py
 create mode 100644 coremltools/converters/mil/mil/ops/tests/iOS17/__init__.py
 create mode 100644 coremltools/converters/mil/mil/ops/tests/iOS17/test_activation.py
 create mode 100644 coremltools/converters/mil/mil/ops/tests/iOS17/test_conv.py
 create mode 100644 coremltools/converters/mil/mil/ops/tests/iOS17/test_elementwise_unary.py
 create mode 100644 coremltools/converters/mil/mil/ops/tests/iOS17/test_image_resizing.py
 create mode 100644 coremltools/converters/mil/mil/ops/tests/iOS17/test_linear.py
 create mode 100644 coremltools/converters/mil/mil/ops/tests/iOS17/test_normalization.py
 rename coremltools/converters/mil/mil/ops/tests/{ => iOS17}/test_quantization.py (87%)
 create mode 100644 coremltools/converters/mil/mil/ops/tests/iOS17/test_recurrent.py
 create mode 100644 coremltools/converters/mil/mil/ops/tests/iOS17/test_reduction.py
 create mode 100644 coremltools/converters/mil/mil/ops/tests/iOS17/test_scatter_gather.py
 create mode 100644 coremltools/converters/mil/mil/ops/tests/iOS17/test_tensor_operation.py
 create mode 100644 coremltools/converters/mil/mil/ops/tests/iOS17/test_tensor_transformation.py
 delete mode 100644 coremltools/converters/mil/mil/ops/tests/test_const.py
 delete mode 100644 coremltools/converters/mil/mil/ops/tests/test_image_resizing.py
 delete mode 100644 coremltools/converters/mil/mil/ops/tests/test_scatter_gather.py
 delete mode 100644 coremltools/converters/mil/mil/ops/tests/test_slice.py
 rename coremltools/converters/mil/{test_flexible_shape_inputs.py => test_inputs_outputs_shape.py} (87%)
 create mode 100644 coremltools/test/optimize/api/__init__.py
 create mode 100644 coremltools/test/optimize/api/test_optimize_api.py

diff --git a/coremlpython/CoreMLPythonUtils.mm b/coremlpython/CoreMLPythonUtils.mm
index 9bed610d0..edfc2de1e 100644
--- a/coremlpython/CoreMLPythonUtils.mm
+++ b/coremlpython/CoreMLPythonUtils.mm
@@ -444,6 +444,13 @@ static size_t sizeOfArrayElement(MLMultiArrayDataType type) {
         return py::none();
     }
     MLMultiArrayDataType type = value.dataType;
+    if (type == MLMultiArrayDataTypeFloat16) {
+        // Cast to fp32 because py:array doesn't support fp16.
+        // TODO: rdar://92239209 : return np.float16 instead of np.float32 when multiarray type is Float16
+        value = [MLMultiArray multiArrayByConcatenatingMultiArrays:@[value] alongAxis:0 dataType:MLMultiArrayDataTypeFloat32];
+        type = value.dataType;
+    }
+
     std::vector<size_t> shape = Utils::convertNSArrayToCpp(value.shape);
     std::vector<size_t> strides = Utils::convertNSArrayToCpp(value.strides);
 
@@ -451,28 +458,26 @@ static size_t sizeOfArrayElement(MLMultiArrayDataType type) {
     for (size_t& stride : strides) {
         stride *= sizeOfArrayElement(type);
     }
-    
-    switch (type) {
-        case MLMultiArrayDataTypeInt32:
-            return py::array(shape, strides, static_cast<const int32_t*>(value.dataPointer));
-        case MLMultiArrayDataTypeFloat32:
-            return py::array(shape, strides, static_cast<const float*>(value.dataPointer));
-        case MLMultiArrayDataTypeFloat16:
-        {
-            // create a float32 array, cast float16 values and copy into it
-            // TODO: rdar://92239209 : return np.float16 instead of np.float32 when multiarray type is Float16
-            std::vector<float> value_fp32(value.count, 0.0);
-            for (size_t i=0; i<value.count; i++) {
-                value_fp32[i] = [value[i] floatValue];
-            }
-            return py::array(shape, strides, value_fp32.data());
+
+    __block py::object array;
+    [value getBytesWithHandler:^(const void *bytes, NSInteger size) {
+        switch (type) {
+            case MLMultiArrayDataTypeInt32:
+                array = py::array(shape, strides, reinterpret_cast<const int32_t *>(bytes));
+                break;
+            case MLMultiArrayDataTypeFloat32:
+                array = py::array(shape, strides, reinterpret_cast<const float *>(bytes));
+                break;
+            case MLMultiArrayDataTypeFloat64:
+                array = py::array(shape, strides, reinterpret_cast<const double *>(bytes));
+                break;
+            default:
+                assert(false);
+                array = py::object();
         }
-        case MLMultiArrayDataTypeDouble:
-            return py::array(shape, strides, static_cast<const double*>(value.dataPointer));
-        default:
-            assert(false);
-            return py::object();
-    }
+    }];
+
+    return array;
 }
 
 py::object Utils::convertDictionaryValueToPython(NSDictionary<NSObject *,NSNumber *> * dict) {
diff --git a/coremltools/converters/_converters_entry.py b/coremltools/converters/_converters_entry.py
index d0d914d3f..438cb128b 100644
--- a/coremltools/converters/_converters_entry.py
+++ b/coremltools/converters/_converters_entry.py
@@ -14,6 +14,7 @@
 )
 from coremltools import ComputeUnit as _ComputeUnit
 from coremltools import __version__ as _ct_version
+from coremltools import _logger as logger
 from coremltools._deps import _HAS_TF_1, _HAS_TF_2, _HAS_TORCH
 from coremltools.converters._profile_utils import _profile
 from coremltools.converters.mil._deployment_compatibility import (
@@ -156,6 +157,8 @@ def convert(
               ``ct.utils.rename_feature`` API.
             - If ``dtype`` is not specified, it defaults to the ``dtype`` of the
               inputs in the TF model.
+            - For ``minimum_deployment_target >= ct.target.macOS13``, and with ``compute_precision`` in float 16 precision.
+              When ``inputs`` not provided or ``dtype`` not specified. The float 32 inputs defaults to float 16.
 
         * PyTorch:
             - The ``inputs`` parameter is required.
@@ -166,7 +169,10 @@ def convert(
             - If the ``name`` argument is specified with ``TensorType`` or
               ``ImageType``, the converted Core ML model will have inputs with
               the same name.
-            - If ``dtype`` is missing, it defaults to float 32.
+            - If ``dtype`` is missing:
+              * For ``minimum_deployment_target <= ct.target.macOS12``, it defaults to float 32.
+              * For ``minimum_deployment_target >= ct.target.macOS13``, and with ``compute_precision`` in float 16 precision.
+                It defaults to float 16.
 
     outputs : list of ``TensorType`` or ``ImageType`` (optional)
 
@@ -206,6 +212,9 @@ def convert(
             - If specified, the ``name`` with ``TensorType`` or ``ImageType``
               must correspond to a node in the TF graph. In this case, the model
               will be converted up to that node.
+            - For ``minimum_deployment_target >= ct.target.macOS13``, and with ``compute_precision`` in float 16 precision.
+              If ``dtype`` not specified, the outputs inferred of type float 32
+              defaults to float 16.
 
         * PyTorch:
 
@@ -213,6 +222,10 @@ def convert(
               outputs returned by the PyTorch model.
             - If ``name`` is specified, it is applied to the output names of the
               converted Core ML model.
+            - For ``minimum_deployment_target >= ct.target.macOS13``, and with ``compute_precision`` in float 16 precision.
+              If ``dtype`` not specified, the outputs inferred of type float 32
+              defaults to float 16.
+
 
     classifier_config : ClassifierConfig class (optional)
         The configuration if the MLModel is intended to be a classifier.
@@ -221,7 +234,7 @@ def convert(
         A member of the ``coremltools.target`` enum.
         The value of this parameter determines the type of the model
         representation produced by the converter. To learn about the differences
-        between neural networks and ML programs, see
+        between ML programs and neural networks, see
         `ML Programs <https://coremltools.readme.io/docs/ml-programs>`_.
 
         - The converter produces a neural network (``neuralnetwork``) if:
@@ -239,33 +252,34 @@ def convert(
                                            coremltools.target.tvOS15:
 
         - If neither the ``minimum_deployment_target`` nor the ``convert_to``
-          parameter is specified, the converter produces the neural network
+          parameter is specified, the converter produces the ML programs
           model type with as minimum of a deployment target as possible.
         - If this parameter is specified and ``convert_to`` is also specified,
           they must be compatible. The following are examples of invalid values:
           ::
-            # Invalid:
-            convert_to="neuralnetwork", minimum_deployment_target=coremltools.target.iOS15
             # Invalid:
             convert_to="mlprogram", minimum_deployment_target=coremltools.target.iOS14
 
+            # Invalid:
+            convert_to="neuralnetwork", minimum_deployment_target=coremltools.target.iOS15
+
     convert_to : str (optional)
-        Must be one of [``'neuralnetwork'``, ``'mlprogram'``, ``'milinternal'``].
+        Must be one of [``'mlprogram'``, ``'neuralnetwork'``, ``'milinternal'``].
         The value of this parameter determines the type of the model
         representation produced by the converter. To learn about the
-        differences between neural networks and ML programs, see
+        differences between ML programs and neural networks, see
         `ML Programs <https://coremltools.readme.io/docs/ml-programs>`_.
 
+        - ``'mlprogram'`` : Returns an MLModel (``coremltools.models.MLModel``)
+          containing a MILSpec.Program proto, which is the Core ML program format.
+          The model saved from this returned object is executable on iOS15,
+          macOS12, watchOS8, and tvOS15.
         - ``'neuralnetwork'``: Returns an MLModel (``coremltools.models.MLModel``)
           containing a NeuralNetwork proto, which is the original Core ML format.
           The model saved from this returned object is executable either on
           iOS13/macOS10.15/watchOS6/tvOS13 and newer, or on
           iOS14/macOS11/watchOS7/tvOS14 and newer, depending on the layers used
           in the model.
-        - ``'mlprogram'`` : Returns an MLModel (``coremltools.models.MLModel``)
-          containing a MILSpec.Program proto, which is the Core ML program format.
-          The model saved from this returned object is executable on iOS15,
-          macOS12, watchOS8, and tvOS15.
         - ``'milinternal'``: Returns an MIL program object
           (``coremltools.converters.mil.Program``). An MIL program is primarily
           used for debugging and inspection. It can be converted to an MLModel for
@@ -275,7 +289,7 @@ def convert(
              ct.convert(mil_program, convert_to="mlprogram")
 
         - If neither the ``minimum_deployment_target`` nor the ``convert_to``
-          parameter is specified, the converter produces the neural network
+          parameter is specified, the converter produces the ML programs
           model type with as minimum of a deployment target as possible.
 
     compute_precision : coremltools.precision enumeration or ct.transform.FP16ComputePrecision() (optional)
@@ -504,10 +518,11 @@ def skip_real_div_ops(op):
         exact_target,
         minimum_deployment_target,
     )
+    need_fp16_cast_pass = _need_fp16_cast_pass(compute_precision, exact_target)
 
     if pass_pipeline is None:
         pass_pipeline = PassPipeline()
-    if not _need_fp16_cast_pass(compute_precision, exact_target):
+    if not need_fp16_cast_pass:
         pass_pipeline.remove_passes({"common::add_fp16_cast"})
     if isinstance(compute_precision, FP16ComputePrecision):
         # For backward compatibility with the `op_selector` param in FP16ComputePrecision.
@@ -527,6 +542,12 @@ def skip_real_div_ops(op):
     if specification_version is None:
         specification_version = _set_default_specification_version(exact_target)
 
+    use_default_fp16_io = (
+        specification_version is not None
+        and specification_version >= AvailableTarget.iOS16
+        and need_fp16_cast_pass
+    )
+
     mlmodel = mil_convert(
         model,
         convert_from=exact_source,
@@ -540,6 +561,7 @@ def skip_real_div_ops(op):
         debug=debug,
         specification_version=specification_version,
         main_pipeline=pass_pipeline,
+        use_default_fp16_io=use_default_fp16_io,
     )
 
     if exact_target == "mlprogram" and mlmodel._input_has_infinite_upper_bound():
@@ -890,6 +912,15 @@ def _determine_target(convert_to, minimum_deployment_target):
     """
     Infer the precise backend target, which could be one of ``milinternal``, ``neuralnetwork`` or ``mlprogram``
     """
+    if minimum_deployment_target is None and convert_to is None:
+        logger.warning(
+            "When both 'convert_to' and 'minimum_deployment_target' not specified, "
+            "'convert_to' is set to \"mlprogram\" and 'minimum_deployment_targer' is set to "
+            "ct.target.iOS15 (which is same as ct.target.macOS12). "
+            "Note: the model will not run on systems older than iOS15/macOS12/watchOS8/tvOS15. "
+            "In order to make your model run on older system, please set the 'minimum_deployment_target' to iOS14/iOS13. "
+            "Details please see the link: https://coremltools.readme.io/docs/unified-conversion-api#target-conversion-formats"
+        )
     if minimum_deployment_target is not None:
         if convert_to == "mlprogram" and minimum_deployment_target < AvailableTarget.iOS15:
             raise ValueError(
@@ -908,7 +939,7 @@ def _determine_target(convert_to, minimum_deployment_target):
         return convert_to
     else:
         if minimum_deployment_target is None:
-            return "neuralnetwork"
+            return "mlprogram"
         elif minimum_deployment_target <= AvailableTarget.iOS14:
             return "neuralnetwork"
         else:
diff --git a/coremltools/converters/mil/backend/mil/helper.py b/coremltools/converters/mil/backend/mil/helper.py
index 078a662a1..880f4bda1 100644
--- a/coremltools/converters/mil/backend/mil/helper.py
+++ b/coremltools/converters/mil/backend/mil/helper.py
@@ -10,10 +10,12 @@
 import coremltools.proto.FeatureTypes_pb2 as ft
 import coremltools.proto.MIL_pb2 as pm
 from coremltools.converters.mil.mil import types
-from coremltools.converters.mil.mil.types import (builtin_to_proto_types,
-                                                  builtin_to_string,
-                                                  numpy_type_to_builtin_type,
-                                                  type_to_builtin_type)
+from coremltools.converters.mil.mil.types import (
+    BUILTIN_TO_PROTO_TYPES,
+    builtin_to_string,
+    numpy_type_to_builtin_type,
+    type_to_builtin_type,
+)
 from coremltools.converters.mil.mil.types.type_mapping import np_val_to_py_type
 from coremltools.models.utils import _WEIGHTS_DIR_NAME, _WEIGHTS_FILE_NAME
 
@@ -91,20 +93,30 @@ def update_tensortype(t_type, shape, data_type):
         set_proto_dim(t_dim, s)
 
 def _tensor_field_by_type(tensor_val, builtin_type):
+    """
+    Pick the field based on the builtin_type.
+
+    The field is defined in TensorValue in ``mlmodel/format/MIL.proto``.
+    The picked field need to be consistent with how it will be read by MIL.
+    For example, int8 is serialized to ``bytes`` field while int16 is serialized to ``ints`` field.
+    """
     if builtin_type == types.bool:
         return tensor_val.bools.values
     elif types.is_int(builtin_type):
-        if (builtin_type == types.int64 or builtin_type == types.uint64):
+        if builtin_type == types.int64 or builtin_type == types.uint64:
             return tensor_val.longInts.values
         if builtin_type in (types.int8, types.uint8, types.uint32):
             return tensor_val.bytes.values
+        if builtin_type == types.int16 or builtin_type == types.uint16:
+            # TODO (rdar://111797203): Serialize to byte after MIL changes to read from byte field.
+            return tensor_val.ints.values
         return tensor_val.ints.values
     elif types.is_float(builtin_type):
-        if (builtin_type == types.fp64):
+        if builtin_type == types.fp64:
             return tensor_val.doubles.values
-        elif (builtin_type == types.fp32):
+        elif builtin_type == types.fp32:
             return tensor_val.floats.values
-        elif (builtin_type == types.fp16):
+        elif builtin_type == types.fp16:
             return tensor_val.bytes.values
         else:
             raise TypeError(
@@ -177,14 +189,8 @@ def create_scalar_value(py_scalar):
 
     # Set the tensor value
     t_field = _tensor_field_by_type(t_val, builtin_type)
-    if builtin_type in (
-        types.fp16,
-        types.int8,
-        types.uint8,
-        types.int16,
-        types.uint16,
-        types.uint32,
-    ):
+    if builtin_type in (types.fp16, types.int8, types.uint8, types.uint32):
+        # Serialize to bytes because MIL read them from the "bytes" field in TensorValue.
         val.immediateValue.tensor.bytes.values = np_val_to_py_type(py_scalar)
     else:
         if builtin_type == types.str:
@@ -243,7 +249,7 @@ def create_file_value_tensor(file_name, offset, dim, data_type):
 
 
 def types_to_proto_primitive(valuetype):
-    if valuetype not in builtin_to_proto_types:
+    if valuetype not in BUILTIN_TO_PROTO_TYPES:
         additional_error_msg = ""
         if valuetype in (types.complex64, types.complex128):
             additional_error_msg = (
@@ -253,7 +259,7 @@ def types_to_proto_primitive(valuetype):
         raise ValueError(
             f"Unknown map from SSA type {valuetype} to Proto type. {additional_error_msg}"
         )
-    return builtin_to_proto_types[valuetype]
+    return BUILTIN_TO_PROTO_TYPES[valuetype]
 
 
 def types_to_proto(valuetype):
diff --git a/coremltools/converters/mil/backend/mil/load.py b/coremltools/converters/mil/backend/mil/load.py
index 246179e01..36691868b 100644
--- a/coremltools/converters/mil/backend/mil/load.py
+++ b/coremltools/converters/mil/backend/mil/load.py
@@ -429,7 +429,12 @@ def load(prog, weights_dir, resume_on_errors=False, specification_version=_SPECI
                     # Classifier outputs are set up separately, so default to fp32 for now.
                     dataType = ft.ArrayFeatureType.ArrayDataType.FLOAT32
 
-                array_type = ft.ArrayFeatureType(shape=None, dataType=dataType)
+                output_shape = (
+                    None
+                    if any_symbolic(var.shape) or types.is_primitive(var.sym_type)
+                    else var.shape
+                )
+                array_type = ft.ArrayFeatureType(shape=output_shape, dataType=dataType)
                 output_feature_type.multiArrayType.CopyFrom(array_type)
                 output_features.append(ml.FeatureDescription(name=var.name, type=output_feature_type))
         elif (types.is_dict(var.sym_type)):
diff --git a/coremltools/converters/mil/backend/mil/passes/adjust_io_to_supported_types.py b/coremltools/converters/mil/backend/mil/passes/adjust_io_to_supported_types.py
index afacdf024..df8d9349b 100644
--- a/coremltools/converters/mil/backend/mil/passes/adjust_io_to_supported_types.py
+++ b/coremltools/converters/mil/backend/mil/passes/adjust_io_to_supported_types.py
@@ -79,7 +79,7 @@ def _adjust_var_dtype_helper(var, dtype):
 def _get_io_supported_types(opset_version: target) -> Set[type]:
     """Get Core ML I/O supported data types based on opset version."""
     supported_types = {types.fp32, types.int32}
-    if opset_version >= target.iOS16:
+    if opset_version is not None and opset_version >= target.iOS16:
         supported_types.add(types.fp16)
     return supported_types
 
@@ -88,7 +88,7 @@ def _get_runtime_supported_types(opset_version: target) -> Set[type]:
     """Get Core ML Runtime supported data types based on opset version."""
     supported_types = {types.fp16, types.fp32, types.int32, types.str, types.bool}
     if opset_version >= target.iOS17:
-        supported_types.update({types.int16, types.uint16})
+        supported_types.update({types.int8, types.uint8, types.int16, types.uint16})
     return supported_types
 
 
diff --git a/coremltools/converters/mil/backend/mil/passes/test_passes.py b/coremltools/converters/mil/backend/mil/passes/test_passes.py
index 4168fb775..8523b0618 100644
--- a/coremltools/converters/mil/backend/mil/passes/test_passes.py
+++ b/coremltools/converters/mil/backend/mil/passes/test_passes.py
@@ -31,8 +31,11 @@ def prog(x, y, z):
         prog.functions['not_main'] = copy.deepcopy(prog.functions['main'])
 
         prev_prog, prev_block, block = apply_pass_and_basic_check(
-            prog, "mil_backend::adjust_io_to_supported_types"
-        )
+            prog,
+            "mil_backend::adjust_io_to_supported_types",
+            skip_output_type_check=True,
+            skip_input_type_check=True,
+        )  # output dtype is modified
 
         """
         Input graph:
@@ -109,8 +112,11 @@ def prog(x):
             return x
 
         prev_prog, prev_block, block = apply_pass_and_basic_check(
-            prog, "mil_backend::adjust_io_to_supported_types"
-        )
+            prog,
+            "mil_backend::adjust_io_to_supported_types",
+            skip_output_type_check=True,
+            skip_input_type_check=True,
+        )  # output dtype is modified
 
         prev_inputs = list(prev_prog.functions['main'].inputs.items())
         inputs = list(prog.functions['main'].inputs.items())
@@ -134,8 +140,11 @@ def prog(x):
             return x
 
         prev_prog, prev_block, block = apply_pass_and_basic_check(
-            prog, "mil_backend::adjust_io_to_supported_types"
-        )
+            prog,
+            "mil_backend::adjust_io_to_supported_types",
+            skip_output_type_check=True,
+            skip_input_type_check=True,
+        )  # output dtype is modified
 
         prev_inputs = list(prev_prog.functions['main'].inputs.items())
         inputs = list(prog.functions['main'].inputs.items())
@@ -173,8 +182,12 @@ def test_float16_input_output(self, opset_version):
         def prog(x):
             return mb.relu(x=x)
 
+        skip_type_check = opset_version in [None, ct.target.iOS13]
         prev_prog, prev_block, block = apply_pass_and_basic_check(
-            prog, "mil_backend::adjust_io_to_supported_types"
+            prog,
+            "mil_backend::adjust_io_to_supported_types",
+            skip_output_type_check=skip_type_check,
+            skip_input_type_check=skip_type_check,
         )
 
         prev_inputs = list(prev_block.inputs.items())
@@ -240,8 +253,11 @@ def prog(x):
             return x
 
         prev_prog, prev_block, block = apply_pass_and_basic_check(
-            prog, "mil_backend::adjust_io_to_supported_types"
-        )
+            prog,
+            "mil_backend::adjust_io_to_supported_types",
+            skip_output_type_check=True,
+            skip_input_type_check=True,
+        )  # output dtype is modified
 
         prev_inputs = list(prev_prog.functions['main'].inputs.items())
         inputs = list(prog.functions['main'].inputs.items())
@@ -280,9 +296,13 @@ def test_int16_input(self, opset_version):
         def prog(x):
             return x
 
+        skip_type_check = opset_version is None
         prev_prog, prev_block, block = apply_pass_and_basic_check(
-            prog, "mil_backend::adjust_io_to_supported_types"
-        )
+            prog,
+            "mil_backend::adjust_io_to_supported_types",
+            skip_output_type_check=True,
+            skip_input_type_check=True,
+        )  # output dtype id modified
 
         prev_inputs = list(prev_block.inputs.items())
         inputs = list(block.inputs.items())
diff --git a/coremltools/converters/mil/frontend/_utils.py b/coremltools/converters/mil/frontend/_utils.py
index 0c98a9d51..747071e8a 100644
--- a/coremltools/converters/mil/frontend/_utils.py
+++ b/coremltools/converters/mil/frontend/_utils.py
@@ -73,7 +73,7 @@ def build_einsum_mil(vars: List[Var], equation: str, name: str) -> Var:
     parsed_vectors = parse_einsum_equation(equation)
 
     if len(vars) != 2:
-        return solve_generic_einsum(parsed_vectors, vars, name)
+        return solve_generic_einsum(list(parsed_vectors), vars, name)
 
     equation_rev = _reverse_input_einsum_eq(equation)
     parsed_vectors_rev = parse_einsum_equation(equation_rev)
@@ -82,7 +82,7 @@ def _swap(a, b):
         return b, a
 
     a_var, b_var = vars
-    is_dynamic = any([is_symbolic(var) for var in vars])
+    is_dynamic = any([any_symbolic(var.shape) for var in vars])
     # list of equations supported for explicit mil translations
     vec_bnqd_bnkd_bnqk = (
         [0, 1, 2, 3],
@@ -173,7 +173,7 @@ def _swap(a, b):
         else:
             x = mb.einsum(values=(b_var, a_var), equation=equation_rev, name=name)
     else:
-        x = solve_generic_einsum(parsed_vectors, [a_var, b_var], name)
+        x = solve_generic_einsum(list(parsed_vectors), [a_var, b_var], name)
 
     return x
 
diff --git a/coremltools/converters/mil/frontend/milproto/helper.py b/coremltools/converters/mil/frontend/milproto/helper.py
index b1fe7e6a2..6d7bed661 100644
--- a/coremltools/converters/mil/frontend/milproto/helper.py
+++ b/coremltools/converters/mil/frontend/milproto/helper.py
@@ -23,7 +23,7 @@ def proto_to_types(valuetype):
     """
     if valuetype.WhichOneof("type") == "tensorType":
         tensortype = valuetype.tensorType
-        dtype = types.proto_to_builtin_types[tensortype.dataType]
+        dtype = types.PROTO_TO_BUILTIN_TYPE[tensortype.dataType]
 
         if tensortype.rank < 0:
             raise ValueError("Negative or Dynamic ranks not supported")
@@ -39,13 +39,13 @@ def proto_to_types(valuetype):
         # For the zero rank tensor, we always convert it back to scalar in PyMIL first
         if tensortype.rank == 0:
             return dtype
-            
+
         return types.tensor(dtype, shape)
 
     elif valuetype.WhichOneof("type") == "listType":
         listtype = valuetype.listType
         elem_type = proto_to_types(listtype.type)
-        
+
         if listtype.length.unknown:
             init_length = None
         else:
@@ -59,7 +59,7 @@ def proto_to_types(valuetype):
         dicttype = valuetype.dictionaryType
         keytype = proto_to_types(dicttype.keyType)
         valuetype = proto_to_types(dicttype.valueType)
-        
+
         return types.dict(keytype, valuetype)
     else:
         raise NotImplementedError("Types {} not yet implemented".format(valuetype.WhichOneof("type")))
diff --git a/coremltools/converters/mil/frontend/tensorflow/converter.py b/coremltools/converters/mil/frontend/tensorflow/converter.py
index 98dd468c7..c0e5574ed 100644
--- a/coremltools/converters/mil/frontend/tensorflow/converter.py
+++ b/coremltools/converters/mil/frontend/tensorflow/converter.py
@@ -107,27 +107,69 @@ def __contains__(self, tf_name):
 
 
 class TFConverter:
-    def __init__(self, tfssa, inputs=None, outputs=None, opset_version=None):
+    def __init__(
+        self, tfssa, inputs=None, outputs=None, opset_version=None, use_default_fp16_io=False
+    ):
         """
         tfssa: TensorFlow IR.
         inputs: list of TensorType or ImageType, optional, defaults to None.
         outputs: list[ct.InputType] or None
             list of either ct.TensorTypes or ct.ImageTypes (both of which are child classes of InputType)
             This is the value of the "outputs" argument, passed on by the user in "coremltools.convert" API.
+        opset_version: An int represents the Core ML opset version.
+        use_default_fp16_io (optional): bool. Defaults to False.
+            When minimum_deployment_target set >= ct.target.iOS16 (the same as ct.target.macOS13),
+            and the compute precision set to fp16, this flag is True.
+            When True, fp32 i/o defaults to fp16.
         """
         self.tfssa = tfssa
         self.global_type = {}
         self.inputs = None
         self.main_output_types = outputs
         self.opset_version = _target(opset_version) if opset_version is not None else None
+        self.use_default_fp16_io = use_default_fp16_io
         output_names = get_output_names(outputs)
 
         main_func = tfssa.functions["main"]
         graph = main_func.graph
 
-        # Filter the inputs to only Placeholder names
+        # Get inputs dtype and shape defined in the tf graph
         tf_placeholder_names = [n for n in graph if graph[n].op == "Placeholder"]
-        placeholder_names = []
+        tf_input_dtype = {}
+        tf_input_shape = {}
+        image_input_names = []
+        inputs_with_defined_shape = []
+
+        if inputs is not None:
+            # Special case: if there's only 1 input and 1 placeholder, we match them.
+            if len(tf_placeholder_names) == 1 and len(inputs) == 1:
+                if inputs[0].name is None:
+                    inputs[0].name = tf_placeholder_names[0]
+            for val in inputs:
+                if isinstance(val, ImageType):
+                    image_input_names.append(val.name)
+                if val.shape is not None:
+                    inputs_with_defined_shape.append(val.name)
+
+        for inp in main_func.inputs:
+            node = graph[inp]
+
+            # Parse dtype from the tf graph
+            dtype = node.attr["dtype"]
+            if use_default_fp16_io and dtype == types.fp32 and inp not in image_input_names:
+                dtype = types.fp16
+
+            tf_input_dtype[inp] = dtype
+
+            # Parse shape from the tf graph
+            if inp not in inputs_with_defined_shape:
+                shape = self._get_placeholder_shape_from_tf_graph(tfgraph=graph, name=inp)
+                shape = [get_new_symbol() if s is None or s == -1 else s for s in shape]
+                shape = _get_shaping_class(shape)
+                tf_input_shape[inp] = shape
+
+        # Filter the inputs to only Placeholder names
+        missing_placeholder_names = []
         if inputs is not None:
             # Check inputs format
             if not isinstance(inputs, (list, tuple)):
@@ -143,12 +185,6 @@ def __init__(self, tfssa, inputs=None, outputs=None, opset_version=None):
                     )
                 )
 
-            # Special case: if there's only 1 input and 1 placeholder, we match them.
-            if len(tf_placeholder_names) == 1 and len(inputs) == 1:
-                if inputs[0].name is None:
-                    inputs[0].name = tf_placeholder_names[0]
-
-            # We fill in shapes for user-specified input that doesn't have shape
             for inp in inputs:
                 # Check inputs existence
                 if inp.name is None:
@@ -161,32 +197,27 @@ def __init__(self, tfssa, inputs=None, outputs=None, opset_version=None):
                             inp.name, tf_placeholder_names
                         )
                     )
+                # We fill in shapes and dtypes for user-specified input that doesn't set
                 if inp.shape is None:
-                    shape = self._get_placeholder_shape_from_tf_graph(tfgraph=graph, name=inp.name)
-                    # _get_shaping_class does not accept -1 or None dimension.
-                    shape = [get_new_symbol() if s is None or s == -1 else s \
-                            for s in shape]
-                    inp.shape = _get_shaping_class(shape)
+                    inp.shape = tf_input_shape[inp.name]
+                if inp.dtype is None:
+                    inp.dtype = tf_input_dtype[inp.name]
 
             # Extract placeholders that users didn't specify.
             user_input_names = [inp.name for inp in inputs]
             for name in tf_placeholder_names:
                 if name not in user_input_names:
-                    placeholder_names.append(name)
+                    missing_placeholder_names.append(name)
         else:
             inputs = []
-            placeholder_names = tf_placeholder_names
+            missing_placeholder_names = tf_placeholder_names
 
         # name -> (shape, mil_type) mapping. shape has type list[int]
         added_inputs = {}
         for inp in main_func.inputs:
-            if inp not in placeholder_names:
+            if inp not in missing_placeholder_names:
                 continue
-            node = graph[inp]
-            dtype = node.attr['dtype']
-            shape = self._get_placeholder_shape_from_tf_graph(tfgraph=graph, name=inp)
-            shape = [get_new_symbol() if s is None or s == -1 else s \
-                    for s in shape]
+            shape, dtype = tf_input_shape[inp], tf_input_dtype[inp]
             inputs.append(TensorType(name=inp, shape=shape, dtype=dtype))
             added_inputs[inp] = (shape, dtype)
 
@@ -333,12 +364,12 @@ def _validate_and_update_main_output_types(self, prog):
                 if out.name not in output_vars_names:
                     msg = "output name, '{}', not found in Tensorflow Graph. Available output names are: {}"
                     raise KeyError(msg.format(out.name, output_vars_names))
-            name_to_input_type_map = {}
+            name_to_output_type_map = {}
             for out in self.main_output_types:
-                name_to_input_type_map[out.name] = out
+                name_to_output_type_map[out.name] = out
             main_output_types = []
             for out_var in output_vars:
-                main_output_types.append(name_to_input_type_map[out_var.name])
+                main_output_types.append(name_to_output_type_map[out_var.name])
             self.main_output_types = main_output_types
 
     def check_placeholder_output(self, prog, outputs_name):
@@ -367,23 +398,24 @@ def check_placeholder_output(self, prog, outputs_name):
     def convert_main_graph(self, prog, graph):
         func_inputs = {}
         for input_type in self.inputs:
+            dtype = input_type.dtype
+            # int64 and fp64 are not supported, so they are mapped to int32 / fp32 accordingly
+            if dtype == types.fp64:
+                dtype = types.fp32
+            elif types.is_int(dtype):
+                dtype = types.int32
             func_inputs[input_type.name] = mb.placeholder(
-                    input_type.shape.symbolic_shape, dtype=input_type.dtype)
+                input_type.shape.symbolic_shape, dtype=dtype
+            )
         prog.set_main_input_types(self.inputs)
 
         with Function(func_inputs, opset_version=self.opset_version) as ssa_func:
             # Get the input Var
             for name in func_inputs.keys():
                 input_var = ssa_func.inputs[name]
-                if (types.is_tensor(input_var.sym_type) or types.is_scalar(input_var.sym_type)) \
-                        and (input_var.dtype == types.fp16 or input_var.dtype == types.fp64):
-                    # cast the input var to float32
-                    # We need to do this because the type inference is very buggy when started from
-                    # float16/float64 typed inputs. Until that is fixed in the following radar
-                    # we cast all inputs of type float16/float64 to float32 as the first step.
-                    # These casts will later get removed, if compute_precision=Float16 is
-                    # provided, which will cause the FP16ComputePrecision pass to run.
-                    # TODO: remove this when this radar is fixed: rdar://93731970
+                if (
+                    types.is_tensor(input_var.sym_type) or types.is_scalar(input_var.sym_type)
+                ) and input_var.dtype == types.fp16:
                     input_var = mb.cast(x=input_var, dtype="fp32", name=name)
                 self.context.add(name, input_var)
             outputs = convert_graph(self.context, graph, self.output_names)
@@ -449,7 +481,32 @@ def convert_main_graph(self, prog, graph):
         # verify that if model output dtypes / names are provided by the user, they are valid
         if self.main_output_types is not None:
             self._validate_and_update_main_output_types(prog)
-            prog.set_main_output_types(self.main_output_types)
+
+        if self.use_default_fp16_io:
+            # get a list of names of fp32 output vars
+            fp32_output_var_names = [
+                var.name for var in prog["main"].outputs if var.dtype == types.fp32
+            ]
+
+            if self.main_output_types is not None:
+                # set the dtype default to fp16 if main_output_types is provided
+                for val in self.main_output_types:
+                    if (
+                        val.name in fp32_output_var_names
+                        and isinstance(val, TensorType)
+                        and val.dtype is None
+                    ):
+                        val.dtype = types.fp16
+            else:
+                # otherwise, we construct the main_output_types, to make every fp32
+                # output var fp16
+                main_output_types = []
+                for val in prog["main"].outputs:
+                    dtype = types.fp16 if val.name in fp32_output_var_names else None
+                    main_output_types.append(TensorType(name=val.name, dtype=dtype))
+                self.main_output_types = main_output_types
+
+        prog.set_main_output_types(self.main_output_types)
 
     @_profile
     def convert(self):
diff --git a/coremltools/converters/mil/frontend/tensorflow/dialect_ops.py b/coremltools/converters/mil/frontend/tensorflow/dialect_ops.py
index 1bc96bb90..010408b70 100644
--- a/coremltools/converters/mil/frontend/tensorflow/dialect_ops.py
+++ b/coremltools/converters/mil/frontend/tensorflow/dialect_ops.py
@@ -47,8 +47,6 @@ def type_inference(self):
                 dynamic_length=self.dynamic_length.val,
             )
         builtin_dtype = types.string_to_builtin(self.dtype.val)
-        if builtin_dtype is None:
-            raise ValueError("Unsupported dtype {}".format(self.dtype.val))
         elem_type = types.tensor(builtin_dtype, self.elem_shape.sym_val)
         return types.list(
             elem_type, init_length=init_length, dynamic_length=self.dynamic_length.val
@@ -75,7 +73,7 @@ class TfLSTMBase(Operation):
         weight_peep_o=TensorInputType(const=True, optional=True, type_domain="T"),  # [hidden_dim,]
         bias=TensorInputType(const=True, type_domain="T"),  # [4*hidden_dim] (icfo layout)
     )
-    
+
     type_domains = {
         "T": (types.fp16, types.fp32),
     }
@@ -105,7 +103,7 @@ class tf_lstm_block_cell(TfLSTMBase):
     xh = [x, h_prev]
     [i, ci, f, o] = xh * w + b
     f = f + forget_bias
-    
+
     if not use_peephole:
         wci = wcf = wco = 0
         i = sigmoid(cs_prev .* wci + i)
diff --git a/coremltools/converters/mil/frontend/tensorflow/load.py b/coremltools/converters/mil/frontend/tensorflow/load.py
index bd77337b4..40c33dabd 100644
--- a/coremltools/converters/mil/frontend/tensorflow/load.py
+++ b/coremltools/converters/mil/frontend/tensorflow/load.py
@@ -227,12 +227,12 @@ def _program_from_tf_ssa(self):
             graphviz.Source(dot_string).view(
                 filename="/tmp/ssa_after_tf_passes", cleanup=True
             )
-
         converter = TFConverter(
             tfssa=self._tf_ssa,
             inputs=self.kwargs["inputs"],
             outputs=self.kwargs["outputs"],
             opset_version=self.kwargs["specification_version"],
+            use_default_fp16_io=self.kwargs["use_default_fp16_io"],
         )
         return converter.convert()
 
diff --git a/coremltools/converters/mil/frontend/tensorflow/ops.py b/coremltools/converters/mil/frontend/tensorflow/ops.py
index 912e07303..b4924f41c 100644
--- a/coremltools/converters/mil/frontend/tensorflow/ops.py
+++ b/coremltools/converters/mil/frontend/tensorflow/ops.py
@@ -393,7 +393,7 @@ def BatchToSpaceND(context, node):
 
     x = context[node.inputs[0]]
     block_shape = context[node.inputs[1]].val
-    crops = context[node.inputs[2]].val
+    crops = context[node.inputs[2]]
     original_shape = mb.shape(x=x)
 
     input_rank = x.rank
@@ -401,10 +401,14 @@ def BatchToSpaceND(context, node):
     remaining_rank = x.rank - 1 - spatial_rank
     has_non_unity_remaining_dims = remaining_rank != 1
 
-    if block_shape is None or crops is None:
-        raise NotImplementedError(
-            "Not support dynamic block_shape and crops for BatchToSpaceND!"
-        )
+    if block_shape is None:
+        raise NotImplementedError("Not support dynamic block_shape for BatchToSpaceND!")
+
+    if crops.val is not None:
+        is_static_crops = True
+        crops = crops.val
+    else:
+        is_static_crops = False
 
     if has_non_unity_remaining_dims:
         # Reshape the input tensor to shape [batch, spatial_shape, remaining_dim_1 * ... * remaining_dim_N]
@@ -418,9 +422,50 @@ def BatchToSpaceND(context, node):
         # [B, H, W, C] -> transpose -> [B, C, H, W] -> batch_to_space -> [B_new, C, H_new, W_new] ->
         # transpose -> [B_new, H_new, W_new, C]
         x = mb.transpose(x=x, perm=[0, 3, 1, 2])
-        x = mb.batch_to_space(x=x, block_shape=block_shape, crops=_np.zeros((2, 2), _np.int32), name=node.name)
-        if tuple(crops[0]) != (0, 0) or tuple(crops[1]) != (0, 0):
-            x = mb.crop(x=x, crop_height=crops[0], crop_width=crops[1])
+        x = mb.batch_to_space(
+            x=x, block_shape=block_shape, crops=_np.zeros((2, 2), _np.int32), name=node.name
+        )
+        need_crop = not is_static_crops or (tuple(crops[0]) != (0, 0) or tuple(crops[1]) != (0, 0))
+        if need_crop:
+            # crop_height, crop_width = crops[0, :], crops[1, :]
+            crop_height = mb.slice_by_index(
+                x=crops,
+                begin=[0, 0],
+                end=[0, 0],
+                begin_mask=[False, True],
+                end_mask=[False, True],
+                squeeze_mask=[True, False],
+            )
+            crop_width = mb.slice_by_index(
+                x=crops,
+                begin=[1, 0],
+                end=[0, 0],
+                begin_mask=[False, True],
+                end_mask=[False, True],
+                squeeze_mask=[True, False],
+            )
+
+            if is_static_crops:
+                # If crops is known at compile time, we can directly use mb.crop
+                x = mb.crop(x=x, crop_height=crop_height, crop_width=crop_width)
+            else:
+                # Otherwise, we need to use slice_by_index to implement the crop
+                a, b = _value_at(crop_height, 0), _value_at(crop_height, 1)
+                c, d = _value_at(crop_width, 0), _value_at(crop_width, 1)
+
+                shape = mb.shape(x=x)
+                height, width = _value_at(shape, 2), _value_at(shape, 3)
+                begin_idx_height, end_idx_height = a, mb.sub(x=height, y=b)
+                begin_idx_width, end_idx_width = c, mb.sub(x=width, y=d)
+
+                begin = mb.concat(values=[0, 0, begin_idx_height, begin_idx_width], axis=0)
+                end = mb.concat(values=[0, 0, end_idx_height, end_idx_width], axis=0)
+                begin_mask = [True, True, False, False]
+                end_mask = [True, True, False, False]
+                x = mb.slice_by_index(
+                    x=x, begin=begin, end=end, begin_mask=begin_mask, end_mask=end_mask
+                )
+
         x = mb.transpose(x=x, perm=[0, 2, 3, 1])
 
     if spatial_rank == 1:
@@ -447,8 +492,26 @@ def BatchToSpaceND(context, node):
         reshape_shape = mb.concat(values=reshape_values, axis=0)
         x = mb.reshape(x=x, shape=reshape_shape)
 
-        # crop the tensor to [B/block_shape, H - crops[0][0] - crops[0][1], C]
-        x = mb.crop(x=x, crop_height=crops[0], crop_width=[0, 0])
+        # crop the tensor to [B/block_shape, H*block_shape - crops[0][0] - crops[0][1], C]
+        if is_static_crops:
+            # If crops is known at compile time, we can directly call mb.crop
+            x = mb.crop(x=x, crop_height=crops[0], crop_width=[0, 0])
+        else:
+            # For the dynamic crops, we implement it with slice_by_index
+            flatten_crops = mb.reshape(x=crops, shape=[-1])
+            a, b = _value_at(flatten_crops, 0), _value_at(flatten_crops, 1)
+
+            shape = mb.shape(x=x)
+            height = _value_at(shape, 1)
+            begin_idx, end_idx = a, mb.sub(x=height, y=b)
+
+            begin = mb.concat(values=[0, begin_idx, 0], axis=0)
+            end = mb.concat(values=[0, end_idx, 0], axis=0)
+            begin_mask = [True, False, True]
+            end_mask = [True, False, True]
+            x = mb.slice_by_index(
+                x=x, begin=begin, end=end, begin_mask=begin_mask, end_mask=end_mask
+            )
 
     if has_non_unity_remaining_dims:
         # Reshape the tensor from shape [batch_new, spatial_shape_new, remaining_dim_1 * ... * remaining_dim_N] back to
@@ -547,8 +610,9 @@ def Cross(context, node):
 @register_tf_op
 def Einsum(context, node):
     equation = node.attr["equation"]
-    vars = context[node.inputs[1]]
-    x = build_einsum_mil(vars, equation, node.name)
+    a = context[node.inputs[0]]
+    b = context[node.inputs[1]]
+    x = build_einsum_mil([a, b], equation, node.name)
     context.add(node.name, x)
 
 
@@ -2113,7 +2177,7 @@ def SpaceToBatchND(context, node):
 
     x = context[node.inputs[0]]
     block_shape = context[node.inputs[1]].val
-    paddings = context[node.inputs[2]].val
+    paddings = context[node.inputs[2]]
     original_shape = mb.shape(x=x)
 
     input_rank = x.rank
@@ -2121,10 +2185,14 @@ def SpaceToBatchND(context, node):
     remaining_rank = x.rank - 1 - spatial_rank
     has_non_unity_remaining_dims = remaining_rank != 1
 
-    if block_shape is None or paddings is None:
-        raise NotImplementedError(
-            "Not support dynamic block_shape and paddings for SpaceToBatchND!"
-        )
+    if block_shape is None:
+        raise NotImplementedError("Not support dynamic block_shape for SpaceToBatchND!")
+
+    if paddings.val is not None:
+        is_static_paddings = True
+        paddings = paddings.val
+    else:
+        is_static_paddings = False
 
     if has_non_unity_remaining_dims:
         # Reshape the input tensor to shape [batch, spatial_shape, remaining_dim_1 * ... * remaining_dim_N]
@@ -2138,8 +2206,20 @@ def SpaceToBatchND(context, node):
         # [B, H, W, C] -> transpose -> [B, C, H, W] -> space_to_batch -> [B_new, C, H_new, W_new] ->
         # transpose -> [B_new, H_new, W_new, C]
         x = mb.transpose(x=x, perm=[0, 3, 1, 2])
-        if tuple(paddings[0]) != (0, 0) or tuple(paddings[1]) != (0, 0):
-            x = mb.pad(x=x, pad=paddings.flatten(), mode="constant")
+        needs_paddings = not is_static_paddings or (
+            tuple(paddings[0]) != (0, 0) or tuple(paddings[1]) != (0, 0)
+        )
+        if needs_paddings:
+            flatten_paddings = mb.reshape(
+                x=paddings,
+                shape=[
+                    4,
+                ],
+            )
+            flatten_paddings = mb.cast(x=flatten_paddings, dtype="int32")
+            flatten_paddings = mb.concat(values=[[0, 0, 0, 0], flatten_paddings], axis=0)
+            x = mb.pad(x=x, pad=flatten_paddings, mode="constant")
+
         x = mb.space_to_batch(x=x, block_shape=block_shape, paddings=_np.zeros((2, 2), _np.int32))
         x = mb.transpose(x=x, perm=[0, 2, 3, 1])
 
@@ -2148,12 +2228,15 @@ def SpaceToBatchND(context, node):
         # [B, H, C] -> decomposite ops -> [B_new, H_new, C]
 
         # expand padding to shape [3, 2]
-        new_paddings = _np.zeros(shape=(3, 2), dtype=_np.int32)
-        new_paddings[1] = paddings
-        paddings = new_paddings
-        needs_paddings = any(paddings.flatten())
+        paddings = mb.cast(x=paddings, dtype="int32")
+        values = [[[0, 0]], paddings, [[0, 0]]]
+        paddings = mb.concat(values=values, axis=0)
+        needs_paddings = not is_static_paddings or any(paddings.val.flatten())
+
         if needs_paddings:
-            padded = mb.pad(x=x, pad=paddings.flatten(), mode="constant")
+            flatten_paddings = mb.reshape(x=paddings, shape=[-1])
+            padded = mb.pad(x=x, pad=flatten_paddings, mode="constant")
+            x = padded
         else:
             padded = x
 
@@ -2572,38 +2655,46 @@ def ResizeNearestNeighbor(context, node):
     # "ResizeNearestNeighbor" op in TF is always in the channel last mode
     # instead of upsample factor, it uses output size, which is the second input
     x = context[node.inputs[0]]
-    output_shape = context[node.inputs[1]]
 
     input_shape = x.shape  # (N,Hin,Win,C)
     if len(input_shape) != 4:
         raise ValueError('"ResizeNearestNeighbor" op: input rank is not 4')
 
-    if len(output_shape.shape) != 1:
+    if len(context[node.inputs[1]].shape) != 1:
         raise ValueError('"ResizeNearestNeighbor" op: the second input, must have rank 1')
-    if output_shape.shape[0] != 2:
+
+    if context[node.inputs[1]].shape[0] != 2:
         raise ValueError(
             '"ResizeNearestNeighbor" op: the second input, which is the output size, must have 2 elements'
         )
-
     Hout, Wout = None, None
-    if output_shape.val is None:
-        # The only dynamic input shape case that can be converted to Core ML is when
-        # output_shape.op = mul(x=input_shape, y=scaling_factor) with const scaling factor
-        # because the resize-related Core ML ops either require const shape or const scaling factor
-        if output_shape.op.op_type != "mul":
+    scaling_factor_h, scaling_factor_w = None, None
+    target_shape = context[node.inputs[1]]
+    if target_shape.val is None:
+        if target_shape.op is not None and target_shape.op.op_type == "mul":
+            scaling_factor_h = target_shape.op.y.val[0]
+            scaling_factor_w = target_shape.op.y.val[1]
+        elif not is_current_opset_version_compatible_with(target.iOS17):
+            # For the dynamic input shape case before iOS17,
+            # context[node.inputs[1]] need to be a mul(x=input_shape, y=scaling_factor) op.
             raise ValueError(
-                "A dynamic input shape image resizing can be converted to Core ML "
-                "only if the `output shape / input shape` ratio is const"
+                "Cannot determine the scale factor for the resize layer. "
+                "Please make sure the target size is known statically, or "
+                "use mul op to get the target size. If the target size has to be dynamic, please"
+                "set minimum_deployment_target to iOS17 during conversion."
             )
-        scaling_factor_h = output_shape.op.y.val[0]
-        scaling_factor_w = output_shape.op.y.val[1]
     else:
         Hin, Win = input_shape[1], input_shape[2]
-        Hout, Wout = output_shape.val
+        Hout, Wout = target_shape.val
         scaling_factor_h = Hout / Hin if Hout % Hin == 0 else (Hout + 1e-4) / Hin
         scaling_factor_w = Wout / Win if Wout % Win == 0 else (Wout + 1e-4) / Win
 
-    if scaling_factor_h < 1 and scaling_factor_w < 1:
+    if (
+        scaling_factor_h is not None
+        and scaling_factor_w is not None
+        and scaling_factor_h < 1
+        and scaling_factor_w < 1
+    ):
         ResizeBilinear(context, node)
         return
 
@@ -2633,6 +2724,14 @@ def ResizeNearestNeighbor(context, node):
                 target_size_width=Wout,
                 name=node.name + "_channel_first_resize",
             )
+        elif is_current_opset_version_compatible_with(target.iOS17):
+            x = mb.resize(
+                x=x,
+                shape=target_shape,
+                resized_dims=np.uint32(2),
+                interpolation_mode="NEAREST_NEIGHBOR",
+                name=node.name + "_channel_first_resize",
+            )
         else:
             logger.warning('Using upsample_nearest_neighbor to approximate resize_nearest_neighbor.')
             x = mb.upsample_nearest_neighbor(
@@ -2685,7 +2784,7 @@ def ResizeBilinear(context, node):
     # if the output_shape comes from a pattern of input_shape * (h_scale, w_scale)
     if is_current_opset_version_compatible_with(target.iOS16) and context[node.inputs[1]].val is None:
         output_shape = context[node.inputs[1]]
-        if output_shape.op.op_type == "mul":
+        if output_shape.op is not None and output_shape.op.op_type == "mul":
             scale_factor_height = context[node.inputs[1]].op.y.val[0]
             scale_factor_width = context[node.inputs[1]].op.y.val[1]
             x = _transpose_NHWC_to_NCHW(x)
@@ -2700,45 +2799,44 @@ def ResizeBilinear(context, node):
             context.add(node.name, x)
             return
 
-    if (align_corners and not half_pixel_centers) or \
-       (not align_corners and not half_pixel_centers):
-        # output shape needed to be known at compile time
-        if context[node.inputs[1]].val is None:
-            raise ValueError(
-                '"ResizeBilinear" op: the second input, which is the output size, must be known statically'
-            )
-
-        Hout, Wout = context[node.inputs[1]].val
-
-        if not (isinstance(Hout, (_np.int32, _np.int64)) and isinstance(Wout, (_np.int32, _np.int64))):
-            raise ValueError(
-                '"ResizeBilinear" op: the second input, which is the output size, must have elements of type int32 or int64'
-            )
-
     # first transpose to from channel last to channel first format for coreml
     x = _transpose_NHWC_to_NCHW(x)
 
-    # add either the resize_bilinear layer or the upsample layer
-
-    # [align_corners = True, half_pixel_centers = False]
-    if align_corners and not half_pixel_centers:
-        x = mb.resize_bilinear(
-            x=x,
-            target_size_height=Hout,
-            target_size_width=Wout,
-            sampling_mode="STRICT_ALIGN_CORNERS",
-            name=node.name + "_channel_first_resize_bilinear",
-        )
-
-    # [align_corners = False, half_pixel_centers = False]
-    elif not align_corners and not half_pixel_centers:
-        x = mb.resize_bilinear(
-            x=x,
-            target_size_height=Hout,
-            target_size_width=Wout,
-            sampling_mode="DEFAULT",
-            name=node.name + "_channel_first_resize_bilinear",
-        )
+    # [half_pixel_centers = False]
+    if not half_pixel_centers:
+        sampling_mode = "STRICT_ALIGN_CORNERS" if align_corners else "DEFAULT"
+        node_name = node.name + "_channel_first_resize_bilinear"
+        target_size = context[node.inputs[1]]
+
+        if target_size.val is not None:
+            Hout, Wout = target_size.val
+            if not (
+                isinstance(Hout, (_np.int32, _np.int64))
+                and isinstance(Wout, (_np.int32, _np.int64))
+            ):
+                raise ValueError(
+                    '"ResizeBilinear" op: the second input, which is the output size, must have elements of type int32 or int64'
+                )
+            x = mb.resize_bilinear(
+                x=x,
+                target_size_height=Hout,
+                target_size_width=Wout,
+                sampling_mode=sampling_mode,
+                name=node_name,
+            )
+        elif is_current_opset_version_compatible_with(target.iOS17):
+            x = mb.resize(
+                x=x,
+                shape=target_size,
+                resized_dims=np.uint32(2),
+                sampling_mode=sampling_mode,
+                name=node_name,
+            )
+        else:
+            raise ValueError(
+                '"ResizeBilinear" op: the second input, which is the output size, must be known '
+                "statically. Consider setting minimum_deployment_target to iOS17 during conversion."
+            )
 
     # [align_corners = False, half_pixel_centers = True]
     elif not align_corners and half_pixel_centers:
diff --git a/coremltools/converters/mil/frontend/tensorflow/test/test_load.py b/coremltools/converters/mil/frontend/tensorflow/test/test_load.py
index 175aaac08..cc4930e65 100644
--- a/coremltools/converters/mil/frontend/tensorflow/test/test_load.py
+++ b/coremltools/converters/mil/frontend/tensorflow/test/test_load.py
@@ -15,10 +15,10 @@
 import coremltools.proto.FeatureTypes_pb2 as ft
 from coremltools import EnumeratedShapes, ImageType, RangeDim, TensorType
 from coremltools._deps import _HAS_TF_1, _IS_MACOS, MSG_TF1_NOT_FOUND
-from coremltools.converters.mil.frontend.tensorflow.converter import \
-    TFConverter
+from coremltools.converters.mil.frontend.tensorflow.converter import TFConverter
 from coremltools.converters.mil.frontend.tensorflow.test.testing_utils import (
     TensorFlowBaseTest, get_tf_keras_io_names, make_tf_graph)
+from coremltools.converters.mil.testing_reqs import backends
 from coremltools.converters.mil.testing_utils import random_gen
 
 tf = pytest.importorskip("tensorflow")
@@ -38,7 +38,11 @@ def teardown(self):
         if os.path.exists(self.saved_model_dir):
             shutil.rmtree(self.saved_model_dir)
 
-    def test_infer_inputs(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_infer_inputs(self, backend):
         x_shape = (3, 4, 5)
 
         @make_tf_graph([x_shape])
@@ -49,17 +53,19 @@ def build_model(x):
         if not isinstance(outputs, (tuple, list)):
             outputs = [outputs]
 
-        output_names = [
-            j if isinstance(j, str) else j.op.name for j in outputs
-        ]
-        mlmodel = converter.convert(model, outputs=output_names)
+        output_names = [j if isinstance(j, str) else j.op.name for j in outputs]
+        mlmodel = converter.convert(model, outputs=output_names, convert_to=backend[0])
         assert mlmodel is not None
 
         input_values = [random_gen(x_shape, -10.0, 10.0)]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(model, input_dict, outputs)
 
-    def test_infer_outputs(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_infer_outputs(self, backend):
         x_shape = (3, 4, 5)
 
         @make_tf_graph([x_shape])
@@ -67,17 +73,21 @@ def build_model(x):
             return tf.nn.relu(x)
 
         model, inputs, outputs = build_model
-        input_name = (
-            inputs[0] if isinstance(inputs[0], str) else inputs[0].op.name
+        input_name = inputs[0] if isinstance(inputs[0], str) else inputs[0].op.name
+        mlmodel = converter.convert(
+            model, inputs=[TensorType(input_name, (3, 4, 5))], convert_to=backend[0]
         )
-        mlmodel = converter.convert(model, inputs=[TensorType(input_name, (3, 4, 5))])
         assert mlmodel is not None
 
         input_values = [random_gen(x_shape, -10.0, 10.0)]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(model, input_dict, outputs)
 
-    def test_infer_inputs_and_outputs(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_infer_inputs_and_outputs(self, backend):
         x_shape = (3, 4, 5)
 
         @make_tf_graph([x_shape])
@@ -85,14 +95,18 @@ def build_model(x):
             return tf.nn.relu(x)
 
         model, inputs, outputs = build_model
-        mlmodel = converter.convert(model)
+        mlmodel = converter.convert(model, convert_to=backend[0])
         assert mlmodel is not None
 
         input_values = [random_gen(x_shape, -10.0, 10.0)]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(model, input_dict, outputs)
 
-    def test_extract_sub_model(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_extract_sub_model(self, backend):
         x_shape = (3, 4, 5)
         y_shape = (3, 4, 5)
 
@@ -105,11 +119,15 @@ def build_model(x, y):
             first_output_name = outputs[0]
         else:
             first_output_name = outputs[0].name.split(":")[0]
-        mlmodel = converter.convert(model, outputs=[first_output_name])
+        mlmodel = converter.convert(model, outputs=[first_output_name], convert_to=backend[0])
         assert mlmodel is not None
 
-    def test_auto_image_nhwc_input_names(self):
-        x_shape = (4, 5, 3)
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_auto_image_nhwc_input_names(self, backend):
+        x_shape = (4, 5, 3) if backend[0] == "neuralnetwork" else (1, 4, 5, 3)
 
         @make_tf_graph([x_shape])
         def build_model(x):
@@ -117,11 +135,15 @@ def build_model(x):
 
         model, inputs, outputs = build_model
 
-        mlmodel = converter.convert(model, inputs=[ImageType()])
+        mlmodel = converter.convert(model, inputs=[ImageType()], convert_to=backend[0])
         assert mlmodel is not None
 
-    def test_auto_image_nchw_input_names(self):
-        x_shape = (3, 4, 5)
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_auto_image_nchw_input_names(self, backend):
+        x_shape = (3, 4, 5) if backend[0] == "neuralnetwork" else (1, 3, 4, 5)
 
         @make_tf_graph([x_shape])
         def build_model(x):
@@ -129,7 +151,9 @@ def build_model(x):
 
         model, inputs, outputs = build_model
 
-        mlmodel = converter.convert(model, inputs=[ImageType(channel_first=True)])
+        mlmodel = converter.convert(
+            model, inputs=[ImageType(channel_first=True)], convert_to=backend[0]
+        )
         assert mlmodel is not None
 
     @pytest.mark.parametrize(
@@ -169,7 +193,11 @@ def build_model(x):
         # successful conversion
         converter.convert(model, minimum_deployment_target=target)
 
-    def test_invalid_output_names(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_invalid_output_names(self, backend):
         x_shape = (3, 4, 5)
 
         @make_tf_graph([x_shape])
@@ -178,10 +206,16 @@ def build_model(x):
 
         model, inputs, outputs = build_model
         with pytest.raises(AssertionError) as e:
-            converter.convert(model, source=frontend, outputs=["invalid_name"])
+            converter.convert(
+                model, source=frontend, outputs=["invalid_name"], convert_to=backend[0]
+            )
         e.match(r".* is not in graph")
 
-    def test_missing_placeholder_shape(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_missing_placeholder_shape(self, backend):
         x_shape = None  # Missing Placeholder shape
 
         @make_tf_graph([x_shape])
@@ -190,15 +224,20 @@ def build_model(x):
 
         model, inputs, outputs = build_model
         with pytest.raises(ValueError) as e:
-            converter.convert(model, source=frontend)
+            converter.convert(model, source=frontend, convert_to=backend[0])
             e.match(r"Unable to determine the shape of input .*")
 
-        mlmodel = converter.convert(model, source=frontend,
-                                    inputs=[ct.TensorType(shape=(1,))])
+        mlmodel = converter.convert(
+            model, source=frontend, inputs=[ct.TensorType(shape=(1,))], convert_to=backend[0]
+        )
         assert mlmodel is not None
 
     @pytest.mark.skip(reason="Rank-0 input is not supported")
-    def test_scalar_placeholder_shape(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_scalar_placeholder_shape(self, backend):
         x_shape = ()  # Scalar Placeholder Shape
 
         @make_tf_graph([x_shape])
@@ -206,14 +245,18 @@ def build_model(x):
             return tf.nn.relu(x)
 
         model, inputs, outputs = build_model
-        mlmodel = converter.convert(model, source=frontend)
+        mlmodel = converter.convert(model, source=frontend, convert_to=backend[0])
         assert mlmodel is not None
 
         input_values = [random_gen(x_shape, -10.0, 10.0)]
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(model, input_dict, outputs)
 
-    def test_shaping_utils(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_shaping_utils(self, backend):
         @make_tf_graph([(None, 4, 5)])
         def build_flexible_model(x):
             return tf.nn.relu(x)
@@ -223,13 +266,16 @@ def build_flexible_model(x):
         output_name = TFConverter._get_tensor_name(outputs[0])
 
         # static-Flexible shape
-        mlmodel = converter.convert(
-            model,
-            inputs=[
+        if backend[0] == "neuralnetwork":
+            inputs = [
                 # Use TF's input shapes (None, 4, 5)
                 TensorType(name=input_name)
-            ],
-            outputs=[output_name]
+            ]
+        else:
+            inputs = [TensorType(name=input_name, shape=(RangeDim(upper_bound=3), 4, 5))]
+
+        mlmodel = converter.convert(
+            model, inputs=inputs, outputs=[output_name], convert_to=backend[0]
         )
         assert mlmodel is not None
         input_values = [random_gen((3, 4, 5), -10.0, 10.0)]
@@ -239,10 +285,10 @@ def build_flexible_model(x):
             np.allclose(ret[output_name], np.maximum(input_values[0], 0.0))
 
         # Enumerate shape
-        inputs_shape = [
-            TensorType(input_name, EnumeratedShapes(shapes=[(3, 4, 5), (4, 4, 5)]))
-        ]
-        mlmodel = converter.convert(model, inputs=inputs_shape, outputs=[output_name])
+        inputs_shape = [TensorType(input_name, EnumeratedShapes(shapes=[(3, 4, 5), (4, 4, 5)]))]
+        mlmodel = converter.convert(
+            model, inputs=inputs_shape, outputs=[output_name], convert_to=backend[0]
+        )
         assert mlmodel is not None
         input_values = [random_gen((3, 4, 5), -10.0, 10.0)]
         input_dict = {input_name: input_values[0]}
@@ -264,7 +310,9 @@ def build_flexible_model(x):
 
         # Ranged shape
         inputs_shape = [TensorType(input_name, [RangeDim(3, 5), 4, 5])]
-        mlmodel = converter.convert(model, inputs=inputs_shape, outputs=[output_name])
+        mlmodel = converter.convert(
+            model, inputs=inputs_shape, outputs=[output_name], convert_to=backend[0]
+        )
         assert mlmodel is not None
         input_values = [random_gen((3, 4, 5), -10.0, 10.0)]
         input_dict = {input_name: input_values[0]}
@@ -284,13 +332,17 @@ def build_flexible_model(x):
                 input_dict = {input_name: input_values[0]}
                 ret = mlmodel.predict(input_dict)
 
-    def test_default_data_types(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_default_data_types(self, backend):
         @make_tf_graph([(2, 2)])
         def build_model(x):
             return tf.nn.relu(x)
 
         model, inputs, outputs = build_model
-        mlmodel = converter.convert(model)
+        mlmodel = converter.convert(model, convert_to=backend[0])
         assert mlmodel is not None
         spec = mlmodel.get_spec()
 
@@ -316,16 +368,27 @@ def teardown(self):
         if os.path.exists(self.saved_model_dir):
             shutil.rmtree(self.saved_model_dir)
 
-    def test_graph_def(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_graph_def(self, backend):
         with tf.Graph().as_default() as graph:
             x = tf.placeholder(tf.float32, shape=(3, 4, 5))
             out = tf.nn.relu(x)
             mlmodel = converter.convert(
-                graph, inputs=[TensorType(x.op.name, (3, 4, 5))], outputs=[out.op.name]
+                graph,
+                inputs=[TensorType(x.op.name, (3, 4, 5))],
+                outputs=[out.op.name],
+                convert_to=backend[0],
             )
             assert mlmodel is not None
 
-    def test_graph_def_file(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_graph_def_file(self, backend):
         with tf.Graph().as_default() as graph:
             x = tf.placeholder(tf.float32, shape=(3, 4, 5))
             out = tf.nn.relu(x)
@@ -336,38 +399,46 @@ def test_graph_def_file(self):
             self.model_path_pb,
             inputs=[TensorType(x.op.name, (3, 4, 5))],
             outputs=[out.op.name],
+            convert_to=backend[0],
         )
         assert mlmodel is not None
 
-    def test_saved_model_from_simple_save(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_saved_model_from_simple_save(self, backend):
         with tf.compat.v1.Session() as sess:
             x = tf.placeholder(shape=(1, 3, 5), dtype=tf.float32)
             y = tf.nn.relu(x)
             inputs = {"x": x}
             outputs = {"y": y}
-            tf.compat.v1.saved_model.simple_save(
-                sess, self.saved_model_dir, inputs, outputs
-            )
-        mlmodel = converter.convert(self.saved_model_dir)
+            tf.compat.v1.saved_model.simple_save(sess, self.saved_model_dir, inputs, outputs)
+        mlmodel = converter.convert(self.saved_model_dir, convert_to=backend[0])
         assert mlmodel is not None
 
-    def test_tf_keras(self):
-        keras_model = tf.keras.Sequential(
-            [tf.keras.layers.ReLU(input_shape=(4, 5), batch_size=3)]
-        )
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_tf_keras(self, backend):
+        keras_model = tf.keras.Sequential([tf.keras.layers.ReLU(input_shape=(4, 5), batch_size=3)])
         input_names, output_names = get_tf_keras_io_names(keras_model)
         mlmodel = converter.convert(
             keras_model,
             inputs=[TensorType(input_names[0], (3, 4, 5))],
             outputs=["Identity"],
             source=frontend,
+            convert_to=backend[0],
         )
         assert mlmodel is not None
 
-    def test_tf_keras_hdf5_file(self):
-        keras_model = tf.keras.Sequential(
-            [tf.keras.layers.ReLU(input_shape=(4, 5), batch_size=3)]
-        )
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_tf_keras_hdf5_file(self, backend):
+        keras_model = tf.keras.Sequential([tf.keras.layers.ReLU(input_shape=(4, 5), batch_size=3)])
         keras_model.save(self.model_path_h5)
         input_names, output_names = get_tf_keras_io_names(keras_model)
         mlmodel = converter.convert(
@@ -375,37 +446,55 @@ def test_tf_keras_hdf5_file(self):
             inputs=[TensorType(input_names[0], (3, 4, 5))],
             outputs=["Identity"],
             source=frontend,
+            convert_to=backend[0],
         )
         assert mlmodel is not None
 
-    def test_model_metadata(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_model_metadata(self, backend):
         with tf.Graph().as_default() as graph:
             x = tf.placeholder(tf.float32, shape=(3, 4, 5))
             out = tf.nn.relu(x)
             mlmodel = converter.convert(
-                graph, inputs=[TensorType(x.op.name, (3, 4, 5))], outputs=[out.op.name]
+                graph,
+                inputs=[TensorType(x.op.name, (3, 4, 5))],
+                outputs=[out.op.name],
+                convert_to=backend[0],
             )
             metadata_keys = mlmodel.get_spec().description.metadata.userDefined
             assert "com.github.apple.coremltools.version" in metadata_keys
             assert "com.github.apple.coremltools.source" in metadata_keys
             assert "tensorflow==1." in metadata_keys["com.github.apple.coremltools.source"]
 
-    def test_invalid_format_none(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_invalid_format_none(self, backend):
         with pytest.raises(NotImplementedError) as e:
-            converter.convert(None, source="tensorflow")
+            converter.convert(None, source="tensorflow", convert_to=backend[0])
             e.match(r"Expected model format: .* .pb")
 
-    def test_invalid_format_invalid_extension(self):
-        _, invalid_filename = tempfile.mkstemp(
-            suffix=".invalid", prefix=self.saved_model_dir
-        )
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_invalid_format_invalid_extension(self, backend):
+        _, invalid_filename = tempfile.mkstemp(suffix=".invalid", prefix=self.saved_model_dir)
         with pytest.raises(NotImplementedError) as e:
-            converter.convert(invalid_filename, source="tensorflow")
+            converter.convert(invalid_filename, source="tensorflow", convert_to=backend[0])
             e.match(r"Expected model format: .* .pb")
 
-    def test_invalid_converter_source(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_invalid_converter_source(self, backend):
         with pytest.raises(ValueError) as e:
-            converter.convert(None, source="invalid")
+            converter.convert(None, source="invalid", convert_to=backend[0])
             expected_msg = r'Unrecognized value of argument "source": .*'
             e.match(expected_msg)
 
@@ -428,8 +517,12 @@ def test_invalid_converter_target(self):
             converter.convert(graph, convert_to="invalid", source="tensorflow")
             e.match(r"Backend converter .* not implemented")
 
-    def test_invalid_format_non_exist(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_invalid_format_non_exist(self, backend):
         non_exist_filename = self.model_path_pb.replace(".pb", "_non_exist.pb")
         with pytest.raises(ValueError) as e:
-            converter.convert(non_exist_filename, source="tensorflow")
+            converter.convert(non_exist_filename, source="tensorflow", convert_to=backend[0])
             e.match(r"Input model .* does not exist")
diff --git a/coremltools/converters/mil/frontend/tensorflow/test/test_ops.py b/coremltools/converters/mil/frontend/tensorflow/test/test_ops.py
index 8cc3d9517..da22af453 100644
--- a/coremltools/converters/mil/frontend/tensorflow/test/test_ops.py
+++ b/coremltools/converters/mil/frontend/tensorflow/test/test_ops.py
@@ -21,6 +21,7 @@
 from coremltools.converters.mil.frontend.tensorflow.test.testing_utils import (
     TensorFlowBaseTest,
     freeze_g,
+    get_tf_node_names,
     layer_counts,
     load_tf_pb,
     make_tf_graph,
@@ -666,6 +667,44 @@ def build_model(x):
             backend=backend,
         )
 
+    @pytest.mark.parametrize(
+        "compute_unit, minimum_deployment_target",
+        itertools.product(
+            compute_units,
+            [None, ct.target.iOS17],
+        ),
+    )
+    def test_ios17_mixed_precision(self, compute_unit, minimum_deployment_target):
+        input_shape = np.random.randint(low=1, high=4, size=2)
+
+        @make_tf_graph([input_shape])
+        def build_model(x):
+            return tf.math.log1p(x)
+
+        model, inputs, outputs = build_model
+        input_values = [random_gen(input_shape, 0.0, 2.0)]
+        input_dict = dict(zip(inputs, input_values))
+        results = TensorFlowBaseTest.run_compare_tf(
+            model,
+            input_dict,
+            outputs,
+            compute_unit=compute_unit,
+            backend=("mlprogram", "fp16"),
+            minimum_deployment_target=minimum_deployment_target,
+        )
+
+        prog: Program = results[1]._mil_program
+        log_op: Operation = prog.find_ops(op_type="log", exactly_one=True)[0]
+        assert log_op.x.dtype == types.fp16
+
+        # Before IOS17, the epsilon param is converted to fp16.
+        # After IOS17, the epsilon param is kept as fp32 because it supports mixed precision.
+        if minimum_deployment_target is not None and minimum_deployment_target >= ct.target.iOS17:
+            expected_epsilon_dtype = "fp32"
+        else:
+            expected_epsilon_dtype = "fp16"
+        assert types.builtin_to_string(log_op.epsilon.dtype) == expected_epsilon_dtype
+
 
 class TestSelect(TensorFlowBaseTest):
     @pytest.mark.parametrize(
@@ -2520,7 +2559,7 @@ def build_model(x):
             [True, False],
         ),
     )
-    def test_resize_bilinear_dynamic_shape(
+    def test_ios16_resize_bilinear_dynamic_shape_by_upsample_bilinear(
         self,
         compute_unit,
         backend,
@@ -2529,6 +2568,10 @@ def test_resize_bilinear_dynamic_shape(
         align_corners,
         half_pixel_centers,
     ):
+        """
+        Since iOS16, dynamic shape is supported only if the output_shape comes from a pattern of
+        ``input_shape * (h_scale, w_scale)``, which will be lowered to `upsample_bilinear` MIL op.
+        """
         if backend[0] == "neuralnetwork" or ct.utils._macos_version() < (13, 0):
             pytest.skip("half_pixel_centers only support for iOS16 upsample_bilinear layer")
 
@@ -2561,6 +2604,66 @@ def build_model(x):
             minimum_deployment_target=ct.target.iOS16,
         )
 
+    @pytest.mark.parametrize(
+        "compute_unit, backend, input_shape, target_shape, align_corners",
+        itertools.product(
+            compute_units,
+            backends,
+            [(1, 10, 20, 1), (2, 5, 2, 3)],
+            [(20, 60)],
+            [True, False],
+        ),
+    )
+    def test_ios17_resize_bilinear_dynamic_shape(
+        self,
+        compute_unit,
+        backend,
+        input_shape,
+        target_shape,
+        align_corners,
+    ):
+        """
+        Since iOS17, dynamic shape is supported by lowering to `resize` MIL op.
+        """
+        batch_dim, _, _, channel = input_shape
+
+        @make_tf_graph([(batch_dim, None, None, channel, tf.float32), (2, tf.int32)])
+        def build_model(x, size):
+            return tf.raw_ops.ResizeBilinear(
+                images=x,
+                size=size,
+                half_pixel_centers=False,
+                align_corners=align_corners,
+            )
+
+        model, inputs, outputs = build_model
+        input_values = [random_gen(input_shape, -1, 1), np.array(target_shape, dtype=np.int32)]
+        input_dict = dict(zip(inputs, input_values))
+
+        # Before iOS17, the dynamic shape will error out.
+        with pytest.raises(
+            ValueError,
+            match="the second input, which is the output size, must be known statically. "
+            "Consider setting minimum_deployment_target to iOS17 during conversion.",
+        ):
+            TensorFlowBaseTest.run_compare_tf(
+                model,
+                input_dict,
+                outputs,
+                compute_unit=compute_unit,
+                backend=backend,
+            )
+
+        # Since iOS17, the dynamic shape will be handled correctly.
+        TensorFlowBaseTest.run_compare_tf(
+            model,
+            input_dict,
+            outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+            minimum_deployment_target=ct.target.iOS17,
+        )
+
     @pytest.mark.parametrize(
         "compute_unit, backend, input_shape, upsample_factor, data_format",
         itertools.product(
@@ -2604,6 +2707,65 @@ def build_model(x):
                 if layer.WhichOneof('layer') == "upsample":
                     assert len(layer.upsample.fractionalScalingFactor) == 0
 
+    @pytest.mark.parametrize(
+        "compute_unit, backend, input_shape, target_shape",
+        itertools.product(
+            compute_units,
+            backends,
+            [(1, 10, 20, 1), (2, 5, 2, 3)],
+            [(20, 60)],
+        ),
+    )
+    def test_ios17_resize_nearest_neighbor_dynamic_shape(
+        self,
+        compute_unit,
+        backend,
+        input_shape,
+        target_shape,
+    ):
+        """
+        Since iOS17, dynamic shape is supported by lowering to `resize` MIL op.
+        """
+        batch_dim, _, _, channel = input_shape
+
+        @make_tf_graph([(batch_dim, None, None, channel, tf.float32), (2, tf.int32)])
+        def build_model(x, size):
+            return tf.raw_ops.ResizeNearestNeighbor(
+                images=x,
+                size=size,
+                half_pixel_centers=True,
+                align_corners=False,
+            )
+
+        model, inputs, outputs = build_model
+        input_values = [random_gen(input_shape, -1, 1), np.array(target_shape, dtype=np.int32)]
+        input_dict = dict(zip(inputs, input_values))
+
+        # Before iOS17, the dynamic shape will error out.
+        with pytest.raises(
+            ValueError,
+            match="Cannot determine the scale factor for the resize layer. "
+            "Please make sure the target size is known statically, or "
+            "use mul op to get the target size. If the target size has to be dynamic, please"
+            "set minimum_deployment_target to iOS17 during conversion.",
+        ):
+            TensorFlowBaseTest.run_compare_tf(
+                model,
+                input_dict,
+                outputs,
+                compute_unit=compute_unit,
+                backend=backend,
+            )
+
+        TensorFlowBaseTest.run_compare_tf(
+            model,
+            input_dict,
+            outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+            minimum_deployment_target=ct.target.iOS17,
+        )
+
     @pytest.mark.parametrize(
         "compute_unit, backend, input_shape, num_of_crops, crop_size, method, dynamic, "
         "extrapolation_value, minimum_deployment_target",
@@ -4216,6 +4378,11 @@ def test_slice_by_index_simple(self, compute_unit, backend, rank, masking_type):
             pytest.xfail(
                 "rdar://109854221 ([Bug][Regression] slice_by_index is throwing expection through E5ML - Follow up radar)"
             )
+
+        if backend[0] == "neuralnetwork":
+            pytest.xfail(
+                "rdar://111134257 ([Bug][Regression] nnv1 slice_by_index unittests are failing)"
+            )
         input_shape = np.random.randint(low=2, high=4, size=rank)
         begin_val = np.array(
             [
@@ -4926,7 +5093,7 @@ def test_non_max_suppression(
                 "When score threshold is too high, TF will return empty result, while MIL "
                 "will still keep the highest score box."
             )
-        if num_boxes >= 1000 and backend == ("mlprogram", "fp16"):
+        if num_boxes >= 1000:
             pytest.xfail(
                 "rdar://103891349 ([TensorFlow] [PyTorch] NMS discrepancy in Fp16 when "
                 "number of boxes is large)"
@@ -4937,6 +5104,11 @@ def test_non_max_suppression(
             # rdar://109871491 ([Bug][CI][Regression] Numerical regression on E5ML for nms layers)
             backend = ("mlprogram", "fp32")
 
+        if _HAS_TF_1 and score_threshold == -200 and backend[0] == "mlprogram":
+            pytest.xfail(
+                "rdar://111714405 ([Bug][Regression] Tensorflow nms layer unitests are failing)"
+            )
+
         boxes_val = random_gen(shape=(num_boxes, 4), rand_min=0, rand_max=32)
         # When the input score is too close, the returned index order is not guaranteed.
         # So instead of generating random scores by rand, use shuffle.
@@ -6341,24 +6513,44 @@ def build_model(x):
 class TestSpaceToBatchND(TensorFlowBaseTest):
     # No direct mil smoke test since it's a TF op which is a composite of several ops.
     @pytest.mark.parametrize(
-        "compute_unit, backend, input_shape, block_shape, paddings",
+        "compute_unit, backend, input_shape, block_shape, paddings, dynamic_paddings",
         itertools.product(
             compute_units,
             backends,
             [(1, 4, 4, 1), (1, 4, 4, 3), (2, 4, 6, 1)],
             [[2, 2]],
             [[[0, 0], [0, 0]], [[1, 1], [0, 2]], [[4, 2], [4, 2]]],
+            [True, False],
         ),
     )
-    def test_smoke(self, compute_unit, backend, input_shape, block_shape, paddings):
-        @make_tf_graph([input_shape])
-        def build_model(x):
-            return tf.raw_ops.SpaceToBatchND(
-                input=x, block_shape=block_shape, paddings=paddings
-            )
+    def test_smoke(
+        self, compute_unit, backend, input_shape, block_shape, paddings, dynamic_paddings
+    ):
+        paddings = np.array(paddings, dtype=np.int32)
+
+        if dynamic_paddings:
+
+            @make_tf_graph([input_shape, (2, 2, tf.int32)])
+            def build_model(x, paddings):
+                return tf.raw_ops.SpaceToBatchND(
+                    input=x, block_shape=block_shape, paddings=paddings
+                )
+
+        else:
+
+            @make_tf_graph([input_shape])
+            def build_model(x):
+                return tf.raw_ops.SpaceToBatchND(
+                    input=x, block_shape=block_shape, paddings=paddings
+                )
 
         model, inputs, outputs = build_model
-        input_values = [random_gen(input_shape)]
+
+        if dynamic_paddings:
+            input_values = [random_gen(input_shape), paddings]
+        else:
+            input_values = [random_gen(input_shape)]
+
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
             model,
@@ -6369,7 +6561,7 @@ def build_model(x):
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape_block_paddings, dynamic",
+        "compute_unit, backend, shape_block_paddings, dynamic_input, dynamic_paddings",
         itertools.product(
             compute_units,
             backends,
@@ -6380,24 +6572,43 @@ def build_model(x):
                 [(2, 4, 6, 1, 2), [2], [[0, 0]]],
             ],
             [True, False],
+            [True, False],
         ),
     )
-    def test_smoke_new_op(self, compute_unit, backend, shape_block_paddings, dynamic):
+    def test_smoke_new_op(
+        self, compute_unit, backend, shape_block_paddings, dynamic_input, dynamic_paddings
+    ):
         input_shape, block_shape, paddings = shape_block_paddings
+        paddings = np.array(paddings, dtype=np.int32)
 
         # The neuralnetwork backend doesn't support these tests
         if backend[0] == "neuralnetwork":
             return
 
-        tf_input_shape = input_shape if not dynamic else [None] * len(input_shape)
-        @make_tf_graph([tf_input_shape])
-        def build_model(x):
-            return tf.raw_ops.SpaceToBatchND(
-                input=x, block_shape=block_shape, paddings=paddings
-            )
+        tf_input_shape = input_shape if not dynamic_input else [None] * len(input_shape)
+        if dynamic_paddings:
+
+            @make_tf_graph([tf_input_shape, (*paddings.shape, tf.int32)])
+            def build_model(x, paddings):
+                return tf.raw_ops.SpaceToBatchND(
+                    input=x, block_shape=block_shape, paddings=paddings
+                )
+
+        else:
+
+            @make_tf_graph([tf_input_shape])
+            def build_model(x):
+                return tf.raw_ops.SpaceToBatchND(
+                    input=x, block_shape=block_shape, paddings=paddings
+                )
 
         model, inputs, outputs = build_model
-        input_values = [random_gen(input_shape)]
+
+        if dynamic_paddings:
+            input_values = [random_gen(input_shape), paddings]
+        else:
+            input_values = [random_gen(input_shape)]
+
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
             model,
@@ -6408,16 +6619,17 @@ def build_model(x):
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, input_block_rank, dynamic",
+        "compute_unit, backend, input_block_rank, dynamic_input, dynamic_paddings",
         itertools.product(
             compute_units,
             backends,
             [(3, 1), (3, 2), (4, 1)],
             [True, False],
+            [True, False],
         ),
     )
     def test_programmatic(
-        self, compute_unit, backend, input_block_rank, dynamic
+        self, compute_unit, backend, input_block_rank, dynamic_input, dynamic_paddings
     ):
 
         input_rank, block_rank = input_block_rank
@@ -6439,26 +6651,31 @@ def test_programmatic(
                 if (np.sum(temp) + input_shape[i + 1]) % block_shape[i] == 0:
                     paddings.append(temp)
                     break
-        paddings = np.array(paddings)
+        paddings = np.array(paddings, dtype=np.int32)
 
-        if not dynamic:
+        tf_input_shape = input_shape if not dynamic_input else [None] * len(input_shape)
+        if dynamic_paddings:
 
-            @make_tf_graph([input_shape])
-            def build_model(x):
+            @make_tf_graph([tf_input_shape, (*paddings.shape, tf.int32)])
+            def build_model(x, paddings):
                 return tf.raw_ops.SpaceToBatchND(
                     input=x, block_shape=block_shape, paddings=paddings
                 )
-
         else:
 
-            @make_tf_graph([[None] * input_rank])
+            @make_tf_graph([tf_input_shape])
             def build_model(x):
                 return tf.raw_ops.SpaceToBatchND(
                     input=x, block_shape=block_shape, paddings=paddings
                 )
 
         model, inputs, outputs = build_model
-        input_values = [random_gen(input_shape)]
+
+        if dynamic_paddings:
+            input_values = [random_gen(input_shape), paddings]
+        else:
+            input_values = [random_gen(input_shape)]
+
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
             model,
@@ -6472,24 +6689,37 @@ def build_model(x):
 class TestBatchToSpaceND(TensorFlowBaseTest):
     # No direct mil smoke test since it's a TF op which is a composite of several ops.
     @pytest.mark.parametrize(
-        "compute_unit, backend, input_shape, block_size, crops",
+        "compute_unit, backend, input_shape, block_size, crops, dynamic_crops",
         itertools.product(
             compute_units,
             backends,
             [(4, 4, 4, 1), (4, 4, 4, 3), (4, 4, 6, 1)],
             [[2, 2]],
             [[[0, 0], [0, 0]], [[1, 1], [0, 2]], [[4, 2], [4, 2]]],
+            [True, False],
         ),
     )
-    def test_smoke(self, compute_unit, backend, input_shape, block_size, crops):
-        @make_tf_graph([input_shape])
-        def build_model(x):
-            return tf.raw_ops.BatchToSpaceND(
-                input=x, block_shape=block_size, crops=crops
-            )
+    def test_smoke(self, compute_unit, backend, input_shape, block_size, crops, dynamic_crops):
+
+        if dynamic_crops:
+
+            @make_tf_graph([input_shape, (2, 2, tf.int32)])
+            def build_model(x, y):
+                return tf.raw_ops.BatchToSpaceND(input=x, block_shape=block_size, crops=y)
+
+        else:
+
+            @make_tf_graph([input_shape])
+            def build_model(x):
+                return tf.raw_ops.BatchToSpaceND(input=x, block_shape=block_size, crops=crops)
 
         model, inputs, outputs = build_model
-        input_values = [random_gen(input_shape)]
+
+        if dynamic_crops:
+            input_values = [random_gen(input_shape), np.array(crops, np.int32)]
+        else:
+            input_values = [random_gen(input_shape)]
+
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
             model,
@@ -6500,16 +6730,18 @@ def build_model(x):
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, input_block_rank, dynamic",
+        "compute_unit, backend, input_block_rank, dynamic_input, dynamic_crops",
         itertools.product(
             compute_units,
             backends,
             [(3, 1), (3, 2), (4, 1)],
-            [True, False]
+            [True, False],
+            [True, False],
         ),
     )
     def test_programmatic(
-        self, compute_unit, backend, input_block_rank, dynamic):
+        self, compute_unit, backend, input_block_rank, dynamic_input, dynamic_crops
+    ):
 
         input_rank, block_rank = input_block_rank
 
@@ -6531,40 +6763,53 @@ def test_programmatic(
                 if np.sum(temp) < input_shape[i + 1] * block_shape[i]:
                     crops.append(temp)
                     break
-        crops = np.array(crops)
+        crops = np.array(crops, dtype=np.int32)
 
-        if not dynamic:
+        tf_input_shape = [None] * input_rank if dynamic_input else input_shape
 
-            @make_tf_graph([input_shape])
-            def build_model(x):
+        if dynamic_crops:
+
+            @make_tf_graph([tf_input_shape, (*crops.shape, tf.int32)])
+            def build_model(x, crops):
                 return tf.raw_ops.BatchToSpaceND(
                     input=x, block_shape=block_shape, crops=crops
                 )
-
         else:
 
-            @make_tf_graph([[None] * input_rank])
+            @make_tf_graph([tf_input_shape])
             def build_model(x):
                 return tf.raw_ops.BatchToSpaceND(
                     input=x, block_shape=block_shape, crops=crops
                 )
 
         model, inputs, outputs = build_model
-        input_values = [random_gen(input_shape)]
+
+        if dynamic_crops:
+            input_values = [random_gen(input_shape), crops]
+        else:
+            input_values = [random_gen(input_shape)]
         input_dict = dict(zip(inputs, input_values))
 
         # Before rdar://93071454 (batch_to_space is error out in espresso for dynamic inputs cormel model) is fixed,
         # we need to specify the default shape for the dynamic model by setting inputs_for_conversion
-        if dynamic:
+        input_names = get_tf_node_names(inputs, mode="inputs")
+        if dynamic_input:
             shape = tuple(
                 [
                     RangeDim(default=dim, upper_bound=dim if backend[0] == "mlprogram" else -1)
                     for dim in input_shape
                 ]
             )
-            inputs_for_conversion = [TensorType(shape=shape, dtype=np.float32)]
+            inputs_for_conversion = [TensorType(shape=shape, name=input_names[0], dtype=np.float32)]
         else:
-            inputs_for_conversion = None
+            inputs_for_conversion = [
+                TensorType(shape=tuple(input_shape), name=input_names[0], dtype=np.float32)
+            ]
+
+        if dynamic_crops:
+            inputs_for_conversion += [
+                TensorType(shape=crops.shape, name=input_names[1], dtype=np.int32)
+            ]
 
         TensorFlowBaseTest.run_compare_tf(
             model,
@@ -6576,7 +6821,7 @@ def build_model(x):
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape_block_crops, dynamic",
+        "compute_unit, backend, shape_block_crops, dynamic_input, dynamic_crops",
         itertools.product(
             compute_units,
             backends,
@@ -6587,38 +6832,60 @@ def build_model(x):
                 [(4, 4, 6, 1, 2), [2], [[0, 0]]],
             ],
             [True, False],
+            [True, False],
         ),
     )
-    def test_smoke_new_op(self, compute_unit, backend, shape_block_crops, dynamic):
+    def test_smoke_new_op(
+        self, compute_unit, backend, shape_block_crops, dynamic_input, dynamic_crops
+    ):
         input_shape, block_shape, crops = shape_block_crops
+        crops = np.array(crops, dtype=np.int32)
 
         # The neuralnetwork backend doesn't support these tests
         if backend[0] == "neuralnetwork":
             return
 
-        tf_input_shape = input_shape if not dynamic else [None] * len(input_shape)
-        @make_tf_graph([tf_input_shape])
-        def build_model(x):
-            return tf.raw_ops.BatchToSpaceND(
-                input=x, block_shape=block_shape, crops=crops
-            )
+        tf_input_shape = input_shape if not dynamic_input else [None] * len(input_shape)
+        if dynamic_crops:
+
+            @make_tf_graph([tf_input_shape, (*crops.shape, tf.int32)])
+            def build_model(x, crops):
+                return tf.raw_ops.BatchToSpaceND(input=x, block_shape=block_shape, crops=crops)
+
+        else:
+
+            @make_tf_graph([tf_input_shape])
+            def build_model(x):
+                return tf.raw_ops.BatchToSpaceND(input=x, block_shape=block_shape, crops=crops)
+
+        model, inputs, outputs = build_model
 
         # Before rdar://93071454 (batch_to_space is error out in espresso for dynamic inputs cormel model) is fixed,
         # we need to specify the default shape for the dynamic model by setting inputs_for_conversion
-        if dynamic:
+        input_names = get_tf_node_names(inputs, mode="inputs")
+        if dynamic_input:
             shape = tuple(
                 [
                     RangeDim(default=dim, upper_bound=dim if backend[0] == "mlprogram" else -1)
                     for dim in input_shape
                 ]
             )
-            inputs_for_conversion = [TensorType(shape=shape, dtype=np.float32)]
+            inputs_for_conversion = [TensorType(shape=shape, name=input_names[0], dtype=np.float32)]
+        else:
+            inputs_for_conversion = [
+                TensorType(shape=tuple(input_shape), name=input_names[0], dtype=np.float32)
+            ]
+
+        if dynamic_crops:
+            inputs_for_conversion += [
+                TensorType(shape=crops.shape, name=input_names[1], dtype=np.int32)
+            ]
+            input_values = [random_gen(input_shape), crops]
         else:
-                        inputs_for_conversion = None
+            input_values = [random_gen(input_shape)]
 
-        model, inputs, outputs = build_model
-        input_values = [random_gen(input_shape)]
         input_dict = dict(zip(inputs, input_values))
+
         TensorFlowBaseTest.run_compare_tf(
             model,
             input_dict,
diff --git a/coremltools/converters/mil/frontend/tensorflow/test/test_tf_conversion_api.py b/coremltools/converters/mil/frontend/tensorflow/test/test_tf_conversion_api.py
index 9f4866f71..60d28c259 100644
--- a/coremltools/converters/mil/frontend/tensorflow/test/test_tf_conversion_api.py
+++ b/coremltools/converters/mil/frontend/tensorflow/test/test_tf_conversion_api.py
@@ -41,12 +41,16 @@
 @pytest.mark.skipif(ct.utils._macos_version() < (10, 15), reason='Model produces specification 4.')
 class TestTensorFlow1ConverterExamples:
     @staticmethod
-    def test_convert_from_frozen_graph(tmpdir):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_convert_from_frozen_graph(tmpdir, backend):
         with tf.Graph().as_default() as graph:
             x = tf.placeholder(tf.float32, shape=(1, 2, 3), name="input")
             y = tf.nn.relu(x, name="output")
 
-        mlmodel = ct.convert(graph, compute_units=ct.ComputeUnit.CPU_ONLY)
+        mlmodel = ct.convert(graph, convert_to=backend[0], compute_units=ct.ComputeUnit.CPU_ONLY)
 
         test_input = np.random.rand(1, 2, 3) - 0.5
         with tf.compat.v1.Session(graph=graph) as sess:
@@ -55,7 +59,11 @@ def test_convert_from_frozen_graph(tmpdir):
         np.testing.assert_allclose(results["output"], expected_val)
 
     @staticmethod
-    def test_convert_from_frozen_graph_file(tmpdir):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_convert_from_frozen_graph_file(tmpdir, backend):
         # create the model to convert
 
         # write a toy frozen graph
@@ -85,6 +93,7 @@ def test_convert_from_frozen_graph_file(tmpdir):
             # We specify inputs with name matching the placeholder name.
             inputs=[ct.TensorType(name="input", shape=(1, 2, 3))],
             outputs=["output"],
+            convert_to=backend[0],
         )
 
         # (2) Specify input TensorType without name (when there's only one
@@ -94,16 +103,23 @@ def test_convert_from_frozen_graph_file(tmpdir):
             # TensorType name is optional when there's only one input.
             inputs=[ct.TensorType(shape=(1, 2, 3))],
             outputs=["output"],
+            convert_to=backend[0],
         )
 
         # (3) Not specify inputs at all. `inputs` is optional for TF. When
         # inputs is not specified, convert() infers inputs from Placeholder
         # nodes.
-        mlmodel = ct.convert(pb_path, outputs=["output"], compute_units=ct.ComputeUnit.CPU_ONLY)
+        mlmodel = ct.convert(
+            pb_path,
+            outputs=["output"],
+            convert_to=backend[0],
+            compute_units=ct.ComputeUnit.CPU_ONLY,
+        )
 
         results = mlmodel.predict({"input": test_input})
         np.testing.assert_allclose(results["output"], expected_val)
-        mlmodel_path = os.path.join(save_path, "model.mlmodel")
+        suffix = ".mlmodel" if backend[0] == "neuralnetwork" else ".mlpackage"
+        mlmodel_path = os.path.join(save_path, "model" + suffix)
         # Save the converted model
         mlmodel.save(mlmodel_path)
 
@@ -111,7 +127,11 @@ def test_convert_from_frozen_graph_file(tmpdir):
         np.testing.assert_allclose(results["output"], expected_val, atol=1e-3)
 
     @staticmethod
-    def test_convert_from_saved_model_dir(tmpdir):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_convert_from_saved_model_dir(tmpdir, backend):
         # Sample input
         test_input = np.random.rand(1, 3, 5) - 0.5
 
@@ -130,7 +150,9 @@ def test_convert_from_saved_model_dir(tmpdir):
 
         # SavedModel directory generated by TensorFlow 1.x
         # when converting from SavedModel dir, inputs / outputs are optional
-        mlmodel = ct.convert(save_path, compute_units=ct.ComputeUnit.CPU_ONLY)
+        mlmodel = ct.convert(
+            save_path, convert_to=backend[0], compute_units=ct.ComputeUnit.CPU_ONLY
+        )
 
         # Need input output names to call mlmodel
         # x.name == 'Placeholder:0'. Strip out ':0'
@@ -142,7 +164,11 @@ def test_convert_from_saved_model_dir(tmpdir):
 
 
     @staticmethod
-    def test_freeze_and_convert_matmul_graph():
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_freeze_and_convert_matmul_graph(backend):
         # testing : https://coremltools.readme.io/docs/tensorflow-1#export-as-frozen-graph-and-convert
         graph = tf.Graph()
         with graph.as_default():
@@ -183,7 +209,7 @@ def test_freeze_and_convert_matmul_graph():
                          clear_devices=True,
                          initializer_nodes="")
         print("Tensorflow frozen graph saved at {}".format(frozen_graph_file))
-        ct.convert(frozen_graph_file)
+        ct.convert(frozen_graph_file, convert_to=backend[0])
 
     @staticmethod
     def test_convert_tf1_frozen_graph_to_milinternal(tmpdir):
@@ -223,7 +249,11 @@ def test_mil_op_names_consistency(tmpdir):
 @pytest.mark.skipif(not _HAS_TF_1, reason=MSG_TF1_NOT_FOUND)
 class TestTf1Inputs(_TestInputs):
     @staticmethod
-    def test_input_noname():
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_input_noname(backend):
         with tf.Graph().as_default() as graph:
             x = tf.placeholder(tf.float32, shape=(1, 2, 3), name="input")
             x1 = tf.placeholder(tf.float32, shape=(1, 2, 3), name="input_1")
@@ -233,13 +263,18 @@ def test_input_noname():
         with pytest.raises(ValueError) as e:
             model = ct.convert(
                 graph,
-                inputs=[ct.TensorType(shape=(1, 2, 3))]
+                inputs=[ct.TensorType(shape=(1, 2, 3))],
+                convert_to=backend[0],
             )
         expected_error = "Multiple inputs are found in graph, but no input name was provided"
         assert expected_error == str(e.value)
 
     @staticmethod
-    def test_input_wrongname():
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_input_wrongname(backend):
         with tf.Graph().as_default() as graph:
             x = tf.placeholder(tf.float32, shape=(1, 2, 3), name="input")
             x1 = tf.placeholder(tf.float32, shape=(1, 2, 3), name="input_1")
@@ -249,7 +284,8 @@ def test_input_wrongname():
         with pytest.raises(ValueError) as e:
             model = ct.convert(
                 graph,
-                inputs=[ct.TensorType(shape=(1, 2, 3), name="wrong_input")]
+                inputs=[ct.TensorType(shape=(1, 2, 3), name="wrong_input")],
+                convert_to=backend[0],
             )
         expected_error = "Multiple inputs are found in graph, but no input name was provided"
         expected_error = "Input ({}) provided is not found in given tensorflow graph. Placeholders in graph are: {}".format("wrong_input", ["input", "input_1"])
@@ -303,9 +339,33 @@ def test_input_dynamic_without_inputs_param(self, backend, compute_unit):
         )
 
     @staticmethod
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
     @pytest.mark.skipif(not ct.utils._is_macos(), reason="test needs predictions")
-    def test_tf_predict_input():
-        TestTf1Inputs._test_variant_input_type_prediction(tf.convert_to_tensor)
+    def test_tf_predict_input(backend):
+        TestTf1Inputs._test_variant_input_type_prediction(tf.convert_to_tensor, backend[0])
+
+@pytest.fixture
+def uint8_input_model():
+    if not _HAS_TF_1:
+        pytest.skip(MSG_TF1_NOT_FOUND)
+    with tf.Graph().as_default() as graph:
+        x = tf.placeholder(tf.uint8, shape=[10, 20], name="input")
+        out = tf.add(x, tf.constant(5, dtype=tf.uint8), name="output")
+    return graph
+
+
+@pytest.fixture
+def int8_input_model():
+    if not _HAS_TF_1:
+        pytest.skip(MSG_TF1_NOT_FOUND)
+    with tf.Graph().as_default() as graph:
+        x = tf.placeholder(tf.int8, shape=[10, 20], name="input")
+        out = tf.add(x, tf.constant(5, dtype=tf.int8), name="output")
+    return graph
+
 
 @pytest.fixture
 def int32_input_model():
@@ -316,6 +376,54 @@ def int32_input_model():
         out = tf.add(x, tf.constant(5, dtype=tf.int32), name="output")
     return graph
 
+
+@pytest.fixture
+def int32_two_input_model():
+    if not _HAS_TF_1:
+        pytest.skip(MSG_TF1_NOT_FOUND)
+    with tf.Graph().as_default() as graph:
+        x = tf.placeholder(tf.int32, shape=[10, 20], name="input1")
+        y = tf.placeholder(tf.int32, shape=[10, 20], name="input2")
+        out = tf.add(x, y, name="output")
+    return graph
+
+
+@pytest.fixture
+def int32_two_output_model():
+    if not _HAS_TF_1:
+        pytest.skip(MSG_TF1_NOT_FOUND)
+    with tf.Graph().as_default() as graph:
+        x = tf.placeholder(tf.int32, shape=[10, 20], name="input1")
+        y = tf.placeholder(tf.int32, shape=[10, 20], name="input2")
+        out1 = tf.add(x, 1, name="output1")
+        out2 = tf.add(y, 1, name="output2")
+    return graph
+
+
+@pytest.fixture
+def int32_float32_two_output_model():
+    if not _HAS_TF_1:
+        pytest.skip(MSG_TF1_NOT_FOUND)
+    with tf.Graph().as_default() as graph:
+        x = tf.placeholder(tf.float32, shape=[10, 20], name="input1")
+        y = tf.placeholder(tf.float32, shape=[10, 20], name="input2")
+        x_add = tf.add(x, 1.0, name="output1")
+        y_add = tf.add(y, 1.0)
+        y_cast = tf.cast(y_add, dtype=tf.int32, name="output2")
+    return graph
+
+
+@pytest.fixture
+def int32_float32_two_input_model():
+    if not _HAS_TF_1:
+        pytest.skip(MSG_TF1_NOT_FOUND)
+    with tf.Graph().as_default() as graph:
+        x = tf.placeholder(tf.int32, shape=[10, 20], name="input1")
+        y = tf.placeholder(tf.float32, shape=[10, 20], name="input2")
+        x_cast = tf.cast(x, dtype=tf.float32)
+        out = tf.add(x_cast, y, name="output")
+    return graph
+
 @pytest.fixture
 def float32_input_model_add_op():
     if not _HAS_TF_1:
@@ -365,6 +473,16 @@ def float32_two_output_model():
         out1 = tf.nn.relu(y, name="output1")
     return graph
 
+@pytest.fixture
+def float64_input_model():
+    if not _HAS_TF_1:
+        pytest.skip(MSG_TF1_NOT_FOUND)
+    with tf.Graph().as_default() as graph:
+        x = tf.placeholder(tf.float64, shape=[10, 20], name="input")
+        out = tf.add(x, tf.constant(5, dtype=tf.float64), name="output")
+    return graph
+
+
 @pytest.fixture
 def rank3_input_model():
     if not _HAS_TF_1:
@@ -430,12 +548,27 @@ class TestInputOutputConversionAPI:
 
     def test_input_dtype_inferred(self, int32_input_model):
         # test that the input dtype is picked up from TF correctly
-        mlmodel = ct.convert(int32_input_model,
-                             minimum_deployment_target=ct.target.macOS12)
+        mlmodel = ct.convert(int32_input_model, minimum_deployment_target=ct.target.macOS12)
+        assert_input_dtype(mlmodel, expected_type_str="int32")
+        verify_prediction(mlmodel)
+
+    def test_unsupported_input_dtype_in_tf_graph_uint8(self, uint8_input_model):
+        # test that no error is raised when no dtype is provided by the user,
+        # and the TF graph's input dtype is not supported.
+        # In this case, it will be mapped to the closest supported dtype
+        mlmodel = ct.convert(uint8_input_model, minimum_deployment_target=ct.target.macOS12)
+        assert_input_dtype(mlmodel, expected_type_str="int32")
+        verify_prediction(mlmodel)
+
+    def test_unsupported_input_dtype_in_tf_graph_int8(self, int8_input_model):
+        # test that no error is raised when no dtype is provided by the user,
+        # and the TF graph's input dtype is not supported.
+        # In this case, it will be mapped to the closest supported dtype
+        mlmodel = ct.convert(int8_input_model, minimum_deployment_target=ct.target.macOS12)
         assert_input_dtype(mlmodel, expected_type_str="int32")
         verify_prediction(mlmodel)
 
-    def test_unsupported_input_dtype_in_tf_graph(self, int64_input_model):
+    def test_unsupported_input_dtype_in_tf_graph_int64(self, int64_input_model):
         # test that no error is raised when no dtype is provided by the user,
         # and the TF graph's input dtype is not supported.
         # In this case, it will be mapped to the closest supported dtype
@@ -444,6 +577,14 @@ def test_unsupported_input_dtype_in_tf_graph(self, int64_input_model):
         assert_input_dtype(mlmodel, expected_type_str="int32")
         verify_prediction(mlmodel)
 
+    def test_unsupported_input_dtype_in_tf_graph_fp64(self, float64_input_model):
+        # test that no error is raised when no dtype is provided by the user,
+        # and the TF graph's input dtype is not supported.
+        # In this case, it will be mapped to the closest supported dtype
+        mlmodel = ct.convert(float64_input_model, minimum_deployment_target=ct.target.macOS12)
+        assert_input_dtype(mlmodel, expected_type_str="fp32")
+        verify_prediction(mlmodel)
+
     def test_input_dtype_user_provided(self, int32_input_model):
         # test that provided dtype in the api overrides the input dtype in the TF model
         mlmodel = ct.convert(int32_input_model,
@@ -473,29 +614,35 @@ def test_fp16_input_dtype(self, float32_input_model_add_op, float32_input_model_
         """
         Test that providing fp16 input dtype works with macOS13.
         """
-        mlmodel = ct.convert(float32_input_model_add_op,
-                             inputs=[ct.TensorType(dtype=np.float16)],
-                             minimum_deployment_target=ct.target.macOS13
-                             )
+        mlmodel = ct.convert(
+            float32_input_model_add_op,
+            inputs=[ct.TensorType(dtype=np.float16)],
+            outputs=[ct.TensorType(dtype=np.float32)],
+            minimum_deployment_target=ct.target.macOS13,
+        )
         assert_ops_in_mil_program(mlmodel, expected_op_list=["add", "cast"])
         assert_input_dtype(mlmodel, expected_type_str="fp16")
         assert_output_dtype(mlmodel, expected_type_str="fp32")
         verify_prediction(mlmodel)
 
-        mlmodel = ct.convert(float32_input_model_relu_ops,
-                             inputs=[ct.TensorType(dtype=np.float16)],
-                             minimum_deployment_target=ct.target.macOS13
-                             )
+        mlmodel = ct.convert(
+            float32_input_model_relu_ops,
+            inputs=[ct.TensorType(dtype=np.float16)],
+            outputs=[ct.TensorType(dtype=np.float32)],
+            minimum_deployment_target=ct.target.macOS13,
+        )
         # Two consecutive relus are merged in the `merge_consecutive_relus` pass.
         assert_ops_in_mil_program(mlmodel, expected_op_list=["relu", "cast"])
         assert_input_dtype(mlmodel, expected_type_str="fp16")
         assert_output_dtype(mlmodel, expected_type_str="fp32")
         verify_prediction(mlmodel)
 
-        mlmodel = ct.convert(int32_input_model,
-                             inputs=[ct.TensorType(dtype=np.float16)],
-                             minimum_deployment_target=ct.target.macOS13,
-                             )
+        mlmodel = ct.convert(
+            int32_input_model,
+            inputs=[ct.TensorType(dtype=np.float16)],
+            outputs=[ct.TensorType(dtype=np.float32)],
+            minimum_deployment_target=ct.target.macOS13,
+        )
         assert_ops_in_mil_program(mlmodel, expected_op_list=["add", "cast"])
         assert_input_dtype(mlmodel, expected_type_str="fp16")
         assert_output_dtype(mlmodel, expected_type_str="fp32")
@@ -506,30 +653,37 @@ def test_fp16_input_dtype_fp32_precision(self, float32_input_model_add_op, float
         """
         Same test as test_fp16_input_dtype, but with Float32 precision
         """
-        mlmodel = ct.convert(float32_input_model_add_op,
-                             inputs=[ct.TensorType(dtype=np.float16)],
-                             minimum_deployment_target=ct.target.macOS13,
-                             compute_precision=ct.precision.FLOAT32,
-                             )
+        mlmodel = ct.convert(
+            float32_input_model_add_op,
+            inputs=[ct.TensorType(dtype=np.float16)],
+            outputs=[ct.TensorType(dtype=np.float32)],
+            minimum_deployment_target=ct.target.macOS13,
+            compute_precision=ct.precision.FLOAT32,
+        )
         assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "add"])
         assert_input_dtype(mlmodel, expected_type_str="fp16")
         assert_output_dtype(mlmodel, expected_type_str="fp32")
         verify_prediction(mlmodel)
 
-        mlmodel = ct.convert(float32_input_model_relu_ops,
-                             inputs=[ct.TensorType(dtype=np.float16)],
-                             minimum_deployment_target=ct.target.macOS13,
-                             compute_precision=ct.precision.FLOAT32,
-                             )
+        mlmodel = ct.convert(
+            float32_input_model_relu_ops,
+            inputs=[ct.TensorType(dtype=np.float16)],
+            outputs=[ct.TensorType(dtype=np.float32)],
+            minimum_deployment_target=ct.target.macOS13,
+            compute_precision=ct.precision.FLOAT32,
+        )
         assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "relu"])
         assert_input_dtype(mlmodel, expected_type_str="fp16")
         assert_output_dtype(mlmodel, expected_type_str="fp32")
 
     def test_two_input_model(self, float32_two_input_model):
         # test forcing input type of "input1" to be int32
-        mlmodel = ct.convert(float32_two_input_model,
-                             inputs=[ct.TensorType(name="input1", dtype=np.int32)],
-                             minimum_deployment_target=ct.target.macOS12)
+        mlmodel = ct.convert(
+            float32_two_input_model,
+            inputs=[ct.TensorType(name="input1", dtype=np.int32)],
+            outputs=[ct.TensorType(dtype=np.float32)],
+            minimum_deployment_target=ct.target.macOS12,
+        )
         assert_input_dtype(mlmodel, expected_type_str="int32", expected_name="input1")
         assert_input_dtype(mlmodel, expected_type_str="fp32", expected_name="input2")
         assert_output_dtype(mlmodel, expected_type_str="fp32")
@@ -553,11 +707,15 @@ def test_two_input_model(self, float32_two_input_model):
                                  minimum_deployment_target=ct.target.macOS12)
 
         # test forcing both inputs to be float16
-        mlmodel = ct.convert(float32_two_input_model,
-                             inputs=[ct.TensorType(name="input1", dtype=np.float16),
-                                     ct.TensorType(name="input2", dtype=np.float16),
-                                     ],
-                             minimum_deployment_target=ct.target.macOS13)
+        mlmodel = ct.convert(
+            float32_two_input_model,
+            inputs=[
+                ct.TensorType(name="input1", dtype=np.float16),
+                ct.TensorType(name="input2", dtype=np.float16),
+            ],
+            outputs=[ct.TensorType(dtype=np.float32)],
+            minimum_deployment_target=ct.target.macOS13,
+        )
         assert_input_dtype(mlmodel, expected_type_str="fp16", expected_name="input1")
         assert_input_dtype(mlmodel, expected_type_str="fp16", expected_name="input2")
         assert_output_dtype(mlmodel, expected_type_str="fp32")
@@ -607,11 +765,15 @@ def test_single_output_model(self, int32_input_model, float32_input_model_relu_o
                        )
 
         # test that output type float16 is applied correctly
-        mlmodel = ct.convert(float32_input_model_relu_ops,
-                             outputs=[ct.TensorType(dtype=np.float16)],
-                             minimum_deployment_target=ct.target.macOS13,
-                             )
-        assert_output_dtype(mlmodel, expected_type_str="fp16", expected_name="Identity" if _HAS_TF_2 else "output")
+        mlmodel = ct.convert(
+            float32_input_model_relu_ops,
+            inputs=[ct.TensorType(name="input", dtype=np.float32)],
+            outputs=[ct.TensorType(dtype=np.float16)],
+            minimum_deployment_target=ct.target.macOS13,
+        )
+        assert_output_dtype(
+            mlmodel, expected_type_str="fp16", expected_name="Identity" if _HAS_TF_2 else "output"
+        )
         assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "relu"])
 
         # test that input and output types float16 are applied correctly
@@ -667,10 +829,12 @@ def test_multi_output_model(self, float32_two_output_model):
         verify_prediction(mlmodel)
 
     def test_color_input(self, rank4_input_model, rank3_input_model):
-        mlmodel = ct.convert(rank4_input_model,
-                             inputs=[ct.ImageType(color_layout=ct.colorlayout.RGB)],
-                             minimum_deployment_target=ct.target.macOS13,
-                             )
+        mlmodel = ct.convert(
+            rank4_input_model,
+            inputs=[ct.ImageType(color_layout=ct.colorlayout.RGB)],
+            outputs=[ct.TensorType(dtype=np.float32)],
+            minimum_deployment_target=ct.target.macOS13,
+        )
         assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "transpose", "add", "cast"])
         assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.RGB)
         assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp32")
@@ -697,10 +861,12 @@ def test_grayscale_input(self, rank4_input_model, rank3_input_model, rank4_grays
                                  minimum_deployment_target=ct.target.macOS13,
                                  )
 
-        mlmodel = ct.convert(rank4_grayscale_input_model,
-                             inputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE)],
-                             minimum_deployment_target=ct.target.macOS13,
-                             )
+        mlmodel = ct.convert(
+            rank4_grayscale_input_model,
+            inputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE)],
+            outputs=[ct.TensorType(dtype=np.float32)],
+            minimum_deployment_target=ct.target.macOS13,
+        )
         assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "transpose", "add", "cast"])
         assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE)
         assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp32")
@@ -752,10 +918,12 @@ def test_color_output(self, rank4_input_model, rank4_input_model_with_channel_fi
         verify_prediction(mlmodel)
 
         # check neural network conversion
-        mlmodel = ct.convert(rank4_input_model_with_channel_first_output,
-                             inputs=[ct.ImageType(color_layout=ct.colorlayout.RGB)],
-                             outputs=[ct.ImageType(color_layout=ct.colorlayout.BGR)],
-                             )
+        mlmodel = ct.convert(
+            rank4_input_model_with_channel_first_output,
+            inputs=[ct.ImageType(color_layout=ct.colorlayout.RGB)],
+            outputs=[ct.ImageType(color_layout=ct.colorlayout.BGR)],
+            convert_to="neuralnetwork",
+        )
         assert_ops_in_mil_program(mlmodel, expected_op_list=["add"])
         assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.RGB)
         assert_spec_output_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.BGR)
@@ -764,10 +932,12 @@ def test_color_output(self, rank4_input_model, rank4_input_model_with_channel_fi
     def test_grayscale_output(self, rank4_grayscale_input_model, rank4_grayscale_input_model_with_channel_first_output):
         # check that an error is raised if the output shape is not of form (1, 1, H, W)
         with pytest.raises(ValueError, match="Shape of the Grayscale image output,"):
-            mlmodel = ct.convert(rank4_grayscale_input_model,
-                                 inputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE)],
-                                 outputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE)],
-                                 )
+            mlmodel = ct.convert(
+                rank4_grayscale_input_model,
+                inputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE)],
+                outputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE)],
+                convert_to="neuralnetwork",
+            )
 
         with pytest.raises(TypeError, match="float16 dtype for outputs is only supported for deployment target >= iOS16/macOS13"):
             mlmodel = ct.convert(rank4_grayscale_input_model_with_channel_first_output,
@@ -775,10 +945,12 @@ def test_grayscale_output(self, rank4_grayscale_input_model, rank4_grayscale_inp
                                  minimum_deployment_target=ct.target.macOS12,
                                  )
 
-        mlmodel = ct.convert(rank4_grayscale_input_model_with_channel_first_output,
-                             inputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE)],
-                             outputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE)],
-                             )
+        mlmodel = ct.convert(
+            rank4_grayscale_input_model_with_channel_first_output,
+            inputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE)],
+            outputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE)],
+            convert_to="neuralnetwork",
+        )
         assert_ops_in_mil_program(mlmodel, expected_op_list=["add"])
         assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE)
         assert_spec_output_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE)
@@ -820,3 +992,470 @@ def test_linear_model(self, linear_model):
         assert_output_dtype(mlmodel, expected_type_str="fp16")
         assert_ops_in_mil_program(mlmodel, ["linear", "relu"])
         verify_prediction(mlmodel)
+
+    def test_default_input_dtype(self, int32_input_model, int32_two_input_model):
+        """
+        If ``dtype`` is not specified, it defaults to the ``dtype`` of the
+              inputs in the TF model.
+        """
+        # Case 1: Single input model with no dtype specified
+        mlmodel = ct.convert(
+            int32_input_model,
+            inputs=[ct.TensorType(shape=(10, 20))],
+            minimum_deployment_target=ct.target.macOS13,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="int32")
+        verify_prediction(mlmodel)
+
+        # Case 2: two inputs model with dtype specified for the first input
+        mlmodel = ct.convert(
+            int32_two_input_model,
+            inputs=[
+                ct.TensorType(name="input1", shape=(10, 20), dtype=np.float16),
+                ct.TensorType(name="input2", shape=(10, 20)),
+            ],
+            minimum_deployment_target=ct.target.macOS13,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="fp16", index=0)
+        assert_input_dtype(mlmodel, expected_type_str="int32", index=1)
+        verify_prediction(mlmodel)
+
+        # Case 3: two inputs model with dtype specified for the second input
+        mlmodel = ct.convert(
+            int32_two_input_model,
+            inputs=[
+                ct.TensorType(name="input1", shape=(10, 20)),
+                ct.TensorType(name="input2", shape=(10, 20), dtype=np.float16),
+            ],
+            minimum_deployment_target=ct.target.macOS13,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="int32", index=0)
+        assert_input_dtype(mlmodel, expected_type_str="fp16", index=1)
+        verify_prediction(mlmodel)
+
+        # Case 4: two inputs model with no dtype specified for both inputs
+        mlmodel = ct.convert(
+            int32_two_input_model,
+            inputs=[
+                ct.TensorType(name="input1", shape=(10, 20)),
+                ct.TensorType(name="input2", shape=(10, 20)),
+            ],
+            minimum_deployment_target=ct.target.macOS13,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="int32", index=0)
+        assert_input_dtype(mlmodel, expected_type_str="int32", index=1)
+        verify_prediction(mlmodel)
+
+
+class TestiOS16DefaultIODtype:
+    def test_iO16_default_fp16_input(
+        self,
+        float32_input_model_add_op,
+        int32_input_model,
+    ):
+        """
+        With minimum_deployment_target set >= iOS16, if the compute precision is
+        set to fp16. By default, a fp16 i/o model is produced for fp32 models.
+        However, if the users specify the dtype, the converter is going to respect that.
+        """
+        # Case 1: fp32 single input model
+        mlmodel = ct.convert(
+            float32_input_model_add_op,
+            inputs=[ct.TensorType(shape=(10, 20))],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="fp16")
+        verify_prediction(mlmodel)
+
+        # Case 2: fp32 single input model
+        mlmodel = ct.convert(
+            float32_input_model_add_op,
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="fp16")
+        verify_prediction(mlmodel)
+
+        # Case 3: int32 single input model. No change made.
+        mlmodel = ct.convert(
+            int32_input_model,
+            inputs=[ct.TensorType(shape=(10, 20))],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="int32")
+        verify_prediction(mlmodel)
+
+    def test_iO16_default_fp16_multiple_input(
+        self,
+        float32_two_input_model,
+        int32_two_input_model,
+        int32_float32_two_input_model,
+    ):
+        # Case 1: fp32 two inputs model. First input dtype missing
+        mlmodel = ct.convert(
+            float32_two_input_model,
+            inputs=[
+                ct.TensorType(name="input1", shape=(10, 20)),
+                ct.TensorType(name="input2", shape=(10, 20), dtype=np.float32),
+            ],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="fp16", index=0)
+        assert_input_dtype(mlmodel, expected_type_str="fp32", index=1)
+        verify_prediction(mlmodel)
+
+        # Case 2: fp32 two inputs model. Second input dtype missing
+        mlmodel = ct.convert(
+            float32_two_input_model,
+            inputs=[
+                ct.TensorType(name="input1", shape=(10, 20), dtype=np.float32),
+                ct.TensorType(name="input2", shape=(10, 20)),
+            ],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="fp32", index=0)
+        assert_input_dtype(mlmodel, expected_type_str="fp16", index=1)
+        verify_prediction(mlmodel)
+
+        # Case 3: fp32 two inputs model. Both dtype missing
+        mlmodel = ct.convert(
+            float32_two_input_model,
+            inputs=[
+                ct.TensorType(name="input1", shape=(10, 20)),
+                ct.TensorType(name="input2", shape=(10, 20)),
+            ],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="fp16", index=0)
+        assert_input_dtype(mlmodel, expected_type_str="fp16", index=1)
+        verify_prediction(mlmodel)
+
+        # Case 4: fp32 two inputs model. inputs not given
+        mlmodel = ct.convert(
+            float32_two_input_model,
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="fp16", index=0)
+        assert_input_dtype(mlmodel, expected_type_str="fp16", index=1)
+        verify_prediction(mlmodel)
+
+        # Case 5: fp32 two inputs model. Both dtype given
+        mlmodel = ct.convert(
+            float32_two_input_model,
+            inputs=[
+                ct.TensorType(name="input1", shape=(10, 20), dtype=np.int32),
+                ct.TensorType(name="input2", shape=(10, 20), dtype=np.float32),
+            ],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="int32", index=0)
+        assert_input_dtype(mlmodel, expected_type_str="fp32", index=1)
+        verify_prediction(mlmodel)
+
+        # Case 6: int32 two inputs model. Both dtype missing. No change made.
+        mlmodel = ct.convert(
+            int32_two_input_model,
+            inputs=[
+                ct.TensorType(name="input1", shape=(10, 20)),
+                ct.TensorType(name="input2", shape=(10, 20)),
+            ],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="int32", index=0)
+        assert_input_dtype(mlmodel, expected_type_str="int32", index=1)
+        verify_prediction(mlmodel)
+
+        # Case 7: mixed dtype model with two inputs. Both dtype missing. The fp32 input is cast to fp16.
+        mlmodel = ct.convert(
+            int32_float32_two_input_model,
+            inputs=[
+                ct.TensorType(name="input1", shape=(10, 20)),
+                ct.TensorType(name="input2", shape=(10, 20)),
+            ],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="int32", index=0)
+        assert_input_dtype(mlmodel, expected_type_str="fp16", index=1)
+        verify_prediction(mlmodel)
+
+    def test_iO16_default_fp16_output(
+        self,
+        float32_input_model_add_op,
+        int32_input_model,
+    ):
+        """
+        With minimum_deployment_target set >= iOS16, if the compute precision is
+        set to fp16. By default, a fp16 i/o model is produced for fp32 models.
+        However, if the users specify the dtype, the converter is going to respect that.
+        """
+        # Case 1: fp32 single output model
+        mlmodel = ct.convert(
+            float32_input_model_add_op,
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_output_dtype(mlmodel, expected_type_str="fp16")
+        verify_prediction(mlmodel)
+
+        # Case 2: int32 single output model. No change made.
+        mlmodel = ct.convert(
+            int32_input_model,
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_output_dtype(mlmodel, expected_type_str="int32")
+        verify_prediction(mlmodel)
+
+        # Case 3: fp32 single output model, with dtype set by the user
+        mlmodel = ct.convert(
+            float32_input_model_add_op,
+            outputs=[ct.TensorType(dtype=np.float32)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_output_dtype(mlmodel, expected_type_str="fp32")
+        verify_prediction(mlmodel)
+
+    def test_iO16_default_fp16_multiple_output(
+        self,
+        float32_two_output_model,
+        int32_two_output_model,
+        int32_float32_two_output_model,
+    ):
+        output1_name = "Identity" if _HAS_TF_2 else "output1"
+        output2_name = "Identity_1" if _HAS_TF_2 else "output2"
+
+        # Case 1: fp32 two outputs model. First output dtype missing
+        mlmodel = ct.convert(
+            float32_two_output_model,
+            outputs=[
+                ct.TensorType(name=output1_name),
+                ct.TensorType(name=output2_name, dtype=np.float32),
+            ],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_output_dtype(mlmodel, expected_type_str="fp16", index=0)
+        assert_output_dtype(mlmodel, expected_type_str="fp32", index=1)
+        verify_prediction(mlmodel)
+
+        # Case 2: fp32 two outputs model. Second output dtype missing
+        mlmodel = ct.convert(
+            float32_two_output_model,
+            outputs=[
+                ct.TensorType(name=output1_name, dtype=np.int32),
+                ct.TensorType(name=output2_name),
+            ],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_output_dtype(mlmodel, expected_type_str="int32", index=0)
+        assert_output_dtype(mlmodel, expected_type_str="fp16", index=1)
+        verify_prediction(mlmodel)
+
+        # Case 3: fp32 two outputs model. Both output dtype missing
+        mlmodel = ct.convert(
+            float32_two_output_model,
+            outputs=[
+                ct.TensorType(name=output1_name),
+                ct.TensorType(name=output2_name),
+            ],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_output_dtype(mlmodel, expected_type_str="fp16", index=0)
+        assert_output_dtype(mlmodel, expected_type_str="fp16", index=1)
+        verify_prediction(mlmodel)
+
+        # Case 4: fp32 two outputs model. outputs not set.
+        mlmodel = ct.convert(
+            float32_two_output_model,
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_output_dtype(mlmodel, expected_type_str="fp16", index=0)
+        assert_output_dtype(mlmodel, expected_type_str="fp16", index=1)
+        verify_prediction(mlmodel)
+
+        # Case 5: int32 two outputs model. outputs not set. No change happens.
+        mlmodel = ct.convert(
+            int32_two_output_model,
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_output_dtype(mlmodel, expected_type_str="int32", index=0)
+        assert_output_dtype(mlmodel, expected_type_str="int32", index=1)
+        verify_prediction(mlmodel)
+
+        # Case 6: int32 two outputs model. The first input is force set to fp32.
+        # In this case, the first output is inferred as fp32 as well, so it defaults
+        # to fp16.
+        mlmodel = ct.convert(
+            int32_two_output_model,
+            inputs=[
+                ct.TensorType(name="input1", dtype=np.float32),
+                ct.TensorType(name="input2"),
+            ],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_output_dtype(mlmodel, expected_type_str="fp16", index=0)
+        assert_output_dtype(mlmodel, expected_type_str="int32", index=1)
+        verify_prediction(mlmodel)
+
+        # Case 7: int32 two outputs model. The second input is force set to fp16.
+        # In this case, the second output is inferred as fp32 as well, so it defaults
+        # to fp16.
+        mlmodel = ct.convert(
+            int32_two_output_model,
+            inputs=[
+                ct.TensorType(name="input1"),
+                ct.TensorType(name="input2", dtype=np.float16),
+            ],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_output_dtype(mlmodel, expected_type_str="int32", index=0)
+        assert_output_dtype(mlmodel, expected_type_str="fp16", index=1)
+        verify_prediction(mlmodel)
+
+        # Case 8: two outputs model with int32/fp32.
+        # In this case, the fp32 output defaults to fp16, while the int32 one remains unchanged.
+        mlmodel = ct.convert(
+            int32_float32_two_output_model,
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_output_dtype(mlmodel, expected_type_str="fp16", index=0)
+        assert_output_dtype(mlmodel, expected_type_str="int32", index=1)
+        verify_prediction(mlmodel)
+
+    def test_iO17_default_fp32_io(
+        self,
+        int32_float32_two_input_model,
+        int32_float32_two_output_model,
+    ):
+        """
+        With minimum_deployment_target set >= iOS16, and if the compute precision is
+        set to fp32. By default, a fp32 i/o model is produced.
+        """
+        # Example 1
+        mlmodel = ct.convert(
+            int32_float32_two_input_model,
+            compute_precision=ct.precision.FLOAT32,
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="int32", index=0)
+        assert_input_dtype(mlmodel, expected_type_str="fp32", index=1)
+        assert_output_dtype(mlmodel, expected_type_str="fp32", index=0)
+
+        # Example 2
+        mlmodel = ct.convert(
+            int32_float32_two_output_model,
+            compute_precision=ct.precision.FLOAT32,
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="fp32", index=0)
+        assert_input_dtype(mlmodel, expected_type_str="fp32", index=1)
+        assert_output_dtype(mlmodel, expected_type_str="fp32", index=0)
+        assert_output_dtype(mlmodel, expected_type_str="int32", index=1)
+
+    def test_iO16_default_image_dtype_input(
+        self,
+        rank4_input_model,
+        rank4_grayscale_input_model,
+    ):
+        """
+        We keep the input dtype for the image input model to fp32, unless it is GRAYSCALE_FLOAT16
+        """
+        # Example 1
+        mlmodel = ct.convert(
+            rank4_input_model,
+            inputs=[ct.ImageType(color_layout=ct.colorlayout.RGB)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.RGB)
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        verify_prediction(mlmodel)
+
+        # Example 2
+        mlmodel = ct.convert(
+            rank4_input_model,
+            inputs=[ct.ImageType(color_layout=ct.colorlayout.BGR)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.BGR)
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        verify_prediction(mlmodel)
+
+        # Example 3
+        mlmodel = ct.convert(
+            rank4_grayscale_input_model,
+            inputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_spec_input_image_type(
+            mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE
+        )
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        verify_prediction(mlmodel)
+
+        # Example 4
+        mlmodel = ct.convert(
+            rank4_grayscale_input_model,
+            inputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_spec_input_image_type(
+            mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE_FLOAT16
+        )
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        verify_prediction(mlmodel)
+
+    def test_iO16_default_image_dtype_output(
+        self,
+        rank4_input_model_with_channel_first_output,
+        rank4_grayscale_input_model_with_channel_first_output,
+    ):
+        """
+        We keep the output dtype for the image input model to fp32, unless it is GRAYSCALE_FLOAT16
+        """
+        # Example 1
+        mlmodel = ct.convert(
+            rank4_input_model_with_channel_first_output,
+            outputs=[ct.ImageType(color_layout=ct.colorlayout.RGB)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        assert_spec_output_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.RGB)
+        verify_prediction(mlmodel)
+
+        # Example 2
+        mlmodel = ct.convert(
+            rank4_input_model_with_channel_first_output,
+            outputs=[ct.ImageType(color_layout=ct.colorlayout.BGR)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        assert_spec_output_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.BGR)
+        verify_prediction(mlmodel)
+
+        # Example 3
+        mlmodel = ct.convert(
+            rank4_grayscale_input_model_with_channel_first_output,
+            outputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        assert_spec_output_image_type(
+            mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE
+        )
+        verify_prediction(mlmodel)
+
+        # Example 4
+        mlmodel = ct.convert(
+            rank4_grayscale_input_model_with_channel_first_output,
+            outputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        assert_spec_output_image_type(
+            mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE_FLOAT16
+        )
+        verify_prediction(mlmodel)
diff --git a/coremltools/converters/mil/frontend/tensorflow2/load.py b/coremltools/converters/mil/frontend/tensorflow2/load.py
index 624475c0d..654b84dad 100644
--- a/coremltools/converters/mil/frontend/tensorflow2/load.py
+++ b/coremltools/converters/mil/frontend/tensorflow2/load.py
@@ -205,6 +205,7 @@ def _program_from_tf_ssa(self):
             inputs=self.kwargs["inputs"],
             outputs=self.kwargs["outputs"],
             opset_version=self.kwargs["specification_version"],
+            use_default_fp16_io=self.kwargs["use_default_fp16_io"],
         )
         return converter.convert()
 
diff --git a/coremltools/converters/mil/frontend/tensorflow2/test/test_tf2_conversion_api.py b/coremltools/converters/mil/frontend/tensorflow2/test/test_tf2_conversion_api.py
index 71c9a3f89..a867bbe31 100644
--- a/coremltools/converters/mil/frontend/tensorflow2/test/test_tf2_conversion_api.py
+++ b/coremltools/converters/mil/frontend/tensorflow2/test/test_tf2_conversion_api.py
@@ -3,6 +3,7 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import itertools
 import os
 import platform
 from os import chdir, getcwd
@@ -13,7 +14,19 @@
 import pytest
 
 import coremltools as ct
+from coremltools.converters.mil.frontend.tensorflow.test.test_tf_conversion_api import (
+    TestInputOutputConversionAPI as TestTf2InputOutputConversionAPI,
+)
+from coremltools.converters.mil.frontend.tensorflow.test.test_tf_conversion_api import (
+    TestiOS16DefaultIODtype as TestTf2iOS16DefaultIODtype,
+)
 from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.testing_reqs import backends
+
+# We need to keep this, other the pre-commit hook is going to remove the TestInputOutputConversionAPI, TestiOS16DefaultIODtype
+assert TestTf2InputOutputConversionAPI is not None
+assert TestTf2iOS16DefaultIODtype is not None
+
 
 tf = pytest.importorskip("tensorflow", minversion="2.1.0")
 
@@ -22,12 +35,57 @@
 from tensorflow.keras import layers
 
 
+@pytest.fixture
+def uint8_input_model():
+    x = tf.keras.Input(batch_input_shape=(10, 20), name="input", dtype=tf.uint8)
+    out = tf.add(x, tf.constant(5, dtype=tf.uint8), name="output")
+    return tf.keras.Model(inputs=x, outputs=out)
+
+
+@pytest.fixture
+def int8_input_model():
+    x = tf.keras.Input(batch_input_shape=(10, 20), name="input", dtype=tf.int8)
+    out = tf.add(x, tf.constant(5, dtype=tf.int8), name="output")
+    return tf.keras.Model(inputs=x, outputs=out)
+
 @pytest.fixture
 def int32_input_model():
     x = tf.keras.Input(batch_input_shape=(10, 20), name="input", dtype=tf.int32)
     out = tf.add(x, tf.constant(5, dtype=tf.int32), name="output")
     return tf.keras.Model(inputs=x, outputs=out)
 
+@pytest.fixture
+def int32_two_input_model():
+    x = tf.keras.Input(batch_input_shape=(10, 20), name="input1", dtype=tf.int32)
+    y = tf.keras.Input(batch_input_shape=(10, 20), name="input2", dtype=tf.int32)
+    out = tf.add(x, y, name="output")
+    return tf.keras.Model(inputs=[x, y], outputs=out)
+
+@pytest.fixture
+def int32_two_output_model():
+    x = tf.keras.Input(batch_input_shape=(10, 20), name="input1", dtype=tf.int32)
+    y = tf.keras.Input(batch_input_shape=(10, 20), name="input2", dtype=tf.int32)
+    out1 = tf.add(x, 1, name="output1")
+    out2 = tf.add(y, 1, name="output2")
+    return tf.keras.Model(inputs=[x, y], outputs=[out1, out2])
+
+@pytest.fixture
+def int32_float32_two_output_model():
+    x = tf.keras.Input(batch_input_shape=(10, 20), name="input1", dtype=tf.float32)
+    y = tf.keras.Input(batch_input_shape=(10, 20), name="input2", dtype=tf.float32)
+    x_add = tf.add(x, 1.0, name="output1")
+    y_add = tf.add(y, 1.0)
+    y_cast = tf.cast(y_add, dtype=tf.int32, name="output2")
+    return tf.keras.Model(inputs=[x, y], outputs=[x_add, y_cast])
+
+@pytest.fixture
+def int32_float32_two_input_model():
+    x = tf.keras.Input(batch_input_shape=(10, 20), name="input1", dtype=tf.int32)
+    y = tf.keras.Input(batch_input_shape=(10, 20), name="input2", dtype=tf.float32)
+    x_cast = tf.cast(x, dtype=tf.float32)
+    out = tf.add(x_cast, y, name="output")
+    return tf.keras.Model(inputs=[x, y], outputs=out)
+
 @pytest.fixture
 def float32_input_model_add_op():
     x = tf.keras.Input(batch_input_shape=(10, 20), name="input", dtype=tf.float32)
@@ -62,6 +120,13 @@ def float32_two_output_model():
     out1 = tf.nn.relu(y, name="output1")
     return tf.keras.Model(inputs=x, outputs=[out1, out2])
 
+
+@pytest.fixture
+def float64_input_model():
+    x = tf.keras.Input(batch_input_shape=(10, 20), name="input", dtype=tf.float64)
+    out = tf.add(x, tf.constant(5, dtype=tf.float64), name="output")
+    return tf.keras.Model(inputs=x, outputs=out)
+
 @pytest.fixture
 def rank3_input_model():
     x = tf.keras.Input(batch_input_shape=(1, 10, 20), name="input", dtype=tf.float32)
@@ -139,7 +204,11 @@ def teardown_class(self):
             rmtree(self._temp_dir)
 
     @staticmethod
-    def test_convert_tf_keras_h5_file():
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_convert_tf_keras_h5_file(backend):
         if platform.machine() == "arm64":
             pytest.xfail("rdar://101162740 ([CI] [TF] The tf_keras_h5_file API testing is failing on M1 with new OS)")
 
@@ -151,25 +220,29 @@ def test_convert_tf_keras_h5_file():
             save_dir = str(temp_dir)
             path = os.path.join(save_dir, "tf_keras_model." + file_extension)
             keras_model.save(path)
-            mlmodel = ct.convert(path)
+            mlmodel = ct.convert(path, convert_to=backend[0])
 
             test_input = np.random.rand(2, 32)
             expected_val = keras_model(test_input)
             results = mlmodel.predict({"input": test_input})
-            np.testing.assert_allclose(results["Identity"], expected_val, rtol=1e-4)
+            np.testing.assert_allclose(results["Identity"], expected_val, rtol=1e-2, atol=1e-2)
 
     @staticmethod
-    def test_convert_tf_keras_model():
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_convert_tf_keras_model(backend):
         x = tf.keras.Input(shape=(32,), name="input")
         y = tf.keras.layers.Dense(16, activation="softmax")(x)
         keras_model = tf.keras.Model(x, y)
 
-        mlmodel = ct.convert(keras_model)
+        mlmodel = ct.convert(keras_model, convert_to=backend[0])
 
         test_input = np.random.rand(2, 32)
         expected_val = keras_model(test_input)
         results = mlmodel.predict({"input": test_input})
-        np.testing.assert_allclose(results["Identity"], expected_val, rtol=1e-4)
+        np.testing.assert_allclose(results["Identity"], expected_val, rtol=0.005)
 
     @staticmethod
     @pytest.mark.parametrize(
@@ -193,18 +266,23 @@ def test_convert_tf_keras_applications_model(dtype):
         mlmodel = ct.convert(
             tf_keras_model,
             inputs=[ct.TensorType(shape=(1, 224, 224, 3), dtype=dtype)],
+            convert_to="neuralnetwork",
         )
         mlmodel.save("./mobilenet.mlmodel")
 
     @staticmethod
     def test_convert_from_saved_model_dir():
         # SavedModel directory generated by TensorFlow 2.x
-        mlmodel = ct.convert("./saved_model")
+        mlmodel = ct.convert("./saved_model", convert_to="neuralnetwork")
         mlmodel.save("./model.mlmodel")
 
 
     @staticmethod
-    def test_keras_custom_layer_model():
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_keras_custom_layer_model(backend):
         # testing : https://coremltools.readme.io/docs/tensorflow-2#conversion-from-user-defined-models
 
         class CustomDense(layers.Layer):
@@ -228,10 +306,14 @@ def call(self, inputs):
         inputs = keras.Input((4,))
         outputs = CustomDense(10)(inputs)
         model = keras.Model(inputs, outputs)
-        ct.convert(model)
+        ct.convert(model, convert_to=backend[0])
 
     @staticmethod
-    def test_concrete_function_conversion():
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_concrete_function_conversion(backend):
         # testing : https://coremltools.readme.io/docs/tensorflow-2#conversion-from-user-defined-models
 
         @tf.function(input_signature=[tf.TensorSpec(shape=(6,), dtype=tf.float32)])
@@ -241,7 +323,7 @@ def gelu_tanh_activation(x):
             return x * y
 
         conc_func = gelu_tanh_activation.get_concrete_function()
-        mlmodel = ct.convert([conc_func])
+        mlmodel = ct.convert([conc_func], convert_to=backend[0])
 
     @staticmethod
     def test_convert_tf2_keras():
@@ -255,8 +337,14 @@ def test_convert_tf2_keras():
 class TestTF2FlexibleInput:
     # Test examples in https://coremltools.readme.io/docs/flexible-inputs
     @staticmethod
-    @pytest.mark.parametrize("use_symbol", [True, False])
-    def test_tf2keras_shared_range_dim(use_symbol):
+    @pytest.mark.parametrize(
+        "use_symbol, backend",
+        itertools.product(
+            [True, False],
+            backends,
+        ),
+    )
+    def test_tf2keras_shared_range_dim(use_symbol, backend):
         input_dim = 3
         # None denotes seq_len dimension
         x1 = tf.keras.Input(shape=(None,input_dim), name="seq1")
@@ -265,16 +353,15 @@ def test_tf2keras_shared_range_dim(use_symbol):
         keras_model = tf.keras.Model(inputs=[x1, x2], outputs=[y])
 
         # One RangeDim shared by two inputs
+        upper_bound = -1 if backend[0] == "neuralnetwork" else 5
         if use_symbol:
-            seq_len_dim = ct.RangeDim(symbol='seq_len')
+            seq_len_dim = ct.RangeDim(symbol="seq_len", upper_bound=upper_bound)
         else:
             # symbol is optional
-            seq_len_dim = ct.RangeDim()
+            seq_len_dim = ct.RangeDim(upper_bound=upper_bound)
         seq1_input = ct.TensorType(name="seq1", shape=(1, seq_len_dim, input_dim))
         seq2_input = ct.TensorType(name="seq2", shape=(1, seq_len_dim, input_dim))
-        mlmodel = ct.convert(keras_model,
-                inputs=[seq1_input, seq2_input])
-
+        mlmodel = ct.convert(keras_model, inputs=[seq1_input, seq2_input], convert_to=backend[0])
         batch = 1
         seq_len = 5
         test_input_x1 = np.random.rand(batch, seq_len, input_dim).astype(np.float32)
@@ -289,7 +376,11 @@ def test_tf2keras_shared_range_dim(use_symbol):
 
 
     @staticmethod
-    def test_tf2keras_incorrect_range_dim():
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_tf2keras_incorrect_range_dim(backend):
         input_dim = 3
         # None denotes seq_len dimension
         x1 = tf.keras.Input(shape=(None,input_dim), name="seq1")
@@ -301,11 +392,17 @@ def test_tf2keras_incorrect_range_dim():
         with pytest.raises(ValueError,
             match=r"Can\'t convert to CoreML shaping"):
             seq1_input = ct.TensorType(name="seq1", shape=(1, -1, input_dim))
-            mlmodel = ct.convert(keras_model, inputs=[seq1_input])
+            mlmodel = ct.convert(keras_model, inputs=[seq1_input], convert_to=backend[0])
 
     @staticmethod
-    @pytest.mark.parametrize("use_symbol", [True, False])
-    def test_tf2keras_outofbound_range_dim(use_symbol):
+    @pytest.mark.parametrize(
+        "use_symbol, backend",
+        itertools.product(
+            [True, False],
+            backends,
+        ),
+    )
+    def test_tf2keras_outofbound_range_dim(use_symbol, backend):
         input_dim = 3
         # None denotes seq_len dimension
         x = tf.keras.Input(shape=(None,input_dim), name="seq")
@@ -318,7 +415,7 @@ def test_tf2keras_outofbound_range_dim(use_symbol):
         else:
             seq_len_dim = ct.RangeDim(lower_bound=3, upper_bound=5)
         seq_input = ct.TensorType(name="seq", shape=(1, seq_len_dim, input_dim))
-        mlmodel = ct.convert(keras_model, inputs=[seq_input])
+        mlmodel = ct.convert(keras_model, inputs=[seq_input], convert_to=backend[0])
 
         # seq_len is within bound
         batch = 1
@@ -346,7 +443,11 @@ def test_tf2keras_outofbound_range_dim(use_symbol):
                 results = mlmodel.predict({"seq": test_input_x})
 
     @staticmethod
-    def test_tf2_image_enumerated_shapes():
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_tf2_image_enumerated_shapes(backend):
         keras_model = tf.keras.applications.MobileNetV2(
             input_shape=(None, None, 3,),
             classes=1000,
@@ -355,13 +456,17 @@ def test_tf2_image_enumerated_shapes():
         input_shapes = ct.EnumeratedShapes(shapes=[(1, 192, 192, 3), (1, 224, 224, 3)])
         image_input = ct.ImageType(shape=input_shapes,
                                    bias=[-1,-1,-1], scale=1/127)
-        model = ct.convert(keras_model, inputs=[image_input])
+        model = ct.convert(keras_model, inputs=[image_input], convert_to=backend[0])
         assert model is not None
         spec = model.get_spec()
         assert len(spec.description.input[0].type.imageType.enumeratedSizes.sizes) == 2
 
     @staticmethod
-    def test_tf2keras_enumerated_shapes():
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_tf2keras_enumerated_shapes(backend):
         input_shape = (28, 28, 3)
         # None denotes seq_len dimension
         x = tf.keras.Input(shape=input_shape, name="input")
@@ -375,7 +480,7 @@ def test_tf2keras_enumerated_shapes():
         shapes = [(1, 28, 28, 3), (1, 56, 56, 3)]
         enumerated_shapes = ct.EnumeratedShapes(shapes=shapes)
         tensor_input = ct.TensorType(name="input", shape=enumerated_shapes)
-        mlmodel = ct.convert(keras_model, inputs=[tensor_input])
+        mlmodel = ct.convert(keras_model, inputs=[tensor_input], convert_to=backend[0])
 
         # Test (1, 28, 28, 3) shape
         test_input_x = np.random.rand(*shapes[0]).astype(np.float32)
@@ -402,7 +507,11 @@ def test_tf2keras_enumerated_shapes():
                     "input": test_input_x})
 
     @staticmethod
-    def test_tf2keras_optional_input():
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_tf2keras_optional_input(backend):
         input_dim = 3
         # None denotes seq_len dimension
         x1 = tf.keras.Input(shape=(None,input_dim), name="optional_input")
@@ -410,7 +519,8 @@ def test_tf2keras_optional_input():
         y = x1 + x2
         keras_model = tf.keras.Model(inputs=[x1, x2], outputs=[y])
 
-        seq_len_dim = ct.RangeDim()
+        upper_bound = -1 if backend[0] == "neuralnetwork" else 2
+        seq_len_dim = ct.RangeDim(upper_bound=upper_bound)
         default_value = np.ones((1, 2, input_dim)).astype(np.float32)
         optional_input = ct.TensorType(
             name="optional_input",
@@ -420,9 +530,10 @@ def test_tf2keras_optional_input():
         required_input = ct.TensorType(
             name="required_input",
             shape=(1, seq_len_dim, input_dim),
-          )
-        mlmodel = ct.convert(keras_model,
-                inputs=[optional_input, required_input])
+        )
+        mlmodel = ct.convert(
+            keras_model, inputs=[optional_input, required_input], convert_to=backend[0]
+        )
 
         batch = 1
         seq_len = 2
diff --git a/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_load.py b/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_load.py
index 14877a904..376abf2de 100644
--- a/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_load.py
+++ b/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_load.py
@@ -15,7 +15,7 @@
 from coremltools.converters.mil.frontend.tensorflow.test.testing_utils import \
     get_tf_keras_io_names
 from coremltools.converters.mil.input_types import TensorType
-
+from coremltools.converters.mil.testing_reqs import backends
 
 tf = pytest.importorskip("tensorflow", minversion="2.1.0")
 
@@ -31,7 +31,11 @@ def teardown(self):
         if os.path.exists(self.saved_model_dir):
             shutil.rmtree(self.saved_model_dir)
 
-    def test_keras_model(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_keras_model(self, backend):
         keras_model = tf.keras.Sequential(
             [tf.keras.layers.ReLU(input_shape=(4, 5), batch_size=3)]
         )
@@ -41,10 +45,15 @@ def test_keras_model(self):
             inputs=[TensorType(input_names[0], (3, 4, 5))],
             outputs=["Identity"],
             source=frontend,
+            convert_to=backend[0],
         )
         assert mlmodel is not None
 
-    def test_keras_saved_model_file(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_keras_saved_model_file(self, backend):
         keras_model = tf.keras.Sequential(
             [
                 tf.keras.layers.Flatten(input_shape=(28, 28), batch_size=1),
@@ -53,11 +62,15 @@ def test_keras_saved_model_file(self):
         )
         keras_model.save(self.saved_model_dir, save_format="tf")
         mlmodel = converter.convert(
-            self.saved_model_dir, outputs=["Identity"], source=frontend
+            self.saved_model_dir, outputs=["Identity"], source=frontend, convert_to=backend[0]
         )
         assert mlmodel is not None
 
-    def test_keras_h5_file(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_keras_h5_file(self, backend):
         keras_model = tf.keras.Sequential(
             [tf.keras.layers.ReLU(input_shape=(4, 5), batch_size=3)]
         )
@@ -68,10 +81,15 @@ def test_keras_h5_file(self):
             inputs=[TensorType(input_names[0], (3, 4, 5))],
             outputs=["Identity"],
             source=frontend,
+            convert_to=backend[0],
         )
         assert mlmodel is not None
 
-    def test_keras_hdf5_file(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_keras_hdf5_file(self, backend):
         keras_model = tf.keras.Sequential(
             [tf.keras.layers.ReLU(input_shape=(4, 5), batch_size=3)]
         )
@@ -82,10 +100,15 @@ def test_keras_hdf5_file(self):
             inputs=[TensorType(input_names[0], (3, 4, 5))],
             outputs=["Identity"],
             source=frontend,
+            convert_to=backend[0],
         )
         assert mlmodel is not None
 
-    def test_concrete_function_list_from_tf_low_level_api(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_concrete_function_list_from_tf_low_level_api(self, backend):
         root = tf.train.Checkpoint()
         root.v1 = tf.Variable(3.0)
         root.v2 = tf.Variable(2.0)
@@ -100,11 +123,15 @@ def test_concrete_function_list_from_tf_low_level_api(self):
             tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY
         ]
         mlmodel = converter.convert(
-            [concrete_func], outputs=["Identity"], source=frontend
+            [concrete_func], outputs=["Identity"], source=frontend, convert_to=backend[0]
         )
         assert mlmodel is not None
 
-    def test_saved_model_list_from_tf_function(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_saved_model_list_from_tf_function(self, backend):
         class build_model(tf.Module):
             @tf.function(
                 input_signature=[tf.TensorSpec(shape=[3, 4, 5], dtype=tf.float32)]
@@ -115,11 +142,15 @@ def __call__(self, x):
         model = build_model()
         tf.saved_model.save(model, self.saved_model_dir)
         mlmodel = converter.convert(
-            self.saved_model_dir, outputs=["Identity"], source=frontend
+            self.saved_model_dir, outputs=["Identity"], source=frontend, convert_to=backend[0]
         )
         assert mlmodel is not None
 
-    def test_concrete_function_list_from_tf_function(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_concrete_function_list_from_tf_function(self, backend):
         class build_model(tf.Module):
             @tf.function(
                 input_signature=[tf.TensorSpec(shape=[3, 4, 5], dtype=tf.float32)]
@@ -130,11 +161,15 @@ def __call__(self, x):
         model = build_model()
         concrete_func = model.__call__.get_concrete_function()
         mlmodel = converter.convert(
-            [concrete_func], outputs=["Identity"], source=frontend
+            [concrete_func], outputs=["Identity"], source=frontend, convert_to=backend[0]
         )
         assert mlmodel is not None
 
-    def test_graphdef_from_tf_function(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_graphdef_from_tf_function(self, backend):
         class build_model(tf.Module):
             def __init__(self):
                 self.dense = tf.keras.layers.Dense(256, activation="relu")
@@ -157,10 +192,14 @@ def call(self, x):
             model.call.get_concrete_function())
         frozen_graph_def = frozen_graph_func.graph.as_graph_def()
 
-        mlmodel = converter.convert(frozen_graph_def)
+        mlmodel = converter.convert(frozen_graph_def, convert_to=backend[0])
         assert mlmodel is not None
 
-    def test_model_metadata(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_model_metadata(self, backend):
         keras_model = tf.keras.Sequential(
             [tf.keras.layers.ReLU(input_shape=(4, 5), batch_size=3)]
         )
@@ -170,25 +209,38 @@ def test_model_metadata(self):
             inputs=[TensorType(input_names[0], (3, 4, 5))],
             outputs=["Identity"],
             source=frontend,
+            convert_to=backend[0],
         )
         metadata_keys = mlmodel.get_spec().description.metadata.userDefined
         assert "com.github.apple.coremltools.version" in metadata_keys
         assert "com.github.apple.coremltools.source" in metadata_keys
         assert "tensorflow==2." in metadata_keys["com.github.apple.coremltools.source"]
 
-    def test_invalid_format_none(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_invalid_format_none(self, backend):
         with pytest.raises(NotImplementedError, match="Expected model format: .* .h5"):
-            converter.convert(None, source=frontend)
+            converter.convert(None, source=frontend, convert_to=backend[0])
 
-    def test_invalid_format_invalid_extension(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_invalid_format_invalid_extension(self, backend):
         _, invalid_filename = tempfile.mkstemp(suffix=".invalid", prefix=self.saved_model_dir)
         with pytest.raises(
             ValueError,
             match="Input model path should be .h5/.hdf5 file or a directory, but got .*.invalid",
         ):
-            converter.convert(invalid_filename, source=frontend)
+            converter.convert(invalid_filename, source=frontend, convert_to=backend[0])
 
-    def test_invalid_format_multiple_concrete_functions(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_invalid_format_multiple_concrete_functions(self, backend):
         class build_model(tf.Module):
             @tf.function(
                 input_signature=[tf.TensorSpec(shape=[3, 4, 5], dtype=tf.float32)]
@@ -201,14 +253,18 @@ def __call__(self, x):
         with pytest.raises(
             NotImplementedError, match="Only a single concrete function is supported"
         ):
-            converter.convert([cf, cf, cf], source=frontend)
+            converter.convert([cf, cf, cf], source=frontend, convert_to=backend[0])
 
-    def test_invalid_converter_type(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_invalid_converter_type(self, backend):
         keras_model = tf.keras.Sequential(
             [tf.keras.layers.ReLU(input_shape=(4, 5), batch_size=3)]
         )
         with pytest.raises(ValueError) as e:
-            converter.convert(keras_model, source="invalid")
+            converter.convert(keras_model, source="invalid", convert_to=backend[0])
 
         expected_msg = r'Unrecognized value of argument "source": .*'
         e.match(expected_msg)
@@ -217,8 +273,12 @@ def test_invalid_converter_type(self):
             converter.convert(keras_model, convert_to="invalid", source=frontend)
         e.match(r"Backend converter .* not implemented")
 
-    def test_invalid_format_non_exist(self):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_invalid_format_non_exist(self, backend):
         non_exist_filename = self.model_path_h5.replace(".h5", "_non_exist.h5")
         with pytest.raises(ValueError) as e:
-            converter.convert(non_exist_filename, source=frontend)
+            converter.convert(non_exist_filename, source=frontend, convert_to=backend[0])
         e.match(r"Input model .* does not exist")
diff --git a/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_ops_tf_keras.py b/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_ops_tf_keras.py
index 23e74dd68..47c8247c5 100644
--- a/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_ops_tf_keras.py
+++ b/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_ops_tf_keras.py
@@ -867,6 +867,36 @@ def test(self, compute_unit, backend, rank, data_format):
             backend=backend,
         )
 
+class TestMasking(TensorFlowBaseTest):
+    @pytest.mark.parametrize(
+        "compute_unit, backend, rank, mask_value, is_masked",
+        itertools.product(
+            compute_units,
+            backends,
+            [2, 5],
+            [0, 0.4],
+            [False, True],
+        ),
+    )
+    def test(self, compute_unit, backend, rank, mask_value, is_masked):
+        shape = np.random.randint(low=2, high=4, size=rank)
+        model = tf.keras.Sequential(
+            [
+                tf.keras.layers.Masking(
+                    batch_input_shape=shape,
+                    mask_value=mask_value,
+                )
+            ]
+        )
+        input_value = random_gen(shape, -10, 10)
+        if is_masked:
+            input_value[:, 1] = mask_value
+        TensorFlowBaseTest.run_compare_tf_keras(
+            model,
+            [input_value],
+            compute_unit=compute_unit,
+            backend=backend,
+        )
 
 class TestLambda(TensorFlowBaseTest):
     @pytest.mark.parametrize(
diff --git a/coremltools/converters/mil/frontend/torch/converter.py b/coremltools/converters/mil/frontend/torch/converter.py
index f2c97e75e..98c279167 100644
--- a/coremltools/converters/mil/frontend/torch/converter.py
+++ b/coremltools/converters/mil/frontend/torch/converter.py
@@ -11,7 +11,7 @@
 from coremltools import _logger as logger
 from coremltools._deps import version_lt
 from coremltools.converters.mil._deployment_compatibility import AvailableTarget as _target
-from coremltools.converters.mil.input_types import ImageType
+from coremltools.converters.mil.input_types import ImageType, TensorType
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import Function, Program, types
 from coremltools.converters.mil.mil.types import is_float
@@ -292,6 +292,7 @@ def __init__(
         outputs=None,
         cut_at_symbols=None,
         opset_version=None,
+        use_default_fp16_io=False,
     ):
         """
         Arguments:
@@ -302,6 +303,10 @@ def __init__(
                 terminate once these symbols have been generated. For debugging use
                 only. See kwarg in load.py.
             opset_version: An int represents the Core ML opset version.
+            use_default_fp16_io (optional): bool. Defaults to False.
+                When minimum_deployment_target set >= ct.target.iOS16 (the same as ct.target.macOS13),
+                and the compute precision set to fp16, this flag is True.
+                When True, fp32 i/o defaults to fp16.
         """
         assert isinstance(torchscript, torch.jit.ScriptModule)
 
@@ -312,6 +317,13 @@ def __init__(
 
         self.torchscript = torchscript
         self.outputs = outputs
+        self.use_default_fp16_io = use_default_fp16_io
+
+        if self.use_default_fp16_io:
+            # If the input type is not specified by the user and use_default_fp16_io
+            # is True. Make the default input type to fp16
+            self._adjust_default_input_to_fp16()
+
         self.output_names = get_output_names(self.outputs)
         self.opset_version = _target(opset_version) if opset_version is not None else None
         self.context = TranscriptionContext()
@@ -337,6 +349,37 @@ def __init__(
         self.inputs = list(self.graph.inputs.values())
         self._prog = Program()
 
+    def _adjust_default_input_to_fp16(self):
+        """
+        An utility function that sets the default input dtype to fp16
+        """
+        assert isinstance(self.inputs, list), "inputs must be type of list"
+        # Adjust inputs dtype to fp16
+        for val in self.inputs:
+            if isinstance(val, TensorType) and val.dtype is None:
+                val.dtype = types.fp16
+
+    def _adjust_default_output_to_fp16(self, graph_outputs):
+        """
+        An utility function that sets the default outputs with inferred type fp32 to fp16.
+
+        - If the inferred output dtype is fp32, and the user doesn't provide dtype, it defaults to fp16.
+        - If the inferred output dtype is not fp32, nothing would change.
+        """
+        if self.outputs is None:
+            self.outputs = []
+            for val in graph_outputs:
+                dtype = types.fp16 if val.dtype == types.fp32 else val.dtype
+                self.outputs.append(TensorType(dtype=dtype))
+        else:
+            for i, val in enumerate(self.outputs):
+                if (
+                    isinstance(val, TensorType)
+                    and val.dtype is None
+                    and graph_outputs[i].dtype == types.fp32
+                ):
+                    val.dtype = types.fp16
+
     @staticmethod
     def _check_ops(graph):
         """
@@ -367,6 +410,11 @@ def _create_placeholder(_input):
         """
         shape = _input.shape.symbolic_shape
         dtype = _input.dtype
+        # int64 and fp64 are not supported, so they are mapped to int32 / fp32 accordingly
+        if dtype == types.int64:
+            dtype = types.int32
+        elif dtype == types.fp64:
+            dtype = types.fp32
         return mb.placeholder(shape, dtype=dtype)
 
     def check_ops(self):
@@ -421,15 +469,9 @@ def convert(self):
                 self.graph.inputs.keys(), ssa_func_inputs.keys()
             ):
                 input_var = ssa_func.inputs[users_name]
-                if (types.is_tensor(input_var.sym_type) or types.is_scalar(input_var.sym_type)) \
-                    and (input_var.dtype == types.fp16 or input_var.dtype == types.fp64):
-                    # cast the input var to float32
-                    # We need to do this because the type inference is very buggy when started from
-                    # float16/float64 typed inputs. Until that is fixed in the following radar
-                    # we cast all inputs of type float16/float64 to float32 as the first step.
-                    # These casts will later get removed, if compute_precision=Float16 is
-                    # provided, which will cause the FP16ComputePrecision pass to run.
-                    # TODO: remove this when this radar is fixed: rdar://93731970
+                if (
+                    types.is_tensor(input_var.sym_type) or types.is_scalar(input_var.sym_type)
+                ) and input_var.dtype == types.fp16:
                     input_var = mb.cast(x=input_var, dtype="fp32")
                 self.context.add(input_var, torch_name=internal_name)
 
@@ -461,6 +503,10 @@ def convert(self):
 
             ssa_func.set_outputs(graph_outputs)
             prog.add_function("main", ssa_func)
+            if self.use_default_fp16_io:
+                # If the output type is not specified by the user and use_default_fp16_io
+                # is True. Make the default output type to fp16
+                self._adjust_default_output_to_fp16(graph_outputs)
             if self.outputs is not None:
                 prog.set_main_output_types(self.outputs)
         return prog
diff --git a/coremltools/converters/mil/frontend/torch/load.py b/coremltools/converters/mil/frontend/torch/load.py
index c95f27772..ca822776d 100644
--- a/coremltools/converters/mil/frontend/torch/load.py
+++ b/coremltools/converters/mil/frontend/torch/load.py
@@ -13,9 +13,16 @@
 from .converter import TorchConverter, torch_to_mil_types
 
 
-def load(model_spec, inputs, specification_version,
-         debug=False, outputs=None, cut_at_symbols=None,
-         **kwargs):
+def load(
+    model_spec,
+    inputs,
+    specification_version,
+    debug=False,
+    outputs=None,
+    cut_at_symbols=None,
+    use_default_fp16_io=False,
+    **kwargs
+):
     """
     Convert PyTorch model to mil CoreML format.
 
@@ -42,6 +49,10 @@ def load(model_spec, inputs, specification_version,
     cut_at_symbols (optional): List of internal symbol name strings. Graph conversion will
         terminate once these symbols have been generated. For debugging use
         only.
+    use_default_fp16_io (optional): bool. Defaults to False.
+        When minimum_deployment_target set >= ct.target.iOS16 (the same as ct.target.macOS13),
+        and the compute precision set to fp16, this flag is True.
+        When True, fp32 i/o defaults to fp16.
     """
     torchscript = _torchscript_from_model(model_spec)
 
@@ -59,6 +70,7 @@ def load(model_spec, inputs, specification_version,
         outputs,
         cut_at_symbols,
         specification_version,
+        use_default_fp16_io,
     )
     return _perform_torch_convert(converter, debug)
 
diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index 3a9dd363c..79938ba4d 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -1581,7 +1581,7 @@ def view(context, node):
         view = mb.complex(real_data=real, imag_data=imag, name=node.name)
     else:
         view = mb.reshape(x=x, shape=shape, name=node.name)
-        
+
     context.add(view)
 
 
@@ -3892,6 +3892,7 @@ def nll_loss(context, node):
 
     # compute the weights loss
     batch_size = x.shape[0]
+    class_num = x.shape[1]
 
     # only support weight and ignore_index both None
     if weight is not None:
@@ -3901,9 +3902,12 @@ def nll_loss(context, node):
 
     x = mb.cast(x=x, dtype="fp32")
     x = mb.mul(x=x, y=-1.)
-    range_indices = mb.range_1d(end=batch_size, start=0, step=1)
-    total_indices = mb.stack(values=[range_indices, target], axis=1)
-    loss = mb.gather_nd(x=x, indices=total_indices)
+
+    target = mb.cast(x=target, dtype="int32")
+    labels = mb.one_hot(indices=target, one_hot_vector_size=class_num)
+    labels = mb.cast(x=labels, dtype="fp32")
+    loss = mb.mul(x=x, y=labels)
+    loss = mb.reduce_sum(x=loss, axes=[1])
 
     # reduction type
     if reduction == "none":
@@ -4327,13 +4331,14 @@ def meshgrid(context, node):
 # Defines all the nodes that are noOps
 @register_torch_op(
     torch_alias=[
+        "clone",
+        "contiguous",
+        "detach",
+        "device",
         "dropout",
         "dropout_",
         "feature_dropout",
-        "contiguous",
-        "device",
-        "detach",
-        "clone",
+        "lift_fresh",
     ]
 )
 def noop(context, node):
@@ -4581,6 +4586,69 @@ def repeat(context, node):
     context.add(mb.tile(x=x, reps=reps, name=node.name))
 
 
+@register_torch_op
+def repeat_interleave(context, node):
+    """
+    For now, we only support scalar repeats + None or 0 dim
+    """
+    x, repeats, dim, _ = _get_inputs(context, node, expected=4)
+
+    repeats_val = repeats.val
+    if isinstance(repeats_val, np.ndarray):
+        repeats_val0 = np.expand_dims(repeats_val, 0).reshape(-1)[0]
+        if np.any(repeats_val != repeats_val0):
+            raise NotImplementedError(
+                "Conversion for torch.repeat_interleave with Tensor repeats has not been implemented"
+            )
+        repeats_val = repeats_val0
+
+    # This would operate on the flattened input tensor
+    if dim is None:
+        x = mb.reshape(x=x, shape=(-1,))
+    else:
+        if dim.val != 0:
+            raise NotImplementedError(
+                "Conversion for torch.repeat_interleave with non-zero dim has not been implemented"
+            )
+
+    """
+    on a high level:
+         x
+         | tile in dim 0
+         v
+        [x, x, ...]
+         | reshape to split the repeats
+         v
+        [[x],
+         [x],
+         ...]
+         | transpose(1, 0)
+         V
+        [x^T, x^T, ...]
+         | flatten
+         V
+        result
+    """
+
+    reps = [1] * x.rank
+    reps[0] = repeats_val
+    x_tiled = mb.tile(x=x, reps=reps)
+
+    split_reps = [repeats_val] + list(x.shape)
+    x_reshaped = mb.reshape(x=x_tiled, shape=list(split_reps))
+
+    perm = [*range(x.rank + 1)]
+    perm[0] = 1
+    perm[1] = 0
+    x_transposed = mb.transpose(x=x_reshaped, perm=perm)
+
+    result_shape = list(x.shape)
+    result_shape[0] = -1
+    result = mb.reshape(x=x_transposed, shape=result_shape, name=node.name)
+
+    context.add(result)
+
+
 @register_torch_op
 def acos(context, node):
     inputs = _get_inputs(context, node, expected=1)
@@ -5834,12 +5902,12 @@ def stft(context, node):
     if types.is_complex(input_data.dtype):
         onesided = False # pytorch defaults onesided to False for complex inputs
     stft_res = mb.complex_stft(
-        input=input_data, 
-        n_fft=n_fft, 
-        hop_length=hop_length, 
-        win_length=win_length, 
-        window=window, 
-        normalized=normalized, 
+        input=input_data,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        window=window,
+        normalized=normalized,
         onesided=onesided)
     context.add(stft_res, node.name)
 
diff --git a/coremltools/converters/mil/frontend/torch/test/test_api.py b/coremltools/converters/mil/frontend/torch/test/test_api.py
index f52eb32fa..4a3e0cf51 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_api.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_api.py
@@ -11,6 +11,7 @@
 
 import coremltools as ct
 from coremltools._deps import _HAS_TORCH, MSG_TORCH_NOT_FOUND
+from coremltools.converters.mil.testing_reqs import backends
 
 if _HAS_TORCH:
     import torch
@@ -20,7 +21,11 @@
 @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
 class TestPyTorchConverter:
     @staticmethod
-    def test_no_inputs():
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_no_inputs(backend):
         model = torchvision.models.mobilenet_v2()
         model.eval()
 
@@ -29,12 +34,16 @@ def test_no_inputs():
         traced_model = torch.jit.trace(model, example_input)
 
         with pytest.raises(ValueError) as e:
-            ct.convert(traced_model)
+            ct.convert(traced_model, convert_to=backend[0])
         e.match(r'Expected argument for pytorch "inputs" not provided')
 
 
     @staticmethod
-    def test_pth_extension(tmpdir):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_pth_extension(tmpdir, backend):
         # test for issue: https://github.com/apple/coremltools/issues/917
         class TestModule(torch.nn.Module):
             def __init__(self):
@@ -59,4 +68,5 @@ def forward(self, x):
                     shape=example_input.shape,
                 )
             ],
+            convert_to=backend[0],
         )
diff --git a/coremltools/converters/mil/frontend/torch/test/test_examples.py b/coremltools/converters/mil/frontend/torch/test/test_examples.py
index 10a99a5af..cddc4d97f 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_examples.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_examples.py
@@ -6,21 +6,23 @@
 import pytest
 
 import coremltools
-from coremltools._deps import (
-    _HAS_TORCH,
-    MSG_TORCH_NOT_FOUND,
-)
+from coremltools._deps import _HAS_TORCH, MSG_TORCH_NOT_FOUND
+from coremltools.converters.mil.testing_reqs import backends
 
 if _HAS_TORCH:
     import torch
-    from torch import nn
     import torch.nn.functional as F
+    from torch import nn
 
 
 @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
 class TestModelScripting:
     @staticmethod
-    def test():
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test(backend):
         # Example code from https://coremltools.readme.io/docs/model-scripting
 
         class _LoopBody(nn.Module):
@@ -61,4 +63,5 @@ def forward(self, x):
         mlmodel = coremltools.converters.convert(
             scripted_model,
             inputs=[coremltools.TensorType(shape=(1, 3, 64, 64))],
+            convert_to=backend[0],
         )
diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_conversion_api.py b/coremltools/converters/mil/frontend/torch/test/test_torch_conversion_api.py
index 775f7c5b2..487db8256 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_conversion_api.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_conversion_api.py
@@ -13,6 +13,7 @@
 import coremltools as ct
 from coremltools._deps import _HAS_TORCH, MSG_TORCH_NOT_FOUND
 from coremltools.converters.mil.frontend.torch.test.testing_utils import _copy_input_data
+from coremltools.converters.mil.testing_reqs import backends
 from coremltools.converters.mil.testing_utils import (
     assert_cast_ops_count,
     assert_input_dtype,
@@ -43,7 +44,11 @@
 @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
 class TestPyTorchConverterExamples:
     @staticmethod
-    def test_convert_torch_vision_mobilenet_v2(tmpdir):
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_convert_torch_vision_mobilenet_v2(tmpdir, backend):
         """
         In this example, we'll instantiate a PyTorch classification model and convert
         it to Core ML.
@@ -92,12 +97,14 @@ def test_convert_torch_vision_mobilenet_v2(tmpdir):
         mlmodel = ct.convert(
             traced_model,
             inputs=[ct.TensorType(name="input", shape=example_input.shape)],
+            convert_to=backend[0],
         )
 
         """
         Now with a conversion complete, we can save the MLModel and run inference.
         """
-        save_path = os.path.join(str(tmpdir), "mobilenet_v2.mlmodel")
+        suffix = ".mlmodel" if backend == "neuralnetwork" else ".mlpackage"
+        save_path = os.path.join(str(tmpdir), "mobilenet_v2" + suffix)
         mlmodel.save(save_path)
 
         """
@@ -188,8 +195,8 @@ def _test_classifier(traced_model, example_input, class_type, backend):
                 _test_classifier(traced_model, example_input, class_type, "mlprogram")
 
     @staticmethod
-    @pytest.mark.parametrize("convert_to", ['neuralnetwork', 'mlprogram'])
-    def test_convert_to_argument_with_torch_model(tmpdir, convert_to):
+    @pytest.mark.parametrize("backend", backends)
+    def test_convert_to_argument_with_torch_model(tmpdir, backend):
         class Network(torch.nn.Module):
             def __init__(self):
                 super(Network, self).__init__()
@@ -207,11 +214,11 @@ def forward(self, x):
         model = ct.convert(
             traced_model,
             inputs=[ct.TensorType(name="input", shape=example_input.shape)],
-            convert_to=convert_to
+            convert_to=backend[0],
         )
         assert isinstance(model, ct.models.MLModel)
         spec = model.get_spec()
-        if convert_to == "mlprogram":
+        if backend[0] == "mlprogram":
             assert spec.WhichOneof('Type') == 'mlProgram'
         else:
             assert spec.WhichOneof('Type') == 'neuralNetwork'
@@ -302,13 +309,13 @@ def forward(self, x):
     @staticmethod
     @pytest.mark.skipif(ct.utils._macos_version() < (12, 0), reason='Model produces specification 6.')
     @pytest.mark.parametrize(
-        "convert_to, provide_prob_output_argument",
+        "backend, provide_prob_output_argument",
         itertools.product(
-            ["neuralnetwork", "mlprogram"],
+            backends,
             [False, True],
         )
     )
-    def test_classifier_from_torch_model(convert_to, provide_prob_output_argument):
+    def test_classifier_from_torch_model(backend, provide_prob_output_argument):
         torch_model = torch.nn.ReLU().eval()
         traced_model = torch.jit.trace(torch_model, torch.rand(3,))
         variable_name = "var_2"
@@ -323,7 +330,7 @@ def test_classifier_from_torch_model(convert_to, provide_prob_output_argument):
             traced_model,
             inputs=[ct.TensorType(shape=(3,))],
             classifier_config = classifier_config,
-            convert_to=convert_to,
+            convert_to=backend[0],
         )
         spec = model.get_spec()
         input_name = spec.description.input[0].name
@@ -331,7 +338,7 @@ def test_classifier_from_torch_model(convert_to, provide_prob_output_argument):
 
         assert class_label_name in out_dict
         assert out_dict[class_label_name] == 'c'
-        if convert_to == "neuralnetwork":
+        if backend[0] == "neuralnetwork":
             assert variable_name in out_dict
             assert isinstance(out_dict[variable_name], dict)
         else:
@@ -348,11 +355,19 @@ def test_classifier_from_torch_model(convert_to, provide_prob_output_argument):
 class TestTorchInputs(_TestInputs):
     @staticmethod
     @pytest.mark.skipif(not ct.utils._is_macos(), reason="test needs predictions")
-    def test_torch_predict_input():
-        TestTorchInputs._test_variant_input_type_prediction(torch.tensor)
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_torch_predict_input(backend):
+        TestTorchInputs._test_variant_input_type_prediction(torch.tensor, backend[0])
 
     @staticmethod
-    def test_int64_inputs():
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_int64_inputs(backend):
 
         num_tokens = 3
         embedding_size = 5
@@ -380,6 +395,7 @@ def forward(self, x):
                     dtype=example_input.numpy().dtype,
                 )
             ],
+            convert_to=backend[0],
         )
 
         # running predict() is supported on macOS
@@ -391,7 +407,11 @@ def forward(self, x):
             # Verify outputs
             expected = model(example_input)
             name = list(result.keys())[0]
-            np.testing.assert_allclose(result[name], expected.detach().numpy())
+            rtol = 1e-03 if backend[0] == "mlprogram" else 1e-07
+            atol = 1e-04 if backend[0] == "mlprogram" else 0
+            np.testing.assert_allclose(
+                result[name], expected.detach().numpy(), rtol=rtol, atol=atol
+            )
 
         # Duplicated inputs are invalid
         with pytest.raises(ValueError, match=r"Duplicated inputs"):
@@ -409,6 +429,7 @@ def forward(self, x):
                         dtype=example_input.numpy().dtype,
                     ),
                 ],
+                convert_to=backend[0],
             )
 
         # Outputs must be of type ct.ImageType or ct.TensorType
@@ -423,10 +444,15 @@ def forward(self, x):
                     ),
                 ],
                 outputs=["output"],
+                convert_to=backend[0],
             )
 
     @staticmethod
-    def test_fully_dynamic_inputs():
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_fully_dynamic_inputs(backend):
         """
         All dims of the inputs are dynamic, and write to slice to one of the
         inputs.
@@ -445,12 +471,15 @@ def forward(self, x, y):
         model = Model(torch.tensor(3))
         scripted_model = torch.jit.script(model)
 
+        a, b = (-1, -1) if backend[0] == "neuralnetwork" else (6, 6)
+
         mlmodel = ct.convert(
             scripted_model,
             inputs=[
-                ct.TensorType("x", shape=(ct.RangeDim(), ct.RangeDim())),
-                ct.TensorType("y", shape=(ct.RangeDim(), ct.RangeDim())),
+                ct.TensorType("x", shape=(ct.RangeDim(upper_bound=a), ct.RangeDim(upper_bound=b))),
+                ct.TensorType("y", shape=(ct.RangeDim(upper_bound=a), ct.RangeDim(upper_bound=b))),
             ],
+            convert_to=backend[0],
         )
 
         # running predict() is supported on macOS
@@ -460,8 +489,11 @@ def forward(self, x, y):
             torch_res = model(*torch_input)
             results = mlmodel.predict({"x": x.cpu().detach().numpy(),
               "y": y.cpu().detach().numpy()})
+
+            rtol = 1e-03 if backend[0] == "mlprogram" else 1e-07
+            atol = 1e-04 if backend[0] == "mlprogram" else 0
             for i, name in enumerate(mlmodel.output_description):
-                np.testing.assert_allclose(torch_res[i], results[name])
+                np.testing.assert_allclose(torch_res[i], results[name], rtol=rtol, atol=atol)
 
             x, y = torch.rand(1, 6), torch.rand(2, 3)
             torch_input = _copy_input_data([x, y])
@@ -469,10 +501,14 @@ def forward(self, x, y):
             results = mlmodel.predict({"x": x.cpu().detach().numpy(),
               "y": y.cpu().detach().numpy()})
             for i, name in enumerate(mlmodel.output_description):
-                np.testing.assert_allclose(torch_res[i], results[name])
+                np.testing.assert_allclose(torch_res[i], results[name], rtol=rtol, atol=atol)
 
     @staticmethod
-    def test_rank0_inputs_torch():
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_rank0_inputs_torch(backend):
         """Similar to TestPyTorchConverterExamples::test_int64_inputs but
         using rank-0 int input.
         """
@@ -504,11 +540,15 @@ def forward(self, x):
                         dtype=example_input.numpy().dtype,
                     )
                 ],
+                convert_to=backend[0],
             )
 
     @staticmethod
-    @pytest.mark.parametrize("variable_length", [True, False])
-    def test_torch_range_dim_lstm(variable_length):
+    @pytest.mark.parametrize(
+        "variable_length, backend",
+        itertools.product([True, False], backends),
+    )
+    def test_torch_range_dim_lstm(variable_length, backend):
         """
         This example shows how to run LSTM with previous hidden / cell states
         """
@@ -547,7 +587,8 @@ def forward(self, x, hidden_state, cell_state):
         # each inference example (aka "runtime-determined"). If the sequence
         # length is always the same (e.g., 2 step LSTM would have seq_len == 2)
         # Note that fixed-length models usually run slightly faster than variable length models.
-        ct_seq_len = ct.RangeDim() if variable_length else seq_len
+        upper_bound = -1 if backend[0] == "neuralnetwork" else 10
+        ct_seq_len = ct.RangeDim(upper_bound=upper_bound) if variable_length else seq_len
         seq_input = ct.TensorType(shape=(ct_seq_len, batch, input_size),
             name="seq_input")
         h_input = ct.TensorType(shape=h_shape, name="h_input")
@@ -556,6 +597,7 @@ def forward(self, x, hidden_state, cell_state):
         mlmodel = ct.convert(
             traced_model,
             inputs=[seq_input, h_input, c_input],
+            convert_to=backend[0],
         )
 
         if ct.utils._is_macos():
@@ -570,12 +612,17 @@ def forward(self, x, hidden_state, cell_state):
             expected = model(rand_input, rand_h0, rand_c0)
             names = list(result.keys())
             names.sort()
-            np.testing.assert_allclose(result[names[0]],
-                expected[0].detach().numpy(), atol=1e-4)
-            np.testing.assert_allclose(result[names[1]],
-                expected[1].detach().numpy(), atol=1e-4)
-            np.testing.assert_allclose(result[names[2]],
-                expected[2].detach().numpy(), atol=1e-4)
+            atol = 1e-03 if backend[0] == "mlprogram" else 1e-04
+            rtol = 1e-03 if backend[0] == "mlprogram" else 1e-07
+            np.testing.assert_allclose(
+                result[names[0]], expected[0].detach().numpy(), atol=atol, rtol=rtol
+            )
+            np.testing.assert_allclose(
+                result[names[1]], expected[1].detach().numpy(), atol=atol, rtol=rtol
+            )
+            np.testing.assert_allclose(
+                result[names[2]], expected[2].detach().numpy(), atol=atol, rtol=rtol
+            )
 
             # Try example of different length
             if variable_length:
@@ -592,16 +639,25 @@ def forward(self, x, hidden_state, cell_state):
                 expected = model(rand_input, rand_h0, rand_c0)
                 names = list(result.keys())
                 names.sort()
-                np.testing.assert_allclose(result[names[0]],
-                    expected[0].detach().numpy(), atol=1e-4)
-                np.testing.assert_allclose(result[names[1]],
-                    expected[1].detach().numpy(), atol=1e-4)
-                np.testing.assert_allclose(result[names[2]],
-                    expected[2].detach().numpy(), atol=1e-4)
+                np.testing.assert_allclose(
+                    result[names[0]], expected[0].detach().numpy(), atol=atol, rtol=rtol
+                )
+                np.testing.assert_allclose(
+                    result[names[1]], expected[1].detach().numpy(), atol=atol, rtol=rtol
+                )
+                np.testing.assert_allclose(
+                    result[names[2]], expected[2].detach().numpy(), atol=atol, rtol=rtol
+                )
 
     @staticmethod
-    @pytest.mark.parametrize("use_symbol", [True, False])
-    def test_torch_outofbound_range_dim(use_symbol):
+    @pytest.mark.parametrize(
+        "use_symbol, backend",
+        itertools.product(
+            [True, False],
+            backends,
+        ),
+    )
+    def test_torch_outofbound_range_dim(use_symbol, backend):
 
         num_tokens = 3
         embedding_size = 5
@@ -632,6 +688,7 @@ def forward(self, x):
         mlmodel = ct.convert(
             traced_model,
             inputs=[seq_input],
+            convert_to=backend[0],
         )
 
         if ct.utils._is_macos():
@@ -640,9 +697,13 @@ def forward(self, x):
             )
 
             # Verify outputs
+            rtol = 1e-03 if backend[0] == "mlprogram" else 1e-07
+            atol = 1e-04 if backend[0] == "mlprogram" else 0
             expected = model(example_input)
             name = list(result.keys())[0]
-            np.testing.assert_allclose(result[name], expected.detach().numpy())
+            np.testing.assert_allclose(
+                result[name], expected.detach().numpy(), rtol=rtol, atol=atol
+            )
 
             # seq_len below/above lower_bound/upper_bound
             with pytest.raises(RuntimeError,
@@ -662,7 +723,11 @@ def forward(self, x):
                 )
 
     @staticmethod
-    def test_torch_enumerated_shapes():
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_torch_enumerated_shapes(backend):
 
         in_channels = 3
         out_channels = 2
@@ -690,7 +755,8 @@ def forward(self, x):
         mlmodel = ct.convert(
             traced_model,
             inputs=[tensor_input],
-            compute_units=ct.ComputeUnit.CPU_ONLY
+            compute_units=ct.ComputeUnit.CPU_ONLY,
+            convert_to=backend[0],
         )
 
         if ct.utils._is_macos():
@@ -699,10 +765,13 @@ def forward(self, x):
             )
 
             # Verify outputs
+            rtol = 1 if backend[0] == "mlprogram" else 1e-03
+            atol = 1e-02 if backend[0] == "mlprogram" else 1e-04
             expected = model(example_input)
             name = list(result.keys())[0]
-            np.testing.assert_allclose(result[name], expected.detach().numpy(),
-                    rtol=1e-3, atol=1e-4)
+            np.testing.assert_allclose(
+                result[name], expected.detach().numpy(), rtol=rtol, atol=atol
+            )
 
             # Test (1, 3, 56, 56) shape (can't verify numerical parity with Torch
             # which doesn't support enumerated shape)
@@ -716,7 +785,11 @@ def forward(self, x):
                 mlmodel.predict({"input": test_input_x})
 
     @staticmethod
-    def test_torch_image_enumerated_shapes():
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_torch_image_enumerated_shapes(backend):
         import torchvision
         torch_model = torchvision.models.mobilenet_v2().features
         torch_model.eval()
@@ -725,13 +798,17 @@ def test_torch_image_enumerated_shapes():
         input_shapes = ct.EnumeratedShapes(shapes=[(1, 3, 256, 256), (1, 3, 224, 224)])
         image_input = ct.ImageType(shape=input_shapes,
                                    bias=[-1, -1, -1], scale=1 / 127)
-        model = ct.convert(traced_model, inputs=[image_input])
+        model = ct.convert(traced_model, inputs=[image_input], convert_to=backend[0])
         assert model is not None
         spec = model.get_spec()
         assert len(spec.description.input[0].type.imageType.enumeratedSizes.sizes) == 2
 
     @staticmethod
-    def test_torch_optional_input():
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_torch_optional_input(backend):
 
         num_tokens = 3
         embedding_size = 5
@@ -753,8 +830,9 @@ def forward(self, x, y):
             ]
         traced_model = torch.jit.trace(model, example_input)
 
+        upper_bound = -1 if backend[0] == "neuralnetwork" else 2
         required_input = ct.TensorType(
-            name="required_input", shape=(ct.RangeDim(),), dtype=np.int64
+            name="required_input", shape=(ct.RangeDim(upper_bound=upper_bound),), dtype=np.int64
         )
         default_value = np.array([3]).astype(np.float32)
         optional_input = ct.TensorType(name="optional_input", shape=(1,),
@@ -768,6 +846,7 @@ def forward(self, x, y):
                 traced_model,
                 inputs=[required_input, optional_input],
                 compute_units=compute_units,
+                convert_to=backend[0],
             )
 
             assert(mlmodel.compute_unit == compute_units)
@@ -779,10 +858,14 @@ def forward(self, x, y):
                 )
 
                 # Verify outputs
+                rtol = 1e-03 if backend[0] == "mlprogram" else 1e-07
+                atol = 1e-03 if backend[0] == "mlprogram" else 0
                 torch_default_value = torch.tensor([3])
                 expected = model(example_input[0].detach(), torch_default_value)
                 name = list(result.keys())[0]
-                np.testing.assert_allclose(result[name], expected.detach().numpy())
+                np.testing.assert_allclose(
+                    result[name], expected.detach().numpy(), rtol=rtol, atol=atol
+                )
 
 
 @pytest.fixture
@@ -793,7 +876,6 @@ def forward(self, x):
     example_input = torch.randint(0, 100, (10, 20), dtype=torch.int32)
     return torch.jit.trace(Model().eval(), example_input)
 
-@pytest.fixture
 def int64_input_model():
     class Model(torch.nn.Module):
         def forward(self, x):
@@ -837,6 +919,28 @@ def forward(self, x):
     example_input = torch.randint(0, 100, (10, 20), dtype=torch.float32)
     return torch.jit.trace(Model().eval(), example_input)
 
+@pytest.fixture
+def int32_float32_two_output_model():
+    class Model(torch.nn.Module):
+        def forward(self, x, y):
+            out1 = x + 1
+            out2 = y + 1
+            return out1, out2
+
+    input_1 = torch.randint(0, 100, (10, 20), dtype=torch.int32)
+    input_2 = torch.randint(0, 100, (10, 20), dtype=torch.float32)
+    return torch.jit.trace(Model().eval(), [input_1, input_2])
+
+
+def float64_input_model():
+    class Model(torch.nn.Module):
+        def forward(self, x):
+            return x + 5.1
+
+    example_input = torch.randint(0, 100, (10, 20), dtype=torch.float64)
+    return torch.jit.trace(Model().eval(), example_input)
+
+
 @pytest.fixture
 def rank3_input_model():
     class Model(torch.nn.Module):
@@ -897,14 +1001,35 @@ def test_input_shape_missing_error(self, float32_input_model_add_op):
                                  inputs=[ct.TensorType(dtype=np.int32)],
                                  minimum_deployment_target=ct.target.macOS12)
 
-    def test_unsupported_input_dtype_in_torch_model(self, int64_input_model):
-        # test that no error is raised when no dtype is provided by the user,
-        # and the Torch model's input dtype is not supported.
-        # In this case, it will be mapped to the default dtype which is float32
-        mlmodel = ct.convert(int64_input_model,
-                             inputs=[ct.TensorType(shape=(10, 20))],
-                             minimum_deployment_target=ct.target.macOS12)
-        assert_input_dtype(mlmodel, expected_type_str="fp32")
+    @pytest.mark.parametrize(
+        "default_input_dtype, model",
+        itertools.product(
+            [True, False],
+            [int64_input_model, float64_input_model],
+        ),
+    )
+    def test_unsupported_input_dtype_torch_model(self, default_input_dtype, model):
+        # test that no error is raised when the Torch model's input dtype is not supported.
+        # If users don't provide the input type, it will be mapped to the default dtype which is float32.
+        # If the input type is provided, it will be mapped to the most compatible dtype:
+        # fp64 -> fp32, int64 -> int32
+        if default_input_dtype:
+            dtype = None
+            expected_type_str = "fp32"
+        else:
+            if model == int64_input_model:
+                dtype = np.int64
+                expected_type_str = "int32"
+            elif model == float64_input_model:
+                dtype = np.float64
+                expected_type_str = "fp32"
+
+        mlmodel = ct.convert(
+            model(),
+            inputs=[ct.TensorType(shape=(10, 20), dtype=dtype)],
+            minimum_deployment_target=ct.target.macOS12,
+        )
+        assert_input_dtype(mlmodel, expected_type_str=expected_type_str)
         verify_prediction(mlmodel)
 
     def test_input_dtype_user_provided(self, float32_input_model_add_op):
@@ -935,29 +1060,35 @@ def test_fp16_input_dtype(self, float32_input_model_add_op, float32_input_model_
         """
         Test that providing fp16 input dtype works with macOS13.
         """
-        mlmodel = ct.convert(float32_input_model_add_op,
-                             inputs=[ct.TensorType(shape=(10, 20), dtype=np.float16)],
-                             minimum_deployment_target=ct.target.macOS13
-                             )
+        mlmodel = ct.convert(
+            float32_input_model_add_op,
+            inputs=[ct.TensorType(shape=(10, 20), dtype=np.float16)],
+            outputs=[ct.TensorType(dtype=np.float32)],
+            minimum_deployment_target=ct.target.macOS13,
+        )
         assert_ops_in_mil_program(mlmodel, expected_op_list=["add", "cast"])
         assert_input_dtype(mlmodel, expected_type_str="fp16")
         assert_output_dtype(mlmodel, expected_type_str="fp32")
         verify_prediction(mlmodel)
 
-        mlmodel = ct.convert(float32_input_model_relu_ops,
-                             inputs=[ct.TensorType(shape=(10, 20), dtype=np.float16)],
-                             minimum_deployment_target=ct.target.macOS13
-                             )
+        mlmodel = ct.convert(
+            float32_input_model_relu_ops,
+            inputs=[ct.TensorType(shape=(10, 20), dtype=np.float16)],
+            outputs=[ct.TensorType(dtype=np.float32)],
+            minimum_deployment_target=ct.target.macOS13,
+        )
         # Two consecutive relus are merged in the `merge_consecutive_relus` pass.
         assert_ops_in_mil_program(mlmodel, expected_op_list=["relu", "cast"])
         assert_input_dtype(mlmodel, expected_type_str="fp16")
         assert_output_dtype(mlmodel, expected_type_str="fp32")
         verify_prediction(mlmodel)
 
-        mlmodel = ct.convert(int32_input_model,
-                             inputs=[ct.TensorType(shape=(10, 20), dtype=np.float16)],
-                             minimum_deployment_target=ct.target.macOS13,
-                             )
+        mlmodel = ct.convert(
+            int32_input_model,
+            inputs=[ct.TensorType(shape=(10, 20), dtype=np.float16)],
+            outputs=[ct.TensorType(dtype=np.float32)],
+            minimum_deployment_target=ct.target.macOS13,
+        )
         assert_ops_in_mil_program(mlmodel, expected_op_list=["add", "cast"])
         assert_input_dtype(mlmodel, expected_type_str="fp16")
         assert_output_dtype(mlmodel, expected_type_str="fp32")
@@ -1032,11 +1163,17 @@ def test_two_input_model(self, float32_two_input_model):
         assert_output_dtype(mlmodel, expected_type_str="int32")
 
         # test forcing both inputs to be float16
-        mlmodel = ct.convert(float32_two_input_model,
-                             inputs=[ct.TensorType(shape=(10, 20), dtype=np.float16),
-                                     ct.TensorType(shape=(10, 20), dtype=np.float16),
-                                     ],
-                             minimum_deployment_target=ct.target.macOS13)
+        mlmodel = ct.convert(
+            float32_two_input_model,
+            inputs=[
+                ct.TensorType(shape=(10, 20), dtype=np.float16),
+                ct.TensorType(shape=(10, 20), dtype=np.float16),
+            ],
+            outputs=[
+                ct.TensorType(dtype=np.float32),
+            ],
+            minimum_deployment_target=ct.target.macOS13,
+        )
         assert_ops_in_mil_program(mlmodel, expected_op_list=["add", "cast"])
         assert_input_dtype(mlmodel, expected_type_str="fp16", index=0)
         assert_input_dtype(mlmodel, expected_type_str="fp16", index=1)
@@ -1062,21 +1199,26 @@ def test_output_name_specified_by_user(self, float32_input_model_relu_ops, float
 
     def test_single_output_model(self, int32_input_model, float32_input_model_relu_ops):
         # test output type: if not provided, it should be the default which is float32
-        mlmodel = ct.convert(int32_input_model,
-                             inputs=[ct.TensorType(shape=(10, 20))],
-                             minimum_deployment_target=ct.target.macOS12)
+        mlmodel = ct.convert(
+            int32_input_model,
+            inputs=[ct.TensorType(shape=(10, 20), dtype=np.float32)],
+            outputs=[ct.TensorType(dtype=np.float32)],
+            minimum_deployment_target=ct.target.macOS12,
+        )
         assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "add", "cast"])
         assert_input_dtype(mlmodel, expected_type_str="fp32")
         assert_output_dtype(mlmodel, expected_type_str="fp32")
 
         # test that the output dtype provided by the user is applied during conversion
-        mlmodel = ct.convert(float32_input_model_relu_ops,
-                             inputs=[ct.TensorType(shape=(10, 20))],
-                             outputs=[ct.TensorType(dtype=np.int32)],
-                             minimum_deployment_target=ct.target.macOS12)
+        mlmodel = ct.convert(
+            float32_input_model_relu_ops,
+            inputs=[ct.TensorType(shape=(10, 20), dtype=np.float32)],
+            outputs=[ct.TensorType(dtype=np.int32)],
+            minimum_deployment_target=ct.target.macOS12,
+        )
         assert_input_dtype(mlmodel, expected_type_str="fp32")
         assert_output_dtype(mlmodel, expected_type_str="int32")
-        assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "relu", "cast", "cast"])
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "relu", "cast"])
 
         # test that an error is raised when shape is provided for the output
         with pytest.raises(ValueError):
@@ -1096,11 +1238,12 @@ def test_single_output_model(self, int32_input_model, float32_input_model_relu_o
                        )
 
         # test that output type float16 is applied correctly
-        mlmodel = ct.convert(float32_input_model_relu_ops,
-                             inputs=[ct.TensorType(shape=(10, 20))],
-                             outputs=[ct.TensorType(dtype=np.float16)],
-                             minimum_deployment_target=ct.target.macOS13,
-                             )
+        mlmodel = ct.convert(
+            float32_input_model_relu_ops,
+            inputs=[ct.TensorType(shape=(10, 20), dtype=np.float32)],
+            outputs=[ct.TensorType(dtype=np.float16)],
+            minimum_deployment_target=ct.target.macOS13,
+        )
         assert_output_dtype(mlmodel, expected_type_str="fp16")
         assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "relu"])
 
@@ -1138,10 +1281,12 @@ def test_multi_output_model(self, float32_two_output_model):
         verify_prediction(mlmodel)
 
     def test_color_input(self, rank4_input_model, rank3_input_model):
-        mlmodel = ct.convert(rank4_input_model,
-                             inputs=[ct.ImageType(shape=(1, 3, 10, 20), color_layout=ct.colorlayout.RGB)],
-                             minimum_deployment_target=ct.target.macOS13,
-                             )
+        mlmodel = ct.convert(
+            rank4_input_model,
+            inputs=[ct.ImageType(shape=(1, 3, 10, 20), color_layout=ct.colorlayout.RGB)],
+            outputs=[ct.TensorType(dtype=np.float32)],
+            minimum_deployment_target=ct.target.macOS13,
+        )
         assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "add", "cast"])
         assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.RGB)
         assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp32")
@@ -1168,10 +1313,12 @@ def test_grayscale_input(self, rank4_input_model, rank3_input_model, rank4_grays
                        minimum_deployment_target=ct.target.macOS13,
                        )
 
-        mlmodel = ct.convert(rank4_grayscale_input_model,
-                             inputs=[ct.ImageType(shape=(1, 1, 10, 20), color_layout=ct.colorlayout.GRAYSCALE)],
-                             minimum_deployment_target=ct.target.macOS13,
-                             )
+        mlmodel = ct.convert(
+            rank4_grayscale_input_model,
+            inputs=[ct.ImageType(shape=(1, 1, 10, 20), color_layout=ct.colorlayout.GRAYSCALE)],
+            outputs=[ct.TensorType(dtype=np.float32)],
+            minimum_deployment_target=ct.target.macOS13,
+        )
         assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "add", "cast"])
         assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE)
         assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp32")
@@ -1226,11 +1373,12 @@ def test_color_output(self, rank4_input_model, float32_input_model_add_op):
         verify_prediction(mlmodel)
 
         # check neural network conversion
-        mlmodel = ct.convert(rank4_input_model,
-                             inputs=[ct.ImageType(shape=(1, 3, 10, 20),
-                                                  color_layout=ct.colorlayout.RGB)],
-                             outputs=[ct.ImageType(color_layout=ct.colorlayout.BGR)],
-                             )
+        mlmodel = ct.convert(
+            rank4_input_model,
+            inputs=[ct.ImageType(shape=(1, 3, 10, 20), color_layout=ct.colorlayout.RGB)],
+            outputs=[ct.ImageType(color_layout=ct.colorlayout.BGR)],
+            convert_to="neuralnetwork",
+        )
         assert_ops_in_mil_program(mlmodel, expected_op_list=["add"])
         assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.RGB)
         assert_spec_output_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.BGR)
@@ -1244,11 +1392,12 @@ def test_grayscale_output(self, rank4_grayscale_input_model):
                        minimum_deployment_target=ct.target.macOS12,
                       )
 
-        mlmodel = ct.convert(rank4_grayscale_input_model,
-                             inputs=[ct.ImageType(shape=(1, 1, 10, 20),
-                                                  color_layout=ct.colorlayout.GRAYSCALE)],
-                             outputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE)],
-                             )
+        mlmodel = ct.convert(
+            rank4_grayscale_input_model,
+            inputs=[ct.ImageType(shape=(1, 1, 10, 20), color_layout=ct.colorlayout.GRAYSCALE)],
+            outputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE)],
+            convert_to="neuralnetwork",
+        )
         assert_ops_in_mil_program(mlmodel, expected_op_list=["add"])
         assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE)
         assert_spec_output_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE)
@@ -1410,7 +1559,7 @@ def test_grayscale_fp16_output_image(self, rank4_grayscale_input_model):
 
 
 @pytest.mark.skipif(
-    ct.utils._macos_version() < (14, 0), reason="Tests are for deployment target ios17/macos14"
+    ct.utils._macos_version() < (14, 0), reason="Tests are for deployment target iOS16/macos14"
 )
 class TestQuantizationConversionAPI:
     def test_dynamic_quantization(self):
@@ -1488,13 +1637,10 @@ def forward(self, x):
 
         ops = get_op_types_in_program(coreml_model._mil_program)
         # constexpr_affine_dequantize and cast -> quantize can have arbitrary order
-        assert ops[:3] == ["cast", "quantize", "constexpr_affine_dequantize"] or ops[:3] == [
-            "constexpr_affine_dequantize",
-            "cast",
-            "quantize",
-        ]
+        assert set(ops[:2]) == set(["quantize", "constexpr_affine_dequantize"])
+
         # these ops have well-defined order
-        assert ops[3:] == [
+        assert ops[2:] == [
             # quantized ConvRelu op
             "dequantize",
             "conv",
@@ -1502,7 +1648,6 @@ def forward(self, x):
             "quantize",
             # dequantize and output
             "dequantize",
-            "cast",
         ]
 
         output = traced_model(x)
@@ -1547,13 +1692,450 @@ def forward(self, x):
 
         ops = get_op_types_in_program(coreml_model._mil_program)
         # constexpr_affine_dequantize and cast can have arbitrary order
-        assert ops[:2] == ["cast", "constexpr_affine_dequantize"] or ops[:2] == [
+        assert ops == [
             "constexpr_affine_dequantize",
-            "cast",
+            "conv",
         ]
-        # these ops have well-defined order
-        assert ops[2:] == ["conv", "cast"]
-
         output = traced_model(x)
         coreml_output = coreml_model.predict({"x": x})["y"]
         np.testing.assert_allclose(output, coreml_output, rtol=1e-2, atol=2e-2)
+
+
+class TestiOS16DefaultIODtype:
+    """
+    This class tests the default i/o dtype behavior for iOS16 (and above) models.
+    """
+
+    @staticmethod
+    def _verify_model_io(mlmodel, input_dtype, output_dtype, expected_op_list):
+        """
+        This utility function verifies the model's i/o dtypes and expected ops
+        """
+        assert_input_dtype(mlmodel, expected_type_str=input_dtype)
+        assert_output_dtype(mlmodel, expected_type_str=output_dtype)
+        assert_ops_in_mil_program(mlmodel, expected_op_list=expected_op_list)
+        verify_prediction(mlmodel)
+
+    def test_iO16_default_fp16_input(self, float32_input_model_add_op):
+        """
+        With minimum_deployment_target set >= iOS16, and if the compute precision is
+        set to fp16. By default, a fp16 i/o model is produced.
+        However, if the users specify the dtype, the converter is going to respect that.
+        """
+        # Case 1: Inputs given / outputs None
+        mlmodel = ct.convert(
+            float32_input_model_add_op,
+            inputs=[ct.TensorType(shape=(10, 20))],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        self._verify_model_io(
+            mlmodel,
+            input_dtype="fp16",
+            output_dtype="fp16",
+            expected_op_list=["add"],
+        )
+
+        # Case 2: Inputs given / outputs given
+        mlmodel = ct.convert(
+            float32_input_model_add_op,
+            inputs=[ct.TensorType(shape=(10, 20), dtype=None)],
+            outputs=[ct.TensorType(dtype=None)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        self._verify_model_io(
+            mlmodel,
+            input_dtype="fp16",
+            output_dtype="fp16",
+            expected_op_list=["add"],
+        )
+
+        # Case 3: Inputs set fp32 / outputs None
+        mlmodel = ct.convert(
+            float32_input_model_add_op,
+            inputs=[ct.TensorType(shape=(10, 20), dtype=np.float32)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        self._verify_model_io(
+            mlmodel,
+            input_dtype="fp32",
+            output_dtype="fp16",
+            expected_op_list=["cast", "add"],
+        )
+
+        # Case 4: Inputs set fp32 / outputs given
+        mlmodel = ct.convert(
+            float32_input_model_add_op,
+            inputs=[ct.TensorType(shape=(10, 20), dtype=np.float32)],
+            outputs=[ct.TensorType(dtype=None)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        self._verify_model_io(
+            mlmodel,
+            input_dtype="fp32",
+            output_dtype="fp16",
+            expected_op_list=["cast", "add"],
+        )
+
+        # Case 5: Inputs given / outputs set to fp32
+        mlmodel = ct.convert(
+            float32_input_model_add_op,
+            inputs=[ct.TensorType(shape=(10, 20))],
+            outputs=[ct.TensorType(dtype=np.float32)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        self._verify_model_io(
+            mlmodel,
+            input_dtype="fp16",
+            output_dtype="fp32",
+            expected_op_list=["add", "cast"],
+        )
+
+        # Case 6: Inputs / outputs both set to fp32
+        mlmodel = ct.convert(
+            float32_input_model_add_op,
+            inputs=[ct.TensorType(shape=(10, 20), dtype=np.float32)],
+            outputs=[ct.TensorType(dtype=np.float32)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        self._verify_model_io(
+            mlmodel,
+            input_dtype="fp32",
+            output_dtype="fp32",
+            expected_op_list=["cast", "add", "cast"],
+        )
+
+    def test_iO16_default_fp16_io_with_multiple_inputs(self, float32_two_input_model):
+        """
+        For the multiple inputs model, the converter only set the default dtype for
+        inputs with unspecified dtype.
+        """
+        # Case 1: first input is set to fp32
+        mlmodel = ct.convert(
+            float32_two_input_model,
+            inputs=[ct.TensorType(shape=(10, 20), dtype=np.float32), ct.TensorType(shape=(10, 20))],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="fp32", index=0)
+        assert_input_dtype(mlmodel, expected_type_str="fp16", index=1)
+        assert_output_dtype(mlmodel, expected_type_str="fp16")
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "add"])
+
+        # Case 2: second input is set to fp32
+        mlmodel = ct.convert(
+            float32_two_input_model,
+            inputs=[ct.TensorType(shape=(10, 20)), ct.TensorType(shape=(10, 20), dtype=np.float32)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="fp16", index=0)
+        assert_input_dtype(mlmodel, expected_type_str="fp32", index=1)
+        assert_output_dtype(mlmodel, expected_type_str="fp16")
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "add"])
+
+        # Case 3: both inputs are set to fp32
+        mlmodel = ct.convert(
+            float32_two_input_model,
+            inputs=[
+                ct.TensorType(shape=(10, 20), dtype=np.float32),
+                ct.TensorType(shape=(10, 20), dtype=np.float32),
+            ],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="fp32", index=0)
+        assert_input_dtype(mlmodel, expected_type_str="fp32", index=1)
+        assert_output_dtype(mlmodel, expected_type_str="fp16")
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "cast", "add"])
+
+        # Case 4: both inputs are not set
+        mlmodel = ct.convert(
+            float32_two_input_model,
+            inputs=[ct.TensorType(shape=(10, 20)), ct.TensorType(shape=(10, 20))],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="fp16", index=0)
+        assert_input_dtype(mlmodel, expected_type_str="fp16", index=1)
+        assert_output_dtype(mlmodel, expected_type_str="fp16")
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["add"])
+
+    def test_iO16_default_fp16_io_with_multiple_outputs(
+        self, float32_two_output_model, int32_float32_two_output_model
+    ):
+        """
+        For the multiple outputs model, the converter only set the default dtype to  fp16 for
+        outputs that satisfy
+        1. dtype is None
+        2. inferred dtype is fp32
+        """
+        # Case 1: first output is set to fp32
+        mlmodel = ct.convert(
+            float32_two_output_model,
+            inputs=[ct.TensorType(shape=(10, 20))],
+            outputs=[ct.TensorType(dtype=np.float32), ct.TensorType(dtype=None)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="fp16")
+        assert_output_dtype(mlmodel, expected_type_str="fp32", index=0)
+        assert_output_dtype(mlmodel, expected_type_str="fp16", index=1)
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["relu", "clip", "cast"])
+
+        # Case 2: second output is set to fp32
+        mlmodel = ct.convert(
+            float32_two_output_model,
+            inputs=[ct.TensorType(shape=(10, 20))],
+            outputs=[ct.TensorType(dtype=None), ct.TensorType(dtype=np.float32)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="fp16")
+        assert_output_dtype(mlmodel, expected_type_str="fp16", index=0)
+        assert_output_dtype(mlmodel, expected_type_str="fp32", index=1)
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["relu", "clip", "cast"])
+
+        # Case 3: both outputs are set to fp32
+        mlmodel = ct.convert(
+            float32_two_output_model,
+            inputs=[ct.TensorType(shape=(10, 20))],
+            outputs=[ct.TensorType(dtype=np.float32), ct.TensorType(dtype=np.float32)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="fp16")
+        assert_output_dtype(mlmodel, expected_type_str="fp32", index=0)
+        assert_output_dtype(mlmodel, expected_type_str="fp32", index=1)
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["relu", "clip", "cast", "cast"])
+
+        # Case 4: both outputs are not set
+        mlmodel = ct.convert(
+            float32_two_output_model,
+            inputs=[ct.TensorType(shape=(10, 20))],
+            outputs=[ct.TensorType(dtype=None), ct.TensorType(dtype=None)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="fp16")
+        assert_output_dtype(mlmodel, expected_type_str="fp16", index=0)
+        assert_output_dtype(mlmodel, expected_type_str="fp16", index=1)
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["relu", "clip"])
+
+        # Case 5: outputs is not provided at all
+        mlmodel = ct.convert(
+            float32_two_output_model,
+            inputs=[ct.TensorType(shape=(10, 20))],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="fp16")
+        assert_output_dtype(mlmodel, expected_type_str="fp16", index=0)
+        assert_output_dtype(mlmodel, expected_type_str="fp16", index=1)
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["relu", "clip"])
+
+        # Case 6: int32 and fp32 output. The fp32 defaults to fp32 while the int32 one remains unchanged.
+        mlmodel = ct.convert(
+            int32_float32_two_output_model,
+            inputs=[
+                ct.TensorType(shape=(10, 20), dtype=np.int32),
+                ct.TensorType(shape=(10, 20), dtype=np.float32),
+            ],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="int32", index=0)
+        assert_input_dtype(mlmodel, expected_type_str="fp32", index=1)
+        assert_output_dtype(mlmodel, expected_type_str="int32", index=0)
+        assert_output_dtype(mlmodel, expected_type_str="fp16", index=1)
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["add", "cast", "add"])
+
+        # Case 7: int32 and fp32 output. The fp32 defaults to fp32 while the int32 one remains unchanged.
+        mlmodel = ct.convert(
+            int32_float32_two_output_model,
+            inputs=[
+                ct.TensorType(shape=(10, 20), dtype=np.int32),
+                ct.TensorType(shape=(10, 20)),
+            ],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="int32", index=0)
+        assert_input_dtype(mlmodel, expected_type_str="fp16", index=1)
+        assert_output_dtype(mlmodel, expected_type_str="int32", index=0)
+        assert_output_dtype(mlmodel, expected_type_str="fp16", index=1)
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["add", "add"])
+
+        # Case 8: int32 and fp32 output. The fp32 defaults to fp32 while the int32 one remains unchanged.
+        mlmodel = ct.convert(
+            int32_float32_two_output_model,
+            inputs=[
+                ct.TensorType(shape=(10, 20), dtype=np.int32),
+                ct.TensorType(shape=(10, 20)),
+            ],
+            outputs=[
+                ct.TensorType(name="out1"),
+                ct.TensorType(name="out2"),
+            ],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="int32", index=0)
+        assert_input_dtype(mlmodel, expected_type_str="fp16", index=1)
+        assert_output_dtype(mlmodel, expected_type_str="int32", index=0)
+        assert_output_dtype(mlmodel, expected_type_str="fp16", index=1)
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["add", "add"])
+
+        # Case 9: two int32 outputs. Nothing changed.
+        mlmodel = ct.convert(
+            int32_float32_two_output_model,
+            inputs=[
+                ct.TensorType(shape=(10, 20), dtype=np.int32),
+                ct.TensorType(shape=(10, 20), dtype=np.int32),
+            ],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_input_dtype(mlmodel, expected_type_str="int32", index=0)
+        assert_input_dtype(mlmodel, expected_type_str="int32", index=1)
+        assert_output_dtype(mlmodel, expected_type_str="int32", index=0)
+        assert_output_dtype(mlmodel, expected_type_str="int32", index=1)
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["add", "add"])
+
+    def test_iO16_default_image_dtype_input(
+        self,
+        rank4_input_model,
+        rank4_grayscale_input_model,
+    ):
+        """
+        We keep the input dtype for the image input model to fp32, unless it is GRAYSCALE_FLOAT16
+        """
+        # Example 1
+        mlmodel = ct.convert(
+            rank4_input_model,
+            inputs=[ct.ImageType(shape=(1, 3, 10, 20), color_layout=ct.colorlayout.RGB)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.RGB)
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        verify_prediction(mlmodel)
+
+        # Example 2
+        mlmodel = ct.convert(
+            rank4_input_model,
+            inputs=[ct.ImageType(shape=(1, 3, 10, 20), color_layout=ct.colorlayout.BGR)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_spec_input_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.BGR)
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        verify_prediction(mlmodel)
+
+        # Example 3
+        mlmodel = ct.convert(
+            rank4_grayscale_input_model,
+            inputs=[ct.ImageType(shape=(1, 1, 10, 20), color_layout=ct.colorlayout.GRAYSCALE)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_spec_input_image_type(
+            mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE
+        )
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        verify_prediction(mlmodel)
+
+        # Example 4
+        mlmodel = ct.convert(
+            rank4_grayscale_input_model,
+            inputs=[
+                ct.ImageType(shape=(1, 1, 10, 20), color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)
+            ],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_spec_input_image_type(
+            mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE_FLOAT16
+        )
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        verify_prediction(mlmodel)
+
+    def test_iO16_default_image_dtype_output(
+        self,
+        rank4_input_model,
+        rank4_grayscale_input_model,
+    ):
+        """
+        We keep the output dtype for the image input model to fp32, unless it is GRAYSCALE_FLOAT16
+        """
+        # Example 1
+        mlmodel = ct.convert(
+            rank4_input_model,
+            inputs=[ct.TensorType(shape=(1, 3, 10, 20))],
+            outputs=[ct.ImageType(color_layout=ct.colorlayout.RGB)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        assert_spec_output_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.RGB)
+        verify_prediction(mlmodel)
+
+        # Example 2
+        mlmodel = ct.convert(
+            rank4_input_model,
+            inputs=[ct.TensorType(shape=(1, 3, 10, 20))],
+            outputs=[ct.ImageType(color_layout=ct.colorlayout.BGR)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        assert_spec_output_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.BGR)
+        verify_prediction(mlmodel)
+
+        # Example 3
+        mlmodel = ct.convert(
+            rank4_grayscale_input_model,
+            inputs=[ct.TensorType(shape=(1, 1, 10, 20))],
+            outputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        assert_spec_output_image_type(
+            mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE
+        )
+        verify_prediction(mlmodel)
+
+        # Example 4
+        mlmodel = ct.convert(
+            rank4_grayscale_input_model,
+            inputs=[ct.TensorType(shape=(1, 1, 10, 20))],
+            outputs=[ct.ImageType(color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)],
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp16")
+        assert_spec_output_image_type(
+            mlmodel._spec, expected_feature_type=ft.ImageFeatureType.GRAYSCALE_FLOAT16
+        )
+        verify_prediction(mlmodel)
+
+    def test_iO16_default_fp32_io(self, float32_input_model_add_op):
+        """
+        With minimum_deployment_target set >= iOS16, and if the compute precision is
+        set to fp32. By default, a fp32 i/o model is produced.
+        """
+        # Case 1: Inputs given / outputs None
+        mlmodel = ct.convert(
+            float32_input_model_add_op,
+            inputs=[ct.TensorType(shape=(10, 20))],
+            compute_precision=ct.precision.FLOAT32,
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        self._verify_model_io(
+            mlmodel,
+            input_dtype="fp32",
+            output_dtype="fp32",
+            expected_op_list=["add"],
+        )
+
+        # Case 2: Inputs given / outputs given
+        mlmodel = ct.convert(
+            float32_input_model_add_op,
+            inputs=[ct.TensorType(shape=(10, 20), dtype=None)],
+            outputs=[ct.TensorType(dtype=None)],
+            compute_precision=ct.precision.FLOAT32,
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        self._verify_model_io(
+            mlmodel,
+            input_dtype="fp32",
+            output_dtype="fp32",
+            expected_op_list=["add"],
+        )
diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
index 374bb34c6..241eb97f4 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
@@ -24,7 +24,12 @@
 )
 from coremltools.converters.mil.mil import Operation, Program, types
 from coremltools.converters.mil.mil.var import Var
-from coremltools.converters.mil.testing_utils import einsum_equations, gen_input_shapes_einsum
+from coremltools.converters.mil.testing_utils import (
+    einsum_equations,
+    gen_input_shapes_einsum,
+    get_op_types_in_program,
+    hardcoded_einsum_equations,
+)
 from coremltools.models.utils import _macos_version, _python_version
 
 from .testing_utils import ModuleWrapper, TorchBaseTest, contains_op, generate_input_data
@@ -463,7 +468,7 @@ def forward(self, x, target):
         model = NLLLossModel()
         expected_results = model(*inputs)
 
-        self.run_compare_torch(
+        res = self.run_compare_torch(
             inputs,
             model,
             expected_results,
@@ -472,6 +477,12 @@ def forward(self, x, target):
             compute_unit=compute_unit,
         )
 
+        # verify that the translation function is using one_hot instead of gather
+        prog = res[1]._mil_program
+        ops = get_op_types_in_program(prog)
+        assert "gather" not in ops and "gather_nd" not in ops
+        assert "one_hot" in ops
+
 
 class TestArgSort(TorchBaseTest):
     @pytest.mark.parametrize(
@@ -3282,10 +3293,6 @@ def test_lstm(
         LSTM_batch_first,
         pad_value,
     ):
-        if backend[0] == "mlprogram":
-            pytest.xfail(
-                "rdar://109081548 ([Bug] TestLSTMWithPackedSequence is failing through E5ML)"
-            )
         from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
 
         input_size = 4
@@ -3600,7 +3607,7 @@ def _get_inputs(self, input_types):
         elif y_type == "bool":
             y = torch.tensor([0, 0, 1, 1], dtype=torch.bool)
         return (x, y)
-    
+
     @pytest.mark.parametrize(
         "compute_unit, backend, input_types",
         itertools.product(
@@ -4625,7 +4632,7 @@ def forward(self, x, y):
                 converter_input_type.reverse()
 
         model = TestBinaryEinsum()
-        self.run_compare_torch(
+        res = self.run_compare_torch(
             input_shapes,
             model,
             backend=backend,
@@ -4634,6 +4641,23 @@ def forward(self, x, y):
             converter_input_type=converter_input_type
         )
 
+        # Verify the pattern of the hardcode einsum cases
+        traced_model = res[0]
+        mlprogram = ct.convert(
+            traced_model,
+            inputs=converter_input_type,
+            convert_to="milinternal",
+            pass_pipeline=ct.PassPipeline.EMPTY,
+        )
+        ops_in_prog = get_op_types_in_program(mlprogram)
+
+        if (equation in hardcoded_einsum_equations) and not (
+            equation in ["abcd,cde->abe", "abc,cde->abde"] and dynamic
+        ):
+            assert "reduce_prod" not in ops_in_prog
+            assert "concat" not in ops_in_prog
+            assert "shape" not in ops_in_prog
+
     @pytest.mark.parametrize(
         "compute_unit, backend, equation, dynamic",
         itertools.product(
@@ -5934,6 +5958,48 @@ def forward(self, x, y):
         )
 
 
+class TestRepeatInterleave(TorchBaseTest):
+    @pytest.mark.parametrize(
+        "compute_unit, backend, rank, repeat, dim",
+        itertools.product(
+            compute_units,
+            backends,
+            (1, 3, 5),
+            (2, torch.tensor(3), torch.tensor([4])),
+            (None, 0),
+        ),
+    )
+    def test_scalar_repeat_and_dim_None_or_0(self, compute_unit, backend, rank, repeat, dim):
+        input_shape = tuple(np.random.randint(low=1, high=6, size=rank))
+        model = ModuleWrapper(function=lambda x: x.repeat_interleave(repeat, dim=dim))
+        self.run_compare_torch(input_shape, model, backend=backend, compute_unit=compute_unit)
+
+    def test_single_fill_tensor_repeat(self):
+        input_shape = (2, 3)
+        model = ModuleWrapper(function=lambda x: x.repeat_interleave(torch.tensor([2, 2]), dim=0))
+        self.run_compare_torch(input_shape, model)
+
+    def test_unsupported_tensor_repeat(self):
+        input_shape = (3, 1)
+        model = ModuleWrapper(
+            function=lambda x: x.repeat_interleave(torch.tensor([1, 2, 3]), dim=0)
+        )
+        with pytest.raises(
+            NotImplementedError,
+            match=r"Conversion for torch.repeat_interleave with Tensor repeats has not been implemented",
+        ):
+            self.run_compare_torch(input_shape, model)
+
+    def test_unsupported_dim1(self):
+        input_shape = (2, 1, 2, 1, 2)
+        model = ModuleWrapper(function=lambda x: x.repeat_interleave(2, dim=1))
+        with pytest.raises(
+            NotImplementedError,
+            match=r"Conversion for torch.repeat_interleave with non-zero dim has not been implemented",
+        ):
+            self.run_compare_torch(input_shape, model)
+
+
 class TestStd(TorchBaseTest):
     @pytest.mark.parametrize(
         "compute_unit, backend, unbiased",
@@ -9204,6 +9270,7 @@ def forward(self, x):
                     x = torch.stack([torch.real(x), torch.imag(x)], dim=0)
                 return x
 
+        np.random.seed(1024)
         TorchBaseTest.run_compare_torch(
             input_shape,
             SpectrogramModel(),
@@ -9377,6 +9444,7 @@ def forward(self, boxes, scores):
             backend=backend,
             converter_input_type=converter_input_type,
             compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
         # Change the last input box to make IOU slightly smaller than 0.2, the output of CoreML will match PyTorch.
@@ -9390,6 +9458,7 @@ def forward(self, boxes, scores):
             backend=backend,
             converter_input_type=converter_input_type,
             compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
         )
 
 
@@ -9665,6 +9734,10 @@ def test_is_causal_flag(self, compute_unit, backend, seq_lengths, include_heads)
         ),
     )
     def test_attn_mask(self, compute_unit, backend, seq_lengths, bool_mask):
+        if bool_mask:
+            pytest.xfail(
+                "rdar://110499660 ([CI][Bug] test_attn_mask is occasionally failing when bool_mask = True)"
+            )
         source_seq_len, target_seq_len = seq_lengths
         query_shape = (2, 3, target_seq_len, 7)
         key_shape = (2, 3, source_seq_len, 7)
diff --git a/coremltools/converters/mil/input_types.py b/coremltools/converters/mil/input_types.py
index 406767616..04fa9030a 100644
--- a/coremltools/converters/mil/input_types.py
+++ b/coremltools/converters/mil/input_types.py
@@ -210,8 +210,19 @@ def __init__(self, name=None, shape=None, dtype=None, default_value=None):
         if dtype is not None:
             if is_builtin(dtype):
                 self.dtype = dtype
-                if dtype not in (types.fp16, types.fp32, types.fp64, types.int32, types.int64, types.bool):
-                    raise TypeError("dtype={} is unsupported for inputs/outputs of the model".format(dtype))
+                if dtype not in (
+                    types.int8,
+                    types.uint8,
+                    types.fp16,
+                    types.fp32,
+                    types.fp64,
+                    types.int32,
+                    types.int64,
+                    types.bool,
+                ):
+                    raise TypeError(
+                        "dtype={} is unsupported for inputs/outputs of the model".format(dtype)
+                    )
             else:
                 # Assume dtype is numpy type
                 try:
diff --git a/coremltools/converters/mil/mil/block.py b/coremltools/converters/mil/mil/block.py
index 0002c5cf0..aef4e6f0c 100644
--- a/coremltools/converters/mil/mil/block.py
+++ b/coremltools/converters/mil/mil/block.py
@@ -445,6 +445,8 @@ def replace_block_output_var(
             old_var.consuming_blocks.remove(self)
             # Ensure output name is consistent
             if isinstance(self, Function):
+                if new_var in self.inputs.values() and new_var.name != old_var.name:
+                    raise ValueError("It is not allowed to modify function inputs name.")
                 new_var.name = old_var.name
 
     def try_replace_uses_of_var_after_op(
diff --git a/coremltools/converters/mil/mil/ops/defs/_utils.py b/coremltools/converters/mil/mil/ops/defs/_utils.py
index 42342989f..57fe572f9 100644
--- a/coremltools/converters/mil/mil/ops/defs/_utils.py
+++ b/coremltools/converters/mil/mil/ops/defs/_utils.py
@@ -276,10 +276,7 @@ def spatial_dimensions_out_shape(
         # * `effective_ks` (effective kernel size, determined from kernel size + dilations) cannot be symbolic
         # * strides cannot be symbolic
         if is_symbolic(input_shape[r]):
-            if not is_symbolic(pad[r]) and pad[r] - effective_ks[r] == -1 and strides[r] == 1:
-                out_shape.append(input_shape[r])
-            else:
-                out_shape.append(get_new_symbol())
+            out_shape.append(get_new_symbol())
         else:
             out_dim = 0
             if not ceil_mode:
@@ -294,7 +291,7 @@ def spatial_dimensions_out_shape(
     return out_shape
 
 
-def parse_einsum_equation(equation: str) -> List[List[str]]:
+def parse_einsum_equation(equation: str) -> Tuple[List[str]]:
     """
     Args
         equation : str
@@ -341,7 +338,7 @@ def _update_vec(str, map_char_to_int, index):
         index, vec = _update_vec(inout_str, map_char_to_int, index)
         in_outs_vec.append(vec)
 
-    return in_outs_vec
+    return tuple(in_outs_vec)
 
 def compute_gather(params, indices, axis, batch_dims):
     """
@@ -459,6 +456,13 @@ def solve_slice_by_index_shape(x_shape, begin, end, stride, begin_mask, end_mask
         """
         We first deal with those cases, where the output size is a deterministic number, even if the input dimension
         is unknown (i.e. symbolic)
+        - No begin_mask and no end_mask:
+          - begin == end. output shape = 0.
+          - begin == end - 1, stride > 0. output shape = 1
+          - begin == end + 1, stride < 0. output shape = 1
+        - begin_mask is false and end_mask is true:
+          - begin == -1, stride > 0. output shape = 1
+          - begin == 0, stride < 0. output shape = 1
         """
         if (
             not begin_mask[idx]
@@ -466,9 +470,7 @@ def solve_slice_by_index_shape(x_shape, begin, end, stride, begin_mask, end_mask
             and begin[idx] is not None
             and end[idx] is not None
         ):
-            # in this case the slice is from "begin" to "end", where both these boundary points are known
-            # we can find the size of the slice in this case, unless one of them is positive and other is negative
-            # as in that case, we would need to know the size of the full input dimension
+            out_shape = None
             if begin[idx] >= 0 and end[idx] >= 0 and stride[idx] > 0:
                 if end[idx] < begin[idx]:
                     raise ValueError(
@@ -477,12 +479,10 @@ def solve_slice_by_index_shape(x_shape, begin, end, stride, begin_mask, end_mask
                             idx, begin[idx], end[idx], stride[idx]
                         )
                     )
-                ret_shape.append(
-                    np.arange(end[idx] - begin[idx])[
-                        slice(0, end[idx] - begin[idx], stride[idx])
-                    ].size
-                )
-                continue
+                out_shape = np.arange(end[idx] - begin[idx])[
+                    slice(0, end[idx] - begin[idx], stride[idx])
+                ].size
+
             if begin[idx] < 0 and end[idx] < 0 and stride[idx] < 0:
                 if begin[idx] < end[idx]:
                     raise ValueError(
@@ -491,41 +491,17 @@ def solve_slice_by_index_shape(x_shape, begin, end, stride, begin_mask, end_mask
                             idx, begin[idx], end[idx], stride[idx]
                         )
                     )
-                ret_shape.append(
-                    np.arange(begin[idx] - end[idx])[
-                        slice(-1, end[idx] - begin[idx] - 1, stride[idx])
-                    ].size
-                )
-                continue
+                out_shape = np.arange(begin[idx] - end[idx])[
+                    slice(-1, end[idx] - begin[idx] - 1, stride[idx])
+                ].size
 
-        if begin_mask[idx] and not end_mask[idx] and end[idx] is not None:
-            # in this case we know that the slice is [0, end] or [-1, end], depending on the sign of stride,
-            #  and the value of end is known
-            if end[idx] > 0 and stride[idx] > 0:
-                ret_shape.append(
-                    np.arange(end[idx])[slice(None, end[idx], stride[idx])].size
-                )
-                continue
-            if end[idx] < 0 and stride[idx] < 0:
-                ret_shape.append(
-                    np.arange(abs(end[idx]))[slice(None, end[idx], stride[idx])].size
-                )
+            if out_shape in (0, 1):
+                ret_shape.append(out_shape)
                 continue
 
         if not begin_mask[idx] and end_mask[idx] and begin[idx] is not None:
-            # in this case we know the value of begin, and since end_mask is True, we know that the slice
-            # is till the right most edge
-            if begin[idx] > 0 and stride[idx] < 0:
-                ret_shape.append(
-                    np.arange(begin[idx] + 1)[slice(begin[idx], None, stride[idx])].size
-                )
-                continue
-            if begin[idx] < 0 and stride[idx] > 0:
-                ret_shape.append(
-                    np.arange(abs(begin[idx]))[
-                        slice(begin[idx], None, stride[idx])
-                    ].size
-                )
+            if (begin[idx] == 0 and stride[idx] < 0) or (begin[idx] == -1 and stride[idx] > 0):
+                ret_shape.append(1)
                 continue
 
         # for symbolic case
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS15/control_flow.py b/coremltools/converters/mil/mil/ops/defs/iOS15/control_flow.py
index 8b31ecaab..a12091433 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS15/control_flow.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS15/control_flow.py
@@ -172,7 +172,6 @@ def value_inference(self):
         return val
 
     def _get_type_val(self, value):
-
         if isinstance(value, (float, np.float64)):
             value = np.float32(value)
         elif isinstance(value, bool):
@@ -181,24 +180,17 @@ def _get_type_val(self, value):
             value = np.int32(value)
         elif isinstance(value, (tuple, list, np.ndarray)):
             value = np.array(value) if isinstance(value, (tuple, list)) else value
-
-            # For the int type, we use int32 by default
-            if value.dtype in [np.uint16, np.int16, np.uint64, np.int64]:
-                if value.dtype in [np.uint64, np.int64]:
-                    msg = "Downcast const op {} data".format(self.name) + builtin_to_string(numpy_type_to_builtin_type(value.dtype)) + " as int32"
-                    logger.debug(msg)
+            if value.dtype in [np.uint64, np.int64]:
+                logger.debug(
+                    f"Downcast const op {self.name} data {builtin_to_string(numpy_type_to_builtin_type(value.dtype))} as int32"
+                )
                 value = value.astype(np.int32)
-
-
-            # For the float type, we use float32 by default
-            elif value.dtype == np.float64:
-                msg = "Downcast const op {} data fp64 as fp32".format(self.name)
-                logger.debug(msg)
+            if value.dtype == np.float64:
+                logger.debug(f"Downcast const op {self.name} data fp64 as fp32")
                 value = value.astype(np.float32)
-
         elif isinstance(value, mil_list):
-            # if val that was passed in is of type mil_list, which is just a wrapper on top of python list
-            # then construct the list type
+            # If val that was passed in is of type mil_list, which is just a wrapper on top of
+            # python list, then construct the list type.
             list_value = value.ls
             if len(list_value) == 0:
                 raise ValueError("'mil_list' points to an empty list")
@@ -209,9 +201,8 @@ def _get_type_val(self, value):
             builtin_type = types_list(builtin_elem_type, init_length=len(list_value), dynamic_length=False)
             return builtin_type, value
 
-
         if not isinstance(value, (np.generic, np.ndarray, str, bool, mil_list)):
-            raise ValueError("Unknown value for constant: {}".format(value))
+            raise ValueError(f"Unknown value for constant: {value}")
 
         _, builtin_type = numpy_val_to_builtin_val(value)
         return builtin_type, value
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS15/elementwise_unary.py b/coremltools/converters/mil/mil/ops/defs/iOS15/elementwise_unary.py
index 0c93a08c7..61e236b11 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS15/elementwise_unary.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS15/elementwise_unary.py
@@ -8,14 +8,16 @@
 import numpy as np
 
 from coremltools.converters.mil.mil import types
-from coremltools.converters.mil.mil.input_type import (DefaultInputs,
-                                                       InputSpec,
-                                                       TensorInputType)
-from coremltools.converters.mil.mil.operation import (SYMBOL, VALUE, Operation,
-                                                      precondition)
+from coremltools.converters.mil.mil.input_type import DefaultInputs, InputSpec, TensorInputType
+from coremltools.converters.mil.mil.operation import SYMBOL, VALUE, Operation, precondition
 from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
 from coremltools.converters.mil.mil.types import nptype_from_builtin
 from coremltools.converters.mil.mil.types.symbolic import is_symbolic
+from coremltools.converters.mil.mil.types.type_mapping import (
+    builtin_to_string,
+    string_to_builtin,
+    string_to_nptype,
+)
 
 
 def _maintain_shape(x, y):
@@ -819,14 +821,11 @@ class cast(Operation):
     """
     Cast the input ``x`` to the new type ``dtype``.
 
-    Notice that the underlying Core MIL op doesn't support int64 and fp64. We support them in PyMIL
-    by mapping int64 to int32, and mapping fp64 to fp32.
-
     Parameters
     ----------
     x: tensor<[\*d], T> (Required)
     dtype: const str (Required)
-        * Can be one of the following types: ``int32``, ``int64``, ``fp32``, ``fp64``, ``bool``.
+        * Can be one of the following types: ``int32``, ``fp16``, ``fp32``, ``bool``.
 
     Returns
     -------
@@ -835,7 +834,7 @@ class cast(Operation):
 
     Attributes
     ----------
-    T: i32, i64, fp16, fp32, fp64, bool.
+    T: i32, fp16, fp32, bool.
     """
 
     input_spec = InputSpec(
@@ -844,39 +843,25 @@ class cast(Operation):
     )
 
     type_domains = {
-        "T": (types.fp16, types.fp32, types.fp64, types.int32, types.int64, types.bool),
-    }
-
-    str_to_types_map = {
-        "int32": types.int32,
-        "int64": types.int32,
-        "fp16": types.fp16,
-        "fp32": types.fp32,
-        "fp64": types.fp32,
-        "bool": types.bool,
+        "T": (types.fp16, types.fp32, types.int32, types.bool),
     }
 
-    str_to_numpy_type_map = {
-        "int32": np.int32,
-        "int64": np.int32,
-        "fp16": np.float16,
-        "fp32": np.float32,
-        "fp64": np.float32,
-        "bool": bool,
-    }
+    @classmethod
+    def supported_dtypes(cls):
+        return (builtin_to_string(v) for v in cls.type_domains["T"])
 
     def type_inference(self):
-        if self.dtype.val not in self.str_to_types_map.keys():
+        if self.dtype.val not in self.supported_dtypes():
             raise NotImplementedError(
                 "Parameter dtype of the cast operation can be one of the {}. "
-                "Provided {}".format(self.str_to_types_map.keys(), self.dtype.val)
+                "Provided {}".format(self.supported_dtypes(), self.dtype.val)
             )
 
         if not types.is_tensor(self.x.sym_type):
-            return self.str_to_types_map[self.dtype.val]
+            return string_to_builtin(self.dtype.val)
 
         ret_shape = self.x.shape
-        return types.tensor(self.str_to_types_map[self.dtype.val], ret_shape)
+        return types.tensor(string_to_builtin(self.dtype.val), ret_shape)
 
     @precondition(allow=VALUE | SYMBOL)
     def value_inference(self):
@@ -884,10 +869,10 @@ def value_inference(self):
 
     @classmethod
     def get_cast_value(cls, input_var, dtype_val):
-        if dtype_val not in cls.str_to_numpy_type_map.keys():
+        if dtype_val not in cls.supported_dtypes():
             raise NotImplementedError(
                 "Parameter dtype of the cast operation can be one of the {}. "
-                "Provided {}".format(cls.str_to_numpy_type_map.keys(), dtype_val)
+                "Provided {}".format(cls.supported_dtypes(), dtype_val)
             )
 
         if input_var.val is None:
@@ -897,7 +882,7 @@ def get_cast_value(cls, input_var, dtype_val):
                 and len(input_var.sym_val.shape) == 1
             ):
                 result = [
-                    np.array(val).astype(dtype=cls.str_to_numpy_type_map[dtype_val]).item()
+                    np.array(val).astype(dtype=string_to_nptype(dtype_val)).item()
                     if not is_symbolic(val)
                     else val
                     for val in input_var.sym_val
@@ -906,6 +891,6 @@ def get_cast_value(cls, input_var, dtype_val):
             return None
 
         if not types.is_tensor(input_var.sym_type):
-            return input_var.val.astype(dtype=cls.str_to_numpy_type_map[dtype_val])
+            return input_var.val.astype(dtype=string_to_nptype(dtype_val))
         else:
-            return np.array(input_var.val).astype(dtype=cls.str_to_numpy_type_map[dtype_val])
+            return np.array(input_var.val).astype(dtype=string_to_nptype(dtype_val))
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_operation.py b/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_operation.py
index d80022226..3202c480b 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_operation.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_operation.py
@@ -93,7 +93,7 @@ def type_inference(self):
     @precondition(allow=VALUE)
     def value_inference(self):
         M, N = self.x.val.shape[-2:]
-        band = np.zeros((M, N), dtype=types.nptype_from_builtin(self.x.sym_type))
+        band = np.zeros((M, N), dtype=types.nptype_from_builtin(self.x.dtype))
         num_lower = self.lower.val
         num_upper = self.upper.val
         for m in range(M):
@@ -462,6 +462,9 @@ class pad(Operation):
         * If mode is "replicate" then ``pad[2*i]`` and ``pad[2*i+1]`` can be
           at most ``D[i]``.
 
+        * If pad is not a constant, it must be a vector of length ``2 * rank(x)``,
+          that is, ``N == D_in``.
+
     mode: const<str> (Optional)
         * Defaults to ``constant``.
         * Must be one of the following values:
@@ -506,12 +509,16 @@ def type_inference(self):
             raise ValueError("Pad should be a 1D tensor!")
         if self.mode and self.mode.val not in {"constant", "reflect", "replicate"}:
             raise ValueError("Pad mode should be one of {'constant', 'reflect', 'replicate'}")
+        if pad.val is None and pad.shape[0] != self.x.rank * 2:
+            raise ValueError(
+                f"Non-constant 'pad' must have shape ({2*self.x.rank},). Got {pad.shape}"
+            )
 
-        if pad.val is None:
+        if pad.sym_val is None:
             for i in range(self.pad.shape[0] // 2):
                 ret_shape[-self.pad.shape[0] // 2 + i] = get_new_symbol()
         else:
-            pad = pad.val
+            pad = pad.sym_val
             pad = pad.copy()
 
             if len(pad) % 2 != 0:
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_transformation.py b/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_transformation.py
index a7924d100..a409b36af 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_transformation.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_transformation.py
@@ -232,8 +232,10 @@ def _get_type_val(self):
             ret_shape = self._infer_shape_dynamic()
 
         ret_val = None
-        if self.x.val is not None and all(isscalar(a) and not is_symbolic(a) for a in ret_shape):
-            ret_val = reshape_with_symbol(self.x.val, ret_shape)
+        if self.x.sym_val is not None and all(
+            isscalar(a) and not is_symbolic(a) for a in ret_shape
+        ):
+            ret_val = reshape_with_symbol(self.x.sym_val, ret_shape)
         return types.tensor(self.x.dtype, tuple(ret_shape)), ret_val
 
     @staticmethod
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS16/constexpr_ops.py b/coremltools/converters/mil/mil/ops/defs/iOS16/constexpr_ops.py
index e0fa58329..9b2953fc7 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS16/constexpr_ops.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS16/constexpr_ops.py
@@ -33,7 +33,7 @@ class constexpr_affine_dequantize(Operation):
     ----------
     quantized_data: const tensor<SrcT, [1..]> (Required)
 
-    zero_point: const tensor<SrcT, [0..1]> (Required)
+    zero_point: const tensor<ZeroPointT, [0..1]> (Required)
  	   * ``zero_point`` can be either a scalar or a vector. 
  	   * ``zero_point`` follows similar broadcasting rules and size constraints as ``scale``.
 
@@ -57,6 +57,7 @@ class constexpr_affine_dequantize(Operation):
     Attributes
     ----------
     SrcT: uint8, int8
+    ZeroPointT: uint8, int8, fp32
     DstT: fp16, fp32
     """
 
@@ -66,11 +67,11 @@ class constexpr_affine_dequantize(Operation):
         scale=TensorInputType(const=True, type_domain="DstT"),
         axis=TensorInputType(const=True, type_domain=types.int32),
     )
-    
+
     type_domains = {
-        "DstT": (types.fp16, types.fp32),
         "SrcT": (types.uint8, types.int8),
-        "ZeroPointT": (types.uint8, types.int8),
+        "ZeroPointT": (types.uint8, types.int8, types.fp32),
+        "DstT": (types.fp16, types.fp32),
     }
 
     def type_inference(self):
@@ -88,11 +89,6 @@ def assert_vector_size_same_as_axial_dimension(param, axis_dim_size, name):
                     )
                 )
 
-        if self.zero_point.dtype != self.quantized_data.dtype:
-            raise ValueError(
-                "Parameters quantized_data and zero_point needs to be of the same dtype"
-            )
-
         rank = self.quantized_data.rank
         if self.axis.val < -rank or self.axis.val >= rank:
             raise ValueError(
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS16/image_resizing.py b/coremltools/converters/mil/mil/ops/defs/iOS16/image_resizing.py
index ac53dff64..d3fedf4b3 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS16/image_resizing.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS16/image_resizing.py
@@ -61,9 +61,6 @@ class upsample_bilinear(_upsample_bilinear_iOS15):
         half_pixel_centers=TensorInputType(const=True, optional=True, type_domain=types.bool),
     )
 
-    def default_inputs(self):
-        return super().default_inputs() + DefaultInputs(half_pixel_centers=not self.align_corners.val)
-
 @register_op(opset_version=_IOS16_TARGET)
 class crop_resize(_crop_resize_iOS15):
     """
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS17/__init__.py b/coremltools/converters/mil/mil/ops/defs/iOS17/__init__.py
index 123b06775..d6479aae0 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS17/__init__.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS17/__init__.py
@@ -18,9 +18,13 @@
     softplus_parametric,
     thresholded_relu,
 )
-from .elementwise_unary import cast, clip
-from .image_resizing import crop_resize
+from .conv import conv, conv_transpose
+from .elementwise_unary import cast, clip, inverse, log, rsqrt
+from .image_resizing import crop_resize, resample, resize
+from .linear import linear, matmul
+from .normalization import batch_norm, instance_norm, l2_norm, layer_norm, local_response_norm
 from .quantization_ops import dequantize, quantize
+from .recurrent import gru, lstm, rnn
 from .reduction import reduce_argmax, reduce_argmin
 from .scatter_gather import (
     gather,
@@ -31,4 +35,13 @@
     scatter_nd,
 )
 from .tensor_operation import non_maximum_suppression, topk
-from .tensor_transformation import reshape
+from .tensor_transformation import (
+    expand_dims,
+    reshape,
+    reshape_like,
+    reverse,
+    reverse_sequence,
+    sliding_windows,
+    squeeze,
+    transpose,
+)
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS17/conv.py b/coremltools/converters/mil/mil/ops/defs/iOS17/conv.py
new file mode 100644
index 000000000..edbb7e0bf
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/defs/iOS17/conv.py
@@ -0,0 +1,69 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.input_type import InputSpec, TensorInputType
+from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
+from coremltools.converters.mil.mil.ops.defs.iOS15.conv import conv as _conv_iOS15
+from coremltools.converters.mil.mil.ops.defs.iOS15.conv import (
+    conv_transpose as _conv_transpose_iOS15,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS17 import _IOS17_TARGET
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class conv(_conv_iOS15):
+    """
+    Perform convolution over input. Supports 1-D, 2-D, and 3-D convolution.
+
+    The difference between this version and the iOS 15 :py:class:`~.iOS15.conv.conv` is that the
+    ``weight`` and ``bias`` may have a different dtype than the input/output.
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        weight=TensorInputType(type_domain="U"),
+        bias=TensorInputType(optional=True, type_domain="U"),
+        strides=TensorInputType(const=True, optional=True, type_domain=types.int32),
+        pad_type=TensorInputType(const=True, optional=True, type_domain=types.str),
+        pad=TensorInputType(const=True, optional=True, type_domain=types.int32),
+        dilations=TensorInputType(const=True, optional=True, type_domain=types.int32),
+        groups=TensorInputType(const=True, optional=True, type_domain=types.int32),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32),
+        "U": (types.fp16, types.fp32),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class conv_transpose(_conv_transpose_iOS15):
+    """
+    Perform transposed convolution (also known as deconvolution and fractionally
+    stride convolution) over input. ``conv_transpose`` can also be used to compute
+    the gradient of conv. Supports 1-D, 2-D, and 3-D convolution.
+
+    The differences between this version and the iOS 15 :py:class:`~.iOS15.conv.conv_transpose` are:
+    - ``weight`` and ``bias`` may have a different dtype than the input/output.
+    - ``weight`` doesn't have to be const.
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        weight=TensorInputType(type_domain="U"),
+        bias=TensorInputType(optional=True, type_domain="U"),
+        pad=TensorInputType(const=True, optional=True, type_domain=types.int32),
+        output_shape=TensorInputType(const=True, optional=True, type_domain=types.int32),
+        pad_type=TensorInputType(const=True, optional=True, type_domain=types.str),
+        strides=TensorInputType(const=True, optional=True, type_domain=types.int32),
+        dilations=TensorInputType(const=True, optional=True, type_domain=types.int32),
+        groups=TensorInputType(const=True, optional=True, type_domain=types.int32),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32),
+        "U": (types.fp16, types.fp32),
+    }
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS17/elementwise_unary.py b/coremltools/converters/mil/mil/ops/defs/iOS17/elementwise_unary.py
index f17c51631..28802592f 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS17/elementwise_unary.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS17/elementwise_unary.py
@@ -3,13 +3,17 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-import numpy as np
 
 from coremltools.converters.mil.mil import types
 from coremltools.converters.mil.mil.input_type import InputSpec, TensorInputType
 from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
 from coremltools.converters.mil.mil.ops.defs.iOS15.elementwise_unary import cast as _cast_iOS15
 from coremltools.converters.mil.mil.ops.defs.iOS15.elementwise_unary import clip as _clip_iOS15
+from coremltools.converters.mil.mil.ops.defs.iOS15.elementwise_unary import (
+    inverse as _inverse_iOS15,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS15.elementwise_unary import log as _log_iOS15
+from coremltools.converters.mil.mil.ops.defs.iOS15.elementwise_unary import rsqrt as _rsqrt_iOS15
 from coremltools.converters.mil.mil.ops.defs.iOS17 import _IOS17_TARGET
 
 
@@ -18,14 +22,13 @@ class cast(_cast_iOS15):
     """
     Cast the input ``x`` to the new type ``dtype``.
     The only difference between this version and the iOS 15 :py:class:`~.iOS15.elementwise_unary.cast`
-    is that it supports int16 and uint16.
+    is that it supports int8, uint8, int16, and uint16.
 
     Parameters
     ----------
     x: tensor<[\*d], T> (Required)
     dtype: const str (Required)
-        * Can be one of the following types: ``int16``, ``uint16``, ``int32``, ``int64``, ``fp16``,
-        ``fp32``, ``fp64``, or ``bool``.
+        * Can be one of the following types: ``int8``, ``uint8``, ``int16``, ``uint16``, ``int32``, ``fp16``, ``fp32``, or ``bool``.
 
     Returns
     -------
@@ -34,7 +37,7 @@ class cast(_cast_iOS15):
 
     Attributes
     ----------
-    T: i16, ui16, i32, i64, fp16, fp32, fp64, bool.
+    T: i8, ui8, i16, ui16, i32, fp16, fp32, bool.
     """
 
     input_spec = InputSpec(
@@ -45,38 +48,15 @@ class cast(_cast_iOS15):
         "T": (
             types.fp16,
             types.fp32,
-            types.fp64,
+            types.int8,
+            types.uint8,
             types.int16,
             types.uint16,
             types.int32,
-            types.int64,
             types.bool,
         ),
     }
 
-    str_to_types_map = {
-        "int16": types.int16,
-        "uint16": types.uint16,
-        "int32": types.int32,
-        "int64": types.int32,
-        "fp16": types.fp16,
-        "fp32": types.fp32,
-        "fp64": types.fp32,
-        "bool": types.bool,
-    }
-
-    str_to_numpy_type_map = {
-        "int16": np.int16,
-        "uint16": np.uint16,
-        "int32": np.int32,
-        "int64": np.int32,
-        "fp16": np.float16,
-        "fp32": np.float32,
-        "fp64": np.float32,
-        "bool": bool,
-    }
-
-
 @register_op(opset_version=_IOS17_TARGET)
 class clip(_clip_iOS15):
     """
@@ -110,3 +90,110 @@ def type_inference(self):
                 f"({self.beta.val}) in `clip` op."
             )
         return self.x.sym_type
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class inverse(_inverse_iOS15):
+    """
+    Return the reciprocal value of the input ``x``, element-wise.
+    The only difference from IOS15 is epsilon may have different dtypes than the inputs/outputs.
+
+    Parameters
+    ----------
+    x: tensor<[\*d], T> (Required)
+    epsilon: const U (Optional, default=1e-4)
+        * This is a small constant that is added to the input, before taking its
+          inverse, for stability.
+        * ``y = 1 / (x + epsilon)``.
+
+    Returns
+    -------
+    tensor<[\*d], T>
+        * A tensor of the same shape as ``x``.
+
+    Attributes
+    ----------
+    T: fp16, fp32
+    U: fp16, fp32
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        epsilon=TensorInputType(const=True, optional=True, type_domain="U"),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32),
+        "U": (types.fp16, types.fp32),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class log(_log_iOS15):
+    """
+    Return the natural logarithm value of the input ``x``, element-wise.
+    The only difference from IOS15 is epsilon may have different dtypes than the inputs/outputs.
+
+    Parameters
+    ----------
+    x: tensor<[\*d], T> (Required)
+    epsilon: const U (Optional, default=1e-45)
+        * This is a small constant that is added to the input, before taking log.
+        * ``y = log(x + epsilon)``.
+
+    Returns
+    -------
+    tensor<[\*d], T>
+        * A tensor of the same shape as ``x``.
+
+    Attributes
+    ----------
+    T: fp16, fp32
+    U: fp16, fp32
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        epsilon=TensorInputType(const=True, optional=True, type_domain="U"),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32),
+        "U": (types.fp16, types.fp32),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class rsqrt(_rsqrt_iOS15):
+    """
+    Return the reciprocal value of the square root of the input ``x``, element-wise.
+    The only difference from IOS15 is epsilon may have different dtypes than the inputs/outputs.
+
+    Parameters
+    ----------
+    x: tensor<[\*d], T> (Required)
+    epsilon: const U (Optional, default=1e-12)
+        * This is a small constant that is added to the input, before applying the
+          ``rsqrt`` function, for stability.
+        * ``y = 1 / sqrt(x + epsilon)``.
+
+    Returns
+    -------
+    tensor<[\*d], T>
+        * A tensor of the same shape as ``x``.
+
+    Attributes
+    ----------
+    T: fp16, fp32
+    U: fp16, fp32
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        epsilon=TensorInputType(const=True, optional=True, type_domain="U"),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32),
+        "U": (types.fp16, types.fp32),
+    }
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS17/image_resizing.py b/coremltools/converters/mil/mil/ops/defs/iOS17/image_resizing.py
index efb91e37b..ef786d2e7 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS17/image_resizing.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS17/image_resizing.py
@@ -5,12 +5,13 @@
 
 import numpy as np
 
-from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil import Operation, get_new_symbol, types
 from coremltools.converters.mil.mil.input_type import DefaultInputs, InputSpec, TensorInputType
 from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
 from coremltools.converters.mil.mil.ops.defs.iOS16.image_resizing import (
     crop_resize as _crop_resize_iOS16,
 )
+from coremltools.converters.mil.mil.ops.defs.iOS16.image_resizing import resample as _resample_iOS16
 from coremltools.converters.mil.mil.ops.defs.iOS17 import _IOS17_TARGET
 
 
@@ -183,3 +184,159 @@ def type_inference(self):
             self.target_width.val,
         ]
         return types.tensor(self.x.dtype, ret_shape)
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class resample(_resample_iOS16):
+    """
+    Resample the input image tensor ``x`` at the ``coordinates``.
+
+    The major difference between this version and the iOS 16 :py:class:`~.iOS16.image_resizing.resample`
+    is that `coordinates` supports int8, uint8, int16, uint16.
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        coordinates=TensorInputType(type_domain="U"),
+        sampling_mode=TensorInputType(const=True, type_domain=types.str),
+        padding_mode=TensorInputType(const=True, type_domain=types.str),
+        padding_value=TensorInputType(const=True, type_domain="T"),
+        coordinates_mode=TensorInputType(const=True, type_domain=types.str),
+        align_corners=TensorInputType(const=True, type_domain=types.bool),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32),
+        "U": (
+            types.int8,
+            types.uint8,
+            types.int16,
+            types.uint16,
+            types.int32,
+            types.fp16,
+            types.fp32,
+        ),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class resize(Operation):
+    """
+    Resizes the input tensor ``x`` by choosing the right-most ``resized_dims`` dimensions from
+    the input shape ``shape``, and by choosing the rest from ``x``'s shape.
+
+    This iOS17 ``resize`` is a superset of ``resize_bilinear`` and ``resize_nearest_neighbor`` in
+    iOS15. The main benefit is that this resize op allows a use-case in dynamic tensor shapes where
+    a tensor needs to be resized to a dynamic shape as specified by another tensor.
+
+    To illustrate how output shape is inferred, here are two examples.
+    - Example #1
+        x.shape: [1, 2, 3, 4]
+        shape: [1, 6, 8]
+        resized_dims: 2
+        The output's shape will be [1, 2, 6, 8]
+    - Example #2
+        x.shape: [1, 2, 3, is0]
+        shape: [1, 0, 0]
+        resized_dims: 2
+        The output's shape will be [1, 2, 3, is0]
+
+    Parameters
+    ----------
+    x: tensor<[...], T> (Required)
+
+    shape: tensor<[K], U> (Required)
+        * Restriction: ``size(shape)`` <= ``rank(x)``
+        * If shape[i]==0, the dimension in the output tensor will instead be inferred from the
+          corresponding element of x.shape().  Note this might not be x.shape()[i], as size(shape),
+          resized_dims, and size(x) may all be different sizes.
+
+    resized_dims: const tensor<[], uint32> (Required)
+        * Restriction: ``resized_dims`` <= ``size(shape)``
+
+    interpolation_mode: const<str> (Optional, default="LINEAR")
+        * Available mode: ``LINEAR``, ``NEAREST_NEIGHBOR``.
+
+    sampling_mode: const<str> (Optional, default="DEFAULT")
+        * Available mode: ``DEFAULT``, ``STRICT_ALIGN_CORNERS``, ``ALIGN_CORNERS``,
+        ``OFFSET_CORNERS``, ``UNALIGN_CORNERS``.
+        * For details about different sampling modes, see iOS 15 :py:class:`~.iOS15.image_resizing.resize_bilinear`.
+
+    Returns
+    -------
+    tensor<[...], T>
+
+    Attributes
+    ----------
+    T: fp16, fp32, int32
+    U: int32, int16, uint16, uint32
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        shape=TensorInputType(type_domain="U"),
+        resized_dims=TensorInputType(const=True, type_domain=types.uint32),
+        interpolation_mode=TensorInputType(const=True, optional=True, type_domain=types.str),
+        sampling_mode=TensorInputType(const=True, optional=True, type_domain=types.str),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32, types.int32),
+        "U": (types.int32, types.int16, types.uint16, types.uint32),
+    }
+
+    _VALID_INTERPOLATION_MODES = {"LINEAR", "NEAREST_NEIGHBOR"}
+    _VALID_SAMPLING_MODE = {
+        "DEFAULT",
+        "STRICT_ALIGN_CORNERS",
+        "ALIGN_CORNERS",
+        "OFFSET_CORNERS",
+        "UNALIGN_CORNERS",
+    }
+
+    def default_inputs(self):
+        return DefaultInputs(
+            interpolation_mode="LINEAR",
+            sampling_mode="DEFAULT",
+        )
+
+    def _validate_input(self):
+        if self.shape.val is not None:
+            shape_element_num = self.shape.val.size
+            if self.resized_dims.val > shape_element_num:
+                raise ValueError(
+                    f"The resized_dims ({self.resized_dims.val}) must <= shape's size ({shape_element_num})"
+                )
+            if shape_element_num > self.x.rank:
+                raise ValueError(
+                    f"The shape's size ({shape_element_num}) must <= x's rank ({self.x.rank})"
+                )
+        if self.shape.rank != 1:
+            raise ValueError(f"The shape's rank must be 1, but got {self.shape.rank}")
+        if self.interpolation_mode.val not in self._VALID_INTERPOLATION_MODES:
+            raise ValueError(
+                f"Invalid interpolation_mode {self.interpolation_mode.val}. Supported modes are: {self._VALID_INTERPOLATION_MODES}"
+            )
+        if self.sampling_mode.val not in self._VALID_SAMPLING_MODE:
+            raise ValueError(
+                f"Invalid sampling_mode {self.sampling_mode.val}. Supported modes are: {self._VALID_SAMPLING_MODE}"
+            )
+
+    def type_inference(self):
+        self._validate_input()
+
+        # The output tensor will have the same rank as the input tensor. The rightmost resized_dims
+        # dimensions of the output_shape will be taken from the input "shape".
+        ret_shape = list(self.x.shape)
+
+        start_idx = self.shape.shape[0] - self.resized_dims.val
+        for i in range(self.resized_dims.val):
+            target_shape = (
+                get_new_symbol() if self.shape.val is None else self.shape.val[start_idx + i]
+            )
+            if target_shape == 0:
+                # The 0 in `shape` means inheriting from x's shape.
+                target_shape = self.x.shape[self.x.rank - self.resized_dims.val + i]
+            ret_shape[self.x.rank - self.resized_dims.val + i] = target_shape
+
+        return types.tensor(self.x.dtype, ret_shape)
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS17/linear.py b/coremltools/converters/mil/mil/ops/defs/iOS17/linear.py
new file mode 100644
index 000000000..a70d9b516
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/defs/iOS17/linear.py
@@ -0,0 +1,119 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.input_type import InputSpec, TensorInputType
+from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
+from coremltools.converters.mil.mil.ops.defs.iOS15.linear import linear as _linear_iOS15
+from coremltools.converters.mil.mil.ops.defs.iOS15.linear import matmul as _matmul_iOS15
+from coremltools.converters.mil.mil.ops.defs.iOS17 import _IOS17_TARGET
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class linear(_linear_iOS15):
+    """
+    A version of ``linear`` for iOS 17+. The only difference between this version and the
+    iOS 15 :py:class:`~.iOS15.linear.linear` is that the ``weight`` and ``bias`` may have a
+    different dtype than the input/output.
+
+    Parameters
+    ----------
+    x: tensor<[\*D, D_in], T> (Required)
+        * ``1 <= rank <= 3``.
+        * ``0 <= rank(*D) <= 2``.
+    weight: const tensor<[D_out, D_in], U> (Required)
+    bias: const tensor<[D_out], U> (Optional)
+        * Default to ``0``.
+
+    Returns
+    -------
+    tensor<[\*D, D_out], T>
+        * Same rank as the input ``x``.
+
+    Attributes
+    ----------
+    T: fp16, fp32, i32
+    U: fp16, fp32, i32
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        weight=TensorInputType(const=True, type_domain="U"),
+        bias=TensorInputType(const=True, optional=True, type_domain="U"),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32, types.int32),
+        "U": (types.fp16, types.fp32, types.int32),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class matmul(_matmul_iOS15):
+    """
+    A version of ``matmul`` for iOS 17+. The only difference between this version and the
+    iOS 15 :py:class:`~.iOS15.linear.matmul` is that the ``x`` and ``y`` can have a different
+    dtypes when one of them is const.
+
+    Parameters
+    ----------
+    x: tensor<[\*, K1], T> (Required)
+        * ``x`` must be 1-D or higher.
+    y: tensor<[\*, K2], U> (Required)
+        * ``y`` must be 1-D or higher.
+    transpose_x: const bool (Optional)
+        * Default to ``False``.
+        * Use ``True`` to transpose the last two dimensions of ``x`` before multiplication.
+          It has no effect when ``x`` is 1-D.
+    transpose_y: const bool (Optional)
+        * Default to ``False``.
+        * Use ``True`` to transpose the last two dimensions of ``y`` before multiplication.
+          It has no effect when ``y`` is 1-D.
+
+    Returns
+    -------
+    tensor<\*, V>
+        * Scalar or tensor output.
+        * When ``x`` and ``y`` are both const or both non-const, it should follow ios15 behavior
+          that ``x``, ``y``, and ``output`` all have the same dtype.
+          When one of x and y is const, the output dtype should be the same as the non-const one.
+
+    Attributes
+    ----------
+    T: fp16, fp32, i32
+    U: fp16, fp32, i32
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        y=TensorInputType(type_domain="U"),
+        transpose_x=TensorInputType(const=True, optional=True, type_domain=types.bool),
+        transpose_y=TensorInputType(const=True, optional=True, type_domain=types.bool),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32, types.int32),
+        "U": (types.fp16, types.fp32, types.int32),
+    }
+
+    def type_inference(self):
+        x_is_const = self.x.op is not None and self.x.op.op_type == "const"
+        y_is_const = self.y.op is not None and self.y.op.op_type == "const"
+
+        if x_is_const == y_is_const and self.x.dtype != self.y.dtype:
+            is_const_str = "const" if x_is_const else "non-const"
+            raise ValueError(
+                f'In op "matmul", when x and y are both {is_const_str}, their dtype '
+                f"need to match, but got x as {types.builtin_to_string(self.x.dtype)} "
+                f"and y as {types.builtin_to_string(self.y.dtype)}"
+            )
+
+        inferred_type = super().type_inference()
+        if x_is_const != y_is_const:
+            # The output dtype should be the same as the non-const one.
+            output_dtype = self.x.dtype if y_is_const else self.y.dtype
+            inferred_type = types.tensor(output_dtype, inferred_type.get_shape())
+
+        return inferred_type
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS17/normalization.py b/coremltools/converters/mil/mil/ops/defs/iOS17/normalization.py
new file mode 100644
index 000000000..dae4b0648
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/defs/iOS17/normalization.py
@@ -0,0 +1,148 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.input_type import InputSpec, TensorInputType
+from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
+from coremltools.converters.mil.mil.ops.defs.iOS15.normalization import (
+    batch_norm as _batch_norm_iOS15,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS15.normalization import (
+    instance_norm as _instance_norm_iOS15,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS15.normalization import l2_norm as _l2_norm_iOS15
+from coremltools.converters.mil.mil.ops.defs.iOS15.normalization import (
+    layer_norm as _layer_norm_iOS15,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS15.normalization import (
+    local_response_norm as _local_response_norm_iOS15,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS17 import _IOS17_TARGET
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class batch_norm(_batch_norm_iOS15):
+    """
+    Normalize input tensor ``x`` by ``mean`` and ``variance``, and optionally apply a
+    scale ``gamma`` and an offset ``beta``:
+
+    .. math::
+       y_i = \\gamma_i \\dfrac{ (x_i - mean_i)}{\\sqrt{variance_i + epsilon}} + beta_i \\;,\\;i=1,....,C
+
+    The difference between this version and the iOS 15 :py:class:`~.iOS15.normalization.batch_norm`
+    is that input/output can have different dtypes from other parameters.
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        mean=TensorInputType(const=True, type_domain="U"),
+        variance=TensorInputType(const=True, type_domain="U"),
+        gamma=TensorInputType(const=True, optional=True, type_domain="U"),
+        beta=TensorInputType(const=True, optional=True, type_domain="U"),
+        epsilon=TensorInputType(const=True, optional=True, type_domain="U"),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32),
+        "U": (types.fp16, types.fp32),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class instance_norm(_instance_norm_iOS15):
+    """
+    Apply instance normalization to the n-dimensional input tensor.
+
+    The difference between this version and the iOS 15 :py:class:`~.iOS15.normalization.instance_norm`
+    is that input/output can have different dtypes from other parameters.
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        gamma=TensorInputType(const=True, optional=True, type_domain="U"),
+        beta=TensorInputType(const=True, optional=True, type_domain="U"),
+        epsilon=TensorInputType(const=True, optional=True, type_domain="U"),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32),
+        "U": (types.fp16, types.fp32),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class l2_norm(_l2_norm_iOS15):
+    """
+    Apply L2 normalization to the n-dimensional input tensor. That is, divide the input
+    tensor by the square root of the sum of squares of all elements of the input.
+
+    .. math::
+       x_i \\leftarrow \\dfrac{x_i}{\\sqrt{\\sum{x_i^2} + \\epsilon}}
+
+    The difference between this version and the iOS 15 :py:class:`~.iOS15.normalization.l2_norm`
+    is that input/output and ``epsilon`` can have different dtypes.
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        epsilon=TensorInputType(const=True, optional=True, type_domain="U"),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32),
+        "U": (types.fp16, types.fp32),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class layer_norm(_layer_norm_iOS15):
+    """
+    Apply layer normalization to the n-dimensional input tensor:
+
+    .. math::
+       out = gamma * (input - E[x]) / sqrt(Var[x] + epsilon) + beta
+
+    The difference between this version and the iOS 15 :py:class:`~.iOS15.normalization.layer_norm`
+    is that input/output can have different dtypes from other parameters.
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        axes=TensorInputType(const=True, optional=True, type_domain=types.int32),
+        gamma=TensorInputType(const=True, optional=True, type_domain="U"),
+        beta=TensorInputType(const=True, optional=True, type_domain="U"),
+        epsilon=TensorInputType(const=True, optional=True, type_domain="U"),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32),
+        "U": (types.fp16, types.fp32),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class local_response_norm(_local_response_norm_iOS15):
+    """
+    Apply local response normalization to the n-dimensional input tensor:
+
+    .. math::
+       x_i \\leftarrow \\dfrac{x_i}{\\left ( k + \\dfrac{\\alpha}{\\text{size}} \\sum_j x_j^2 \\right )^\\beta}
+
+    The difference between this version and the iOS 15 :py:class:`~.iOS15.normalization.local_response_norm`
+    is that input/output can have different dtypes from other parameters.
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        size=TensorInputType(const=True, type_domain=types.int32),
+        alpha=TensorInputType(const=True, optional=True, type_domain="U"),
+        beta=TensorInputType(const=True, optional=True, type_domain="U"),
+        k=TensorInputType(const=True, optional=True, type_domain="U"),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32),
+        "U": (types.fp16, types.fp32),
+    }
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS17/recurrent.py b/coremltools/converters/mil/mil/ops/defs/iOS17/recurrent.py
new file mode 100644
index 000000000..d652cedd3
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/defs/iOS17/recurrent.py
@@ -0,0 +1,98 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.input_type import InputSpec, TensorInputType
+from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
+from coremltools.converters.mil.mil.ops.defs.iOS15.recurrent import gru as _gru_iOS15
+from coremltools.converters.mil.mil.ops.defs.iOS15.recurrent import lstm as _lstm_iOS15
+from coremltools.converters.mil.mil.ops.defs.iOS15.recurrent import rnn as _rnn_iOS15
+from coremltools.converters.mil.mil.ops.defs.iOS17 import _IOS17_TARGET
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class gru(_gru_iOS15):
+    """
+    Gated Recurrent Unit (GRU)
+
+    The only difference between this version and the iOS 15 :py:class:`~.iOS15.recurrent.gru` is
+    adding the support for fp16.
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        initial_h=TensorInputType(type_domain="T"),
+        weight_ih=TensorInputType(const=True, type_domain="T"),
+        weight_hh=TensorInputType(const=True, type_domain="T"),
+        bias=TensorInputType(const=True, optional=True, type_domain="T"),
+        direction=TensorInputType(const=True, optional=True, type_domain=types.str),
+        output_sequence=TensorInputType(const=True, optional=True, type_domain=types.bool),
+        recurrent_activation=TensorInputType(const=True, optional=True, type_domain=types.str),
+        activation=TensorInputType(const=True, optional=True, type_domain=types.str),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class lstm(_lstm_iOS15):
+    """
+    Long Short-Term Memory (LSTM)
+
+    The only difference between this version and the iOS 15 :py:class:`~.iOS15.recurrent.lstm` is
+    adding the support for fp16.
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        initial_h=TensorInputType(type_domain="T"),
+        initial_c=TensorInputType(type_domain="T"),
+        weight_ih=TensorInputType(const=True, type_domain="T"),  # ifoz layout,
+        weight_hh=TensorInputType(const=True, type_domain="T"),  # ifoz layout
+        bias=TensorInputType(const=True, optional=True, type_domain="T"),  # ifoz layout
+        peephole=TensorInputType(const=True, optional=True, type_domain="T"),  # ifo layout
+        weight_ih_back=TensorInputType(const=True, optional=True, type_domain="T"),  # ifoz layout,
+        weight_hh_back=TensorInputType(const=True, optional=True, type_domain="T"),  # ifoz layout
+        bias_back=TensorInputType(const=True, optional=True, type_domain="T"),  # ifoz layout
+        peephole_back=TensorInputType(const=True, optional=True, type_domain="T"),  # ifo layout
+        direction=TensorInputType(const=True, optional=True, type_domain=types.str),
+        output_sequence=TensorInputType(const=True, optional=True, type_domain=types.bool),
+        recurrent_activation=TensorInputType(const=True, optional=True, type_domain=types.str),
+        cell_activation=TensorInputType(const=True, optional=True, type_domain=types.str),
+        activation=TensorInputType(const=True, optional=True, type_domain=types.str),
+        clip=TensorInputType(const=True, optional=True, type_domain="T"),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class rnn(_rnn_iOS15):
+    """
+    Recurrent Neural Network (RNN)
+
+    The only difference between this version and the iOS 15 :py:class:`~.iOS15.recurrent.rnn` is
+    adding the support for fp16.
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        initial_h=TensorInputType(type_domain="T"),
+        weight_ih=TensorInputType(const=True, type_domain="T"),
+        weight_hh=TensorInputType(const=True, type_domain="T"),
+        bias=TensorInputType(const=True, optional=True, type_domain="T"),
+        direction=TensorInputType(const=True, optional=True, type_domain=types.str),
+        output_sequence=TensorInputType(const=True, optional=True, type_domain=types.bool),
+        activation=TensorInputType(const=True, optional=True, type_domain=types.str),
+    )
+
+    type_domains = {
+        "T": (types.fp16, types.fp32),
+    }
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS17/tensor_operation.py b/coremltools/converters/mil/mil/ops/defs/iOS17/tensor_operation.py
index da74582ef..d3cad5eeb 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS17/tensor_operation.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS17/tensor_operation.py
@@ -4,7 +4,7 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 from coremltools.converters.mil.mil import types
-from coremltools.converters.mil.mil.input_type import InputSpec, TensorInputType
+from coremltools.converters.mil.mil.input_type import DefaultInputs, InputSpec, TensorInputType
 from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
 from coremltools.converters.mil.mil.ops.defs.iOS15.tensor_operation import (
     non_maximum_suppression as _nms_iOS15,
@@ -20,7 +20,7 @@ class non_maximum_suppression(_nms_iOS15):
 
     NMS iteratively removes lower-scoring boxes which have an IoU greater than ``iou_threshold`` with
     another (higher-scoring) box.
-    
+
     The major differences between this version and the iOS 15 :py:class:`~.iOS15.tensor_operation.non_maximum_suppression`
     are as follows:
 
@@ -84,11 +84,13 @@ def type_inference(self):
 @register_op(opset_version=_IOS17_TARGET)
 class topk(_topk_iOS16):
     """
-    A version of ``topk`` for iOS 17+. The only difference between this version and the
-    iOS 16 :py:class:`~.iOS16.tensor_operation.topk` is the data type support.
-    The newly added data type is:
-    - int16, unint16 for ``x`` and output.
-    - int16 for ``k``.
+    A version of ``topk`` for iOS 17+. The differences between this version and the
+    iOS 16 :py:class:`~.iOS16.tensor_operation.topk` are:
+    - New data type support. The newly added data type is:
+        - int8, uint8, int16, unint16 for ``x`` and output.
+        - int8, int16 for ``k``.
+    - Validation restrictions on the optional ``indices`` output: must be either uint16 or int32. Also
+      a new input parameter ``output_indices_dtype`` is added to set the dtype of output ``indices``.
 
     Parameters
     ----------
@@ -111,20 +113,24 @@ class topk(_topk_iOS16):
     return_indices: const<bool> (Optional)
         * Defaults to ``True``.
         * If ``True``, returns both values and indices. Otherwise, returns only the ``top-k`` values.
+    output_indices_dtype: const<str> (Optional, default="int32")
+        * It can only be set when ``return_indices`` is ``True``.
+        * This parameter can take ``"int32"`` or ``"uint16"`` as values.
 
     Returns
     -------
     tensor<\*?, T>
         * Values of top/bottom ``k`` elements.
 
-    tensor<\*?, int32>
+    tensor<\*?, U>
         * Only returned when ``return_indices = True``
         * Indices of the top/bottom ``k`` elements along axis.
+        * U is int32 or uint16 determined by ``output_indices_dtype`` (int32 by default).
 
     Attributes
     ----------
-    T: fp16, fp32, int16, int32, uint16
-    K: int16, int32
+    T: fp16, fp32, int8, int16, int32, uint8, uint16
+    K: int8, int16, int32
     """
 
     input_spec = InputSpec(
@@ -134,15 +140,49 @@ class topk(_topk_iOS16):
         ascending=TensorInputType(const=True, optional=True, type_domain=types.bool),
         sort=TensorInputType(const=True, optional=True, type_domain=types.bool),
         return_indices=TensorInputType(const=True, optional=True, type_domain=types.bool),
+        output_indices_dtype=TensorInputType(const=True, optional=True, type_domain=types.str),
     )
 
     type_domains = {
         "T": (
             types.fp16,
             types.fp32,
+            types.int8,
             types.int16,
             types.int32,
+            types.uint8,
             types.uint16,
         ),
-        "K": (types.int16, types.int32),
+        "K": (types.int8, types.int16, types.int32),
     }
+
+    _ALLOWED_OUTPUT_INDICES_DTYPES = {"int32", "uint16"}
+
+    def default_inputs(self):
+        parent_default_inputs = super().default_inputs()
+        # If return_indices is not set, it is default to True.
+        # output_indices_dtype can only be set when return_indices = True
+        if self.return_indices is None or self.return_indices.val:
+            return parent_default_inputs + DefaultInputs(output_indices_dtype="int32")
+        return parent_default_inputs
+
+    def type_inference(self):
+        if not self.return_indices.val and self.output_indices_dtype is not None:
+            raise ValueError(
+                'In iOS17 topk op, "output_indices_dtype" can only be set when "return_indices=True".'
+            )
+
+        if self.return_indices.val:
+            if self.output_indices_dtype.val not in self._ALLOWED_OUTPUT_INDICES_DTYPES:
+                raise ValueError(
+                    f'"topk" op invalid output_indices_dtype: "{self.output_indices_dtype.val}". '
+                    f"Valid options are: {self._ALLOWED_OUTPUT_INDICES_DTYPES}"
+                )
+
+            value_type, indices_type = super().type_inference()
+            indices_type = types.tensor(
+                types.string_to_builtin(self.output_indices_dtype.val), indices_type.get_shape()
+            )
+            return value_type, indices_type
+        else:
+            return super().type_inference()
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS17/tensor_transformation.py b/coremltools/converters/mil/mil/ops/defs/iOS17/tensor_transformation.py
index c2052aa51..318968c9d 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS17/tensor_transformation.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS17/tensor_transformation.py
@@ -5,10 +5,39 @@
 
 from typing import List
 
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.input_type import InputSpec, TensorInputType, TupleInputType
 from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
+from coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation import (
+    expand_dims as _expand_dims_iOS15,
+)
 from coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation import (
     reshape as _reshape_iOS15,
 )
+from coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation import (
+    reverse as _reverse_iOS15,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation import (
+    reverse_sequence as _reverse_sequence_iOS15,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation import (
+    slice_by_index as _slice_by_index_iOS15,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation import (
+    slice_by_size as _slice_by_size_iOS15,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation import (
+    sliding_windows as _sliding_windows_iOS15,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation import (
+    squeeze as _squeeze_iOS15,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation import (
+    transpose as _transpose_iOS15,
+)
+from coremltools.converters.mil.mil.ops.defs.iOS16.tensor_transformation import (
+    reshape_like as _reshape_like_iOS16,
+)
 from coremltools.converters.mil.mil.ops.defs.iOS17 import _IOS17_TARGET
 
 
@@ -18,19 +47,22 @@ class reshape(_reshape_iOS15):
     Return a tensor that has the same values as ``x`` with shape ``shape``.
     ``shape`` must have the same volume (number of elements) as ``x``.
 
-    The major difference between this version and the iOS 15 :py:class:`~.iOS15.tensor_transformation.reshape` is as follows:
+    The major differences between this version and the iOS 15 :py:class:`~.iOS15.tensor_transformation.reshape` are as follows:
+
+    - When the ``shape`` contains ``0``,
+      the restriction about ``K == rank(x)`` is no longer enforced. Each ``0`` in ``shape`` will match the
+      corresponding dimension in ``x.shape``, counting from the rightmost element. So ``shape[i]``
+      matches ``input[j]`` if ``length(shape)-i == rank(input)-j``. If a ``0`` is out of range, assign ``1``
+      (equivalent to ``expand_dims`` for ``x.shape``).
 
-    When the ``shape`` contains ``0``,
-    the restriction about ``K == rank(x)`` is no longer enforced. Each ``0`` in ``shape`` will match the
-    corresponding dimension in ``x.shape``, counting from the rightmost element. So ``shape[i]``
-    matches ``input[j]`` if ``length(shape)-i == rank(input)-j``. If a ``0`` is out of range, assign ``1``
-    (equivalent to ``expand_dims`` for ``x.shape``).
+      More specifically, when ``x.shape`` is ``[2, 50]`` and ``shape`` is ``[1, 0, -1, 0]``, it will error out
+      in iOS 15 or iOS 16 because ``x`` has rank ``2`` while the ``len`` of ``shape`` is ``4``. In iOS 17, the result will
+      have ``shape`` ``[1, 1, 2, 50]``, because the rightmost ``0`` will be changed to the rightmost dim of
+      ``x.shape``, which is ``50``. There is no other ``0`` that has a corresponding dim in ``x.shape``, so it is set
+      as ``1``. Finally, the ``-1`` is calculated based on knowing dimensions that produce ``2``.
 
-    More specifically, when ``x.shape`` is ``[2, 50]`` and ``shape`` is ``[1, 0, -1, 0]``, it will error out
-    in iOS 15 or iOS 16 because ``x`` has rank ``2`` while the ``len`` of ``shape`` is ``4``. In iOS 17, the result will
-    have ``shape`` ``[1, 1, 2, 50]``, because the rightmost ``0`` will be changed to the rightmost dim of
-    ``x.shape``, which is ``50``. There is no other ``0`` that has a corresponding dim in ``x.shape``, so it is set
-    as ``1``. Finally, the ``-1`` is calculated based on knowing dimensions that produce ``2``.
+    - Support more data types, including int8, uint8, int16, uint16 for ``x`` and int8, int16 for
+      ``shape``.
 
     Parameters
     ----------
@@ -41,7 +73,7 @@ class reshape(_reshape_iOS15):
           ``shape`` may contain elements that are not positive integers (see below).
         * If ``x`` has a variadic rank, ``shape`` can only contain positive integers.
 
-    shape: tensor<[K], i32> (Required)
+    shape: tensor<[K], U> (Required)
 
         A 1-D tensor, with elements from the following:
 
@@ -63,9 +95,29 @@ class reshape(_reshape_iOS15):
 
     Attributes
     ----------
-    T: fp16, fp32, i32, bool
+    T: fp16, fp32, int8, uint8, int16, uint16, int32, bool
+    U: int8, int16, int32
     """
 
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        shape=TensorInputType(type_domain="U"),
+    )
+
+    type_domains = {
+        "T": (
+            types.fp16,
+            types.fp32,
+            types.int8,
+            types.uint8,
+            types.int16,
+            types.uint16,
+            types.int32,
+            types.bool,
+        ),
+        "U": (types.int8, types.int16, types.int32),
+    }
+
     @staticmethod
     def replace_zeros_in_shape(from_shape: List[int], to_shape: List[int]) -> List[int]:
         """
@@ -85,3 +137,483 @@ def replace_zeros_in_shape(from_shape: List[int], to_shape: List[int]) -> List[i
             # Reverse the result back to make the right alignment.
             to_shape = to_shape_reversed[::-1]
         return to_shape
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class reshape_like(_reshape_like_iOS16):
+    """
+    Reshape a tensor to an output shape specified by some or all dimensions of a tuple of reference tensors ``ref_tensors``.
+
+    The major difference between this version and the iOS 15 :py:class:`~.iOS16.tensor_transformation.reshape_like`
+    is that input ``x`` and ``ref_tensors`` supports more data types: int8, uint8, int16, uint16.
+
+    Parameters
+    ----------
+    x: tensor<\*?, T> (Required)
+        * The input tensor to be reshaped.
+
+    ref_tensors: Tuple[tensor<\*?, R>] (Required)
+        * A tuple of tensors that define the output shape.
+
+    begins: Tuple[const<int32>] (Required)
+        * A tuple of integers specifying the begin index into the shape vector of the corresponding ``ref_tensor``.
+
+    ends: Tuple[const<int32>] (Required)
+        * A tuple of integers specifying the end index into the shape vector of the corresponding ``ref_tensor``.
+
+    end_masks: Tuple[const<bool>] (Required)
+        * If ``True``, select all axes from the begin index until the end of the corresponding ``ref_tensor``, as in
+          ``ref_tensors[i].shape[begins[i]:]``.
+
+    Returns
+    -------
+    tensor<\*?, T>
+        * Same type as input tensor ``x``.
+        * Output shape is computed by ``ref_tensors``, ``begins``, ``ends``, and ``end_masks``.
+
+    Attributes
+    ----------
+    T: fp16, fp32, int8, int16, int32, uint8, uint16, bool
+    R: fp16, fp32, int8, int16, int32, uint8, uint16, bool
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        ref_tensors=TupleInputType(),
+        begins=TupleInputType(),
+        ends=TupleInputType(),
+        end_masks=TupleInputType(),
+    )
+
+    type_domains = {
+        "T": (
+            types.fp16,
+            types.fp32,
+            types.int8,
+            types.int16,
+            types.int32,
+            types.uint8,
+            types.uint16,
+            types.bool,
+        ),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class expand_dims(_expand_dims_iOS15):
+    """
+    Insert a single-dimension in a 1-D or higher tensor at each axis in axes.
+
+    The major difference between this version and the iOS 15 :py:class:`~.iOS15.tensor_transformation.expand_dims`
+    is that input ``x`` supports more data types: int8, uint8, int16, uint16.
+
+    Parameters
+    ----------
+    x: tensor<\*?, T> (Required)
+        * Scalar or tensor.
+    axes: const tensor<[K], int32> Required
+        * ``K`` is the number of dimensions expanded.
+        * Insert single dimension at dimension index at each axes.
+        * Negative value to index from the end. ``-d-1 <= axis <= d``
+          where ``d`` is the rank of ``x``.
+
+    Returns
+    -------
+    tensor<\*(rank(x)+K), T>
+        * Same type as the input ``x`` with rank ``rank(x)+K``.
+
+    Attributes
+    ----------
+    T: fp16, fp32, int8, int16, int32, uint8, uint16, bool
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        axes=TensorInputType(const=True, type_domain=types.int32),
+    )
+
+    type_domains = {
+        "T": (
+            types.fp16,
+            types.fp32,
+            types.int8,
+            types.int16,
+            types.int32,
+            types.uint8,
+            types.uint16,
+            types.bool,
+        ),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class squeeze(_squeeze_iOS15):
+    """
+    Remove single-dimension dimensions in a 1-D or higher tensor.
+
+    The major difference between this version and the iOS 15 :py:class:`~.iOS15.tensor_transformation.squeeze`
+    is that input ``x`` supports more data types: int8, uint8, int16, uint16.
+
+    Parameters
+    ----------
+    x: tensor<\*?,T> (Required)
+        * Must be at least 1-D.
+    axes: const<K,int32> (Optional)
+        * Axes to squeeze out.
+        * Default to remove all single-dimensions.
+
+    Returns
+    -------
+    tensor<\*(rank(x)-K),T>
+        * Tensor with same type as input ``x`` and rank ``rank(x)-K``.
+
+    Attributes
+    ----------
+    T: fp16, fp32, int8, int16, int32, uint8, uint16, bool
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        axes=TensorInputType(const=True, optional=True, type_domain=types.int32),
+    )
+
+    type_domains = {
+        "T": (
+            types.fp16,
+            types.fp32,
+            types.int8,
+            types.int16,
+            types.int32,
+            types.uint8,
+            types.uint16,
+            types.bool,
+        ),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class reverse(_reverse_iOS15):
+    """
+    Reverse the order of the input tensor ``x`` along specified ``axes`` (dimensions).
+
+    The major difference between this version and the iOS 15 :py:class:`~.iOS15.tensor_transformation.reverse`
+    is that input ``x`` supports more data types: int8, uint8, int16, uint16.
+
+    Parameters
+    ----------
+    x: tensor<\*?, T> (Required)
+        * Input tensor.
+
+    axes: const<D, int32> (Optional)
+        * Dimension(s) to reverse. Each axis must be in the range ``[-rank(x), rank(x))``.
+        * Defaults to None (reverse on all dimensions).
+
+    Returns
+    -------
+    tensor<\*?, T>
+        * Same type and shape as the input tensor.
+
+    Attributes
+    ----------
+    T: fp16, fp32, int8, int16, int32, uint8, uint16, bool
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        axes=TensorInputType(const=True, optional=True, type_domain=types.int32),
+    )
+
+    type_domains = {
+        "T": (
+            types.fp16,
+            types.fp32,
+            types.int8,
+            types.int16,
+            types.int32,
+            types.uint8,
+            types.uint16,
+            types.bool,
+        ),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class reverse_sequence(_reverse_sequence_iOS15):
+    """
+    Reverse variable length slices for specified axes / dimensions of the input
+    tensor. This op first slices input tensor along the ``batch_axis`` dimension, then
+    partially reverses the elements along the ``seq_axis`` for the first ``lengths[i]``
+    elements.
+
+    The major difference between this version and the iOS 15 :py:class:`~.iOS15.tensor_transformation.reverse_sequence`
+    is that input supports more data types:
+    - ``x`` additionally supports int8, uint8, int16, uint16
+    - ``lengths`` additionally supports int8, int16
+
+    Parameters
+    ----------
+    x: tensor<\*?, T> (Required)
+        * Input tensor.
+    lengths: tensor<L, U> (Required)
+        * 1-dimensional tensor of length ``x.shape[batch_axis]`` specifying the length
+          of the sequence to reverse.
+        * Values must be in range ``[0, x.shape[seq_axis]]``.
+    seq_axis: const<int32> (Optional)
+        * The dimension to reverse.
+        * Defaults to ``0``.
+    batch_axis: const<int32> (Optional)
+        * Dimension for slicing.
+        * Defaults to ``0``.
+
+    Returns
+    -------
+    tensor<\*?, T>
+        * Same type and shape as the input tensor.
+
+    Attributes
+    ----------
+    T: fp16, fp32, int8, int16, int32, uint8, uint16, bool
+    U: int8, int16, int32
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        lengths=TensorInputType(type_domain="U"),
+        seq_axis=TensorInputType(const=True, optional=True, type_domain=types.int32),
+        batch_axis=TensorInputType(const=True, optional=True, type_domain=types.int32),
+    )
+
+    type_domains = {
+        "T": (
+            types.fp16,
+            types.fp32,
+            types.int8,
+            types.int16,
+            types.int32,
+            types.uint8,
+            types.uint16,
+            types.bool,
+        ),
+        "U": (types.int8, types.int16, types.int32),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class sliding_windows(_sliding_windows_iOS15):
+    """
+    Return a tensor containing all windows of ``size``, separated by stride along the
+    given ``axis``.
+
+    The major difference between this version and the iOS 15 :py:class:`~.iOS15.tensor_transformation.sliding_windows`
+    is that input ``x`` supports more data types: int8, uint8, int16, uint16.
+
+    Parameters
+    ----------
+    x: tensor<[\*d0, d_axis, *dn], T>
+        * Input tensor.
+
+    axis: const<int32>
+        * Axis to perform the operation.
+
+    size: const<int32>
+        * Number of elements in the sliding window.
+
+    stride: const<int32> Optional
+        * Default to ``1``.
+        * The stride of the input elements in the sliding window.
+
+    Returns
+    -------
+    tensor<[\*d0, d_axis - size // stride + 1, size, \*dn], T>
+        * The output will be a tensor of rank ``N+1`` where ``N`` is the input tensor
+          rank.
+
+    Attributes
+    ----------
+    T: fp16, fp32, int8, int16, int32, uint8, uint16, bool
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        axis=TensorInputType(const=True, type_domain=types.int32),
+        size=TensorInputType(const=True, type_domain=types.int32),
+        stride=TensorInputType(const=True, optional=True, type_domain=types.int32),
+    )
+
+    type_domains = {
+        "T": (
+            types.fp16,
+            types.fp32,
+            types.int8,
+            types.int16,
+            types.int32,
+            types.uint8,
+            types.uint16,
+            types.bool,
+        ),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class transpose(_transpose_iOS15):
+    """
+    Permute tensor ``x`` dimensions according to ``perm``.
+
+    The major difference between this version and the iOS 15 :py:class:`~.iOS15.tensor_transformation.transpose`
+    is that input ``x`` supports more data types: int8, uint8, int16, uint16.
+
+    Parameters
+    ----------
+    x: tensor<\*?, T> (Required)
+        * Must be at least 1-D. ``x`` may have a symbolic shape.
+    perm: const<[rank(x)], i32> (Required)
+        * Permutation order. -rank(x) <= perm[I] < rank(x) for all perm entries.
+
+    Returns
+    -------
+    tensor<\*?,T>
+        * Tensor with same rank and type as ``x``.
+
+    Attributes
+    ----------
+    T: fp16, fp32, int8, int16, int32, uint8, uint16, bool
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        perm=TensorInputType(const=True, type_domain=types.int32),
+    )
+
+    type_domains = {
+        "T": (
+            types.fp16,
+            types.fp32,
+            types.int8,
+            types.int16,
+            types.int32,
+            types.uint8,
+            types.uint16,
+            types.bool,
+        ),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class slice_by_index(_slice_by_index_iOS15):
+    """
+    Method for numpy style indexing and slicing.
+    With a tensor ``x``, this method achieves the following:
+
+    ``result = x[begin[0]: end[0]: stride[0], begin[1]: end[1]: stride[1], ...]``
+
+    The differences between this version and the iOS 15 :py:class:`~.iOS15.tensor_transformation.slice_by_index`
+    is that additional data types are supported for ``x``, ``begin``, ``end``, and ``stride``.
+    See Parameters and Attributes sections for details.
+
+    Parameters
+    ----------
+    x: tensor<*?, T> (Required)
+        * Input tensor
+    begin: tensor<[rank(x)], U> (Required)
+        * Starting index for the dimension of slicing.
+    end: tensor<[rank(x)], U> (Required)
+        * Ending index for the dimension of slicing.
+    stride: tensor<[rank(x)], U> (Optional)
+        * Default is all ``1``.
+        * Stride for the dimension of slicing.
+    begin_mask: tensor<[rank(x)], bool> (Optional)
+        * Default to all ``False``.
+        * If ``begin_mask[i]==True``, ignores ``begin[i]``, and set ``begin[i]`` to ``0``.
+    end_mask: tensor<[rank(x)], bool> (Optional)
+        * Default to all ``False``.
+        * If ``end_mask[i]==True``, ignores ``end[i]``, and set ``end[i]`` to ``x.shape[i]``.
+    squeeze_mask: tensor<[rank(x)], bool> (Optional)
+        * Default to all ``False``.
+        * If ``squeeze_mask[i]==True``, ignores ``end[i]``, and do the pure index at ``begin[i]``.
+
+    Returns
+    -------
+    tensor<\*?, T>
+        - Scalar or tensor.
+
+    Attributes
+    ----------
+    T: bool, fp16, fp32, int8, int16, int32, uint8, uint16
+    U: int8, int16, int32
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        begin=TensorInputType(type_domain="U"),
+        end=TensorInputType(type_domain="U"),
+        stride=TensorInputType(const=True, optional=True, type_domain="U"),
+        begin_mask=TensorInputType(const=True, optional=True, type_domain=types.bool),
+        end_mask=TensorInputType(const=True, optional=True, type_domain=types.bool),
+        squeeze_mask=TensorInputType(const=True, optional=True, type_domain=types.bool),
+    )
+
+    type_domains = {
+        "T": (
+            types.fp16,
+            types.fp32,
+            types.int8,
+            types.int16,
+            types.int32,
+            types.uint8,
+            types.uint16,
+            types.bool,
+        ),
+        "U": (types.int8, types.int16, types.int32),
+    }
+
+
+@register_op(opset_version=_IOS17_TARGET)
+class slice_by_size(_slice_by_size_iOS15):
+    """
+    Slice input tensor starting from the given ``begin`` index and by
+    the amount specified by the ``size`` input, for each dimension.
+
+    The differences between this version and the iOS 15 :py:class:`~.iOS15.tensor_transformation.slice_by_size`
+    is that additional data types are supported for ``x``, ``begin``, and ``size``.
+    See Parameters and Attributes sections for details.
+
+    Parameters
+    ----------
+    x: tensor<*?, T> (Required)
+        * Input tensor.
+    begin: tensor<[rank(x)], U> Required
+        * The begin index for slice.
+    size: tensor<[rank(x)], U> Required
+        * The size that is to be sliced. If ``size`` is ``-1``,
+          all the remaining elements starting with "begin" are sliced.
+
+    Returns
+    -------
+    tensor<\*?, T>
+        * Scalar or tensor.
+
+    Attributes
+    ----------
+    T: bool, fp16, fp32, int8, int16, int32, uint8, uint16
+    U: int8, int16, int32
+    """
+
+    input_spec = InputSpec(
+        x=TensorInputType(type_domain="T"),
+        begin=TensorInputType(type_domain="U"),
+        size=TensorInputType(type_domain="U"),
+    )
+
+    type_domains = {
+        "T": (
+            types.fp16,
+            types.fp32,
+            types.int8,
+            types.int16,
+            types.int32,
+            types.uint8,
+            types.uint16,
+            types.bool,
+        ),
+        "U": (types.int8, types.int16, types.int32),
+    }
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS14/__init__.py b/coremltools/converters/mil/mil/ops/tests/iOS14/__init__.py
new file mode 100644
index 000000000..1ce9e2a63
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/__init__.py
@@ -0,0 +1,9 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import coremltools as ct
+from coremltools.converters.mil.testing_reqs import backends_internal, clean_up_backends
+
+backends = clean_up_backends(backends_internal, ct.target.iOS14, force_include_iOS15_test=True)
diff --git a/coremltools/converters/mil/mil/ops/tests/test_activation.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_activation.py
similarity index 79%
rename from coremltools/converters/mil/mil/ops/tests/test_activation.py
rename to coremltools/converters/mil/mil/ops/tests/iOS14/test_activation.py
index 515208b28..a7ff49a81 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_activation.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_activation.py
@@ -10,21 +10,19 @@
 import scipy
 
 import coremltools as ct
-from coremltools.converters.mil import testing_reqs
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.ops.tests.iOS14 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import (
+    mark_api_breaking,
+    run_compare_builder,
+)
+from coremltools.converters.mil.testing_reqs import compute_units
 from coremltools.converters.mil.testing_utils import ssa_fn
 
-from .testing_utils import run_compare_builder
-
-backends = testing_reqs.backends
-compute_units = testing_reqs.compute_units
-
 
 class TestClampedReLU:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         t = np.array([[-1, 2, -3], [4, -5, 6]], dtype=np.float32)
         input_placeholders = {
@@ -59,13 +57,7 @@ def test_builder_eval(self):
 
     @pytest.mark.parametrize(
         "compute_unit, backend, dim, alpha, beta",
-        itertools.product(
-            compute_units,
-            backends,
-            [2, 4, 8],
-            [2.0, 3.0],
-            [4.0, 5.0]
-        ),
+        itertools.product(compute_units, backends, [2, 4, 8], [2.0, 3.0], [4.0, 5.0]),
     )
     def test_builder_to_backend_stress(self, compute_unit, backend, dim, alpha, beta):
         shape_x = np.array([dim, dim])
@@ -94,9 +86,7 @@ def build(x):
 
 
 class TestELU:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         t = np.array([[-1, 2, -3], [4, -5, 6]], dtype=np.float32)
         input_placeholders = {
@@ -134,9 +124,7 @@ def test_builder_eval(self):
 
 
 class TestGeLU:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         t = np.array([[-1, 2, -3], [4, -5, 6]], dtype=np.float32)
         input_placeholders = {
@@ -230,9 +218,7 @@ def build(x):
 
 
 class TestLeakyReLU:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         t = np.array([[-1, 2, -3], [4, -5, 6]], dtype=np.float32)
         input_placeholders = {
@@ -267,9 +253,7 @@ def test_builder_eval(self):
 
 
 class TestLinearActivation:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         t = np.array([[-1, 2, -3], [4, -5, 6]], dtype=np.float32)
         input_placeholders = {"x": mb.placeholder(shape=t.shape)}
@@ -330,16 +314,16 @@ def build(x):
 
 class TestPReLU:
     @pytest.mark.parametrize(
-        "rank, alpha_values, compute_unit, backend",
+        "compute_unit, backend, rank, alpha_values",
         itertools.product(
-            [3, 4, 5],
-            [[1.0, 2.0, 3.0], [4.0, 4.0, 4.0]],
             compute_units,
             backends,
-        )
+            [3, 4, 5],
+            [[1.0, 2.0, 3.0], [4.0, 4.0, 4.0]],
+        ),
     )
-    def test_builder_to_backend_smoke(self, rank, alpha_values, compute_unit, backend):
-        if (backend[0] == "mlprogram" and backend[1] == "fp16"):
+    def test_builder_to_backend_smoke(self, compute_unit, backend, rank, alpha_values):
+        if backend.backend == "mlprogram" and backend.precision == "fp16":
             pytest.xfail(
                 "rdar://92175249 ([MIL] TestActivation::test_prelu[backend=(mlprogram, fp16)] CI failure)"
             )
@@ -421,12 +405,7 @@ def test_builder_eval3(self):
 
     @pytest.mark.parametrize(
         "compute_unit, backend, dim, chan",
-        itertools.product(
-            compute_units,
-            backends,
-            [1, 2, 4, 8],
-            [2, 3, 4]
-        ),
+        itertools.product(compute_units, backends, [1, 2, 4, 8], [2, 3, 4]),
     )
     def test_builder_to_backend_stress(self, compute_unit, backend, dim, chan):
         shape = np.array([1, chan, dim, dim])
@@ -493,9 +472,7 @@ def test_builder_eval(self):
 
 
 class TestReLU6:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         t = np.array([[-1, 7, -3], [4, -5, 8]], dtype=np.float32)
         input_placeholders = {"x": mb.placeholder(shape=t.shape)}
@@ -527,9 +504,7 @@ def test_builder_eval(self):
 
 
 class TestScaledTanh:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         t = np.array([[-1, 2, -3], [4, -5, 6]], dtype=np.float32)
         input_placeholders = {"x": mb.placeholder(shape=t.shape)}
@@ -562,13 +537,7 @@ def test_builder_eval(self):
 
     @pytest.mark.parametrize(
         "compute_unit, backend, dim, alpha, beta",
-        itertools.product(
-            compute_units,
-            backends,
-            [2, 4, 8],
-            [2.0, 3.0],
-            [4.0, 5.0]
-        ),
+        itertools.product(compute_units, backends, [2, 4, 8], [2.0, 3.0], [4.0, 5.0]),
     )
     def test_builder_to_backend_stress(self, compute_unit, backend, dim, alpha, beta):
         shape_x = np.array([dim, dim])
@@ -594,9 +563,7 @@ def build(x):
 
 
 class TestSigmoid:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         t = np.array([[-1, 2, -3], [4, -5, 6]], dtype=np.float32)
         input_placeholders = {"x": mb.placeholder(shape=t.shape)}
@@ -632,9 +599,7 @@ def test_builder_eval(self):
 
 
 class TestSigmoidHard:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         t = np.array([[-1, 2, -3], [4, -5, 6]], dtype=np.float32)
         input_placeholders = {"x": mb.placeholder(shape=t.shape)}
@@ -644,9 +609,7 @@ def build(x):
             return mb.sigmoid_hard(x=x, alpha=1.0, beta=2.0)
 
         expected_output_types = (2, 3, types.fp32)
-        expected_outputs = np.array(
-            [[1.0, 1.0, 0.0], [1.0, 0.0, 1.0]], dtype=np.float32
-        )
+        expected_outputs = np.array([[1.0, 1.0, 0.0], [1.0, 0.0, 1.0]], dtype=np.float32)
 
         run_compare_builder(
             build,
@@ -673,13 +636,7 @@ def test_builder_eval(self):
 
     @pytest.mark.parametrize(
         "compute_unit, backend, dim, alpha, beta",
-        itertools.product(
-            compute_units,
-            backends,
-            [2, 4, 8],
-            [2.0, 3.0],
-            [4.0, 5.0]
-        ),
+        itertools.product(compute_units, backends, [2, 4, 8], [2.0, 3.0], [4.0, 5.0]),
     )
     def test_builder_to_backend_stress(self, compute_unit, backend, dim, alpha, beta):
         shape_x = np.array([dim, dim])
@@ -705,9 +662,7 @@ def build(x):
 
 
 class TestSiLU:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         x_val = np.array([-1.1, 2.2, -3.3, 4.4], dtype=np.float32).reshape((1, 2, 1, 2))
 
@@ -720,9 +675,9 @@ def test_builder_to_backend_smoke(self, compute_unit, backend):
         def build(x):
             return mb.silu(x=x)
 
-        expected_output = np.array(
-            [-0.2747, 1.9805, -0.1174, 4.3466], dtype=np.float32
-        ).reshape(expected_output_type[:-1])
+        expected_output = np.array([-0.2747, 1.9805, -0.1174, 4.3466], dtype=np.float32).reshape(
+            expected_output_type[:-1]
+        )
 
         run_compare_builder(
             build,
@@ -736,9 +691,7 @@ def build(x):
 
 
 class TestSoftplus:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         t = np.array([[-1, 2, -3], [4, -5, 6]], dtype=np.float32)
         input_placeholders = {"x": mb.placeholder(shape=t.shape)}
@@ -774,9 +727,7 @@ def test_builder_eval(self):
 
 # No torch test because there is no direct torch translation to this layer
 class TestSoftplusParametric:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         t = np.array([[[[-1, 3, 6]], [[-1, 2, -3]], [[4, -5, 6]]]], dtype=np.float32)
         input_placeholders = {"x": mb.placeholder(shape=t.shape)}
@@ -791,11 +742,13 @@ def build(x):
 
         expected_output_types = (1, 3, 1, 3, types.fp32)
         expected_outputs = np.array(
-            [[
-                [[1.8142700e-02, 1.2000000e01, 2.4000000e01]],
-                [[1.3427734e-02, 2.0000000e01, 7.1525574e-07]],
-                [[7.2000000e01, 0.0000000e00, 1.0800000e02]],
-            ]],
+            [
+                [
+                    [[1.8142700e-02, 1.2000000e01, 2.4000000e01]],
+                    [[1.3427734e-02, 2.0000000e01, 7.1525574e-07]],
+                    [[7.2000000e01, 0.0000000e00, 1.0800000e02]],
+                ]
+            ],
             dtype=np.float32,
         )
 
@@ -879,12 +832,7 @@ def test_builder_eval6(self):
 
     @pytest.mark.parametrize(
         "compute_unit, backend, dim, chan",
-        itertools.product(
-            compute_units,
-            backends,
-            [1, 2, 4, 8],
-            [1, 2, 3]
-        ),
+        itertools.product(compute_units, backends, [1, 2, 4, 8], [1, 2, 3]),
     )
     def test_builder_to_backend_stress(self, compute_unit, backend, dim, chan):
         shape = np.array([1, chan, dim, dim])
@@ -918,9 +866,7 @@ def build(x):
 
 
 class TestSoftmax:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_buidler_to_backend_smoke(self, compute_unit, backend):
         t = np.array([[-1, 2, -3], [4, -5, 6]], dtype=np.float32)
         input_placeholders = {"x": mb.placeholder(shape=t.shape)}
@@ -955,14 +901,13 @@ def test_builder_eval(self):
             scipy.special.softmax(x_val, axis=0), v.val, atol=1e-04, rtol=1e-05
         )
 
-    @pytest.mark.parametrize(
-        "input_size", [(1), (2), (1, 2), (2, 2), (2, 3, 4), (2, 3, 4, 10)]
-    )
+    @pytest.mark.parametrize("input_size", [(1), (2), (1, 2), (2, 2), (2, 3, 4), (2, 3, 4, 10)])
     def test_value_inference(self, input_size):
         rs = np.random.RandomState(1234)
         x = rs.random(input_size)
 
         for axis in range(-x.ndim, x.ndim - 1):
+
             @mb.program(input_specs=[])
             def prog():
                 return mb.softmax(x=x, axis=axis)
@@ -978,9 +923,7 @@ def prog():
 
 
 class TestSoftsign:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         t = np.array([[-1, 2, -3], [4, -5, 6]], dtype=np.float32)
         input_placeholders = {"x": mb.placeholder(shape=t.shape)}
@@ -1013,9 +956,7 @@ def test_builder_eval(self):
 
 
 class TestThresholdedReLU:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         t = np.array([[-1, 2, -3], [4, -5, 6]], dtype=np.float32)
         input_placeholders = {"x": mb.placeholder(shape=t.shape)}
@@ -1047,12 +988,7 @@ def test_builder_eval(self):
 
     @pytest.mark.parametrize(
         "compute_unit, backend, dim, alpha",
-        itertools.product(
-            compute_units,
-            backends,
-            [2, 4, 8],
-            [2.0, 3.0]
-        ),
+        itertools.product(compute_units, backends, [2, 4, 8], [2.0, 3.0]),
     )
     def test_builder_to_backend_stress(self, compute_unit, backend, dim, alpha):
         shape_x = np.array([dim, dim])
@@ -1079,21 +1015,22 @@ def build(x):
         )
 
 
-class TestInputWeightDifferentDtypes:
+class TestInputWeightDifferentDtypesErrorOut:
     """
     Starting from IOS17 the alpha/beta can have different dtypes from the input/output, so this
-    test class is mainly to verify the behaviour of those alpha/beta related activations.
+    test class is mainly to verify the behaviour before iOS17, that the type inference should early error out.
     """
 
+    @mark_api_breaking(breaking_opset_version=ct.target.iOS17)
     @pytest.mark.parametrize(
-        "opset_version, different_dtype, op_name",
+        "backend, different_dtype, op_name",
         itertools.product(
-            [None, ct.target.iOS17],
+            backends,
             [True, False],
             ["elu", "leaky_relu", "prelu", "thresholded_relu"],
         ),
     )
-    def test_builder_eval_alpha(self, opset_version, different_dtype, op_name):
+    def test_builder_eval_alpha(self, backend, different_dtype, op_name):
         x = np.array([[[-1, 2, -3], [4, -5, 6]]], dtype=np.float32)
         alpha = np.float16(2.0) if different_dtype else np.float32(2.0)
         if op_name == "prelu":
@@ -1102,17 +1039,18 @@ def test_builder_eval_alpha(self, opset_version, different_dtype, op_name):
         def prog():
             return getattr(mb, op_name)(x=x, alpha=alpha)
 
-        if different_dtype and opset_version != ct.target.iOS17:
+        if different_dtype:
             # Before iOS17 it should raise error when alpha has different dtype than input/output.
             with pytest.raises(ValueError, match="must have the same data type"):
-                mb.program(input_specs=[], opset_version=opset_version)(prog)
+                mb.program(input_specs=[], opset_version=backend.opset_version)(prog)
         else:
-            mb.program(input_specs=[], opset_version=opset_version)(prog)
+            mb.program(input_specs=[], opset_version=backend.opset_version)(prog)
 
+    @mark_api_breaking(breaking_opset_version=ct.target.iOS17)
     @pytest.mark.parametrize(
-        "opset_version, different_dtype, op_name",
+        "backend, different_dtype, op_name",
         itertools.product(
-            [None, ct.target.iOS17],
+            backends,
             [True, False],
             [
                 "clamped_relu",
@@ -1123,7 +1061,7 @@ def prog():
             ],
         ),
     )
-    def test_builder_eval_alpha_beta(self, opset_version, different_dtype, op_name):
+    def test_builder_eval_alpha_beta(self, backend, different_dtype, op_name):
         x = np.array([[[-1, 2, -3], [4, -5, 6]]], dtype=np.float32)
         alpha = np.float16(2.0) if different_dtype else np.float32(2.0)
         beta = np.float16(1.0) if different_dtype else np.float32(1.0)
@@ -1134,116 +1072,8 @@ def test_builder_eval_alpha_beta(self, opset_version, different_dtype, op_name):
         def prog():
             return getattr(mb, op_name)(x=x, alpha=alpha, beta=beta)
 
-        if different_dtype and opset_version != ct.target.iOS17:
+        if different_dtype:
             with pytest.raises(ValueError, match="must have the same data type"):
-                mb.program(input_specs=[], opset_version=opset_version)(prog)
+                mb.program(input_specs=[], opset_version=backend.opset_version)(prog)
         else:
-            mb.program(input_specs=[], opset_version=opset_version)(prog)
-
-    @pytest.mark.parametrize(
-        "compute_unit, different_dtype, op_name",
-        itertools.product(
-            compute_units, [True, False], ["elu", "leaky_relu", "prelu", "thresholded_relu"]
-        ),
-    )
-    def test_builder_to_backend_numerical_alpha(self, compute_unit, different_dtype, op_name):
-        x = np.array([[[-1, 2, -3], [4, -5, 6]]], dtype=np.float32)
-        alpha = np.float16(2.0) if different_dtype else np.float32(2.0)
-        if op_name == "prelu":
-            alpha = np.array([2.0, 2.0], dtype=alpha.dtype)
-
-        def calculate_by_np():
-            if op_name == "elu":
-                res = np.copy(x)
-                res[res < 0] = alpha * (np.exp(res[res < 0]) - 1)
-                return res
-            elif op_name == "leaky_relu":
-                res = np.copy(x)
-                res[res < 0] *= 2.0
-                return res
-            elif op_name == "prelu":
-                alpha_br = np.copy(alpha)
-                for i in range(len(x.shape)):
-                    if i != 1:
-                        alpha_br = np.expand_dims(alpha_br, i)
-                res = np.maximum(x, 0) + np.minimum(x, 0) * alpha_br
-                return res
-            elif op_name == "thresholded_relu":
-                res = np.copy(x)
-                res[res < alpha] = 0.0
-                return res
-            else:
-                raise ValueError(f"Invalid op_name: {op_name}")
-
-        def build(x):
-            return getattr(mb, op_name)(x=x, alpha=alpha)
-
-        run_compare_builder(
-            build,
-            input_placeholders={"x": mb.placeholder(shape=x.shape)},
-            input_values={"x": x},
-            expected_output_types=x.shape + (types.fp32,),
-            expected_outputs=calculate_by_np(),
-            compute_unit=compute_unit,
-            backend=("mlprogram", "fp16"),
-            minimum_deployment_target=ct.target.iOS17,
-        )
-
-    @pytest.mark.parametrize(
-        "compute_unit, different_dtype, op_name",
-        itertools.product(
-            compute_units,
-            [True, False],
-            [
-                "clamped_relu",
-                "linear_activation",
-                "scaled_tanh",
-                "sigmoid_hard",
-                "softplus_parametric",
-            ],
-        ),
-    )
-    def test_builder_to_backend_numerical_alpha_beta(self, compute_unit, different_dtype, op_name):
-        x = np.array([[[-1, 2, -3], [4, -5, 6]]], dtype=np.float32)
-        alpha = np.float16(2.0) if different_dtype else np.float32(2.0)
-        beta = np.float16(1.0) if different_dtype else np.float32(1.0)
-        if op_name == "softplus_parametric":
-            alpha = np.array([2.0, 2.0], dtype=alpha.dtype)
-            beta = np.array([1.0, 1.0], dtype=beta.dtype)
-
-        def calculate_by_np():
-            if op_name == "clamped_relu":
-                return np.minimum(np.maximum(x, 0), beta) + np.minimum(
-                    np.minimum(x, 0) * alpha, beta
-                )
-            elif op_name == "linear_activation":
-                return x * alpha + beta
-            elif op_name == "scaled_tanh":
-                return alpha * np.tanh(x * beta)
-            elif op_name == "sigmoid_hard":
-                return np.minimum(np.maximum((alpha * x) + beta, 0), 1)
-            elif op_name == "softplus_parametric":
-                alpha_br = alpha
-                beta_br = beta
-                for i in range(len(x.shape)):
-                    if i != 1:
-                        alpha_br = np.expand_dims(alpha_br, i)
-                        beta_br = np.expand_dims(beta_br, i)
-                res = alpha_br * np.log(np.exp(x * beta_br) + 1)
-                return res
-            else:
-                raise ValueError(f"Invalid op_name: {op_name}")
-
-        def build(x):
-            return getattr(mb, op_name)(x=x, alpha=alpha, beta=beta)
-
-        run_compare_builder(
-            build,
-            input_placeholders={"x": mb.placeholder(shape=x.shape)},
-            input_values={"x": x},
-            expected_output_types=x.shape + (types.fp32,),
-            expected_outputs=calculate_by_np(),
-            compute_unit=compute_unit,
-            backend=("mlprogram", "fp16"),
-            minimum_deployment_target=ct.target.iOS17,
-        )
+            mb.program(input_specs=[], opset_version=backend.opset_version)(prog)
diff --git a/coremltools/converters/mil/mil/ops/tests/test_control_flow.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_control_flow.py
similarity index 77%
rename from coremltools/converters/mil/mil/ops/tests/test_control_flow.py
rename to coremltools/converters/mil/mil/ops/tests/iOS14/test_control_flow.py
index 11711058a..a8f0232a2 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_control_flow.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_control_flow.py
@@ -10,16 +10,18 @@
 
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import get_new_symbol, types
-from coremltools.converters.mil.testing_reqs import backends, compute_units
+from coremltools.converters.mil.mil.ops.tests.iOS14 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import (
+    UNK_SYM,
+    construct_inputs_from_placeholders,
+    run_compare_builder,
+)
+from coremltools.converters.mil.testing_reqs import compute_units
 from coremltools.converters.mil.testing_utils import random_gen, ssa_fn
 
-from .testing_utils import UNK_SYM, construct_inputs_from_placeholders, run_compare_builder
-
 
 class TestSelect:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         cond_val = np.array([[3, 0, 0], [0, 4, 0], [5, 6, 0]], dtype=np.float32)
         a_val = np.array([[3, 1, 1], [1, 4, 1], [5, 6, 1]], dtype=np.float32)
@@ -38,9 +40,7 @@ def build(cond, a, b):
 
         expected_output_types = [(3, 3, types.fp32)]
         expected_outputs = [
-            np.array(
-                [[3.0, 2.0, 2.0], [2.0, 4.0, 2.0], [5.0, 6.0, 2.0]], dtype=np.float32
-            )
+            np.array([[3.0, 2.0, 2.0], [2.0, 4.0, 2.0], [5.0, 6.0, 2.0]], dtype=np.float32)
         ]
 
         run_compare_builder(
@@ -138,13 +138,16 @@ def build(a):
         shape = tuple(np.random.randint(1, 5, size=len(SYMBOLIC_SHAPE)))
         a = np.random.rand(*shape)
         input_values = {"a": a}
+        expected_outputs = [
+            VALUE * np.ones(shape),
+        ]
 
         run_compare_builder(
             build,
             input_placeholders,
             input_values,
             expected_output_types=[SYMBOLIC_SHAPE + (types.fp32,)],
-            expected_outputs=[VALUE],
+            expected_outputs=expected_outputs,
             inputs=construct_inputs_from_placeholders(input_placeholders, upper_bound=10),
             compute_unit=compute_unit,
             backend=backend,
@@ -174,9 +177,14 @@ def test_builder_eval_scalar(self):
         assert isinstance(res.val, np.float32)
         np.testing.assert_allclose(np.float32(1), res.val)
 
+
 class TestCond:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         input_placeholders = {
@@ -186,10 +194,10 @@ def test_builder_to_backend_smoke(self, compute_unit, backend):
 
         def build(a, b):
             def true_fn():
-                return mb.add(x=b, y=1.), mb.mul(x=b, y=2.)
+                return mb.add(x=b, y=1.0), mb.mul(x=b, y=2.0)
 
             def false_fn():
-                return mb.add(x=b, y=-1.), mb.mul(x=b, y=-2.)
+                return mb.add(x=b, y=-1.0), mb.mul(x=b, y=-2.0)
 
             pred = mb.squeeze(x=a)
             return mb.cond(pred=pred, _true_fn=true_fn, _false_fn=false_fn)
@@ -221,7 +229,11 @@ def false_fn():
 
 class TestWhileLoop:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         def body(a, b):
@@ -263,7 +275,11 @@ def build(a, b):
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_power(self, compute_unit, backend):
 
@@ -280,8 +296,7 @@ def body(res, bx):
             def cond(res, bx):
                 return mb.less(x=bx, y=b)
 
-            res, ignored = mb.while_loop(_cond=cond, _body=body,
-                                         loop_vars=([1.], [0.]))
+            res, ignored = mb.while_loop(_cond=cond, _body=body, loop_vars=([1.0], [0.0]))
             return res
 
         input_values = {
@@ -307,11 +322,17 @@ def cond(res, bx):
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_nested(self, compute_unit, backend):
-        if backend[0] == 'neuralnetwork':
-            pytest.xfail("rdar://96862073 (test_control_folw::TestWhileLoop::test_builder_to_backend_nested failing on nnv1)")
+        if backend.backend == "neuralnetwork":
+            pytest.xfail(
+                "rdar://96862073 (test_control_folw::TestWhileLoop::test_builder_to_backend_nested failing on nnv1)"
+            )
 
         input_placeholders = {
             "x": mb.placeholder(shape=(1,)),
@@ -327,8 +348,8 @@ def build(x, y):
             # return i, j
 
             # Create const outside of while loop for testing purpose
-            two = mb.const(val=[2.], name='const_two')
-            one = mb.const(val=[1.], name='const_one')
+            two = mb.const(val=[2.0], name="const_two")
+            one = mb.const(val=[1.0], name="const_one")
 
             def cond2(i):
                 return mb.less(x=mb.mul(x=two, y=i), y=mb.add(x=i, y=two))
@@ -340,12 +361,10 @@ def cond1(i, j):
                 return mb.less(x=i, y=j)
 
             def body1(i, j):
-                new_i = mb.while_loop(_cond=cond2, _body=body2,
-                                      loop_vars=(i,))
+                new_i = mb.while_loop(_cond=cond2, _body=body2, loop_vars=(i,))
                 return mb.add(x=new_i, y=two), j
 
-            return mb.while_loop(_cond=cond1, _body=body1,
-                                 loop_vars=(x, y))
+            return mb.while_loop(_cond=cond1, _body=body1, loop_vars=(x, y))
 
         input_values = {
             "x": np.array([0], dtype=np.float32),
@@ -374,7 +393,11 @@ def body1(i, j):
 
 class TestList:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         elem_shape = (2,)
@@ -428,7 +451,11 @@ def build(a, b):
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_while(self, compute_unit, backend):
         # The while_loop appends [1, 2]*i to `ls` for each iteration
@@ -485,3 +512,79 @@ def build(num_iters, update):
             compute_unit=compute_unit,
             backend=backend,
         )
+
+
+class TestConst:
+    @pytest.mark.parametrize(
+        "compute_unit, backend, dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [
+                np.int32,
+                np.int64,
+                np.float16,
+                np.float32,
+                np.float64,
+            ],
+        ),
+    )
+    def test_builder_to_backend_smoke(self, compute_unit, backend, dtype):
+        t = np.random.randint(0, 5, (4, 2)).astype(np.float32)
+        constant = np.random.randint(0, 5, (4, 2)).astype(dtype)
+        input_placeholders = {
+            "x": mb.placeholder(shape=t.shape),
+        }
+        input_values = {"x": t}
+
+        def build(x):
+            y = mb.const(val=constant)
+            y = mb.cast(x=y, dtype="fp32")
+            return mb.add(x=x, y=y)
+
+        expected_output_types = (4, 2, types.fp32)
+        expected_outputs = t + constant.astype(np.float32)
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            (
+                np.int8,
+                np.uint8,
+                np.int16,
+                np.uint16,
+                np.int32,
+                np.int64,
+                np.float16,
+                np.float32,
+                np.float64,
+            ),
+        ),
+    )
+    def test_const_type(self, compute_unit, backend, dtype):
+        """Makes sure the ndarray in const has the correct type."""
+        @mb.program(input_specs=[], opset_version=backend.opset_version)
+        def prog():
+            return mb.const(val=np.random.randint(0, 5, (4, 2)).astype(dtype))
+
+        const_op = prog.functions["main"].find_ops(op_type="const")[0]
+
+        if dtype == np.int64:
+            target_dtype = np.int32
+        elif dtype == np.float64:
+            target_dtype = np.float32
+        else:
+            target_dtype = dtype
+        assert const_op.outputs[0].dtype == types.numpy_type_to_builtin_type(target_dtype)
diff --git a/coremltools/converters/mil/mil/ops/tests/test_conv.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_conv.py
similarity index 72%
rename from coremltools/converters/mil/mil/ops/tests/test_conv.py
rename to coremltools/converters/mil/mil/ops/tests/iOS14/test_conv.py
index d1404382e..0af19fa19 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_conv.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_conv.py
@@ -9,18 +9,21 @@
 import pytest
 
 import coremltools as ct
-from coremltools.converters.mil import testing_reqs
+from coremltools._deps import _HAS_TORCH, MSG_TORCH_NOT_FOUND
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import get_new_symbol, types
-from coremltools.converters.mil.testing_reqs import backends, compute_units
-from coremltools.models.utils import _macos_version
+from coremltools.converters.mil.mil.ops.tests.iOS14 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
+from coremltools.converters.mil.testing_reqs import compute_units
+from coremltools.converters.mil.testing_utils import random_gen
 
-from .testing_utils import run_compare_builder
+if _HAS_TORCH:
+    import torch
+    import torch.nn as nn
 
 
 class TestConvTranspose:
-
-    @pytest.mark.skipif(not testing_reqs._HAS_TORCH, reason="PyTorch not installed.")
+    @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
     @pytest.mark.parametrize(
         ",".join(
             [
@@ -28,53 +31,56 @@ class TestConvTranspose:
                 "backend",
                 "conv_dim",
                 "config",
+                "x_weight_dtype",
             ]
         ),
         itertools.product(
             compute_units,
             backends,
             ["conv1d", "conv2d", "conv3d"],
-            [{
-                "padding": (1, 2, 3),
-                "DHWKdKhKw": (10, 12, 14, 3, 2, 4),
-                "stride": (2, 1, 1),
-                "dilation": (1, 1, 1),
-                "has_bias": False,
-                "groups": 1,
-                "test_symbolic": False,
-                "test_output_shape": True,
-            },
-            {
-                "padding": (2, 2, 2),
-                "DHWKdKhKw": (10, 12, 14, 3, 2, 4),
-                "stride": (2, 2, 2),
-                "dilation": (2, 1, 1),
-                "has_bias": False,
-                "groups": 2,
-                "test_symbolic": True,
-                "test_output_shape": False,
-            },
-            {
-                "padding": (1, 2, 3),
-                "DHWKdKhKw": (7, 7, 7, 2, 2, 2),
-                "stride": (2, 2, 2),
-                "dilation": (2, 1, 1),
-                "has_bias": True,
-                "groups": 1,
-                "test_symbolic": True,
-                "test_output_shape": False,
-            },
-            {
-                "padding": (2, 2, 2),
-                "DHWKdKhKw": (7, 7, 7, 2, 2, 2),
-                "stride": (2, 1, 1),
-                "dilation": (1, 1, 1),
-                "has_bias": True,
-                "groups": 2,
-                "test_symbolic": False,
-                "test_output_shape": False,
-            },
+            [
+                {
+                    "padding": (1, 2, 3),
+                    "DHWKdKhKw": (10, 12, 14, 3, 2, 4),
+                    "stride": (2, 1, 1),
+                    "dilation": (1, 1, 1),
+                    "has_bias": False,
+                    "groups": 1,
+                    "test_symbolic": False,
+                    "test_output_shape": True,
+                },
+                {
+                    "padding": (2, 2, 2),
+                    "DHWKdKhKw": (10, 12, 14, 3, 2, 4),
+                    "stride": (2, 2, 2),
+                    "dilation": (2, 1, 1),
+                    "has_bias": False,
+                    "groups": 2,
+                    "test_symbolic": True,
+                    "test_output_shape": False,
+                },
+                {
+                    "padding": (1, 2, 3),
+                    "DHWKdKhKw": (7, 7, 7, 2, 2, 2),
+                    "stride": (2, 2, 2),
+                    "dilation": (2, 1, 1),
+                    "has_bias": True,
+                    "groups": 1,
+                    "test_symbolic": True,
+                    "test_output_shape": False,
+                },
+                {
+                    "padding": (2, 2, 2),
+                    "DHWKdKhKw": (7, 7, 7, 2, 2, 2),
+                    "stride": (2, 1, 1),
+                    "dilation": (1, 1, 1),
+                    "has_bias": True,
+                    "groups": 2,
+                    "test_symbolic": False,
+                    "test_output_shape": False,
+                },
             ],
+            [(np.float32, np.float32), (np.float16, np.float16)],
         ),
     )
     def test_builder_to_backend_stress(
@@ -83,6 +89,7 @@ def test_builder_to_backend_stress(
         backend,
         conv_dim,
         config,
+        x_weight_dtype,
     ):
         padding = config["padding"]
         DHWKdKhKw = config["DHWKdKhKw"]
@@ -96,8 +103,8 @@ def test_builder_to_backend_stress(
         D, H, W, Kd, Kh, Kw = DHWKdKhKw
         N, C_in, C_out = 1, 1 * groups, 2 * groups
 
-        import torch
-        import torch.nn as nn
+        x_dtype, weight_bias_dtype = x_weight_dtype
+        x_builtin_dtype = types.numpy_type_to_builtin_type(x_dtype)
 
         isDeconv1d = conv_dim == "conv1d"
         isDeconv2d = conv_dim == "conv2d"
@@ -160,13 +167,13 @@ def test_builder_to_backend_stress(
             ]
 
         wts = m.state_dict()
-        weight = wts["weight"].detach().numpy()
-        bias = wts["bias"].detach().numpy() if has_bias else None
+        weight = wts["weight"].detach().numpy().astype(weight_bias_dtype)
+        bias = wts["bias"].detach().numpy().astype(weight_bias_dtype) if has_bias else None
 
         input = torch.randn(*input_shape)
         output = m(input)
-        output = output.detach().numpy()
-        input = input.detach().numpy()
+        output = output.detach().numpy().astype(x_dtype)
+        input = input.detach().numpy().astype(x_dtype)
 
         output_shape = list(output.shape)
         if test_symbolic:
@@ -179,7 +186,7 @@ def test_builder_to_backend_stress(
         expected_output_types = tuple(output_shape[:]) + (types.fp32,)
         expected_outputs = [output]
 
-        input_placeholders = {"x": mb.placeholder(shape=input_shape)}
+        input_placeholders = {"x": mb.placeholder(shape=input_shape, dtype=x_builtin_dtype)}
         input_values = {"x": input}
 
         def build(x):
@@ -206,12 +213,12 @@ def build(x):
             expected_outputs,
             compute_unit=compute_unit,
             backend=backend,
+            atol=1e-3 if x_dtype == np.float16 and backend.backend == "neuralnetwork" else 1e-4,
         )
 
 
 class TestConv:
-
-    @pytest.mark.skipif(not testing_reqs._HAS_TORCH, reason="PyTorch not installed.")
+    @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
     @pytest.mark.parametrize(
         "compute_unit, backend, padding_mode, conv_dim",
         itertools.product(
@@ -222,7 +229,6 @@ class TestConv:
         ),
     )
     def test_padding_mode_stress(self, compute_unit, backend, padding_mode, conv_dim):
-        import torch
         def rotation_tensor(tensor):
             assert tensor.shape[0] == tensor.shape[1] == 1
             tensor = tensor[0][0]
@@ -231,18 +237,14 @@ def rotation_tensor(tensor):
             return np.expand_dims(new_tensor, axis=(0, 1))
 
         if conv_dim == "conv3d" and padding_mode == "same_lower":
-            if backend[0] == "neuralnetwork":
+            if backend.backend == "neuralnetwork":
                 pytest.skip("same_lower mode not supported for conv3d in neuralnetwork backend")
 
-        if padding_mode == "same_lower" and backend[0] == "mlprogram" and ct.utils._macos_version() < (13, 0):
-            pytest.skip("same_lower pad_type not supported in macOS12 or older.")
-
-        minimum_deployment_target = ct.target.iOS16 if backend[0] == "mlprogram" else None
-        if _macos_version() < (13, 0) and minimum_deployment_target == ct.target.iOS16:
-            pytest.skip("iOS16 target not available on macOS 13")
+        if padding_mode == "same_lower" and backend.opset_version == ct.target.iOS15:
+            pytest.skip("same_lower pad_type not supported iOS15 opset")
 
         batch, in_channels, out_channels = 1, 1, 1
-        input_shape = (batch, in_channels, 4, 5, 6) # batch, channel, height, width
+        input_shape = (batch, in_channels, 4, 5, 6)  # batch, channel, height, width
         kernel_size = (2, 4, 3)
         torch_padding_mode = padding_mode if padding_mode != "same_lower" else "same"
 
@@ -296,9 +298,13 @@ def rotation_tensor(tensor):
             # (1) Rotate the input value
             # (2) Rotate the kernel value
             # (3) Rotate the torch out
-            rotated_input = torch.tensor(rotation_tensor(input.detach().numpy()), dtype=torch.float32)
-            rotated_weight = torch.tensor(rotation_tensor(weight.detach().numpy()), dtype=torch.float32)
-            m.load_state_dict({'weight': rotated_weight}, strict=False)
+            rotated_input = torch.tensor(
+                rotation_tensor(input.detach().numpy()), dtype=torch.float32
+            )
+            rotated_weight = torch.tensor(
+                rotation_tensor(weight.detach().numpy()), dtype=torch.float32
+            )
+            m.load_state_dict({"weight": rotated_weight}, strict=False)
             output = m(rotated_input).detach().numpy()
             output = rotation_tensor(output)
         else:
@@ -326,11 +332,9 @@ def build(x):
             expected_outputs,
             compute_unit=compute_unit,
             backend=backend,
-            minimum_deployment_target=minimum_deployment_target,
         )
 
-
-    @pytest.mark.skipif(not testing_reqs._HAS_TORCH, reason="PyTorch not installed.")
+    @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
     @pytest.mark.parametrize(
         ",".join(
             [
@@ -338,49 +342,52 @@ def build(x):
                 "backend",
                 "conv_dim",
                 "config",
+                "x_weight_dtype",
             ]
         ),
         itertools.product(
             compute_units,
             backends,
             ["conv1d", "conv2d", "conv3d"],
-            [{
-                "padding": (1, 1, 1),
-                "DHWKdKhKw": (10, 12, 14, 3, 2, 4),
-                "stride": (2, 1, 1),
-                "dilation": (1, 1, 1),
-                "has_bias": False,
-                "groups": 1,
-                "symbolic": False,
-             },
-             {
-                "padding": (2, 2, 2),
-                "DHWKdKhKw": (10, 12, 14, 3, 2, 4),
-                "stride": (2, 2, 2),
-                "dilation": (2, 1, 1),
-                "has_bias": False,
-                "groups": 2,
-                "symbolic": True,
-             },
-             {
-                "padding": (1, 1, 1),
-                "DHWKdKhKw": (5, 5, 5, 2, 2, 2),
-                "stride": (2, 2, 2),
-                "dilation": (2, 1, 1),
-                "has_bias": True,
-                "groups": 1,
-                "symbolic": True,
-             },
-             {
-                "padding": (2, 2, 2),
-                "DHWKdKhKw": (5, 5, 5, 2, 2, 2),
-                "stride": (2, 1, 1),
-                "dilation": (1, 1, 1),
-                "has_bias": True,
-                "groups": 2,
-                "symbolic": False,
-             },
-             ],
+            [
+                {
+                    "padding": (1, 1, 1),
+                    "DHWKdKhKw": (10, 12, 14, 3, 2, 4),
+                    "stride": (2, 1, 1),
+                    "dilation": (1, 1, 1),
+                    "has_bias": False,
+                    "groups": 1,
+                    "symbolic": False,
+                },
+                {
+                    "padding": (2, 2, 2),
+                    "DHWKdKhKw": (10, 12, 14, 3, 2, 4),
+                    "stride": (2, 2, 2),
+                    "dilation": (2, 1, 1),
+                    "has_bias": False,
+                    "groups": 2,
+                    "symbolic": True,
+                },
+                {
+                    "padding": (1, 1, 1),
+                    "DHWKdKhKw": (5, 5, 5, 2, 2, 2),
+                    "stride": (2, 2, 2),
+                    "dilation": (2, 1, 1),
+                    "has_bias": True,
+                    "groups": 1,
+                    "symbolic": True,
+                },
+                {
+                    "padding": (2, 2, 2),
+                    "DHWKdKhKw": (5, 5, 5, 2, 2, 2),
+                    "stride": (2, 1, 1),
+                    "dilation": (1, 1, 1),
+                    "has_bias": True,
+                    "groups": 2,
+                    "symbolic": False,
+                },
+            ],
+            [(np.float32, np.float32), (np.float16, np.float16)],
         ),
     )
     def test_builder_to_backend_stress(
@@ -389,6 +396,7 @@ def test_builder_to_backend_stress(
         backend,
         conv_dim,
         config,
+        x_weight_dtype,
     ):
         padding = config["padding"]
         DHWKdKhKw = config["DHWKdKhKw"]
@@ -401,8 +409,8 @@ def test_builder_to_backend_stress(
         D, H, W, Kd, Kh, Kw = DHWKdKhKw
         N, C_in, C_out = 1, 1 * groups, 2 * groups
 
-        import torch
-        import torch.nn as nn
+        x_dtype, weight_bias_dtype = x_weight_dtype
+        x_builtin_dtype = types.numpy_type_to_builtin_type(x_dtype)
 
         isConv1d = conv_dim == "conv1d"
         isConv2d = conv_dim == "conv2d"
@@ -464,17 +472,18 @@ def test_builder_to_backend_stress(
             ]
 
         wts = m.state_dict()
-        weight = wts["weight"].detach().numpy()
-        bias = wts["bias"].detach().numpy() if has_bias else None
+        weight = wts["weight"].detach().numpy().astype(weight_bias_dtype)
+        bias = wts["bias"].detach().numpy().astype(weight_bias_dtype) if has_bias else None
 
         # PyTorch and CoreML weight format is same
         # PyTorch weight format: C_out, C_in, H, W
         # MIL weight format: C_out, C_in, H, W
 
-        input = torch.randn(*input_shape)
+        input = random_gen(input_shape)
+        input = torch.Tensor(input)
         output = m(input)
-        output = output.detach().numpy()
-        input = input.detach().numpy()
+        output = output.detach().numpy().astype(x_dtype)
+        input = input.detach().numpy().astype(x_dtype)
 
         output_shape = list(output.shape)
         if symbolic:
@@ -484,10 +493,10 @@ def test_builder_to_backend_stress(
             input_shape[0] = symbolic_batch_size
             output_shape[0] = symbolic_batch_size
 
-        expected_output_types = tuple(output_shape[:]) + (types.fp32,)
+        expected_output_types = tuple(output_shape[:]) + (x_builtin_dtype,)
         expected_outputs = [output]
 
-        input_placeholders = {"x": mb.placeholder(shape=input_shape)}
+        input_placeholders = {"x": mb.placeholder(shape=input_shape, dtype=x_builtin_dtype)}
         input_values = {"x": input}
 
         def build(x):
@@ -512,9 +521,10 @@ def build(x):
             expected_outputs,
             compute_unit=compute_unit,
             backend=backend,
+            atol=1e-3 if x_dtype == np.float16 and backend.backend == "neuralnetwork" else 1e-4,
         )
 
-    @pytest.mark.skipif(not testing_reqs._HAS_TORCH, reason="PyTorch not installed.")
+    @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
     @pytest.mark.parametrize(
         ",".join(
             [
@@ -529,42 +539,42 @@ def build(x):
             backends,
             ["conv1d", "conv2d"],
             [
-            {
-                "padding": (1, 1, 1),
-                "DHWKdKhKw": (10, 12, 14, 3, 2, 4),
-                "stride": (2, 1, 1),
-                "dilation": (1, 1, 1),
-                "has_bias": False,
-                "groups": 1,
-                "symbolic": False,
-            },
-            {
-                "padding": (2, 2, 2),
-                "DHWKdKhKw": (10, 12, 14, 3, 2, 4),
-                "stride": (2, 2, 2),
-                "dilation": (2, 1, 1),
-                "has_bias": False,
-                "groups": 2,
-                "symbolic": True,
-            },
-            {
-                "padding": (1, 1, 1),
-                "DHWKdKhKw": (5, 5, 5, 2, 2, 2),
-                "stride": (2, 2, 2),
-                "dilation": (2, 1, 1),
-                "has_bias": True,
-                "groups": 1,
-                "symbolic": True,
-            },
-            {
-                "padding": (2, 2, 2),
-                "DHWKdKhKw": (5, 5, 5, 2, 2, 2),
-                "stride": (2, 1, 1),
-                "dilation": (1, 1, 1),
-                "has_bias": True,
-                "groups": 2,
-                "symbolic": False,
-            },
+                {
+                    "padding": (1, 1, 1),
+                    "DHWKdKhKw": (10, 12, 14, 3, 2, 4),
+                    "stride": (2, 1, 1),
+                    "dilation": (1, 1, 1),
+                    "has_bias": False,
+                    "groups": 1,
+                    "symbolic": False,
+                },
+                {
+                    "padding": (2, 2, 2),
+                    "DHWKdKhKw": (10, 12, 14, 3, 2, 4),
+                    "stride": (2, 2, 2),
+                    "dilation": (2, 1, 1),
+                    "has_bias": False,
+                    "groups": 2,
+                    "symbolic": True,
+                },
+                {
+                    "padding": (1, 1, 1),
+                    "DHWKdKhKw": (5, 5, 5, 2, 2, 2),
+                    "stride": (2, 2, 2),
+                    "dilation": (2, 1, 1),
+                    "has_bias": True,
+                    "groups": 1,
+                    "symbolic": True,
+                },
+                {
+                    "padding": (2, 2, 2),
+                    "DHWKdKhKw": (5, 5, 5, 2, 2, 2),
+                    "stride": (2, 1, 1),
+                    "dilation": (1, 1, 1),
+                    "has_bias": True,
+                    "groups": 2,
+                    "symbolic": False,
+                },
             ],
         ),
     )
@@ -582,18 +592,19 @@ def test_builder_to_backend_stress_weights_input(
         groups = config["groups"]
         symbolic = config["symbolic"]
 
-        if backend[0] == "neuralnetwork" and groups > 1:
-            pytest.skip("dynamic conv with groups > 1 is not supported on the neuralnetwork backend")
+        if backend.backend == "neuralnetwork" and groups > 1:
+            pytest.skip(
+                "dynamic conv with groups > 1 is not supported on the neuralnetwork backend"
+            )
 
-        if backend[0] == "mlprogram" and compute_unit != ct.ComputeUnit.CPU_ONLY:
-            pytest.xfail("rdar://97398343 (test_builder_to_backend_stress_weights_input is failing on mlprogram + GPU)")
+        if backend.backend == "mlprogram" and compute_unit != ct.ComputeUnit.CPU_ONLY:
+            pytest.xfail(
+                "rdar://97398343 (test_builder_to_backend_stress_weights_input is failing on mlprogram + GPU)"
+            )
 
         D, H, W, Kd, Kh, Kw = DHWKdKhKw
         N, C_in, C_out = 1, 1 * groups, 2 * groups
 
-        import torch
-        import torch.nn as nn
-
         isConv1d = conv_dim == "conv1d"
         isConv2d = conv_dim == "conv2d"
 
@@ -650,7 +661,10 @@ def test_builder_to_backend_stress_weights_input(
         expected_output_types = tuple(output_shape[:]) + (types.fp32,)
         expected_outputs = [output]
 
-        input_placeholders = {"x": mb.placeholder(shape=input_shape), "input_weight":mb.placeholder(shape=weight.shape)}
+        input_placeholders = {
+            "x": mb.placeholder(shape=input_shape),
+            "input_weight": mb.placeholder(shape=weight.shape),
+        }
         input_values = {"x": input, "input_weight": weight}
 
         def build(x, input_weight):
@@ -676,9 +690,7 @@ def build(x, input_weight):
             backend=backend,
         )
 
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_conv_bias_fusion(self, compute_unit, backend):
         """
         Test conv bias fusion when const input.
@@ -697,7 +709,7 @@ def test_conv_bias_fusion(self, compute_unit, backend):
 
         def build(x):
             x = mb.conv(x=x, weight=weight)
-            bias = mb.const(val=[10.])
+            bias = mb.const(val=[10.0])
             return mb.add(x=x, y=bias)
 
         input = np.array([1, 2, 3, 4], dtype=np.float32).reshape((1, 1, 2, 2))
@@ -719,7 +731,6 @@ def build(x):
 
 
 class TestInvalidConvConfig:
-
     @pytest.mark.parametrize(
         "compute_unit, backend, conv_dim",
         itertools.product(
@@ -740,14 +751,16 @@ def test_invalid_weight(self, compute_unit, backend, conv_dim):
         while C_in % groups != 0:
             groups = np.random.randint(low=1, high=C_in + 1)
 
-        weight = np.random.rand(C_out, C_in // groups + + np.random.randint(low=1, high=8), *K) * 2.0 - 1.0
+        weight = (
+            np.random.rand(C_out, C_in // groups + +np.random.randint(low=1, high=8), *K) * 2.0
+            - 1.0
+        )
 
         def build(x):
             return mb.conv(x=x, weight=weight, groups=groups)
 
         with pytest.raises(
-            ValueError,
-            match=r"C_in / groups = [0-9]+/[0-9]+ != weight\[1\] \([0-9]+\)"
+            ValueError, match=r"C_in / groups = [0-9]+/[0-9]+ != weight\[1\] \([0-9]+\)"
         ):
             run_compare_builder(
                 build,
@@ -782,8 +795,7 @@ def build(x):
             return mb.conv(x=x, weight=weight, bias=bias)
 
         with pytest.raises(
-            ValueError,
-            match=r"# of bias values [0-9]+ not equal to # output channels [0-9]+"
+            ValueError, match=r"# of bias values [0-9]+ not equal to # output channels [0-9]+"
         ):
             run_compare_builder(
                 build,
@@ -815,8 +827,7 @@ def build(x):
             return mb.conv(x=x, weight=weight)
 
         with pytest.raises(
-            ValueError,
-            match=r"spatial dimension [0-9]+ has invalid output size -?[0-9]+"
+            ValueError, match=r"spatial dimension [0-9]+ has invalid output size -?[0-9]+"
         ):
             run_compare_builder(
                 build,
@@ -849,8 +860,7 @@ def build(x):
             return mb.conv(x=x, weight=weight, dilations=dilations)
 
         with pytest.raises(
-            ValueError,
-            match=r"spatial dimension [0-9]+ has invalid output size -?[0-9]+"
+            ValueError, match=r"spatial dimension [0-9]+ has invalid output size -?[0-9]+"
         ):
             run_compare_builder(
                 build,
@@ -886,8 +896,7 @@ def build(x):
             return mb.conv(x=x, weight=weight, groups=groups)
 
         with pytest.raises(
-            ValueError,
-            match=r"# of input channels [0-9]+ not divisible by groups [0-9]+"
+            ValueError, match=r"# of input channels [0-9]+ not divisible by groups [0-9]+"
         ):
             run_compare_builder(
                 build,
@@ -920,16 +929,18 @@ def test_invalid_rank(self, compute_unit, backend, conv_dim):
         pad = tuple(np.random.randint(low=1, high=4, size=2 * conv_dim + 3))
 
         def build(x):
-            return mb.conv(x=x, weight=weight, strides=strides, dilations=dilations, pad_type="custom", pad=pad)
+            return mb.conv(
+                x=x, weight=weight, strides=strides, dilations=dilations, pad_type="custom", pad=pad
+            )
 
         with pytest.raises(
             ValueError,
             match=r"input_shape \(length [0-9]+\), "
-                  r"kernel_shape \(length [0-9]+\), "
-                  r"strides \(length [0-9]+\), "
-                  r"dilations \(length [0-9]+\), "
-                  r"and custom_pad \(length [0-9]+\) divided by two "
-                  r"must all be the same length",
+            r"kernel_shape \(length [0-9]+\), "
+            r"strides \(length [0-9]+\), "
+            r"dilations \(length [0-9]+\), "
+            r"and custom_pad \(length [0-9]+\) divided by two "
+            r"must all be the same length",
         ):
             run_compare_builder(
                 build,
diff --git a/coremltools/converters/mil/mil/ops/tests/test_elementwise_binary.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_elementwise_binary.py
similarity index 97%
rename from coremltools/converters/mil/mil/ops/tests/test_elementwise_binary.py
rename to coremltools/converters/mil/mil/ops/tests/iOS14/test_elementwise_binary.py
index fa2790ce7..4b5342a67 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_elementwise_binary.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_elementwise_binary.py
@@ -9,11 +9,11 @@
 
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import get_new_symbol, types
-from coremltools.converters.mil.testing_reqs import backends, compute_units
+from coremltools.converters.mil.mil.ops.tests.iOS14 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
+from coremltools.converters.mil.testing_reqs import compute_units
 from coremltools.converters.mil.testing_utils import ssa_fn
 
-from .testing_utils import run_compare_builder
-
 
 class TestElementwiseBinary:
     # All in this test share the same backends
@@ -75,9 +75,7 @@ def test_builder_to_backend_smoke(self, compute_unit, backend, mode):
         elif mode == "pow":
             x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
             y = np.array([[-1, 2, -3], [4, -5, 6]], dtype=np.float32)
-            expected_outputs = np.array(
-                [[1, 4, 0.037], [256, 0.00032, 46656]], dtype=np.float32
-            )
+            expected_outputs = np.array([[1, 4, 0.037], [256, 0.00032, 46656]], dtype=np.float32)
 
             build = lambda x, y: mb.pow(x=x, y=y)
         elif mode == "real_div":
@@ -185,9 +183,7 @@ def test_builder_mul(self):
     def test_builder_pow(self):
         x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
         y = np.array([[-1, 2, -3], [4, -5, 6]], dtype=np.float32)
-        expected_outputs = np.array(
-            [[1, 4, 0.037], [256, 0.00032, 46656]], dtype=np.float32
-        )
+        expected_outputs = np.array([[1, 4, 0.037], [256, 0.00032, 46656]], dtype=np.float32)
         v = mb.pow(x=x, y=y)
         np.testing.assert_allclose(expected_outputs, v.val, atol=1e-04, rtol=1e-05)
 
@@ -238,7 +234,7 @@ def test_real_div_int_builder_to_backend(self, compute_unit, backend):
         x = np.array([[10, 20, 30], [40, 50, 60]], dtype=np.float32)
         y = np.array([[11, 12, 13], [14, 15, 16]], dtype=np.float32)
 
-        if backend[0] == "neuralnetwork":
+        if backend.backend == "neuralnetwork":
             dtype = np.float32
         else:
             dtype = np.int32
@@ -281,7 +277,7 @@ def test_builder_to_backend_smoke(self, compute_unit, backend):
         input_values = {"x": x, "y": y}
 
         def build(x, y):
-            return mb.equal(x=x, y=y), mb.equal(x=-3., y=y)
+            return mb.equal(x=x, y=y), mb.equal(x=-3.0, y=y)
 
         expected_output_types = [
             (2, 3, types.bool),
diff --git a/coremltools/converters/mil/mil/ops/tests/test_elementwise_unary.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_elementwise_unary.py
similarity index 74%
rename from coremltools/converters/mil/mil/ops/tests/test_elementwise_unary.py
rename to coremltools/converters/mil/mil/ops/tests/iOS14/test_elementwise_unary.py
index aa91f944c..8bdae53ac 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_elementwise_unary.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_elementwise_unary.py
@@ -4,26 +4,19 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 import itertools
-from unittest.mock import patch
 
 import numpy as np
 import pytest
 import scipy
 
-import coremltools as ct
-from coremltools.converters.mil import testing_reqs
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import Function, get_new_symbol, types
-from coremltools.converters.mil.mil.passes.pass_pipeline import PassPipeline
+from coremltools.converters.mil.mil.ops.tests.iOS14 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
 from coremltools.converters.mil.mil.types.symbolic import is_compatible_symbolic_vector
-from coremltools.converters.mil.mil.var import Var
+from coremltools.converters.mil.testing_reqs import compute_units
 from coremltools.converters.mil.testing_utils import ssa_fn
 
-from .testing_utils import run_compare_builder
-
-backends = testing_reqs.backends
-compute_units = testing_reqs.compute_units
-
 
 class TestElementwiseUnary:
     # All ops in this test share the same backends
@@ -164,9 +157,7 @@ def test_builder_to_backend_smoke(self, compute_unit, backend, mode):
             build = lambda x: mb.exp(x=x)
         elif mode == "exp2":
             val = np.array([[-1, 2, -3], [4, -5, 6]], dtype=np.float32)
-            expected_outputs = np.array(
-                [[0.5, 4.0, 0.125], [16, 0.03125, 64]], dtype=np.float32
-            )
+            expected_outputs = np.array([[0.5, 4.0, 0.125], [16, 0.03125, 64]], dtype=np.float32)
 
             build = lambda x: mb.exp2(x=x)
         elif mode == "floor":
@@ -236,7 +227,7 @@ def test_builder_to_backend_smoke(self, compute_unit, backend, mode):
         elif mode == "square":
             val = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
             expected_outputs = np.array(
-                [[1.0, 4.0, 9.0], [16.0, 25.0, 36.]],
+                [[1.0, 4.0, 9.0], [16.0, 25.0, 36.0]],
                 dtype=np.float32,
             )
 
@@ -261,17 +252,13 @@ def test_builder_to_backend_smoke(self, compute_unit, backend, mode):
             build = lambda x: mb.tanh(x=x)
         elif mode == "threshold":
             val = np.array([[-1.2, 2, -3.4], [4.5, -5, 6.7]], dtype=np.float32)
-            expected_outputs = np.array(
-                [[1.0, 2, 1.0], [4.5, 1.0, 6.7]], dtype=np.float32
-            )
+            expected_outputs = np.array([[1.0, 2, 1.0], [4.5, 1.0, 6.7]], dtype=np.float32)
 
             build = lambda x: mb.threshold(x=x, alpha=1.0)
 
         input_placeholders = {"x": mb.placeholder(shape=val.shape)}
         input_values = {"x": val}
-        expected_output_types = (
-            (2, 3, types.int32) if mode == "cast" else (2, 3, types.fp32)
-        )
+        expected_output_types = (2, 3, types.int32) if mode == "cast" else (2, 3, types.fp32)
 
         run_compare_builder(
             build,
@@ -408,9 +395,7 @@ def test_builder_exp_eval(self):
     def test_builder_exp2_eval(self):
         val = np.array([[-1, 2, -3], [4, -5, 6]], dtype=np.float32)
         v = mb.exp2(x=val)
-        expected_outputs = np.array(
-            [[0.5, 4.0, 0.125], [16, 0.03125, 64]], dtype=np.float32
-        )
+        expected_outputs = np.array([[0.5, 4.0, 0.125], [16, 0.03125, 64]], dtype=np.float32)
 
         np.testing.assert_allclose(expected_outputs, v.val, atol=1e-04, rtol=1e-05)
 
@@ -543,97 +528,65 @@ def build(x):
             output_vars = build(**ssa_func.inputs)
             assert is_compatible_symbolic_vector(output_vars.sym_val, [get_new_symbol(), 1])
 
-    @pytest.mark.parametrize(
-        "compute_unit, backend, epsilon",
-        itertools.product(
-            compute_units,
-            backends,
-            [1e-3, 1e-1, 1.0],
-        ),
-    )
-    def test_builder_to_backend_stress_inverse(
-        self, compute_unit, backend, epsilon
+    @staticmethod
+    def _test_builder_to_backend_stress_with_epsilon(
+        compute_unit,
+        backend,
+        op_name,
+        epsilon_val,
+        x_eps_dtype,
     ):
-        x = np.array([[1, -2, 3], [4, -5, 6]], dtype=np.float32)
-        numpy_pred = 1 / (x + epsilon)
+        x_dtype, epsilon_dtype = x_eps_dtype
 
-        input_placeholder_dict = {"x": mb.placeholder(shape=x.shape)}
-        input_value_dict = {"x": x}
+        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=x_dtype)
+        epsilon = epsilon_dtype(epsilon_val)
 
-        def build(x):
-            return mb.inverse(x=x, epsilon=epsilon)
-
-        expected_output_type = x.shape + (types.fp32,)
-        run_compare_builder(
-            build,
-            input_placeholder_dict,
-            input_value_dict,
-            expected_output_type,
-            numpy_pred,
-            compute_unit=compute_unit,
-            backend=backend,
-        )
-
-    @pytest.mark.parametrize(
-        "compute_unit, backend, epsilon",
-        itertools.product(
-            compute_units,
-            backends,
-            [1e-3, 1e-1, 1.0],
-        ),
-    )
-    def test_builder_to_backend_stress_rsqrt(
-        self, compute_unit, backend, epsilon
-    ):
-        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-        numpy_pred = 1.0 / np.sqrt(x + epsilon)
-
-        input_placeholder_dict = {"x": mb.placeholder(shape=x.shape)}
-        input_value_dict = {"x": x}
+        def _calculate_by_np():
+            if op_name == "inverse":
+                return 1 / (x + epsilon)
+            elif op_name == "log":
+                return np.log(x + epsilon)
+            elif op_name == "rsqrt":
+                return 1.0 / np.sqrt(x + epsilon)
+            else:
+                raise ValueError(f"Invalid op {op_name}")
 
         def build(x):
-            return mb.rsqrt(x=x, epsilon=epsilon)
+            return getattr(mb, op_name)(x=x, epsilon=epsilon)
 
-        expected_output_type = x.shape + (types.fp32,)
+        x_mb_dtype = types.numpy_type_to_builtin_type(x_dtype)
         run_compare_builder(
             build,
-            input_placeholder_dict,
-            input_value_dict,
-            expected_output_type,
-            numpy_pred,
+            input_placeholders={"x": mb.placeholder(shape=x.shape, dtype=x_mb_dtype)},
+            input_values={"x": x},
+            expected_output_types=x.shape + (x_mb_dtype,),
+            expected_outputs=_calculate_by_np(),
             compute_unit=compute_unit,
             backend=backend,
+            atol=1e-2 if x_dtype == np.float16 else 1e-4,
+            rtol=1e-3 if x_dtype == np.float16 else 1e-5,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, epsilon",
+        "compute_unit, backend, op_name, epsilon_val, x_eps_dtype",
         itertools.product(
             compute_units,
             backends,
+            ["inverse", "log", "rsqrt"],
             [1e-3, 1e-1, 1.0],
+            [(np.float32, np.float32), (np.float16, np.float16)],
         ),
     )
-    def test_builder_to_backend_stress_log(
-            self, compute_unit, backend, epsilon
+    def test_builder_to_backend_stress_with_epsilon(
+        self,
+        compute_unit,
+        backend,
+        op_name,
+        epsilon_val,
+        x_eps_dtype,
     ):
-        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-        numpy_pred = np.log(x + epsilon)
-
-        input_placeholder_dict = {"x": mb.placeholder(shape=x.shape)}
-        input_value_dict = {"x": x}
-
-        def build(x):
-            return mb.log(x=x, epsilon=epsilon)
-
-        expected_output_type = x.shape + (types.fp32,)
-        run_compare_builder(
-            build,
-            input_placeholder_dict,
-            input_value_dict,
-            expected_output_type,
-            numpy_pred,
-            compute_unit=compute_unit,
-            backend=backend,
+        self._test_builder_to_backend_stress_with_epsilon(
+            compute_unit, backend, op_name, epsilon_val, x_eps_dtype
         )
 
     @pytest.mark.parametrize(
@@ -644,9 +597,7 @@ def build(x):
             [("fp16", "fp32"), ("fp32", "fp16")],
         ),
     )
-    def test_builder_to_backend_stress_cast(
-            self, compute_unit, backend, src_dst
-    ):
+    def test_builder_to_backend_stress_cast(self, compute_unit, backend, src_dst):
         src_dtype, dst_dtype = src_dst
         x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
         numpy_pred = x.astype(dtype=np.float16)
@@ -673,89 +624,8 @@ def build(x):
             backend=backend,
         )
 
-    @pytest.mark.parametrize(
-        "compute_unit, backend, src_dtype, dst_dtype",
-        itertools.product(
-            compute_units,
-            [("mlprogram", "fp16")],
-            [np.float16, np.float32, np.float64, np.int64, np.int32, np.int16, np.uint16],
-            [np.float16, np.float32, np.float64, np.int64, np.int32, np.int16, np.uint16],
-        ),
-    )
-    def test_builder_eval_cast_ios17(self, compute_unit, backend, src_dtype, dst_dtype):
-        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=src_dtype)
-        dst_dtype_str = types.builtin_to_string(
-            types.type_mapping.numpy_type_to_builtin_type(dst_dtype)
-        )
-        expected_res = x.astype(dtype=np.float16)
-
-        @mb.program(input_specs=[], opset_version=ct.target.iOS17)
-        def prog():
-            return mb.cast(x=x, dtype=dst_dtype_str)
-
-        main_func = prog.functions["main"]
-        cast_op = main_func.find_ops(op_type="cast")[0]
-        np.testing.assert_allclose(expected_res, cast_op.outputs[0].val, atol=1e-04, rtol=1e-05)
-
-    @pytest.mark.parametrize(
-        "compute_unit, backend, src_dtype, dst_dtype",
-        itertools.product(
-            compute_units,
-            [("mlprogram", "fp16")],
-            [np.float16, np.float32, np.int16, np.int32, np.uint16],
-            [np.float16, np.float32, np.int16, np.int32, np.uint16],
-        ),
-    )
-    def test_builder_to_backend_cast_ios17(self, compute_unit, backend, src_dtype, dst_dtype):
-        _SUPPORTED_IO_DTYPES = {types.fp16, types.fp32, types.int32}
-        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=src_dtype)
-        src_builtin_dtype = types.type_mapping.numpy_type_to_builtin_type(src_dtype)
-        dst_builtin_dtype = types.type_mapping.numpy_type_to_builtin_type(dst_dtype)
-        expected_res = x.astype(dtype=np.float16)
-
-        expected_cast_num = 1
-        if src_builtin_dtype not in _SUPPORTED_IO_DTYPES:
-            # A cast will be inserted for unsupported dtypes inputs.
-            expected_cast_num += 1
-
-        # As CoreML IO only allows fp16/32 and int32, the output will be further cast.
-        expected_res_builtin_dtype = dst_builtin_dtype
-        if dst_builtin_dtype not in _SUPPORTED_IO_DTYPES:
-            expected_res_builtin_dtype = (
-                types.int32 if types.is_int(dst_builtin_dtype) else types.fp32
-            )
-            expected_cast_num += 1
-
-        def build(x):
-            return mb.cast(x=x, dtype=types.builtin_to_string(dst_builtin_dtype))
-
-        with patch.object(Var, "_is_nonreplaceable_var") as mocked_is_nonreplaceable_var:
-            # Mock that the cast is non-replaceable, to make sure it's kept in the graph.
-            mocked_is_nonreplaceable_var.side_effect = (
-                lambda var: var.op and var.op.op_type == "cast"
-            )
-            # Remove the cast optimization pass to make sure all cast are kept in the graph.
-            pass_pipeline: PassPipeline = PassPipeline.DEFAULT
-            pass_pipeline.remove_passes(
-                ["common::cast_optimization", "common::topological_reorder"]
-            )
-            mlmodel = run_compare_builder(
-                build,
-                {"x": mb.placeholder(shape=x.shape, dtype=src_builtin_dtype)},
-                input_values={"x": x},
-                expected_output_types=x.shape + (expected_res_builtin_dtype,),
-                expected_outputs=expected_res,
-                compute_unit=compute_unit,
-                backend=backend,
-                minimum_deployment_target=ct.target.iOS17,
-                pass_pipeline=pass_pipeline,
-            )
-            prog = mlmodel._mil_program
-            cast_ops = prog["main"].find_ops(op_type="cast")
-            assert len(cast_ops) == expected_cast_num
-
     def test_erf_value_inference(self):
-        INPUT_SIZE=(2, 3, 4)
+        INPUT_SIZE = (2, 3, 4)
         rs = np.random.RandomState(1234)
         x = rs.random(INPUT_SIZE)
 
@@ -765,7 +635,9 @@ def prog():
 
         ops = list(prog.functions.values())[0].operations
         assert len(ops) == 2
-        assert ops[0].op_type == 'const'
+        assert ops[0].op_type == "const"
         erf_op = ops[1]
-        assert erf_op.op_type == 'erf'
-        np.testing.assert_allclose(erf_op.value_inference(), scipy.special.erf(x), atol=1e-04, rtol=1e-05)
+        assert erf_op.op_type == "erf"
+        np.testing.assert_allclose(
+            erf_op.value_inference(), scipy.special.erf(x), atol=1e-04, rtol=1e-05
+        )
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS14/test_image_resizing.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_image_resizing.py
new file mode 100644
index 000000000..6dfa97db7
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_image_resizing.py
@@ -0,0 +1,548 @@
+#  Copyright (c) 2020, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import functools
+import itertools
+
+import numpy as np
+import pytest
+
+import coremltools as ct
+from coremltools._deps import _HAS_TORCH, MSG_TORCH_NOT_FOUND
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import get_new_symbol, types
+from coremltools.converters.mil.mil.ops.tests.iOS14 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import (
+    mark_api_breaking,
+    run_compare_builder,
+)
+from coremltools.converters.mil.testing_reqs import compute_units
+from coremltools.converters.mil.testing_utils import random_gen
+
+if _HAS_TORCH:
+    import torch
+
+class TestResizeNearestNeighbor:
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_builder_to_backend_smoke(self, compute_unit, backend):
+        x_val = np.array([0.37, 6.17], dtype=np.float32).reshape([1, 1, 2, 1])
+        input_placeholder_dict = {"x": mb.placeholder(shape=x_val.shape)}
+        input_value_dict = {"x": x_val}
+
+        def build_model(x):
+            return [
+                mb.resize_nearest_neighbor(
+                    x=x,
+                    target_size_height=2,
+                    target_size_width=1,
+                ),
+                mb.resize_nearest_neighbor(
+                    x=x,
+                    target_size_height=2,
+                    target_size_width=3,
+                ),
+            ]
+
+        expected_output_types = [
+            (1, 1, 2, 1, types.fp32),
+            (1, 1, 2, 3, types.fp32),
+        ]
+        expected_outputs = [
+            x_val,
+            np.array([0.37, 0.37, 0.37, 6.17, 6.17, 6.17], dtype=np.float32).reshape([1, 1, 2, 3]),
+        ]
+
+        run_compare_builder(
+            build_model,
+            input_placeholder_dict,
+            input_value_dict,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+class TestResizeBilinear:
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
+    )
+    def test_builder_to_backend_smoke(self, compute_unit, backend):
+        if backend.backend == "mlprogram":
+            pytest.xfail(
+                "Seg fault: rdar://78343191 ((MIL GPU) Core ML Tools Unit Test failures [failure to load or Seg fault])"
+            )
+
+        if backend.backend == "neuralnetwork" and compute_unit == ct.ComputeUnit.CPU_ONLY:
+            pytest.xfail(
+                "rdar://85318710 (Coremltools Smoke test on ResizeBilinear failing on NNv1 backend.)"
+            )
+
+        x = np.array([0, 1], dtype=np.float32).reshape(1, 1, 2)
+        input_placeholder_dict = {"x": mb.placeholder(shape=x.shape)}
+        input_value_dict = {"x": x}
+
+        def build_mode_0(x):
+            return mb.resize_bilinear(
+                x=x,
+                target_size_height=1,
+                target_size_width=5,
+                sampling_mode="STRICT_ALIGN_CORNERS",
+            )
+
+        expected_output_type = (1, 1, 5, types.fp32)
+        expected_output = np.array([0, 0.25, 0.5, 0.75, 1], dtype=np.float32).reshape(1, 1, 5)
+
+        run_compare_builder(
+            build_mode_0,
+            input_placeholder_dict,
+            input_value_dict,
+            expected_output_type,
+            expected_output,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+        def build_mode_2(x):
+            return mb.resize_bilinear(
+                x=x, target_size_height=1, target_size_width=5, sampling_mode="DEFAULT"
+            )
+
+        expected_output = np.array([0, 0.4, 0.8, 1, 1], dtype=np.float32).reshape(1, 1, 5)
+
+        run_compare_builder(
+            build_mode_2,
+            input_placeholder_dict,
+            input_value_dict,
+            expected_output_type,
+            expected_output,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+        def build_mode_3(x):
+            return mb.resize_bilinear(
+                x=x,
+                target_size_height=1,
+                target_size_width=5,
+                sampling_mode="OFFSET_CORNERS",
+            )
+
+        expected_output = np.array([0.1, 0.3, 0.5, 0.7, 0.9], dtype=np.float32).reshape(1, 1, 5)
+
+        run_compare_builder(
+            build_mode_3,
+            input_placeholder_dict,
+            input_value_dict,
+            expected_output_type,
+            expected_output,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+class TestUpsampleBilinear:
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
+    )
+    def test_builder_to_backend_smoke(self, compute_unit, backend):
+        x = np.array([0, 1], dtype=np.float32).reshape(1, 1, 2)
+        input_placeholder_dict = {"x": mb.placeholder(shape=x.shape)}
+        input_value_dict = {"x": x}
+
+        def build_upsample_integer(x):
+            return mb.upsample_bilinear(
+                x=x, scale_factor_height=1, scale_factor_width=3, align_corners=True
+            )
+
+        expected_output_type = (1, 1, 6, types.fp32)
+        expected_output = np.array([0, 0.2, 0.4, 0.6, 0.8, 1], dtype=np.float32).reshape(1, 1, 6)
+
+        run_compare_builder(
+            build_upsample_integer,
+            input_placeholder_dict,
+            input_value_dict,
+            expected_output_type,
+            expected_output,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+        def build_upsample_fractional(x):
+            return mb.upsample_bilinear(
+                x=x, scale_factor_height=1.0, scale_factor_width=2.6, align_corners=False
+            )
+
+        expected_output_type = (1, 1, 5, types.fp32)
+        expected_output = np.array([0, 0.1, 0.5, 0.9, 1], dtype=np.float32).reshape(1, 1, 5)
+
+        run_compare_builder(
+            build_upsample_fractional,
+            input_placeholder_dict,
+            input_value_dict,
+            expected_output_type,
+            expected_output,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
+    @pytest.mark.parametrize(
+        "compute_unit, backend, input_shape, scale_factor, align_corners, recompute_scale_factor",
+        itertools.product(
+            compute_units,
+            backends,
+            [(2, 5, 10, 22)],
+            [(3, 4), (2.5, 2.0), (0.5, 0.75)],
+            [True, False],
+            [True, False],
+        ),
+    )
+    def test_builder_to_backend_stress(
+        self,
+        compute_unit,
+        backend,
+        input_shape,
+        scale_factor,
+        align_corners,
+        recompute_scale_factor,
+    ):
+        scale_factor_height, scale_factor_width = scale_factor
+        _, _, height, width = input_shape
+        height = height * scale_factor_height
+        width = width * scale_factor_width
+        is_h_float = height - np.floor(height) > 0.001
+        is_w_float = width - np.floor(width) > 0.001
+
+        # Currently, MIL is not suporting recompute_scale_factor=False + align_corners=False
+        # with fractional output size
+        if not recompute_scale_factor and not align_corners and (is_h_float or is_w_float):
+            pytest.xfail("rdar://81124053 (Support recompute_scale_factor)")
+
+        def _get_torch_upsample_prediction(
+            x, scale_factor=(2, 2), align_corners=False, recompute_scale_factor=True
+        ):
+            x = torch.from_numpy(x)
+            out = torch.nn.functional.interpolate(
+                x,
+                scale_factor=scale_factor,
+                mode="bilinear",
+                align_corners=align_corners,
+                recompute_scale_factor=recompute_scale_factor,
+            )
+            return out.numpy()
+
+        x = random_gen(input_shape, rand_min=-100, rand_max=100)
+        torch_pred = _get_torch_upsample_prediction(
+            x,
+            scale_factor=scale_factor,
+            align_corners=align_corners,
+            recompute_scale_factor=recompute_scale_factor,
+        )
+
+        input_placeholder_dict = {"x": mb.placeholder(shape=x.shape)}
+        input_value_dict = {"x": x}
+
+        def build_upsample(x):
+            return mb.upsample_bilinear(
+                x=x,
+                scale_factor_height=scale_factor[0],
+                scale_factor_width=scale_factor[1],
+                align_corners=align_corners,
+            )
+
+        expected_output_type = torch_pred.shape + (types.fp32,)
+        run_compare_builder(
+            build_upsample,
+            input_placeholder_dict,
+            input_value_dict,
+            expected_output_type,
+            torch_pred,
+            compute_unit=compute_unit,
+            backend=backend,
+            rtol=0.5,
+        )
+
+
+class TestUpsampleNearestNeighbor:
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
+    )
+    def test_builder_to_backend_smoke(self, compute_unit, backend):
+        x = np.array([1.5, 2.5, 3.5], dtype=np.float32).reshape([1, 1, 1, 3])
+        input_placeholder_dict = {"x": mb.placeholder(shape=x.shape)}
+        input_value_dict = {"x": x}
+
+        def build(x):
+            return mb.upsample_nearest_neighbor(x=x, scale_factor_height=1, scale_factor_width=2)
+
+        expected_output_type = (1, 1, 1, 6, types.fp32)
+        expected_output = np.array([1.5, 1.5, 2.5, 2.5, 3.5, 3.5], dtype=np.float32).reshape(
+            [1, 1, 1, 6]
+        )
+
+        run_compare_builder(
+            build,
+            input_placeholder_dict,
+            input_value_dict,
+            expected_output_type,
+            expected_output,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+
+class TestCrop:
+    @pytest.mark.parametrize(
+        "compute_unit, backend, is_symbolic",
+        itertools.product(compute_units, backends, compute_units),
+    )
+    def test_builder_to_backend_smoke(self, compute_unit, backend, is_symbolic):
+        x = np.array(
+            [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]],
+            dtype=np.float32,
+        ).reshape(1, 1, 4, 4)
+
+        input_shape = list(x.shape)
+        placeholder_input_shape = input_shape
+        if is_symbolic:
+            # set batch and channel dimension symbolic
+            placeholder_input_shape[0] = get_new_symbol()
+            placeholder_input_shape[1] = get_new_symbol()
+
+        input_placeholder_dict = {"x": mb.placeholder(shape=placeholder_input_shape)}
+        input_value_dict = {"x": x}
+
+        def build(x):
+            return mb.crop(x=x, crop_height=[0, 1], crop_width=[1, 1])
+
+        expected_output_type = (
+            placeholder_input_shape[0],
+            placeholder_input_shape[1],
+            3,
+            2,
+            types.fp32,
+        )
+        expected_output = np.array([2, 3, 6, 7, 10, 11], dtype=np.float32).reshape(1, 1, 3, 2)
+
+        run_compare_builder(
+            build,
+            input_placeholder_dict,
+            input_value_dict,
+            expected_output_type,
+            expected_output,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, C, H, W",
+        itertools.product(
+            compute_units,
+            backends,
+            [x for x in range(2, 4)],
+            [x for x in range(5, 8)],
+            [x for x in range(8, 10)],
+        ),
+    )
+    def test_builder_to_backend_stress(self, compute_unit, backend, C, H, W):
+        input_shape = (1, C, H, W)
+        x = np.random.random(input_shape)
+
+        crop_h = [np.random.randint(H)]
+        crop_h.append(np.random.randint(H - crop_h[0]))
+        crop_w = [np.random.randint(W)]
+        crop_w.append(np.random.randint(W - crop_w[0]))
+
+        input_placeholder_dict = {"x": mb.placeholder(shape=input_shape)}
+        input_value_dict = {"x": x}
+
+        def build(x):
+            return mb.crop(x=x, crop_height=crop_h, crop_width=crop_w)
+
+        expected_output_type = (
+            1,
+            C,
+            H - crop_h[0] - crop_h[1],
+            W - crop_w[0] - crop_w[1],
+            types.fp32,
+        )
+        expected_output = x[:, :, crop_h[0] : H - crop_h[1], crop_w[0] : W - crop_w[1]]
+
+        run_compare_builder(
+            build,
+            input_placeholder_dict,
+            input_value_dict,
+            expected_output_type,
+            expected_output,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+
+class TestCropResize:
+    @mark_api_breaking(breaking_opset_version=ct.target.iOS17)
+    @pytest.mark.parametrize(
+        "compute_unit, backend, is_symbolic",
+        itertools.product(compute_units, backends, [True, False]),
+    )
+    def test_builder_to_backend_smoke(self, compute_unit, backend, is_symbolic):
+        if backend.backend == "mlprogram" and compute_unit != ct.ComputeUnit.CPU_ONLY:
+            pytest.xfail("rdar://97398582 (TestCropResize failing on mlprogram + GPU)")
+        x = np.array(
+            [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]],
+            dtype=np.float32,
+        ).reshape(1, 1, 4, 4)
+
+        input_shape = list(x.shape)
+        placeholder_input_shape = input_shape
+        if is_symbolic:
+            # set batch and channel dimension symbolic
+            placeholder_input_shape[0] = get_new_symbol()
+            placeholder_input_shape[1] = get_new_symbol()
+
+        input_placeholder_dict = {"x": mb.placeholder(shape=placeholder_input_shape)}
+        input_value_dict = {"x": x}
+        N = 1
+        roi = np.array([[1, 1, 2, 2]], dtype=np.float32).reshape(1, 1, 4, 1, 1)
+        roi_normalized = np.array([[0, 0.0, 0.0, 1.0 / 3, 1.0 / 3]], dtype=np.float32).reshape(
+            1, 1, 5, 1, 1
+        )
+        roi_invert = np.array([[2, 2, 1, 1]], dtype=np.float32).reshape(1, 1, 4, 1, 1)
+
+        def build(x, mode=0):
+            if mode == 0:
+                return mb.crop_resize(
+                    x=x,
+                    roi=roi,
+                    target_width=2,
+                    target_height=2,
+                    normalized_coordinates=False,
+                    box_coordinate_mode="CORNERS_HEIGHT_FIRST",
+                    sampling_mode="ALIGN_CORNERS",
+                )
+
+            elif mode == 1:
+                return mb.crop_resize(
+                    x=x,
+                    roi=roi,
+                    target_width=4,
+                    target_height=4,
+                    normalized_coordinates=False,
+                    box_coordinate_mode="CORNERS_HEIGHT_FIRST",
+                    sampling_mode="ALIGN_CORNERS",
+                )
+
+            elif mode == 2:
+                return mb.crop_resize(
+                    x=x,
+                    roi=roi,
+                    target_width=1,
+                    target_height=1,
+                    normalized_coordinates=False,
+                    box_coordinate_mode="CORNERS_HEIGHT_FIRST",
+                    sampling_mode="ALIGN_CORNERS",
+                )
+
+            elif mode == 3:
+                return mb.crop_resize(
+                    x=x,
+                    roi=roi_normalized,
+                    target_width=2,
+                    target_height=2,
+                    normalized_coordinates=True,
+                    box_coordinate_mode="CORNERS_HEIGHT_FIRST",
+                    sampling_mode="ALIGN_CORNERS",
+                )
+
+            elif mode == 4:
+                return mb.crop_resize(
+                    x=x,
+                    roi=roi_invert,
+                    target_width=2,
+                    target_height=2,
+                    normalized_coordinates=False,
+                    box_coordinate_mode="CORNERS_HEIGHT_FIRST",
+                    sampling_mode="ALIGN_CORNERS",
+                )
+
+        expected_output_type = [
+            (
+                N,
+                placeholder_input_shape[0],
+                placeholder_input_shape[1],
+                2,
+                2,
+                types.fp32,
+            ),
+            (
+                N,
+                placeholder_input_shape[0],
+                placeholder_input_shape[1],
+                4,
+                4,
+                types.fp32,
+            ),
+            (
+                N,
+                placeholder_input_shape[0],
+                placeholder_input_shape[1],
+                1,
+                1,
+                types.fp32,
+            ),
+            (
+                N,
+                placeholder_input_shape[0],
+                placeholder_input_shape[1],
+                2,
+                2,
+                types.fp32,
+            ),
+            (
+                N,
+                placeholder_input_shape[0],
+                placeholder_input_shape[1],
+                2,
+                2,
+                types.fp32,
+            ),
+        ]
+        expected_output = [
+            np.array([6, 7, 10, 11], dtype=np.float32).reshape(1, 1, 1, 2, 2),
+            np.array(
+                [
+                    [6, 6.333333, 6.66666, 7],
+                    [7.333333, 7.666666, 8, 8.333333],
+                    [8.666666, 9, 9.3333333, 9.666666],
+                    [10, 10.333333, 10.666666, 11],
+                ],
+                dtype=np.float32,
+            ).reshape(1, 1, 1, 4, 4),
+            np.array([8.5], dtype=np.float32).reshape(1, 1, 1, 1, 1),
+            np.array([1, 2, 5, 6], dtype=np.float32).reshape(1, 1, 1, 2, 2),
+            np.array([11, 10, 7, 6], dtype=np.float32).reshape(1, 1, 1, 2, 2),
+        ]
+
+        for mode in range(5):
+            run_compare_builder(
+                functools.partial(build, mode=mode),
+                input_placeholder_dict,
+                input_value_dict,
+                expected_output_type[mode],
+                expected_output[mode],
+                compute_unit=compute_unit,
+                backend=backend,
+            )
diff --git a/coremltools/converters/mil/mil/ops/tests/test_linear.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_linear.py
similarity index 86%
rename from coremltools/converters/mil/mil/ops/tests/test_linear.py
rename to coremltools/converters/mil/mil/ops/tests/iOS14/test_linear.py
index 317632877..00ce6956c 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_linear.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_linear.py
@@ -11,11 +11,11 @@
 import coremltools as ct
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import types
-from coremltools.converters.mil.testing_reqs import backends, compute_units
+from coremltools.converters.mil.mil.ops.tests.iOS14 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
+from coremltools.converters.mil.testing_reqs import compute_units
 from coremltools.converters.mil.testing_utils import random_gen, ssa_fn
 
-from .testing_utils import run_compare_builder
-
 
 class TestLinear:
     @pytest.mark.parametrize(
@@ -34,9 +34,7 @@ def build(x):
 
         expected_output_types = [(2, 2, types.fp32)]
         expected_outputs = [
-            np.array(
-                [[-5.9438195, -1.8854373], [-4.054486, -1.3484411]], dtype=np.float32
-            )
+            np.array([[-5.9438195, -1.8854373], [-4.054486, -1.3484411]], dtype=np.float32)
         ]
 
         run_compare_builder(
@@ -55,19 +53,28 @@ def test_builder_eval(self):
         weight_val = random_gen(shape=(2, 2), rand_min=-91, rand_max=84)
         bias_val = random_gen(shape=(2,), rand_min=0.0, rand_max=9.0)
         v = mb.linear(x=x_val, weight=weight_val, bias=bias_val)
-        np.testing.assert_allclose(np.matmul(x_val, weight_val.T) + bias_val, v.val, atol=1e-04, rtol=1e-05)
+        np.testing.assert_allclose(
+            np.matmul(x_val, weight_val.T) + bias_val, v.val, atol=1e-04, rtol=1e-05
+        )
 
     @pytest.mark.parametrize(
         "compute_unit, backend, rank",
         itertools.product(compute_units, backends, [2, 3, 5]),
     )
     def test_builder_to_backend_stress(self, compute_unit, backend, rank):
-        if backend[0] == "mlprogram" and compute_unit != ct.ComputeUnit.CPU_ONLY:
+        if backend.backend == "mlprogram" and compute_unit != ct.ComputeUnit.CPU_ONLY:
             pytest.xfail("rdar://97398733 (TestLinear failing on mlprogram + GPU)")
-            
-        if backend[0] == "neuralnetwork" and compute_unit != ct.ComputeUnit.CPU_ONLY and platform.machine() == "arm64" and rank == 5:
-            pytest.xfail("rdar://98015195 ([M1 native tests] Some MIL unittests are failing on M1 native)")
-            
+
+        if (
+            backend.backend == "neuralnetwork"
+            and compute_unit != ct.ComputeUnit.CPU_ONLY
+            and platform.machine() == "arm64"
+            and rank == 5
+        ):
+            pytest.xfail(
+                "rdar://98015195 ([M1 native tests] Some MIL unittests are failing on M1 native)"
+            )
+
         x_shape = np.random.randint(low=1, high=3, size=(rank,))
         x_val = np.random.rand(*x_shape)
         out_channels = 3
@@ -98,9 +105,7 @@ def build(x):
 
 
 class TestMatMul:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         x_val = np.array([[-4.0, 13.0], [-3.0, 9.0]], dtype=np.float32)
         y_val = np.array([[1.0, -7.0], [-1.0, -8.0]], dtype=np.float32)
@@ -241,7 +246,11 @@ def build(x):
 
 class TestEinsum:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         equation = "abcd,adce->abce"
@@ -279,7 +288,7 @@ def build(x, y):
             [3, 4],
             [True, False],
             backends,
-        )
+        ),
     )
     def test_builder_to_backend_stress(self, compute_unit, rank, broadcast, backend):
         equation = "abcd,adce->abce" if rank == 4 else "vnm,mno->vno"
@@ -298,21 +307,28 @@ def test_builder_to_backend_stress(self, compute_unit, rank, broadcast, backend)
         }
 
         input_value_dict = {"x": x_val, "y": y_val}
-        out_shape = [shape_y[-4], shape_x[-3], shape_x[-2], shape_y[-1]] if rank == 4 else \
-                    [shape_x[-3], shape_x[-2], shape_y[-1]]
+        out_shape = (
+            [shape_y[-4], shape_x[-3], shape_x[-2], shape_y[-1]]
+            if rank == 4
+            else [shape_x[-3], shape_x[-2], shape_y[-1]]
+        )
         expected_output_type = tuple(out_shape) + (types.fp32,)
 
         def build(x, y):
             return mb.einsum(values=(x, y), equation=equation)
 
         if rank == 3:
-            expected_output = np.einsum(equation,
-                                        np.broadcast_to(x_val, [shape_x[-3], shape_x[-2], shape_x[-1]]),
-                                        np.broadcast_to(y_val, [shape_y[-3], shape_x[-2], shape_y[-1]]))
+            expected_output = np.einsum(
+                equation,
+                np.broadcast_to(x_val, [shape_x[-3], shape_x[-2], shape_x[-1]]),
+                np.broadcast_to(y_val, [shape_y[-3], shape_x[-2], shape_y[-1]]),
+            )
         else:
-            expected_output = np.einsum(equation,
-                                        np.broadcast_to(x_val, [shape_y[-4], shape_x[-3], shape_x[-2], shape_x[-1]]),
-                                        np.broadcast_to(y_val, [shape_y[-4], shape_y[-3], shape_x[-2], shape_y[-1]]))
+            expected_output = np.einsum(
+                equation,
+                np.broadcast_to(x_val, [shape_y[-4], shape_x[-3], shape_x[-2], shape_x[-1]]),
+                np.broadcast_to(y_val, [shape_y[-4], shape_y[-3], shape_x[-2], shape_y[-1]]),
+            )
 
         run_compare_builder(
             build,
diff --git a/coremltools/converters/mil/mil/ops/tests/test_normalization.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_normalization.py
similarity index 67%
rename from coremltools/converters/mil/mil/ops/tests/test_normalization.py
rename to coremltools/converters/mil/mil/ops/tests/iOS14/test_normalization.py
index cc3568df7..785ea270d 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_normalization.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_normalization.py
@@ -13,11 +13,15 @@
 from coremltools._deps import _HAS_TF_2, _HAS_TORCH, MSG_TF2_NOT_FOUND, MSG_TORCH_NOT_FOUND
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import Function, get_new_symbol, types
-from coremltools.converters.mil.testing_reqs import backends, compute_units
+from coremltools.converters.mil.mil.ops.tests.iOS14 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import (
+    UNK_SYM,
+    construct_inputs_from_placeholders,
+    run_compare_builder,
+)
+from coremltools.converters.mil.testing_reqs import compute_units
 from coremltools.converters.mil.testing_utils import random_gen
 
-from .testing_utils import UNK_SYM, construct_inputs_from_placeholders, run_compare_builder
-
 if _HAS_TORCH:
     import torch
 
@@ -27,9 +31,20 @@
 
 class TestNormalizationBatchNorm:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend, x_param_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [(np.float16, np.float16), (np.float32, np.float32)],
+        ),
     )
-    def test_builder_to_backend_smoke(self, compute_unit, backend):
+    def test_builder_to_backend_smoke(self, compute_unit, backend, x_param_dtype):
+        x_dtype, param_dtype = x_param_dtype
+        x_builtin_dtype = types.numpy_type_to_builtin_type(x_dtype)
+
+        if x_dtype == np.float16 and backend.backend == "neuralnetwork":
+            pytest.skip("No need to test fp16 for neuralnetwork backend.")
+
         x_val = np.array(
             [
                 [
@@ -38,13 +53,13 @@ def test_builder_to_backend_smoke(self, compute_unit, backend):
                     [[-9.0, -4.0], [-6.0, 3.0]],
                 ]
             ],
-            dtype=np.float32,
+            dtype=x_dtype,
         )
-        mean_val = np.array([9.0, 6.0, 3.0], dtype=np.float32)
-        variance_val = np.array([6.0, 1.0, 7.0], dtype=np.float32)
-        gamma_val = np.array([1.0, 1.0, 1.0], dtype=np.float32)
-        beta_val = np.array([1.0, 3.0, 0.0], dtype=np.float32)
-        input_placeholders = {"x": mb.placeholder(shape=x_val.shape)}
+        mean_val = np.array([9.0, 6.0, 3.0], dtype=param_dtype)
+        variance_val = np.array([6.0, 1.0, 7.0], dtype=param_dtype)
+        gamma_val = np.array([1.0, 1.0, 1.0], dtype=param_dtype)
+        beta_val = np.array([1.0, 3.0, 0.0], dtype=param_dtype)
+        input_placeholders = {"x": mb.placeholder(shape=x_val.shape, dtype=x_builtin_dtype)}
         input_values = {"x": x_val}
 
         def build(x):
@@ -56,13 +71,13 @@ def build(x):
                     variance=variance_val,
                     gamma=gamma_val,
                     beta=beta_val,
-                    epsilon=1e-4,
+                    epsilon=param_dtype(1e-4),
                 ),
             ]
 
         expected_output_types = [
-            (1, 3, 2, 2, types.fp32),
-            (1, 3, 2, 2, types.fp32),
+            (1, 3, 2, 2, x_builtin_dtype),
+            (1, 3, 2, 2, x_builtin_dtype),
         ]
         expected_outputs = [
             np.array(
@@ -73,7 +88,7 @@ def build(x):
                         [[-4.53557, -2.6457493], [-3.4016776, 0.0]],
                     ]
                 ],
-                dtype=np.float32,
+                dtype=x_dtype,
             ),
             np.array(
                 [
@@ -83,7 +98,7 @@ def build(x):
                         [[-4.535541, -2.6457324], [-3.4016557, 0.0]],
                     ]
                 ],
-                dtype=np.float32,
+                dtype=x_dtype,
             ),
         ]
 
@@ -100,9 +115,20 @@ def build(x):
 
 class TestNormalizationInstanceNorm:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend, x_param_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [(np.float16, np.float16), (np.float32, np.float32)],
+        ),
     )
-    def test_builder_to_backend_smoke(self, compute_unit, backend):
+    def test_builder_to_backend_smoke(self, compute_unit, backend, x_param_dtype):
+        x_dtype, param_dtype = x_param_dtype
+        x_builtin_dtype = types.numpy_type_to_builtin_type(x_dtype)
+
+        if x_dtype == np.float16 and backend.backend == "neuralnetwork":
+            pytest.skip("No need to test fp16 for neuralnetwork backend.")
+
         x_val = np.array(
             [
                 [
@@ -110,23 +136,21 @@ def test_builder_to_backend_smoke(self, compute_unit, backend):
                     [[13.0, 15.0], [13.0, 9.0]],
                     [[-9.0, 4.0], [-6.0, 3.0]],
                 ],
-
                 [
                     [[-5.0, 1.0], [12.0, 3.0]],
                     [[0.0, 9.0], [2.0, -8.0]],
                     [[2.0, 5.0], [10.0, 0.0]],
-
-                ]
+                ],
             ],
-            dtype=np.float32,
+            dtype=x_dtype,
         )
-        input_placeholders = {"x": mb.placeholder(shape=x_val.shape)}
+        input_placeholders = {"x": mb.placeholder(shape=x_val.shape, dtype=x_builtin_dtype)}
         input_values = {"x": x_val}
 
         def build(x):
-            return mb.instance_norm(x=x, epsilon=1e-2)
+            return mb.instance_norm(x=x, epsilon=param_dtype(1e-2))
 
-        expected_output_types = [(2, 3, 2, 2, types.fp32)]
+        expected_output_types = [(2, 3, 2, 2, x_builtin_dtype)]
         expected_outputs = [
             np.array(
                 [
@@ -135,12 +159,11 @@ def build(x):
                         [[0.22917463, 1.14587319], [0.22917463, -1.60422242]],
                         [[-1.2470212, 1.06887531], [-0.71258354, 0.89072943]],
                     ],
-
                     [
                         [[-1.27070526, -0.28693344], [1.51664821, 0.04099049]],
                         [[-0.12380638, 1.36187018], [0.20634397, -1.44440776]],
                         [[-0.59714057, 0.19904686], [1.5260259, -1.12793219]],
-                    ]
+                    ],
                 ],
                 dtype=np.float32,
             )
@@ -157,9 +180,22 @@ def build(x):
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend, x_param_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [(np.float16, np.float16), (np.float32, np.float32)],
+        ),
     )
-    def test_builder_to_backend_smoke_with_gamma_and_beta(self, compute_unit, backend):
+    def test_builder_to_backend_smoke_with_gamma_and_beta(
+        self, compute_unit, backend, x_param_dtype
+    ):
+        x_dtype, param_dtype = x_param_dtype
+        x_builtin_dtype = types.numpy_type_to_builtin_type(x_dtype)
+
+        if x_dtype == np.float16 and backend.backend == "neuralnetwork":
+            pytest.skip("No need to test fp16 for neuralnetwork backend.")
+
         x_val = np.array(
             [
                 [
@@ -167,26 +203,24 @@ def test_builder_to_backend_smoke_with_gamma_and_beta(self, compute_unit, backen
                     [[13.0, 15.0], [13.0, 9.0]],
                     [[-9.0, 4.0], [-6.0, 3.0]],
                 ],
-
                 [
                     [[-5.0, 1.0], [12.0, 3.0]],
                     [[0.0, 9.0], [2.0, -8.0]],
                     [[2.0, 5.0], [10.0, 0.0]],
-
-                ]
+                ],
             ],
-            dtype=np.float32,
+            dtype=x_dtype,
         )
-        gamma_val = np.array([-9.0, 3.2, 1.3], dtype=np.float32)
-        beta_val = np.array([-0.8, 3.4, 1.2], dtype=np.float32)
+        gamma_val = np.array([-9.0, 3.2, 1.3], dtype=param_dtype)
+        beta_val = np.array([-0.8, 3.4, 1.2], dtype=param_dtype)
 
-        input_placeholders = {"x": mb.placeholder(shape=x_val.shape)}
+        input_placeholders = {"x": mb.placeholder(shape=x_val.shape, dtype=x_builtin_dtype)}
         input_values = {"x": x_val}
 
         def build(x):
-            return mb.instance_norm(x=x, gamma=gamma_val, beta=beta_val, epsilon=1e-2)
+            return mb.instance_norm(x=x, gamma=gamma_val, beta=beta_val, epsilon=param_dtype(1e-2))
 
-        expected_output_types = [(2, 3, 2, 2, types.fp32)]
+        expected_output_types = [(2, 3, 2, 2, x_builtin_dtype)]
         expected_outputs = [
             np.array(
                 [
@@ -195,12 +229,11 @@ def build(x):
                         [[4.1333588, 7.06679399], [4.1333588, -1.73351158]],
                         [[-0.42112757, 2.58953791], [0.27364139, 2.35794826]],
                     ],
-
                     [
                         [[10.6363473, 1.782401], [-14.44983388, -1.16891443]],
                         [[3.00381959, 7.75798456], [4.06030069, -1.22210484]],
                         [[0.42371726, 1.45876091], [3.18383368, -0.26631185]],
-                    ]
+                    ],
                 ],
                 dtype=np.float32,
             )
@@ -218,27 +251,35 @@ def build(x):
 
     @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
     @pytest.mark.parametrize(
-        "rank, compute_unit, backend, epsilon",
+        "rank, compute_unit, backend, epsilon, x_param_dtype",
         itertools.product(
             [3, 4],
             compute_units,
             backends,
-            [1e-3, 1e-5, 1e-10]
+            [1e-3, 1e-5, 1e-10],
+            [(np.float16, np.float16), (np.float32, np.float32)],
         ),
     )
-    def test_builder_to_backend_stress(self, rank, compute_unit, backend, epsilon):
+    def test_builder_to_backend_stress(self, rank, compute_unit, backend, epsilon, x_param_dtype):
+        x_dtype, param_dtype = x_param_dtype
+        x_builtin_dtype = types.numpy_type_to_builtin_type(x_dtype)
+
+        if x_dtype == np.float16 and backend.backend == "neuralnetwork":
+            pytest.skip("No need to test fp16 for neuralnetwork backend.")
+
         shape = np.random.randint(low=2, high=6, size=rank)
-        x_val = random_gen(shape=shape, rand_min=-100.0, rand_max=100.0)
-        input_placeholders = {"x": mb.placeholder(shape=x_val.shape)}
+        x_val = random_gen(shape=shape, rand_min=-100.0, rand_max=100.0).astype(x_dtype)
+        input_placeholders = {"x": mb.placeholder(shape=x_val.shape, dtype=x_builtin_dtype)}
         input_values = {"x": x_val}
 
         def build(x):
-            return mb.instance_norm(x=x, epsilon=epsilon)
+            return mb.instance_norm(x=x, epsilon=param_dtype(epsilon))
 
         layer = torch.nn.InstanceNorm2d if rank == 4 else torch.nn.InstanceNorm1d
         torch_op = layer(num_features=shape[1], eps=epsilon)
-        expected_outputs = [torch_op(torch.as_tensor(x_val)).numpy()]
-        expected_output_types = [o.shape[:] + (types.fp32,) for o in expected_outputs]
+        # PyTorch's batch_norm op is not implemented for fp16, so need to cast to fp32 first.
+        expected_outputs = [torch_op(torch.as_tensor(x_val.astype(np.float32))).numpy()]
+        expected_output_types = [o.shape[:] + (x_builtin_dtype,) for o in expected_outputs]
 
         run_compare_builder(
             build,
@@ -250,12 +291,11 @@ def build(x):
             backend=backend,
             atol=1e-3,
             rtol=1e-4,
-            also_compare_shapes=True
+            also_compare_shapes=True,
         )
 
 
 class TestNormalizationL2Norm:
-
     @staticmethod
     def _compute_l2_norm(val, eps):
         shape = val.shape
@@ -263,17 +303,21 @@ def _compute_l2_norm(val, eps):
         batch_dims = rank - 3
         if batch_dims == 0:
             square_sum = np.sum(val**2)
-            output = val/np.power(square_sum + eps, 0.5)
+            output = val / np.power(square_sum + eps, 0.5)
         else:
             batch_dim_prod = np.prod(shape[:batch_dims])
             reshape_val = np.reshape(val, (batch_dim_prod, -1))
             square_sum = np.sum(reshape_val * reshape_val, axis=1, keepdims=True) + eps
-            output = reshape_val/np.power(square_sum, 0.5)
+            output = reshape_val / np.power(square_sum, 0.5)
             output = np.reshape(output, shape)
         return output
 
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         x_val = np.array([[[1.0, -7.0], [5.0, -6.0], [-3.0, -5.0]]], dtype=np.float32)
@@ -308,28 +352,33 @@ def build(x):
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank, epsilon",
+        "compute_unit, backend, rank, epsilon, x_param_dtype",
         itertools.product(
             compute_units,
             backends,
             [3, 4, 5],
-            [1e-4, 5.7]
-        )
+            [1e-4, 5.7],
+            [(np.float16, np.float16), (np.float32, np.float32)],
+        ),
     )
-    def test_builder_to_backend_stress(self, compute_unit, backend, rank, epsilon):
+    def test_builder_to_backend_stress(self, compute_unit, backend, rank, epsilon, x_param_dtype):
+        x_dtype, param_dtype = x_param_dtype
+        x_builtin_dtype = types.numpy_type_to_builtin_type(x_dtype)
+
+        if x_dtype == np.float16 and backend.backend == "neuralnetwork":
+            pytest.skip("No need to test fp16 for neuralnetwork backend.")
+
         shape = np.random.randint(low=2, high=6, size=rank)
-        x_val = random_gen(shape=shape, rand_min=-1.0, rand_max=1.0)
-        input_placeholders = {"x": mb.placeholder(shape=shape)}
+        x_val = random_gen(shape=shape, rand_min=-1.0, rand_max=1.0).astype(x_dtype)
+        input_placeholders = {"x": mb.placeholder(shape=shape, dtype=x_builtin_dtype)}
         input_values = {"x": x_val}
 
         def build(x):
-            return [mb.l2_norm(x=x, epsilon=epsilon)]
+            return [mb.l2_norm(x=x, epsilon=param_dtype(epsilon))]
 
         output = TestNormalizationL2Norm._compute_l2_norm(x_val, epsilon)
-        expected_output_types = [list(output.shape) + [types.fp32]]
-        expected_outputs = [
-            output
-        ]
+        expected_output_types = [list(output.shape) + [x_builtin_dtype]]
+        expected_outputs = [output]
 
         run_compare_builder(
             build,
@@ -341,7 +390,8 @@ def build(x):
             backend=backend,
         )
 
-    @pytest.mark.parametrize("rank, epsilon",
+    @pytest.mark.parametrize(
+        "rank, epsilon",
         itertools.product(
             [3, 4, 5],
             [1e-4, 11.2],
@@ -357,7 +407,6 @@ def test_builder_eval_stress(self, rank, epsilon):
 
 
 class TestNormalizationLayerNorm:
-
     @staticmethod
     def _keras_layer_norm(x, axes, epsilon):
         layer = tf.keras.layers.LayerNormalization(axis=axes, epsilon=epsilon)
@@ -370,18 +419,27 @@ def _np_layer_norm(x, axes, gamma=None, beta=None, epsilon=1e-5):
         rank = len(x.shape)
         axes = [axis + rank if axis < 0 else axis for axis in axes]
         normalized_shape = [x.shape[i] if i in axes else 1 for i in range(rank)]
-        gamma = np.ones(shape=normalized_shape) if gamma is None else np.reshape(gamma, normalized_shape)
-        beta = np.zeros(shape=normalized_shape) if beta is None else np.reshape(beta, normalized_shape)
+        gamma = (
+            np.ones(shape=normalized_shape)
+            if gamma is None
+            else np.reshape(gamma, normalized_shape)
+        )
+        beta = (
+            np.zeros(shape=normalized_shape) if beta is None else np.reshape(beta, normalized_shape)
+        )
         num = x - np.mean(x, axis=tuple(axes), keepdims=True)
         dem = np.sqrt(
-            np.sum(np.square(num), axis=tuple(axes), keepdims=True)
-            / np.prod(normalized_shape)
+            np.sum(np.square(num), axis=tuple(axes), keepdims=True) / np.prod(normalized_shape)
             + epsilon
         )
         return num / dem * gamma + beta
 
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         x_val = np.array([[[1.0, -7.0], [5.0, -6.0], [-3.0, -5.0]]], dtype=np.float32)
@@ -400,12 +458,16 @@ def build(x):
                 mb.layer_norm(x=x, axes=[2], epsilon=1e-4, gamma=gamma_val, beta=beta_val),
             ]
 
-        expected_output_types = [(1, 3, 2, types.fp32), (1, 3, 2, types.fp32), (1, 3, 2, types.fp32)]
+        expected_output_types = [
+            (1, 3, 2, types.fp32),
+            (1, 3, 2, types.fp32),
+            (1, 3, 2, types.fp32),
+        ]
         expected_outputs = [
             np.array(
                 [
                     [
-                        [0.9999969,  -0.9999969 ],
+                        [0.9999969, -0.9999969],
                         [0.99999833, -0.99999833],
                         [0.99995005, -0.99995005],
                     ]
@@ -425,7 +487,7 @@ def build(x):
             np.array(
                 [
                     [
-                        [1.9999969,  -0.9999969 ],
+                        [1.9999969, -0.9999969],
                         [1.99999833, -0.99999833],
                         [1.99995005, -0.99995005],
                     ]
@@ -445,7 +507,11 @@ def build(x):
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_smoke_rank_2(self, compute_unit, backend):
         x_val = np.array([[1.0, -7.0], [5.0, -6.0], [-3.0, -5.0]], dtype=np.float32)
@@ -458,24 +524,24 @@ def build(x):
             return [
                 # V2->V1 lowering (op_mappings.py): if branch
                 mb.layer_norm(x=x, axes=[1], epsilon=1e-4),
-                mb.layer_norm(x=x, axes=[1], epsilon=1e-4, gamma=gamma_val, beta=beta_val)
+                mb.layer_norm(x=x, axes=[1], epsilon=1e-4, gamma=gamma_val, beta=beta_val),
             ]
 
         expected_output_types = [(3, 2, types.fp32), (3, 2, types.fp32)]
         expected_outputs = [
             np.array(
                 [
-                    [ 0.9999969,  -0.9999969 ],
-                    [ 0.99999833, -0.99999833],
-                    [ 0.99995005, -0.99995005],
+                    [0.9999969, -0.9999969],
+                    [0.99999833, -0.99999833],
+                    [0.99995005, -0.99995005],
                 ],
                 dtype=np.float32,
             ),
             np.array(
                 [
-                    [ 1.9999969,  -0.9999969 ],
-                    [ 1.99999833, -0.99999833],
-                    [ 1.99995005, -0.99995005],
+                    [1.9999969, -0.9999969],
+                    [1.99999833, -0.99999833],
+                    [1.99995005, -0.99995005],
                 ],
                 dtype=np.float32,
             ),
@@ -492,7 +558,11 @@ def build(x):
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_smoke_with_dynamic_shape(self, compute_unit, backend):
         x_val = np.array([[[1.0, -7.0], [5.0, -6.0], [-3.0, -5.0]]], dtype=np.float32)
@@ -510,9 +580,9 @@ def build(x):
             np.array(
                 [
                     [
-                        [ 0.9999969,  -0.9999969 ],
-                        [ 0.99999833, -0.99999833],
-                        [ 0.99995005, -0.99995005],
+                        [0.9999969, -0.9999969],
+                        [0.99999833, -0.99999833],
+                        [0.99995005, -0.99995005],
                     ]
                 ],
                 dtype=np.float32,
@@ -526,7 +596,7 @@ def build(x):
             expected_output_types,
             expected_outputs,
             inputs=construct_inputs_from_placeholders(input_placeholders, 10)
-            if backend[0] == "mlprogram"
+            if backend.backend == "mlprogram"
             else None,
             compute_unit=compute_unit,
             backend=backend,
@@ -537,24 +607,32 @@ def build(x):
         itertools.product(
             compute_units,
             backends,
-            [
-                [3, [0, 2]],
-                [3, [-2]],
-                [4, [0, 1, 3]],
-                [5, [0, 4]],
-                [5, [-5, -4, -3, -2, -1]]
-            ],
+            [[3, [0, 2]], [3, [-2]], [4, [0, 1, 3]], [5, [0, 4]], [5, [-5, -4, -3, -2, -1]]],
             [0.0001, 0.01],
-            [True, False]
+            [True, False],
         ),
-        )
-    def test_builder_to_backend_stress_numpy(self, compute_unit, backend, rank_and_axes, epsilon, provides_gamma_beta):
+    )
+    def test_builder_to_backend_stress_numpy(
+        self, compute_unit, backend, rank_and_axes, epsilon, provides_gamma_beta
+    ):
 
-        if backend == ("mlprogram", "fp16") and compute_unit != ct.ComputeUnit.CPU_ONLY:
-            pytest.xfail("rdar://80662357 ([GPU failures] LayerNorm FP16 tests failing on GPU with numerical errors)")
+        if (
+            backend.backend == "mlprogram"
+            and backend.precision == "fp16"
+            and compute_unit != ct.ComputeUnit.CPU_ONLY
+        ):
+            pytest.xfail(
+                "rdar://80662357 ([GPU failures] LayerNorm FP16 tests failing on GPU with numerical errors)"
+            )
 
-        if backend[0] == "neuralnetwork" and compute_unit != ct.ComputeUnit.CPU_ONLY and platform.machine() == "arm64":
-            pytest.xfail("rdar://98015195 ([M1 native tests] Some MIL unittests are failing on M1 native)")
+        if (
+            backend.backend == "neuralnetwork"
+            and compute_unit != ct.ComputeUnit.CPU_ONLY
+            and platform.machine() == "arm64"
+        ):
+            pytest.xfail(
+                "rdar://98015195 ([M1 native tests] Some MIL unittests are failing on M1 native)"
+            )
 
         rank, axes = rank_and_axes
         shape = np.random.randint(low=2, high=6, size=rank)
@@ -565,21 +643,19 @@ def test_builder_to_backend_stress_numpy(self, compute_unit, backend, rank_and_a
         gamma, beta = None, None
 
         if provides_gamma_beta:
-            positive_axes = [axis+rank if axis < 0 else axis for axis in axes]
+            positive_axes = [axis + rank if axis < 0 else axis for axis in axes]
             normalized_shape = [shape[i] for i in range(rank) if i in positive_axes]
             gamma = random_gen(shape=normalized_shape, rand_min=-100, rand_max=100)
             beta = random_gen(shape=normalized_shape, rand_min=-100, rand_max=100)
 
         def build(x):
-            return [
-                mb.layer_norm(x=x, axes=axes, epsilon=epsilon, gamma=gamma, beta=beta)
-            ]
+            return [mb.layer_norm(x=x, axes=axes, epsilon=epsilon, gamma=gamma, beta=beta)]
 
-        output = TestNormalizationLayerNorm._np_layer_norm(x=x_val, axes=axes, epsilon=epsilon, gamma=gamma, beta=beta)
+        output = TestNormalizationLayerNorm._np_layer_norm(
+            x=x_val, axes=axes, epsilon=epsilon, gamma=gamma, beta=beta
+        )
         expected_output_types = [tuple(output.shape) + (types.fp32,)]
-        expected_outputs = [
-            output
-        ]
+        expected_outputs = [output]
 
         run_compare_builder(
             build,
@@ -595,37 +671,36 @@ def build(x):
 
     @pytest.mark.skipif(not _HAS_TF_2, reason=MSG_TF2_NOT_FOUND)
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank_and_axes, epsilon",
+        "compute_unit, backend, rank_and_axes, epsilon, x_param_dtype",
         itertools.product(
             compute_units,
             backends,
-            [
-                [3, [0, 2]],
-                [3, [-2]],
-                [4, [0, 1, 3]],
-                [5, [0, 4]],
-                [5, [-5, -4, -3, -2, -1]]
-            ],
-            [0.0001, 0.01]
+            [[3, [0, 2]], [3, [-2]], [4, [0, 1, 3]], [5, [0, 4]], [5, [-5, -4, -3, -2, -1]]],
+            [0.0001, 0.01],
+            [(np.float16, np.float16), (np.float32, np.float32)],
         ),
     )
-    def test_builder_to_backend_stress_keras(self, compute_unit, backend, rank_and_axes, epsilon):
+    def test_builder_to_backend_stress_keras(
+        self, compute_unit, backend, rank_and_axes, epsilon, x_param_dtype
+    ):
+        x_dtype, param_dtype = x_param_dtype
+        x_builtin_dtype = types.numpy_type_to_builtin_type(x_dtype)
+
+        if x_dtype == np.float16 and backend.backend == "neuralnetwork":
+            pytest.skip("No need to test fp16 for neuralnetwork backend.")
+
         rank, axes = rank_and_axes
         shape = np.random.randint(low=2, high=6, size=rank)
-        x_val = random_gen(shape=shape, rand_min=-100.0, rand_max=100.0)
-        input_placeholders = {"x": mb.placeholder(shape=x_val.shape)}
+        x_val = random_gen(shape=shape, rand_min=-100.0, rand_max=100.0).astype(x_dtype)
+        input_placeholders = {"x": mb.placeholder(shape=x_val.shape, dtype=x_builtin_dtype)}
         input_values = {"x": x_val}
 
         def build(x):
-            return [
-                mb.layer_norm(x=x, axes=axes, epsilon=epsilon)
-            ]
+            return [mb.layer_norm(x=x, axes=axes, epsilon=param_dtype(epsilon))]
 
         output = TestNormalizationLayerNorm._keras_layer_norm(x=x_val, axes=axes, epsilon=epsilon)
-        expected_output_types = [tuple(output.shape) + (types.fp32,)]
-        expected_outputs = [
-            output
-        ]
+        expected_output_types = [tuple(output.shape) + (x_builtin_dtype,)]
+        expected_outputs = [output]
 
         run_compare_builder(
             build,
@@ -637,14 +712,15 @@ def build(x):
             backend=backend,
         )
 
-    @pytest.mark.parametrize("rank_and_axes, epsilon",
+    @pytest.mark.parametrize(
+        "rank_and_axes, epsilon",
         itertools.product(
             [
                 [3, [0, 2]],
                 [3, [-2, -1]],
                 [4, [0, 1, 2, 3]],
                 [5, [0, 2, -1]],
-                [5, [-5, -4, -3, -2, -1]]
+                [5, [-5, -4, -3, -2, -1]],
             ],
             [0.0001, 0.01],
         ),
@@ -653,19 +729,25 @@ def test_builder_eval_stress(self, rank_and_axes, epsilon):
         rank, axes = rank_and_axes
         shape = np.random.randint(low=2, high=6, size=rank)
         x_val = random_gen(shape=shape, rand_min=-100.0, rand_max=100.0)
-        positive_axes = [axis+rank if axis < 0 else axis for axis in axes]
+        positive_axes = [axis + rank if axis < 0 else axis for axis in axes]
         normalized_shape = [shape[i] for i in range(rank) if i in positive_axes]
         gamma_val = random_gen(shape=normalized_shape, rand_min=-100, rand_max=100)
         beta_val = random_gen(shape=normalized_shape, rand_min=-100, rand_max=100)
         with Function({}):
             res = mb.layer_norm(x=x_val, axes=axes, epsilon=epsilon, gamma=gamma_val, beta=beta_val)
-            ref = TestNormalizationLayerNorm._np_layer_norm(x=x_val, axes=axes, epsilon=epsilon, gamma=gamma_val, beta=beta_val)
+            ref = TestNormalizationLayerNorm._np_layer_norm(
+                x=x_val, axes=axes, epsilon=epsilon, gamma=gamma_val, beta=beta_val
+            )
             np.testing.assert_allclose(ref, res.val, atol=1e-04, rtol=1e-05)
 
 
 class TestNormalizationLocalResponseNorm:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         x_val = np.array([[[1.0, -7.0], [5.0, -6.0], [-3.0, -5.0]]], dtype=np.float32)
@@ -714,7 +796,7 @@ def build(x):
 
     @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank, size, alpha, beta, k",
+        "compute_unit, backend, rank, size, alpha, beta, k, x_param_dtype",
         itertools.product(
             compute_units,
             backends,
@@ -723,22 +805,32 @@ def build(x):
             [0.0001, 0.01],
             [0.75, 1.0],
             [1.0, 2.0],
+            [(np.float16, np.float16), (np.float32, np.float32)],
         ),
     )
     def test_builder_to_backend_stress(
-        self, compute_unit, backend, rank, size, alpha, beta, k
+        self, compute_unit, backend, rank, size, alpha, beta, k, x_param_dtype
     ):
+        x_dtype, param_dtype = x_param_dtype
+        x_builtin_dtype = types.numpy_type_to_builtin_type(x_dtype)
+
+        if x_dtype == np.float16 and backend.backend == "neuralnetwork":
+            pytest.skip("No need to test fp16 for neuralnetwork backend.")
+
         shape = np.random.randint(low=2, high=5, size=rank)
-        x_val = random_gen(shape=shape)
-        input_placeholders = {"x": mb.placeholder(shape=x_val.shape)}
+        x_val = random_gen(shape=shape).astype(x_dtype)
+        input_placeholders = {"x": mb.placeholder(shape=x_val.shape, dtype=x_builtin_dtype)}
         input_values = {"x": x_val}
 
         def build(x):
-            return mb.local_response_norm(x=x, size=size, alpha=alpha, beta=beta, k=k)
+            return mb.local_response_norm(
+                x=x, size=size, alpha=param_dtype(alpha), beta=param_dtype(beta), k=param_dtype(k)
+            )
 
         torch_lrn = torch.nn.LocalResponseNorm(size=size, alpha=alpha, beta=beta, k=k)
-        expected_outputs = [torch_lrn(torch.as_tensor(x_val)).numpy()]
-        expected_output_types = [o.shape[:] + (types.fp32,) for o in expected_outputs]
+        # PyTorch doesn't support LocalResponseNorm with fp16, so need to cast to float32 first.
+        expected_outputs = [torch_lrn(torch.as_tensor(x_val.astype(np.float32))).numpy()]
+        expected_output_types = [o.shape[:] + (x_builtin_dtype,) for o in expected_outputs]
 
         run_compare_builder(
             build,
diff --git a/coremltools/converters/mil/mil/ops/tests/test_pool.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_pool.py
similarity index 90%
rename from coremltools/converters/mil/mil/ops/tests/test_pool.py
rename to coremltools/converters/mil/mil/ops/tests/iOS14/test_pool.py
index a42f3fb39..63ffe3603 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_pool.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_pool.py
@@ -11,9 +11,9 @@
 import coremltools as ct
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import types
-from coremltools.converters.mil.testing_reqs import backends, compute_units
-
-from .testing_utils import run_compare_builder
+from coremltools.converters.mil.mil.ops.tests.iOS14 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
+from coremltools.converters.mil.testing_reqs import compute_units
 
 
 class TestAvgPool:
@@ -26,7 +26,7 @@ class TestAvgPool:
                 [(1, 1, 2), (2,)],
                 [(1, 1, 2, 2), (2, 2)],
                 [(1, 1, 2, 2, 2), (2, 2, 2)],
-            ]
+            ],
         ),
     )
     def test_avgpool_builder_to_backend_smoke_samelower_padtype(
@@ -35,19 +35,17 @@ def test_avgpool_builder_to_backend_smoke_samelower_padtype(
         input_shape, kernel_shape = inputshape_kernelshape
         rank = len(input_shape) - 2
 
-        if backend[0] == "neuralnetwork" and rank == 3:
+        if backend.backend == "neuralnetwork" and rank == 3:
             pytest.skip(
                 "pad_type `same_lower` not supported for 3d pooling in neuralnetwork backend"
             )
-        if backend[0] == "mlprogram" and rank == 1:
+        if backend.backend == "mlprogram" and rank == 1:
             pytest.xfail(
                 "rdar://98852008 (MIL backend producing wrong result for 1d pooling with pad_type "
                 "same_lower)"
             )
-        if backend[0] == "mlprogram" and ct.utils._macos_version() < (13, 0):
-            pytest.skip("same_lower pad_type not supported in macOS12 or older.")
-
-        minimum_deployment_target = ct.target.iOS16 if backend[0] == "mlprogram" else None
+        if backend.opset_version == ct.target.iOS15:
+            pytest.skip("same_lower pad_type not supported in iOS15 opset.")
 
         x_val = np.arange(1, np.prod(input_shape) + 1).reshape(*input_shape).astype(np.float32)
 
@@ -78,16 +76,11 @@ def build(x):
             expected_outputs,
             compute_unit=compute_unit,
             backend=backend,
-            minimum_deployment_target=minimum_deployment_target,
         )
 
     @pytest.mark.parametrize(
         "compute_unit, backend, num_dims",
-        itertools.product(
-            compute_units,
-            backends,
-            [1, 2, 3]
-        ),
+        itertools.product(compute_units, backends, [1, 2, 3]),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend, num_dims):
         kernel_sizes = [1, 2, 3]
@@ -201,7 +194,6 @@ def build(x):
 
 
 class TestMaxPool:
-
     @pytest.mark.parametrize(
         "compute_unit, backend, inputshape_kernelshape",
         itertools.product(
@@ -211,7 +203,7 @@ class TestMaxPool:
                 [(1, 1, 2), (2,)],
                 [(1, 1, 2, 2), (2, 2)],
                 [(1, 1, 2, 2, 2), (2, 2, 2)],
-            ]
+            ],
         ),
     )
     def test_maxpool_builder_to_backend_smoke_samelower_padtype(
@@ -220,19 +212,17 @@ def test_maxpool_builder_to_backend_smoke_samelower_padtype(
         input_shape, kernel_shape = inputshape_kernelshape
         rank = len(input_shape) - 2
 
-        if backend[0] == "neuralnetwork" and rank == 3:
+        if backend.backend == "neuralnetwork" and rank == 3:
             pytest.skip(
                 "pad_type `same_lower` not supported for 3d pooling in neuralnetwork backend"
             )
-        if backend[0] == "mlprogram" and rank == 1:
+        if backend.backend == "mlprogram" and rank == 1:
             pytest.xfail(
                 "rdar://98852008 (MIL backend producing wrong result for 1d pooling with pad_type "
                 "same_lower)"
             )
-        if backend[0] == "mlprogram" and ct.utils._macos_version() < (13, 0):
-            pytest.skip("same_lower pad_type not supported in macOS12 or older.")
-
-        minimum_deployment_target = ct.target.iOS16 if backend[0] == "mlprogram" else None
+        if backend.opset_version == ct.target.iOS15:
+            pytest.skip("same_lower pad_type not supported in iOS15 opset.")
 
         x_val = np.arange(1, np.prod(input_shape) + 1).reshape(*input_shape).astype(np.float32)
 
@@ -263,16 +253,11 @@ def build(x):
             expected_outputs,
             compute_unit=compute_unit,
             backend=backend,
-            minimum_deployment_target=minimum_deployment_target,
         )
 
     @pytest.mark.parametrize(
         "compute_unit, backend, num_dims",
-        itertools.product(
-            compute_units,
-            backends,
-            [1, 2, 3]
-        ),
+        itertools.product(compute_units, backends, [1, 2, 3]),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend, num_dims):
         kernel_sizes = [1, 2, 3]
@@ -375,7 +360,6 @@ def build(x):
 
 
 class TestL2Pool:
-
     @pytest.mark.parametrize(
         "compute_unit, backend, inputshape_kernelshape",
         itertools.product(
@@ -384,7 +368,7 @@ class TestL2Pool:
             [
                 [(1, 1, 2), (2,)],
                 [(1, 1, 2, 2), (2, 2)],
-            ]
+            ],
         ),
     )
     def test_l2pool_builder_to_backend_smoke_samelower_padtype(
@@ -393,15 +377,13 @@ def test_l2pool_builder_to_backend_smoke_samelower_padtype(
         input_shape, kernel_shape = inputshape_kernelshape
         rank = len(input_shape) - 2
 
-        if backend[0] == "mlprogram" and rank == 1:
+        if backend.backend == "mlprogram" and rank == 1:
             pytest.xfail(
                 "rdar://98852008 (MIL backend producing wrong result for 1d pooling with pad_type "
                 "same_lower)"
             )
-        if backend[0] == "mlprogram" and ct.utils._macos_version() < (13, 0):
-            pytest.skip("same_lower pad_type not supported in macOS12 or older.")
-
-        minimum_deployment_target = ct.target.iOS16 if backend[0] == "mlprogram" else None
+        if backend.opset_version == ct.target.iOS15:
+            pytest.skip("same_lower pad_type not supported in iOS15 opset.")
 
         x_val = np.arange(1, np.prod(input_shape) + 1).reshape(*input_shape).astype(np.float32)
 
@@ -430,7 +412,6 @@ def build(x):
             expected_outputs,
             compute_unit=compute_unit,
             backend=backend,
-            minimum_deployment_target=minimum_deployment_target,
         )
 
     @pytest.mark.parametrize(
diff --git a/coremltools/converters/mil/mil/ops/tests/test_random.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_random.py
similarity index 84%
rename from coremltools/converters/mil/mil/ops/tests/test_random.py
rename to coremltools/converters/mil/mil/ops/tests/iOS14/test_random.py
index abfb9dd38..57c12214c 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_random.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_random.py
@@ -10,16 +10,20 @@
 
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import types
-from coremltools.converters.mil.testing_reqs import backends, compute_units
+from coremltools.converters.mil.mil.ops.tests.iOS14 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import UNK_SYM, run_compare_builder
+from coremltools.converters.mil.testing_reqs import compute_units
 from coremltools.converters.mil.testing_utils import get_core_ml_prediction
 from coremltools.models.utils import _macos_version
 
-from .testing_utils import UNK_SYM, run_compare_builder
-
 
 class TestRandomBernoulli:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
 
@@ -62,9 +66,7 @@ def build(x):
             [True, False],
         ),
     )
-    def test_builder_to_backend_stress(
-        self, compute_unit, backend, rank, prob, dynamic
-    ):
+    def test_builder_to_backend_stress(self, compute_unit, backend, rank, prob, dynamic):
         shape = np.random.randint(low=1, high=4, size=rank).astype(np.int32)
         x_val = np.array([0.0], dtype=np.float32)
         if dynamic:
@@ -90,13 +92,10 @@ def build_dyn(x, dyn_shape):
 
         if dynamic:
             expected_output_types = [
-                tuple([UNK_SYM for _ in o.shape]) + (types.fp32,)
-                for o in expected_outputs
+                tuple([UNK_SYM for _ in o.shape]) + (types.fp32,) for o in expected_outputs
             ]
         else:
-            expected_output_types = [
-                o.shape[:] + (types.fp32,) for o in expected_outputs
-            ]
+            expected_output_types = [o.shape[:] + (types.fp32,) for o in expected_outputs]
 
         builder = build_dyn if dynamic else build
 
@@ -117,7 +116,11 @@ def softmax(self, data):
         return e_data / e_data.sum()
 
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         x_val = np.array([1], dtype=np.int32)
@@ -147,15 +150,12 @@ def build(x):
             backend=backend,
         )
 
-    @pytest.mark.skipif(_macos_version() < (12, 0), reason="Can only get predictions for ml program on macOS 12+")
+    @pytest.mark.skipif(
+        _macos_version() < (12, 0), reason="Can only get predictions for ml program on macOS 12+"
+    )
     @pytest.mark.parametrize(
         "compute_unit, backend, n_sample, n_class",
-        itertools.product(
-            compute_units,
-            backends,
-            [50000],
-            [2, 10, 20]
-        ),
+        itertools.product(compute_units, backends, [50000], [2, 10, 20]),
     )
     def test_builder_to_backend_stress(self, compute_unit, backend, n_sample, n_class):
         output_name = "random_categorical"
@@ -167,14 +167,14 @@ def test_builder_to_backend_stress(self, compute_unit, backend, n_sample, n_clas
         input_values = {"x": logits}
 
         def build(x):
-            return [
-                mb.random_categorical(
-                    x=x, size=n_sample, mode="logits", name=output_name
-                )
-            ]
+            return [mb.random_categorical(x=x, size=n_sample, mode="logits", name=output_name)]
 
         prediction = get_core_ml_prediction(
-            build, input_placeholders, input_values, backend=backend, compute_unit=compute_unit,
+            build,
+            input_placeholders,
+            input_values,
+            backend=backend,
+            compute_unit=compute_unit,
         )
 
         ref0 = np.random.multinomial(n_sample, probs[0])
@@ -206,11 +206,7 @@ def build(x):
         input_values = {"x": np.array(probs)}
 
         def build(x):
-            return [
-                mb.random_categorical(
-                    x=x, size=n_sample, mode="probs", name=output_name
-                )
-            ]
+            return [mb.random_categorical(x=x, size=n_sample, mode="probs", name=output_name)]
 
         prediction = get_core_ml_prediction(
             build, input_placeholders, input_values, backend=backend, compute_unit=compute_unit
@@ -240,7 +236,11 @@ def build(x):
 
 class TestRandomNormal:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         x_val = np.array([0.0], dtype=np.float32)
@@ -250,12 +250,8 @@ def test_builder_to_backend_smoke(self, compute_unit, backend):
         def build(x):
             return [
                 mb.add(x=x, y=x),
-                mb.random_normal(
-                    shape=np.array([2, 1, 3], np.int32), mean=1.0, stddev=0.0
-                ),
-                mb.random_normal(
-                    shape=np.array([3, 1, 2], np.int32), mean=0.0, stddev=0.0
-                ),
+                mb.random_normal(shape=np.array([2, 1, 3], np.int32), mean=1.0, stddev=0.0),
+                mb.random_normal(shape=np.array([3, 1, 2], np.int32), mean=0.0, stddev=0.0),
             ]
 
         expected_outputs = [
@@ -286,9 +282,7 @@ def build(x):
             [True, False],
         ),
     )
-    def test_builder_to_backend_stress(
-        self, compute_unit, backend, rank, mean, dynamic
-    ):
+    def test_builder_to_backend_stress(self, compute_unit, backend, rank, mean, dynamic):
         shape = np.random.randint(low=1, high=4, size=rank).astype(np.int32)
         x_val = np.array([0.0], dtype=np.float32)
         if dynamic:
@@ -320,13 +314,10 @@ def build_dyn(x, dyn_shape):
 
         if dynamic:
             expected_output_types = [
-                tuple([UNK_SYM for _ in o.shape]) + (types.fp32,)
-                for o in expected_outputs
+                tuple([UNK_SYM for _ in o.shape]) + (types.fp32,) for o in expected_outputs
             ]
         else:
-            expected_output_types = [
-                o.shape[:] + (types.fp32,) for o in expected_outputs
-            ]
+            expected_output_types = [o.shape[:] + (types.fp32,) for o in expected_outputs]
 
         builder = build_dyn if dynamic else build
         run_compare_builder(
@@ -342,7 +333,11 @@ def build_dyn(x, dyn_shape):
 
 class TestRandomUniform:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         x_val = np.array([0.0], dtype=np.float32)
@@ -352,12 +347,8 @@ def test_builder_to_backend_smoke(self, compute_unit, backend):
         def build(x):
             return [
                 mb.add(x=x, y=x),
-                mb.random_uniform(
-                    shape=np.array([2, 1, 3], np.int32), low=0.0, high=0.0
-                ),
-                mb.random_uniform(
-                    shape=np.array([3, 1, 2], np.int32), low=1.0, high=1.0
-                ),
+                mb.random_uniform(shape=np.array([2, 1, 3], np.int32), low=0.0, high=0.0),
+                mb.random_uniform(shape=np.array([3, 1, 2], np.int32), low=1.0, high=1.0),
             ]
 
         expected_outputs = [
@@ -389,9 +380,7 @@ def build(x):
             [True, False],
         ),
     )
-    def test_builder_to_backend_stress(
-        self, compute_unit, backend, rank, low, high, dynamic
-    ):
+    def test_builder_to_backend_stress(self, compute_unit, backend, rank, low, high, dynamic):
         shape = np.random.randint(low=1, high=4, size=rank).astype(np.int32)
         x_val = np.array([0.0], dtype=np.float32)
         if dynamic:
@@ -423,13 +412,10 @@ def build_dyn(x, dyn_shape):
 
         if dynamic:
             expected_output_types = [
-                tuple([UNK_SYM for _ in o.shape]) + (types.fp32,)
-                for o in expected_outputs
+                tuple([UNK_SYM for _ in o.shape]) + (types.fp32,) for o in expected_outputs
             ]
         else:
-            expected_output_types = [
-                o.shape[:] + (types.fp32,) for o in expected_outputs
-            ]
+            expected_output_types = [o.shape[:] + (types.fp32,) for o in expected_outputs]
 
         builder = build_dyn if dynamic else build
         run_compare_builder(
diff --git a/coremltools/converters/mil/mil/ops/tests/test_recurrent.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_recurrent.py
similarity index 83%
rename from coremltools/converters/mil/mil/ops/tests/test_recurrent.py
rename to coremltools/converters/mil/mil/ops/tests/iOS14/test_recurrent.py
index 6444ebf2b..14c813306 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_recurrent.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_recurrent.py
@@ -11,15 +11,25 @@
 import coremltools as ct
 from coremltools._deps import _HAS_TORCH, MSG_TORCH_NOT_FOUND
 from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import get_new_symbol, types
-from coremltools.converters.mil.testing_reqs import backends, compute_units
+from coremltools.converters.mil.mil import get_new_symbol
+from coremltools.converters.mil.mil.ops.tests.iOS14 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import (
+    construct_inputs_from_placeholders,
+    run_compare_builder,
+)
+from coremltools.converters.mil.mil.types.type_mapping import numpy_type_to_builtin_type
+from coremltools.converters.mil.testing_reqs import compute_units
 from coremltools.converters.mil.testing_utils import ssa_fn
 
-from .testing_utils import construct_inputs_from_placeholders, run_compare_builder
-
 if _HAS_TORCH:
     import torch
 
+new_backends = []
+for v in backends:
+    if v.opset_version <= ct.target.iOS15:
+        new_backends.append(v)
+backends = new_backends
+
 
 class TestGRU:
     @pytest.mark.parametrize(
@@ -35,6 +45,7 @@ class TestGRU:
             "direction",
             "activation_functions",
             "symbolic",
+            "dtype",
         ],
         argvalues=itertools.product(
             compute_units,
@@ -52,6 +63,7 @@ class TestGRU:
                 ["sigmoid", "tanh"],
             ],
             [True, False],
+            [np.float32],
         ),
     )
     def test_builder_to_backend_smoke(
@@ -67,6 +79,7 @@ def test_builder_to_backend_smoke(
         direction,
         activation_functions,
         symbolic,
+        dtype,
     ):
         torch.manual_seed(5)
 
@@ -144,8 +157,8 @@ def get_numpy_prediction_gru_single_batch(
 
             return np_out_final
 
-        x = np.random.rand(batch_size, seq_len, input_size)
-        h = np.random.rand(batch_size, hidden_size)
+        x = np.random.rand(batch_size, seq_len, input_size).astype(dtype)
+        h = np.random.rand(batch_size, hidden_size).astype(dtype)
 
         activation, inner_activation = activation_functions
         output, state = get_numpy_prediction_gru(
@@ -157,24 +170,25 @@ def get_numpy_prediction_gru_single_batch(
             batch_size = get_new_symbol()
             seq_len = get_new_symbol()
 
-        hh_wt = np.concatenate([R_r, R_o, R_z], axis=0)
-        ih_wt = np.concatenate([W_r, W_o, W_z], axis=0)
-        b = np.concatenate([b_r, b_o, b_z], axis=0)
+        hh_wt = np.concatenate([R_r, R_o, R_z], axis=0).astype(dtype)
+        ih_wt = np.concatenate([W_r, W_o, W_z], axis=0).astype(dtype)
+        b = np.concatenate([b_r, b_o, b_z], axis=0).astype(dtype)
 
         input_shape = [seq_len, batch_size, input_size]
         h_shape = [batch_size, hidden_size]
 
+        builtin_dtype = numpy_type_to_builtin_type(dtype)
         input_placeholders = {
-            "x": mb.placeholder(shape=input_shape),
-            "initial_h": mb.placeholder(shape=h_shape),
+            "x": mb.placeholder(shape=input_shape, dtype=builtin_dtype),
+            "initial_h": mb.placeholder(shape=h_shape, dtype=builtin_dtype),
         }
 
         coreml_x = np.transpose(x, (1, 0, 2))
         input_values = {"x": coreml_x, "initial_h": h}
 
         expected_output_types = [
-            (seq_len if output_sequence else 1, batch_size, hidden_size, types.fp32),
-            (batch_size, hidden_size, types.fp32),
+            (seq_len if output_sequence else 1, batch_size, hidden_size, builtin_dtype),
+            (batch_size, hidden_size, builtin_dtype),
         ]
 
         def build(x, initial_h):
@@ -200,7 +214,7 @@ def build(x, initial_h):
             expected_output_types,
             expected_outputs,
             inputs=construct_inputs_from_placeholders(input_placeholders, upper_bound=10)
-            if symbolic and backend[0] == "mlprogram"
+            if symbolic and backend.backend == "mlprogram"
             else None,
             compute_unit=compute_unit,
             backend=backend,
@@ -224,6 +238,7 @@ class TestLSTM:
                 "has_peephole",
                 "coupled_input_forget",
                 "clip",
+                "dtype",
             ]
         ),
         itertools.product(
@@ -240,6 +255,7 @@ class TestLSTM:
             [True, False],
             [False],  # We have not exposed this option yet!
             [50.0, 0.2, 0.01],
+            [np.float32],  # Only support fp32 before iOS17.
         ),
     )
     def test_numpy_numerical(
@@ -257,6 +273,7 @@ def test_numpy_numerical(
         has_peephole,
         coupled_input_forget,
         clip,
+        dtype,
     ):
         def _apply_act(x, option):
             # All activation functions use their standard default values.
@@ -328,14 +345,12 @@ def _get_numpy_prediction_lstm_single_batch(Weights, X):
                 np_out_final = np_out[-1:, :]
             return np_out_final
 
-        batch = input_dims[0]
-        seq_len = input_dims[1]
-        input_size = input_dims[2]
+        batch, seq_len, input_size = input_dims
         hidden_size = output_dim
 
         # define random weights
-        W_x = np.random.rand(4 * hidden_size, input_size)
-        W_h = np.random.rand(4 * hidden_size, hidden_size)
+        W_x = np.random.rand(4 * hidden_size, input_size).astype(dtype)
+        W_h = np.random.rand(4 * hidden_size, hidden_size).astype(dtype)
 
         if has_bias:
             b = np.random.rand(4 * hidden_size) - 0.5
@@ -343,31 +358,32 @@ def _get_numpy_prediction_lstm_single_batch(Weights, X):
                 b = b + 1
         else:
             b = np.zeros((4 * hidden_size))
+        b = b.astype(dtype)
 
         if has_peephole:
             p = np.random.rand(3 * hidden_size) - 0.5
         else:
             p = np.zeros((3 * hidden_size))
+        p = p.astype(dtype)
 
-        Weights = {}
-        Weights["W_x"] = W_x
-        Weights["W_h"] = W_h
-        Weights["b"] = b
-        Weights["p"] = p
+        weights = {"W_x": W_x, "W_h": W_h, "b": b, "p": p}
 
-        input_data = np.random.rand(batch, seq_len, input_size)
-        numpy_preds = _get_numpy_prediction_lstm(Weights, input_data)
+        input_data = np.random.rand(batch, seq_len, input_size).astype(dtype)
+        numpy_preds = _get_numpy_prediction_lstm(weights, input_data)
         numpy_preds = np.transpose(numpy_preds, [1, 0, 2])
 
         coreml_input_data = np.transpose(input_data, [1, 0, 2])
-        input_placeholders = {"x": mb.placeholder(shape=coreml_input_data.shape)}
+        builtin_dtype = numpy_type_to_builtin_type(dtype)
+        input_placeholders = {
+            "x": mb.placeholder(shape=coreml_input_data.shape, dtype=builtin_dtype)
+        }
         input_values = {"x": coreml_input_data}
 
         def build(x):
             h_all, ht, ct = mb.lstm(
                 x=x,
-                initial_h=np.zeros((batch, hidden_size)).astype(np.float32),
-                initial_c=np.zeros((batch, hidden_size)).astype(np.float32),
+                initial_h=np.zeros((batch, hidden_size)).astype(dtype),
+                initial_c=np.zeros((batch, hidden_size)).astype(dtype),
                 weight_ih=W_x,
                 weight_hh=W_h,
                 peephole=p,
@@ -377,7 +393,7 @@ def build(x):
                 recurrent_activation=activation,
                 cell_activation=inner_activation,
                 activation=outer_activation,
-                clip=clip,
+                clip=dtype(clip),
             )
             return h_all
 
@@ -385,7 +401,7 @@ def build(x):
             seq_len if return_seq else 1,
             batch,
             hidden_size,
-            types.fp32,
+            builtin_dtype,
         )
         expected_outputs = numpy_preds
 
@@ -414,6 +430,7 @@ def build(x):
             "output_sequence",
             "direction",
             "symbolic",
+            "dtype",
         ],
         argvalues=itertools.product(
             compute_units,
@@ -426,6 +443,7 @@ def build(x):
             [True, False],
             ["forward", "reverse"],
             [True, False],
+            [np.float32],  # Only support fp32 before iOS17.
         ),
     )
     def test_builder_to_backend_smoke_unilstm(
@@ -440,6 +458,7 @@ def test_builder_to_backend_smoke_unilstm(
         output_sequence,
         direction,
         symbolic,
+        dtype,
     ):
 
         torch.manual_seed(50)
@@ -452,7 +471,7 @@ def test_builder_to_backend_smoke_unilstm(
         # Make weight compatible to CoreML format
         def ifzo_to_ifoz(x):
             i, f, z, o = np.split(x, 4)
-            return np.concatenate([i, f, o, z], axis=0)
+            return np.concatenate([i, f, o, z], axis=0).astype(dtype)
 
         w_x = ifzo_to_ifoz(ih_wt)
         w_h = ifzo_to_ifoz(hh_wt)
@@ -493,19 +512,24 @@ def ifzo_to_ifoz(x):
         h_shape = [batch_size, hidden_size]
         c_shape = [batch_size, hidden_size]
 
+        builtin_dtype = numpy_type_to_builtin_type(dtype)
         expected_output_types = [
-            (seq_len if output_sequence else 1, batch_size, hidden_size, types.fp32),
-            (batch_size, hidden_size, types.fp32),
-            (batch_size, hidden_size, types.fp32),
+            (seq_len if output_sequence else 1, batch_size, hidden_size, builtin_dtype),
+            (batch_size, hidden_size, builtin_dtype),
+            (batch_size, hidden_size, builtin_dtype),
         ]
         expected_outputs = [output, hn, cn]
 
         input_placeholders = {
-            "x": mb.placeholder(shape=input_shape),
-            "initial_h": mb.placeholder(shape=h_shape),
-            "initial_c": mb.placeholder(shape=c_shape),
+            "x": mb.placeholder(shape=input_shape, dtype=builtin_dtype),
+            "initial_h": mb.placeholder(shape=h_shape, dtype=builtin_dtype),
+            "initial_c": mb.placeholder(shape=c_shape, dtype=builtin_dtype),
+        }
+        input_values = {
+            "x": t.astype(dtype),
+            "initial_h": h.astype(dtype),
+            "initial_c": c.astype(dtype),
         }
-        input_values = {"x": t, "initial_h": h, "initial_c": c}
 
         def build(x, initial_h, initial_c):
             arguments = {
@@ -529,7 +553,7 @@ def build(x, initial_h, initial_c):
             expected_output_types,
             expected_outputs,
             inputs=construct_inputs_from_placeholders(input_placeholders, upper_bound=64)
-            if symbolic and backend[0] == "mlprogram"
+            if symbolic and backend.backend == "mlprogram"
             else None,
             compute_unit=compute_unit,
             backend=backend,
@@ -547,6 +571,7 @@ def build(x, initial_h, initial_c):
             "has_bias",
             "output_sequence",
             "symbolic",
+            "dtype",
         ],
         argvalues=itertools.product(
             compute_units,
@@ -558,6 +583,7 @@ def build(x, initial_h, initial_c):
             [True, False],
             [True, False],
             [True, False],
+            [np.float32],
         ),
     )
     def test_builder_to_backend_smoke_bidirlstm(
@@ -571,6 +597,7 @@ def test_builder_to_backend_smoke_bidirlstm(
         has_bias,
         output_sequence,
         symbolic,
+        dtype,
     ):
         def _pytorch_hidden_to_coreml(x):
             x = x.detach().numpy()
@@ -583,9 +610,7 @@ def _pytorch_hidden_to_coreml(x):
 
         direction = "bidirectional"
         torch.manual_seed(20)
-        rnn = torch.nn.LSTM(
-            input_size, hidden_size, 1, bidirectional=True, bias=has_bias
-        )
+        rnn = torch.nn.LSTM(input_size, hidden_size, 1, bidirectional=True, bias=has_bias)
         state_dict = rnn.state_dict()
 
         ih_wt = state_dict["weight_ih_l0"].detach().numpy()
@@ -595,7 +620,7 @@ def _pytorch_hidden_to_coreml(x):
 
         def ifzo_to_ifoz(x):
             i, f, z, o = np.split(x, 4)
-            return np.concatenate([i, f, o, z], axis=0)
+            return np.concatenate([i, f, o, z], axis=0).astype(dtype)
 
         wx = ifzo_to_ifoz(ih_wt)
         wh = ifzo_to_ifoz(hh_wt)
@@ -625,9 +650,9 @@ def ifzo_to_ifoz(x):
             output_r = output[0].unsqueeze(0)[:, :, hidden_size:]
             output = torch.cat([output_f, output_r], dim=2)
 
-        output = output.detach().numpy()
-        hn = _pytorch_hidden_to_coreml(hn)
-        cn = _pytorch_hidden_to_coreml(cn)
+        output = output.detach().numpy().astype(dtype)
+        hn = _pytorch_hidden_to_coreml(hn).astype(dtype)
+        cn = _pytorch_hidden_to_coreml(cn).astype(dtype)
 
         if symbolic:
             batch_size = get_new_symbol()
@@ -637,15 +662,16 @@ def ifzo_to_ifoz(x):
         h_shape = [batch_size, 2 * hidden_size]
         c_shape = [batch_size, 2 * hidden_size]
 
+        builtin_dtype = numpy_type_to_builtin_type(dtype)
         expected_output_types = [
             (
                 seq_len if output_sequence else 1,
                 batch_size,
                 2 * hidden_size,
-                types.fp32,
+                builtin_dtype,
             ),
-            (batch_size, 2 * hidden_size, types.fp32),
-            (batch_size, 2 * hidden_size, types.fp32),
+            (batch_size, 2 * hidden_size, builtin_dtype),
+            (batch_size, 2 * hidden_size, builtin_dtype),
         ]
         expected_outputs = [output, hn, cn]
 
@@ -654,11 +680,15 @@ def ifzo_to_ifoz(x):
         c = _pytorch_hidden_to_coreml(c0)
 
         input_placeholders = {
-            "x": mb.placeholder(shape=input_shape),
-            "initial_h": mb.placeholder(shape=h_shape),
-            "initial_c": mb.placeholder(shape=c_shape),
+            "x": mb.placeholder(shape=input_shape, dtype=builtin_dtype),
+            "initial_h": mb.placeholder(shape=h_shape, dtype=builtin_dtype),
+            "initial_c": mb.placeholder(shape=c_shape, dtype=builtin_dtype),
+        }
+        input_values = {
+            "x": t.astype(dtype),
+            "initial_h": h.astype(dtype),
+            "initial_c": c.astype(dtype),
         }
-        input_values = {"x": t, "initial_h": h, "initial_c": c}
 
         def build(x, initial_h, initial_c):
             arguments = {
@@ -685,7 +715,7 @@ def build(x, initial_h, initial_c):
             expected_output_types,
             expected_outputs,
             inputs=construct_inputs_from_placeholders(input_placeholders, upper_bound=64)
-            if symbolic and backend[0] == "mlprogram"
+            if symbolic and backend.backend == "mlprogram"
             else None,
             compute_unit=compute_unit,
             backend=backend,
@@ -750,6 +780,7 @@ class TestRNN:
             "output_sequence",
             "direction",
             "symbolic",
+            "dtype",
         ],
         argvalues=itertools.product(
             compute_units,
@@ -762,6 +793,7 @@ class TestRNN:
             [True, False],
             ["forward", "reverse"],
             [True, False],
+            [np.float32],
         ),
     )
     def test_builder_to_backend_smoke(
@@ -776,18 +808,19 @@ def test_builder_to_backend_smoke(
         output_sequence,
         direction,
         symbolic,
+        dtype,
     ):
         torch.manual_seed(50)
         rnn = torch.nn.RNN(input_size, hidden_size, 1, bias=has_bias)
         state_dict = rnn.state_dict()
 
-        ih_wt = state_dict["weight_ih_l0"].detach().numpy()
-        hh_wt = state_dict["weight_hh_l0"].detach().numpy()
+        ih_wt = state_dict["weight_ih_l0"].detach().numpy().astype(dtype)
+        hh_wt = state_dict["weight_hh_l0"].detach().numpy().astype(dtype)
 
         b = None
         if has_bias:
-            ih_b = state_dict["bias_ih_l0"].detach().numpy()
-            hh_b = state_dict["bias_hh_l0"].detach().numpy()
+            ih_b = state_dict["bias_ih_l0"].detach().numpy().astype(dtype)
+            hh_b = state_dict["bias_hh_l0"].detach().numpy().astype(dtype)
             b = ih_b + hh_b
 
         t = torch.randn(seq_len, batch_size, input_size)
@@ -814,17 +847,18 @@ def test_builder_to_backend_smoke(
         input_shape = [seq_len, batch_size, input_size]
         h_shape = [batch_size, hidden_size]
 
+        builtin_dtype = numpy_type_to_builtin_type(dtype)
         expected_output_types = [
-            (seq_len if output_sequence else 1, batch_size, hidden_size, types.fp32),
-            (batch_size, hidden_size, types.fp32),
+            (seq_len if output_sequence else 1, batch_size, hidden_size, builtin_dtype),
+            (batch_size, hidden_size, builtin_dtype),
         ]
         expected_outputs = [output, hn]
 
         input_placeholders = {
-            "x": mb.placeholder(shape=input_shape),
-            "initial_h": mb.placeholder(shape=h_shape),
+            "x": mb.placeholder(shape=input_shape, dtype=builtin_dtype),
+            "initial_h": mb.placeholder(shape=h_shape, dtype=builtin_dtype),
         }
-        input_values = {"x": t, "initial_h": h}
+        input_values = {"x": t.astype(dtype), "initial_h": h.astype(dtype)}
 
         def build(x, initial_h):
             arguments = {
@@ -847,7 +881,7 @@ def build(x, initial_h):
             expected_output_types,
             expected_outputs,
             inputs=construct_inputs_from_placeholders(input_placeholders, upper_bound=64)
-            if symbolic and backend[0] == "mlprogram"
+            if symbolic and backend.backend == "mlprogram"
             else None,
             compute_unit=compute_unit,
             backend=backend,
diff --git a/coremltools/converters/mil/mil/ops/tests/test_reduction.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_reduction.py
similarity index 81%
rename from coremltools/converters/mil/mil/ops/tests/test_reduction.py
rename to coremltools/converters/mil/mil/ops/tests/iOS14/test_reduction.py
index 3555e8764..54bac467c 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_reduction.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_reduction.py
@@ -9,16 +9,13 @@
 import pytest
 import scipy
 
-import coremltools as ct
-from coremltools.converters.mil import testing_reqs
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import get_new_symbol, types
+from coremltools.converters.mil.mil.ops.tests.iOS14 import backends
 from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
+from coremltools.converters.mil.testing_reqs import compute_units
 from coremltools.converters.mil.testing_utils import random_gen, ssa_fn
 
-backends = testing_reqs.backends
-compute_units = testing_reqs.compute_units
-
 
 class TestReduction:
     # All ops in this test share the same backends
@@ -104,11 +101,7 @@ def test_builder_to_backend_smoke(self, compute_unit, backend, mode):
 
     @pytest.mark.parametrize(
         "compute_unit, backend, mode",
-        itertools.product(
-            compute_units,
-            backends,
-            ["max", "mean"]
-        ),
+        itertools.product(compute_units, backends, ["max", "mean"]),
     )
     def test_builder_to_backend_global_pool_2d(self, compute_unit, backend, mode):
         # test lowering to spatial reduction to global_pool path
@@ -137,15 +130,14 @@ def test_builder_to_backend_global_pool_2d(self, compute_unit, backend, mode):
 
     @pytest.mark.parametrize(
         "compute_unit, backend, mode",
-        itertools.product(
-            compute_units,
-            backends,
-            ["max", "mean"]
-        ),
+        itertools.product(compute_units, backends, ["max", "mean"]),
     )
     def test_builder_to_backend_global_pool_none(self, compute_unit, backend, mode):
         # test lowering to spatial reduction to global_pool path for axis = None
-        val = np.array([[[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]]], dtype=np.float32)
+        val = np.array(
+            [[[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]]],
+            dtype=np.float32,
+        )
         input_placeholders = {"x": mb.placeholder(shape=val.shape)}
         input_values = {"x": val}
 
@@ -170,11 +162,7 @@ def test_builder_to_backend_global_pool_none(self, compute_unit, backend, mode):
 
     @pytest.mark.parametrize(
         "compute_unit, backend, mode",
-        itertools.product(
-            compute_units,
-            backends,
-            ["max", "mean"]
-        ),
+        itertools.product(compute_units, backends, ["max", "mean"]),
     )
     def test_builder_to_backend_global_pool_3d(self, compute_unit, backend, mode):
         # test lowering to spatial reduction to global_pool path
@@ -201,14 +189,7 @@ def test_builder_to_backend_global_pool_3d(self, compute_unit, backend, mode):
             backend=backend,
         )
 
-
-    @pytest.mark.parametrize(
-        ["axis", "keep_dims"],
-        itertools.product(
-            [1, -3],
-            [True, False]
-        )
-    )
+    @pytest.mark.parametrize(["axis", "keep_dims"], itertools.product([1, -3], [True, False]))
     def test_builder_eval(self, axis, keep_dims):
         x_val = random_gen(shape=(1, 3, 4, 4), rand_min=-100.0, rand_max=100.0)
 
@@ -303,7 +284,11 @@ def test_reduce_sum_square():
         test_reduce_sum_square()
 
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_symbolic(self, compute_unit, backend):
         s0 = get_new_symbol()
@@ -334,67 +319,19 @@ def build(x):
             backend=backend,
         )
 
-    @pytest.mark.parametrize(
-        "input_size", [(1), (2), (1,2), (2,2), (2,3,4), (2,3,4,10)]
-    )
+    @pytest.mark.parametrize("input_size", [(1), (2), (1, 2), (2, 2), (2, 3, 4), (2, 3, 4, 10)])
     def test_reduce_log_sum_exp_value_inference(self, input_size):
         rs = np.random.RandomState(1234)
         x = rs.random(input_size)
 
         for axis in range(-x.ndim, x.ndim - 1):
+
             @mb.program(input_specs=[])
             def prog():
-                return  mb.reduce_log_sum_exp(x=x, axes=(axis,))
+                return mb.reduce_log_sum_exp(x=x, axes=(axis,))
 
             op = list(prog.functions.values())[0].operations[3]
-            assert op.op_type == 'reduce_log_sum_exp'
+            assert op.op_type == "reduce_log_sum_exp"
             np.testing.assert_allclose(
-                op.value_inference(),
-                scipy.special.logsumexp(x, axis=axis),
-                atol=1e-04,
-                rtol=1e-05
+                op.value_inference(), scipy.special.logsumexp(x, axis=axis), atol=1e-04, rtol=1e-05
             )
-
-    @pytest.mark.parametrize(
-        "compute_unit, backend, op_name, output_dtype",
-        itertools.product(
-            compute_units, backends, ["reduce_argmax", "reduce_argmin"], ["int32", "uint16", None]
-        ),
-    )
-    def test_reduce_arg_ios17_output_dtype(self, compute_unit, backend, op_name, output_dtype):
-        def build(x):
-            return getattr(mb, op_name)(x=x, axis=1, keep_dims=False, output_dtype=output_dtype)
-
-        val = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32)
-        input_placeholders = {"x": mb.placeholder(shape=val.shape)}
-        input_values = {"x": val}
-        output_np_type = np.uint16 if output_dtype == "uint16" else np.int32
-        output_type = types.uint16 if output_dtype == "uint16" else types.int32
-        expected_output_types = (2, output_type)
-        expected_outputs = np.array(
-            [2, 2] if op_name == "reduce_argmax" else [0, 0], dtype=output_np_type
-        )
-
-        run_compare_builder(
-            build,
-            input_placeholders,
-            input_values,
-            expected_output_types,
-            expected_outputs,
-            compute_unit=compute_unit,
-            backend=backend,
-            minimum_deployment_target=ct.target.iOS17,
-        )
-
-    @pytest.mark.parametrize(
-        "op_name",
-        ["reduce_argmax", "reduce_argmin"],
-    )
-    def test_reduce_arg_ios17_output_dtype_invalid(self, op_name):
-        x = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32)
-
-        def prog():
-            return getattr(mb, op_name)(x=x, axis=1, keep_dims=False, output_dtype="dummy")
-
-        with pytest.raises(ValueError, match='Invalid "output_dtype" dummy'):
-            mb.program(input_specs=[], opset_version=ct.target.iOS17)(prog)
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS14/test_scatter_gather.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_scatter_gather.py
new file mode 100644
index 000000000..a10f516f4
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_scatter_gather.py
@@ -0,0 +1,675 @@
+#  Copyright (c) 2020, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import itertools
+
+import numpy as np
+import pytest
+
+import coremltools as ct
+from coremltools._deps import _HAS_TF_2, MSG_TF2_NOT_FOUND
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.ops.tests.iOS14 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import (
+    mark_api_breaking,
+    run_compare_builder,
+)
+from coremltools.converters.mil.testing_reqs import compute_units
+
+if _HAS_TF_2:
+    import tensorflow as tf
+
+
+class TestScatter:
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(compute_units, backends),
+    )
+    def test_builder_to_backend_smoke(self, compute_unit, backend):
+        data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        indices = np.array([1, 0], dtype=np.int32)
+        updates = np.array([[5, 6, 7], [8, 9, 10]], dtype=np.float32)
+        input_placeholders = {
+            "data": mb.placeholder(shape=data.shape),
+            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
+            "updates": mb.placeholder(shape=updates.shape),
+        }
+
+        input_values = {"data": data, "indices": indices, "updates": updates}
+
+        def build(data, indices, updates):
+            return (mb.scatter(data=data, indices=indices, updates=updates),)
+
+        expected_output_types = (2, 3, types.fp32)
+
+        expected_outputs = np.array([[9, 11, 13], [9, 11, 13]], dtype=np.float32)
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @pytest.mark.skipif(not _HAS_TF_2, reason=MSG_TF2_NOT_FOUND)
+    @pytest.mark.parametrize(
+        "compute_unit, backend, rankData_rankIndices, accumulate_mode",
+        itertools.product(
+            compute_units,
+            backends,
+            [(1, 2), (2, 1), (3, 2), (2, 3), (1, 1), (3, 3), (1, 3)],
+            ["update", "add", "sub", "mul", "div", "max", "min"],
+        ),
+    )
+    def test_builder_to_backend_programmatic(
+        self,
+        compute_unit,
+        backend,
+        rankData_rankIndices,
+        accumulate_mode,
+    ):
+        data_rank, indices_rank = rankData_rankIndices
+        data_shape = np.random.randint(low=2, high=5, size=data_rank)
+        indices_shape = np.random.randint(low=2, high=5, size=indices_rank)
+        updates_shape = list(indices_shape) + list(data_shape[1:])
+
+        data = np.random.rand(*data_shape).astype(np.float32)
+        updates = np.random.rand(*updates_shape).astype(np.float32)
+        indices = np.random.randint(0, data_shape[0], size=indices_shape).astype(np.int32)
+
+        def build(data, indices, updates):
+            return mb.scatter(data=data, indices=indices, updates=updates, mode=accumulate_mode)
+
+        tf_output = tf.Variable(data)
+        if accumulate_mode == "update":
+            tf.compat.v1.scatter_update(tf_output, indices, updates)
+        if accumulate_mode == "add":
+            tf.compat.v1.scatter_add(tf_output, indices, updates)
+        if accumulate_mode == "sub":
+            tf.compat.v1.scatter_sub(tf_output, indices, updates)
+        if accumulate_mode == "mul":
+            tf.compat.v1.scatter_mul(tf_output, indices, updates)
+        if accumulate_mode == "div":
+            tf.compat.v1.scatter_div(tf_output, indices, updates)
+        if accumulate_mode == "max":
+            tf.compat.v1.scatter_max(tf_output, indices, updates)
+        if accumulate_mode == "min":
+            tf.compat.v1.scatter_min(tf_output, indices, updates)
+        expected_output = tf_output.numpy()
+
+        input_placeholders = {
+            "data": mb.placeholder(shape=data.shape),
+            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
+            "updates": mb.placeholder(shape=updates.shape),
+        }
+
+        input_values = {"data": data, "indices": indices, "updates": updates}
+
+        expected_output_types = tuple(data_shape[:]) + (types.fp32,)
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_output,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+class TestScatterAlongAxis:
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(compute_units, backends),
+    )
+    def test_builder_to_backend_smoke(self, compute_unit, backend):
+        data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        indices = np.array([[1, 0, 1], [1, 1, 0]], dtype=np.int32)
+        updates = np.array([[5, 6, 7], [8, 9, 10]], dtype=np.float32)
+        input_placeholders = {
+            "data": mb.placeholder(shape=data.shape),
+            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
+            "updates": mb.placeholder(shape=updates.shape),
+        }
+
+        input_values = {"data": data, "indices": indices, "updates": updates}
+
+        def build(data, indices, updates):
+            return mb.scatter_along_axis(
+                data=data, indices=indices, updates=updates, axis=0, mode="update"
+            )
+
+        expected_output_types = (2, 3, types.fp32)
+
+        expected_outputs = np.array([[1, 6, 10], [8, 9, 7]], dtype=np.float32)
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_builder_eval(self, backend):
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
+            opset_version=backend.opset_version,
+        )
+        def prog(x):
+            params = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+            indices = np.array([[1, 0, 1], [1, 1, 0]], dtype=np.int32)
+            updates = np.array([[5, 6, 7], [8, 9, 10]], dtype=np.float32)
+            res = mb.scatter_along_axis(
+                data=params, indices=indices, updates=updates, axis=0, mode="update"
+            )
+            return res
+
+        main_func = prog.functions["main"]
+        gather_ops = main_func.find_ops(op_type="scatter_along_axis")[0]
+
+        np.testing.assert_allclose(
+            np.array([[1, 6, 10], [8, 9, 7]], dtype=np.float32),
+            gather_ops.outputs[0].val,
+            atol=1e-04,
+            rtol=1e-05,
+        )
+
+    @staticmethod
+    def _test_builder_to_backend_programmatic(
+        compute_unit, backend, rank_axis, force_non_negative_indices
+    ):
+        rank, axis = rank_axis
+        data_shape = np.random.randint(low=2, high=8, size=rank)
+        indices_shape = np.copy(data_shape)
+        indices_shape[axis] = np.random.randint(low=1, high=8)
+        updates_shape = indices_shape
+
+        data = np.random.rand(*data_shape).astype(np.float32)
+        updates = np.random.rand(*updates_shape).astype(np.float32)
+        if force_non_negative_indices:
+            # IOS17 scatter_along_axis requires indices to be non-negative.
+            indices = np.random.randint(0, data_shape[axis], size=indices_shape).astype(np.int32)
+        else:
+            indices = np.random.randint(
+                -data_shape[axis], data_shape[axis], size=indices_shape
+            ).astype(np.int32)
+
+        def build(data, indices, updates):
+            return mb.scatter_along_axis(
+                data=data, indices=indices, updates=updates, axis=axis, mode="update"
+            )
+
+        input_placeholders = {
+            "data": mb.placeholder(shape=data.shape),
+            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
+            "updates": mb.placeholder(shape=updates.shape),
+        }
+
+        input_values = {"data": data, "indices": indices, "updates": updates}
+
+        expected_output_types = tuple(data_shape[:]) + (types.fp32,)
+
+        np_output = np.copy(data)
+        np.put_along_axis(np_output, indices, updates, axis=axis)
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            np_output,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @mark_api_breaking(breaking_opset_version=ct.target.iOS17)
+    @pytest.mark.parametrize(
+        "compute_unit, backend, rank_axis",
+        itertools.product(
+            compute_units,
+            backends,
+            [(rank, axis) for rank in range(1, 5) for axis in range(-rank, rank)],
+        ),
+    )
+    def test_builder_to_backend_programmatic(
+        self,
+        compute_unit,
+        backend,
+        rank_axis,
+    ):
+        self._test_builder_to_backend_programmatic(
+            compute_unit, backend, rank_axis, force_non_negative_indices=False
+        )
+
+
+class TestScatterNd:
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(compute_units, backends),
+    )
+    def test_builder_to_backend_smoke(self, compute_unit, backend):
+        data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        indices = np.array([[1, 0], [0, 2]], dtype=np.int32)
+        updates = np.array([5, 10], dtype=np.float32)
+        input_placeholders = {
+            "data": mb.placeholder(shape=data.shape),
+            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
+            "updates": mb.placeholder(shape=updates.shape),
+        }
+
+        input_values = {"data": data, "indices": indices, "updates": updates}
+
+        def build(data, indices, updates):
+            return (mb.scatter_nd(data=data, indices=indices, updates=updates),)
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types=(2, 3, types.fp32),
+            expected_outputs=np.array([[1, 2, 13], [9, 5, 6]], dtype=np.float32),
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @pytest.mark.skipif(not _HAS_TF_2, reason=MSG_TF2_NOT_FOUND)
+    @pytest.mark.parametrize(
+        "compute_unit, backend, rankData_rankIndices, accumulate_mode",
+        itertools.product(
+            compute_units,
+            backends,
+            [(2, 2), (1, 4), (5, 2), (4, 3), (3, 4), (1, 5)],
+            ["update", "add", "sub"],
+        ),
+    )
+    def test_builder_to_backend_programmatic(
+        self,
+        compute_unit,
+        backend,
+        rankData_rankIndices,
+        accumulate_mode,
+    ):
+        data_rank, indices_rank = rankData_rankIndices
+        data_shape = np.random.randint(low=2, high=5, size=data_rank)
+        indices_shape = np.random.randint(low=2, high=5, size=indices_rank)
+        indices_shape[-1] = np.random.randint(low=1, high=data_rank + 1)
+        updates_shape = list(indices_shape[:-1]) + list(data_shape[indices_shape[-1] :])
+
+        data = np.random.rand(*data_shape).astype(np.float32)
+        updates = np.random.rand(*updates_shape).astype(np.float32)
+        indices_list = []
+        for i in range(indices_shape[-1]):
+            indices_list.append(np.random.randint(0, data_shape[i], size=indices_shape[:-1]))
+
+        indices = np.stack(indices_list, axis=-1).astype(np.int32)
+
+        def build(data, indices, updates):
+            return mb.scatter_nd(data=data, indices=indices, updates=updates, mode=accumulate_mode)
+
+        tf_output = tf.Variable(data)
+        if accumulate_mode == "update":
+            tf.compat.v1.scatter_nd_update(tf_output, indices, updates)
+        if accumulate_mode == "add":
+            tf.compat.v1.scatter_nd_add(tf_output, indices, updates)
+        if accumulate_mode == "sub":
+            tf.compat.v1.scatter_nd_sub(tf_output, indices, updates)
+        expected_output = tf_output.numpy()
+
+        input_placeholders = {
+            "data": mb.placeholder(shape=data.shape),
+            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
+            "updates": mb.placeholder(shape=updates.shape),
+        }
+
+        input_values = {"data": data, "indices": indices, "updates": updates}
+
+        expected_output_types = tuple(data_shape[:]) + (types.fp32,)
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_output,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+class TestGather:
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(compute_units, backends),
+    )
+    def test_builder_to_backend_smoke(self, compute_unit, backend):
+        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        indices = np.array([1, 0], dtype=np.int32)
+        input_placeholders = {
+            "x": mb.placeholder(shape=x.shape),
+            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
+        }
+
+        input_values = {"x": x, "indices": indices}
+
+        def build(x, indices):
+            return [
+                mb.gather(x=x, indices=indices, axis=0),
+                mb.gather(x=x, indices=indices, axis=1),
+                mb.gather(x=x, indices=indices, axis=-2),
+                mb.gather(x=x, indices=indices, axis=-1),
+                mb.gather(x=x, indices=indices),
+                # mb.gather(x=x, indices=1), #shape of scalar indices is incorrect.
+                # mb.gather(x=x, indices=1, axis=1), #Scalar index passes on axis=0 but fails on axis=1,
+                # Need to handle rank 0 correctly, rdar://73160449
+            ]
+
+        expected_output_types = [
+            (2, 3, types.fp32),
+            (2, 2, types.fp32),
+            (2, 3, types.fp32),
+            (2, 2, types.fp32),
+            (2, 3, types.fp32),
+            # (3, types.fp32),
+        ]
+
+        expected_outputs = [
+            np.array([[4, 5, 6], [1, 2, 3]], dtype=np.float32),
+            np.array([[2, 1], [5, 4]], dtype=np.float32),
+            np.array([[4, 5, 6], [1, 2, 3]], dtype=np.float32),
+            np.array([[2, 1], [5, 4]], dtype=np.float32),
+            np.array([[4, 5, 6], [1, 2, 3]], dtype=np.float32),
+            # np.array([4, 5, 6], dtype=np.float32),
+        ]
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(compute_units, backends),
+    )
+    def test_embedding_builder_to_backend_smoke(self, compute_unit, backend):
+        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        indices = np.array([1, 0], dtype=np.int32)
+        input_placeholders = {
+            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
+        }
+
+        input_values = {"indices": indices}
+
+        def build(indices):
+            return [
+                mb.gather(x=x, indices=indices, axis=0),
+                mb.gather(x=x, indices=indices, axis=-2),
+            ]
+
+        expected_output_types = [
+            (2, 3, types.fp32),
+            (2, 3, types.fp32),
+        ]
+
+        expected_outputs = [
+            np.array([[4, 5, 6], [1, 2, 3]], dtype=np.float32),
+            np.array([[4, 5, 6], [1, 2, 3]], dtype=np.float32),
+        ]
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_builder_eval(self, backend):
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
+            opset_version=backend.opset_version,
+        )
+        def prog(x):
+            params = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+            indices = np.array([1, 0], dtype=np.int32)
+            res = mb.gather(x=params, indices=indices, axis=-1)
+            return res
+
+        main_func = prog.functions["main"]
+        gather_ops = main_func.find_ops(op_type="gather")[0]
+
+        np.testing.assert_allclose(
+            np.array([[2, 1], [5, 4]], dtype=np.float32),
+            gather_ops.outputs[0].val,
+            atol=1e-04,
+            rtol=1e-05,
+        )
+
+    @pytest.mark.parametrize(
+        "backend, indices_val, validate_indices",
+        itertools.product(backends, [[-1, 0], [0, 3]], [True, False]),
+    )
+    def test_builder_invalid_indices(self, backend, indices_val, validate_indices):
+        def prog(x):
+            params = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+            indices = np.array(indices_val, dtype=np.int32)
+            res = mb.gather(x=params, indices=indices, axis=-1)
+            return res
+
+        if any([idx > 2 for idx in indices_val]):
+            with pytest.raises(IndexError, match="index 3 is out of bounds for axis 1 with size 3"):
+                mb.program(
+                    input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
+                    opset_version=backend.opset_version,
+                )(prog)
+        else:
+            mb.program(
+                input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
+                opset_version=backend.opset_version,
+            )(prog)
+
+
+class TestGatherAlongAxis:
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(compute_units, backends),
+    )
+    def test_builder_to_backend_smoke(self, compute_unit, backend):
+        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        indices = np.array([[1, 0, 1], [1, 1, 0]], dtype=np.int32)
+        input_placeholders = {
+            "x": mb.placeholder(shape=x.shape),
+            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
+        }
+
+        input_values = {"x": x, "indices": indices}
+
+        def build(x, indices):
+            return [
+                mb.gather_along_axis(x=x, indices=indices, axis=0),
+                mb.gather_along_axis(x=x, indices=indices, axis=1),
+                mb.gather_along_axis(x=x, indices=indices, axis=-2),
+                mb.gather_along_axis(x=x, indices=indices, axis=-1),
+                mb.gather_along_axis(x=x, indices=indices),
+            ]
+
+        expected_output_types = [
+            (2, 3, types.fp32),
+            (2, 3, types.fp32),
+            (2, 3, types.fp32),
+            (2, 3, types.fp32),
+            (2, 3, types.fp32),
+        ]
+
+        expected_outputs = [
+            np.array([[4, 2, 6], [4, 5, 3]], dtype=np.float32),
+            np.array([[2, 1, 2], [5, 5, 4]], dtype=np.float32),
+            np.array([[4, 2, 6], [4, 5, 3]], dtype=np.float32),
+            np.array([[2, 1, 2], [5, 5, 4]], dtype=np.float32),
+            np.array([[4, 2, 6], [4, 5, 3]], dtype=np.float32),
+        ]
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_builder_eval(self, backend):
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
+            opset_version=backend.opset_version,
+        )
+        def prog(x):
+            params = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+            indices = np.array([[1, 0, 1], [0, 0, 1]], dtype=np.int32)
+            res = mb.gather_along_axis(x=params, indices=indices, axis=0)
+            return res
+
+        main_func = prog.functions["main"]
+        gather_ops = main_func.find_ops(op_type="gather_along_axis")[0]
+
+        np.testing.assert_allclose(
+            np.array([[4, 2, 6], [1, 2, 6]], dtype=np.float32),
+            gather_ops.outputs[0].val,
+            atol=1e-04,
+            rtol=1e-05,
+        )
+
+    @staticmethod
+    def _test_builder_to_backend_programmatic(
+        compute_unit, backend, rank_axis, force_non_negative_indices
+    ):
+        rank, axis = rank_axis
+        x_shape = np.random.randint(low=2, high=8, size=rank)
+        indices_shape = np.copy(x_shape)
+        indices_shape[axis] = np.random.randint(low=1, high=8)
+
+        x = np.random.rand(*x_shape).astype(np.float32)
+
+        # IOS17 gather_along_axis requires non-negative indices.
+        lower_bound = 0 if force_non_negative_indices else -x_shape[axis]
+        indices = np.random.randint(lower_bound, x_shape[axis], size=indices_shape).astype(np.int32)
+
+        def build(x, indices):
+            return mb.gather_along_axis(x=x, indices=indices, axis=axis)
+
+        input_placeholders = {
+            "x": mb.placeholder(shape=x.shape),
+            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
+        }
+
+        input_values = {"x": x, "indices": indices}
+
+        expected_output_types = tuple(indices_shape[:]) + (types.fp32,)
+        expected_output = np.take_along_axis(x, indices, axis=axis)
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_output,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @mark_api_breaking(breaking_opset_version=ct.target.iOS17)
+    @pytest.mark.parametrize(
+        "compute_unit, backend, rank_axis",
+        itertools.product(
+            compute_units,
+            backends,
+            [(rank, axis) for rank in range(1, 5) for axis in range(-rank, rank)],
+        ),
+    )
+    def test_builder_to_backend_programmatic(self, compute_unit, backend, rank_axis):
+        self._test_builder_to_backend_programmatic(compute_unit, backend, rank_axis, False)
+
+    @pytest.mark.parametrize(
+        "backend, indices_val, validate_indices",
+        itertools.product(
+            backends,
+            [[[1, 0, -1], [0, 0, 1]], [[1, 0, 1], [0, 0, 2]]],
+            [True, False],
+        ),
+    )
+    def test_builder_invalid_indices(self, backend, indices_val, validate_indices):
+        def prog(x):
+            params = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+            indices = np.array(indices_val, dtype=np.int32)
+            res = mb.gather_along_axis(x=params, indices=indices, axis=0)
+            return res
+
+        if any([idx > 1 for sub_indices in indices_val for idx in sub_indices]):
+            with pytest.raises(IndexError, match="index 2 is out of bounds for axis 0 with size 2"):
+                mb.program(
+                    input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
+                    opset_version=backend.opset_version,
+                )(prog)
+        else:
+            mb.program(
+                input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
+                opset_version=backend.opset_version,
+            )(prog)
+
+
+class TestGatherNd:
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(compute_units, backends),
+    )
+    def test_builder_to_backend_smoke(self, compute_unit, backend):
+        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        indices = np.array([[1, 0], [0, 2]], dtype=np.int32)
+        input_placeholders = {
+            "x": mb.placeholder(shape=x.shape),
+            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
+        }
+
+        input_values = {"x": x, "indices": indices}
+
+        def build(x, indices):
+            return (mb.gather_nd(x=x, indices=indices),)
+
+        expected_output_types = (2, types.fp32)
+        expected_outputs = np.array([4, 3], dtype=np.float32)
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            frontend_only=False,
+            backend=backend,
+        )
diff --git a/coremltools/converters/mil/mil/ops/tests/test_tensor_operation.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_tensor_operation.py
similarity index 86%
rename from coremltools/converters/mil/mil/ops/tests/test_tensor_operation.py
rename to coremltools/converters/mil/mil/ops/tests/iOS14/test_tensor_operation.py
index 9fd8ffda6..53b719c0b 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_tensor_operation.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_tensor_operation.py
@@ -5,32 +5,30 @@
 
 import itertools
 import platform
-from unittest.mock import patch
 
 import numpy as np
 import pytest
 
 import coremltools as ct
 from coremltools._deps import _HAS_TF_2, MSG_TF2_NOT_FOUND
-from coremltools.converters.mil import testing_reqs
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import Function, get_new_symbol, types
-from coremltools.converters.mil.mil.passes.pass_pipeline import PassPipeline
-from coremltools.converters.mil.mil.var import Var
-from coremltools.converters.mil.testing_utils import get_op_types_in_program, random_gen, ssa_fn
-
-from .testing_utils import (
+from coremltools.converters.mil.mil.ops.tests.iOS14 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import (
     UNK_SYM,
     UNK_VARIADIC,
     construct_inputs_from_placeholders,
+    mark_api_breaking,
     run_compare_builder,
 )
+from coremltools.converters.mil.mil.types.symbolic import is_symbolic
+from coremltools.converters.mil.mil.types.type_mapping import nptype_from_builtin
+from coremltools.converters.mil.testing_reqs import compute_units
+from coremltools.converters.mil.testing_utils import get_op_types_in_program, random_gen, ssa_fn
 
 if _HAS_TF_2:
     import tensorflow as tf
 
-backends = testing_reqs.backends
-compute_units = testing_reqs.compute_units
 
 class TestBandPart:
     @pytest.mark.parametrize(
@@ -118,9 +116,13 @@ def build(x):
         )
 
     def get_output_from_mlmodel(
-        self, x_val: np.ndarray, num_lower: int, num_upper: int
+        self,
+        x_val: np.ndarray,
+        num_lower: int,
+        num_upper: int,
+        dtype: type,
     ) -> np.ndarray:
-        @mb.program(input_specs=[mb.TensorSpec(shape=(3, 4))])
+        @mb.program(input_specs=[mb.TensorSpec(shape=(3, 4), dtype=dtype)])
         def prog(x):
             return mb.band_part(x=x, lower=num_lower, upper=num_upper, name="out")
 
@@ -134,9 +136,13 @@ def prog(x):
         return out
 
     def get_value_inference_output(
-        self, x_val: np.ndarray, num_lower: int, num_upper: int
+        self,
+        x_val: np.ndarray,
+        num_lower: int,
+        num_upper: int,
+        dtype: type,
     ) -> np.ndarray:
-        func_inputs = {"x": mb.placeholder(shape=[3, 4])}
+        func_inputs = {"x": mb.placeholder(shape=[3, 4], dtype=dtype)}
         with Function(func_inputs) as ssa_fun:
             x = ssa_fun.inputs["x"]
             v = mb.band_part(x=x_val, lower=num_lower, upper=num_upper)
@@ -146,14 +152,23 @@ def get_value_inference_output(
         ct.utils._macos_version() < (10, 15), reason="needs mlprogram, skip on macos < 10.15"
     )
     @pytest.mark.parametrize(
-        "lower_upper",
-        [(0, -1), (-1, 0), (0, 0), (1, 1), (1, 2), (2, 1)],
+        "lower_upper, dtype",
+        itertools.product(
+            [(0, -1), (-1, 0), (0, 0), (1, 1), (1, 2), (2, 1)],
+            [types.int32, types.fp32],
+        ),
     )
-    def test_value_inference(self, lower_upper):
+    def test_value_inference(self, lower_upper, dtype):
         num_lower, num_upper = lower_upper
-        test_input = np.random.rand(3, 4).astype(np.float32)
-        out_value_inference = self.get_value_inference_output(test_input, num_lower, num_upper)
-        out_from_model_prediction = self.get_output_from_mlmodel(test_input, num_lower, num_upper)
+        np_type = nptype_from_builtin(dtype)
+        test_input = np.random.rand(3, 4).astype(np_type)
+        out_value_inference = self.get_value_inference_output(
+            test_input, num_lower, num_upper, dtype
+        )
+        out_from_model_prediction = self.get_output_from_mlmodel(
+            test_input, num_lower, num_upper, dtype
+        )
+        assert out_value_inference.dtype == test_input.dtype
         np.testing.assert_allclose(
             out_value_inference, out_from_model_prediction, atol=1e-3, rtol=1e-3
         )
@@ -267,45 +282,8 @@ def test_invalid_input2(self):
             mb.cumsum(x=x_val)
 
 
-class TestFillLike:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
-    def test_builder_to_backend_smoke(self, compute_unit, backend):
-        if backend[0] == "neuralnetwork":
-            pytest.xfail("nn backend not supported")
-
-        if ct.utils._macos_version() < (13, 0):
-            pytest.skip("fill_like not supported in macOS12 or older.")
-
-        shape = (2, 1, 3)
-        x_val = np.zeros(shape=shape, dtype=np.float32)
-        input_placeholders = {"x": mb.placeholder(shape=x_val.shape, dtype=types.int32)}
-
-        input_values = {"x": x_val}
-
-        def build(x):
-            return mb.fill_like(ref_tensor=x, value=1.0)
-
-        expected_output_types = [(2, 1, 3, types.fp32)]
-        expected_outputs = [np.full(shape=shape, fill_value=1.0)]
-
-        mlmodel = run_compare_builder(
-            build,
-            input_placeholders,
-            input_values,
-            expected_output_types,
-            expected_outputs,
-            compute_unit=compute_unit,
-            backend=backend,
-            minimum_deployment_target=ct.target.iOS16,
-        )
-
-
 class TestFill:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         shape = (2, 1, 3)
         x_val = np.zeros(shape=shape, dtype=np.float32)
@@ -393,7 +371,7 @@ def build(shape):
             expected_output_types,
             expected_outputs,
             inputs=construct_inputs_from_placeholders(input_placeholders, 3)
-            if backend[0] == "mlprogram"
+            if backend.backend == "mlprogram"
             else None,
             compute_unit=compute_unit,
             backend=backend,
@@ -402,6 +380,7 @@ def build(shape):
 
 @pytest.mark.skipif(not _HAS_TF_2, reason=MSG_TF2_NOT_FOUND)
 class TestNonMaximumSuppression:
+    @mark_api_breaking(breaking_opset_version=ct.target.iOS17)
     @pytest.mark.parametrize(
         "compute_unit, backend",
         itertools.product(
@@ -502,9 +481,7 @@ def _ref_non_maximum_suppression(
         score_threshold = score_threshold.astype(np.float32)
 
         # convert box ids to TF style
-        center_w, center_h, width, height = np.split(
-            boxes, 4, axis=-1
-        )  # (n_batch,n_box,1)
+        center_w, center_h, width, height = np.split(boxes, 4, axis=-1)  # (n_batch,n_box,1)
         y1 = center_h - 0.5 * height
         y2 = center_h + 0.5 * height
         x1 = center_w - 0.5 * width
@@ -575,6 +552,7 @@ def _ref_non_maximum_suppression(
 
         return out1, out2, out3, out4
 
+    @mark_api_breaking(breaking_opset_version=ct.target.iOS17)
     @pytest.mark.parametrize(
         ",".join(
             [
@@ -610,18 +588,22 @@ def test_builder_to_backend_stress(
         n_score,
         per_class_suppression,
     ):
-        if backend[0] == "mlprogram" and iou_threshold_percentile == 0:
+        if backend.backend == "mlprogram" and iou_threshold_percentile == 0:
             pytest.xfail("rdar://78080118")
 
-        if backend[0] == "neuralnetwork" and n_boxes == (10, 7) and platform.machine() == "x86_64":
+        if (
+            backend.backend == "neuralnetwork"
+            and n_boxes == (10, 7)
+            and platform.machine() == "x86_64"
+        ):
             pytest.xfail("rdar://78080118 (Investigate failing tests for NMS in coremltools)")
 
-        if backend == ("mlprogram", "fp16"):
+        if backend.backend == "mlprogram" and backend.precision == "fp16":
             pytest.xfail("CPU: rdar://80662705 and GPU: rdar://80661262")
 
         n_boxes_in, n_boxes_out = n_boxes
         boxes_val = random_gen((n_batch, n_boxes_in, 4), 0, 100)
-        scores_val = random_gen((n_batch, n_boxes_in, n_score), -100, 100)
+        scores_val = random_gen((n_batch, n_boxes_in, n_score), -100, 100, allow_duplicate=False)
 
         iou_matrix = self._compute_iou_matrix(boxes_val[0, :, :])
         iou_matrix = iou_matrix[~np.eye(iou_matrix.shape[0], dtype=bool)].reshape(
@@ -633,9 +615,7 @@ def test_builder_to_backend_stress(
         elif score_threshold_percentile == 100:
             score_threshold = np.max(scores_val) + 1
         else:
-            score_threshold = (
-                np.percentile(scores_val, score_threshold_percentile) + 0.01
-            )
+            score_threshold = np.percentile(scores_val, score_threshold_percentile) + 0.01
 
         if iou_threshold_percentile == 0:
             iou_threshold = np.maximum(np.min(iou_matrix) - 0.01, 0.0)
@@ -643,12 +623,7 @@ def test_builder_to_backend_stress(
             iou_threshold = np.percentile(iou_matrix, iou_threshold_percentile) + 0.01
         iou_threshold = np.maximum(iou_threshold, 1e-8)
 
-        (
-            tf_boxes,
-            tf_scores,
-            tf_indices,
-            tf_num_boxes,
-        ) = self._ref_non_maximum_suppression(
+        (tf_boxes, tf_scores, tf_indices, tf_num_boxes,) = self._ref_non_maximum_suppression(
             boxes_val,
             scores_val,
             iou_threshold,
@@ -692,9 +667,7 @@ def build(boxes, scores):
 
 
 class TestNonZero:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         x_val = np.array([[3, 0, 0], [0, 4, 0], [5, 6, 0]], dtype=np.float32)
         input_placeholders = {"x": mb.placeholder(shape=x_val.shape)}
@@ -729,6 +702,7 @@ def test_shape_inference_for_deterministic_input(self):
         res = mb.non_zero(x=x_val)
         assert res.shape == (3, 2)
 
+
 class TestOneHot:
     @pytest.mark.parametrize(
         "compute_unit, backend",
@@ -752,12 +726,8 @@ def build(x, y):
             return [
                 mb.one_hot(indices=x, one_hot_vector_size=4),
                 mb.one_hot(indices=x, one_hot_vector_size=4, axis=0),
-                mb.one_hot(
-                    indices=x, one_hot_vector_size=4, on_value=1.0, off_value=0.1
-                ),
-                mb.one_hot(
-                    indices=x, one_hot_vector_size=mb.squeeze(x=y), on_value=1, off_value=9
-                ),
+                mb.one_hot(indices=x, one_hot_vector_size=4, on_value=1.0, off_value=0.1),
+                mb.one_hot(indices=x, one_hot_vector_size=mb.squeeze(x=y), on_value=1, off_value=9),
             ]
 
         expected_output_types = [
@@ -921,9 +891,7 @@ def test_constant_general():
             input_values = {"x": t}
 
             def build(x):
-                return mb.pad(
-                    x=x, pad=pad.reshape(-1), mode="constant", constant_val=0.0
-                )
+                return mb.pad(x=x, pad=pad.reshape(-1), mode="constant", constant_val=0.0)
 
             expected_output_types = (4, 6, 5, types.fp32)
             expected_outputs = np.pad(t, pad, mode="constant")
@@ -968,9 +936,7 @@ def test_constant_mode():
 
         def test_reflect_mode():
             x_val = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-            v = mb.pad(
-                x=x_val, pad=np.array([1, 1, 2, 2], dtype=np.int32), mode="reflect"
-            )
+            v = mb.pad(x=x_val, pad=np.array([1, 1, 2, 2], dtype=np.int32), mode="reflect")
             expected_outputs = np.array(
                 [
                     [6.0, 5.0, 4.0, 5.0, 6.0, 5.0, 4.0],
@@ -984,9 +950,7 @@ def test_reflect_mode():
 
         def test_replicate_mode():
             x_val = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-            v = mb.pad(
-                x=x_val, pad=np.array([1, 1, 2, 2], dtype=np.int32), mode="replicate"
-            )
+            v = mb.pad(x=x_val, pad=np.array([1, 1, 2, 2], dtype=np.int32), mode="replicate")
             expected_outputs = np.array(
                 [
                     [1.0, 1.0, 1.0, 2.0, 3.0, 3.0, 3.0],
@@ -1011,6 +975,37 @@ def test_constant_general():
         test_replicate_mode()
         test_constant_general()
 
+    @staticmethod
+    def test_value_inference_with_symbolic_padding():
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=(get_new_symbol(), get_new_symbol(), 1, 3), dtype=types.fp32)
+            ]
+        )
+        def prog(x):
+            paddings = mb.shape(x=x)
+            res = mb.pad(x=np.random.rand(1, 1), pad=paddings)
+            shape = res.shape
+            assert is_symbolic(shape[0])
+            assert shape[1] == 5
+            return res
+
+    @staticmethod
+    def test_error_out_with_dynamic_paddings_with_invaid_shape():
+        with pytest.raises(
+            ValueError, match="Non-constant 'pad' must have shape \(8,\). Got \(4,\)"
+        ):
+
+            @mb.program(
+                input_specs=[
+                    mb.TensorSpec(shape=(1, 1, 3, 4)),
+                    mb.TensorSpec(shape=(2, 2), dtype=types.int32),
+                ]
+            )
+            def prog(x, y):
+                pad = mb.reshape(x=y, shape=[-1])
+                res = mb.pad(x=x, pad=pad)
+
 
 class TestRange1d:
     @pytest.mark.parametrize(
@@ -1089,9 +1084,7 @@ def test_large_array(self, compute_unit, backend):
         def build(x):
             return [mb.range_1d(start=0.0, end=2000000.0, step=1.0)]
 
-        expected_output_types = [
-            (2000000, types.fp32)
-        ]
+        expected_output_types = [(2000000, types.fp32)]
 
         expected_outputs = [
             np.arange(0.0, 2000000.0, 1.0),
@@ -1260,102 +1253,6 @@ def build(x):
             backend=backend,
         )
 
-    @pytest.mark.parametrize(
-        "compute_unit, backend, return_indices, sort",
-        itertools.product(
-            compute_units,
-            backends,
-            [True, False],
-            [True, False],
-        )
-    )
-    def test_builder_to_backend_smoke_iOS16(self, compute_unit, backend, return_indices, sort):
-        val = np.array([[-1.0, 2.0, -3.0], [4.0, -5.0, 6.0]], dtype=np.float32)
-        input_placeholders = {"x": mb.placeholder(shape=val.shape)}
-        input_values = {"x": val}
-
-        def build(x):
-            return mb.topk(x=x, k=2, axis=1, return_indices=return_indices, sort=sort)
-
-        expected_output_types = [
-            (2, 2, types.fp32),
-            (2, 2, types.int32),
-        ]
-        expected_outputs = [
-            np.array([[2.0, -1.0], [6.0, 4.0]], dtype=np.float32),
-            np.array([[1, 0], [2, 0]], dtype=np.float32),
-        ]
-
-        if not return_indices:
-            expected_output_types = expected_output_types[:1]
-            expected_outputs = expected_outputs[:1]
-
-        run_compare_builder(
-            build,
-            input_placeholders,
-            input_values,
-            expected_output_types,
-            expected_outputs,
-            compute_unit=compute_unit,
-            backend=backend,
-            minimum_deployment_target=ct.target.iOS16,
-        )
-
-    @pytest.mark.parametrize(
-        "compute_unit, backend, x_dtype, k_dtype",
-        itertools.product(
-            compute_units,
-            [("mlprogram", "fp16")],
-            [np.float32, np.float16, np.int32, np.int16, np.uint16],
-            [np.int32, np.int16],
-        ),
-    )
-    def test_ios17_different_dtypes(self, compute_unit, backend, x_dtype, k_dtype):
-        def build(x):
-            return mb.topk(x=x, k=k_dtype(2), axis=1)
-
-        if k_dtype == np.int16:
-            pytest.xfail("k with dtype int16 will trigger backend error.")
-
-        val = np.array([[2, 3, 1], [5, 4, 6]], dtype=x_dtype)
-        x_mb_dtype = types.type_mapping.numpy_type_to_builtin_type(x_dtype)
-        input_placeholders = {"x": mb.placeholder(shape=val.shape, dtype=x_mb_dtype)}
-        input_values = {"x": val}
-        # As int16 is not in CoreML I/O supported dtypes, it will be cast to int32.
-        expected_output_types = [(2, 2, x_mb_dtype), (2, 2, types.int32)]
-        expected_outputs = [
-            np.array([[3, 2], [6, 5]], dtype=x_dtype),
-            np.array([[1, 0], [2, 0]], dtype=np.int32),
-        ]
-
-        with patch.object(Var, "_is_nonreplaceable_var") as mocked_is_nonreplaceable_var:
-            # Mock that the cast is non-replaceable, to make sure it's kept in the graph.
-            mocked_is_nonreplaceable_var.side_effect = (
-                lambda var: var.op and var.op.op_type == "cast"
-            )
-            # Remove the cast optimization pass to make sure all cast are kept in the graph.
-            pass_pipeline: PassPipeline = PassPipeline.DEFAULT
-            pass_pipeline.remove_passes(
-                ["common::cast_optimization", "common::topological_reorder"]
-            )
-            mlmodel = run_compare_builder(
-                build,
-                input_placeholders,
-                input_values,
-                expected_output_types,
-                expected_outputs,
-                compute_unit=compute_unit,
-                backend=backend,
-                minimum_deployment_target=ct.target.iOS17,
-                pass_pipeline=pass_pipeline,
-            )
-        prog = mlmodel._mil_program
-        topk_op = prog["main"].find_ops(op_type="topk")[0]
-        expected_x_dtype = x_mb_dtype
-        if backend[1] == "fp16" and types.is_float(x_mb_dtype):
-            expected_x_dtype = types.fp16
-        assert types.builtin_to_string(topk_op.x.dtype) == types.builtin_to_string(expected_x_dtype)
-
     @ssa_fn
     def test_builder_eval(self):
         def np_topk(x, k, axis, ascending=False):
@@ -1424,9 +1321,7 @@ class TestFlatten2d:
         ),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
-        t = np.array(
-            [[[1, 2, 3], [4, 5, 6]], [[-1, -2, -3], [-4, -5, -6]]], dtype=np.float32
-        )
+        t = np.array([[[1, 2, 3], [4, 5, 6]], [[-1, -2, -3], [-4, -5, -6]]], dtype=np.float32)
         input_placeholders = {"x": mb.placeholder(shape=t.shape)}
         input_values = {"x": t}
 
@@ -1535,7 +1430,7 @@ def build(x):
             expected_output_types,
             expected_outputs,
             inputs=construct_inputs_from_placeholders(input_placeholders, 10)
-            if backend[0] == "mlprogram"
+            if backend.backend == "mlprogram"
             else None,
             compute_unit=compute_unit,
             backend=backend,
@@ -1545,11 +1440,7 @@ def build(x):
 class TestShape:
     @pytest.mark.parametrize(
         "compute_unit, backend, input_type",
-        itertools.product(
-            compute_units,
-            backends,
-            ["int32", "float32"]
-        )
+        itertools.product(compute_units, backends, ["int32", "float32"]),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend, input_type):
         np_type = np.int32 if input_type == "int32" else np.float32
@@ -1586,11 +1477,7 @@ def test_builder_eval(self):
 
     @pytest.mark.parametrize(
         "compute_unit, backend, input_type",
-        itertools.product(
-            compute_units,
-            backends,
-            ["int32", "float32"]
-        )
+        itertools.product(compute_units, backends, ["int32", "float32"]),
     )
     def test_builder_to_backend_symbolic(self, compute_unit, backend, input_type):
         np_type = np.int32 if input_type == "int32" else np.float32
@@ -1620,7 +1507,7 @@ def build(x):
             expected_output_types,
             expected_outputs,
             inputs=construct_inputs_from_placeholders(input_placeholders, 10)
-            if backend[0] == "mlprogram"
+            if backend.backend == "mlprogram"
             else None,
             compute_unit=compute_unit,
             backend=backend,
@@ -1630,11 +1517,7 @@ def build(x):
 class TestIdentity:
     @pytest.mark.parametrize(
         "compute_unit, backend, input_type",
-        itertools.product(
-            compute_units,
-            backends,
-            ["int32", "float32"]
-        )
+        itertools.product(compute_units, backends, ["int32", "float32"]),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend, input_type):
         np_type = np.int32 if input_type == "int32" else np.float32
diff --git a/coremltools/converters/mil/mil/ops/tests/test_tensor_transformation.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_tensor_transformation.py
similarity index 66%
rename from coremltools/converters/mil/mil/ops/tests/test_tensor_transformation.py
rename to coremltools/converters/mil/mil/ops/tests/iOS14/test_tensor_transformation.py
index ac5f715e1..93f21ac53 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_tensor_transformation.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_tensor_transformation.py
@@ -13,16 +13,15 @@
 from coremltools.converters.mil import testing_reqs
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import get_new_symbol, types
-from coremltools.converters.mil.mil.types import nptype_from_builtin
-from coremltools.converters.mil.testing_reqs import backends, compute_units
-from coremltools.converters.mil.testing_utils import ssa_fn
-
-from .testing_utils import (
+from coremltools.converters.mil.mil.ops.tests.iOS14 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import (
     UNK_SYM,
     UNK_VARIADIC,
     construct_inputs_from_placeholders,
     run_compare_builder,
 )
+from coremltools.converters.mil.testing_reqs import compute_units
+from coremltools.converters.mil.testing_utils import ssa_fn
 
 if _HAS_TORCH:
     import torch
@@ -30,7 +29,11 @@
 
 class TestDepthToSpace:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         # original input type is (1, 4, 1, 1, fp32)
@@ -57,14 +60,18 @@ def build(x):
 
 class TestSpaceToBatch:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         # original input type is (2, 1, 2, 4, fp32)
-        val = np.array([[[[ 1,  2,  3,  4],
-                          [ 5,  6,  7,  8]]],
-                        [[[ 9, 10, 11, 12],
-                          [13, 14, 15, 16]]]], dtype=np.float32)
+        val = np.array(
+            [[[[1, 2, 3, 4], [5, 6, 7, 8]]], [[[9, 10, 11, 12], [13, 14, 15, 16]]]],
+            dtype=np.float32,
+        )
         input_placeholders = {"x": mb.placeholder(shape=val.shape)}
         input_values = {"x": val}
 
@@ -72,14 +79,19 @@ def build(x):
             return [mb.space_to_batch(x=x, block_shape=[2, 2], paddings=[[0, 0], [2, 0]])]
 
         expected_output_types = (8, 1, 1, 3, types.fp32)
-        expected_outputs = np.array([[[[ 0,  1,  3]]],
-                                     [[[ 0,  9, 11]]],
-                                     [[[ 0,  2,  4]]],
-                                     [[[ 0, 10, 12]]],
-                                     [[[ 0,  5,  7]]],
-                                     [[[ 0, 13, 15]]],
-                                     [[[ 0,  6,  8]]],
-                                     [[[ 0, 14, 16]]]], dtype=np.float32)
+        expected_outputs = np.array(
+            [
+                [[[0, 1, 3]]],
+                [[[0, 9, 11]]],
+                [[[0, 2, 4]]],
+                [[[0, 10, 12]]],
+                [[[0, 5, 7]]],
+                [[[0, 13, 15]]],
+                [[[0, 6, 8]]],
+                [[[0, 14, 16]]],
+            ],
+            dtype=np.float32,
+        )
 
         run_compare_builder(
             build,
@@ -94,18 +106,27 @@ def build(x):
 
 class TestBatchToSpace:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         # original input type is (8, 1, 1, 3, fp32)
-        val = np.array([[[[ 0,  1,  3]]],
-                       [[[ 0,  9, 11]]],
-                       [[[ 0,  2,  4]]],
-                       [[[ 0, 10, 12]]],
-                       [[[ 0,  5,  7]]],
-                       [[[ 0, 13, 15]]],
-                       [[[ 0,  6,  8]]],
-                       [[[ 0, 14, 16]]]], dtype=np.float32)
+        val = np.array(
+            [
+                [[[0, 1, 3]]],
+                [[[0, 9, 11]]],
+                [[[0, 2, 4]]],
+                [[[0, 10, 12]]],
+                [[[0, 5, 7]]],
+                [[[0, 13, 15]]],
+                [[[0, 6, 8]]],
+                [[[0, 14, 16]]],
+            ],
+            dtype=np.float32,
+        )
         input_placeholders = {"x": mb.placeholder(shape=val.shape)}
         input_values = {"x": val}
 
@@ -113,10 +134,10 @@ def build(x):
             return [mb.batch_to_space(x=x, block_shape=[2, 2], crops=[[0, 0], [2, 0]])]
 
         expected_output_types = (2, 1, 2, 4, types.fp32)
-        expected_outputs = np.array([[[[ 1,  2,  3,  4],
-                                       [ 5,  6,  7,  8]]],
-                                     [[[ 9, 10, 11, 12],
-                                       [13, 14, 15, 16]]]], dtype=np.float32)
+        expected_outputs = np.array(
+            [[[[1, 2, 3, 4], [5, 6, 7, 8]]], [[[9, 10, 11, 12], [13, 14, 15, 16]]]],
+            dtype=np.float32,
+        )
 
         run_compare_builder(
             build,
@@ -131,7 +152,11 @@ def build(x):
 
 class TestExpandDims:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         t = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
@@ -176,7 +201,11 @@ def build(x):
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_symbolic(self, compute_unit, backend):
         s0 = get_new_symbol()
@@ -210,7 +239,7 @@ def build(x):
             expected_output_types,
             expected_outputs,
             inputs=construct_inputs_from_placeholders(input_placeholders, 10)
-            if backend[0] == "mlprogram"
+            if backend.backend == "mlprogram"
             else None,
             compute_unit=compute_unit,
             backend=backend,
@@ -230,23 +259,19 @@ def test_builder_eval(self):
         np.testing.assert_allclose(ref, v3.val, atol=1e-04, rtol=1e-05)
 
         v4 = mb.expand_dims(x=x_val, axes=[0, -1, -2])
-        np.testing.assert_allclose(np.reshape(x_val, (1, 1, 6, 1, 1)), v4.val, atol=1e-04, rtol=1e-05)
+        np.testing.assert_allclose(
+            np.reshape(x_val, (1, 1, 6, 1, 1)), v4.val, atol=1e-04, rtol=1e-05
+        )
 
     @pytest.mark.parametrize(
         "compute_unit, backend, rank_and_axis",
         itertools.product(
             compute_units,
             backends,
-            [
-                (rank, axis)
-                for rank in range(1, 5)
-                for axis in range(-rank - 1, rank + 1)
-            ],
+            [(rank, axis) for rank in range(1, 5) for axis in range(-rank - 1, rank + 1)],
         ),
     )
-    def test_builder_to_backend_programmatic_one_axis(
-        self, compute_unit, backend, rank_and_axis
-    ):
+    def test_builder_to_backend_programmatic_one_axis(self, compute_unit, backend, rank_and_axis):
         rank, axis = rank_and_axis
         x_shape = np.random.randint(low=2, high=6, size=rank)
         input_placeholders = {"x": mb.placeholder(shape=x_shape)}
@@ -320,92 +345,13 @@ def build(x):
         )
 
 
-class TestReshapeLike:
+class TestReshape:
     @pytest.mark.parametrize(
-        "compute_unit, backend, InputShape_RefShapes_Begins_Ends_EndMasks, InputType_RefType",
+        "compute_unit, backend",
         itertools.product(
             compute_units,
             backends,
-            [
-                [(4, 3), ((2, 2, 3), (1, 3)), (0, 1), (2, 2), (False, False)],
-                [(32,), ((1, 2, 2, 2), (3, 2, 2)), (1, 1), (0, 0), (True, True)],
-                [(72, 1), ((1, 2, 3, 4, 1), (3,)), (1, 0), (0, 1), (True, False)],
-            ],
-            [(types.bool, types.fp32), (types.fp32, types.bool)],
-        )
-    )
-    def test_builder_to_backend_smoke(
-            self,
-            compute_unit,
-            backend,
-            InputShape_RefShapes_Begins_Ends_EndMasks,
-            InputType_RefType,
-        ):
-        if backend[0] == "neuralnetwork":
-            pytest.skip("reshape_like not supoprted in neuralnetwork backend.")
-
-        if ct.utils._macos_version() < (13, 0):
-            pytest.skip("reshape_like not supported in macOS12 or older.")
-
-        input_shape, ref_shapes, begins, ends, end_masks = InputShape_RefShapes_Begins_Ends_EndMasks
-        ref_shape_1, ref_shape_2 = ref_shapes
-        input_type, ref_type = InputType_RefType
-
-        t = np.random.rand(*input_shape).astype(np.float32)
-        ref_tensor_1 = np.random.rand(*ref_shape_1).astype(np.float32)
-        ref_tensor_2 = np.random.rand(*ref_shape_2).astype(np.float32)
-
-        input_placeholders = {
-            "x": mb.placeholder(shape=t.shape),
-            "ref_tensor_1": mb.placeholder(shape=ref_shape_1),
-            "ref_tensor_2": mb.placeholder(shape=ref_shape_2),
-        }
-        input_values = {
-            "x": t,
-            "ref_tensor_1": ref_tensor_1,
-            "ref_tensor_2": ref_tensor_2,
-        }
-
-        def build(x, ref_tensor_1, ref_tensor_2):
-            if input_type == types.bool:
-                x = mb.cast(x=x, dtype="bool")
-
-            if ref_type == types.bool:
-                ref_tensor_1 = mb.cast(x=ref_tensor_1, dtype="bool")
-                ref_tensor_2 = mb.cast(x=ref_tensor_2, dtype="bool")
-
-            ref_tensors = (ref_tensor_1, ref_tensor_2)
-            return mb.reshape_like(x=x, ref_tensors=ref_tensors, begins=begins, ends=ends, end_masks=end_masks)
-
-        output_shape = ()
-        for ref_shape, begin, end, end_mask in zip((ref_shape_1, ref_shape_2), begins, ends, end_masks):
-            if end_mask:
-                output_shape += tuple(ref_shape[begin:])
-            else:
-                output_shape += tuple(ref_shape[begin:end])
-
-        expected_output_types = [
-            output_shape + (input_type,),
-        ]
-        expected_outputs = [
-            np.reshape(t, output_shape).astype(nptype_from_builtin(input_type)),
-        ]
-
-        run_compare_builder(
-            build,
-            input_placeholders,
-            input_values,
-            expected_output_types,
-            expected_outputs,
-            compute_unit=compute_unit,
-            backend=backend,
-            minimum_deployment_target=ct.target.iOS16
-        )
-
-
-class TestReshape:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        ),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         t = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
@@ -451,7 +397,11 @@ def test_builder_eval(self):
         np.testing.assert_allclose(expected_r2, r2.val, atol=1e-04, rtol=1e-05)
 
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_symbolic(self, compute_unit, backend):
         s0 = get_new_symbol()
@@ -500,7 +450,7 @@ def build(x, shape, shape2):
             expected_output_types,
             expected_outputs,
             inputs=construct_inputs_from_placeholders(input_placeholders, 10)
-            if backend[0] == "mlprogram"
+            if backend.backend == "mlprogram"
             else None,
             compute_unit=compute_unit,
             backend=backend,
@@ -524,137 +474,24 @@ def test_invalid_target_shape_with_zero(self):
         with pytest.raises(ValueError, match="Invalid target shape in `reshape` op"):
             mb.reshape(x=x, shape=[0, 7])
 
-    @pytest.mark.parametrize(
-        "compute_unit, backend",
-        itertools.product(
-            compute_units,
-            backends,
-        ),
-    )
-    def test_reshape_with_zero(self, compute_unit, backend):
-        if backend[0] == "neuralnetwork":
-            pytest.skip("Reshape with 0 is not supported in neuralnetwork.")
-
-        t = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-        input_placeholders = {"x": mb.placeholder(shape=t.shape)}
-        input_values = {"x": t}
-
-        def build(x):
-            return [
-                mb.reshape(x=x, shape=[0, -1]),
-                mb.reshape(x=x, shape=[0, 3]),
-                mb.reshape(x=x, shape=[-1, 0]),
-            ]
-
-        expected_output_types = [
-            (2, 3, types.fp32),
-            (2, 3, types.fp32),
-            (2, 3, types.fp32),
-        ]
-        expected_outputs = [
-            np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32),
-            np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32),
-            np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32),
-        ]
-
-        run_compare_builder(
-            build,
-            input_placeholders,
-            input_values,
-            expected_output_types,
-            expected_outputs,
-            compute_unit=compute_unit,
-            backend=backend,
-        )
-
-    @pytest.mark.parametrize(
-        "compute_unit, backend",
-        itertools.product(
-            compute_units,
-            backends,
-        ),
-    )
-    def test_reshape_with_zero_different_len(self, compute_unit, backend):
-        if backend[0] == "neuralnetwork":
-            pytest.skip("Reshape with 0 is not supported in neuralnetwork.")
-
-        t = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-        input_placeholders = {"x": mb.placeholder(shape=t.shape)}
-        input_values = {"x": t}
-
-        def build(x):
-            return [
-                mb.reshape(x=x, shape=[1, 0, -1, 0]),
+    @staticmethod
+    def test_value_inference_with_symbolic_values():
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=(get_new_symbol(), get_new_symbol()), dtype=types.fp32)
             ]
-
-        expected_output_types = [
-            (1, 1, 2, 3, types.fp32),
-        ]
-        expected_outputs = [
-            np.array([[[[1, 2, 3], [4, 5, 6]]]], dtype=np.float32),
-        ]
-
-        with pytest.raises(
-            ValueError,
-            match="When there is 0 in shape, the rank of x .* must "
-            "equal to the target shape len",
-        ):
-            run_compare_builder(
-                build,
-                input_placeholders,
-                input_values,
-                expected_output_types,
-                expected_outputs,
-                compute_unit=compute_unit,
-                backend=backend,
-            )
-
-    @pytest.mark.parametrize(
-        "compute_unit, backend",
-        itertools.product(
-            compute_units,
-            backends,
-        ),
-    )
-    def test_reshape_with_zero_different_len(self, compute_unit, backend):
-        if backend[0] == "neuralnetwork":
-            pytest.skip("Reshape with 0 is not supported in neuralnetwork.")
-
-        t = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-        input_placeholders = {"x": mb.placeholder(shape=t.shape)}
-        input_values = {"x": t}
-
-        def build(x):
-            return [mb.reshape(x=x, shape=[1, 0, -1, 0])]
-
-        # In IOS15/16 it will error out because rank of x needs to have same length as shape.
-        with pytest.raises(
-            ValueError,
-            match="When there is 0 in shape, the rank of x .* must "
-            "equal to the target shape len",
-        ):
-            run_compare_builder(
-                build,
-                input_placeholders,
-                input_values,
-                compute_unit=compute_unit,
-                backend=backend,
-            )
-
-        # In IOS17 it accepts different length.
-        expected_output_types = [(1, 1, 2, 3, types.fp32)]
-        expected_outputs = [np.array([[[[1, 2, 3], [4, 5, 6]]]], dtype=np.float32)]
-        run_compare_builder(
-            build,
-            input_placeholders,
-            input_values,
-            expected_output_types,
-            expected_outputs,
-            compute_unit=compute_unit,
-            backend=backend,
-            minimum_deployment_target=ct.target.iOS17,
         )
+        def prog(x):
+            shape = mb.shape(x=x)
+            res = mb.reshape(x=shape, shape=(1, 2))
+            res_sym_val = res.sym_val
+            assert res_sym_val is not None
+            assert res_sym_val.shape == (1, 2)
+            assert res_sym_val[0][0] == shape.sym_val[0]
+            assert res_sym_val[0][1] == shape.sym_val[1]
+            return res
 
+class TestReverse:
     @pytest.mark.parametrize(
         "compute_unit, backend",
         itertools.product(
@@ -662,33 +499,6 @@ def build(x):
             backends,
         ),
     )
-    def test_reshape_invalid_with_zero(self, compute_unit, backend):
-        if backend[0] == "neuralnetwork":
-            pytest.skip("Reshape with 0 is not supported in neuralnetwork.")
-
-        t = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-        input_placeholders = {"x": mb.placeholder(shape=t.shape)}
-        input_values = {"x": t}
-
-        def build(x):
-            return [mb.reshape(x=x, shape=[4, 0, -1, 0])]
-
-        with pytest.raises(ValueError, match="Invalid target shape in `reshape` op"):
-            run_compare_builder(
-                build,
-                input_placeholders,
-                input_values,
-                compute_unit=compute_unit,
-                backend=backend,
-                minimum_deployment_target=ct.target.iOS17,
-            )
-
-
-
-class TestReverse:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
-    )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         val = np.array([[-1.0, 2.0, -3.0], [4.0, -5.0, 6.0]], dtype=np.float32)
         input_placeholders = {"x": mb.placeholder(shape=val.shape)}
@@ -720,7 +530,11 @@ def test_builder_eval(self):
         np.testing.assert_allclose(np.flip(val, axis=0), res.val, atol=1e-04, rtol=1e-05)
 
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_symbolic(self, compute_unit, backend):
         s0 = get_new_symbol()
@@ -757,7 +571,11 @@ def build(x):
 
 class TestReverseSequence:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         x_val = np.array(
@@ -774,9 +592,7 @@ def test_builder_to_backend_smoke(self, compute_unit, backend):
 
         def build(x):
             return [
-                mb.reverse_sequence(
-                    x=x, lengths=[7, 2, 3, 5], seq_axis=1, batch_axis=0
-                ),
+                mb.reverse_sequence(x=x, lengths=[7, 2, 3, 5], seq_axis=1, batch_axis=0),
             ]
 
         expected_output_types = [
@@ -805,7 +621,11 @@ def build(x):
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_symbolic(self, compute_unit, backend):
         s0 = get_new_symbol()
@@ -824,9 +644,7 @@ def test_builder_to_backend_symbolic(self, compute_unit, backend):
 
         def build(x):
             return [
-                mb.reverse_sequence(
-                    x=x, lengths=[7, 2, 3, 5], seq_axis=1, batch_axis=0
-                ),
+                mb.reverse_sequence(x=x, lengths=[7, 2, 3, 5], seq_axis=1, batch_axis=0),
             ]
 
         expected_output_types = [
@@ -851,40 +669,160 @@ def build(x):
             expected_output_types,
             expected_outputs,
             inputs=construct_inputs_from_placeholders(input_placeholders, 10)
-            if backend[0] == "mlprogram"
+            if backend.backend == "mlprogram"
             else None,
             compute_unit=compute_unit,
             backend=backend,
         )
 
 
-class TestSliceBySize:
+class TestSliceByIndex:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend, x_dtype, idx_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            (np.float16, np.float32, np.int32),
+            (np.int32,),
+        ),
     )
-    def test_builder_to_backend_smoke(self, compute_unit, backend):
-        x_val = np.array(list(range(24))).reshape((2, 3, 4)).astype(np.float32)
-        begin_val = np.array([1, 1, 1], dtype=np.int32)
+    def test_builder_to_backend_smoke(self, compute_unit, backend, x_dtype, idx_dtype):
+        x_builtin_dtype = types.numpy_type_to_builtin_type(x_dtype)
+        idx_builtin_dtype = types.numpy_type_to_builtin_type(idx_dtype)
+
+        x_val = np.array(list(range(24))).reshape((2, 3, 4)).astype(x_dtype)
+        begin_val = np.array([1, 1, 1], dtype=idx_dtype)
+        end_val = np.array([2, 3, 3], dtype=idx_dtype)
         input_placeholders = {
-            "x": mb.placeholder(shape=x_val.shape),
-            "begin": mb.placeholder(shape=begin_val.shape, dtype=types.int32),
+            "x": mb.placeholder(shape=x_val.shape, dtype=x_builtin_dtype),
+            "begin": mb.placeholder(shape=begin_val.shape, dtype=idx_builtin_dtype),
+            "end": mb.placeholder(shape=end_val.shape, dtype=idx_builtin_dtype),
         }
-        input_values = {"x": x_val, "begin": begin_val}
+        input_values = {"x": x_val, "begin": begin_val, "end": end_val}
 
-        def build_non_single(x, begin):
+        def build(x, begin, end):
+            begin_c = mb.const(val=begin_val)
+            end_c = mb.const(val=end_val)
             return [
-                mb.slice_by_size(x=x, begin=begin, size=[1, 2, 3]),
+                mb.slice_by_index(x=x, begin=begin, end=end),
+                mb.slice_by_index(x=x, begin=begin_c, end=end_c),
             ]
 
-        def build_single(x, begin):
+        expected_output_types = [(UNK_SYM, UNK_SYM, UNK_SYM, x_builtin_dtype)] * 2
+        expected_outputs = [np.array([[[17, 18], [21, 22]]], dtype=x_dtype)] * 2
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    def test_type_inference(self):
+        s0 = get_new_symbol()
+        s1 = get_new_symbol()
+        s2 = get_new_symbol()
+
+        input_placeholders = {
+            "x": mb.placeholder(shape=(10, s0, s1, s2)),
+        }
+
+        def build(x):
             return [
-                mb.slice_by_size(x=x, begin=begin, size=[-1, 2, -1]),
+                mb.slice_by_index(
+                    x=x, begin=[2, 5, 6, 12], end=[6, 9, 20, -9], stride=[2, 1, 2, 1]
+                ),
+                mb.slice_by_index(
+                    x=x,
+                    begin=[-2, -5, -3, 9],
+                    end=[-6, -9, -6, -7],
+                    stride=[-2, -1, -2, 1],
+                ),
+                mb.slice_by_index(
+                    x=x,
+                    begin=[0, 0, 0, 0],
+                    end=[-6, -9, 3, -2],
+                    stride=[-2, -3, 1, 2],
+                    begin_mask=[True, True, True, True],
+                    end_mask=[False, False, False, False],
+                ),
+                mb.slice_by_index(
+                    x=x,
+                    begin=[-2, 5, -1, -7],
+                    end=[0, 0, 0, 0],
+                    stride=[-2, -3, 1, -2],
+                    begin_mask=[False, False, False, False],
+                    end_mask=[True, True, True, True],
+                ),
+                mb.slice_by_index(
+                    x=x, begin=[4, -1, 0, -5], end=[4, -1, 0, -5], stride=[1, -1, 2, -2]
+                ),
+                mb.slice_by_index(
+                    x=x,
+                    begin=[0, -1, 0, 2],
+                    end=[2, 0, 0, 2],
+                    begin_mask=[False, False, False, False],
+                    end_mask=[False, True, True, False],
+                    stride=[1, 2, -2, 1],
+                ),
+                mb.slice_by_index(
+                    x=x,
+                    begin=[0, 2, -3, 0],
+                    end=[1, 3, -4, 4],
+                    begin_mask=[False, False, False, False],
+                    end_mask=[False, False, False, False],
+                    stride=[1, 1, -1, 1],
+                ),
             ]
 
-        expected_output_types = [(1, 2, 3, types.fp32)]
-        expected_outputs = [np.array([[[17, 18, 19], [21, 22, 23]]], dtype=np.float32)]
+        expected_output_types = [
+            (2, UNK_SYM, UNK_SYM, UNK_SYM, types.fp32),
+            (2, UNK_SYM, UNK_SYM, UNK_SYM, types.fp32),
+            (3, UNK_SYM, UNK_SYM, UNK_SYM, types.fp32),
+            (5, UNK_SYM, 1, UNK_SYM, types.fp32),
+            (0, 0, 0, 0, types.fp32),
+            (2, 1, 1, 0, types.fp32),
+            (1, 1, 1, UNK_SYM, types.fp32),
+        ]
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            expected_output_types=expected_output_types,
+            frontend_only=True,
+        )
+
+    @pytest.mark.xfail(reason="rdar://99664032")
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
+    )
+    def test_single_element_edge_case(self, compute_unit, backend):
+        x_val = np.array(list(range(6))).reshape((1, 3, 2)).astype(np.float32)
+        input_placeholders = {
+            "x": mb.placeholder(shape=x_val.shape),
+        }
+        input_values = {"x": x_val}
+
+        def build(x):
+            return mb.slice_by_index(
+                x=x,
+                begin=[-1, 0, 0],
+                end=[-2, 0, 0],
+                stride=[-1, 1, 1],
+                begin_mask=[False, True, True],
+                end_mask=[False, True, True],
+            )
+
+        expected_output_types = [(1, 3, 2, types.fp32)]
+        expected_outputs = [np.array([[[0, 1], [2, 3], [4, 5]]], dtype=np.float32)]
         run_compare_builder(
-            build_non_single,
+            build,
             input_placeholders,
             input_values,
             expected_output_types,
@@ -893,9 +831,306 @@ def build_single(x, begin):
             backend=backend,
         )
 
-        expected_output_types = [(UNK_SYM, 2, UNK_SYM, types.fp32)]
+    @ssa_fn
+    def test_builder_eval_scalar_output_corner_cases(self):
+        x1 = np.array([2.0])
+        x2 = np.array([[[[1.0], [3.0]]]])
+        v = [
+            mb.slice_by_index(
+                x=x1,
+                begin=[
+                    0,
+                ],
+                end=[0],
+                squeeze_mask=[True],
+            ),
+            mb.slice_by_index(
+                x=x2,
+                begin=[0, 0, 0, 0],
+                end=[0, 0, 0, 0],
+                squeeze_mask=[True, True, True, True],
+            ),
+        ]
+        assert v[0].val.shape == ()
+        assert v[0].val == 2
+        assert v[1].val.shape == ()
+        assert v[1].val == 1
+
+    @ssa_fn
+    def test_builder_eval(self):
+        x_val = np.array(list(range(24))).reshape((2, 3, 4))
+        v = [
+            mb.slice_by_index(x=x_val, begin=[1, 1, 1], end=[2, 2, 2]),  # x_val[1:2, 1:2, 1:2]
+            mb.slice_by_index(
+                x=x_val, begin=[1, 1, 1], end=[2, 3, 4], stride=[1, 1, 2]
+            ),  #  x_val[1:2, 1:3, 1:4:2]
+            mb.slice_by_index(
+                x=x_val, begin=[-3, -3, -3], end=[-1, -1, -1]
+            ),  # x_val[-3:-1, -3:-1, -3:-1]
+            mb.slice_by_index(
+                x=x_val, begin=[0, 0, -3], end=[-1, -2, -2]
+            ),  # x_val[0:-1, 0:-2, -3:-2]
+            mb.slice_by_index(
+                x=x_val, begin=[-1, -1, -1], end=[0, 1, -3], stride=[-2, -1, -3]
+            ),  # x_val[-1:0:-2, -1:1:-1, -1:-3:-3]
+            mb.slice_by_index(
+                x=x_val,
+                begin=[1, 1, 1],
+                end=[2, 3, 4],
+                stride=[1, 1, 2],
+                begin_mask=[True, False, True],
+            ),  # x_val[:2, 1:3, :4:2]
+            mb.slice_by_index(
+                x=x_val,
+                begin=[1, 1, 1],
+                end=[2, 3, 4],
+                stride=[1, 1, 2],
+                begin_mask=[True, False, True],
+                end_mask=[True, True, False],
+            ),  # x_val[:, 1:, :4:2]
+            mb.slice_by_index(
+                x=x_val,
+                begin=[1, 1, 1],
+                end=[2, 3, 4],
+                stride=[1, 1, 2],
+                begin_mask=[False, False, True],
+                end_mask=[True, False, False],
+                squeeze_mask=[False, True, False],
+            ),  # x_val[1::1, 1, :3:2]
+            mb.slice_by_index(
+                x=x_val,
+                begin=[0, 0, 0],
+                end=[0, 0, 0],
+                stride=[1, 1, 1],
+                begin_mask=[True, True, True],
+                end_mask=[True, True, True],
+            ),  # x_val[:, :, :]
+            mb.slice_by_index(
+                x=x_val,
+                begin=[1, 1, 1],
+                end=[2, 2, 0],
+                stride=[1, 1, 1],
+                squeeze_mask=[False, False, True],
+            ),  # x_val[1:2, 1:2, 1]
+            mb.slice_by_index(
+                x=x_val,
+                begin=[1, 0, 0],
+                end=[2, 0, 0],
+                stride=[1, 1, 1],
+                begin_mask=[False, True, True],
+                end_mask=[False, True, True],
+            ),  # x_val[1:2, ...]
+            mb.slice_by_index(
+                x=x_val,
+                begin=[0, 0, 0],
+                end=[0, 0, 0],
+                stride=[1, 1, 1],
+                begin_mask=[True, True, True],
+                end_mask=[True, True, True],
+            ),  # x_val[...]
+            mb.slice_by_index(
+                x=x_val,
+                begin=[1, 0, 1],
+                end=[2, 0, 2],
+                stride=[1, 1, 1],
+                begin_mask=[False, True, False],
+                end_mask=[False, True, False],
+            ),  # x_val[1:2, ..., 1:2]
+            mb.slice_by_index(
+                x=x_val,
+                begin=[0, 0, 1],
+                end=[0, 0, 0],
+                stride=[1, 1, 1],
+                begin_mask=[True, True, False],
+                end_mask=[True, True, False],
+                squeeze_mask=[False, False, True],
+            ),  # x_val[..., 1]
+            mb.slice_by_index(
+                x=x_val,
+                begin=[0, 0, 0],
+                end=[0, 0, 0],
+                stride=[1, 1, 1],
+                begin_mask=[False, False, True],
+                end_mask=[False, False, True],
+                squeeze_mask=[True, True, False],
+            ),  # x_val[0, 0, :]
+            mb.slice_by_index(
+                x=x_val,
+                begin=[1, 0, 0],
+                end=[2, 0, 0],
+                stride=[1, 1, 1],
+                begin_mask=[False, True, True],
+                end_mask=[False, True, True],
+            ),  # x_val[1:2]
+            mb.slice_by_index(
+                x=x_val,
+                begin=[1, 1, 0],
+                end=[2, 2, 0],
+                stride=[1, 1, 1],
+                begin_mask=[False, False, True],
+                end_mask=[False, False, True],
+            ),  # x_val[1:2, 1:2]
+            mb.slice_by_index(
+                x=x_val,
+                begin=[1, 0, 0],
+                end=[0, 0, 0],
+                stride=[1, 1, 1],
+                begin_mask=[False, True, True],
+                end_mask=[False, True, True],
+                squeeze_mask=[True, False, False],
+            ),  # x_val[1]
+            mb.slice_by_index(
+                x=x_val,
+                begin=[0, 0, 0],
+                end=[0, 0, 0],
+                begin_mask=[True, True, True],
+                end_mask=[True, True, True],
+            ),  # x_val[:]
+            mb.slice_by_index(
+                x=x_val,
+                begin=[0, 0, 0],
+                end=[0, 0, 0],
+                stride=[1, 1, -1],
+                begin_mask=[True, True, True],
+                end_mask=[True, True, True],
+            ),  # x_val[..., ::-1]
+        ]
+        ans = [
+            x_val[1:2, 1:2, 1:2],
+            x_val[1:2, 1:3, 1:4:2],
+            x_val[-3:-1, -3:-1, -3:-1],
+            x_val[0:-1, 0:-2, -3:-2],
+            x_val[-1:0:-2, -1:1:-1, -1:-3:-3],
+            x_val[:2, 1:3, :4:2],
+            x_val[:, 1:, :4:2],
+            x_val[1::1, 1, :3:2],
+            x_val[:, :, :],
+            x_val[1:2, 1:2, 1],
+            x_val[1:2, ...],
+            x_val[...],
+            x_val[1:2, ..., 1:2],
+            x_val[..., 1],
+            x_val[0, 0, :],
+            x_val[1:2],
+            x_val[1:2, 1:2],
+            x_val[1],
+            x_val[:],
+            x_val[..., ::-1],
+        ]
+        for idx in range(len(v)):
+            assert ans[idx].shape == v[idx].shape
+            np.testing.assert_allclose(ans[idx], v[idx].val, atol=1e-04, rtol=1e-05)
+
+    @staticmethod
+    @pytest.mark.skipif(ct.utils._macos_version() < (14, 0),
+                        reason="Bug fixed in macOS 14")
+    def test_slice_by_index():
+        INPUT_SHAPE = (1, 2, 8, 16)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=INPUT_SHAPE)])
+        def prog(x):
+            x = mb.slice_by_index(
+                x=x,
+                begin=[0, 0, 0, 0],
+                end=[1, 2, 8, 12],
+                stride=[1, 1, 2, 2],
+                begin_mask=None,
+                end_mask=None,
+                squeeze_mask=None,
+            )
+            return x
+
+        x = np.random.rand(*INPUT_SHAPE)
+
+        # slice by index is x[begin[0]: end[0]: stride[0], begin[1]: end[1]: stride[1], ...]
+        y_numpy = x[0:1:1, 0:2:1, 0:8:2, 0:12:2]
+
+        model = ct.convert(prog, source="milinternal", convert_to="neuralnetwork")
+        y_neuralnetwork = list(model.predict({"x": x}).values())[0]
+        np.testing.assert_allclose(y_numpy, y_neuralnetwork)
+
+        model = ct.convert(
+            prog,
+            source="milinternal",
+            convert_to="mlprogram",
+            compute_units=ct.ComputeUnit.CPU_ONLY,
+        )
+
+        y_mlprogram = list(model.predict({"x": x}).values())[0]
+        assert y_numpy.shape == y_mlprogram.shape
+        np.testing.assert_allclose(y_numpy, y_mlprogram)
+
+    @staticmethod
+    @pytest.mark.skipif(ct.utils._macos_version() < (14, 0),
+                        reason="Bug fixed in macOS 14")
+    def test_slice_by_index_slice_squeeze_separate():
+        INPUT_SHAPE = (1, 2, 8, 16)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=INPUT_SHAPE)])
+        def prog(x):
+            x = mb.slice_by_index(
+                x=x,
+                begin=[0, 0, 0, 0],
+                end=[1, 2, 8, 12],
+                stride=[1, 1, 1, 2],
+                begin_mask=None,
+                end_mask=None,
+                squeeze_mask=[True, False, False, False],
+            )
+            return x
+
+        x = np.random.rand(*INPUT_SHAPE)
+
+        # slice by index is x[begin[0]: end[0]: stride[0], begin[1]: end[1]: stride[1], ...]
+        # and squeeze dim 0
+        y_numpy = x[0:1:1, 0:2:1, 0:8:1, 0:12:2]
+        y_numpy = np.squeeze(y_numpy, axis=0)
+
+        model = ct.convert(prog, source="milinternal", convert_to="neuralnetwork")
+        y_neuralnetwork = list(model.predict({"x": x}).values())[0]
+
+        assert y_numpy.shape == y_neuralnetwork.shape
+        np.testing.assert_allclose(y_numpy, y_neuralnetwork)
+
+        model = ct.convert(prog, source="milinternal", convert_to="mlprogram")
+        y_mlprogram = list(model.predict({"x": x}).values())[0]
+        # TODO: rdar://103365766 MLProgram does not apply squeeze_mask.
+        # np.testing.assert_allclose(y_numpy, y_mlprogram)
+
+
+class TestSliceBySize:
+    @pytest.mark.parametrize(
+        "compute_unit, backend, size_val, x_dtype, idx_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            ([1, 2, 3], [-1, 2, -1]),
+            (np.float16, np.float32, np.int32),
+            (np.int32,),
+        ),
+    )
+    def test_builder_to_backend_smoke(self, compute_unit, backend, size_val, x_dtype, idx_dtype):
+        def build(x, begin):
+            return mb.slice_by_size(x=x, begin=begin, size=np.array(size_val, dtype=idx_dtype))
+
+        x_builtin_dtype = types.numpy_type_to_builtin_type(x_dtype)
+        idx_builtin_dtype = types.numpy_type_to_builtin_type(idx_dtype)
+
+        x_val = np.array(list(range(24))).reshape((2, 3, 4)).astype(x_dtype)
+        begin_val = np.array([1, 1, 1], dtype=idx_dtype)
+        input_placeholders = {
+            "x": mb.placeholder(shape=x_val.shape, dtype=x_builtin_dtype),
+            "begin": mb.placeholder(shape=begin_val.shape, dtype=idx_builtin_dtype),
+        }
+        input_values = {"x": x_val, "begin": begin_val}
+
+        expected_outputs = np.array([[[17, 18, 19], [21, 22, 23]]], dtype=x_dtype)
+        expected_output_types = tuple([dim if dim != -1 else UNK_SYM for dim in size_val]) + (
+            x_builtin_dtype,
+        )
+
         run_compare_builder(
-            build_single,
+            build,
             input_placeholders,
             input_values,
             expected_output_types,
@@ -917,7 +1152,11 @@ def test_builder_eval(self):
 
 class TestSpaceToDepth:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         # original input type is (1, 1, 2, 2, fp32)
@@ -929,9 +1168,7 @@ def build(x):
             return [mb.space_to_depth(x=x, block_size=2)]
 
         expected_output_types = (1, 4, 1, 1, types.fp32)
-        expected_outputs = np.array(
-            [[[[7.0]], [[9.0]], [[4.0]], [[6.0]]]], dtype=np.float32
-        )
+        expected_outputs = np.array([[[[7.0]], [[9.0]], [[4.0]], [[6.0]]]], dtype=np.float32)
 
         run_compare_builder(
             build,
@@ -946,7 +1183,11 @@ def build(x):
 
 class TestSqueeze:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         x = np.array([[[[1], [2], [3]]]], dtype=np.float32)
@@ -1004,7 +1245,11 @@ def test_builder_eval_rank_0(self):
 class TestTranspose:
     @pytest.mark.parametrize(
         "compute_unit, backend, is_symbolic",
-        itertools.product(compute_units, backends, [True, False],),
+        itertools.product(
+            compute_units,
+            backends,
+            [True, False],
+        ),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend, is_symbolic):
         x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
@@ -1043,7 +1288,7 @@ def build(x):
             expected_output_types,
             expected_outputs,
             inputs=construct_inputs_from_placeholders(input_placeholders, 10)
-            if backend[0] == "mlprogram"
+            if backend.backend == "mlprogram"
             else None,
             compute_unit=compute_unit,
             backend=backend,
@@ -1056,7 +1301,11 @@ def test_builder_eval(self):
         np.testing.assert_allclose(x.T, v.val, atol=1e-04, rtol=1e-05)
 
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_symbolic(self, compute_unit, backend):
         s0 = get_new_symbol()
@@ -1088,7 +1337,7 @@ def build(x):
             expected_output_types,
             expected_outputs,
             inputs=construct_inputs_from_placeholders(input_placeholders, 10)
-            if backend[0] == "mlprogram"
+            if backend.backend == "mlprogram"
             else None,
             compute_unit=compute_unit,
             backend=backend,
@@ -1097,7 +1346,11 @@ def build(x):
 
 class TestPixelShuffle:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         # original input type is (1, 4, 1, 1, fp32)
@@ -1131,9 +1384,7 @@ def build(x):
             [2, 4],
         ),
     )
-    def test_builder_to_backend_stress(
-        self, compute_unit, backend, shape, upscale_factor
-    ):
+    def test_builder_to_backend_stress(self, compute_unit, backend, shape, upscale_factor):
         val = np.random.rand(*shape)
         input_placeholders = {"x": mb.placeholder(shape=val.shape)}
         input_values = {"x": val}
@@ -1155,78 +1406,14 @@ def build(x):
         )
 
 
-@pytest.mark.skipif(ct.utils._macos_version() < (13, 0), reason="New functionality in macOS13/iOS16")
-class TestPixelUnshuffle:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
-    )
-    def test_builder_to_backend_smoke(self, compute_unit, backend):
-        if backend[0] == "neuralnetwork":
-            pytest.skip("nn backend not supported")
-
-        val = np.array([[[[9.0, 5.0], [1.0, 3.0]]]], dtype=np.float32)
-        input_placeholders = {"x": mb.placeholder(shape=val.shape)}
-        input_values = {"x": val}
-
-        def build(x):
-            return [mb.pixel_unshuffle(x=x, downscale_factor=np.uint32(2))]
-
-        expected_output_types = (1, 4, 1, 1, types.fp32)
-        expected_outputs = np.array([[[[9.0]], [[5.0]], [[1.0]], [[3.0]]]], dtype=np.float32)
-
-        run_compare_builder(
-            build,
-            input_placeholders,
-            input_values,
-            expected_output_types,
-            expected_outputs,
-            compute_unit=compute_unit,
-            backend=backend,
-            minimum_deployment_target=ct.target.iOS16,
-        )
-
-    @pytest.mark.skipif(not testing_reqs._HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
+class TestSlidingWindows:
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape, downscale_factor",
+        "compute_unit, backend",
         itertools.product(
             compute_units,
             backends,
-            [(1, 2, 4, 4), (2, 1, 8, 4)],
-            [2, 4],
         ),
     )
-    def test_builder_to_backend_stress(
-        self, compute_unit, backend, shape, downscale_factor,
-    ):
-        if backend[0] == "neuralnetwork":
-            pytest.skip("nn backend not supported")
-
-        val = np.random.rand(*shape)
-        input_placeholders = {"x": mb.placeholder(shape=val.shape)}
-        input_values = {"x": val}
-
-        def build(x):
-            return [mb.pixel_unshuffle(x=x, downscale_factor=np.uint32(downscale_factor))]
-
-        torch_pixel_unshuffle = torch.nn.PixelUnshuffle(downscale_factor)
-        expected_outputs = [torch_pixel_unshuffle(torch.Tensor(val)).numpy()]
-        expected_output_types = [o.shape[:] + (types.fp32,) for o in expected_outputs]
-        run_compare_builder(
-            build,
-            input_placeholders,
-            input_values,
-            expected_output_types,
-            expected_outputs,
-            compute_unit=compute_unit,
-            backend=backend,
-            minimum_deployment_target=ct.target.iOS16,
-        )
-
-
-class TestSlidingWindows:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
-    )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         # original input type is (1, 4, 1, 1, fp32)
         val = np.array([[[[9.0]], [[5.0]], [[1.0]], [[3.0]]]], dtype=np.float32)
@@ -1262,9 +1449,7 @@ def build(x):
             [1, 2],
         ),
     )
-    def test_builder_to_backend_stress(
-        self, compute_unit, backend, rank_and_axis, size, stride
-    ):
+    def test_builder_to_backend_stress(self, compute_unit, backend, rank_and_axis, size, stride):
         def np_sliding_windows(a, np_axis, np_size, np_stride):
             n = (a.shape[np_axis] - np_size) // np_stride + 1
             x_shape = list(a.shape)
@@ -1286,9 +1471,7 @@ def np_sliding_windows(a, np_axis, np_size, np_stride):
         def build(x):
             return [mb.sliding_windows(x=x, axis=axis, size=size, stride=stride)]
 
-        expected_outputs = [
-            np_sliding_windows(val, np_axis=axis, np_size=size, np_stride=stride)
-        ]
+        expected_outputs = [np_sliding_windows(val, np_axis=axis, np_size=size, np_stride=stride)]
         expected_output_types = [o.shape[:] + (types.fp32,) for o in expected_outputs]
         run_compare_builder(
             build,
@@ -1303,7 +1486,11 @@ def build(x):
 
 class TestConcat:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends, )
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         t1 = np.array([[1, 2], [4, 5]], dtype=np.float32)
@@ -1343,11 +1530,11 @@ def build(x, y):
             [1, 2, 3, 4, 5],
             [2, 3],
             [False, True],
-        )
+        ),
     )
-    def test_builder_to_backend_stress_interleave(self, compute_unit, backend,
-                                                  rank, n_inputs, negative_index):
-
+    def test_builder_to_backend_stress_interleave(
+        self, compute_unit, backend, rank, n_inputs, negative_index
+    ):
         def np_concat_interleave(arrays, axis):
             step = len(arrays)
             in_shape = arrays[0].shape
@@ -1453,7 +1640,11 @@ def test_builder_eval_failure(self):
 
 class TestSplit:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends, )
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         t = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32)
@@ -1464,9 +1655,7 @@ def test_builder_to_backend_smoke(self, compute_unit, backend):
         input_values = {"x": t}
 
         def build(x):
-            return mb.split(x=x, num_splits=2, axis=1) + mb.split(
-                x=x, split_sizes=[1, 2], axis=0
-            )
+            return mb.split(x=x, num_splits=2, axis=1) + mb.split(x=x, split_sizes=[1, 2], axis=0)
 
         expected_output_types = [
             (3, 1, types.fp32),
@@ -1502,7 +1691,11 @@ def test_builder_eval(self):
 
 class TestStack:
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends, )
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_builder_to_backend_smoke(self, compute_unit, backend):
         t1 = np.array([1, 2, 3], dtype=np.float32)
@@ -1515,7 +1708,11 @@ def test_builder_to_backend_smoke(self, compute_unit, backend):
         input_values = {"x": t1, "y": t2}
 
         def build(x, y):
-            return [mb.stack(values=(x, y), axis=0), mb.stack(values=(x, y), axis=1), mb.stack(values=(x, y), axis=-1)]
+            return [
+                mb.stack(values=(x, y), axis=0),
+                mb.stack(values=(x, y), axis=1),
+                mb.stack(values=(x, y), axis=-1),
+            ]
 
         expected_output_types = [
             (2, 3, types.fp32),
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS15/__init__.py b/coremltools/converters/mil/mil/ops/tests/iOS15/__init__.py
new file mode 100644
index 000000000..9991ff735
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/tests/iOS15/__init__.py
@@ -0,0 +1,9 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import coremltools as ct
+from coremltools.converters.mil.testing_reqs import backends_internal, clean_up_backends
+
+backends = clean_up_backends(backends_internal, ct.target.iOS15)
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS15/test_image_resizing.py b/coremltools/converters/mil/mil/ops/tests/iOS15/test_image_resizing.py
new file mode 100644
index 000000000..3d37ded06
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/tests/iOS15/test_image_resizing.py
@@ -0,0 +1,358 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import itertools
+
+import numpy as np
+import pytest
+
+import coremltools as ct
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import get_new_symbol, types
+from coremltools.converters.mil.mil.ops.tests.iOS15 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import (
+    mark_api_breaking,
+    run_compare_builder,
+)
+from coremltools.converters.mil.testing_reqs import compute_units
+
+
+class TestAffine:
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_builder_to_backend_smoke(self, compute_unit, backend):
+        x_val = np.array([11.0, 22.0, 33.0, 44.0], dtype=np.float32).reshape([1, 1, 2, 2])
+        transform_matrix_val = np.array(
+            [-1.0, -2.0, -3.7, -1.0, 3.5, 1.2], dtype=np.float32
+        ).reshape([1, 6])
+
+        input_placeholder_dict = {
+            "x": mb.placeholder(shape=x_val.shape),
+            "transform_matrix": mb.placeholder(shape=transform_matrix_val.shape),
+        }
+        input_value_dict = {"x": x_val, "transform_matrix": transform_matrix_val}
+
+        def build(x, transform_matrix):
+            return [
+                mb.affine(
+                    x=x,
+                    transform_matrix=transform_matrix,
+                    output_height=3,
+                    output_width=3,
+                    sampling_mode="bilinear",
+                    padding_mode="constant",
+                    padding_value=0.0,
+                    coordinates_mode="normalized_minus_one_to_one",
+                    align_corners=True,
+                ),
+                mb.affine(
+                    x=x,
+                    transform_matrix=transform_matrix,
+                    output_height=2,
+                    output_width=5,
+                    sampling_mode="bilinear",
+                    padding_mode="constant",
+                    padding_value=0.0,
+                    coordinates_mode="normalized_minus_one_to_one",
+                    align_corners=True,
+                ),
+            ]
+
+        expected_output_types = [
+            (1, 1, 3, 3, types.fp32),
+            (1, 1, 2, 5, types.fp32),
+        ]
+        expected_outputs = [
+            np.array(
+                [10.752501, 2.5025, 0.0, 1.9799997, 0.0, 0.0, 0.0, 0.0, 0.0],
+                dtype=np.float32,
+            ).reshape([1, 1, 3, 3]),
+            np.array(
+                [10.752501, 5.94, 2.5025, 0.44000006, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                dtype=np.float32,
+            ).reshape([1, 1, 2, 5]),
+        ]
+
+        run_compare_builder(
+            build,
+            input_placeholder_dict,
+            input_value_dict,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+
+class TestUpsampleNearestNeighborFractionalScales:
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_builder_to_backend_smoke(self, compute_unit, backend):
+        if compute_unit != ct.ComputeUnit.CPU_ONLY:
+            pytest.xfail(
+                "rdar://97398448 (TestUpsampleNearestNeighborFractionalScales failing on GPU)"
+            )
+
+        x_val = np.array([1.5, -2.5, 3.5], dtype=np.float32).reshape([1, 1, 1, 3])
+        input_placeholder_dict = {"x": mb.placeholder(shape=x_val.shape)}
+        input_value_dict = {"x": x_val}
+
+        def build(x):
+            return [
+                mb.upsample_nearest_neighbor(
+                    x=x,
+                    scale_factor_height=1.0,
+                    scale_factor_width=1.0,
+                ),
+                mb.upsample_nearest_neighbor(
+                    x=x, scale_factor_height=3.17, scale_factor_width=0.67
+                ),
+                mb.upsample_nearest_neighbor(
+                    x=x,
+                    scale_factor_height=2.0,
+                    scale_factor_width=1.12,
+                ),
+            ]
+
+        expected_output_types = [
+            (1, 1, 1, 3, types.fp32),
+            (1, 1, 3, 2, types.fp32),
+            (1, 1, 2, 3, types.fp32),
+        ]
+        expected_outputs = [
+            x_val,
+            np.array([1.5, -2.5, 1.5, -2.5, 1.5, -2.5], dtype=np.float32).reshape([1, 1, 3, 2]),
+            np.array([1.5, -2.5, 3.5, 1.5, -2.5, 3.5], dtype=np.float32).reshape([1, 1, 2, 3]),
+        ]
+
+        run_compare_builder(
+            build,
+            input_placeholder_dict,
+            input_value_dict,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+
+class TestResample:
+    @staticmethod
+    def _test_builder_to_backend_smoke(compute_unit, backend, coordinates_dtype, expected_cast_ops):
+        x_ = np.array([11.0, 22.0, 33.0, 44.0], dtype=np.float32).reshape([1, 1, 2, 2])
+        coordinates_ = (
+            np.array([-1.0, -2.0, -3.7, -1.0, 0.0, 0.0, 3.5, 1.2], dtype=np.float32)
+            .reshape([1, 2, 2, 2])
+            .astype(coordinates_dtype)
+        )
+        if np.issubdtype(coordinates_dtype, np.integer):
+            coordinates_ = (
+                np.array([0, 0, 1, 1, 0, 0, 1, 1]).reshape([1, 2, 2, 2]).astype(coordinates_dtype)
+            )
+        expected_output_type = (1, 1, 2, 2, types.fp32)
+
+        def build_0(x, coordinates):
+            return mb.resample(
+                x=x,
+                coordinates=coordinates,
+                sampling_mode="bilinear",
+                padding_mode="constant",
+                padding_value=6.17,
+                coordinates_mode="normalized_minus_one_to_one",
+                align_corners=True,
+            )
+
+        expected_output_0 = np.array([8.585, 6.17, 27.5, 6.17], dtype=np.float32)
+        if np.issubdtype(coordinates_dtype, np.integer):
+            expected_output_0 = np.array([27.5, 44.0, 27.5, 44.0], dtype=np.float32)
+        expected_output_0 = expected_output_0.reshape(expected_output_type[:-1])
+
+        def build_1(x, coordinates):
+            return mb.resample(
+                x=x,
+                coordinates=coordinates,
+                sampling_mode="nearest",
+                padding_mode="border",
+                padding_value=-1.0,
+                coordinates_mode="unnormalized",
+                align_corners=False,
+            )
+
+        expected_output_1 = np.array([11.0, 11.0, 11.0, 44.0], dtype=np.float32)
+        if np.issubdtype(coordinates_dtype, np.integer):
+            expected_output_1 = np.array([11.0, 44.0, 11.0, 44.0], dtype=np.float32)
+        expected_output_1 = expected_output_1.reshape(expected_output_type[:-1])
+
+        def build_2(x, coordinates):
+            return mb.resample(
+                x=x,
+                coordinates=coordinates,
+                sampling_mode="bilinear",
+                padding_mode="reflection",
+                padding_value=-1.0,
+                coordinates_mode="normalized_zero_to_one",
+                align_corners=True,
+            )
+
+        expected_output_2 = np.array([22.0, 36.3, 11.0, 34.1], dtype=np.float32)
+        if np.issubdtype(coordinates_dtype, np.integer):
+            expected_output_2 = np.array([11.0, 44.0, 11.0, 44.0], dtype=np.float32)
+        expected_output_2 = expected_output_2.reshape(expected_output_type[:-1])
+
+        def build_3(x, coordinates):
+            return mb.resample(
+                x=x,
+                coordinates=coordinates,
+                sampling_mode="nearest",
+                padding_mode="symmetric",
+                padding_value=-1.0,
+                coordinates_mode="normalized_zero_to_one",
+                align_corners=False,
+            )
+
+        expected_output_3 = np.array([22.0, 33.0, 11.0, 33.0], dtype=np.float32)
+        if np.issubdtype(coordinates_dtype, np.integer):
+            expected_output_3 = np.array([11.0, 44.0, 11.0, 44.0], dtype=np.float32)
+        expected_output_3 = expected_output_3.reshape(expected_output_type[:-1])
+
+        for build, expected_output in zip(
+            [build_0, build_1, build_2, build_3],
+            [
+                expected_output_0,
+                expected_output_1,
+                expected_output_2,
+                expected_output_3,
+            ],
+        ):
+            # Need to create placeholders inside for loop to avoid interfere with each other.
+            input_placeholder_dict = {
+                "x": mb.placeholder(shape=x_.shape),
+                "coordinates": mb.placeholder(
+                    shape=coordinates_.shape,
+                    dtype=types.numpy_type_to_builtin_type(coordinates_dtype),
+                ),
+            }
+            input_value_dict = {"x": x_, "coordinates": coordinates_}
+
+            mlmodel = run_compare_builder(
+                build,
+                input_placeholder_dict,
+                input_value_dict,
+                expected_output_type,
+                expected_output,
+                compute_unit=compute_unit,
+                backend=backend,
+            )
+            prog = mlmodel._mil_program
+            number_of_cast = len(prog["main"].find_ops(op_type="cast"))
+            assert number_of_cast == expected_cast_ops
+
+    @mark_api_breaking(breaking_opset_version=ct.target.iOS16)
+    @pytest.mark.parametrize(
+        "compute_unit, backend, coordinates_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            (np.int32, np.float32),
+        ),
+    )
+    def test_builder_to_backend_smoke(self, compute_unit, backend, coordinates_dtype):
+        self._test_builder_to_backend_smoke(compute_unit, backend, coordinates_dtype, 2)
+
+
+class TestResizeBilinear:
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
+    )
+    def test_builder_to_backend_smoke(self, compute_unit, backend):
+        x = np.array([0, 1], dtype=np.float32).reshape(1, 1, 2)
+        input_placeholder_dict = {"x": mb.placeholder(shape=x.shape)}
+        input_value_dict = {"x": x}
+
+        def build_mode_4(x):
+            return mb.resize_bilinear(
+                x=x,
+                target_size_height=1,
+                target_size_width=5,
+                sampling_mode="UNALIGN_CORNERS",
+            )
+
+        expected_output_type = expected_output_type = (1, 1, 5, types.fp32)
+        expected_output = np.array([0.0, 0.1, 0.5, 0.9, 1.0], dtype=np.float32).reshape(1, 1, 5)
+
+        run_compare_builder(
+            build_mode_4,
+            input_placeholder_dict,
+            input_value_dict,
+            expected_output_type,
+            expected_output,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+
+class TestCropResize:
+    @mark_api_breaking(breaking_opset_version=ct.target.iOS17)
+    @pytest.mark.parametrize(
+        "compute_unit, backend, is_symbolic",
+        itertools.product(compute_units, backends, [True, False]),
+    )
+    def test_builder_to_backend_smoke(self, compute_unit, backend, is_symbolic):
+        if compute_unit != ct.ComputeUnit.CPU_ONLY:
+            pytest.xfail("rdar://97398582 (TestCropResize failing on mlprogram + GPU)")
+        x = np.array(
+            [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]],
+            dtype=np.float32,
+        ).reshape(1, 1, 4, 4)
+
+        input_shape = list(x.shape)
+        placeholder_input_shape = input_shape
+        if is_symbolic:
+            # set batch and channel dimension symbolic
+            placeholder_input_shape[0] = get_new_symbol()
+            placeholder_input_shape[1] = get_new_symbol()
+
+        input_placeholder_dict = {"x": mb.placeholder(shape=placeholder_input_shape)}
+        input_value_dict = {"x": x}
+        N = 1
+        roi = np.array([[1, 1, 2, 2]], dtype=np.float32).reshape(1, 1, 4, 1, 1)
+        roi_normalized = np.array([[0, 0.0, 0.0, 1.0 / 3, 1.0 / 3]], dtype=np.float32).reshape(
+            1, 1, 5, 1, 1
+        )
+        roi_invert = np.array([[2, 2, 1, 1]], dtype=np.float32).reshape(1, 1, 4, 1, 1)
+
+        def build(x):
+            return mb.crop_resize(
+                x=x,
+                roi=roi_invert,
+                target_width=2,
+                target_height=2,
+                normalized_coordinates=True,
+                box_coordinate_mode="CORNERS_HEIGHT_FIRST",
+                sampling_mode="UNALIGN_CORNERS",
+            )
+
+        expected_output_type = (
+            N,
+            placeholder_input_shape[0],
+            placeholder_input_shape[1],
+            2,
+            2,
+            types.fp32,
+        )
+
+        expected_output = np.array([3.5, 5.5, 11.5, 13.5], dtype=np.float32).reshape(1, 1, 1, 2, 2)
+
+        run_compare_builder(
+            build,
+            input_placeholder_dict,
+            input_value_dict,
+            expected_output_type,
+            expected_output,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS15/test_tensor_transformation.py b/coremltools/converters/mil/mil/ops/tests/iOS15/test_tensor_transformation.py
new file mode 100644
index 000000000..1271bc3b2
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/tests/iOS15/test_tensor_transformation.py
@@ -0,0 +1,101 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import itertools
+
+import numpy as np
+import pytest
+
+import coremltools as ct
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.ops.tests.iOS15 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import (
+    mark_api_breaking,
+    run_compare_builder,
+)
+from coremltools.converters.mil.testing_reqs import compute_units
+
+
+class TestReshape:
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
+    )
+    def test_reshape_with_zero(self, compute_unit, backend):
+        t = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        input_placeholders = {"x": mb.placeholder(shape=t.shape)}
+        input_values = {"x": t}
+
+        def build(x):
+            return [
+                mb.reshape(x=x, shape=[0, -1]),
+                mb.reshape(x=x, shape=[0, 3]),
+                mb.reshape(x=x, shape=[-1, 0]),
+            ]
+
+        expected_output_types = [
+            (2, 3, types.fp32),
+            (2, 3, types.fp32),
+            (2, 3, types.fp32),
+        ]
+        expected_outputs = [
+            np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32),
+            np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32),
+            np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32),
+        ]
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @mark_api_breaking(breaking_opset_version=ct.target.iOS17)
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
+    )
+    def test_reshape_with_zero_different_len(self, compute_unit, backend):
+        t = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        input_placeholders = {"x": mb.placeholder(shape=t.shape)}
+        input_values = {"x": t}
+
+        def build(x):
+            return [
+                mb.reshape(x=x, shape=[1, 0, -1, 0]),
+            ]
+
+        expected_output_types = [
+            (1, 1, 2, 3, types.fp32),
+        ]
+        expected_outputs = [
+            np.array([[[[1, 2, 3], [4, 5, 6]]]], dtype=np.float32),
+        ]
+
+        with pytest.raises(
+            ValueError,
+            match="When there is 0 in shape, the rank of x .* must "
+            "equal to the target shape len",
+        ):
+            run_compare_builder(
+                build,
+                input_placeholders,
+                input_values,
+                expected_output_types,
+                expected_outputs,
+                compute_unit=compute_unit,
+                backend=backend,
+            )
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS16/__init__.py b/coremltools/converters/mil/mil/ops/tests/iOS16/__init__.py
new file mode 100644
index 000000000..a021b33f0
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/tests/iOS16/__init__.py
@@ -0,0 +1,9 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import coremltools as ct
+from coremltools.converters.mil.testing_reqs import backends_internal, clean_up_backends
+
+backends = clean_up_backends(backends_internal, ct.target.iOS16)
diff --git a/coremltools/converters/mil/mil/ops/tests/test_constexpr_ops.py b/coremltools/converters/mil/mil/ops/tests/iOS16/test_constexpr_ops.py
similarity index 78%
rename from coremltools/converters/mil/mil/ops/tests/test_constexpr_ops.py
rename to coremltools/converters/mil/mil/ops/tests/iOS16/test_constexpr_ops.py
index 4e8aa47b3..452925e49 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_constexpr_ops.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS16/test_constexpr_ops.py
@@ -8,34 +8,22 @@
 import numpy as np
 import pytest
 
-import coremltools as ct
 from coremltools.converters.mil import testing_reqs
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import types
 from coremltools.converters.mil.mil.ops.defs.iOS16 import constexpr_ops
-from coremltools.converters.mil.mil.ops.tests.testing_utils import \
-    run_compare_builder
-from coremltools.converters.mil.testing_utils import (get_op_types_in_program,
-                                                      ssa_fn)
+from coremltools.converters.mil.mil.ops.tests.iOS16 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
+from coremltools.converters.mil.testing_utils import get_op_types_in_program, ssa_fn
 
-backends = [("mlprogram", "fp32"), ("mlprogram", "fp16")]
 compute_units = testing_reqs.compute_units
 
-
-@pytest.mark.skipif(
-    ct.utils._macos_version() < (13, 0),
-    reason="ConstExpr ops available from macOS13 onwards.",
-)
 class TestConstexprAffineDequantize:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_builder_to_backend_smoke(self, compute_unit, backend):
 
         t = np.array(range(4)).reshape(1, 1, 2, 2).astype(np.float32)
-        decompressed_constant = (
-            np.array([1, 2, 3, 4]).reshape(1, 1, 2, 2).astype(np.float32)
-        )
+        decompressed_constant = np.array([1, 2, 3, 4]).reshape(1, 1, 2, 2).astype(np.float32)
         input_placeholders = {
             "x": mb.placeholder(shape=t.shape),
         }
@@ -65,7 +53,6 @@ def build(x):
             expected_outputs,
             compute_unit=compute_unit,
             backend=backend,
-            minimum_deployment_target=ct.target.iOS16,
         )
 
         # validate that the constexpr op is not removed by any graph pass
@@ -117,48 +104,44 @@ def affine_dequant_config_generator():
         for quant_dtype in [np.int8, np.uint8]:
             low = 0 if quant_dtype == np.uint8 else -128
             high = 255 if quant_dtype == np.uint8 else 127
-            for rank in range(1, 6):
-                shape = np.random.randint(low=2, high=5, size=rank)
-                quantized_data = np.random.randint(
-                    low=low, high=high, size=shape, dtype=quant_dtype
-                )
-                axis = np.random.choice(range(-rank, rank))
-                scalar_zp = np.random.choice([True, False])
-                scalar_sc = np.random.choice([True, False])
-                zero_point = (
-                    np.random.randint(
-                        low=low,
-                        high=high,
-                        size=quantized_data.shape[axis],
-                        dtype=quant_dtype,
+            for zp_dtype in [np.int8, np.uint8, np.float32]:
+                for rank in range(1, 6):
+                    shape = np.random.randint(low=2, high=5, size=rank)
+                    quantized_data = np.random.randint(
+                        low=low, high=high, size=shape, dtype=quant_dtype
                     )
-                    if not scalar_zp
-                    else np.random.choice(range(low, high)).astype(quant_dtype)
-                )
-                scale = (
-                    np.random.rand(quantized_data.shape[axis]).astype(np.float32)
-                    if not scalar_sc
-                    else np.float32(np.random.rand())
-                )  # fp16 is already covered under backends parameterization
+                    axis = np.random.choice(range(-rank, rank))
+                    scalar_zp = np.random.choice([True, False])
+                    scalar_sc = np.random.choice([True, False])
+                    zero_point = (
+                        np.random.randint(
+                            low=low,
+                            high=high,
+                            size=quantized_data.shape[axis],
+                            dtype=quant_dtype,
+                        ).astype(zp_dtype)
+                        if not scalar_zp
+                        else np.random.choice(range(low, high)).astype(zp_dtype)
+                    )
+                    scale = (
+                        np.random.rand(quantized_data.shape[axis]).astype(np.float32)
+                        if not scalar_sc
+                        else np.float32(np.random.rand())
+                    )  # fp16 is already covered under backends parameterization
 
-                params = {
-                    "quantized_data": quantized_data,
-                    "zp": zero_point,
-                    "sc": scale,
-                    "axis": axis,
-                }
-                yield params
+                    params = {
+                        "quantized_data": quantized_data,
+                        "zp": zero_point,
+                        "sc": scale,
+                        "axis": axis,
+                    }
+                    yield params
 
     @pytest.mark.parametrize(
         "compute_unit, backend, config",
-        itertools.product(
-            compute_units,
-            backends,
-            affine_dequant_config_generator.__func__()
-        ),
+        itertools.product(compute_units, backends, affine_dequant_config_generator.__func__()),
     )
     def test_builder_stress(self, compute_unit, backend, config):
-
         quantized_data, zero_point, scale, axis = (
             config["quantized_data"],
             config["zp"],
@@ -198,7 +181,6 @@ def build(x):
             expected_outputs,
             compute_unit=compute_unit,
             backend=backend,
-            minimum_deployment_target=ct.target.iOS16,
         )
 
         # validate that the constexpr op is not removed by any graph pass
@@ -206,15 +188,8 @@ def build(x):
         if "constexpr_affine_dequantize" not in get_op_types_in_program(prog):
             raise AssertionError("Invalidated: Test Failed")
 
-
-@pytest.mark.skipif(
-    ct.utils._macos_version() < (13, 0),
-    reason="ConstExpr ops available from macOS13 onwards.",
-)
 class TestConstexprCast:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_builder_to_backend_smoke(self, compute_unit, backend):
 
         t = np.array(range(4)).reshape(4, 1).astype(np.float32)
@@ -240,7 +215,6 @@ def build(x):
             expected_outputs,
             compute_unit=compute_unit,
             backend=backend,
-            minimum_deployment_target=ct.target.iOS16,
         )
 
         # validate that the constexpr op is not removed by any graph pass
@@ -268,11 +242,7 @@ def cast_config_generator():
 
     @pytest.mark.parametrize(
         "compute_unit, backend, config",
-        itertools.product(
-            compute_units,
-            backends,
-            cast_config_generator.__func__()
-        ),
+        itertools.product(compute_units, backends, cast_config_generator.__func__()),
     )
     def test_builder_stress(self, compute_unit, backend, config):
 
@@ -293,9 +263,7 @@ def build(x):
             types.string_to_builtin(output_dtype),
         )
 
-        output_np_type = types.nptype_from_builtin(
-            types.string_to_builtin(output_dtype)
-        )
+        output_np_type = types.nptype_from_builtin(types.string_to_builtin(output_dtype))
         t = np.random.rand(*source_val.shape).astype(output_np_type)
         decompressed_constant = source_val.astype(output_np_type)
         expected_outputs = t + decompressed_constant
@@ -312,22 +280,14 @@ def build(x):
             expected_outputs,
             compute_unit=compute_unit,
             backend=backend,
-            minimum_deployment_target=ct.target.iOS16,
         )
 
         # validate that the constexpr op is not removed by any graph pass
         prog = mlmodel._mil_program
         assert "constexpr_cast" in get_op_types_in_program(prog)
 
-
-@pytest.mark.skipif(
-    ct.utils._macos_version() < (13, 0),
-    reason="ConstExpr ops available from macOS13 onwards.",
-)
 class TestConstexprLutToDense:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_builder_to_backend_smoke(self, compute_unit, backend):
 
         t = np.array(range(4)).reshape(4, 1).astype(np.float32)
@@ -374,7 +334,6 @@ def build(x):
             expected_outputs,
             compute_unit=compute_unit,
             backend=backend,
-            minimum_deployment_target=ct.target.iOS16,
         )
 
         # validate that the constexpr op is not removed by any graph pass
@@ -392,9 +351,7 @@ def test_builder_eval(self):
                 ]
             ).astype(np.uint32),
         )
-        np.testing.assert_allclose(
-            np.float32([3, 3, 1, 1, 1]).astype(np.float32), v.val
-        )
+        np.testing.assert_allclose(np.float32([3, 3, 1, 1, 1]).astype(np.float32), v.val)
 
     @staticmethod
     def lut_config_generator():
@@ -407,9 +364,7 @@ def lut_config_generator():
                 if lut_dtype == np.uint8:
                     lut = np.random.randint(low=255, size=lut_size, dtype=np.uint8)
                 elif lut_dtype == np.int8:
-                    lut = np.random.randint(
-                        low=-128, high=127, size=lut_size, dtype=np.int8
-                    )
+                    lut = np.random.randint(low=-128, high=127, size=lut_size, dtype=np.int8)
                 else:
                     lut = np.random.rand(lut_size).astype(lut_dtype)
                 for output_rank in range(1, 6):
@@ -418,16 +373,12 @@ def lut_config_generator():
                     indices = np.random.randint(
                         low=0, high=2**nbits, size=output_shape, dtype=np.uint8
                     )
-                    indices_bitarray = np.unpackbits(
-                        indices, bitorder="little"
-                    ).reshape(-1, 8)
-                    packed_indices = np.packbits(
-                        indices_bitarray[:, :nbits], bitorder="little"
-                    )
+                    indices_bitarray = np.unpackbits(indices, bitorder="little").reshape(-1, 8)
+                    packed_indices = np.packbits(indices_bitarray[:, :nbits], bitorder="little")
 
-                    assert packed_indices.size == np.ceil(
-                        nbits * np.prod(output_shape) / 8
-                    ).astype(np.int32)
+                    assert packed_indices.size == np.ceil(nbits * np.prod(output_shape) / 8).astype(
+                        np.int32
+                    )
                     params = {
                         "indices": packed_indices,
                         "shape": output_shape,
@@ -437,11 +388,7 @@ def lut_config_generator():
 
     @pytest.mark.parametrize(
         "compute_unit, backend, config",
-        itertools.product(
-            compute_units,
-            backends,
-            lut_config_generator.__func__()
-        ),
+        itertools.product(compute_units, backends, lut_config_generator.__func__()),
     )
     def test_builder_stress(self, compute_unit, backend, config):
 
@@ -465,9 +412,7 @@ def build(x):
         )
 
         t = np.random.rand(*shape).astype(lut.dtype)
-        decompressed_constant = constexpr_ops.constexpr_lut_to_dense.decompress(
-            lut, indices, shape
-        )
+        decompressed_constant = constexpr_ops.constexpr_lut_to_dense.decompress(lut, indices, shape)
         expected_outputs = t + decompressed_constant
 
         input_placeholders = {
@@ -482,7 +427,6 @@ def build(x):
             expected_outputs,
             compute_unit=compute_unit,
             backend=backend,
-            minimum_deployment_target=ct.target.iOS16,
         )
 
         # validate that the constexpr op is not removed by any graph pass
@@ -490,15 +434,8 @@ def build(x):
         if "constexpr_lut_to_dense" not in get_op_types_in_program(prog):
             raise AssertionError("Invalidated: Test Failed")
 
-
-@pytest.mark.skipif(
-    ct.utils._macos_version() < (13, 0),
-    reason="ConstExpr ops available from macOS13 onwards.",
-)
 class TestConstexprSparseToDense:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_builder_to_backend_smoke(self, compute_unit, backend):
 
         t = np.array(range(4)).reshape(4, 1).astype(np.float32)
@@ -512,9 +449,7 @@ def build(x):
             nonzero_data = np.array([1, 2, 4]).astype(np.float32)
             mask = np.array([11]).astype(np.uint8)
             shape = np.array([4, 1]).astype(np.uint32)
-            y = mb.constexpr_sparse_to_dense(
-                nonzero_data=nonzero_data, mask=mask, shape=shape
-            )
+            y = mb.constexpr_sparse_to_dense(nonzero_data=nonzero_data, mask=mask, shape=shape)
             return mb.add(x=x, y=y)
 
         expected_output_types = (4, 1, types.fp32)
@@ -528,7 +463,6 @@ def build(x):
             expected_outputs,
             compute_unit=compute_unit,
             backend=backend,
-            minimum_deployment_target=ct.target.iOS16,
         )
 
         # validate that the constexpr op is not removed by any graph pass
@@ -566,22 +500,16 @@ def sparse_config_generator():
                     mask = np.random.randint(low=255, size=nBytes, dtype=np.uint8)
                     bitarray = np.unpackbits(mask, bitorder="little")
 
-                nonzero_size = np.sum(
-                    np.where(np.unpackbits(mask, bitorder="little") != 0, 1, 0)
-                )
+                nonzero_size = np.sum(np.where(np.unpackbits(mask, bitorder="little") != 0, 1, 0))
 
                 if nonzero_data_dtype == np.uint8:
-                    nonzero_data = np.random.randint(
-                        low=255, size=nonzero_size, dtype=np.uint8
-                    )
+                    nonzero_data = np.random.randint(low=255, size=nonzero_size, dtype=np.uint8)
                 elif nonzero_data_dtype == np.int8:
                     nonzero_data = np.random.randint(
                         low=-128, high=127, size=nonzero_size, dtype=np.int8
                     )
                 else:
-                    nonzero_data = np.random.rand(nonzero_size).astype(
-                        nonzero_data_dtype
-                    )
+                    nonzero_data = np.random.rand(nonzero_size).astype(nonzero_data_dtype)
 
                 params = {
                     "nonzero_data": nonzero_data,
@@ -592,11 +520,7 @@ def sparse_config_generator():
 
     @pytest.mark.parametrize(
         "compute_unit, backend, config",
-        itertools.product(
-            compute_units,
-            backends,
-            sparse_config_generator.__func__()
-        ),
+        itertools.product(compute_units, backends, sparse_config_generator.__func__()),
     )
     def test_builder_stress(self, compute_unit, backend, config):
 
@@ -637,7 +561,6 @@ def build(x):
             expected_outputs,
             compute_unit=compute_unit,
             backend=backend,
-            minimum_deployment_target=ct.target.iOS16,
         )
 
         # validate that the constexpr op is not removed by any graph pass
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS16/test_image_resizing.py b/coremltools/converters/mil/mil/ops/tests/iOS16/test_image_resizing.py
new file mode 100644
index 000000000..e5b85c268
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/tests/iOS16/test_image_resizing.py
@@ -0,0 +1,195 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import itertools
+
+import numpy as np
+import pytest
+
+import coremltools as ct
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.ops.tests.iOS15.test_image_resizing import (
+    TestResample as _TestResample_iOS15,
+)
+from coremltools.converters.mil.mil.ops.tests.iOS16 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import (
+    mark_api_breaking,
+    run_compare_builder,
+)
+from coremltools.converters.mil.testing_reqs import compute_units
+
+
+class TestUpsampleBilinear:
+    @pytest.mark.parametrize(
+        "compute_unit, backend, align_corners, half_pixel_centers",
+        itertools.product(
+            compute_units,
+            backends,
+            [True, False],
+            [True, False, None],
+        ),
+    )
+    def test_builder_to_backend_smoke_iOS16(
+        self, compute_unit, backend, align_corners, half_pixel_centers
+    ):
+        if align_corners and half_pixel_centers:
+            pytest.skip("Invalid configuration of align_corners and half_pixel_centers")
+
+        x = np.array([1, 2], dtype=np.float32).reshape(1, 1, 1, 2)
+        input_placeholder_dict = {"x": mb.placeholder(shape=x.shape)}
+        input_value_dict = {"x": x}
+
+        def build_upsample_bilinear(x):
+            return mb.upsample_bilinear(
+                x=x,
+                scale_factor_height=2,
+                scale_factor_width=3,
+                align_corners=align_corners,
+                half_pixel_centers=half_pixel_centers,
+            )
+
+        expected_output_type = (1, 1, 2, 6, types.fp32)
+
+        if half_pixel_centers is None:
+            half_pixel_centers = not align_corners
+
+        if align_corners and not half_pixel_centers:
+            expected_output = [1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]
+        elif not align_corners and half_pixel_centers:
+            expected_output = [
+                1.0,
+                1.0,
+                1.33334,
+                1.66667,
+                2.0,
+                2.0,
+                1.0,
+                1.0,
+                1.33334,
+                1.66667,
+                2.0,
+                2.0,
+            ]
+        elif not align_corners and not half_pixel_centers:
+            expected_output = [
+                1.0,
+                1.33334,
+                1.66667,
+                2.0,
+                2.0,
+                2.0,
+                1.0,
+                1.33334,
+                1.66667,
+                2.0,
+                2.0,
+                2.0,
+            ]
+        else:
+            raise ValueError("align_corners and half_pixel_centers cannot be both True")
+
+        expected_output = [np.array(expected_output, dtype=np.float32).reshape(1, 1, 2, 6)]
+
+        run_compare_builder(
+            build_upsample_bilinear,
+            input_placeholder_dict,
+            input_value_dict,
+            expected_output_type,
+            expected_output,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+
+class TestCropResize:
+    @mark_api_breaking(breaking_opset_version=ct.target.iOS17)
+    @pytest.mark.parametrize(
+        "compute_unit, backend, pad_value",
+        itertools.product(compute_units, backends, [0.0, 1.0, 10.0]),
+    )
+    def test_builder_to_backend_ios16(self, compute_unit, backend, pad_value):
+        """For iOS16+ the crop_resize op supports pad_value."""
+        x = np.array(
+            [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]],
+            dtype=np.float32,
+        ).reshape(1, 1, 4, 4)
+
+        roi = np.array(
+            [
+                [0, 0.1, 0.3, 1.3, 1],
+                [0, 0.5, 1.8, 1.0, 0.3],
+                [0, 0.0, 0.4, 0.6, 0.7],
+            ],
+            dtype=np.float32,
+        ).reshape(3, 1, 5, 1, 1)
+
+        def build(x):
+            return mb.crop_resize(
+                x=x,
+                roi=roi,
+                target_width=2,
+                target_height=2,
+                normalized_coordinates=True,
+                box_coordinate_mode="CORNERS_HEIGHT_FIRST",
+                sampling_mode="ALIGN_CORNERS",
+                pad_value=pad_value,
+            )
+
+        expected_output_type = [
+            (3, 1, 1, 2, 2, types.fp32),
+        ]
+        expected_output = [
+            np.array(
+                [
+                    3.1,
+                    5.2,
+                    pad_value,
+                    pad_value,
+                    pad_value,
+                    7.899,
+                    pad_value,
+                    13.9,
+                    2.2,
+                    3.1,
+                    9.4,
+                    10.3,
+                ],
+                dtype=np.float32,
+            ).reshape(3, 1, 1, 2, 2),
+        ]
+
+        input_placeholder_dict = {"x": mb.placeholder(shape=(1, 1, 4, 4))}
+        input_value_dict = {"x": x}
+
+        run_compare_builder(
+            build,
+            input_placeholder_dict,
+            input_value_dict,
+            expected_output_type,
+            expected_output,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+
+class TestResample:
+    @pytest.mark.parametrize(
+        "compute_unit, backend, coordinates_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            (np.int32, np.float16, np.float32),
+        ),
+    )
+    def test_builder_to_backend_smoke_iOS16(self, compute_unit, backend, coordinates_dtype):
+        # The fp16 precision will have two casts inserted for input/output
+        expected_cast_ops = 2 if backend.precision == "fp16" else 0
+        if backend.precision == "fp16" and coordinates_dtype == np.float32:
+            # The coordinates also cast to fp16.
+            expected_cast_ops += 1
+        _TestResample_iOS15._test_builder_to_backend_smoke(
+            compute_unit, backend, coordinates_dtype, expected_cast_ops
+        )
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS16/test_scatter_gather.py b/coremltools/converters/mil/mil/ops/tests/iOS16/test_scatter_gather.py
new file mode 100644
index 000000000..4e1918623
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/tests/iOS16/test_scatter_gather.py
@@ -0,0 +1,185 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import itertools
+
+import numpy as np
+import pytest
+
+import coremltools as ct
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.ops.tests.iOS16 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import (
+    mark_api_breaking,
+    run_compare_builder,
+)
+from coremltools.converters.mil.testing_reqs import compute_units
+
+
+class TestGather:
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(compute_units, backends),
+    )
+    def test_builder_to_backend_smoke_batch_dims(self, compute_unit, backend):
+        # TODO MAKE SURE RUN ON IOS17
+        x = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=np.float32)
+        indices = np.array([[[1, 0], [0, 1]], [[1, 0], [0, 0]]], dtype=np.int32)
+
+        input_placeholders = {
+            "x": mb.placeholder(shape=x.shape),
+            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
+        }
+
+        input_values = {"x": x, "indices": indices}
+
+        def build(x, indices):
+            return [
+                mb.gather(x=x, indices=indices, axis=1, batch_dims=0),
+                mb.gather(x=x, indices=indices, axis=1, batch_dims=1),
+                mb.gather(x=x, indices=indices, axis=2, batch_dims=0),
+                mb.gather(x=x, indices=indices, axis=2, batch_dims=1),
+                mb.gather(x=x, indices=indices, axis=2, batch_dims=2),
+            ]
+
+        expected_output_types = [
+            (2, 2, 2, 2, 3, types.fp32),
+            (2, 2, 2, 3, types.fp32),
+            (2, 2, 2, 2, 2, types.fp32),
+            (2, 2, 2, 2, types.fp32),
+            (2, 2, 2, types.fp32),
+        ]
+
+        expected_outputs = [
+            np.array(
+                [
+                    [
+                        [[[4, 5, 6], [1, 2, 3]], [[1, 2, 3], [4, 5, 6]]],
+                        [[[4, 5, 6], [1, 2, 3]], [[1, 2, 3], [1, 2, 3]]],
+                    ],
+                    [
+                        [[[10, 11, 12], [7, 8, 9]], [[7, 8, 9], [10, 11, 12]]],
+                        [[[10, 11, 12], [7, 8, 9]], [[7, 8, 9], [7, 8, 9]]],
+                    ],
+                ],
+                dtype=np.float32,
+            ),
+            np.array(
+                [
+                    [[[4, 5, 6], [1, 2, 3]], [[1, 2, 3], [4, 5, 6]]],
+                    [[[10, 11, 12], [7, 8, 9]], [[7, 8, 9], [7, 8, 9]]],
+                ],
+                dtype=np.float32,
+            ),
+            np.array(
+                [
+                    [[[[2, 1], [1, 2]], [[2, 1], [1, 1]]], [[[5, 4], [4, 5]], [[5, 4], [4, 4]]]],
+                    [
+                        [[[8, 7], [7, 8]], [[8, 7], [7, 7]]],
+                        [[[11, 10], [10, 11]], [[11, 10], [10, 10]]],
+                    ],
+                ],
+                dtype=np.float32,
+            ),
+            np.array(
+                [[[[2, 1], [1, 2]], [[5, 4], [4, 5]]], [[[8, 7], [7, 7]], [[11, 10], [10, 10]]]],
+                dtype=np.float32,
+            ),
+            np.array([[[2, 1], [4, 5]], [[8, 7], [10, 10]]], dtype=np.float32),
+        ]
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_builder_eval_batch_dims(self, backend):
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
+            opset_version=backend.opset_version,
+        )
+        def prog(x):
+            params = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=np.float32)
+            indices = np.array([[[1, 0], [0, 1]], [[1, 0], [0, 0]]], dtype=np.int32)
+            res = mb.gather(x=params, indices=indices, axis=2, batch_dims=2)
+            return res
+
+        main_func = prog.functions["main"]
+        gather_ops = main_func.find_ops(op_type="gather")[0]
+
+        np.testing.assert_allclose(
+            np.array([[[2, 1], [4, 5]], [[8, 7], [10, 10]]], dtype=np.float32),
+            gather_ops.outputs[0].val,
+            atol=1e-04,
+            rtol=1e-05,
+        )
+
+
+class TestGatherNd:
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(compute_units, backends),
+    )
+    def test_builder_to_backend_smoke_batch_dims(self, compute_unit, backend):
+        # TODO MAKE SURE RUN ON IOS17
+        x = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=np.float32)
+        indices = np.array([[[1, 0], [0, 1]], [[1, 0], [0, 0]]], dtype=np.int32)
+
+        input_placeholders = {
+            "x": mb.placeholder(shape=x.shape),
+            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
+        }
+
+        input_values = {"x": x, "indices": indices}
+
+        def build(x, indices):
+            return [
+                mb.gather_nd(x=x, indices=indices, batch_dims=0),
+                mb.gather_nd(x=x, indices=indices, batch_dims=1),
+            ]
+
+        expected_output_types = [(2, 2, 3, types.fp32), (2, 2, types.fp32)]
+
+        expected_outputs = [
+            np.array([[[7, 8, 9], [4, 5, 6]], [[7, 8, 9], [1, 2, 3]]], dtype=np.float32),
+            np.array([[4, 2], [10, 7]], dtype=np.float32),
+        ]
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @mark_api_breaking(breaking_opset_version=ct.target.iOS17)
+    @pytest.mark.parametrize(
+        "backend, indices_val",
+        itertools.product(backends, [[[-1], [2]], [[1], [3]]]),
+    )
+    def test_builder_invalid_indices(self, backend, indices_val):
+        def prog(x):
+            params = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+            indices = np.array(indices_val, dtype=np.int32)
+            res = mb.gather_nd(x=params, indices=indices, batch_dims=1)
+            return res
+
+        mb.program(
+            input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
+            opset_version=backend.opset_version,
+        )(prog)
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS16/test_tensor_operation.py b/coremltools/converters/mil/mil/ops/tests/iOS16/test_tensor_operation.py
new file mode 100644
index 000000000..318fc457b
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/tests/iOS16/test_tensor_operation.py
@@ -0,0 +1,85 @@
+#  Copyright (c) 2022, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import itertools
+
+import numpy as np
+import pytest
+
+from coremltools.converters.mil import testing_reqs
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.ops.tests.iOS16 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
+
+compute_units = testing_reqs.compute_units
+
+
+class TestFillLike:
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_builder_to_backend_smoke(self, compute_unit, backend):
+        shape = (2, 1, 3)
+        x_val = np.zeros(shape=shape, dtype=np.float32)
+        input_placeholders = {"x": mb.placeholder(shape=x_val.shape, dtype=types.int32)}
+
+        input_values = {"x": x_val}
+
+        def build(x):
+            return mb.fill_like(ref_tensor=x, value=1.0)
+
+        expected_output_types = [(2, 1, 3, types.fp32)]
+        expected_outputs = [np.full(shape=shape, fill_value=1.0)]
+
+        mlmodel = run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+
+class TestTopK:
+    @pytest.mark.parametrize(
+        "compute_unit, backend, return_indices, sort",
+        itertools.product(
+            compute_units,
+            backends,
+            [True, False],
+            [True, False],
+        ),
+    )
+    def test_builder_to_backend_smoke_iOS16(self, compute_unit, backend, return_indices, sort):
+        val = np.array([[-1.0, 2.0, -3.0], [4.0, -5.0, 6.0]], dtype=np.float32)
+        input_placeholders = {"x": mb.placeholder(shape=val.shape)}
+        input_values = {"x": val}
+
+        def build(x):
+            return mb.topk(x=x, k=2, axis=1, return_indices=return_indices, sort=sort)
+
+        expected_output_types = [
+            (2, 2, types.fp32),
+            (2, 2, types.int32),
+        ]
+        expected_outputs = [
+            np.array([[2.0, -1.0], [6.0, 4.0]], dtype=np.float32),
+            np.array([[1, 0], [2, 0]], dtype=np.float32),
+        ]
+
+        if not return_indices:
+            expected_output_types = expected_output_types[:1]
+            expected_outputs = expected_outputs[:1]
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS16/test_tensor_transformation.py b/coremltools/converters/mil/mil/ops/tests/iOS16/test_tensor_transformation.py
new file mode 100644
index 000000000..72d7be8bd
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/tests/iOS16/test_tensor_transformation.py
@@ -0,0 +1,164 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import itertools
+
+import numpy as np
+import pytest
+
+from coremltools._deps import _HAS_TORCH, MSG_TORCH_NOT_FOUND
+from coremltools.converters.mil import testing_reqs
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.ops.tests.iOS16 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
+from coremltools.converters.mil.mil.types.type_mapping import numpy_type_to_builtin_type
+
+compute_units = testing_reqs.compute_units
+
+
+if _HAS_TORCH:
+    import torch
+
+class TestPixelUnshuffle:
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
+    )
+    def test_builder_to_backend_smoke(self, compute_unit, backend):
+        val = np.array([[[[9.0, 5.0], [1.0, 3.0]]]], dtype=np.float32)
+        input_placeholders = {"x": mb.placeholder(shape=val.shape)}
+        input_values = {"x": val}
+
+        def build(x):
+            return [mb.pixel_unshuffle(x=x, downscale_factor=np.uint32(2))]
+
+        expected_output_types = (1, 4, 1, 1, types.fp32)
+        expected_outputs = np.array([[[[9.0]], [[5.0]], [[1.0]], [[3.0]]]], dtype=np.float32)
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @pytest.mark.skipif(not testing_reqs._HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
+    @pytest.mark.parametrize(
+        "compute_unit, backend, shape, downscale_factor",
+        itertools.product(
+            compute_units,
+            backends,
+            [(1, 2, 4, 4), (2, 1, 8, 4)],
+            [2, 4],
+        ),
+    )
+    def test_builder_to_backend_stress(
+        self,
+        compute_unit,
+        backend,
+        shape,
+        downscale_factor,
+    ):
+        val = np.random.rand(*shape)
+        input_placeholders = {"x": mb.placeholder(shape=val.shape)}
+        input_values = {"x": val}
+
+        def build(x):
+            return [mb.pixel_unshuffle(x=x, downscale_factor=np.uint32(downscale_factor))]
+
+        torch_pixel_unshuffle = torch.nn.PixelUnshuffle(downscale_factor)
+        expected_outputs = [torch_pixel_unshuffle(torch.Tensor(val)).numpy()]
+        expected_output_types = [o.shape[:] + (types.fp32,) for o in expected_outputs]
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+
+class TestReshapeLike:
+    @pytest.mark.parametrize(
+        "compute_unit, backend, InputShape_RefShapes_Begins_Ends_EndMasks, x_dtype, ref_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [
+                [(4, 3), ((2, 2, 3), (1, 3)), (0, 1), (2, 2), (False, False)],
+                [(32,), ((1, 2, 2, 2), (3, 2, 2)), (1, 1), (0, 0), (True, True)],
+                [(72, 1), ((1, 2, 3, 4, 1), (3,)), (1, 0), (0, 1), (True, False)],
+            ],
+            [np.float16, np.float32, np.int32, bool],
+            [np.float16, np.float32, np.int32, bool],
+        ),
+    )
+    def test_builder_to_backend_smoke(
+        self,
+        compute_unit,
+        backend,
+        InputShape_RefShapes_Begins_Ends_EndMasks,
+        x_dtype,
+        ref_dtype,
+    ):
+        input_shape, ref_shapes, begins, ends, end_masks = InputShape_RefShapes_Begins_Ends_EndMasks
+        ref_shape_1, ref_shape_2 = ref_shapes
+        x_builtin_dtype = numpy_type_to_builtin_type(x_dtype)
+        ref_builtin_dtype = numpy_type_to_builtin_type(ref_dtype)
+
+        x_val = np.random.randint(low=0, high=6, size=input_shape).astype(x_dtype)
+        ref_tensor_1 = np.random.randint(low=0, high=6, size=ref_shape_1).astype(ref_dtype)
+        ref_tensor_2 = np.random.randint(low=0, high=6, size=ref_shape_2).astype(ref_dtype)
+
+        input_placeholders = {
+            "x": mb.placeholder(shape=x_val.shape, dtype=x_builtin_dtype),
+            "ref_tensor_1": mb.placeholder(shape=ref_shape_1, dtype=ref_builtin_dtype),
+            "ref_tensor_2": mb.placeholder(shape=ref_shape_2, dtype=ref_builtin_dtype),
+        }
+        input_values = {
+            "x": x_val,
+            "ref_tensor_1": ref_tensor_1,
+            "ref_tensor_2": ref_tensor_2,
+        }
+
+        def build(x, ref_tensor_1, ref_tensor_2):
+            return mb.reshape_like(
+                x=x,
+                ref_tensors=(ref_tensor_1, ref_tensor_2),
+                begins=begins,
+                ends=ends,
+                end_masks=end_masks,
+            )
+
+        output_shape = ()
+        for ref_shape, begin, end, end_mask in zip(
+            (ref_shape_1, ref_shape_2), begins, ends, end_masks
+        ):
+            if end_mask:
+                output_shape += tuple(ref_shape[begin:])
+            else:
+                output_shape += tuple(ref_shape[begin:end])
+
+        expected_output_types = [output_shape + (x_builtin_dtype,)]
+        expected_outputs = [np.reshape(x_val, output_shape).astype(x_dtype)]
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS17/__init__.py b/coremltools/converters/mil/mil/ops/tests/iOS17/__init__.py
new file mode 100644
index 000000000..3f8f9fe78
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/tests/iOS17/__init__.py
@@ -0,0 +1,9 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import coremltools as ct
+from coremltools.converters.mil.testing_reqs import backends_internal, clean_up_backends
+
+backends = clean_up_backends(backends_internal, ct.target.iOS17)
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS17/test_activation.py b/coremltools/converters/mil/mil/ops/tests/iOS17/test_activation.py
new file mode 100644
index 000000000..48123fbd5
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/tests/iOS17/test_activation.py
@@ -0,0 +1,182 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import itertools
+
+import numpy as np
+import pytest
+
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.ops.tests.iOS17 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
+from coremltools.converters.mil.testing_reqs import compute_units
+
+
+class TestInputWeightDifferentDtypes:
+    """
+    Starting from IOS17 the alpha/beta can have different dtypes from the input/output, so this
+    test class is mainly to verify the behaviour of those alpha/beta related activations.
+    """
+
+    @pytest.mark.parametrize(
+        "backend, different_dtype, op_name",
+        itertools.product(
+            backends,
+            [True, False],
+            ["elu", "leaky_relu", "prelu", "thresholded_relu"],
+        ),
+    )
+    def test_builder_eval_alpha(self, backend, different_dtype, op_name):
+        x = np.array([[[-1, 2, -3], [4, -5, 6]]], dtype=np.float32)
+        alpha = np.float16(2.0) if different_dtype else np.float32(2.0)
+        if op_name == "prelu":
+            alpha = np.array([2.0, 2.0], dtype=alpha.dtype)  # prelu requires alpha to be rank 1.
+
+        def prog():
+            return getattr(mb, op_name)(x=x, alpha=alpha)
+
+        mb.program(input_specs=[], opset_version=backend.opset_version)(prog)
+
+    @pytest.mark.parametrize(
+        "backend, different_dtype, op_name",
+        itertools.product(
+            backends,
+            [True, False],
+            [
+                "clamped_relu",
+                "linear_activation",
+                "scaled_tanh",
+                "sigmoid_hard",
+                "softplus_parametric",
+            ],
+        ),
+    )
+    def test_builder_eval_alpha_beta(self, backend, different_dtype, op_name):
+        x = np.array([[[-1, 2, -3], [4, -5, 6]]], dtype=np.float32)
+        alpha = np.float16(2.0) if different_dtype else np.float32(2.0)
+        beta = np.float16(1.0) if different_dtype else np.float32(1.0)
+        if op_name == "softplus_parametric":
+            alpha = np.array([2.0, 2.0], dtype=alpha.dtype)
+            beta = np.array([1.0, 1.0], dtype=beta.dtype)
+
+        def prog():
+            return getattr(mb, op_name)(x=x, alpha=alpha, beta=beta)
+
+        mb.program(input_specs=[], opset_version=backend.opset_version)(prog)
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, different_dtype, op_name",
+        itertools.product(
+            compute_units,
+            backends,
+            [True, False],
+            ["elu", "leaky_relu", "prelu", "thresholded_relu"],
+        ),
+    )
+    def test_builder_to_backend_numerical_alpha(
+        self, compute_unit, backend, different_dtype, op_name
+    ):
+        x = np.array([[[-1, 2, -3], [4, -5, 6]]], dtype=np.float32)
+        alpha = np.float16(2.0) if different_dtype else np.float32(2.0)
+        if op_name == "prelu":
+            alpha = np.array([2.0, 2.0], dtype=alpha.dtype)
+
+        def calculate_by_np():
+            if op_name == "elu":
+                res = np.copy(x)
+                res[res < 0] = alpha * (np.exp(res[res < 0]) - 1)
+                return res
+            elif op_name == "leaky_relu":
+                res = np.copy(x)
+                res[res < 0] *= 2.0
+                return res
+            elif op_name == "prelu":
+                alpha_br = np.copy(alpha)
+                for i in range(len(x.shape)):
+                    if i != 1:
+                        alpha_br = np.expand_dims(alpha_br, i)
+                res = np.maximum(x, 0) + np.minimum(x, 0) * alpha_br
+                return res
+            elif op_name == "thresholded_relu":
+                res = np.copy(x)
+                res[res < alpha] = 0.0
+                return res
+            else:
+                raise ValueError(f"Invalid op_name: {op_name}")
+
+        def build(x):
+            return getattr(mb, op_name)(x=x, alpha=alpha)
+
+        run_compare_builder(
+            build,
+            input_placeholders={"x": mb.placeholder(shape=x.shape)},
+            input_values={"x": x},
+            expected_output_types=x.shape + (types.fp32,),
+            expected_outputs=calculate_by_np(),
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, different_dtype, op_name",
+        itertools.product(
+            compute_units,
+            backends,
+            [True, False],
+            [
+                "clamped_relu",
+                "linear_activation",
+                "scaled_tanh",
+                "sigmoid_hard",
+                "softplus_parametric",
+            ],
+        ),
+    )
+    def test_builder_to_backend_numerical_alpha_beta(
+        self, compute_unit, backend, different_dtype, op_name
+    ):
+        x = np.array([[[-1, 2, -3], [4, -5, 6]]], dtype=np.float32)
+        alpha = np.float16(2.0) if different_dtype else np.float32(2.0)
+        beta = np.float16(1.0) if different_dtype else np.float32(1.0)
+        if op_name == "softplus_parametric":
+            alpha = np.array([2.0, 2.0], dtype=alpha.dtype)
+            beta = np.array([1.0, 1.0], dtype=beta.dtype)
+
+        def calculate_by_np():
+            if op_name == "clamped_relu":
+                return np.minimum(np.maximum(x, 0), beta) + np.minimum(
+                    np.minimum(x, 0) * alpha, beta
+                )
+            elif op_name == "linear_activation":
+                return x * alpha + beta
+            elif op_name == "scaled_tanh":
+                return alpha * np.tanh(x * beta)
+            elif op_name == "sigmoid_hard":
+                return np.minimum(np.maximum((alpha * x) + beta, 0), 1)
+            elif op_name == "softplus_parametric":
+                alpha_br = alpha
+                beta_br = beta
+                for i in range(len(x.shape)):
+                    if i != 1:
+                        alpha_br = np.expand_dims(alpha_br, i)
+                        beta_br = np.expand_dims(beta_br, i)
+                res = alpha_br * np.log(np.exp(x * beta_br) + 1)
+                return res
+            else:
+                raise ValueError(f"Invalid op_name: {op_name}")
+
+        def build(x):
+            return getattr(mb, op_name)(x=x, alpha=alpha, beta=beta)
+
+        run_compare_builder(
+            build,
+            input_placeholders={"x": mb.placeholder(shape=x.shape)},
+            input_values={"x": x},
+            expected_output_types=x.shape + (types.fp32,),
+            expected_outputs=calculate_by_np(),
+            compute_unit=compute_unit,
+            backend=backend,
+        )
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS17/test_conv.py b/coremltools/converters/mil/mil/ops/tests/iOS17/test_conv.py
new file mode 100644
index 000000000..65f68b116
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/tests/iOS17/test_conv.py
@@ -0,0 +1,171 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import itertools
+
+import numpy as np
+import pytest
+
+from coremltools._deps import _HAS_TORCH, MSG_TORCH_NOT_FOUND
+from coremltools.converters.mil.mil.ops.tests.iOS14.test_conv import TestConv as _TestConvIos14
+from coremltools.converters.mil.mil.ops.tests.iOS14.test_conv import (
+    TestConvTranspose as _TestTestConvTransposeIos14,
+)
+from coremltools.converters.mil.mil.ops.tests.iOS17 import backends
+from coremltools.converters.mil.testing_reqs import compute_units
+
+
+class TestConv(_TestConvIos14):
+    @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
+    @pytest.mark.parametrize(
+        ",".join(
+            [
+                "compute_unit",
+                "backend",
+                "conv_dim",
+                "config",
+                "x_weight_dtype",
+            ]
+        ),
+        itertools.product(
+            compute_units,
+            backends,
+            ["conv1d", "conv2d", "conv3d"],
+            [
+                {
+                    "padding": (1, 1, 1),
+                    "DHWKdKhKw": (10, 12, 14, 3, 2, 4),
+                    "stride": (2, 1, 1),
+                    "dilation": (1, 1, 1),
+                    "has_bias": False,
+                    "groups": 1,
+                    "symbolic": False,
+                },
+                {
+                    "padding": (2, 2, 2),
+                    "DHWKdKhKw": (10, 12, 14, 3, 2, 4),
+                    "stride": (2, 2, 2),
+                    "dilation": (2, 1, 1),
+                    "has_bias": False,
+                    "groups": 2,
+                    "symbolic": True,
+                },
+                {
+                    "padding": (1, 1, 1),
+                    "DHWKdKhKw": (5, 5, 5, 2, 2, 2),
+                    "stride": (2, 2, 2),
+                    "dilation": (2, 1, 1),
+                    "has_bias": True,
+                    "groups": 1,
+                    "symbolic": True,
+                },
+                {
+                    "padding": (2, 2, 2),
+                    "DHWKdKhKw": (5, 5, 5, 2, 2, 2),
+                    "stride": (2, 1, 1),
+                    "dilation": (1, 1, 1),
+                    "has_bias": True,
+                    "groups": 2,
+                    "symbolic": False,
+                },
+            ],
+            [
+                (np.float32, np.float32),
+                (np.float16, np.float16),
+                (np.float16, np.float32),
+                (np.float32, np.float16),
+            ],
+        ),
+    )
+    def test_builder_to_backend_stress(
+        self,
+        compute_unit,
+        backend,
+        conv_dim,
+        config,
+        x_weight_dtype,
+    ):
+        super().test_builder_to_backend_stress(
+            compute_unit, backend, conv_dim, config, x_weight_dtype
+        )
+
+
+class TestConvTranspose(_TestTestConvTransposeIos14):
+    @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
+    @pytest.mark.parametrize(
+        ",".join(
+            [
+                "compute_unit",
+                "backend",
+                "conv_dim",
+                "config",
+                "x_weight_dtype",
+            ]
+        ),
+        itertools.product(
+            compute_units,
+            backends,
+            ["conv1d", "conv2d", "conv3d"],
+            [
+                {
+                    "padding": (1, 2, 3),
+                    "DHWKdKhKw": (10, 12, 14, 3, 2, 4),
+                    "stride": (2, 1, 1),
+                    "dilation": (1, 1, 1),
+                    "has_bias": False,
+                    "groups": 1,
+                    "test_symbolic": False,
+                    "test_output_shape": True,
+                },
+                {
+                    "padding": (2, 2, 2),
+                    "DHWKdKhKw": (10, 12, 14, 3, 2, 4),
+                    "stride": (2, 2, 2),
+                    "dilation": (2, 1, 1),
+                    "has_bias": False,
+                    "groups": 2,
+                    "test_symbolic": True,
+                    "test_output_shape": False,
+                },
+                {
+                    "padding": (1, 2, 3),
+                    "DHWKdKhKw": (7, 7, 7, 2, 2, 2),
+                    "stride": (2, 2, 2),
+                    "dilation": (2, 1, 1),
+                    "has_bias": True,
+                    "groups": 1,
+                    "test_symbolic": True,
+                    "test_output_shape": False,
+                },
+                {
+                    "padding": (2, 2, 2),
+                    "DHWKdKhKw": (7, 7, 7, 2, 2, 2),
+                    "stride": (2, 1, 1),
+                    "dilation": (1, 1, 1),
+                    "has_bias": True,
+                    "groups": 2,
+                    "test_symbolic": False,
+                    "test_output_shape": False,
+                },
+            ],
+            [
+                (np.float32, np.float32),
+                (np.float16, np.float16),
+                (np.float16, np.float32),
+                (np.float32, np.float16),
+            ],
+        ),
+    )
+    def test_builder_to_backend_stress(
+        self,
+        compute_unit,
+        backend,
+        conv_dim,
+        config,
+        x_weight_dtype,
+    ):
+        super().test_builder_to_backend_stress(
+            compute_unit, backend, conv_dim, config, x_weight_dtype
+        )
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS17/test_elementwise_unary.py b/coremltools/converters/mil/mil/ops/tests/iOS17/test_elementwise_unary.py
new file mode 100644
index 000000000..a41ac776f
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/tests/iOS17/test_elementwise_unary.py
@@ -0,0 +1,177 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import itertools
+from unittest.mock import patch
+
+import numpy as np
+import pytest
+
+import coremltools as ct
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.ops.tests.iOS14.test_elementwise_unary import (
+    TestElementwiseUnary as _TestElementwiseUnary_iOS14,
+)
+from coremltools.converters.mil.mil.ops.tests.iOS17 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
+from coremltools.converters.mil.mil.passes.pass_pipeline import PassPipeline
+from coremltools.converters.mil.mil.types.type_mapping import numpy_type_to_builtin_type
+from coremltools.converters.mil.mil.var import Var
+from coremltools.converters.mil.testing_reqs import compute_units
+from coremltools.converters.mil.testing_utils import get_op_types_in_program
+
+
+class TestElementwiseUnary:
+    @pytest.mark.parametrize(
+        "compute_unit, backend, src_dtype, dst_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [
+                np.float16,
+                np.float32,
+                np.int32,
+                np.int16,
+                np.uint16,
+                np.int8,
+                np.uint8,
+            ],
+            [
+                np.float16,
+                np.float32,
+                np.int32,
+                np.int16,
+                np.uint16,
+                np.int8,
+                np.uint8,
+            ],
+        ),
+    )
+    def test_builder_eval_cast_ios17(self, compute_unit, backend, src_dtype, dst_dtype):
+        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=src_dtype)
+        dst_dtype_str = types.builtin_to_string(numpy_type_to_builtin_type(dst_dtype))
+        expected_res = x.astype(dtype=np.float16)
+
+        @mb.program(input_specs=[], opset_version=backend.opset_version)
+        def prog():
+            return mb.cast(x=x, dtype=dst_dtype_str)
+
+        main_func = prog.functions["main"]
+        cast_op = main_func.find_ops(op_type="cast")[0]
+        np.testing.assert_allclose(expected_res, cast_op.outputs[0].val, atol=1e-04, rtol=1e-05)
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, src_dtype, dst_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [np.float16, np.float32, np.int16, np.int32, np.uint16, np.int8, np.uint8],
+            [np.float16, np.float32, np.int16, np.int32, np.uint16, np.int8, np.uint8],
+        ),
+    )
+    def test_builder_to_backend_cast_ios17(self, compute_unit, backend, src_dtype, dst_dtype):
+        _SUPPORTED_IO_DTYPES = {types.fp16, types.fp32, types.int32}
+        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=src_dtype)
+        src_builtin_dtype = numpy_type_to_builtin_type(src_dtype)
+        dst_builtin_dtype = numpy_type_to_builtin_type(dst_dtype)
+        expected_res = x.astype(dtype=np.float16)
+
+        expected_cast_num = 1
+        if src_builtin_dtype not in _SUPPORTED_IO_DTYPES:
+            # A cast will be inserted for unsupported dtypes inputs.
+            expected_cast_num += 1
+
+        # As CoreML IO only allows fp16/32 and int32, the output will be further cast.
+        expected_res_builtin_dtype = dst_builtin_dtype
+        if dst_builtin_dtype not in _SUPPORTED_IO_DTYPES:
+            expected_res_builtin_dtype = (
+                types.int32 if types.is_int(dst_builtin_dtype) else types.fp32
+            )
+            expected_cast_num += 1
+
+        def build(x):
+            return mb.cast(x=x, dtype=types.builtin_to_string(dst_builtin_dtype))
+
+        with patch.object(Var, "_is_nonreplaceable_var") as mocked_is_nonreplaceable_var:
+            # Mock that the cast is non-replaceable, to make sure it's kept in the graph.
+            mocked_is_nonreplaceable_var.side_effect = (
+                lambda var: var.op and var.op.op_type == "cast"
+            )
+            # Remove the cast optimization pass to make sure all cast are kept in the graph.
+            pass_pipeline: PassPipeline = PassPipeline.DEFAULT
+            pass_pipeline.remove_passes(
+                ["common::cast_optimization", "common::topological_reorder"]
+            )
+            mlmodel = run_compare_builder(
+                build,
+                {"x": mb.placeholder(shape=x.shape, dtype=src_builtin_dtype)},
+                input_values={"x": x},
+                expected_output_types=x.shape + (expected_res_builtin_dtype,),
+                expected_outputs=expected_res,
+                compute_unit=compute_unit,
+                backend=backend,
+                pass_pipeline=pass_pipeline,
+            )
+            prog = mlmodel._mil_program
+            cast_ops = prog["main"].find_ops(op_type="cast")
+            assert len(cast_ops) == expected_cast_num
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, op_name, epsilon_val, x_eps_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            ["inverse", "log", "rsqrt"],
+            [1e-3, 1e-1, 1.0],
+            [(np.float32, np.float16), (np.float16, np.float32)],
+        ),
+    )
+    def test_builder_to_backend_stress_with_epsilon(
+        self,
+        compute_unit,
+        backend,
+        op_name,
+        epsilon_val,
+        x_eps_dtype,
+    ):
+        # From iOS17, epsilon and have different dtype than x
+        _TestElementwiseUnary_iOS14._test_builder_to_backend_stress_with_epsilon(
+            compute_unit, backend, op_name, epsilon_val, x_eps_dtype
+        )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(
+            [ct.ComputeUnit.CPU_ONLY, ct.ComputeUnit.CPU_AND_GPU, ct.ComputeUnit.ALL],
+            backends,
+        ),
+    )
+    def test_cast_fp16_output_bug_smoke(self, compute_unit, backend):
+        """
+        Since a fp16 output bug in Core ML can only be reproduced by non-CPU backends,
+        for this test, we hardcode the compute_unit.
+        """
+
+        def build(x):
+            return mb.cast(x=x, dtype="fp16")
+
+        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
+        expected_res = x.astype(dtype=np.float16)
+
+        mlmodel = run_compare_builder(
+            build,
+            {"x": mb.placeholder(shape=x.shape, dtype=types.int32)},
+            input_values={"x": x},
+            expected_output_types=x.shape + (types.fp16,),
+            expected_outputs=expected_res,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+        prog = mlmodel._mil_program
+        assert get_op_types_in_program(prog) == ["cast"]
+        cast_op = prog.find_ops(op_type="cast", exactly_one=True)[0]
+        assert cast_op.dtype.val == "fp16"
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS17/test_image_resizing.py b/coremltools/converters/mil/mil/ops/tests/iOS17/test_image_resizing.py
new file mode 100644
index 000000000..f756c878c
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/tests/iOS17/test_image_resizing.py
@@ -0,0 +1,407 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import itertools
+
+import numpy as np
+import pytest
+
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.ops.tests.iOS15.test_image_resizing import (
+    TestResample as _TestResampleIos15,
+)
+from coremltools.converters.mil.mil.ops.tests.iOS17 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import UNK_SYM, run_compare_builder
+from coremltools.converters.mil.testing_reqs import compute_units
+
+
+class TestCropResize:
+    @pytest.mark.parametrize(
+        "compute_unit, backend, N",
+        itertools.product(compute_units, backends, [1, 3]),
+    )
+    def test_builder_to_backend_ios17(self, compute_unit, backend, N):
+        """For iOS17+ the `roi` input is replaced by `boxes` and `box_indices`."""
+        x = np.arange(1, 17, dtype=np.float32).reshape(1, 1, 4, 4)
+        boxes = np.array([1, 1, 2, 2], dtype=np.float32).reshape(1, 4)
+        box_indices = None
+        normalized_coordinates = False
+        if N == 3:
+            boxes = np.array(
+                [
+                    [0.1, 0.3, 1.3, 1.0],
+                    [0.5, 1.8, 1.0, 0.3],
+                    [0.0, 0.4, 0.6, 0.7],
+                ],
+                dtype=np.float32,
+            )
+            box_indices = np.array([0] * 3, dtype=np.int32)
+            normalized_coordinates = True
+
+        def build(x):
+            return mb.crop_resize(
+                x=x,
+                boxes=boxes,
+                box_indices=box_indices,
+                target_width=2,
+                target_height=2,
+                normalized_coordinates=normalized_coordinates,
+                box_coordinate_mode="CORNERS_HEIGHT_FIRST",
+                sampling_mode="ALIGN_CORNERS",
+                pad_value=10.0,
+            )
+
+        expected_outputs = [np.array([6, 7, 10, 11], dtype=np.float32).reshape(1, 1, 2, 2)]
+        if N == 3:
+            expected_outputs = [
+                np.array(
+                    [3.1, 5.2, 10.0, 10.0, 10.0, 7.899, 10.0, 13.9, 2.2, 3.1, 9.4, 10.3],
+                    dtype=np.float32,
+                ).reshape(3, 1, 2, 2)
+            ]
+
+        run_compare_builder(
+            build,
+            input_placeholders={"x": mb.placeholder(shape=(1, 1, 4, 4))},
+            input_values={"x": x},
+            expected_output_types=[(N, 1, 2, 2, types.fp32)],
+            expected_outputs=expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_builder_eval_ios17_invalid(self, backend):
+        x = np.arange(1, 17, dtype=np.float32).reshape(1, 1, 4, 4)
+        three_boxes = np.array(
+            [
+                [0.1, 0.3, 1.3, 1.0],
+                [0.5, 1.8, 1.0, 0.3],
+                [0.0, 0.4, 0.6, 0.7],
+            ],
+            dtype=np.float32,
+        )
+        with pytest.raises(
+            ValueError,
+            match='N dimension of "boxes" \(3\) should not be greater '
+            'than the B dimension of "x" \(1\)',
+        ):
+
+            @mb.program(input_specs=[], opset_version=backend.opset_version)
+            def prog():
+                return mb.crop_resize(x=x, boxes=three_boxes)
+
+        one_box = np.array([1, 1, 2, 2], dtype=np.float32).reshape(1, 4)
+        indices_out_of_bound = np.array([10], dtype=np.int32)
+        with pytest.raises(
+            ValueError,
+            match='input "box_indices" should not have values >= B '
+            "dimension of x \(1\), but got \[10\]",
+        ):
+
+            @mb.program(input_specs=[], opset_version=backend.opset_version)
+            def prog():
+                return mb.crop_resize(x=x, boxes=one_box, box_indices=indices_out_of_bound)
+
+        indices_two_dim = np.array([[0]], dtype=np.int32)
+        with pytest.raises(
+            ValueError, match='input "box_indices" must has shape \[1\], but got \(1, 1\)'
+        ):
+
+            @mb.program(input_specs=[], opset_version=backend.opset_version)
+            def prog():
+                return mb.crop_resize(x=x, boxes=one_box, box_indices=indices_two_dim)
+
+        x_rank5 = np.arange(1, 17, dtype=np.float32).reshape(1, 1, 4, 4, 1)
+        with pytest.raises(
+            ValueError, match='input to the "crop_resize" op must be of rank 4, but got 5'
+        ):
+
+            @mb.program(input_specs=[], opset_version=backend.opset_version)
+            def prog():
+                return mb.crop_resize(x=x_rank5, boxes=one_box)
+
+
+class TestResample(_TestResampleIos15):
+    @pytest.mark.parametrize(
+        "compute_unit, backend, coordinates_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            (np.int8, np.uint8, np.int16, np.uint16, np.int32, np.float16, np.float32),
+        ),
+    )
+    def test_builder_to_backend_smoke(self, compute_unit, backend, coordinates_dtype):
+        # The fp16 precision will have two casts inserted for input/output
+        expected_cast_ops = 2 if backend.precision == "fp16" else 0
+        if backend.precision == "fp16" and coordinates_dtype == np.float32:
+            # The coordinates also cast to fp16.
+            expected_cast_ops += 1
+        if coordinates_dtype not in (np.int32, np.float16, np.float32):
+            # For dtype not supported in CoreML I/O, a cast will be inserted.
+            expected_cast_ops += 1
+        self._test_builder_to_backend_smoke(
+            compute_unit, backend, coordinates_dtype, expected_cast_ops
+        )
+
+
+class TestResize:
+    @pytest.mark.parametrize(
+        "compute_unit, backend", itertools.product(compute_units, backends)
+    )
+    def test_resize_nearest_neighbor(self, compute_unit, backend):
+        def build_model(x):
+            return mb.resize(
+                x=x,
+                shape=[1, 1, 3, 2],
+                resized_dims=np.uint32(2),
+                interpolation_mode="NEAREST_NEIGHBOR",
+                sampling_mode="DEFAULT",
+            )
+
+        x_val = np.array([-6.174, 9.371], dtype=np.float32).reshape([1, 1, 1, 2, 1])
+        input_placeholder_dict = {"x": mb.placeholder(shape=x_val.shape)}
+        input_value_dict = {"x": x_val}
+        expected_output_types = [(1, 1, 1, 3, 2, types.fp32)]
+        expected_outputs = [
+            np.array([[-6.174, -6.174, 9.371, 9.371, 9.371, 9.371]], dtype=np.float32).reshape(
+                [1, 1, 1, 3, 2]
+            )
+        ]
+
+        run_compare_builder(
+            build_model,
+            input_placeholder_dict,
+            input_value_dict,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_resize_nearest_neighbor_dynamic_shape(self, compute_unit, backend):
+        def build_model(x, shape):
+            return mb.resize(
+                x=x,
+                shape=shape,
+                resized_dims=np.uint32(2),
+                interpolation_mode="NEAREST_NEIGHBOR",
+                sampling_mode="DEFAULT",
+            )
+
+        x_val = np.array([-6.174, 9.371], dtype=np.float32).reshape([1, 1, 2, 1])
+        shape_val = np.array([1, 1, 3, 2], dtype=np.int32)
+        input_placeholder_dict = {
+            "x": mb.placeholder(shape=x_val.shape, dtype=types.fp32),
+            "shape": mb.placeholder(shape=shape_val.shape, dtype=types.int32),
+        }
+        input_value_dict = {"x": x_val, "shape": shape_val}
+        expected_output_types = [(1, 1, UNK_SYM, UNK_SYM, types.fp32)]
+        expected_outputs = [
+            np.array([[-6.174, -6.174, 9.371, 9.371, 9.371, 9.371]], dtype=np.float32).reshape(
+                [1, 1, 3, 2]
+            )
+        ]
+
+        run_compare_builder(
+            build_model,
+            input_placeholder_dict,
+            input_value_dict,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_resize_linear(self, compute_unit, backend):
+        def build_model(x):
+            return mb.resize(
+                x=x,
+                shape=[1, 1, 5],
+                resized_dims=np.uint32(2),
+                interpolation_mode="LINEAR",
+                sampling_mode="DEFAULT",
+            )
+
+        x_val = np.array([0, 1], dtype=np.float32).reshape([1, 1, 2])
+        input_placeholder_dict = {"x": mb.placeholder(shape=x_val.shape)}
+        input_value_dict = {"x": x_val}
+        expected_output_types = [(1, 1, 5, types.fp32)]
+        expected_outputs = [np.array([[0, 0.4, 0.8, 1, 1]], dtype=np.float32).reshape([1, 1, 5])]
+
+        run_compare_builder(
+            build_model,
+            input_placeholder_dict,
+            input_value_dict,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_resize_linear_dynamic_shape(self, compute_unit, backend):
+        def build_model(x, shape):
+            return mb.resize(
+                x=x,
+                shape=shape,
+                resized_dims=np.uint32(2),
+                interpolation_mode="LINEAR",
+                sampling_mode="DEFAULT",
+            )
+
+        x_val = np.array([0, 1], dtype=np.float32).reshape([1, 1, 1, 2])
+        shape_val = np.array([3, 1, 5], dtype=np.int32)
+        input_placeholder_dict = {
+            "x": mb.placeholder(shape=x_val.shape, dtype=types.fp32),
+            "shape": mb.placeholder(shape=shape_val.shape, dtype=types.int32),
+        }
+        input_value_dict = {"x": x_val, "shape": shape_val}
+        expected_output_types = [(1, 1, UNK_SYM, UNK_SYM, types.fp32)]
+        expected_outputs = [np.array([[0, 0.4, 0.8, 1, 1]], dtype=np.float32).reshape([1, 1, 1, 5])]
+
+        run_compare_builder(
+            build_model,
+            input_placeholder_dict,
+            input_value_dict,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_resize_invalid_parameter(self, compute_unit, backend):
+        def build_invalid_interpolation_mode(x):
+            return mb.resize(
+                x=x,
+                shape=[1, 1, 5],
+                resized_dims=np.uint32(2),
+                interpolation_mode="DUMMY",
+                sampling_mode="DEFAULT",
+            )
+
+        def build_invalid_sampling_mode(x):
+            return mb.resize(
+                x=x,
+                shape=[1, 1, 5],
+                resized_dims=np.uint32(2),
+                interpolation_mode="LINEAR",
+                sampling_mode="DUMMY",
+            )
+
+        def build_invalid_target_shape(x):
+            return mb.resize(
+                x=x,
+                shape=[1, 1, 1, 5],
+                resized_dims=np.uint32(2),
+                interpolation_mode="LINEAR",
+                sampling_mode="DEFAULT",
+            )
+
+        x_val = np.array([0, 1], dtype=np.float32).reshape([1, 1, 2])
+        input_placeholder_dict = {"x": mb.placeholder(shape=x_val.shape)}
+        input_value_dict = {"x": x_val}
+
+        with pytest.raises(ValueError, match="Invalid interpolation_mode"):
+            run_compare_builder(
+                build_invalid_interpolation_mode,
+                input_placeholder_dict,
+                input_value_dict,
+                compute_unit=compute_unit,
+                backend=backend,
+            )
+
+        with pytest.raises(ValueError, match="Invalid sampling_mode"):
+            run_compare_builder(
+                build_invalid_sampling_mode,
+                input_placeholder_dict,
+                input_value_dict,
+                compute_unit=compute_unit,
+                backend=backend,
+            )
+
+        with pytest.raises(ValueError, match="The shape's size \(4\) must <= x's rank \(3\)"):
+            run_compare_builder(
+                build_invalid_target_shape,
+                input_placeholder_dict,
+                input_value_dict,
+                compute_unit=compute_unit,
+                backend=backend,
+            )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, interpolation_mode",
+        itertools.product(compute_units, backends, ("LINEAR",)),
+    )
+    def test_resize_inherit_shape(self, compute_unit, backend, interpolation_mode):
+        def build_model(x):
+            return mb.resize(
+                x=x,
+                shape=[1, 0, 0, 0],
+                resized_dims=np.uint32(3),
+                interpolation_mode=interpolation_mode,
+                sampling_mode="DEFAULT",
+            )
+
+        pytest.xfail("rdar://112418424 Backend failed when input shape has 0.")
+
+        x_val = np.array([-6.174, 9.371], dtype=np.float32).reshape([1, 1, 1, 2, 1])
+        input_placeholder_dict = {"x": mb.placeholder(shape=x_val.shape)}
+        input_value_dict = {"x": x_val}
+        expected_output_types = [(1, 1, 1, 2, 1, types.fp32)]
+        expected_outputs = [np.array([-6.174, 9.371], dtype=np.float32).reshape([1, 1, 1, 2, 1])]
+
+        run_compare_builder(
+            build_model,
+            input_placeholder_dict,
+            input_value_dict,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, interpolation_mode",
+        itertools.product(compute_units, backends, ("LINEAR", "NEAREST_NEIGHBOR")),
+    )
+    def test_resize_inherit_shape_dynamic(self, compute_unit, backend, interpolation_mode):
+        def build_model(x, shape):
+            return mb.resize(
+                x=x,
+                shape=shape,
+                resized_dims=np.uint32(2),
+                interpolation_mode=interpolation_mode,
+                sampling_mode="DEFAULT",
+            )
+
+        pytest.xfail("rdar://112418424 Backend failed when input shape has 0.")
+
+        x_val = np.array([0, 1], dtype=np.float32).reshape([1, 1, 1, 2])
+        shape_val = np.array([1, 0, 0], dtype=np.int32)
+        input_placeholder_dict = {
+            "x": mb.placeholder(shape=x_val.shape, dtype=types.fp32),
+            "shape": mb.placeholder(shape=shape_val.shape, dtype=types.int32),
+        }
+        input_value_dict = {"x": x_val, "shape": shape_val}
+        expected_output_types = [(1, 1, UNK_SYM, UNK_SYM, types.fp32)]
+        expected_outputs = [np.array([[0, 1]], dtype=np.float32).reshape([1, 1, 1, 2])]
+
+        run_compare_builder(
+            build_model,
+            input_placeholder_dict,
+            input_value_dict,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS17/test_linear.py b/coremltools/converters/mil/mil/ops/tests/iOS17/test_linear.py
new file mode 100644
index 000000000..a020003f6
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/tests/iOS17/test_linear.py
@@ -0,0 +1,158 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import itertools
+
+import numpy as np
+import pytest
+
+import coremltools as ct
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.ops.tests.iOS17 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
+from coremltools.converters.mil.mil.types.type_mapping import numpy_type_to_builtin_type
+from coremltools.converters.mil.testing_reqs import compute_units
+
+
+class TestLinear:
+    @pytest.mark.parametrize(
+        "compute_unit, backend, x_dtype, weight_bias_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [np.float16, np.float32, np.int32],
+            [np.float16, np.float32, np.int32],
+        ),
+    )
+    def test_linear_ios17_mixed_precision(self, compute_unit, backend, x_dtype, weight_bias_dtype):
+        if x_dtype == np.int32:
+            pytest.xfail("Linear op doesn't work with int32 input (rdar://111421695)")
+
+        out_channels = 3
+        x_shape = np.random.randint(low=1, high=3, size=(3,))
+        w_shape = np.array([out_channels, x_shape[-1]])
+        b_shape = np.array([out_channels])
+
+        x_val = np.random.randint(low=0, high=10, size=x_shape).astype(x_dtype)
+        weight_val = np.random.randint(low=0, high=10, size=w_shape).astype(weight_bias_dtype)
+        bias_val = np.random.randint(low=0, high=10, size=b_shape).astype(weight_bias_dtype)
+
+        x_builtin_dtype = numpy_type_to_builtin_type(x_dtype)
+        input_placeholders = {
+            "x": mb.placeholder(shape=x_val.shape, dtype=x_builtin_dtype),
+        }
+
+        def build(x):
+            return mb.linear(x=x, weight=weight_val, bias=bias_val)
+
+        expected_outputs = np.matmul(x_val, np.transpose(weight_val)) + bias_val
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values={"x": x_val},
+            expected_output_types=expected_outputs.shape + (x_builtin_dtype,),
+            expected_outputs=expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+
+class TestMatMul:
+    @pytest.mark.parametrize(
+        "compute_unit, backend, x_dtype, y_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [np.float32, np.float16, np.int32],
+            [np.float32, np.float16, np.int32],
+        ),
+    )
+    def test_ios17_mixed_precision(self, compute_unit, backend, x_dtype, y_dtype):
+        x_val = np.random.randint(low=0, high=10, size=(2, 5)).astype(x_dtype)
+        y_val = np.random.randint(low=0, high=10, size=(5, 10)).astype(y_dtype)
+        x_mb_dtype = numpy_type_to_builtin_type(x_dtype)
+        y_mb_dtype = numpy_type_to_builtin_type(y_dtype)
+        expected_outputs = np.matmul(x_val, y_val)
+
+        def build_x_const(y):
+            return mb.matmul(x=x_val, y=y, transpose_x=False, transpose_y=False)
+
+        def build_y_const(x):
+            return mb.matmul(x=x, y=y_val, transpose_x=False, transpose_y=False)
+
+        mlmodel = run_compare_builder(
+            build_y_const,
+            input_placeholders={"x": mb.placeholder(shape=x_val.shape, dtype=x_mb_dtype)},
+            input_values={"x": x_val},
+            expected_output_types=expected_outputs.shape + (x_mb_dtype,),
+            expected_outputs=expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+            pass_pipeline=ct.PassPipeline.EMPTY,
+        )
+        prog = mlmodel._mil_program
+        matmul_op = prog["main"].find_ops(op_type="matmul")[0]
+        # When x is non-const and y is const, the output should have the same dtype as x.
+        assert types.builtin_to_string(matmul_op.outputs[0].dtype) == types.builtin_to_string(
+            x_mb_dtype
+        )
+
+        mlmodel = run_compare_builder(
+            build_x_const,
+            input_placeholders={"y": mb.placeholder(shape=y_val.shape, dtype=y_mb_dtype)},
+            input_values={"y": y_val},
+            expected_output_types=expected_outputs.shape + (y_mb_dtype,),
+            expected_outputs=expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+            pass_pipeline=ct.PassPipeline.EMPTY,
+        )
+        prog = mlmodel._mil_program
+        matmul_op = prog["main"].find_ops(op_type="matmul")[0]
+        # When x is const and y is non-const, the output should have the same dtype as y.
+        assert types.builtin_to_string(matmul_op.outputs[0].dtype) == types.builtin_to_string(
+            y_mb_dtype
+        )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(compute_units, backends),
+    )
+    def test_ios17_invalid_mixed_precision(self, compute_unit, backend):
+        """When x and y are both const or both non-const, mixed precision is not allowed."""
+        x_val = np.random.rand(2, 5).astype(np.float32)
+        y_val = np.random.randint(low=0, high=10, size=(5, 10)).astype(np.int32)
+
+        def build_both_const():
+            return mb.matmul(x=x_val, y=y_val, transpose_x=False, transpose_y=False)
+
+        def build_both_not_const(x, y):
+            return mb.matmul(x=x, y=y, transpose_x=False, transpose_y=False)
+
+        with pytest.raises(
+            ValueError, match="when x and y are both const, their dtype need to match"
+        ):
+            run_compare_builder(
+                build_both_const,
+                input_placeholders={},
+                input_values={},
+                compute_unit=compute_unit,
+                backend=backend,
+            )
+
+        with pytest.raises(
+            ValueError, match="when x and y are both non-const, their dtype need to match"
+        ):
+            run_compare_builder(
+                build_both_not_const,
+                input_placeholders={
+                    "x": mb.placeholder(shape=x_val.shape, dtype=types.fp32),
+                    "y": mb.placeholder(shape=y_val.shape, dtype=types.int32),
+                },
+                input_values={"x": x_val, "y": y_val},
+                compute_unit=compute_unit,
+                backend=backend,
+            )
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS17/test_normalization.py b/coremltools/converters/mil/mil/ops/tests/iOS17/test_normalization.py
new file mode 100644
index 000000000..28c3c2d4c
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/tests/iOS17/test_normalization.py
@@ -0,0 +1,176 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import itertools
+
+import numpy as np
+import pytest
+
+from coremltools._deps import _HAS_TF_2, _HAS_TORCH, MSG_TF2_NOT_FOUND, MSG_TORCH_NOT_FOUND
+from coremltools.converters.mil.mil.ops.tests.iOS14.test_normalization import (
+    TestNormalizationBatchNorm as _TestNormalizationBatchNormIos14,
+)
+from coremltools.converters.mil.mil.ops.tests.iOS14.test_normalization import (
+    TestNormalizationInstanceNorm as _TestNormalizationInstanceNormIos14,
+)
+from coremltools.converters.mil.mil.ops.tests.iOS14.test_normalization import (
+    TestNormalizationL2Norm as _TestNormalizationL2NormIos14,
+)
+from coremltools.converters.mil.mil.ops.tests.iOS14.test_normalization import (
+    TestNormalizationLayerNorm as _TestNormalizationLayerNormIos14,
+)
+from coremltools.converters.mil.mil.ops.tests.iOS14.test_normalization import (
+    TestNormalizationLocalResponseNorm as _TestNormalizationLocalResponseNormIos14,
+)
+from coremltools.converters.mil.mil.ops.tests.iOS17 import backends
+from coremltools.converters.mil.testing_reqs import compute_units
+
+
+class TestNormalizationBatchNorm(_TestNormalizationBatchNormIos14):
+    @pytest.mark.parametrize(
+        "compute_unit, backend, x_param_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [
+                (np.float16, np.float16),
+                (np.float32, np.float32),
+                (np.float16, np.float32),
+                (np.float32, np.float16),
+            ],
+        ),
+    )
+    def test_builder_to_backend_smoke(self, compute_unit, backend, x_param_dtype):
+        super().test_builder_to_backend_smoke(compute_unit, backend, x_param_dtype)
+
+
+class TestNormalizationInstanceNorm(_TestNormalizationInstanceNormIos14):
+    @pytest.mark.parametrize(
+        "compute_unit, backend, x_param_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [
+                (np.float16, np.float16),
+                (np.float32, np.float32),
+                (np.float16, np.float32),
+                (np.float32, np.float16),
+            ],
+        ),
+    )
+    def test_builder_to_backend_smoke(self, compute_unit, backend, x_param_dtype):
+        super().test_builder_to_backend_smoke(compute_unit, backend, x_param_dtype)
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, x_param_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [
+                (np.float16, np.float16),
+                (np.float32, np.float32),
+                (np.float16, np.float32),
+                (np.float32, np.float16),
+            ],
+        ),
+    )
+    def test_builder_to_backend_smoke_with_gamma_and_beta(
+        self, compute_unit, backend, x_param_dtype
+    ):
+        super().test_builder_to_backend_smoke_with_gamma_and_beta(
+            compute_unit, backend, x_param_dtype
+        )
+
+    @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
+    @pytest.mark.parametrize(
+        "rank, compute_unit, backend, epsilon, x_param_dtype",
+        itertools.product(
+            [3, 4],
+            compute_units,
+            backends,
+            [1e-3, 1e-5, 1e-10],
+            [
+                (np.float16, np.float16),
+                (np.float32, np.float32),
+                (np.float16, np.float32),
+                (np.float32, np.float16),
+            ],
+        ),
+    )
+    def test_builder_to_backend_stress(self, rank, compute_unit, backend, epsilon, x_param_dtype):
+        super().test_builder_to_backend_stress(rank, compute_unit, backend, epsilon, x_param_dtype)
+
+
+class TestNormalizationL2Norm(_TestNormalizationL2NormIos14):
+    @pytest.mark.parametrize(
+        "compute_unit, backend, rank, epsilon, x_param_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [3, 4, 5],
+            [1e-4, 5.7],
+            [
+                (np.float16, np.float16),
+                (np.float32, np.float32),
+                (np.float16, np.float32),
+                (np.float32, np.float16),
+            ],
+        ),
+    )
+    def test_builder_to_backend_stress(self, compute_unit, backend, rank, epsilon, x_param_dtype):
+        super().test_builder_to_backend_stress(compute_unit, backend, rank, epsilon, x_param_dtype)
+
+
+class TestNormalizationLayerNorm(_TestNormalizationLayerNormIos14):
+    @pytest.mark.skipif(not _HAS_TF_2, reason=MSG_TF2_NOT_FOUND)
+    @pytest.mark.parametrize(
+        "compute_unit, backend, rank_and_axes, epsilon, x_param_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [[3, [0, 2]], [3, [-2]], [4, [0, 1, 3]], [5, [0, 4]], [5, [-5, -4, -3, -2, -1]]],
+            [0.0001, 0.01],
+            [
+                (np.float16, np.float16),
+                (np.float32, np.float32),
+                (np.float16, np.float32),
+                (np.float32, np.float16),
+            ],
+        ),
+    )
+    def test_builder_to_backend_stress_keras(
+        self, compute_unit, backend, rank_and_axes, epsilon, x_param_dtype
+    ):
+        super().test_builder_to_backend_stress_keras(
+            compute_unit, backend, rank_and_axes, epsilon, x_param_dtype
+        )
+
+
+class TestNormalizationLocalResponseNorm(_TestNormalizationLocalResponseNormIos14):
+    @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
+    @pytest.mark.parametrize(
+        "compute_unit, backend, rank, size, alpha, beta, k, x_param_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [rank for rank in range(3, 6)],
+            [2, 3, 5],
+            [0.0001, 0.01],
+            [0.75, 1.0],
+            [1.0, 2.0],
+            [
+                (np.float16, np.float16),
+                (np.float32, np.float32),
+                (np.float16, np.float32),
+                (np.float32, np.float16),
+            ],
+        ),
+    )
+    def test_builder_to_backend_stress(
+        self, compute_unit, backend, rank, size, alpha, beta, k, x_param_dtype
+    ):
+        super().test_builder_to_backend_stress(
+            compute_unit, backend, rank, size, alpha, beta, k, x_param_dtype
+        )
diff --git a/coremltools/converters/mil/mil/ops/tests/test_quantization.py b/coremltools/converters/mil/mil/ops/tests/iOS17/test_quantization.py
similarity index 87%
rename from coremltools/converters/mil/mil/ops/tests/test_quantization.py
rename to coremltools/converters/mil/mil/ops/tests/iOS17/test_quantization.py
index 5b05b61ee..77b890b7e 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_quantization.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS17/test_quantization.py
@@ -9,11 +9,12 @@
 import numpy as np
 import pytest
 
-import coremltools as ct
 from coremltools._deps import _HAS_TORCH, MSG_TORCH_NOT_FOUND
 from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil.ops.tests.iOS17 import backends
 from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
 from coremltools.converters.mil.mil.types import builtin_to_string, numpy_type_to_builtin_type
+from coremltools.converters.mil.testing_reqs import BackendConfig, compute_units
 from coremltools.converters.mil.testing_utils import ssa_fn
 
 if _HAS_TORCH:
@@ -23,6 +24,13 @@
 np.random.seed(1042)
 
 
+def _set_backend_precision(backend, precision):
+    return BackendConfig(
+        backend=backend.backend,
+        precision=precision,
+        opset_version=backend.opset_version,
+    )
+
 class TestQuantizationBase:
     @staticmethod
     def get_random_quantization_params(
@@ -163,7 +171,8 @@ def test_builder_eval_no_zero_point(self):
         )
         np.testing.assert_allclose(np.array([[0, 1, 2], [0, 1, 2]]).astype(np.int8), v.val)
 
-    def test_smoke_builder_to_backend_quantize_per_tensor(self):
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_smoke_builder_to_backend_quantize_per_tensor(self, compute_unit, backend):
         def build(x):
             x = mb.cast(x=x, dtype="fp16")
             quantized = mb.quantize(
@@ -190,12 +199,12 @@ def build(x):
             input_values={"x": x},
             expected_output_types=[expected_output_type],
             expected_outputs=[expected_output],
-            compute_unit=ct.ComputeUnit.CPU_ONLY,
-            backend=("mlprogram", "fp16"),
-            minimum_deployment_target=ct.target.iOS17,
+            compute_unit=compute_unit,
+            backend=backend,
         )
 
-    def test_smoke_builder_to_backend_quantize_per_channel(self):
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_smoke_builder_to_backend_quantize_per_channel(self, compute_unit, backend):
         def build(x):
             x = mb.cast(x=x, dtype="fp16")
             quantized = mb.quantize(
@@ -223,15 +232,16 @@ def build(x):
             input_values={"x": x},
             expected_output_types=[expected_output_type],
             expected_outputs=[expected_output],
-            compute_unit=ct.ComputeUnit.CPU_ONLY,
-            backend=("mlprogram", "fp16"),
-            minimum_deployment_target=ct.target.iOS17,
+            compute_unit=compute_unit,
+            backend=backend,
         )
 
     @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
     @pytest.mark.parametrize(
-        "float_dtype, quant_dtype, compute_precision, input_rank, is_zp_present",
+        "compute_unit, backend, float_dtype, quant_dtype, compute_precision, input_rank, is_zp_present",
         itertools.product(
+            compute_units,
+            backends,
             (np.float32, np.float16),
             (np.int8, np.uint8),
             ("fp32", "fp16"),
@@ -240,7 +250,14 @@ def build(x):
         ),
     )
     def test_stress_builder_to_backend_quantize_all_possibilities(
-        self, float_dtype, quant_dtype, compute_precision, input_rank, is_zp_present
+        self,
+        compute_unit,
+        backend,
+        float_dtype,
+        quant_dtype,
+        compute_precision,
+        input_rank,
+        is_zp_present,
     ):
         def build(x):
             x = mb.cast(x=x, dtype=builtin_to_string(numpy_type_to_builtin_type(float_dtype)))
@@ -256,11 +273,7 @@ def build(x):
                 input=quantized,
                 scale=float_dtype(1),
             )
-            # TODO(rdar://98013530): some fp16-output models fail
-            if float_dtype == np.float16:
-                return mb.cast(x=dequantized, dtype="fp32")
-            else:
-                return dequantized
+            return dequantized
 
         for axis in [None] + [i for i in range(-input_rank, input_rank)]:
             x_fp, scale, zero_point = self.get_random_quantization_params(
@@ -287,9 +300,8 @@ def build(x):
                 input_values,
                 expected_output_types,
                 expected_outputs=expected_outputs,
-                compute_unit=ct.ComputeUnit.CPU_ONLY,
-                backend=("mlprogram", compute_precision),
-                minimum_deployment_target=ct.target.iOS17,
+                compute_unit=compute_unit,
+                backend=_set_backend_precision(backend, compute_precision),
             )
 
 
@@ -323,7 +335,8 @@ def test_builder_eval_no_zero_point(self):
         )
         np.testing.assert_allclose(np.float32([[0, 2, 4], [0, 2, 4]]), v.val)
 
-    def test_smoke_builder_to_backend_dequantize_per_tensor(self):
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_smoke_builder_to_backend_dequantize_per_tensor(self, compute_unit, backend):
         def build(x):
             x = mb.cast(x=x, dtype="fp32")
             # TODO(rdar://107430678): Replace scale=1 zero_point=0 quantize/dequantize with cast
@@ -350,14 +363,14 @@ def build(x):
             input_values={"x": x},
             expected_output_types=[expected_output_type],
             expected_outputs=[expected_output],
-            compute_unit=ct.ComputeUnit.CPU_ONLY,
-            backend=("mlprogram", "fp32"),
-            minimum_deployment_target=ct.target.iOS17,
+            compute_unit=compute_unit,
+            backend=_set_backend_precision(backend, "fp32"),
             atol=1e-3,
             rtol=1e-3,
         )
 
-    def test_smoke_builder_to_backend_dequantize_per_channel(self):
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_smoke_builder_to_backend_dequantize_per_channel(self, compute_unit, backend):
         def build(x):
             x = mb.cast(x=x, dtype="fp32")
             # TODO(rdar://107430678): Replace scale=1 zero_point=0 quantize/dequantize with cast
@@ -385,17 +398,18 @@ def build(x):
             input_values={"x": x},
             expected_output_types=[expected_output_type],
             expected_outputs=[expected_output],
-            compute_unit=ct.ComputeUnit.CPU_ONLY,
-            backend=("mlprogram", "fp32"),
-            minimum_deployment_target=ct.target.iOS17,
+            compute_unit=compute_unit,
+            backend=_set_backend_precision(backend, "fp32"),
             atol=1e-3,
             rtol=1e-3,
         )
 
     @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
     @pytest.mark.parametrize(
-        "float_dtype, quant_dtype, compute_precision, input_rank, is_zp_present",
+        "compute_unit, backend, float_dtype, quant_dtype, compute_precision, input_rank, is_zp_present",
         itertools.product(
+            compute_units,
+            backends,
             (np.float32, np.float16),
             (np.int8, np.uint8),
             ("fp32", "fp16"),
@@ -404,7 +418,14 @@ def build(x):
         ),
     )
     def test_stress_builder_to_backend_dequantize_all_possibilities(
-        self, float_dtype, quant_dtype, compute_precision, input_rank, is_zp_present
+        self,
+        compute_unit,
+        backend,
+        float_dtype,
+        quant_dtype,
+        compute_precision,
+        input_rank,
+        is_zp_present,
     ):
         def build(x):
             x = mb.cast(x=x, dtype=builtin_to_string(numpy_type_to_builtin_type(float_dtype)))
@@ -420,11 +441,7 @@ def build(x):
                 scale=scale,
                 axis=axis,
             )
-            # TODO(rdar://98013530): some fp16-output models fail
-            if float_dtype == np.float16:
-                return mb.cast(x=dequantized, dtype="fp32")
-            else:
-                return dequantized
+            return dequantized
 
         for axis in [None] + [i for i in range(-input_rank, input_rank)]:
             x_fp, scale, zero_point = self.get_random_quantization_params(
@@ -446,15 +463,13 @@ def build(x):
 
             expected_outputs = [output_torch_val]
             expected_output_types = [output_type]
-
             run_compare_builder(
                 build,
                 input_placeholders,
                 input_values,
                 expected_output_types,
                 expected_outputs=expected_outputs,
-                compute_unit=ct.ComputeUnit.CPU_ONLY,
-                backend=("mlprogram", compute_precision),
-                minimum_deployment_target=ct.target.iOS17,
+                compute_unit=compute_unit,
+                backend=_set_backend_precision(backend, compute_precision),
                 rtol=1e-3,
             )
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS17/test_recurrent.py b/coremltools/converters/mil/mil/ops/tests/iOS17/test_recurrent.py
new file mode 100644
index 000000000..bf77325db
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/tests/iOS17/test_recurrent.py
@@ -0,0 +1,325 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import itertools
+
+import numpy as np
+import pytest
+
+from coremltools._deps import _HAS_TORCH, MSG_TORCH_NOT_FOUND
+from coremltools.converters.mil.mil.ops.tests.iOS14.test_recurrent import TestGRU as _TestGRU_iOS14
+from coremltools.converters.mil.mil.ops.tests.iOS14.test_recurrent import (
+    TestLSTM as _TestLSTM_iOS14,
+)
+from coremltools.converters.mil.mil.ops.tests.iOS14.test_recurrent import TestRNN as _TestRNN_iOS14
+from coremltools.converters.mil.mil.ops.tests.iOS17 import backends
+from coremltools.converters.mil.testing_reqs import compute_units
+
+
+class TestGRU(_TestGRU_iOS14):
+    @pytest.mark.parametrize(
+        argnames=[
+            "compute_unit",
+            "backend",
+            "seq_len",
+            "batch_size",
+            "input_size",
+            "hidden_size",
+            "has_bias",
+            "output_sequence",
+            "direction",
+            "activation_functions",
+            "symbolic",
+            "dtype",
+        ],
+        argvalues=itertools.product(
+            compute_units,
+            backends,
+            [1, 3],
+            [1],  # <rdar://problem/59644603> [MIL] GRU with batch size 1 produces incorrect
+            # output(always 0) for second batch onwards
+            [1, 2],
+            [1, 2],
+            [True, False],
+            [True, False],
+            ["forward", "reverse"],
+            [
+                ["tanh", "sigmoid"],
+                ["sigmoid", "tanh"],
+            ],
+            [True, False],
+            [np.float16, np.float32],
+        ),
+    )
+    def test_builder_to_backend_smoke(
+        self,
+        compute_unit,
+        backend,
+        seq_len,
+        batch_size,
+        input_size,
+        hidden_size,
+        has_bias,
+        output_sequence,
+        direction,
+        activation_functions,
+        symbolic,
+        dtype,
+    ):
+        super().test_builder_to_backend_smoke(
+            compute_unit,
+            backend,
+            seq_len,
+            batch_size,
+            input_size,
+            hidden_size,
+            has_bias,
+            output_sequence,
+            direction,
+            activation_functions,
+            symbolic,
+            dtype,
+        )
+
+
+class TestLSTM(_TestLSTM_iOS14):
+    @pytest.mark.parametrize(
+        ",".join(
+            [
+                "compute_unit",
+                "backend",
+                "input_dims",
+                "output_dim",
+                "activation",
+                "inner_activation",
+                "outer_activation",
+                "return_seq",
+                "has_bias",
+                "forget_bias",
+                "has_peephole",
+                "coupled_input_forget",
+                "clip",
+                "dtype",
+            ]
+        ),
+        itertools.product(
+            compute_units,
+            backends,
+            [[8, 32, 32]],
+            [4],
+            ["sigmoid"],
+            ["tanh"],
+            ["relu"],
+            [False, True],
+            [False, True],
+            [False, True],
+            [True, False],
+            [False],
+            [50.0, 0.01],
+            [np.float16, np.float32],
+        ),
+    )
+    def test_numpy_numerical(
+        self,
+        compute_unit,
+        backend,
+        input_dims,
+        output_dim,
+        activation,
+        inner_activation,
+        outer_activation,
+        return_seq,
+        has_bias,
+        forget_bias,
+        has_peephole,
+        coupled_input_forget,
+        clip,
+        dtype,
+    ):
+        super().test_numpy_numerical(
+            compute_unit,
+            backend,
+            input_dims,
+            output_dim,
+            activation,
+            inner_activation,
+            outer_activation,
+            return_seq,
+            has_bias,
+            forget_bias,
+            has_peephole,
+            coupled_input_forget,
+            clip,
+            dtype,
+        )
+
+    @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
+    @pytest.mark.parametrize(
+        argnames=[
+            "compute_unit",
+            "backend",
+            "seq_len",
+            "batch_size",
+            "input_size",
+            "hidden_size",
+            "has_bias",
+            "output_sequence",
+            "direction",
+            "symbolic",
+            "dtype",
+        ],
+        argvalues=itertools.product(
+            compute_units,
+            backends,
+            [1, 8],
+            [1, 32],
+            [1, 64],
+            [1, 16],
+            [True, False],
+            [True, False],
+            ["forward", "reverse"],
+            [True, False],
+            [np.float16, np.float32],
+        ),
+    )
+    def test_builder_to_backend_smoke_unilstm(
+        self,
+        compute_unit,
+        backend,
+        seq_len,
+        batch_size,
+        input_size,
+        hidden_size,
+        has_bias,
+        output_sequence,
+        direction,
+        symbolic,
+        dtype,
+    ):
+        super().test_builder_to_backend_smoke_unilstm(
+            compute_unit,
+            backend,
+            seq_len,
+            batch_size,
+            input_size,
+            hidden_size,
+            has_bias,
+            output_sequence,
+            direction,
+            symbolic,
+            dtype,
+        )
+
+    @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
+    @pytest.mark.parametrize(
+        argnames=[
+            "compute_unit",
+            "backend",
+            "seq_len",
+            "batch_size",
+            "input_size",
+            "hidden_size",
+            "has_bias",
+            "output_sequence",
+            "symbolic",
+            "dtype",
+        ],
+        argvalues=itertools.product(
+            compute_units,
+            backends,
+            [1, 8],
+            [1, 32],
+            [1, 64],
+            [2, 16],
+            [True, False],
+            [True, False],
+            [True, False],
+            [np.float16, np.float32],
+        ),
+    )
+    def test_builder_to_backend_smoke_bidirlstm(
+        self,
+        compute_unit,
+        backend,
+        seq_len,
+        batch_size,
+        input_size,
+        hidden_size,
+        has_bias,
+        output_sequence,
+        symbolic,
+        dtype,
+    ):
+        super().test_builder_to_backend_smoke_bidirlstm(
+            compute_unit,
+            backend,
+            seq_len,
+            batch_size,
+            input_size,
+            hidden_size,
+            has_bias,
+            output_sequence,
+            symbolic,
+            dtype,
+        )
+
+
+class TestRNN(_TestRNN_iOS14):
+    @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
+    @pytest.mark.parametrize(
+        argnames=[
+            "compute_unit",
+            "backend",
+            "seq_len",
+            "batch_size",
+            "input_size",
+            "hidden_size",
+            "has_bias",
+            "output_sequence",
+            "direction",
+            "symbolic",
+            "dtype",
+        ],
+        argvalues=itertools.product(
+            compute_units,
+            backends,
+            [2, 8],
+            [1, 32],
+            [1, 64],
+            [1, 16],
+            [True, False],
+            [True, False],
+            ["forward", "reverse"],
+            [True, False],
+            [np.float16, np.float32],
+        ),
+    )
+    def test_builder_to_backend_smoke(
+        self,
+        compute_unit,
+        backend,
+        seq_len,
+        batch_size,
+        input_size,
+        hidden_size,
+        has_bias,
+        output_sequence,
+        direction,
+        symbolic,
+        dtype,
+    ):
+        super().test_builder_to_backend_smoke(
+            compute_unit,
+            backend,
+            seq_len,
+            batch_size,
+            input_size,
+            hidden_size,
+            has_bias,
+            output_sequence,
+            direction,
+            symbolic,
+            dtype,
+        )
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS17/test_reduction.py b/coremltools/converters/mil/mil/ops/tests/iOS17/test_reduction.py
new file mode 100644
index 000000000..19c1e5c4f
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/tests/iOS17/test_reduction.py
@@ -0,0 +1,63 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import itertools
+
+import numpy as np
+import pytest
+
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.ops.tests.iOS17 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
+from coremltools.converters.mil.testing_reqs import compute_units
+
+
+class TestReduction:
+    @pytest.mark.parametrize(
+        "compute_unit, backend, op_name, output_dtype",
+        itertools.product(
+            compute_units, backends, ["reduce_argmax", "reduce_argmin"], ["int32", "uint16", None]
+        ),
+    )
+    def test_reduce_arg_ios17_output_dtype(self, compute_unit, backend, op_name, output_dtype):
+        def build(x):
+            return getattr(mb, op_name)(x=x, axis=1, keep_dims=False, output_dtype=output_dtype)
+
+        val = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32)
+        input_placeholders = {"x": mb.placeholder(shape=val.shape)}
+        input_values = {"x": val}
+        output_np_type = np.uint16 if output_dtype == "uint16" else np.int32
+        output_type = types.uint16 if output_dtype == "uint16" else types.int32
+        expected_output_types = (2, output_type)
+        expected_outputs = np.array(
+            [2, 2] if op_name == "reduce_argmax" else [0, 0], dtype=output_np_type
+        )
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @pytest.mark.parametrize(
+        "backend, op_name",
+        itertools.product(
+            backends,
+            ["reduce_argmax", "reduce_argmin"],
+        ),
+    )
+    def test_reduce_arg_ios17_output_dtype_invalid(self, backend, op_name):
+        x = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32)
+
+        def prog():
+            return getattr(mb, op_name)(x=x, axis=1, keep_dims=False, output_dtype="dummy")
+
+        with pytest.raises(ValueError, match='Invalid "output_dtype" dummy'):
+            mb.program(input_specs=[], opset_version=backend.opset_version)(prog)
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS17/test_scatter_gather.py b/coremltools/converters/mil/mil/ops/tests/iOS17/test_scatter_gather.py
new file mode 100644
index 000000000..7199c8a2c
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/tests/iOS17/test_scatter_gather.py
@@ -0,0 +1,358 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import itertools
+
+import numpy as np
+import pytest
+
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.ops.tests.iOS14.test_scatter_gather import (
+    TestGatherAlongAxis as _TestGatherAlongAxis_iOS14,
+)
+from coremltools.converters.mil.mil.ops.tests.iOS14.test_scatter_gather import (
+    TestScatterAlongAxis as _TestScatterAlongAxis_iOS14,
+)
+from coremltools.converters.mil.mil.ops.tests.iOS17 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
+from coremltools.converters.mil.testing_reqs import compute_units
+
+
+class TestScatter:
+    @pytest.mark.parametrize(
+        "compute_unit, backend, indices_val, validate_indices, dynamic",
+        itertools.product(
+            compute_units,
+            backends,
+            [[-1, 0], [10, 0]],  # One negative indices, another out-of-range indices.
+            [True, False],
+            [True, False],
+        ),
+    )
+    def test_ios17_invalid_indices(
+        self, compute_unit, backend, indices_val, validate_indices, dynamic
+    ):
+        def build_static(data, updates):
+            return (
+                mb.scatter(
+                    data=data,
+                    indices=np.array(indices_val, dtype=np.int32),
+                    updates=updates,
+                    validate_indices=validate_indices,
+                ),
+            )
+
+        def build_dynamic(data, indices, updates):
+            return (
+                mb.scatter(
+                    data=data, indices=indices, updates=updates, validate_indices=validate_indices
+                ),
+            )
+
+        data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        updates = np.array([[5, 6, 7], [8, 9, 10]], dtype=np.float32)
+        input_placeholders = {
+            "data": mb.placeholder(shape=data.shape),
+            "updates": mb.placeholder(shape=updates.shape),
+        }
+        input_values = {"data": data, "updates": updates}
+        if dynamic:
+            indices = np.array(indices_val, dtype=np.int32)
+            input_placeholders["indices"] = mb.placeholder(shape=indices.shape, dtype=types.int32)
+            input_values["indices"] = indices
+
+        if not validate_indices:
+            # When not validate indices, negative or out-of-bound indices behavior is undefined.
+            expected_error = AssertionError
+            expected_error_msg = "Not equal"
+        elif dynamic:
+            # In PyMIL's validation, the `validate_indices` will only validate indices whose values are
+            # known during op insertion, so it will not error out at PyMIL layer, but instead, rely on
+            # the backend to do the validation after compilation.
+            expected_error = RuntimeError
+            expected_error_msg = (
+                "Error computing NN outputs",
+                "Unable to compute the prediction using a neural network model",
+            )
+        else:
+            # The negative or out-of-bound indices will error out when validate_indices is set.
+            expected_error = IndexError
+            expected_error_msg = "Indices is out of bounds"
+
+        with pytest.raises(expected_error) as excinfo:
+            run_compare_builder(
+                build_dynamic if dynamic else build_static,
+                input_placeholders,
+                input_values,
+                expected_output_types=(2, 3, types.fp32),
+                expected_outputs=np.array([[9, 11, 13], [9, 11, 13]], dtype=np.float32),
+                compute_unit=compute_unit,
+                backend=backend,
+            )
+            if not isinstance(expected_error_msg, tuple):
+                expected_error_msg = expected_error_msg
+            assert any([err in str(excinfo.value) for err in expected_error_msg])
+
+
+class TestScatterAlongAxis:
+    @pytest.mark.parametrize(
+        "compute_unit, backend, rank_axis",
+        itertools.product(
+            compute_units,
+            backends,
+            [(rank, axis) for rank in range(1, 5) for axis in range(-rank, rank)],
+        ),
+    )
+    def test_builder_to_backend_programmatic(self, compute_unit, backend, rank_axis):
+        _TestScatterAlongAxis_iOS14._test_builder_to_backend_programmatic(
+            compute_unit, backend, rank_axis, force_non_negative_indices=True
+        )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, indices_val, dynamic",
+        itertools.product(
+            compute_units,
+            backends,
+            [[[-1, 0, 1], [1, 1, 0]], [[1, 10, 1], [1, 1, 0]]],
+            [True, False],
+        ),
+    )
+    def test_ios17_invalid_indices(self, compute_unit, backend, indices_val, dynamic):
+        def build_static(data, updates):
+            return (
+                mb.scatter_along_axis(
+                    data=data,
+                    indices=np.array(indices_val, dtype=np.int32),
+                    updates=updates,
+                    validate_indices=True,
+                ),
+            )
+
+        def build_dynamic(data, indices, updates):
+            return mb.scatter_along_axis(
+                data=data,
+                indices=indices,
+                updates=updates,
+                axis=0,
+                mode="update",
+                validate_indices=True,
+            )
+
+        data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        updates = np.array([[5, 6, 7], [8, 9, 10]], dtype=np.float32)
+        input_placeholders = {
+            "data": mb.placeholder(shape=data.shape),
+            "updates": mb.placeholder(shape=updates.shape),
+        }
+        input_values = {"data": data, "updates": updates}
+        if dynamic:
+            indices = np.array(indices_val, dtype=np.int32)
+            input_placeholders["indices"] = mb.placeholder(shape=indices.shape, dtype=types.int32)
+            input_values["indices"] = indices
+
+        if dynamic:
+            expected_error = RuntimeError
+            expected_error_msg = (
+                "Error computing NN outputs",
+                "Unable to compute the prediction using a neural network model",
+            )
+        else:
+            # The negative or out-of-bound indices will error out when validate_indices is set.
+            expected_error = IndexError
+            expected_error_msg = "Indices is out of bounds"
+
+        # The negative or out-of-bound indices will error out when validate_indices is set.
+        with pytest.raises(expected_error) as excinfo:
+            run_compare_builder(
+                build_dynamic if dynamic else build_static,
+                input_placeholders,
+                input_values,
+                expected_output_types=(2, 3, types.fp32),
+                expected_outputs=np.array([[1, 6, 10], [8, 9, 7]], dtype=np.float32),
+                compute_unit=compute_unit,
+                backend=backend,
+            )
+            if not isinstance(expected_error_msg, tuple):
+                expected_error_msg = expected_error_msg
+            assert any([err in str(excinfo.value) for err in expected_error_msg])
+
+
+class TestScatterNd:
+    @pytest.mark.parametrize(
+        "compute_unit, backend, indices_val, dynamic",
+        itertools.product(
+            compute_units, backends, [[[1, 0], [0, -1]], [[1, 0], [0, 3]]], [True, False]
+        ),
+    )
+    def test_ios17_invalid_indices(self, compute_unit, backend, indices_val, dynamic):
+        def build_static(data, updates):
+            return (
+                mb.scatter_nd(
+                    data=data,
+                    indices=np.array(indices_val, dtype=np.int32),
+                    updates=updates,
+                    validate_indices=True,
+                ),
+            )
+
+        def build_dynamic(data, indices, updates):
+            return (
+                mb.scatter_nd(data=data, indices=indices, updates=updates, validate_indices=True),
+            )
+
+        data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        updates = np.array([5, 10], dtype=np.float32)
+        input_placeholders = {
+            "data": mb.placeholder(shape=data.shape),
+            "updates": mb.placeholder(shape=updates.shape),
+        }
+        input_values = {"data": data, "updates": updates}
+        if dynamic:
+            indices = np.array(indices_val, dtype=np.int32)
+            input_placeholders["indices"] = mb.placeholder(shape=indices.shape, dtype=types.int32)
+            input_values["indices"] = indices
+
+        if dynamic:
+            expected_error = RuntimeError
+            expected_error_msg = (
+                "Error computing NN outputs",
+                "Unable to compute the prediction using a neural network model",
+            )
+        else:
+            # The negative or out-of-bound indices will error out when validate_indices is set.
+            expected_error = IndexError
+            expected_error_msg = "Indices is out of bounds"
+
+        with pytest.raises(expected_error) as excinfo:
+            run_compare_builder(
+                build_dynamic if dynamic else build_static,
+                input_placeholders,
+                input_values,
+                expected_output_types=(2, 3, types.fp32),
+                expected_outputs=np.array([[1, 2, 13], [9, 5, 6]], dtype=np.float32),
+                compute_unit=compute_unit,
+                backend=backend,
+            )
+            if not isinstance(expected_error_msg, tuple):
+                expected_error_msg = expected_error_msg
+            assert any([err in str(excinfo.value) for err in expected_error_msg])
+
+
+class TestGather:
+    @pytest.mark.parametrize(
+        "backend, indices_val, validate_indices",
+        itertools.product(backends, [[-1, 0], [0, 3]], [True, False]),
+    )
+    def test_builder_invalid_indices_iOS17(self, backend, indices_val, validate_indices):
+        def prog(x):
+            params = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+            indices = np.array(indices_val, dtype=np.int32)
+            res = mb.gather(x=params, indices=indices, axis=-1, validate_indices=validate_indices)
+            return res
+
+        if validate_indices:
+            with pytest.raises(IndexError, match="Indices is out of bounds for `gather` node"):
+                mb.program(
+                    input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
+                    opset_version=backend.opset_version,
+                )(prog)
+        elif any([idx > 2 for idx in indices_val]):
+            # If the indices are not validated during type inference for IOS17, the `gather` op's
+            # value inference will raise error for out-of-bound index.
+            with pytest.raises(IndexError, match="index 3 is out of bounds for axis 1 with size 3"):
+                mb.program(
+                    input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
+                    opset_version=backend.opset_version,
+                )(prog)
+        else:
+            mb.program(
+                input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
+                opset_version=backend.opset_version,
+            )(prog)
+
+
+class TestGatherAlongAxis:
+    @pytest.mark.parametrize(
+        "compute_unit, backend, rank_axis",
+        itertools.product(
+            compute_units,
+            backends,
+            [(rank, axis) for rank in range(1, 5) for axis in range(-rank, rank)],
+        ),
+    )
+    def test_builder_to_backend_programmatic(self, compute_unit, backend, rank_axis):
+        _TestGatherAlongAxis_iOS14._test_builder_to_backend_programmatic(
+            compute_unit, backend, rank_axis, True
+        )
+
+    @pytest.mark.parametrize(
+        "backend, indices_val, validate_indices",
+        itertools.product(
+            backends,
+            [[[1, 0, -1], [0, 0, 1]], [[1, 0, 1], [0, 0, 2]]],
+            [True, False],
+        ),
+    )
+    def test_builder_invalid_indices(self, backend, indices_val, validate_indices):
+        def prog(x):
+            params = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+            indices = np.array(indices_val, dtype=np.int32)
+            res = mb.gather_along_axis(
+                x=params, indices=indices, axis=0, validate_indices=validate_indices
+            )
+            return res
+
+        if validate_indices:
+            with pytest.raises(
+                IndexError, match="Indices is out of bounds for `gather_along_axis` node"
+            ):
+                mb.program(
+                    input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
+                    opset_version=backend.opset_version,
+                )(prog)
+        elif any([idx > 1 for sub_indices in indices_val for idx in sub_indices]):
+            # If the indices are not validated during type inference for IOS17, the `gather` op's
+            # value inference will raise error for out-of-bound index.
+            with pytest.raises(IndexError, match="index 2 is out of bounds for axis 0 with size 2"):
+                mb.program(
+                    input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
+                    opset_version=backend.opset_version,
+                )(prog)
+        else:
+            mb.program(
+                input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
+                opset_version=backend.opset_version,
+            )(prog)
+
+class TestGatherNd:
+    @pytest.mark.parametrize(
+        "backend, indices_val, validate_indices",
+        itertools.product(
+            backends,
+            [[[-1], [2]], [[1], [3]]],
+            [True, False],
+        ),
+    )
+    def test_builder_invalid_indices(self, backend, indices_val, validate_indices):
+        def prog(x):
+            params = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+            indices = np.array(indices_val, dtype=np.int32)
+            res = mb.gather_nd(
+                x=params, indices=indices, batch_dims=1, validate_indices=validate_indices
+            )
+            return res
+
+        if validate_indices:
+            with pytest.raises(IndexError, match="Indices is out of bounds for `gather_nd` node"):
+                mb.program(
+                    input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
+                    opset_version=backend.opset_version,
+                )(prog)
+        else:
+            mb.program(
+                input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
+                opset_version=backend.opset_version,
+            )(prog)
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS17/test_tensor_operation.py b/coremltools/converters/mil/mil/ops/tests/iOS17/test_tensor_operation.py
new file mode 100644
index 000000000..d3db09ece
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/tests/iOS17/test_tensor_operation.py
@@ -0,0 +1,141 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import itertools
+
+import numpy as np
+import pytest
+
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.ops.tests.iOS17 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
+from coremltools.converters.mil.testing_reqs import compute_units
+
+
+class TestTopK:
+    @pytest.mark.parametrize(
+        "compute_unit, backend, x_dtype, k_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [np.float16, np.float32, np.int8, np.int16, np.int32, np.uint8, np.uint16],
+            [np.int8, np.int16, np.int32],
+        ),
+    )
+    def test_ios17_different_dtypes(self, compute_unit, backend, x_dtype, k_dtype):
+        def build(x):
+            return mb.topk(x=x, k=k_dtype(2), axis=1)
+
+        val = np.array([[2, 3, 1], [5, 4, 6]], dtype=x_dtype)
+        x_mb_dtype = types.numpy_type_to_builtin_type(x_dtype)
+        input_placeholders = {"x": mb.placeholder(shape=val.shape, dtype=x_mb_dtype)}
+        input_values = {"x": val}
+        # As int16 is not in CoreML I/O supported dtypes, it will be cast to int32.
+        expected_output_types = [(2, 2, x_mb_dtype), (2, 2, types.int32)]
+        expected_outputs = [
+            np.array([[3, 2], [6, 5]], dtype=x_dtype),
+            np.array([[1, 0], [2, 0]], dtype=np.int32),
+        ]
+
+        mlmodel = run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+        prog = mlmodel._mil_program
+        topk_op = prog["main"].find_ops(op_type="topk")[0]
+        expected_x_dtype = x_mb_dtype
+        if backend.precision == "fp16" and types.is_float(x_mb_dtype):
+            expected_x_dtype = types.fp16
+        assert types.builtin_to_string(topk_op.x.dtype) == types.builtin_to_string(expected_x_dtype)
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, output_indices_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            ["int32", "uint16", None],
+        ),
+    )
+    def test_ios17_output_indices_dtype(self, compute_unit, backend, output_indices_dtype):
+        def build(x):
+            return mb.topk(x=x, k=2, axis=1, output_indices_dtype=output_indices_dtype)
+
+        val = np.array([[2, 3, 1], [5, 4, 6]], dtype=np.int32)
+        input_placeholders = {"x": mb.placeholder(shape=val.shape, dtype=types.int32)}
+        input_values = {"x": val}
+        expected_output_types = [(2, 2, types.int32), (2, 2, types.int32)]
+        expected_outputs = [
+            np.array([[3, 2], [6, 5]], dtype=np.int32),
+            np.array([[1, 0], [2, 0]], dtype=np.int32),
+        ]
+
+        mlmodel = run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+        prog = mlmodel._mil_program
+        topk_op = prog["main"].find_ops(op_type="topk")[0]
+
+        # If output_indices_dtype is not set, the output should be in type int32
+        expected_output_indices_dtype = "int32"
+        if output_indices_dtype is not None:
+            expected_output_indices_dtype = output_indices_dtype
+
+        assert types.builtin_to_string(topk_op.outputs[1].dtype) == expected_output_indices_dtype
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
+    )
+    def test_ios17_invalid_output_indices_dtype(self, compute_unit, backend):
+        def build(x):
+            return mb.topk(x=x, k=2, axis=1, output_indices_dtype="dummy")
+
+        val = np.array([[2, 3, 1], [5, 4, 6]], dtype=np.int32)
+        with pytest.raises(ValueError, match="invalid output_indices_dtype"):
+            run_compare_builder(
+                build,
+                input_placeholders={"x": mb.placeholder(shape=val.shape, dtype=types.int32)},
+                input_values={"x": val},
+                compute_unit=compute_unit,
+                backend=backend,
+            )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
+    )
+    def test_ios17_redundant_output_indices_dtype_early_error_out(self, compute_unit, backend):
+        def build(x):
+            return mb.topk(x=x, k=2, axis=1, return_indices=False, output_indices_dtype="int32")
+
+        val = np.array([[2, 3, 1], [5, 4, 6]], dtype=np.int32)
+        with pytest.raises(
+            ValueError, match='"output_indices_dtype" can only be set when "return_indices=True"'
+        ):
+            run_compare_builder(
+                build,
+                input_placeholders={"x": mb.placeholder(shape=val.shape, dtype=types.int32)},
+                input_values={"x": val},
+                compute_unit=compute_unit,
+                backend=backend,
+            )
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS17/test_tensor_transformation.py b/coremltools/converters/mil/mil/ops/tests/iOS17/test_tensor_transformation.py
new file mode 100644
index 000000000..74f167bc8
--- /dev/null
+++ b/coremltools/converters/mil/mil/ops/tests/iOS17/test_tensor_transformation.py
@@ -0,0 +1,381 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import itertools
+
+import numpy as np
+import pytest
+
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil.ops.tests.iOS14.test_tensor_transformation import (
+    TestSliceByIndex as _TestSliceByIndexIos14,
+)
+from coremltools.converters.mil.mil.ops.tests.iOS14.test_tensor_transformation import (
+    TestSliceBySize as _TestSliceBySizeIos14,
+)
+from coremltools.converters.mil.mil.ops.tests.iOS16.test_tensor_transformation import (
+    TestReshapeLike as _TestReshapeLike_iOS16,
+)
+from coremltools.converters.mil.mil.ops.tests.iOS17 import backends
+from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
+from coremltools.converters.mil.mil.types.type_mapping import numpy_type_to_builtin_type
+from coremltools.converters.mil.testing_reqs import compute_units
+
+
+class TestReshape:
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
+    )
+    def test_reshape_with_zero_different_len_iOS17(self, compute_unit, backend):
+        t = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        input_placeholders = {"x": mb.placeholder(shape=t.shape)}
+        input_values = {"x": t}
+
+        def build(x):
+            return [mb.reshape(x=x, shape=[1, 0, -1, 0])]
+
+        # In IOS17 it accepts different length.
+        expected_output_types = [(1, 1, 2, 3, types.fp32)]
+        expected_outputs = [np.array([[[[1, 2, 3], [4, 5, 6]]]], dtype=np.float32)]
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(
+            compute_units,
+            backends,
+        ),
+    )
+    def test_reshape_invalid_with_zero(self, compute_unit, backend):
+        t = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
+        input_placeholders = {"x": mb.placeholder(shape=t.shape)}
+        input_values = {"x": t}
+
+        def build(x):
+            return [mb.reshape(x=x, shape=[4, 0, -1, 0])]
+
+        with pytest.raises(ValueError, match="Invalid target shape in `reshape` op"):
+            run_compare_builder(
+                build,
+                input_placeholders,
+                input_values,
+                compute_unit=compute_unit,
+                backend=backend,
+            )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, x_dtype, shape_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.float16, np.float32],
+            [np.int8, np.int16, np.int32],
+        ),
+    )
+    def test_reshape_ios17_different_data_types(self, compute_unit, backend, x_dtype, shape_dtype):
+        x_val = np.array([[1, 2, 3], [4, 5, 6]], dtype=x_dtype)
+        target_shape = np.array([1, 6], dtype=shape_dtype)
+        x_builtin_dtype = numpy_type_to_builtin_type(x_dtype)
+        input_placeholders = {"x": mb.placeholder(shape=x_val.shape, dtype=x_builtin_dtype)}
+        input_values = {"x": x_val}
+
+        def build(x):
+            return mb.reshape(x=x, shape=target_shape)
+
+        expected_output_types = (1, 6, x_builtin_dtype)
+        expected_outputs = np.array([[1, 2, 3, 4, 5, 6]], dtype=x_dtype)
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+
+class TestReshapeLike(_TestReshapeLike_iOS16):
+    @pytest.mark.parametrize(
+        "compute_unit, backend, InputShape_RefShapes_Begins_Ends_EndMasks, x_dtype, ref_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [
+                [(4, 3), ((2, 2, 3), (1, 3)), (0, 1), (2, 2), (False, False)],
+                [(32,), ((1, 2, 2, 2), (3, 2, 2)), (1, 1), (0, 0), (True, True)],
+                [(72, 1), ((1, 2, 3, 4, 1), (3,)), (1, 0), (0, 1), (True, False)],
+            ],
+            [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.float16, np.float32, bool],
+            [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.float16, np.float32, bool],
+        ),
+    )
+    def test_builder_to_backend_smoke(
+        self,
+        compute_unit,
+        backend,
+        InputShape_RefShapes_Begins_Ends_EndMasks,
+        x_dtype,
+        ref_dtype,
+    ):
+        super().test_builder_to_backend_smoke(
+            compute_unit, backend, InputShape_RefShapes_Begins_Ends_EndMasks, x_dtype, ref_dtype
+        )
+
+
+class TestExpandDims:
+    @pytest.mark.parametrize(
+        "compute_unit, backend, x_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.float16, np.float32],
+        ),
+    )
+    def test_expand_dims_different_data_types(self, compute_unit, backend, x_dtype):
+        axis = 1
+        x_val = np.random.randint(low=2, high=6, size=(2, 3, 4)).astype(x_dtype)
+        x_builtin_dtype = numpy_type_to_builtin_type(x_dtype)
+        input_placeholders = {"x": mb.placeholder(shape=x_val.shape, dtype=x_builtin_dtype)}
+        input_values = {"x": x_val}
+
+        def build(x):
+            return mb.expand_dims(x=x, axes=[axis])
+
+        x_shape = list(x_val.shape)
+        out_shape = x_shape[:axis] + [1] + x_shape[axis:]
+        expected_output_types = tuple(out_shape[:]) + (x_builtin_dtype,)
+        expected_outputs = np.expand_dims(input_values["x"], axis)
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+
+class TestReverse:
+    @pytest.mark.parametrize(
+        "compute_unit, backend, x_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.float16, np.float32],
+        ),
+    )
+    def test_reverse_different_data_types(self, compute_unit, backend, x_dtype):
+        def build(x):
+            return [mb.reverse(x=x), mb.reverse(x=x, axes=[0])]
+
+        x_val = np.array([[1, 2, 3], [4, 5, 6]], dtype=x_dtype)
+        x_builtin_dtype = numpy_type_to_builtin_type(x_dtype)
+        input_placeholders = {"x": mb.placeholder(shape=x_val.shape, dtype=x_builtin_dtype)}
+        input_values = {"x": x_val}
+        expected_output_types = [(2, 3, x_builtin_dtype), (2, 3, x_builtin_dtype)]
+        expected_outputs = [
+            np.array([[6, 5, 4], [3, 2, 1]], dtype=x_dtype),
+            np.array([[4, 5, 6], [1, 2, 3]], dtype=x_dtype),
+        ]
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+
+class TestReverseSequence:
+    @pytest.mark.parametrize(
+        "compute_unit, backend, x_dtype, length_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.float16, np.float32],
+            [np.int8, np.int16, np.int32],
+        ),
+    )
+    def test_reverse_sequence_different_data_types(
+        self, compute_unit, backend, x_dtype, length_dtype
+    ):
+        def build(x, length):
+            return mb.reverse_sequence(x=x, lengths=length, seq_axis=1, batch_axis=0)
+
+        x_val = np.array(
+            [
+                [1, 2, 3, 4, 5, 0, 0, 0],
+                [1, 2, 0, 0, 0, 0, 0, 0],
+                [1, 2, 3, 4, 0, 0, 0, 0],
+                [1, 2, 3, 4, 5, 6, 7, 8],
+            ],
+            dtype=x_dtype,
+        )
+        length_val = np.array([7, 2, 3, 5], dtype=length_dtype)
+        x_builtin_dtype = numpy_type_to_builtin_type(x_dtype)
+        length_builtin_dtype = numpy_type_to_builtin_type(length_dtype)
+
+        input_placeholders = {
+            "x": mb.placeholder(shape=x_val.shape, dtype=x_builtin_dtype),
+            "length": mb.placeholder(shape=length_val.shape, dtype=length_builtin_dtype),
+        }
+        input_values = {"x": x_val, "length": length_val}
+        expected_output_types = (4, 8, x_builtin_dtype)
+        expected_outputs = np.array(
+            [
+                [0, 0, 5, 4, 3, 2, 1, 0],
+                [2, 1, 0, 0, 0, 0, 0, 0],
+                [3, 2, 1, 4, 0, 0, 0, 0],
+                [5, 4, 3, 2, 1, 6, 7, 8],
+            ],
+            dtype=x_dtype,
+        )
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+
+class TestSqueeze:
+    @pytest.mark.parametrize(
+        "compute_unit, backend, x_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.float16, np.float32],
+        ),
+    )
+    def test_squeeze_different_data_types(self, compute_unit, backend, x_dtype):
+        def build(x):
+            return mb.squeeze(x=x, axes=(-1,))
+
+        x_val = np.array([[[[1], [2], [3]]]], dtype=x_dtype)
+        x_builtin_dtype = numpy_type_to_builtin_type(x_dtype)
+        input_placeholders = {"x": mb.placeholder(shape=x_val.shape, dtype=x_builtin_dtype)}
+        input_values = {"x": x_val}
+        expected_outputs = np.squeeze(x_val, -1)
+        expected_output_types = tuple(expected_outputs.shape) + (x_builtin_dtype,)
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+
+class TestTranspose:
+    @pytest.mark.parametrize(
+        "compute_unit, backend, x_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.float16, np.float32],
+        ),
+    )
+    def test_transpose_different_data_types(self, compute_unit, backend, x_dtype):
+        def build(x):
+            return mb.transpose(x=x, perm=(-1, 0))
+
+        x_val = np.array([[1, 2, 3], [4, 5, 6]], dtype=x_dtype)
+        x_builtin_dtype = numpy_type_to_builtin_type(x_dtype)
+        run_compare_builder(
+            build,
+            input_placeholders={"x": mb.placeholder(shape=x_val.shape, dtype=x_builtin_dtype)},
+            input_values={"x": x_val},
+            expected_output_types=(3, 2, types.fp32),
+            expected_outputs=x_val.T,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+
+class TestSlidingWindows:
+    @pytest.mark.parametrize(
+        "compute_unit, backend, x_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.float16, np.float32],
+        ),
+    )
+    def test_ios17_different_data_types(self, compute_unit, backend, x_dtype):
+        def build(x):
+            return mb.sliding_windows(x=x, axis=1, size=2)
+
+        x_val = np.array([[[[9.0]], [[5.0]], [[1.0]], [[3.0]]]], dtype=x_dtype)
+        x_builtin_dtype = numpy_type_to_builtin_type(x_dtype)
+        input_placeholders = {"x": mb.placeholder(shape=x_val.shape, dtype=x_builtin_dtype)}
+        input_values = {"x": x_val}
+        expected_output_types = (1, 3, 2, 1, 1, x_builtin_dtype)
+        expected_outputs = np.array(
+            [[[[[9.0]], [[5.0]]], [[[5.0]], [[1.0]]], [[[1.0]], [[3.0]]]]],
+            dtype=x_dtype,
+        )
+
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+
+class TestSliceByIndex(_TestSliceByIndexIos14):
+    @pytest.mark.parametrize(
+        "compute_unit, backend, x_dtype, idx_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            (np.float16, np.float32, np.int8, np.int16, np.int32, np.uint8, np.uint16),
+            (np.int8, np.int16, np.int32),
+        ),
+    )
+    def test_builder_to_backend_smoke(self, compute_unit, backend, x_dtype, idx_dtype):
+        super().test_builder_to_backend_smoke(compute_unit, backend, x_dtype, idx_dtype)
+
+
+class TestSliceBySize(_TestSliceBySizeIos14):
+    @pytest.mark.parametrize(
+        "compute_unit, backend, size_val, x_dtype, idx_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            ([1, 2, 3], [-1, 2, -1]),
+            (np.float16, np.float32, np.int8, np.int16, np.int32, np.uint8, np.uint16),
+            (np.int8, np.int16, np.int32),
+        ),
+    )
+    def test_builder_to_backend_smoke(self, compute_unit, backend, size_val, x_dtype, idx_dtype):
+        super().test_builder_to_backend_smoke(compute_unit, backend, size_val, x_dtype, idx_dtype)
diff --git a/coremltools/converters/mil/mil/ops/tests/test_const.py b/coremltools/converters/mil/mil/ops/tests/test_const.py
deleted file mode 100644
index b484e3571..000000000
--- a/coremltools/converters/mil/mil/ops/tests/test_const.py
+++ /dev/null
@@ -1,62 +0,0 @@
-#  Copyright (c) 2020, Apple Inc. All rights reserved.
-#
-#  Use of this source code is governed by a BSD-3-clause license that can be
-#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-
-import itertools
-
-import numpy as np
-import pytest
-
-from coremltools.converters.mil import testing_reqs
-from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import types
-
-from .testing_utils import run_compare_builder
-
-backends = testing_reqs.backends
-compute_units = testing_reqs.compute_units
-
-
-class TestConst:
-    @pytest.mark.parametrize(
-        "compute_unit, backend, dtype", itertools.product(
-            compute_units,
-            backends,
-            [
-                np.int32,
-                np.int64,
-                np.float16,
-                np.float32,
-                np.float64,
-            ]
-        )
-    )
-    def test_builder_to_backend_smoke(self, compute_unit, backend, dtype):
-        if backend[0] == "mlprogram" and dtype in [np.uint8, np.int8, np.uint32]:
-            pytest.skip("Data type not supported")
-
-        t = np.random.randint(0, 5, (4, 2)).astype(np.float32)
-        constant = np.random.randint(0, 5, (4, 2)).astype(dtype)
-        input_placeholders = {
-            "x": mb.placeholder(shape=t.shape),
-        }
-        input_values = {"x": t}
-
-        def build(x):
-            y = mb.const(val=constant)
-            y = mb.cast(x=y, dtype='fp32')
-            return mb.add(x=x, y=y)
-
-        expected_output_types = (4, 2, types.fp32)
-        expected_outputs = t + constant.astype(np.float32)
-
-        run_compare_builder(
-            build,
-            input_placeholders,
-            input_values,
-            expected_output_types,
-            expected_outputs,
-            compute_unit=compute_unit,
-            backend=backend,
-        )
diff --git a/coremltools/converters/mil/mil/ops/tests/test_image_resizing.py b/coremltools/converters/mil/mil/ops/tests/test_image_resizing.py
deleted file mode 100644
index 92439617d..000000000
--- a/coremltools/converters/mil/mil/ops/tests/test_image_resizing.py
+++ /dev/null
@@ -1,1049 +0,0 @@
-#  Copyright (c) 2020, Apple Inc. All rights reserved.
-#
-#  Use of this source code is governed by a BSD-3-clause license that can be
-#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-
-import functools
-import itertools
-
-import numpy as np
-import pytest
-
-import coremltools as ct
-from coremltools._deps import _HAS_TORCH, MSG_TORCH_NOT_FOUND
-from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import get_new_symbol, types
-from coremltools.converters.mil.testing_reqs import backends, compute_units
-from coremltools.converters.mil.testing_utils import random_gen
-from coremltools.models.utils import _macos_version
-
-from .testing_utils import run_compare_builder
-
-if _HAS_TORCH:
-    import torch
-
-
-class TestAffine:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
-    def test_builder_to_backend_smoke(self, compute_unit, backend):
-        if backend[0] == "neuralnetwork":
-            pytest.skip("nn backend not supported")
-
-        x_val = np.array([11.0, 22.0, 33.0, 44.0], dtype=np.float32).reshape(
-            [1, 1, 2, 2]
-        )
-        transform_matrix_val = np.array(
-            [-1.0, -2.0, -3.7, -1.0, 3.5, 1.2], dtype=np.float32
-        ).reshape([1, 6])
-
-        input_placeholder_dict = {
-            "x": mb.placeholder(shape=x_val.shape),
-            "transform_matrix": mb.placeholder(shape=transform_matrix_val.shape),
-        }
-        input_value_dict = {"x": x_val, "transform_matrix": transform_matrix_val}
-
-        def build(x, transform_matrix):
-            return [
-                mb.affine(
-                    x=x,
-                    transform_matrix=transform_matrix,
-                    output_height=3,
-                    output_width=3,
-                    sampling_mode="bilinear",
-                    padding_mode="constant",
-                    padding_value=0.0,
-                    coordinates_mode="normalized_minus_one_to_one",
-                    align_corners=True,
-                ),
-                mb.affine(
-                    x=x,
-                    transform_matrix=transform_matrix,
-                    output_height=2,
-                    output_width=5,
-                    sampling_mode="bilinear",
-                    padding_mode="constant",
-                    padding_value=0.0,
-                    coordinates_mode="normalized_minus_one_to_one",
-                    align_corners=True,
-                ),
-            ]
-
-        expected_output_types = [
-            (1, 1, 3, 3, types.fp32),
-            (1, 1, 2, 5, types.fp32),
-        ]
-        expected_outputs = [
-            np.array(
-                [10.752501, 2.5025, 0.0, 1.9799997, 0.0, 0.0, 0.0, 0.0, 0.0],
-                dtype=np.float32,
-            ).reshape([1, 1, 3, 3]),
-            np.array(
-                [10.752501, 5.94, 2.5025, 0.44000006, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                dtype=np.float32,
-            ).reshape([1, 1, 2, 5]),
-        ]
-
-        run_compare_builder(
-            build,
-            input_placeholder_dict,
-            input_value_dict,
-            expected_output_types,
-            expected_outputs,
-            compute_unit=compute_unit,
-            backend=backend,
-        )
-
-
-class TestResample:
-    @pytest.mark.parametrize(
-        "compute_unit, backend, minimum_deployment_target",
-        itertools.product(
-            compute_units,
-            backends,
-            [ct.target.iOS15, ct.target.iOS16],
-        )
-    )
-    def test_builder_to_backend_smoke(self, compute_unit, backend, minimum_deployment_target):
-        if backend[0] == "neuralnetwork":
-            pytest.skip("nn backend not supported")
-        if minimum_deployment_target == ct.target.iOS16 and _macos_version() < (13, 0):
-            pytest.skip("New functionality in macOS13/iOS16")
-
-        x_ = np.array([11.0, 22.0, 33.0, 44.0], dtype=np.float32).reshape([1, 1, 2, 2])
-        coordinates_ = np.array(
-            [-1.0, -2.0, -3.7, -1.0, 0.0, 0.0, 3.5, 1.2], dtype=np.float32
-        ).reshape([1, 2, 2, 2])
-
-        input_placeholder_dict = {
-            "x": mb.placeholder(shape=x_.shape),
-            "coordinates": mb.placeholder(shape=coordinates_.shape),
-        }
-        input_value_dict = {"x": x_, "coordinates": coordinates_}
-        expected_output_type = (1, 1, 2, 2, types.fp32)
-
-        def build_0(x, coordinates):
-            return mb.resample(
-                x=x,
-                coordinates=coordinates,
-                sampling_mode="bilinear",
-                padding_mode="constant",
-                padding_value=6.17,
-                coordinates_mode="normalized_minus_one_to_one",
-                align_corners=True,
-            )
-
-        expected_output_0 = np.array(
-            [8.585, 6.17, 27.5, 6.17], dtype=np.float32
-        ).reshape(expected_output_type[:-1])
-
-        def build_1(x, coordinates):
-            return mb.resample(
-                x=x,
-                coordinates=coordinates,
-                sampling_mode="nearest",
-                padding_mode="border",
-                padding_value=-1.0,
-                coordinates_mode="unnormalized",
-                align_corners=False,
-            )
-
-        expected_output_1 = np.array(
-            [11.0, 11.0, 11.0, 44.0], dtype=np.float32
-        ).reshape(expected_output_type[:-1])
-
-        def build_2(x, coordinates):
-            return mb.resample(
-                x=x,
-                coordinates=coordinates,
-                sampling_mode="bilinear",
-                padding_mode="reflection",
-                padding_value=-1.0,
-                coordinates_mode="normalized_zero_to_one",
-                align_corners=True,
-            )
-
-        expected_output_2 = np.array(
-            [22.0, 36.3, 11.0, 34.1], dtype=np.float32
-        ).reshape(expected_output_type[:-1])
-
-        def build_3(x, coordinates):
-            return mb.resample(
-                x=x,
-                coordinates=coordinates,
-                sampling_mode="nearest",
-                padding_mode="symmetric",
-                padding_value=-1.0,
-                coordinates_mode="normalized_zero_to_one",
-                align_corners=False,
-            )
-
-        expected_output_3 = np.array(
-            [22.0, 33.0, 11.0, 33.0], dtype=np.float32
-        ).reshape(expected_output_type[:-1])
-
-        for build, expected_output in zip(
-            [build_0, build_1, build_2, build_3],
-            [
-                expected_output_0,
-                expected_output_1,
-                expected_output_2,
-                expected_output_3,
-            ],
-        ):
-            mlmodel = run_compare_builder(
-                build,
-                input_placeholder_dict,
-                input_value_dict,
-                expected_output_type,
-                expected_output,
-                compute_unit=compute_unit,
-                backend=backend,
-                minimum_deployment_target=minimum_deployment_target,
-            )
-            prog = mlmodel._mil_program
-            number_of_cast = len(prog["main"].find_ops(op_type="cast"))
-            # for the new iOS16 resample op, the coordinates is cast to fp16
-            if minimum_deployment_target == ct.target.iOS15:
-                assert number_of_cast == 2
-            elif minimum_deployment_target == ct.target.iOS16:
-                assert number_of_cast == 3
-            else:
-                raise ValueError("Unrecognized target {}".format(minimum_deployment_target))
-
-
-class TestResizeNearestNeighbor:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
-    def test_builder_to_backend_smoke(self, compute_unit, backend):
-        x_val = np.array([0.37, 6.17], dtype=np.float32).reshape([1, 1, 2, 1])
-        input_placeholder_dict = {"x": mb.placeholder(shape=x_val.shape)}
-        input_value_dict = {"x": x_val}
-
-        def build_model(x):
-            return [
-                mb.resize_nearest_neighbor(
-                    x=x, target_size_height=2, target_size_width=1,
-                ),
-                mb.resize_nearest_neighbor(
-                    x=x, target_size_height=2, target_size_width=3,
-                ),
-            ]
-
-        expected_output_types = [
-            (1, 1, 2, 1, types.fp32),
-            (1, 1, 2, 3, types.fp32),
-        ]
-        expected_outputs = [
-            x_val,
-            np.array([0.37, 0.37, 0.37, 6.17, 6.17, 6.17], dtype=np.float32).reshape(
-                [1, 1, 2, 3]
-            ),
-        ]
-
-        run_compare_builder(
-            build_model,
-            input_placeholder_dict,
-            input_value_dict,
-            expected_output_types,
-            expected_outputs,
-            compute_unit=compute_unit,
-            backend=backend,
-        )
-
-
-class TestUpsampleNearestNeighborFractionalScales:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
-    def test_builder_to_backend_smoke(self, compute_unit, backend):
-        if backend[0] == "neuralnetwork":
-            pytest.skip("nn backend not supported")
-
-        if backend[0] == "mlprogram" and compute_unit != ct.ComputeUnit.CPU_ONLY:
-            pytest.xfail("rdar://97398448 (TestUpsampleNearestNeighborFractionalScales failing on GPU)")
-
-        x_val = np.array([1.5, -2.5, 3.5], dtype=np.float32).reshape([1, 1, 1, 3])
-        input_placeholder_dict = {"x": mb.placeholder(shape=x_val.shape)}
-        input_value_dict = {"x": x_val}
-
-        def build(x):
-            return [
-                mb.upsample_nearest_neighbor(
-                    x=x, scale_factor_height=1.0, scale_factor_width=1.0,
-                ),
-                mb.upsample_nearest_neighbor(
-                    x=x, scale_factor_height=3.17, scale_factor_width=0.67
-                ),
-                mb.upsample_nearest_neighbor(
-                    x=x, scale_factor_height=2.0, scale_factor_width=1.12,
-                ),
-            ]
-
-        expected_output_types = [
-            (1, 1, 1, 3, types.fp32),
-            (1, 1, 3, 2, types.fp32),
-            (1, 1, 2, 3, types.fp32),
-        ]
-        expected_outputs = [
-            x_val,
-            np.array([1.5, -2.5, 1.5, -2.5, 1.5, -2.5], dtype=np.float32).reshape(
-                [1, 1, 3, 2]
-            ),
-            np.array([1.5, -2.5, 3.5, 1.5, -2.5, 3.5], dtype=np.float32).reshape(
-                [1, 1, 2, 3]
-            ),
-        ]
-
-        run_compare_builder(
-            build,
-            input_placeholder_dict,
-            input_value_dict,
-            expected_output_types,
-            expected_outputs,
-            compute_unit=compute_unit,
-            backend=backend,
-        )
-
-
-class TestResizeBilinear:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
-    )
-    def test_builder_to_backend_smoke(self, compute_unit, backend):
-        if backend[0] == "mlprogram":
-            pytest.xfail("Seg fault: rdar://78343191 ((MIL GPU) Core ML Tools Unit Test failures [failure to load or Seg fault])")
-
-        if backend[0] == "neuralnetwork" and compute_unit == ct.ComputeUnit.CPU_ONLY:
-            pytest.xfail("rdar://85318710 (Coremltools Smoke test on ResizeBilinear failing on NNv1 backend.)")
-
-        x = np.array([0, 1], dtype=np.float32).reshape(1, 1, 2)
-        input_placeholder_dict = {"x": mb.placeholder(shape=x.shape)}
-        input_value_dict = {"x": x}
-
-        def build_mode_0(x):
-            return mb.resize_bilinear(
-                x=x,
-                target_size_height=1,
-                target_size_width=5,
-                sampling_mode="STRICT_ALIGN_CORNERS",
-            )
-
-        expected_output_type = (1, 1, 5, types.fp32)
-        expected_output = np.array([0, 0.25, 0.5, 0.75, 1], dtype=np.float32).reshape(
-            1, 1, 5
-        )
-
-        run_compare_builder(
-            build_mode_0,
-            input_placeholder_dict,
-            input_value_dict,
-            expected_output_type,
-            expected_output,
-            compute_unit=compute_unit,
-            backend=backend,
-        )
-
-        def build_mode_2(x):
-            return mb.resize_bilinear(
-                x=x, target_size_height=1, target_size_width=5, sampling_mode="DEFAULT"
-            )
-
-        expected_output = np.array([0, 0.4, 0.8, 1, 1], dtype=np.float32).reshape(
-            1, 1, 5
-        )
-
-        run_compare_builder(
-            build_mode_2,
-            input_placeholder_dict,
-            input_value_dict,
-            expected_output_type,
-            expected_output,
-            compute_unit=compute_unit,
-            backend=backend,
-        )
-
-        def build_mode_3(x):
-            return mb.resize_bilinear(
-                x=x,
-                target_size_height=1,
-                target_size_width=5,
-                sampling_mode="OFFSET_CORNERS",
-            )
-
-        expected_output = np.array([0.1, 0.3, 0.5, 0.7, 0.9], dtype=np.float32).reshape(
-            1, 1, 5
-        )
-
-        run_compare_builder(
-            build_mode_3,
-            input_placeholder_dict,
-            input_value_dict,
-            expected_output_type,
-            expected_output,
-            compute_unit=compute_unit,
-            backend=backend,
-        )
-
-        if backend[0] != "neuralnetwork":
-            def build_mode_4(x):
-                return mb.resize_bilinear(
-                    x=x,
-                    target_size_height=1,
-                    target_size_width=5,
-                    sampling_mode="UNALIGN_CORNERS",
-                )
-
-            expected_output = np.array([0.0, 0.1, 0.5, 0.9, 1.0], dtype=np.float32).reshape(
-                1, 1, 5
-            )
-
-            run_compare_builder(
-                build_mode_4,
-                input_placeholder_dict,
-                input_value_dict,
-                expected_output_type,
-                expected_output,
-                compute_unit=compute_unit,
-                backend=backend,
-            )
-
-
-class TestUpsampleBilinear:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
-    )
-    def test_builder_to_backend_smoke(self, compute_unit, backend):
-        x = np.array([0, 1], dtype=np.float32).reshape(1, 1, 2)
-        input_placeholder_dict = {"x": mb.placeholder(shape=x.shape)}
-        input_value_dict = {"x": x}
-
-        def build_upsample_integer(x):
-            return mb.upsample_bilinear(
-                x=x, scale_factor_height=1, scale_factor_width=3
-            )
-
-        expected_output_type = (1, 1, 6, types.fp32)
-        expected_output = np.array(
-            [0, 0.2, 0.4, 0.6, 0.8, 1], dtype=np.float32
-        ).reshape(1, 1, 6)
-
-        run_compare_builder(
-            build_upsample_integer,
-            input_placeholder_dict,
-            input_value_dict,
-            expected_output_type,
-            expected_output,
-            compute_unit=compute_unit,
-            backend=backend,
-        )
-
-        def build_upsample_fractional(x):
-            return mb.upsample_bilinear(
-                x=x, scale_factor_height=1.0, scale_factor_width=2.6, align_corners=False
-            )
-
-        expected_output_type = (1, 1, 5, types.fp32)
-        expected_output = np.array([0, 0.1, 0.5, 0.9, 1], dtype=np.float32).reshape(
-            1, 1, 5
-        )
-
-        run_compare_builder(
-            build_upsample_fractional,
-            input_placeholder_dict,
-            input_value_dict,
-            expected_output_type,
-            expected_output,
-            compute_unit=compute_unit,
-            backend=backend,
-        )
-
-    @pytest.mark.parametrize(
-        "compute_unit, backend, align_corners, half_pixel_centers",
-        itertools.product(
-            compute_units,
-            backends,
-            [True, False],
-            [True, False],
-        )
-    )
-    def test_builder_to_backend_smoke_iOS16(self, compute_unit, backend, align_corners, half_pixel_centers):
-        if backend[0] == "neuralnetwork" or ct.utils._macos_version() < (13, 0):
-            pytest.skip("The new half_pixel_centers argument only available in iOS16")
-
-        if align_corners and half_pixel_centers:
-            pytest.skip("Invalid configuration of align_corners and half_pixel_centers")
-
-        x = np.array([1, 2], dtype=np.float32).reshape(1, 1, 1, 2)
-        input_placeholder_dict = {"x": mb.placeholder(shape=x.shape)}
-        input_value_dict = {"x": x}
-
-        def build_upsample_bilinear(x):
-            return mb.upsample_bilinear(
-                x=x,
-                scale_factor_height=2,
-                scale_factor_width=3,
-                align_corners=align_corners,
-                half_pixel_centers=half_pixel_centers,
-            )
-
-        expected_output_type = (1, 1, 2, 6, types.fp32)
-
-        if align_corners and not half_pixel_centers:
-            expected_output = [1., 1.2, 1.4, 1.6, 1.8, 2., 1., 1.2, 1.4, 1.6, 1.8, 2.]
-        elif not align_corners and half_pixel_centers:
-            expected_output = [1., 1., 1.33334, 1.66667, 2., 2., 1., 1., 1.33334, 1.66667, 2., 2.]
-        elif not align_corners and not half_pixel_centers:
-            expected_output = [1., 1.33334, 1.66667, 2., 2., 2., 1., 1.33334, 1.66667, 2., 2., 2.]
-        else:
-            raise ValueError("align_corners and half_pixel_centers cannot be both True")
-
-        expected_output = [np.array(expected_output, dtype=np.float32).reshape(1, 1, 2, 6)]
-
-        run_compare_builder(
-            build_upsample_bilinear,
-            input_placeholder_dict,
-            input_value_dict,
-            expected_output_type,
-            expected_output,
-            compute_unit=compute_unit,
-            backend=backend,
-            minimum_deployment_target=ct.target.iOS16,
-        )
-
-    @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
-    @pytest.mark.parametrize(
-        "compute_unit, backend, input_shape, scale_factor, align_corners, recompute_scale_factor",
-        itertools.product(
-            compute_units,
-            backends,
-            [(2, 5, 10, 22)],
-            [(3, 4), (2.5, 2.0), (0.5, 0.75)],
-            [True, False],
-            [True, False],
-        ),
-    )
-    def test_builder_to_backend_stress(
-        self, compute_unit, backend, input_shape, scale_factor, align_corners, recompute_scale_factor
-    ):
-        scale_factor_height, scale_factor_width = scale_factor
-        _, _, height, width = input_shape
-        height = height * scale_factor_height
-        width = width * scale_factor_width
-        is_h_float = height - np.floor(height) > 0.001
-        is_w_float = width - np.floor(width) > 0.001
-
-        # Currently, MIL is not suporting recompute_scale_factor=False + align_corners=False
-        # with fractional output size
-        if not recompute_scale_factor and not align_corners and (is_h_float or is_w_float):
-            pytest.xfail("rdar://81124053 (Support recompute_scale_factor)")
-
-        def _get_torch_upsample_prediction(x, scale_factor=(2, 2), align_corners=False, recompute_scale_factor=True):
-            x = torch.from_numpy(x)
-            out = torch.nn.functional.interpolate(
-                x,
-                scale_factor=scale_factor,
-                mode="bilinear",
-                align_corners=align_corners,
-                recompute_scale_factor=recompute_scale_factor,
-            )
-            return out.numpy()
-
-        x = random_gen(input_shape, rand_min=-100, rand_max=100)
-        torch_pred = _get_torch_upsample_prediction(
-            x,
-            scale_factor=scale_factor,
-            align_corners=align_corners,
-            recompute_scale_factor=recompute_scale_factor,
-        )
-
-        input_placeholder_dict = {"x": mb.placeholder(shape=x.shape)}
-        input_value_dict = {"x": x}
-
-        def build_upsample(x):
-            return mb.upsample_bilinear(
-                x=x,
-                scale_factor_height=scale_factor[0],
-                scale_factor_width=scale_factor[1],
-                align_corners=align_corners,
-            )
-
-        expected_output_type = torch_pred.shape + (types.fp32,)
-        run_compare_builder(
-            build_upsample,
-            input_placeholder_dict,
-            input_value_dict,
-            expected_output_type,
-            torch_pred,
-            compute_unit=compute_unit,
-            backend=backend,
-            rtol=0.5,
-        )
-
-
-class TestUpsampleNearestNeighbor:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
-    )
-    def test_builder_to_backend_smoke(self, compute_unit, backend):
-        x = np.array([1.5, 2.5, 3.5], dtype=np.float32).reshape([1, 1, 1, 3])
-        input_placeholder_dict = {"x": mb.placeholder(shape=x.shape)}
-        input_value_dict = {"x": x}
-
-        def build(x):
-            return mb.upsample_nearest_neighbor(
-                x=x, scale_factor_height=1, scale_factor_width=2
-            )
-
-        expected_output_type = (1, 1, 1, 6, types.fp32)
-        expected_output = np.array(
-            [1.5, 1.5, 2.5, 2.5, 3.5, 3.5], dtype=np.float32
-        ).reshape([1, 1, 1, 6])
-
-        run_compare_builder(
-            build,
-            input_placeholder_dict,
-            input_value_dict,
-            expected_output_type,
-            expected_output,
-            compute_unit=compute_unit,
-            backend=backend,
-        )
-
-
-class TestCrop:
-    @pytest.mark.parametrize(
-        "compute_unit, backend, is_symbolic",
-        itertools.product(compute_units, backends, compute_units),
-    )
-    def test_builder_to_backend_smoke(self, compute_unit, backend, is_symbolic):
-        x = np.array(
-            [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]],
-            dtype=np.float32,
-        ).reshape(1, 1, 4, 4)
-
-        input_shape = list(x.shape)
-        placeholder_input_shape = input_shape
-        if is_symbolic:
-            # set batch and channel dimension symbolic
-            placeholder_input_shape[0] = get_new_symbol()
-            placeholder_input_shape[1] = get_new_symbol()
-
-        input_placeholder_dict = {"x": mb.placeholder(shape=placeholder_input_shape)}
-        input_value_dict = {"x": x}
-
-        def build(x):
-            return mb.crop(x=x, crop_height=[0, 1], crop_width=[1, 1])
-
-        expected_output_type = (
-            placeholder_input_shape[0],
-            placeholder_input_shape[1],
-            3,
-            2,
-            types.fp32,
-        )
-        expected_output = np.array([2, 3, 6, 7, 10, 11], dtype=np.float32).reshape(1, 1, 3, 2)
-
-        run_compare_builder(
-            build,
-            input_placeholder_dict,
-            input_value_dict,
-            expected_output_type,
-            expected_output,
-            compute_unit=compute_unit,
-            backend=backend,
-        )
-
-    @pytest.mark.parametrize(
-        "compute_unit, backend, C, H, W",
-        itertools.product(
-            compute_units,
-            backends,
-            [x for x in range(2, 4)],
-            [x for x in range(5, 8)],
-            [x for x in range(8, 10)],
-        ),
-    )
-    def test_builder_to_backend_stress(self, compute_unit, backend, C, H, W):
-        input_shape = (1, C, H, W)
-        x = np.random.random(input_shape)
-
-        crop_h = [np.random.randint(H)]
-        crop_h.append(np.random.randint(H - crop_h[0]))
-        crop_w = [np.random.randint(W)]
-        crop_w.append(np.random.randint(W - crop_w[0]))
-
-        input_placeholder_dict = {"x": mb.placeholder(shape=input_shape)}
-        input_value_dict = {"x": x}
-
-        def build(x):
-            return mb.crop(x=x, crop_height=crop_h, crop_width=crop_w)
-
-        expected_output_type = (
-            1,
-            C,
-            H - crop_h[0] - crop_h[1],
-            W - crop_w[0] - crop_w[1],
-            types.fp32,
-        )
-        expected_output = x[:, :, crop_h[0] : H - crop_h[1], crop_w[0] : W - crop_w[1]]
-
-        run_compare_builder(
-            build,
-            input_placeholder_dict,
-            input_value_dict,
-            expected_output_type,
-            expected_output,
-            compute_unit=compute_unit,
-            backend=backend,
-        )
-
-
-class TestCropResize:
-    @pytest.mark.parametrize(
-        "compute_unit, backend, pad_value",
-        itertools.product(compute_units, backends, [0.0, 1.0, 10.0]),
-    )
-    def test_builder_to_backend_ios16(self, compute_unit, backend, pad_value):
-        """For iOS16+ the crop_resize op supports pad_value."""
-        x = np.array(
-            [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]],
-            dtype=np.float32,
-        ).reshape(1, 1, 4, 4)
-
-        roi = np.array([
-            [0, 0.1, 0.3, 1.3, 1],
-            [0, 0.5, 1.8, 1., 0.3],
-            [0, 0.0, 0.4, 0.6, 0.7],
-        ], dtype=np.float32).reshape(3, 1, 5, 1, 1)
-
-        def build(x):
-            return mb.crop_resize(
-                x=x,
-                roi=roi,
-                target_width=2,
-                target_height=2,
-                normalized_coordinates=True,
-                box_coordinate_mode="CORNERS_HEIGHT_FIRST",
-                sampling_mode="ALIGN_CORNERS",
-                pad_value=pad_value,
-            )
-
-        expected_output_type = [
-            (3, 1, 1, 2, 2, types.fp32),
-        ]
-        expected_output = [
-            np.array(
-                [
-                    3.1,
-                    5.2,
-                    pad_value,
-                    pad_value,
-                    pad_value,
-                    7.899,
-                    pad_value,
-                    13.9,
-                    2.2,
-                    3.1,
-                    9.4,
-                    10.3,
-                ],
-                dtype=np.float32,
-            ).reshape(3, 1, 1, 2, 2),
-        ]
-
-        input_placeholder_dict = {"x": mb.placeholder(shape=(1, 1, 4, 4))}
-        input_value_dict = {"x": x}
-
-        run_compare_builder(
-            build,
-            input_placeholder_dict,
-            input_value_dict,
-            expected_output_type,
-            expected_output,
-            compute_unit=compute_unit,
-            backend=backend,
-            minimum_deployment_target=ct.target.iOS16,
-        )
-
-    @pytest.mark.parametrize(
-        "compute_unit, backend, is_symbolic",
-        itertools.product(compute_units, backends, [True, False]),
-    )
-    def test_builder_to_backend_smoke(self, compute_unit, backend, is_symbolic):
-        if backend[0] == "mlprogram" and compute_unit != ct.ComputeUnit.CPU_ONLY:
-            pytest.xfail("rdar://97398582 (TestCropResize failing on mlprogram + GPU)")
-        x = np.array(
-            [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]],
-            dtype=np.float32,
-        ).reshape(1, 1, 4, 4)
-
-        input_shape = list(x.shape)
-        placeholder_input_shape = input_shape
-        if is_symbolic:
-            # set batch and channel dimension symbolic
-            placeholder_input_shape[0] = get_new_symbol()
-            placeholder_input_shape[1] = get_new_symbol()
-
-        input_placeholder_dict = {"x": mb.placeholder(shape=placeholder_input_shape)}
-        input_value_dict = {"x": x}
-        N = 1
-        roi = np.array([[1, 1, 2, 2]], dtype=np.float32).reshape(1, 1, 4, 1, 1)
-        roi_normalized = np.array(
-            [[0, 0.0, 0.0, 1.0 / 3, 1.0 / 3]], dtype=np.float32
-        ).reshape(1, 1, 5, 1, 1)
-        roi_invert = np.array([[2, 2, 1, 1]], dtype=np.float32).reshape(1, 1, 4, 1, 1)
-
-        def build(x, mode=0):
-            if mode == 0:
-                return mb.crop_resize(
-                        x=x,
-                        roi=roi,
-                        target_width=2,
-                        target_height=2,
-                        normalized_coordinates=False,
-                        box_coordinate_mode="CORNERS_HEIGHT_FIRST",
-                        sampling_mode="ALIGN_CORNERS",
-                    )
-
-            elif mode == 1:
-                return mb.crop_resize(
-                        x=x,
-                        roi=roi,
-                        target_width=4,
-                        target_height=4,
-                        normalized_coordinates=False,
-                        box_coordinate_mode="CORNERS_HEIGHT_FIRST",
-                        sampling_mode="ALIGN_CORNERS",
-                    )
-
-            elif mode == 2:
-                return mb.crop_resize(
-                        x=x,
-                        roi=roi,
-                        target_width=1,
-                        target_height=1,
-                        normalized_coordinates=False,
-                        box_coordinate_mode="CORNERS_HEIGHT_FIRST",
-                        sampling_mode="ALIGN_CORNERS",
-                    )
-
-            elif mode == 3:
-                return mb.crop_resize(
-                        x=x,
-                        roi=roi_normalized,
-                        target_width=2,
-                        target_height=2,
-                        normalized_coordinates=True,
-                        box_coordinate_mode="CORNERS_HEIGHT_FIRST",
-                        sampling_mode="ALIGN_CORNERS",
-                    )
-
-            elif mode == 4:
-                return mb.crop_resize(
-                        x=x,
-                        roi=roi_invert,
-                        target_width=2,
-                        target_height=2,
-                        normalized_coordinates=False,
-                        box_coordinate_mode="CORNERS_HEIGHT_FIRST",
-                        sampling_mode="ALIGN_CORNERS",
-                    )
-
-            elif mode == 5:
-                return mb.crop_resize(
-                        x=x,
-                        roi=roi_invert,
-                        target_width=2,
-                        target_height=2,
-                        normalized_coordinates=True,
-                        box_coordinate_mode="CORNERS_HEIGHT_FIRST",
-                        sampling_mode="UNALIGN_CORNERS",
-                    )
-
-        expected_output_type = [
-            (
-                N,
-                placeholder_input_shape[0],
-                placeholder_input_shape[1],
-                2,
-                2,
-                types.fp32,
-            ),
-            (
-                N,
-                placeholder_input_shape[0],
-                placeholder_input_shape[1],
-                4,
-                4,
-                types.fp32,
-            ),
-            (
-                N,
-                placeholder_input_shape[0],
-                placeholder_input_shape[1],
-                1,
-                1,
-                types.fp32,
-            ),
-            (
-                N,
-                placeholder_input_shape[0],
-                placeholder_input_shape[1],
-                2,
-                2,
-                types.fp32,
-            ),
-            (
-                N,
-                placeholder_input_shape[0],
-                placeholder_input_shape[1],
-                2,
-                2,
-                types.fp32,
-            ),
-            (
-                N,
-                placeholder_input_shape[0],
-                placeholder_input_shape[1],
-                2,
-                2,
-                types.fp32,
-            ),
-        ]
-        expected_output = [
-            np.array([6, 7, 10, 11], dtype=np.float32).reshape(1, 1, 1, 2, 2),
-            np.array(
-                [
-                    [6, 6.333333, 6.66666, 7],
-                    [7.333333, 7.666666, 8, 8.333333],
-                    [8.666666, 9, 9.3333333, 9.666666],
-                    [10, 10.333333, 10.666666, 11],
-                ],
-                dtype=np.float32,
-            ).reshape(1, 1, 1, 4, 4),
-            np.array([8.5], dtype=np.float32).reshape(1, 1, 1, 1, 1),
-            np.array([1, 2, 5, 6], dtype=np.float32).reshape(1, 1, 1, 2, 2),
-            np.array([11, 10, 7, 6], dtype=np.float32).reshape(1, 1, 1, 2, 2),
-            np.array([3.5, 5.5, 11.5, 13.5], dtype=np.float32).reshape(1, 1, 1, 2, 2),
-        ]
-
-        for mode in range(6):
-            if backend[0] == "neuralnetwork" and mode == 5:
-                pytest.skip("nn-proto does not support UNALIGN_CORNERS")
-            run_compare_builder(
-                functools.partial(build, mode=mode),
-                input_placeholder_dict,
-                input_value_dict,
-                expected_output_type[mode],
-                expected_output[mode],
-                compute_unit=compute_unit,
-                backend=backend,
-            )
-
-    @pytest.mark.parametrize(
-        "compute_unit, backend, N",
-        itertools.product(compute_units, backends, [1, 3]),
-    )
-    def test_builder_to_backend_ios17(self, compute_unit, backend, N):
-        """For iOS17+ the `roi` input is replaced by `boxes` and `box_indices`."""
-        x = np.arange(1, 17, dtype=np.float32).reshape(1, 1, 4, 4)
-        boxes = np.array([1, 1, 2, 2], dtype=np.float32).reshape(1, 4)
-        box_indices = None
-        normalized_coordinates = False
-        if N == 3:
-            boxes = np.array(
-                [
-                    [0.1, 0.3, 1.3, 1.0],
-                    [0.5, 1.8, 1.0, 0.3],
-                    [0.0, 0.4, 0.6, 0.7],
-                ],
-                dtype=np.float32,
-            )
-            box_indices = np.array([0] * 3, dtype=np.int32)
-            normalized_coordinates = True
-
-        def build(x):
-            return mb.crop_resize(
-                x=x,
-                boxes=boxes,
-                box_indices=box_indices,
-                target_width=2,
-                target_height=2,
-                normalized_coordinates=normalized_coordinates,
-                box_coordinate_mode="CORNERS_HEIGHT_FIRST",
-                sampling_mode="ALIGN_CORNERS",
-                pad_value=10.0,
-            )
-
-        expected_outputs = [np.array([6, 7, 10, 11], dtype=np.float32).reshape(1, 1, 2, 2)]
-        if N == 3:
-            expected_outputs = [
-                np.array(
-                    [3.1, 5.2, 10.0, 10.0, 10.0, 7.899, 10.0, 13.9, 2.2, 3.1, 9.4, 10.3],
-                    dtype=np.float32,
-                ).reshape(3, 1, 2, 2)
-            ]
-
-        run_compare_builder(
-            build,
-            input_placeholders={"x": mb.placeholder(shape=(1, 1, 4, 4))},
-            input_values={"x": x},
-            expected_output_types=[(N, 1, 2, 2, types.fp32)],
-            expected_outputs=expected_outputs,
-            compute_unit=compute_unit,
-            backend=backend,
-            minimum_deployment_target=ct.target.iOS17,
-        )
-
-    def test_builder_eval_ios17_invalid(self):
-        x = np.arange(1, 17, dtype=np.float32).reshape(1, 1, 4, 4)
-        three_boxes = np.array(
-            [
-                [0.1, 0.3, 1.3, 1.0],
-                [0.5, 1.8, 1.0, 0.3],
-                [0.0, 0.4, 0.6, 0.7],
-            ],
-            dtype=np.float32,
-        )
-        with pytest.raises(
-            ValueError,
-            match='N dimension of "boxes" \(3\) should not be greater '
-            'than the B dimension of "x" \(1\)',
-        ):
-
-            @mb.program(input_specs=[], opset_version=ct.target.iOS17)
-            def prog():
-                return mb.crop_resize(x=x, boxes=three_boxes)
-
-        one_box = np.array([1, 1, 2, 2], dtype=np.float32).reshape(1, 4)
-        indices_out_of_bound = np.array([10], dtype=np.int32)
-        with pytest.raises(
-            ValueError,
-            match='input "box_indices" should not have values >= B '
-            "dimension of x \(1\), but got \[10\]",
-        ):
-
-            @mb.program(input_specs=[], opset_version=ct.target.iOS17)
-            def prog():
-                return mb.crop_resize(x=x, boxes=one_box, box_indices=indices_out_of_bound)
-
-        indices_two_dim = np.array([[0]], dtype=np.int32)
-        with pytest.raises(
-            ValueError, match='input "box_indices" must has shape \[1\], but got \(1, 1\)'
-        ):
-
-            @mb.program(input_specs=[], opset_version=ct.target.iOS17)
-            def prog():
-                return mb.crop_resize(x=x, boxes=one_box, box_indices=indices_two_dim)
-
-        x_rank5 = np.arange(1, 17, dtype=np.float32).reshape(1, 1, 4, 4, 1)
-        with pytest.raises(
-            ValueError, match='input to the "crop_resize" op must be of rank 4, but got 5'
-        ):
-
-            @mb.program(input_specs=[], opset_version=ct.target.iOS17)
-            def prog():
-                return mb.crop_resize(x=x_rank5, boxes=one_box)
diff --git a/coremltools/converters/mil/mil/ops/tests/test_scatter_gather.py b/coremltools/converters/mil/mil/ops/tests/test_scatter_gather.py
deleted file mode 100644
index d2ef1b9c2..000000000
--- a/coremltools/converters/mil/mil/ops/tests/test_scatter_gather.py
+++ /dev/null
@@ -1,1170 +0,0 @@
-#  Copyright (c) 2020, Apple Inc. All rights reserved.
-#
-#  Use of this source code is governed by a BSD-3-clause license that can be
-#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-
-import itertools
-
-import numpy as np
-import pytest
-
-import coremltools as ct
-from coremltools._deps import _HAS_TF_2, MSG_TF2_NOT_FOUND
-from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import types
-from coremltools.converters.mil.testing_reqs import backends, compute_units
-from coremltools.models.utils import _macos_version
-
-from .testing_utils import run_compare_builder
-
-if _HAS_TF_2:
-    import tensorflow as tf
-
-
-class TestScatter:
-    @pytest.mark.parametrize(
-        "compute_unit, backend, minimum_deployment_target", itertools.product(
-            compute_units, backends, [None, ct.target.iOS17])
-    )
-    def test_builder_to_backend_smoke(self, compute_unit, backend, minimum_deployment_target):
-        if minimum_deployment_target == ct.target.iOS17:
-            if backend[0] != "mlprogram" or _macos_version() < (14, 0):
-                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
-
-        data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-        indices = np.array([1, 0], dtype=np.int32)
-        updates = np.array([[5, 6, 7], [8, 9, 10]], dtype=np.float32)
-        input_placeholders = {
-            "data": mb.placeholder(shape=data.shape),
-            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
-            "updates": mb.placeholder(shape=updates.shape),
-        }
-
-        input_values = {"data": data, "indices": indices, "updates": updates}
-
-        def build(data, indices, updates):
-            return (mb.scatter(data=data, indices=indices, updates=updates),)
-
-        expected_output_types = (2, 3, types.fp32)
-
-        expected_outputs = np.array([[9, 11, 13], [9, 11, 13]], dtype=np.float32)
-
-        run_compare_builder(
-            build,
-            input_placeholders,
-            input_values,
-            expected_output_types,
-            expected_outputs,
-            compute_unit=compute_unit,
-            backend=backend,
-            minimum_deployment_target=minimum_deployment_target,
-        )
-
-    @pytest.mark.skipif(not _HAS_TF_2, reason=MSG_TF2_NOT_FOUND)
-    @pytest.mark.parametrize(
-        "compute_unit, backend, rankData_rankIndices, accumulate_mode, minimum_deployment_target",
-        itertools.product(
-            compute_units,
-            backends,
-            [(1, 2), (2, 1), (3, 2), (2, 3), (1, 1), (3, 3), (1, 3)],
-            ["update", "add", "sub", "mul", "div", "max", "min"],
-            [None, ct.target.iOS17]
-        ),
-    )
-    def test_builder_to_backend_programmatic(
-        self, compute_unit, backend, rankData_rankIndices, accumulate_mode, minimum_deployment_target
-    ):
-        if minimum_deployment_target == ct.target.iOS17:
-            if backend[0] != "mlprogram" or _macos_version() < (14, 0):
-                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
-
-        data_rank, indices_rank = rankData_rankIndices
-        data_shape = np.random.randint(low=2, high=5, size=data_rank)
-        indices_shape = np.random.randint(low=2, high=5, size=indices_rank)
-        updates_shape = list(indices_shape) + list(data_shape[1:])
-
-        data = np.random.rand(*data_shape).astype(np.float32)
-        updates = np.random.rand(*updates_shape).astype(np.float32)
-        indices = np.random.randint(0, data_shape[0], size=indices_shape).astype(
-            np.int32
-        )
-
-        def build(data, indices, updates):
-            return mb.scatter(
-                data=data, indices=indices, updates=updates, mode=accumulate_mode
-            )
-
-        tf_output = tf.Variable(data)
-        if accumulate_mode == "update":
-            tf.compat.v1.scatter_update(tf_output, indices, updates)
-        if accumulate_mode == "add":
-            tf.compat.v1.scatter_add(tf_output, indices, updates)
-        if accumulate_mode == "sub":
-            tf.compat.v1.scatter_sub(tf_output, indices, updates)
-        if accumulate_mode == "mul":
-            tf.compat.v1.scatter_mul(tf_output, indices, updates)
-        if accumulate_mode == "div":
-            tf.compat.v1.scatter_div(tf_output, indices, updates)
-        if accumulate_mode == "max":
-            tf.compat.v1.scatter_max(tf_output, indices, updates)
-        if accumulate_mode == "min":
-            tf.compat.v1.scatter_min(tf_output, indices, updates)
-        expected_output = tf_output.numpy()
-
-        input_placeholders = {
-            "data": mb.placeholder(shape=data.shape),
-            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
-            "updates": mb.placeholder(shape=updates.shape),
-        }
-
-        input_values = {"data": data, "indices": indices, "updates": updates}
-
-        expected_output_types = tuple(data_shape[:]) + (types.fp32,)
-        run_compare_builder(
-            build,
-            input_placeholders,
-            input_values,
-            expected_output_types,
-            expected_output,
-            compute_unit=compute_unit,
-            backend=backend,
-            minimum_deployment_target=minimum_deployment_target,
-        )
-
-    @pytest.mark.parametrize(
-        "compute_unit, backend, indices_val, validate_indices, dynamic",
-        itertools.product(
-            compute_units,
-            backends,
-            [[-1, 0], [10, 0]],  # One negative indices, another out-of-range indices.
-            [True, False],
-            [True, False],
-        ),
-    )
-    def test_ios17_invalid_indices(self, compute_unit, backend, indices_val, validate_indices, dynamic):
-        if backend[0] != "mlprogram" or _macos_version() < (14, 0):
-            pytest.skip("IOS17 target available only on macOS 14+")
-
-        def build_static(data, updates):
-            return (
-                mb.scatter(
-                    data=data,
-                    indices=np.array(indices_val, dtype=np.int32),
-                    updates=updates,
-                    validate_indices=validate_indices,
-                ),
-            )
-
-        def build_dynamic(data, indices, updates):
-            return (mb.scatter(data=data, indices=indices, updates=updates, validate_indices=validate_indices), )
-
-        data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-        updates = np.array([[5, 6, 7], [8, 9, 10]], dtype=np.float32)
-        input_placeholders = {
-            "data": mb.placeholder(shape=data.shape),
-            "updates": mb.placeholder(shape=updates.shape),
-        }
-        input_values = {"data": data, "updates": updates}
-        if dynamic:
-            indices = np.array(indices_val, dtype=np.int32)
-            input_placeholders["indices"] = mb.placeholder(shape=indices.shape, dtype=types.int32)
-            input_values["indices"] = indices
-
-        if not validate_indices:
-            # When not validate indices, negative or out-of-bound indices behavior is undefined.
-            expected_error = AssertionError
-            expected_error_msg = "Not equal"
-        elif dynamic:
-            # In PyMIL's validation, the `validate_indices` will only validate indices whose values are
-            # known during op insertion, so it will not error out at PyMIL layer, but instead, rely on
-            # the backend to do the validation after compilation.
-            expected_error = RuntimeError
-            expected_error_msg = (
-                "Error computing NN outputs",
-                "Unable to compute the prediction using a neural network model",
-            )
-        else:
-            # The negative or out-of-bound indices will error out when validate_indices is set.
-            expected_error = IndexError
-            expected_error_msg = "Indices is out of bounds"
-
-        with pytest.raises(expected_error) as excinfo:
-            run_compare_builder(
-                build_dynamic if dynamic else build_static,
-                input_placeholders,
-                input_values,
-                expected_output_types=(2, 3, types.fp32),
-                expected_outputs=np.array([[9, 11, 13], [9, 11, 13]], dtype=np.float32),
-                compute_unit=compute_unit,
-                backend=backend,
-                minimum_deployment_target=ct.target.iOS17,
-            )
-            if not isinstance(expected_error_msg, tuple):
-                expected_error_msg = expected_error_msg
-            assert any([err in str(excinfo.value) for err in expected_error_msg])
-
-
-class TestScatterAlongAxis:
-    @pytest.mark.parametrize(
-        "compute_unit, backend, minimum_deployment_target", itertools.product(
-            compute_units, backends, [None, ct.target.iOS17])
-    )
-    def test_builder_to_backend_smoke(self, compute_unit, backend, minimum_deployment_target):
-        if minimum_deployment_target == ct.target.iOS17:
-            if backend[0] != "mlprogram" or _macos_version() < (14, 0):
-                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
-
-        data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-        indices = np.array([[1, 0, 1], [1, 1, 0]], dtype=np.int32)
-        updates = np.array([[5, 6, 7], [8, 9, 10]], dtype=np.float32)
-        input_placeholders = {
-            "data": mb.placeholder(shape=data.shape),
-            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
-            "updates": mb.placeholder(shape=updates.shape),
-        }
-
-        input_values = {"data": data, "indices": indices, "updates": updates}
-
-        def build(data, indices, updates):
-            return mb.scatter_along_axis(
-                data=data, indices=indices, updates=updates, axis=0, mode="update"
-            )
-
-        expected_output_types = (2, 3, types.fp32)
-
-        expected_outputs = np.array([[1, 6, 10], [8, 9, 7]], dtype=np.float32)
-
-        run_compare_builder(
-            build,
-            input_placeholders,
-            input_values,
-            expected_output_types,
-            expected_outputs,
-            compute_unit=compute_unit,
-            backend=backend,
-            minimum_deployment_target=minimum_deployment_target,
-        )
-
-    @pytest.mark.parametrize(
-        "opset_version",
-        [ct.target.iOS15, ct.target.iOS16, ct.target.iOS17],
-    )
-    def test_builder_eval(self, opset_version):
-        @mb.program(
-            input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)], opset_version=opset_version
-        )
-        def prog(x):
-            params = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-            indices = np.array([[1, 0, 1], [1, 1, 0]], dtype=np.int32)
-            updates = np.array([[5, 6, 7], [8, 9, 10]], dtype=np.float32)
-            res = mb.scatter_along_axis(
-                data=params, indices=indices, updates=updates, axis=0, mode="update"
-            )
-            return res
-
-        main_func = prog.functions["main"]
-        gather_ops = main_func.find_ops(op_type="scatter_along_axis")[0]
-
-        np.testing.assert_allclose(
-            np.array([[1, 6, 10], [8, 9, 7]], dtype=np.float32),
-            gather_ops.outputs[0].val,
-            atol=1e-04,
-            rtol=1e-05,
-        )
-
-    @pytest.mark.parametrize(
-        "compute_unit, backend, rank_axis, minimum_deployment_target",
-        itertools.product(
-            compute_units,
-            backends,
-            [(rank, axis) for rank in range(1, 5) for axis in range(-rank, rank)],
-            [None, ct.target.iOS17]
-        ),
-    )
-    def test_builder_to_backend_programmatic(self, compute_unit, backend, rank_axis, minimum_deployment_target):
-        if minimum_deployment_target == ct.target.iOS17:
-            if backend[0] != "mlprogram" or _macos_version() < (14, 0):
-                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
-
-        rank, axis = rank_axis
-        data_shape = np.random.randint(low=2, high=8, size=rank)
-        indices_shape = np.copy(data_shape)
-        indices_shape[axis] = np.random.randint(low=1, high=8)
-        updates_shape = indices_shape
-
-        data = np.random.rand(*data_shape).astype(np.float32)
-        updates = np.random.rand(*updates_shape).astype(np.float32)
-        if minimum_deployment_target == ct.target.iOS17:
-            # IOS17 scatter_along_axis requires indices to be non-negative.
-            indices = np.random.randint(0, data_shape[axis], size=indices_shape).astype(np.int32)
-        else:
-            indices = np.random.randint(
-                -data_shape[axis], data_shape[axis], size=indices_shape
-            ).astype(np.int32)
-
-        def build(data, indices, updates):
-            return mb.scatter_along_axis(
-                data=data, indices=indices, updates=updates, axis=axis, mode="update"
-            )
-
-        input_placeholders = {
-            "data": mb.placeholder(shape=data.shape),
-            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
-            "updates": mb.placeholder(shape=updates.shape),
-        }
-
-        input_values = {"data": data, "indices": indices, "updates": updates}
-
-        expected_output_types = tuple(data_shape[:]) + (types.fp32,)
-
-        np_output = np.copy(data)
-        np.put_along_axis(np_output, indices, updates, axis=axis)
-
-        run_compare_builder(
-            build,
-            input_placeholders,
-            input_values,
-            expected_output_types,
-            np_output,
-            compute_unit=compute_unit,
-            backend=backend,
-            minimum_deployment_target=minimum_deployment_target,
-        )
-
-    @pytest.mark.parametrize(
-        "compute_unit, backend, indices_val, dynamic",
-        itertools.product(
-            compute_units,
-            backends,
-            [[[-1, 0, 1], [1, 1, 0]], [[1, 10, 1], [1, 1, 0]]],
-            [True, False],
-        ),
-    )
-    def test_ios17_invalid_indices(self, compute_unit, backend, indices_val, dynamic):
-        if backend[0] != "mlprogram" or _macos_version() < (14, 0):
-            pytest.skip("IOS17 target available only on macOS 14+")
-
-        def build_static(data, updates):
-            return (
-                mb.scatter_along_axis(
-                    data=data,
-                    indices=np.array(indices_val, dtype=np.int32),
-                    updates=updates,
-                    validate_indices=True,
-                ),
-            )
-
-        def build_dynamic(data, indices, updates):
-            return mb.scatter_along_axis(
-                data=data, indices=indices, updates=updates, axis=0, mode="update",
-                validate_indices=True,
-            )
-
-        data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-        updates = np.array([[5, 6, 7], [8, 9, 10]], dtype=np.float32)
-        input_placeholders = {
-            "data": mb.placeholder(shape=data.shape),
-            "updates": mb.placeholder(shape=updates.shape),
-        }
-        input_values = {"data": data, "updates": updates}
-        if dynamic:
-            indices = np.array(indices_val, dtype=np.int32)
-            input_placeholders["indices"] = mb.placeholder(shape=indices.shape, dtype=types.int32)
-            input_values["indices"] = indices
-
-        if dynamic:
-            expected_error = RuntimeError
-            expected_error_msg = (
-                "Error computing NN outputs",
-                "Unable to compute the prediction using a neural network model",
-            )
-        else:
-            # The negative or out-of-bound indices will error out when validate_indices is set.
-            expected_error = IndexError
-            expected_error_msg = "Indices is out of bounds"
-
-        # The negative or out-of-bound indices will error out when validate_indices is set.
-        with pytest.raises(expected_error) as excinfo:
-            run_compare_builder(
-                build_dynamic if dynamic else build_static,
-                input_placeholders,
-                input_values,
-                expected_output_types=(2, 3, types.fp32),
-                expected_outputs=np.array([[1, 6, 10], [8, 9, 7]], dtype=np.float32),
-                compute_unit=compute_unit,
-                backend=backend,
-                minimum_deployment_target=ct.target.iOS17,
-            )
-            if not isinstance(expected_error_msg, tuple):
-                expected_error_msg = expected_error_msg
-            assert any([err in str(excinfo.value) for err in expected_error_msg])
-
-
-class TestScatterNd:
-    @pytest.mark.parametrize(
-        "compute_unit, backend, minimum_deployment_target", itertools.product(
-            compute_units, backends, [None, ct.target.iOS17])
-    )
-    def test_builder_to_backend_smoke(self, compute_unit, backend, minimum_deployment_target):
-        if minimum_deployment_target == ct.target.iOS17:
-            if backend[0] != "mlprogram" or _macos_version() < (14, 0):
-                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
-
-        data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-        indices = np.array([[1, 0], [0, 2]], dtype=np.int32)
-        updates = np.array([5, 10], dtype=np.float32)
-        input_placeholders = {
-            "data": mb.placeholder(shape=data.shape),
-            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
-            "updates": mb.placeholder(shape=updates.shape),
-        }
-
-        input_values = {"data": data, "indices": indices, "updates": updates}
-
-        def build(data, indices, updates):
-            return (mb.scatter_nd(data=data, indices=indices, updates=updates),)
-
-        run_compare_builder(
-            build,
-            input_placeholders,
-            input_values,
-            expected_output_types=(2, 3, types.fp32),
-            expected_outputs=np.array([[1, 2, 13], [9, 5, 6]], dtype=np.float32),
-            compute_unit=compute_unit,
-            backend=backend,
-            minimum_deployment_target=minimum_deployment_target,
-        )
-
-    @pytest.mark.skipif(not _HAS_TF_2, reason=MSG_TF2_NOT_FOUND)
-    @pytest.mark.parametrize(
-        "compute_unit, backend, rankData_rankIndices, accumulate_mode, minimum_deployment_target",
-        itertools.product(
-            compute_units,
-            backends,
-            [(2, 2), (1, 4), (5, 2), (4, 3), (3, 4), (1, 5)],
-            ["update", "add", "sub"],
-            [None, ct.target.iOS17],
-        ),
-    )
-    def test_builder_to_backend_programmatic(
-        self, compute_unit, backend, rankData_rankIndices, accumulate_mode, minimum_deployment_target
-    ):
-        if minimum_deployment_target == ct.target.iOS17:
-            if backend[0] != "mlprogram" or _macos_version() < (14, 0):
-                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
-
-        data_rank, indices_rank = rankData_rankIndices
-        data_shape = np.random.randint(low=2, high=5, size=data_rank)
-        indices_shape = np.random.randint(low=2, high=5, size=indices_rank)
-        indices_shape[-1] = np.random.randint(low=1, high=data_rank + 1)
-        updates_shape = list(indices_shape[:-1]) + list(data_shape[indices_shape[-1] :])
-
-        data = np.random.rand(*data_shape).astype(np.float32)
-        updates = np.random.rand(*updates_shape).astype(np.float32)
-        indices_list = []
-        for i in range(indices_shape[-1]):
-            indices_list.append(
-                np.random.randint(0, data_shape[i], size=indices_shape[:-1])
-            )
-
-        indices = np.stack(indices_list, axis=-1).astype(np.int32)
-
-        def build(data, indices, updates):
-            return mb.scatter_nd(
-                data=data, indices=indices, updates=updates, mode=accumulate_mode
-            )
-
-        tf_output = tf.Variable(data)
-        if accumulate_mode == "update":
-            tf.compat.v1.scatter_nd_update(tf_output, indices, updates)
-        if accumulate_mode == "add":
-            tf.compat.v1.scatter_nd_add(tf_output, indices, updates)
-        if accumulate_mode == "sub":
-            tf.compat.v1.scatter_nd_sub(tf_output, indices, updates)
-        expected_output = tf_output.numpy()
-
-        input_placeholders = {
-            "data": mb.placeholder(shape=data.shape),
-            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
-            "updates": mb.placeholder(shape=updates.shape),
-        }
-
-        input_values = {"data": data, "indices": indices, "updates": updates}
-
-        expected_output_types = tuple(data_shape[:]) + (types.fp32,)
-        run_compare_builder(
-            build,
-            input_placeholders,
-            input_values,
-            expected_output_types,
-            expected_output,
-            compute_unit=compute_unit,
-            backend=backend,
-            minimum_deployment_target=minimum_deployment_target,
-        )
-
-    @pytest.mark.parametrize(
-        "compute_unit, backend, indices_val, dynamic",
-        itertools.product(
-            compute_units, backends, [[[1, 0], [0, -1]], [[1, 0], [0, 3]]], [True, False]
-        ),
-    )
-    def test_ios17_invalid_indices(self, compute_unit, backend, indices_val, dynamic):
-        if backend[0] != "mlprogram" or _macos_version() < (14, 0):
-            pytest.skip("IOS17 target available only on macOS 14+")
-
-        def build_static(data, updates):
-            return (
-                mb.scatter_nd(
-                    data=data,
-                    indices=np.array(indices_val, dtype=np.int32),
-                    updates=updates,
-                    validate_indices=True,
-                ),
-            )
-
-        def build_dynamic(data, indices, updates):
-            return (
-                mb.scatter_nd(data=data, indices=indices, updates=updates, validate_indices=True),
-            )
-
-        data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-        updates = np.array([5, 10], dtype=np.float32)
-        input_placeholders = {
-            "data": mb.placeholder(shape=data.shape),
-            "updates": mb.placeholder(shape=updates.shape),
-        }
-        input_values = {"data": data, "updates": updates}
-        if dynamic:
-            indices = np.array(indices_val, dtype=np.int32)
-            input_placeholders["indices"] = mb.placeholder(shape=indices.shape, dtype=types.int32)
-            input_values["indices"] = indices
-
-        if dynamic:
-            expected_error = RuntimeError
-            expected_error_msg = (
-                "Error computing NN outputs",
-                "Unable to compute the prediction using a neural network model",
-            )
-        else:
-            # The negative or out-of-bound indices will error out when validate_indices is set.
-            expected_error = IndexError
-            expected_error_msg = "Indices is out of bounds"
-
-        with pytest.raises(expected_error) as excinfo:
-            run_compare_builder(
-                build_dynamic if dynamic else build_static,
-                input_placeholders,
-                input_values,
-                expected_output_types=(2, 3, types.fp32),
-                expected_outputs=np.array([[1, 2, 13], [9, 5, 6]], dtype=np.float32),
-                compute_unit=compute_unit,
-                backend=backend,
-                minimum_deployment_target=ct.target.iOS17,
-            )
-            if not isinstance(expected_error_msg, tuple):
-                expected_error_msg = expected_error_msg
-            assert any([err in str(excinfo.value) for err in expected_error_msg])
-
-
-class TestGather:
-    @pytest.mark.parametrize(
-        "compute_unit, backend, minimum_deployment_target",
-        itertools.product(compute_units, backends, [None, ct.target.iOS17]),
-    )
-    def test_builder_to_backend_smoke(self, compute_unit, backend, minimum_deployment_target):
-        if minimum_deployment_target == ct.target.iOS17:
-            if backend[0] != "mlprogram" or _macos_version() < (14, 0):
-                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
-
-        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-        indices = np.array([1, 0], dtype=np.int32)
-        input_placeholders = {
-            "x": mb.placeholder(shape=x.shape),
-            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
-        }
-
-        input_values = {"x": x, "indices": indices}
-
-        def build(x, indices):
-            return [
-                mb.gather(x=x, indices=indices, axis=0),
-                mb.gather(x=x, indices=indices, axis=1),
-                mb.gather(x=x, indices=indices, axis=-2),
-                mb.gather(x=x, indices=indices, axis=-1),
-                mb.gather(x=x, indices=indices),
-                # mb.gather(x=x, indices=1), #shape of scalar indices is incorrect.
-                # mb.gather(x=x, indices=1, axis=1), #Scalar index passes on axis=0 but fails on axis=1,
-                # Need to handle rank 0 correctly, rdar://73160449
-            ]
-
-        expected_output_types = [
-            (2, 3, types.fp32),
-            (2, 2, types.fp32),
-            (2, 3, types.fp32),
-            (2, 2, types.fp32),
-            (2, 3, types.fp32),
-            # (3, types.fp32),
-        ]
-
-        expected_outputs = [
-            np.array([[4, 5, 6], [1, 2, 3]], dtype=np.float32),
-            np.array([[2, 1], [5, 4]], dtype=np.float32),
-            np.array([[4, 5, 6], [1, 2, 3]], dtype=np.float32),
-            np.array([[2, 1], [5, 4]], dtype=np.float32),
-            np.array([[4, 5, 6], [1, 2, 3]], dtype=np.float32),
-            # np.array([4, 5, 6], dtype=np.float32),
-        ]
-
-        run_compare_builder(
-            build,
-            input_placeholders,
-            input_values,
-            expected_output_types,
-            expected_outputs,
-            compute_unit=compute_unit,
-            backend=backend,
-            minimum_deployment_target=minimum_deployment_target,
-        )
-
-    @pytest.mark.parametrize(
-        "compute_unit, backend, minimum_deployment_target",
-        itertools.product(compute_units, backends, [ct.target.iOS16, ct.target.iOS17]),
-    )
-    def test_builder_to_backend_smoke_batch_dims(
-        self, compute_unit, backend, minimum_deployment_target
-    ):
-        if backend[0] == "neuralnetwork":
-            pytest.skip("nn backend not supported")
-        if ct.utils._macos_version() < (13, 0):
-            pytest.skip("batch_dims not supported in macOS12 or older.")
-        if minimum_deployment_target == ct.target.iOS17:
-            if _macos_version() < (14, 0):
-                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
-
-        x = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=np.float32)
-        indices = np.array([[[1, 0], [0, 1]], [[1, 0], [0, 0]]], dtype=np.int32)
-
-        input_placeholders = {
-            "x": mb.placeholder(shape=x.shape),
-            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
-        }
-
-        input_values = {"x": x, "indices": indices}
-
-        def build(x, indices):
-            return [
-                mb.gather(x=x, indices=indices, axis=1, batch_dims=0),
-                mb.gather(x=x, indices=indices, axis=1, batch_dims=1),
-                mb.gather(x=x, indices=indices, axis=2, batch_dims=0),
-                mb.gather(x=x, indices=indices, axis=2, batch_dims=1),
-                mb.gather(x=x, indices=indices, axis=2, batch_dims=2),
-            ]
-
-        expected_output_types = [
-            (2, 2, 2, 2, 3, types.fp32),
-            (2, 2, 2, 3, types.fp32),
-            (2, 2, 2, 2, 2, types.fp32),
-            (2, 2, 2, 2, types.fp32),
-            (2, 2, 2, types.fp32),
-        ]
-
-        expected_outputs = [
-            np.array([[[[[ 4,  5,  6],
-                         [ 1,  2,  3]],
-                        [[ 1,  2,  3],
-                         [ 4,  5,  6]]],
-                       [[[ 4,  5,  6],
-                         [ 1,  2,  3]],
-                        [[ 1,  2,  3],
-                         [ 1,  2,  3]]]],
-                      [[[[10, 11, 12],
-                         [ 7,  8,  9]],
-                        [[ 7,  8,  9],
-                         [10, 11, 12]]],
-                       [[[10, 11, 12],
-                         [ 7,  8,  9]],
-                        [[ 7,  8,  9],
-                         [ 7,  8,  9]]]]], dtype=np.float32
-            ),
-            np.array([[[[ 4,  5,  6],
-                        [ 1,  2,  3]],
-                       [[ 1,  2,  3],
-                        [ 4,  5,  6]]],
-                      [[[10, 11, 12],
-                        [ 7,  8,  9]],
-                       [[ 7,  8,  9],
-                        [ 7,  8,  9]]]], dtype=np.float32
-            ),
-            np.array([[[[[ 2,  1],
-                         [ 1,  2]],
-                        [[ 2,  1],
-                         [ 1,  1]]],
-                       [[[ 5,  4],
-                         [ 4,  5]],
-                        [[ 5,  4],
-                         [ 4,  4]]]],
-                      [[[[ 8,  7],
-                         [ 7,  8]],
-                        [[ 8,  7],
-                         [ 7,  7]]],
-                       [[[11, 10],
-                         [10, 11]],
-                        [[11, 10],
-                         [10, 10]]]]], dtype=np.float32
-            ),
-            np.array([[[[ 2,  1],
-                        [ 1,  2]],
-                       [[ 5,  4],
-                        [ 4,  5]]],
-                      [[[ 8,  7],
-                        [ 7,  7]],
-                       [[11, 10],
-                        [10, 10]]]], dtype=np.float32
-            ),
-            np.array([[[ 2,  1],
-                       [ 4,  5]],
-                      [[ 8,  7],
-                       [10, 10]]], dtype=np.float32
-            ),
-        ]
-
-        run_compare_builder(
-            build,
-            input_placeholders,
-            input_values,
-            expected_output_types,
-            expected_outputs,
-            compute_unit=compute_unit,
-            backend=backend,
-            minimum_deployment_target=minimum_deployment_target,
-        )
-
-    @pytest.mark.parametrize(
-        "opset_version",
-        [ct.target.iOS16, ct.target.iOS17],
-    )
-    def test_builder_eval_batch_dims(self, opset_version):
-        @mb.program(
-            input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)], opset_version=opset_version
-        )
-        def prog(x):
-            params = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=np.float32)
-            indices = np.array([[[1, 0], [0, 1]], [[1, 0], [0, 0]]], dtype=np.int32)
-            res = mb.gather(x=params, indices=indices, axis=2, batch_dims=2)
-            return res
-
-        main_func = prog.functions["main"]
-        gather_ops = main_func.find_ops(op_type="gather")[0]
-
-        np.testing.assert_allclose(
-            np.array([[[2, 1], [4, 5]], [[8, 7], [10, 10]]], dtype=np.float32),
-            gather_ops.outputs[0].val,
-            atol=1e-04,
-            rtol=1e-05
-        )
-
-    @pytest.mark.parametrize(
-        "compute_unit, backend, minimum_deployment_target",
-        itertools.product(compute_units, backends, [None, ct.target.iOS17]),
-    )
-    def test_embedding_builder_to_backend_smoke(
-        self, compute_unit, backend, minimum_deployment_target
-    ):
-        if minimum_deployment_target == ct.target.iOS17:
-            if backend[0] != "mlprogram" or _macos_version() < (14, 0):
-                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
-
-        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-        indices = np.array([1, 0], dtype=np.int32)
-        input_placeholders = {
-            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
-        }
-
-        input_values = {"indices": indices}
-
-        def build(indices):
-            return [
-                mb.gather(x=x, indices=indices, axis=0),
-                mb.gather(x=x, indices=indices, axis=-2),
-            ]
-
-        expected_output_types = [
-            (2, 3, types.fp32),
-            (2, 3, types.fp32),
-        ]
-
-        expected_outputs = [
-            np.array([[4, 5, 6], [1, 2, 3]], dtype=np.float32),
-            np.array([[4, 5, 6], [1, 2, 3]], dtype=np.float32),
-        ]
-
-        run_compare_builder(
-            build,
-            input_placeholders,
-            input_values,
-            expected_output_types,
-            expected_outputs,
-            compute_unit=compute_unit,
-            backend=backend,
-            minimum_deployment_target=minimum_deployment_target,
-        )
-
-    @pytest.mark.parametrize(
-        "opset_version",
-        [ct.target.iOS15, ct.target.iOS16, ct.target.iOS17],
-    )
-    def test_builder_eval(self, opset_version):
-        @mb.program(
-            input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)], opset_version=opset_version
-        )
-        def prog(x):
-            params = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-            indices = np.array([1, 0], dtype=np.int32)
-            res = mb.gather(x=params, indices=indices, axis=-1)
-            return res
-
-        main_func = prog.functions["main"]
-        gather_ops = main_func.find_ops(op_type="gather")[0]
-
-        np.testing.assert_allclose(
-            np.array([[2, 1], [5, 4]], dtype=np.float32),
-            gather_ops.outputs[0].val,
-            atol=1e-04,
-            rtol=1e-05,
-        )
-
-    @pytest.mark.parametrize(
-        "indices_val, validate_indices, opset_version",
-        itertools.product([[-1, 0], [0, 3]], [True, False], [None, ct.target.iOS17]),
-    )
-    def test_builder_invalid_indices(self, indices_val, validate_indices, opset_version):
-        def prog(x):
-            params = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-            indices = np.array(indices_val, dtype=np.int32)
-            if opset_version == ct.target.iOS17:
-                res = mb.gather(
-                    x=params, indices=indices, axis=-1, validate_indices=validate_indices
-                )
-            else:
-                res = mb.gather(x=params, indices=indices, axis=-1)
-            return res
-
-        if opset_version == ct.target.iOS17 and validate_indices:
-            with pytest.raises(IndexError, match="Indices is out of bounds for `gather` node"):
-                mb.program(
-                    input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
-                    opset_version=opset_version,
-                )(prog)
-        elif any([idx > 2 for idx in indices_val]):
-            # If the indices are not validated during type inference for IOS17, the `gather` op's
-            # value inference will raise error for out-of-bound index.
-            with pytest.raises(IndexError, match="index 3 is out of bounds for axis 1 with size 3"):
-                mb.program(
-                    input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
-                    opset_version=opset_version,
-                )(prog)
-        else:
-            mb.program(
-                input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
-                opset_version=opset_version,
-            )(prog)
-
-
-class TestGatherAlongAxis:
-    @pytest.mark.parametrize(
-        "compute_unit, backend, minimum_deployment_target",
-        itertools.product(compute_units, backends, [None, ct.target.iOS17]),
-    )
-    def test_builder_to_backend_smoke(self, compute_unit, backend, minimum_deployment_target):
-        if minimum_deployment_target == ct.target.iOS17:
-            if backend[0] != "mlprogram" or _macos_version() < (14, 0):
-                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
-
-        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-        indices = np.array([[1, 0, 1], [1, 1, 0]], dtype=np.int32)
-        input_placeholders = {
-            "x": mb.placeholder(shape=x.shape),
-            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
-        }
-
-        input_values = {"x": x, "indices": indices}
-
-        def build(x, indices):
-            return [
-                mb.gather_along_axis(x=x, indices=indices, axis=0),
-                mb.gather_along_axis(x=x, indices=indices, axis=1),
-                mb.gather_along_axis(x=x, indices=indices, axis=-2),
-                mb.gather_along_axis(x=x, indices=indices, axis=-1),
-                mb.gather_along_axis(x=x, indices=indices),
-            ]
-
-        expected_output_types = [
-            (2, 3, types.fp32),
-            (2, 3, types.fp32),
-            (2, 3, types.fp32),
-            (2, 3, types.fp32),
-            (2, 3, types.fp32),
-        ]
-
-        expected_outputs = [
-            np.array([[4, 2, 6], [4, 5, 3]], dtype=np.float32),
-            np.array([[2, 1, 2], [5, 5, 4]], dtype=np.float32),
-            np.array([[4, 2, 6], [4, 5, 3]], dtype=np.float32),
-            np.array([[2, 1, 2], [5, 5, 4]], dtype=np.float32),
-            np.array([[4, 2, 6], [4, 5, 3]], dtype=np.float32),
-        ]
-
-        run_compare_builder(
-            build,
-            input_placeholders,
-            input_values,
-            expected_output_types,
-            expected_outputs,
-            compute_unit=compute_unit,
-            backend=backend,
-            minimum_deployment_target=minimum_deployment_target,
-        )
-
-    @pytest.mark.parametrize(
-        "opset_version",
-        [ct.target.iOS15, ct.target.iOS16, ct.target.iOS17],
-    )
-    def test_builder_eval(self, opset_version):
-        @mb.program(
-            input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)], opset_version=opset_version
-        )
-        def prog(x):
-            params = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-            indices = np.array([[1, 0, 1], [0, 0, 1]], dtype=np.int32)
-            res = mb.gather_along_axis(x=params, indices=indices, axis=0)
-            return res
-
-        main_func = prog.functions["main"]
-        gather_ops = main_func.find_ops(op_type="gather_along_axis")[0]
-
-        np.testing.assert_allclose(
-            np.array([[4, 2, 6], [1, 2, 6]], dtype=np.float32),
-            gather_ops.outputs[0].val,
-            atol=1e-04,
-            rtol=1e-05,
-        )
-
-    @pytest.mark.parametrize(
-        "compute_unit, backend, rank_axis, minimum_deployment_target",
-        itertools.product(
-            compute_units,
-            backends,
-            [(rank, axis) for rank in range(1, 5) for axis in range(-rank, rank)],
-            [None, ct.target.iOS17],
-        ),
-    )
-    def test_builder_to_backend_programmatic(
-        self, compute_unit, backend, rank_axis, minimum_deployment_target
-    ):
-        if minimum_deployment_target == ct.target.iOS17:
-            if backend[0] != "mlprogram" or _macos_version() < (14, 0):
-                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
-        if backend[0] == "mlprogram" and compute_unit != ct.ComputeUnit.CPU_ONLY:
-            pytest.xfail("rdar://97398875 (TestGatherAlongAxis failing on mlprgram + GPU)")
-        rank, axis = rank_axis
-        x_shape = np.random.randint(low=2, high=8, size=rank)
-        indices_shape = np.copy(x_shape)
-        indices_shape[axis] = np.random.randint(low=1, high=8)
-
-        x = np.random.rand(*x_shape).astype(np.float32)
-        # IOS17 gather_along_axis requires non-negative indices.
-        lower_bound = 0 if minimum_deployment_target == ct.target.iOS17 else -x_shape[axis]
-        indices = np.random.randint(lower_bound, x_shape[axis], size=indices_shape).astype(np.int32)
-
-        def build(x, indices):
-            return mb.gather_along_axis(x=x, indices=indices, axis=axis)
-
-        input_placeholders = {
-            "x": mb.placeholder(shape=x.shape),
-            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
-        }
-
-        input_values = {"x": x, "indices": indices}
-
-        expected_output_types = tuple(indices_shape[:]) + (types.fp32,)
-        expected_output = np.take_along_axis(x, indices, axis=axis)
-
-        run_compare_builder(
-            build,
-            input_placeholders,
-            input_values,
-            expected_output_types,
-            expected_output,
-            compute_unit=compute_unit,
-            backend=backend,
-            minimum_deployment_target=minimum_deployment_target,
-        )
-
-    @pytest.mark.parametrize(
-        "indices_val, validate_indices, opset_version",
-        itertools.product(
-            [[[1, 0, -1], [0, 0, 1]], [[1, 0, 1], [0, 0, 2]]],
-            [True, False],
-            [None, ct.target.iOS17],
-        ),
-    )
-    def test_builder_invalid_indices(self, indices_val, validate_indices, opset_version):
-        def prog(x):
-            params = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-            indices = np.array(indices_val, dtype=np.int32)
-            if opset_version == ct.target.iOS17:
-                res = mb.gather_along_axis(
-                    x=params, indices=indices, axis=0, validate_indices=validate_indices
-                )
-            else:
-                res = mb.gather_along_axis(x=params, indices=indices, axis=0)
-            return res
-
-        if opset_version == ct.target.iOS17 and validate_indices:
-            with pytest.raises(
-                IndexError, match="Indices is out of bounds for `gather_along_axis` node"
-            ):
-                mb.program(
-                    input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
-                    opset_version=opset_version,
-                )(prog)
-        elif any([idx > 1 for sub_indices in indices_val for idx in sub_indices]):
-            # If the indices are not validated during type inference for IOS17, the `gather` op's
-            # value inference will raise error for out-of-bound index.
-            with pytest.raises(IndexError, match="index 2 is out of bounds for axis 0 with size 2"):
-                mb.program(
-                    input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
-                    opset_version=opset_version,
-                )(prog)
-        else:
-            mb.program(
-                input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
-                opset_version=opset_version,
-            )(prog)
-
-
-class TestGatherNd:
-    @pytest.mark.parametrize(
-        "compute_unit, backend, minimum_deployment_target",
-        itertools.product(compute_units, backends, [None, ct.target.iOS17]),
-    )
-    def test_builder_to_backend_smoke(self, compute_unit, backend, minimum_deployment_target):
-        if minimum_deployment_target == ct.target.iOS17:
-            if backend[0] != "mlprogram" or _macos_version() < (14, 0):
-                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
-
-        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-        indices = np.array([[1, 0], [0, 2]], dtype=np.int32)
-        input_placeholders = {
-            "x": mb.placeholder(shape=x.shape),
-            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
-        }
-
-        input_values = {"x": x, "indices": indices}
-
-        def build(x, indices):
-            return (mb.gather_nd(x=x, indices=indices),)
-
-        expected_output_types = (2, types.fp32)
-        expected_outputs = np.array([4, 3], dtype=np.float32)
-
-        run_compare_builder(
-            build,
-            input_placeholders,
-            input_values,
-            expected_output_types,
-            expected_outputs,
-            compute_unit=compute_unit,
-            frontend_only=False,
-            backend=backend,
-            minimum_deployment_target=minimum_deployment_target,
-        )
-
-    @pytest.mark.parametrize(
-        "compute_unit, backend, minimum_deployment_target",
-        itertools.product(compute_units, backends, [ct.target.iOS16, ct.target.iOS17]),
-    )
-    def test_builder_to_backend_smoke_batch_dims(
-        self, compute_unit, backend, minimum_deployment_target
-    ):
-        if backend[0] == "neuralnetwork":
-            pytest.skip("nn backend not supported")
-        if ct.utils._macos_version() < (13, 0):
-            pytest.skip("batch_dims not supported in macOS12 or older.")
-        if minimum_deployment_target == ct.target.iOS17:
-            if backend[0] != "mlprogram" or _macos_version() < (14, 0):
-                pytest.skip("IOS17 target available only on macOS 14+ with mlprogram.")
-
-        x = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=np.float32)
-        indices = np.array([[[1, 0], [0, 1]], [[1, 0], [0, 0]]], dtype=np.int32)
-
-        input_placeholders = {
-            "x": mb.placeholder(shape=x.shape),
-            "indices": mb.placeholder(shape=indices.shape, dtype=types.int32),
-        }
-
-        input_values = {"x": x, "indices": indices}
-
-        def build(x, indices):
-            return [
-                mb.gather_nd(x=x, indices=indices, batch_dims=0),
-                mb.gather_nd(x=x, indices=indices, batch_dims=1),
-            ]
-
-        expected_output_types = [
-            (2, 2, 3, types.fp32),
-            (2, 2, types.fp32)
-        ]
-
-        expected_outputs = [
-            np.array([[[7, 8, 9],
-                       [4, 5, 6]],
-                      [[7, 8, 9],
-                       [1, 2, 3]]], dtype=np.float32
-            ),
-            np.array([[ 4,  2],
-                      [10,  7]], dtype=np.float32
-            ),
-        ]
-
-        run_compare_builder(
-            build,
-            input_placeholders,
-            input_values,
-            expected_output_types,
-            expected_outputs,
-            compute_unit=compute_unit,
-            backend=backend,
-            minimum_deployment_target=minimum_deployment_target,
-        )
-
-    @pytest.mark.parametrize(
-        "indices_val, validate_indices, opset_version",
-        itertools.product(
-            [[[-1], [2]], [[1], [3]]], [True, False], [ct.target.iOS16, ct.target.iOS17]
-        ),
-    )
-    def test_builder_invalid_indices(self, indices_val, validate_indices, opset_version):
-        def prog(x):
-            params = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
-            indices = np.array(indices_val, dtype=np.int32)
-            if opset_version == ct.target.iOS17:
-                res = mb.gather_nd(
-                    x=params, indices=indices, batch_dims=1, validate_indices=validate_indices
-                )
-            else:
-                res = mb.gather_nd(x=params, indices=indices, batch_dims=1)
-            return res
-
-        if opset_version == ct.target.iOS17 and validate_indices:
-            with pytest.raises(IndexError, match="Indices is out of bounds for `gather_nd` node"):
-                mb.program(
-                    input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
-                    opset_version=opset_version,
-                )(prog)
-        else:
-            mb.program(
-                input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)],
-                opset_version=opset_version,
-            )(prog)
diff --git a/coremltools/converters/mil/mil/ops/tests/test_slice.py b/coremltools/converters/mil/mil/ops/tests/test_slice.py
deleted file mode 100644
index b5ab0a02b..000000000
--- a/coremltools/converters/mil/mil/ops/tests/test_slice.py
+++ /dev/null
@@ -1,403 +0,0 @@
-#  Copyright (c) 2020, Apple Inc. All rights reserved.
-#
-#  Use of this source code is governed by a BSD-3-clause license that can be
-#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-
-import itertools
-
-import numpy as np
-import pytest
-
-import coremltools as ct
-from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import get_new_symbol, types
-from coremltools.converters.mil.testing_reqs import backends, compute_units
-from coremltools.converters.mil.testing_utils import ssa_fn
-
-from .testing_utils import UNK_SYM, run_compare_builder
-
-
-class TestSliceByIndex:
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
-    )
-    def test_builder_to_backend_smoke(self, compute_unit, backend):
-        x_val = np.array(list(range(24))).reshape((2, 3, 4)).astype(np.float32)
-        begin_val = np.array([1, 1, 1], dtype=np.int32)
-        end_val = np.array([2, 3, 3], dtype=np.int32)
-        input_placeholders = {
-            "x": mb.placeholder(shape=x_val.shape),
-            "begin": mb.placeholder(shape=begin_val.shape, dtype=types.int32),
-            "end": mb.placeholder(shape=end_val.shape, dtype=types.int32),
-        }
-        input_values = {"x": x_val, "begin": begin_val, "end": end_val}
-
-        def build(x, begin, end):
-            begin_c = mb.const(val=begin_val)
-            end_c = mb.const(val=end_val)
-            return [
-                mb.slice_by_index(x=x, begin=begin, end=end),
-                mb.slice_by_index(x=x, begin=begin_c, end=end_c)
-            ]
-
-        expected_output_types = [(UNK_SYM, UNK_SYM, UNK_SYM, types.fp32)] * 2
-        expected_outputs = [np.array([[[17, 18], [21, 22]]], dtype=np.float32)] * 2
-        run_compare_builder(
-            build,
-            input_placeholders,
-            input_values,
-            expected_output_types,
-            expected_outputs,
-            compute_unit=compute_unit,
-            backend=backend,
-        )
-
-    def test_type_inference(self):
-        s0 = get_new_symbol()
-        s1 = get_new_symbol()
-        s2 = get_new_symbol()
-
-        input_placeholders = {
-            "x": mb.placeholder(shape=(10, s0, s1, s2)),
-        }
-
-        def build(x):
-            return [
-                mb.slice_by_index(
-                    x=x, begin=[2, 5, 6, 12], end=[6, 9, 20, -9], stride=[2, 1, 2, 1]
-                ),
-                mb.slice_by_index(
-                    x=x,
-                    begin=[-2, -5, -3, 9],
-                    end=[-6, -9, -6, -7],
-                    stride=[-2, -1, -2, 1],
-                ),
-                mb.slice_by_index(
-                    x=x,
-                    begin=[0, 0, 0, 0],
-                    end=[-6, -9, 3, -2],
-                    stride=[-2, -3, 1, 2],
-                    begin_mask=[True, True, True, True],
-                    end_mask=[False, False, False, False],
-                ),
-                mb.slice_by_index(
-                    x=x,
-                    begin=[-2, 5, -1, -7],
-                    end=[0, 0, 0, 0],
-                    stride=[-2, -3, 1, -2],
-                    begin_mask=[False, False, False, False],
-                    end_mask=[True, True, True, True],
-                ),
-                mb.slice_by_index(
-                    x=x, begin=[4, -1, 0, -5], end=[4, -1, 0, -5], stride=[1, -1, 2, -2]
-                ),
-            ]
-
-        expected_output_types = [
-            (2, 4, 7, UNK_SYM, types.fp32),
-            (2, 4, 2, UNK_SYM, types.fp32),
-            (3, 3, 3, UNK_SYM, types.fp32),
-            (5, 2, 1, UNK_SYM, types.fp32),
-            (0, 0, 0, 0, types.fp32),
-        ]
-
-        run_compare_builder(
-            build,
-            input_placeholders,
-            expected_output_types=expected_output_types,
-            frontend_only=True,
-        )
-
-
-    @pytest.mark.xfail(reason="rdar://99664032")
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends,)
-    )
-    def test_single_element_edge_case(self, compute_unit, backend):
-        x_val = np.array(list(range(6))).reshape((1, 3, 2)).astype(np.float32)
-        input_placeholders = {
-            "x": mb.placeholder(shape=x_val.shape),
-        }
-        input_values = {"x": x_val}
-
-        def build(x):
-            return mb.slice_by_index(
-                x=x,
-                begin=[-1, 0, 0],
-                end=[-2, 0, 0],
-                stride=[-1, 1, 1],
-                begin_mask=[False, True, True],
-                end_mask=[False, True, True]
-            )
-
-        expected_output_types = [(1, 3, 2, types.fp32)]
-        expected_outputs = [np.array([[[0, 1], [2, 3], [4, 5]]], dtype=np.float32)]
-        run_compare_builder(
-            build,
-            input_placeholders,
-            input_values,
-            expected_output_types,
-            expected_outputs,
-            compute_unit=compute_unit,
-            backend=backend,
-        )
-
-    @ssa_fn
-    def test_builder_eval_scalar_output_corner_cases(self):
-        x1 = np.array([2.])
-        x2 = np.array([[[[1.],[3.]]]])
-        v = [
-            mb.slice_by_index(
-                x=x1, begin=[0,], end=[0], squeeze_mask=[True],
-            ),
-            mb.slice_by_index(
-                x=x2, begin=[0, 0, 0, 0], end=[0, 0, 0, 0], squeeze_mask=[True, True, True, True],
-            ),
-        ]
-        assert v[0].val.shape == ()
-        assert v[0].val == 2
-        assert v[1].val.shape == ()
-        assert v[1].val == 1
-
-    @ssa_fn
-    def test_builder_eval(self):
-        x_val = np.array(list(range(24))).reshape((2, 3, 4))
-        v = [
-            mb.slice_by_index(
-                x=x_val, begin=[1, 1, 1], end=[2, 2, 2]
-            ),  # x_val[1:2, 1:2, 1:2]
-            mb.slice_by_index(
-                x=x_val, begin=[1, 1, 1], end=[2, 3, 4], stride=[1, 1, 2]
-            ),  #  x_val[1:2, 1:3, 1:4:2]
-            mb.slice_by_index(
-                x=x_val, begin=[-3, -3, -3], end=[-1, -1, -1]
-            ),  # x_val[-3:-1, -3:-1, -3:-1]
-            mb.slice_by_index(
-                x=x_val, begin=[0, 0, -3], end=[-1, -2, -2]
-            ),  # x_val[0:-1, 0:-2, -3:-2]
-            mb.slice_by_index(
-                x=x_val, begin=[-1, -1, -1], end=[0, 1, -3], stride=[-2, -1, -3]
-            ),  # x_val[-1:0:-2, -1:1:-1, -1:-3:-3]
-            mb.slice_by_index(
-                x=x_val,
-                begin=[1, 1, 1],
-                end=[2, 3, 4],
-                stride=[1, 1, 2],
-                begin_mask=[True, False, True],
-            ),  # x_val[:2, 1:3, :4:2]
-            mb.slice_by_index(
-                x=x_val,
-                begin=[1, 1, 1],
-                end=[2, 3, 4],
-                stride=[1, 1, 2],
-                begin_mask=[True, False, True],
-                end_mask=[True, True, False],
-            ),  # x_val[:, 1:, :4:2]
-            mb.slice_by_index(
-                x=x_val,
-                begin=[1, 1, 1],
-                end=[2, 3, 4],
-                stride=[1, 1, 2],
-                begin_mask=[False, False, True],
-                end_mask=[True, False, False],
-                squeeze_mask=[False, True, False],
-            ),  # x_val[1::1, 1, :3:2]
-            mb.slice_by_index(
-                x=x_val,
-                begin=[0, 0, 0],
-                end=[0, 0, 0],
-                stride=[1, 1, 1],
-                begin_mask=[True, True, True],
-                end_mask=[True, True, True],
-            ),  # x_val[:, :, :]
-            mb.slice_by_index(
-                x=x_val,
-                begin=[1, 1, 1],
-                end=[2, 2, 0],
-                stride=[1, 1, 1],
-                squeeze_mask=[False, False, True],
-            ),  # x_val[1:2, 1:2, 1]
-            mb.slice_by_index(
-                x=x_val,
-                begin=[1, 0, 0],
-                end=[2, 0, 0],
-                stride=[1, 1, 1],
-                begin_mask=[False, True, True],
-                end_mask=[False, True, True],
-            ),  # x_val[1:2, ...]
-            mb.slice_by_index(
-                x=x_val,
-                begin=[0, 0, 0],
-                end=[0, 0, 0],
-                stride=[1, 1, 1],
-                begin_mask=[True, True, True],
-                end_mask=[True, True, True],
-            ),  # x_val[...]
-            mb.slice_by_index(
-                x=x_val,
-                begin=[1, 0, 1],
-                end=[2, 0, 2],
-                stride=[1, 1, 1],
-                begin_mask=[False, True, False],
-                end_mask=[False, True, False],
-            ),  # x_val[1:2, ..., 1:2]
-            mb.slice_by_index(
-                x=x_val,
-                begin=[0, 0, 1],
-                end=[0, 0, 0],
-                stride=[1, 1, 1],
-                begin_mask=[True, True, False],
-                end_mask=[True, True, False],
-                squeeze_mask=[False, False, True],
-            ),  # x_val[..., 1]
-            mb.slice_by_index(
-                x=x_val,
-                begin=[0, 0, 0],
-                end=[0, 0, 0],
-                stride=[1, 1, 1],
-                begin_mask=[False, False, True],
-                end_mask=[False, False, True],
-                squeeze_mask=[True, True, False],
-            ),  # x_val[0, 0, :]
-            mb.slice_by_index(
-                x=x_val,
-                begin=[1, 0, 0],
-                end=[2, 0, 0],
-                stride=[1, 1, 1],
-                begin_mask=[False, True, True],
-                end_mask=[False, True, True],
-            ),  # x_val[1:2]
-            mb.slice_by_index(
-                x=x_val,
-                begin=[1, 1, 0],
-                end=[2, 2, 0],
-                stride=[1, 1, 1],
-                begin_mask=[False, False, True],
-                end_mask=[False, False, True],
-            ),  # x_val[1:2, 1:2]
-            mb.slice_by_index(
-                x=x_val,
-                begin=[1, 0, 0],
-                end=[0, 0, 0],
-                stride=[1, 1, 1],
-                begin_mask=[False, True, True],
-                end_mask=[False, True, True],
-                squeeze_mask=[True, False, False],
-            ),  # x_val[1]
-            mb.slice_by_index(
-                x=x_val,
-                begin=[0, 0, 0],
-                end=[0, 0, 0],
-                begin_mask=[True, True, True],
-                end_mask=[True, True, True],
-            ),  # x_val[:]
-            mb.slice_by_index(
-                x=x_val,
-                begin=[0, 0, 0],
-                end=[0, 0, 0],
-                stride=[1, 1, -1],
-                begin_mask=[True, True, True],
-                end_mask=[True, True, True],
-            ),  # x_val[..., ::-1]
-        ]
-        ans = [
-            x_val[1:2, 1:2, 1:2],
-            x_val[1:2, 1:3, 1:4:2],
-            x_val[-3:-1, -3:-1, -3:-1],
-            x_val[0:-1, 0:-2, -3:-2],
-            x_val[-1:0:-2, -1:1:-1, -1:-3:-3],
-            x_val[:2, 1:3, :4:2],
-            x_val[:, 1:, :4:2],
-            x_val[1::1, 1, :3:2],
-            x_val[:, :, :],
-            x_val[1:2, 1:2, 1],
-            x_val[1:2, ...],
-            x_val[...],
-            x_val[1:2, ..., 1:2],
-            x_val[..., 1],
-            x_val[0, 0, :],
-            x_val[1:2],
-            x_val[1:2, 1:2],
-            x_val[1],
-            x_val[:],
-            x_val[..., ::-1],
-        ]
-        for idx in range(len(v)):
-            assert ans[idx].shape == v[idx].shape
-            np.testing.assert_allclose(ans[idx], v[idx].val, atol=1e-04, rtol=1e-05)
-
-
-    @staticmethod
-    def test_slice_by_index():
-        INPUT_SHAPE = (1, 2, 8, 16)
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=INPUT_SHAPE)])
-        def prog(x):
-            x = mb.slice_by_index(
-                x=x,
-                begin=[0, 0, 0, 0],
-                end=[1, 2, 8, 12],
-                stride=[1, 1, 2, 2],
-                begin_mask=None,
-                end_mask=None,
-                squeeze_mask=None,
-            )
-            return x
-
-        x = np.random.rand(*INPUT_SHAPE)
-
-        # slice by index is x[begin[0]: end[0]: stride[0], begin[1]: end[1]: stride[1], ...]
-        y_numpy = x[0:1:1, 0:2:1, 0:8:2, 0:12:2]
-
-        model = ct.convert(prog, source="milinternal", convert_to="neuralnetwork")
-        y_neuralnetwork = list(model.predict({'x': x}).values())[0]
-        np.testing.assert_allclose(y_numpy, y_neuralnetwork)
-
-        model = ct.convert(
-            prog,
-            source="milinternal",
-            convert_to="mlprogram",
-            compute_units=ct.ComputeUnit.CPU_ONLY,
-        )
-
-        # rdar://109080828 ([Bug] slice_by_index is throwing expection through E5ML stack) need to be fixed
-        # The above radar fixed the CPU case,
-        # the non-CPU is still failing, which is tracked in rdar://109854221 ([Bug][Regression] slice_by_index is throwing expection through E5ML - Follow up radar)
-        # y_mlprogram = list(model.predict({'x': x}).values())[0]
-        # rdar://102217935 needs to be fixed before mlprogram will pass
-        # np.testing.assert_allclose(y_numpy, y_mlprogram)
-
-    @staticmethod
-    def test_slice_by_index_slice_squeeze_separate():
-        INPUT_SHAPE = (1, 2, 8, 16)
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=INPUT_SHAPE)])
-        def prog(x):
-            x = mb.slice_by_index(
-                x=x,
-                begin=[0, 0, 0, 0],
-                end=[1, 2, 8, 12],
-                stride=[1, 1, 1, 2],
-                begin_mask=None,
-                end_mask=None,
-                squeeze_mask=[True, False, False, False],
-            )
-            return x
-
-        x = np.random.rand(*INPUT_SHAPE)
-
-        # slice by index is x[begin[0]: end[0]: stride[0], begin[1]: end[1]: stride[1], ...]
-        # and squeeze dim 0
-        y_numpy = x[0:1:1, 0:2:1, 0:8:1, 0:12:2]
-        y_numpy = np.squeeze(y_numpy, axis=0)
-
-        model = ct.convert(prog, source="milinternal", convert_to="neuralnetwork")
-        y_neuralnetwork = list(model.predict({'x': x}).values())[0]
-
-        assert y_numpy.shape == y_neuralnetwork.shape
-        np.testing.assert_allclose(y_numpy, y_neuralnetwork)
-
-        model = ct.convert(prog, source="milinternal", convert_to="mlprogram")
-        y_mlprogram = list(model.predict({'x': x}).values())[0]
-        # TODO: rdar://103365766 MLProgram does not apply squeeze_mask.
-        # np.testing.assert_allclose(y_numpy, y_mlprogram)
diff --git a/coremltools/converters/mil/mil/ops/tests/test_utils.py b/coremltools/converters/mil/mil/ops/tests/test_utils.py
index 88d0aa1b6..82e22c743 100644
--- a/coremltools/converters/mil/mil/ops/tests/test_utils.py
+++ b/coremltools/converters/mil/mil/ops/tests/test_utils.py
@@ -4,7 +4,6 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 import numpy as np
-from coremltools.converters.mil import get_new_symbol
 
 from coremltools.converters.mil.mil.ops.defs._utils import (
     aggregated_pad, effective_kernel, spatial_dimensions_out_shape)
@@ -261,15 +260,3 @@ def test_same_padding_shape_dilation_2(self):
 
         expected = [5, 5]
         np.testing.assert_equal(actual, expected)
-
-    def test_symbolic_custom_pad(self):
-        input_shape = (get_new_symbol(), get_new_symbol())
-        actual = spatial_dimensions_out_shape(
-            pad_type="custom",
-            input_shape=input_shape,
-            kernel_shape=(1, 1),
-            strides=(1, 1),
-            dilations=(1, 1),
-            custom_pad=(0, 0, 0, 0),
-        )
-        np.testing.assert_equal(actual, input_shape)
diff --git a/coremltools/converters/mil/mil/ops/tests/testing_utils.py b/coremltools/converters/mil/mil/ops/tests/testing_utils.py
index d898de057..4fa5b93ad 100644
--- a/coremltools/converters/mil/mil/ops/tests/testing_utils.py
+++ b/coremltools/converters/mil/mil/ops/tests/testing_utils.py
@@ -3,14 +3,18 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import functools
 from typing import Dict, List, Optional
 
+import pytest
+
 import coremltools as ct
 from coremltools import _logger as logger
 from coremltools.converters.mil.input_types import TensorType
 from coremltools.converters.mil.mil import Function, Placeholder, Program
 from coremltools.converters.mil.mil.passes.pass_pipeline import PassPipeline
 from coremltools.converters.mil.mil.types.symbolic import is_symbolic
+from coremltools.converters.mil.testing_reqs import BackendConfig
 from coremltools.converters.mil.testing_utils import (
     compare_backend,
     ct_convert,
@@ -21,6 +25,33 @@
 UNK_SYM = "s_unk"
 
 
+def mark_api_breaking(breaking_opset_version: ct.target):
+    """
+    The function is used to mark api breaking for MIL op unittests.
+    For instance, if `test_op_1` is supposed to pass from iOS14 -> iOS16 and breaks starting from iOS17,
+    we can use the following syntax:
+
+    @makr_api_breaking(breaking_opsey_version=ct.target.iOS17)
+    def test_op_1(self, backend, ...):
+        pass
+
+    Note that the test function must take `backend` with type of `BackendConfig` as an input.
+    """
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            backend = kwargs.get("backend", None)
+            if backend is None:
+                raise ValueError(
+                    f'Function {func} decorated with mark_api_breaking must takes "backend" as an input.'
+                )
+            if backend.opset_version >= breaking_opset_version:
+                pytest.skip(f"The test is breaking at opset version {breaking_opset_version}.")
+            return func(*args, **kwargs)
+        return wrapper
+
+    return decorator
+
 def run_compare_builder(
     build,
     input_placeholders,
@@ -29,13 +60,12 @@ def run_compare_builder(
     expected_outputs=None,
     compute_unit=ct.ComputeUnit.CPU_ONLY,
     frontend_only=False,
-    backend=("neuralnetwork", "fp32"),
+    backend: Optional[BackendConfig] = None,
     atol=1e-04,
     rtol=1e-05,
     inputs=None,
-    also_compare_shapes=False,
+    also_compare_shapes=True,
     converter=ct.convert,
-    minimum_deployment_target=None,
     pass_pipeline: Optional[PassPipeline] = None,
 ):
     """
@@ -67,14 +97,21 @@ def run_compare_builder(
             Reference to convert function to be used.
             Default: ct.convert
 
-        - minimum_deployment_target : coremltools.target enumeration (optional)
-            A member of the ``coremltools.target`` enum.
+        - backend: A BackendConfig that specifies the compute backend, precision and minimum_deployment_target
 
     Returns:
         The converted mlmodel
     """
-    if minimum_deployment_target is not None:
-        validate_minimum_deployment_target(minimum_deployment_target, backend)
+    if backend is None:
+        backend = BackendConfig(
+            backend="neuralnetwork",
+            precision="fp32",
+            opset_version=ct.target.iOS14,
+        )
+    minimum_deployment_target = backend.opset_version
+    backend = (backend.backend, backend.precision)
+
+    validate_minimum_deployment_target(minimum_deployment_target, backend)
 
     if not isinstance(expected_output_types, list):
         expected_output_types = [expected_output_types]
diff --git a/coremltools/converters/mil/mil/passes/defs/cleanup/noop_elimination.py b/coremltools/converters/mil/mil/passes/defs/cleanup/noop_elimination.py
index 0e9aac551..8aee02bbf 100644
--- a/coremltools/converters/mil/mil/passes/defs/cleanup/noop_elimination.py
+++ b/coremltools/converters/mil/mil/passes/defs/cleanup/noop_elimination.py
@@ -32,6 +32,7 @@ class noop_elimination(AbstractGraphPass):
     """
 
     _SUPPORTED_OPS = {
+        "identity",
         "add",
         "mul",
         "floor_div",
@@ -57,7 +58,17 @@ def apply(self, prog):
             self._noop_elimination_block_wrapper(f)
 
     @staticmethod
-    def _match_pattern(op, block):
+    def _match_pattern(op):
+        def remove_identity(op):
+            if op.enclosing_block.try_replace_uses_of_var_after_op(
+                anchor_op=op,
+                old_var=op.outputs[0],
+                new_var=op.x,
+            ):
+                op.enclosing_block.remove_ops([op])
+                return True
+            return False
+
         def _remove_elementwise_binary(op, x, y):
             # We remove the ops that has op.x == x or op.y == y
             def has_all_elements_equal_to(var, value):
@@ -97,7 +108,7 @@ def has_all_elements_equal_to(var, value):
                 return True
             return False
 
-        def remove_elementwise(op, block):
+        def remove_elementwise(op):
             if op.op_type in {"add"}:
                 return _remove_elementwise_binary(op, 0, 0)
             elif op.op_type in {"mul"}:
@@ -109,7 +120,7 @@ def remove_elementwise(op, block):
             else:
                 return False
 
-        def remove_slice_by_index(op, block):
+        def remove_slice_by_index(op):
             input_shape = op.x.sym_type
             output_shape = op.outputs[0].sym_type
 
@@ -133,7 +144,7 @@ def remove_slice_by_index(op, block):
                 return True
             return False
 
-        def remove_same_shape(op, block):
+        def remove_same_shape(op):
             input_shape = op.x.sym_type
             output_shape = op.outputs[0].sym_type
 
@@ -152,7 +163,7 @@ def remove_same_shape(op, block):
                 return True
             return False
 
-        def remove_linear(op, block):
+        def remove_linear(op):
             if op.alpha.val != 1 or op.beta.val != 0:
                 return False
 
@@ -168,7 +179,7 @@ def remove_linear(op, block):
                 return True
             return False
 
-        def remove_transpose(op, block):
+        def remove_transpose(op):
             perm = np.array([p if p >= 0 else p + len(op.perm.val) for p in op.perm.val])
             sorted_perm = np.sort(perm)
             if (perm != sorted_perm).any():
@@ -187,6 +198,7 @@ def remove_transpose(op, block):
             return False
 
         op_to_removal_fn = {
+            "identity": remove_identity,
             "add": remove_elementwise,
             "mul": remove_elementwise,
             "floor_div": remove_elementwise,
@@ -230,9 +242,9 @@ def _noop_elimination_block(block):
                 if len(op.blocks) > 0:
                     continue
 
-                remove_fn = noop_elimination._match_pattern(op, block)
+                remove_fn = noop_elimination._match_pattern(op)
                 if remove_fn is not None:
-                    status = remove_fn(op, block)
+                    status = remove_fn(op)
                     # has to break as the downstream iterator is affected.
                     if status:
                         return status
diff --git a/coremltools/converters/mil/mil/passes/defs/cleanup/remove_redundant_ops.py b/coremltools/converters/mil/mil/passes/defs/cleanup/remove_redundant_ops.py
index 32e9950ac..19f9cf339 100644
--- a/coremltools/converters/mil/mil/passes/defs/cleanup/remove_redundant_ops.py
+++ b/coremltools/converters/mil/mil/passes/defs/cleanup/remove_redundant_ops.py
@@ -155,8 +155,11 @@ def _try_to_transform(parent_var):
         candidate_ops_lists = remove_redundant_ops._get_candidate_ops_lists_from_var(parent_var)
         block_changed = False
         for ops_list in candidate_ops_lists:
-            if remove_redundant_ops._try_to_remove_ops(ops_list):
-                block_changed = True
+            # Iterate through the child ops list, to make sure that we check all possible combinations.
+            for idx in range(len(ops_list)):
+                if remove_redundant_ops._try_to_remove_ops(ops_list[idx:]):
+                    block_changed = True
+                    break
         return block_changed
 
     @block_context_manager
diff --git a/coremltools/converters/mil/mil/passes/defs/optimize_elementwise_binary.py b/coremltools/converters/mil/mil/passes/defs/optimize_elementwise_binary.py
index 3cd9fe58a..f0be448e4 100644
--- a/coremltools/converters/mil/mil/passes/defs/optimize_elementwise_binary.py
+++ b/coremltools/converters/mil/mil/passes/defs/optimize_elementwise_binary.py
@@ -5,8 +5,11 @@
 
 import numpy as np
 
+from coremltools.converters.mil.mil import Block
 from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import Operation, Var
 from coremltools.converters.mil.mil import types as _types
+from coremltools.converters.mil.mil.ops.defs._utils import broadcast_shapes
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
 from coremltools.converters.mil.mil.passes.helper import block_context_manager
 from coremltools.converters.mil.mil.passes.pass_registry import register_pass
@@ -48,10 +51,173 @@ def _divide_to_multiply_block(self, block):
                 block.remove_ops([op])
 
 
+@register_pass(namespace="common")
+class select_optimization(AbstractGraphPass):
+    """
+    For ``select(cond, a, b)``, there are 2 cases where we can replace it with a single simpler op
+
+    1. If ``cond`` is a const scalar (or a const tensor but all elements are the same, which is
+       equivalent to a scalar), then we replace ``select(cond, a, b)`` with simply ``a`` or ``b``
+
+       .. code-block::
+
+            Input graph:
+
+                const(scalar cond) -|
+                                    |
+                a ------------------|-> select -> output
+                                    |
+                b ------------------|
+
+            Output graph:
+
+                if cond:
+                    a -> output
+                else:
+                    b -> output
+
+    2. If ``cond`` is a more complicated const, and ``a`` is an inf const,
+       then we replace ``a`` with ``select(cond, a, 0)``, then return ``a + b``
+
+        .. code-block::
+
+            Input graph:
+
+                const(cond) -|
+                             |
+                const(±inf) -|-> select -> output
+                             |
+                b -----------|
+
+            Output graph:
+
+                select(cond, ±inf, 0) -|
+                                       |-> add -> output
+                b ---------------------|
+
+        Note that ``select(cond, ±inf, 0))`` will further get eliminated by
+        ``const_elimination``, so in the end the op in graph is simply ``add``
+
+        This replacement is based on floating-point arithmetic
+
+        .. code-block::
+
+            inf + b = inf
+            -inf + b = -inf
+            0 + b = b
+
+        PS: if ``a`` is not inf const but ``b`` is, then we would swap ``a`` and ``b``
+    """
+
+    def apply(self, prog):
+        @block_context_manager
+        def apply_block(block: Block):
+            for op in list(block.operations):
+                for b in op.blocks:
+                    apply_block(b)
+
+                if op.op_type == "select":
+                    self.try_to_transform_select(op)
+
+        for f in prog.functions.values():
+            apply_block(f)
+
+    def try_to_transform_select(self, select_op: Operation) -> bool:
+        assert select_op.op_type == "select"
+
+        cond_val = select_op.cond.val
+        # this pass only handles const cond
+        if cond_val is None:
+            return False
+
+        a_val = select_op.a.val
+        b_val = select_op.b.val
+        # if everything is const, then let const_elimination do its job
+        if a_val is not None and b_val is not None:
+            return False
+
+        # try case 1: const scalar cond
+        # (or const tensor cond but all elements are the same, which is equivalent to a scalar)
+        result_candidate = self.try_to_transform_const_scalar_cond(select_op, cond_val)
+        if result_candidate is not None and self.try_to_modify_block(select_op, result_candidate):
+            return True
+
+        # try case 2: complicated const cond + inf const a or b
+        result_candidate = self.try_to_transform_inf_const_selection(
+            select_op, cond_val, a_val, b_val
+        )
+        if result_candidate is not None and self.try_to_modify_block(select_op, result_candidate):
+            return True
+
+        return False
+
+    @staticmethod
+    def try_to_transform_const_scalar_cond(select_op: Operation, cond_val: np.ndarray) -> Var:
+        assert select_op.op_type == "select"
+        assert cond_val is not None
+
+        a = select_op.a
+        b = select_op.b
+
+        x: Var = None
+        if np.all(cond_val):
+            x = mb.identity(x=a, before_op=select_op)
+        elif np.all(np.logical_not(cond_val)):
+            x = mb.identity(x=b, before_op=select_op)
+        else:
+            return None
+
+        result_shape = broadcast_shapes(a.shape, b.shape)
+        # cannot simply replace with a or b if broadcasting
+        if x.shape != result_shape:
+            return None
+
+        return x
+
+    @staticmethod
+    def try_to_transform_inf_const_selection(
+        select_op: Operation, cond_val: np.ndarray, a_val: np.ndarray, b_val: np.ndarray
+    ) -> Var:
+        assert select_op.op_type == "select"
+        assert cond_val is not None
+
+        # check if a or b is inf const
+        # if a is not but b is, then swap a and b
+        a: np.ndarray = None
+        b: Var = None
+        if a_val is not None and np.all(np.abs(a_val) > 1e38):
+            a = a_val
+            b = select_op.b
+        elif b_val is not None and np.all(np.abs(b_val) > 1e38):
+            a = b_val
+            b = select_op.a
+            cond_val = np.logical_not(cond_val)
+        else:
+            return None
+
+        # build add
+        cond_val, a = np.broadcast_arrays(cond_val, a)
+        a = a.copy()
+        a[np.where(np.logical_not(cond_val))] = 0.0
+        return mb.add(x=a, y=b, before_op=select_op, name=select_op.outputs[0].name)
+
+    @staticmethod
+    def try_to_modify_block(select_op: Operation, new_var: Var) -> bool:
+        block: Block = select_op.enclosing_block
+        if not block.try_replace_uses_of_var_after_op(
+            anchor_op=select_op,
+            old_var=select_op.outputs[0],
+            new_var=new_var,
+        ):
+            return False
+        block.remove_ops([select_op])
+        return True
+
+
 @register_pass(namespace="common")
 class fuse_elementwise_to_batchnorm(AbstractGraphPass):
     """
-    Fold ``mul`` + ``add`` into a ``batchnorm`` 
+    Fold ``mul`` + ``add`` into a ``batchnorm``
     if the ``const`` feeding into the ``mul``/``add`` is of shape ``(1,C,1,1)`` or ``(C,1,1)``
     and input to ``mul`` is of rank 4.
 
@@ -193,7 +359,7 @@ class rank0_expand_dims_swap(AbstractGraphPass):
     should be added after both of the ``rank-0`` tensors, and the final ``expand_dims`` should be removed.
     If the output var of the binary elementwise op is consumed by more than one op, a ``squeeze`` op
     is inserted.
-    
+
     `Input`
 
     .. code-block::
diff --git a/coremltools/converters/mil/mil/passes/defs/optimize_quantization.py b/coremltools/converters/mil/mil/passes/defs/optimize_quantization.py
index 879618d5b..f26045a0d 100644
--- a/coremltools/converters/mil/mil/passes/defs/optimize_quantization.py
+++ b/coremltools/converters/mil/mil/passes/defs/optimize_quantization.py
@@ -8,6 +8,7 @@
 import numpy as np
 
 import coremltools.converters.mil.mil.types as types
+from coremltools.converters.mil._deployment_compatibility import AvailableTarget
 from coremltools.converters.mil.mil import Block
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import Operation, Var
@@ -20,6 +21,103 @@
 from coremltools.converters.mil.mil.passes.pass_registry import register_pass
 
 
+@register_pass(namespace="common")
+class int_op_canonicalization(AbstractGraphPass):
+    """
+    For general quantized operators, in Core ML, we represent them as
+    ``dequantize -> the floating-point version of this operator -> quantize``,
+    because mathematically it is the floating-point tensor rather than
+    its quantized integer representation that gets operated upon.
+
+    For some quantized operators that do not involve floating-point arithmetic,
+    however, it is unnecessary to prepend ``dequantize`` and append ``quantize``.
+    Examples are:
+
+    * reshape
+    """
+
+    INT_OP_TYPES_AND_OPSET_VERSIONS = {"reshape": {AvailableTarget.iOS17}}
+
+    def apply(self, prog):
+        for f in prog.functions.values():
+            self._canonicalize_int_ops_block(f)
+
+    @block_context_manager
+    def _canonicalize_int_ops_block(self, block: Block):
+        def apply_block(block: Block) -> bool:
+            for op in list(block.operations):
+                for b in op.blocks:
+                    self._canonicalize_int_ops_block(b)
+
+                matched_ops = self.match_pattern(op)
+                if matched_ops is not None:
+                    dequantize, quantize = matched_ops
+                    # has to break as the downstream iterator is affected
+                    if self.try_to_transform(dequantize, op, quantize):
+                        return True
+
+            return False
+
+        need_transformation = True
+        while need_transformation:
+            need_transformation = apply_block(block)
+
+    def match_pattern(self, op: Operation) -> Tuple[Operation, Operation]:
+        if (
+            op.op_type not in self.INT_OP_TYPES_AND_OPSET_VERSIONS
+            or op.opset_version not in self.INT_OP_TYPES_AND_OPSET_VERSIONS[op.op_type]
+        ):
+            return None
+
+        # make sure the input is quantized
+        dequantize = op.x.op
+        if dequantize is None or dequantize.op_type != "dequantize":
+            return None
+
+        # make sure the output is quantized
+        if not _check_child_op_type(op, "quantize"):
+            return None
+        quantize = op.outputs[0].child_ops[0]
+
+        # we do not have to check block output, because:
+        # * for dequantize, it is ok to connect to block output, since our
+        #   transformation method `try_to_transform` is able to deal with that
+        # * for op, checking child op has made sure it has only 1 child
+        #   and connects to quantize, i.e. it cannot connect to block output
+
+        return dequantize, quantize
+
+    def try_to_transform(self, dequantize: Operation, op: Operation, quantize: Operation) -> bool:
+        block: Block = op.enclosing_block
+
+        if not block.try_replace_uses_of_var_after_op(
+            anchor_op=quantize,
+            old_var=quantize.outputs[0],
+            new_var=self.build_int_op(dequantize, op, quantize),
+        ):
+            return False
+
+        # remove op and quantize here, but not dequantize, since:
+        # * all uses of op and quantize has been replaced with the canonicalized one
+        # * dequantize may feed to multiple ops, which are not replaced
+        #   (if not, then pass dead_code_elimination will eliminate it)
+        block.remove_ops([op, quantize])
+
+        return True
+
+    @staticmethod
+    def build_int_op(dequantize: Operation, op: Operation, quantize: Operation) -> Var:
+        if op.op_type == "reshape":
+            return mb.reshape(
+                x=dequantize.input,
+                shape=op.shape,
+                name=quantize.outputs[0].name,
+                before_op=op,
+            )
+
+        raise NotImplementedError(f"no build method implemented for int op {op.op_type}")
+
+
 # TODO (rdar://107718371): remove this pass after implementing QuantizedVar
 @register_pass(namespace="common")
 class nullify_redundant_quantization_zero_point(AbstractGraphPass):
@@ -85,7 +183,7 @@ def apply(self, prog):
             self._nullify_redundant_quantization_zero_point_block(f)
 
     @block_context_manager
-    def _nullify_redundant_quantization_zero_point_block(self, block):
+    def _nullify_redundant_quantization_zero_point_block(self, block: Block):
         def apply_block(block: Block) -> bool:
             for op in list(block.operations):
                 for b in op.blocks:
diff --git a/coremltools/converters/mil/mil/passes/defs/optimize_repeat_ops.py b/coremltools/converters/mil/mil/passes/defs/optimize_repeat_ops.py
index e7f7c95c7..f9aa82805 100644
--- a/coremltools/converters/mil/mil/passes/defs/optimize_repeat_ops.py
+++ b/coremltools/converters/mil/mil/passes/defs/optimize_repeat_ops.py
@@ -5,16 +5,24 @@
 
 import copy
 from collections import defaultdict
-from typing import List, Text
+from typing import List, Text, Tuple
 
 import numpy as np
 
 from coremltools import _logger as logger
+from coremltools.converters.mil.mil import Block
 from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import Function, Operation
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
 from coremltools.converters.mil.mil.passes.helper import _check_child_op_type, block_context_manager
 from coremltools.converters.mil.mil.passes.pass_registry import register_pass
 from coremltools.converters.mil.mil.types.symbolic import any_symbolic
+from coremltools.converters.mil.mil.types.type_mapping import (
+    RangeTuple,
+    builtin_to_range,
+    builtin_to_resolution,
+    string_to_builtin,
+)
 from coremltools.converters.mil.mil.var import Var
 
 
@@ -22,7 +30,7 @@
 class merge_consecutive_paddings(AbstractGraphPass):
     """
     Identify two consecutive ``pad`` layers which could be merged into a single ``pad`` layer.
-    
+
     This is possible only if one of the following conditions is satisfied:
 
     - The paddings are "constant" and have the same ``constant_val``.
@@ -285,19 +293,18 @@ def help_merge_consecutive_reshapes_block(block):
         while block_changed:
             block_changed = help_merge_consecutive_reshapes_block(block)
 
-
 class CastOptimizationNode:
     def __init__(self, op_type, match_criterion=None):
         """
         Parameters
         ----------
-        
+
         param op_type : Type of an operation.
         param match_criterion : A callable function that matches a MIL op and returns a boolean.
 
         Examples
         --------
-        
+
         .. sourcecode:: python
 
             CastOptimizationNode("mul"),
@@ -315,227 +322,254 @@ def __init__(self, op_type, match_criterion=None):
 
         self.match_criterion = match_criterion
 
-
 @register_pass(namespace="common")
 class cast_optimization(AbstractGraphPass):
     """
     This optimization pass performs the following:
 
     - Removes redundant ``cast`` op; that is, ``cast`` where source and destination tensors have same dtypes.
-    - Either cancels or fuses any two consecutive `cast` ops, repeatedly.
+    - Fuses two consecutive `cast` ops if applicable, repeatedly.
 
-    After this pass, there can't be any consecutive `cast` ops present in the program.
-    For examples, see ``TestCastOptimization``.
     This is a non-algebraic translation which assumes that the upcasting doesn't change the user's intent.
-    
-    For example:
 
-    .. code-block::
+    (1) Example for redundant ``cast`` op removal:
+        .. code-block::
 
-        Input graph:
-        input -----> cast(dtype="fp16") -----> cast(dtype="fp32") ----> square ---> out
+            Input graph:
+            input(fp16) -> cast(dtype="fp16") -> relu -> out
 
-        Output graph:
-        input -----> square -----> out
+            Output graph:
+            input -> relu -> out
+
+        The input and output tensors for the ``cast`` op are both with type of ``fp16``. Hence, it can be removed.
+
+    (2) Example for two ``cast`` ops fusion:
+        .. code-block::
+
+            Input graph:
+            input(int8) -> cast(dtype="fp16") -> cast(dtype="fp32") -> out
+
+            Output graph:
+            input(int8) -> cast(dtype="fp32") -> out
+
+        The data range and resolution of the above graph are limited by the int8 input, so the fusion is allowed.
+
+    (3) Negative example for two ``cast`` ops fusion:
+        .. code-block::
+            Input graph:
+            input(fp32) -> cast(dtype="bool") -> cast(dtype="fp16") -> out
+
+            Output graph:
+            Same as input graph.
+
+        The above two ``cast`` ops cannot be merged, since after the first cast, the resolution of the numerical output
+        is downcasted to binary (``0, 1``). If we fuse them, the output would be in the range and resolution of ``fp16`` instead.
 
-    The input graph has a maximum precision of fp16 while the output graph has fp32 precision.
+    (3) Another Negative example for two ``cast`` ops fusion:
+        .. code-block::
+            Input graph:
+            input(int32) -> cast(dtype="int8") -> cast(dtype="uint8") -> out
+
+            Output graph:
+            Same as input graph.
+
+        The above two ``cast`` ops cannot be merged, since in the original graph, by going through two casts,
+        the output numerical range is capped to ``[0, 127]``.
+
+        However, if two ``cast`` ops are reduced to 1 ``cast(dtype="uint8")``, the output numerical would in the range of ``[0, 255]``.
+        The fusion would cause numerical issue for the numbers between ``[128, 255]``, which is prohibited.
+
+    In general, two ``cast`` ops can be merged if the output data range and resolution is not affected.
 
+    For more examples, please see the unittests start with prefix "TestCastOptimization" in test_passes.py
     """
 
     def apply(self, prog):
         for f in prog.functions.values():
-            self._fuse_or_cancel_consecutive_casts_block_wrapper(f, {})
+            self._fuse_or_cancel_consecutive_casts_block_wrapper(f)
 
-        # main function's output_vars are treated differently, which are not handled by the method
-        # above, "_fuse_or_cancel_consecutive_casts_block".
-        # For that, we invoke another method
-        block_changed = True
-        while block_changed:
-            block_changed = self._cancel_consecutive_casts_connected_to_outputs(
-                prog.functions["main"]
+    def _propagate_range_resolution(self, in_dtype: type, dtype_chain: Tuple[type]):
+        """
+        Given an input type ``in_dtype``, and a chain of casting, return the resulting output data range and resolution.
+
+        For example, ``in_dtype = fp32`` and ``dtype_chain = [int8, int32]``. This means an input data with type ``fp32``,
+        is propagated through ``cast(dtype="int8")`` and ``cast(dtype="int32")`` in order.
+
+        1. The input fp32 data range is ``[-3.4e+38, 3.4e+38]`` with resolution ``1e-06``.
+        2. After the first ``cast(dtype="int8")`` downcast, the range becomes ``[-128, 127]`` with resolution ``1``.
+        3. Even the ``int32`` has a larger range, the resulting range is still capped to ``[-128, 127]``.
+
+        For the above example, this function returns range of ``[-128, 127]`` and resolution ``1``.
+        """
+        assert isinstance(dtype_chain, tuple)
+        cur_range, cur_resolution = builtin_to_range(in_dtype), builtin_to_resolution(in_dtype)
+        for v in dtype_chain:
+            tmp_range, tmp_resolution = builtin_to_range(v), builtin_to_resolution(v)
+            cur_range = RangeTuple(
+                max(cur_range.low, tmp_range.low), min(cur_range.high, tmp_range.high)
             )
+            cur_resolution = max(cur_resolution, tmp_resolution)
+        return cur_range, cur_resolution
+
+    def _is_cast_ops_fusable(self, cast_1: Operation, cast_2: Operation):
+        """
+        Check if two cast ops can be fused by verifying the consistency between the range and resolution before and after fusion.
+
+        Take the same example shown in ``_propagate_range_resolution``:
+
+            input(fp32) -> cast(dtype="int8") -> cast(dtype="int32")
+
+        The original pattern has output range and resolution ``[-128, 127]``, ``1``.
+
+        However, if the two ``cast`` ops are fused:
+
+            input(fp32) -> cast(dtype="int32")
+
+        The output range becomes the range of int32, which is not ``[-128, 127]``.
+        As the result, the fusion is prohibited.
+        """
+        x_dtype, cast_1_dtype, cast_2_dtype = (
+            cast_1.x.dtype,
+            string_to_builtin(cast_1.dtype.val),
+            string_to_builtin(cast_2.dtype.val),
+        )
+
+        ref_range, ref_resolution = self._propagate_range_resolution(
+            x_dtype, (cast_1_dtype, cast_2_dtype)
+        )
+        out_range, out_resolution = self._propagate_range_resolution(x_dtype, (cast_2_dtype,))
 
-    def _match_linear_pattern(self, root, pattern):
+        return out_range == ref_range and out_resolution == ref_resolution
+
+    def _dup_if_affect_io(self, new_var: Var, old_var: Var, before_op: Operation):
         """
-        Use Depth First Search to match the pattern
+        We cannot replace old_var with new_var, if:
+        1. old_var is a function output
+        2. new_var is a function input
+        Since the name of the function is going to be changed and become invalid.
 
-        :param root: operation
-        :param pattern: List[CastOptimizationNode]
-        :return: Return List[operation] if pattern matches entirely else []
+        For this special corner case, we use an identity op to duplicate the new_var.
         """
-        op = root
-        if not pattern or len(op.outputs) != 1:
-            return []
+        block_1 = before_op.enclosing_block
+        is_new_var_function_input = (
+            isinstance(block_1, Function) and new_var in block_1.inputs.values()
+        )
+        block_2 = old_var.op.enclosing_block
+        is_old_var_function_output = isinstance(block_2, Function) and old_var in block_2.outputs
 
-        node = pattern[0]
-        if op.op_type != node.op_type:
-            return []
+        if is_new_var_function_input and is_old_var_function_output:
+            return mb.identity(x=new_var, before_op=before_op)
+        return new_var
 
-        if not node.match_criterion(op):
-            return []
+    def _fuse_cast_ops(self, cast_ops: List[Operation], reuse_input_var: bool = False):
+        """
+        Fuse the pattern of:
+            input -> cast_1(dtype=dtype_1) -> cast_2(dtype=dtype_2) -> out
 
-        for child in op.outputs[0].child_ops:
-            op_list = [op] + self._match_linear_pattern(child, pattern[1:])
-            if len(op_list) == len(pattern):
-                return op_list
+        If ``reuse_input_var = True``, the pattern is reduced to:
+            input -> out
 
-        return []
+        otherwise, a new ``cast`` op with the same ``dtype`` as ``cast_2`` is created:
+            input -> cast_3(dtype=dtype_2) -> out
+        """
+        if not isinstance(cast_ops[0], tuple):
+            cast_ops = tuple((cast_ops,))
 
-    def _try_to_transform(self, root_op, cached_vars):
+        ops_to_remove = []
+
+        for cast_1, cast_2 in cast_ops:
+            if reuse_input_var:
+                new_output_var = self._dup_if_affect_io(cast_1.x, cast_2.outputs[0], cast_1)
+            else:
+                fused_output_var_name = cast_1.x.name + "_to_{}".format(cast_2.dtype.val)
+                new_output_var = mb.cast(
+                    x=cast_1.x,
+                    dtype=cast_2.dtype,
+                    name=fused_output_var_name,
+                    before_op=cast_2,
+                )
+            # It's important to use `cast_2.enclosing_block` since `cast_2` might be present in a block nested under `cast_1.enclosing_block`
+            cast_2.enclosing_block.replace_uses_of_var_after_op(
+                anchor_op=cast_2,
+                old_var=cast_2.outputs[0],
+                new_var=new_output_var,
+            )
+            # Remove just the last cast op and let dce eliminate the rest of the ops if needed,
+            # The reason is that first cast op could be feeding into other non-cast ops.
+            ops_to_remove.append(cast_2)
+
+        ops_to_remove[0].enclosing_block.remove_ops(ops_to_remove)
+
+    def _try_to_transform(self, root_op, cast_ops_across_blocks):
         block = root_op.enclosing_block
 
         # Scenario: Redundant cast when source and destination dtype are same.
         if root_op.op_type == "cast" and root_op.x.is_tensor_or_scalar_of(dtype=root_op.dtype.val):
+            new_var = root_op.x
+            old_var = root_op.outputs[0]
+            new_var = self._dup_if_affect_io(root_op.x, old_var, root_op)
             block.replace_uses_of_var_after_op(
                 anchor_op=root_op,
-                old_var=root_op.outputs[0],
-                new_var=root_op.x,
+                old_var=old_var,
+                new_var=new_var,
             )
             block.remove_ops([root_op])
             return True
 
         # Scenario: Consecutive casts
-        list_of_ops_in_pattern = self._match_linear_pattern(
-            root_op,
-            [
-                CastOptimizationNode("cast"),
-                CastOptimizationNode("cast"),
-            ],
-        )
-
-        if not list_of_ops_in_pattern:
-            return False
-
-        cast_1, cast_2 = list_of_ops_in_pattern
-
-        fused_output_var_name = cast_1.x.name + "_to_{}".format(cast_2.dtype.val)
+        candidate_child_ops = []
+        for op in root_op.outputs[0].child_ops:
+            if op.op_type == "cast":
+                candidate_child_ops.append(op)
+
+        fusion_happens = False
+        for child_op in candidate_child_ops:
+            if not self._is_cast_ops_fusable(root_op, child_op):
+                continue
 
-        if cast_1.x.is_tensor_or_scalar_of(dtype=cast_2.dtype.val):
-            # when consecutive casts cancel each other
-            # Please check out: test_linear_consecutive_cast_ops_cancellation in TestCastOptimization
-            new_output_var = cast_1.x
-        elif fused_output_var_name in cached_vars:
-            # When the output of 1 cast goes into multiple casts of same configuration
-            # Please check out: test_consecutive_fusable_casts_on_all_branches in TestCastOptimization
-            new_output_var = cached_vars[fused_output_var_name]
-        else:
-            new_output_var = mb.cast(
-                x=cast_1.x,
-                dtype=cast_2.dtype,
-                name=fused_output_var_name,
-                before_op=cast_2,
-            )
-            cached_vars[fused_output_var_name] = new_output_var
-
-        # It's important to use `cast_2.enclosing_block` over `block` since `cast_2` might be present in
-        # a block nested under `block`
-        cast_2.enclosing_block.replace_uses_of_var_after_op(
-            anchor_op=cast_2,
-            old_var=cast_2.outputs[0],
-            new_var=new_output_var,
-        )
+            if root_op.x.is_tensor_or_scalar_of(dtype=child_op.dtype.val):
+                # when consecutive casts cancel each other
+                # Please check out: test_linear_consecutive_cast_ops_cancellation in TestCastOptimization
+                self._fuse_cast_ops((root_op, child_op), reuse_input_var=True)
+                fusion_happens = True
+            else:
+                if child_op.enclosing_block != block:
+                    # If cast_2 is in an inner block, we handle it at once in a seperated function `_fuse_casts_ops_across_blocks`
+                    cast_ops_across_blocks[child_op.enclosing_block].add((root_op, child_op))
+                    continue
+                self._fuse_cast_ops((root_op, child_op))
+                fusion_happens = True
+        return fusion_happens
 
-        # Remove just the last cast op and let dce eliminate the rest of the ops if needed,
-        # The reason is that first cast op could be feeding into other non-cast ops.
-        cast_2.enclosing_block.remove_ops([cast_2])
-        return True
+    @block_context_manager
+    def _fuse_casts_ops_across_blocks(self, block: Block, ops_to_fused: Tuple[Operation]):
+        self._fuse_cast_ops(ops_to_fused)
 
     @block_context_manager
-    def _fuse_or_cancel_consecutive_casts_block_wrapper(self, block, cached_vars):
-        def _fuse_or_cancel_consecutive_casts_block(block, cached_vars):
-            block_changed = False
+    def _fuse_or_cancel_consecutive_casts_block_wrapper(self, block):
+        def _fuse_or_cancel_consecutive_casts_block(block, cast_ops_across_blocks):
+            # We first make sure all the inner blocks are optimized
+            # It is important to do it seperately in the very beginning, to ensure the last step of optimization cast ops across the block boundary is correct.
             for i, op in enumerate(list(block.operations)):
                 for b in op.blocks:
-                    nested_block_changed = True
-                    nested_block_cached_vars = {}
-                    nested_block_cached_vars.update(cached_vars)
-                    self._fuse_or_cancel_consecutive_casts_block_wrapper(
-                        b, nested_block_cached_vars
-                    )
-
-                if len(op.blocks) > 0:
-                    continue
+                    self._fuse_or_cancel_consecutive_casts_block_wrapper(b)
 
+            for i, op in enumerate(list(block.operations)):
                 # start pattern match if cast op is encountered
                 if op.op_type == "cast":
-                    block_changed = self._try_to_transform(op, cached_vars)
-                    # has to break as the downstream iterator is affected.
-                    if block_changed:
-                        return block_changed
-            return block_changed
+                    if self._try_to_transform(op, cast_ops_across_blocks):
+                        # has to break as the downstream iterator is affected.
+                        return True
+            return False
 
         block_changed = True
-        """
-        Cached vars are used when `all` of the following conditions are met:
-        
-        1. The output of a ``cast`` is fed into multiple ``cast`` ops of same configuration.
-        2. These 2 consecutive ``cast`` ops can be fused into a single ``cast``.
-        
-        When these conditions are satisfied, we create a `new` fused ``cast`` op `only` once, and
-        the output of all these consecutive ``cast`` ops are replaced with the ouptut of this fused ``cast``.
-
-        .. code-block::
-
-            Input graph:
-                                        |---->cast(dtype="fp16")---->square--->out_1
-                                        |
-            input---->cast(dtype="int32")---->cast(dtype="fp16")---->relu--->out_2
-                                        |
-                                        |---->cast(dtype="fp16")---->log--->out_3
-            
-            Output graph:
-                                                 |---->square--->out_1
-                                                 |
-            input---->new_fused_cast(dtype="fp16")---->relu--->out_2
-                                                 |
-                                                 |---->log--->out_3
-
-        """
+        cast_ops_across_blocks = defaultdict(set)
         while block_changed:
-            block_changed = _fuse_or_cancel_consecutive_casts_block(block, cached_vars)
-
-    @staticmethod
-    def _cancel_consecutive_casts_connected_to_outputs(block):
-        """
-        Lets say the ops in the block have the following pattern
-        "some_op"---->{var1}---->"cast_op1"---->"cast_op2"--->{var2}
-        , where var2 is one of the outputs in block.outputs
-
-        If cast_op1 and cast_op2 can be cancelled, this means, var1 and var2 are duplicates
-        of each other. The program can then be updated to
-        "some_op"---->{var1}
-        where var1 replaces var2 in block.outputs
-        This also requires replacing var1's name with var2's so that the model output names remain unchanged
-        """
-        new_output_vars = []
-        block_changed = False
-        for output_var in block.outputs:
-            cast_op2 = output_var.op
-            if cast_op2 is None:
-                continue
-            if cast_op2.op_type != "cast":
-                new_output_vars.append(output_var)
-                continue
-            cast_op1 = cast_op2.x.op
-            if cast_op1 is None:
-                new_output_vars.append(output_var)
-                continue
-            if cast_op1.op_type != "cast":
-                new_output_vars.append(output_var)
-                continue
-            var1 = cast_op1.x
-            if var1.op is None or var1.dtype != output_var.dtype:
-                new_output_vars.append(output_var)
-                continue
-            var1.set_name(output_var.name)
-            new_output_vars.append(var1)
-            block_changed = True
-
-        if block_changed:
-            block.set_outputs(new_output_vars)
-
-        return block_changed
+            block_changed = _fuse_or_cancel_consecutive_casts_block(block, cast_ops_across_blocks)
 
+        # fuse the cast ops across the inner / outer block boundary
+        for k, v in cast_ops_across_blocks.items():
+            self._fuse_casts_ops_across_blocks(k, tuple(v))
 
 class TransformAxisUpdateOps:
     """
@@ -1628,21 +1662,21 @@ class reduce_transposes(AbstractGraphPass):
 
             Output graph:
             input -----> identity -----> out
-        
+
         # Example 2
             Input graph:
             input---->transpose(axis=[0,3,1,2])---->relu---->transpose(axis=[0,2,3,1])--->out
 
             Output graph:
             input----->relu----->out
-        
+
         # Example 3
             Input graph:
             input(shape=10,2,3,5)--->transpose(axis=[0,2,3,1])----->relu---->pool----->out1
                                                                |
                                                                |
                                                                --->relu----->log---->transpose(axis=[0,3,1,2])---->out2
-            
+
             Output graph:
             input(shape=10,2,3,5)----->relu---->transpose(axis=[0,2,3,1])---->pool----->out1
                                    |
@@ -1707,7 +1741,7 @@ class reduce_transposes(AbstractGraphPass):
     To resolve this, we recognize that nodes consisting of sets ``(a)`` and ``(b)`` form a bipartitle graph, where,
     ``(a) ==`` starting ``transpose`` ops (originators of ``_LazyTransposeHypotheticalValue``)
     and ``(b) ==`` set of ``transpose`` ``cancel`` ops and ``materialize`` ops.
-    
+
     - In this bipartite graph, we find all the connected components for each connected component.
       Either the entire set of ``transpose`` ops in it are removed/materialized, or none
       of them are touched.
diff --git a/coremltools/converters/mil/mil/passes/defs/preprocess.py b/coremltools/converters/mil/mil/passes/defs/preprocess.py
index 6add95b64..3f9ea7b1a 100644
--- a/coremltools/converters/mil/mil/passes/defs/preprocess.py
+++ b/coremltools/converters/mil/mil/passes/defs/preprocess.py
@@ -7,6 +7,7 @@
 import warnings
 from collections import OrderedDict
 
+from coremltools import _logger as logger
 from coremltools.converters.mil.input_types import EnumeratedShapes, ImageType, Shape
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import Function, types
@@ -117,6 +118,10 @@ def __init__(self, prefix=None):
         self.all_names = set()
         self.prefix = "_" if prefix is None else prefix
 
+    @staticmethod
+    def _replace_invalid_char_with_underscore(name):
+        return re.sub("[^a-zA-Z0-9_]", "_", name)
+
     def sanitize_name(self, name):
         """
         Sanitize the input string and return it back.
@@ -136,7 +141,7 @@ def sanitize_name(self, name):
         """
 
         # replace any character that is not [a-zA-Z0-9_] with an underscore
-        new_name = re.sub("[^a-zA-Z0-9_]", "_", name)
+        new_name = self._replace_invalid_char_with_underscore(name)
 
         # now check if the name starts with anything but [A-Za-z_]
         # if so, then add the prefix
@@ -325,6 +330,7 @@ def apply(self, prog):
         user_provided_output_types = prog.main_output_types
         main_func = prog.functions["main"]
         output_vars = main_func.outputs
+        input_vars = list(main_func.inputs.values())
         if user_provided_output_types is None or len(user_provided_output_types) == 0:
             return
         if len(output_vars) != len(user_provided_output_types):
@@ -347,6 +353,15 @@ def apply(self, prog):
             ):
                 # no need to update the output var's dtype in this case
                 new_outputs.append(output_var)
+            elif output_var in input_vars:
+                # Here is this rare special case, that the program input is also an output
+                # For this case, we don't do anything, and throw a warning message
+                new_outputs.append(output_var)
+                logger.warning(
+                    f"Output var '{output_var.name}' is also an input var, hence the "
+                    f"dtype cannot be changed: output var '{output_var.name}' remains "
+                    f"dtype {types.builtin_to_string(output_var.dtype)}"
+                )
             else:
                 output_var_name = output_var.name
                 output_var.set_name(
diff --git a/coremltools/converters/mil/mil/passes/defs/quantization.py b/coremltools/converters/mil/mil/passes/defs/quantization.py
index 6253f28b3..4ddea93a6 100644
--- a/coremltools/converters/mil/mil/passes/defs/quantization.py
+++ b/coremltools/converters/mil/mil/passes/defs/quantization.py
@@ -135,6 +135,7 @@ class FP16ComputePrecision(AbstractQuantizationPass):
         "sigmoid_hard",
         "softplus_parametric",
     }
+    _ELEMENTWISE_UNARY_EPSILON_OPS: Set[str] = {"inverse", "log", "rsqrt"}
 
     def __init__(self, op_selector=None):
         super(FP16ComputePrecision, self).__init__(op_selector=op_selector)
@@ -183,10 +184,6 @@ def is_valid_op(self, op: Operation) -> bool:
         ]:
             return False
 
-        # TODO: Remove after supporting IOS17 FP16 RNN Ops (rdar://108143371)
-        if op.op_type in ["gru", "rnn", "lstm"]:
-            return False
-
         if self.fp16_overflow(op):
             return False
 
@@ -207,6 +204,10 @@ def should_cast_parameter(self, op: Operation, param_name: str) -> bool:
             if op.op_type in self._ACTIVATION_ALPHA_BETA_OPS and param_name in {"alpha", "beta"}:
                 return False
 
+            # Element-wise unary ops with epsilon also support mixed precision.
+            if op.op_type in self._ELEMENTWISE_UNARY_EPSILON_OPS and param_name == "epsilon":
+                return False
+
         return True
 
     def _check_underflow_to_zero(self, new_var, var):
diff --git a/coremltools/converters/mil/mil/passes/graph_pass.md b/coremltools/converters/mil/mil/passes/graph_pass.md
index 5bfe6b42d..886264cfe 100644
--- a/coremltools/converters/mil/mil/passes/graph_pass.md
+++ b/coremltools/converters/mil/mil/passes/graph_pass.md
@@ -30,7 +30,7 @@ In addition to the using default setting, you can:
 
 * Decide which passes and the order of passes to run (see
 [Specify Passes To Run](#specify-passes-to-run)). For example,
-   - Switching off certain fusions to correctly export Phoenix optimized models for palettization.
+   - Switching off certain fusions to correctly export optimized models from coremltools.optimize.torch for palettization.
    - Skipping all passes to keep the MIL Program untouched.
 
 * Set options for a specific pass to control the behaviour (see [Set Pass Option](#set-pass-option)). For example,
@@ -128,7 +128,7 @@ ct.convert(model, pass_pipeline=pipeline)
 
 ## Define Custom Graph Pass
 
-If the currently available 
+If the currently available
 [MIL Graph Passes in the coremltools API Reference](https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.passes.defs.html#mil-graph-passes) do not meet your goal, you can  define custom graph passes.
 
 To illustrate how to define a custom graph pass, the following example demonstrates merging consecutive `relu` ops using a PyTorch model with 2 `relu` layers. You can directly convert this model using the following script:
@@ -306,5 +306,3 @@ Using the `block_context_manager` decorator is highly recommended, especially wh
 original function involves calling `with block` multiple times. However, you may want to avoid recursively calling the function decorated with `block_context_manager`, since it involves expensive `_propagate_nonreplaceable_vars()`.
 
 For details about how to use a `_noop_elimination_block_wrapper` to avoid that recursive calling, see  [noop_elimination](https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.passes.defs.html#coremltools.converters.mil.mil.passes.defs.cleanup.noop_elimination).
-
-
diff --git a/coremltools/converters/mil/mil/passes/pass_pipeline.py b/coremltools/converters/mil/mil/passes/pass_pipeline.py
index 687930038..11fe8187a 100644
--- a/coremltools/converters/mil/mil/passes/pass_pipeline.py
+++ b/coremltools/converters/mil/mil/passes/pass_pipeline.py
@@ -21,9 +21,10 @@
     "common::update_output_dtypes",
     "common::cast_optimization",
     "common::noop_elimination",
-    # quantization pass 1: canonicalize zero point
+    # quantization pass 1: canonicalizations
     # always start quantization passes with canonicalizations
-    "common::nullify_redundant_quantization_zero_point",
+    "common::int_op_canonicalization",  # ops that support int do not need dequantize -> op -> quantize sandwich
+    "common::nullify_redundant_quantization_zero_point",  # canonicalize zero point
     # quantization pass 2: remove redundancy
     # remove redundancy after canonicalization but before anything else
     "common::dequantize_quantize_pair_elimination",
@@ -36,6 +37,7 @@
     "common::const_elimination",
     "common::sanitize_input_output_names",
     "common::divide_to_multiply",
+    "common::select_optimization",
     "common::add_conv_transpose_output_shape",
     "common::const_elimination",
     "common::const_deduplication",  # after all consts have been settled
@@ -355,18 +357,18 @@ def validate(self):
                     f"pipeline: {self._pass_names}"
                 )
 
-    @staticmethod
-    def get_pipeline(pipeline_name: Text) -> PassPipeline:
+    @classmethod
+    def get_pipeline(cls, pipeline_name: Text) -> PassPipeline:
         """
         Gets a pipeline based on the name. Raises an error if no pipeline is found.
         Available Pipelines are defined in _PIPELINE_NAME_TO_PASSES
         """
-        if pipeline_name not in PassPipeline._PIPELINE_NAME_TO_PASSES:
+        if pipeline_name not in cls._PIPELINE_NAME_TO_PASSES:
             raise ValueError(
                 f"There is no pipeline for `{pipeline_name}`. "
-                f"Available pipelines: {PassPipeline._PIPELINE_NAME_TO_PASSES.keys()}"
+                f"Available pipelines: {cls._PIPELINE_NAME_TO_PASSES.keys()}"
             )
-        return PassPipeline(PassPipeline._PIPELINE_NAME_TO_PASSES[pipeline_name], pipeline_name)
+        return PassPipeline(cls._PIPELINE_NAME_TO_PASSES[pipeline_name], pipeline_name)
 
     """
     =======================================
@@ -381,19 +383,19 @@ def EMPTY(cls) -> PassPipeline:
     @_classproperty
     def DEFAULT(cls) -> PassPipeline:
         """Creates a pipeline that the converter uses by default."""
-        return PassPipeline.get_pipeline("default")
+        return cls.get_pipeline("default")
 
     @_classproperty
     def CLEANUP(cls) -> PassPipeline:
         """Create a pipeline that contains cleanup passes."""
-        return PassPipeline.get_pipeline("cleanup")
+        return cls.get_pipeline("cleanup")
 
     @_classproperty
     def DEFAULT_PALETTIZATION(cls) -> PassPipeline:
         """Create a default palettization pipeline to convert a compressed source model"""
         # We use delayed import to avoid circular import
         from coremltools.optimize.coreml import OpPalettizerConfig, OptimizationConfig
-        pipeline = PassPipeline.get_pipeline("default_palettization")
+        pipeline = cls.get_pipeline("default_palettization")
 
         # set default palettization
         config = OptimizationConfig(global_config=OpPalettizerConfig(mode="unique"))
@@ -405,12 +407,12 @@ def DEFAULT_PRUNING(cls) -> PassPipeline:
         """Create a default sparsification pipeline to convert a compressed source model"""
         # We use delayed import to avoid circular import
         from coremltools.optimize.coreml import OpThresholdPrunerConfig, OptimizationConfig
-        pipeline = PassPipeline.get_pipeline("default_sparsification")
+        pipeline = cls.get_pipeline("default_sparsification")
 
         # set default sparsification
         config = OptimizationConfig(
             global_config=OpThresholdPrunerConfig(
-                threshold=1e-3,
+                threshold=1e-12,
             )
         )
         pipeline.set_options("compression::prune_weights", {"config": config})
diff --git a/coremltools/converters/mil/mil/passes/tests/test_passes.py b/coremltools/converters/mil/mil/passes/tests/test_passes.py
index 4630be19c..3debe4e30 100644
--- a/coremltools/converters/mil/mil/passes/tests/test_passes.py
+++ b/coremltools/converters/mil/mil/passes/tests/test_passes.py
@@ -12,16 +12,20 @@
 from mock import patch
 
 import coremltools as ct
+import coremltools.optimize as cto
 from coremltools._deps import _IS_MACOS
 from coremltools.converters.mil.experimental.passes.generic_pass_infrastructure import (
     register_generic_pass,
 )
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import Function, Program, Symbol, get_new_symbol, types
+from coremltools.converters.mil.mil.ops.defs.iOS15.elementwise_unary import cast as _cast_iOS14
+from coremltools.converters.mil.mil.ops.defs.iOS17.elementwise_unary import cast as _cast_iOS17
 from coremltools.converters.mil.mil.passes.defs.cleanup import topological_reorder
 from coremltools.converters.mil.mil.passes.helper import _check_var_scalar_value
 from coremltools.converters.mil.mil.passes.pass_registry import PASS_REGISTRY
 from coremltools.converters.mil.mil.types import numpy_type_to_builtin_type
+from coremltools.converters.mil.mil.types.type_mapping import builtin_to_string
 from coremltools.converters.mil.testing_reqs import backends
 from coremltools.converters.mil.testing_utils import (
     apply_pass_and_basic_check,
@@ -29,12 +33,11 @@
     assert_op_count_match,
     assert_same_output_names,
     get_op_names_in_program,
+    get_op_types_in_block,
     get_op_types_in_program,
 )
 from coremltools.models.utils import _macos_version
 
-import coremltools.optimize as cto
-
 np.random.seed(1984)
 _VALIDATE_MODEL = True
 
@@ -386,7 +389,7 @@ def prog(x):
             x = mb.cast(x=x, dtype="fp16", name="castop")
             x = mb.cast(x=x, dtype="fp16", name="castop")
             x = mb.cast(x=x, dtype="int32", name="castop_2")
-            x = mb.cast(x=x, dtype="int64", name="castop")
+            x = mb.cast(x=x, dtype="fp16", name="castop")
             x = mb.cast(x=x, dtype="fp32", name="castop_2")
             x = mb.square(x=x, name="square")
             return x
@@ -497,6 +500,44 @@ def prog(x):
 
 
 class TestNoopElimination:
+    @pytest.mark.parametrize("is_block_output", ((True, False)))
+    def test_identity(self, is_block_output):
+        """
+        Input graph:
+
+            input -> identity -> (add 1.0 if not is_block_output) -> output
+
+        Output graph:
+
+            if is_block_output:
+                input -> identity -> output
+            else:
+                input -> add 1.0 -> output
+        """
+        SHAPE = (2, 3)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=SHAPE)])
+        def prog(x):
+            y = mb.identity(x=x)
+            if not is_block_output:
+                y = mb.add(x=y, y=1.0)
+            return y
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
+        if is_block_output:
+            assert get_op_types_in_program(prev_prog) == ["identity"]
+            assert get_op_types_in_program(prog) == ["identity"]
+        else:
+            assert get_op_types_in_program(prev_prog) == ["identity", "add"]
+            assert get_op_types_in_program(prog) == ["add"]
+
+        output_name = block.outputs[0].name
+        assert_model_is_valid(
+            prog,
+            {"x": SHAPE},
+            expected_output_shapes={output_name: SHAPE},
+        )
+
     @pytest.mark.parametrize(
         "op_type, pos, val",
         itertools.product(
@@ -1141,7 +1182,6 @@ def test_redundant_ops_just_after_input_valid_pattern_1(self):
                                     |                    |
                                     |--------------------
         """
-
         @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3, 5))])
         def prog(x):
             x1 = mb.transpose(x=x, perm=[0, 2, 1])
@@ -1210,6 +1250,47 @@ def prog(x):
             expected_output_shapes={block.outputs[0].name: (2, 3, 5)},
         )
 
+    def test_redundant_ops_just_after_input_valid_pattern_3(self):
+        """
+        Input graph:
+        input----->leaky_relu(alpha=0.4)--->add---> add ---> out
+               |                             ^       ^
+               |                             |       |
+               |----->leaky_relu(alpha=0.3)---       |
+               |                                     |
+               |                                     |
+               |---->leaky_relu(alpha=0.3)------------
+
+        Output graph:
+        input----->leaky_relu(alpha=0.4)--->add---> add ---> out
+               |                             ^       ^
+               |                             |       |
+               |----->leaky_relu(alpha=0.3)----------
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3, 5))])
+        def prog(x):
+            x1 = mb.leaky_relu(x=x, alpha=0.4)
+            x2 = mb.leaky_relu(x=x, alpha=0.3)
+            x3 = mb.leaky_relu(x=x, alpha=0.3)
+            z = mb.add(x=x1, y=x2)
+            z = mb.add(x=z, y=x3)
+            return z
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
+        assert get_op_types_in_program(prev_prog) == [
+            "leaky_relu",
+            "leaky_relu",
+            "leaky_relu",
+            "add",
+            "add",
+        ]
+        assert get_op_types_in_program(prog) == ["leaky_relu", "leaky_relu", "add", "add"]
+
+        leaky_relu_ops = block.find_ops(op_type="leaky_relu")
+        assert leaky_relu_ops[0].alpha.val == np.float32(0.4)
+        assert leaky_relu_ops[1].alpha.val == np.float32(0.3)
+
     def test_redundant_ops_just_after_input_invalid_pattern_1(self):
         """
         input----->transpose(perm=[0, 2, 1])---> reshape(shape=[-1]) -----> add ---> out
@@ -3501,80 +3582,319 @@ def prog(x):
             backend=backend,
         )
 
+class TestCastOptimizationReduendantCastRemoval:
+    """
+    Test single cast op removal.
+    """
+    def test_remove_redundant_cast_smoke(self):
+        """
+        Input graph:
+        input(fp32) -> cast(dtype=fp32) -> output
 
-class TestCastOptimization:
-    """Test the cast optimization pass."""
+        Output graph:
+        input -> output
+        """
 
-    """
-    Input graph:
-    input -----> cast(dtype="fp32") -----> square -----> cast(dtype="fp32") ---> out
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)])
+        def prog(x):
+            x = mb.cast(x=x, dtype="fp32")
+            return x
 
-    Output graph:
-    input -----> square -----> out
+        assert get_op_types_in_program(prog) == ["cast"]
+
+        _, _, block = apply_pass_and_basic_check(prog, "common::cast_optimization")
+
+        assert len(block.find_ops(op_type="cast")) == 0
+        assert block.outputs[0].dtype == types.fp32
+
+    def test_remove_redundant_cast_negative_smoke(self):
+        """
+        Input graph:
+        input(fp32) -> cast(dtype=fp16) -> output
+
+        Output graph:
+        input -> cast -> output
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)])
+        def prog(x):
+            x = mb.cast(x=x, dtype="fp16")
+            return x
+
+        assert get_op_types_in_program(prog) == ["cast"]
+
+        _, _, block = apply_pass_and_basic_check(prog, "common::cast_optimization")
+
+        assert len(block.find_ops(op_type="cast")) == 1
+        assert block.outputs[0].dtype == types.fp16
+
+    @pytest.mark.parametrize(
+        "opset_version",
+        [ct.target.iOS14, ct.target.iOS17],
+    )
+    def test_remove_redundant_cast_stress(self, opset_version):
+        """
+        Test all possible dtype combination for each iOS version of cast.
+
+        Input graph:
+        input(dtype=dtype_a) -> cast(dtype=dtype_b) -> out
+
+        Output graph:
+        if dtype_a == dtype_b, the cast op can be eliminated
+            input -> out
+
+        if dtype_a != dtype_b, the cast op should be preserved
+            input -> cast -> out
+        """
+
+        def _test_cast_op_cancellation(dtype_a, dtype_b):
+            @mb.program(
+                input_specs=[mb.TensorSpec(shape=(1,), dtype=dtype_a)], opset_version=opset_version
+            )
+            def prog(x):
+                x = mb.cast(x=x, dtype=builtin_to_string(dtype_b))
+                return x
+
+            assert get_op_types_in_program(prog) == ["cast"]
+
+            _, _, block = apply_pass_and_basic_check(prog, "common::cast_optimization")
+            cast_ops = block.find_ops(op_type="cast")
+            if dtype_a == dtype_b:
+                assert len(cast_ops) == 0
+            else:
+                assert len(cast_ops) == 1
+            assert block.outputs[0].dtype == dtype_b
+
+        opset_version_to_cast_op = {
+            ct.target.iOS14: _cast_iOS14,
+            ct.target.iOS17: _cast_iOS17,
+        }
+        cast_op = opset_version_to_cast_op[opset_version]
+        for dtype_a in cast_op.type_domains["T"]:
+            for dtype_b in cast_op.type_domains["T"]:
+                _test_cast_op_cancellation(dtype_a, dtype_b)
+
+
+class TestCastOptimizationCastFusion:
+    """
+    Test consecutive cast ops funsion
     """
+    def test_cast_ops_fusion_smoke(self):
+        """
+        Input graph:
+        input(fp16) --> cast(dtype="fp32") --> cast(dtype="fp16") --> out
 
-    def test_remove_redundant_casts(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
+        Output graph:
+        input --> identity --> out
+
+        This pattern should be fused, since it doesn't affect the computation precision
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp16)])
         def prog(x):
             x = mb.cast(x=x, dtype="fp32")
-            x = mb.square(x=x)
+            x = mb.cast(x=x, dtype="fp16")
+            return x
+
+        apply_pass_and_basic_check(prog, "common::cast_optimization")
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+        assert get_op_types_in_program(prog) == ["identity"]
+        assert block.outputs[0].dtype == types.fp16
+
+    def test_cast_ops_fusion_smoke_2(self):
+        """
+        Input graph:
+        input(int8) --> cast(dtype="fp16") --> cast(dtype="fp32") --> out
+
+        Output graph:
+        input --> cast(dtype="fp32") --> out
+
+        This pattern should be fused, since it doesn't affect the computation precision, given that the precision is limited by the program int8 input.
+        """
+
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(1,), dtype=types.int8)], opset_version=ct.target.iOS17
+        )
+        def prog(x):
+            x = mb.cast(x=x, dtype="fp16")
             x = mb.cast(x=x, dtype="fp32")
             return x
 
-        assert get_op_types_in_program(prog) == ["cast", "square", "cast"]
+        apply_pass_and_basic_check(prog, "common::cast_optimization")
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+
+        assert get_op_types_in_program(prog) == ["cast"]
+        assert block.find_ops(op_type="cast")[0].outputs[0].dtype == types.fp32
+        assert block.outputs[0].dtype == types.fp32
+
+    def test_cast_ops_fusion_smoke_3(self):
+        """
+        Input graph:
+        input(fp32) --> cast(dtype="fp16") --> cast(dtype="fp16") --> out
+
+        Output graph:
+        input --> cast(dtype="fp16") --> out
+
+        Two identical cast ops can be fused into one.
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)])
+        def prog(x):
+            x = mb.cast(x=x, dtype="fp16")
+            x = mb.cast(x=x, dtype="fp16")
+            return x
 
         apply_pass_and_basic_check(prog, "common::cast_optimization")
         _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
 
-        assert get_op_types_in_program(prog) == ["square"]
+        assert get_op_types_in_program(prog) == ["cast"]
+        assert block.find_ops(op_type="cast")[0].outputs[0].dtype == types.fp16
+        assert block.outputs[0].dtype == types.fp16
 
-        assert_model_is_valid(
-            prog,
-            {"x": (10, 20)},
-            expected_output_shapes={block.outputs[0].name: (10, 20)},
+    def test_cast_ops_fusion_smoke_4(self):
+        """
+        Input graph:
+        input(int8) --> cast(dtype="fp32") --> cast(dtype="int8") --> out
+
+        Output graph:
+        input --> identity --> out
+
+        There will be two staged of optimization:
+        1. cast(dtype=fp32) + cast(dtype=int8) fused into a single cast(dtype=int8)
+        2. cast(dtype=int8) is further removed
+        """
+
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(1,), dtype=types.int8)], opset_version=ct.target.iOS17
         )
+        def prog(x):
+            x = mb.cast(x=x, dtype="fp32")
+            x = mb.cast(x=x, dtype="int8")
+            return x
 
-    """
-    Input graph:
-    input -----> cast(dtype="fp16") -----> cast(dtype="fp32") ----> square ---> out
+        apply_pass_and_basic_check(prog, "common::cast_optimization")
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
 
-    Output graph:
-    input -----> square -----> out
-    """
+        assert get_op_types_in_program(prog) == ["identity"]
+        assert block.outputs[0].dtype == types.int8
 
-    def test_linear_consecutive_cast_ops_cancellation(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
+    def test_cast_ops_fusion_negative_smoke(self):
+        """
+        Input graph:
+        input(fp32) --> cast(dtype="fp16") --> cast(dtype="fp32") --> out
+
+        Output graph:
+        input --> cast --> cast --> out
+
+        This pattern should not be fused, since the precision is lowered.
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)])
         def prog(x):
             x = mb.cast(x=x, dtype="fp16")
             x = mb.cast(x=x, dtype="fp32")
-            x = mb.square(x=x)
             return x
 
-        assert get_op_types_in_program(prog) == ["cast", "cast", "square"]
-
         apply_pass_and_basic_check(prog, "common::cast_optimization")
         _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
 
-        assert get_op_types_in_program(prog) == ["square"]
+        assert get_op_types_in_program(prog) == ["cast", "cast"]
+        cast_ops = block.find_ops(op_type="cast")
+        assert cast_ops[0].outputs[0].dtype == types.fp16
+        assert cast_ops[1].outputs[0].dtype == types.fp32
+        assert block.outputs[0].dtype == types.fp32
 
-        assert_model_is_valid(
-            prog,
-            {"x": (10, 20)},
-            expected_output_shapes={block.outputs[0].name: (10, 20)},
+    def test_cast_ops_fusion_negative_smoke_2(self):
+        """
+        Input graph:
+        input(int32) --> cast(dtype="uint8") --> cast(dtype="int8") --> out
+
+        Output graph:
+        input --> cast --> cast --> out
+
+        This pattern should not be fused, since the data range results from uint8 -> int8
+        is [0, 127], while a single cast(int8) produces [-128, 127]. The data point between [-128, 0] will have wrong numerical result.
+        """
+
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(1,), dtype=types.int32)],
+            opset_version=ct.target.iOS17,
         )
+        def prog(x):
+            x = mb.cast(x=x, dtype="uint8")
+            x = mb.cast(x=x, dtype="int8")
+            return x
 
-    """
-    Input graph:
-    input---->cast(dtype="int32")---->cast(dtype="fp16")--->square--->out
+        apply_pass_and_basic_check(prog, "common::cast_optimization")
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
 
-    Output graph:
-    input----->cast(dtype="fp16")----->square--->out
+        assert get_op_types_in_program(prog) == ["cast", "cast"]
+        cast_ops = block.find_ops(op_type="cast")
+        assert cast_ops[0].outputs[0].dtype == types.uint8
+        assert cast_ops[1].outputs[0].dtype == types.int8
+        assert block.outputs[0].dtype == types.int8
+
+    @pytest.mark.parametrize(
+        "opset_version",
+        [ct.target.iOS14, ct.target.iOS17],
+    )
+    def test_cast_ops_fusion_stress(self, opset_version):
+        """
+        Test all possible dtype combination for each iOS version of cast.
+
+        Input graph:
+        input(dtype=dtype_a) -> cast(dtype=dtype_b) -> cast(dtype=dtype_c) -> out
+
+        Output graph:
+        The output graph can have cast ops with number from 0 to 2
+        """
+
+        def _test_cast_op_fusion(dtype_a, dtype_b, dtype_c):
+            @mb.program(
+                input_specs=[mb.TensorSpec(shape=(1,), dtype=dtype_a)], opset_version=opset_version
+            )
+            def prog(x):
+                x = mb.cast(x=x, dtype=builtin_to_string(dtype_b))
+                x = mb.cast(x=x, dtype=builtin_to_string(dtype_c))
+                return x
+
+            _, _, block = apply_pass_and_basic_check(prog, "common::cast_optimization")
+            assert block.outputs[0].dtype == dtype_c
+            return
+            cast_ops = block.find_ops(op_type="cast")
+            if dtype_a == dtype_b:
+                assert len(cast_ops) == 0
+            else:
+                assert len(cast_ops) == 1
+
+        opset_version_to_cast_op = {
+            ct.target.iOS14: _cast_iOS14,
+            ct.target.iOS17: _cast_iOS17,
+        }
+        cast_op = opset_version_to_cast_op[opset_version]
+        supported_dtypes = cast_op.type_domains["T"]
+        for dtype_a in supported_dtypes:
+            for dtype_b in supported_dtypes:
+                for dtype_c in supported_dtypes:
+                    _test_cast_op_fusion(dtype_a, dtype_b, dtype_c)
+
+class TestCastOptimizationComplexPatterns:
     """
+    Test cast ops fusion / romoval in some complex graph examples.
+    """
+    def test_linear_consecutive_cast_ops_cancellation(self):
+        """Test the cast optimization pass with more complicated patterns."""
 
-    def test_linear_consecutive_cast_ops_fusion(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
+        """
+        Input graph:
+        input(fp16) -----> cast(dtype="fp32") -----> cast(dtype="fp16") ----> square ---> out
+
+        Output graph:
+        input -----> square -----> out
+        """
+        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20), dtype=types.fp16)])
         def prog(x):
-            x = mb.cast(x=x, dtype="int32")
+            x = mb.cast(x=x, dtype="fp32")
             x = mb.cast(x=x, dtype="fp16")
             x = mb.square(x=x)
             return x
@@ -3584,8 +3904,7 @@ def prog(x):
         apply_pass_and_basic_check(prog, "common::cast_optimization")
         _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
 
-        assert get_op_types_in_program(prog) == ["cast", "square"]
-        assert block.find_ops(op_type="cast")[0].dtype.val == "fp16"
+        assert get_op_types_in_program(prog) == ["square"]
 
         assert_model_is_valid(
             prog,
@@ -3593,41 +3912,28 @@ def prog(x):
             expected_output_shapes={block.outputs[0].name: (10, 20)},
         )
 
-    """
-    Input graph:
-    input-->cast(dtype="fp16")-->cast(dtype="fp16")-->cast(dtype="int32")-->cast(dtype="int64")-->cast(dtype="fp32")-->cast(dtype="fp16")-->square->out
-
-    Output graph:
-    input---->cast(dtype="fp16")----->square--->out
-    """
+    def test_linear_consecutive_cast_ops_fusion(self):
+        """
+        Input graph:
+        input(fp32)---->cast(dtype="fp16")---->cast(dtype="bool")--->identity--->out
 
-    def test_linear_multiple_consecutive_cast_ops(self):
+        Output graph:
+        input(fp32)----->cast(dtype="bool")----->identity--->out
+        """
         @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
         def prog(x):
             x = mb.cast(x=x, dtype="fp16")
-            x = mb.cast(x=x, dtype="fp16")
-            x = mb.cast(x=x, dtype="int32")
-            x = mb.cast(x=x, dtype="int64")
-            x = mb.cast(x=x, dtype="fp32")
-            x = mb.cast(x=x, dtype="fp16")
-            x = mb.square(x=x)
+            x = mb.cast(x=x, dtype="bool")
+            x = mb.identity(x=x)
             return x
 
-        assert get_op_types_in_program(prog) == [
-            "cast",
-            "cast",
-            "cast",
-            "cast",
-            "cast",
-            "cast",
-            "square",
-        ]
+        assert get_op_types_in_program(prog) == ["cast", "cast", "identity"]
 
         apply_pass_and_basic_check(prog, "common::cast_optimization")
         _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
 
-        assert get_op_types_in_program(prog) == ["cast", "square"]
-        assert block.find_ops(op_type="cast")[0].dtype.val == "fp16"
+        assert get_op_types_in_program(prog) == ["cast", "identity"]
+        assert block.find_ops(op_type="cast")[0].dtype.val == "bool"
 
         assert_model_is_valid(
             prog,
@@ -3635,30 +3941,68 @@ def prog(x):
             expected_output_shapes={block.outputs[0].name: (10, 20)},
         )
 
-    """
-    Input graph:
-                               |---->cast(dtype="fp32")---->square--->out_1
-                               |
-    input---->cast(dtype="fp16")---->cast(dtype="fp32")---->relu--->out_2
-                               |
-                               |---->cast(dtype="fp32")---->log--->out_3
+    def test_linear_multiple_consecutive_cast_ops(self):
+        """
+        Input graph:
+        input(fp16)-->cast(dtype="fp32")-->cast(dtype="fp32")-->cast(dtype="int32")-->cast(dtype="fp32")-->cast(dtype="fp16")-->square->out
 
-    Output graph:
+        Output graph:
+        input(fp16)-->cast(dtype="int32")-->cast(dtype="fp16")-->square--->out
+        """
+        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20), dtype=types.fp16)])
+        def prog(x):
+            x = mb.cast(x=x, dtype="fp32")
+            x = mb.cast(x=x, dtype="fp32")
+            x = mb.cast(x=x, dtype="int32")
+            x = mb.cast(x=x, dtype="fp32")
+            x = mb.cast(x=x, dtype="fp16")
+            x = mb.square(x=x)
+            return x
 
-         |---->square--->out_1
-         |
-    input---->relu--->out_2
-         |
-         |---->log--->out_3
-    """
+        assert get_op_types_in_program(prog) == [
+            "cast",
+            "cast",
+            "cast",
+            "cast",
+            "cast",
+            "square",
+        ]
+
+        apply_pass_and_basic_check(prog, "common::cast_optimization")
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+        assert get_op_types_in_program(prog) == ["cast", "cast", "square"]
+        assert block.find_ops(op_type="cast")[0].dtype.val == "int32"
+        assert block.find_ops(op_type="cast")[1].dtype.val == "fp16"
+
+        assert_model_is_valid(
+            prog,
+            {"x": (10, 20)},
+            expected_output_shapes={block.outputs[0].name: (10, 20)},
+        )
 
     def test_same_consecutive_cancelling_casts_on_all_branches(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
+        """
+        Input graph:
+                                      |---->cast(dtype="fp16")---->square--->out_1
+                                      |
+        input(fp16)---->cast(dtype="fp32")---->cast(dtype="fp16")---->relu--->out_2
+                                      |
+                                      |---->cast(dtype="fp16")---->log--->out_3
+
+        Output graph:
+
+             |---->square--->out_1
+             |
+        input---->relu--->out_2
+             |
+             |---->log--->out_3
+        """
+        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20), dtype=types.fp16)])
         def prog(x):
-            x = mb.cast(x=x, dtype="fp16")
-            x1 = mb.cast(x=x, dtype="fp32")
-            x2 = mb.cast(x=x, dtype="fp32")
-            x3 = mb.cast(x=x, dtype="fp32")
+            x = mb.cast(x=x, dtype="fp32")
+            x1 = mb.cast(x=x, dtype="fp16")
+            x2 = mb.cast(x=x, dtype="fp16")
+            x3 = mb.cast(x=x, dtype="fp16")
             x4 = mb.square(x=x1)
             x5 = mb.relu(x=x2)
             x6 = mb.log(x=x3)
@@ -3689,33 +4033,34 @@ def prog(x):
             },
         )
 
-    """
-    Input graph:
-                                |---->cast(dtype="fp16")---->square--->out_1
-                                |
-    input---->cast(dtype="int32")---->cast(dtype="fp16")---->relu--->out_2
-                                |
-                                |---->cast(dtype="fp16")---->log--->out_3
+    def test_consecutive_fusable_casts_on_all_branches(self):
+        """
+        Input graph:
+                                         |---->cast(dtype="int32")---->square--->out_1
+                                         |
+        input(fp16)---->cast(dtype="fp32")---->cast(dtype="int32")---->abs--->out_2
+                                         |
+                                         |---->cast(dtype="int32")---->identity--->out_3
 
-    Output graph:
+        Output graph:
 
-                                |---->square--->out_1
-                                |
-    input---->cast(dtype="fp16")---->relu--->out_2
-                                |
-                                |---->log--->out_3
-    """
+                                          |-->square-->out_1
+                                          |
+        input(fp16)---->cast(dtype="int32")-->abs-->out_2
+                                          |
+                                          |-->identity->out_3
 
-    def test_consecutive_fusable_casts_on_all_branches(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
+        Note that, this result needs the assistant of another pass remove_redundant_ops
+        """
+        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20), dtype=types.fp16)])
         def prog(x):
-            x = mb.cast(x=x, dtype="int32")
-            x1 = mb.cast(x=x, dtype="fp16")
-            x2 = mb.cast(x=x, dtype="fp16")
-            x3 = mb.cast(x=x, dtype="fp16")
+            x = mb.cast(x=x, dtype="fp32")
+            x1 = mb.cast(x=x, dtype="int32")
+            x2 = mb.cast(x=x, dtype="int32")
+            x3 = mb.cast(x=x, dtype="int32")
             x4 = mb.square(x=x1)
-            x5 = mb.relu(x=x2)
-            x6 = mb.log(x=x3)
+            x5 = mb.abs(x=x2)
+            x6 = mb.identity(x=x3)
             return x4, x5, x6
 
         assert get_op_types_in_program(prog) == [
@@ -3724,15 +4069,32 @@ def prog(x):
             "cast",
             "cast",
             "square",
-            "relu",
-            "log",
+            "abs",
+            "identity",
         ]
 
         apply_pass_and_basic_check(prog, "common::cast_optimization")
         _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+        assert get_op_types_in_program(prog) == [
+            "cast",
+            "cast",
+            "cast",
+            "square",
+            "abs",
+            "identity",
+        ]
+        cast_ops = block.find_ops(op_type="cast")
+        assert all([v.dtype.val == "int32" for v in cast_ops])
 
-        assert get_op_types_in_program(prog) == ["cast", "square", "relu", "log"]
-        assert block.find_ops(op_type="cast")[0].dtype.val == "fp16"
+        apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+        assert get_op_types_in_program(prog) == [
+            "cast",
+            "square",
+            "abs",
+            "identity",
+        ]
+        assert block.find_ops(op_type="cast")[0].dtype.val == "int32"
 
         assert_model_is_valid(
             prog,
@@ -3744,48 +4106,48 @@ def prog(x):
             },
         )
 
-    """
-    Input graph:
-
-                                |---->cast(dtype="fp32")---->square--->out_1
-                                |
-                                |---->cast(dtype="fp16")---->square--->out_2
-                                |
-    input---->cast(dtype="int32")---->cast(dtype="fp16")---->relu--->out_3
-                                |
-                                |---->cast(dtype="fp16")---->log--->out_4
-                                |
-                                |---->cast(dtype="fp32")---->log--->out_5
+    def test_mixed_consecutive_casts_on_different_branches(self):
+        """
+        Input graph:
 
-    Output graph:
+                                    |---->cast(dtype="fp16")---->square--->out_1
+                                    |
+                                    |---->cast(dtype="int32")---->square--->out_2
+                                    |
+        input(fp16)---->cast(dtype="fp32")---->cast(dtype="int32")---->identity--->out_3
+                                    |
+                                    |---->cast(dtype="int32")---->abs--->out_4
+                                    |
+                                    |---->cast(dtype="fp16")---->abs--->out_5
 
-         |---->square--->out_1
-         |
-         |                      |---->square--->out_2
-         |                      |
-    input---->cast(dtype="fp16")---->relu--->out_3
-         |                      |
-         |                      |---->log--->out_4
-         |
-         |
-         |---->log--->out_5
+        Output graph:
 
-    """
+                 |---->square--->out_1
+                 |
+                 |                      |---->square--->out_2
+                 |                      |
+        input(fp16)---->cast(dtype="int32")---->identity--->out_3
+                 |                      |
+                 |                      |---->abs--->out_4
+                 |
+                 |
+                 |---->abs--->out_5
 
-    def test_mixed_consecutive_casts_on_different_branches(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
+        Note that, this result needs the assistant of another pass remove_redundant_ops
+        """
+        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20), dtype=types.fp16)])
         def prog(x):
-            x = mb.cast(x=x, dtype="int32")
-            x1 = mb.cast(x=x, dtype="fp32")
-            x2 = mb.cast(x=x, dtype="fp16")
-            x3 = mb.cast(x=x, dtype="fp16")
-            x4 = mb.cast(x=x, dtype="fp16")
-            x5 = mb.cast(x=x, dtype="fp32")
+            x = mb.cast(x=x, dtype="fp32")
+            x1 = mb.cast(x=x, dtype="fp16")
+            x2 = mb.cast(x=x, dtype="int32")
+            x3 = mb.cast(x=x, dtype="int32")
+            x4 = mb.cast(x=x, dtype="int32")
+            x5 = mb.cast(x=x, dtype="fp16")
             x6 = mb.square(x=x1)
             x7 = mb.square(x=x2)
-            x8 = mb.relu(x=x3)
-            x9 = mb.log(x=x4)
-            x10 = mb.log(x=x5)
+            x8 = mb.identity(x=x3)
+            x9 = mb.abs(x=x4)
+            x10 = mb.abs(x=x5)
             return x6, x7, x8, x9, x10
 
         assert get_op_types_in_program(prog) == [
@@ -3797,17 +4159,37 @@ def prog(x):
             "cast",
             "square",
             "square",
-            "relu",
-            "log",
-            "log",
+            "identity",
+            "abs",
+            "abs",
         ]
 
         apply_pass_and_basic_check(prog, "common::cast_optimization")
         _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+        assert get_op_types_in_program(prog) == [
+            "cast",
+            "cast",
+            "cast",
+            "square",
+            "square",
+            "identity",
+            "abs",
+            "abs",
+        ]
+        cast_ops = block.find_ops(op_type="cast")
+        assert all([v.dtype.val == "int32" for v in cast_ops])
 
-        assert get_op_types_in_program(prog) == ["cast", "square", "square", "relu", "log", "log"]
-        assert block.find_ops(op_type="cast")[0].dtype.val == "fp16"
-
+        apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+        assert get_op_types_in_program(prog) == [
+            "cast",
+            "square",
+            "square",
+            "identity",
+            "abs",
+            "abs",
+        ]
+        assert block.find_ops(op_type="cast")[0].dtype.val == "int32"
         assert_model_is_valid(
             prog,
             {"x": (10, 20)},
@@ -3818,62 +4200,61 @@ def prog(x):
             },
         )
 
-    """
-    Input graph:
-
-                                |---->cast(dtype="fp32")---->square--->out_1
-                                |
-    input---->cast(dtype="int32")---->cast(dtype="fp16")---->relu--->out_2
-                                |
-                                |---->log--->out_3
+    def test_different_consecutive_casts_config_on_different_branches(self):
+        """
+        Input graph:
 
+                                        |---->cast(dtype="fp16")---->square--->out_1
+                                        |
+        input(fp16)---->cast(dtype="fp32")---->cast(dtype="int32")---->exp2--->out_2
+                                        |
+                                        |---->abs--->out_3
 
-    Output graph:
 
-         |---->square--->out_1
-         |
-         |
-         |
-    input---->cast(dtype="fp16")---->relu--->out_2
-         |
-         |
-         |
-         |
-         |---->cast(dtype="int32")---->abs--->out_3
+        Output graph:
 
-    """
+                |---->square--->out_1
+                |
+                |
+                |
+        input(fp16)---->cast(dtype="int32")---->exp2--->out_2
+                |
+                |
+                |
+                |
+                |---->cast(dtype="fp32")---->abs--->out_3
 
-    def test_different_consecutive_casts__config_on_different_branches(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
+        """
+        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20), dtype=types.fp16)])
         def prog(x):
-            x = mb.cast(x=x, dtype="int32")
-            x1 = mb.cast(x=x, dtype="fp32")
-            x2 = mb.cast(x=x, dtype="fp16")
+            x = mb.cast(x=x, dtype="fp32")
+            x1 = mb.cast(x=x, dtype="fp16")
+            x2 = mb.cast(x=x, dtype="int32")
             x3 = mb.square(x=x1)
-            x4 = mb.relu(x=x2)
+            x4 = mb.exp2(x=x2)
             x5 = mb.abs(x=x)
             return x3, x4, x5
 
-        assert get_op_types_in_program(prog) == ["cast", "cast", "cast", "square", "relu", "abs"]
+        assert get_op_types_in_program(prog) == ["cast", "cast", "cast", "square", "exp2", "abs"]
 
         apply_pass_and_basic_check(prog, "common::cast_optimization")
         _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
 
-        assert get_op_types_in_program(prog) == ["cast", "cast", "square", "relu", "abs"]
+        assert get_op_types_in_program(prog) == ["cast", "cast", "square", "exp2", "abs"]
 
         # Asserting first cast configuration
         cast_1 = block.find_ops(op_type="cast")[0]
-        assert cast_1.dtype.val == "int32"
+        assert cast_1.dtype.val == "fp32"
         assert len(cast_1.outputs) == 1
         assert len(cast_1.outputs[0].child_ops) == 1
         assert cast_1.outputs[0].child_ops[0].op_type == "abs"
 
         # Asserting second cast configuration
         cast_2 = block.find_ops(op_type="cast")[1]
-        assert cast_2.dtype.val == "fp16"
+        assert cast_2.dtype.val == "int32"
         assert len(cast_2.outputs) == 1
         assert len(cast_2.outputs[0].child_ops) == 1
-        assert cast_2.outputs[0].child_ops[0].op_type == "relu"
+        assert cast_2.outputs[0].child_ops[0].op_type == "exp2"
 
         assert_model_is_valid(
             prog,
@@ -3885,25 +4266,24 @@ def prog(x):
             },
         )
 
-    """
-    Input graph:
-    input(dtype="fp16")---->relu----->relu
-                                      |
-                              --------|
-                              |
-                              V
-                             cast(dtype="fp32")---->cast(dtype="fp16")
-                                                      |
-                                ----------------------|
-                                |
-                                V
-                             cast(dtype="fp32")---->cast(dtype="fp16")---->output(dtype="fp16")
-
-    Output graph:
-    input(dtype="fp16")---->relu----->relu---->output(dtype="fp16")
-    """
-
     def test_two_casts_at_the_end(self):
+        """
+        Input graph:
+        input(dtype="fp16")---->relu----->relu
+                                          |
+                                  --------|
+                                  |
+                                  V
+                                 cast(dtype="fp32")---->cast(dtype="fp16")
+                                                          |
+                                    ----------------------|
+                                    |
+                                    V
+                                 cast(dtype="fp32")---->cast(dtype="fp16")---->output(dtype="fp16")
+
+        Output graph:
+        input(dtype="fp16")---->relu----->relu---->output(dtype="fp16")
+        """
         @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20), dtype=types.fp16)])
         def prog(x):
             x = mb.relu(x=x)
@@ -3922,6 +4302,242 @@ def prog(x):
         assert block.outputs[0].name == "original_output_name"
         assert block.outputs[0].dtype == types.fp16
 
+    def test_mixed_consecutive_casts_on_different_branches_complex(self):
+        """
+        Input graph:
+
+                                    |->cast(dtype="fp16")->cast(dtype="fp16")->out_1
+                                    |
+        input(fp16)---->cast(dtype="fp32")->cast(dtype="uint8")->cast(dtype="int8")->out_2
+                                    |
+                                    |->cast(dtype="int32")->out_3
+                                    |
+                                    |->cast(dtype="int32")->cast(dtype="float32")->out_4
+
+        Output graph:
+
+                    |-->out_1
+                    |
+        input(fp16)-->cast(dtype="uint8")-->cast(dtype="int8")-->out_2
+                    |
+                    .-->cast(dtype="int32")-->out_3
+                                           |
+                                           .-->cast(dtype="float32")-->out_4
+
+        Note that, this result needs the assistant of another pass remove_redundant_ops
+        """
+
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp16)], opset_version=ct.target.iOS17
+        )
+        def prog(x):
+            x = mb.cast(x=x, dtype="fp32")
+            x1 = mb.cast(x=x, dtype="fp16")
+            x1 = mb.cast(x=x1, dtype="fp16")
+            x2 = mb.cast(x=x, dtype="uint8")
+            x2 = mb.cast(x=x2, dtype="int8")
+            x3 = mb.cast(x=x, dtype="int32")
+            x4 = mb.cast(x=x, dtype="int32")
+            x4 = mb.cast(x=x4, dtype="fp32")
+            return x2, x3, x4
+
+        assert get_op_types_in_program(prog) == ["cast"] * 8
+        apply_pass_and_basic_check(prog, "common::cast_optimization")
+        apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+        assert get_op_types_in_program(prog) == ["cast"] * 4
+
+        expected_cast_dtype = ["uint8", "int8", "int32", "fp32"]
+        cast_ops = block.find_ops(op_type="cast")
+        assert [v.dtype.val for v in cast_ops] == expected_cast_dtype
+
+
+class TestCastOptimizationAcrossBlocks:
+    """
+    Test the cast optmization for cast ops at the boundary of inner and outer block.
+    """
+    def test_cast_ops_fuse_across_block_smoke_1(self):
+        """
+        Input graph:
+        main[CoreML3](%x: (1,int32)(Tensor)) {
+        main[CoreML3](%x: (1,int32)(Tensor)) {
+          block0() {
+            %cast_0: (1,fp32)(Tensor) = cast(x=%x, dtype="fp32", name="cast_0")
+            %cond_0: (1,fp32)(Tensor) = cond(pred=True, name="cond_0")
+              cond_0_true() {
+                %cast_1: (1,fp32)(Tensor) = cast(x=%cast_0, dtype="fp32", name="cast_1")
+              } -> (%cast_1)
+              cond_0_false() {
+                %cast_2: (1,fp32)(Tensor) = cast(x=%cast_0, dtype="fp32", name="cast_2")
+              } -> (%cast_2)
+          } -> (%cond_0)
+        }
+
+        Output graph:
+        main[CoreML3](%x: (1,int32)(Tensor)) {
+          block0() {
+            %cast_0: (1,fp32)(Tensor) = cast(x=%x, dtype="fp32", name="cast_0")
+            %cond_0: (1,fp32)(Tensor) = cond(pred=True, name="cond_0")
+              cond_0_true() {
+              } -> (%cast_0)
+              cond_0_false() {
+              } -> (%const_0)
+          } -> (%cond_0)
+        }
+        """
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1,), dtype=types.int32)])
+        def prog(x):
+            x = mb.cast(x=x, dtype="fp32")
+            def _true_fn():
+                return mb.cast(x=x, dtype="fp32")
+
+            def _false_fn():
+                return mb.cast(x=x, dtype="fp32")
+
+            return mb.cond(pred=True, _true_fn=_true_fn, _false_fn=_false_fn)
+
+        _, _, block = apply_pass_and_basic_check(prog, "common::cast_optimization")
+        assert get_op_types_in_program(prog) == ["cast", "cond"]
+
+        cast_op = block.find_ops(op_type="cast")[0]
+        assert cast_op.dtype.val == "fp32"
+
+        cond_op = block.find_ops(op_type="cond")[0]
+        true_block, false_block = cond_op.blocks
+        assert get_op_types_in_block(true_block) == []
+        assert get_op_types_in_block(false_block) == []
+        assert true_block.outputs[0] == cast_op.outputs[0]
+        assert false_block.outputs[0] == cast_op.outputs[0]
+
+    def test_cast_ops_fuse_across_block_smoke_2(self):
+        """
+        Input graph:
+        main[CoreML3](%x: (1,fp32)(Tensor)) {
+          block0() {
+            %cast_0: (1,fp32)(Tensor) = cast(x=%x, dtype="fp32", name="cast_0")
+            %cond_0: (1,fp32)(Tensor) = cond(pred=True, name="cond_0")
+              cond_0_true() {
+                %cast_1: (1,fp32)(Tensor) = cast(x=%cast_0, dtype="fp32", name="cast_1")
+              } -> (%cast_1)
+              cond_0_false() {
+                %cast_2: (1,fp32)(Tensor) = cast(x=%cast_0, dtype="fp32", name="cast_2")
+              } -> (%cast_2)
+          } -> (%cond_0)
+        }
+
+        Output graph:
+        main[CoreML3](%x: (1,fp32)(Tensor)) {
+          block0() {
+            %cond_0: (1,fp32)(Tensor) = cond(pred=True, name="cond_0")
+              cond_0_true() {
+              } -> (%x)
+              cond_0_false() {
+              } -> (%x)
+          } -> (%cond_0)
+        }
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)])
+        def prog(x):
+            x = mb.cast(x=x, dtype="fp32")
+
+            def _true_fn():
+                return mb.cast(x=x, dtype="fp32")
+
+            def _false_fn():
+                return mb.cast(x=x, dtype="fp32")
+
+            return mb.cond(pred=True, _true_fn=_true_fn, _false_fn=_false_fn)
+
+        _, _, block = apply_pass_and_basic_check(prog, "common::cast_optimization")
+        assert get_op_types_in_program(prog) == ["cond"]
+
+        cond_op = block.find_ops(op_type="cond")[0]
+        true_block, false_block = cond_op.blocks
+        assert get_op_types_in_block(true_block) == []
+        assert get_op_types_in_block(false_block) == []
+        assert true_block.outputs[0] == block.inputs["x"]
+        assert false_block.outputs[0] == block.inputs["x"]
+
+    def test_cast_ops_fuse_across_block_smoke_3(self):
+        """
+        Input graph:
+        main[CoreML7](%x: (1,int32)(Tensor)) {
+          block0() {
+            %cast_0: (1,fp32)(Tensor) = cast(x=%x, dtype="fp32", name="cast_0")
+            %cond_0: (1,uint8)(Tensor) = cond(pred=True, name="cond_0")
+              cond_0_true() {
+                %cast_1: (1,int32)(Tensor) = cast(x=%cast_0, dtype="int32", name="cast_1")
+                %cast_2: (1,uint8)(Tensor) = cast(x=%cast_1, dtype="uint8", name="cast_2")
+                %cast_3: (1,fp32)(Tensor) = cast(x=%cast_2, dtype="fp32", name="cast_3")
+                %cast_4: (1,uint8)(Tensor) = cast(x=%cast_3, dtype="uint8", name="cast_4")
+              } -> (%cast_4)
+              cond_0_false() {
+                %cast_5: (1,int8)(Tensor) = cast(x=%cast_0, dtype="int8", name="cast_5")
+                %cast_6: (1,bool)(Tensor) = cast(x=%cast_5, dtype="bool", name="cast_6")
+                %cast_7: (1,uint8)(Tensor) = cast(x=%cast_6, dtype="uint8", name="cast_7")
+              } -> (%cast_7)
+          } -> (%cond_0)
+        }
+
+        Output graph:
+        main[CoreML7](%x: (1,int32)(Tensor)) {
+          block0() {
+            %cond_0: (1,uint8)(Tensor) = cond(pred=True, name="cond_0")
+              cond_0_true() {
+                %x_to_uint8: (1,uint8)(Tensor) = cast(x=%x, dtype="uint8", name="x_to_uint8")
+              } -> (%x_to_uint8)
+              cond_0_false() {
+                %x_to_bool: (1,bool)(Tensor) = cast(x=%x, dtype="bool", name="x_to_bool")
+                %cast_7: (1,uint8)(Tensor) = cast(x=%x_to_bool, dtype="uint8", name="cast_7")
+              } -> (%cast_7)
+          } -> (%cond_0)
+        }
+
+        This is a more complex example:
+        First, in the true branch, 4 ``cast`` ops are optimized into a single ``cast(dtype="uint8")``. In the false branch, 3 ``cast`` ops are optimized to ``cast(dtype="bool")->cast(dtype="uint8")``
+        Second, the first ``cast`` op in each inner block is fused with the outer ``cast_0`` op, resulting in the above output graph.
+        """
+
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(1,), dtype=types.int32)],
+            opset_version=ct.target.iOS17,
+        )
+        def prog(x):
+            x = mb.cast(x=x, dtype="fp32")
+
+            def _true_fn():
+                x1 = mb.cast(x=x, dtype="int32")
+                x1 = mb.cast(x=x1, dtype="uint8")
+                x1 = mb.cast(x=x1, dtype="fp32")
+                return mb.cast(x=x1, dtype="uint8")
+
+            def _false_fn():
+                x2 = mb.cast(x=x, dtype="int8")
+                x2 = mb.cast(x=x2, dtype="bool")
+                return mb.cast(x=x2, dtype="uint8")
+
+            return mb.cond(pred=True, _true_fn=_true_fn, _false_fn=_false_fn)
+
+        _, _, block = apply_pass_and_basic_check(prog, "common::cast_optimization")
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+
+        assert get_op_types_in_program(prog) == ["cond"]
+
+        cond_op = block.find_ops(op_type="cond")[0]
+        true_block, false_block = cond_op.blocks
+        assert get_op_types_in_block(true_block) == ["cast"]
+        assert get_op_types_in_block(false_block) == ["cast"] * 2
+
+        expected_true_branch_types = ["uint8"]
+        expected_false_branch_types = ["bool", "uint8"]
+
+        assert expected_true_branch_types == [
+            v.dtype.val for v in true_block.find_ops(op_type="cast")
+        ]
+        assert expected_false_branch_types == [
+            v.dtype.val for v in false_block.find_ops(op_type="cast")
+        ]
 
 class TestConv1dCompositionPasses:
     @pytest.mark.parametrize(
@@ -6474,6 +7090,188 @@ def prog(x):
             assert_model_is_valid(prog, {"x": (2, 4)})
 
 
+class TestSelectOptimization:
+    @pytest.mark.parametrize(
+        "cond_val, is_cond_scalar, need_broadcast, is_block_output",
+        itertools.product(
+            (True, False),
+            (True, False),
+            (True, False),
+            (True, False),
+        ),
+    )
+    def test_const_scalar_cond(self, cond_val, is_cond_scalar, need_broadcast, is_block_output):
+        """
+        Input graph:
+
+            const(cond) -|
+                         |
+            a -----------|-> select -> (add 1.0 if not is_block_output) -> output
+                         |
+            b -----------|
+
+        If a and b need broadcast, then nothing is changed; else output graph becomes:
+
+            if cond:
+                if is_block_output:
+                    a -> identity -> output
+                else:
+                    a -> add 1.0 -> output
+            else:
+                if is_block_output:
+                    b -> identity -> output
+                else:
+                    b -> add 1.0 -> output
+        """
+        SHAPE = (5, 2, 3)
+
+        if need_broadcast:
+            a_shape = (5, 2, 1)
+            b_shape = (5, 1, 3)
+        else:
+            a_shape = SHAPE
+            b_shape = SHAPE
+
+        if is_cond_scalar:
+            cond = cond_val
+        else:
+            cond_shape = (5, 1, 1)
+            cond = np.full(cond_shape, cond_val)
+
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=a_shape),
+                mb.TensorSpec(shape=b_shape),
+            ]
+        )
+        def prog(a, b):
+            c = mb.select(cond=cond, a=a, b=b)
+            if not is_block_output:
+                c = mb.add(x=c, y=1.0)
+            return c
+
+        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::select_optimization")
+        apply_pass_and_basic_check(prog, "common::noop_elimination")
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+        # check previous program
+        if is_block_output:
+            assert get_op_types_in_program(prev_prog) == ["select"]
+        else:
+            assert get_op_types_in_program(prev_prog) == ["select", "add"]
+        # check passed program
+        if is_block_output:
+            if need_broadcast:
+                assert get_op_types_in_program(prog) == ["select"]
+            else:
+                assert get_op_types_in_program(prog) == ["identity"]
+        else:
+            if need_broadcast:
+                assert get_op_types_in_program(prog) == ["select", "add"]
+            else:
+                assert get_op_types_in_program(prog) == ["add"]
+
+        output_name = block.outputs[0].name
+        assert_model_is_valid(
+            prog,
+            {"a": a_shape, "b": b_shape},
+            expected_output_shapes={output_name: SHAPE},
+        )
+
+        prev_model = ct.convert(
+            prev_prog,
+            pass_pipeline=ct.PassPipeline.EMPTY,
+            convert_to="mlprogram",
+            compute_units=ct.ComputeUnit.CPU_ONLY,
+        )
+        model = ct.convert(
+            prog,
+            pass_pipeline=ct.PassPipeline.EMPTY,
+            convert_to="mlprogram",
+            compute_units=ct.ComputeUnit.CPU_ONLY,
+        )
+
+        a = np.random.rand(*a_shape)
+        b = np.random.rand(*b_shape)
+        input_dict = {"a": a, "b": b}
+        prev_output = prev_model.predict(input_dict)[output_name]
+        output = model.predict(input_dict)[output_name]
+        np.testing.assert_allclose(prev_output, output, rtol=0.0, atol=0.0)
+
+    @pytest.mark.parametrize(
+        "is_a_const, is_fill_scalar",
+        itertools.product((True, False), (True, False)),
+    )
+    def test_inf_const_selection(self, is_a_const, is_fill_scalar):
+        """
+        Input graph if is_a_const (else input and fill are swapped):
+
+            const(cond) ------|
+                              |
+            input ------------|-> select -> tanh -> output
+                              |
+            const(±inf fill) -|
+
+        Output graph:
+
+            input -> add -> tanh -> output
+        """
+        INPUT_SHAPE = (5, 2, 3)
+
+        cond_shape = (2, 3)
+
+        while True:
+            cond = np.random.randint(0, 2, size=cond_shape) == 0
+            if not np.all(cond) and not np.all(np.logical_not(cond)):
+                break
+
+        if is_fill_scalar:
+            fill = np.float16(-np.inf)
+        else:
+            fill_shape = (5, 2, 1)
+            fill = np.empty(fill_shape, dtype=np.float16)
+            neg_pos = np.random.randint(0, 2, size=fill_shape)
+            fill[np.where(neg_pos == 0)] = -np.inf
+            fill[np.where(neg_pos == 1)] = np.inf
+
+        output_shape = INPUT_SHAPE
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=INPUT_SHAPE, dtype=types.fp16)])
+        def prog(x):
+            if is_a_const:
+                y = mb.select(cond=cond, a=fill, b=x)
+            else:
+                y = mb.select(cond=cond, a=x, b=fill)
+            return mb.tanh(x=y)
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::select_optimization")
+        assert get_op_types_in_program(prev_prog) == ["select", "tanh"]
+        assert get_op_types_in_program(prog) == ["add", "tanh"]
+
+        output_name = block.outputs[0].name
+        assert_model_is_valid(
+            prog,
+            {"x": INPUT_SHAPE},
+            expected_output_shapes={output_name: output_shape},
+        )
+
+        prev_model = ct.convert(
+            prev_prog,
+            pass_pipeline=ct.PassPipeline.EMPTY,
+            convert_to="mlprogram",
+        )
+        model = ct.convert(
+            prog,
+            pass_pipeline=ct.PassPipeline.EMPTY,
+            convert_to="mlprogram",
+        )
+
+        a = 65500.0 * np.random.rand(*INPUT_SHAPE)
+        input_dict = {"x": a}
+        prev_output = prev_model.predict(input_dict)[output_name]
+        output = model.predict(input_dict)[output_name]
+        np.testing.assert_allclose(prev_output, output, rtol=0.0, atol=0.0)
+
+
 class TestFuseElementwiseToBatchNorm:
     """
     Input graph:
@@ -6685,7 +7483,10 @@ def test_nn_backend_style_sanitization(self):
         prog.add_function("main", ssa_fun)
 
         prev_prog, prev_block, block = apply_pass_and_basic_check(
-            prog, "common::sanitize_input_output_names", skip_output_name_check=True
+            prog,
+            "common::sanitize_input_output_names",
+            skip_output_name_check=True,
+            skip_input_name_check=True,
         )
 
         relu_op = prog.find_ops(op_type="relu", exactly_one=True)[0]
@@ -6694,7 +7495,7 @@ def test_nn_backend_style_sanitization(self):
         assert block.outputs[0].name == "out_1"  # output name: sanitized
 
         # convert prev_prog to NN backend
-        mlmodel = ct.convert(prev_prog)
+        mlmodel = ct.convert(prev_prog, convert_to="neuralnetwork")
         spec = mlmodel._spec
         assert spec.description.input[0].name == "x_0"
         assert spec.description.output[0].name == "out_1"
@@ -6734,7 +7535,7 @@ def prog(input):
 
         prog.set_main_output_types([ct.TensorType(dtype=np.float16)])
         prev_prog, prev_block, block = apply_pass_and_basic_check(
-            prog, "common::update_output_dtypes"
+            prog, "common::update_output_dtypes", skip_output_type_check=True
         )
         assert get_op_types_in_program(prev_prog) == ["abs", "square"]
         assert prev_block.outputs[0].dtype == types.int32
@@ -6770,11 +7571,46 @@ def prog(input):
             return x1, x2
 
         prog.set_main_output_types([ct.TensorType(), ct.TensorType(dtype=np.float16)])
-        _, _, block = apply_pass_and_basic_check(prog, "common::update_output_dtypes")
+        _, _, block = apply_pass_and_basic_check(
+            prog, "common::update_output_dtypes", skip_output_type_check=True
+        )
         assert get_op_types_in_program(prog) == ["split", "cast"]
         assert block.outputs[1].dtype == types.fp16
         assert block.outputs[1].name == "split_1"
 
+    def test_output_as_input(self, caplog):
+        """
+        Given:
+        -----
+        main(%input: (3, fp32)(Tensor)) {
+          block0() {
+          } -> (input)
+        }
+        prog.main_output_types = [ct.TensorType(dtype=np.float16)]
+
+        Result:
+        Since the output var is also an input var, the dtype is not changed, and a warning message is thrown
+        ------
+        main(%input: (3, fp32)(Tensor)) {
+          block0() {
+          } -> (input)
+        }
+
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(3,), dtype=types.fp32)])
+        def prog(input):
+            return input
+
+        prog.set_main_output_types([ct.TensorType(dtype=np.float16)])
+        _, _, block = apply_pass_and_basic_check(
+            prog,
+            "common::update_output_dtypes",
+        )
+        warning_msg = "Output var 'input' is also an input var, hence the dtype cannot be changed: output var 'input' remains dtype fp32"
+        assert any([warning_msg in rec.message for rec in caplog.records])
+        assert get_op_types_in_program(prog) == []
+        assert block.outputs[0].dtype == types.fp32
 
 class TestFuseLayerNormOrInstanceNorm:
     @pytest.mark.parametrize("axes_size", [1, 2, 3])
diff --git a/coremltools/converters/mil/mil/passes/tests/test_quantization_passes.py b/coremltools/converters/mil/mil/passes/tests/test_quantization_passes.py
index a3da5d655..5d0c43333 100644
--- a/coremltools/converters/mil/mil/passes/tests/test_quantization_passes.py
+++ b/coremltools/converters/mil/mil/passes/tests/test_quantization_passes.py
@@ -5,7 +5,6 @@
 
 import itertools
 from typing import Tuple
-import unittest
 
 import numpy as np
 import parameterized
@@ -100,6 +99,235 @@ def generate_random_quantize_input(
         return float_dtype(x_fp)
 
 
+class TestIntOpCanonicalization:
+    @pytest.mark.parametrize("op_type", ["reshape"])
+    def test_canonicalize_int_op(self, op_type):
+        """
+        Input graph:
+
+            input -> quantize -> dequantize -> int op -> quantize -> dequantize -> output
+
+        Output graph:
+
+            input -> quantize -> int op -> dequantize -> output
+        """
+        input_shape = (5, 6)
+        output_shape = (5, 2, 3)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=input_shape)], opset_version=ct.target.iOS17)
+        def prog(x):
+            quantize_0 = mb.quantize(input=x, scale=0.1, output_dtype="int8")
+            dequantize_1 = mb.dequantize(input=quantize_0, scale=0.1)
+            if op_type == "reshape":
+                reshape = mb.reshape(x=dequantize_1, shape=output_shape)
+            quantize_1 = mb.quantize(input=reshape, scale=0.1, output_dtype="int8")
+            dequantize_2 = mb.dequantize(input=quantize_1, scale=0.1)
+            return dequantize_2
+
+        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::int_op_canonicalization")
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+        assert get_op_types_in_program(prev_prog) == [
+            "quantize",
+            "dequantize", "reshape", "quantize",
+            "dequantize",
+        ]
+        assert get_op_types_in_program(prog) == ["quantize", "reshape", "dequantize"]
+
+        assert_model_is_valid(
+            prog,
+            {"x": input_shape},
+            expected_output_shapes={block.outputs[0].name: output_shape},
+            minimum_deployment_target=ct.target.iOS17,
+            backend=("mlprogram", "fp32"),
+        )
+
+    @pytest.mark.parametrize("all_are_int", (True, False))
+    def test_canonicalize_versatile_inputs(self, all_are_int):
+        """
+        Input graph:
+
+                                             |-> int op 0 if all_are_int else add -> quantize -> dequantize -> output_0
+            input -> quantize -> dequantize -|
+                                             |-> int op 1 -> quantize -> dequantize -> output_1
+
+        Output graph:
+
+            if all_are_int:
+
+                                   |-> int op 0 -> dequantize -> output_0
+                input -> quantize -|
+                                   |-> int op 1 -> dequantize -> output_1
+
+            else:
+
+                                   |-> dequantize -> add -> quantize -> dequantize -> output_0
+                input -> quantize -|
+                                   |-> int op 1 -> dequantize -> output_1
+        """
+        input_shape = (5, 6)
+        output_shape = (5, 2, 3)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=input_shape)], opset_version=ct.target.iOS17)
+        def prog(x):
+            quantize_0 = mb.quantize(input=x, scale=0.1, output_dtype="int8")
+            dequantize_1 = mb.dequantize(input=quantize_0, scale=0.1)
+
+            # int op 0 (here reshape) path
+            if all_are_int:
+                reshape = mb.reshape(x=dequantize_1, shape=output_shape)
+                quantize_1_0 = mb.quantize(input=reshape, scale=0.1, output_dtype="int8")
+                dequantize_2_0 = mb.dequantize(input=quantize_1_0, scale=0.1)
+            # float op (here add) path
+            else:
+                add = mb.add(x=dequantize_1, y=1.0)
+                quantize_1_0 = mb.quantize(input=add, scale=0.1, output_dtype="int8")
+                dequantize_2_0 = mb.dequantize(input=quantize_1_0, scale=0.1)
+
+            # int op 1 (here reshape) path
+            reshape = mb.reshape(x=dequantize_1, shape=output_shape)
+            quantize_1_1 = mb.quantize(input=reshape, scale=0.1, output_dtype="int8")
+            dequantize_2_1 = mb.dequantize(input=quantize_1_1, scale=0.1)
+
+            return dequantize_2_0, dequantize_2_1, 
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::int_op_canonicalization")
+        if all_are_int:
+            _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+            assert get_op_types_in_program(prev_prog) == [
+                "quantize", "dequantize",
+                "reshape", "quantize", "dequantize",
+                "reshape", "quantize", "dequantize",
+            ]
+            assert get_op_types_in_program(prog) == [
+                "quantize",
+                "reshape", "dequantize",
+                "reshape", "dequantize",
+            ]
+        else:
+            assert get_op_types_in_program(prev_prog) == [
+                "quantize", "dequantize",
+                "add", "quantize", "dequantize",
+                "reshape", "quantize", "dequantize",
+            ]
+            assert get_op_types_in_program(prog) == [
+                "quantize",
+                "dequantize", "add", "quantize", "dequantize",
+                "reshape", "dequantize",
+            ]
+
+        assert_model_is_valid(
+            prog,
+            {"x": input_shape},
+            expected_output_shapes={
+                block.outputs[0].name: output_shape if all_are_int else input_shape,
+                block.outputs[1].name: output_shape,
+            },
+            minimum_deployment_target=ct.target.iOS17,
+            backend=("mlprogram", "fp32"),
+        )
+
+    def test_canonicalize_consecutive_int_ops(self):
+        """
+        Input graph:
+
+            input -> quantize -> dequantize -> int op 0 -> quantize -> dequantize -> int op 1 -> quantize -> dequantize -> output
+
+        Output graph:
+
+            input -> quantize -> int op 0 -> int op 1 -> dequantize -> output
+        """
+        input_shape = (5, 6)
+        activation_shape = (10, 3)
+        output_shape = (5, 2, 3)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=input_shape)], opset_version=ct.target.iOS17)
+        def prog(x):
+            quantize_0 = mb.quantize(input=x, scale=0.1, output_dtype="int8")
+
+            dequantize_1 = mb.dequantize(input=quantize_0, scale=0.1)
+            reshape0 = mb.reshape(x=dequantize_1, shape=activation_shape)
+            quantize_1 = mb.quantize(input=reshape0, scale=0.1, output_dtype="int8")
+
+            dequantize_2 = mb.dequantize(input=quantize_1, scale=0.1)
+            reshape1 = mb.reshape(x=dequantize_2, shape=output_shape)
+            quantize_2 = mb.quantize(input=reshape1, scale=0.1, output_dtype="int8")
+
+            dequantize_3 = mb.dequantize(input=quantize_2, scale=0.1)
+            return dequantize_3
+
+        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::int_op_canonicalization")
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+        assert get_op_types_in_program(prev_prog) == [
+            "quantize",
+            "dequantize", "reshape", "quantize",
+            "dequantize", "reshape", "quantize",
+            "dequantize",
+        ]
+        assert get_op_types_in_program(prog) == ["quantize", "reshape", "reshape", "dequantize"]
+
+        assert_model_is_valid(
+            prog,
+            {"x": input_shape},
+            expected_output_shapes={block.outputs[0].name: output_shape},
+            minimum_deployment_target=ct.target.iOS17,
+            backend=("mlprogram", "fp32"),
+        )
+
+    def test_canonicalize_block_output_input(self):
+        """
+        Input graph:
+
+                                             |-> output_0
+            input -> quantize -> dequantize -|
+                                             |-> int op -> quantize -> dequantize -> output_1
+
+        Output graph:
+
+                               |-> dequantize -> output_0
+            input -> quantize -|
+                               |-> int op -> dequantize -> output_1
+        """
+        input_shape = (5, 6)
+        output_shape = (5, 2, 3)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=input_shape)], opset_version=ct.target.iOS17)
+        def prog(x):
+            quantize_0 = mb.quantize(input=x, scale=0.1, output_dtype="int8")
+            dequantize_1 = mb.dequantize(input=quantize_0, scale=0.1)
+
+            reshape = mb.reshape(x=dequantize_1, shape=output_shape)
+            quantize_1 = mb.quantize(input=reshape, scale=0.1, output_dtype="int8")
+            dequantize_2 = mb.dequantize(input=quantize_1, scale=0.1)
+
+            return dequantize_1, dequantize_2
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::int_op_canonicalization")
+        assert get_op_types_in_program(prev_prog) == [
+            "quantize", "dequantize",
+            "reshape", "quantize", "dequantize",
+        ]
+        assert get_op_types_in_program(prog) == [
+            "quantize",
+            "dequantize",
+            "reshape", "dequantize",
+        ]
+
+        assert_model_is_valid(
+            prog,
+            {"x": input_shape},
+            expected_output_shapes={
+                block.outputs[0].name: input_shape,
+                block.outputs[1].name: output_shape,
+            },
+            minimum_deployment_target=ct.target.iOS17,
+            backend=("mlprogram", "fp32"),
+        )
+
+    # TODO (rdar://112297858): test the case where `int_op_canonicalization`
+    # refuses to transform because the "int op" is from an older iOS version
+    # that does not support int8 and uint8
+
+
 class TestNullifyRedundantQuantizationZeroPoint:
     @staticmethod
     def np_dtype_to_str(np_dtype: np.dtype) -> str:
@@ -1329,7 +1557,11 @@ def prog(x):
         assert get_op_types_in_program(prog) == ["dequantize"]
 
 
-class TestFP16CastTransform(unittest.TestCase):
+class TestFP16CastTransform:
+    def assertEqual(self, first, second):
+        """A convenience method to migrate from unittest (self.assertEqual) to pytest."""
+        assert first == second
+
     def test_single_input_to_single_operation(self):
         """
         Input graph:
@@ -1655,3 +1887,33 @@ def prog(x):
             expected_output_shapes={block.outputs[0].name: (1, 2), block.outputs[1].name: (1, 2)},
             backend=("mlprogram", "fp16"),
         )
+
+    @pytest.mark.parametrize(
+        "opset_version, op_name",
+        itertools.product(
+            [None, ct.target.iOS17],
+            ["inverse", "log", "rsqrt"],
+        ),
+    )
+    def test_epsilon_mixed_precision(self, opset_version, op_name):
+        """The IOS17+ elementwise unary ops with epsilon support mixed precision."""
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))], opset_version=opset_version)
+        def prog(x):
+            return getattr(mb, op_name)(x=x, epsilon=0.1)
+
+        _, _, block = apply_pass_and_basic_check(prog, "common::add_fp16_cast")
+
+        expected_ops = ["cast", "cast", op_name, "cast"]
+        if opset_version is not None and opset_version >= ct.target.iOS17:
+            # Allow mixed precision, so the epsilon is not cast to fp16.
+            expected_ops = ["cast", op_name, "cast"]
+        assert get_op_types_in_program(prog) == expected_ops
+
+        assert_model_is_valid(
+            prog,
+            {"x": (2, 3)},
+            expected_output_shapes={block.outputs[0].name: (2, 3)},
+            backend=("mlprogram", "fp16"),
+            minimum_deployment_target=opset_version,
+        )
diff --git a/coremltools/converters/mil/mil/tests/test_programs.py b/coremltools/converters/mil/mil/tests/test_programs.py
index 80feeb1bd..fb0d2ea41 100644
--- a/coremltools/converters/mil/mil/tests/test_programs.py
+++ b/coremltools/converters/mil/mil/tests/test_programs.py
@@ -333,7 +333,12 @@ def prog(x):
                 res = mb.reshape(x=x, shape=(1, 1, 1, 1, 1, 1), name="reshape_0")
                 return res
 
-            ct.convert(prog, source="milinternal", compute_units=ct.ComputeUnit.CPU_ONLY)
+            ct.convert(
+                prog,
+                source="milinternal",
+                convert_to="neuralnetwork",
+                compute_units=ct.ComputeUnit.CPU_ONLY,
+            )
 
     @staticmethod
     def test_rank5_list_early_error_out():
diff --git a/coremltools/converters/mil/mil/types/__init__.py b/coremltools/converters/mil/mil/types/__init__.py
index 004e981ce..b49028e6e 100644
--- a/coremltools/converters/mil/mil/types/__init__.py
+++ b/coremltools/converters/mil/mil/types/__init__.py
@@ -11,20 +11,36 @@
 from .type_dict import dict, empty_dict
 from .type_double import double, float, fp16, fp32, fp64, is_float
 from .type_globals_pseudo_type import globals_pseudo_type
-from .type_int import (int8, int16, int32, int64, is_int, uint, uint8,
-                       uint16, uint32, uint64)
+from .type_int import int8, int16, int32, int64, is_int, uint, uint8, uint16, uint32, uint64
 from .type_list import empty_list, is_list, list
-from .type_mapping import (builtin_to_proto_types, builtin_to_string,
-                           is_builtin, is_dict, is_primitive, is_scalar,
-                           is_str, is_subtype, is_tensor, is_tuple,
-                           np_dtype_to_py_type, nptype_from_builtin,
-                           numpy_type_to_builtin_type,
-                           numpy_val_to_builtin_val, promote_dtypes,
-                           promote_types, proto_to_builtin_types,
-                           string_to_builtin, type_to_builtin_type)
+from .type_mapping import (
+    BUILTIN_TO_PROTO_TYPES,
+    PROTO_TO_BUILTIN_TYPE,
+    builtin_to_string,
+    is_builtin,
+    is_dict,
+    is_primitive,
+    is_scalar,
+    is_str,
+    is_subtype,
+    is_tensor,
+    is_tuple,
+    np_dtype_to_py_type,
+    nptype_from_builtin,
+    numpy_type_to_builtin_type,
+    numpy_val_to_builtin_val,
+    promote_dtypes,
+    promote_types,
+    string_to_builtin,
+    type_to_builtin_type,
+)
 from .type_str import str
-from .type_tensor import (is_compatible_type, is_tensor_and_is_compatible,
-                          tensor, tensor_has_complete_shape)
+from .type_tensor import (
+    is_compatible_type,
+    is_tensor_and_is_compatible,
+    tensor,
+    tensor_has_complete_shape,
+)
 from .type_tuple import tuple
 from .type_unknown import unknown
 from .type_void import void
diff --git a/coremltools/converters/mil/mil/types/type_dict.py b/coremltools/converters/mil/mil/types/type_dict.py
index bf711211e..ff3ef9d98 100644
--- a/coremltools/converters/mil/mil/types/type_dict.py
+++ b/coremltools/converters/mil/mil/types/type_dict.py
@@ -60,3 +60,9 @@ def __contains__(self, key):
 
     dict.__template_name__ = "dict[" + keytype.__name__ + "," + valuetype.__name__ + "]"
     return dict
+
+
+def is_dict(t):
+    if t is None:
+        return False
+    return get_type_info(t).name == "dict"
diff --git a/coremltools/converters/mil/mil/types/type_mapping.py b/coremltools/converters/mil/mil/types/type_mapping.py
index a6fbeab1f..7664a595b 100644
--- a/coremltools/converters/mil/mil/types/type_mapping.py
+++ b/coremltools/converters/mil/mil/types/type_mapping.py
@@ -2,6 +2,9 @@
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from collections import namedtuple
+
 import numpy as _np
 import numpy as np
 import sympy as sm
@@ -14,6 +17,7 @@
 from .type_complex import complex64 as types_complex64
 from .type_complex import complex128 as types_complex128
 from .type_complex import is_complex
+from .type_dict import is_dict
 from .type_double import fp16 as types_fp16
 from .type_double import fp32 as types_fp32
 from .type_double import fp64 as types_fp64
@@ -31,7 +35,7 @@
 from .type_str import str as types_str
 from .type_unknown import unknown
 
-_types_TO_NPTYPES = {
+_TYPES_TO_NPTYPES = {
     types_bool: np.bool_,
     types_int8: np.int8,
     types_int16: np.int16,
@@ -49,7 +53,25 @@
     types_str: np.str_,
 }
 
-_types_TO_STRINGS = {
+_NPTYPES_TO_STRINGS = {
+    np.bool_: "bool",
+    np.int8: "int8",
+    np.int16: "int16",
+    np.int32: "int32",
+    np.int64: "int64",
+    np.uint8: "uint8",
+    np.uint16: "uint16",
+    np.uint32: "uint32",
+    np.uint64: "uint64",
+    np.float16: "fp16",
+    np.float32: "fp32",
+    np.float64: "fp64",
+    np.complex64: "complex64",
+    np.complex128: "complex128",
+    np.str_: "string",
+}
+
+_TYPES_TO_STRINGS = {
     types_bool: "bool",
     types_int8: "int8",
     types_int16: "int16",
@@ -67,7 +89,35 @@
     types_str: "string",
 }
 
-builtin_to_proto_types = {
+_TYPES_TO_RESOLUTION = {
+    types_bool: 1,
+    types_int8: 1,
+    types_uint8: 1,
+    types_int16: 1,
+    types_uint16: 1,
+    types_int32: 1,
+    types_int64: 1,
+    types_fp16: np.finfo(np.float16).resolution,
+    types_fp32: np.finfo(np.float32).resolution,
+    types_fp64: np.finfo(np.float64).resolution,
+}
+
+RangeTuple = namedtuple("RangeTuple", "low high")
+
+_TYPES_TO_RANGE = {
+    types_bool: RangeTuple(0, 1),
+    types_int8: RangeTuple(np.iinfo(np.int8).min, np.iinfo(np.int8).max),
+    types_uint8: RangeTuple(np.iinfo(np.uint8).min, np.iinfo(np.uint8).max),
+    types_int16: RangeTuple(np.iinfo(np.int16).min, np.iinfo(np.int16).max),
+    types_uint16: RangeTuple(np.iinfo(np.uint16).min, np.iinfo(np.uint16).max),
+    types_int32: RangeTuple(np.iinfo(np.int32).min, np.iinfo(np.int32).max),
+    types_int64: RangeTuple(np.iinfo(np.int64).min, np.iinfo(np.int64).max),
+    types_fp16: RangeTuple(np.finfo(np.float16).min, np.finfo(np.float16).max),
+    types_fp32: RangeTuple(np.finfo(np.float32).min, np.finfo(np.float32).max),
+    types_fp64: RangeTuple(np.finfo(np.float64).min, np.finfo(np.float64).max),
+}
+
+BUILTIN_TO_PROTO_TYPES = {
     # bool:
     types_bool: _mil_pm.BOOL,
 
@@ -93,9 +143,6 @@
     types_str: _mil_pm.STRING,
 }
 
-proto_to_builtin_types = {v: k for k, v in builtin_to_proto_types.items()}
-
-
 def np_dtype_to_py_type(np_dtype):
     # Can't use dict, as hash(np.int32) != hash(val.dtype)
     if np_dtype in [np.int32, np.int64]:
@@ -108,30 +155,52 @@ def np_dtype_to_py_type(np_dtype):
         return complex
     raise NotImplementedError('{} is not supported'.format(np_dtype))
 
-
-_STRINGS_TO_types = {v: k for k, v in _types_TO_STRINGS.items()}
-
+PROTO_TO_BUILTIN_TYPE = {v: k for k, v in BUILTIN_TO_PROTO_TYPES.items()}
+_STRINGS_TO_TYPES = {v: k for k, v in _TYPES_TO_STRINGS.items()}
+_STRINGS_TO_NPTYPES = {v: k for k, v in _NPTYPES_TO_STRINGS.items()}
 
 def string_to_builtin(s):
     """
     Given a str, return its corresponding builtin type.
     """
-    return _STRINGS_TO_types.get(s, None)
+    return _STRINGS_TO_TYPES[s]
 
 
 def builtin_to_string(builtin_type):
     """
     Given a builtin type, return its corresponding string representation.
     """
-    return _types_TO_STRINGS.get(builtin_type, None)
+    if is_dict(builtin_type):
+        return "dict"
+    return _TYPES_TO_STRINGS[builtin_type]
+
+
+def string_to_nptype(s: str):
+    """
+    Given a str, return its corresponding numpy type.
+    """
+    return _STRINGS_TO_NPTYPES[s]
 
 
 def nptype_from_builtin(btype):
     """
     Given a builtin type, return its corresponding Numpy dtype.
     """
-    return _types_TO_NPTYPES.get(btype, None)
+    return _TYPES_TO_NPTYPES[btype]
+
 
+def builtin_to_resolution(builtin_type: type):
+    """
+    Given a builtin type, return its corrsponding resolution.
+    """
+    return _TYPES_TO_RESOLUTION[builtin_type]
+
+
+def builtin_to_range(builtin_type: type):
+    """
+    Given a builtin type, return its corresponding range.
+    """
+    return _TYPES_TO_RANGE[builtin_type]
 
 def promote_types(dtype1, dtype2):
     """
@@ -416,10 +485,13 @@ def np_val_to_py_type(val):
     if not isinstance(val, (_np.ndarray, _np.generic)):
         return val
 
-    if val.dtype in (_np.float16, _np.uint8, _np.int8, _np.uint16, _np.int16, _np.uint32):
+    if val.dtype in (_np.float16, _np.uint8, _np.int8, _np.uint32):
+        # Serialize to bytes because MIL read them from bytes field (see TensorValue in MIL.proto).
         return val.tobytes()
     else:
-        # val is np.ndarray or np.generic
+        if val.dtype in (_np.uint16, _np.int16):
+            # TODO (rdar://111797203): Serialize to byte after MIL changes to read from byte field.
+            val = val.astype(np.int32)
         is_np_scalar = isinstance(val, _np.generic) or val.shape == ()
         py_type = np_dtype_to_py_type(val.dtype)
         return py_type(val) if is_np_scalar else tuple(py_type(v) for v in val.flatten())
diff --git a/coremltools/converters/mil/mil/var.py b/coremltools/converters/mil/mil/var.py
index 8af32badd..ac4be0d12 100644
--- a/coremltools/converters/mil/mil/var.py
+++ b/coremltools/converters/mil/mil/var.py
@@ -3,7 +3,7 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-from typing import Optional
+from typing import Optional, Union
 
 from coremltools.converters.mil.mil import types
 from coremltools.converters.mil.mil.types import builtin_to_string
@@ -253,7 +253,9 @@ def type_str(self):
     def set_name(self, name):
         self.name = name
 
-    def is_tensor_or_scalar_of(self, dtype: str):
+    def is_tensor_or_scalar_of(self, dtype: Union[str, type]):
+        if isinstance(dtype, type):
+            dtype = builtin_to_string(dtype)
         return (types.is_tensor(self.sym_type) or types.is_scalar(self.sym_type)) and builtin_to_string(self.dtype) == dtype
 
     def __str__(self):
diff --git a/coremltools/converters/mil/test_flexible_shape_inputs.py b/coremltools/converters/mil/test_inputs_outputs_shape.py
similarity index 87%
rename from coremltools/converters/mil/test_flexible_shape_inputs.py
rename to coremltools/converters/mil/test_inputs_outputs_shape.py
index ad126e473..c93ac1ab6 100644
--- a/coremltools/converters/mil/test_flexible_shape_inputs.py
+++ b/coremltools/converters/mil/test_inputs_outputs_shape.py
@@ -13,21 +13,29 @@
 
 import coremltools as ct
 from coremltools._deps import _HAS_TF_2, _HAS_TORCH, MSG_TF2_NOT_FOUND, MSG_TORCH_NOT_FOUND
+from coremltools.converters.mil import Builder as mb
 from coremltools.converters.mil.testing_reqs import backends, compute_units
 
 if _HAS_TORCH:
     import torch
+
     torch.manual_seed(10)
 
     class TestConvModule(torch.nn.Module):
         def __init__(self, in_channels=3, out_channels=10, kernel_size=3):
             super(TestConvModule, self).__init__()
-            self.conv = torch.nn.Conv2d(in_channels, out_channels,
-                                        kernel_size)
+            self.conv = torch.nn.Conv2d(in_channels, out_channels, kernel_size)
 
         def forward(self, x):
             return self.conv(x)
 
+    class TestSimpleModule(torch.nn.Module):
+        def forward(self, x):
+            x = x + 1.0
+            y = x - 9.0
+            z = torch.sum(x)
+            return x, y, z
+
 
 if _HAS_TF_2:
     import tensorflow as tf
@@ -39,8 +47,8 @@ def _numpy_array_to_pil_image(x):
     """
     assert len(x.shape) == 4
     assert list(x.shape[:2]) == [1, 3]
-    x = x[0, :, :, :] # (3, H, W)
-    x = _np.transpose(x, [1, 2, 0]) # (H, W, 3)
+    x = x[0, :, :, :]  # (3, H, W)
+    x = _np.transpose(x, [1, 2, 0])  # (H, W, 3)
     x = x.astype(_np.uint8)
     return PIL.Image.fromarray(x)
 
@@ -49,15 +57,17 @@ def _compute_snr(arr1, arr2):
     arr1 = arr1.flatten()
     arr2 = arr2.flatten()
     noise = arr1 - arr2
-    noise_var = _np.sum(noise ** 2) / len(noise) + 1e-7
-    signal_energy = _np.sum(arr2 ** 2) / len(arr2)
-    max_signal_energy = _np.amax(arr2 ** 2)
+    noise_var = _np.sum(noise**2) / len(noise) + 1e-7
+    signal_energy = _np.sum(arr2**2) / len(arr2)
+    max_signal_energy = _np.amax(arr2**2)
     snr = 10 * _np.log10(signal_energy / noise_var)
     psnr = 10 * _np.log10(max_signal_energy / noise_var)
     return snr, psnr
 
 
-def _assert_torch_coreml_output_shapes(coreml_model, spec, torch_model, torch_example_input, is_image_input=False):
+def _assert_torch_coreml_output_shapes(
+    coreml_model, spec, torch_model, torch_example_input, is_image_input=False
+):
     torch_out = torch_model(torch_example_input)
     input_name = spec.description.input[0].name
     output_name = spec.description.output[0].name
@@ -73,9 +83,66 @@ def _assert_torch_coreml_output_shapes(coreml_model, spec, torch_model, torch_ex
     _np.testing.assert_array_less(30, psnr)
 
 
+class TestOutputShapes:
+    @staticmethod
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_static_output_shapes(backend):
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(
+                    shape=(2, 3),
+                )
+            ]
+        )
+        def prog(x):
+            x = mb.add(x=x, y=1.0)
+            y = mb.sub(x=x, y=3.0)
+            z = mb.reduce_sum(x=x, axes=[0, 1], keep_dims=False)
+            return x, y, z
+
+        model = ct.convert(prog, convert_to=backend[0])
+        spec = model.get_spec()
+        expected_output_shape = [2, 3] if backend[0] == "mlprogram" else []
+        assert spec.description.output[0].type.multiArrayType.shape == expected_output_shape
+        assert spec.description.output[1].type.multiArrayType.shape == expected_output_shape
+        # scalar outputs have shape ()
+        assert spec.description.output[2].type.multiArrayType.shape == []
+
+        coreml_in = {"x": _np.random.rand(2, 3)}
+        model.predict(coreml_in)
+
+    @staticmethod
+    @pytest.mark.parametrize(
+        "backend",
+        backends,
+    )
+    def test_dynamic_output_shapes(backend):
+
+        example_input = torch.rand(2, 3)
+        traced_model = torch.jit.trace(TestSimpleModule().eval(), example_input)
+
+        input_shape = ct.Shape(shape=(2, ct.RangeDim(3, 5)))
+        model = ct.convert(
+            traced_model, inputs=[ct.TensorType(shape=input_shape)], convert_to=backend[0]
+        )
+
+        spec = model.get_spec()
+        # We don't put the shape information for dynamic output shapes,
+        # otherwise a runtime validation error would raise
+        assert spec.description.output[0].type.multiArrayType.shape == []
+        assert spec.description.output[1].type.multiArrayType.shape == []
+        # scalar outputs have shape ()
+        assert spec.description.output[2].type.multiArrayType.shape == []
+
+        coreml_in = {"x_1": _np.random.rand(2, 3)}
+        model.predict(coreml_in)
+
+
 @pytest.mark.skipif(not _HAS_TORCH or not ct.utils._is_macos(), reason=MSG_TORCH_NOT_FOUND)
 class TestFlexibleInputShapesTorch:
-
     @pytest.mark.parametrize(
         "backend, compute_unit",
         itertools.product(
@@ -103,8 +170,12 @@ def test_multiarray_input_rangedim(self, backend, compute_unit):
 
         spec = model.get_spec()
         assert list(spec.description.input[0].type.multiArrayType.shape) == [1, 3, 45, 45]
-        assert spec.description.input[0].type.multiArrayType.shapeRange.sizeRanges[2].lowerBound == 25
-        assert spec.description.input[0].type.multiArrayType.shapeRange.sizeRanges[2].upperBound == 100
+        assert (
+            spec.description.input[0].type.multiArrayType.shapeRange.sizeRanges[2].lowerBound == 25
+        )
+        assert (
+            spec.description.input[0].type.multiArrayType.shapeRange.sizeRanges[2].upperBound == 100
+        )
         _assert_torch_coreml_output_shapes(model, spec, traced_model, example_input)
 
     @pytest.mark.parametrize(
@@ -183,11 +254,16 @@ def test_multiarray_input_enumerated(self, backend, compute_unit):
 
         spec = model.get_spec()
         assert list(spec.description.input[0].type.multiArrayType.shape) == [1, 3, 67, 67]
-        assert list(spec.description.input[0].type.multiArrayType.enumeratedShapes.shapes[0].shape) == [1, 3, 67, 67]
+        assert list(
+            spec.description.input[0].type.multiArrayType.enumeratedShapes.shapes[0].shape
+        ) == [1, 3, 67, 67]
         assert len(spec.description.input[0].type.multiArrayType.enumeratedShapes.shapes) == 3
         _assert_torch_coreml_output_shapes(model, spec, traced_model, example_input)
 
-    @pytest.mark.skipif(ct.utils._macos_version() < (12, 0), reason="Image input with RangeDim works correctly on macOS12+")
+    @pytest.mark.skipif(
+        ct.utils._macos_version() < (12, 0),
+        reason="Image input with RangeDim works correctly on macOS12+",
+    )
     @pytest.mark.parametrize(
         "backend, compute_unit",
         itertools.product(
@@ -215,7 +291,9 @@ def test_image_input_rangedim(self, backend, compute_unit):
         assert spec.description.input[0].type.imageType.height == 35
         assert spec.description.input[0].type.imageType.imageSizeRange.widthRange.lowerBound == 25
         assert spec.description.input[0].type.imageType.imageSizeRange.widthRange.upperBound == 100
-        _assert_torch_coreml_output_shapes(model, spec, traced_model, example_input, is_image_input=True)
+        _assert_torch_coreml_output_shapes(
+            model, spec, traced_model, example_input, is_image_input=True
+        )
 
     @pytest.mark.skipif(
         ct.utils._macos_version() < (12, 0),
@@ -301,7 +379,9 @@ def test_image_input_enumerated(self, backend, compute_unit):
         assert len(spec.description.input[0].type.imageType.enumeratedSizes.sizes) == 3
         assert spec.description.input[0].type.imageType.enumeratedSizes.sizes[0].width == 25
         assert spec.description.input[0].type.imageType.enumeratedSizes.sizes[0].height == 25
-        _assert_torch_coreml_output_shapes(model, spec, traced_model, example_input, is_image_input=True)
+        _assert_torch_coreml_output_shapes(
+            model, spec, traced_model, example_input, is_image_input=True
+        )
 
 
 @pytest.mark.skipif(not _HAS_TF_2 or not ct.utils._is_macos(), reason=MSG_TF2_NOT_FOUND)
diff --git a/coremltools/converters/mil/testing_reqs.py b/coremltools/converters/mil/testing_reqs.py
index 4c6ba4bdd..224204b62 100644
--- a/coremltools/converters/mil/testing_reqs.py
+++ b/coremltools/converters/mil/testing_reqs.py
@@ -3,33 +3,145 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 import os
+from typing import List
 
 import numpy as np
 import pytest
+from attrs import define, field, validators
 
 import coremltools as ct
 from coremltools._deps import _HAS_TF_1, _HAS_TF_2, _HAS_TORCH
+from coremltools.converters.mil.testing_utils import macos_compatible_with_deployment_target
+
+# Setting up backend / precision / op version
+_SUPPORTED_BACKENDS = ("neuralnetwork", "mlprogram")
+_SUPPORTED_PRECISIONS = ("fp32", "fp16")
+_SUPPORTED_OPSET_VERSIONS_NN = (ct.target.iOS14,)
+_SUPPORTED_OPSET_VERSIONS_MLPROGRAM = (ct.target.iOS15, ct.target.iOS16, ct.target.iOS17)
+
+@define(frozen=True)
+class BackendConfig:
+    """
+    Parameters
+    ----------
+    backend: str
+        "neuralnetwork" or "mlprogram"
+    precision: str
+        "fp16" or "fp32"
+    opset_version: ct.target
+        minimum_deployment_target for the ct.convert function
+    """
+    backend: str = field(validator=validators.instance_of(str))
+    precision: str = field(validator=validators.instance_of(str))
+    opset_version: ct.target = field(validator=validators.instance_of(ct.target))
+
+    @backend.validator
+    def check_backend(self, attr, backend):
+        if backend not in _SUPPORTED_BACKENDS:
+            raise ValueError(
+                f"backend {backend} not supported. Please pass one of the following values: {_SUPPORTED_BACKENDS}"
+            )
+
+    @precision.validator
+    def check_precision(self, attr, precision):
+        if precision not in _SUPPORTED_PRECISIONS:
+            raise ValueError(
+                f"precision {precision} not supported. Please pass one of the following values: {_SUPPORTED_PRECISIONS}"
+            )
+        if precision == "fp16" and self.backend == "neuralnetwork":
+            raise ValueError("fp16 precision is only supported in mlprogram backend.")
+
+    @opset_version.validator
+    def check_opset_version(self, attr, opset_version):
+        if self.backend == "neuralnetwork" and opset_version not in _SUPPORTED_OPSET_VERSIONS_NN:
+            raise ValueError(
+                f"opset_version {opset_version} not supported in neuralnetwork backend. Supported opset versions are {_SUPPORTED_OPSET_VERSIONS_NN}"
+            )
+        if self.backend == "mlprogram" and opset_version not in _SUPPORTED_OPSET_VERSIONS_MLPROGRAM:
+            raise ValueError(
+                f"opset_version {opset_version} not supported in mlprogram backend. Supported opset versions are {_SUPPORTED_OPSET_VERSIONS_MLPROGRAM}"
+            )
 
-# Setting up backend / precision
-backends = []
 if 'PYMIL_TEST_TARGETS' in os.environ:
     targets = os.environ['PYMIL_TEST_TARGETS'].split(',')
     for i in range(len(targets)):
         targets[i] = targets[i].strip()
+else:
+    targets = ["mlprogram", "neuralnetwork"]
 
-    if 'mlprogram' in targets:
-        backends.append(('mlprogram', 'fp16'))
+# new backends using the new infrastructure
+backends_internal = []
+if "mlprogram" in targets:
+    for v in _SUPPORTED_OPSET_VERSIONS_MLPROGRAM:
+        precisions = ["fp16"]
         if os.getenv('INCLUDE_MIL_FP32_UNIT_TESTS') == '1':
-            backends.append(('mlprogram', 'fp32'))
-    if 'neuralnetwork' in targets:
-        backends.append(('neuralnetwork', 'fp32'))
+            precisions.append("fp32")
+        for p in precisions:
+            backends_internal.append(
+                BackendConfig(backend="mlprogram", precision=p, opset_version=v)
+            )
 
-    if not backends:
-        raise ValueError("PYMIL_TEST_TARGETS can be set to one or more of: neuralnetwork, mlprogram")
-else:
-    backends = [('mlprogram', "fp16"), ('neuralnetwork', "fp32")]
-    if os.getenv('INCLUDE_MIL_FP32_UNIT_TESTS') == '1':
-        backends.append(('mlprogram', 'fp32'))
+if "neuralnetwork" in targets:
+    for v in _SUPPORTED_OPSET_VERSIONS_NN:
+        backends_internal.append(
+            BackendConfig(
+                backend="neuralnetwork",
+                precision="fp32",
+                opset_version=v,
+            )
+        )
+
+# old backends approach
+backends = []
+if "mlprogram" in targets:
+    backends.append(("mlprogram", "fp16"))
+    if os.getenv("INCLUDE_MIL_FP32_UNIT_TESTS") == "1":
+        backends.append(("mlprogram", "fp32"))
+if "neuralnetwork" in targets:
+    backends.append(("neuralnetwork", "fp32"))
+
+if not backends or not backends_internal:
+    raise ValueError("PYMIL_TEST_TARGETS can be set to one or more of: neuralnetwork, mlprogram")
+
+
+def clean_up_backends(
+    backends: List[BackendConfig],
+    minimum_opset_version: ct.target,
+    force_include_iOS15_test: bool = False,
+) -> List[BackendConfig]:
+    """
+    Given a list of BackendConfig objects, this utility function filters out the invalid elements.
+
+    For instance, given a list of configs with opset_versions range from iOS14 to iOS17, with minimum_opset_version set to iOS16 and environment variable `RUN_BACKWARD_COMAPTIBILITY=1`, iOS14/iOS15 configs are removed, and iOS16/iOS17 configs are preserved.
+
+    To be more specifc, the config is removed if one of the following conditions is matched:
+    1. If opset_version is not compatable with the macOS.
+    2. If opset_version < minimum_opset_version
+    3. For the non backward compatibility run, opset_version > minimum_opset_version
+
+    Note a corner case that when `force_include_iOS15_test=True`, the iOS15 configs are forced to be preserved.
+    """
+    test_all_opset_versions = os.getenv("RUN_BACKWARD_COMPATIBILITY") == "1"
+    res = []
+    for config in backends:
+        # First check if the macOS are able to run the test
+        if not macos_compatible_with_deployment_target(config.opset_version):
+            continue
+        if force_include_iOS15_test and config.opset_version == ct.target.iOS15:
+            res.append(config)
+            continue
+        if config.opset_version < minimum_opset_version:
+            continue
+        if not test_all_opset_versions and config.opset_version > minimum_opset_version:
+            continue
+        res.append(config)
+
+    if len(res) == 0:
+        pytest.skip(
+            f"Tests are not runnable under {minimum_opset_version.name}.", allow_module_level=True
+        )
+
+    return res
 
 # Setting up compute unit
 compute_units = []
diff --git a/coremltools/converters/mil/testing_utils.py b/coremltools/converters/mil/testing_utils.py
index 083897917..18f2d8865 100644
--- a/coremltools/converters/mil/testing_utils.py
+++ b/coremltools/converters/mil/testing_utils.py
@@ -6,7 +6,6 @@
 
 import copy
 import os
-import re
 from functools import partial
 from pathlib import Path
 from typing import Dict, List, Tuple
@@ -18,7 +17,8 @@
 import coremltools as ct
 import coremltools.models.utils as coremltoolsutils
 from coremltools._deps import _IS_MACOS
-from coremltools.converters.mil.mil import Function, Program
+from coremltools.converters.mil.mil import Block, Function, Program
+from coremltools.converters.mil.mil.passes.defs.preprocess import NameSanitizer as _NameSanitizer
 from coremltools.converters.mil.mil.passes.defs.quantization import AbstractQuantizationPass
 from coremltools.converters.mil.mil.passes.pass_registry import PASS_REGISTRY
 from coremltools.proto import FeatureTypes_pb2 as ft
@@ -39,7 +39,7 @@
     ct.target.iOS17: 14,
 }
 
-einsum_equations: List[str] = [
+hardcoded_einsum_equations: List[str] = [
     # hardcoded cases
     "abcd,adce->abce",
     "abc,cbd->abd",
@@ -50,6 +50,9 @@
     "bnft,btnh->bfnh",
     "abcd,cde->abe",
     "a b c d , a d c e -> a b c e",
+]
+
+einsum_equations: List[str] = hardcoded_einsum_equations + [
     # with-diagonal generic cases
     "jiii,ijjk->jk",
     "iji,ji->j",
@@ -88,6 +91,14 @@
     "iijj,j->j",
 ]
 
+
+def macos_compatible_with_deployment_target(minimum_deployment_target):
+    if coremltoolsutils._is_macos():
+        macos_major_version = coremltoolsutils._macos_version()[0]
+        if macos_major_version < IOS_TO_MINIMUM_MACOS_VERSION[minimum_deployment_target]:
+            return False
+    return True
+
 def _serialize_current_pytest(mlmodel):
     class_name = os.environ.get('PYTEST_CURRENT_TEST').split("::")[1].strip()
     test_name = "::".join(os.environ.get('PYTEST_CURRENT_TEST').split("::")[2:]).split("(call)")[0].strip()
@@ -161,12 +172,35 @@ def assert_model_is_valid(
                 assert out_shape == prediction[out_name].shape, \
                         "{} != {}".format(out_shape, prediction[out_name].shape)
 
+def assert_same_input_names(prog1, prog2, func_name="main"):
+    # check the input keys
+    prog1_input_keys = list(prog1[func_name].inputs.keys())
+    prog2_input_keys = list(prog2[func_name].inputs.keys())
+    assert prog1_input_keys == prog2_input_keys
+
+    # check the input var name
+    prog1_input_names = [x.name for x in list(prog1[func_name].inputs.values())]
+    prog2_input_names = [x.name for x in list(prog2[func_name].inputs.values())]
+    assert prog1_input_names == prog2_input_names
+
+
+def assert_same_input_types(prog1, prog2, func_name="main"):
+    prog1_input_types = [x.dtype for x in list(prog1[func_name].inputs.values())]
+    prog2_input_types = [x.dtype for x in list(prog2[func_name].inputs.values())]
+    assert prog1_input_types == prog2_input_types
 
 def assert_same_output_names(prog1, prog2, func_name="main"):
     prog1_outputs = [o.name for o in prog1[func_name].outputs]
     prog2_outputs = [o.name for o in prog2[func_name].outputs]
     assert prog1_outputs == prog2_outputs
 
+def assert_same_output_types(prog1: Program, prog2: Program, func_name: str = "main"):
+    """
+    Check ``prog1`` and ``prog2`` have the same output dtypes.
+    """
+    prog1_output_types = [o.dtype for o in prog1[func_name].outputs]
+    prog2_output_types = [o.dtype for o in prog2[func_name].outputs]
+    assert prog1_output_types == prog2_output_types
 
 def assert_same_output_shapes(prog1, prog2, func_name="main"):
     prog1_output_shapes = [o.shape for o in prog1[func_name].outputs]
@@ -186,20 +220,29 @@ def get_op_names_in_program(prog, func_name="main", skip_const_ops=True):
         op_names_in_program.append(op.name)
     return op_names_in_program
 
-def get_op_types_in_program(prog, func_name="main", skip_const_ops=True):
+
+def get_op_types_in_block(block: Block, skip_const_ops: bool = True):
     """
-    Return the operation types in prog[func_name],
+    Return the operation types in block,
     in the same order as they are stored (topological)
     """
-    op_types_in_program = []
-    for op in prog[func_name].operations:
+    op_types_in_block = []
+    for op in block.operations:
         if skip_const_ops:
             if op.op_type == "const":
                 continue
-        op_types_in_program.append(op.op_type)
-    return op_types_in_program
+        op_types_in_block.append(op.op_type)
+    return op_types_in_block
 
 
+def get_op_types_in_program(prog: Program, func_name: str = "main", skip_const_ops: bool = True):
+    """
+    Return the operation types in prog[func_name],
+    in the same order as they are stored (topological)
+    If ``skip_const_ops = True``, const ops are not returned.
+    """
+    return get_op_types_in_block(prog[func_name], skip_const_ops)
+
 def random_gen(
     shape,
     rand_min=0.0,
@@ -216,13 +259,17 @@ def random_gen(
     Default data type is np.float32.
     """
     elem = np.prod(shape).astype(np.int32)
+
+    # Since this function is extensively used as well for the fp16 precision models,
+    # we make sure that the numerical value can be presented in fp16.
+    gen_dtype = np.float16 if dtype == np.float32 else dtype
     ret = []
     for _ in range(elem):
         while True:
-            r = dtype((rand_max - rand_min) * np.random.random() + rand_min)
+            r = gen_dtype((rand_max - rand_min) * np.random.random() + rand_min)
             if not allow_duplicate and r in ret:
                 continue
-            if np.issubdtype(dtype, np.integer) or np.fabs(np.round(r) - r) > eps_from_int:
+            if np.issubdtype(gen_dtype, np.integer) or np.fabs(np.round(r) - r) > eps_from_int:
                 ret.append(r)
                 break
     ret = np.array(ret).reshape(shape)
@@ -261,10 +308,18 @@ def run_core_ml_predict(mlmodel, input_key_values):
 def _get_coreml_out_from_dict(out_dict, out_name):
     if out_name in out_dict:
         return out_dict[out_name]
-    elif re.sub("[^a-zA-Z0-9_]", "_", out_name) in out_dict:
-        return out_dict[re.sub("[^a-zA-Z0-9_]", "_", out_name)]
+    sanitized_out_name = _NameSanitizer._replace_invalid_char_with_underscore(out_name)
+    if sanitized_out_name in out_dict:
+        return out_dict[sanitized_out_name]
     else:
-        raise KeyError("{} output not found in Core ML outputs".format(out_name))
+        raise KeyError(f"{out_name} output not found in Core ML outputs")
+
+def _get_proto_output_shape(spec, out_name):
+    sanitized_out_name = _NameSanitizer._replace_invalid_char_with_underscore(out_name)
+    for coreml_o in spec.description.output:
+        if coreml_o.name == sanitized_out_name:
+            return coreml_o.type.multiArrayType.shape
+    raise KeyError(f"{out_name} output not found in Core ML outputs")
 
 def compare_backend(
     mlmodel,
@@ -329,7 +384,6 @@ def compare_shapes(mlmodel, input_key_values, expected_outputs, pred=None):
 
         - pred: Prediction to use, if it has already been computed.
     """
-
     if _IS_MACOS:
         if not pred:
             pred = run_core_ml_predict(mlmodel, input_key_values)
@@ -352,6 +406,19 @@ def compare_shapes(mlmodel, input_key_values, expected_outputs, pred=None):
                 if expected.shape == () and coreml_out.shape == (1,):
                     continue
                 assert coreml_out.shape == expected.shape, msg
+
+                # Validate the shape consistency across runtime returned values and
+                # the output information in the mlprogram proto.
+                spec = mlmodel.get_spec()
+                if spec.WhichOneof("Type") == "mlProgram":
+                    # The proto output and the runtime outputs are different for classifier
+                    if spec.description.predictedFeatureName != "":
+                        continue
+                    proto_shape = _get_proto_output_shape(spec, o)
+                    if proto_shape != []:
+                        assert proto_shape == list(
+                            coreml_out.shape
+                        ), f"the output shape, for output named {o}, returned by the model is {coreml_out.shape} which does match with the shape present in the proto spec, which is {proto_shape}"
                 continue
 
             # output is other types (for classifier)
@@ -409,13 +476,14 @@ def ct_convert(
     return mlmodel
 
 def get_core_ml_prediction(
-        build, input_placeholders, input_values, compute_unit=ct.ComputeUnit.CPU_ONLY,
-        backend=("neuralnetwork", "fp32")):
+    build, input_placeholders, input_values, backend, compute_unit=ct.ComputeUnit.CPU_ONLY
+):
     """
     Return predictions of the given model.
     """
+    minimum_deployment_target = backend.opset_version
     program = Program()
-    with Function(input_placeholders) as ssa_func:
+    with Function(input_placeholders, opset_version=minimum_deployment_target) as ssa_func:
         output_vars = build(**ssa_func.inputs)
         if isinstance(output_vars, tuple):
             output_vars = list(output_vars)
@@ -427,13 +495,21 @@ def get_core_ml_prediction(
     mlmodel = ct_convert(
         program,
         source="milinternal",
-        convert_to=backend,
-        compute_units=compute_unit
+        convert_to=(backend.backend, backend.precision),
+        compute_units=compute_unit,
+        minimum_deployment_target=minimum_deployment_target,
     )
     return mlmodel.predict(input_values)
 
 
-def apply_pass_and_basic_check(prog, pass_name, skip_output_name_check=False):
+def apply_pass_and_basic_check(
+    prog,
+    pass_name,
+    skip_output_name_check=False,
+    skip_output_type_check=False,
+    skip_input_name_check=False,
+    skip_input_type_check=False,
+):
     """
     Apply pass to the program
     """
@@ -444,7 +520,14 @@ def apply_pass_and_basic_check(prog, pass_name, skip_output_name_check=False):
     prev_block = prev_prog.functions["main"]
     if not skip_output_name_check:
         assert_same_output_names(prev_prog, prog)
+    if not skip_output_type_check:
+        assert_same_output_types(prev_prog, prog)
     assert_same_output_shapes(prev_prog, prog)
+
+    if not skip_input_name_check:
+        assert_same_input_names(prev_prog, prog)
+    if not skip_input_type_check:
+        assert_same_input_types(prev_prog, prog)
     return prev_prog, prev_block, block
 
 
@@ -578,9 +661,7 @@ def validate_minimum_deployment_target(
     """
     if minimum_deployment_target >= ct.target.iOS15 and backend[0] != "mlprogram":
         pytest.skip("IOS15+ target only compatible with mlprogram.")
-    if coremltoolsutils._is_macos():
-        macos_major_version = coremltoolsutils._macos_version()[0]
-        if macos_major_version < IOS_TO_MINIMUM_MACOS_VERSION[minimum_deployment_target]:
-            pytest.skip(
-                f"IOS{minimum_deployment_target} target requires macOS {macos_major_version}+."
-            )
+    if not macos_compatible_with_deployment_target(minimum_deployment_target):
+        pytest.skip(
+            f"IOS{minimum_deployment_target} target is not runnable on this macOS {coremltoolsutils._macos_version()}"
+        )
diff --git a/coremltools/models/ml_program/compression_utils.py b/coremltools/models/ml_program/compression_utils.py
index 942165e0f..28264eb2c 100644
--- a/coremltools/models/ml_program/compression_utils.py
+++ b/coremltools/models/ml_program/compression_utils.py
@@ -67,7 +67,9 @@ def palettize_weights(mlmodel, nbits=None, mode="kmeans", op_selector=None, lut_
     version="7.0",
     obj_prefix="coremltools.compression_utils.",
 )
-def sparsify_weights(mlmodel, mode="threshold_based", threshold=1e-3, target_percentile=1.0, op_selector=None):
+def sparsify_weights(
+    mlmodel, mode="threshold_based", threshold=1e-12, target_percentile=1.0, op_selector=None
+):
     """
     ``coremltools.compression_utils.sparsify_weights`` is deprecated and will be removed in the future.
     Please use ``coremltools.optimize.coreml.prune_weights``.
diff --git a/coremltools/models/model.py b/coremltools/models/model.py
index 74b765126..a30923cf2 100644
--- a/coremltools/models/model.py
+++ b/coremltools/models/model.py
@@ -492,7 +492,11 @@ def save(self, save_path: str):
             if not ext:
                 save_path = "{}{}".format(save_path, _MLPACKAGE_EXTENSION)
             elif ext != _MLPACKAGE_EXTENSION:
-                raise Exception("For an ML Program, extension must be {} (not {})".format(_MLPACKAGE_EXTENSION, ext))
+                raise Exception(
+                    "For an ML Program, extension must be {} (not {}). Please see https://coremltools.readme.io/docs/unified-conversion-api#target-conversion-formats to see the difference between neuralnetwork and mlprogram model types.".format(
+                        _MLPACKAGE_EXTENSION, ext
+                    )
+                )
             _shutil.copytree(self.package_path, save_path)
 
             saved_spec_path = _os.path.join(
diff --git a/coremltools/models/neural_network/quantization_utils.py b/coremltools/models/neural_network/quantization_utils.py
index ddcbee825..2d4432121 100644
--- a/coremltools/models/neural_network/quantization_utils.py
+++ b/coremltools/models/neural_network/quantization_utils.py
@@ -56,12 +56,15 @@ def __init__(self):
 
             def do_quantize(self, layer, **kwargs):
                 ret = super().do_quantize(layer)
-                if not ret or layer.name == 'dense_2':
+                if not ret or layer.name == "dense_2":
                     return False
                 return True
 
+
         selector = MyLayerSelector()
-        quantized_model = quantize_weights(mlmodel, 8, quantization_mode='linear', selector=selector)
+        quantized_model = quantize_weights(
+            mlmodel, 8, quantization_mode="linear", selector=selector
+        )
 
     """
 
@@ -90,7 +93,7 @@ def do_quantize(self, layer, **kwargs):
 
 
 class AdvancedQuantizedLayerSelector(QuantizedLayerSelector):
-    """ Quantized layer selector allowing the user to specify some types of
+    """Quantized layer selector allowing the user to specify some types of
     layers to skip during quantization process and the minimum size parameters
     in quantized convolution layers.
 
@@ -99,11 +102,15 @@ class AdvancedQuantizedLayerSelector(QuantizedLayerSelector):
     .. highlight:: python
     .. code-block:: python
 
-        from coremltools.models.neural_network.quantization_utils import AdvancedQuantizedLayerSelector
+        from coremltools.models.neural_network.quantization_utils import (
+            AdvancedQuantizedLayerSelector,
+        )
+
         selector = AdvancedQuantizedLayerSelector(
-                skip_layer_types=['batchnorm', 'bias', 'depthwiseConv'],
-                minimum_conv_kernel_channels=4,
-                minimum_conv_weight_count=4096)
+            skip_layer_types=["batchnorm", "bias", "depthwiseConv"],
+            minimum_conv_kernel_channels=4,
+            minimum_conv_weight_count=4096,
+        )
         quantized_model = quantize_weights(model, 8, selector=selector)
 
     """
@@ -1169,7 +1176,7 @@ def _load_and_resize_image(image_path, size):
     from PIL import Image
 
     img = Image.open(image_path)
-    return img.resize(size, Image.ANTIALIAS)
+    return img.resize(size, Image.LANCZOS)
 
 
 class TopKMetrics:
@@ -1641,10 +1648,11 @@ def quantize_weights(
     --------
     .. sourcecode:: python
 
-        >>> import coremltools
-        >>> from coremltools.models.neural_network import quantization_utils
-        >>> model = coremltools.models.MLModel('my_model.mlmodel')
-        >>> quantized_model = quantization_utils.quantize_weights(model, 8, "linear")
+        import coremltools
+        from coremltools.models.neural_network import quantization_utils
+
+        model = coremltools.models.MLModel("my_model.mlmodel")
+        quantized_model = quantization_utils.quantize_weights(model, 8, "linear")
     """
 
     qmode_mapping = {
diff --git a/coremltools/optimize/coreml/__init__.py b/coremltools/optimize/coreml/__init__.py
index 9d40e6246..061ad56e8 100644
--- a/coremltools/optimize/coreml/__init__.py
+++ b/coremltools/optimize/coreml/__init__.py
@@ -4,16 +4,18 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 from ._config import (
-	OpLinearQuantizerConfig,
+    OpLinearQuantizerConfig,
     OpMagnitudePrunerConfig,
-	OpPalettizerConfig,
+    OpPalettizerConfig,
     OpThresholdPrunerConfig,
-	OptimizationConfig,
+    OptimizationConfig,
 )
-
 from ._post_training_quantization import (
+    CoreMLOpMetaData,
+    CoreMLWeightMetaData,
     decompress_weights,
-	linear_quantize_weights,
-	palettize_weights,
-	prune_weights,
+    get_weights_metadata,
+    linear_quantize_weights,
+    palettize_weights,
+    prune_weights,
 )
diff --git a/coremltools/optimize/coreml/_config.py b/coremltools/optimize/coreml/_config.py
index 8b9fec3fa..edf8e2272 100644
--- a/coremltools/optimize/coreml/_config.py
+++ b/coremltools/optimize/coreml/_config.py
@@ -5,6 +5,7 @@
 
 import sys
 from abc import ABC, abstractmethod
+from collections import OrderedDict
 from typing import IO, Any, Callable, Dict, Optional, Tuple, Union
 
 import cattrs
@@ -143,7 +144,7 @@ class OpThresholdPrunerConfig(OpCompressorConfig):
     threshold: float
         All weight values above this threshold are set to ``0``.
 
-        * Default value is ``1e-3``.
+        * Default value is ``1e-12``.
 
     minimum_sparsity_percentile: float
         The sparsity level must be above this value for the weight representation to be stored in the sparse format rather than the dense format.
@@ -164,7 +165,8 @@ class OpThresholdPrunerConfig(OpCompressorConfig):
 
         * If not provided, it will be set to ``2048``, in which weights bigger than ``2048`` elements are compressed.
     """
-    threshold: float = field(default=1e-3, validator=validators.instance_of(float))
+
+    threshold: float = field(default=1e-12, validator=validators.instance_of(float))
     minimum_sparsity_percentile: float = field(default=0.5, validator=validators.instance_of(float))
     weight_threshold: Optional[int] = field(
                                         default=2048,
@@ -582,7 +584,7 @@ class OptimizationConfig:
 
     1. ``global_config``: The default configuration applied to all ops / consts.
     2. ``op_type_configs``: Configurations applied to specific op type. It overrides ``global_config``.
-    3. ``op_name_configs``: Confgurations applied to specific op instance. It overrides ``global_config`` and ``op_type_configs``.
+    3. ``op_name_configs``: Confgurations applied to specific constant or op instance. It overrides ``global_config`` and ``op_type_configs``.
 
     The following is an example that constructs an optimization config for weight palettization.
 
@@ -625,10 +627,11 @@ class OptimizationConfig:
         * An op type will not be compressed if the value is set to ``None``.
 
     op_name_configs: dict[str, OpCompressorConfig]
-        Op instance level configs applied to a specific op or constant.
+        Op instance level configs applied to a specific constant or op.
 
-        * The keys of the dictionary are the name of an op instance, and the values are the corresponding :py:class:`OpCompressorConfig`.
+        * The keys of the dictionary are the name of a constant or an op instance, and the values are the corresponding :py:class:`OpCompressorConfig`.
         * An op instance will not be compressed if the value is set to ``None``.
+        * You can use ``coremltools.optimize.coreml.get_weights_metadata`` to get the name of the constants / op instances in the model.
     """
     global_config: Optional[OpCompressorConfig] = field(default=None)
     op_type_configs: Optional[OpCompressorConfig] = field(default=None)
@@ -705,7 +708,7 @@ def set_op_name(
         op_config: OpCompressorConfig,
     ):
         """
-        Sets the compression config at the level of op instance by name.
+        Sets the compression config at the level of constant / op instance by name.
 
         .. code-block:: python
 
@@ -715,14 +718,16 @@ def set_op_name(
             op_config = OpPalettizerConfig(mode="kmeans", nbits=2)
             config.set_op_name("conv_1", op_config)
 
+        Note that, in order to get the name of a constant or an op instance, please refer to the ``coremltools.optimize.coreml.get_weights_metadata`` API.
+
         Parameters
         ----------
 
         op_name: str
-            The name of the op instance.
+            The name of a constant or an op instance.
 
         op_config: OpCompressorConfig
-            Op instance level config applied to a specific op or constant with name ``op_name``.
+            Op instance level config applied to a specific constant or op with name ``op_name``.
         """
         if self._is_deprecated:
             raise ValueError("set_op_name is not exposed through the coremltools.compression_utils API.")
@@ -969,3 +974,19 @@ def from_yaml(cls, yml: Union[IO, str]) -> "OptimizationConfig":
         else:
             config_dict = yaml.safe_load(yml)
         return cls.from_dict(config_dict)
+
+class _MetaDataDict(OrderedDict):
+    """
+    A dictionary class with nice print out str
+    """
+
+    def __init__(self, mapping=None, str_prefix=""):
+        super().__init__(mapping)
+        self._str_prefix = str_prefix
+
+    def __str__(self):
+        res = ""
+        for k, v in self.items():
+            res += f"{self._str_prefix}{k}\n"
+            res += f"{v}\n"
+        return res
diff --git a/coremltools/optimize/coreml/_post_training_quantization.py b/coremltools/optimize/coreml/_post_training_quantization.py
index 55db021ea..1a1726b2b 100644
--- a/coremltools/optimize/coreml/_post_training_quantization.py
+++ b/coremltools/optimize/coreml/_post_training_quantization.py
@@ -3,27 +3,34 @@
 # Use of this source code is governed by a BSD-3-clause license that can be
 # found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+from collections import OrderedDict
+from typing import Dict, List, Optional
+
+import numpy as np
+from attrs import define, field, validators
+from tqdm import tqdm
+
 from coremltools import _SPECIFICATION_VERSION_IOS_16
-from coremltools.converters.mil import Operation as _Operation
 from coremltools.converters.mil.converter import mil_convert as _mil_convert
 from coremltools.converters.mil.frontend.milproto.load import load as _milproto_to_pymil
 from coremltools.converters.mil.mil.passes.defs.quantization import (
     AbstractQuantizationPass as _AbstractQuantizationPass,
 )
-from ._quantization_passes import (
-    linear_quantize_weights as _linear_quantize_weights,
-    palettize_weights as _palettize_weights,
-    prune_weights as _prune_weights,
-    WeightDecompressor as _WeightDecompressor,
-)
 from coremltools.models import MLModel as _MLModel
 from coremltools.optimize.coreml import OptimizationConfig as _OptimizationConfig
+from coremltools.optimize.coreml._config import _MetaDataDict
+
+from ._quantization_passes import WeightDecompressor as _WeightDecompressor
+from ._quantization_passes import linear_quantize_weights as _linear_quantize_weights
+from ._quantization_passes import palettize_weights as _palettize_weights
+from ._quantization_passes import prune_weights as _prune_weights
 
 _DEFAULT_SPECIFICATION_VERSION_FOR_COMPRESSION = _SPECIFICATION_VERSION_IOS_16
 
-def _apply_graph_pass(mlmodel, graph_pass):
-    # Utility function which compresses a coreml model
-    # convert the fully precision mlmodel into pymil program
+def _convert_model_spec_to_pymil_prog(mlmodel: _MLModel, specification_version: int):
+    """
+    An utility that converts a ml program model into PyMIL program.
+    """
     model_spec = mlmodel.get_spec()
     model_type = model_spec.WhichOneof("Type")
     if model_type in ("neuralNetwork", "neuralNetworkClassifier", "neuralNetworkRegressor", "pipeline", "PipelineClassifier", "PipelineRegressor"):
@@ -36,15 +43,27 @@ def _apply_graph_pass(mlmodel, graph_pass):
     else:
        raise TypeError("weight compression not applicable for model type {}".format(model_type))
 
-    assert isinstance(graph_pass, _AbstractQuantizationPass), "compression pass must be an AbstractQuantizationPass instance"
-    specification_version = max(model_spec.specificationVersion, _DEFAULT_SPECIFICATION_VERSION_FOR_COMPRESSION)
     prog = _milproto_to_pymil(
         model_spec=model_spec,
         specification_version=specification_version,
         file_weights_dir=mlmodel.weights_dir,
     )
+    return prog
+
+
+def _apply_graph_pass(mlmodel: _MLModel, graph_pass: _AbstractQuantizationPass):
+    # Utility function which compresses a Core ML model
+    # converts the full precision mlmodel into a pymil program
+    model_spec = mlmodel.get_spec()
+    specification_version = max(
+        model_spec.specificationVersion, _DEFAULT_SPECIFICATION_VERSION_FOR_COMPRESSION
+    )
+    prog = _convert_model_spec_to_pymil_prog(mlmodel, specification_version)
 
     # apply compression graph pass
+    assert isinstance(
+        graph_pass, _AbstractQuantizationPass
+    ), "compression pass must be an AbstractQuantizationPass instance"
     graph_pass.apply(prog)
 
     # convert the pymil program back to mlmodel
@@ -58,6 +77,10 @@ def _apply_graph_pass(mlmodel, graph_pass):
     )
     return compressed_mlmodel
 
+
+def _is_valid_const(val, weight_threshold):
+    return isinstance(val, np.ndarray) and val.size >= weight_threshold
+
 def linear_quantize_weights(mlmodel: _MLModel, config: _OptimizationConfig):
     """
     Utility function to convert a float precision MLModel of type ``mlprogram``, which uses
@@ -101,7 +124,7 @@ def linear_quantize_weights(mlmodel: _MLModel, config: _OptimizationConfig):
        w_q = cast\_to\_8\_bit\_integer(w_r / s + cast\_to\_float(z))
 
     Note: :math:`cast\_to\_8\_bit\_integer` is the process of clipping the input to range ``[low, high]`` followed by rounding and casting to 8-bit integer.
-    
+
     In ``"linear"`` mode, ``s, z`` are computed by mapping the original float range
     ``[A, B]`` into the 8-bit integer range ``[-128, 127]`` or ``[0, 255]``. That is, you are solving the
     following linear equations:
@@ -119,9 +142,9 @@ def linear_quantize_weights(mlmodel: _MLModel, config: _OptimizationConfig):
     case, scales are computed per `channel`, in which `channel` is the output dimension,
     which corresponds to the first dimension for ops such as ``conv`` and ``linear``, and
     the second dimension for the ``conv_transpose`` op.
-    
+
     For ``"linear"`` mode, :math:`A = min(w_r)`, :math:`B = max(w_r)`.
-    
+
     **Linear symmetric interpolation**
 
     With linear symmetric interpolation (``"linear_symmetric"`` mode, the default), rather than
@@ -129,9 +152,9 @@ def linear_quantize_weights(mlmodel: _MLModel, config: _OptimizationConfig):
     chooses the maximum absolute value between the min/max, which results in a
     floating-point range that is symmetric with respect to zero. This also makes the resulting zero
     point ``0`` for int8 weight and ``127`` for uint8 weight.
-    
+
     For ``"linear_symmetric"`` mode:
-    
+
        * :math:`A = -R` and :math:`B = R`, where :math:`R = max(abs(w_r))`.
        * This function maps to the range of ``[-127, 127]`` for int8 weight and ``[0, 254]`` for uint8 weight.
        * The result is ``s=(B-A)/254`` -> ``s=2R/254`` -> ``s=R/127``.
@@ -143,7 +166,7 @@ def linear_quantize_weights(mlmodel: _MLModel, config: _OptimizationConfig):
     ----------
     mlmodel: MLModel
         Model to be quantized. This MLModel should be of type ``mlprogram``.
-        
+
     config: OptimizationConfig
         An :py:class:`OptimizationConfig` object that specifies the parameters for weight quantization.
 
@@ -156,11 +179,11 @@ def linear_quantize_weights(mlmodel: _MLModel, config: _OptimizationConfig):
     Examples
     --------
     .. sourcecode:: python
- 
+
         import coremltools as ct
         import coremltools.optimize as cto
-        
-        model = ct.coreml.models.MLModel('my_model.mlpackage')
+
+        model = ct.coreml.models.MLModel("my_model.mlpackage")
         config = cto.coreml.OptimizationConfig(
             global_config=cto.coreml.OpLinearQuantizerConfig(mode="linear_symmetric")
         )
@@ -211,7 +234,7 @@ def palettize_weights(mlmodel: _MLModel, config: _OptimizationConfig):
     ----------
     mlmodel: MLModel
         Model to be converted by a LUT. This MLModel should be of type ``mlprogram``.
-        
+
     config: OptimizationConfig
         An :py:class:`OptimizationConfig` object that specifies the parameters for weight palettization.
 
@@ -220,15 +243,14 @@ def palettize_weights(mlmodel: _MLModel, config: _OptimizationConfig):
     model: MLModel
         The palettized MLModel instance.
 
-    Example
-    -------
-
+    Examples
+    --------
     .. sourcecode:: python
 
         import coremltools as ct
         import coremltools.optimize as cto
-        
-        model = ct.models.MLModel('my_model.mlpackage')
+
+        model = ct.models.MLModel("my_model.mlpackage")
         config = cto.coreml.OptimizationConfig(
             global_config=cto.coreml.OpPalettizerConfig(mode="kmeans", nbits=4)
         )
@@ -277,16 +299,16 @@ def prune_weights(mlmodel: _MLModel, config: _OptimizationConfig):
     model: MLModel
         The sparse MLModel instance.
 
-    Example
-    -------
+    Examples
+    --------
     .. sourcecode:: python
 
         import coremltools as ct
         import coremltools.optimize as cto
-        
-        model = ct.models.MLModel('my_model.mlpackage')
+
+        model = ct.models.MLModel("my_model.mlpackage")
         config = cto.coreml.OptimizationConfig(
-            global_config=cto.coreml.OpThresholdPrunerConfig(threshold=1e-3)
+            global_config=cto.coreml.OpThresholdPrunerConfig(threshold=1e-12)
         )
         compressed_model = cto.coreml.prune_weights(model, config)
 
@@ -314,8 +336,8 @@ def decompress_weights(mlmodel: _MLModel):
     model: MLModel
         The MLModel with no ``constexpr`` ops included.
 
-    Example
-    -------
+    Examples
+    --------
     .. sourcecode:: python
 
         import coremltools as ct
@@ -327,3 +349,267 @@ def decompress_weights(mlmodel: _MLModel):
 
     weight_decompressor = _WeightDecompressor(op_selector=lambda op: True)
     return _apply_graph_pass(mlmodel, weight_decompressor)
+
+
+
+def get_weights_metadata(mlmodel: _MLModel, weight_threshold: int = 2048):
+    """
+    Utility function to get the weights metadata as a dictionary, which map the weight's name to its corresponding CoreMLWeightMetaData.
+
+    CoreMLWeightMetaData contains the following attributes:
+
+    1. val: The weight data.
+    2. sparsity: the percentile of the element whose absolute value ``<= 1e-12``.
+    3. unique_values: number of unique values in the weight.
+    4. child_ops: meta information of the child ops in which the weight is feeding into.
+
+    Parameters
+    ----------
+    mlmodel: MLModel
+        Model in which the weight meta data retrieved from.
+
+    weight_threshold: int
+        The size threshold, above which weights are returned.
+        That is, a weight tensor is included in the resulting dictionary only if its total number of elements are greater than ``weight_threshold``.
+
+        For example, if ``weight_threshold = 1024`` and a weight tensor is of shape ``[10, 20, 1, 1]``, hence ``200``
+        elements, it will not be returned by the ``get_weights_metadata`` API.
+
+        * If not provided, it will be set to ``2048``, in which weights bigger than ``2048`` elements are returned.
+
+    Returns
+    -------
+    dict[str, CoreMLWeightMetaData]
+        A dict that maps weight's name to its metadata.
+
+    Examples
+    --------
+    In this example, there are two weights whose size is greater than ``2048``.
+    A weight named ``conv_1_weight`` is feeding into a ``conv`` op named ``conv_1``,
+    while another weight named ``linear_1_weight`` is feeding into a ``linear`` op named ``linear_1``.
+    You can access the metadata by ``weight_metadata_dict["conv_1_weight"]``, etc.
+
+    .. sourcecode:: python
+
+        import coremltools as ct
+
+        mlmodel = ct.models.MLModel("my_model.mlpackage")
+        weight_metadata_dict = ct.optimize.coreml.get_weights_metadata(
+            mlmodel, weight_threshold=2048
+        )
+
+        # get the weight names with size > 25600
+        large_weights = []
+        for k, v in weight_metadata_dict.items():
+            if v.val.size >= 25600:
+                large_weights.append(k)
+
+        # get the weight names with sparsity >= 50%
+        sparse_weights = []
+        for k, v in weight_metadata_dict.items():
+            if v.sparsity >= 0.5:
+                sparse_weights.append(k)
+
+        # get the weight names with unique elements <= 16
+        palettized_weights = []
+        for k, v in weight_metadata_dict.items():
+            if v.unique_values <= 16:
+                palettized_weights.append(k)
+
+        # print out the dictionary
+        print(weight_metadata_dict)
+
+    The output from the above example would be:
+
+    ::
+
+        conv_1_weight
+        [
+            val: np.ndarray(shape=(32, 64, 2, 2), dtype=float32)
+            sparsity: 0.5
+            unique_values: 4097
+            child_ops: [
+                conv(name=conv_1, weight=conv_1_weight, ...)
+            ]
+        ]
+        linear_1_weight
+        [
+            val: np.ndarray(shape=(128, 64), dtype=float32)
+            sparsity: 0.2501220703125
+            unique_values: 4
+            child_ops: [
+                linear(name=linear_1, weight=linear_1_weight, ...)
+            ]
+        ]
+    """
+    def _get_weight_metadata(op):
+        """
+        Returns a CoreMLWeightMetaData object given a const operation.
+        """
+        assert op.op_type == "const", f"Expect op be type of 'const', got '{op.op_type}'"
+        child_ops = []
+        visited = set()
+        for child_op in op.outputs[0].child_ops:
+            if child_op in visited:
+                continue
+            visited.add(child_op)
+            params_name_mapping = OrderedDict()
+            for k, v in child_op.inputs.items():
+                if _is_valid_const(v.val, weight_threshold):
+                    params_name_mapping[k] = v.op.name
+            child_ops.append(
+                CoreMLOpMetaData(
+                    op_type=child_op.op_type,
+                    name=child_op.name,
+                    params_name_mapping=params_name_mapping,
+                )
+            )
+        return CoreMLWeightMetaData(op.val.val, child_ops=child_ops)
+
+    prog = _convert_model_spec_to_pymil_prog(mlmodel, mlmodel.get_spec().specificationVersion)
+    res = _MetaDataDict({})
+
+    def get_weights_meta_block(block):
+        # get the candidates ops with the given op_type
+        candidate_ops = []
+        for op in list(block.operations):
+            for b in op.blocks:
+                get_weights_meta_block(b)
+
+            if op.op_type == "const" and _is_valid_const(op.val.val, weight_threshold):
+                candidate_ops.append(op)
+
+        for op in tqdm(
+            candidate_ops,
+            desc="Getting Core ML weights meta data",
+            unit=" ops",
+        ):
+            res[op.name] = _get_weight_metadata(op)
+
+    for f in prog.functions.values():
+        get_weights_meta_block(f)
+
+    return res
+
+
+@define(frozen=True)
+class CoreMLOpMetaData:
+    """
+    A container class that stores op meta data.
+
+    The class has the following attributes:
+
+    Parameters
+    ----------
+    op_type: str
+        The type of the op. For instance: ``conv``, ``linear``, etc.
+
+    name: str
+        The name of the op.
+
+    params_name_mapping: dict[str, str]
+        A dict that maps the op's constant parameters to its corresponding weight name.
+        For instance, given a ``conv`` op with ``params_name_mapping``,
+
+        .. sourcecode:: python
+
+            {
+                "weight": "conv_1_weight",
+                "bias": "conv_1_bias",
+            }
+
+        means that the weight and bias of this op are named ``conv_1_weight``,  ``conv_1_bias``, respectively.
+
+    """
+
+    op_type: str = field(validator=validators.instance_of(str))
+    name: str = field(validator=validators.instance_of(str))
+    params_name_mapping: Dict[str, str] = field(validator=validators.instance_of(dict))
+
+    def __str__(self):
+        res = f"{self.op_type}(name={self.name}"
+        for k, v in self.params_name_mapping.items():
+            res += f", {k}={v}"
+        res += ", ...)"
+        return res
+
+
+@define(frozen=True)
+class CoreMLWeightMetaData:
+    """
+    A container class that stores weight meta data.
+
+    The class has the following attributes:
+
+    Parameters
+    ----------
+    val: numpy.ndarray
+        The weight data.
+
+    sparsity: float
+        the percentile of the element whose absolute value ``<= 1e-12``
+
+    unique_values: int
+        number of unique values in the weight
+
+    child_ops: list[CoreMLOpMetaData]
+        A list of of ``CoreMLOpMetaData`` which contains information of child ops in which the weight is feeding into.
+
+        The attributes can be accessed by:
+        ``child_ops[idx].op_type``: The operation type of the ``idx``th child op.
+        ``child_ops[idx].name``: The name of the ``idx``th child op.
+
+        Other op-dependant attributes also can be accessed. For instance, if ``idx``th child op is a ``conv`` layer,
+        ``child_ops[idx].weight`` will return its weight name.
+
+        For more details, please refer to the ``CoreMLOpMetaData`` doc string.
+
+    Examples
+    --------
+    .. sourcecode:: python
+
+        import numpy as np
+        from coremltools.optimize.coreml import CoreMLWeightMetaData
+
+        data = np.array([[1.0, 0.0], [0.0, 6.0]], dtype=np.float32)
+        meta_data = CoreMLWeightMetaData(data)
+        print(meta_data)
+
+    Outputs:
+    ::
+        [
+            val: np.ndarray(shape=(2, 2), dtype=float32)
+            sparsity: 0.5
+            unique_values: 3
+        ]
+
+    """
+
+    val: np.ndarray = field(validator=validators.instance_of(np.ndarray))
+    sparsity: Optional[float] = field(validator=validators.instance_of(float))
+    unique_values: Optional[int] = field(validator=validators.instance_of(int))
+    child_ops: Optional[List[CoreMLOpMetaData]] = field(
+        default=None, validator=validators.optional(validators.instance_of(list))
+    )
+
+    @sparsity.default
+    def _get_sparsity(self):
+        num_of_zeros = np.sum(np.abs(self.val) <= 1e-12)
+        return num_of_zeros / self.val.size
+
+    @unique_values.default
+    def _get_unique_values(self):
+        return len(np.unique(self.val))
+
+    def __str__(self):
+        res = "[  \n"
+        res += f"  val: np.ndarray(shape={self.val.shape}, dtype={self.val.dtype})\n"
+        res += f"  sparsity: {self.sparsity}\n"
+        res += f"  unique_values: {self.unique_values}\n"
+        if self.child_ops is not None:
+            res += "  child_ops: [\n"
+            for child_op in self.child_ops:
+                res += f"    {child_op}\n"
+            res += "  ]\n"
+        res += "]"
+        return res
diff --git a/coremltools/test/api/test_api_examples.py b/coremltools/test/api/test_api_examples.py
index 8400b59b1..758bd049e 100644
--- a/coremltools/test/api/test_api_examples.py
+++ b/coremltools/test/api/test_api_examples.py
@@ -49,7 +49,11 @@ def prog(x):
 class TestInputs:
     @staticmethod
     @pytest.mark.skipif(not ct.utils._is_macos(), reason="Platform is not Mac OS")
-    def test_unsanitized_input_name_during_prediction():
+    @pytest.mark.parametrize(
+        "convert_to",
+        ["mlprogram", "neuralnetwork"],
+    )
+    def test_unsanitized_input_name_during_prediction(convert_to):
         '''
         input name : "x/0" becomes "x_0" due to name sanitization applied during conversion
         '''
@@ -63,7 +67,7 @@ def test_unsanitized_input_name_during_prediction():
             ssa_fun.set_outputs([z])
         prog.add_function("main", ssa_fun)
 
-        mlmodel = ct.convert(prog)
+        mlmodel = ct.convert(prog, convert_to=convert_to)
 
         with pytest.raises(KeyError) as error_info:
             mlmodel.predict(
@@ -74,7 +78,7 @@ def test_unsanitized_input_name_during_prediction():
         assert "does not match any of the model input" in error_str
 
     @staticmethod
-    def _test_variant_input_type_prediction(to_tensor):
+    def _test_variant_input_type_prediction(to_tensor, convert_to):
         prog = Program()
         func_inputs = {"x": mb.placeholder(shape=[2, 3]),
                        "y": mb.placeholder(shape=[2, 3])}
@@ -85,7 +89,7 @@ def _test_variant_input_type_prediction(to_tensor):
             ssa_fun.set_outputs([z])
         prog.add_function("main", ssa_fun)
 
-        mlmodel = ct.convert(prog)
+        mlmodel = ct.convert(prog, convert_to=convert_to)
         x_numpy = np.random.rand(2, 3)
         y_numpy = np.random.rand(2, 3)
         out_by_numpy = mlmodel.predict(
@@ -100,8 +104,12 @@ def _test_variant_input_type_prediction(to_tensor):
 
     @staticmethod
     @pytest.mark.skipif(not ct.utils._is_macos(), reason="test needs predictions")
-    def test_list_predict_input():
-        TestInputs._test_variant_input_type_prediction(lambda x: x.tolist())
+    @pytest.mark.parametrize(
+        "convert_to",
+        ["mlprogram", "neuralnetwork"],
+    )
+    def test_list_predict_input(convert_to):
+        TestInputs._test_variant_input_type_prediction(lambda x: x.tolist(), convert_to)
 
     @staticmethod
     def test_rank0_inputs_mil():
@@ -133,7 +141,7 @@ def prog(x):
 
         # save neuralnetwork model without extension and check that it is saved with
         # mlmodel extension
-        mlmodel = ct.convert(prog)
+        mlmodel = ct.convert(prog, convert_to="neuralnetwork")
         mlmodel_path = os.path.join(save_path_dir, "model_nn")
         mlmodel.save(mlmodel_path)
         assert os.path.exists(mlmodel_path + ".mlmodel")
@@ -152,10 +160,9 @@ def prog(x):
 
         # check error if mlprogram is saved with mlmodel extension
         mlmodel_path = os.path.join(save_path_dir, "model_mlprogram.mlmodel")
-        with pytest.raises(Exception) as e:
+        expected_pattern = "For an ML Program\, extension must be \.mlpackage \(not \.mlmodel\)\. Please see .* to see the difference between neuralnetwork and mlprogram model types\."
+        with pytest.raises(Exception, match=expected_pattern):
             mlmodel.save(mlmodel_path)
-        expected_error = "For an ML Program, extension must be .mlpackage (not .mlmodel)"
-        assert expected_error == str(e.value)
 
     @staticmethod
     @pytest.mark.skipif(not ct.utils._is_macos(), reason="Platform is not Mac OS")
@@ -183,8 +190,7 @@ def prog(x):
             # converting to mlprogram, on macOS < 12
             # should raise a runtime error when skip_model_load is False
             with pytest.warns(RuntimeWarning):
-                model = ct.convert(prog, convert_to='mlprogram',
-                                   skip_model_load=skip_model_load)
+                model = ct.convert(prog, convert_to="mlprogram", skip_model_load=skip_model_load)
         else:
             model = ct.convert(prog, convert_to="mlprogram", skip_model_load=skip_model_load)
 
@@ -238,7 +244,9 @@ def prog(x):
 
         expected_pattern = "compute_precision .* supported .* mlprogram .* None .* target=='neuralnetwork'.*minimum_deployment_target.*"
         with pytest.raises(ValueError, match=expected_pattern) as e:
-            mlmodel = ct.convert(copy.deepcopy(prog), compute_precision='fp16')
+            mlmodel = ct.convert(
+                copy.deepcopy(prog), convert_to="neuralnetwork", compute_precision="fp16"
+            )
 
     @staticmethod
     def test_invalid_argument_nn_backend():
@@ -254,9 +262,13 @@ def prog(x):
 
         expected_err_str = "compute_precision is only supported for mlprogram target and must be None if target.*"
         with pytest.raises(ValueError, match=expected_err_str):
-            mlmodel = ct.convert(prog, compute_precision=ct.precision.FLOAT16)
+            mlmodel = ct.convert(
+                prog, convert_to="neuralnetwork", compute_precision=ct.precision.FLOAT16
+            )
         with pytest.raises(ValueError, match=expected_err_str):
-            mlmodel = ct.convert(prog, compute_precision=ct.precision.FLOAT32)
+            mlmodel = ct.convert(
+                prog, convert_to="neuralnetwork", compute_precision=ct.precision.FLOAT32
+            )
 
 
 @pytest.mark.skipif(not _HAS_TORCH, reason="PyTorch not found")
diff --git a/coremltools/test/api/test_api_visibilities.py b/coremltools/test/api/test_api_visibilities.py
index d3c365484..7cfb12c2b 100644
--- a/coremltools/test/api/test_api_visibilities.py
+++ b/coremltools/test/api/test_api_visibilities.py
@@ -179,6 +179,9 @@ def test_optimize_coreml(self):
             "palettize_weights",
             "prune_weights",
             "decompress_weights",
+            "get_weights_metadata",
+            "CoreMLWeightMetaData",
+            "CoreMLOpMetaData",
         ]
         _check_visible_modules(_get_visible_items(ct.optimize.coreml), expected)
 
diff --git a/coremltools/test/neural_network/test_numpy_nn_layers.py b/coremltools/test/neural_network/test_numpy_nn_layers.py
index bca34e14f..404d60f97 100644
--- a/coremltools/test/neural_network/test_numpy_nn_layers.py
+++ b/coremltools/test/neural_network/test_numpy_nn_layers.py
@@ -2141,6 +2141,9 @@ def test_slice_static_gpu(self):
         self.test_slice_static_cpu(cpu_only=False)
 
     def test_slice_dynamic_cpu(self, cpu_only=True):
+        pytest.xfail(
+            "rdar://111134257 ([Bug][Regression] nnv1 slice_by_index unittests are failing)"
+        )
         for rank in range(1, 6):
             input_shape = np.array([5 for _ in range(rank)])
             objs, strides, begin_masks, end_ids, end_masks, begin_ids = (
@@ -2357,6 +2360,9 @@ def test_slice_dynamic_cpu(self, cpu_only=True):
                 self.assertEqual(rank, builder._get_rank("output"))
 
     def test_slice_dynamic_gpu(self):
+        pytest.xfail(
+            "rdar://111134257 ([Bug][Regression] nnv1 slice_by_index unittests are failing)"
+        )
         self.test_slice_dynamic_cpu(cpu_only=False)
 
     def test_tile_cpu(self, cpu_only=True):
@@ -4246,6 +4252,10 @@ def test_reshape_like_cpu(self, cpu_only=True):
                 self.assertEqual(target_rank, builder._get_rank("output"))
 
     def test_reshape_like_gpu(self):
+        if platform.machine() == "arm64":
+            pytest.xfail(
+                "rdar://111942798 ([Regression][Bug] Reshape model got stuck while loading in M1 machine for non-cpu compute unit)"
+            )
         self.test_reshape_like_cpu(cpu_only=False)
 
     def test_reshape_static_cpu(self, cpu_only=True):
@@ -4287,6 +4297,10 @@ def test_reshape_static_cpu(self, cpu_only=True):
                 self.assertEqual(len(target_shape), builder._get_rank("output"))
 
     def test_reshape_static_gpu(self):
+        if platform.machine() == "arm64":
+            pytest.xfail(
+                "rdar://111942798 ([Regression][Bug] Reshape model got stuck while loading in M1 machine for non-cpu compute unit)"
+            )
         self.test_reshape_static_cpu(cpu_only=False)
 
     def test_reshape_dynamic_cpu(self, cpu_only=True):
diff --git a/coremltools/test/optimize/api/__init__.py b/coremltools/test/optimize/api/__init__.py
new file mode 100644
index 000000000..25c7d28c5
--- /dev/null
+++ b/coremltools/test/optimize/api/__init__.py
@@ -0,0 +1,4 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
diff --git a/coremltools/test/optimize/api/test_optimize_api.py b/coremltools/test/optimize/api/test_optimize_api.py
new file mode 100644
index 000000000..a798a1120
--- /dev/null
+++ b/coremltools/test/optimize/api/test_optimize_api.py
@@ -0,0 +1,636 @@
+#  Copyright (c) 2023, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import tempfile
+
+from coremltools.converters.mil.testing_utils import get_op_types_in_program
+from coremltools.test.optimize.coreml.test_passes import (
+    TestCompressionPasses as _TestCompressionPasses,
+)
+from coremltools.test.optimize.coreml.test_passes import (
+    TestConfigurationFromDictFromYaml as _TestConfigurationFromDictFromYaml,
+)
+
+get_test_program = _TestCompressionPasses._get_test_program_2
+
+
+def create_model_and_optimizer():
+    import torch
+
+    class Model(torch.nn.Module):
+        def __init__(self):
+            super(Model, self).__init__()
+            self.conv1 = torch.nn.Conv2d(3, 128, (1, 1))
+            self.conv2 = torch.nn.Conv2d(128, 256, (10, 10))
+            self.conv3 = torch.nn.Conv2d(256, 26, (10, 10))
+            self.linear = torch.nn.Linear(206, 12)
+
+        def forward(self, x):
+            x = self.conv1(x)
+            x = self.conv2(x)
+            x = self.conv3(x)
+            x = self.linear(x)
+            return x
+
+    model = Model()
+    loss_fn = torch.nn.MSELoss()
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
+
+    return model, loss_fn, optimizer
+
+
+def get_mlmodel():
+    import coremltools as ct
+    prog = get_test_program()
+    mlmodel = ct.convert(prog, convert_to="mlprogram", compute_precision=ct.precision.FLOAT32)
+    return mlmodel
+
+
+class TestOptimizeCoremlAPIOverview:
+    """
+    This class is testing the api reference code in
+    https://coremltools.readme.io/v7.0/docs/optimizecoreml-api-overview
+    """
+
+    def test_6_bit_palettization_example(self):
+        import coremltools as ct
+        import coremltools.optimize.coreml as cto
+
+        # load model
+        # (original) mlmodel = ct.models.MLModel(uncompressed_model_path)
+        mlmodel = get_mlmodel()
+
+        # define op config
+        op_config = cto.OpPalettizerConfig(mode="kmeans", nbits=6)
+
+        # define optimization config by applying the op config globally to all ops
+        config = cto.OptimizationConfig(global_config=op_config)
+
+        # palettize weights
+        compressed_mlmodel = cto.palettize_weights(mlmodel, config)
+
+        # Do some basic checks
+        assert compressed_mlmodel is not None
+        ops = get_op_types_in_program(compressed_mlmodel._mil_program)
+        assert ops.count("constexpr_lut_to_dense") == 6
+
+    def test_linear_quantization_config_from_yaml(self):
+        import coremltools.optimize.coreml as cto
+
+        mlmodel = get_mlmodel()
+
+        config_dict = {
+            "config_type": "OpLinearQuantizerConfig",
+            "global_config": {
+                "mode": "linear_symmetric",
+                "dtype": "int8",
+            },
+        }
+        yaml_file = _TestConfigurationFromDictFromYaml.get_yaml(config_dict)
+
+        # (original) config = cto.OptimizationConfig.from_yaml("linear_config.yaml")
+        config = cto.OptimizationConfig.from_yaml(yaml_file)
+        compressed_mlmodel = cto.linear_quantize_weights(mlmodel, config)
+
+        # Do some basic checks
+        assert compressed_mlmodel is not None
+        ops = get_op_types_in_program(compressed_mlmodel._mil_program)
+        assert ops.count("constexpr_affine_dequantize") == 6
+
+    def test_customize_ops_to_compress(self):
+        import coremltools.optimize.coreml as cto
+
+        mlmodel = get_mlmodel()
+
+        global_config = cto.OpPalettizerConfig(nbits=6, mode="kmeans")
+        linear_config = cto.OpPalettizerConfig(nbits=8, mode="kmeans")
+        config = cto.OptimizationConfig(
+            global_config=global_config,
+            op_type_configs={"linear": linear_config},
+            op_name_configs={"conv1": None, "conv3": None},
+        )
+        compressed_mlmodel = cto.palettize_weights(mlmodel, config)
+
+        # Do some basic checks
+        assert compressed_mlmodel is not None
+        ops = get_op_types_in_program(compressed_mlmodel._mil_program)
+        assert ops.count("constexpr_lut_to_dense") == 4
+
+
+class TestOptimizeTorchAPIOverview:
+    """
+    This class is testing the api reference code in
+    https://coremltools.readme.io/v7.0/docs/optimizetorch-api-overview
+    """
+
+    def get_global_config(self):
+        config_dict = {
+            "global_config": {
+                "scheduler": {"update_steps": [100, 200, 300, 500]},
+                "target_sparsity": 0.8,
+            }
+        }
+        return _TestConfigurationFromDictFromYaml.get_yaml(config_dict)
+
+    def get_fine_grain_config(self):
+        config_dict = {
+            "module_type_configs": {
+                "Linear": {
+                    "scheduler": {
+                        "update_steps": [100, 200, 300, 500],
+                    },
+                    "n_m_ratio": [3, 4],
+                },
+                "Conv2d": {
+                    "scheduler": {
+                        "update_steps": [100, 200, 300, 500],
+                    },
+                    "target_sparsity": 0.5,
+                    "block_size": 2,
+                },
+            },
+            "module_name_configs": {
+                "module2.conv1": {
+                    "scheduler": {
+                        "update_steps": [100, 200, 300, 500],
+                    },
+                    "target_sparsity": 0.75,
+                },
+                "module2.linear": None,
+            },
+        }
+        return _TestConfigurationFromDictFromYaml.get_yaml(config_dict)
+
+    def test_load_from_yaml(self):
+        def _test_config(config_path):
+            import torch
+
+            import coremltools as ct
+            from coremltools.optimize.torch.pruning import MagnitudePruner, MagnitudePrunerConfig
+
+            # Toy example
+            x, label = torch.rand(1, 3, 224, 224), torch.rand(1, 26, 206, 12)
+            data = [(x, label)]
+
+            model, loss_fn, optimizer = create_model_and_optimizer()
+
+            # Initialize pruner and configure it
+            # (original) config = MagnitudePrunerConfig.from_yaml("config.yaml")
+            config = MagnitudePrunerConfig.from_yaml(config_path)
+
+            pruner = MagnitudePruner(model, config)
+
+            # Insert pruning layers in the model
+            model = pruner.prepare()
+
+            for inputs, labels in data:
+                output = model(inputs)
+                loss = loss_fn(output, labels)
+                loss.backward()
+                optimizer.step()
+                pruner.step()
+
+            # Commit pruning masks to model parameters
+            pruner.finalize(inplace=True)
+
+            # Export
+            example_input = torch.rand(1, 3, 224, 224)
+            traced_model = torch.jit.trace(model, example_input)
+
+            coreml_model = ct.convert(
+                traced_model,
+                inputs=[ct.TensorType(shape=example_input.shape)],
+                pass_pipeline=ct.PassPipeline.DEFAULT_PRUNING,
+                minimum_deployment_target=ct.target.iOS16,
+            )
+            assert coreml_model is not None
+            output_file = tempfile.NamedTemporaryFile(suffix=".mlpackage").name
+            coreml_model.save(output_file)
+
+        _test_config(self.get_global_config())
+        _test_config(self.get_fine_grain_config())
+
+    def test_programmatic_example_1(self):
+        import torch
+
+        import coremltools as ct
+        from coremltools.optimize.torch.palettization import (
+            DKMPalettizer,
+            DKMPalettizerConfig,
+            ModuleDKMPalettizerConfig,
+        )
+
+        # Toy example
+        x, label = torch.rand(1, 3, 224, 224), torch.rand(1, 26, 206, 12)
+        data = [(x, label)]
+
+        # code that defines the pytorch model, and optimizer
+        model, loss_fn, optimizer = create_model_and_optimizer()
+
+        # Initialize the palettizer
+        config = DKMPalettizerConfig(
+            global_config=ModuleDKMPalettizerConfig(n_bits=4, cluster_dim=4)
+        )
+
+        palettizer = DKMPalettizer(model, config)
+
+        # Prepare the model to insert FakePalettize layers for palettization
+        model = palettizer.prepare(inplace=True)
+
+        # Use palettizer in the PyTorch training loop
+        for inputs, labels in data:
+            output = model(inputs)
+            loss = loss_fn(output, labels)
+            loss.backward()
+            optimizer.step()
+            palettizer.step()
+
+        # Fold LUT and indices into weights
+        model = palettizer.finalize(inplace=True)
+
+        # Export
+        example_input = torch.rand(1, 3, 224, 224)
+        traced_model = torch.jit.trace(model, example_input)
+
+        coreml_model = ct.convert(
+            traced_model,
+            inputs=[ct.TensorType(shape=example_input.shape)],
+            pass_pipeline=ct.PassPipeline.DEFAULT_PALETTIZATION,
+            minimum_deployment_target=ct.target.iOS16,
+        )
+        assert coreml_model is not None
+        output_file = tempfile.NamedTemporaryFile(suffix=".mlpackage").name
+        coreml_model.save(output_file)
+
+    def test_programmatic_example_2(self):
+        import torch
+
+        import coremltools as ct
+        from coremltools.optimize.torch.quantization import (
+            LinearQuantizer,
+            LinearQuantizerConfig,
+            ModuleLinearQuantizerConfig,
+            ObserverType,
+            QuantizationScheme,
+        )
+
+        # Toy example
+        x, label = torch.rand(1, 3, 224, 224), torch.rand(1, 26, 206, 12)
+        data = [(x, label)]
+        model, loss_fn, optimizer = create_model_and_optimizer()
+
+        # Initialize the quantizer
+        global_config = ModuleLinearQuantizerConfig(
+            quantization_scheme=QuantizationScheme.symmetric
+        )
+
+        config = LinearQuantizerConfig().set_global(global_config)
+
+        # We only want to quantize convolution layers which have a kernel size of 1 or all linear layers.
+        for name, m in model.named_modules():
+            if isinstance(m, torch.nn.Conv2d):
+                if m.kernel_size == (1, 1):
+                    config = config.set_module_name(
+                        name,
+                        ModuleLinearQuantizerConfig(
+                            weight_observer=ObserverType.mix_max, weight_per_channel=True
+                        ),
+                    )
+                else:
+                    config = config.set_module_name(name, None)
+
+        quantizer = LinearQuantizer(model, config)
+
+        # Prepare the model to insert FakeQuantize layers for QAT
+        example_input = torch.rand(1, 3, 224, 224)
+        model = quantizer.prepare(example_inputs=example_input, inplace=True)
+
+        # Use quantizer in your PyTorch training loop
+        for inputs, labels in data:
+            output = model(inputs)
+            loss = loss_fn(output, labels)
+            loss.backward()
+            optimizer.step()
+            quantizer.step()
+
+        # Convert operations to their quanitzed counterparts using parameters learnt via QAT
+        model = quantizer.finalize(inplace=True)
+
+        traced_model = torch.jit.trace(model, example_input)
+
+        coreml_model = ct.convert(
+            traced_model,
+            inputs=[ct.TensorType(shape=example_input.shape)],
+            minimum_deployment_target=ct.target.iOS17,
+        )
+        assert coreml_model is not None
+        output_file = tempfile.NamedTemporaryFile(suffix=".mlpackage").name
+        coreml_model.save(output_file)
+
+
+class TestConvertingCompressedSourceModels:
+    """
+    This class is testing examples in https://coremltools.readme.io/v7.0/docs/converting-compressed-source-models
+    """
+
+    def test_smoke_convert_compressed_source_model_pruning(self):
+        import coremltools as ct
+
+        model_with_sparse_weights = ct.convert(
+            get_test_program(),
+            pass_pipeline=ct.PassPipeline.DEFAULT_PRUNING,
+            minimum_deployment_target=ct.target.iOS17,
+        )
+        assert model_with_sparse_weights is not None
+
+    def test_smoke_convert_compressed_source_model_pelettization(self):
+        import coremltools as ct
+
+        model_with_lut_weights = ct.convert(
+            get_test_program(),
+            pass_pipeline=ct.PassPipeline.DEFAULT_PALETTIZATION,
+            minimum_deployment_target=ct.target.macOS13,
+        )
+        assert model_with_lut_weights is not None
+
+
+class TestPostTrainingPruning:
+    """
+    This class is testing examples in https://coremltools.readme.io/v7.0/docs/pruning-a-core-ml-model
+    """
+
+    def test_threshold_pruner(self):
+        from coremltools.optimize.coreml import (
+            OpThresholdPrunerConfig,
+            OptimizationConfig,
+            prune_weights,
+        )
+
+        model = get_mlmodel()
+        op_config = OpThresholdPrunerConfig(
+            threshold=0.03,
+            minimum_sparsity_percentile=0.55,
+            weight_threshold=1024,
+        )
+        config = OptimizationConfig(global_config=op_config)
+        model_compressed = prune_weights(model, config=config)
+        assert model_compressed is not None
+
+    def test_magnitute_pruner(self):
+        from coremltools.optimize.coreml import (
+            OpMagnitudePrunerConfig,
+            OptimizationConfig,
+            prune_weights,
+        )
+
+        model = get_mlmodel()
+        op_config = OpMagnitudePrunerConfig(
+            target_sparsity=0.6,
+            weight_threshold=1024,
+        )
+        config = OptimizationConfig(global_config=op_config)
+        model_compressed = prune_weights(model, config=config)
+
+
+class TestTrainingTimePruning:
+    """
+    This class is testing examples in https://coremltools.readme.io/v7.0/docs/data-dependent-pruning
+    """
+
+    def test_magnitute_pruner(self):
+        from collections import OrderedDict
+
+        import torch
+
+        import coremltools as ct
+        from coremltools.optimize.torch.pruning import MagnitudePruner, MagnitudePrunerConfig
+
+        # Toy example
+        x, label = torch.rand(1, 3, 224, 224), torch.rand(1, 32, 224, 224)
+        data = [(x, label)]
+        model = torch.nn.Sequential(
+            OrderedDict(
+                [
+                    ("conv1", torch.nn.Conv2d(3, 32, 3, padding="same")),
+                    ("conv2", torch.nn.Conv2d(32, 32, 3, padding="same")),
+                ]
+            )
+        )
+        loss_fn = torch.nn.MSELoss()
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
+
+        # initialize pruner and configure it
+        # we will configure the pruner for all conv2d layers
+        config = MagnitudePrunerConfig.from_dict(
+            {
+                "module_type_configs": {
+                    "Conv2d": {
+                        "scheduler": {"update_steps": [3, 5, 7]},
+                        "target_sparsity": 0.75,
+                        "granularity": "per_scalar",
+                    },
+                }
+            }
+        )
+
+        pruner = MagnitudePruner(model, config)
+
+        # insert pruning layers in the model
+        model = pruner.prepare()
+
+        for inputs, labels in data:
+            output = model(inputs)
+            loss = loss_fn(output, labels)
+            loss.backward()
+            optimizer.step()
+            pruner.step()
+
+        # commit pruning masks to model parameters
+        pruner.finalize(inplace=True)
+
+        # trace and convert the model
+        example_input = torch.rand(1, 3, 224, 224)  # shape of input for the model
+        traced_model = torch.jit.trace(model, example_input)
+        coreml_model = ct.convert(
+            traced_model,
+            convert_to="mlprogram",
+            inputs=[ct.TensorType(shape=example_input.shape)],
+            pass_pipeline=ct.PassPipeline.DEFAULT_PRUNING,
+            minimum_deployment_target=ct.target.iOS17,
+        )
+
+        assert coreml_model is not None
+        output_file = tempfile.NamedTemporaryFile(suffix=".mlpackage").name
+        coreml_model.save(output_file)
+
+
+class TestPostTrainingPalettization:
+    """
+    This class is testing the examples in https://coremltools.readme.io/v7.0/docs/data-free-palettization
+    """
+
+    def test_palettizer(self):
+        from coremltools.optimize.coreml import (
+            OpPalettizerConfig,
+            OptimizationConfig,
+            palettize_weights,
+        )
+
+        model = get_mlmodel()
+        op_config = OpPalettizerConfig(mode="kmeans", nbits=6, weight_threshold=512)
+        config = OptimizationConfig(global_config=op_config)
+        compressed_6_bit_model = palettize_weights(model, config=config)
+
+        # Some basic checks
+        assert compressed_6_bit_model is not None
+        ops = get_op_types_in_program(compressed_6_bit_model._mil_program)
+        assert ops.count("constexpr_lut_to_dense") == 6
+
+
+class TestTrainingTimePalettization:
+    """
+    This class is testing the examples in https://coremltools.readme.io/v7.0/docs/data-dependent-palettization
+    """
+
+    def test_palettizer(self):
+        import torch
+        import torch.nn as nn
+
+        import coremltools as ct
+        from coremltools.optimize.torch.palettization import DKMPalettizer, DKMPalettizerConfig
+
+        # Toy example
+        x, label = torch.rand(1, 4), torch.rand(1, 4)
+        data = [(x, label)]
+
+        model = nn.Sequential(nn.Linear(4, 500), nn.Sigmoid(), nn.Linear(500, 4), nn.Sigmoid())
+        loss_fn = nn.MSELoss()
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
+
+        # Prepare model for palettization
+        module_config = {nn.Linear: {"n_bits": 2, "weight_threshold": 1000, "milestone": 2}}
+        config = DKMPalettizerConfig.from_dict({"module_type_configs": module_config})
+        palettizer = DKMPalettizer(model, config)
+
+        prepared_model = palettizer.prepare()
+
+        # Fine-tune the model for a few epochs after this.
+        for inputs, labels in data:
+            output = model(inputs)
+            loss = loss_fn(output, labels)
+            loss.backward()
+            optimizer.step()
+            palettizer.step()
+
+        # prepare for conversion
+        finalized_model = palettizer.finalize()
+
+        # trace and convert
+        example_input = torch.rand(1, 4)  # shape of input for the model
+        traced_model = torch.jit.trace(finalized_model, example_input)
+
+        coreml_model = ct.convert(
+            traced_model,
+            convert_to="mlprogram",
+            inputs=[ct.TensorType(shape=example_input.shape)],
+            pass_pipeline=ct.PassPipeline.DEFAULT_PALETTIZATION,
+            minimum_deployment_target=ct.target.iOS16,
+        )
+
+        assert coreml_model is not None
+        output_file = tempfile.NamedTemporaryFile(suffix=".mlpackage").name
+        coreml_model.save(output_file)
+
+
+class TestPostTrainingQuantization:
+    """
+    This class is testing the examples in https://coremltools.readme.io/v7.0/docs/data-free-quantization
+    """
+
+    def test_quantization(self):
+        import coremltools.optimize.coreml as cto
+
+        model = get_mlmodel()
+        op_config = cto.OpLinearQuantizerConfig(mode="linear_symmetric", weight_threshold=512)
+        config = cto.OptimizationConfig(global_config=op_config)
+
+        compressed_8_bit_model = cto.linear_quantize_weights(model, config=config)
+
+        # Some basic checks
+        assert compressed_8_bit_model is not None
+        ops = get_op_types_in_program(compressed_8_bit_model._mil_program)
+        assert ops.count("constexpr_affine_dequantize") == 6
+
+
+class TestTrainingTimeQuantization:
+    """
+    This class is testing the examples in https://coremltools.readme.io/v7.0/docs/data-dependent-quantization
+    """
+
+    def test_quantization(self):
+        from collections import OrderedDict
+
+        import torch
+        import torch.nn as nn
+
+        import coremltools as ct
+        from coremltools.optimize.torch.quantization import LinearQuantizer, LinearQuantizerConfig
+
+        # Toy example
+        x, label = torch.rand(1, 1, 20, 20), torch.rand(1, 20, 16, 16)
+        data = [(x, label)]
+
+        model = nn.Sequential(
+            OrderedDict(
+                {
+                    "conv": nn.Conv2d(1, 20, (3, 3)),
+                    "relu1": nn.ReLU(),
+                    "conv2": nn.Conv2d(20, 20, (3, 3)),
+                    "relu2": nn.ReLU(),
+                }
+            )
+        )
+
+        loss_fn = nn.MSELoss()
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
+
+        # Initialize the quantizer
+        config = LinearQuantizerConfig.from_dict(
+            {
+                "global_config": {
+                    "quantization_scheme": "symmetric",
+                    "milestones": [0, 100, 400, 200],
+                }
+            }
+        )
+        quantizer = LinearQuantizer(model, config)
+
+        # Prepare the model to insert FakeQuantize layers for QAT
+        example_input = torch.rand(1, 1, 20, 20)
+        model = quantizer.prepare(example_inputs=example_input, inplace=True)
+
+        # Use quantizer in your PyTorch training loop
+        for inputs, labels in data:
+            output = model(inputs)
+            loss = loss_fn(output, labels)
+            loss.backward()
+            optimizer.step()
+            quantizer.step()
+
+        # Convert operations to their quanitzed counterparts using parameters learnt via QAT
+        model = quantizer.finalize(inplace=True)
+
+        # Convert the PyTorch models to CoreML format
+        traced_model = torch.jit.trace(model, example_input)
+        coreml_model = ct.convert(
+            traced_model,
+            convert_to="mlprogram",
+            inputs=[ct.TensorType(shape=example_input.shape)],
+            minimum_deployment_target=ct.target.iOS17,
+        )
+
+        assert coreml_model is not None
+        output_file = tempfile.NamedTemporaryFile(suffix=".mlpackage").name
+        coreml_model.save(output_file)
diff --git a/coremltools/test/optimize/coreml/test_post_training_quantization.py b/coremltools/test/optimize/coreml/test_post_training_quantization.py
index 425e8070f..259b78b92 100644
--- a/coremltools/test/optimize/coreml/test_post_training_quantization.py
+++ b/coremltools/test/optimize/coreml/test_post_training_quantization.py
@@ -10,12 +10,15 @@
 import torch
 
 import coremltools as ct
+import coremltools.optimize as cto
 from coremltools._deps import _HAS_SKLEARN
+from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import types
 from coremltools.converters.mil.testing_utils import get_op_types_in_program
-import coremltools.optimize as cto
+from coremltools.optimize.coreml._post_training_quantization import CoreMLWeightMetaData
 from coremltools.test.ml_program.test_compression import get_test_model_and_data
 
+
 # Wrapper functions that create the optimization config and call ct.optimize.coreml APIs
 def linear_quantize_weights(mlmodel, mode="linear", dtype=np.int8):
     op_config = cto.coreml.OpLinearQuantizerConfig(mode=mode, dtype=dtype)
@@ -106,7 +109,7 @@ def create_unique_weight(weight, nbits):
 def create_sparse_weight(weight, target_sparsity):
     shape = list(weight.shape)
     size = np.prod(shape)
-    weight = 3 * np.ones(size)
+    weight = 100 * np.random.rand(size)
     num_of_zeros = int(size * target_sparsity)
     weight[:num_of_zeros] = 0
     return np.reshape(weight, shape).astype(np.float32)
@@ -661,10 +664,10 @@ def test_convert_sparse_source_model_custom(self):
 
         pipeline = ct.PassPipeline.DEFAULT_PRUNING
         config = cto.coreml.OptimizationConfig(
-            global_config=cto.coreml.OpThresholdPrunerConfig(threshold=1e-3, minimum_sparsity_percentile=0.05),
-            op_type_configs={
-                "conv": None
-            }
+            global_config=cto.coreml.OpThresholdPrunerConfig(
+                threshold=1e-12, minimum_sparsity_percentile=0.05
+            ),
+            op_type_configs={"conv": None},
         )
         pipeline.set_options("compression::prune_weights", {"config": config})
         mlmodel = ct.convert(
@@ -757,7 +760,9 @@ def test_convert_sparse_and_palettized_source_model_custom():
         model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data_complex()
 
         weight_1_sparse = create_sparse_weight(model.conv_1.weight, 0.5)
-        weight_2_sparse = create_sparse_weight(model.conv_2.weight, 0.1)
+        weight_2_sparse = create_sparse_weight(
+            model.conv_2.weight, 0.1
+        )  # the sparsity of 0.1 is filtered out by the minimum_sparsity_percentile
         linear_1_unique = create_unique_weight(model.linear_1.weight, nbits=4)
 
         with torch.no_grad():
@@ -788,7 +793,6 @@ def test_convert_sparse_and_palettized_source_model_custom():
         expected_ops = [
             "constexpr_sparse_to_dense",
             "constexpr_lut_to_dense",
-            "constexpr_lut_to_dense",
             "conv",
             "conv",
             "reshape",
@@ -804,7 +808,7 @@ def test_convert_sparse_and_palettized_source_model_custom():
 
         conv_ops = prog.find_ops(op_type="conv")
         assert conv_ops[0].weight.op.op_type == "constexpr_sparse_to_dense"
-        assert conv_ops[1].weight.op.op_type == "constexpr_lut_to_dense"
+        assert conv_ops[1].weight.op.op_type == "const"
 
         linear_ops = prog.find_ops(op_type="linear")
         assert linear_ops[0].weight.op.op_type == "constexpr_lut_to_dense"
@@ -872,3 +876,178 @@ def test_error_handling():
         expected_err_str = "A function object must be provided as \"lut_function\""
         with pytest.raises(ValueError, match=expected_err_str):
             palettize_weights(mlmodel, mode="custom", lut_function=1)
+
+
+class TestCoreMLWeightMetaData:
+    """
+    This test includes unit tests for:
+    1. CoreMLWeightMetaData
+    2. coremltools.optimize.coreml.get_weights_metadata
+    """
+    @staticmethod
+    def test_coreml_weight_metadata_api():
+        """
+        Test the example in the CoreMLWeightMetaData api doc string.
+        """
+        data = np.array([[1.0, 0.0], [0.0, 6.0]], dtype=np.float32)
+        meta_data = CoreMLWeightMetaData(data)
+        assert meta_data.val is data
+        assert meta_data.sparsity == 0.5
+        assert meta_data.unique_values == 3
+
+    @staticmethod
+    def test_get_weights_metadata():
+        """
+        Test the example in the get_weights_metadata functionality with op_type is None.
+        """
+        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data_complex()
+
+        weight_1_sparse = create_sparse_weight(model.conv_1.weight, 0.5)
+        weight_2_sparse = create_sparse_weight(model.conv_2.weight, 0.8)
+        linear_1_palettized = create_unique_weight(model.linear_1.weight, 2)
+        linear_2_palettized = create_unique_weight(model.linear_2.weight, 4)
+
+        with torch.no_grad():
+            model.conv_1.weight = torch.nn.Parameter(torch.Tensor(weight_1_sparse))
+            model.conv_2.weight = torch.nn.Parameter(torch.Tensor(weight_2_sparse))
+            model.linear_1.weight = torch.nn.Parameter(torch.Tensor(linear_1_palettized))
+            model.linear_2.weight = torch.nn.Parameter(torch.Tensor(linear_2_palettized))
+
+        torchmodel = torch.jit.trace(model, torch_input_values)
+
+        mlmodel = ct.convert(
+            torchmodel,
+            inputs=inputs,
+            convert_to="mlprogram",
+            compute_precision=ct.precision.FLOAT32,
+            minimum_deployment_target=ct.target.iOS16,
+        )
+
+        # test the weight_threshold can filter out weights with size
+        weight_threshold = 10
+        weight_metadata_dict = ct.optimize.coreml.get_weights_metadata(
+            mlmodel, weight_threshold=weight_threshold
+        )
+        for v in weight_metadata_dict.values():
+            assert v.val.size >= weight_threshold
+
+        # test the functionality of using the returned meta data
+        weight_metadata_dict = ct.optimize.coreml.get_weights_metadata(mlmodel)
+
+        # get the weight names with size > 25600
+        large_weights = []
+        for k, v in weight_metadata_dict.items():
+            if v.val.size >= 25600:
+                large_weights.append(k)
+
+        # get the weight names with sparsity >= 50%
+        sparse_weights = []
+        for k, v in weight_metadata_dict.items():
+            if v.sparsity >= 0.5:
+                sparse_weights.append(k)
+
+        # get the weight names with unique elements <= 16
+        palettized_weights = []
+        for k, v in weight_metadata_dict.items():
+            if v.unique_values <= 16:
+                palettized_weights.append(k)
+
+        meta_data_1 = weight_metadata_dict["conv_1_weight"]
+
+        # testing
+        expected_large_weights = [
+            "linear_2_weight",
+            "concat_1",
+            "concat_2",
+        ]
+        assert large_weights == expected_large_weights
+
+        expected_sparse_weights = [
+            "conv_1_weight",
+            "conv_2_weight",
+            "op_59_lstm_h0_squeeze",
+        ]
+        assert sparse_weights == expected_sparse_weights
+
+        expected_palettized_weights = [
+            "linear_1_weight",
+            "linear_2_weight",
+            "op_59_lstm_h0_squeeze",
+        ]
+        assert palettized_weights == expected_palettized_weights
+
+    @staticmethod
+    def test_get_weights_metadata_shared_weight():
+        """
+        Test the get_weights_metadata functionality for models with weight-sharing layers.
+        """
+        def _test_child_ops(child_ops):
+            assert len(child_ops) == 2
+
+            assert child_ops[0].name == "add_1"
+            assert child_ops[0].op_type == "add"
+            assert child_ops[0].params_name_mapping["y"] == "w_1"
+
+            assert child_ops[1].name == "add_2"
+            assert child_ops[1].op_type == "add"
+            assert child_ops[1].params_name_mapping["y"] == "w_1"
+
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=(1, 30, 10, 10)),
+                mb.TensorSpec(shape=(1, 30, 10, 10)),
+            ],
+        )
+        def prog(x, y):
+            shared_weight = mb.const(
+                val=np.random.rand(1, 30, 10, 10).astype(np.float32), name="w_1"
+            )
+            x = mb.add(x=x, y=shared_weight, name="add_1")
+            y = mb.add(x=y, y=shared_weight, name="add_2")
+            return x, y
+
+        mlmodel = ct.convert(
+            prog,
+            convert_to="mlprogram",
+            compute_precision=ct.precision.FLOAT32,
+        )
+
+        ops_metadata_dict = ct.optimize.coreml.get_weights_metadata(
+            mlmodel,
+            weight_threshold=100,
+        )
+        assert len(ops_metadata_dict) == 1
+        child_ops = ops_metadata_dict["w_1"].child_ops
+        _test_child_ops(child_ops)
+
+    @staticmethod
+    def test_get_weights_metadata_op_var_different_name():
+        """
+        For several rare corner cases, the const var and op have different names.
+        Test that the API is correctly using the op's name.
+        """
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=(1, 30, 10, 10)),
+            ],
+        )
+        def prog(x):
+            shared_weight = mb.const(
+                val=np.random.rand(1, 30, 10, 10).astype(np.float32), name="w_1"
+            )
+            shared_weight.name = "w_1_new"
+            x = mb.add(x=x, y=shared_weight, name="add_1")
+            return x
+
+        mlmodel = ct.convert(
+            prog,
+            convert_to="mlprogram",
+            compute_precision=ct.precision.FLOAT32,
+        )
+
+        ops_metadata_dict = ct.optimize.coreml.get_weights_metadata(
+            mlmodel,
+            weight_threshold=100,
+        )
+        assert "w_1" in ops_metadata_dict
+        assert ops_metadata_dict["w_1"].child_ops[0].params_name_mapping["y"] == "w_1"
diff --git a/coremltools/test/optimize/torch/conftest.py b/coremltools/test/optimize/torch/conftest.py
index 6ab0dee21..182d47798 100644
--- a/coremltools/test/optimize/torch/conftest.py
+++ b/coremltools/test/optimize/torch/conftest.py
@@ -39,7 +39,7 @@ def _datadir(request):
     # is not happy with. Thus we should substitute these characters with a more universally accepted path character.
     safe_name = request.node.name.replace("[", "___").replace("]", "___")
 
-    dir = test_data_path() / safe_name   # noqa: F821
+    dir = test_data_path() / safe_name  # noqa: F821
     shutil.rmtree(str(dir), ignore_errors=True)
     os.makedirs(str(dir))
     return dir
diff --git a/coremltools/version.py b/coremltools/version.py
index b17d2e113..c00da273a 100644
--- a/coremltools/version.py
+++ b/coremltools/version.py
@@ -4,4 +4,4 @@
 # found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 
-__version__ = "7.0b1"  # VERSION_STRING
+__version__ = "7.0b2"  # VERSION_STRING
diff --git a/docs/source/coremltools.converters.mil.mil.passes.defs.rst b/docs/source/coremltools.converters.mil.mil.passes.defs.rst
index 2dcda1b45..a32d4e3ad 100644
--- a/docs/source/coremltools.converters.mil.mil.passes.defs.rst
+++ b/docs/source/coremltools.converters.mil.mil.passes.defs.rst
@@ -51,6 +51,7 @@ optimize_elementwise_binary
 .. automodule:: coremltools.converters.mil.mil.passes.defs.optimize_elementwise_binary
 
     .. autoclass:: divide_to_multiply
+    .. autoclass:: select_optimization
     .. autoclass:: fuse_elementwise_to_batchnorm
     .. autoclass:: rank0_expand_dims_swap
 
@@ -77,6 +78,7 @@ optimize_quantization
 
 .. automodule:: coremltools.converters.mil.mil.passes.defs.optimize_quantization
 
+    .. autoclass:: int_op_canonicalization
     .. autoclass:: nullify_redundant_quantization_zero_point
     .. autoclass:: dequantize_quantize_pair_elimination
     .. autoclass:: distributive_quantized_binary_op_scale_normalization
diff --git a/docs/source/coremltools.optimize.coreml.post_training_quantization.rst b/docs/source/coremltools.optimize.coreml.post_training_quantization.rst
index b292cd784..b4e28aa3c 100644
--- a/docs/source/coremltools.optimize.coreml.post_training_quantization.rst
+++ b/docs/source/coremltools.optimize.coreml.post_training_quantization.rst
@@ -6,4 +6,7 @@ Post-Training Compression
     .. autofunction:: linear_quantize_weights
     .. autofunction:: prune_weights
     .. autofunction:: palettize_weights
-    .. autofunction:: decompress_weights
\ No newline at end of file
+    .. autofunction:: decompress_weights
+    .. autofunction:: get_weights_metadata
+    .. autoclass:: CoreMLWeightMetaData
+    .. autoclass:: CoreMLOpMetaData
diff --git a/mlmodel/docs/Format/ItemSimilarityRecommender.rst b/mlmodel/docs/Format/ItemSimilarityRecommender.rst
index 4d91ad82d..d213bc4c7 100644
--- a/mlmodel/docs/Format/ItemSimilarityRecommender.rst
+++ b/mlmodel/docs/Format/ItemSimilarityRecommender.rst
@@ -1,28 +1,6 @@
 ItemSimilarityRecommender
 =========================
 
-Each tree is a collection of nodes,
-each of which is identified by a unique identifier.
-
-Each node is either a branch or a leaf node.
-A branch node evaluates a value according to a behavior;
-if true, the node identified by ``true_child_node_id`` is evaluated next,
-if false, the node identified by ``false_child_node_id`` is evaluated next.
-A leaf node adds the evaluation value to the base prediction value
-to get the final prediction.
-
-A tree must have exactly one root node,
-which has no parent node.
-A tree must not terminate on a branch node.
-All leaf nodes must be accessible
-by evaluating one or more branch nodes in sequence,
-starting from the root node.
-
-
-
-ItemSimilarityRecommender
---------------------------------------------------------------------------------
-
 The Item Similarity recommender takes as input a list of items and scores,
 then uses that information and a table of item similarities to predict similarity
 scores for all items.  By default, the items predicted are most similar to the given
@@ -104,4 +82,4 @@ The formula for the score of a given model as given above, with shift_k
 	        uint64 itemId = 1;
 	        repeated ConnectedItem similarItemList = 2;
 	        double itemScoreAdjustment = 3;
-	    }
\ No newline at end of file
+	    }
diff --git a/mlmodel/format/ItemSimilarityRecommender.proto b/mlmodel/format/ItemSimilarityRecommender.proto
index 7eea57e59..eb0292ac6 100644
--- a/mlmodel/format/ItemSimilarityRecommender.proto
+++ b/mlmodel/format/ItemSimilarityRecommender.proto
@@ -3,25 +3,6 @@
 // Use of this source code is governed by a BSD-3-clause license that can be
 // found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause
 
-/*
- * Each tree is a collection of nodes,
- * each of which is identified by a unique identifier.
- *
- * Each node is either a branch or a leaf node.
- * A branch node evaluates a value according to a behavior;
- * if true, the node identified by ``true_child_node_id`` is evaluated next,
- * if false, the node identified by ``false_child_node_id`` is evaluated next.
- * A leaf node adds the evaluation value to the base prediction value
- * to get the final prediction.
- *
- * A tree must have exactly one root node,
- * which has no parent node.
- * A tree must not terminate on a branch node.
- * All leaf nodes must be accessible
- * by evaluating one or more branch nodes in sequence,
- * starting from the root node.
- */
-
 syntax = "proto3";
 option optimize_for = LITE_RUNTIME;
 
diff --git a/mlmodel/src/ResultReason.hpp b/mlmodel/src/ResultReason.hpp
index 4d7234199..b2fc3b87f 100644
--- a/mlmodel/src/ResultReason.hpp
+++ b/mlmodel/src/ResultReason.hpp
@@ -33,7 +33,6 @@ enum class ResultReason {
     MODEL_MAIN_INPUT_RANK_MISMATCHED,
     MODEL_MAIN_INPUT_SHAPE_MISMATCHED,
     MODEL_MAIN_INPUT_TYPE_MISMATCHED,
-    MODEL_MAIN_INPUT_UNBOUNDED_UPPER_RANGE,
     MODEL_MAIN_OUTPUT_COUNT_MISMATCHED,
     MODEL_MAIN_OUTPUT_RANK_MISMATCHED,
     MODEL_MAIN_OUTPUT_SHAPE_MISMATCHED,
diff --git a/reqs/test.pip b/reqs/test.pip
index 0d4c69556..e90870439 100644
--- a/reqs/test.pip
+++ b/reqs/test.pip
@@ -24,9 +24,9 @@ scipy==1.9.2; python_version == '3.11'
 six
 sympy > 1.6
 gast==0.4.0
-torch==2.0.0
-torchaudio==2.0.1
-torchvision==0.15.1
+torch==2.0.1
+torchaudio==2.0.2
+torchvision==0.15.2
 xgboost==1.4.2; platform_machine != "arm64"
 mock
 wrapt
diff --git a/setup.py b/setup.py
index 4f5f6688b..459e6128a 100755
--- a/setup.py
+++ b/setup.py
@@ -81,7 +81,7 @@
         "sympy",
         "tqdm",
         "packaging",
-        "attrs",
+        "attrs>=21.3.0",
         "cattrs",
         "pyaml",
     ],