From 031125fdbdc79da4cc7f57ca7f5b9420a3e6f055 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 23 Jun 2023 17:28:00 -0400
Subject: [PATCH 01/15] Add transformation to propagate dequantize op through
 split

---
 src/sparseml/exporters/onnx_to_deepsparse.py  |  1 +
 src/sparseml/exporters/transforms/__init__.py |  1 +
 .../propagate_dequant_through_split.py        | 75 +++++++++++++++++++
 3 files changed, 77 insertions(+)
 create mode 100644 src/sparseml/exporters/transforms/propagate_dequant_through_split.py

diff --git a/src/sparseml/exporters/onnx_to_deepsparse.py b/src/sparseml/exporters/onnx_to_deepsparse.py
index 78651e39c74..fadad423b5c 100644
--- a/src/sparseml/exporters/onnx_to_deepsparse.py
+++ b/src/sparseml/exporters/onnx_to_deepsparse.py
@@ -75,6 +75,7 @@ def __init__(
             sparseml_transforms.DeleteRepeatedQdq(),
             sparseml_transforms.QuantizeQATEmbedding(),
             sparseml_transforms.PropagateEmbeddingQuantization(),
+            sparseml_transforms.PropagateDequantThroughSplit(),
             sparseml_transforms.MatMulToQLinearMatMul(),
             sparseml_transforms.MatMulAddToMatMulIntegerAddCastMul(),
             sparseml_transforms.MatMulToMatMulIntegerCastMul(),
diff --git a/src/sparseml/exporters/transforms/__init__.py b/src/sparseml/exporters/transforms/__init__.py
index 459c86083e4..4daa38a4a6a 100644
--- a/src/sparseml/exporters/transforms/__init__.py
+++ b/src/sparseml/exporters/transforms/__init__.py
@@ -41,6 +41,7 @@
 from .matmul_add_to_matmulinteger_add_cast_mul import MatMulAddToMatMulIntegerAddCastMul
 from .matmul_to_matmulinteger_cast_mul import MatMulToMatMulIntegerCastMul
 from .propagate_embedding_quantization import PropagateEmbeddingQuantization
+from .propagate_dequant_through_split import PropagateDequantThroughSplit
 from .quantize_qat_embedding import QuantizeQATEmbedding
 from .quantize_residuals import QuantizeResiduals
 from .remove_duplicate_qconv_weights import RemoveDuplicateQConvWeights
diff --git a/src/sparseml/exporters/transforms/propagate_dequant_through_split.py b/src/sparseml/exporters/transforms/propagate_dequant_through_split.py
new file mode 100644
index 00000000000..231dca36dbb
--- /dev/null
+++ b/src/sparseml/exporters/transforms/propagate_dequant_through_split.py
@@ -0,0 +1,75 @@
+from onnx import ModelProto
+import onnx
+
+from sparseml.exporters.transforms import OnnxTransform
+from sparseml.exporters.transforms.utils import (
+    MatchResult,
+    get_structural_matches,
+)
+from sparseml.onnx.utils import ONNXGraph
+
+__all__ = ["PropagateDequantThroughSplit"]
+
+class PropagateDequantThroughSplit(OnnxTransform):
+    """
+     A pass for propagating DequantizeLinear nodes down through a split node
+     so if there are quantized operations after the split they can
+     be properly converted.
+     Starting with:
+     |         INPUT
+     |              |
+     |       DequantizeLinear
+     |             |
+     |           Split
+     |         |   |   |
+     Converts to:
+     |                     INPUT
+     |                         |
+     |                       Split
+     |                |         |           |
+     | DequantizeLinear  DequantizeLinear  DequantizeLinear
+     |         |                |                |
+     """
+    def transform(self, model: ModelProto) -> ModelProto:
+        graph = ONNXGraph(model)
+        matches = get_structural_matches(
+            graph,
+            parent_ops=[["DequantizeLinear"]],
+            op_type="Split",
+        )
+        for match in matches:
+            self.log_match(match)
+            self._transform_match( model, match)
+        return model
+
+    def _transform_match(self, model: ModelProto, match: MatchResult):
+
+        # Loop through the nodes that are children of the Split node
+        # For every child, create a DequantizeLinear node and insert between Split and child
+        for split_output_id in range(len(match.node.output)):
+            dequant_node_name = match.node.name + f"_dequant.{split_output_id}"
+            dequant_node_output = match.node.output[split_output_id]
+            dequant_node_input = dequant_node_name + "_input"
+
+            # Input to DequantizeLinear node is the output of the Split node
+            model.graph.node.append(
+                onnx.helper.make_node(
+                    "DequantizeLinear",
+                    [
+                        dequant_node_input,  # input
+                        match.parents[0][0].input[1],  # scale
+                        match.parents[0][0].input[2],  # zero point
+                    ],
+                    [dequant_node_output],
+                    dequant_node_name,
+                )
+            )
+
+            # Replace the output of the Split node with the input of the new DequantizeLinear node
+            match.node.output[split_output_id] = dequant_node_input
+
+        # Set the input to the Split node to what was the input of the original DequantizeLinear node
+        match.node.input[0] = match.parents[0][0].input[0]
+
+        # Remove original DequantizeLinear node
+        self.delete_node_deferred(match.parents[0][0])

From c54c289a8bd72fc5663b0363ec4f55ab5b06bf3d Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 23 Jun 2023 17:28:51 -0400
Subject: [PATCH 02/15] Remove requirement that QuantizeLinear must be next to
 DequantizeLinear for input branch of Conv node

---
 .../exporters/transforms/conv_to_convinteger_add_cast_mul.py  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/sparseml/exporters/transforms/conv_to_convinteger_add_cast_mul.py b/src/sparseml/exporters/transforms/conv_to_convinteger_add_cast_mul.py
index c8e1e693c04..fb52bf3432c 100644
--- a/src/sparseml/exporters/transforms/conv_to_convinteger_add_cast_mul.py
+++ b/src/sparseml/exporters/transforms/conv_to_convinteger_add_cast_mul.py
@@ -69,7 +69,7 @@ def transform(self, model: ModelProto) -> ModelProto:
         matches = get_structural_matches(
             graph,
             parent_ops=[
-                ["QuantizeLinear", "DequantizeLinear"],
+                ["DequantizeLinear"],
                 [
                     # weight should be initializer
                     INITIALIZER_MATCH,
@@ -89,7 +89,7 @@ def transform(self, model: ModelProto) -> ModelProto:
         return model
 
     def _transform_match(self, graph: ONNXGraph, model: ModelProto, match: MatchResult):
-        input_quant, input_dequant = match.parents[0]
+        (input_dequant,) = match.parents[0]
         weight_init, weight_quantize_node, weight_dequantize_node = match.parents[1]
         (bias_init,) = match.parents[2]
 

From 20632ce3bb8ebce19b431d7fce5f4ce2dda73d76 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 23 Jun 2023 18:11:57 -0400
Subject: [PATCH 03/15] Fixed embedding quantization propagation

---
 .../propagate_embedding_quantization.py       | 38 ++++++++++++++++---
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/src/sparseml/exporters/transforms/propagate_embedding_quantization.py b/src/sparseml/exporters/transforms/propagate_embedding_quantization.py
index 0ca82d5e995..be3ed607c78 100644
--- a/src/sparseml/exporters/transforms/propagate_embedding_quantization.py
+++ b/src/sparseml/exporters/transforms/propagate_embedding_quantization.py
@@ -15,6 +15,7 @@
 import logging
 
 import numpy
+import onnx.numpy_helper
 from onnx import ModelProto, numpy_helper
 
 from sparseml.exporters.transforms.onnx_transform import OnnxTransform
@@ -79,16 +80,19 @@ def transform(self, model: ModelProto) -> ModelProto:
                 ["Concat"],
             ],
         )
+
+        initializer_dict = {i.name: i for i in model.graph.initializer}
+
         for match in matches:
             (gather,) = match.parents[0]
             dequant = match.node
-            slice1, _, concat1 = match.children[0]
-            slice2, _, concat2 = match.children[1]
+            slice1, pad1, concat1 = match.children[0]
+            slice2, pad2, concat2 = match.children[1]
             (concat,) = match.children[2]
 
             # check for uint8 initializer
             indices = graph.get_init_by_name(gather.input[0])
-            if indices is None or numpy_helper.to_array(indices).dtype != numpy.uint8:
+            if indices is None or numpy_helper.to_array(indices).dtype not in [numpy.uint8, numpy.int8]:
                 continue
 
             # check that all concats are the same
@@ -97,11 +101,35 @@ def transform(self, model: ModelProto) -> ModelProto:
 
             self.log_match(match)
 
-            assert concat.input[2] == dequant.output[0]
-            concat.input[2] = gather.output[0]
+            for id, input_name in enumerate(concat.input):
+                if input_name == dequant.output[0]:
+                    break
+
+            concat.input[id] = gather.output[0]
             slice1.input[0] = gather.output[0]
             slice2.input[0] = gather.output[0]
 
+            zero_point_initializer = initializer_dict[match.node.input[2]]
+            zero_point = onnx.numpy_helper.to_array(zero_point_initializer)
+
+            pad1_value_initializer = initializer_dict[pad1.input[2]]
+            pad1_value = onnx.numpy_helper.to_array(pad1_value_initializer)
+            pad1_value = pad1_value.astype(zero_point.dtype) + zero_point
+            new_pad1_value_initializer = numpy_helper.from_array(
+                pad1_value, name=pad1_value_initializer.name
+            )
+            model.graph.initializer.remove(pad1_value_initializer)
+            model.graph.initializer.append(new_pad1_value_initializer)
+
+            pad2_value_initializer = initializer_dict[pad2.input[2]]
+            pad2_value = onnx.numpy_helper.to_array(pad2_value_initializer)
+            pad2_value = pad2_value.astype(zero_point.dtype) + zero_point
+            new_pad2_value_initializer = numpy_helper.from_array(
+                pad2_value, name=pad2_value_initializer.name
+            )
+            model.graph.initializer.remove(pad2_value_initializer)
+            model.graph.initializer.append(new_pad2_value_initializer)
+
             tmp = concat.output[0]
             concat.output[0] = dequant.output[0]
             dequant.output[0] = tmp

From 4384a11d8e07b11cf9768f9f5d820f0b2ed049c8 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 23 Jun 2023 18:15:24 -0400
Subject: [PATCH 04/15] Quality fixes

---
 .../propagate_dequant_through_split.py        | 71 ++++++++++++-------
 .../propagate_embedding_quantization.py       |  5 +-
 2 files changed, 48 insertions(+), 28 deletions(-)

diff --git a/src/sparseml/exporters/transforms/propagate_dequant_through_split.py b/src/sparseml/exporters/transforms/propagate_dequant_through_split.py
index 231dca36dbb..7c65117f710 100644
--- a/src/sparseml/exporters/transforms/propagate_dequant_through_split.py
+++ b/src/sparseml/exporters/transforms/propagate_dequant_through_split.py
@@ -1,35 +1,49 @@
-from onnx import ModelProto
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import onnx
+from onnx import ModelProto
 
 from sparseml.exporters.transforms import OnnxTransform
-from sparseml.exporters.transforms.utils import (
-    MatchResult,
-    get_structural_matches,
-)
+from sparseml.exporters.transforms.utils import MatchResult, get_structural_matches
 from sparseml.onnx.utils import ONNXGraph
 
+
 __all__ = ["PropagateDequantThroughSplit"]
 
+
 class PropagateDequantThroughSplit(OnnxTransform):
     """
-     A pass for propagating DequantizeLinear nodes down through a split node
-     so if there are quantized operations after the split they can
-     be properly converted.
-     Starting with:
-     |         INPUT
-     |              |
-     |       DequantizeLinear
-     |             |
-     |           Split
-     |         |   |   |
-     Converts to:
-     |                     INPUT
-     |                         |
-     |                       Split
-     |                |         |           |
-     | DequantizeLinear  DequantizeLinear  DequantizeLinear
-     |         |                |                |
-     """
+    A pass for propagating DequantizeLinear nodes down through a split node
+    so if there are quantized operations after the split they can
+    be properly converted.
+    Starting with:
+    |         INPUT
+    |              |
+    |       DequantizeLinear
+    |             |
+    |           Split
+    |         |   |   |
+    Converts to:
+    |                     INPUT
+    |                         |
+    |                       Split
+    |                |         |           |
+    | DequantizeLinear  DequantizeLinear  DequantizeLinear
+    |         |                |                |
+    """
+
     def transform(self, model: ModelProto) -> ModelProto:
         graph = ONNXGraph(model)
         matches = get_structural_matches(
@@ -39,13 +53,14 @@ def transform(self, model: ModelProto) -> ModelProto:
         )
         for match in matches:
             self.log_match(match)
-            self._transform_match( model, match)
+            self._transform_match(model, match)
         return model
 
     def _transform_match(self, model: ModelProto, match: MatchResult):
 
         # Loop through the nodes that are children of the Split node
-        # For every child, create a DequantizeLinear node and insert between Split and child
+        # For every child, create a DequantizeLinear node and insert
+        # between Split and child
         for split_output_id in range(len(match.node.output)):
             dequant_node_name = match.node.name + f"_dequant.{split_output_id}"
             dequant_node_output = match.node.output[split_output_id]
@@ -65,10 +80,12 @@ def _transform_match(self, model: ModelProto, match: MatchResult):
                 )
             )
 
-            # Replace the output of the Split node with the input of the new DequantizeLinear node
+            # Replace the output of the Split node with the input of
+            # the new DequantizeLinear node
             match.node.output[split_output_id] = dequant_node_input
 
-        # Set the input to the Split node to what was the input of the original DequantizeLinear node
+        # Set the input to the Split node to what was the input of the
+        # original DequantizeLinear node
         match.node.input[0] = match.parents[0][0].input[0]
 
         # Remove original DequantizeLinear node
diff --git a/src/sparseml/exporters/transforms/propagate_embedding_quantization.py b/src/sparseml/exporters/transforms/propagate_embedding_quantization.py
index be3ed607c78..7e3dda87120 100644
--- a/src/sparseml/exporters/transforms/propagate_embedding_quantization.py
+++ b/src/sparseml/exporters/transforms/propagate_embedding_quantization.py
@@ -92,7 +92,10 @@ def transform(self, model: ModelProto) -> ModelProto:
 
             # check for uint8 initializer
             indices = graph.get_init_by_name(gather.input[0])
-            if indices is None or numpy_helper.to_array(indices).dtype not in [numpy.uint8, numpy.int8]:
+            if indices is None or numpy_helper.to_array(indices).dtype not in [
+                numpy.uint8,
+                numpy.int8,
+            ]:
                 continue
 
             # check that all concats are the same

From a98bd91e972ad1bdab8a8871fbb4dbd2989550e7 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 27 Jun 2023 00:33:31 -0400
Subject: [PATCH 05/15] Add zero point to dequant node

---
 .../transforms/test_propagate_embedding_quantization.py        | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py
index 9111d75e42d..300744d5cc3 100644
--- a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py
+++ b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py
@@ -32,6 +32,7 @@ def onnx_model():
         "output", onnx.TensorProto.FLOAT, (1,)
     )
     scale = onnx.helper.make_tensor("scale", onnx.TensorProto.FLOAT, (1,), [1.0])
+    zero_point = onnx.helper.make_tensor("zero point", onnx.TensorProto.INT8, (1,), [0.0])
     starts = onnx.helper.make_tensor("starts", onnx.TensorProto.INT64, (1,), [0])
     ends = onnx.helper.make_tensor("ends", onnx.TensorProto.INT64, (1,), [1])
     pads = onnx.helper.make_tensor("pads", onnx.TensorProto.INT64, (1,), [1])
@@ -43,7 +44,7 @@ def onnx_model():
     )
     dequant = onnx.helper.make_node(
         "DequantizeLinear",
-        ["gather_output", "scale"],
+        ["gather_output", "scale", "zero_point"],
         ["dequant_output"],
         name="dequant",
     )

From 32db97be427e8b688a48ee9ae0424b68f6f62e71 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 27 Jun 2023 00:34:51 -0400
Subject: [PATCH 06/15] Add zero point to initializers

---
 .../transforms/test_propagate_embedding_quantization.py       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py
index 300744d5cc3..ca48489ad77 100644
--- a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py
+++ b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py
@@ -32,7 +32,7 @@ def onnx_model():
         "output", onnx.TensorProto.FLOAT, (1,)
     )
     scale = onnx.helper.make_tensor("scale", onnx.TensorProto.FLOAT, (1,), [1.0])
-    zero_point = onnx.helper.make_tensor("zero point", onnx.TensorProto.INT8, (1,), [0.0])
+    zero_point = onnx.helper.make_tensor("zero_point", onnx.TensorProto.INT8, (1,), [0.0])
     starts = onnx.helper.make_tensor("starts", onnx.TensorProto.INT64, (1,), [0])
     ends = onnx.helper.make_tensor("ends", onnx.TensorProto.INT64, (1,), [1])
     pads = onnx.helper.make_tensor("pads", onnx.TensorProto.INT64, (1,), [1])
@@ -74,7 +74,7 @@ def onnx_model():
         name="g",
         inputs=[model_input],
         outputs=[model_output],
-        initializer=[scale, starts, ends, embeddings, pads],
+        initializer=[scale, zero_point, starts, ends, embeddings, pads],
     )
 
     model = onnx.helper.make_model(graph)

From 8e18613303fe1468761950ba40236e29952ce028 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 27 Jun 2023 00:38:31 -0400
Subject: [PATCH 07/15] Style fixes

---
 .../transforms/test_propagate_embedding_quantization.py       | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py
index ca48489ad77..ae66d72745e 100644
--- a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py
+++ b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py
@@ -32,7 +32,9 @@ def onnx_model():
         "output", onnx.TensorProto.FLOAT, (1,)
     )
     scale = onnx.helper.make_tensor("scale", onnx.TensorProto.FLOAT, (1,), [1.0])
-    zero_point = onnx.helper.make_tensor("zero_point", onnx.TensorProto.INT8, (1,), [0.0])
+    zero_point = onnx.helper.make_tensor(
+        "zero_point", onnx.TensorProto.INT8, (1,), [0.0]
+    )
     starts = onnx.helper.make_tensor("starts", onnx.TensorProto.INT64, (1,), [0])
     ends = onnx.helper.make_tensor("ends", onnx.TensorProto.INT64, (1,), [1])
     pads = onnx.helper.make_tensor("pads", onnx.TensorProto.INT64, (1,), [1])

From fdae854fa8c6701805ec2502587eed951ee81d9d Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 27 Jun 2023 00:48:41 -0400
Subject: [PATCH 08/15] Fix data type

---
 .../transforms/test_propagate_embedding_quantization.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py
index ae66d72745e..2836e60bbaa 100644
--- a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py
+++ b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py
@@ -33,7 +33,7 @@ def onnx_model():
     )
     scale = onnx.helper.make_tensor("scale", onnx.TensorProto.FLOAT, (1,), [1.0])
     zero_point = onnx.helper.make_tensor(
-        "zero_point", onnx.TensorProto.INT8, (1,), [0.0]
+        "zero_point", onnx.TensorProto.INT8, (1,), [0]
     )
     starts = onnx.helper.make_tensor("starts", onnx.TensorProto.INT64, (1,), [0])
     ends = onnx.helper.make_tensor("ends", onnx.TensorProto.INT64, (1,), [1])

From 85c68289932364a4e110c29e1ea4486c053f6b3d Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 27 Jun 2023 01:16:53 -0400
Subject: [PATCH 09/15] Allow MatMul weight to be on either input 0 or 1

---
 ...atmul_add_to_matmulinteger_add_cast_mul.py | 43 +++++++++++++++++--
 1 file changed, 40 insertions(+), 3 deletions(-)

diff --git a/src/sparseml/exporters/transforms/matmul_add_to_matmulinteger_add_cast_mul.py b/src/sparseml/exporters/transforms/matmul_add_to_matmulinteger_add_cast_mul.py
index 2cef33a6226..bc49a925ebc 100644
--- a/src/sparseml/exporters/transforms/matmul_add_to_matmulinteger_add_cast_mul.py
+++ b/src/sparseml/exporters/transforms/matmul_add_to_matmulinteger_add_cast_mul.py
@@ -67,6 +67,37 @@ class MatMulAddToMatMulIntegerAddCastMul(OnnxTransform):
 
     def transform(self, model: ModelProto) -> ModelProto:
         graph = ONNXGraph(model)
+
+        # Weight on input 0
+        matches = get_structural_matches(
+            graph,
+            op_type="MatMul",
+            parent_ops=[
+                [
+                    # weight should be initializer
+                    INITIALIZER_MATCH,
+                    "QuantizeLinear",
+                    "DequantizeLinear",
+                    optional_node("Transpose"),
+                ],
+                [any_of("QuantizeLinear", "DequantizeLinear")],
+            ],
+            children_ops=[[optional_node("Add")]],
+        )
+        for match in matches:
+            add_node = match.children[0][0]
+            bias_init = None
+            if add_node:
+                # NOTE: bias could be either input 0 or 1 of add node
+                # if add does not have a bias initializer,
+                # still do conversion, but do not fold the bias add to rescale
+                bias_init = graph.get_init_by_name(match.children[0][0].input[1])
+                if bias_init is None:
+                    bias_init = graph.get_init_by_name(match.children[0][0].input[0])
+            self.log_match(match)
+            self._transform_match(graph, model, match, bias_init, 0)
+
+        # Weight on input 1
         matches = get_structural_matches(
             graph,
             op_type="MatMul",
@@ -93,7 +124,8 @@ def transform(self, model: ModelProto) -> ModelProto:
                 if bias_init is None:
                     bias_init = graph.get_init_by_name(match.children[0][0].input[0])
             self.log_match(match)
-            self._transform_match(graph, model, match, bias_init)
+            self._transform_match(graph, model, match, bias_init, 1)
+
         return model
 
     def _transform_match(
@@ -102,10 +134,15 @@ def _transform_match(
         model: ModelProto,
         match: MatchResult,
         bias_init: TensorProto,
+        weight_parent: int,
     ):
         matmul = match.node
-        (input_quant,) = match.parents[0]
-        weight_init, weight_quant, weight_dequant, opt_transpose = match.parents[1]
+        if weight_parent == 0:
+            (input_quant,) = match.parents[1]
+            weight_init, weight_quant, weight_dequant, opt_transpose = match.parents[0]
+        else:
+            (input_quant,) = match.parents[0]
+            weight_init, weight_quant, weight_dequant, opt_transpose = match.parents[1]
         (add,) = match.children[0]
 
         input_quantize_params = get_quantization_params(

From f2bf1d78b15b0c95e9ddc28b0617d78ec6b0b926 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 27 Jun 2023 01:20:21 -0400
Subject: [PATCH 10/15] Style fixes

---
 .../transforms/test_propagate_embedding_quantization.py       | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py
index 2836e60bbaa..cb63f82fd3d 100644
--- a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py
+++ b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py
@@ -32,9 +32,7 @@ def onnx_model():
         "output", onnx.TensorProto.FLOAT, (1,)
     )
     scale = onnx.helper.make_tensor("scale", onnx.TensorProto.FLOAT, (1,), [1.0])
-    zero_point = onnx.helper.make_tensor(
-        "zero_point", onnx.TensorProto.INT8, (1,), [0]
-    )
+    zero_point = onnx.helper.make_tensor("zero_point", onnx.TensorProto.INT8, (1,), [0])
     starts = onnx.helper.make_tensor("starts", onnx.TensorProto.INT64, (1,), [0])
     ends = onnx.helper.make_tensor("ends", onnx.TensorProto.INT64, (1,), [1])
     pads = onnx.helper.make_tensor("pads", onnx.TensorProto.INT64, (1,), [1])

From f904e4932b2c963dd10a34fe6901fe24fc8ffa47 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 27 Jun 2023 01:29:12 -0400
Subject: [PATCH 11/15] Add padding value

---
 .../test_propagate_embedding_quantization.py        | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py
index cb63f82fd3d..a6711b4a619 100644
--- a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py
+++ b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py
@@ -32,10 +32,15 @@ def onnx_model():
         "output", onnx.TensorProto.FLOAT, (1,)
     )
     scale = onnx.helper.make_tensor("scale", onnx.TensorProto.FLOAT, (1,), [1.0])
-    zero_point = onnx.helper.make_tensor("zero_point", onnx.TensorProto.INT8, (1,), [0])
+    zero_point = onnx.helper.make_tensor(
+        "zero_point", onnx.TensorProto.UINT8, (1,), [128]
+    )
     starts = onnx.helper.make_tensor("starts", onnx.TensorProto.INT64, (1,), [0])
     ends = onnx.helper.make_tensor("ends", onnx.TensorProto.INT64, (1,), [1])
     pads = onnx.helper.make_tensor("pads", onnx.TensorProto.INT64, (1,), [1])
+    padding_value = onnx.helper.make_tensor(
+        "padding_value", onnx.TensorProto.FLOAT, (1,), [0.0]
+    )
     embeddings = onnx.helper.make_tensor(
         "embeddings", onnx.TensorProto.UINT8, (1,), [0]
     )
@@ -53,13 +58,13 @@ def onnx_model():
         "Slice", ["dequant_output", "starts", "ends"], ["slice1_output"], name="slice1"
     )
     pad1 = onnx.helper.make_node(
-        "Pad", ["slice1_output", "pads"], ["pad1_output"], name="pad1"
+        "Pad", ["slice1_output", "pads", "padding_value"], ["pad1_output"], name="pad1"
     )
     slice2 = onnx.helper.make_node(
         "Slice", ["dequant_output", "starts", "ends"], ["slice2_output"], name="slice2"
     )
     pad2 = onnx.helper.make_node(
-        "Pad", ["slice2_output", "pads"], ["pad2_output"], name="pad2"
+        "Pad", ["slice2_output", "pads", "padding_value"], ["pad2_output"], name="pad2"
     )
     concat = onnx.helper.make_node(
         "Concat",
@@ -74,7 +79,7 @@ def onnx_model():
         name="g",
         inputs=[model_input],
         outputs=[model_output],
-        initializer=[scale, zero_point, starts, ends, embeddings, pads],
+        initializer=[scale, zero_point, starts, ends, embeddings, pads, padding_value],
     )
 
     model = onnx.helper.make_model(graph)

From 8e15c59e8c9ba97ef2f2105f6ea2c6bffa6b5764 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 27 Jun 2023 01:34:59 -0400
Subject: [PATCH 12/15] Make initializers distinct

---
 .../test_propagate_embedding_quantization.py        | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py
index a6711b4a619..699c4e834e0 100644
--- a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py
+++ b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py
@@ -38,8 +38,11 @@ def onnx_model():
     starts = onnx.helper.make_tensor("starts", onnx.TensorProto.INT64, (1,), [0])
     ends = onnx.helper.make_tensor("ends", onnx.TensorProto.INT64, (1,), [1])
     pads = onnx.helper.make_tensor("pads", onnx.TensorProto.INT64, (1,), [1])
-    padding_value = onnx.helper.make_tensor(
-        "padding_value", onnx.TensorProto.FLOAT, (1,), [0.0]
+    padding1_value = onnx.helper.make_tensor(
+        "padding1_value", onnx.TensorProto.FLOAT, (1,), [0.0]
+    )
+    padding2_value = onnx.helper.make_tensor(
+        "padding2_value", onnx.TensorProto.FLOAT, (1,), [0.0]
     )
     embeddings = onnx.helper.make_tensor(
         "embeddings", onnx.TensorProto.UINT8, (1,), [0]
@@ -58,13 +61,13 @@ def onnx_model():
         "Slice", ["dequant_output", "starts", "ends"], ["slice1_output"], name="slice1"
     )
     pad1 = onnx.helper.make_node(
-        "Pad", ["slice1_output", "pads", "padding_value"], ["pad1_output"], name="pad1"
+        "Pad", ["slice1_output", "pads", "padding1_value"], ["pad1_output"], name="pad1"
     )
     slice2 = onnx.helper.make_node(
         "Slice", ["dequant_output", "starts", "ends"], ["slice2_output"], name="slice2"
     )
     pad2 = onnx.helper.make_node(
-        "Pad", ["slice2_output", "pads", "padding_value"], ["pad2_output"], name="pad2"
+        "Pad", ["slice2_output", "pads", "padding2_value"], ["pad2_output"], name="pad2"
     )
     concat = onnx.helper.make_node(
         "Concat",
@@ -79,7 +82,7 @@ def onnx_model():
         name="g",
         inputs=[model_input],
         outputs=[model_output],
-        initializer=[scale, zero_point, starts, ends, embeddings, pads, padding_value],
+        initializer=[scale, zero_point, starts, ends, embeddings, pads, padding1_value, padding2_value],
     )
 
     model = onnx.helper.make_model(graph)

From 1cdbacb9e46650d85405b7a79c746057a8db8918 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 27 Jun 2023 01:41:19 -0400
Subject: [PATCH 13/15] Style and quality fixes

---
 .../test_propagate_embedding_quantization.py          | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py
index 699c4e834e0..e2c8c308c1d 100644
--- a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py
+++ b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py
@@ -82,7 +82,16 @@ def onnx_model():
         name="g",
         inputs=[model_input],
         outputs=[model_output],
-        initializer=[scale, zero_point, starts, ends, embeddings, pads, padding1_value, padding2_value],
+        initializer=[
+            scale,
+            zero_point,
+            starts,
+            ends,
+            embeddings,
+            pads,
+            padding1_value,
+            padding2_value,
+        ],
     )
 
     model = onnx.helper.make_model(graph)

From 7629f5ef3a0c4ba4313b4ac8779d7e56428e9fea Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Wed, 5 Jul 2023 13:59:52 -0400
Subject: [PATCH 14/15] Make bias optional for Conv QAT conversion

---
 .../conv_to_convinteger_add_cast_mul.py       | 41 +++++++++++++++++--
 1 file changed, 37 insertions(+), 4 deletions(-)

diff --git a/src/sparseml/exporters/transforms/conv_to_convinteger_add_cast_mul.py b/src/sparseml/exporters/transforms/conv_to_convinteger_add_cast_mul.py
index fb52bf3432c..85c4e0362cf 100644
--- a/src/sparseml/exporters/transforms/conv_to_convinteger_add_cast_mul.py
+++ b/src/sparseml/exporters/transforms/conv_to_convinteger_add_cast_mul.py
@@ -21,6 +21,7 @@
     add_quantized_conv_matmul_add_ops,
     get_quantization_params,
     get_structural_matches,
+    optional_node,
 )
 from sparseml.onnx.utils import ONNXGraph
 
@@ -66,7 +67,9 @@ class ConvToConvIntegerAddCastMul(OnnxTransform):
 
     def transform(self, model: ModelProto) -> ModelProto:
         graph = ONNXGraph(model)
-        matches = get_structural_matches(
+
+        # Nodes with bias
+        matches_bias = get_structural_matches(
             graph,
             parent_ops=[
                 ["DequantizeLinear"],
@@ -78,20 +81,50 @@ def transform(self, model: ModelProto) -> ModelProto:
                 ],
                 [
                     # bias should be initializer
-                    INITIALIZER_MATCH
+                    INITIALIZER_MATCH,
                 ],
             ],
             op_type="Conv",
         )
+
+        # Nodes without bias
+        matches_no_bias = get_structural_matches(
+            graph,
+            parent_ops=[
+                ["DequantizeLinear"],
+                [
+                    # weight should be initializer
+                    INITIALIZER_MATCH,
+                    "QuantizeLinear",
+                    "DequantizeLinear",
+                ],
+            ],
+            op_type="Conv",
+        )
+
+        matches = matches_bias
+        matches_names = [m.node.name for m in matches]
+        for match in matches_no_bias:
+            if match.node.name not in matches_names:
+                matches.append(match)
+
         for match in matches:
             self.log_match(match)
             self._transform_match(graph, model, match)
         return model
 
-    def _transform_match(self, graph: ONNXGraph, model: ModelProto, match: MatchResult):
+    def _transform_match(
+        self,
+        graph: ONNXGraph,
+        model: ModelProto,
+        match: MatchResult,
+    ):
         (input_dequant,) = match.parents[0]
         weight_init, weight_quantize_node, weight_dequantize_node = match.parents[1]
-        (bias_init,) = match.parents[2]
+        if len(match.parents) == 3:
+            (bias_init,) = match.parents[2]
+        else:
+            bias_init = None
 
         model = add_quantized_conv_matmul_add_ops(
             model=model,

From d189627d52924c6c00b4706bbb09ce44fb821d42 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Wed, 5 Jul 2023 16:39:26 -0400
Subject: [PATCH 15/15] Quality fix

---
 .../exporters/transforms/conv_to_convinteger_add_cast_mul.py     | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/sparseml/exporters/transforms/conv_to_convinteger_add_cast_mul.py b/src/sparseml/exporters/transforms/conv_to_convinteger_add_cast_mul.py
index 85c4e0362cf..f19e1dd8642 100644
--- a/src/sparseml/exporters/transforms/conv_to_convinteger_add_cast_mul.py
+++ b/src/sparseml/exporters/transforms/conv_to_convinteger_add_cast_mul.py
@@ -21,7 +21,6 @@
     add_quantized_conv_matmul_add_ops,
     get_quantization_params,
     get_structural_matches,
-    optional_node,
 )
 from sparseml.onnx.utils import ONNXGraph