From 031125fdbdc79da4cc7f57ca7f5b9420a3e6f055 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 23 Jun 2023 17:28:00 -0400 Subject: [PATCH 01/15] Add transformation to propagate dequantize op through split --- src/sparseml/exporters/onnx_to_deepsparse.py | 1 + src/sparseml/exporters/transforms/__init__.py | 1 + .../propagate_dequant_through_split.py | 75 +++++++++++++++++++ 3 files changed, 77 insertions(+) create mode 100644 src/sparseml/exporters/transforms/propagate_dequant_through_split.py diff --git a/src/sparseml/exporters/onnx_to_deepsparse.py b/src/sparseml/exporters/onnx_to_deepsparse.py index 78651e39c74..fadad423b5c 100644 --- a/src/sparseml/exporters/onnx_to_deepsparse.py +++ b/src/sparseml/exporters/onnx_to_deepsparse.py @@ -75,6 +75,7 @@ def __init__( sparseml_transforms.DeleteRepeatedQdq(), sparseml_transforms.QuantizeQATEmbedding(), sparseml_transforms.PropagateEmbeddingQuantization(), + sparseml_transforms.PropagateDequantThroughSplit(), sparseml_transforms.MatMulToQLinearMatMul(), sparseml_transforms.MatMulAddToMatMulIntegerAddCastMul(), sparseml_transforms.MatMulToMatMulIntegerCastMul(), diff --git a/src/sparseml/exporters/transforms/__init__.py b/src/sparseml/exporters/transforms/__init__.py index 459c86083e4..4daa38a4a6a 100644 --- a/src/sparseml/exporters/transforms/__init__.py +++ b/src/sparseml/exporters/transforms/__init__.py @@ -41,6 +41,7 @@ from .matmul_add_to_matmulinteger_add_cast_mul import MatMulAddToMatMulIntegerAddCastMul from .matmul_to_matmulinteger_cast_mul import MatMulToMatMulIntegerCastMul from .propagate_embedding_quantization import PropagateEmbeddingQuantization +from .propagate_dequant_through_split import PropagateDequantThroughSplit from .quantize_qat_embedding import QuantizeQATEmbedding from .quantize_residuals import QuantizeResiduals from .remove_duplicate_qconv_weights import RemoveDuplicateQConvWeights diff --git a/src/sparseml/exporters/transforms/propagate_dequant_through_split.py b/src/sparseml/exporters/transforms/propagate_dequant_through_split.py new file mode 100644 index 00000000000..231dca36dbb --- /dev/null +++ b/src/sparseml/exporters/transforms/propagate_dequant_through_split.py @@ -0,0 +1,75 @@ +from onnx import ModelProto +import onnx + +from sparseml.exporters.transforms import OnnxTransform +from sparseml.exporters.transforms.utils import ( + MatchResult, + get_structural_matches, +) +from sparseml.onnx.utils import ONNXGraph + +__all__ = ["PropagateDequantThroughSplit"] + +class PropagateDequantThroughSplit(OnnxTransform): + """ + A pass for propagating DequantizeLinear nodes down through a split node + so if there are quantized operations after the split they can + be properly converted. + Starting with: + | INPUT + | | + | DequantizeLinear + | | + | Split + | | | | + Converts to: + | INPUT + | | + | Split + | | | | + | DequantizeLinear DequantizeLinear DequantizeLinear + | | | | + """ + def transform(self, model: ModelProto) -> ModelProto: + graph = ONNXGraph(model) + matches = get_structural_matches( + graph, + parent_ops=[["DequantizeLinear"]], + op_type="Split", + ) + for match in matches: + self.log_match(match) + self._transform_match( model, match) + return model + + def _transform_match(self, model: ModelProto, match: MatchResult): + + # Loop through the nodes that are children of the Split node + # For every child, create a DequantizeLinear node and insert between Split and child + for split_output_id in range(len(match.node.output)): + dequant_node_name = match.node.name + f"_dequant.{split_output_id}" + dequant_node_output = match.node.output[split_output_id] + dequant_node_input = dequant_node_name + "_input" + + # Input to DequantizeLinear node is the output of the Split node + model.graph.node.append( + onnx.helper.make_node( + "DequantizeLinear", + [ + dequant_node_input, # input + match.parents[0][0].input[1], # scale + match.parents[0][0].input[2], # zero point + ], + [dequant_node_output], + dequant_node_name, + ) + ) + + # Replace the output of the Split node with the input of the new DequantizeLinear node + match.node.output[split_output_id] = dequant_node_input + + # Set the input to the Split node to what was the input of the original DequantizeLinear node + match.node.input[0] = match.parents[0][0].input[0] + + # Remove original DequantizeLinear node + self.delete_node_deferred(match.parents[0][0]) From c54c289a8bd72fc5663b0363ec4f55ab5b06bf3d Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 23 Jun 2023 17:28:51 -0400 Subject: [PATCH 02/15] Remove requirement that QuantizeLinear must be next to DequantizeLinear for input branch of Conv node --- .../exporters/transforms/conv_to_convinteger_add_cast_mul.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sparseml/exporters/transforms/conv_to_convinteger_add_cast_mul.py b/src/sparseml/exporters/transforms/conv_to_convinteger_add_cast_mul.py index c8e1e693c04..fb52bf3432c 100644 --- a/src/sparseml/exporters/transforms/conv_to_convinteger_add_cast_mul.py +++ b/src/sparseml/exporters/transforms/conv_to_convinteger_add_cast_mul.py @@ -69,7 +69,7 @@ def transform(self, model: ModelProto) -> ModelProto: matches = get_structural_matches( graph, parent_ops=[ - ["QuantizeLinear", "DequantizeLinear"], + ["DequantizeLinear"], [ # weight should be initializer INITIALIZER_MATCH, @@ -89,7 +89,7 @@ def transform(self, model: ModelProto) -> ModelProto: return model def _transform_match(self, graph: ONNXGraph, model: ModelProto, match: MatchResult): - input_quant, input_dequant = match.parents[0] + (input_dequant,) = match.parents[0] weight_init, weight_quantize_node, weight_dequantize_node = match.parents[1] (bias_init,) = match.parents[2] From 20632ce3bb8ebce19b431d7fce5f4ce2dda73d76 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 23 Jun 2023 18:11:57 -0400 Subject: [PATCH 03/15] Fixed embedding quantization propagation --- .../propagate_embedding_quantization.py | 38 ++++++++++++++++--- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/src/sparseml/exporters/transforms/propagate_embedding_quantization.py b/src/sparseml/exporters/transforms/propagate_embedding_quantization.py index 0ca82d5e995..be3ed607c78 100644 --- a/src/sparseml/exporters/transforms/propagate_embedding_quantization.py +++ b/src/sparseml/exporters/transforms/propagate_embedding_quantization.py @@ -15,6 +15,7 @@ import logging import numpy +import onnx.numpy_helper from onnx import ModelProto, numpy_helper from sparseml.exporters.transforms.onnx_transform import OnnxTransform @@ -79,16 +80,19 @@ def transform(self, model: ModelProto) -> ModelProto: ["Concat"], ], ) + + initializer_dict = {i.name: i for i in model.graph.initializer} + for match in matches: (gather,) = match.parents[0] dequant = match.node - slice1, _, concat1 = match.children[0] - slice2, _, concat2 = match.children[1] + slice1, pad1, concat1 = match.children[0] + slice2, pad2, concat2 = match.children[1] (concat,) = match.children[2] # check for uint8 initializer indices = graph.get_init_by_name(gather.input[0]) - if indices is None or numpy_helper.to_array(indices).dtype != numpy.uint8: + if indices is None or numpy_helper.to_array(indices).dtype not in [numpy.uint8, numpy.int8]: continue # check that all concats are the same @@ -97,11 +101,35 @@ def transform(self, model: ModelProto) -> ModelProto: self.log_match(match) - assert concat.input[2] == dequant.output[0] - concat.input[2] = gather.output[0] + for id, input_name in enumerate(concat.input): + if input_name == dequant.output[0]: + break + + concat.input[id] = gather.output[0] slice1.input[0] = gather.output[0] slice2.input[0] = gather.output[0] + zero_point_initializer = initializer_dict[match.node.input[2]] + zero_point = onnx.numpy_helper.to_array(zero_point_initializer) + + pad1_value_initializer = initializer_dict[pad1.input[2]] + pad1_value = onnx.numpy_helper.to_array(pad1_value_initializer) + pad1_value = pad1_value.astype(zero_point.dtype) + zero_point + new_pad1_value_initializer = numpy_helper.from_array( + pad1_value, name=pad1_value_initializer.name + ) + model.graph.initializer.remove(pad1_value_initializer) + model.graph.initializer.append(new_pad1_value_initializer) + + pad2_value_initializer = initializer_dict[pad2.input[2]] + pad2_value = onnx.numpy_helper.to_array(pad2_value_initializer) + pad2_value = pad2_value.astype(zero_point.dtype) + zero_point + new_pad2_value_initializer = numpy_helper.from_array( + pad2_value, name=pad2_value_initializer.name + ) + model.graph.initializer.remove(pad2_value_initializer) + model.graph.initializer.append(new_pad2_value_initializer) + tmp = concat.output[0] concat.output[0] = dequant.output[0] dequant.output[0] = tmp From 4384a11d8e07b11cf9768f9f5d820f0b2ed049c8 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 23 Jun 2023 18:15:24 -0400 Subject: [PATCH 04/15] Quality fixes --- .../propagate_dequant_through_split.py | 71 ++++++++++++------- .../propagate_embedding_quantization.py | 5 +- 2 files changed, 48 insertions(+), 28 deletions(-) diff --git a/src/sparseml/exporters/transforms/propagate_dequant_through_split.py b/src/sparseml/exporters/transforms/propagate_dequant_through_split.py index 231dca36dbb..7c65117f710 100644 --- a/src/sparseml/exporters/transforms/propagate_dequant_through_split.py +++ b/src/sparseml/exporters/transforms/propagate_dequant_through_split.py @@ -1,35 +1,49 @@ -from onnx import ModelProto +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import onnx +from onnx import ModelProto from sparseml.exporters.transforms import OnnxTransform -from sparseml.exporters.transforms.utils import ( - MatchResult, - get_structural_matches, -) +from sparseml.exporters.transforms.utils import MatchResult, get_structural_matches from sparseml.onnx.utils import ONNXGraph + __all__ = ["PropagateDequantThroughSplit"] + class PropagateDequantThroughSplit(OnnxTransform): """ - A pass for propagating DequantizeLinear nodes down through a split node - so if there are quantized operations after the split they can - be properly converted. - Starting with: - | INPUT - | | - | DequantizeLinear - | | - | Split - | | | | - Converts to: - | INPUT - | | - | Split - | | | | - | DequantizeLinear DequantizeLinear DequantizeLinear - | | | | - """ + A pass for propagating DequantizeLinear nodes down through a split node + so if there are quantized operations after the split they can + be properly converted. + Starting with: + | INPUT + | | + | DequantizeLinear + | | + | Split + | | | | + Converts to: + | INPUT + | | + | Split + | | | | + | DequantizeLinear DequantizeLinear DequantizeLinear + | | | | + """ + def transform(self, model: ModelProto) -> ModelProto: graph = ONNXGraph(model) matches = get_structural_matches( @@ -39,13 +53,14 @@ def transform(self, model: ModelProto) -> ModelProto: ) for match in matches: self.log_match(match) - self._transform_match( model, match) + self._transform_match(model, match) return model def _transform_match(self, model: ModelProto, match: MatchResult): # Loop through the nodes that are children of the Split node - # For every child, create a DequantizeLinear node and insert between Split and child + # For every child, create a DequantizeLinear node and insert + # between Split and child for split_output_id in range(len(match.node.output)): dequant_node_name = match.node.name + f"_dequant.{split_output_id}" dequant_node_output = match.node.output[split_output_id] @@ -65,10 +80,12 @@ def _transform_match(self, model: ModelProto, match: MatchResult): ) ) - # Replace the output of the Split node with the input of the new DequantizeLinear node + # Replace the output of the Split node with the input of + # the new DequantizeLinear node match.node.output[split_output_id] = dequant_node_input - # Set the input to the Split node to what was the input of the original DequantizeLinear node + # Set the input to the Split node to what was the input of the + # original DequantizeLinear node match.node.input[0] = match.parents[0][0].input[0] # Remove original DequantizeLinear node diff --git a/src/sparseml/exporters/transforms/propagate_embedding_quantization.py b/src/sparseml/exporters/transforms/propagate_embedding_quantization.py index be3ed607c78..7e3dda87120 100644 --- a/src/sparseml/exporters/transforms/propagate_embedding_quantization.py +++ b/src/sparseml/exporters/transforms/propagate_embedding_quantization.py @@ -92,7 +92,10 @@ def transform(self, model: ModelProto) -> ModelProto: # check for uint8 initializer indices = graph.get_init_by_name(gather.input[0]) - if indices is None or numpy_helper.to_array(indices).dtype not in [numpy.uint8, numpy.int8]: + if indices is None or numpy_helper.to_array(indices).dtype not in [ + numpy.uint8, + numpy.int8, + ]: continue # check that all concats are the same From a98bd91e972ad1bdab8a8871fbb4dbd2989550e7 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 27 Jun 2023 00:33:31 -0400 Subject: [PATCH 05/15] Add zero point to dequant node --- .../transforms/test_propagate_embedding_quantization.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py index 9111d75e42d..300744d5cc3 100644 --- a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py +++ b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py @@ -32,6 +32,7 @@ def onnx_model(): "output", onnx.TensorProto.FLOAT, (1,) ) scale = onnx.helper.make_tensor("scale", onnx.TensorProto.FLOAT, (1,), [1.0]) + zero_point = onnx.helper.make_tensor("zero point", onnx.TensorProto.INT8, (1,), [0.0]) starts = onnx.helper.make_tensor("starts", onnx.TensorProto.INT64, (1,), [0]) ends = onnx.helper.make_tensor("ends", onnx.TensorProto.INT64, (1,), [1]) pads = onnx.helper.make_tensor("pads", onnx.TensorProto.INT64, (1,), [1]) @@ -43,7 +44,7 @@ def onnx_model(): ) dequant = onnx.helper.make_node( "DequantizeLinear", - ["gather_output", "scale"], + ["gather_output", "scale", "zero_point"], ["dequant_output"], name="dequant", ) From 32db97be427e8b688a48ee9ae0424b68f6f62e71 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 27 Jun 2023 00:34:51 -0400 Subject: [PATCH 06/15] Add zero point to initializers --- .../transforms/test_propagate_embedding_quantization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py index 300744d5cc3..ca48489ad77 100644 --- a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py +++ b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py @@ -32,7 +32,7 @@ def onnx_model(): "output", onnx.TensorProto.FLOAT, (1,) ) scale = onnx.helper.make_tensor("scale", onnx.TensorProto.FLOAT, (1,), [1.0]) - zero_point = onnx.helper.make_tensor("zero point", onnx.TensorProto.INT8, (1,), [0.0]) + zero_point = onnx.helper.make_tensor("zero_point", onnx.TensorProto.INT8, (1,), [0.0]) starts = onnx.helper.make_tensor("starts", onnx.TensorProto.INT64, (1,), [0]) ends = onnx.helper.make_tensor("ends", onnx.TensorProto.INT64, (1,), [1]) pads = onnx.helper.make_tensor("pads", onnx.TensorProto.INT64, (1,), [1]) @@ -74,7 +74,7 @@ def onnx_model(): name="g", inputs=[model_input], outputs=[model_output], - initializer=[scale, starts, ends, embeddings, pads], + initializer=[scale, zero_point, starts, ends, embeddings, pads], ) model = onnx.helper.make_model(graph) From 8e18613303fe1468761950ba40236e29952ce028 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 27 Jun 2023 00:38:31 -0400 Subject: [PATCH 07/15] Style fixes --- .../transforms/test_propagate_embedding_quantization.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py index ca48489ad77..ae66d72745e 100644 --- a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py +++ b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py @@ -32,7 +32,9 @@ def onnx_model(): "output", onnx.TensorProto.FLOAT, (1,) ) scale = onnx.helper.make_tensor("scale", onnx.TensorProto.FLOAT, (1,), [1.0]) - zero_point = onnx.helper.make_tensor("zero_point", onnx.TensorProto.INT8, (1,), [0.0]) + zero_point = onnx.helper.make_tensor( + "zero_point", onnx.TensorProto.INT8, (1,), [0.0] + ) starts = onnx.helper.make_tensor("starts", onnx.TensorProto.INT64, (1,), [0]) ends = onnx.helper.make_tensor("ends", onnx.TensorProto.INT64, (1,), [1]) pads = onnx.helper.make_tensor("pads", onnx.TensorProto.INT64, (1,), [1]) From fdae854fa8c6701805ec2502587eed951ee81d9d Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 27 Jun 2023 00:48:41 -0400 Subject: [PATCH 08/15] Fix data type --- .../transforms/test_propagate_embedding_quantization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py index ae66d72745e..2836e60bbaa 100644 --- a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py +++ b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py @@ -33,7 +33,7 @@ def onnx_model(): ) scale = onnx.helper.make_tensor("scale", onnx.TensorProto.FLOAT, (1,), [1.0]) zero_point = onnx.helper.make_tensor( - "zero_point", onnx.TensorProto.INT8, (1,), [0.0] + "zero_point", onnx.TensorProto.INT8, (1,), [0] ) starts = onnx.helper.make_tensor("starts", onnx.TensorProto.INT64, (1,), [0]) ends = onnx.helper.make_tensor("ends", onnx.TensorProto.INT64, (1,), [1]) From 85c68289932364a4e110c29e1ea4486c053f6b3d Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 27 Jun 2023 01:16:53 -0400 Subject: [PATCH 09/15] Allow MatMul weight to be on either input 0 or 1 --- ...atmul_add_to_matmulinteger_add_cast_mul.py | 43 +++++++++++++++++-- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/src/sparseml/exporters/transforms/matmul_add_to_matmulinteger_add_cast_mul.py b/src/sparseml/exporters/transforms/matmul_add_to_matmulinteger_add_cast_mul.py index 2cef33a6226..bc49a925ebc 100644 --- a/src/sparseml/exporters/transforms/matmul_add_to_matmulinteger_add_cast_mul.py +++ b/src/sparseml/exporters/transforms/matmul_add_to_matmulinteger_add_cast_mul.py @@ -67,6 +67,37 @@ class MatMulAddToMatMulIntegerAddCastMul(OnnxTransform): def transform(self, model: ModelProto) -> ModelProto: graph = ONNXGraph(model) + + # Weight on input 0 + matches = get_structural_matches( + graph, + op_type="MatMul", + parent_ops=[ + [ + # weight should be initializer + INITIALIZER_MATCH, + "QuantizeLinear", + "DequantizeLinear", + optional_node("Transpose"), + ], + [any_of("QuantizeLinear", "DequantizeLinear")], + ], + children_ops=[[optional_node("Add")]], + ) + for match in matches: + add_node = match.children[0][0] + bias_init = None + if add_node: + # NOTE: bias could be either input 0 or 1 of add node + # if add does not have a bias initializer, + # still do conversion, but do not fold the bias add to rescale + bias_init = graph.get_init_by_name(match.children[0][0].input[1]) + if bias_init is None: + bias_init = graph.get_init_by_name(match.children[0][0].input[0]) + self.log_match(match) + self._transform_match(graph, model, match, bias_init, 0) + + # Weight on input 1 matches = get_structural_matches( graph, op_type="MatMul", @@ -93,7 +124,8 @@ def transform(self, model: ModelProto) -> ModelProto: if bias_init is None: bias_init = graph.get_init_by_name(match.children[0][0].input[0]) self.log_match(match) - self._transform_match(graph, model, match, bias_init) + self._transform_match(graph, model, match, bias_init, 1) + return model def _transform_match( @@ -102,10 +134,15 @@ def _transform_match( model: ModelProto, match: MatchResult, bias_init: TensorProto, + weight_parent: int, ): matmul = match.node - (input_quant,) = match.parents[0] - weight_init, weight_quant, weight_dequant, opt_transpose = match.parents[1] + if weight_parent == 0: + (input_quant,) = match.parents[1] + weight_init, weight_quant, weight_dequant, opt_transpose = match.parents[0] + else: + (input_quant,) = match.parents[0] + weight_init, weight_quant, weight_dequant, opt_transpose = match.parents[1] (add,) = match.children[0] input_quantize_params = get_quantization_params( From f2bf1d78b15b0c95e9ddc28b0617d78ec6b0b926 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 27 Jun 2023 01:20:21 -0400 Subject: [PATCH 10/15] Style fixes --- .../transforms/test_propagate_embedding_quantization.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py index 2836e60bbaa..cb63f82fd3d 100644 --- a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py +++ b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py @@ -32,9 +32,7 @@ def onnx_model(): "output", onnx.TensorProto.FLOAT, (1,) ) scale = onnx.helper.make_tensor("scale", onnx.TensorProto.FLOAT, (1,), [1.0]) - zero_point = onnx.helper.make_tensor( - "zero_point", onnx.TensorProto.INT8, (1,), [0] - ) + zero_point = onnx.helper.make_tensor("zero_point", onnx.TensorProto.INT8, (1,), [0]) starts = onnx.helper.make_tensor("starts", onnx.TensorProto.INT64, (1,), [0]) ends = onnx.helper.make_tensor("ends", onnx.TensorProto.INT64, (1,), [1]) pads = onnx.helper.make_tensor("pads", onnx.TensorProto.INT64, (1,), [1]) From f904e4932b2c963dd10a34fe6901fe24fc8ffa47 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 27 Jun 2023 01:29:12 -0400 Subject: [PATCH 11/15] Add padding value --- .../test_propagate_embedding_quantization.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py index cb63f82fd3d..a6711b4a619 100644 --- a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py +++ b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py @@ -32,10 +32,15 @@ def onnx_model(): "output", onnx.TensorProto.FLOAT, (1,) ) scale = onnx.helper.make_tensor("scale", onnx.TensorProto.FLOAT, (1,), [1.0]) - zero_point = onnx.helper.make_tensor("zero_point", onnx.TensorProto.INT8, (1,), [0]) + zero_point = onnx.helper.make_tensor( + "zero_point", onnx.TensorProto.UINT8, (1,), [128] + ) starts = onnx.helper.make_tensor("starts", onnx.TensorProto.INT64, (1,), [0]) ends = onnx.helper.make_tensor("ends", onnx.TensorProto.INT64, (1,), [1]) pads = onnx.helper.make_tensor("pads", onnx.TensorProto.INT64, (1,), [1]) + padding_value = onnx.helper.make_tensor( + "padding_value", onnx.TensorProto.FLOAT, (1,), [0.0] + ) embeddings = onnx.helper.make_tensor( "embeddings", onnx.TensorProto.UINT8, (1,), [0] ) @@ -53,13 +58,13 @@ def onnx_model(): "Slice", ["dequant_output", "starts", "ends"], ["slice1_output"], name="slice1" ) pad1 = onnx.helper.make_node( - "Pad", ["slice1_output", "pads"], ["pad1_output"], name="pad1" + "Pad", ["slice1_output", "pads", "padding_value"], ["pad1_output"], name="pad1" ) slice2 = onnx.helper.make_node( "Slice", ["dequant_output", "starts", "ends"], ["slice2_output"], name="slice2" ) pad2 = onnx.helper.make_node( - "Pad", ["slice2_output", "pads"], ["pad2_output"], name="pad2" + "Pad", ["slice2_output", "pads", "padding_value"], ["pad2_output"], name="pad2" ) concat = onnx.helper.make_node( "Concat", @@ -74,7 +79,7 @@ def onnx_model(): name="g", inputs=[model_input], outputs=[model_output], - initializer=[scale, zero_point, starts, ends, embeddings, pads], + initializer=[scale, zero_point, starts, ends, embeddings, pads, padding_value], ) model = onnx.helper.make_model(graph) From 8e15c59e8c9ba97ef2f2105f6ea2c6bffa6b5764 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 27 Jun 2023 01:34:59 -0400 Subject: [PATCH 12/15] Make initializers distinct --- .../test_propagate_embedding_quantization.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py index a6711b4a619..699c4e834e0 100644 --- a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py +++ b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py @@ -38,8 +38,11 @@ def onnx_model(): starts = onnx.helper.make_tensor("starts", onnx.TensorProto.INT64, (1,), [0]) ends = onnx.helper.make_tensor("ends", onnx.TensorProto.INT64, (1,), [1]) pads = onnx.helper.make_tensor("pads", onnx.TensorProto.INT64, (1,), [1]) - padding_value = onnx.helper.make_tensor( - "padding_value", onnx.TensorProto.FLOAT, (1,), [0.0] + padding1_value = onnx.helper.make_tensor( + "padding1_value", onnx.TensorProto.FLOAT, (1,), [0.0] + ) + padding2_value = onnx.helper.make_tensor( + "padding2_value", onnx.TensorProto.FLOAT, (1,), [0.0] ) embeddings = onnx.helper.make_tensor( "embeddings", onnx.TensorProto.UINT8, (1,), [0] @@ -58,13 +61,13 @@ def onnx_model(): "Slice", ["dequant_output", "starts", "ends"], ["slice1_output"], name="slice1" ) pad1 = onnx.helper.make_node( - "Pad", ["slice1_output", "pads", "padding_value"], ["pad1_output"], name="pad1" + "Pad", ["slice1_output", "pads", "padding1_value"], ["pad1_output"], name="pad1" ) slice2 = onnx.helper.make_node( "Slice", ["dequant_output", "starts", "ends"], ["slice2_output"], name="slice2" ) pad2 = onnx.helper.make_node( - "Pad", ["slice2_output", "pads", "padding_value"], ["pad2_output"], name="pad2" + "Pad", ["slice2_output", "pads", "padding2_value"], ["pad2_output"], name="pad2" ) concat = onnx.helper.make_node( "Concat", @@ -79,7 +82,7 @@ def onnx_model(): name="g", inputs=[model_input], outputs=[model_output], - initializer=[scale, zero_point, starts, ends, embeddings, pads, padding_value], + initializer=[scale, zero_point, starts, ends, embeddings, pads, padding1_value, padding2_value], ) model = onnx.helper.make_model(graph) From 1cdbacb9e46650d85405b7a79c746057a8db8918 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 27 Jun 2023 01:41:19 -0400 Subject: [PATCH 13/15] Style and quality fixes --- .../test_propagate_embedding_quantization.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py index 699c4e834e0..e2c8c308c1d 100644 --- a/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py +++ b/tests/sparseml/exporters/transforms/test_propagate_embedding_quantization.py @@ -82,7 +82,16 @@ def onnx_model(): name="g", inputs=[model_input], outputs=[model_output], - initializer=[scale, zero_point, starts, ends, embeddings, pads, padding1_value, padding2_value], + initializer=[ + scale, + zero_point, + starts, + ends, + embeddings, + pads, + padding1_value, + padding2_value, + ], ) model = onnx.helper.make_model(graph) From 7629f5ef3a0c4ba4313b4ac8779d7e56428e9fea Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Wed, 5 Jul 2023 13:59:52 -0400 Subject: [PATCH 14/15] Make bias optional for Conv QAT conversion --- .../conv_to_convinteger_add_cast_mul.py | 41 +++++++++++++++++-- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/src/sparseml/exporters/transforms/conv_to_convinteger_add_cast_mul.py b/src/sparseml/exporters/transforms/conv_to_convinteger_add_cast_mul.py index fb52bf3432c..85c4e0362cf 100644 --- a/src/sparseml/exporters/transforms/conv_to_convinteger_add_cast_mul.py +++ b/src/sparseml/exporters/transforms/conv_to_convinteger_add_cast_mul.py @@ -21,6 +21,7 @@ add_quantized_conv_matmul_add_ops, get_quantization_params, get_structural_matches, + optional_node, ) from sparseml.onnx.utils import ONNXGraph @@ -66,7 +67,9 @@ class ConvToConvIntegerAddCastMul(OnnxTransform): def transform(self, model: ModelProto) -> ModelProto: graph = ONNXGraph(model) - matches = get_structural_matches( + + # Nodes with bias + matches_bias = get_structural_matches( graph, parent_ops=[ ["DequantizeLinear"], @@ -78,20 +81,50 @@ def transform(self, model: ModelProto) -> ModelProto: ], [ # bias should be initializer - INITIALIZER_MATCH + INITIALIZER_MATCH, ], ], op_type="Conv", ) + + # Nodes without bias + matches_no_bias = get_structural_matches( + graph, + parent_ops=[ + ["DequantizeLinear"], + [ + # weight should be initializer + INITIALIZER_MATCH, + "QuantizeLinear", + "DequantizeLinear", + ], + ], + op_type="Conv", + ) + + matches = matches_bias + matches_names = [m.node.name for m in matches] + for match in matches_no_bias: + if match.node.name not in matches_names: + matches.append(match) + for match in matches: self.log_match(match) self._transform_match(graph, model, match) return model - def _transform_match(self, graph: ONNXGraph, model: ModelProto, match: MatchResult): + def _transform_match( + self, + graph: ONNXGraph, + model: ModelProto, + match: MatchResult, + ): (input_dequant,) = match.parents[0] weight_init, weight_quantize_node, weight_dequantize_node = match.parents[1] - (bias_init,) = match.parents[2] + if len(match.parents) == 3: + (bias_init,) = match.parents[2] + else: + bias_init = None model = add_quantized_conv_matmul_add_ops( model=model, From d189627d52924c6c00b4706bbb09ce44fb821d42 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Wed, 5 Jul 2023 16:39:26 -0400 Subject: [PATCH 15/15] Quality fix --- .../exporters/transforms/conv_to_convinteger_add_cast_mul.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sparseml/exporters/transforms/conv_to_convinteger_add_cast_mul.py b/src/sparseml/exporters/transforms/conv_to_convinteger_add_cast_mul.py index 85c4e0362cf..f19e1dd8642 100644 --- a/src/sparseml/exporters/transforms/conv_to_convinteger_add_cast_mul.py +++ b/src/sparseml/exporters/transforms/conv_to_convinteger_add_cast_mul.py @@ -21,7 +21,6 @@ add_quantized_conv_matmul_add_ops, get_quantization_params, get_structural_matches, - optional_node, ) from sparseml.onnx.utils import ONNXGraph