Skip to content

Commit

Permalink
Enable TF quantsim per channel range learning
Browse files Browse the repository at this point in the history
Signed-off-by: Kevin Hsieh <quic_klhsieh@quicinc.com>
  • Loading branch information
quic-klhsieh authored Apr 5, 2022
1 parent c39b22d commit 974eb17
Show file tree
Hide file tree
Showing 4 changed files with 289 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1174,7 +1174,8 @@ def create_quantize_op():
if self._quant_scheme in [QuantScheme.training_range_learning_with_tf_init,
QuantScheme.training_range_learning_with_tf_enhanced_init]:
with self.session.graph.gradient_override_map(
{"QcQuantize": "QcQuantizeRangeLearningCustomGradient"}):
{"QcQuantize": "QcQuantizeRangeLearningCustomGradient",
"QcQuantizePerChannel": "QcQuantizePerChannelRangeLearningCustomGradient"}):
q_op_out = create_quantize_op()
else:
q_op_out = create_quantize_op()
Expand All @@ -1184,7 +1185,8 @@ def create_quantize_op():
if self._quant_scheme in [QuantScheme.training_range_learning_with_tf_init,
QuantScheme.training_range_learning_with_tf_enhanced_init]:
with self.session.graph.gradient_override_map(
{"QcQuantize": "QcQuantizeRangeLearningCustomGradient"}):
{"QcQuantize": "QcQuantizeRangeLearningCustomGradient",
"QcQuantizePerChannel": "QcQuantizePerChannelRangeLearningCustomGradient"}):
q_op_out = create_quantize_op()
else:
q_op_out = create_quantize_op()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,38 +164,43 @@ def _compute_dloss_by_dmax(x, grad, scaling, offset, bitwidth, use_symmetric_enc
# to single value before returning gradient
# this uses chain rule, multiply by loss and sum it to get scalar.
dq_by_dmax = tf.where(tf.less_equal(n, r_x_by_s_plus_round_o), inner_cond, false_expr)
dloss_by_dmax = tf.reduce_sum(dq_by_dmax * grad)

# If per channel is active, scaling tensor will be rank 1 (an array instead of a singular value).
# In case of per channel, we reduce by all but the last dimension. Otherwise, we reduce all dimensions.
dloss_by_dmax = tf.cond(tf.equal(tf.rank(scaling), 0), lambda: tf.reduce_sum(dq_by_dmax * grad),
lambda: tf.reduce_sum(dq_by_dmax * grad, axis=tf.range(0, tf.rank(x) - 1)))
return dloss_by_dmax


@tf_ops.RegisterGradient("QcQuantizeRangeLearningCustomGradient")
def quantsim_custom_grad_learned_grid(op, grad):
# pylint: disable=too-many-locals
def _compute_dloss_by_dmin_dmax_and_dx(inputs: tf.Tensor, bitwidth: tf.Tensor, op_mode: tf.Tensor,
encoding_min: tf.Tensor, encoding_max: tf.Tensor, is_symmetric: tf.Tensor,
grad: tf.Tensor):
"""
Performs custom gradient calculations for trained Quantize op
:param op: Tf operation for which gradients are to be computed
:param grad: Gradient flowing through
Return tensors for dloss_by_dmin, dloss_by_dmax, and dloss_by_dx.
:param inputs: Inputs to op
:param bitwidth: Bitwidth used to quantize
:param op_mode: Op mode (if passthrough, gradient is returned as is)
:param encoding_min: Encoding min value(s), will be more than one if per channel is active
:param encoding_max: Encoding max value(s), will be more than one if per channel is active
:param is_symmetric: True if symmetric encodings are used, False otherwise
:param grad: Gradient from child layer
:return: Tensors for dloss_by_dmin, dloss_by_dmax, and dloss_by_dx
"""
# pylint: disable=R0914

# read bitwidth, use_symmetric_encoding_flag,
# encoding_min and encoding_max from the op inputs
x = tf.cast(op.inputs[0], tf.float32)
bitwidth = tf.cast(op.inputs[int(QuantizeOpIndices.bit_width)], tf.float32)
op_mode = tf.cast(op.inputs[int(QuantizeOpIndices.op_mode)], tf.int8)

encoding_min = tf.cast(op.inputs[int(QuantizeOpIndices.encoding_min)], tf.float32)
encoding_max_read = tf.cast(op.inputs[int(QuantizeOpIndices.encoding_max)], tf.float32)

x = tf.cast(inputs, tf.float32)
bitwidth = tf.cast(bitwidth, tf.float32)
op_mode = tf.cast(op_mode, tf.int8)
encoding_min = tf.cast(encoding_min, tf.float32)
encoding_max = tf.cast(encoding_max, tf.float32)
# handle min == max to avoid divide by zero
epsilon = tf.constant(1e-5, dtype=tf.float32)
encoding_max = tf.math.maximum(encoding_max_read, tf.add(encoding_min, epsilon))
encoding_max = tf.math.maximum(encoding_max, tf.add(encoding_min, epsilon))

# compute n, p, scaling and offset params
# choose n based on symmetric or asymmetric flag
# symmetric : -two_pow_bw + 1
# asymmetric : 0
n, p = _get_n_and_p(bitwidth, op.inputs[int(QuantizeOpIndices.use_symmetric_encoding)])
n, p = _get_n_and_p(bitwidth, is_symmetric)
steps = tf.cast(tf.pow(tf.cast(tf.constant(2), tf.float32), bitwidth) - 1, tf.float32)
scaling = tf.cast(((encoding_max - encoding_min) / steps), tf.float32)
rounded_offset = tf.round(-encoding_min / scaling) # pylint: disable=invalid-unary-operand-type
Expand All @@ -211,12 +216,46 @@ def quantsim_custom_grad_learned_grid(op, grad):
inner_cond, # execute if true
tf.zeros_like(r_x_by_s_plus_round_o))) * grad

dloss_by_dmax = tf.cast(_compute_dloss_by_dmax(x, grad, scaling, rounded_offset, bitwidth,
op.inputs[int(QuantizeOpIndices.use_symmetric_encoding)]),
dloss_by_dmax = tf.cast(_compute_dloss_by_dmax(x, grad, scaling, rounded_offset, bitwidth, is_symmetric),
tf.float64)
dloss_by_dmin = tf.cast(_compute_dloss_by_dmin_using_dmax(dloss_by_dmax), tf.float64)

# Pass through gradient for skipped ops
dloss_by_dx = tf.cond(tf.equal(op_mode, 3), lambda: grad, lambda: dloss_by_dx)
return dloss_by_dmin, dloss_by_dmax, dloss_by_dx


@tf_ops.RegisterGradient("QcQuantizeRangeLearningCustomGradient")
def quantsim_custom_grad_learned_grid(op, grad):
"""
Performs custom gradient calculations for trained Quantize op
:param op: Tf operation for which gradients are to be computed
:param grad: Gradient flowing through
"""
dloss_by_dmin, dloss_by_dmax, dloss_by_dx = \
_compute_dloss_by_dmin_dmax_and_dx(op.inputs[0],
op.inputs[int(QuantizeOpIndices.bit_width)],
op.inputs[int(QuantizeOpIndices.op_mode)],
op.inputs[int(QuantizeOpIndices.encoding_min)],
op.inputs[int(QuantizeOpIndices.encoding_max)],
op.inputs[int(QuantizeOpIndices.use_symmetric_encoding)],
grad)
return dloss_by_dx, None, None, dloss_by_dmin, dloss_by_dmax, None, None, None


@tf_ops.RegisterGradient("QcQuantizePerChannelRangeLearningCustomGradient")
def quantsim_per_channel_custom_grad_learned_grid(op, grad):
"""
Performs custom gradient calculations for trained QcQuantizePerChannel op
:param op: Tf operation for which gradients are to be computed
:param grad: Gradient flowing through
"""
dloss_by_dmin, dloss_by_dmax, dloss_by_dx = \
_compute_dloss_by_dmin_dmax_and_dx(op.inputs[0],
op.inputs[int(QuantizeOpIndices.bit_width)],
op.inputs[int(QuantizeOpIndices.op_mode)],
op.inputs[int(QuantizeOpIndices.encoding_min)],
op.inputs[int(QuantizeOpIndices.encoding_max)],
op.inputs[int(QuantizeOpIndices.use_symmetric_encoding)],
grad)
return dloss_by_dx, None, None, dloss_by_dmin, dloss_by_dmax, None, None, None
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@
from aimet_tensorflow.common.graph_eval import initialize_uninitialized_vars
from aimet_tensorflow.quantsim import QuantizationSimModel
from aimet_tensorflow.examples.test_models import depthwise_conv2d_model
from aimet_tensorflow.utils.constants import QuantizeOpIndices
from aimet_tensorflow.utils.op.conv import WeightTensorUtils
from aimet_common.defs import QuantScheme
from aimet_common.quantsim import calculate_delta_offset

tf.compat.v1.disable_eager_execution()
Expand Down Expand Up @@ -482,7 +484,6 @@ def dummy_forward_pass(sess, args):
assert np.allclose(encoding_numpy, encodings, rtol=0.01)



@pytest.mark.cuda
def test_to_compare_time_per_channel_and_per_tensor_quantization(self):
save_config_file_for_per_channel_quantization()
Expand Down Expand Up @@ -552,6 +553,153 @@ def dummy_forward_pass(sess, args):
encoding = quantizer_info.get_encoding()
assert isinstance(encoding, list)

# Mark below test as cuda until per channel on cpu is supported.
@pytest.mark.cuda
def test_per_channel_range_learning(self):
"""
Test to validate per channel range learning
"""
tf.compat.v1.reset_default_graph()
tf.compat.v1.set_random_seed(0)
np.random.seed(0)
with tf.device('/cpu:0'):
inputs = tf.keras.Input(shape=(32, 32, 4,))
conv_op = tf.keras.layers.Conv2D(2, (3, 3),
kernel_initializer=tf.random_uniform_initializer(-1, 2),
bias_initializer='random_uniform',
padding='SAME')(inputs)
relu_op = tf.nn.relu(conv_op)
reshape = tf.keras.layers.Flatten()(relu_op)
_ = tf.keras.layers.Dense(10, bias_initializer='random_uniform')(reshape)

sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph())
initialize_uninitialized_vars(sess)

save_config_file_bias_quantized_for_per_channel_quantization()

# create quantsim model without config file
sim = QuantizationSimModel(sess, ['input_1'], ['dense/BiasAdd'], use_cuda=True,
quant_scheme=QuantScheme.training_range_learning_with_tf_init,
config_file='./quantsim_config.json')

def dummy_forward_pass(sess, _):
model_output = sess.graph.get_tensor_by_name('dense/BiasAdd_quantized:0')
model_input = sess.graph.get_tensor_by_name('input_1:0')
shape = model_input.shape
dummy_input = np.random.randn(1, shape[1], shape[2], shape[3])
sess.run(model_output, feed_dict={model_input: dummy_input})

conv2d_weight_quant_op = sim.session.graph.get_operation_by_name('conv2d/Conv2D/ReadVariableOp_quantized')
conv2d_output_quant_op = sim.session.graph.get_operation_by_name('conv2d/BiasAdd_quantized')
dense_bias_quant_op = sim.session.graph.get_operation_by_name('dense/BiasAdd/ReadVariableOp_quantized')

# enable input
sim.compute_encodings(dummy_forward_pass, None)

inp_tensor = sim.session.graph.get_tensor_by_name('input_1:0')
w_shape = inp_tensor.shape
batches = 32
inp_data = np.random.rand(batches, w_shape[1], w_shape[2], w_shape[3])
logits = sim.session.graph.get_tensor_by_name('dense/BiasAdd_quantized:0')

labels = np.random.randint(10, size=batches)
one_hot_labels = np.eye(10)[labels]

with sim.session.graph.as_default():
var_list = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES)
labels_placeholder = tf.compat.v1.placeholder(tf.float32, [None, 10], name='labels')
loss = tf.compat.v1.losses.softmax_cross_entropy(onehot_labels=labels_placeholder, logits=logits)

update_ops = []
global_step = tf.compat.v1.train.create_global_step()
initialize_uninitialized_vars(sim.session)

optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=1e-3)
gradients = optimizer.compute_gradients(loss, var_list)

grad_updates = optimizer.apply_gradients(gradients, global_step=global_step)
update_ops.append(grad_updates)
update_op = tf.group(*update_ops)

conv_inp_tensor = conv2d_weight_quant_op.inputs[0]
grads = tf.gradients(loss, [conv_inp_tensor,
conv2d_weight_quant_op.inputs[QuantizeOpIndices.encoding_min],
conv2d_weight_quant_op.inputs[QuantizeOpIndices.encoding_max],
dense_bias_quant_op.inputs[QuantizeOpIndices.encoding_min],
dense_bias_quant_op.inputs[QuantizeOpIndices.encoding_max]])
_, conv_dqbydmin, conv_dqbydmax, dense_dqbydmin, dense_dqbydmax = grads
conv2d_weight_min_gradient = sim.session.run(conv_dqbydmin,
feed_dict={inp_tensor: inp_data,
labels_placeholder: one_hot_labels})
conv2d_weight_max_gradient = sim.session.run(conv_dqbydmax,
feed_dict={inp_tensor: inp_data,
labels_placeholder: one_hot_labels})
dense_bias_min_gradient = sim.session.run(dense_dqbydmin,
feed_dict={inp_tensor: inp_data,
labels_placeholder: one_hot_labels})
dense_bias_max_gradient = sim.session.run(dense_dqbydmax,
feed_dict={inp_tensor: inp_data,
labels_placeholder: one_hot_labels})

assert len(conv2d_weight_min_gradient) == 2
assert len(conv2d_weight_max_gradient) == 2
assert len(dense_bias_min_gradient) == 10
assert len(dense_bias_max_gradient) == 10

weights_before_train = sim.session.run(conv2d_weight_quant_op.inputs[0])
encoding_min_before_train = sim.session.run(conv2d_weight_quant_op.inputs[QuantizeOpIndices.encoding_min])
encoding_max_before_train = sim.session.run(conv2d_weight_quant_op.inputs[QuantizeOpIndices.encoding_max])
conv2d_output_encoding_min_before_train = sim.session.run(conv2d_output_quant_op.inputs[
QuantizeOpIndices.encoding_min])
conv2d_output_encoding_max_before_train = sim.session.run(conv2d_output_quant_op.inputs[
QuantizeOpIndices.encoding_max])
dense_bias_encoding_min_before_train = \
sim.session.run(dense_bias_quant_op.inputs[QuantizeOpIndices.encoding_min])
dense_bias_encoding_max_before_train = \
sim.session.run(dense_bias_quant_op.inputs[QuantizeOpIndices.encoding_max])
with tf.control_dependencies([update_op]):
train_op = tf.identity(loss, name='train_op')

for quant_op_name in sim._param_quantizers.keys():
print(quant_op_name + '_min_before_train = ' + str(sim.session.run(
sim.session.graph.get_operation_by_name(quant_op_name).inputs[QuantizeOpIndices.encoding_min])))
print(quant_op_name + '_max_before_train = ' + str(sim.session.run(
sim.session.graph.get_operation_by_name(quant_op_name).inputs[QuantizeOpIndices.encoding_max])))

# start training
_ = sim.session.run(train_op, feed_dict={inp_tensor: inp_data, labels_placeholder: one_hot_labels})

for quant_op_name in sim._param_quantizers.keys():
print(quant_op_name + '_min = ' + str(sim.session.run(sim.session.graph.get_operation_by_name
(quant_op_name).inputs[
QuantizeOpIndices.encoding_min])))
print(quant_op_name + '_max = ' + str(sim.session.run(sim.session.graph.get_operation_by_name
(quant_op_name).inputs[
QuantizeOpIndices.encoding_max])))

weights_after_train = sim.session.run(conv2d_weight_quant_op.inputs[0])
conv2d_output_encoding_min_after_train = sim.session.run(conv2d_output_quant_op.inputs[
QuantizeOpIndices.encoding_min])
conv2d_output_encoding_max_after_train = sim.session.run(conv2d_output_quant_op.inputs[
QuantizeOpIndices.encoding_max])
encoding_min_after_train = sim.session.run(conv2d_weight_quant_op.inputs[QuantizeOpIndices.encoding_min])
encoding_max_after_train = sim.session.run(conv2d_weight_quant_op.inputs[QuantizeOpIndices.encoding_max])
dense_bias_encoding_min_after_train = \
sim.session.run(dense_bias_quant_op.inputs[QuantizeOpIndices.encoding_min])
dense_bias_encoding_max_after_train = \
sim.session.run(dense_bias_quant_op.inputs[QuantizeOpIndices.encoding_max])

assert not np.allclose(weights_before_train, weights_after_train, atol=1e-6)
assert not np.array_equal(encoding_min_before_train, encoding_min_after_train)
assert not np.array_equal(encoding_max_before_train, encoding_max_after_train)
assert not np.array_equal(conv2d_output_encoding_min_before_train, conv2d_output_encoding_min_after_train)
assert not np.array_equal(conv2d_output_encoding_max_before_train, conv2d_output_encoding_max_after_train)
assert not np.array_equal(dense_bias_encoding_min_before_train, dense_bias_encoding_min_after_train)
assert not np.array_equal(dense_bias_encoding_max_before_train, dense_bias_encoding_max_after_train)

sess.close()
sim.session.close()


def save_config_file_for_per_channel_quantization():
quantsim_config = {
Expand All @@ -578,6 +726,31 @@ def save_config_file_for_per_channel_quantization():
with open('./quantsim_config.json', 'w') as f:
json.dump(quantsim_config, f)


def save_config_file_bias_quantized_for_per_channel_quantization():
quantsim_config = {
"defaults": {
"ops": {
"is_output_quantized": "True",
"is_symmetric": "False"
},
"params": {
"is_quantized": "True",
"is_symmetric": "False"
},
"per_channel_quantization": "True",
},
"params": {},
"op_type": {},
"supergroups": [],
"model_input": {},
"model_output": {}
}

with open('./quantsim_config.json', 'w') as f:
json.dump(quantsim_config, f)


def compute_tf_encodings(sess, op, axis):

data = WeightTensorUtils.get_tensor_as_numpy_data(sess, op)
Expand Down
Loading

0 comments on commit 974eb17

Please sign in to comment.