Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improvements in the quantizer and dequantization kernel #1061

Merged
merged 2 commits into from
May 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 8 additions & 18 deletions mlx/backend/metal/kernels/quantized.metal
Original file line number Diff line number Diff line change
Expand Up @@ -205,13 +205,10 @@ qouter(const thread uint8_t* w, U x, U scale, U bias, thread U* result) {
}

else if (bits == 4) {
const thread uint16_t* ws = (const thread uint16_t*)w;
U s[4] = {scale, scale / 16.0f, scale / 256.0f, scale / 4096.0f};
for (int i = 0; i < (values_per_thread / 4); i++) {
result[4 * i] += x * (s[0] * (ws[i] & 0x000f) + bias);
result[4 * i + 1] += x * (s[1] * (ws[i] & 0x00f0) + bias);
result[4 * i + 2] += x * (s[2] * (ws[i] & 0x0f00) + bias);
result[4 * i + 3] += x * (s[3] * (ws[i] & 0xf000) + bias);
U s[2] = {scale, scale / 16.0f};
for (int i = 0; i < (values_per_thread / 2); i++) {
result[2 * i] += x * (s[0] * (w[i] & 0x0f) + bias);
result[2 * i + 1] += x * (s[1] * (w[i] & 0xf0) + bias);
}
}

Expand Down Expand Up @@ -244,17 +241,10 @@ dequantize(const device uint8_t* w, U scale, U bias, threadgroup U* w_local) {
}

else if (bits == 4) {
const device uint16_t* ws = (const device uint16_t*)w;
U s[4] = {
scale,
scale / static_cast<U>(16.0f),
scale / static_cast<U>(256.0f),
scale / static_cast<U>(4096.0f)};
for (int i = 0; i < (N / 4); i++) {
w_local[4 * i] = s[0] * (ws[i] & 0x000f) + bias;
w_local[4 * i + 1] = s[1] * (ws[i] & 0x00f0) + bias;
w_local[4 * i + 2] = s[2] * (ws[i] & 0x0f00) + bias;
w_local[4 * i + 3] = s[3] * (ws[i] & 0xf000) + bias;
U s[2] = {scale, scale / static_cast<U>(16.0f)};
for (int i = 0; i < (N / 2); i++) {
w_local[2 * i] = s[0] * (w[i] & 0x0f) + bias;
w_local[2 * i + 1] = s[1] * (w[i] & 0xf0) + bias;
}
}

Expand Down
24 changes: 16 additions & 8 deletions mlx/ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3275,7 +3275,9 @@ std::tuple<array, array, array> quantize(
}

// Compute some constants used for the quantization
int n_bins = (1 << bits) - 1; // 2**bits - 1
array n_bins((1 << bits) - 1, w.dtype()); // 2**bits - 1
array eps(1e-7, w.dtype());
array zero(0, w.dtype());
int el_per_int = 32 / bits;
array shifts = power(array(2, uint32), arange(0, 32, bits, uint32, s), s);
shifts = reshape(shifts, {1, 1, -1}, s);
Expand All @@ -3299,16 +3301,22 @@ std::tuple<array, array, array> quantize(
reshape(w, {w.shape(0), w.shape(1) / group_size, group_size}, s);
array w_max = max(packed_w, /* axis= */ -1, /* keepdims= */ true, s);
array w_min = min(packed_w, /* axis= */ -1, /* keepdims= */ true, s);
array scales = maximum(
divide(subtract(w_max, w_min, s), array(n_bins, w.dtype()), s),
array(1e-7, w.dtype()),
s);
// making sure that 0 is represented exactly in the resulting quantization
array biases = multiply(round(divide(w_min, scales, s), s), scales, s);

array mask = greater(abs(w_min, s), abs(w_max, s), s);
array scales = maximum(divide(subtract(w_max, w_min, s), n_bins, s), eps, s);
scales = where(mask, scales, negative(scales), s);
array edge = where(mask, w_min, w_max, s);
array q0 = round(divide(edge, scales, s), s);
scales = where(not_equal(q0, zero, s), divide(edge, q0, s), scales);
array biases = where(equal(q0, zero, s), zero, edge);

// Quantize and pack w
packed_w = astype(
round(divide(subtract(packed_w, biases, s), scales, s), s), uint32);
clip(
round(divide(subtract(packed_w, biases, s), scales, s), s),
zero,
n_bins),
uint32);
packed_w = reshape(packed_w, {w.shape(0), -1, el_per_int}, s);
packed_w = sum(
multiply(packed_w, shifts, s), /* axis= */ 2, /* keepdims= */ false, s);
Expand Down
2 changes: 1 addition & 1 deletion python/tests/test_quantized.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def test_quantize_dequantize(self):
w_hat = mx.dequantize(w_q, scales, biases, gs, b)
errors = (w - w_hat).abs().reshape(*scales.shape, -1)
eps = 1e-6
self.assertTrue((2 * errors <= (scales[..., None] + eps)).all())
self.assertTrue((errors <= (scales[..., None] + eps).abs()).all())

# test quantize/dequantize 0s
a = mx.zeros((256, 512))
Expand Down