huggingface · amyeroberts · May 2, 2024 · Apr 24, 2024 · Apr 24, 2024 · Apr 24, 2024
diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile
@@ -45,6 +45,9 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/opt
 # Add aqlm for quantization testing
 RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2
 
+# Add hqq for quantization testing
+RUN python3 -m pip install --no-cache-dir hqq
+
 # Add autoawq for quantization testing
 # >=v0.2.3 needed for compatibility with torch 2.2.1
 RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+cu118-cp38-cp38-linux_x86_64.whl

diff --git a/docs/source/en/main_classes/quantization.md b/docs/source/en/main_classes/quantization.md
@@ -52,3 +52,7 @@ Learn how to quantize models in the [Quantization](../quantization) guide.
 ## HfQuantizer
 
 [[autodoc]] quantizers.base.HfQuantizer
+
+## HqqConfig
+
+[[autodoc]] HqqConfig
diff --git a/docs/source/en/quantization.md b/docs/source/en/quantization.md
@@ -745,3 +745,43 @@ The speed and throughput of fused and unfused modules were also tested with the
     <figcaption class="mt-2 text-center text-sm text-gray-500">generate throughput/batch size</figcaption>
   </div>
 </div>
+
+## HQQ 
+Half-Quadratic Quantization (HQQ) implements on-the-fly quantization via fast robust optimization. It doesn't require calibration data and can be used to quantize any model.  
+Please refer to the <a href="https://github.com/mobiusml/hqq/">official package</a> for more details.
+
+For installation, we recommend you use the following approach to get the latest version and build its corresponding CUDA kernels:
+```
+pip install hqq
+```
+
+To quantize a model, you need to create an ```HqqConfig``` as follows:
+``` Python
+from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig
+
+#Linear layers will use the same quantization config
+quant_config  = HqqConfig(nbits=8, group_size=64, quant_zero=False, quant_scale=False, axis=0) #axis=0 is used by default
+
+#Each type of linear layer (referred to as linear tag) will use different quantization parameters
+q4_config = {'nbits':4, 'group_size':64, 'quant_zero':False, 'quant_scale':False}
+q3_config = {'nbits':3, 'group_size':32, 'quant_zero':False, 'quant_scale':False}
+quant_config  = HqqConfig(dynamic_config={
+  'self_attn.q_proj':q4_config,
+  'self_attn.k_proj':q4_config,
+  'self_attn.v_proj':q4_config,
+  'self_attn.o_proj':q4_config,
+
+  'mlp.gate_proj':q3_config,
+  'mlp.up_proj'  :q3_config,
+  'mlp.down_proj':q3_config,
+  })
+```
+
+Then you simply quantize the model as follows
+``` Python
+model = transformers.AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="cuda", quantization_config=quant_config)
+```
+### Optimized Runtime
+HQQ supports various backends, including pure Pytorch and custom dequantization CUDA kernels. These backends are suitable for older gpus and peft/QLoRA training.
+For faster inference, HQQ supports 4-bit fused kernels (TorchAO and Marlin), reaching up to 200 tokens/sec on a single 4090.
+For more details on how to use the backends, please refer to https://github.com/mobiusml/hqq/?tab=readme-ov-file#backend
diff --git a/docs/source/en/quicktour.md b/docs/source/en/quicktour.md
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -1133,6 +1133,7 @@
         "BitsAndBytesConfig",
         "EetqConfig",
         "GPTQConfig",
+        "HqqConfig",
         "QuantoConfig",
     ],
 }
@@ -6097,6 +6098,7 @@
         BitsAndBytesConfig,
         EetqConfig,
         GPTQConfig,
+        HqqConfig,
         QuantoConfig,
     )
 

diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py
@@ -43,6 +43,7 @@
         "unset_hf_deepspeed_config",
     ],
     "eetq": ["replace_with_eetq_linear"],
+    "hqq": ["prepare_for_hqq_linear"],
     "integration_utils": [
         "INTEGRATION_TO_CALLBACK",
         "AzureMLCallback",
@@ -113,6 +114,7 @@
         unset_hf_deepspeed_config,
     )
     from .eetq import replace_with_eetq_linear
+    from .hqq import prepare_for_hqq_linear
     from .integration_utils import (
         INTEGRATION_TO_CALLBACK,
         AzureMLCallback,

diff --git a/src/transformers/integrations/hqq.py b/src/transformers/integrations/hqq.py
@@ -0,0 +1,123 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"HQQ (Half-Quadratic Quantization) integration file"
+
+from ..utils import is_hqq_available, is_torch_available, logging
+
+
+if is_torch_available():
+    import torch
+else:
+    torch = None
+
+logger = logging.get_logger(__name__)
+
+
+# Name all modules inside the model
+def autoname_modules(model):
+    for name, module in model.named_modules():
+        module.name = name
+
+
+# Get the linear_tag from a modul name. For example: model.layers.31.self_attn.k_proj -> self_attn.k_proj
+def name_to_linear_tag(name):
+    return ".".join([n for n in name.split(".") if ((n not in ["model", "layers"]) and (not n.isnumeric()))])
+
+
+# Get all linear tags available
+def get_linear_tags(model):
+    if is_hqq_available():
+        from hqq.core.quantize import HQQLinear
+
+    linear_tags = set()
+    for name, module in model.named_modules():
+        if type(module) in [torch.nn.Linear, HQQLinear]:
+            linear_tags.add(name_to_linear_tag(name))
+    return list(linear_tags)
+
+
+def _prepare_for_hqq_linear(model, patch_params, has_been_replaced, current_key_name=None):
+    for name, module in model.named_children():
+        if current_key_name is None:
+            current_key_name = []
+        current_key_name.append(name)
+
+        if isinstance(module, torch.nn.Linear):
+            # Get linear tag
+            linear_tag = name_to_linear_tag(module.name)
+
+            # We put the module quant_config into the nn.Linear layer so we can access it later in quantizer_hqq.create_quantized_param()
+            if linear_tag in patch_params:
+                if patch_params[linear_tag] is not None:
+                    model._modules[name].quant_config = patch_params[linear_tag]
+                    # Store the module class in case we need to transpose the weight later
+                    model._modules[name].source_cls = type(module)
+                    # Force requires grad to False to avoid unexpected errors
+                    model._modules[name].requires_grad_(False)
+
+            has_been_replaced = True
+
+        if len(list(module.children())) > 0:
+            _, has_been_replaced = _prepare_for_hqq_linear(
+                module,
+                patch_params=patch_params,
+                has_been_replaced=has_been_replaced,
+            )
+        # Remove the last key for recursion
+        current_key_name.pop(-1)
+
+    return model, has_been_replaced
+
+
+def prepare_for_hqq_linear(model, quantization_config=None, modules_to_not_convert=None, has_been_replaced=False):
+    """
+    Prepares nn.Linear layers for HQQ quantization.
+    Since each layer type can have separate quantization parameters, we need to do the following:
+    1- tag each module with its neme via autoname_modules()
+    2- Extract linear_tags (e.g. ['self_attn.q_proj', ...])
+    3- Map quantization parameters as a dictionary linear_tag -> quant_params as HQQLinear exepects it, this is referred to as patch_params
+    """
+
+    modules_to_not_convert = [] if modules_to_not_convert is None else modules_to_not_convert
+
+    # Add name to module
+    autoname_modules(model)
+
+    # Get linear tags. This allows us to use different quant params to different layer types
+    linear_tags = get_linear_tags(model)
+
+    # Convert quantization_config to layer-wise config
+    skip_modules = quantization_config.skip_modules
+    quant_config = quantization_config.to_dict()
+    linear_tags = list(set(linear_tags) - set(skip_modules) - set(modules_to_not_convert))
+
+    if any(key in linear_tags for key in quant_config.keys()):
+        # If the user doesn't specify a key from get_linear_tags, the layer is not quantized via (key, None)
+        patch_params = {key: None for key in linear_tags}
+        patch_params.update(quant_config)
+    else:
+        # Same quant_config for all layers
+        patch_params = {k: quant_config for k in linear_tags}
+
+    model, has_been_replaced = _prepare_for_hqq_linear(
+        model, patch_params=patch_params, has_been_replaced=has_been_replaced
+    )
+
+    # We store quantization config as linear_tag -> hqq quant config
+    model.config.quantization_config = patch_params
+
+    if not has_been_replaced:
+        logger.warning("No linear modules were found in your model for quantization.")
+
+    return model
diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -38,6 +38,7 @@
 from torch import Tensor, nn
 from torch.nn import CrossEntropyLoss, Identity
 from torch.utils.checkpoint import checkpoint
+from tqdm import tqdm as tqdm_lib
 
 from .activations import get_activation
 from .configuration_utils import PretrainedConfig
@@ -808,7 +809,13 @@ def _load_state_dict_into_meta_model(
     for old_key, new_key in zip(old_keys, new_keys):
         state_dict[new_key] = state_dict.pop(old_key)
 
-    for param_name, param in state_dict.items():
+    # Show shard-level progress. Useful to monitor quantization progress
+    quant_show_progress = False
+    if hf_quantizer is not None:
+        if hasattr(hf_quantizer, "show_progress"):
+            quant_show_progress = hf_quantizer.show_progress
+
+    for param_name, param in tqdm_lib(state_dict.items(), disable=not quant_show_progress):
         # First part of the test is always true as load_state_dict_keys always contains state_dict keys.
         if param_name not in loaded_state_dict_keys or param_name not in expected_keys:
             continue
@@ -2656,6 +2663,8 @@ def get_memory_footprint(self, return_buffers=True):
 
     @wraps(torch.nn.Module.cuda)
     def cuda(self, *args, **kwargs):
+        if getattr(self, "quantization_method", None) == QuantizationMethod.HQQ:
+            raise ValueError("`.to` is not supported for HQQ-quantized models.")
         # Checks if the model has been loaded in 8-bit
         if getattr(self, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES:
             raise ValueError(
@@ -2667,6 +2676,8 @@ def cuda(self, *args, **kwargs):
 
     @wraps(torch.nn.Module.to)
     def to(self, *args, **kwargs):
+        if getattr(self, "quantization_method", None) == QuantizationMethod.HQQ:
+            raise ValueError("`.to` is not supported for HQQ-quantized models.")
         # Checks if the model has been loaded in 8-bit
         if getattr(self, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES:
             raise ValueError(
@@ -3736,6 +3747,13 @@ def from_pretrained(
             }
             if "skip_keys" in inspect.signature(dispatch_model).parameters:
                 device_map_kwargs["skip_keys"] = model._skip_keys_device_placement
+            # For HQQ method we force-set the hooks for single GPU envs
+            if (
+                "force_hooks" in inspect.signature(dispatch_model).parameters
+                and hf_quantizer is not None
+                and hf_quantizer.quantization_config.quant_method == QuantizationMethod.HQQ
+            ):
+                device_map_kwargs["force_hooks"] = True
             if not is_fsdp_enabled() and not is_deepspeed_zero3_enabled():
                 dispatch_model(model, **device_map_kwargs)
 

diff --git a/src/transformers/quantizers/__init__.py b/src/transformers/quantizers/__init__.py
diff --git a/src/transformers/quantizers/auto.py b/src/transformers/quantizers/auto.py
@@ -21,6 +21,7 @@
     BitsAndBytesConfig,
     EetqConfig,
     GPTQConfig,
+    HqqConfig,
     QuantizationConfigMixin,
     QuantizationMethod,
     QuantoConfig,
@@ -31,6 +32,7 @@
 from .quantizer_bnb_8bit import Bnb8BitHfQuantizer
 from .quantizer_eetq import EetqHfQuantizer
 from .quantizer_gptq import GptqHfQuantizer
+from .quantizer_hqq import HQQHfQuantizer
 from .quantizer_quanto import QuantoHfQuantizer
 
 
@@ -42,6 +44,7 @@
     "aqlm": AqlmHfQuantizer,
     "quanto": QuantoHfQuantizer,
     "eetq": EetqHfQuantizer,
+    "hqq": HQQHfQuantizer,
 }
 
 AUTO_QUANTIZATION_CONFIG_MAPPING = {
@@ -52,6 +55,7 @@
     "gptq": GPTQConfig,
     "aqlm": AqlmConfig,
     "quanto": QuantoConfig,
+    "hqq": HqqConfig,
 }