Skip to content

Commit

Permalink
Improve WOQ algo autoround (#1330)
Browse files Browse the repository at this point in the history
Signed-off-by: changwangss <chang1.wang@intel.com>
  • Loading branch information
changwangss authored Mar 1, 2024
1 parent e07e39f commit a6c05b9
Show file tree
Hide file tree
Showing 9 changed files with 50 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ neural-compressor
intel_extension_for_pytorch==2.2.0
optimum-intel
git+https://github.com/bigcode-project/bigcode-evaluation-harness@00967d12093ef614de7bdad0772aed8e4118f1fd
git+https://github.com/intel/auto-round.git@a868c805de4be271cfe7403309a64d9bf03a0ecf
git+https://github.com/intel/auto-round.git@b65830f3f6cb32d92a5c8ba5f80ace12d517357b



Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@
# ============AUTOROUND configs==============
parser.add_argument(
"--autoround_nsamples",
type=int, default=128,
type=int, default=512,
help="Number of calibration data samples.",
)
parser.add_argument(
Expand Down Expand Up @@ -301,6 +301,7 @@
"iters": args.calib_iters,
"scale_dtype": "fp32",
"device": "cpu",
"export_args": {"format": "itrex", "inplace": False}
}
quantization_config = WeightOnlyQuantConfig(
compute_dtype=args.woq_compute_dtype,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ tiktoken #qwen
einops #qwen
git+https://github.com/intel/neural-compressor.git
git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2
git+https://github.com/intel/auto-round.git@a868c805de4be271cfe7403309a64d9bf03a0ecf
git+https://github.com/intel/auto-round.git@b65830f3f6cb32d92a5c8ba5f80ace12d517357b
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@
# ============AUTOROUND configs==============
parser.add_argument(
"--autoround_nsamples",
type=int, default=128,
type=int, default=512,
help="Number of calibration data samples.",
)
parser.add_argument(
Expand Down Expand Up @@ -312,6 +312,7 @@
"iters": args.calib_iters,
"scale_dtype": "fp32",
"device": "cpu",
"export_args": {"format": "itrex", "inplace": False}
}
quantization_config = WeightOnlyQuantConfig(
compute_dtype=args.woq_compute_dtype,
Expand Down
46 changes: 35 additions & 11 deletions intel_extension_for_transformers/llm/quantization/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from datasets import load_dataset
from neural_compressor import quantization
from neural_compressor.adaptor.torch_utils.model_wrapper import WeightOnlyLinear
from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear as auto_round_woqlinear
from neural_compressor.utils.utility import LazyImport
from neural_compressor.config import PostTrainingQuantConfig
from ...utils.utils import is_ipex_available
Expand Down Expand Up @@ -105,8 +106,9 @@ def _replace_linear(
is_removed = False

if (isinstance(module, torch.nn.Linear) or isinstance(module, WeightOnlyLinear)
or (is_ipex_available() and isinstance(module, ipex.nn.utils._weight_prepack._IPEXLinear))) \
and (name not in modules_to_not_convert):
or isinstance(module, auto_round_woqlinear) or (is_ipex_available()
and isinstance(module, ipex.nn.utils._weight_prepack._IPEXLinear))) \
and (name not in modules_to_not_convert):
# Check if the current key is not in the `modules_to_not_convert`
if not any(
key in ".".join(current_key_name) for key in modules_to_not_convert
Expand Down Expand Up @@ -186,7 +188,7 @@ def _replace_linear(
int_weight,
gptq_scales,
gptq_zeros,
module.g_idx,
module.g_idx if hasattr(module, "g_idx") else None,
quantization_config,
bias=None if module.bias is None else module.bias.data,
)
Expand Down Expand Up @@ -279,12 +281,35 @@ def collate_batch(batch):
input_ids_padded.append(input_ids)
return torch.vstack(input_ids_padded)

calib_dataloader = DataLoader(
tokenized_dataset,
batch_size=1,
shuffle=False,
collate_fn=collate_batch,
)
def collate_batch_for_autoround(batch):
input_ids_padded = []
for text in batch:
input_ids = text["input_ids"]
if input_ids.shape[0] < config.algorithm_args["seq_len"]:
continue
input_ids = input_ids[:config.algorithm_args["seq_len"]]
input_ids_list = input_ids.tolist()
if input_ids_list.count(input_ids_list[-1]) > config.algorithm_args["seq_len"] // 2:
continue
input_ids_padded.append(input_ids)
if len(input_ids_padded) == 0:
return None

return torch.vstack(input_ids_padded)
if config.algorithm == "AUTOROUND":
calib_dataloader = DataLoader(
tokenized_dataset,
batch_size=1,
shuffle=False,
collate_fn=collate_batch_for_autoround,
)
else:
calib_dataloader = DataLoader(
tokenized_dataset,
batch_size=1,
shuffle=False,
collate_fn=collate_batch,
)
if calib_func is None and config.algorithm in ["AWQ"]:

def default_calib_func(model):
Expand Down Expand Up @@ -390,7 +415,6 @@ def default_calib_func(model):
setattr(config, "gptq_quantize_config", quantize_config)
q_model = replace_linear(inc_model, None, None, config, device=device)
elif config.algorithm == "AUTOROUND":
inc_model = inc_model.export_compressed_model(use_optimum_format=True)
inc_model.eval()
quantize_config = {
"bits": bits,
Expand All @@ -403,7 +427,7 @@ def default_calib_func(model):
}

setattr(config, "gptq_quantize_config", quantize_config)
q_model = replace_linear(inc_model, None, None, config, device=device)
q_model = replace_linear(inc_model._model, None, None, config, device=device)
else:
q_model = replace_linear(inc_model.model, None, None, config, device=device)
if orig_dtype != torch.float32:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
import intel_extension_for_pytorch as ipex
except ImportError:
logger.warning("Please install Intel Extension for PyTorch to accelerate the model inference.")
assert (ipex.__version__ >= "2.1.0+cpu"), "Please use Intel Extension for PyTorch >=2.1.0+cpu."
assert (ipex.__version__ >= "2.2.0+cpu"), "Please use Intel Extension for PyTorch >=2.2.0+cpu."
model = cls.ORIG_MODEL.from_pretrained(
pretrained_model_name_or_path,
low_cpu_mem_usage=True,
Expand All @@ -424,8 +424,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
model = model.float()
model.eval()
model_type = model.config.model_type.replace("_", "-")
if "falcon" in model_type:
logger.warning("Please use transformers 4.33.3 if you would like to apply smoothquant to Falcon.")
if "llama" in model_type and transformers.__version__ >= "4.36.0":
quantization_config.ipex_opt_llm = False
logger.info("Applying SmoothQuant.")
Expand All @@ -434,7 +432,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
if model_type in IPEX_OPT_LLM_SUPPORTED:
quantization_config.ipex_opt_llm = True
logger.info("quantization_config.ipex_opt_llm set to True and ipex.optimize_transformers is used.")
logger.warning("The suggested transformers version is 4.31.0.")
logger.warning("The suggested transformers version is 4.35.2.")
else:
quantization_config.ipex_opt_llm = False
if quantization_config.ipex_opt_llm:
Expand Down Expand Up @@ -487,12 +485,12 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
calib_dataset = calib_dataset.shuffle(seed=42)

def tokenize_function(examples):
if "prompt" in examples:
if "code" in examples:
example = tokenizer(examples["code"])
elif "prompt" in examples:
example = tokenizer(examples["prompt"])
elif "text" in examples:
example = tokenizer(examples["text"])
elif "code" in examples:
example = tokenizer(examples["code"])
else:
logger.error("Please check dataset prompt identifier," +
" NeelNanda/pile-10k is default used calibration dataset.")
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
accelerate
cmake>=3.16
gguf
git+https://github.com/intel/auto-round.git@b65830f3f6cb32d92a5c8ba5f80ace12d517357b
ninja
optimum-intel
py-cpuinfo
Expand Down
3 changes: 2 additions & 1 deletion tests/CI/test_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,7 @@ def test_quantization_for_llm(self):
"iters": 5,
"scale_dtype": "fp32",
"device": "cpu",
"export_args": {"format": "itrex", "inplace": False}
}
woq_config = WeightOnlyQuantConfig(weight_dtype="int4_clip",
algorithm_args=algorithm_args,
Expand All @@ -476,7 +477,7 @@ def test_quantization_for_llm(self):
)
woq_model.eval()
output = woq_model(dummy_input)
self.assertTrue(isclose(float(output[0][0][0][0]), 0.18015708029270172, rel_tol=1e-04))
self.assertTrue(isclose(float(output[0][0][0][0]), 0.173023983836174, rel_tol=1e-04))

def test_export(self):
# test model with model_id
Expand Down
2 changes: 1 addition & 1 deletion tests/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ datasets==2.14.7
einops
evaluate
gguf
git+https://github.com/intel/auto-round.git@a868c805de4be271cfe7403309a64d9bf03a0ecf
git+https://github.com/intel/auto-round.git@b65830f3f6cb32d92a5c8ba5f80ace12d517357b
git+https://github.com/intel/neural-compressor.git
intel-extension-for-pytorch==2.2.0
intel-tensorflow==2.14.0
Expand Down

0 comments on commit a6c05b9

Please sign in to comment.