Skip to content

Commit

Permalink
GPTQ example support (#98)
Browse files Browse the repository at this point in the history
* add gptq examples

Signed-off-by: YIYANGCAI <yiyang.cai@intel.com>

---------

Signed-off-by: YIYANGCAI <yiyang.cai@intel.com>
Co-authored-by: xinhe <xin3.he@intel.com>
  • Loading branch information
YIYANGCAI and xin3he authored Aug 31, 2023
1 parent c569fd5 commit b4b2fcc
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,20 @@ python run_clm_no_trainer.py \
--approach weight_only \
--output_dir "saved_results" \
```
**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN/AWQ[1] algorithms. You can try it with `--approach weight_only`. `--awq` will trigger AWQ algorithm.
**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN/AWQ[1]/GPTQ[2] algorithms. You can try it with `--approach weight_only`. `--awq` will trigger AWQ algorithm. `--gptq` will trigger GPTQ algorithm. For example, to run a GPTQ example, try the following command.
```bash
python run_clm_no_trainer.py \
--model EleutherAI/gpt-j-6B \
--weight_only_algo GPTQ \
--weight_only_bits 4 \
--quantize \
--pad_max_length 2048 \
--gptq_pad_max_length 2048 \
--gptq_use_max_length \
--approach weight_only \
--output_dir "test_models" \
```


#### Accuracy with lm_eval
```bash
Expand Down Expand Up @@ -230,3 +243,4 @@ python run_mlm.py \
```

[1]. Lin, Ji, et al. "AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration." arXiv preprint arXiv:2306.00978 (2023).
[2]. Elias, Frantar, et al. "GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers." arXiv preprint arXiv:2210.17323 (2023).
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import argparse
import os
import sys
sys.path.append('./')
import time
import json
import re
import torch
from datasets import load_dataset
import datasets
from torch.nn.functional import pad
from torch.utils.data import DataLoader


parser = argparse.ArgumentParser()
parser.add_argument(
"--model", nargs="?", default="EleutherAI/gpt-j-6b"
Expand All @@ -32,7 +34,15 @@
parser.add_argument("--sq", action="store_true")
parser.add_argument("--alpha", default="auto",
help="Smooth quant parameter.")
parser.add_argument("--weight_only_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ'],
# ============gptq configs===============
parser.add_argument("--gptq_actorder", action="store_true", help="Whether to apply the activation order GPTQ heuristic.")
parser.add_argument('--gptq_percdamp', type=float, default=.01, help='Percent of the average Hessian diagonal to use for dampening.')
parser.add_argument('--gptq_block_size', type=int, default=128, help='Block size. sub weight matrix size to run GPTQ.')
parser.add_argument('--gptq_nsamples', type=int, default=128, help='Number of calibration data samples.')
parser.add_argument('--gptq_use_max_length', action="store_true", help='Set all sequence length to be same length of args.gptq_pad_max_length')
parser.add_argument('--gptq_pad_max_length', type=int, default=2048, help='Calibration dataset sequence max length, this should align with your model config, and your dataset builder args: args.pad_max_length')
# =======================================
parser.add_argument("--weight_only_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ'],
help="Weight-only parameter.")
parser.add_argument("--int8", action="store_true")
parser.add_argument("--ipex", action="store_true", help="Use intel extension for pytorch.")
Expand Down Expand Up @@ -135,6 +145,7 @@ def evaluate(self, model):
print("Latency: ", latency)
return acc


def get_user_model():
from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer
torchscript = False
Expand Down Expand Up @@ -186,6 +197,10 @@ def get_user_model():
)
tokenizer = AutoTokenizer.from_pretrained(args.model)

# Set model's seq_len when GPTQ calibration is enabled.
if args.weight_only_algo == 'GPTQ':
user_model.seqlen = args.gptq_pad_max_length

if args.peft_model_id is not None:
from peft import PeftModel
user_model = PeftModel.from_pretrained(user_model, args.peft_model_id)
Expand All @@ -195,11 +210,11 @@ def get_user_model():
user_model.eval()
return user_model, tokenizer


if args.quantize:
# dataset
user_model, tokenizer = get_user_model()
calib_dataset = load_dataset(args.dataset, split="train")
# calib_dataset = datasets.load_from_disk('/your/local/dataset/pile-10k/') # use this if trouble with connecting to HF
calib_dataset = calib_dataset.shuffle(seed=42)
calib_evaluator = Evaluator(calib_dataset, tokenizer, args.batch_size, pad_max=args.pad_max_length, is_calib=True)
calib_dataloader = DataLoader(
Expand Down Expand Up @@ -252,6 +267,30 @@ def calib_func(prepared_model):
op_type_dict=op_type_dict,
recipes=recipes,
)
elif args.weight_only_algo == "GPTQ":
recipes = {
'gptq_args': {
'percdamp': args.gptq_percdamp,
'act_order':args.gptq_actorder,
'block_size': args.gptq_block_size,
'nsamples': args.gptq_nsamples,
'use_max_length': args.gptq_use_max_length
}
}
conf = PostTrainingQuantConfig(
backend="ipex" if args.ipex else "default",
approach=args.approach,
excluded_precisions=excluded_precisions,
op_type_dict=op_type_dict,
op_name_dict={
'.*lm_head':{ # re.match
"weight": {
'dtype': 'fp32'
},
},
},
recipes=recipes,
)
else:
conf = PostTrainingQuantConfig(
backend="ipex" if args.ipex else "default",
Expand All @@ -261,6 +300,9 @@ def calib_func(prepared_model):
recipes=recipes,
)

# when GPTQ is enabled: use assistive functions to modify calib_dataloader and calib_func
if args.weight_only_algo == "GPTQ":
calib_func = None
if args.weight_only_algo == 'TEQ':
# set calib_func=None, use default training func as calib_func
calib_func = None
Expand Down

0 comments on commit b4b2fcc

Please sign in to comment.