GPTQ example support (#98)

* add gptq examples Signed-off-by: YIYANGCAI <yiyang.cai@intel.com> --------- Signed-off-by: YIYANGCAI <yiyang.cai@intel.com> Co-authored-by: xinhe <xin3.he@intel.com>
intel · Aug 31, 2023 · b4b2fcc · b4b2fcc
1 parent c569fd5
commit b4b2fcc
Show file tree

Hide file tree

Showing 2 changed files with 60 additions and 4 deletions.
diff --git a/examples/huggingface/pytorch/language-modeling/quantization/README.md b/examples/huggingface/pytorch/language-modeling/quantization/README.md
@@ -50,7 +50,20 @@ python run_clm_no_trainer.py \
     --approach weight_only \
     --output_dir "saved_results" \
 ```
-**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN/AWQ[1] algorithms. You can try it with `--approach weight_only`. `--awq` will trigger AWQ algorithm.
+**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN/AWQ[1]/GPTQ[2] algorithms. You can try it with `--approach weight_only`. `--awq` will trigger AWQ algorithm. `--gptq` will trigger GPTQ algorithm. For example, to run a GPTQ example, try the following command.
+```bash
+python run_clm_no_trainer.py \
+    --model EleutherAI/gpt-j-6B \
+    --weight_only_algo GPTQ \
+    --weight_only_bits 4 \
+    --quantize \
+    --pad_max_length 2048 \
+    --gptq_pad_max_length 2048 \
+    --gptq_use_max_length \
+    --approach weight_only \
+    --output_dir "test_models" \
+```
+
 
 #### Accuracy with lm_eval
 ```bash
@@ -230,3 +243,4 @@ python run_mlm.py \
 ```
 
 [1]. Lin, Ji, et al. "AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration." arXiv preprint arXiv:2306.00978 (2023).
+[2]. Elias, Frantar, et al. "GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers." arXiv preprint arXiv:2210.17323 (2023).
diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py b/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py
@@ -1,14 +1,16 @@
 import argparse
 import os
+import sys
+sys.path.append('./')
 import time
 import json
 import re
 import torch
 from datasets import load_dataset
+import datasets
 from torch.nn.functional import pad
 from torch.utils.data import DataLoader
 
-
 parser = argparse.ArgumentParser()
 parser.add_argument(
     "--model", nargs="?", default="EleutherAI/gpt-j-6b"
@@ -32,7 +34,15 @@
 parser.add_argument("--sq", action="store_true")
 parser.add_argument("--alpha", default="auto",
                     help="Smooth quant parameter.")
-parser.add_argument("--weight_only_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ'], 
+# ============gptq configs===============
+parser.add_argument("--gptq_actorder", action="store_true", help="Whether to apply the activation order GPTQ heuristic.")
+parser.add_argument('--gptq_percdamp', type=float, default=.01, help='Percent of the average Hessian diagonal to use for dampening.')
+parser.add_argument('--gptq_block_size', type=int, default=128, help='Block size. sub weight matrix size to run GPTQ.')
+parser.add_argument('--gptq_nsamples', type=int, default=128, help='Number of calibration data samples.')
+parser.add_argument('--gptq_use_max_length', action="store_true", help='Set all sequence length to be same length of args.gptq_pad_max_length')
+parser.add_argument('--gptq_pad_max_length', type=int, default=2048, help='Calibration dataset sequence max length, this should align with your model config, and your dataset builder args: args.pad_max_length')
+# =======================================
+parser.add_argument("--weight_only_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ'], 
                     help="Weight-only parameter.")
 parser.add_argument("--int8", action="store_true")
 parser.add_argument("--ipex", action="store_true", help="Use intel extension for pytorch.")
@@ -135,6 +145,7 @@ def evaluate(self, model):
         print("Latency: ", latency)
         return acc
 
+
 def get_user_model():
     from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer
     torchscript = False
@@ -186,6 +197,10 @@ def get_user_model():
         )
         tokenizer = AutoTokenizer.from_pretrained(args.model)
 
+    # Set model's seq_len when GPTQ calibration is enabled.
+    if args.weight_only_algo == 'GPTQ':
+        user_model.seqlen = args.gptq_pad_max_length
+
     if args.peft_model_id is not None:
         from peft import PeftModel
         user_model = PeftModel.from_pretrained(user_model, args.peft_model_id)
@@ -195,11 +210,11 @@ def get_user_model():
     user_model.eval()
     return user_model, tokenizer
 
-
 if args.quantize:
     # dataset
     user_model, tokenizer = get_user_model()
     calib_dataset = load_dataset(args.dataset, split="train")
+    # calib_dataset = datasets.load_from_disk('/your/local/dataset/pile-10k/') # use this if trouble with connecting to HF
     calib_dataset = calib_dataset.shuffle(seed=42)
     calib_evaluator = Evaluator(calib_dataset, tokenizer, args.batch_size, pad_max=args.pad_max_length, is_calib=True)
     calib_dataloader = DataLoader(
@@ -252,6 +267,30 @@ def calib_func(prepared_model):
             op_type_dict=op_type_dict,
             recipes=recipes,
         )
+    elif args.weight_only_algo == "GPTQ":
+        recipes = {
+            'gptq_args': {
+                'percdamp': args.gptq_percdamp, 
+                'act_order':args.gptq_actorder, 
+                'block_size': args.gptq_block_size, 
+                'nsamples': args.gptq_nsamples, 
+                'use_max_length': args.gptq_use_max_length
+            }
+        }
+        conf = PostTrainingQuantConfig(
+            backend="ipex" if args.ipex else "default",
+            approach=args.approach,
+            excluded_precisions=excluded_precisions,
+            op_type_dict=op_type_dict,
+            op_name_dict={
+                '.*lm_head':{ 	# re.match
+                    "weight": {
+                        'dtype': 'fp32'
+                    },
+                },
+            },
+            recipes=recipes,
+        )
     else:
         conf = PostTrainingQuantConfig(
             backend="ipex" if args.ipex else "default",
@@ -261,6 +300,9 @@ def calib_func(prepared_model):
             recipes=recipes,
         )
 
+    # when GPTQ is enabled: use assistive functions to modify calib_dataloader and calib_func
+    if args.weight_only_algo == "GPTQ":
+        calib_func = None
     if args.weight_only_algo == 'TEQ':
         # set calib_func=None, use default training func as calib_func
         calib_func = None