[LLM Runtime]updata neural speed example (#1241)

intel · Feb 2, 2024 · 3385c42 · 3385c42
1 parent bfa06e2
commit 3385c42
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 7 deletions.
diff --git a/examples/huggingface/neural_speed/README.md b/examples/huggingface/neural_speed/README.md
@@ -36,13 +36,14 @@ pip install -r requirements.txt
 # Run
 
 
-> Note: Please prepare LLMs and save locally before running inference.
+> Note: Please prepare LLMs and save locally before running inference. Here are the models that are currently supported [Support models](https://github.com/intel/neural-speed/blob/main/docs/supported_models.md), you can replace Llama2 in the example with the model in the link.
 
 
 ## 1. Performance
 
 ``` bash
-# int4 with group-size=32
+# int4 with group-size=128
+# --not_quant use fp32 inference
 OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python runtime_example.py \
     --model_path ./Llama2 \
     --prompt "Once upon a time, there existed a little girl," \
@@ -69,4 +70,4 @@ python runtime_acc.py \
 >        model_args=f'pretrained="{args.model_name}", dtype=float32',
 >        tasks=[f"{args.tasks}"]
 >   )
-> ```
+> ```
diff --git a/examples/huggingface/neural_speed/runtime_example.py b/examples/huggingface/neural_speed/runtime_example.py
@@ -21,17 +21,18 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a PyTorch model to a NE compatible file")
     parser.add_argument("--model_path",type=Path,
                         help="model path for local or from hf", default="meta-llama/Llama-2-7b-hf")
-    parser.add_argument("--prompt",type=str,help="model path for local or from hf",default="Once upon a time, there existed a little girl,")
+    parser.add_argument("--prompt",type=str,help="model path for local or from hf", default="Once upon a time, there existed a little girl,")
+    parser.add_argument("--not_quant" ,action="store_false", help="Whether to use a model with low bit quantization")
     parser.add_argument("--weight_dtype",type=str,
                         help="output weight type, default: int4, we support int4, int8, nf4 and others ", default="int4")
     parser.add_argument("--compute_dtype", type=str, help="compute type", default="int8")
     parser.add_argument("--group_size", type=int, help="group size", default=128)
     parser.add_argument("--n_ctx", type=int, help="n_ctx", default=512)
     parser.add_argument("--max_new_tokens", type=int, help="max_new_tokens", default=300)
     args = parser.parse_args(args_in)
-    model_name = args.model_path    
-    woq_config = WeightOnlyQuantConfig(load_in_4bit=True,
-                                       weight_dtype= args.weight_dtype, compute_dtype=args.compute_dtype, group_size= args.group_size)
+    model_name = args.model_path
+    woq_config = WeightOnlyQuantConfig(load_in_4bit=True, use_quant=args.not_quant,
+                                       weight_dtype=args.weight_dtype, compute_dtype=args.compute_dtype, group_size=args.group_size)
     prompt = args.prompt
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
     streamer = TextStreamer(tokenizer)