Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Slowed Down After Quantizing Fine-Tuned gemme-2b-it Model #572

Open
Yimjaehyun93 opened this issue Aug 5, 2024 · 0 comments
Open

Slowed Down After Quantizing Fine-Tuned gemme-2b-it Model #572

Yimjaehyun93 opened this issue Aug 5, 2024 · 0 comments

Comments

@Yimjaehyun93
Copy link

Yimjaehyun93 commented Aug 5, 2024

After fine-tuning and quantizing the gemme-2b-it model, the average inference speed slowed down (increased from 120ms to 160ms). Could there be an issue with the quantization? I'm attaching parts of the code for fine-tuning and quantization.

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

lora_config = LoraConfig(
        r=lora_rank,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        target_modules=[
            "q_proj",
            "o_proj",
            "k_proj",
            "v_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
        task_type="CAUSAL_LM",
    )
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    BASE_MODEL = "google/gemma-2b-it"
    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL, device_map="auto", quantization_config=bnb_config
    )
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
    tokenizer.padding_side = "right"

    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, lora_config)
    torch.cuda.empty_cache()

    trainer = SFTTrainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        max_seq_length=max_seq_length,
        args=TrainingArguments(
            output_dir=output_dir,
            max_steps=max_steps,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            gradient_accumulation_steps=2,
            optim="paged_adamw_8bit",
            lr_scheduler_type="cosine",
            warmup_ratio=0.03,
            learning_rate=2e-4,
            bf16=True,
            do_eval=True,
            eval_strategy="steps",
            save_steps=save_steps,
            eval_steps=eval_steps,
            logging_steps=5,
            push_to_hub=False,
            report_to="tensorboard",
            gradient_checkpointing=True,
            gradient_checkpointing_kwargs={"use_reentrant": True},
        ),
        peft_config=lora_config,
        tokenizer=tokenizer,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
        formatting_func=generate_prompts,
    )
    trainer.accelerator.print(f"{trainer.model}")
    trainer.model.print_trainable_parameters()
    model.config.use_cache = False
    trainer.train()
 
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from huggingface_hub import login
from awq import AutoAWQForCausalLM

finetune_model = AutoAWQForCausalLM.from_pretrained(
    args.adapter_name, **{"low_cpu_mem_usage": True, "use_cache": False}
)
tokenizer = AutoTokenizer.from_pretrained(args.model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.save_pretrained(args.adapter_name)
quant_config = {
    "zero_point": True,
    "q_group_size": 128,
    "w_bit": 4,
    "version": "GEMM",
}

finetune_model.quantize(tokenizer, quant_config=quant_config)
quant_path = args.output_path
finetune_model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)
@Yimjaehyun93 Yimjaehyun93 changed the title Performance Degradation After Quantizing Fine-Tuned gemme-2b-it Model Slowed Down After Quantizing Fine-Tuned gemme-2b-it Model Aug 5, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant