Merge branch 'main' into wanda

neuralmagic · Dec 18, 2023 · 1d1aaca · 1d1aaca
2 parents 854fd69 + 27ae625
commit 1d1aaca
Show file tree

Hide file tree

Showing 9 changed files with 82 additions and 22 deletions.
diff --git a/README.md b/README.md
@@ -59,6 +59,20 @@ SparseML is an open-source model optimization toolkit that enables you to create
    <img alt="SparseML Flow" src="docs/images/sparseml-workflow.png" width="60%" />
 </p>
 
+## ✨NEW✨ SparseML One-Shot LLM Compression
+
+Neural Magic is excited to preview one-shot LLM compression workflows using the new `SparseGPTModfier`! 
+
+To prune and quantize a TinyLlama Chat model it is just a few steps to install dependencies, download a recipe, and apply it to the model:
+```
+git clone https://github.com/neuralmagic/sparseml
+pip install -e "sparseml[transformers]"
+wget https://huggingface.co/neuralmagic/TinyLlama-1.1B-Chat-v0.4-pruned50-quant-ds/raw/main/recipe.yaml
+python sparseml/src/sparseml/transformers/sparsification/obcq/obcq.py TinyLlama/TinyLlama-1.1B-Chat-v0.4 open_platypus --recipe recipe.yaml --save True
+```
+
+The README at [`src/sparseml/transformers/sparsification/obcq`](https://github.com/neuralmagic/sparseml/tree/main/src/sparseml/transformers/sparsification/obcq) has a detailed walkthrough.
+
 ## Workflows
 
 SparseML enables you to create a sparse model trained on your dataset in two ways:

diff --git a/src/sparseml/core/recipe/recipe.py b/src/sparseml/core/recipe/recipe.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import json
+import logging
 import os
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Union
@@ -30,6 +31,8 @@
 
 __all__ = ["Recipe", "RecipeTuple"]
 
+_LOGGER = logging.getLogger(__name__)
+
 
 class Recipe(RecipeBase):
     """
@@ -72,8 +75,15 @@ def create_instance(path: str) -> "Recipe":
                 raise NotImplementedError("Using SparseZoo stubs is not yet supported")
             else:
                 # assume it's a string
+                _LOGGER.warning(
+                    "Could not process input as a file path or zoo stub, "
+                    "attempting to process it as a string."
+                )
+                _LOGGER.warning(f"Input string: {path}")
                 obj = _load_json_or_yaml_string(path)
                 return Recipe.parse_obj(obj)
+        else:
+            _LOGGER.info(f"Loading recipe from file {path}")
 
         with open(path, "r") as file:
             content = file.read().strip()

diff --git a/src/sparseml/modifiers/obcq/utils/helpers.py b/src/sparseml/modifiers/obcq/utils/helpers.py
@@ -13,18 +13,17 @@
 # limitations under the License.
 
 import logging
+import operator
 from collections import defaultdict
 from math import ceil
 from typing import List, Optional
 
 import torch
+from torch.nn.modules.sparse import Embedding
 
 
 _LOGGER = logging.getLogger(__name__)
-_DEFAULT_TARGET_IDS = [
-    "attention_mask",
-    "position_ids",
-]
+_DEFAULT_TARGET_IDS = ["attention_mask", "position_ids", "position_bias"]
 
 
 class Catcher(torch.nn.Module):
@@ -127,24 +126,36 @@ def execute_offloaded_module(
 def cache_attention_inputs(
     model, dataloader, device, nsamples, target_ids, layer_prefix
 ):
-    if layer_prefix:
-        embed_tokens = getattr(model.model, layer_prefix).embed_tokens
-        first_layer = getattr(model.model, layer_prefix).layers[0]
+    if layer_prefix:  # get model-specific path to layers list
+        split_prefix = layer_prefix.split(".")
+        layers_name = split_prefix[-1]
+        model_root_name = ".".join(split_prefix[:-1])
+        model_root = operator.attrgetter(model_root_name)(model)
+        first_layer = getattr(model_root, layers_name)[0]
     else:
-        embed_tokens = model.model.embed_tokens
-        first_layer = model.model.layers[0]
-    embed_tokens.to(device)
+        model_root = model.model
+        layers_name = "layers"
+        first_layer = model_root.layers[0]
+
+    # send everything up to the first compressable layer to device
+    pre_layers_modules = _get_pre_layer_modules(model_root, layers_name)
+    for pre_layer in pre_layers_modules:
+        pre_layer.to(device)
     first_layer.to(device)
+
     cached_inputs = catch(
         model=model,
         attention_layer=first_layer,
         target_keys=target_ids,
         data_loader=dataloader,
         nsamples=nsamples,
     )
-    embed_tokens.cpu()
+
+    for pre_layer in pre_layers_modules:
+        pre_layer.cpu()
     first_layer.cpu()
     torch.cuda.empty_cache()
+
     return cached_inputs
 
 
@@ -191,3 +202,14 @@ def ppl_eval_general(
     _LOGGER.info(f"Perplexity: {ppl.item():3f}")
 
     return ppl.item()
+
+
+def _get_pre_layer_modules(model_root, layers_name):
+    pre_layers_modules = []
+    for name, layer in model_root.named_modules():
+        if name.startswith(layers_name):
+            break
+        if isinstance(layer, Embedding):
+            pre_layers_modules.append(layer)
+
+    return pre_layers_modules
diff --git a/src/sparseml/transformers/data/base_llm.py b/src/sparseml/transformers/data/base_llm.py
@@ -38,7 +38,9 @@ def __init__(
         shuffle: bool = True,
         **kwargs,
     ):
-        self.tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model, use_fast=True, trust_remote_code=True
+        )
         self._nsamples = nsamples
         self._seqlen = seqlen
         self._use_max_tokens = use_max_tokens

diff --git a/src/sparseml/transformers/sparsification/obcq/example.yaml b/src/sparseml/transformers/sparsification/obcq/example.yaml
@@ -1,6 +1,6 @@
 metadata:
   target_model:
-    layer_prefix: "decoder"
+    layer_prefix: "model.decoder.layers"
     architecture: "opt"
 
 test_stage:

diff --git a/src/sparseml/transformers/sparsification/obcq/export.py b/src/sparseml/transformers/sparsification/obcq/export.py
@@ -146,6 +146,9 @@ def load_task_model(
         )
 
     if task == "text-generation":
+        # Export decoder model without kv cache support
+        config.use_cache = False
+
         return SparseAutoModel.text_generation_from_pretrained(
             model_name_or_path=model_path,
             config=config,
@@ -291,10 +294,13 @@ def export_transformer_to_onnx(
 
     # creates a SparseSession and apply structure from the model's recipe
     recipe_path = os.path.join(model_path, RECIPE_FILE_NAME)
-    session_manager.create_session()
-    apply_recipe_structure_to_model(
-        model, recipe_path=recipe_path, model_path=model_path
-    )
+    if os.path.exists(recipe_path):
+        session_manager.create_session()
+        apply_recipe_structure_to_model(
+            model=model, recipe_path=recipe_path, model_path=model_path
+        )
+    else:
+        _LOGGER.warning(f"No input recipe {RECIPE_FILE_NAME} found in {model_path}.")
 
     # create fake model input
     inputs = tokenizer(

diff --git a/src/sparseml/transformers/sparsification/obcq/utils/helpers.py b/src/sparseml/transformers/sparsification/obcq/utils/helpers.py
@@ -43,7 +43,7 @@ def opt_forward(model: Module, data_loader: List, device: str, nsamples: int = N
         device=device,
         nsamples=nsamples,
         target_ids=["attention_mask"],
-        layer_prefix="decoder",
+        layer_prefix="model.decoder.layers",
     )
     buffer = [b[0] for b in cached_inputs.pop("inputs")]
     for layer in model.model.decoder.layers:

diff --git a/src/sparseml/transformers/utils/model.py b/src/sparseml/transformers/utils/model.py
@@ -260,6 +260,9 @@ def text_generation_from_pretrained(
                 model_name_or_path
             )
 
+        kwargs["config"].is_decoder = True
+        kwargs["config"].use_past = False
+
         model = AutoModelForCausalLM.from_pretrained(
             model_name_or_path,
             **kwargs,
@@ -502,9 +505,12 @@ def auto_model_from_pretrained(
             model_path, torch_dtype=torch_dtype
         )
         model.eval()
-        model.seqlen = (
-            sequence_length if sequence_length else model.config.max_position_embeddings
-        )
+        max_seq_len = None
+        if hasattr(model.config, "max_position_embeddings"):
+            max_seq_len = model.config.max_position_embeddings
+        elif hasattr(model.config, "max_seq_len"):
+            max_seq_len = model.config.max_seq_len
+        model.seqlen = sequence_length if sequence_length else max_seq_len
         return model
 
 

diff --git a/tests/sparseml/core/lifecycle/test_session.py b/tests/sparseml/core/lifecycle/test_session.py
@@ -27,7 +27,7 @@
 
 
 def recipe_with_layer_prefix():
-    layer_prefix = "decoder"
+    layer_prefix = "model.decoder.layers"
     recipe = f"""
     metadata:
         target_model: