Skip to content

Commit

Permalink
Merge branch 'main' into wanda
Browse files Browse the repository at this point in the history
  • Loading branch information
rahul-tuli authored Dec 18, 2023
2 parents 854fd69 + 27ae625 commit 1d1aaca
Show file tree
Hide file tree
Showing 9 changed files with 82 additions and 22 deletions.
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,20 @@ SparseML is an open-source model optimization toolkit that enables you to create
<img alt="SparseML Flow" src="docs/images/sparseml-workflow.png" width="60%" />
</p>

## ✨NEW✨ SparseML One-Shot LLM Compression

Neural Magic is excited to preview one-shot LLM compression workflows using the new `SparseGPTModfier`!

To prune and quantize a TinyLlama Chat model it is just a few steps to install dependencies, download a recipe, and apply it to the model:
```
git clone https://github.com/neuralmagic/sparseml
pip install -e "sparseml[transformers]"
wget https://huggingface.co/neuralmagic/TinyLlama-1.1B-Chat-v0.4-pruned50-quant-ds/raw/main/recipe.yaml
python sparseml/src/sparseml/transformers/sparsification/obcq/obcq.py TinyLlama/TinyLlama-1.1B-Chat-v0.4 open_platypus --recipe recipe.yaml --save True
```

The README at [`src/sparseml/transformers/sparsification/obcq`](https://github.com/neuralmagic/sparseml/tree/main/src/sparseml/transformers/sparsification/obcq) has a detailed walkthrough.

## Workflows

SparseML enables you to create a sparse model trained on your dataset in two ways:
Expand Down
10 changes: 10 additions & 0 deletions src/sparseml/core/recipe/recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

import json
import logging
import os
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Union
Expand All @@ -30,6 +31,8 @@

__all__ = ["Recipe", "RecipeTuple"]

_LOGGER = logging.getLogger(__name__)


class Recipe(RecipeBase):
"""
Expand Down Expand Up @@ -72,8 +75,15 @@ def create_instance(path: str) -> "Recipe":
raise NotImplementedError("Using SparseZoo stubs is not yet supported")
else:
# assume it's a string
_LOGGER.warning(
"Could not process input as a file path or zoo stub, "
"attempting to process it as a string."
)
_LOGGER.warning(f"Input string: {path}")
obj = _load_json_or_yaml_string(path)
return Recipe.parse_obj(obj)
else:
_LOGGER.info(f"Loading recipe from file {path}")

with open(path, "r") as file:
content = file.read().strip()
Expand Down
44 changes: 33 additions & 11 deletions src/sparseml/modifiers/obcq/utils/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,17 @@
# limitations under the License.

import logging
import operator
from collections import defaultdict
from math import ceil
from typing import List, Optional

import torch
from torch.nn.modules.sparse import Embedding


_LOGGER = logging.getLogger(__name__)
_DEFAULT_TARGET_IDS = [
"attention_mask",
"position_ids",
]
_DEFAULT_TARGET_IDS = ["attention_mask", "position_ids", "position_bias"]


class Catcher(torch.nn.Module):
Expand Down Expand Up @@ -127,24 +126,36 @@ def execute_offloaded_module(
def cache_attention_inputs(
model, dataloader, device, nsamples, target_ids, layer_prefix
):
if layer_prefix:
embed_tokens = getattr(model.model, layer_prefix).embed_tokens
first_layer = getattr(model.model, layer_prefix).layers[0]
if layer_prefix: # get model-specific path to layers list
split_prefix = layer_prefix.split(".")
layers_name = split_prefix[-1]
model_root_name = ".".join(split_prefix[:-1])
model_root = operator.attrgetter(model_root_name)(model)
first_layer = getattr(model_root, layers_name)[0]
else:
embed_tokens = model.model.embed_tokens
first_layer = model.model.layers[0]
embed_tokens.to(device)
model_root = model.model
layers_name = "layers"
first_layer = model_root.layers[0]

# send everything up to the first compressable layer to device
pre_layers_modules = _get_pre_layer_modules(model_root, layers_name)
for pre_layer in pre_layers_modules:
pre_layer.to(device)
first_layer.to(device)

cached_inputs = catch(
model=model,
attention_layer=first_layer,
target_keys=target_ids,
data_loader=dataloader,
nsamples=nsamples,
)
embed_tokens.cpu()

for pre_layer in pre_layers_modules:
pre_layer.cpu()
first_layer.cpu()
torch.cuda.empty_cache()

return cached_inputs


Expand Down Expand Up @@ -191,3 +202,14 @@ def ppl_eval_general(
_LOGGER.info(f"Perplexity: {ppl.item():3f}")

return ppl.item()


def _get_pre_layer_modules(model_root, layers_name):
pre_layers_modules = []
for name, layer in model_root.named_modules():
if name.startswith(layers_name):
break
if isinstance(layer, Embedding):
pre_layers_modules.append(layer)

return pre_layers_modules
4 changes: 3 additions & 1 deletion src/sparseml/transformers/data/base_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ def __init__(
shuffle: bool = True,
**kwargs,
):
self.tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)
self.tokenizer = AutoTokenizer.from_pretrained(
model, use_fast=True, trust_remote_code=True
)
self._nsamples = nsamples
self._seqlen = seqlen
self._use_max_tokens = use_max_tokens
Expand Down
2 changes: 1 addition & 1 deletion src/sparseml/transformers/sparsification/obcq/example.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
metadata:
target_model:
layer_prefix: "decoder"
layer_prefix: "model.decoder.layers"
architecture: "opt"

test_stage:
Expand Down
14 changes: 10 additions & 4 deletions src/sparseml/transformers/sparsification/obcq/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,9 @@ def load_task_model(
)

if task == "text-generation":
# Export decoder model without kv cache support
config.use_cache = False

return SparseAutoModel.text_generation_from_pretrained(
model_name_or_path=model_path,
config=config,
Expand Down Expand Up @@ -291,10 +294,13 @@ def export_transformer_to_onnx(

# creates a SparseSession and apply structure from the model's recipe
recipe_path = os.path.join(model_path, RECIPE_FILE_NAME)
session_manager.create_session()
apply_recipe_structure_to_model(
model, recipe_path=recipe_path, model_path=model_path
)
if os.path.exists(recipe_path):
session_manager.create_session()
apply_recipe_structure_to_model(
model=model, recipe_path=recipe_path, model_path=model_path
)
else:
_LOGGER.warning(f"No input recipe {RECIPE_FILE_NAME} found in {model_path}.")

# create fake model input
inputs = tokenizer(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def opt_forward(model: Module, data_loader: List, device: str, nsamples: int = N
device=device,
nsamples=nsamples,
target_ids=["attention_mask"],
layer_prefix="decoder",
layer_prefix="model.decoder.layers",
)
buffer = [b[0] for b in cached_inputs.pop("inputs")]
for layer in model.model.decoder.layers:
Expand Down
12 changes: 9 additions & 3 deletions src/sparseml/transformers/utils/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,9 @@ def text_generation_from_pretrained(
model_name_or_path
)

kwargs["config"].is_decoder = True
kwargs["config"].use_past = False

model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
**kwargs,
Expand Down Expand Up @@ -502,9 +505,12 @@ def auto_model_from_pretrained(
model_path, torch_dtype=torch_dtype
)
model.eval()
model.seqlen = (
sequence_length if sequence_length else model.config.max_position_embeddings
)
max_seq_len = None
if hasattr(model.config, "max_position_embeddings"):
max_seq_len = model.config.max_position_embeddings
elif hasattr(model.config, "max_seq_len"):
max_seq_len = model.config.max_seq_len
model.seqlen = sequence_length if sequence_length else max_seq_len
return model


Expand Down
2 changes: 1 addition & 1 deletion tests/sparseml/core/lifecycle/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@


def recipe_with_layer_prefix():
layer_prefix = "decoder"
layer_prefix = "model.decoder.layers"
recipe = f"""
metadata:
target_model:
Expand Down

0 comments on commit 1d1aaca

Please sign in to comment.