Skip to content

Commit

Permalink
Merge branch 'feature/damian/export_samples' of github.com:neuralmagi…
Browse files Browse the repository at this point in the history
…c/sparseml into feature/damian/export_samples
  • Loading branch information
dbogunowicz committed Dec 8, 2023
2 parents ff52598 + ec71402 commit 442485b
Show file tree
Hide file tree
Showing 11 changed files with 394 additions and 11 deletions.
3 changes: 2 additions & 1 deletion src/sparseml/transformers/question_answering.py
Original file line number Diff line number Diff line change
Expand Up @@ -775,7 +775,8 @@ def prepare_validation_features(examples):
examples[context_column_name if pad_on_right else question_column_name],
truncation="only_second" if pad_on_right else "only_first",
max_length=max_seq_length,
stride=data_args.doc_stride,
# TODO: Has this become deprecated?
#stride=data_args.doc_stride,
return_overflowing_tokens=True,
return_offsets_mapping=True,
padding="max_length" if data_args.pad_to_max_length else False,
Expand Down
13 changes: 13 additions & 0 deletions src/sparseml/transformers/refactor_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
24 changes: 24 additions & 0 deletions src/sparseml/transformers/refactor_utils/export_samples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from src.sparseml.transformers.sparsification.trainer import Trainer
import logging
from transformers import AutoTokenizer
__all__ = ["export_samples"]

_LOGGER = logging.getLogger(__name__)

def export_samples(trainer: Trainer, tokenizer: AutoTokenizer, num_samples: int, real_samples = False):
_LOGGER.info(f"Exporting {num_samples} sample inputs/outputs")
if real_samples:
try:
trainer.get_eval_dataloader()
except:
raise ValueError("The trainer does not contain evaluation dataloader. "
"Either set `real_samples = False` to generate fake samples "
"or initialize the trainer with `eval_dataset` argument.")

trainer.save_sample_inputs_outputs(
num_samples_to_export=num_samples,
tokenizer=tokenizer,
)
_LOGGER.info(f"{num_samples} sample inputs/outputs exported")


131 changes: 131 additions & 0 deletions src/sparseml/transformers/refactor_utils/initialize_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Functionality for initializing a transformer model from a given path
"""
# TODO: Add docstrings

import logging
import math
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Union, Optional

from transformers import AutoConfig, AutoTokenizer, TrainingArguments

from sparseml.transformers.sparsification import Trainer
from src.sparseml.transformers.utils.model import TransformerModelsRegistry


__all__ = ["initialize_transformer_model"]

_LOGGER = logging.getLogger(__name__)


@dataclass
class ForceCPUTrainingArguments(TrainingArguments):
@property
def place_model_on_device(self):
# TODO: Observe how this setting influences memory consumption
# The property governs whether or not to automatically place
# the model on the device. Setting to False ensures that the
# model remains in CPU during ONNX export
return False


def initialize_transformer_model(
model_path: Union[str, Path],
sequence_length: int,
task: str,
trust_remote_code: bool = False,
**config_args,
):

config = initialize_config(model_path, trust_remote_code, **config_args)
tokenizer = initialize_tokenizer(model_path, task, sequence_length)
model = TransformerModelsRegistry.load_from_registry(task)(
**dict(
model_name_or_path=model_path,
model_type="model",
config=config,
trust_remote_code=trust_remote_code,
)
)
model.train()
trainer = initialize_trainer(model, model_path)
model.eval()

_LOGGER.info(f"Loaded model, trainer config, and tokenizer from {model_path}")
return model, trainer, config, tokenizer


def initialize_trainer(model: Any, model_path: Union[str, Path]) -> Trainer:
training_args = TrainingArguments(output_dir=os.path.dirname(model_path))
trainer = Trainer(
model=model,
args=training_args,
model_state_path=model_path,
# TODO: Do we need eval_dataset?
# eval_dataset=eval_dataset,
recipe=None,
recipe_args=None,
teacher=None,
)
applied = trainer.apply_manager(epoch=math.inf, checkpoint=None)

if not applied:
_LOGGER.warning(
f"No recipes were applied for {model_path}, "
"check to make sure recipe(s) are stored in the model_path"
)
else:
trainer.finalize_manager()
num_stages = 0
if trainer.manager:
num_stages += trainer.manager.num_stages()
if trainer.arch_manager:
num_stages += trainer.arch_manager.num_stages()

msg = (
"an unstaged recipe"
if num_stages == 1
else f"a staged recipe with {num_stages} stages"
)
_LOGGER.info(f"Applied {msg} to the model at {model_path}")

return trainer


def initialize_config(
model_path: Union[str, Path], trust_remote_code: bool = False, **config_args
) -> AutoConfig:
config = AutoConfig.from_pretrained(
model_path,
trust_remote_code=trust_remote_code,
**config_args,
)
return config


def initialize_tokenizer(
model_path: Union[str, Path], task: str, sequence_length: Optional[int] = None,
) -> AutoTokenizer:

tokenizer = AutoTokenizer.from_pretrained(
model_path, model_max_length=sequence_length
)
if task == "text-generation":
tokenizer.pad_token = tokenizer.eos_token
return tokenizer
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from sparsezoo.utils.registry import RegistryMixin
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from transformers import PreTrainedTokenizerBase
from sparseml.transformers.masked_language_modeling import DataTrainingArguments

TEXT_CLASSIFICATION_TASKS = ["sequence-classification", "glue", "sentiment-analysis", "text-classification"]

class TaskDatasetRegistry(RegistryMixin):
@classmethod
def load_from_registry(cls, name: str) -> Callable[..., Any]:
return cls.get_value_from_registry(name=name)

@TaskDatasetRegistry.register(name=["masked-language-modeling", "mlm"])
def dataset_function():
from sparseml.transformers.masked_language_modeling import (
get_tokenized_mlm_dataset,
)
return get_tokenized_mlm_dataset

@TaskDatasetRegistry.register(name=["question-answering","qa"])
def dataset_function():
from sparseml.transformers.question_answering import (
get_tokenized_qa_dataset,
)

return get_tokenized_qa_dataset

@TaskDatasetRegistry.register(name=["token-classification", "ner"])
def dataset_function():
from sparseml.transformers.token_classification import (
get_tokenized_token_classification_dataset,
)

return get_tokenized_token_classification_dataset

@TaskDatasetRegistry.register(name=TEXT_CLASSIFICATION_TASKS)
def dataset_function():
from sparseml.transformers.text_classification import (
get_tokenized_text_classification_dataset,
)

return get_tokenized_text_classification_dataset

def initialize_task_dataset(task:str, tokenizer: PreTrainedTokenizerBase, model: Optional[Any]=None, config: Optional[Any] = None, data_args: Dict[str, Any]= {}):
tokenized_task_dataset = TaskDatasetRegistry.load_from_registry(task)()
if task in TEXT_CLASSIFICATION_TASKS:
return tokenized_task_dataset(tokenizer=tokenizer, model=model, config=config, data_args=DataTrainingArguments(**data_args))

return tokenized_task_dataset(tokenizer=tokenizer, data_args=DataTrainingArguments(**data_args))



17 changes: 8 additions & 9 deletions src/sparseml/transformers/text_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -639,17 +639,16 @@ def compute_metrics(p: EvalPrediction):
kwargs["dataset"] = f"GLUE {data_args.task_name.upper()}"

# Exporting Samples

if data_args.num_export_samples > 0:
trainer.save_sample_inputs_outputs(
num_samples_to_export=data_args.num_export_samples
)


def _get_label_info(data_args, raw_datasets):
label_column = data_args.label_column_name
if data_args.task_name is not None:
is_regression = data_args.task_name == "stsb"
label_column = 'label'
if data_args.dataset_name is not None:
is_regression = data_args.dataset_name == "stsb"
is_multi_label_classification = False
if not is_regression:
label_list = raw_datasets["train"].features[label_column].names
Expand Down Expand Up @@ -706,7 +705,7 @@ def _get_tokenized_and_preprocessed_raw_datasets(
main_process_func = lambda desc: nullcontext(desc) # noqa: E731

# Preprocessing the datasets
if data_args.input_column_names is not None:
if False:
if "," in data_args.input_column_names:
# two input columns
columns = data_args.input_column_names.split(",")
Expand All @@ -720,8 +719,8 @@ def _get_tokenized_and_preprocessed_raw_datasets(
# one input column
sentence1_key = data_args.input_column_names
sentence2_key = None
elif data_args.task_name is not None:
sentence1_key, sentence2_key = _TASK_TO_KEYS[data_args.task_name]
elif data_args.dataset_name is not None:
sentence1_key, sentence2_key = _TASK_TO_KEYS[data_args.dataset_name]
else:
# Again, we try to have some nice defaults but don't hesitate to tweak to your
# use case
Expand Down Expand Up @@ -883,9 +882,9 @@ def _get_raw_dataset(
#
# In distributed training, the load_dataset function guarantee that only one local
# process can concurrently download the dataset.
if data_args.task_name is not None:
if data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub.
raw_datasets = load_dataset("glue", data_args.task_name, cache_dir=cache_dir)
raw_datasets = load_dataset("glue", data_args.dataset_name, cache_dir=cache_dir)
elif data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub.
raw_datasets = load_dataset(
Expand Down
21 changes: 20 additions & 1 deletion src/sparseml/transformers/utils/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import inspect
import logging
import os
from typing import Any, Dict, Optional, Tuple, Union
from typing import Any, Callable, Dict, Optional, Tuple, Union

import torch
from torch.nn import Module
Expand All @@ -30,6 +30,7 @@
from transformers.file_utils import WEIGHTS_NAME

from sparseml.pytorch.utils import ModuleSparsificationInfo
from sparsezoo.utils.registry import RegistryMixin


__all__ = ["SparseAutoModel", "get_shared_tokenizer_src"]
Expand All @@ -38,12 +39,19 @@
_LOGGER = logging.getLogger(__name__)


class TransformerModelsRegistry(RegistryMixin):
@classmethod
def load_from_registry(cls, name: str) -> Callable[..., Any]:
return cls.get_value_from_registry(name=name)


class SparseAutoModel:
"""
Factory class for creating sparse models using transformers AutoModel classes
"""

@staticmethod
@TransformerModelsRegistry.register(name=["masked-language-modeling", "mlm"])
def masked_language_modeling_from_pretrained(
model_name_or_path: str,
model_type: str,
Expand Down Expand Up @@ -114,6 +122,7 @@ def masked_language_modeling_from_pretrained_distil(
return model, teacher

@staticmethod
@TransformerModelsRegistry.register(name=["question-answering", "qa"])
def question_answering_from_pretrained(
model_name_or_path: str,
model_type: str,
Expand Down Expand Up @@ -176,6 +185,14 @@ def question_answering_from_pretrained_distil(
return model, teacher

@staticmethod
@TransformerModelsRegistry.register(
name=[
"sequence-classification",
"glue",
"sentiment-analysis",
"text-classification",
]
)
def text_classification_from_pretrained(
model_name_or_path: str,
model_type: str,
Expand Down Expand Up @@ -238,6 +255,7 @@ def text_classification_from_pretrained_distil(
return model, teacher

@staticmethod
@TransformerModelsRegistry.register(name="text-generation")
def text_generation_from_pretrained(
model_name_or_path: str,
model_type: str,
Expand Down Expand Up @@ -273,6 +291,7 @@ def text_generation_from_pretrained(
return model

@staticmethod
@TransformerModelsRegistry.register(name=["token-classification", "ner"])
def token_classification_from_pretrained(
model_name_or_path: str,
model_type: str,
Expand Down
13 changes: 13 additions & 0 deletions tests/sparseml/transformers/refactor_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Loading

0 comments on commit 442485b

Please sign in to comment.