Merge branch 'feature/damian/export_samples' of github.com:neuralmagi…

…c/sparseml into feature/damian/export_samples
neuralmagic · Dec 8, 2023 · 442485b · 442485b
2 parents ff52598 + ec71402
commit 442485b
Show file tree

Hide file tree

Showing 11 changed files with 394 additions and 11 deletions.
diff --git a/src/sparseml/transformers/question_answering.py b/src/sparseml/transformers/question_answering.py
@@ -775,7 +775,8 @@ def prepare_validation_features(examples):
             examples[context_column_name if pad_on_right else question_column_name],
             truncation="only_second" if pad_on_right else "only_first",
             max_length=max_seq_length,
-            stride=data_args.doc_stride,
+            # TODO: Has this become deprecated?
+            #stride=data_args.doc_stride,
             return_overflowing_tokens=True,
             return_offsets_mapping=True,
             padding="max_length" if data_args.pad_to_max_length else False,

diff --git a/src/sparseml/transformers/refactor_utils/__init__.py b/src/sparseml/transformers/refactor_utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/src/sparseml/transformers/refactor_utils/export_samples.py b/src/sparseml/transformers/refactor_utils/export_samples.py
@@ -0,0 +1,24 @@
+from src.sparseml.transformers.sparsification.trainer import Trainer
+import logging
+from transformers import AutoTokenizer
+__all__ = ["export_samples"]
+
+_LOGGER = logging.getLogger(__name__)
+
+def export_samples(trainer: Trainer, tokenizer: AutoTokenizer, num_samples: int, real_samples = False):
+    _LOGGER.info(f"Exporting {num_samples} sample inputs/outputs")
+    if real_samples:
+        try:
+            trainer.get_eval_dataloader()
+        except:
+            raise ValueError("The trainer does not contain evaluation dataloader. "
+                             "Either set `real_samples = False` to generate fake samples "
+                             "or initialize the trainer with `eval_dataset` argument.")
+
+    trainer.save_sample_inputs_outputs(
+        num_samples_to_export=num_samples,
+        tokenizer=tokenizer,
+    )
+    _LOGGER.info(f"{num_samples} sample inputs/outputs exported")
+
+
diff --git a/src/sparseml/transformers/refactor_utils/initialize_model.py b/src/sparseml/transformers/refactor_utils/initialize_model.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Functionality for initializing a transformer model from a given path
+"""
+# TODO: Add docstrings
+
+import logging
+import math
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Union, Optional
+
+from transformers import AutoConfig, AutoTokenizer, TrainingArguments
+
+from sparseml.transformers.sparsification import Trainer
+from src.sparseml.transformers.utils.model import TransformerModelsRegistry
+
+
+__all__ = ["initialize_transformer_model"]
+
+_LOGGER = logging.getLogger(__name__)
+
+
+@dataclass
+class ForceCPUTrainingArguments(TrainingArguments):
+    @property
+    def place_model_on_device(self):
+        # TODO: Observe how this setting influences memory consumption
+        # The property governs whether or not to automatically place
+        # the model on the device. Setting to False ensures that the
+        # model remains in CPU during ONNX export
+        return False
+
+
+def initialize_transformer_model(
+    model_path: Union[str, Path],
+    sequence_length: int,
+    task: str,
+    trust_remote_code: bool = False,
+    **config_args,
+):
+
+    config = initialize_config(model_path, trust_remote_code, **config_args)
+    tokenizer = initialize_tokenizer(model_path, task, sequence_length)
+    model = TransformerModelsRegistry.load_from_registry(task)(
+        **dict(
+            model_name_or_path=model_path,
+            model_type="model",
+            config=config,
+            trust_remote_code=trust_remote_code,
+        )
+    )
+    model.train()
+    trainer = initialize_trainer(model, model_path)
+    model.eval()
+
+    _LOGGER.info(f"Loaded model, trainer config, and tokenizer from {model_path}")
+    return model, trainer, config, tokenizer
+
+
+def initialize_trainer(model: Any, model_path: Union[str, Path]) -> Trainer:
+    training_args = TrainingArguments(output_dir=os.path.dirname(model_path))
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        model_state_path=model_path,
+        # TODO: Do we need eval_dataset?
+        # eval_dataset=eval_dataset,
+        recipe=None,
+        recipe_args=None,
+        teacher=None,
+    )
+    applied = trainer.apply_manager(epoch=math.inf, checkpoint=None)
+
+    if not applied:
+        _LOGGER.warning(
+            f"No recipes were applied for {model_path}, "
+            "check to make sure recipe(s) are stored in the model_path"
+        )
+    else:
+        trainer.finalize_manager()
+        num_stages = 0
+        if trainer.manager:
+            num_stages += trainer.manager.num_stages()
+        if trainer.arch_manager:
+            num_stages += trainer.arch_manager.num_stages()
+
+        msg = (
+            "an unstaged recipe"
+            if num_stages == 1
+            else f"a staged recipe with {num_stages} stages"
+        )
+        _LOGGER.info(f"Applied {msg} to the model at {model_path}")
+
+    return trainer
+
+
+def initialize_config(
+    model_path: Union[str, Path], trust_remote_code: bool = False, **config_args
+) -> AutoConfig:
+    config = AutoConfig.from_pretrained(
+        model_path,
+        trust_remote_code=trust_remote_code,
+        **config_args,
+    )
+    return config
+
+
+def initialize_tokenizer(
+    model_path: Union[str, Path], task: str, sequence_length: Optional[int] = None,
+) -> AutoTokenizer:
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_path, model_max_length=sequence_length
+    )
+    if task == "text-generation":
+        tokenizer.pad_token = tokenizer.eos_token
+    return tokenizer
diff --git a/src/sparseml/transformers/refactor_utils/initialize_task_dataset.py b/src/sparseml/transformers/refactor_utils/initialize_task_dataset.py
@@ -0,0 +1,52 @@
+from sparsezoo.utils.registry import RegistryMixin
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from transformers import PreTrainedTokenizerBase
+from sparseml.transformers.masked_language_modeling import DataTrainingArguments
+
+TEXT_CLASSIFICATION_TASKS = ["sequence-classification", "glue", "sentiment-analysis", "text-classification"]
+
+class TaskDatasetRegistry(RegistryMixin):
+    @classmethod
+    def load_from_registry(cls, name: str) -> Callable[..., Any]:
+        return cls.get_value_from_registry(name=name)
+
+@TaskDatasetRegistry.register(name=["masked-language-modeling", "mlm"])
+def dataset_function():
+    from sparseml.transformers.masked_language_modeling import (
+        get_tokenized_mlm_dataset,
+    )
+    return get_tokenized_mlm_dataset
+
+@TaskDatasetRegistry.register(name=["question-answering","qa"])
+def dataset_function():
+    from sparseml.transformers.question_answering import (
+        get_tokenized_qa_dataset,
+    )
+
+    return get_tokenized_qa_dataset
+
+@TaskDatasetRegistry.register(name=["token-classification", "ner"])
+def dataset_function():
+    from sparseml.transformers.token_classification import (
+        get_tokenized_token_classification_dataset,
+    )
+
+    return get_tokenized_token_classification_dataset
+
+@TaskDatasetRegistry.register(name=TEXT_CLASSIFICATION_TASKS)
+def dataset_function():
+    from sparseml.transformers.text_classification import (
+        get_tokenized_text_classification_dataset,
+    )
+
+    return get_tokenized_text_classification_dataset
+
+def initialize_task_dataset(task:str, tokenizer: PreTrainedTokenizerBase, model: Optional[Any]=None, config: Optional[Any] = None, data_args: Dict[str, Any]= {}):
+    tokenized_task_dataset = TaskDatasetRegistry.load_from_registry(task)()
+    if task in TEXT_CLASSIFICATION_TASKS:
+        return tokenized_task_dataset(tokenizer=tokenizer, model=model, config=config, data_args=DataTrainingArguments(**data_args))
+
+    return tokenized_task_dataset(tokenizer=tokenizer, data_args=DataTrainingArguments(**data_args))
+
+
+
diff --git a/src/sparseml/transformers/text_classification.py b/src/sparseml/transformers/text_classification.py
@@ -639,17 +639,16 @@ def compute_metrics(p: EvalPrediction):
         kwargs["dataset"] = f"GLUE {data_args.task_name.upper()}"
 
     # Exporting Samples
-
     if data_args.num_export_samples > 0:
         trainer.save_sample_inputs_outputs(
             num_samples_to_export=data_args.num_export_samples
         )
 
 
 def _get_label_info(data_args, raw_datasets):
-    label_column = data_args.label_column_name
-    if data_args.task_name is not None:
-        is_regression = data_args.task_name == "stsb"
+    label_column = 'label'
+    if data_args.dataset_name is not None:
+        is_regression = data_args.dataset_name == "stsb"
         is_multi_label_classification = False
         if not is_regression:
             label_list = raw_datasets["train"].features[label_column].names
@@ -706,7 +705,7 @@ def _get_tokenized_and_preprocessed_raw_datasets(
         main_process_func = lambda desc: nullcontext(desc)  # noqa: E731
 
     # Preprocessing the datasets
-    if data_args.input_column_names is not None:
+    if False:
         if "," in data_args.input_column_names:
             # two input columns
             columns = data_args.input_column_names.split(",")
@@ -720,8 +719,8 @@ def _get_tokenized_and_preprocessed_raw_datasets(
             # one input column
             sentence1_key = data_args.input_column_names
             sentence2_key = None
-    elif data_args.task_name is not None:
-        sentence1_key, sentence2_key = _TASK_TO_KEYS[data_args.task_name]
+    elif data_args.dataset_name is not None:
+        sentence1_key, sentence2_key = _TASK_TO_KEYS[data_args.dataset_name]
     else:
         # Again, we try to have some nice defaults but don't hesitate to tweak to your
         # use case
@@ -883,9 +882,9 @@ def _get_raw_dataset(
     #
     # In distributed training, the load_dataset function guarantee that only one local
     # process can concurrently download the dataset.
-    if data_args.task_name is not None:
+    if data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset("glue", data_args.task_name, cache_dir=cache_dir)
+        raw_datasets = load_dataset("glue", data_args.dataset_name, cache_dir=cache_dir)
     elif data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
         raw_datasets = load_dataset(

diff --git a/src/sparseml/transformers/utils/model.py b/src/sparseml/transformers/utils/model.py
@@ -15,7 +15,7 @@
 import inspect
 import logging
 import os
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Callable, Dict, Optional, Tuple, Union
 
 import torch
 from torch.nn import Module
@@ -30,6 +30,7 @@
 from transformers.file_utils import WEIGHTS_NAME
 
 from sparseml.pytorch.utils import ModuleSparsificationInfo
+from sparsezoo.utils.registry import RegistryMixin
 
 
 __all__ = ["SparseAutoModel", "get_shared_tokenizer_src"]
@@ -38,12 +39,19 @@
 _LOGGER = logging.getLogger(__name__)
 
 
+class TransformerModelsRegistry(RegistryMixin):
+    @classmethod
+    def load_from_registry(cls, name: str) -> Callable[..., Any]:
+        return cls.get_value_from_registry(name=name)
+
+
 class SparseAutoModel:
     """
     Factory class for creating sparse models using transformers AutoModel classes
     """
 
     @staticmethod
+    @TransformerModelsRegistry.register(name=["masked-language-modeling", "mlm"])
     def masked_language_modeling_from_pretrained(
         model_name_or_path: str,
         model_type: str,
@@ -114,6 +122,7 @@ def masked_language_modeling_from_pretrained_distil(
         return model, teacher
 
     @staticmethod
+    @TransformerModelsRegistry.register(name=["question-answering", "qa"])
     def question_answering_from_pretrained(
         model_name_or_path: str,
         model_type: str,
@@ -176,6 +185,14 @@ def question_answering_from_pretrained_distil(
         return model, teacher
 
     @staticmethod
+    @TransformerModelsRegistry.register(
+        name=[
+            "sequence-classification",
+            "glue",
+            "sentiment-analysis",
+            "text-classification",
+        ]
+    )
     def text_classification_from_pretrained(
         model_name_or_path: str,
         model_type: str,
@@ -238,6 +255,7 @@ def text_classification_from_pretrained_distil(
         return model, teacher
 
     @staticmethod
+    @TransformerModelsRegistry.register(name="text-generation")
     def text_generation_from_pretrained(
         model_name_or_path: str,
         model_type: str,
@@ -273,6 +291,7 @@ def text_generation_from_pretrained(
         return model
 
     @staticmethod
+    @TransformerModelsRegistry.register(name=["token-classification", "ner"])
     def token_classification_from_pretrained(
         model_name_or_path: str,
         model_type: str,

diff --git a/tests/sparseml/transformers/refactor_utils/__init__.py b/tests/sparseml/transformers/refactor_utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.