diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index ffb2789da..858910527 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -20,6 +20,7 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union import onnx +from transformers.generation import GenerationMixin from transformers.utils import is_tf_available, is_torch_available from openvino.runtime import Model, save_model @@ -40,6 +41,7 @@ _torch_version, _transformers_version, compare_versions, + is_transformers_version, ) from optimum.utils import DEFAULT_DUMMY_SHAPES, is_diffusers_available from optimum.utils.save_utils import maybe_save_preprocessors @@ -379,7 +381,7 @@ def ts_patched_forward(*args, **kwargs): if stateful: # cannot raise because stateful is enabled by default and it would break backward compatibility for models that couldn't convert to OV directly # TODO: Implement stateful for ONNX path as well, not doing it right now because of lack of validation - logger.warn( + logger.warning( "[ WARNING ] Making stateful models is not supported when exporting to ONNX as an intermediate step. " "A stateless model will be exported instead. It may result in sub-optimal inference performance." "Provide a model that can be converted to OpenVINO without fallback to ONNX conversion path." @@ -622,6 +624,18 @@ def export_from_model( files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()] elif library_name != "diffusers": + if is_transformers_version(">=", "4.44.99"): + misplaced_generation_parameters = model.config._get_non_default_generation_parameters() + if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(model.generation_config, param_name, param_value) + setattr(model.config, param_name, None) + # Saving the model config and preprocessor as this is needed sometimes. model.config.save_pretrained(output) generation_config = getattr(model, "generation_config", None) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 8d9a96ff5..c71edf83c 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -45,7 +45,7 @@ from transformers.modeling_tf_utils import TFPreTrainedModel -BETTERTRANSFORMER_IGNORE = ("codegen",) +BETTERTRANSFORMER_IGNORE = ("codegen", "gpt_neo") def patch_model_with_bettertransformer(model): @@ -57,7 +57,7 @@ def patch_model_with_bettertransformer(model): return model if is_transformers_version("<", "4.36") or is_torch_version("<", "2.1.1"): - log.warn( + log.warning( COLOR_RED + "[WARNING] For good performance with stateful models, transformers>=4.36.2 and PyTorch>=2.1.1 are required. " f"This Python environment has Transformers {_transformers_version} and PyTorch {_torch_version}. " @@ -75,7 +75,7 @@ def patch_model_with_bettertransformer(model): display_version = ( _openvino_version.split("-")[0] if is_openvino_version("<=", "2024.0.0-14509") else _openvino_version ) - log.warn( + log.warning( COLOR_RED + f"[WARNING] Stateful models are not supported for Llama, Gemma and GPTBigCode with Transformers " f"{_transformers_version} and OpenVINO {display_version}. For good performance, consider using a nightly OpenVINO build: " @@ -93,7 +93,7 @@ def patch_model_with_bettertransformer(model): try: model = model.to_bettertransformer() except Exception as e: - log.warn( + log.warning( f"Cannot apply model.to_bettertransformer because of the exception:\n{e}." " Usage model with stateful=True may be non-effective if model does not contain torch.functional.scaled_dot_product_attention" ) @@ -168,7 +168,8 @@ def __enter__(self): layer.block_sparse_moe.forward = types.MethodType( _mixtral_sparse_moe_block_forward, layer.block_sparse_moe ) - _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb) + if is_transformers_version("<", "4.44.99"): + _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -2151,6 +2152,7 @@ def _persimmon_self_attn_sdpa_forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: from transformers.models.persimmon.modeling_persimmon import apply_rotary_pos_emb @@ -2176,25 +2178,42 @@ def _persimmon_self_attn_sdpa_forward( value_states = value_states.transpose(1, 2) key_states = key_states.transpose(1, 2) - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." + if is_transformers_version("<", "4.44.99"): + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + else: + if position_embeddings is None: + log.warning( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings + + if is_transformers_version("<", "4.44.99"): + rotary_ndims = self.rotary_emb.dim + else: + rotary_ndims = self.rotary_ndims # Partial rotary embedding query_rot, query_pass = ( - query_states[..., : self.rotary_emb.dim], - query_states[..., self.rotary_emb.dim :], + query_states[..., :rotary_ndims], + query_states[..., rotary_ndims:], ) key_rot, key_pass = ( - key_states[..., : self.rotary_emb.dim], - key_states[..., self.rotary_emb.dim :], + key_states[..., :rotary_ndims], + key_states[..., rotary_ndims:], ) # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor] query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids) @@ -2208,7 +2227,7 @@ def _persimmon_self_attn_sdpa_forward( cache_kwargs = { "sin": sin, "cos": cos, - "partial_rotation_size": self.rotary_emb.dim, + "partial_rotation_size": rotary_ndims, "cache_position": cache_position, } key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) @@ -2244,7 +2263,8 @@ def __enter__(self): orig_self_attn_fwd = layer.self_attn.forward layer.self_attn.forward = types.MethodType(_persimmon_self_attn_sdpa_forward, layer.self_attn) layer.self_attn._orig_forward = orig_self_attn_fwd - _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb) + if is_transformers_version("<", "4.44.99"): + _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -2393,29 +2413,33 @@ def __exit__(self, exc_type, exc_value, traceback): class RotaryEmbPatcher(DecoderModelPatcher): def __enter__(self): super().__enter__() - for layer in self._model.model.layers: - _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb) + if is_transformers_version("<", "4.44.99"): + for layer in self._model.model.layers: + _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb) class FalconModelPatcher(DecoderModelPatcher): def __enter__(self): super().__enter__() - for layer in self._model.transformer.h: - _reinitialize_cos_sin_cached_fp32(layer.self_attention.rotary_emb) + if is_transformers_version("<", "4.44.99"): + for layer in self._model.transformer.h: + _reinitialize_cos_sin_cached_fp32(layer.self_attention.rotary_emb) class GptNeoxModelPatcher(DecoderModelPatcher): def __enter__(self): super().__enter__() - for layer in self._model.gpt_neox.layers: - _reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb) + if is_transformers_version("<", "4.44.99"): + for layer in self._model.gpt_neox.layers: + _reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb) class GptNeoxJapaneseModelPatcher(DecoderModelPatcher): def __enter__(self): super().__enter__() - for layer in self._model.gpt_neox_japanese.layers: - _reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb) + if is_transformers_version("<", "4.44.99"): + for layer in self._model.gpt_neox_japanese.layers: + _reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb) class Gemma2ModelPatcher(LlamaModelPatcher): diff --git a/optimum/exporters/openvino/stateful.py b/optimum/exporters/openvino/stateful.py index f05cc23cd..fa5a2a898 100644 --- a/optimum/exporters/openvino/stateful.py +++ b/optimum/exporters/openvino/stateful.py @@ -151,7 +151,7 @@ def make_stateful( shape[0] = num_beams_and_batch input.get_node().set_partial_shape(shape) else: - log.warn(f"Rank of {input.get_any_name()} input of the model is not 2, batch size is not set") + log.warning(f"Rank of {input.get_any_name()} input of the model is not 2, batch size is not set") for kv_name_pair in zip(key_value_input_names, key_value_output_names): input_output_map[kv_name_pair[0]] = kv_name_pair[1] @@ -176,7 +176,7 @@ def ensure_stateful_is_available(warn=True): """ if is_openvino_version("<", "2023.3"): if warn: - log.warn( + log.warning( f"Could not create or use stateful model when using old version of openvino=={_openvino_version}. It may result in sub-optimal inference performance." "Install openvino>=2023.3.0." ) diff --git a/optimum/intel/neural_compressor/modeling_base.py b/optimum/intel/neural_compressor/modeling_base.py index 392d84b47..6f6c700d8 100644 --- a/optimum/intel/neural_compressor/modeling_base.py +++ b/optimum/intel/neural_compressor/modeling_base.py @@ -46,10 +46,8 @@ from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME from transformers.utils.generic import ContextManagers -from optimum.intel.generation import BaseModelForCausalLM - from ...modeling_base import OptimizedModel -from ..utils.import_utils import _torch_version, is_torch_version +from ..utils.import_utils import _torch_version, is_torch_version, is_transformers_version from .configuration import INCConfig from .quantization import _weight_only_quantization from .utils import QUANTIZATION_CONFIG_NAME @@ -85,6 +83,8 @@ def __init__( inc_config: Dict = None, **kwargs, ): + generation_config = kwargs.pop("generation_config", None) + super().__init__(model=model, config=config, **kwargs) self.inc_config = inc_config self._q_config = q_config @@ -92,7 +92,23 @@ def __init__( self._device = getattr(self.model, "device", None) or torch.device( "cuda:0" if torch.cuda.is_available() else "cpu" ) - self.generation_config = GenerationConfig.from_model_config(config) + if self.can_generate(): + self.generation_config = generation_config or GenerationConfig.from_model_config(config) + + if is_transformers_version(">=", "4.44.99"): + misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + if len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(self.generation_config, param_name, param_value) + setattr(self.config, param_name, None) + + else: + self.generation_config = None # Registers the INCModelForXXX classes into the transformers AutoModel classes to avoid warnings when creating # a pipeline https://github.com/huggingface/transformers/blob/cad61b68396a1a387287a8e2e2fef78a25b79383/src/transformers/pipelines/base.py#L863 @@ -126,9 +142,29 @@ def _from_pretrained( token = use_auth_token quantization_config = kwargs.pop("quantization_config", None) + generation_config = kwargs.pop("generation_config", None) + model_path = Path(model_id) is_local = model_path.is_dir() + if generation_config is None and "text-generation" in cls.export_feature: + try: + generation_config = GenerationConfig.from_pretrained( + model_id, + cache_dir=cache_dir, + force_download=force_download, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + ) + if getattr(generation_config, "cache_implementation", None) is not None: + generation_config.cache_implementation = None + except OSError: + logger.info( + "Generation config file not found, using a generation config created from the model config." + ) + # ITREX compatibility quantization_config_path = None if is_local: @@ -202,7 +238,7 @@ def _from_pretrained( **kwargs, ) - return cls(model, config=config, model_save_dir=None, **kwargs).model + return cls(model, config=config, model_save_dir=None, generation_config=generation_config, **kwargs).model model_cache_path = None inc_config = None @@ -261,7 +297,14 @@ def _from_pretrained( ) model = torch.jit.load(model_cache_path) model = torch.jit.freeze(model.eval()) - return cls(model, config=config, model_save_dir=model_save_dir, inc_config=inc_config, **kwargs) + return cls( + model, + config=config, + model_save_dir=model_save_dir, + inc_config=inc_config, + generation_config=generation_config, + **kwargs, + ) model_class = _get_model_class(config, cls.auto_model_class._model_mapping) # Load the state dictionary of the model to verify whether the model to get the quantization config @@ -283,7 +326,13 @@ def _from_pretrained( raise return cls( - model, config=config, model_save_dir=model_save_dir, q_config=q_config, inc_config=inc_config, **kwargs + model, + config=config, + model_save_dir=model_save_dir, + q_config=q_config, + inc_config=inc_config, + generation_config=generation_config, + **kwargs, ) def _save_pretrained(self, save_directory: Union[str, Path]): @@ -304,6 +353,14 @@ def _save_pretrained(self, save_directory: Union[str, Path]): if self.inc_config: self.inc_config.save_pretrained(save_directory) + if self.generation_config is not None: + try: + self.generation_config.save_pretrained(save_directory) + except Exception as exception: + logger.warning( + f"The generation config will not be saved, saving failed with following error:\n{exception}" + ) + def forward(self, *args, **kwargs): return self.model(*args, **kwargs) @@ -366,29 +423,6 @@ class INCModelForVision2Seq(INCModel): export_feature = "image-to-text" -class INCModelForCausalLM(INCModel, BaseModelForCausalLM): +class INCModelForCausalLM(INCModel): auto_model_class = AutoModelForCausalLM export_feature = "text-generation" - forward = BaseModelForCausalLM.forward - generate = BaseModelForCausalLM.generate - can_generate = BaseModelForCausalLM.can_generate - - def __init__( - self, - model, - config: PretrainedConfig = None, - model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, - q_config: Dict = None, - inc_config: Dict = None, - use_cache: bool = True, - **kwargs, - ): - super(INCModelForCausalLM, self).__init__( - model=model, - config=config, - model_save_dir=model_save_dir, - q_config=q_config, - inc_config=inc_config, - use_cache=use_cache, - **kwargs, - ) diff --git a/optimum/intel/neural_compressor/trainer.py b/optimum/intel/neural_compressor/trainer.py index 5e64ca862..c0fe0cf6d 100644 --- a/optimum/intel/neural_compressor/trainer.py +++ b/optimum/intel/neural_compressor/trainer.py @@ -271,7 +271,18 @@ def _inner_training_loop( if not delay_optimizer_creation: self.create_optimizer_and_scheduler(num_training_steps=max_steps) - self.state = TrainerState() + if is_transformers_version(">=", "4.44.99"): + from transformers.trainer_callback import ExportableState + + self.state = TrainerState( + stateful_callbacks=[ + cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState) + ] + ) + + else: + self.state = TrainerState() + self.state.is_hyper_param_search = trial is not None self.state.train_batch_size = self._train_batch_size @@ -692,6 +703,21 @@ def _save(self, output_dir=None, state_dict=None): output_model_file = os.path.join(output_dir, WEIGHTS_NAME) # Save the config + if self.model.can_generate(): + if is_transformers_version(">=", "4.44.99"): + misplaced_generation_parameters = self.model.config._get_non_default_generation_parameters() + if len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(self.model.generation_config, param_name, param_value) + setattr(self.model.config, param_name, None) + + self.model.generation_config.save_pretrained(output_dir) + if self.model.config is not None: self.model.config.save_pretrained(output_dir) diff --git a/optimum/intel/neural_compressor/trainer_seq2seq.py b/optimum/intel/neural_compressor/trainer_seq2seq.py index 27540cfb1..ee8f21da5 100644 --- a/optimum/intel/neural_compressor/trainer_seq2seq.py +++ b/optimum/intel/neural_compressor/trainer_seq2seq.py @@ -124,6 +124,7 @@ def prediction_step( inputs: Dict[str, Union[torch.Tensor, Any]], prediction_loss_only: bool, ignore_keys: Optional[List[str]] = None, + **gen_kwargs, ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: """ Perform an evaluation step on `model` using `inputs`. @@ -155,17 +156,17 @@ def prediction_step( has_labels = "labels" in inputs inputs = self._prepare_inputs(inputs) - # XXX: adapt synced_gpus for fairscale as well - gen_kwargs = { - "max_length": self._max_length if self._max_length is not None else self.model.config.max_length, - "num_beams": self._num_beams if self._num_beams is not None else self.model.config.num_beams, - "synced_gpus": True if is_deepspeed_zero3_enabled() else False, - } + # Priority (handled in generate): + # non-`None` gen_kwargs > model.generation_config > default GenerationConfig() + if len(gen_kwargs) == 0 and hasattr(self, "_gen_kwargs"): + gen_kwargs = self._gen_kwargs.copy() + if "num_beams" in gen_kwargs and gen_kwargs["num_beams"] is None: + gen_kwargs.pop("num_beams") + if "max_length" in gen_kwargs and gen_kwargs["max_length"] is None: + gen_kwargs.pop("max_length") - if "attention_mask" in inputs: - gen_kwargs["attention_mask"] = inputs.get("attention_mask", None) - if "global_attention_mask" in inputs: - gen_kwargs["global_attention_mask"] = inputs.get("global_attention_mask", None) + if "synced_gpus" not in gen_kwargs: + gen_kwargs["synced_gpus"] = is_deepspeed_zero3_enabled() # prepare generation inputs # some encoder-decoder models can have varying encoder's and thus @@ -176,14 +177,25 @@ def prediction_step( generation_inputs = inputs[self.model.main_input_name] generated_tokens = self.model.generate(generation_inputs, **gen_kwargs) + + # Temporary hack to ensure the generation config is not initialized for each iteration of the evaluation loop + # TODO: remove this hack when the legacy code that initializes generation_config from a model config is + # removed in https://github.com/huggingface/transformers/blob/98d88b23f54e5a23e741833f1e973fdf600cc2c5/src/transformers/generation/utils.py#L1183 + if self.model.generation_config._from_model_config: + self.model.generation_config._from_model_config = False + + # Retrieves GenerationConfig from model.generation_config + gen_config = self.model.generation_config # in case the batch is shorter than max length, the output should be padded - if generated_tokens.shape[-1] < gen_kwargs["max_length"]: - generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"]) + if generated_tokens.shape[-1] < gen_config.max_length: + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_length) + elif gen_config.max_new_tokens is not None and generated_tokens.shape[-1] < gen_config.max_new_tokens + 1: + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_new_tokens + 1) with torch.no_grad(): - with self.autocast_smart_context_manager(): - outputs = model(**inputs) if has_labels: + with self.compute_loss_context_manager(): + outputs = model(**inputs) if self.label_smoother is not None: loss = self.label_smoother(outputs, inputs["labels"]).mean().detach() else: @@ -192,16 +204,18 @@ def prediction_step( loss = None if self.args.prediction_loss_only: - return (loss, None, None) + return loss, None, None if has_labels: labels = inputs["labels"] - if labels.shape[-1] < gen_kwargs["max_length"]: - labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"]) + if labels.shape[-1] < gen_config.max_length: + labels = self._pad_tensors_to_max_len(labels, gen_config.max_length) + elif gen_config.max_new_tokens is not None and labels.shape[-1] < gen_config.max_new_tokens + 1: + labels = self._pad_tensors_to_max_len(labels, gen_config.max_new_tokens + 1) else: labels = None - return (loss, generated_tokens, labels) + return loss, generated_tokens, labels def _pad_tensors_to_max_len(self, tensor, max_length): if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"): diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index ac1ace6d1..6669853a9 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -34,7 +34,7 @@ from optimum.modeling_base import FROM_PRETRAINED_START_DOCSTRING, OptimizedModel from ...exporters.openvino import export, main_export -from ..utils.import_utils import is_nncf_available +from ..utils.import_utils import is_nncf_available, is_transformers_version from ..utils.modeling_utils import _find_files_matching_pattern from .configuration import OVConfig, OVDynamicQuantizationConfig, OVWeightQuantizationConfig from .utils import ( @@ -127,11 +127,25 @@ def __init__( self.output_names = output_names self.output_dtypes = output_dtypes - self.model = model self.request = None if not self._compile_only else self.model + + generation_config = kwargs.get("generation_config", None) if self.can_generate(): - self.generation_config = kwargs.get("generation_config", GenerationConfig.from_model_config(config)) + self.generation_config = generation_config or GenerationConfig.from_model_config(config) + + if is_transformers_version(">=", "4.44.99"): + misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + if len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(self.generation_config, param_name, param_value) + setattr(self.config, param_name, None) + else: self.generation_config = None @@ -352,19 +366,6 @@ def _from_pretrained( model_save_dir=model_cache_path.parent, ) - try: - generation_config = GenerationConfig.from_pretrained( - model_id, - token=token, - revision=revision, - subfolder=subfolder, - force_download=force_download, - cache_dir=cache_dir, - ) - kwargs["generation_config"] = generation_config - except Exception: - pass - return cls( model, config=config, @@ -583,7 +584,6 @@ def _from_transformers( library_name=cls._library_name, ) - config.save_pretrained(save_dir_path) return cls._from_pretrained( model_id=save_dir_path, config=config, @@ -712,9 +712,7 @@ def can_generate(self) -> bool: """ Returns whether this model can generate sequences with `.generate()`. """ - if isinstance(self, GenerationMixin): - return True - return False + return isinstance(self, GenerationMixin) def _inference(self, inputs): try: diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py index 7de9d5cf5..763dd2b50 100644 --- a/optimum/intel/openvino/modeling_base_seq2seq.py +++ b/optimum/intel/openvino/modeling_base_seq2seq.py @@ -26,6 +26,7 @@ from transformers.file_utils import add_start_docstrings from ...exporters.openvino import main_export +from ..utils.import_utils import is_transformers_version from .configuration import OVConfig, OVWeightQuantizationConfig from .modeling_base import OVBaseModel from .utils import ( @@ -78,10 +79,22 @@ def __init__( self.encoder_model = encoder self.decoder_model = decoder self.decoder_with_past_model = decoder_with_past - if self.can_generate(): - self.generation_config = kwargs.get("generation_config", GenerationConfig.from_model_config(config)) - else: - self.generation_config = None + + generation_config = kwargs.get("generation_config", None) + self.generation_config = generation_config or GenerationConfig.from_model_config(config) + + if is_transformers_version(">=", "4.44.99"): + misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + if len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(self.generation_config, param_name, param_value) + setattr(self.config, param_name, None) + self._openvino_config = None if quantization_config: self._openvino_config = OVConfig(quantization_config=quantization_config) @@ -166,6 +179,9 @@ def _from_pretrained( local_files_only(`bool`, *optional*, defaults to `False`): Whether or not to only look at local files (i.e., do not try to download the model). """ + generation_config = kwargs.pop("generation_config", None) + subfolder = kwargs.pop("subfolder", "") + default_encoder_file_name = ONNX_ENCODER_NAME if from_onnx else OV_ENCODER_NAME default_decoder_file_name = ONNX_DECODER_NAME if from_onnx else OV_DECODER_NAME default_decoder_with_past_file_name = ONNX_DECODER_WITH_PAST_NAME if from_onnx else OV_DECODER_WITH_PAST_NAME @@ -229,6 +245,7 @@ def _from_pretrained( cache_dir=cache_dir, force_download=force_download, local_files_only=local_files_only, + subfolder=subfolder, ) file_names[name] = model_cache_path @@ -252,18 +269,24 @@ def _from_pretrained( kwargs.get("ov_config"), model_save_dir, ) - try: - generation_config = GenerationConfig.from_pretrained( - model_id, - token=token, - revision=revision, - cache_dir=cache_dir, - force_download=force_download, - local_files_only=local_files_only, - ) - kwargs["generation_config"] = generation_config - except Exception: - pass + + if generation_config is None: + try: + generation_config = GenerationConfig.from_pretrained( + model_id, + cache_dir=cache_dir, + force_download=force_download, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + ) + if getattr(generation_config, "cache_implementation", None) is not None: + generation_config.cache_implementation = None + except OSError: + logger.info( + "Generation config file not found, using a generation config created from the model config." + ) return cls( encoder=encoder, @@ -272,6 +295,7 @@ def _from_pretrained( config=config, model_save_dir=model_save_dir, quantization_config=quantization_config, + generation_config=generation_config, **kwargs, ) @@ -352,7 +376,6 @@ def _from_transformers( ov_config=ov_config, ) - config.save_pretrained(save_dir_path) return cls._from_pretrained( model_id=save_dir_path, config=config, diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 04ccc928a..733f5a411 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -121,7 +121,7 @@ def __init__( "`compile_only` mode does not support disabling compilation." "Please provide `compile=True` if you want to use `compile_only=True` or set `compile_only=False`" ) - + config.is_encoder_decoder = False super().__init__( model, config, @@ -142,9 +142,8 @@ def __init__( self.num_pkv = 2 self.key_value_input_names = [key for key in self.input_names if "key_values" in key] self.key_value_output_names = [key for key in self.output_names if "present" in key] - self._original_model = ( - self.model.clone() if not compile_only else None - ) # keep original model for serialization + # Keeping the original model for serialization + self._original_model = self.model.clone() if not compile_only else None self._pkv_precision = Type.f32 self.next_beam_idx = None self._past_length = 0 @@ -328,13 +327,11 @@ def _from_transformers( library_name=cls._library_name, ) - config.is_decoder = True - config.is_encoder_decoder = False if config.model_type == "phi3" and config.max_position_embeddings != getattr( config, "original_max_position_embeddings", config.max_position_embeddings ): config.max_position_embeddings = config.original_max_position_embeddings - config.save_pretrained(save_dir_path) + return cls._from_pretrained( model_id=save_dir_path, config=config, @@ -767,10 +764,6 @@ def _reorder_cache( ) return tuple(np.take(past_state, beam_idx, 0) for past_state in past_key_values) - def can_generate(self): - """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate.""" - return True - @classmethod def _from_pretrained( cls, @@ -789,6 +782,7 @@ def _from_pretrained( quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None, **kwargs, ): + generation_config = kwargs.pop("generation_config", None) model_path = Path(model_id) default_file_name = ONNX_WEIGHTS_NAME if from_onnx else OV_XML_FILE_NAME file_name = file_name or default_file_name @@ -829,20 +823,23 @@ def _from_pretrained( enable_compilation = kwargs.pop("compile", True) and not quantization_config - try: - generation_config = GenerationConfig.from_pretrained( - model_id, - token=token, - revision=revision, - cache_dir=cache_dir, - force_download=force_download, - local_files_only=local_files_only, - ) - if getattr(generation_config, "cache_implementation", None) is not None: - generation_config.cache_implementation = None - kwargs["generation_config"] = generation_config - except Exception: - pass + if generation_config is None: + try: + generation_config = GenerationConfig.from_pretrained( + model_id, + cache_dir=cache_dir, + force_download=force_download, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + ) + if getattr(generation_config, "cache_implementation", None) is not None: + generation_config.cache_implementation = None + except OSError: + logger.info( + "Generation config file not found, using a generation config created from the model config." + ) causal_model = init_cls( model=model, @@ -851,6 +848,7 @@ def _from_pretrained( compile=enable_compilation, compile_only=compile_only, quantization_config=quantization_config, + generation_config=generation_config, **kwargs, ) diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py index b050286a1..a2f08b647 100644 --- a/optimum/intel/openvino/trainer.py +++ b/optimum/intel/openvino/trainer.py @@ -87,7 +87,7 @@ from optimum.exporters.onnx import OnnxConfig from ..utils.constant import _TASK_ALIASES -from ..utils.import_utils import is_transformers_version +from ..utils.import_utils import _transformers_version, is_transformers_version from .configuration import OVConfig from .quantization import OVDataLoader from .training_args import OVTrainingArguments @@ -215,6 +215,11 @@ def __init__( ): logger.warning("OVTrainer is deprecated and will be removed in optimum-intel v1.22.0.") + if is_transformers_version(">=", "4.45.0"): + logger.warning( + f"The transformers version found is {_transformers_version} which is not officially supported by the OVTrainer, use at your own risk" + ) + self.neftune_noise_alpha = None super().__init__( @@ -380,7 +385,18 @@ def _inner_training_loop( if not delay_optimizer_creation: self.create_optimizer_and_scheduler(num_training_steps=max_steps) - self.state = TrainerState() + if is_transformers_version(">=", "4.44.99"): + from transformers.trainer_callback import ExportableState + + self.state = TrainerState( + stateful_callbacks=[ + cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState) + ] + ) + + else: + self.state = TrainerState() + self.state.is_hyper_param_search = trial is not None self.state.train_batch_size = self._train_batch_size diff --git a/setup.py b/setup.py index 59344315a..d0d8e5215 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ INSTALL_REQUIRE = [ "torch>=1.11", - "transformers>=4.36,<4.45", + "transformers>=4.36,<4.46", "optimum~=1.22", "datasets>=1.4.0", "sentencepiece", @@ -60,7 +60,7 @@ QUALITY_REQUIRE = ["black~=23.1", "ruff==0.4.4"] EXTRAS_REQUIRE = { - "neural-compressor": ["neural-compressor[pt]>3.0", "accelerate", "transformers<=4.43.2"], + "neural-compressor": ["neural-compressor[pt]>3.0", "accelerate"], "openvino": ["openvino>=2023.3,<2024.4", "nncf>=2.11.0", "openvino-tokenizers[transformers]<2024.4"], "nncf": ["nncf>=2.11.0"], "ipex": ["intel-extension-for-pytorch", "transformers>=4.39,<4.45"], diff --git a/tests/neural_compressor/test_modeling.py b/tests/neural_compressor/test_modeling.py index 81e6d03dc..e9e117518 100644 --- a/tests/neural_compressor/test_modeling.py +++ b/tests/neural_compressor/test_modeling.py @@ -125,27 +125,6 @@ def test_pipeline(self, model_id, task): pipe(*inputs) - def test_compare_with_and_without_past_key_values(self): - model_id = "echarlaix/tiny-random-gpt2-torchscript" - tokenizer = AutoTokenizer.from_pretrained(model_id) - tokens = tokenizer("This is a sample input", return_tensors="pt") - - model_with_pkv = INCModelForCausalLM.from_pretrained(model_id, use_cache=True, subfolder="model_with_pkv") - - outputs_with_pkv = model_with_pkv.generate( - **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 - ) - model_without_pkv = INCModelForCausalLM.from_pretrained( - model_id, use_cache=False, subfolder="model_without_pkv" - ) - - outputs_without_pkv = model_without_pkv.generate( - **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 - ) - self.assertEqual(outputs_with_pkv.shape[1], self.GENERATION_LENGTH) - self.assertEqual(outputs_without_pkv.shape[1], self.GENERATION_LENGTH) - self.assertTrue(torch.equal(outputs_with_pkv, outputs_without_pkv)) - def test_saving_loading_inc_woq_model(self): model_name = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ" model = INCModelForCausalLM.from_pretrained(model_name, revision="main") diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 73cb8f961..f8a24c573 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -750,7 +750,7 @@ class OVTrainerTest(unittest.TestCase): @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) def test_aware_training_quantization(self, model_name, expected_fake_quantize, expected_int8): model_id = MODEL_NAMES[model_name] - model = AutoModelForSequenceClassification.from_pretrained(model_id) + model = AutoModelForSequenceClassification.from_pretrained(model_id, attn_implementation="eager") tokenizer = AutoTokenizer.from_pretrained(model_id) ov_config = OVConfig() dataset = load_dataset("glue", "sst2")