Skip to content

Commit

Permalink
Add transformers v4.45 support (#902)
Browse files Browse the repository at this point in the history
* Add transformers v4.45 support

* update setup

* fix generation config

* fix persimmon

* style

* fix trainer

* fix generation config for inc models

* format

* fix seq2seq trainer

* style

* fix trfs version for ipex

* udpate transformers version in workflows

* update setup
  • Loading branch information
echarlaix authored Sep 26, 2024
1 parent 7d18351 commit d3c8ac6
Show file tree
Hide file tree
Showing 13 changed files with 295 additions and 169 deletions.
16 changes: 15 additions & 1 deletion optimum/exporters/openvino/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union

import onnx
from transformers.generation import GenerationMixin
from transformers.utils import is_tf_available, is_torch_available

from openvino.runtime import Model, save_model
Expand All @@ -40,6 +41,7 @@
_torch_version,
_transformers_version,
compare_versions,
is_transformers_version,
)
from optimum.utils import DEFAULT_DUMMY_SHAPES, is_diffusers_available
from optimum.utils.save_utils import maybe_save_preprocessors
Expand Down Expand Up @@ -379,7 +381,7 @@ def ts_patched_forward(*args, **kwargs):
if stateful:
# cannot raise because stateful is enabled by default and it would break backward compatibility for models that couldn't convert to OV directly
# TODO: Implement stateful for ONNX path as well, not doing it right now because of lack of validation
logger.warn(
logger.warning(
"[ WARNING ] Making stateful models is not supported when exporting to ONNX as an intermediate step. "
"A stateless model will be exported instead. It may result in sub-optimal inference performance."
"Provide a model that can be converted to OpenVINO without fallback to ONNX conversion path."
Expand Down Expand Up @@ -622,6 +624,18 @@ def export_from_model(

files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()]
elif library_name != "diffusers":
if is_transformers_version(">=", "4.44.99"):
misplaced_generation_parameters = model.config._get_non_default_generation_parameters()
if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0:
logger.warning(
"Moving the following attributes in the config to the generation config: "
f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
"generation parameters in the model config, as opposed to in the generation config.",
)
for param_name, param_value in misplaced_generation_parameters.items():
setattr(model.generation_config, param_name, param_value)
setattr(model.config, param_name, None)

# Saving the model config and preprocessor as this is needed sometimes.
model.config.save_pretrained(output)
generation_config = getattr(model, "generation_config", None)
Expand Down
80 changes: 52 additions & 28 deletions optimum/exporters/openvino/model_patcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
from transformers.modeling_tf_utils import TFPreTrainedModel


BETTERTRANSFORMER_IGNORE = ("codegen",)
BETTERTRANSFORMER_IGNORE = ("codegen", "gpt_neo")


def patch_model_with_bettertransformer(model):
Expand All @@ -57,7 +57,7 @@ def patch_model_with_bettertransformer(model):
return model

if is_transformers_version("<", "4.36") or is_torch_version("<", "2.1.1"):
log.warn(
log.warning(
COLOR_RED
+ "[WARNING] For good performance with stateful models, transformers>=4.36.2 and PyTorch>=2.1.1 are required. "
f"This Python environment has Transformers {_transformers_version} and PyTorch {_torch_version}. "
Expand All @@ -75,7 +75,7 @@ def patch_model_with_bettertransformer(model):
display_version = (
_openvino_version.split("-")[0] if is_openvino_version("<=", "2024.0.0-14509") else _openvino_version
)
log.warn(
log.warning(
COLOR_RED
+ f"[WARNING] Stateful models are not supported for Llama, Gemma and GPTBigCode with Transformers "
f"{_transformers_version} and OpenVINO {display_version}. For good performance, consider using a nightly OpenVINO build: "
Expand All @@ -93,7 +93,7 @@ def patch_model_with_bettertransformer(model):
try:
model = model.to_bettertransformer()
except Exception as e:
log.warn(
log.warning(
f"Cannot apply model.to_bettertransformer because of the exception:\n{e}."
" Usage model with stateful=True may be non-effective if model does not contain torch.functional.scaled_dot_product_attention"
)
Expand Down Expand Up @@ -168,7 +168,8 @@ def __enter__(self):
layer.block_sparse_moe.forward = types.MethodType(
_mixtral_sparse_moe_block_forward, layer.block_sparse_moe
)
_reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)
if is_transformers_version("<", "4.44.99"):
_reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)

def __exit__(self, exc_type, exc_value, traceback):
super().__exit__(exc_type, exc_value, traceback)
Expand Down Expand Up @@ -2151,6 +2152,7 @@ def _persimmon_self_attn_sdpa_forward(
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
from transformers.models.persimmon.modeling_persimmon import apply_rotary_pos_emb

Expand All @@ -2176,25 +2178,42 @@ def _persimmon_self_attn_sdpa_forward(
value_states = value_states.transpose(1, 2)
key_states = key_states.transpose(1, 2)

kv_seq_len = key_states.shape[-2]
if past_key_value is not None:
if self.layer_idx is None:
raise ValueError(
f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
"with a layer index."
if is_transformers_version("<", "4.44.99"):
kv_seq_len = key_states.shape[-2]
if past_key_value is not None:
if self.layer_idx is None:
raise ValueError(
f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
"with a layer index."
)
kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
else:
if position_embeddings is None:
log.warning(
"The attention layers in this model are transitioning from computing the RoPE embeddings internally "
"through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
"`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
"removed and `position_embeddings` will be mandatory."
)
kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
cos, sin = self.rotary_emb(value_states, position_ids)
else:
cos, sin = position_embeddings

if is_transformers_version("<", "4.44.99"):
rotary_ndims = self.rotary_emb.dim
else:
rotary_ndims = self.rotary_ndims

# Partial rotary embedding
query_rot, query_pass = (
query_states[..., : self.rotary_emb.dim],
query_states[..., self.rotary_emb.dim :],
query_states[..., :rotary_ndims],
query_states[..., rotary_ndims:],
)
key_rot, key_pass = (
key_states[..., : self.rotary_emb.dim],
key_states[..., self.rotary_emb.dim :],
key_states[..., :rotary_ndims],
key_states[..., rotary_ndims:],
)
# [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor]
query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
Expand All @@ -2208,7 +2227,7 @@ def _persimmon_self_attn_sdpa_forward(
cache_kwargs = {
"sin": sin,
"cos": cos,
"partial_rotation_size": self.rotary_emb.dim,
"partial_rotation_size": rotary_ndims,
"cache_position": cache_position,
}
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
Expand Down Expand Up @@ -2244,7 +2263,8 @@ def __enter__(self):
orig_self_attn_fwd = layer.self_attn.forward
layer.self_attn.forward = types.MethodType(_persimmon_self_attn_sdpa_forward, layer.self_attn)
layer.self_attn._orig_forward = orig_self_attn_fwd
_reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)
if is_transformers_version("<", "4.44.99"):
_reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)

def __exit__(self, exc_type, exc_value, traceback):
super().__exit__(exc_type, exc_value, traceback)
Expand Down Expand Up @@ -2393,29 +2413,33 @@ def __exit__(self, exc_type, exc_value, traceback):
class RotaryEmbPatcher(DecoderModelPatcher):
def __enter__(self):
super().__enter__()
for layer in self._model.model.layers:
_reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)
if is_transformers_version("<", "4.44.99"):
for layer in self._model.model.layers:
_reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)


class FalconModelPatcher(DecoderModelPatcher):
def __enter__(self):
super().__enter__()
for layer in self._model.transformer.h:
_reinitialize_cos_sin_cached_fp32(layer.self_attention.rotary_emb)
if is_transformers_version("<", "4.44.99"):
for layer in self._model.transformer.h:
_reinitialize_cos_sin_cached_fp32(layer.self_attention.rotary_emb)


class GptNeoxModelPatcher(DecoderModelPatcher):
def __enter__(self):
super().__enter__()
for layer in self._model.gpt_neox.layers:
_reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb)
if is_transformers_version("<", "4.44.99"):
for layer in self._model.gpt_neox.layers:
_reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb)


class GptNeoxJapaneseModelPatcher(DecoderModelPatcher):
def __enter__(self):
super().__enter__()
for layer in self._model.gpt_neox_japanese.layers:
_reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb)
if is_transformers_version("<", "4.44.99"):
for layer in self._model.gpt_neox_japanese.layers:
_reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb)


class Gemma2ModelPatcher(LlamaModelPatcher):
Expand Down
4 changes: 2 additions & 2 deletions optimum/exporters/openvino/stateful.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def make_stateful(
shape[0] = num_beams_and_batch
input.get_node().set_partial_shape(shape)
else:
log.warn(f"Rank of {input.get_any_name()} input of the model is not 2, batch size is not set")
log.warning(f"Rank of {input.get_any_name()} input of the model is not 2, batch size is not set")

for kv_name_pair in zip(key_value_input_names, key_value_output_names):
input_output_map[kv_name_pair[0]] = kv_name_pair[1]
Expand All @@ -176,7 +176,7 @@ def ensure_stateful_is_available(warn=True):
"""
if is_openvino_version("<", "2023.3"):
if warn:
log.warn(
log.warning(
f"Could not create or use stateful model when using old version of openvino=={_openvino_version}. It may result in sub-optimal inference performance."
"Install openvino>=2023.3.0."
)
Expand Down
96 changes: 65 additions & 31 deletions optimum/intel/neural_compressor/modeling_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,8 @@
from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME
from transformers.utils.generic import ContextManagers

from optimum.intel.generation import BaseModelForCausalLM

from ...modeling_base import OptimizedModel
from ..utils.import_utils import _torch_version, is_torch_version
from ..utils.import_utils import _torch_version, is_torch_version, is_transformers_version
from .configuration import INCConfig
from .quantization import _weight_only_quantization
from .utils import QUANTIZATION_CONFIG_NAME
Expand Down Expand Up @@ -85,14 +83,32 @@ def __init__(
inc_config: Dict = None,
**kwargs,
):
generation_config = kwargs.pop("generation_config", None)

super().__init__(model=model, config=config, **kwargs)
self.inc_config = inc_config
self._q_config = q_config
self.model_save_dir = model_save_dir
self._device = getattr(self.model, "device", None) or torch.device(
"cuda:0" if torch.cuda.is_available() else "cpu"
)
self.generation_config = GenerationConfig.from_model_config(config)
if self.can_generate():
self.generation_config = generation_config or GenerationConfig.from_model_config(config)

if is_transformers_version(">=", "4.44.99"):
misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
if len(misplaced_generation_parameters) > 0:
logger.warning(
"Moving the following attributes in the config to the generation config: "
f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
"generation parameters in the model config, as opposed to in the generation config.",
)
for param_name, param_value in misplaced_generation_parameters.items():
setattr(self.generation_config, param_name, param_value)
setattr(self.config, param_name, None)

else:
self.generation_config = None

# Registers the INCModelForXXX classes into the transformers AutoModel classes to avoid warnings when creating
# a pipeline https://github.com/huggingface/transformers/blob/cad61b68396a1a387287a8e2e2fef78a25b79383/src/transformers/pipelines/base.py#L863
Expand Down Expand Up @@ -126,9 +142,29 @@ def _from_pretrained(
token = use_auth_token

quantization_config = kwargs.pop("quantization_config", None)
generation_config = kwargs.pop("generation_config", None)

model_path = Path(model_id)
is_local = model_path.is_dir()

if generation_config is None and "text-generation" in cls.export_feature:
try:
generation_config = GenerationConfig.from_pretrained(
model_id,
cache_dir=cache_dir,
force_download=force_download,
local_files_only=local_files_only,
token=token,
revision=revision,
subfolder=subfolder,
)
if getattr(generation_config, "cache_implementation", None) is not None:
generation_config.cache_implementation = None
except OSError:
logger.info(
"Generation config file not found, using a generation config created from the model config."
)

# ITREX compatibility
quantization_config_path = None
if is_local:
Expand Down Expand Up @@ -202,7 +238,7 @@ def _from_pretrained(
**kwargs,
)

return cls(model, config=config, model_save_dir=None, **kwargs).model
return cls(model, config=config, model_save_dir=None, generation_config=generation_config, **kwargs).model

model_cache_path = None
inc_config = None
Expand Down Expand Up @@ -261,7 +297,14 @@ def _from_pretrained(
)
model = torch.jit.load(model_cache_path)
model = torch.jit.freeze(model.eval())
return cls(model, config=config, model_save_dir=model_save_dir, inc_config=inc_config, **kwargs)
return cls(
model,
config=config,
model_save_dir=model_save_dir,
inc_config=inc_config,
generation_config=generation_config,
**kwargs,
)

model_class = _get_model_class(config, cls.auto_model_class._model_mapping)
# Load the state dictionary of the model to verify whether the model to get the quantization config
Expand All @@ -283,7 +326,13 @@ def _from_pretrained(
raise

return cls(
model, config=config, model_save_dir=model_save_dir, q_config=q_config, inc_config=inc_config, **kwargs
model,
config=config,
model_save_dir=model_save_dir,
q_config=q_config,
inc_config=inc_config,
generation_config=generation_config,
**kwargs,
)

def _save_pretrained(self, save_directory: Union[str, Path]):
Expand All @@ -304,6 +353,14 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
if self.inc_config:
self.inc_config.save_pretrained(save_directory)

if self.generation_config is not None:
try:
self.generation_config.save_pretrained(save_directory)
except Exception as exception:
logger.warning(
f"The generation config will not be saved, saving failed with following error:\n{exception}"
)

def forward(self, *args, **kwargs):
return self.model(*args, **kwargs)

Expand Down Expand Up @@ -366,29 +423,6 @@ class INCModelForVision2Seq(INCModel):
export_feature = "image-to-text"


class INCModelForCausalLM(INCModel, BaseModelForCausalLM):
class INCModelForCausalLM(INCModel):
auto_model_class = AutoModelForCausalLM
export_feature = "text-generation"
forward = BaseModelForCausalLM.forward
generate = BaseModelForCausalLM.generate
can_generate = BaseModelForCausalLM.can_generate

def __init__(
self,
model,
config: PretrainedConfig = None,
model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
q_config: Dict = None,
inc_config: Dict = None,
use_cache: bool = True,
**kwargs,
):
super(INCModelForCausalLM, self).__init__(
model=model,
config=config,
model_save_dir=model_save_dir,
q_config=q_config,
inc_config=inc_config,
use_cache=use_cache,
**kwargs,
)
Loading

0 comments on commit d3c8ac6

Please sign in to comment.