Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add transformers v4.45 support #902

Merged
merged 25 commits into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion optimum/exporters/openvino/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union

import onnx
from transformers.generation import GenerationMixin
from transformers.utils import is_tf_available, is_torch_available

from openvino.runtime import Model, save_model
Expand All @@ -40,6 +41,7 @@
_torch_version,
_transformers_version,
compare_versions,
is_transformers_version,
)
from optimum.utils import DEFAULT_DUMMY_SHAPES, is_diffusers_available
from optimum.utils.save_utils import maybe_save_preprocessors
Expand Down Expand Up @@ -379,7 +381,7 @@ def ts_patched_forward(*args, **kwargs):
if stateful:
# cannot raise because stateful is enabled by default and it would break backward compatibility for models that couldn't convert to OV directly
# TODO: Implement stateful for ONNX path as well, not doing it right now because of lack of validation
logger.warn(
logger.warning(
"[ WARNING ] Making stateful models is not supported when exporting to ONNX as an intermediate step. "
"A stateless model will be exported instead. It may result in sub-optimal inference performance."
"Provide a model that can be converted to OpenVINO without fallback to ONNX conversion path."
Expand Down Expand Up @@ -622,6 +624,18 @@ def export_from_model(

files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()]
elif library_name != "diffusers":
if is_transformers_version(">=", "4.44.99"):
misplaced_generation_parameters = model.config._get_non_default_generation_parameters()
if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0:
logger.warning(
"Moving the following attributes in the config to the generation config: "
f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
"generation parameters in the model config, as opposed to in the generation config.",
)
for param_name, param_value in misplaced_generation_parameters.items():
setattr(model.generation_config, param_name, param_value)
setattr(model.config, param_name, None)

# Saving the model config and preprocessor as this is needed sometimes.
model.config.save_pretrained(output)
generation_config = getattr(model, "generation_config", None)
Expand Down
80 changes: 52 additions & 28 deletions optimum/exporters/openvino/model_patcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
from transformers.modeling_tf_utils import TFPreTrainedModel


BETTERTRANSFORMER_IGNORE = ("codegen",)
BETTERTRANSFORMER_IGNORE = ("codegen", "gpt_neo")


def patch_model_with_bettertransformer(model):
Expand All @@ -57,7 +57,7 @@ def patch_model_with_bettertransformer(model):
return model

if is_transformers_version("<", "4.36") or is_torch_version("<", "2.1.1"):
log.warn(
log.warning(
COLOR_RED
+ "[WARNING] For good performance with stateful models, transformers>=4.36.2 and PyTorch>=2.1.1 are required. "
f"This Python environment has Transformers {_transformers_version} and PyTorch {_torch_version}. "
Expand All @@ -75,7 +75,7 @@ def patch_model_with_bettertransformer(model):
display_version = (
_openvino_version.split("-")[0] if is_openvino_version("<=", "2024.0.0-14509") else _openvino_version
)
log.warn(
log.warning(
COLOR_RED
+ f"[WARNING] Stateful models are not supported for Llama, Gemma and GPTBigCode with Transformers "
f"{_transformers_version} and OpenVINO {display_version}. For good performance, consider using a nightly OpenVINO build: "
Expand All @@ -93,7 +93,7 @@ def patch_model_with_bettertransformer(model):
try:
model = model.to_bettertransformer()
except Exception as e:
log.warn(
log.warning(
f"Cannot apply model.to_bettertransformer because of the exception:\n{e}."
" Usage model with stateful=True may be non-effective if model does not contain torch.functional.scaled_dot_product_attention"
)
Expand Down Expand Up @@ -168,7 +168,8 @@ def __enter__(self):
layer.block_sparse_moe.forward = types.MethodType(
_mixtral_sparse_moe_block_forward, layer.block_sparse_moe
)
_reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)
if is_transformers_version("<", "4.44.99"):
_reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)

def __exit__(self, exc_type, exc_value, traceback):
super().__exit__(exc_type, exc_value, traceback)
Expand Down Expand Up @@ -2151,6 +2152,7 @@ def _persimmon_self_attn_sdpa_forward(
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
from transformers.models.persimmon.modeling_persimmon import apply_rotary_pos_emb

Expand All @@ -2176,25 +2178,42 @@ def _persimmon_self_attn_sdpa_forward(
value_states = value_states.transpose(1, 2)
key_states = key_states.transpose(1, 2)

kv_seq_len = key_states.shape[-2]
if past_key_value is not None:
if self.layer_idx is None:
raise ValueError(
f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
"with a layer index."
if is_transformers_version("<", "4.44.99"):
kv_seq_len = key_states.shape[-2]
if past_key_value is not None:
if self.layer_idx is None:
raise ValueError(
f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
"with a layer index."
)
kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
else:
if position_embeddings is None:
log.warning(
"The attention layers in this model are transitioning from computing the RoPE embeddings internally "
"through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
"`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
"removed and `position_embeddings` will be mandatory."
)
kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
cos, sin = self.rotary_emb(value_states, position_ids)
else:
cos, sin = position_embeddings

if is_transformers_version("<", "4.44.99"):
rotary_ndims = self.rotary_emb.dim
else:
rotary_ndims = self.rotary_ndims

# Partial rotary embedding
query_rot, query_pass = (
query_states[..., : self.rotary_emb.dim],
query_states[..., self.rotary_emb.dim :],
query_states[..., :rotary_ndims],
query_states[..., rotary_ndims:],
)
key_rot, key_pass = (
key_states[..., : self.rotary_emb.dim],
key_states[..., self.rotary_emb.dim :],
key_states[..., :rotary_ndims],
key_states[..., rotary_ndims:],
)
# [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor]
query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
Expand All @@ -2208,7 +2227,7 @@ def _persimmon_self_attn_sdpa_forward(
cache_kwargs = {
"sin": sin,
"cos": cos,
"partial_rotation_size": self.rotary_emb.dim,
"partial_rotation_size": rotary_ndims,
"cache_position": cache_position,
}
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
Expand Down Expand Up @@ -2244,7 +2263,8 @@ def __enter__(self):
orig_self_attn_fwd = layer.self_attn.forward
layer.self_attn.forward = types.MethodType(_persimmon_self_attn_sdpa_forward, layer.self_attn)
layer.self_attn._orig_forward = orig_self_attn_fwd
_reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)
if is_transformers_version("<", "4.44.99"):
_reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)

def __exit__(self, exc_type, exc_value, traceback):
super().__exit__(exc_type, exc_value, traceback)
Expand Down Expand Up @@ -2393,29 +2413,33 @@ def __exit__(self, exc_type, exc_value, traceback):
class RotaryEmbPatcher(DecoderModelPatcher):
def __enter__(self):
super().__enter__()
for layer in self._model.model.layers:
_reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)
if is_transformers_version("<", "4.44.99"):
for layer in self._model.model.layers:
_reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)


class FalconModelPatcher(DecoderModelPatcher):
def __enter__(self):
super().__enter__()
for layer in self._model.transformer.h:
_reinitialize_cos_sin_cached_fp32(layer.self_attention.rotary_emb)
if is_transformers_version("<", "4.44.99"):
for layer in self._model.transformer.h:
_reinitialize_cos_sin_cached_fp32(layer.self_attention.rotary_emb)


class GptNeoxModelPatcher(DecoderModelPatcher):
def __enter__(self):
super().__enter__()
for layer in self._model.gpt_neox.layers:
_reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb)
if is_transformers_version("<", "4.44.99"):
for layer in self._model.gpt_neox.layers:
_reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb)


class GptNeoxJapaneseModelPatcher(DecoderModelPatcher):
def __enter__(self):
super().__enter__()
for layer in self._model.gpt_neox_japanese.layers:
_reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb)
if is_transformers_version("<", "4.44.99"):
for layer in self._model.gpt_neox_japanese.layers:
_reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb)
echarlaix marked this conversation as resolved.
Show resolved Hide resolved


class Gemma2ModelPatcher(LlamaModelPatcher):
Expand Down
4 changes: 2 additions & 2 deletions optimum/exporters/openvino/stateful.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def make_stateful(
shape[0] = num_beams_and_batch
input.get_node().set_partial_shape(shape)
else:
log.warn(f"Rank of {input.get_any_name()} input of the model is not 2, batch size is not set")
log.warning(f"Rank of {input.get_any_name()} input of the model is not 2, batch size is not set")

for kv_name_pair in zip(key_value_input_names, key_value_output_names):
input_output_map[kv_name_pair[0]] = kv_name_pair[1]
Expand All @@ -176,7 +176,7 @@ def ensure_stateful_is_available(warn=True):
"""
if is_openvino_version("<", "2023.3"):
if warn:
log.warn(
log.warning(
f"Could not create or use stateful model when using old version of openvino=={_openvino_version}. It may result in sub-optimal inference performance."
"Install openvino>=2023.3.0."
)
Expand Down
96 changes: 65 additions & 31 deletions optimum/intel/neural_compressor/modeling_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,8 @@
from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME
from transformers.utils.generic import ContextManagers

from optimum.intel.generation import BaseModelForCausalLM

from ...modeling_base import OptimizedModel
from ..utils.import_utils import _torch_version, is_torch_version
from ..utils.import_utils import _torch_version, is_torch_version, is_transformers_version
from .configuration import INCConfig
from .quantization import _weight_only_quantization
from .utils import QUANTIZATION_CONFIG_NAME
Expand Down Expand Up @@ -85,14 +83,32 @@ def __init__(
inc_config: Dict = None,
**kwargs,
):
generation_config = kwargs.pop("generation_config", None)

super().__init__(model=model, config=config, **kwargs)
self.inc_config = inc_config
self._q_config = q_config
self.model_save_dir = model_save_dir
self._device = getattr(self.model, "device", None) or torch.device(
"cuda:0" if torch.cuda.is_available() else "cpu"
)
self.generation_config = GenerationConfig.from_model_config(config)
if self.can_generate():
self.generation_config = generation_config or GenerationConfig.from_model_config(config)

if is_transformers_version(">=", "4.44.99"):
misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
if len(misplaced_generation_parameters) > 0:
logger.warning(
"Moving the following attributes in the config to the generation config: "
f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
"generation parameters in the model config, as opposed to in the generation config.",
)
for param_name, param_value in misplaced_generation_parameters.items():
setattr(self.generation_config, param_name, param_value)
setattr(self.config, param_name, None)

else:
self.generation_config = None

# Registers the INCModelForXXX classes into the transformers AutoModel classes to avoid warnings when creating
# a pipeline https://github.com/huggingface/transformers/blob/cad61b68396a1a387287a8e2e2fef78a25b79383/src/transformers/pipelines/base.py#L863
Expand Down Expand Up @@ -126,9 +142,29 @@ def _from_pretrained(
token = use_auth_token

quantization_config = kwargs.pop("quantization_config", None)
generation_config = kwargs.pop("generation_config", None)

model_path = Path(model_id)
is_local = model_path.is_dir()

if generation_config is None and "text-generation" in cls.export_feature:
try:
generation_config = GenerationConfig.from_pretrained(
model_id,
cache_dir=cache_dir,
force_download=force_download,
local_files_only=local_files_only,
token=token,
revision=revision,
subfolder=subfolder,
)
if getattr(generation_config, "cache_implementation", None) is not None:
generation_config.cache_implementation = None
except OSError:
logger.info(
"Generation config file not found, using a generation config created from the model config."
)

# ITREX compatibility
quantization_config_path = None
if is_local:
Expand Down Expand Up @@ -202,7 +238,7 @@ def _from_pretrained(
**kwargs,
)

return cls(model, config=config, model_save_dir=None, **kwargs).model
return cls(model, config=config, model_save_dir=None, generation_config=generation_config, **kwargs).model

model_cache_path = None
inc_config = None
Expand Down Expand Up @@ -261,7 +297,14 @@ def _from_pretrained(
)
model = torch.jit.load(model_cache_path)
model = torch.jit.freeze(model.eval())
return cls(model, config=config, model_save_dir=model_save_dir, inc_config=inc_config, **kwargs)
return cls(
model,
config=config,
model_save_dir=model_save_dir,
inc_config=inc_config,
generation_config=generation_config,
**kwargs,
)

model_class = _get_model_class(config, cls.auto_model_class._model_mapping)
# Load the state dictionary of the model to verify whether the model to get the quantization config
Expand All @@ -283,7 +326,13 @@ def _from_pretrained(
raise

return cls(
model, config=config, model_save_dir=model_save_dir, q_config=q_config, inc_config=inc_config, **kwargs
model,
config=config,
model_save_dir=model_save_dir,
q_config=q_config,
inc_config=inc_config,
generation_config=generation_config,
**kwargs,
)

def _save_pretrained(self, save_directory: Union[str, Path]):
Expand All @@ -304,6 +353,14 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
if self.inc_config:
self.inc_config.save_pretrained(save_directory)

if self.generation_config is not None:
try:
self.generation_config.save_pretrained(save_directory)
except Exception as exception:
logger.warning(
f"The generation config will not be saved, saving failed with following error:\n{exception}"
)

def forward(self, *args, **kwargs):
return self.model(*args, **kwargs)

Expand Down Expand Up @@ -366,29 +423,6 @@ class INCModelForVision2Seq(INCModel):
export_feature = "image-to-text"


class INCModelForCausalLM(INCModel, BaseModelForCausalLM):
class INCModelForCausalLM(INCModel):
auto_model_class = AutoModelForCausalLM
export_feature = "text-generation"
forward = BaseModelForCausalLM.forward
generate = BaseModelForCausalLM.generate
can_generate = BaseModelForCausalLM.can_generate

def __init__(
self,
model,
config: PretrainedConfig = None,
model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
q_config: Dict = None,
inc_config: Dict = None,
use_cache: bool = True,
**kwargs,
):
super(INCModelForCausalLM, self).__init__(
model=model,
config=config,
model_save_dir=model_save_dir,
q_config=q_config,
inc_config=inc_config,
use_cache=use_cache,
**kwargs,
)
Loading
Loading