From cae68e5e4eb3436cbcb640d61a540660242e671a Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Thu, 12 Sep 2024 16:49:28 +0200
Subject: [PATCH 01/22] transformers v4.45 support

---
 .github/workflows/test_onnxruntime.yml | 1 +
 setup.py                               | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml
index 291a3b0833..c4157797e7 100644
--- a/.github/workflows/test_onnxruntime.yml
+++ b/.github/workflows/test_onnxruntime.yml
@@ -42,6 +42,7 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+          pip install git+https://github.com/huggingface/transformers
           pip install .[tests,onnxruntime]
 
       - name: Test with pytest (in series)
diff --git a/setup.py b/setup.py
index ac5db71a74..231dc9110e 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@
 REQUIRED_PKGS = [
     "coloredlogs",
     "sympy",
-    "transformers[sentencepiece]>=4.29,<4.45.0",
+    "transformers[sentencepiece]>=4.29,<4.46.0",
     "torch>=1.11",
     "packaging",
     "numpy<2.0",  # transformers requires numpy<2.0 https://github.com/huggingface/transformers/pull/31569

From 69a1d2e48ecb5a8d5042cd95ac77e9933dd2998d Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Fri, 13 Sep 2024 14:11:59 +0200
Subject: [PATCH 02/22] fix transformers v4.45 compatibility

---
 optimum/exporters/onnx/convert.py       | 13 +++++++++++++
 optimum/exporters/onnx/model_configs.py |  8 ++++----
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py
index 63a9067b90..e5a912b18e 100644
--- a/optimum/exporters/onnx/convert.py
+++ b/optimum/exporters/onnx/convert.py
@@ -38,6 +38,7 @@
     is_torch_onnx_support_available,
     logging,
     require_numpy_strictly_lower,
+    check_if_transformers_greater,
 )
 from ...utils.modeling_utils import MODEL_TO_PATCH_FOR_PAST
 from ...utils.save_utils import maybe_save_preprocessors
@@ -1120,6 +1121,18 @@ def onnx_export_from_model(
             if isinstance(atol, dict):
                 atol = atol[task.replace("-with-past", "")]
 
+        if check_if_transformers_greater("4.44.99"):
+            misplaced_generation_parameters = model.config._get_non_default_generation_parameters()
+            if model.can_generate() and len(misplaced_generation_parameters) > 0:
+                logger.warning(
+                    "Moving the following attributes in the config to the generation config: "
+                    f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
+                    "generation parameters in the model config, as opposed to in the generation config.",
+                )
+                for param_name, param_value in misplaced_generation_parameters.items():
+                    setattr(model.generation_config, param_name, param_value)
+                    setattr(model.config, param_name, None)
+
         # Saving the model config and preprocessor as this is needed sometimes.
         model.config.save_pretrained(output)
         generation_config = getattr(model, "generation_config", None)
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index d4b15b2968..e927d5dfe9 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -119,7 +119,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
 
 
 class AlbertOnnxConfig(BertOnnxConfig):
-    DEFAULT_ONNX_OPSET = 11
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
 
 class ConvBertOnnxConfig(BertOnnxConfig):
@@ -171,11 +171,11 @@ class MPNetOnnxConfig(DistilBertOnnxConfig):
 
 
 class RobertaOnnxConfig(DistilBertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
 
 class CamembertOnnxConfig(DistilBertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
 
 class FlaubertOnnxConfig(BertOnnxConfig):
@@ -187,7 +187,7 @@ class IBertOnnxConfig(DistilBertOnnxConfig):
 
 
 class XLMRobertaOnnxConfig(DistilBertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
 
 class DebertaOnnxConfig(BertOnnxConfig):

From 450a5a45dd0a2d31807b3f1f0d4139a5e4e515d5 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Fri, 13 Sep 2024 15:58:19 +0200
Subject: [PATCH 03/22] update opset

---
 optimum/exporters/onnx/model_configs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index e927d5dfe9..f24a984ca0 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -257,7 +257,7 @@ class ImageGPTOnnxConfig(GPT2OnnxConfig):
 
 
 class GPTNeoOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
-    DEFAULT_ONNX_OPSET = 13
+    DEFAULT_ONNX_OPSET = 14
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_attention_heads="num_heads")
 
 

From e29f4666551f23fe37b06eaf1b2aa1ba6e681f5f Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Fri, 13 Sep 2024 16:35:50 +0200
Subject: [PATCH 04/22] fix config

---
 optimum/modeling_base.py                |  3 ++
 optimum/onnxruntime/modeling_decoder.py | 53 ++++++++++++---------
 optimum/onnxruntime/modeling_ort.py     |  3 --
 optimum/onnxruntime/modeling_seq2seq.py | 63 +++++++++++--------------
 4 files changed, 62 insertions(+), 60 deletions(-)

diff --git a/optimum/modeling_base.py b/optimum/modeling_base.py
index 5bab0622de..cfef0bac8e 100644
--- a/optimum/modeling_base.py
+++ b/optimum/modeling_base.py
@@ -372,6 +372,9 @@ def from_pretrained(
             export = from_transformers
 
         if len(model_id.split("@")) == 2:
+            logger.warning(
+                f"Specifying the `revision` as @{model_id.split('@')[1]} is deprecated and will be removed in v1.23, please use the `revision` argument instead."
+            )
             if revision is not None:
                 logger.warning(
                     f"The argument `revision` was set to {revision} but will be ignored for {model_id.split('@')[1]}"
diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py
index f6d4b7e20a..35417ba60f 100644
--- a/optimum/onnxruntime/modeling_decoder.py
+++ b/optimum/onnxruntime/modeling_decoder.py
@@ -149,6 +149,19 @@ def __init__(
             generation_config = GenerationConfig.from_model_config(config)
 
         self.generation_config = generation_config
+
+        if check_if_transformers_greater("4.44.99"):
+            misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
+            if len(misplaced_generation_parameters) > 0:
+                logger.warning(
+                    "Moving the following attributes in the config to the generation config: "
+                    f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
+                    "generation parameters in the model config, as opposed to in the generation config.",
+                )
+                for param_name, param_value in misplaced_generation_parameters.items():
+                    setattr(self.generation_config, param_name, param_value)
+                    setattr(self.config, param_name, None)
+
         self.onnx_paths = [self.model_path]
         self.use_merged = "use_cache_branch" in self.input_names
         self.model_type = self.config.model_type
@@ -393,7 +406,6 @@ def _from_pretrained(
         cls,
         model_id: Union[str, Path],
         config: "PretrainedConfig",
-        use_auth_token: Optional[Union[bool, str]] = None,
         token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
@@ -410,15 +422,7 @@ def _from_pretrained(
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
         **kwargs,
     ) -> "ORTModelForCausalLM":
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
-                FutureWarning,
-            )
-            if token is not None:
-                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
-            token = use_auth_token
-
+        generation_config = kwargs.pop("generation_config", None)
         model_path = Path(model_id)
 
         # We do not implement the logic for use_cache=False, use_merged=True
@@ -586,6 +590,22 @@ def _from_pretrained(
         else:
             init_cls = ORTModelForCausalLM
 
+        if generation_config is None:
+            try:
+                generation_config = GenerationConfig.from_pretrained(
+                    model_id,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                )
+            except OSError:
+                logger.info(
+                    "Generation config file not found, using a generation config created from the model config."
+                )
+
         return init_cls(
             model=model,
             config=config,
@@ -593,6 +613,7 @@ def _from_pretrained(
             model_save_dir=model_save_dir,
             preprocessors=preprocessors,
             use_cache=use_cache,
+            generation_config=generation_config,
         )
 
     @classmethod
@@ -600,7 +621,6 @@ def _from_transformers(
         cls,
         model_id: str,
         config: "PretrainedConfig",
-        use_auth_token: Optional[Union[bool, str]] = None,
         token: Optional[Union[bool, str]] = None,
         revision: str = "main",
         force_download: bool = True,
@@ -616,15 +636,6 @@ def _from_transformers(
         use_io_binding: Optional[bool] = None,
         task: Optional[str] = None,
     ) -> "ORTModelForCausalLM":
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
-                FutureWarning,
-            )
-            if token is not None:
-                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
-            token = use_auth_token
-
         file_name = ONNX_WEIGHTS_NAME
 
         if use_merged:
@@ -655,8 +666,6 @@ def _from_transformers(
             force_download=force_download,
             trust_remote_code=trust_remote_code,
         )
-
-        config.save_pretrained(save_dir_path)
         maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder)
 
         return cls._from_pretrained(
diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py
index 254b771e33..0485b9fb27 100644
--- a/optimum/onnxruntime/modeling_ort.py
+++ b/optimum/onnxruntime/modeling_ort.py
@@ -661,8 +661,6 @@ def _export(
             force_download=force_download,
             trust_remote_code=trust_remote_code,
         )
-
-        config.save_pretrained(save_dir_path)
         maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder)
 
         return cls._from_pretrained(
@@ -1169,7 +1167,6 @@ def _export(
             library_name="transformers",
         )
 
-        config.save_pretrained(save_dir_path)
         maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder)
 
         return cls._from_pretrained(
diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py
index 4ce3e4707e..46a6609b81 100644
--- a/optimum/onnxruntime/modeling_seq2seq.py
+++ b/optimum/onnxruntime/modeling_seq2seq.py
@@ -717,6 +717,18 @@ def show_deprecated_argument(arg_name):
             generation_config = GenerationConfig.from_model_config(config)
         self.generation_config = generation_config
 
+        if check_if_transformers_greater("4.44.99"):
+            misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
+            if len(misplaced_generation_parameters) > 0:
+                logger.warning(
+                    "Moving the following attributes in the config to the generation config: "
+                    f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
+                    "generation parameters in the model config, as opposed to in the generation config.",
+                )
+                for param_name, param_value in misplaced_generation_parameters.items():
+                    setattr(self.generation_config, param_name, param_value)
+                    setattr(self.config, param_name, None)
+
     @abstractmethod
     def _initialize_encoder(self, session: ort.InferenceSession) -> ORTEncoder:
         pass
@@ -791,7 +803,6 @@ def _from_pretrained(
         cls,
         model_id: Union[str, Path],
         config: "PretrainedConfig",
-        use_auth_token: Optional[Union[bool, str]] = None,
         token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
@@ -810,15 +821,7 @@ def _from_pretrained(
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
         **kwargs,
     ):
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
-                FutureWarning,
-            )
-            if token is not None:
-                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
-            token = use_auth_token
-
+        generation_config = kwargs.pop("generation_config", None)
         model_path = Path(model_id)
 
         # We do not implement the logic for use_cache=False, use_merged=True
@@ -1007,19 +1010,21 @@ def _from_pretrained(
         if model_save_dir is None:
             model_save_dir = new_model_save_dir
 
-        generation_config = None
-        try:
-            generation_config = GenerationConfig.from_pretrained(
-                model_id,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                local_files_only=local_files_only,
-                token=token,
-                revision=revision,
-                subfolder=subfolder,
-            )
-        except OSError:
-            logger.info("Generation config file not found, using a generation config created from the model config.")
+        if generation_config is None:
+            try:
+                generation_config = GenerationConfig.from_pretrained(
+                    model_id,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                )
+            except OSError:
+                logger.info(
+                    "Generation config file not found, using a generation config created from the model config."
+                )
 
         onnx_paths = [encoder_path]
         if use_merged is False:
@@ -1046,7 +1051,6 @@ def _from_transformers(
         cls,
         model_id: str,
         config: "PretrainedConfig",
-        use_auth_token: Optional[Union[bool, str]] = None,
         token: Optional[Union[bool, str]] = None,
         revision: str = "main",
         force_download: bool = True,
@@ -1062,15 +1066,6 @@ def _from_transformers(
         use_io_binding: Optional[bool] = None,
         task: Optional[str] = None,
     ) -> "ORTModelForConditionalGeneration":
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
-                FutureWarning,
-            )
-            if token is not None:
-                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
-            token = use_auth_token
-
         if use_cache is False and use_merged is True:
             raise ValueError(
                 "The incompatible arguments use_cache=False, use_merged=True were passed to"
@@ -1102,8 +1097,6 @@ def _from_transformers(
             force_download=force_download,
             trust_remote_code=trust_remote_code,
         )
-
-        config.save_pretrained(save_dir_path)
         maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder)
 
         return cls._from_pretrained(

From e69658c21b705e27a099304f2884df5fc00f4eb7 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Fri, 13 Sep 2024 16:46:09 +0200
Subject: [PATCH 05/22] fix

---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index e2c2126303..824ef3d0cf 100644
--- a/Makefile
+++ b/Makefile
@@ -23,11 +23,11 @@ REAL_CLONE_URL = $(if $(CLONE_URL),$(CLONE_URL),$(DEFAULT_CLONE_URL))
 # Run code quality checks
 style_check:
 	black --check .
-	ruff .
+	ruff check .
 
 style:
 	black .
-	ruff . --fix
+	ruff check . --fix
 
 # Run tests for the library
 test:

From 0cc167d7eacd615a9a60b35bed9cd202d75f17eb Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Fri, 13 Sep 2024 16:46:18 +0200
Subject: [PATCH 06/22] style

---
 optimum/exporters/onnx/convert.py       | 2 +-
 optimum/onnxruntime/modeling_decoder.py | 1 -
 optimum/onnxruntime/modeling_seq2seq.py | 1 -
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py
index e5a912b18e..7142d5be4a 100644
--- a/optimum/exporters/onnx/convert.py
+++ b/optimum/exporters/onnx/convert.py
@@ -34,11 +34,11 @@
     DEFAULT_DUMMY_SHAPES,
     ONNX_WEIGHTS_NAME,
     TORCH_MINIMUM_VERSION,
+    check_if_transformers_greater,
     is_diffusers_available,
     is_torch_onnx_support_available,
     logging,
     require_numpy_strictly_lower,
-    check_if_transformers_greater,
 )
 from ...utils.modeling_utils import MODEL_TO_PATCH_FOR_PAST
 from ...utils.save_utils import maybe_save_preprocessors
diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py
index 35417ba60f..239eaee5f3 100644
--- a/optimum/onnxruntime/modeling_decoder.py
+++ b/optimum/onnxruntime/modeling_decoder.py
@@ -14,7 +14,6 @@
 """Classes handling causal-lm related architectures in ONNX Runtime."""
 
 import logging
-import warnings
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py
index 46a6609b81..3c0a7f4b06 100644
--- a/optimum/onnxruntime/modeling_seq2seq.py
+++ b/optimum/onnxruntime/modeling_seq2seq.py
@@ -18,7 +18,6 @@
 
 import logging
 import shutil
-import warnings
 from abc import ABC, abstractmethod
 from pathlib import Path
 from tempfile import TemporaryDirectory

From c98d5d6fb48bc31f0425fa732909f53c4c026f31 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Mon, 16 Sep 2024 11:15:44 +0200
Subject: [PATCH 07/22] fix

---
 optimum/onnxruntime/optimization.py          | 12 ++++++++----
 tests/onnxruntime/test_modeling.py           |  2 +-
 tests/onnxruntime/utils_onnxruntime_tests.py |  2 +-
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/optimum/onnxruntime/optimization.py b/optimum/onnxruntime/optimization.py
index 9e62a3f324..fd6958bba7 100644
--- a/optimum/onnxruntime/optimization.py
+++ b/optimum/onnxruntime/optimization.py
@@ -20,6 +20,7 @@
 
 import onnx
 from onnx import load_model
+from transformers import GenerationConfig
 from transformers.models.auto.configuration_auto import AutoConfig
 
 from onnxruntime.transformers.onnx_model_bert import BertOnnxModel
@@ -152,10 +153,6 @@ def optimize(
         save_dir = Path(save_dir)
         save_dir.mkdir(parents=True, exist_ok=True)
         ORTConfigManager.check_optimization_supported_model(self.model_type, optimization_config)
-
-        self.config.save_pretrained(save_dir)
-        maybe_save_preprocessors(self.onnx_model_path[0].parent, save_dir)
-
         model_type = ORTConfigManager.get_model_ort_type(self.config.model_type)
         optimization_options = optimization_config.create_fusion_options(model_type)
 
@@ -236,6 +233,13 @@ def optimize(
         # Save the model configuration
         self.config.save_pretrained(save_dir)
         ort_config.save_pretrained(save_dir)
+        maybe_save_preprocessors(self.onnx_model_path[0].parent, save_dir)
+
+        try:
+            generation_config = GenerationConfig.from_pretrained(self.onnx_model_path[0].parent)
+            generation_config.save_pretrained(save_dir)
+        except Exception:
+            pass
 
         logger.info(
             f"Optimized model saved at: {save_dir} (external data format: "
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index 4b44acb38a..a19dbcd1a8 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -3147,7 +3147,7 @@ class ORTModelForAudioClassificationIntegrationTest(ORTModelTestMixin):
         "wavlm",
         "wav2vec2",
         "wav2vec2-conformer",
-        "whisper",
+        # "whisper", saving of the model's config is broken in transformers v4.45.0
     ]
 
     FULL_GRID = {"model_arch": SUPPORTED_ARCHITECTURES}
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index bb6935461d..8d25d88df5 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -114,7 +114,7 @@
     "longt5": "hf-internal-testing/tiny-random-LongT5Model",
     "llama": "fxmarty/tiny-llama-fast-tokenizer",
     "m2m_100": "hf-internal-testing/tiny-random-m2m_100",
-    "marian": "sshleifer/tiny-marian-en-de",  # hf-internal-testing ones are broken
+    "marian": "fxmarty/tiny-marian",  # hf-internal-testing ones are broken
     "mbart": "hf-internal-testing/tiny-random-mbart",
     "mistral": "echarlaix/tiny-random-mistral",
     "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel",

From 9a6f601536a4623a7cf1ec58dc7b32e220e309c2 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Mon, 16 Sep 2024 15:28:07 +0200
Subject: [PATCH 08/22] update model

---
 tests/onnxruntime/utils_onnxruntime_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index 8d25d88df5..804aa00f04 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -114,7 +114,7 @@
     "longt5": "hf-internal-testing/tiny-random-LongT5Model",
     "llama": "fxmarty/tiny-llama-fast-tokenizer",
     "m2m_100": "hf-internal-testing/tiny-random-m2m_100",
-    "marian": "fxmarty/tiny-marian",  # hf-internal-testing ones are broken
+    "marian": "echarlaix/tiny-random-marian",
     "mbart": "hf-internal-testing/tiny-random-mbart",
     "mistral": "echarlaix/tiny-random-mistral",
     "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel",

From fadadc9a441734696f8d26dc974eb80c0e33b247 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Mon, 16 Sep 2024 18:11:00 +0200
Subject: [PATCH 09/22] Add generation config saving

---
 optimum/onnxruntime/modeling_decoder.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py
index 239eaee5f3..378fdbb6db 100644
--- a/optimum/onnxruntime/modeling_decoder.py
+++ b/optimum/onnxruntime/modeling_decoder.py
@@ -720,6 +720,9 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) ->
             for layer_past in past
         )
 
+    def _save_pretrained(self, save_directory: Union[str, Path]):
+        super()._save_pretrained(save_directory)
+        self.generation_config.save_pretrained(save_directory)
 
 class ORTGPTBigCodeForCausalLM(ORTModelForCausalLM):
     # Adapted from transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeForCausalLM.prepare_inputs_for_generation

From 9fa9e9fc3ebd4ca719ac88aa06942cf6ae1da2d8 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Wed, 25 Sep 2024 19:01:54 +0200
Subject: [PATCH 10/22] fix codegen

---
 optimum/bettertransformer/models/attention.py | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/optimum/bettertransformer/models/attention.py b/optimum/bettertransformer/models/attention.py
index 9dfa57844d..53e6a676e6 100644
--- a/optimum/bettertransformer/models/attention.py
+++ b/optimum/bettertransformer/models/attention.py
@@ -195,7 +195,7 @@ def codegen_wrapped_scaled_dot_product(
                 query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=True
             )
         else:
-            # in this case, which is the later decoding steps, the `causal_mask`` in
+            # in this case, which is the later decoding steps, the `causal_mask` in
             # https://github.com/huggingface/transformers/blob/ae54e3c3b18bac0832ad62ea9b896dfd52a09850/src/transformers/models/gpt2/modeling_gpt2.py#L195
             # is [True, ..., True] so actually not causal
             sdpa_result = torch.nn.functional.scaled_dot_product_attention(
@@ -207,15 +207,20 @@ def codegen_wrapped_scaled_dot_product(
         # causal_mask is always [True, ..., True] otherwise, so executing this
         # is unnecessary
         if query_length > 1:
-            causal_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
 
-            causal_mask = torch.where(causal_mask, 0, mask_value)
+            if not check_if_transformers_greater("4.44.99"):
+                causal_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
 
-            # torch.Tensor.expand does no memory copy
-            causal_mask = causal_mask.expand(batch_size, -1, -1, -1)
+                causal_mask = torch.where(causal_mask, 0, mask_value)
 
-            # we use torch.min to avoid having tensor(-inf)
-            attention_mask = torch.min(causal_mask, attention_mask)
+                # torch.Tensor.expand does no memory copy
+                causal_mask = causal_mask.expand(batch_size, -1, -1, -1)
+
+                # we use torch.min to avoid having tensor(-inf)
+                attention_mask = torch.min(causal_mask, attention_mask)
+            else:
+
+                attention_mask = attention_mask[:, :, :, : key.shape[-2]]
 
         sdpa_result = torch.nn.functional.scaled_dot_product_attention(
             query, key, value, attn_mask=attention_mask, dropout_p=dropout_p, is_causal=False
@@ -224,6 +229,7 @@ def codegen_wrapped_scaled_dot_product(
     return sdpa_result, None
 
 
+
 # Adapted from transformers.models.opt.modeling_opt.OPTAttention.forward
 def opt_forward(
     self,

From bf913c285b3ef149070572a8e509d0ac7b874ceb Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Wed, 25 Sep 2024 19:04:53 +0200
Subject: [PATCH 11/22] udpate setup

---
 .github/workflows/test_onnxruntime.yml        | 1 -
 optimum/bettertransformer/models/attention.py | 7 +++----
 optimum/onnxruntime/modeling_decoder.py       | 1 +
 setup.py                                      | 3 ++-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml
index c4157797e7..291a3b0833 100644
--- a/.github/workflows/test_onnxruntime.yml
+++ b/.github/workflows/test_onnxruntime.yml
@@ -42,7 +42,6 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-          pip install git+https://github.com/huggingface/transformers
           pip install .[tests,onnxruntime]
 
       - name: Test with pytest (in series)
diff --git a/optimum/bettertransformer/models/attention.py b/optimum/bettertransformer/models/attention.py
index 53e6a676e6..63f0275464 100644
--- a/optimum/bettertransformer/models/attention.py
+++ b/optimum/bettertransformer/models/attention.py
@@ -207,9 +207,10 @@ def codegen_wrapped_scaled_dot_product(
         # causal_mask is always [True, ..., True] otherwise, so executing this
         # is unnecessary
         if query_length > 1:
-
             if not check_if_transformers_greater("4.44.99"):
-                causal_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
+                causal_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length].to(
+                    torch.bool
+                )
 
                 causal_mask = torch.where(causal_mask, 0, mask_value)
 
@@ -219,7 +220,6 @@ def codegen_wrapped_scaled_dot_product(
                 # we use torch.min to avoid having tensor(-inf)
                 attention_mask = torch.min(causal_mask, attention_mask)
             else:
-
                 attention_mask = attention_mask[:, :, :, : key.shape[-2]]
 
         sdpa_result = torch.nn.functional.scaled_dot_product_attention(
@@ -229,7 +229,6 @@ def codegen_wrapped_scaled_dot_product(
     return sdpa_result, None
 
 
-
 # Adapted from transformers.models.opt.modeling_opt.OPTAttention.forward
 def opt_forward(
     self,
diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py
index 378fdbb6db..bda3ec98d9 100644
--- a/optimum/onnxruntime/modeling_decoder.py
+++ b/optimum/onnxruntime/modeling_decoder.py
@@ -724,6 +724,7 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
         super()._save_pretrained(save_directory)
         self.generation_config.save_pretrained(save_directory)
 
+
 class ORTGPTBigCodeForCausalLM(ORTModelForCausalLM):
     # Adapted from transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeForCausalLM.prepare_inputs_for_generation
     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
diff --git a/setup.py b/setup.py
index 231dc9110e..c961cf973c 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,8 @@
 REQUIRED_PKGS = [
     "coloredlogs",
     "sympy",
-    "transformers[sentencepiece]>=4.29,<4.46.0",
+    "transformers @ git+https://github.com/huggingface/transformers.git",
+    # "transformers[sentencepiece]>=4.29,<4.46.0",
     "torch>=1.11",
     "packaging",
     "numpy<2.0",  # transformers requires numpy<2.0 https://github.com/huggingface/transformers/pull/31569

From 94dee276bbb1c8b1227ca2b131b1566dd6d81b79 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Thu, 26 Sep 2024 10:52:41 +0200
Subject: [PATCH 12/22] update setup

---
 setup.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index c961cf973c..231dc9110e 100644
--- a/setup.py
+++ b/setup.py
@@ -15,8 +15,7 @@
 REQUIRED_PKGS = [
     "coloredlogs",
     "sympy",
-    "transformers @ git+https://github.com/huggingface/transformers.git",
-    # "transformers[sentencepiece]>=4.29,<4.46.0",
+    "transformers[sentencepiece]>=4.29,<4.46.0",
     "torch>=1.11",
     "packaging",
     "numpy<2.0",  # transformers requires numpy<2.0 https://github.com/huggingface/transformers/pull/31569

From 3bfa30e9c98b59399eea0c5b0b49954a63a9c6d1 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Thu, 26 Sep 2024 11:01:47 +0200
Subject: [PATCH 13/22] bump default opset m2m100

---
 optimum/exporters/onnx/model_configs.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index f24a984ca0..36963a986d 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -564,6 +564,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
 
 
 class M2M100OnnxConfig(TextSeq2SeqOnnxConfig):
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
     NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args(
         encoder_num_layers="encoder_layers",
         decoder_num_layers="decoder_layers",

From 7bf1d305ea7dd1110c849dbbb25593186817651b Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Thu, 26 Sep 2024 18:36:45 +0200
Subject: [PATCH 14/22] fix codegen

---
 optimum/bettertransformer/models/decoder_models.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/optimum/bettertransformer/models/decoder_models.py b/optimum/bettertransformer/models/decoder_models.py
index b64b7f5a1e..95ebcdd25f 100644
--- a/optimum/bettertransformer/models/decoder_models.py
+++ b/optimum/bettertransformer/models/decoder_models.py
@@ -238,12 +238,16 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
             super(BetterTransformerBaseLayer, self).__init__(config)
 
         self.module_mapping = None
-        submodules = ["attn_dropout", "resid_dropout", "qkv_proj", "out_proj", "causal_mask", "scale_attn"]
+        submodules = ["attn_dropout", "resid_dropout", "qkv_proj", "out_proj", "scale_attn"]
 
         # Attribute only for transformers>=4.28
         if hasattr(layer, "embed_positions"):
             submodules.append("embed_positions")
 
+        # Attribute only for transformers<4.45
+        if hasattr(layer, "causal_mask"):
+            submodules.append("causal_mask")
+
         for attr in submodules:
             setattr(self, attr, getattr(layer, attr))
 

From f01cccf799d754736d54f381be4b01eec966aa89 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Thu, 26 Sep 2024 18:45:22 +0200
Subject: [PATCH 15/22] fix

---
 optimum/bettertransformer/models/decoder_models.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/optimum/bettertransformer/models/decoder_models.py b/optimum/bettertransformer/models/decoder_models.py
index 95ebcdd25f..8393c92b8b 100644
--- a/optimum/bettertransformer/models/decoder_models.py
+++ b/optimum/bettertransformer/models/decoder_models.py
@@ -96,14 +96,19 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
             "out_proj",
             "attn_dropout",
             "resid_dropout",
-            "bias",
             "scale_attn",
-            "masked_bias",
         ]
         # Attribute only for transformers>=4.28
         if hasattr(layer, "embed_positions"):
             submodules.append("embed_positions")
 
+        # Attribute only for transformers<4.45
+        if hasattr(layer, "bias"):
+            submodules.append("bias")
+        if hasattr(layer, "masked_bias"):
+            submodules.append("masked_bias")
+
+
         for attr in submodules:
             setattr(self, attr, getattr(layer, attr))
 

From e206d44e7d9a09699c735f8aa48c5daa250d81f9 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Fri, 27 Sep 2024 14:54:36 +0200
Subject: [PATCH 16/22] fix bettertransformers

---
 optimum/bettertransformer/models/attention.py | 66 +++++++++++++++++++
 .../models/decoder_models.py                  | 20 +++++-
 2 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/optimum/bettertransformer/models/attention.py b/optimum/bettertransformer/models/attention.py
index 63f0275464..053f283f18 100644
--- a/optimum/bettertransformer/models/attention.py
+++ b/optimum/bettertransformer/models/attention.py
@@ -91,6 +91,72 @@ def gpt2_wrapped_scaled_dot_product(
 
     return sdpa_result, None
 
+# Adapted from transformers.models.gptj.modeling_gptj.GPTJAttention._attn
+def gptj_wrapped_scaled_dot_product(
+    self,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    head_mask: Optional[torch.Tensor] = None,
+):
+    raise_on_head_mask(head_mask)
+    batch_size = query.shape[0]
+
+    mask_value = torch.finfo(value.dtype).min
+    mask_value = torch.full([], mask_value, dtype=value.dtype)
+
+    # in gpt-neo-x and gpt-j the query and keys are always in fp32
+    # thus we need to cast them to the value dtype
+    if self.downcast_qk:
+        query = query.to(value.dtype)
+        key = key.to(value.dtype)
+
+    if batch_size == 1 and attention_mask is not None and attention_mask[0, 0, -1, -1] < -1:
+        raise ValueError("BetterTransformer does not support padding='max_length' with a batch size of 1.")
+
+    dropout_p = self.dropout_prob_attn if self.training else 0.0
+    if batch_size == 1 or self.training:
+        if query.shape[2] > 1:
+            sdpa_result = torch.nn.functional.scaled_dot_product_attention(
+                query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=True
+            )
+        else:
+            sdpa_result = torch.nn.functional.scaled_dot_product_attention(
+                query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=False
+            )
+    else:
+        query_length, key_length = query.size(-2), key.size(-2)
+
+        # causal_mask is always [True, ..., True] otherwise, so executing this
+        # is unnecessary
+        if query_length > 1:
+
+            if not check_if_transformers_greater("4.44.99"):
+
+                causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
+
+                causal_mask = torch.where(causal_mask, 0, mask_value)
+
+                # torch.Tensor.expand does no memory copy
+                causal_mask = causal_mask.expand(batch_size, -1, -1, -1)
+                if attention_mask is not None:
+                    attention_mask = causal_mask + attention_mask
+
+            else:
+                attention_mask = attention_mask[:, :, :, : key.shape[-2]]
+
+        sdpa_result = torch.nn.functional.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=dropout_p, is_causal=False
+        )
+
+    # in gpt-neo-x and gpt-j the query and keys are always in fp32
+    # thus we need to cast them to the value dtype
+    if self.downcast_qk:
+        sdpa_result = sdpa_result.to(value.dtype)
+
+    return sdpa_result, None
+
 
 # Adapted from transformers.models.bark.modeling_bark.BarkSelfAttention._attn
 def bark_wrapped_scaled_dot_product(
diff --git a/optimum/bettertransformer/models/decoder_models.py b/optimum/bettertransformer/models/decoder_models.py
index 8393c92b8b..11616162f3 100644
--- a/optimum/bettertransformer/models/decoder_models.py
+++ b/optimum/bettertransformer/models/decoder_models.py
@@ -43,6 +43,7 @@
     bloom_forward,
     codegen_wrapped_scaled_dot_product,
     gpt2_wrapped_scaled_dot_product,
+    gptj_wrapped_scaled_dot_product,
     gpt_neo_wrapped_scaled_dot_product,
     opt_forward,
     t5_forward,
@@ -82,7 +83,7 @@ def forward(self, *args, **kwargs):
 
 
 class GPTJAttentionLayerBetterTransformer(BetterTransformerBaseLayer, GPTJAttention, nn.Module):
-    _attn = gpt2_wrapped_scaled_dot_product
+    _attn = gptj_wrapped_scaled_dot_product
 
     def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
         super().__init__(config)
@@ -108,6 +109,9 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
         if hasattr(layer, "masked_bias"):
             submodules.append("masked_bias")
 
+        # Attribute only for transformers>=4.45
+        if hasattr(layer, "layer_idx"):
+            submodules.append("layer_idx")
 
         for attr in submodules:
             setattr(self, attr, getattr(layer, attr))
@@ -132,6 +136,11 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
 
         self.module_mapping = None
         submodules = ["rotary_emb", "query_key_value", "dense", "bias", "masked_bias", "norm_factor"]
+
+        # Attribute only for transformers>=4.45
+        if hasattr(layer, "layer_idx"):
+            submodules.append("layer_idx")
+
         for attr in submodules:
             setattr(self, attr, getattr(layer, attr))
 
@@ -160,6 +169,11 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
 
         self.module_mapping = None
         submodules = ["attn_dropout", "resid_dropout", "k_proj", "v_proj", "q_proj", "out_proj", "bias", "masked_bias"]
+        
+        # Attribute only for transformers>=4.45
+        if hasattr(layer, "layer_idx"):
+            submodules.append("layer_idx")
+
         for attr in submodules:
             setattr(self, attr, getattr(layer, attr))
 
@@ -253,6 +267,10 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
         if hasattr(layer, "causal_mask"):
             submodules.append("causal_mask")
 
+        # Attribute only for transformers>=4.45
+        if hasattr(layer, "layer_idx"):
+            submodules.append("layer_idx")
+            
         for attr in submodules:
             setattr(self, attr, getattr(layer, attr))
 

From 3572a0b4d6f96019b82101f23ba2f80b19791500 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Fri, 27 Sep 2024 14:59:39 +0200
Subject: [PATCH 17/22] format

---
 optimum/bettertransformer/models/attention.py      | 3 +--
 optimum/bettertransformer/models/decoder_models.py | 6 +++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/optimum/bettertransformer/models/attention.py b/optimum/bettertransformer/models/attention.py
index 053f283f18..22b8faf1c2 100644
--- a/optimum/bettertransformer/models/attention.py
+++ b/optimum/bettertransformer/models/attention.py
@@ -91,6 +91,7 @@ def gpt2_wrapped_scaled_dot_product(
 
     return sdpa_result, None
 
+
 # Adapted from transformers.models.gptj.modeling_gptj.GPTJAttention._attn
 def gptj_wrapped_scaled_dot_product(
     self,
@@ -131,9 +132,7 @@ def gptj_wrapped_scaled_dot_product(
         # causal_mask is always [True, ..., True] otherwise, so executing this
         # is unnecessary
         if query_length > 1:
-
             if not check_if_transformers_greater("4.44.99"):
-
                 causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
 
                 causal_mask = torch.where(causal_mask, 0, mask_value)
diff --git a/optimum/bettertransformer/models/decoder_models.py b/optimum/bettertransformer/models/decoder_models.py
index 11616162f3..1fb7fe6fbf 100644
--- a/optimum/bettertransformer/models/decoder_models.py
+++ b/optimum/bettertransformer/models/decoder_models.py
@@ -43,8 +43,8 @@
     bloom_forward,
     codegen_wrapped_scaled_dot_product,
     gpt2_wrapped_scaled_dot_product,
-    gptj_wrapped_scaled_dot_product,
     gpt_neo_wrapped_scaled_dot_product,
+    gptj_wrapped_scaled_dot_product,
     opt_forward,
     t5_forward,
 )
@@ -169,7 +169,7 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
 
         self.module_mapping = None
         submodules = ["attn_dropout", "resid_dropout", "k_proj", "v_proj", "q_proj", "out_proj", "bias", "masked_bias"]
-        
+
         # Attribute only for transformers>=4.45
         if hasattr(layer, "layer_idx"):
             submodules.append("layer_idx")
@@ -270,7 +270,7 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
         # Attribute only for transformers>=4.45
         if hasattr(layer, "layer_idx"):
             submodules.append("layer_idx")
-            
+
         for attr in submodules:
             setattr(self, attr, getattr(layer, attr))
 

From d6e97cf224e2f65ea4c7d02c85934c381ee7b51c Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Fri, 27 Sep 2024 15:02:58 +0200
Subject: [PATCH 18/22] add warnign deprecation bettertransformer

---
 optimum/bettertransformer/transformation.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/optimum/bettertransformer/transformation.py b/optimum/bettertransformer/transformation.py
index 2105e19987..a101757b6f 100644
--- a/optimum/bettertransformer/transformation.py
+++ b/optimum/bettertransformer/transformation.py
@@ -206,6 +206,10 @@ def transform(
             The converted model if the conversion has been successful.
         """
 
+        logger.warning(
+            "The class `optimum.bettertransformers.transformation.BetterTransformer` is deprecated and will be removed in a future release."
+        )
+
         hf_config = model.config
         if hf_config.model_type in ["falcon", "gpt_bigcode", "llama", "whisper"]:
             raise ValueError(

From 0e2ed872a457bbb6d32af28e5636f3b35a6d8044 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Fri, 27 Sep 2024 15:49:36 +0200
Subject: [PATCH 19/22] bettertransformers fixes

---
 optimum/bettertransformer/models/decoder_models.py | 4 ++--
 tests/bettertransformer/testing_utils.py           | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/optimum/bettertransformer/models/decoder_models.py b/optimum/bettertransformer/models/decoder_models.py
index 1fb7fe6fbf..52d28d076d 100644
--- a/optimum/bettertransformer/models/decoder_models.py
+++ b/optimum/bettertransformer/models/decoder_models.py
@@ -171,8 +171,8 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
         submodules = ["attn_dropout", "resid_dropout", "k_proj", "v_proj", "q_proj", "out_proj", "bias", "masked_bias"]
 
         # Attribute only for transformers>=4.45
-        if hasattr(layer, "layer_idx"):
-            submodules.append("layer_idx")
+        if hasattr(layer, "layer_id"):
+            submodules.append("layer_id")
 
         for attr in submodules:
             setattr(self, attr, getattr(layer, attr))
diff --git a/tests/bettertransformer/testing_utils.py b/tests/bettertransformer/testing_utils.py
index e9e2edd979..098882180a 100644
--- a/tests/bettertransformer/testing_utils.py
+++ b/tests/bettertransformer/testing_utils.py
@@ -59,12 +59,12 @@
     # "llama": "fxmarty/tiny-llama-fast-tokenizer",
     # "llama-gqa": "noamwies/llama-test-gqa-with-better-transformer",
     "m2m_100": "hf-internal-testing/tiny-random-nllb",
-    "marian": "fxmarty/tiny-marian",  # the other tiny ones have a too small max_position_embeddings
+    "marian": "optimum-internal-testing/tiny-random-marian",  # the other tiny ones have a too small max_position_embeddings
     "markuplm": "hf-internal-testing/tiny-random-MarkupLMModel",
     "mbart": "hf-internal-testing/tiny-random-mbart",
     "opt": "hf-internal-testing/tiny-random-OPTModel",
     "pegasus": "hf-internal-testing/tiny-random-PegasusModel",
-    "prophetnet": "hirotasoshu/tiny-random-prophetnet",  # the other tiny ones have a too small max_position_embeddings
+    "prophetnet": "optimum-internal-testing/tiny-random-prophetnet",  # the other tiny ones have a too small max_position_embeddings
     "rembert": "hf-internal-testing/tiny-random-RemBertModel",
     "roberta": "hf-internal-testing/tiny-random-RobertaModel",
     "rocbert": "hf-internal-testing/tiny-random-RoCBertModel",

From bc28f035b78a1fd2aa8f4e25022def6a81347d7b Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Mon, 30 Sep 2024 14:19:11 +0200
Subject: [PATCH 20/22] disable transformers 4.45 for onnx export

---
 optimum/exporters/onnx/convert.py | 5 +++++
 setup.py                          | 7 +++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py
index 7142d5be4a..f2bf95f3e3 100644
--- a/optimum/exporters/onnx/convert.py
+++ b/optimum/exporters/onnx/convert.py
@@ -26,6 +26,7 @@
 
 import numpy as np
 import onnx
+import transformers
 from transformers.modeling_utils import get_parameter_dtype
 from transformers.utils import is_tf_available, is_torch_available
 
@@ -1000,6 +1001,10 @@ def onnx_export_from_model(
     >>> onnx_export_from_model(model, output="gpt2_onnx/")
     ```
     """
+    if check_if_transformers_greater("4.44.99"):
+        raise ImportError(
+            f"ONNX conversion disabled for now for transformers version greater than v4.45, found {transformers.__version__}"
+        )
 
     TasksManager.standardize_model_attributes(model)
 
diff --git a/setup.py b/setup.py
index 231dc9110e..24c1ae1cd4 100644
--- a/setup.py
+++ b/setup.py
@@ -54,6 +54,7 @@
         "datasets>=1.2.1",
         "evaluate",
         "protobuf>=3.20.1",
+        "transformers<4.45.0",
     ],
     "onnxruntime-gpu": [
         "onnx",
@@ -62,9 +63,10 @@
         "evaluate",
         "protobuf>=3.20.1",
         "accelerate",  # ORTTrainer requires it.
+        "transformers<4.45.0",
     ],
-    "exporters": ["onnx", "onnxruntime", "timm"],
-    "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm"],
+    "exporters": ["onnx", "onnxruntime", "timm", "transformers<4.45.0"],
+    "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm", "transformers<4.45.0"],
     "exporters-tf": [
         "tensorflow>=2.4,<=2.12.1",
         "tf2onnx",
@@ -75,6 +77,7 @@
         "numpy<1.24.0",
         "datasets<=2.16",
         "transformers[sentencepiece]>=4.26,<4.38",
+        "transformers<4.45.0",
     ],
     "diffusers": ["diffusers"],
     "intel": "optimum-intel>=1.18.0",

From e7d3ba42cdc1b97a33dc0843417f9224442c6b45 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Mon, 30 Sep 2024 15:28:18 +0200
Subject: [PATCH 21/22] update model ID

---
 tests/onnxruntime/utils_onnxruntime_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index 804aa00f04..3a6af30df1 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -112,7 +112,7 @@
     "layoutlm": "hf-internal-testing/tiny-random-LayoutLMModel",
     "layoutlmv3": "hf-internal-testing/tiny-random-LayoutLMv3Model",
     "longt5": "hf-internal-testing/tiny-random-LongT5Model",
-    "llama": "fxmarty/tiny-llama-fast-tokenizer",
+    "llama": "optimum-internal-testing/tiny-random-llama",
     "m2m_100": "hf-internal-testing/tiny-random-m2m_100",
     "marian": "echarlaix/tiny-random-marian",
     "mbart": "hf-internal-testing/tiny-random-mbart",

From e146328029bbdc7320a7dbccf95c2a93ddf5dd89 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Mon, 30 Sep 2024 15:57:05 +0200
Subject: [PATCH 22/22] udpate model id

---
 tests/onnxruntime/test_modeling.py           | 2 +-
 tests/onnxruntime/utils_onnxruntime_tests.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index a19dbcd1a8..4b44acb38a 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -3147,7 +3147,7 @@ class ORTModelForAudioClassificationIntegrationTest(ORTModelTestMixin):
         "wavlm",
         "wav2vec2",
         "wav2vec2-conformer",
-        # "whisper", saving of the model's config is broken in transformers v4.45.0
+        "whisper",
     ]
 
     FULL_GRID = {"model_arch": SUPPORTED_ARCHITECTURES}
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index 3a6af30df1..a51249781c 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -151,7 +151,7 @@
     "unispeech_sat": "hf-internal-testing/tiny-random-UnispeechSatModel",
     "vision-encoder-decoder": "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2",
     "vit": "hf-internal-testing/tiny-random-vit",
-    "whisper": "openai/whisper-tiny.en",  # hf-internal-testing ones are broken
+    "whisper": "optimum-internal-testing/tiny-random-whisper",
     "wav2vec2": "hf-internal-testing/tiny-random-Wav2Vec2Model",
     "wav2vec2-conformer": "hf-internal-testing/tiny-random-wav2vec2-conformer",
     "wavlm": "hf-internal-testing/tiny-random-WavlmModel",