From 454bc14d90ddf8eca69c6721a2dbd0af834804d0 Mon Sep 17 00:00:00 2001
From: Sai-Suraj-27 <sai.suraj.27.729@gmail.com>
Date: Mon, 15 Jul 2024 14:35:08 +0530
Subject: [PATCH 1/7] fix: Removed a wrong key-word argument in
 `sigmoid_focal_loss()` function call (#31951)

Removed a wrong key-word argument in sigmoid_focal_loss() function call.
---
 src/transformers/models/rt_detr/modeling_rt_detr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/rt_detr/modeling_rt_detr.py b/src/transformers/models/rt_detr/modeling_rt_detr.py
index 850b8dc2f627b3..e61521d8880077 100644
--- a/src/transformers/models/rt_detr/modeling_rt_detr.py
+++ b/src/transformers/models/rt_detr/modeling_rt_detr.py
@@ -2163,7 +2163,7 @@ def loss_labels_focal(self, outputs, targets, indices, num_boxes, log=True):
         target_classes[idx] = target_classes_original
 
         target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1]
-        loss = sigmoid_focal_loss(src_logits, target, self.alpha, self.gamma, reduction="none")
+        loss = sigmoid_focal_loss(src_logits, target, self.alpha, self.gamma)
         loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes
         return {"loss_focal": loss}
 

From 907500423d240cd660944960dd32a6d56d4ca693 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Mon, 15 Jul 2024 11:07:53 +0100
Subject: [PATCH 2/7] Generate: handle `logits_warper` update in models with
 custom generate fn (#31957)

handle logits_warper update in models with custom generate fn
---
 src/transformers/generation/utils.py          |  6 +-
 .../models/musicgen/modeling_musicgen.py      | 80 ++++---------------
 .../modeling_musicgen_melody.py               | 80 ++++---------------
 src/transformers/models/rag/modeling_rag.py   |  2 +
 4 files changed, 39 insertions(+), 129 deletions(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 970475c98879a8..9ce16f7a395e0b 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -2219,7 +2219,7 @@ def _dola_decoding(
         generation_config: GenerationConfig,
         synced_gpus: bool,
         streamer: "BaseStreamer",
-        logits_warper: LogitsProcessorList,
+        logits_warper: Optional[LogitsProcessorList],
         **model_kwargs,
     ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
         r"""
@@ -2826,7 +2826,7 @@ def _sample(
         generation_config: GenerationConfig,
         synced_gpus: bool,
         streamer: Optional["BaseStreamer"],
-        logits_warper: LogitsProcessorList,
+        logits_warper: Optional[LogitsProcessorList],
         **model_kwargs,
     ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
         r"""
@@ -3033,7 +3033,7 @@ def _beam_search(
         stopping_criteria: StoppingCriteriaList,
         generation_config: GenerationConfig,
         synced_gpus: bool,
-        logits_warper: LogitsProcessorList,
+        logits_warper: Optional[LogitsProcessorList],
         **model_kwargs,
     ) -> Union[GenerateBeamOutput, torch.LongTensor]:
         r"""
diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py
index 5101fef3df4e4e..0102d1c267c7ad 100644
--- a/src/transformers/models/musicgen/modeling_musicgen.py
+++ b/src/transformers/models/musicgen/modeling_musicgen.py
@@ -26,7 +26,7 @@
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...generation.configuration_utils import GenerationConfig
+from ...generation.configuration_utils import GenerationConfig, GenerationMode
 from ...generation.logits_process import ClassifierFreeGuidanceLogitsProcessor, LogitsProcessorList
 from ...generation.stopping_criteria import StoppingCriteriaList
 from ...modeling_attn_mask_utils import (
@@ -1618,16 +1618,7 @@ def generate(
         model_kwargs["delay_pattern_mask"] = delay_pattern_mask
 
         # 7. determine generation mode
-        is_greedy_gen_mode = (
-            (generation_config.num_beams == 1)
-            and (generation_config.num_beam_groups == 1)
-            and generation_config.do_sample is False
-        )
-        is_sample_gen_mode = (
-            (generation_config.num_beams == 1)
-            and (generation_config.num_beam_groups == 1)
-            and generation_config.do_sample is True
-        )
+        generation_mode = generation_config.get_generation_mode()
 
         # 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
         if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
@@ -1649,27 +1640,13 @@ def generate(
             generation_config=generation_config, stopping_criteria=stopping_criteria
         )
 
-        if is_greedy_gen_mode:
-            if generation_config.num_return_sequences > 1:
-                raise ValueError(
-                    "num_return_sequences has to be 1 when doing greedy search, "
-                    f"but is {generation_config.num_return_sequences}."
-                )
-
-            # 11. run greedy search
-            outputs = self._sample(
-                input_ids,
-                logits_processor=logits_processor,
-                stopping_criteria=stopping_criteria,
-                generation_config=generation_config,
-                synced_gpus=synced_gpus,
-                streamer=streamer,
-                **model_kwargs,
-            )
-
-        elif is_sample_gen_mode:
+        if generation_mode in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
             # 11. prepare logits warper
-            logits_warper = self._get_logits_warper(generation_config, device=input_ids.device)
+            prepared_logits_warper = (
+                self._get_logits_warper(generation_config, device=input_ids.device)
+                if generation_config.do_sample
+                else None
+            )
 
             # expand input_ids with `num_return_sequences` additional sequences per batch
             input_ids, model_kwargs = self._expand_inputs_for_generation(
@@ -1682,7 +1659,7 @@ def generate(
             outputs = self._sample(
                 input_ids,
                 logits_processor=logits_processor,
-                logits_warper=logits_warper,
+                logits_warper=prepared_logits_warper,
                 stopping_criteria=stopping_criteria,
                 generation_config=generation_config,
                 synced_gpus=synced_gpus,
@@ -2714,16 +2691,7 @@ def generate(
             streamer.put(input_ids.cpu())
 
         # 7. determine generation mode
-        is_greedy_gen_mode = (
-            (generation_config.num_beams == 1)
-            and (generation_config.num_beam_groups == 1)
-            and generation_config.do_sample is False
-        )
-        is_sample_gen_mode = (
-            (generation_config.num_beams == 1)
-            and (generation_config.num_beam_groups == 1)
-            and generation_config.do_sample is True
-        )
+        generation_mode = generation_config.get_generation_mode()
 
         # 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
         if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
@@ -2745,27 +2713,13 @@ def generate(
             generation_config=generation_config, stopping_criteria=stopping_criteria
         )
 
-        if is_greedy_gen_mode:
-            if generation_config.num_return_sequences > 1:
-                raise ValueError(
-                    "num_return_sequences has to be 1 when doing greedy search, "
-                    f"but is {generation_config.num_return_sequences}."
-                )
-
-            # 11. run greedy search
-            outputs = self._sample(
-                input_ids,
-                logits_processor=logits_processor,
-                stopping_criteria=stopping_criteria,
-                generation_config=generation_config,
-                synced_gpus=synced_gpus,
-                streamer=streamer,
-                **model_kwargs,
-            )
-
-        elif is_sample_gen_mode:
+        if generation_mode in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
             # 11. prepare logits warper
-            logits_warper = self._get_logits_warper(generation_config, device=input_ids.device)
+            prepared_logits_warper = (
+                self._get_logits_warper(generation_config, device=input_ids.device)
+                if generation_config.do_sample
+                else None
+            )
 
             # expand input_ids with `num_return_sequences` additional sequences per batch
             input_ids, model_kwargs = self._expand_inputs_for_generation(
@@ -2779,7 +2733,7 @@ def generate(
             outputs = self._sample(
                 input_ids,
                 logits_processor=logits_processor,
-                logits_warper=logits_warper,
+                logits_warper=prepared_logits_warper,
                 stopping_criteria=stopping_criteria,
                 generation_config=generation_config,
                 synced_gpus=synced_gpus,
diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
index 9a120dc3294f4c..3140b9f286448f 100644
--- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
@@ -26,7 +26,7 @@
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...generation.configuration_utils import GenerationConfig
+from ...generation.configuration_utils import GenerationConfig, GenerationMode
 from ...generation.logits_process import ClassifierFreeGuidanceLogitsProcessor, LogitsProcessorList
 from ...generation.stopping_criteria import StoppingCriteriaList
 from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
@@ -1539,16 +1539,7 @@ def generate(
         model_kwargs["delay_pattern_mask"] = delay_pattern_mask
 
         # 7. determine generation mode
-        is_greedy_gen_mode = (
-            (generation_config.num_beams == 1)
-            and (generation_config.num_beam_groups == 1)
-            and generation_config.do_sample is False
-        )
-        is_sample_gen_mode = (
-            (generation_config.num_beams == 1)
-            and (generation_config.num_beam_groups == 1)
-            and generation_config.do_sample is True
-        )
+        generation_mode = generation_config.get_generation_mode()
 
         # 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
         if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
@@ -1570,27 +1561,13 @@ def generate(
             generation_config=generation_config, stopping_criteria=stopping_criteria
         )
 
-        if is_greedy_gen_mode:
-            if generation_config.num_return_sequences > 1:
-                raise ValueError(
-                    "num_return_sequences has to be 1 when doing greedy search, "
-                    f"but is {generation_config.num_return_sequences}."
-                )
-
-            # 11. run greedy search
-            outputs = self._sample(
-                input_ids,
-                logits_processor=logits_processor,
-                stopping_criteria=stopping_criteria,
-                generation_config=generation_config,
-                synced_gpus=synced_gpus,
-                streamer=streamer,
-                **model_kwargs,
-            )
-
-        elif is_sample_gen_mode:
+        if generation_mode in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
             # 11. prepare logits warper
-            logits_warper = self._get_logits_warper(generation_config, device=input_ids.device)
+            prepared_logits_warper = (
+                self._get_logits_warper(generation_config, device=input_ids.device)
+                if generation_config.do_sample
+                else None
+            )
 
             # expand input_ids with `num_return_sequences` additional sequences per batch
             input_ids, model_kwargs = self._expand_inputs_for_generation(
@@ -1603,7 +1580,7 @@ def generate(
             outputs = self._sample(
                 input_ids,
                 logits_processor=logits_processor,
-                logits_warper=logits_warper,
+                logits_warper=prepared_logits_warper,
                 stopping_criteria=stopping_criteria,
                 generation_config=generation_config,
                 synced_gpus=synced_gpus,
@@ -2557,16 +2534,7 @@ def generate(
             streamer.put(input_ids.cpu())
 
         # 7. determine generation mode
-        is_greedy_gen_mode = (
-            (generation_config.num_beams == 1)
-            and (generation_config.num_beam_groups == 1)
-            and generation_config.do_sample is False
-        )
-        is_sample_gen_mode = (
-            (generation_config.num_beams == 1)
-            and (generation_config.num_beam_groups == 1)
-            and generation_config.do_sample is True
-        )
+        generation_mode = generation_config.get_generation_mode()
 
         # 8. prepare batched CFG externally (to enable coexistance with the unbatched CFG)
         if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
@@ -2588,27 +2556,13 @@ def generate(
             generation_config=generation_config, stopping_criteria=stopping_criteria
         )
 
-        if is_greedy_gen_mode:
-            if generation_config.num_return_sequences > 1:
-                raise ValueError(
-                    "num_return_sequences has to be 1 when doing greedy search, "
-                    f"but is {generation_config.num_return_sequences}."
-                )
-
-            # 11. run greedy search
-            outputs = self._sample(
-                input_ids,
-                logits_processor=logits_processor,
-                stopping_criteria=stopping_criteria,
-                generation_config=generation_config,
-                synced_gpus=synced_gpus,
-                streamer=streamer,
-                **model_kwargs,
-            )
-
-        elif is_sample_gen_mode:
+        if generation_mode in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
             # 11. prepare logits warper
-            logits_warper = self._get_logits_warper(generation_config, device=input_ids.device)
+            prepared_logits_warper = (
+                self._get_logits_warper(generation_config, device=input_ids.device)
+                if generation_config.do_sample
+                else None
+            )
 
             # expand input_ids with `num_return_sequences` additional sequences per batch
             input_ids, model_kwargs = self._expand_inputs_for_generation(
@@ -2622,7 +2576,7 @@ def generate(
             outputs = self._sample(
                 input_ids,
                 logits_processor=logits_processor,
-                logits_warper=logits_warper,
+                logits_warper=prepared_logits_warper,
                 stopping_criteria=stopping_criteria,
                 generation_config=generation_config,
                 synced_gpus=synced_gpus,
diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py
index 4f6c8dc384266c..5b170bde8a3343 100644
--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -1558,6 +1558,7 @@ def extend_enc_output(tensor, num_beams=None):
                 generation_config=generation_config,
                 synced_gpus=False,
                 streamer=None,
+                logits_warper=None,
                 **model_kwargs,
             )
         elif generation_config.num_beams > 1:
@@ -1579,6 +1580,7 @@ def extend_enc_output(tensor, num_beams=None):
                 stopping_criteria=prepared_stopping_criteria,
                 generation_config=generation_config,
                 synced_gpus=False,
+                logits_warper=None,
                 **model_kwargs,
             )
         else:

From 556a4205f00594a852bdda237211a24cb09715a6 Mon Sep 17 00:00:00 2001
From: Sai-Suraj-27 <sai.suraj.27.729@gmail.com>
Date: Mon, 15 Jul 2024 20:26:17 +0530
Subject: [PATCH 3/7] fix: Fixed the arguments in `create_repo()` function call
 (#31947)

* Fixed the arguments in create_repo() function call.

* Formatted the code properly using ruff.

* Formatted the code more clearly.
---
 src/transformers/commands/user.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/commands/user.py b/src/transformers/commands/user.py
index 938f4c8ea8b616..bf4072ce04689b 100644
--- a/src/transformers/commands/user.py
+++ b/src/transformers/commands/user.py
@@ -185,7 +185,7 @@ def run(self):
                 print("Abort")
                 exit()
         try:
-            url = create_repo(token, name=self.args.name, organization=self.args.organization)
+            url = create_repo(repo_id=full_name, token=token)
         except HTTPError as e:
             print(e)
             print(ANSI.red(e.response.text))

From 11efb4fc093683b3a057a213463ccc7beb544b56 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 15 Jul 2024 17:16:36 +0200
Subject: [PATCH 4/7] Notify new docker images built for circleci (#31701)

* hello

* hello

* hello

* hello

* hello

* hello

* hello

* notify

* trigger

* use new channel

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .github/workflows/build-ci-docker-images.yml | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-ci-docker-images.yml b/.github/workflows/build-ci-docker-images.yml
index 6f29df82769d82..a07b99af65d0b6 100644
--- a/.github/workflows/build-ci-docker-images.yml
+++ b/.github/workflows/build-ci-docker-images.yml
@@ -27,10 +27,10 @@ jobs:
     strategy:
       matrix:
         file: ["quality", "consistency", "custom-tokenizers", "torch-light", "tf-light", "exotic-models", "torch-tf-light", "torch-jax-light", "jax-light", "examples-torch",  "examples-tf"]
-    continue-on-error: true 
+    continue-on-error: true
 
     steps:
-      - 
+      -
         name: Set tag
         run: |
               if ${{contains(github.event.head_commit.message, '[build-ci-image]')}}; then
@@ -61,4 +61,17 @@ jobs:
             REF=${{ github.sha }}
           file: "./docker/${{ matrix.file }}.dockerfile"
           push: ${{ contains(github.event.head_commit.message, 'ci-image]') ||  github.event_name == 'schedule' }}
-          tags: ${{ env.TAG }}
\ No newline at end of file
+          tags: ${{ env.TAG }}
+
+  notify:
+    runs-on: ubuntu-22.04
+    if: ${{ contains(github.event.head_commit.message, '[build-ci-image]') || contains(github.event.head_commit.message, '[push-ci-image]') && '!cancelled()' || github.event_name == 'schedule' }}
+    steps:
+      - name: Post to Slack
+        if: ${{ contains(github.event.head_commit.message, '[push-ci-image]') && github.event_name != 'schedule' }}
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: "#transformers-ci-circleci-images"
+          title: 🤗 New docker images for CircleCI are pushed.
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
\ No newline at end of file

From a1a34657d41627b21dddf2bf9cc55941329a60b6 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 15 Jul 2024 17:56:24 +0200
Subject: [PATCH 5/7] Avoid race condition (#31973)

* [test_all] hub

* remove delete

* remove delete

* remove delete

* remove delete

* remove delete

* remove delete

* [test_all]

* [test_all]

* [test_all]

* [test_all]

* [test_all]

* [test_all]

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/generation/test_configuration_utils.py | 14 ++++++++++----
 tests/utils/test_configuration_utils.py      | 14 ++++++++++----
 tests/utils/test_feature_extraction_utils.py | 14 ++++++++++----
 tests/utils/test_image_processing_utils.py   | 14 ++++++++++----
 tests/utils/test_modeling_flax_utils.py      | 14 ++++++++++----
 tests/utils/test_modeling_tf_utils.py        | 14 ++++++++++----
 tests/utils/test_modeling_utils.py           | 14 ++++++++++----
 tests/utils/test_tokenization_utils.py       | 14 ++++++++++----
 8 files changed, 80 insertions(+), 32 deletions(-)

diff --git a/tests/generation/test_configuration_utils.py b/tests/generation/test_configuration_utils.py
index ece3f33a06070c..26b8d092fdcd2f 100644
--- a/tests/generation/test_configuration_utils.py
+++ b/tests/generation/test_configuration_utils.py
@@ -253,8 +253,11 @@ def test_push_to_hub(self):
             if k != "transformers_version":
                 self.assertEqual(v, getattr(new_config, k))
 
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-generation-config")
+        try:
+            # Reset repo
+            delete_repo(token=self._token, repo_id="test-generation-config")
+        except:  # noqa E722
+            pass
 
         # Push to hub via save_pretrained
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -278,8 +281,11 @@ def test_push_to_hub_in_organization(self):
             if k != "transformers_version":
                 self.assertEqual(v, getattr(new_config, k))
 
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-generation-config-org")
+        try:
+            # Reset repo
+            delete_repo(token=self._token, repo_id="valid_org/test-generation-config-org")
+        except:  # noqa E722
+            pass
 
         # Push to hub via save_pretrained
         with tempfile.TemporaryDirectory() as tmp_dir:
diff --git a/tests/utils/test_configuration_utils.py b/tests/utils/test_configuration_utils.py
index 6809b3a2ce5f0c..15adb213079e50 100644
--- a/tests/utils/test_configuration_utils.py
+++ b/tests/utils/test_configuration_utils.py
@@ -126,8 +126,11 @@ def test_push_to_hub(self):
             if k != "transformers_version":
                 self.assertEqual(v, getattr(new_config, k))
 
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-config")
+        try:
+            # Reset repo
+            delete_repo(token=self._token, repo_id="test-config")
+        except:  # noqa E722
+            pass
 
         # Push to hub via save_pretrained
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -149,8 +152,11 @@ def test_push_to_hub_in_organization(self):
             if k != "transformers_version":
                 self.assertEqual(v, getattr(new_config, k))
 
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-config-org")
+        try:
+            # Reset repo
+            delete_repo(token=self._token, repo_id="valid_org/test-config-org")
+        except:  # noqa E722
+            pass
 
         # Push to hub via save_pretrained
         with tempfile.TemporaryDirectory() as tmp_dir:
diff --git a/tests/utils/test_feature_extraction_utils.py b/tests/utils/test_feature_extraction_utils.py
index d88fcb276056d7..0e68addb2adceb 100644
--- a/tests/utils/test_feature_extraction_utils.py
+++ b/tests/utils/test_feature_extraction_utils.py
@@ -85,8 +85,11 @@ def test_push_to_hub(self):
         for k, v in feature_extractor.__dict__.items():
             self.assertEqual(v, getattr(new_feature_extractor, k))
 
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-feature-extractor")
+        try:
+            # Reset repo
+            delete_repo(token=self._token, repo_id="test-feature-extractor")
+        except:  # noqa E722
+            pass
 
         # Push to hub via save_pretrained
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -106,8 +109,11 @@ def test_push_to_hub_in_organization(self):
         for k, v in feature_extractor.__dict__.items():
             self.assertEqual(v, getattr(new_feature_extractor, k))
 
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-feature-extractor")
+        try:
+            # Reset repo
+            delete_repo(token=self._token, repo_id="valid_org/test-feature-extractor")
+        except:  # noqa E722
+            pass
 
         # Push to hub via save_pretrained
         with tempfile.TemporaryDirectory() as tmp_dir:
diff --git a/tests/utils/test_image_processing_utils.py b/tests/utils/test_image_processing_utils.py
index 4b2586a634d8a2..3681d1d1e1365b 100644
--- a/tests/utils/test_image_processing_utils.py
+++ b/tests/utils/test_image_processing_utils.py
@@ -96,8 +96,11 @@ def test_push_to_hub(self):
         for k, v in image_processor.__dict__.items():
             self.assertEqual(v, getattr(new_image_processor, k))
 
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-image-processor")
+        try:
+            # Reset repo
+            delete_repo(token=self._token, repo_id="test-image-processor")
+        except:  # noqa E722
+            pass
 
         # Push to hub via save_pretrained
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -117,8 +120,11 @@ def test_push_to_hub_in_organization(self):
         for k, v in image_processor.__dict__.items():
             self.assertEqual(v, getattr(new_image_processor, k))
 
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-image-processor")
+        try:
+            # Reset repo
+            delete_repo(token=self._token, repo_id="valid_org/test-image-processor")
+        except:  # noqa E722
+            pass
 
         # Push to hub via save_pretrained
         with tempfile.TemporaryDirectory() as tmp_dir:
diff --git a/tests/utils/test_modeling_flax_utils.py b/tests/utils/test_modeling_flax_utils.py
index 0309a3bd8f8ce0..5011c240cc9282 100644
--- a/tests/utils/test_modeling_flax_utils.py
+++ b/tests/utils/test_modeling_flax_utils.py
@@ -83,8 +83,11 @@ def test_push_to_hub(self):
             max_diff = (base_params[key] - new_params[key]).sum().item()
             self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
 
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-model-flax")
+        try:
+            # Reset repo
+            delete_repo(token=self._token, repo_id="test-model-flax")
+        except:  # noqa E722
+            pass
 
         # Push to hub via save_pretrained
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -115,8 +118,11 @@ def test_push_to_hub_in_organization(self):
             max_diff = (base_params[key] - new_params[key]).sum().item()
             self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
 
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-model-flax-org")
+        try:
+            # Reset repo
+            delete_repo(token=self._token, repo_id="valid_org/test-model-flax-org")
+        except:  # noqa E722
+            pass
 
         # Push to hub via save_pretrained
         with tempfile.TemporaryDirectory() as tmp_dir:
diff --git a/tests/utils/test_modeling_tf_utils.py b/tests/utils/test_modeling_tf_utils.py
index 8a281761333dc3..6332df014d57a1 100644
--- a/tests/utils/test_modeling_tf_utils.py
+++ b/tests/utils/test_modeling_tf_utils.py
@@ -723,8 +723,11 @@ def test_push_to_hub(self):
                 break
         self.assertTrue(models_equal)
 
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-model-tf")
+        try:
+            # Reset repo
+            delete_repo(token=self._token, repo_id="test-model-tf")
+        except:  # noqa E722
+            pass
 
         # Push to hub via save_pretrained
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -786,8 +789,11 @@ def test_push_to_hub_in_organization(self):
                 break
         self.assertTrue(models_equal)
 
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-model-tf-org")
+        try:
+            # Reset repo
+            delete_repo(token=self._token, repo_id="valid_org/test-model-tf-org")
+        except:  # noqa E722
+            pass
 
         # Push to hub via save_pretrained
         with tempfile.TemporaryDirectory() as tmp_dir:
diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py
index 83c8ec8499bfe4..ed540fd5e59b84 100644
--- a/tests/utils/test_modeling_utils.py
+++ b/tests/utils/test_modeling_utils.py
@@ -1847,8 +1847,11 @@ def test_push_to_hub(self):
         for p1, p2 in zip(model.parameters(), new_model.parameters()):
             self.assertTrue(torch.equal(p1, p2))
 
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-model")
+        try:
+            # Reset repo
+            delete_repo(token=self._token, repo_id="test-model")
+        except:  # noqa E722
+            pass
 
         # Push to hub via save_pretrained
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -1887,8 +1890,11 @@ def test_push_to_hub_in_organization(self):
         for p1, p2 in zip(model.parameters(), new_model.parameters()):
             self.assertTrue(torch.equal(p1, p2))
 
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-model-org")
+        try:
+            # Reset repo
+            delete_repo(token=self._token, repo_id="valid_org/test-model-org")
+        except:  # noqa E722
+            pass
 
         # Push to hub via save_pretrained
         with tempfile.TemporaryDirectory() as tmp_dir:
diff --git a/tests/utils/test_tokenization_utils.py b/tests/utils/test_tokenization_utils.py
index 5b2f2021565812..0df86dc3cc658d 100644
--- a/tests/utils/test_tokenization_utils.py
+++ b/tests/utils/test_tokenization_utils.py
@@ -146,8 +146,11 @@ def test_push_to_hub(self):
         new_tokenizer = BertTokenizer.from_pretrained(f"{USER}/test-tokenizer")
         self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
 
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-tokenizer")
+        try:
+            # Reset repo
+            delete_repo(token=self._token, repo_id="test-tokenizer")
+        except:  # noqa E722
+            pass
 
         # Push to hub via save_pretrained
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -167,8 +170,11 @@ def test_push_to_hub_in_organization(self):
         new_tokenizer = BertTokenizer.from_pretrained("valid_org/test-tokenizer-org")
         self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
 
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-tokenizer-org")
+        try:
+            # Reset repo
+            delete_repo(token=self._token, repo_id="valid_org/test-tokenizer-org")
+        except:  # noqa E722
+            pass
 
         # Push to hub via save_pretrained
         with tempfile.TemporaryDirectory() as tmp_dir:

From e4682de6358f9b9cefb73683588e588e4d9154f7 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Mon, 15 Jul 2024 18:49:37 +0100
Subject: [PATCH 6/7] Masking: remove flakiness from test (#31939)

---
 tests/models/whisper/test_modeling_whisper.py | 3 ---
 tests/test_modeling_common.py                 | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index dcb495d95a6e4d..5fc66f9a20551d 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -1571,9 +1571,6 @@ def test_custom_4d_attention_mask(self):
         out_last_tokens = logits[:, -1, :]  # last tokens in each batch line
         out_shared_prefix_last_tokens = logits_shared_prefix[0, -3:, :]  # last three tokens
 
-        # comparing greedily-chosen tokens:
-        assert torch.equal(out_last_tokens.max(axis=1).indices, out_shared_prefix_last_tokens.max(axis=1).indices)
-
         # comparing softmax-normalized logits:
         normalized_0 = torch.nn.functional.softmax(out_last_tokens)
         normalized_1 = torch.nn.functional.softmax(out_shared_prefix_last_tokens)
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 0ed3cee3c57a53..a73417e4164821 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -4486,9 +4486,6 @@ def test_custom_4d_attention_mask(self):
             out_last_tokens = logits[:, -1, :]  # last tokens in each batch line
             out_shared_prefix_last_tokens = logits_shared_prefix[0, -3:, :]  # last three tokens
 
-            # comparing greedily-chosen tokens:
-            assert torch.equal(out_last_tokens.max(axis=1).indices, out_shared_prefix_last_tokens.max(axis=1).indices)
-
             # comparing softmax-normalized logits:
             normalized_0 = F.softmax(out_last_tokens)
             normalized_1 = F.softmax(out_shared_prefix_last_tokens)

From 6fbea6d237cbdfc3c229cdadfa3c968cfb2d5142 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Mon, 15 Jul 2024 19:59:20 +0100
Subject: [PATCH 7/7] Generate: doc nits (#31982)

nits
---
 .../generation/configuration_utils.py         | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index dcdccad23a54c1..c7e626f1a7c284 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -113,10 +113,10 @@ class GenerationConfig(PushToHubMixin):
             heuristic is applied and the generation stops when is it very unlikely to find better candidates;
             `"never"`, where the beam search procedure only stops when there cannot be better candidates (canonical
             beam search algorithm).
-        max_time(`float`, *optional*):
+        max_time (`float`, *optional*):
             The maximum amount of time you allow the computation to run for in seconds. generation will still finish
             the current pass after allocated time has been passed.
-        stop_strings(`str or List[str]`, *optional*):
+        stop_strings (`str or List[str]`, *optional*):
             A string or a list of strings that should terminate generation if the model outputs them.
 
         > Parameters that control the generation strategy used
@@ -181,10 +181,10 @@ class GenerationConfig(PushToHubMixin):
             `length_penalty` < 0.0 encourages shorter sequences.
         no_repeat_ngram_size (`int`, *optional*, defaults to 0):
             If set to int > 0, all ngrams of that size can only occur once.
-        bad_words_ids(`List[List[int]]`, *optional*):
+        bad_words_ids (`List[List[int]]`, *optional*):
             List of list of token ids that are not allowed to be generated. Check
             [`~generation.NoBadWordsLogitsProcessor`] for further documentation and examples.
-        force_words_ids(`List[List[int]]` or `List[List[List[int]]]`, *optional*):
+        force_words_ids (`List[List[int]]` or `List[List[List[int]]]`, *optional*):
             List of token ids that must be generated. If given a `List[List[int]]`, this is treated as a simple list of
             words that must be included, the opposite to `bad_words_ids`. If given `List[List[List[int]]]`, this
             triggers a [disjunctive constraint](https://github.com/huggingface/transformers/issues/14081), where one
@@ -200,7 +200,7 @@ class GenerationConfig(PushToHubMixin):
             The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful for
             multilingual models like [mBART](../model_doc/mbart) where the first generated token needs to be the target
             language token.
-        forced_eos_token_id (`Union[int, List[int]]`, *optional*, defaults to `model.config.forced_eos_token_id`):
+        forced_eos_token_id (`int` or List[int]`, *optional*, defaults to `model.config.forced_eos_token_id`):
             The id of the token to force as the last generated token when `max_length` is reached. Optionally, use a
             list to set multiple *end-of-sequence* tokens.
         remove_invalid_values (`bool`, *optional*, defaults to `model.config.remove_invalid_values`):
@@ -210,7 +210,7 @@ class GenerationConfig(PushToHubMixin):
             This Tuple adds an exponentially increasing length penalty, after a certain amount of tokens have been
             generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates where
             penalty starts and `decay_factor` represents the factor of exponential decay
-        suppress_tokens  (`List[int]`, *optional*):
+        suppress_tokens (`List[int]`, *optional*):
             A list of tokens that will be suppressed at generation. The `SupressTokens` logit processor will set their
             log probs to `-inf` so that they are not sampled.
         begin_suppress_tokens  (`List[int]`, *optional*):
@@ -234,7 +234,7 @@ class GenerationConfig(PushToHubMixin):
         low_memory (`bool`, *optional*):
             Switch to sequential beam search and sequential topk for contrastive search to reduce peak memory.
             Used with beam search and contrastive search.
-        watermarking_config (Union[`WatermarkingConfig`, `dict`], *optional*):
+        watermarking_config (`WatermarkingConfig` or `dict`, *optional*):
             Arguments used to watermark the model outputs by adding a small bias to randomly selected set of "green" tokens.
             If passed as `Dict`, it will be converted to a `WatermarkingConfig` internally.
             See [this paper](https://arxiv.org/abs/2306.04634) for more details. Accepts the following keys:
@@ -249,12 +249,12 @@ class GenerationConfig(PushToHubMixin):
                     - "lefthash" (default): "green" tokens selection depend on the last token (Algorithm 2 from the paper)
                     - "selfhash": "green" tokens selection depends on the current token itself (Algorithm 3 from the paper)
                         The downside of this scheme is that it considers all possible next tokens and can be slower than "lefthash".
-            - context_width(`int`):
+            - context_width (`int`):
                 The context length of previous tokens to use in seeding. Higher context length makes watermarking more robust.
 
         > Parameters that define the output variables of generate
 
-        num_return_sequences(`int`, *optional*, defaults to 1):
+        num_return_sequences (`int`, *optional*, defaults to 1):
             The number of independently computed returned sequences for each element in the batch.
         output_attentions (`bool`, *optional*, defaults to `False`):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
@@ -284,7 +284,7 @@ class GenerationConfig(PushToHubMixin):
         encoder_no_repeat_ngram_size (`int`, *optional*, defaults to 0):
             If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the
             `decoder_input_ids`.
-        decoder_start_token_id (`Union[int, List[int]]`, *optional*):
+        decoder_start_token_id (`int` or `List[int]`, *optional*):
             If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token or a list of length
             `batch_size`. Indicating a list enables different start ids for each element in the batch
             (e.g. multilingual models with different target languages in one batch)
@@ -323,7 +323,7 @@ class GenerationConfig(PushToHubMixin):
 
         cache_implementation (`str`, *optional*, default to `None`):
             Cache class that should be used when generating.
-        cache_config (`Union[CacheConfig, dict]`, *optional*, default to `None`):
+        cache_config (`CacheConfig` or `dict`, *optional*, default to `None`):
             Arguments used in the key-value cache class can be passed in `cache_config`. Can be passed as a `Dict` and
             it will be converted to its repsective `CacheConfig` internally.
             Otherwise can be passed as a `CacheConfig` class matching the indicated `cache_implementation`.