[mistral] Fix FA2 attention reshape for Mistral Nemo (#32065)

* [mistral] Fix FA2 attention reshape * [run-slow] mistral
huggingface · Jul 25, 2024 · 7e1468c · 7e1468c
1 parent 46e0353
commit 7e1468c
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py
@@ -387,7 +387,7 @@ def forward(
             is_causal=self.is_causal,
         )
 
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim).contiguous()
         attn_output = self.o_proj(attn_output)
 
         if not output_attentions: