Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BART & FSMT: fix decoder not returning hidden states from the last layer #8597

Merged
merged 8 commits into from
Nov 27, 2020
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/transformers/models/bart/modeling_bart.py
Original file line number Diff line number Diff line change
Expand Up @@ -610,6 +610,12 @@ def forward(
all_self_attns += (layer_self_attn,)
all_cross_attentions += (layer_cross_attn,)

# add hidden states from the last decoder layer
if output_hidden_states:
x = x.transpose(0, 1)
all_hidden_states += (x,)
x = x.transpose(0, 1)

if self.layer_norm: # if config.add_final_layer_norm (mBART)
x = self.layer_norm(x)

Expand Down
6 changes: 6 additions & 0 deletions src/transformers/models/fsmt/modeling_fsmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -692,6 +692,12 @@ def forward(
all_self_attns += (layer_self_attn,)
all_cross_attns += (layer_cross_attn,)

# add hidden states from the last decoder layer
if output_hidden_states:
x = x.transpose(0, 1)
all_hidden_states += (x,)
x = x.transpose(0, 1)

# Convert to standard output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim)
x = x.transpose(0, 1)
encoder_hidden_states = encoder_hidden_states.transpose(0, 1)
Expand Down
19 changes: 18 additions & 1 deletion tests/test_modeling_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -659,12 +659,14 @@ def check_hidden_states_output(inputs_dict, config, model_class):

with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
hidden_states = outputs["hidden_states"] if "hidden_states" in outputs else outputs[-1]

hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states

expected_num_layers = getattr(
self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
)
self.assertEqual(len(hidden_states), expected_num_layers)

if hasattr(self.model_tester, "encoder_seq_length"):
seq_length = self.model_tester.encoder_seq_length
if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
Expand All @@ -677,9 +679,24 @@ def check_hidden_states_output(inputs_dict, config, model_class):
[seq_length, self.model_tester.hidden_size],
)

if config.is_encoder_decoder:
hidden_states = outputs.decoder_hidden_states

self.assertIsInstance(hidden_states, (list, tuple))
self.assertEqual(len(hidden_states), expected_num_layers)
seq_len = getattr(self.model_tester, "seq_length", None)
decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)

self.assertListEqual(
list(hidden_states[0].shape[-2:]),
[decoder_seq_length, self.model_tester.hidden_size],
)

config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

for model_class in self.all_model_classes:
config.return_dict = True
MaksymDel marked this conversation as resolved.
Show resolved Hide resolved

inputs_dict["output_hidden_states"] = True
check_hidden_states_output(inputs_dict, config, model_class)

Expand Down