Skip to content

Commit

Permalink
Merge pull request #846 from OptimalScale/yizhenjia-tokenizer-mp
Browse files Browse the repository at this point in the history
[Bug fix] Blocking function args missing fix
  • Loading branch information
research4pan authored May 31, 2024
2 parents 365b5ef + 6b63319 commit 69b0e99
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 2 deletions.
2 changes: 1 addition & 1 deletion src/lmflow/models/hf_decoder_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ def tokenize(self, dataset, add_special_tokens=True, *args, **kwargs):
(
raw_datasets.get_fingerprint()
+ str(self.tokenizer)
+ str(conversation_template) if "conversation" in dataset_type else ""
+ ('###conversation_template=' + str(conversation_template) if "conversation" in dataset_type else "")
+ f'###disable_group_texts={data_args.disable_group_texts}'
+ f'###block_size={data_args.block_size}'
).encode("utf-8")
Expand Down
7 changes: 6 additions & 1 deletion src/lmflow/tokenization/hf_decoder_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,12 @@ def conversation_tokenize_function(
token_dict["labels"][i].extend(labels)

if data_args.disable_group_texts:
token_dict = blocking(token_dict)
token_dict = blocking(
token_dict=token_dict,
block_size=data_args.block_size,
model_max_length=tokenizer.model_max_length,
pad_token_id=tokenizer.pad_token_id,
)

# clm input could be much much longer than block_size
if "Token indices sequence length is longer than the" in cl.out:
Expand Down

0 comments on commit 69b0e99

Please sign in to comment.