Merge pull request #846 from OptimalScale/yizhenjia-tokenizer-mp

[Bug fix] Blocking function args missing fix
OptimalScale · May 31, 2024 · 69b0e99 · 69b0e99
2 parents 365b5ef + 6b63319
commit 69b0e99
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 2 deletions.
diff --git a/src/lmflow/models/hf_decoder_model.py b/src/lmflow/models/hf_decoder_model.py
@@ -491,7 +491,7 @@ def tokenize(self, dataset, add_special_tokens=True, *args, **kwargs):
                 (
                     raw_datasets.get_fingerprint()
                     + str(self.tokenizer)
-                    + str(conversation_template) if "conversation" in dataset_type else ""
+                    + ('###conversation_template=' + str(conversation_template) if "conversation" in dataset_type else "")
                     + f'###disable_group_texts={data_args.disable_group_texts}'
                     + f'###block_size={data_args.block_size}'
                 ).encode("utf-8")

diff --git a/src/lmflow/tokenization/hf_decoder_model.py b/src/lmflow/tokenization/hf_decoder_model.py
@@ -170,7 +170,12 @@ def conversation_tokenize_function(
             token_dict["labels"][i].extend(labels)
 
     if data_args.disable_group_texts:
-        token_dict = blocking(token_dict)
+        token_dict = blocking(
+            token_dict=token_dict,
+            block_size=data_args.block_size,
+            model_max_length=tokenizer.model_max_length,
+            pad_token_id=tokenizer.pad_token_id,
+        )
 
     # clm input could be much much longer than block_size
     if "Token indices sequence length is longer than the" in cl.out: