Skip to content

Commit

Permalink
Merge pull request #244 from instructlab/fix-pretrain-max
Browse files Browse the repository at this point in the history
Fix pretrain token list->int for masking
  • Loading branch information
mergify[bot] authored Oct 2, 2024
2 parents 63f128c + 5c368ee commit ed833b9
Showing 1 changed file with 3 additions and 3 deletions.
6 changes: 3 additions & 3 deletions src/instructlab/training/data_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ def main(args: DataProcessArgs):
print("\033[92mCategorizing training data type...\033[0m")
data_with_input_ids = data_with_input_ids.map(
lambda x: {
"is_pretrain": get_sp_token(tokenizer, "<|pretrain|>") in x["input_ids"]
"is_pretrain": get_sp_token(tokenizer, "<|pretrain|>")[0] in x["input_ids"]
},
num_proc=NUM_PROC,
)
Expand All @@ -320,8 +320,8 @@ def main(args: DataProcessArgs):
user_tokens=user_tk,
assist_tokens=assistant_tk,
system_tokens=system_tk,
pretrain_token=get_sp_token(tokenizer, "<|pretrain|>"),
pretrain_end_token=get_sp_token(tokenizer, "<|/pretrain|>"),
pretrain_token=get_sp_token(tokenizer, "<|pretrain|>")[0],
pretrain_end_token=get_sp_token(tokenizer, "<|/pretrain|>")[0],
)
print("\033[92munmasking the appropriate message content...\033[0m")
data_with_labels = data_with_input_ids.map(
Expand Down

0 comments on commit ed833b9

Please sign in to comment.