Skip to content

Commit

Permalink
fix(tokenizer): handle fast tokenizer properly for bos/eos (axolotl-a…
Browse files Browse the repository at this point in the history
  • Loading branch information
NanoCode012 committed Dec 8, 2023
1 parent d7797cf commit bfe1425
Showing 1 changed file with 18 additions and 0 deletions.
18 changes: 18 additions & 0 deletions src/axolotl/utils/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def load_tokenizer(cfg):
"LlamaTokenizer",
"LlamaTokenizerFast",
"CodeLlamaTokenizer",
"CodeLlamaTokenizerFast",
]
and hasattr(tokenizer, "pad_token")
and not tokenizer.pad_token
Expand Down Expand Up @@ -124,6 +125,23 @@ def load_tokenizer(cfg):
tokenizer.add_special_tokens(
{k: AddedToken(val, rstrip=False, lstrip=False, normalized=False)}
)

# If we add bos_token and eos_token, we need to update the post processor to
# handle them correctly.
# https://github.com/huggingface/transformers/pull/24132
bos_or_eos_in_special_tokens = (
"bos_token" in cfg.special_tokens and "eos_token" in cfg.special_tokens
)
if (
tokenizer.__class__.__name__
in (
"LlamaTokenizerFast",
"CodeLlamaTokenizerFast",
)
and bos_or_eos_in_special_tokens
):
tokenizer.update_post_processor()

if cfg.tokens:
tokenizer.add_tokens(
[
Expand Down

0 comments on commit bfe1425

Please sign in to comment.