-
Notifications
You must be signed in to change notification settings - Fork 212
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* enable mpt peft LORA finetune in Gaudi1 * update README * mpt model change due to DL1 lack of support for torch.roll
- Loading branch information
Showing
14 changed files
with
2,205 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
Empty file.
41 changes: 41 additions & 0 deletions
41
workflows/chatbot/fine_tuning/instruction_tuning_pipeline/models/mpt/adapt_tokenizer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
from typing import Union | ||
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast | ||
Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast] | ||
NUM_SENTINEL_TOKENS: int = 100 | ||
|
||
def adapt_tokenizer_for_denoising(tokenizer: Tokenizer): | ||
"""Adds sentinel tokens and padding token (if missing). | ||
Expands the tokenizer vocabulary to include sentinel tokens | ||
used in mixture-of-denoiser tasks as well as a padding token. | ||
All added tokens are added as special tokens. No tokens are | ||
added if sentinel tokens and padding token already exist. | ||
""" | ||
sentinels_to_add = [f'<extra_id_{i}>' for i in range(NUM_SENTINEL_TOKENS)] | ||
tokenizer.add_tokens(sentinels_to_add, special_tokens=True) | ||
if tokenizer.pad_token is None: | ||
tokenizer.add_tokens('<pad>', special_tokens=True) | ||
tokenizer.pad_token = '<pad>' | ||
assert tokenizer.pad_token_id is not None | ||
sentinels = ''.join([f'<extra_id_{i}>' for i in range(NUM_SENTINEL_TOKENS)]) | ||
_sentinel_token_ids = tokenizer(sentinels, add_special_tokens=False).input_ids | ||
tokenizer.sentinel_token_ids = _sentinel_token_ids | ||
|
||
class AutoTokenizerForMOD(AutoTokenizer): | ||
"""AutoTokenizer + Adaptation for MOD. | ||
A simple wrapper around AutoTokenizer to make instantiating | ||
an MOD-adapted tokenizer a bit easier. | ||
MOD-adapted tokenizers have sentinel tokens (e.g., <extra_id_0>), | ||
a padding token, and a property to get the token ids of the | ||
sentinel tokens. | ||
""" | ||
|
||
@classmethod | ||
def from_pretrained(cls, *args, **kwargs): | ||
"""See `AutoTokenizer.from_pretrained` docstring.""" | ||
tokenizer = super().from_pretrained(*args, **kwargs) | ||
adapt_tokenizer_for_denoising(tokenizer) | ||
return tokenizer |
Oops, something went wrong.