Skip to content

Commit

Permalink
Check if the given token is a string (#745)
Browse files Browse the repository at this point in the history
  • Loading branch information
silverriver committed Mar 14, 2024
1 parent c8566e8 commit aa0a35e
Showing 1 changed file with 6 additions and 2 deletions.
8 changes: 6 additions & 2 deletions outlines/integrations/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,15 @@ def adapt_tokenizer(tokenizer: PreTrainedTokenizerBase) -> PreTrainedTokenizerBa
tokenizer.vocabulary = tokenizer.get_vocab()
tokenizer.special_tokens = set(tokenizer.all_special_tokens)

def convert_token_to_string(token: str) -> str:
def convert_token_to_string(token: Union[str, bytes]) -> str:
string = tokenizer.convert_tokens_to_string([token])

# A hack to handle missing spaces to HF's Llama tokenizers
if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
if (
type(token) is str
and token.startswith(SPIECE_UNDERLINE)
or token == "<0x20>"
):
return " " + string

return string
Expand Down

0 comments on commit aa0a35e

Please sign in to comment.