Skip to content

Commit

Permalink
Fix convert_tokens_to_string when decoder is None
Browse files Browse the repository at this point in the history
  • Loading branch information
dszeto committed Nov 1, 2024
1 parent 33868a0 commit 1dd4100
Showing 1 changed file with 32 additions and 9 deletions.
41 changes: 32 additions & 9 deletions src/transformers/tokenization_utils_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,12 @@
from tokenizers import Encoding as EncodingFast
from tokenizers import Tokenizer as TokenizerFast
from tokenizers.decoders import Decoder as DecoderFast
from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer
from tokenizers.trainers import (
BpeTrainer,
UnigramTrainer,
WordLevelTrainer,
WordPieceTrainer,
)

from .convert_slow_tokenizer import convert_slow_tokenizer
from .integrations.ggml import convert_gguf_tokenizer
Expand Down Expand Up @@ -75,7 +80,10 @@
"WordPiece": WordPieceTrainer,
}

VOCAB_FILES_NAMES = {"tokenizer_file": TOKENIZER_FILE, "vocab_file": TIKTOKEN_VOCAB_FILE}
VOCAB_FILES_NAMES = {
"tokenizer_file": TOKENIZER_FILE,
"vocab_file": TIKTOKEN_VOCAB_FILE,
}


@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
Expand Down Expand Up @@ -404,7 +412,13 @@ def convert_ids_to_tokens(
tokens.append(self._tokenizer.id_to_token(index))
return tokens

def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
def tokenize(
self,
text: str,
pair: Optional[str] = None,
add_special_tokens: bool = False,
**kwargs,
) -> List[str]:
return self.encode_plus(text=text, text_pair=pair, add_special_tokens=add_special_tokens, **kwargs).tokens()

def set_truncation_and_padding(
Expand Down Expand Up @@ -473,7 +487,7 @@ def set_truncation_and_padding(
length = max_length if padding_strategy == PaddingStrategy.MAX_LENGTH else None
_target = {
"length": length,
"direction": padding_side if padding_side is not None else self.padding_side,
"direction": (padding_side if padding_side is not None else self.padding_side),
"pad_id": self.pad_token_id,
"pad_token": self.pad_token,
"pad_type_id": self.pad_token_type_id,
Expand All @@ -485,7 +499,10 @@ def set_truncation_and_padding(
def _batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
List[TextInput], List[TextInputPair], List[PreTokenizedInput], List[PreTokenizedInputPair]
List[TextInput],
List[TextInputPair],
List[PreTokenizedInput],
List[PreTokenizedInputPair],
],
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
Expand Down Expand Up @@ -624,7 +641,7 @@ def _encode_plus(
if return_tensors is None and not return_overflowing_tokens:
batched_output = BatchEncoding(
{
key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
key: (value[0] if len(value) > 0 and isinstance(value[0], list) else value)
for key, value in batched_output.items()
},
batched_output.encodings,
Expand All @@ -635,7 +652,11 @@ def _encode_plus(
return batched_output

def convert_tokens_to_string(self, tokens: List[str]) -> str:
return self.backend_tokenizer.decoder.decode(tokens)
return (
self.backend_tokenizer.decoder.decode(tokens)
if self.backend_tokenizer.decoder is not None
else " ".join(tokens)
)

def _decode(
self,
Expand Down Expand Up @@ -689,7 +710,8 @@ def _save_pretrained(

if save_slow:
added_tokens_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
save_directory,
(filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE,
)
# make sure to be foward compatible
added_vocab = {tok: index for tok, index in self.added_tokens_encoder.items() if index >= self.vocab_size}
Expand All @@ -703,7 +725,8 @@ def _save_pretrained(

if save_fast:
tokenizer_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_FILE
save_directory,
(filename_prefix + "-" if filename_prefix else "") + TOKENIZER_FILE,
)
self.backend_tokenizer.save(tokenizer_file)
file_names = file_names + (tokenizer_file,)
Expand Down

0 comments on commit 1dd4100

Please sign in to comment.
  NODES
COMMUNITY 1
Note 1
Project 3
USERS 1