forked from huggingface/transformers
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Updated documentation and added conversion utility (huggingface#34319)
* Updated documentation and added conversion utility * Update docs/source/en/tiktoken.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/tiktoken.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Moved util function to integration folder + allow for str * Update formatting Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Updated formatting * style changes --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
- Loading branch information
1 parent
d786bd4
commit 5e0ee45
Showing
2 changed files
with
67 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
from pathlib import Path | ||
from typing import Any | ||
|
||
from transformers.convert_slow_tokenizer import TikTokenConverter | ||
from transformers.tokenization_utils_fast import TIKTOKEN_VOCAB_FILE, TOKENIZER_FILE | ||
|
||
|
||
def convert_tiktoken_to_fast(encoding: Any, output_dir: str): | ||
""" | ||
Converts given `tiktoken` encoding to `PretrainedTokenizerFast` and saves the configuration of converted tokenizer | ||
on disk. | ||
Args: | ||
encoding (`str` or `tiktoken.Encoding`): | ||
Tokenizer from `tiktoken` library. If `encoding` is `str`, the tokenizer will be loaded with | ||
`tiktoken.get_encoding(encoding)`. | ||
output_dir (`str`): | ||
Save path for converted tokenizer configuration file. | ||
""" | ||
output_dir = Path(output_dir) | ||
output_dir.mkdir(exist_ok=True) | ||
|
||
save_file = output_dir / "tiktoken" / TIKTOKEN_VOCAB_FILE | ||
tokenizer_file = output_dir / TOKENIZER_FILE | ||
|
||
save_file_absolute = str(save_file.absolute()) | ||
output_file_absolute = str(tokenizer_file.absolute()) | ||
|
||
try: | ||
from tiktoken import get_encoding | ||
from tiktoken.load import dump_tiktoken_bpe | ||
|
||
if isinstance(encoding, str): | ||
encoding = get_encoding(encoding) | ||
|
||
dump_tiktoken_bpe(encoding._mergeable_ranks, save_file_absolute) | ||
except ImportError: | ||
raise ValueError( | ||
"`tiktoken` is required to save a `tiktoken` file. Install it with " "`pip install tiktoken`." | ||
) | ||
|
||
tokenizer = TikTokenConverter( | ||
vocab_file=save_file_absolute, pattern=encoding._pat_str, additional_special_tokens=encoding._special_tokens | ||
).tokenizer() | ||
tokenizer.save(output_file_absolute) |