diff --git a/docs/source/api_ref_modules.rst b/docs/source/api_ref_modules.rst index 5eb8fff358..979e57347f 100644 --- a/docs/source/api_ref_modules.rst +++ b/docs/source/api_ref_modules.rst @@ -48,10 +48,10 @@ model specific tokenizers. :toctree: generated/ :nosignatures: - tokenizers.SentencePieceBaseTokenizer - tokenizers.TikTokenBaseTokenizer - tokenizers.ModelTokenizer - tokenizers.BaseTokenizer + transforms.tokenizers.SentencePieceBaseTokenizer + transforms.tokenizers.TikTokenBaseTokenizer + transforms.tokenizers.ModelTokenizer + transforms.tokenizers.BaseTokenizer Tokenizer Utilities ------------------- @@ -61,8 +61,8 @@ These are helper methods that can be used by any tokenizer. :toctree: generated/ :nosignatures: - tokenizers.tokenize_messages_no_special_tokens - tokenizers.parse_hf_tokenizer_json + transforms.tokenizers.tokenize_messages_no_special_tokens + transforms.tokenizers.parse_hf_tokenizer_json PEFT Components diff --git a/docs/source/basics/custom_components.rst b/docs/source/basics/custom_components.rst index f252cb197e..0f742644dc 100644 --- a/docs/source/basics/custom_components.rst +++ b/docs/source/basics/custom_components.rst @@ -117,7 +117,7 @@ our models in torchtune - see :func:`~torchtune.models.llama3_2_vision.llama3_2_ # from torchtune.datasets import SFTDataset, PackedDataset from torchtune.data import InputOutputToMessages - from torchtune.modules.tokenizers import ModelTokenizer + from torchtune.modules.transforms.tokenizers import ModelTokenizer # Example builder function for a custom code instruct dataset not in torchtune, but using # different dataset building blocks from torchtune diff --git a/docs/source/basics/model_transforms.rst b/docs/source/basics/model_transforms.rst index c10cb1abd8..71e7e08bd5 100644 --- a/docs/source/basics/model_transforms.rst +++ b/docs/source/basics/model_transforms.rst @@ -101,7 +101,7 @@ The following methods are required on the model transform: .. code-block:: python - from torchtune.modules.tokenizers import ModelTokenizer + from torchtune.modules.transforms.tokenizers import ModelTokenizer from torchtune.modules.transforms import Transform class MyMultimodalTransform(ModelTokenizer, Transform): diff --git a/docs/source/basics/tokenizers.rst b/docs/source/basics/tokenizers.rst index d637961c54..47be88fe0c 100644 --- a/docs/source/basics/tokenizers.rst +++ b/docs/source/basics/tokenizers.rst @@ -168,7 +168,7 @@ For example, here we change the ``"<|begin_of_text|>"`` and ``"<|end_of_text|>"` Base tokenizers --------------- -:class:`~torchtune.modules.tokenizers.BaseTokenizer` are the underlying byte-pair encoding modules that perform the actual raw string to token ID conversion and back. +:class:`~torchtune.modules.transforms.tokenizers.BaseTokenizer` are the underlying byte-pair encoding modules that perform the actual raw string to token ID conversion and back. In torchtune, they are required to implement ``encode`` and ``decode`` methods, which are called by the :ref:`model_tokenizers` to convert between raw text and token IDs. @@ -202,13 +202,13 @@ between raw text and token IDs. """ pass -If you load any :ref:`model_tokenizers`, you can see that it calls its underlying :class:`~torchtune.modules.tokenizers.BaseTokenizer` +If you load any :ref:`model_tokenizers`, you can see that it calls its underlying :class:`~torchtune.modules.transforms.tokenizers.BaseTokenizer` to do the actual encoding and decoding. .. code-block:: python from torchtune.models.mistral import mistral_tokenizer - from torchtune.modules.tokenizers import SentencePieceBaseTokenizer + from torchtune.modules.transforms.tokenizers import SentencePieceBaseTokenizer m_tokenizer = mistral_tokenizer("/tmp/Mistral-7B-v0.1/tokenizer.model") # Mistral uses SentencePiece for its underlying BPE @@ -227,7 +227,7 @@ to do the actual encoding and decoding. Model tokenizers ---------------- -:class:`~torchtune.modules.tokenizers.ModelTokenizer` are specific to a particular model. They are required to implement the ``tokenize_messages`` method, +:class:`~torchtune.modules.transforms.tokenizers.ModelTokenizer` are specific to a particular model. They are required to implement the ``tokenize_messages`` method, which converts a list of Messages into a list of token IDs. .. code-block:: python @@ -259,7 +259,7 @@ is because they add all the necessary special tokens or prompt templates require .. code-block:: python from torchtune.models.mistral import mistral_tokenizer - from torchtune.modules.tokenizers import SentencePieceBaseTokenizer + from torchtune.modules.transforms.tokenizers import SentencePieceBaseTokenizer from torchtune.data import Message m_tokenizer = mistral_tokenizer("/tmp/Mistral-7B-v0.1/tokenizer.model") diff --git a/recipes/eleuther_eval.py b/recipes/eleuther_eval.py index 97744fb9b8..fd1ac8f6e1 100644 --- a/recipes/eleuther_eval.py +++ b/recipes/eleuther_eval.py @@ -31,8 +31,8 @@ from torchtune.modules import TransformerDecoder from torchtune.modules.common_utils import local_kv_cache from torchtune.modules.model_fusion import DeepFusionModel -from torchtune.modules.tokenizers import ModelTokenizer from torchtune.modules.transforms import Transform +from torchtune.modules.transforms.tokenizers import ModelTokenizer from torchtune.recipe_interfaces import EvalRecipeInterface from torchtune.training import FullModelTorchTuneCheckpointer diff --git a/tests/test_utils.py b/tests/test_utils.py index 6497539869..ca28029710 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -20,8 +20,8 @@ import torch from torch import nn from torchtune.data import Message, PromptTemplate, truncate -from torchtune.modules.tokenizers import ModelTokenizer from torchtune.modules.transforms import Transform +from torchtune.modules.transforms.tokenizers import ModelTokenizer skip_if_cuda_not_available = unittest.skipIf( not torch.cuda.is_available(), "CUDA is not available" diff --git a/tests/torchtune/modules/tokenizers/test_sentencepiece.py b/tests/torchtune/modules/transforms/tokenizers/test_sentencepiece.py similarity index 97% rename from tests/torchtune/modules/tokenizers/test_sentencepiece.py rename to tests/torchtune/modules/transforms/tokenizers/test_sentencepiece.py index d11c1b9c52..217f0bf2d8 100644 --- a/tests/torchtune/modules/tokenizers/test_sentencepiece.py +++ b/tests/torchtune/modules/transforms/tokenizers/test_sentencepiece.py @@ -7,7 +7,7 @@ import pytest from tests.common import ASSETS -from torchtune.modules.tokenizers import SentencePieceBaseTokenizer +from torchtune.modules.transforms.tokenizers import SentencePieceBaseTokenizer class TestSentencePieceBaseTokenizer: diff --git a/tests/torchtune/modules/tokenizers/test_tiktoken.py b/tests/torchtune/modules/transforms/tokenizers/test_tiktoken.py similarity index 98% rename from tests/torchtune/modules/tokenizers/test_tiktoken.py rename to tests/torchtune/modules/transforms/tokenizers/test_tiktoken.py index e7e69f62d3..5d3608d4bd 100644 --- a/tests/torchtune/modules/tokenizers/test_tiktoken.py +++ b/tests/torchtune/modules/transforms/tokenizers/test_tiktoken.py @@ -8,7 +8,7 @@ from tests.common import ASSETS from torchtune.models.llama3._tokenizer import CL100K_PATTERN -from torchtune.modules.tokenizers import TikTokenBaseTokenizer +from torchtune.modules.transforms.tokenizers import TikTokenBaseTokenizer class TestTikTokenBaseTokenizer: diff --git a/tests/torchtune/modules/tokenizers/test_utils.py b/tests/torchtune/modules/transforms/tokenizers/test_utils.py similarity index 94% rename from tests/torchtune/modules/tokenizers/test_utils.py rename to tests/torchtune/modules/transforms/tokenizers/test_utils.py index 2c49d82a5a..e3a11e6f36 100644 --- a/tests/torchtune/modules/tokenizers/test_utils.py +++ b/tests/torchtune/modules/transforms/tokenizers/test_utils.py @@ -9,7 +9,7 @@ from tests.test_utils import DummyTokenizer from torchtune.data import Message -from torchtune.modules.tokenizers import tokenize_messages_no_special_tokens +from torchtune.modules.transforms.tokenizers import tokenize_messages_no_special_tokens class TestTokenizerUtils: diff --git a/torchtune/data/_messages.py b/torchtune/data/_messages.py index a4e00834c2..170970e5c5 100644 --- a/torchtune/data/_messages.py +++ b/torchtune/data/_messages.py @@ -22,9 +22,10 @@ class Message: """ This class represents individual messages in a fine-tuning dataset. It supports - text-only content, text with interleaved images, and tool calls. The :class:`~torchtune.modules.tokenizers.ModelTokenizer` - will tokenize the content of the message using ``tokenize_messages`` and attach - the appropriate special tokens based on the flags set in this class. + text-only content, text with interleaved images, and tool calls. The + :class:`~torchtune.modules.transforms.tokenizers.ModelTokenizer` will tokenize + the content of the message using ``tokenize_messages`` and attach the appropriate + special tokens based on the flags set in this class. Args: role (Role): role of the message writer. Can be "system" for system prompts, diff --git a/torchtune/datasets/_alpaca.py b/torchtune/datasets/_alpaca.py index a881c149b0..c7795c8f28 100644 --- a/torchtune/datasets/_alpaca.py +++ b/torchtune/datasets/_alpaca.py @@ -12,7 +12,7 @@ from torchtune.datasets._packed import PackedDataset from torchtune.datasets._sft import SFTDataset -from torchtune.modules.tokenizers import ModelTokenizer +from torchtune.modules.transforms.tokenizers import ModelTokenizer def alpaca_dataset( diff --git a/torchtune/datasets/_chat.py b/torchtune/datasets/_chat.py index f126fb3979..1e3962e14b 100644 --- a/torchtune/datasets/_chat.py +++ b/torchtune/datasets/_chat.py @@ -9,7 +9,7 @@ from torchtune.data._messages import OpenAIToMessages, ShareGPTToMessages from torchtune.datasets._packed import PackedDataset from torchtune.datasets._sft import SFTDataset -from torchtune.modules.tokenizers import ModelTokenizer +from torchtune.modules.transforms.tokenizers import ModelTokenizer def chat_dataset( diff --git a/torchtune/datasets/_cnn_dailymail.py b/torchtune/datasets/_cnn_dailymail.py index d3c3af1f93..3995d46b22 100644 --- a/torchtune/datasets/_cnn_dailymail.py +++ b/torchtune/datasets/_cnn_dailymail.py @@ -8,7 +8,7 @@ from torchtune.datasets._text_completion import TextCompletionDataset -from torchtune.modules.tokenizers import ModelTokenizer +from torchtune.modules.transforms.tokenizers import ModelTokenizer def cnn_dailymail_articles_dataset( diff --git a/torchtune/datasets/_grammar.py b/torchtune/datasets/_grammar.py index 9e9d700ea6..02970cedef 100644 --- a/torchtune/datasets/_grammar.py +++ b/torchtune/datasets/_grammar.py @@ -10,7 +10,7 @@ from torchtune.data import InputOutputToMessages from torchtune.datasets._packed import PackedDataset from torchtune.datasets._sft import SFTDataset -from torchtune.modules.tokenizers import ModelTokenizer +from torchtune.modules.transforms.tokenizers import ModelTokenizer def grammar_dataset( diff --git a/torchtune/datasets/_hh_rlhf_helpful.py b/torchtune/datasets/_hh_rlhf_helpful.py index e466a8a4fd..8eea7e1a46 100644 --- a/torchtune/datasets/_hh_rlhf_helpful.py +++ b/torchtune/datasets/_hh_rlhf_helpful.py @@ -8,7 +8,7 @@ from torchtune.data import ChosenRejectedToMessages from torchtune.datasets._preference import PreferenceDataset -from torchtune.modules.tokenizers import ModelTokenizer +from torchtune.modules.transforms.tokenizers import ModelTokenizer def hh_rlhf_helpful_dataset( diff --git a/torchtune/datasets/_instruct.py b/torchtune/datasets/_instruct.py index 0dfa46146d..20168aac1d 100644 --- a/torchtune/datasets/_instruct.py +++ b/torchtune/datasets/_instruct.py @@ -9,7 +9,7 @@ from torchtune.data import InputOutputToMessages from torchtune.datasets._packed import PackedDataset from torchtune.datasets._sft import SFTDataset -from torchtune.modules.tokenizers import ModelTokenizer +from torchtune.modules.transforms.tokenizers import ModelTokenizer def instruct_dataset( diff --git a/torchtune/datasets/_preference.py b/torchtune/datasets/_preference.py index dea4eec852..c9615fe93c 100644 --- a/torchtune/datasets/_preference.py +++ b/torchtune/datasets/_preference.py @@ -11,10 +11,10 @@ from torch.utils.data import Dataset from torchtune.data import ChosenRejectedToMessages, CROSS_ENTROPY_IGNORE_IDX - -from torchtune.modules.tokenizers import ModelTokenizer from torchtune.modules.transforms import Transform +from torchtune.modules.transforms.tokenizers import ModelTokenizer + class PreferenceDataset(Dataset): """ @@ -84,7 +84,7 @@ class requires the dataset to have "chosen" and "rejected" model responses. Thes of messages are stored in the ``"chosen"`` and ``"rejected"`` keys. tokenizer (ModelTokenizer): Tokenizer used by the model that implements the ``tokenize_messages`` method. Since PreferenceDataset only supports text data, it requires a - :class:`~torchtune.modules.tokenizers.ModelTokenizer` instead of the ``model_transform`` in + :class:`~torchtune.modules.transforms.tokenizers.ModelTokenizer` instead of the ``model_transform`` in :class:`~torchtune.datasets.SFTDataset`. filter_fn (Optional[Callable]): callable used to filter the dataset prior to any pre-processing. See the Hugging Face `docs `_ for more diff --git a/torchtune/datasets/_samsum.py b/torchtune/datasets/_samsum.py index 905911d736..bd7f7dd8eb 100644 --- a/torchtune/datasets/_samsum.py +++ b/torchtune/datasets/_samsum.py @@ -10,7 +10,7 @@ from torchtune.data import InputOutputToMessages from torchtune.datasets._packed import PackedDataset from torchtune.datasets._sft import SFTDataset -from torchtune.modules.tokenizers import ModelTokenizer +from torchtune.modules.transforms.tokenizers import ModelTokenizer def samsum_dataset( diff --git a/torchtune/datasets/_sft.py b/torchtune/datasets/_sft.py index 9ee11244b6..0d1461dd0d 100644 --- a/torchtune/datasets/_sft.py +++ b/torchtune/datasets/_sft.py @@ -69,11 +69,13 @@ class SFTDataset(Dataset): multimodal datasets requires processing the images in a way specific to the vision encoder being used by the model and is agnostic to the specific dataset. - Tokenization is handled by the ``model_transform``. All :class:`~torchtune.modules.tokenizers.ModelTokenizer` - can be treated as a ``model_transform`` since it uses the model-specific tokenizer to - transform the list of messages outputted from the ``message_transform`` into tokens - used by the model for training. Text-only datasets will simply pass the :class:`~torchtune.modules.tokenizers.ModelTokenizer` - into ``model_transform``. Tokenizers handle prompt templating, if configured. + Tokenization is handled by the ``model_transform``. All + :class:`~torchtune.modules.transforms.tokenizers.ModelTokenizer` can be treated as + a ``model_transform`` since it uses the model-specific tokenizer to transform the + list of messages outputted from the ``message_transform`` into tokens used by the + model for training. Text-only datasets will simply pass the + :class:`~torchtune.modules.transforms.tokenizers.ModelTokenizer` into ``model_transform``. + Tokenizers handle prompt templating, if configured. Args: source (str): path to dataset repository on Hugging Face. For local datasets, diff --git a/torchtune/datasets/_slimorca.py b/torchtune/datasets/_slimorca.py index 126b6b92e4..2701b2d717 100644 --- a/torchtune/datasets/_slimorca.py +++ b/torchtune/datasets/_slimorca.py @@ -10,7 +10,7 @@ from torchtune.datasets._packed import PackedDataset from torchtune.datasets._sft import SFTDataset -from torchtune.modules.tokenizers import ModelTokenizer +from torchtune.modules.transforms.tokenizers import ModelTokenizer def slimorca_dataset( diff --git a/torchtune/datasets/_stack_exchange_paired.py b/torchtune/datasets/_stack_exchange_paired.py index 09eda929fe..a111d415d2 100644 --- a/torchtune/datasets/_stack_exchange_paired.py +++ b/torchtune/datasets/_stack_exchange_paired.py @@ -8,8 +8,8 @@ from torchtune.data import Message from torchtune.datasets._preference import PreferenceDataset -from torchtune.modules.tokenizers import ModelTokenizer from torchtune.modules.transforms import Transform +from torchtune.modules.transforms.tokenizers import ModelTokenizer class StackExchangePairedToMessages(Transform): diff --git a/torchtune/datasets/_text_completion.py b/torchtune/datasets/_text_completion.py index 5b5cc94299..342c6aa816 100644 --- a/torchtune/datasets/_text_completion.py +++ b/torchtune/datasets/_text_completion.py @@ -10,7 +10,7 @@ from torch.utils.data import Dataset from torchtune.data._utils import truncate from torchtune.datasets._packed import PackedDataset -from torchtune.modules.tokenizers import ModelTokenizer +from torchtune.modules.transforms.tokenizers import ModelTokenizer class TextCompletionDataset(Dataset): diff --git a/torchtune/datasets/_wikitext.py b/torchtune/datasets/_wikitext.py index 01111a25c6..4f9ada6741 100644 --- a/torchtune/datasets/_wikitext.py +++ b/torchtune/datasets/_wikitext.py @@ -13,7 +13,7 @@ TextCompletionDataset, ) -from torchtune.modules.tokenizers import ModelTokenizer +from torchtune.modules.transforms.tokenizers import ModelTokenizer def wikitext_dataset( diff --git a/torchtune/models/clip/_tokenizer.py b/torchtune/models/clip/_tokenizer.py index 69fed32c72..cdab2c9c05 100644 --- a/torchtune/models/clip/_tokenizer.py +++ b/torchtune/models/clip/_tokenizer.py @@ -7,7 +7,7 @@ import regex as re -from torchtune.modules.tokenizers._utils import BaseTokenizer +from torchtune.modules.transforms.tokenizers._utils import BaseTokenizer WORD_BOUNDARY = "" diff --git a/torchtune/models/gemma/_tokenizer.py b/torchtune/models/gemma/_tokenizer.py index e5eb89e230..dc5d2eadf8 100644 --- a/torchtune/models/gemma/_tokenizer.py +++ b/torchtune/models/gemma/_tokenizer.py @@ -7,12 +7,12 @@ from typing import Any, List, Mapping, Optional, Tuple from torchtune.data import Message, PromptTemplate -from torchtune.modules.tokenizers import ( +from torchtune.modules.transforms import Transform +from torchtune.modules.transforms.tokenizers import ( ModelTokenizer, SentencePieceBaseTokenizer, tokenize_messages_no_special_tokens, ) -from torchtune.modules.transforms import Transform WHITESPACE_CHARS = [" ", "\n", "\t", "\r", "\v"] diff --git a/torchtune/models/llama2/_tokenizer.py b/torchtune/models/llama2/_tokenizer.py index 078494c531..4e2ab6a40c 100644 --- a/torchtune/models/llama2/_tokenizer.py +++ b/torchtune/models/llama2/_tokenizer.py @@ -8,12 +8,12 @@ from torchtune.data import Message, PromptTemplate from torchtune.models.llama2._prompt_template import Llama2ChatTemplate -from torchtune.modules.tokenizers import ( +from torchtune.modules.transforms import Transform +from torchtune.modules.transforms.tokenizers import ( ModelTokenizer, SentencePieceBaseTokenizer, tokenize_messages_no_special_tokens, ) -from torchtune.modules.transforms import Transform WHITESPACE_CHARS = [" ", "\n", "\t", "\r", "\v"] diff --git a/torchtune/models/llama3/_model_builders.py b/torchtune/models/llama3/_model_builders.py index 0ddca90189..6c13e37cff 100644 --- a/torchtune/models/llama3/_model_builders.py +++ b/torchtune/models/llama3/_model_builders.py @@ -13,7 +13,7 @@ from torchtune.modules import TransformerDecoder from torchtune.modules.peft import LORA_ATTN_MODULES -from torchtune.modules.tokenizers import parse_hf_tokenizer_json +from torchtune.modules.transforms.tokenizers import parse_hf_tokenizer_json """ diff --git a/torchtune/models/llama3/_tokenizer.py b/torchtune/models/llama3/_tokenizer.py index 50ea0a7581..012aa9f584 100644 --- a/torchtune/models/llama3/_tokenizer.py +++ b/torchtune/models/llama3/_tokenizer.py @@ -8,8 +8,11 @@ from typing import Any, Dict, List, Mapping, Optional, Tuple from torchtune.data import Message, PromptTemplate, truncate -from torchtune.modules.tokenizers import ModelTokenizer, TikTokenBaseTokenizer from torchtune.modules.transforms import Transform +from torchtune.modules.transforms.tokenizers import ( + ModelTokenizer, + TikTokenBaseTokenizer, +) CL100K_PATTERN = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""" # noqa diff --git a/torchtune/models/llama3_2_vision/_transform.py b/torchtune/models/llama3_2_vision/_transform.py index eaf627d027..534ed4ab1c 100644 --- a/torchtune/models/llama3_2_vision/_transform.py +++ b/torchtune/models/llama3_2_vision/_transform.py @@ -10,8 +10,8 @@ from torchtune.models.clip import CLIPImageTransform from torchtune.models.llama3 import llama3_tokenizer -from torchtune.modules.tokenizers import ModelTokenizer from torchtune.modules.transforms import Transform, VisionCrossAttentionMask +from torchtune.modules.transforms.tokenizers import ModelTokenizer class Llama3VisionTransform(ModelTokenizer, Transform): diff --git a/torchtune/models/mistral/_tokenizer.py b/torchtune/models/mistral/_tokenizer.py index c3bbc8a4a7..49617220c3 100644 --- a/torchtune/models/mistral/_tokenizer.py +++ b/torchtune/models/mistral/_tokenizer.py @@ -8,12 +8,12 @@ from torchtune.data import Message, PromptTemplate from torchtune.models.mistral._prompt_template import MistralChatTemplate -from torchtune.modules.tokenizers import ( +from torchtune.modules.transforms import Transform +from torchtune.modules.transforms.tokenizers import ( ModelTokenizer, SentencePieceBaseTokenizer, tokenize_messages_no_special_tokens, ) -from torchtune.modules.transforms import Transform WHITESPACE_CHARS = [" ", "\n", "\t", "\r", "\v"] diff --git a/torchtune/models/phi3/_model_builders.py b/torchtune/models/phi3/_model_builders.py index 91d42623d7..e1275df783 100644 --- a/torchtune/models/phi3/_model_builders.py +++ b/torchtune/models/phi3/_model_builders.py @@ -6,7 +6,7 @@ from torchtune.modules import TransformerDecoder from torchtune.modules.peft import LORA_ATTN_MODULES from functools import partial -from torchtune.modules.tokenizers import parse_hf_tokenizer_json +from torchtune.modules.transforms.tokenizers import parse_hf_tokenizer_json from torchtune.data._prompt_templates import _TemplateType from torchtune.data._prompt_templates import _get_prompt_template diff --git a/torchtune/models/phi3/_tokenizer.py b/torchtune/models/phi3/_tokenizer.py index 38707bf26e..44f66b5934 100644 --- a/torchtune/models/phi3/_tokenizer.py +++ b/torchtune/models/phi3/_tokenizer.py @@ -9,8 +9,11 @@ from torchtune.data._messages import Message from torchtune.data._prompt_templates import PromptTemplate from torchtune.data._utils import truncate -from torchtune.modules.tokenizers import ModelTokenizer, SentencePieceBaseTokenizer from torchtune.modules.transforms import Transform +from torchtune.modules.transforms.tokenizers import ( + ModelTokenizer, + SentencePieceBaseTokenizer, +) PHI3_SPECIAL_TOKENS = { "<|endoftext|>": 32000, diff --git a/torchtune/models/qwen2/_model_builders.py b/torchtune/models/qwen2/_model_builders.py index 2a0ee06f83..f1ca5b8506 100644 --- a/torchtune/models/qwen2/_model_builders.py +++ b/torchtune/models/qwen2/_model_builders.py @@ -11,7 +11,7 @@ from torchtune.models.qwen2._tokenizer import QWEN2_SPECIAL_TOKENS, Qwen2Tokenizer from torchtune.modules import TransformerDecoder from torchtune.modules.peft import LORA_ATTN_MODULES -from torchtune.modules.tokenizers import parse_hf_tokenizer_json +from torchtune.modules.transforms.tokenizers import parse_hf_tokenizer_json """ Model builders build specific instantiations using component builders. For example diff --git a/torchtune/models/qwen2/_tokenizer.py b/torchtune/models/qwen2/_tokenizer.py index 0e4ee6bd35..dd6d038003 100644 --- a/torchtune/models/qwen2/_tokenizer.py +++ b/torchtune/models/qwen2/_tokenizer.py @@ -11,7 +11,7 @@ import regex as re from torchtune.data import ChatMLTemplate, Message, PromptTemplate, truncate -from torchtune.modules.tokenizers import ModelTokenizer +from torchtune.modules.transforms.tokenizers import ModelTokenizer PRETOKENIZE_REGEX = ( r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|" diff --git a/torchtune/models/qwen2_5/_model_builders.py b/torchtune/models/qwen2_5/_model_builders.py index 7d39802375..716ae48329 100644 --- a/torchtune/models/qwen2_5/_model_builders.py +++ b/torchtune/models/qwen2_5/_model_builders.py @@ -11,7 +11,7 @@ from torchtune.models.qwen2_5._tokenizer import QWEN2_5_SPECIAL_TOKENS, Qwen2_5Tokenizer from torchtune.modules import TransformerDecoder from torchtune.modules.peft import LORA_ATTN_MODULES -from torchtune.modules.tokenizers import parse_hf_tokenizer_json +from torchtune.modules.transforms.tokenizers import parse_hf_tokenizer_json """ Model builders build specific instantiations using component builders. For example diff --git a/torchtune/models/t5/_tokenizer.py b/torchtune/models/t5/_tokenizer.py index f89dff00f6..e4fa9c539e 100644 --- a/torchtune/models/t5/_tokenizer.py +++ b/torchtune/models/t5/_tokenizer.py @@ -5,7 +5,9 @@ # LICENSE file in the root directory of this source tree. from typing import Any, Dict, List -from torchtune.modules.tokenizers._sentencepiece import SentencePieceBaseTokenizer +from torchtune.modules.transforms.tokenizers._sentencepiece import ( + SentencePieceBaseTokenizer, +) class T5Tokenizer(SentencePieceBaseTokenizer): diff --git a/torchtune/modules/tokenizers/__init__.py b/torchtune/modules/tokenizers/__init__.py index 2fecc279ee..f10a9b3dd6 100644 --- a/torchtune/modules/tokenizers/__init__.py +++ b/torchtune/modules/tokenizers/__init__.py @@ -4,20 +4,28 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from ._sentencepiece import SentencePieceBaseTokenizer -from ._tiktoken import TikTokenBaseTokenizer -from ._utils import ( +# flake8: noqa: F401 + +# NOTE: This file is maintained for backward compatibility purposes. +# The imports below point to the new location in `torchtune.modules.transforms.tokenizers`. +# The import paths will be removed in v0.7. Please update your code to use the new path +# (torchtune.modules.transforms.tokenizers) to avoid breaking changes in future releases. + + +import warnings + +from torchtune.modules.transforms.tokenizers import ( BaseTokenizer, ModelTokenizer, parse_hf_tokenizer_json, + SentencePieceBaseTokenizer, + TikTokenBaseTokenizer, tokenize_messages_no_special_tokens, ) -__all__ = [ - "SentencePieceBaseTokenizer", - "TikTokenBaseTokenizer", - "ModelTokenizer", - "BaseTokenizer", - "tokenize_messages_no_special_tokens", - "parse_hf_tokenizer_json", -] +warnings.warn( + "The import path 'torchtune.modules.tokenizers' is deprecated and will be removed in v0.7. " + "Please update your imports to 'torchtune.modules.transforms.tokenizers'.", + DeprecationWarning, + stacklevel=2, +) diff --git a/torchtune/modules/transforms/tokenizers/__init__.py b/torchtune/modules/transforms/tokenizers/__init__.py new file mode 100644 index 0000000000..2fecc279ee --- /dev/null +++ b/torchtune/modules/transforms/tokenizers/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from ._sentencepiece import SentencePieceBaseTokenizer +from ._tiktoken import TikTokenBaseTokenizer +from ._utils import ( + BaseTokenizer, + ModelTokenizer, + parse_hf_tokenizer_json, + tokenize_messages_no_special_tokens, +) + +__all__ = [ + "SentencePieceBaseTokenizer", + "TikTokenBaseTokenizer", + "ModelTokenizer", + "BaseTokenizer", + "tokenize_messages_no_special_tokens", + "parse_hf_tokenizer_json", +] diff --git a/torchtune/modules/tokenizers/_sentencepiece.py b/torchtune/modules/transforms/tokenizers/_sentencepiece.py similarity index 98% rename from torchtune/modules/tokenizers/_sentencepiece.py rename to torchtune/modules/transforms/tokenizers/_sentencepiece.py index 0b22b63ee3..8d98617378 100644 --- a/torchtune/modules/tokenizers/_sentencepiece.py +++ b/torchtune/modules/transforms/tokenizers/_sentencepiece.py @@ -7,8 +7,7 @@ from typing import List, Optional from sentencepiece import SentencePieceProcessor - -from torchtune.modules.tokenizers._utils import BaseTokenizer +from torchtune.modules.transforms.tokenizers._utils import BaseTokenizer WHITESPACE_CHARS = [" ", "\n", "\t", "\r", "\v"] diff --git a/torchtune/modules/tokenizers/_tiktoken.py b/torchtune/modules/transforms/tokenizers/_tiktoken.py similarity index 98% rename from torchtune/modules/tokenizers/_tiktoken.py rename to torchtune/modules/transforms/tokenizers/_tiktoken.py index 077b22b0cd..64733b4634 100644 --- a/torchtune/modules/tokenizers/_tiktoken.py +++ b/torchtune/modules/transforms/tokenizers/_tiktoken.py @@ -8,7 +8,7 @@ from tiktoken import Encoding from tiktoken.load import load_tiktoken_bpe -from torchtune.modules.tokenizers._utils import BaseTokenizer +from torchtune.modules.transforms.tokenizers._utils import BaseTokenizer # Constants controlling encode logic MAX_ENCODE_CHARS = 400_000 diff --git a/torchtune/modules/tokenizers/_utils.py b/torchtune/modules/transforms/tokenizers/_utils.py similarity index 97% rename from torchtune/modules/tokenizers/_utils.py rename to torchtune/modules/transforms/tokenizers/_utils.py index b580eda1c0..ff374738c7 100644 --- a/torchtune/modules/tokenizers/_utils.py +++ b/torchtune/modules/transforms/tokenizers/_utils.py @@ -14,8 +14,8 @@ class BaseTokenizer(Protocol): """ Abstract token encoding model that implements ``encode`` and ``decode`` methods. - See :class:`~torchtune.modules.tokenizers.SentencePieceBaseTokenizer` and - :class:`~torchtune.modules.tokenizers.TikTokenBaseTokenizer` for example implementations of this protocol. + See :class:`~torchtune.modules.transforms.tokenizers.SentencePieceBaseTokenizer` and + :class:`~torchtune.modules.transforms.tokenizers.TikTokenBaseTokenizer` for example implementations of this protocol. """ def encode(self, text: str, **kwargs: Dict[str, Any]) -> List[int]: