Skip to content

utils

BaseTokenizer

Bases: BaseModel, ABC

Base tokenizer class providing unified tokenization interface.

This abstract base class defines the interface for different tokenization strategies including tiktoken and jieba tokenizers.

Source code in rm_gallery/core/utils/tokenizer.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
class BaseTokenizer(BaseModel, ABC):
    """
    Base tokenizer class providing unified tokenization interface.

    This abstract base class defines the interface for different tokenization
    strategies including tiktoken and jieba tokenizers.
    """

    name: str = Field(..., description="Name of the tokenizer")

    @abstractmethod
    def tokenize(self, text: str) -> List[str]:
        """
        Tokenize input text into a list of tokens.

        Args:
            text: Input text to tokenize

        Returns:
            List[str]: List of token strings
        """
        pass

    def preprocess_text(self, text: str, to_lower: bool = False) -> str:
        """
        Preprocess text before tokenization.

        Args:
            text: Input text
            to_lower: Whether to convert to lowercase

        Returns:
            str: Preprocessed text
        """
        text = text.strip()
        if to_lower:
            text = text.lower()
        return text

preprocess_text(text, to_lower=False)

Preprocess text before tokenization.

Parameters:

Name Type Description Default
text str

Input text

required
to_lower bool

Whether to convert to lowercase

False

Returns:

Name Type Description
str str

Preprocessed text

Source code in rm_gallery/core/utils/tokenizer.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def preprocess_text(self, text: str, to_lower: bool = False) -> str:
    """
    Preprocess text before tokenization.

    Args:
        text: Input text
        to_lower: Whether to convert to lowercase

    Returns:
        str: Preprocessed text
    """
    text = text.strip()
    if to_lower:
        text = text.lower()
    return text

tokenize(text) abstractmethod

Tokenize input text into a list of tokens.

Parameters:

Name Type Description Default
text str

Input text to tokenize

required

Returns:

Type Description
List[str]

List[str]: List of token strings

Source code in rm_gallery/core/utils/tokenizer.py
18
19
20
21
22
23
24
25
26
27
28
29
@abstractmethod
def tokenize(self, text: str) -> List[str]:
    """
    Tokenize input text into a list of tokens.

    Args:
        text: Input text to tokenize

    Returns:
        List[str]: List of token strings
    """
    pass

JiebaTokenizer

Bases: BaseTokenizer

Jieba-based tokenizer for Chinese text processing.

Provides Chinese word segmentation using jieba library with optional Chinese character filtering and preprocessing capabilities.

Source code in rm_gallery/core/utils/tokenizer.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
class JiebaTokenizer(BaseTokenizer):
    """
    Jieba-based tokenizer for Chinese text processing.

    Provides Chinese word segmentation using jieba library with optional
    Chinese character filtering and preprocessing capabilities.
    """

    name: str = Field(default="jieba", description="Jieba Chinese tokenizer")
    chinese_only: bool = Field(
        default=False, description="Whether to keep only Chinese characters"
    )

    def _preserve_chinese(self, text: str) -> str:
        """
        Preserve only Chinese characters.

        Args:
            text: Input text

        Returns:
            str: Text with only Chinese characters
        """
        chinese_chars = re.findall(r"[\u4e00-\u9fff]", text)
        return "".join(chinese_chars)

    def tokenize(self, text: str) -> List[str]:
        """
        Tokenize Chinese text using jieba.

        Args:
            text: Input text to tokenize

        Returns:
            List[str]: List of token strings

        Raises:
            ImportError: If jieba library is not installed
        """
        try:
            import jieba

            if self.chinese_only:
                text = self._preserve_chinese(text)
            return list(jieba.cut(text))
        except ImportError:
            raise ImportError(
                "jieba library required for Chinese tokenization: pip install jieba"
            )

tokenize(text)

Tokenize Chinese text using jieba.

Parameters:

Name Type Description Default
text str

Input text to tokenize

required

Returns:

Type Description
List[str]

List[str]: List of token strings

Raises:

Type Description
ImportError

If jieba library is not installed

Source code in rm_gallery/core/utils/tokenizer.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
def tokenize(self, text: str) -> List[str]:
    """
    Tokenize Chinese text using jieba.

    Args:
        text: Input text to tokenize

    Returns:
        List[str]: List of token strings

    Raises:
        ImportError: If jieba library is not installed
    """
    try:
        import jieba

        if self.chinese_only:
            text = self._preserve_chinese(text)
        return list(jieba.cut(text))
    except ImportError:
        raise ImportError(
            "jieba library required for Chinese tokenization: pip install jieba"
        )

SimpleTokenizer

Bases: BaseTokenizer

Simple whitespace-based tokenizer.

Basic tokenizer that splits text on whitespace. Used as fallback when other tokenizers are not available or fail.

Source code in rm_gallery/core/utils/tokenizer.py
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
class SimpleTokenizer(BaseTokenizer):
    """
    Simple whitespace-based tokenizer.

    Basic tokenizer that splits text on whitespace. Used as fallback
    when other tokenizers are not available or fail.
    """

    name: str = Field(default="simple", description="Simple whitespace tokenizer")

    def tokenize(self, text: str) -> List[str]:
        """
        Tokenize text by splitting on whitespace.

        Args:
            text: Input text to tokenize

        Returns:
            List[str]: List of token strings
        """
        return text.split()

tokenize(text)

Tokenize text by splitting on whitespace.

Parameters:

Name Type Description Default
text str

Input text to tokenize

required

Returns:

Type Description
List[str]

List[str]: List of token strings

Source code in rm_gallery/core/utils/tokenizer.py
145
146
147
148
149
150
151
152
153
154
155
def tokenize(self, text: str) -> List[str]:
    """
    Tokenize text by splitting on whitespace.

    Args:
        text: Input text to tokenize

    Returns:
        List[str]: List of token strings
    """
    return text.split()

TiktokenTokenizer

Bases: BaseTokenizer

Tiktoken-based tokenizer supporting multilingual content.

Uses tiktoken encoding for robust tokenization of Chinese, English and other languages. Falls back to simple splitting if tiktoken fails.

Source code in rm_gallery/core/utils/tokenizer.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
class TiktokenTokenizer(BaseTokenizer):
    """
    Tiktoken-based tokenizer supporting multilingual content.

    Uses tiktoken encoding for robust tokenization of Chinese, English
    and other languages. Falls back to simple splitting if tiktoken fails.
    """

    name: str = Field(default="tiktoken", description="Tiktoken tokenizer")
    encoding_name: str = Field(
        default="cl100k_base", description="Tiktoken encoding name"
    )

    def tokenize(self, text: str) -> List[str]:
        """
        Tokenize text using tiktoken encoder.

        Args:
            text: Input text to tokenize

        Returns:
            List[str]: List of token strings
        """
        try:
            import tiktoken

            encoding = tiktoken.get_encoding(self.encoding_name)
            tokens = encoding.encode(text)
            # Convert token ids back to strings for comparison
            token_strings = [encoding.decode([token]) for token in tokens]
            return token_strings
        except Exception:
            # Fallback to simple splitting if tiktoken fails
            return text.split()

tokenize(text)

Tokenize text using tiktoken encoder.

Parameters:

Name Type Description Default
text str

Input text to tokenize

required

Returns:

Type Description
List[str]

List[str]: List of token strings

Source code in rm_gallery/core/utils/tokenizer.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def tokenize(self, text: str) -> List[str]:
    """
    Tokenize text using tiktoken encoder.

    Args:
        text: Input text to tokenize

    Returns:
        List[str]: List of token strings
    """
    try:
        import tiktoken

        encoding = tiktoken.get_encoding(self.encoding_name)
        tokens = encoding.encode(text)
        # Convert token ids back to strings for comparison
        token_strings = [encoding.decode([token]) for token in tokens]
        return token_strings
    except Exception:
        # Fallback to simple splitting if tiktoken fails
        return text.split()

get_tokenizer(tokenizer_type='tiktoken', encoding_name='cl100k_base', chinese_only=False, **kwargs)

Factory function to create tokenizer instances.

Parameters:

Name Type Description Default
tokenizer_type str

Type of tokenizer ("tiktoken", "jieba", "simple")

'tiktoken'
encoding_name str

Tiktoken encoding name (for tiktoken tokenizer)

'cl100k_base'
chinese_only bool

Whether to keep only Chinese characters (for jieba tokenizer)

False
**kwargs

Additional arguments for tokenizer initialization

{}

Returns:

Name Type Description
BaseTokenizer BaseTokenizer

Tokenizer instance

Raises:

Type Description
ValueError

If tokenizer_type is not supported

Source code in rm_gallery/core/utils/tokenizer.py
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
def get_tokenizer(
    tokenizer_type: str = "tiktoken",
    encoding_name: str = "cl100k_base",
    chinese_only: bool = False,
    **kwargs,
) -> BaseTokenizer:
    """
    Factory function to create tokenizer instances.

    Args:
        tokenizer_type: Type of tokenizer ("tiktoken", "jieba", "simple")
        encoding_name: Tiktoken encoding name (for tiktoken tokenizer)
        chinese_only: Whether to keep only Chinese characters (for jieba tokenizer)
        **kwargs: Additional arguments for tokenizer initialization

    Returns:
        BaseTokenizer: Tokenizer instance

    Raises:
        ValueError: If tokenizer_type is not supported
    """
    if tokenizer_type == "tiktoken":
        return TiktokenTokenizer(encoding_name=encoding_name, **kwargs)
    elif tokenizer_type == "jieba":
        return JiebaTokenizer(chinese_only=chinese_only, **kwargs)
    elif tokenizer_type == "simple":
        return SimpleTokenizer(**kwargs)
    else:
        raise ValueError(
            f"Unsupported tokenizer type: {tokenizer_type}. "
            f"Supported types: tiktoken, jieba, simple"
        )