Skip to main content

index/fulltext_index/
tokenizer.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use crate::Bytes;
16use crate::fulltext_index::error::Result;
17
18lazy_static::lazy_static! {
19    static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
20}
21
22/// A-Z, a-z, 0-9, and '_' are true
23const VALID_ASCII_TOKEN: [bool; 256] = [
24    false, false, false, false, false, false, false, false, false, false, false, false, false,
25    false, false, false, false, false, false, false, false, false, false, false, false, false,
26    false, false, false, false, false, false, false, false, false, false, false, false, false,
27    false, false, false, false, false, false, false, false, false, true, true, true, true, true,
28    true, true, true, true, true, false, false, false, false, false, false, false, true, true,
29    true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true,
30    true, true, true, true, true, true, true, true, false, false, false, false, true, false, true,
31    true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true,
32    true, true, true, true, true, true, true, true, true, false, false, false, false, false, false,
33    false, false, false, false, false, false, false, false, false, false, false, false, false,
34    false, false, false, false, false, false, false, false, false, false, false, false, false,
35    false, false, false, false, false, false, false, false, false, false, false, false, false,
36    false, false, false, false, false, false, false, false, false, false, false, false, false,
37    false, false, false, false, false, false, false, false, false, false, false, false, false,
38    false, false, false, false, false, false, false, false, false, false, false, false, false,
39    false, false, false, false, false, false, false, false, false, false, false, false, false,
40    false, false, false, false, false, false, false, false, false, false, false, false, false,
41    false, false, false, false, false, false, false, false, false, false, false, false, false,
42    false, false, false, false, false, false, false, false, false, false,
43];
44
45/// `Tokenizer` tokenizes a text into a list of tokens.
46pub trait Tokenizer: Send {
47    fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str>;
48}
49
50/// `EnglishTokenizer` tokenizes an English text.
51///
52/// It splits the text by non-alphabetic characters.
53#[derive(Debug, Default)]
54pub struct EnglishTokenizer;
55
56impl Tokenizer for EnglishTokenizer {
57    fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
58        if text.is_ascii() {
59            let mut tokens = Vec::new();
60            let mut start = 0;
61            for (i, &byte) in text.as_bytes().iter().enumerate() {
62                if !VALID_ASCII_TOKEN[byte as usize] {
63                    if start < i {
64                        tokens.push(&text[start..i]);
65                    }
66                    start = i + 1;
67                }
68            }
69
70            if start < text.len() {
71                tokens.push(&text[start..]);
72            }
73
74            tokens
75        } else {
76            text.split(|c: char| !c.is_alphanumeric() && c != '_')
77                .filter(|s| !s.is_empty())
78                .collect()
79        }
80    }
81}
82
83/// `ChineseTokenizer` tokenizes a Chinese text.
84///
85/// It uses Jieba search-mode tokenization to improve recall for Chinese fulltext search.
86/// Enabling HMM also helps merge some unknown fragments into larger tokens, which can reduce
87/// token cardinality versus a fully fragmented output.
88#[derive(Debug, Default)]
89pub struct ChineseTokenizer;
90
91impl Tokenizer for ChineseTokenizer {
92    fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
93        if text.is_ascii() {
94            EnglishTokenizer {}.tokenize(text)
95        } else {
96            // Search-mode tokenization emits finer-grained searchable terms, while HMM helps
97            // merge some unknown fragments and avoid excessive token fragmentation.
98            JIEBA
99                .cut_for_search(text, true)
100                .into_iter()
101                .filter(|s| s.chars().any(|c| c.is_alphanumeric() || c == '_'))
102                .collect()
103        }
104    }
105}
106
107/// `Analyzer` analyzes a text into a list of tokens.
108///
109/// It uses a `Tokenizer` to tokenize the text and optionally lowercases the tokens.
110pub struct Analyzer {
111    tokenizer: Box<dyn Tokenizer>,
112    case_sensitive: bool,
113}
114
115impl Analyzer {
116    /// Creates a new `Analyzer` with the given `Tokenizer` and case sensitivity.
117    pub fn new(tokenizer: Box<dyn Tokenizer>, case_sensitive: bool) -> Self {
118        Self {
119            tokenizer,
120            case_sensitive,
121        }
122    }
123
124    /// Analyzes the given text into a list of tokens.
125    pub fn analyze_text(&self, text: &str) -> Result<Vec<Bytes>> {
126        let res = self
127            .tokenizer
128            .tokenize(text)
129            .iter()
130            .map(|s| {
131                if self.case_sensitive {
132                    s.as_bytes().to_vec()
133                } else {
134                    s.to_lowercase().as_bytes().to_vec()
135                }
136            })
137            .collect();
138        Ok(res)
139    }
140}
141
142#[cfg(test)]
143mod tests {
144    use super::*;
145
146    #[test]
147    fn test_english_tokenizer() {
148        let tokenizer = EnglishTokenizer;
149        let text = "Hello, world!!! This is a----++   test012_345+67890";
150        let tokens = tokenizer.tokenize(text);
151        assert_eq!(
152            tokens,
153            vec!["Hello", "world", "This", "is", "a", "test012_345", "67890"]
154        );
155    }
156
157    #[test]
158    fn test_english_tokenizer_with_utf8() {
159        let tokenizer = EnglishTokenizer;
160        let text = "💸unfold the 纸巾😣and gently 清洁表😭面";
161        let tokens = tokenizer.tokenize(text);
162        assert_eq!(
163            tokens,
164            // Don't care what happens to non-ASCII characters.
165            // It's kind of a misconfiguration to use EnglishTokenizer on non-ASCII text.
166            vec!["unfold", "the", "纸巾", "and", "gently", "清洁表", "面"]
167        );
168    }
169
170    #[test]
171    fn test_chinese_tokenizer() {
172        let tokenizer = ChineseTokenizer;
173        let text = "我喜欢苹果";
174        let tokens = tokenizer.tokenize(text);
175        assert_eq!(tokens, vec!["我", "喜欢", "苹果"]);
176    }
177
178    #[test]
179    fn test_chinese_tokenizer_issue_7943_sample() {
180        let tokenizer = ChineseTokenizer;
181        let text = "登录手机号18888888888的动态key:829889AC8";
182        let tokens = tokenizer.tokenize(text);
183        assert_eq!(
184            tokens,
185            [
186                "登录",
187                "手机",
188                "手机号",
189                "18888888888",
190                "的",
191                "动态",
192                "key",
193                "829889AC8"
194            ]
195        );
196    }
197
198    #[test]
199    fn test_chinese_tokenizer_aggressive_tokenization_probe() {
200        let tokenizer = ChineseTokenizer;
201        let text = "哈基米哦南北绿豆,噢马自立曼波。登录手机号。中国农业银行。装电视台,中国中央广播电视台。压不缩,笑不活。";
202
203        let default_tokens = tokenizer.tokenize(text);
204        let cut_hmm_false = JIEBA.cut(text, false);
205        let cut_hmm_true = JIEBA.cut(text, true);
206        let cut_for_search_hmm_false = JIEBA.cut_for_search(text, false);
207        let cut_for_search_hmm_true = JIEBA.cut_for_search(text, true);
208
209        assert_eq!(
210            default_tokens,
211            [
212                "哈基米",
213                "哦",
214                "南北",
215                "绿豆",
216                "噢",
217                "马",
218                "自立",
219                "曼波",
220                "登录",
221                "手机",
222                "手机号",
223                "中国",
224                "农业",
225                "银行",
226                "中国农业银行",
227                "装",
228                "电视",
229                "电视台",
230                "中国",
231                "中央",
232                "广播",
233                "电视",
234                "电视台",
235                "不缩",
236                "压不缩",
237                "笑",
238                "不活",
239            ]
240        );
241        assert_eq!(
242            cut_hmm_false,
243            [
244                "哈",
245                "基",
246                "米",
247                "哦",
248                "南北",
249                "绿豆",
250                ",",
251                "噢",
252                "马",
253                "自立",
254                "曼",
255                "波",
256                "。",
257                "登录",
258                "手机号",
259                "。",
260                "中国农业银行",
261                "。",
262                "装",
263                "电视台",
264                ",",
265                "中国",
266                "中央",
267                "广播",
268                "电视台",
269                "。",
270                "压",
271                "不",
272                "缩",
273                ",",
274                "笑",
275                "不",
276                "活",
277                "。"
278            ]
279        );
280        assert_eq!(
281            cut_hmm_true,
282            [
283                "哈基米",
284                "哦",
285                "南北",
286                "绿豆",
287                ",",
288                "噢",
289                "马",
290                "自立",
291                "曼波",
292                "。",
293                "登录",
294                "手机号",
295                "。",
296                "中国农业银行",
297                "。",
298                "装",
299                "电视台",
300                ",",
301                "中国",
302                "中央",
303                "广播",
304                "电视台",
305                "。",
306                "压不缩",
307                ",",
308                "笑",
309                "不活",
310                "。"
311            ]
312        );
313        assert_eq!(
314            cut_for_search_hmm_false,
315            [
316                "哈",
317                "基",
318                "米",
319                "哦",
320                "南北",
321                "绿豆",
322                ",",
323                "噢",
324                "马",
325                "自立",
326                "曼",
327                "波",
328                "。",
329                "登录",
330                "手机",
331                "手机号",
332                "。",
333                "中国",
334                "农业",
335                "银行",
336                "中国农业银行",
337                "。",
338                "装",
339                "电视",
340                "电视台",
341                ",",
342                "中国",
343                "中央",
344                "广播",
345                "电视",
346                "电视台",
347                "。",
348                "压",
349                "不",
350                "缩",
351                ",",
352                "笑",
353                "不",
354                "活",
355                "。"
356            ]
357        );
358
359        assert_eq!(
360            cut_for_search_hmm_true,
361            [
362                "哈基米",
363                "哦",
364                "南北",
365                "绿豆",
366                ",",
367                "噢",
368                "马",
369                "自立",
370                "曼波",
371                "。",
372                "登录",
373                "手机",
374                "手机号",
375                "。",
376                "中国",
377                "农业",
378                "银行",
379                "中国农业银行",
380                "。",
381                "装",
382                "电视",
383                "电视台",
384                ",",
385                "中国",
386                "中央",
387                "广播",
388                "电视",
389                "电视台",
390                "。",
391                "不缩",
392                "压不缩",
393                ",",
394                "笑",
395                "不活",
396                "。"
397            ]
398        );
399    }
400
401    #[test]
402    fn test_valid_ascii_token_lookup_table() {
403        // Test all ASCII values in a single loop
404        for c in 0u8..=255u8 {
405            let is_valid = VALID_ASCII_TOKEN[c as usize];
406            let should_be_valid = (c as char).is_ascii_alphanumeric() || c == b'_';
407
408            assert_eq!(
409                is_valid,
410                should_be_valid,
411                "Character '{}' (byte {}) validity mismatch: expected {}, got {}",
412                if c.is_ascii() && !c.is_ascii_control() {
413                    c as char
414                } else {
415                    '?'
416                },
417                c,
418                should_be_valid,
419                is_valid
420            );
421        }
422    }
423
424    #[test]
425    fn test_analyzer() {
426        let tokenizer = EnglishTokenizer;
427        let analyzer = Analyzer::new(Box::new(tokenizer), false);
428        let text = "Hello, world! This is a test.";
429        let tokens = analyzer.analyze_text(text).unwrap();
430        assert_eq!(
431            tokens,
432            vec![
433                b"hello".to_vec(),
434                b"world".to_vec(),
435                b"this".to_vec(),
436                b"is".to_vec(),
437                b"a".to_vec(),
438                b"test".to_vec()
439            ]
440        );
441    }
442}