Skip to main content

index/fulltext_index/
tokenizer.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use crate::Bytes;
16use crate::fulltext_index::error::Result;
17
18lazy_static::lazy_static! {
19    static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
20}
21
22/// A-Z, a-z, 0-9, and '_' are true
23const VALID_ASCII_TOKEN: [bool; 256] = [
24    false, false, false, false, false, false, false, false, false, false, false, false, false,
25    false, false, false, false, false, false, false, false, false, false, false, false, false,
26    false, false, false, false, false, false, false, false, false, false, false, false, false,
27    false, false, false, false, false, false, false, false, false, true, true, true, true, true,
28    true, true, true, true, true, false, false, false, false, false, false, false, true, true,
29    true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true,
30    true, true, true, true, true, true, true, true, false, false, false, false, true, false, true,
31    true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true,
32    true, true, true, true, true, true, true, true, true, false, false, false, false, false, false,
33    false, false, false, false, false, false, false, false, false, false, false, false, false,
34    false, false, false, false, false, false, false, false, false, false, false, false, false,
35    false, false, false, false, false, false, false, false, false, false, false, false, false,
36    false, false, false, false, false, false, false, false, false, false, false, false, false,
37    false, false, false, false, false, false, false, false, false, false, false, false, false,
38    false, false, false, false, false, false, false, false, false, false, false, false, false,
39    false, false, false, false, false, false, false, false, false, false, false, false, false,
40    false, false, false, false, false, false, false, false, false, false, false, false, false,
41    false, false, false, false, false, false, false, false, false, false, false, false, false,
42    false, false, false, false, false, false, false, false, false, false,
43];
44
45/// `Tokenizer` tokenizes a text into a list of tokens.
46pub trait Tokenizer: Send {
47    fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str>;
48}
49
50/// `EnglishTokenizer` tokenizes an English text.
51///
52/// It splits the text by non-alphabetic characters.
53#[derive(Debug, Default)]
54pub struct EnglishTokenizer;
55
56impl Tokenizer for EnglishTokenizer {
57    fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
58        if text.is_ascii() {
59            let mut tokens = Vec::new();
60            let mut start = 0;
61            for (i, &byte) in text.as_bytes().iter().enumerate() {
62                if !VALID_ASCII_TOKEN[byte as usize] {
63                    if start < i {
64                        tokens.push(&text[start..i]);
65                    }
66                    start = i + 1;
67                }
68            }
69
70            if start < text.len() {
71                tokens.push(&text[start..]);
72            }
73
74            tokens
75        } else {
76            text.split(|c: char| !c.is_alphanumeric() && c != '_')
77                .filter(|s| !s.is_empty())
78                .collect()
79        }
80    }
81}
82
83/// `ChineseTokenizer` tokenizes a Chinese text.
84///
85/// It uses Jieba search-mode tokenization to improve recall for Chinese fulltext search.
86/// Enabling HMM also helps merge some unknown fragments into larger tokens, which can reduce
87/// token cardinality versus a fully fragmented output.
88#[derive(Debug, Default)]
89pub struct ChineseTokenizer;
90
91impl Tokenizer for ChineseTokenizer {
92    fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
93        if text.is_ascii() {
94            EnglishTokenizer {}.tokenize(text)
95        } else {
96            // Search-mode tokenization emits finer-grained searchable terms, while HMM helps
97            // merge some unknown fragments and avoid excessive token fragmentation.
98            let mut tokens = JIEBA
99                .cut_for_search(text, true)
100                .into_iter()
101                .filter(|s| is_indexable_token(s))
102                .collect::<Vec<_>>();
103
104            let english = EnglishTokenizer {};
105            tokens.extend(
106                english
107                    .tokenize(text)
108                    .into_iter()
109                    .filter(|token| is_ascii_underscore_token(token)),
110            );
111
112            tokens
113        }
114    }
115}
116
117fn is_indexable_token(token: &str) -> bool {
118    token.chars().any(|c| c.is_alphanumeric() || c == '_')
119}
120
121fn is_ascii_underscore_token(token: &str) -> bool {
122    token.is_ascii() && token.chars().any(|c| c == '_')
123}
124
125/// `Analyzer` analyzes a text into a list of tokens.
126///
127/// It uses a `Tokenizer` to tokenize the text and optionally lowercases the tokens.
128pub struct Analyzer {
129    tokenizer: Box<dyn Tokenizer>,
130    case_sensitive: bool,
131}
132
133impl Analyzer {
134    /// Creates a new `Analyzer` with the given `Tokenizer` and case sensitivity.
135    pub fn new(tokenizer: Box<dyn Tokenizer>, case_sensitive: bool) -> Self {
136        Self {
137            tokenizer,
138            case_sensitive,
139        }
140    }
141
142    /// Analyzes the given text into a list of tokens.
143    pub fn analyze_text(&self, text: &str) -> Result<Vec<Bytes>> {
144        let res = self
145            .tokenizer
146            .tokenize(text)
147            .iter()
148            .map(|s| {
149                if self.case_sensitive {
150                    s.as_bytes().to_vec()
151                } else {
152                    s.to_lowercase().as_bytes().to_vec()
153                }
154            })
155            .collect();
156        Ok(res)
157    }
158}
159
160#[cfg(test)]
161mod tests {
162    use super::*;
163
164    #[test]
165    fn test_english_tokenizer() {
166        let tokenizer = EnglishTokenizer;
167        let text = "Hello, world!!! This is a----++   test012_345+67890 ship_ship ship__ship _ __ __IDENTIFIER__ _ship ship_";
168        let tokens = tokenizer.tokenize(text);
169        assert_eq!(
170            tokens,
171            vec![
172                "Hello",
173                "world",
174                "This",
175                "is",
176                "a",
177                "test012_345",
178                "67890",
179                "ship_ship",
180                "ship__ship",
181                "_",
182                "__",
183                "__IDENTIFIER__",
184                "_ship",
185                "ship_"
186            ]
187        );
188    }
189
190    #[test]
191    fn test_english_tokenizer_with_utf8() {
192        let tokenizer = EnglishTokenizer;
193        let text = "💸unfold the 纸巾😣and gently 清洁表😭面";
194        let tokens = tokenizer.tokenize(text);
195        assert_eq!(
196            tokens,
197            // Don't care what happens to non-ASCII characters.
198            // It's kind of a misconfiguration to use EnglishTokenizer on non-ASCII text.
199            vec!["unfold", "the", "纸巾", "and", "gently", "清洁表", "面"]
200        );
201    }
202
203    #[test]
204    fn test_chinese_tokenizer() {
205        let tokenizer = ChineseTokenizer;
206        let text = "我喜欢苹果";
207        let tokens = tokenizer.tokenize(text);
208        assert_eq!(tokens, vec!["我", "喜欢", "苹果"]);
209    }
210
211    #[test]
212    fn test_chinese_tokenizer_issue_7943_sample() {
213        let tokenizer = ChineseTokenizer;
214        let text = "[2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key:829889AC8 ship_ship ship__ship _ __ __IDENTIFIER__ _ship ship_ EOF";
215        let tokens = tokenizer.tokenize(text);
216
217        assert_eq!(
218            tokens,
219            vec![
220                "2026",
221                "04",
222                "09",
223                "13",
224                "56",
225                "11.031",
226                "2026-04",
227                "09",
228                "13",
229                "56",
230                "11.031",
231                "trace",
232                "_",
233                "id",
234                "340a6a44b0bd8e37bb7697ss7da61ff0",
235                "span",
236                "_",
237                "id",
238                "085ff5ttf1e0a23b",
239                "trace",
240                "_",
241                "flags",
242                "01",
243                "http",
244                "nio-8081",
245                "exec-16",
246                "INFO",
247                "c",
248                "h",
249                "p",
250                "xx",
251                "web",
252                "service",
253                "impl",
254                "CCCXForwardKKKServiceImpl",
255                "pushout",
256                "188",
257                "登录",
258                "手机",
259                "手机号",
260                "18888888888",
261                "的",
262                "动态",
263                "key",
264                "829889AC8",
265                "ship",
266                "_",
267                "ship",
268                "ship",
269                "__",
270                "ship",
271                "_",
272                "__",
273                "__",
274                "IDENTIFIER",
275                "__",
276                "_",
277                "ship",
278                "ship",
279                "_",
280                "EOF",
281                "trace_id",
282                "span_id",
283                "trace_flags",
284                "ship_ship",
285                "ship__ship",
286                "_",
287                "__",
288                "__IDENTIFIER__",
289                "_ship",
290                "ship_"
291            ]
292        );
293    }
294
295    #[test]
296    fn test_chinese_tokenizer_keeps_ascii_underscore_compounds() {
297        let tokenizer = ChineseTokenizer;
298        let text = "trace_id=abc 登录手机号 dynamic_key=xyz";
299
300        let tokens = tokenizer.tokenize(text);
301
302        assert!(tokens.contains(&"trace_id"));
303        assert!(tokens.contains(&"dynamic_key"));
304        assert!(tokens.contains(&"登录"));
305        assert!(tokens.contains(&"手机号"));
306    }
307
308    #[test]
309    fn test_chinese_tokenizer_skips_non_ascii_underscore_tokens() {
310        let tokenizer = ChineseTokenizer;
311        let text = "登录_id trace_id 手机号_trace";
312
313        let tokens = tokenizer.tokenize(text);
314
315        assert_eq!(
316            tokens,
317            [
318                "登录",
319                "_",
320                "id",
321                "trace",
322                "_",
323                "id",
324                "手机",
325                "手机号",
326                "_",
327                "trace",
328                "trace_id"
329            ]
330        );
331    }
332
333    #[test]
334    fn test_chinese_tokenizer_aggressive_tokenization_probe() {
335        let tokenizer = ChineseTokenizer;
336        let text = "哈基米哦南北绿豆,噢马自立曼波。登录手机号。中国农业银行。装电视台,中国中央广播电视台。压不缩,笑不活。";
337
338        let default_tokens = tokenizer.tokenize(text);
339        let cut_hmm_false = JIEBA.cut(text, false);
340        let cut_hmm_true = JIEBA.cut(text, true);
341        let cut_for_search_hmm_false = JIEBA.cut_for_search(text, false);
342        let cut_for_search_hmm_true = JIEBA.cut_for_search(text, true);
343
344        assert_eq!(
345            default_tokens,
346            [
347                "哈基米",
348                "哦",
349                "南北",
350                "绿豆",
351                "噢",
352                "马",
353                "自立",
354                "曼波",
355                "登录",
356                "手机",
357                "手机号",
358                "中国",
359                "农业",
360                "银行",
361                "中国农业银行",
362                "装",
363                "电视",
364                "电视台",
365                "中国",
366                "中央",
367                "广播",
368                "电视",
369                "电视台",
370                "不缩",
371                "压不缩",
372                "笑",
373                "不活",
374            ]
375        );
376        assert_eq!(
377            cut_hmm_false,
378            [
379                "哈",
380                "基",
381                "米",
382                "哦",
383                "南北",
384                "绿豆",
385                ",",
386                "噢",
387                "马",
388                "自立",
389                "曼",
390                "波",
391                "。",
392                "登录",
393                "手机号",
394                "。",
395                "中国农业银行",
396                "。",
397                "装",
398                "电视台",
399                ",",
400                "中国",
401                "中央",
402                "广播",
403                "电视台",
404                "。",
405                "压",
406                "不",
407                "缩",
408                ",",
409                "笑",
410                "不",
411                "活",
412                "。"
413            ]
414        );
415        assert_eq!(
416            cut_hmm_true,
417            [
418                "哈基米",
419                "哦",
420                "南北",
421                "绿豆",
422                ",",
423                "噢",
424                "马",
425                "自立",
426                "曼波",
427                "。",
428                "登录",
429                "手机号",
430                "。",
431                "中国农业银行",
432                "。",
433                "装",
434                "电视台",
435                ",",
436                "中国",
437                "中央",
438                "广播",
439                "电视台",
440                "。",
441                "压不缩",
442                ",",
443                "笑",
444                "不活",
445                "。"
446            ]
447        );
448        assert_eq!(
449            cut_for_search_hmm_false,
450            [
451                "哈",
452                "基",
453                "米",
454                "哦",
455                "南北",
456                "绿豆",
457                ",",
458                "噢",
459                "马",
460                "自立",
461                "曼",
462                "波",
463                "。",
464                "登录",
465                "手机",
466                "手机号",
467                "。",
468                "中国",
469                "农业",
470                "银行",
471                "中国农业银行",
472                "。",
473                "装",
474                "电视",
475                "电视台",
476                ",",
477                "中国",
478                "中央",
479                "广播",
480                "电视",
481                "电视台",
482                "。",
483                "压",
484                "不",
485                "缩",
486                ",",
487                "笑",
488                "不",
489                "活",
490                "。"
491            ]
492        );
493
494        assert_eq!(
495            cut_for_search_hmm_true,
496            [
497                "哈基米",
498                "哦",
499                "南北",
500                "绿豆",
501                ",",
502                "噢",
503                "马",
504                "自立",
505                "曼波",
506                "。",
507                "登录",
508                "手机",
509                "手机号",
510                "。",
511                "中国",
512                "农业",
513                "银行",
514                "中国农业银行",
515                "。",
516                "装",
517                "电视",
518                "电视台",
519                ",",
520                "中国",
521                "中央",
522                "广播",
523                "电视",
524                "电视台",
525                "。",
526                "不缩",
527                "压不缩",
528                ",",
529                "笑",
530                "不活",
531                "。"
532            ]
533        );
534    }
535
536    #[test]
537    fn test_valid_ascii_token_lookup_table() {
538        // Test all ASCII values in a single loop
539        for c in 0u8..=255u8 {
540            let is_valid = VALID_ASCII_TOKEN[c as usize];
541            let should_be_valid = (c as char).is_ascii_alphanumeric() || c == b'_';
542
543            assert_eq!(
544                is_valid,
545                should_be_valid,
546                "Character '{}' (byte {}) validity mismatch: expected {}, got {}",
547                if c.is_ascii() && !c.is_ascii_control() {
548                    c as char
549                } else {
550                    '?'
551                },
552                c,
553                should_be_valid,
554                is_valid
555            );
556        }
557    }
558
559    #[test]
560    fn test_analyzer() {
561        let tokenizer = EnglishTokenizer;
562        let analyzer = Analyzer::new(Box::new(tokenizer), false);
563        let text = "Hello, world! This is a test.";
564        let tokens = analyzer.analyze_text(text).unwrap();
565        assert_eq!(
566            tokens,
567            vec![
568                b"hello".to_vec(),
569                b"world".to_vec(),
570                b"this".to_vec(),
571                b"is".to_vec(),
572                b"a".to_vec(),
573                b"test".to_vec()
574            ]
575        );
576    }
577}