Skip to main content

index/fulltext_index/
tokenizer.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use crate::Bytes;
16use crate::fulltext_index::error::Result;
17
18lazy_static::lazy_static! {
19    static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
20}
21
22/// A-Z, a-z, 0-9, and '_' are true
23const VALID_ASCII_TOKEN: [bool; 256] = [
24    false, false, false, false, false, false, false, false, false, false, false, false, false,
25    false, false, false, false, false, false, false, false, false, false, false, false, false,
26    false, false, false, false, false, false, false, false, false, false, false, false, false,
27    false, false, false, false, false, false, false, false, false, true, true, true, true, true,
28    true, true, true, true, true, false, false, false, false, false, false, false, true, true,
29    true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true,
30    true, true, true, true, true, true, true, true, false, false, false, false, true, false, true,
31    true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true,
32    true, true, true, true, true, true, true, true, true, false, false, false, false, false, false,
33    false, false, false, false, false, false, false, false, false, false, false, false, false,
34    false, false, false, false, false, false, false, false, false, false, false, false, false,
35    false, false, false, false, false, false, false, false, false, false, false, false, false,
36    false, false, false, false, false, false, false, false, false, false, false, false, false,
37    false, false, false, false, false, false, false, false, false, false, false, false, false,
38    false, false, false, false, false, false, false, false, false, false, false, false, false,
39    false, false, false, false, false, false, false, false, false, false, false, false, false,
40    false, false, false, false, false, false, false, false, false, false, false, false, false,
41    false, false, false, false, false, false, false, false, false, false, false, false, false,
42    false, false, false, false, false, false, false, false, false, false,
43];
44
45/// `Tokenizer` tokenizes a text into a list of tokens.
46pub trait Tokenizer: Send {
47    fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str>;
48}
49
50/// `EnglishTokenizer` tokenizes an English text.
51///
52/// It splits the text by non-alphabetic characters.
53#[derive(Debug, Default)]
54pub struct EnglishTokenizer;
55
56impl Tokenizer for EnglishTokenizer {
57    fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
58        if text.is_ascii() {
59            let mut tokens = Vec::new();
60            let mut start = 0;
61            for (i, &byte) in text.as_bytes().iter().enumerate() {
62                if !VALID_ASCII_TOKEN[byte as usize] {
63                    if start < i {
64                        tokens.push(&text[start..i]);
65                    }
66                    start = i + 1;
67                }
68            }
69
70            if start < text.len() {
71                tokens.push(&text[start..]);
72            }
73
74            tokens
75        } else {
76            text.split(|c: char| !c.is_alphanumeric() && c != '_')
77                .filter(|s| !s.is_empty())
78                .collect()
79        }
80    }
81}
82
83/// `ChineseTokenizer` tokenizes a Chinese text.
84///
85/// It uses Jieba search-mode tokenization to improve recall for Chinese fulltext search.
86/// Enabling HMM also helps merge some unknown fragments into larger tokens, which can reduce
87/// token cardinality versus a fully fragmented output.
88#[derive(Debug, Default)]
89pub struct ChineseTokenizer;
90
91impl Tokenizer for ChineseTokenizer {
92    fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
93        if text.is_ascii() {
94            EnglishTokenizer {}.tokenize(text)
95        } else {
96            // Search-mode tokenization emits finer-grained searchable terms, while HMM helps
97            // merge some unknown fragments and avoid excessive token fragmentation.
98            let mut tokens = JIEBA
99                .cut_for_search(text, true)
100                .into_iter()
101                .map(|token| token.word)
102                .filter(|token| is_indexable_token(token))
103                .collect::<Vec<_>>();
104
105            let english = EnglishTokenizer {};
106            tokens.extend(
107                english
108                    .tokenize(text)
109                    .into_iter()
110                    .filter(|token| is_ascii_underscore_token(token)),
111            );
112
113            tokens
114        }
115    }
116}
117
118fn is_indexable_token(token: &str) -> bool {
119    token.chars().any(|c| c.is_alphanumeric() || c == '_')
120}
121
122fn is_ascii_underscore_token(token: &str) -> bool {
123    token.is_ascii() && token.chars().any(|c| c == '_')
124}
125
126/// `Analyzer` analyzes a text into a list of tokens.
127///
128/// It uses a `Tokenizer` to tokenize the text and optionally lowercases the tokens.
129pub struct Analyzer {
130    tokenizer: Box<dyn Tokenizer>,
131    case_sensitive: bool,
132}
133
134impl Analyzer {
135    /// Creates a new `Analyzer` with the given `Tokenizer` and case sensitivity.
136    pub fn new(tokenizer: Box<dyn Tokenizer>, case_sensitive: bool) -> Self {
137        Self {
138            tokenizer,
139            case_sensitive,
140        }
141    }
142
143    /// Analyzes the given text into a list of tokens.
144    pub fn analyze_text(&self, text: &str) -> Result<Vec<Bytes>> {
145        let res = self
146            .tokenizer
147            .tokenize(text)
148            .iter()
149            .map(|s| {
150                if self.case_sensitive {
151                    s.as_bytes().to_vec()
152                } else {
153                    s.to_lowercase().as_bytes().to_vec()
154                }
155            })
156            .collect();
157        Ok(res)
158    }
159}
160
161#[cfg(test)]
162mod tests {
163    use super::*;
164
165    #[test]
166    fn test_english_tokenizer() {
167        let tokenizer = EnglishTokenizer;
168        let text = "Hello, world!!! This is a----++   test012_345+67890 ship_ship ship__ship _ __ __IDENTIFIER__ _ship ship_";
169        let tokens = tokenizer.tokenize(text);
170        assert_eq!(
171            tokens,
172            vec![
173                "Hello",
174                "world",
175                "This",
176                "is",
177                "a",
178                "test012_345",
179                "67890",
180                "ship_ship",
181                "ship__ship",
182                "_",
183                "__",
184                "__IDENTIFIER__",
185                "_ship",
186                "ship_"
187            ]
188        );
189    }
190
191    #[test]
192    fn test_english_tokenizer_with_utf8() {
193        let tokenizer = EnglishTokenizer;
194        let text = "💸unfold the 纸巾😣and gently 清洁表😭面";
195        let tokens = tokenizer.tokenize(text);
196        assert_eq!(
197            tokens,
198            // Don't care what happens to non-ASCII characters.
199            // It's kind of a misconfiguration to use EnglishTokenizer on non-ASCII text.
200            vec!["unfold", "the", "纸巾", "and", "gently", "清洁表", "面"]
201        );
202    }
203
204    #[test]
205    fn test_chinese_tokenizer() {
206        let tokenizer = ChineseTokenizer;
207        let text = "我喜欢苹果";
208        let tokens = tokenizer.tokenize(text);
209        assert_eq!(tokens, vec!["我", "喜欢", "苹果"]);
210    }
211
212    #[test]
213    fn test_chinese_tokenizer_issue_7943_sample() {
214        let tokenizer = ChineseTokenizer;
215        let text = "[2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key:829889AC8 ship_ship ship__ship _ __ __IDENTIFIER__ _ship ship_ EOF";
216        let tokens = tokenizer.tokenize(text);
217
218        assert_eq!(
219            tokens,
220            vec![
221                "2026",
222                "04",
223                "09",
224                "13",
225                "56",
226                "11.031",
227                "2026-04",
228                "09",
229                "13",
230                "56",
231                "11.031",
232                "trace",
233                "_",
234                "id",
235                "340a6a44b0bd8e37bb7697ss7da61ff0",
236                "span",
237                "_",
238                "id",
239                "085ff5ttf1e0a23b",
240                "trace",
241                "_",
242                "flags",
243                "01",
244                "http",
245                "nio-8081",
246                "exec-16",
247                "INFO",
248                "c",
249                "h",
250                "p",
251                "xx",
252                "web",
253                "service",
254                "impl",
255                "CCCXForwardKKKServiceImpl",
256                "pushout",
257                "188",
258                "登录",
259                "手机",
260                "手机号",
261                "18888888888",
262                "的",
263                "动态",
264                "key",
265                "829889AC8",
266                "ship",
267                "_",
268                "ship",
269                "ship",
270                "__",
271                "ship",
272                "_",
273                "__",
274                "__",
275                "IDENTIFIER",
276                "__",
277                "_",
278                "ship",
279                "ship",
280                "_",
281                "EOF",
282                "trace_id",
283                "span_id",
284                "trace_flags",
285                "ship_ship",
286                "ship__ship",
287                "_",
288                "__",
289                "__IDENTIFIER__",
290                "_ship",
291                "ship_"
292            ]
293        );
294    }
295
296    #[test]
297    fn test_chinese_tokenizer_keeps_ascii_underscore_compounds() {
298        let tokenizer = ChineseTokenizer;
299        let text = "trace_id=abc 登录手机号 dynamic_key=xyz";
300
301        let tokens = tokenizer.tokenize(text);
302
303        assert!(tokens.contains(&"trace_id"));
304        assert!(tokens.contains(&"dynamic_key"));
305        assert!(tokens.contains(&"登录"));
306        assert!(tokens.contains(&"手机号"));
307    }
308
309    #[test]
310    fn test_chinese_tokenizer_skips_non_ascii_underscore_tokens() {
311        let tokenizer = ChineseTokenizer;
312        let text = "登录_id trace_id 手机号_trace";
313
314        let tokens = tokenizer.tokenize(text);
315
316        assert_eq!(
317            tokens,
318            [
319                "登录",
320                "_",
321                "id",
322                "trace",
323                "_",
324                "id",
325                "手机",
326                "手机号",
327                "_",
328                "trace",
329                "trace_id"
330            ]
331        );
332    }
333
334    #[test]
335    fn test_chinese_tokenizer_aggressive_tokenization_probe() {
336        let tokenizer = ChineseTokenizer;
337        let text = "哈基米哦南北绿豆,噢马自立曼波。登录手机号。中国农业银行。装电视台,中国中央广播电视台。压不缩,笑不活。";
338
339        let default_tokens = tokenizer.tokenize(text);
340        let cut_hmm_false = JIEBA
341            .cut(text, false)
342            .into_iter()
343            .map(|token| token.word)
344            .collect::<Vec<_>>();
345        let cut_hmm_true = JIEBA
346            .cut(text, true)
347            .into_iter()
348            .map(|token| token.word)
349            .collect::<Vec<_>>();
350        let cut_for_search_hmm_false = JIEBA
351            .cut_for_search(text, false)
352            .into_iter()
353            .map(|token| token.word)
354            .collect::<Vec<_>>();
355        let cut_for_search_hmm_true = JIEBA
356            .cut_for_search(text, true)
357            .into_iter()
358            .map(|token| token.word)
359            .collect::<Vec<_>>();
360
361        assert_eq!(
362            default_tokens,
363            [
364                "哈基米",
365                "哦",
366                "南北",
367                "绿豆",
368                "噢",
369                "马",
370                "自立",
371                "曼波",
372                "登录",
373                "手机",
374                "手机号",
375                "中国",
376                "农业",
377                "银行",
378                "中国农业银行",
379                "装",
380                "电视",
381                "电视台",
382                "中国",
383                "中央",
384                "广播",
385                "电视",
386                "电视台",
387                "不缩",
388                "压不缩",
389                "笑",
390                "不活",
391            ]
392        );
393        assert_eq!(
394            cut_hmm_false,
395            [
396                "哈",
397                "基",
398                "米",
399                "哦",
400                "南北",
401                "绿豆",
402                ",",
403                "噢",
404                "马",
405                "自立",
406                "曼",
407                "波",
408                "。",
409                "登录",
410                "手机号",
411                "。",
412                "中国农业银行",
413                "。",
414                "装",
415                "电视台",
416                ",",
417                "中国",
418                "中央",
419                "广播",
420                "电视台",
421                "。",
422                "压",
423                "不",
424                "缩",
425                ",",
426                "笑",
427                "不",
428                "活",
429                "。"
430            ]
431        );
432        assert_eq!(
433            cut_hmm_true,
434            [
435                "哈基米",
436                "哦",
437                "南北",
438                "绿豆",
439                ",",
440                "噢",
441                "马",
442                "自立",
443                "曼波",
444                "。",
445                "登录",
446                "手机号",
447                "。",
448                "中国农业银行",
449                "。",
450                "装",
451                "电视台",
452                ",",
453                "中国",
454                "中央",
455                "广播",
456                "电视台",
457                "。",
458                "压不缩",
459                ",",
460                "笑",
461                "不活",
462                "。"
463            ]
464        );
465        assert_eq!(
466            cut_for_search_hmm_false,
467            [
468                "哈",
469                "基",
470                "米",
471                "哦",
472                "南北",
473                "绿豆",
474                ",",
475                "噢",
476                "马",
477                "自立",
478                "曼",
479                "波",
480                "。",
481                "登录",
482                "手机",
483                "手机号",
484                "。",
485                "中国",
486                "农业",
487                "银行",
488                "中国农业银行",
489                "。",
490                "装",
491                "电视",
492                "电视台",
493                ",",
494                "中国",
495                "中央",
496                "广播",
497                "电视",
498                "电视台",
499                "。",
500                "压",
501                "不",
502                "缩",
503                ",",
504                "笑",
505                "不",
506                "活",
507                "。"
508            ]
509        );
510
511        assert_eq!(
512            cut_for_search_hmm_true,
513            [
514                "哈基米",
515                "哦",
516                "南北",
517                "绿豆",
518                ",",
519                "噢",
520                "马",
521                "自立",
522                "曼波",
523                "。",
524                "登录",
525                "手机",
526                "手机号",
527                "。",
528                "中国",
529                "农业",
530                "银行",
531                "中国农业银行",
532                "。",
533                "装",
534                "电视",
535                "电视台",
536                ",",
537                "中国",
538                "中央",
539                "广播",
540                "电视",
541                "电视台",
542                "。",
543                "不缩",
544                "压不缩",
545                ",",
546                "笑",
547                "不活",
548                "。"
549            ]
550        );
551    }
552
553    #[test]
554    fn test_valid_ascii_token_lookup_table() {
555        // Test all ASCII values in a single loop
556        for c in 0u8..=255u8 {
557            let is_valid = VALID_ASCII_TOKEN[c as usize];
558            let should_be_valid = (c as char).is_ascii_alphanumeric() || c == b'_';
559
560            assert_eq!(
561                is_valid,
562                should_be_valid,
563                "Character '{}' (byte {}) validity mismatch: expected {}, got {}",
564                if c.is_ascii() && !c.is_ascii_control() {
565                    c as char
566                } else {
567                    '?'
568                },
569                c,
570                should_be_valid,
571                is_valid
572            );
573        }
574    }
575
576    #[test]
577    fn test_analyzer() {
578        let tokenizer = EnglishTokenizer;
579        let analyzer = Analyzer::new(Box::new(tokenizer), false);
580        let text = "Hello, world! This is a test.";
581        let tokens = analyzer.analyze_text(text).unwrap();
582        assert_eq!(
583            tokens,
584            vec![
585                b"hello".to_vec(),
586                b"world".to_vec(),
587                b"this".to_vec(),
588                b"is".to_vec(),
589                b"a".to_vec(),
590                b"test".to_vec()
591            ]
592        );
593    }
594}