index/fulltext_index/
tokenizer.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use crate::fulltext_index::error::Result;
16use crate::Bytes;
17
18lazy_static::lazy_static! {
19    static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
20}
21
22/// A-Z, a-z, 0-9, and '_' are true
23const VALID_ASCII_TOKEN: [bool; 256] = [
24    false, false, false, false, false, false, false, false, false, false, false, false, false,
25    false, false, false, false, false, false, false, false, false, false, false, false, false,
26    false, false, false, false, false, false, false, false, false, false, false, false, false,
27    false, false, false, false, false, false, false, false, false, true, true, true, true, true,
28    true, true, true, true, true, false, false, false, false, false, false, false, true, true,
29    true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true,
30    true, true, true, true, true, true, true, true, false, false, false, false, true, false, true,
31    true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true,
32    true, true, true, true, true, true, true, true, true, false, false, false, false, false, false,
33    false, false, false, false, false, false, false, false, false, false, false, false, false,
34    false, false, false, false, false, false, false, false, false, false, false, false, false,
35    false, false, false, false, false, false, false, false, false, false, false, false, false,
36    false, false, false, false, false, false, false, false, false, false, false, false, false,
37    false, false, false, false, false, false, false, false, false, false, false, false, false,
38    false, false, false, false, false, false, false, false, false, false, false, false, false,
39    false, false, false, false, false, false, false, false, false, false, false, false, false,
40    false, false, false, false, false, false, false, false, false, false, false, false, false,
41    false, false, false, false, false, false, false, false, false, false, false, false, false,
42    false, false, false, false, false, false, false, false, false, false,
43];
44
45/// `Tokenizer` tokenizes a text into a list of tokens.
46pub trait Tokenizer: Send {
47    fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str>;
48}
49
50/// `EnglishTokenizer` tokenizes an English text.
51///
52/// It splits the text by non-alphabetic characters.
53#[derive(Debug, Default)]
54pub struct EnglishTokenizer;
55
56impl Tokenizer for EnglishTokenizer {
57    fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
58        if text.is_ascii() {
59            let mut tokens = Vec::new();
60            let mut start = 0;
61            for (i, &byte) in text.as_bytes().iter().enumerate() {
62                if !VALID_ASCII_TOKEN[byte as usize] {
63                    if start < i {
64                        tokens.push(&text[start..i]);
65                    }
66                    start = i + 1;
67                }
68            }
69
70            if start < text.len() {
71                tokens.push(&text[start..]);
72            }
73
74            tokens
75        } else {
76            text.split(|c: char| !c.is_alphanumeric() && c != '_')
77                .filter(|s| !s.is_empty())
78                .collect()
79        }
80    }
81}
82
83/// `ChineseTokenizer` tokenizes a Chinese text.
84///
85/// It uses the Jieba tokenizer to split the text into Chinese words.
86#[derive(Debug, Default)]
87pub struct ChineseTokenizer;
88
89impl Tokenizer for ChineseTokenizer {
90    fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
91        if text.is_ascii() {
92            EnglishTokenizer {}.tokenize(text)
93        } else {
94            JIEBA.cut(text, false)
95        }
96    }
97}
98
99/// `Analyzer` analyzes a text into a list of tokens.
100///
101/// It uses a `Tokenizer` to tokenize the text and optionally lowercases the tokens.
102pub struct Analyzer {
103    tokenizer: Box<dyn Tokenizer>,
104    case_sensitive: bool,
105}
106
107impl Analyzer {
108    /// Creates a new `Analyzer` with the given `Tokenizer` and case sensitivity.
109    pub fn new(tokenizer: Box<dyn Tokenizer>, case_sensitive: bool) -> Self {
110        Self {
111            tokenizer,
112            case_sensitive,
113        }
114    }
115
116    /// Analyzes the given text into a list of tokens.
117    pub fn analyze_text(&self, text: &str) -> Result<Vec<Bytes>> {
118        let res = self
119            .tokenizer
120            .tokenize(text)
121            .iter()
122            .map(|s| {
123                if self.case_sensitive {
124                    s.as_bytes().to_vec()
125                } else {
126                    s.to_lowercase().as_bytes().to_vec()
127                }
128            })
129            .collect();
130        Ok(res)
131    }
132}
133
134#[cfg(test)]
135mod tests {
136    use super::*;
137
138    #[test]
139    fn test_english_tokenizer() {
140        let tokenizer = EnglishTokenizer;
141        let text = "Hello, world!!! This is a----++   test012_345+67890";
142        let tokens = tokenizer.tokenize(text);
143        assert_eq!(
144            tokens,
145            vec!["Hello", "world", "This", "is", "a", "test012_345", "67890"]
146        );
147    }
148
149    #[test]
150    fn test_english_tokenizer_with_utf8() {
151        let tokenizer = EnglishTokenizer;
152        let text = "💸unfold the 纸巾😣and gently 清洁表😭面";
153        let tokens = tokenizer.tokenize(text);
154        assert_eq!(
155            tokens,
156            // Don't care what happens to non-ASCII characters.
157            // It's kind of a misconfiguration to use EnglishTokenizer on non-ASCII text.
158            vec!["unfold", "the", "纸巾", "and", "gently", "清洁表", "面"]
159        );
160    }
161
162    #[test]
163    fn test_chinese_tokenizer() {
164        let tokenizer = ChineseTokenizer;
165        let text = "我喜欢苹果";
166        let tokens = tokenizer.tokenize(text);
167        assert_eq!(tokens, vec!["我", "喜欢", "苹果"]);
168    }
169
170    #[test]
171    fn test_valid_ascii_token_lookup_table() {
172        // Test all ASCII values in a single loop
173        for c in 0u8..=255u8 {
174            let is_valid = VALID_ASCII_TOKEN[c as usize];
175            let should_be_valid = (c as char).is_ascii_alphanumeric() || c == b'_';
176
177            assert_eq!(
178                is_valid,
179                should_be_valid,
180                "Character '{}' (byte {}) validity mismatch: expected {}, got {}",
181                if c.is_ascii() && !c.is_ascii_control() {
182                    c as char
183                } else {
184                    '?'
185                },
186                c,
187                should_be_valid,
188                is_valid
189            );
190        }
191    }
192
193    #[test]
194    fn test_analyzer() {
195        let tokenizer = EnglishTokenizer;
196        let analyzer = Analyzer::new(Box::new(tokenizer), false);
197        let text = "Hello, world! This is a test.";
198        let tokens = analyzer.analyze_text(text).unwrap();
199        assert_eq!(
200            tokens,
201            vec![
202                b"hello".to_vec(),
203                b"world".to_vec(),
204                b"this".to_vec(),
205                b"is".to_vec(),
206                b"a".to_vec(),
207                b"test".to_vec()
208            ]
209        );
210    }
211}