index/fulltext_index/
tokenizer.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use crate::fulltext_index::error::Result;
16use crate::Bytes;
17
18lazy_static::lazy_static! {
19    static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
20}
21
22/// `Tokenizer` tokenizes a text into a list of tokens.
23pub trait Tokenizer: Send {
24    fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str>;
25}
26
27/// `EnglishTokenizer` tokenizes an English text.
28///
29/// It splits the text by non-alphabetic characters.
30#[derive(Debug, Default)]
31pub struct EnglishTokenizer;
32
33impl Tokenizer for EnglishTokenizer {
34    fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
35        text.split(|c: char| !c.is_alphanumeric())
36            .filter(|s| !s.is_empty())
37            .collect()
38    }
39}
40
41/// `ChineseTokenizer` tokenizes a Chinese text.
42///
43/// It uses the Jieba tokenizer to split the text into Chinese words.
44#[derive(Debug, Default)]
45pub struct ChineseTokenizer;
46
47impl Tokenizer for ChineseTokenizer {
48    fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
49        if text.is_ascii() {
50            EnglishTokenizer {}.tokenize(text)
51        } else {
52            JIEBA.cut(text, false)
53        }
54    }
55}
56
57/// `Analyzer` analyzes a text into a list of tokens.
58///
59/// It uses a `Tokenizer` to tokenize the text and optionally lowercases the tokens.
60pub struct Analyzer {
61    tokenizer: Box<dyn Tokenizer>,
62    case_sensitive: bool,
63}
64
65impl Analyzer {
66    /// Creates a new `Analyzer` with the given `Tokenizer` and case sensitivity.
67    pub fn new(tokenizer: Box<dyn Tokenizer>, case_sensitive: bool) -> Self {
68        Self {
69            tokenizer,
70            case_sensitive,
71        }
72    }
73
74    /// Analyzes the given text into a list of tokens.
75    pub fn analyze_text(&self, text: &str) -> Result<Vec<Bytes>> {
76        let res = self
77            .tokenizer
78            .tokenize(text)
79            .iter()
80            .map(|s| {
81                if self.case_sensitive {
82                    s.as_bytes().to_vec()
83                } else {
84                    s.to_lowercase().as_bytes().to_vec()
85                }
86            })
87            .collect();
88        Ok(res)
89    }
90}
91
92#[cfg(test)]
93mod tests {
94    use super::*;
95
96    #[test]
97    fn test_english_tokenizer() {
98        let tokenizer = EnglishTokenizer;
99        let text = "Hello, world! This is a test0.";
100        let tokens = tokenizer.tokenize(text);
101        assert_eq!(tokens, vec!["Hello", "world", "This", "is", "a", "test0"]);
102    }
103
104    #[test]
105    fn test_chinese_tokenizer() {
106        let tokenizer = ChineseTokenizer;
107        let text = "我喜欢苹果";
108        let tokens = tokenizer.tokenize(text);
109        assert_eq!(tokens, vec!["我", "喜欢", "苹果"]);
110    }
111
112    #[test]
113    fn test_analyzer() {
114        let tokenizer = EnglishTokenizer;
115        let analyzer = Analyzer::new(Box::new(tokenizer), false);
116        let text = "Hello, world! This is a test.";
117        let tokens = analyzer.analyze_text(text).unwrap();
118        assert_eq!(
119            tokens,
120            vec![
121                b"hello".to_vec(),
122                b"world".to_vec(),
123                b"this".to_vec(),
124                b"is".to_vec(),
125                b"a".to_vec(),
126                b"test".to_vec()
127            ]
128        );
129    }
130}