1use crate::fulltext_index::error::Result;
16use crate::Bytes;
17
18lazy_static::lazy_static! {
19 static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
20}
21
22const VALID_ASCII_TOKEN: [bool; 256] = [
24 false, false, false, false, false, false, false, false, false, false, false, false, false,
25 false, false, false, false, false, false, false, false, false, false, false, false, false,
26 false, false, false, false, false, false, false, false, false, false, false, false, false,
27 false, false, false, false, false, false, false, false, false, true, true, true, true, true,
28 true, true, true, true, true, false, false, false, false, false, false, false, true, true,
29 true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true,
30 true, true, true, true, true, true, true, true, false, false, false, false, true, false, true,
31 true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true,
32 true, true, true, true, true, true, true, true, true, false, false, false, false, false, false,
33 false, false, false, false, false, false, false, false, false, false, false, false, false,
34 false, false, false, false, false, false, false, false, false, false, false, false, false,
35 false, false, false, false, false, false, false, false, false, false, false, false, false,
36 false, false, false, false, false, false, false, false, false, false, false, false, false,
37 false, false, false, false, false, false, false, false, false, false, false, false, false,
38 false, false, false, false, false, false, false, false, false, false, false, false, false,
39 false, false, false, false, false, false, false, false, false, false, false, false, false,
40 false, false, false, false, false, false, false, false, false, false, false, false, false,
41 false, false, false, false, false, false, false, false, false, false, false, false, false,
42 false, false, false, false, false, false, false, false, false, false,
43];
44
45pub trait Tokenizer: Send {
47 fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str>;
48}
49
50#[derive(Debug, Default)]
54pub struct EnglishTokenizer;
55
56impl Tokenizer for EnglishTokenizer {
57 fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
58 if text.is_ascii() {
59 let mut tokens = Vec::new();
60 let mut start = 0;
61 for (i, &byte) in text.as_bytes().iter().enumerate() {
62 if !VALID_ASCII_TOKEN[byte as usize] {
63 if start < i {
64 tokens.push(&text[start..i]);
65 }
66 start = i + 1;
67 }
68 }
69
70 if start < text.len() {
71 tokens.push(&text[start..]);
72 }
73
74 tokens
75 } else {
76 text.split(|c: char| !c.is_alphanumeric() && c != '_')
77 .filter(|s| !s.is_empty())
78 .collect()
79 }
80 }
81}
82
83#[derive(Debug, Default)]
87pub struct ChineseTokenizer;
88
89impl Tokenizer for ChineseTokenizer {
90 fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
91 if text.is_ascii() {
92 EnglishTokenizer {}.tokenize(text)
93 } else {
94 JIEBA.cut(text, false)
95 }
96 }
97}
98
99pub struct Analyzer {
103 tokenizer: Box<dyn Tokenizer>,
104 case_sensitive: bool,
105}
106
107impl Analyzer {
108 pub fn new(tokenizer: Box<dyn Tokenizer>, case_sensitive: bool) -> Self {
110 Self {
111 tokenizer,
112 case_sensitive,
113 }
114 }
115
116 pub fn analyze_text(&self, text: &str) -> Result<Vec<Bytes>> {
118 let res = self
119 .tokenizer
120 .tokenize(text)
121 .iter()
122 .map(|s| {
123 if self.case_sensitive {
124 s.as_bytes().to_vec()
125 } else {
126 s.to_lowercase().as_bytes().to_vec()
127 }
128 })
129 .collect();
130 Ok(res)
131 }
132}
133
134#[cfg(test)]
135mod tests {
136 use super::*;
137
138 #[test]
139 fn test_english_tokenizer() {
140 let tokenizer = EnglishTokenizer;
141 let text = "Hello, world!!! This is a----++ test012_345+67890";
142 let tokens = tokenizer.tokenize(text);
143 assert_eq!(
144 tokens,
145 vec!["Hello", "world", "This", "is", "a", "test012_345", "67890"]
146 );
147 }
148
149 #[test]
150 fn test_english_tokenizer_with_utf8() {
151 let tokenizer = EnglishTokenizer;
152 let text = "💸unfold the 纸巾😣and gently 清洁表😭面";
153 let tokens = tokenizer.tokenize(text);
154 assert_eq!(
155 tokens,
156 vec!["unfold", "the", "纸巾", "and", "gently", "清洁表", "面"]
159 );
160 }
161
162 #[test]
163 fn test_chinese_tokenizer() {
164 let tokenizer = ChineseTokenizer;
165 let text = "我喜欢苹果";
166 let tokens = tokenizer.tokenize(text);
167 assert_eq!(tokens, vec!["我", "喜欢", "苹果"]);
168 }
169
170 #[test]
171 fn test_valid_ascii_token_lookup_table() {
172 for c in 0u8..=255u8 {
174 let is_valid = VALID_ASCII_TOKEN[c as usize];
175 let should_be_valid = (c as char).is_ascii_alphanumeric() || c == b'_';
176
177 assert_eq!(
178 is_valid,
179 should_be_valid,
180 "Character '{}' (byte {}) validity mismatch: expected {}, got {}",
181 if c.is_ascii() && !c.is_ascii_control() {
182 c as char
183 } else {
184 '?'
185 },
186 c,
187 should_be_valid,
188 is_valid
189 );
190 }
191 }
192
193 #[test]
194 fn test_analyzer() {
195 let tokenizer = EnglishTokenizer;
196 let analyzer = Analyzer::new(Box::new(tokenizer), false);
197 let text = "Hello, world! This is a test.";
198 let tokens = analyzer.analyze_text(text).unwrap();
199 assert_eq!(
200 tokens,
201 vec![
202 b"hello".to_vec(),
203 b"world".to_vec(),
204 b"this".to_vec(),
205 b"is".to_vec(),
206 b"a".to_vec(),
207 b"test".to_vec()
208 ]
209 );
210 }
211}