1use crate::Bytes;
16use crate::fulltext_index::error::Result;
17
18lazy_static::lazy_static! {
19 static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
20}
21
22const VALID_ASCII_TOKEN: [bool; 256] = [
24 false, false, false, false, false, false, false, false, false, false, false, false, false,
25 false, false, false, false, false, false, false, false, false, false, false, false, false,
26 false, false, false, false, false, false, false, false, false, false, false, false, false,
27 false, false, false, false, false, false, false, false, false, true, true, true, true, true,
28 true, true, true, true, true, false, false, false, false, false, false, false, true, true,
29 true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true,
30 true, true, true, true, true, true, true, true, false, false, false, false, true, false, true,
31 true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true,
32 true, true, true, true, true, true, true, true, true, false, false, false, false, false, false,
33 false, false, false, false, false, false, false, false, false, false, false, false, false,
34 false, false, false, false, false, false, false, false, false, false, false, false, false,
35 false, false, false, false, false, false, false, false, false, false, false, false, false,
36 false, false, false, false, false, false, false, false, false, false, false, false, false,
37 false, false, false, false, false, false, false, false, false, false, false, false, false,
38 false, false, false, false, false, false, false, false, false, false, false, false, false,
39 false, false, false, false, false, false, false, false, false, false, false, false, false,
40 false, false, false, false, false, false, false, false, false, false, false, false, false,
41 false, false, false, false, false, false, false, false, false, false, false, false, false,
42 false, false, false, false, false, false, false, false, false, false,
43];
44
45pub trait Tokenizer: Send {
47 fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str>;
48}
49
50#[derive(Debug, Default)]
54pub struct EnglishTokenizer;
55
56impl Tokenizer for EnglishTokenizer {
57 fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
58 if text.is_ascii() {
59 let mut tokens = Vec::new();
60 let mut start = 0;
61 for (i, &byte) in text.as_bytes().iter().enumerate() {
62 if !VALID_ASCII_TOKEN[byte as usize] {
63 if start < i {
64 tokens.push(&text[start..i]);
65 }
66 start = i + 1;
67 }
68 }
69
70 if start < text.len() {
71 tokens.push(&text[start..]);
72 }
73
74 tokens
75 } else {
76 text.split(|c: char| !c.is_alphanumeric() && c != '_')
77 .filter(|s| !s.is_empty())
78 .collect()
79 }
80 }
81}
82
83#[derive(Debug, Default)]
89pub struct ChineseTokenizer;
90
91impl Tokenizer for ChineseTokenizer {
92 fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
93 if text.is_ascii() {
94 EnglishTokenizer {}.tokenize(text)
95 } else {
96 JIEBA
99 .cut_for_search(text, true)
100 .into_iter()
101 .filter(|s| s.chars().any(|c| c.is_alphanumeric() || c == '_'))
102 .collect()
103 }
104 }
105}
106
107pub struct Analyzer {
111 tokenizer: Box<dyn Tokenizer>,
112 case_sensitive: bool,
113}
114
115impl Analyzer {
116 pub fn new(tokenizer: Box<dyn Tokenizer>, case_sensitive: bool) -> Self {
118 Self {
119 tokenizer,
120 case_sensitive,
121 }
122 }
123
124 pub fn analyze_text(&self, text: &str) -> Result<Vec<Bytes>> {
126 let res = self
127 .tokenizer
128 .tokenize(text)
129 .iter()
130 .map(|s| {
131 if self.case_sensitive {
132 s.as_bytes().to_vec()
133 } else {
134 s.to_lowercase().as_bytes().to_vec()
135 }
136 })
137 .collect();
138 Ok(res)
139 }
140}
141
142#[cfg(test)]
143mod tests {
144 use super::*;
145
146 #[test]
147 fn test_english_tokenizer() {
148 let tokenizer = EnglishTokenizer;
149 let text = "Hello, world!!! This is a----++ test012_345+67890";
150 let tokens = tokenizer.tokenize(text);
151 assert_eq!(
152 tokens,
153 vec!["Hello", "world", "This", "is", "a", "test012_345", "67890"]
154 );
155 }
156
157 #[test]
158 fn test_english_tokenizer_with_utf8() {
159 let tokenizer = EnglishTokenizer;
160 let text = "💸unfold the 纸巾😣and gently 清洁表😭面";
161 let tokens = tokenizer.tokenize(text);
162 assert_eq!(
163 tokens,
164 vec!["unfold", "the", "纸巾", "and", "gently", "清洁表", "面"]
167 );
168 }
169
170 #[test]
171 fn test_chinese_tokenizer() {
172 let tokenizer = ChineseTokenizer;
173 let text = "我喜欢苹果";
174 let tokens = tokenizer.tokenize(text);
175 assert_eq!(tokens, vec!["我", "喜欢", "苹果"]);
176 }
177
178 #[test]
179 fn test_chinese_tokenizer_issue_7943_sample() {
180 let tokenizer = ChineseTokenizer;
181 let text = "登录手机号18888888888的动态key:829889AC8";
182 let tokens = tokenizer.tokenize(text);
183 assert_eq!(
184 tokens,
185 [
186 "登录",
187 "手机",
188 "手机号",
189 "18888888888",
190 "的",
191 "动态",
192 "key",
193 "829889AC8"
194 ]
195 );
196 }
197
198 #[test]
199 fn test_chinese_tokenizer_aggressive_tokenization_probe() {
200 let tokenizer = ChineseTokenizer;
201 let text = "哈基米哦南北绿豆,噢马自立曼波。登录手机号。中国农业银行。装电视台,中国中央广播电视台。压不缩,笑不活。";
202
203 let default_tokens = tokenizer.tokenize(text);
204 let cut_hmm_false = JIEBA.cut(text, false);
205 let cut_hmm_true = JIEBA.cut(text, true);
206 let cut_for_search_hmm_false = JIEBA.cut_for_search(text, false);
207 let cut_for_search_hmm_true = JIEBA.cut_for_search(text, true);
208
209 assert_eq!(
210 default_tokens,
211 [
212 "哈基米",
213 "哦",
214 "南北",
215 "绿豆",
216 "噢",
217 "马",
218 "自立",
219 "曼波",
220 "登录",
221 "手机",
222 "手机号",
223 "中国",
224 "农业",
225 "银行",
226 "中国农业银行",
227 "装",
228 "电视",
229 "电视台",
230 "中国",
231 "中央",
232 "广播",
233 "电视",
234 "电视台",
235 "不缩",
236 "压不缩",
237 "笑",
238 "不活",
239 ]
240 );
241 assert_eq!(
242 cut_hmm_false,
243 [
244 "哈",
245 "基",
246 "米",
247 "哦",
248 "南北",
249 "绿豆",
250 ",",
251 "噢",
252 "马",
253 "自立",
254 "曼",
255 "波",
256 "。",
257 "登录",
258 "手机号",
259 "。",
260 "中国农业银行",
261 "。",
262 "装",
263 "电视台",
264 ",",
265 "中国",
266 "中央",
267 "广播",
268 "电视台",
269 "。",
270 "压",
271 "不",
272 "缩",
273 ",",
274 "笑",
275 "不",
276 "活",
277 "。"
278 ]
279 );
280 assert_eq!(
281 cut_hmm_true,
282 [
283 "哈基米",
284 "哦",
285 "南北",
286 "绿豆",
287 ",",
288 "噢",
289 "马",
290 "自立",
291 "曼波",
292 "。",
293 "登录",
294 "手机号",
295 "。",
296 "中国农业银行",
297 "。",
298 "装",
299 "电视台",
300 ",",
301 "中国",
302 "中央",
303 "广播",
304 "电视台",
305 "。",
306 "压不缩",
307 ",",
308 "笑",
309 "不活",
310 "。"
311 ]
312 );
313 assert_eq!(
314 cut_for_search_hmm_false,
315 [
316 "哈",
317 "基",
318 "米",
319 "哦",
320 "南北",
321 "绿豆",
322 ",",
323 "噢",
324 "马",
325 "自立",
326 "曼",
327 "波",
328 "。",
329 "登录",
330 "手机",
331 "手机号",
332 "。",
333 "中国",
334 "农业",
335 "银行",
336 "中国农业银行",
337 "。",
338 "装",
339 "电视",
340 "电视台",
341 ",",
342 "中国",
343 "中央",
344 "广播",
345 "电视",
346 "电视台",
347 "。",
348 "压",
349 "不",
350 "缩",
351 ",",
352 "笑",
353 "不",
354 "活",
355 "。"
356 ]
357 );
358
359 assert_eq!(
360 cut_for_search_hmm_true,
361 [
362 "哈基米",
363 "哦",
364 "南北",
365 "绿豆",
366 ",",
367 "噢",
368 "马",
369 "自立",
370 "曼波",
371 "。",
372 "登录",
373 "手机",
374 "手机号",
375 "。",
376 "中国",
377 "农业",
378 "银行",
379 "中国农业银行",
380 "。",
381 "装",
382 "电视",
383 "电视台",
384 ",",
385 "中国",
386 "中央",
387 "广播",
388 "电视",
389 "电视台",
390 "。",
391 "不缩",
392 "压不缩",
393 ",",
394 "笑",
395 "不活",
396 "。"
397 ]
398 );
399 }
400
401 #[test]
402 fn test_valid_ascii_token_lookup_table() {
403 for c in 0u8..=255u8 {
405 let is_valid = VALID_ASCII_TOKEN[c as usize];
406 let should_be_valid = (c as char).is_ascii_alphanumeric() || c == b'_';
407
408 assert_eq!(
409 is_valid,
410 should_be_valid,
411 "Character '{}' (byte {}) validity mismatch: expected {}, got {}",
412 if c.is_ascii() && !c.is_ascii_control() {
413 c as char
414 } else {
415 '?'
416 },
417 c,
418 should_be_valid,
419 is_valid
420 );
421 }
422 }
423
424 #[test]
425 fn test_analyzer() {
426 let tokenizer = EnglishTokenizer;
427 let analyzer = Analyzer::new(Box::new(tokenizer), false);
428 let text = "Hello, world! This is a test.";
429 let tokens = analyzer.analyze_text(text).unwrap();
430 assert_eq!(
431 tokens,
432 vec![
433 b"hello".to_vec(),
434 b"world".to_vec(),
435 b"this".to_vec(),
436 b"is".to_vec(),
437 b"a".to_vec(),
438 b"test".to_vec()
439 ]
440 );
441 }
442}