1use crate::Bytes;
16use crate::fulltext_index::error::Result;
17
18lazy_static::lazy_static! {
19 static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
20}
21
22const VALID_ASCII_TOKEN: [bool; 256] = [
24 false, false, false, false, false, false, false, false, false, false, false, false, false,
25 false, false, false, false, false, false, false, false, false, false, false, false, false,
26 false, false, false, false, false, false, false, false, false, false, false, false, false,
27 false, false, false, false, false, false, false, false, false, true, true, true, true, true,
28 true, true, true, true, true, false, false, false, false, false, false, false, true, true,
29 true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true,
30 true, true, true, true, true, true, true, true, false, false, false, false, true, false, true,
31 true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true,
32 true, true, true, true, true, true, true, true, true, false, false, false, false, false, false,
33 false, false, false, false, false, false, false, false, false, false, false, false, false,
34 false, false, false, false, false, false, false, false, false, false, false, false, false,
35 false, false, false, false, false, false, false, false, false, false, false, false, false,
36 false, false, false, false, false, false, false, false, false, false, false, false, false,
37 false, false, false, false, false, false, false, false, false, false, false, false, false,
38 false, false, false, false, false, false, false, false, false, false, false, false, false,
39 false, false, false, false, false, false, false, false, false, false, false, false, false,
40 false, false, false, false, false, false, false, false, false, false, false, false, false,
41 false, false, false, false, false, false, false, false, false, false, false, false, false,
42 false, false, false, false, false, false, false, false, false, false,
43];
44
45pub trait Tokenizer: Send {
47 fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str>;
48}
49
50#[derive(Debug, Default)]
54pub struct EnglishTokenizer;
55
56impl Tokenizer for EnglishTokenizer {
57 fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
58 if text.is_ascii() {
59 let mut tokens = Vec::new();
60 let mut start = 0;
61 for (i, &byte) in text.as_bytes().iter().enumerate() {
62 if !VALID_ASCII_TOKEN[byte as usize] {
63 if start < i {
64 tokens.push(&text[start..i]);
65 }
66 start = i + 1;
67 }
68 }
69
70 if start < text.len() {
71 tokens.push(&text[start..]);
72 }
73
74 tokens
75 } else {
76 text.split(|c: char| !c.is_alphanumeric() && c != '_')
77 .filter(|s| !s.is_empty())
78 .collect()
79 }
80 }
81}
82
83#[derive(Debug, Default)]
89pub struct ChineseTokenizer;
90
91impl Tokenizer for ChineseTokenizer {
92 fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
93 if text.is_ascii() {
94 EnglishTokenizer {}.tokenize(text)
95 } else {
96 let mut tokens = JIEBA
99 .cut_for_search(text, true)
100 .into_iter()
101 .map(|token| token.word)
102 .filter(|token| is_indexable_token(token))
103 .collect::<Vec<_>>();
104
105 let english = EnglishTokenizer {};
106 tokens.extend(
107 english
108 .tokenize(text)
109 .into_iter()
110 .filter(|token| is_ascii_underscore_token(token)),
111 );
112
113 tokens
114 }
115 }
116}
117
118fn is_indexable_token(token: &str) -> bool {
119 token.chars().any(|c| c.is_alphanumeric() || c == '_')
120}
121
122fn is_ascii_underscore_token(token: &str) -> bool {
123 token.is_ascii() && token.chars().any(|c| c == '_')
124}
125
126pub struct Analyzer {
130 tokenizer: Box<dyn Tokenizer>,
131 case_sensitive: bool,
132}
133
134impl Analyzer {
135 pub fn new(tokenizer: Box<dyn Tokenizer>, case_sensitive: bool) -> Self {
137 Self {
138 tokenizer,
139 case_sensitive,
140 }
141 }
142
143 pub fn analyze_text(&self, text: &str) -> Result<Vec<Bytes>> {
145 let res = self
146 .tokenizer
147 .tokenize(text)
148 .iter()
149 .map(|s| {
150 if self.case_sensitive {
151 s.as_bytes().to_vec()
152 } else {
153 s.to_lowercase().as_bytes().to_vec()
154 }
155 })
156 .collect();
157 Ok(res)
158 }
159}
160
161#[cfg(test)]
162mod tests {
163 use super::*;
164
165 #[test]
166 fn test_english_tokenizer() {
167 let tokenizer = EnglishTokenizer;
168 let text = "Hello, world!!! This is a----++ test012_345+67890 ship_ship ship__ship _ __ __IDENTIFIER__ _ship ship_";
169 let tokens = tokenizer.tokenize(text);
170 assert_eq!(
171 tokens,
172 vec![
173 "Hello",
174 "world",
175 "This",
176 "is",
177 "a",
178 "test012_345",
179 "67890",
180 "ship_ship",
181 "ship__ship",
182 "_",
183 "__",
184 "__IDENTIFIER__",
185 "_ship",
186 "ship_"
187 ]
188 );
189 }
190
191 #[test]
192 fn test_english_tokenizer_with_utf8() {
193 let tokenizer = EnglishTokenizer;
194 let text = "💸unfold the 纸巾😣and gently 清洁表😭面";
195 let tokens = tokenizer.tokenize(text);
196 assert_eq!(
197 tokens,
198 vec!["unfold", "the", "纸巾", "and", "gently", "清洁表", "面"]
201 );
202 }
203
204 #[test]
205 fn test_chinese_tokenizer() {
206 let tokenizer = ChineseTokenizer;
207 let text = "我喜欢苹果";
208 let tokens = tokenizer.tokenize(text);
209 assert_eq!(tokens, vec!["我", "喜欢", "苹果"]);
210 }
211
212 #[test]
213 fn test_chinese_tokenizer_issue_7943_sample() {
214 let tokenizer = ChineseTokenizer;
215 let text = "[2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key:829889AC8 ship_ship ship__ship _ __ __IDENTIFIER__ _ship ship_ EOF";
216 let tokens = tokenizer.tokenize(text);
217
218 assert_eq!(
219 tokens,
220 vec![
221 "2026",
222 "04",
223 "09",
224 "13",
225 "56",
226 "11.031",
227 "2026-04",
228 "09",
229 "13",
230 "56",
231 "11.031",
232 "trace",
233 "_",
234 "id",
235 "340a6a44b0bd8e37bb7697ss7da61ff0",
236 "span",
237 "_",
238 "id",
239 "085ff5ttf1e0a23b",
240 "trace",
241 "_",
242 "flags",
243 "01",
244 "http",
245 "nio-8081",
246 "exec-16",
247 "INFO",
248 "c",
249 "h",
250 "p",
251 "xx",
252 "web",
253 "service",
254 "impl",
255 "CCCXForwardKKKServiceImpl",
256 "pushout",
257 "188",
258 "登录",
259 "手机",
260 "手机号",
261 "18888888888",
262 "的",
263 "动态",
264 "key",
265 "829889AC8",
266 "ship",
267 "_",
268 "ship",
269 "ship",
270 "__",
271 "ship",
272 "_",
273 "__",
274 "__",
275 "IDENTIFIER",
276 "__",
277 "_",
278 "ship",
279 "ship",
280 "_",
281 "EOF",
282 "trace_id",
283 "span_id",
284 "trace_flags",
285 "ship_ship",
286 "ship__ship",
287 "_",
288 "__",
289 "__IDENTIFIER__",
290 "_ship",
291 "ship_"
292 ]
293 );
294 }
295
296 #[test]
297 fn test_chinese_tokenizer_keeps_ascii_underscore_compounds() {
298 let tokenizer = ChineseTokenizer;
299 let text = "trace_id=abc 登录手机号 dynamic_key=xyz";
300
301 let tokens = tokenizer.tokenize(text);
302
303 assert!(tokens.contains(&"trace_id"));
304 assert!(tokens.contains(&"dynamic_key"));
305 assert!(tokens.contains(&"登录"));
306 assert!(tokens.contains(&"手机号"));
307 }
308
309 #[test]
310 fn test_chinese_tokenizer_skips_non_ascii_underscore_tokens() {
311 let tokenizer = ChineseTokenizer;
312 let text = "登录_id trace_id 手机号_trace";
313
314 let tokens = tokenizer.tokenize(text);
315
316 assert_eq!(
317 tokens,
318 [
319 "登录",
320 "_",
321 "id",
322 "trace",
323 "_",
324 "id",
325 "手机",
326 "手机号",
327 "_",
328 "trace",
329 "trace_id"
330 ]
331 );
332 }
333
334 #[test]
335 fn test_chinese_tokenizer_aggressive_tokenization_probe() {
336 let tokenizer = ChineseTokenizer;
337 let text = "哈基米哦南北绿豆,噢马自立曼波。登录手机号。中国农业银行。装电视台,中国中央广播电视台。压不缩,笑不活。";
338
339 let default_tokens = tokenizer.tokenize(text);
340 let cut_hmm_false = JIEBA
341 .cut(text, false)
342 .into_iter()
343 .map(|token| token.word)
344 .collect::<Vec<_>>();
345 let cut_hmm_true = JIEBA
346 .cut(text, true)
347 .into_iter()
348 .map(|token| token.word)
349 .collect::<Vec<_>>();
350 let cut_for_search_hmm_false = JIEBA
351 .cut_for_search(text, false)
352 .into_iter()
353 .map(|token| token.word)
354 .collect::<Vec<_>>();
355 let cut_for_search_hmm_true = JIEBA
356 .cut_for_search(text, true)
357 .into_iter()
358 .map(|token| token.word)
359 .collect::<Vec<_>>();
360
361 assert_eq!(
362 default_tokens,
363 [
364 "哈基米",
365 "哦",
366 "南北",
367 "绿豆",
368 "噢",
369 "马",
370 "自立",
371 "曼波",
372 "登录",
373 "手机",
374 "手机号",
375 "中国",
376 "农业",
377 "银行",
378 "中国农业银行",
379 "装",
380 "电视",
381 "电视台",
382 "中国",
383 "中央",
384 "广播",
385 "电视",
386 "电视台",
387 "不缩",
388 "压不缩",
389 "笑",
390 "不活",
391 ]
392 );
393 assert_eq!(
394 cut_hmm_false,
395 [
396 "哈",
397 "基",
398 "米",
399 "哦",
400 "南北",
401 "绿豆",
402 ",",
403 "噢",
404 "马",
405 "自立",
406 "曼",
407 "波",
408 "。",
409 "登录",
410 "手机号",
411 "。",
412 "中国农业银行",
413 "。",
414 "装",
415 "电视台",
416 ",",
417 "中国",
418 "中央",
419 "广播",
420 "电视台",
421 "。",
422 "压",
423 "不",
424 "缩",
425 ",",
426 "笑",
427 "不",
428 "活",
429 "。"
430 ]
431 );
432 assert_eq!(
433 cut_hmm_true,
434 [
435 "哈基米",
436 "哦",
437 "南北",
438 "绿豆",
439 ",",
440 "噢",
441 "马",
442 "自立",
443 "曼波",
444 "。",
445 "登录",
446 "手机号",
447 "。",
448 "中国农业银行",
449 "。",
450 "装",
451 "电视台",
452 ",",
453 "中国",
454 "中央",
455 "广播",
456 "电视台",
457 "。",
458 "压不缩",
459 ",",
460 "笑",
461 "不活",
462 "。"
463 ]
464 );
465 assert_eq!(
466 cut_for_search_hmm_false,
467 [
468 "哈",
469 "基",
470 "米",
471 "哦",
472 "南北",
473 "绿豆",
474 ",",
475 "噢",
476 "马",
477 "自立",
478 "曼",
479 "波",
480 "。",
481 "登录",
482 "手机",
483 "手机号",
484 "。",
485 "中国",
486 "农业",
487 "银行",
488 "中国农业银行",
489 "。",
490 "装",
491 "电视",
492 "电视台",
493 ",",
494 "中国",
495 "中央",
496 "广播",
497 "电视",
498 "电视台",
499 "。",
500 "压",
501 "不",
502 "缩",
503 ",",
504 "笑",
505 "不",
506 "活",
507 "。"
508 ]
509 );
510
511 assert_eq!(
512 cut_for_search_hmm_true,
513 [
514 "哈基米",
515 "哦",
516 "南北",
517 "绿豆",
518 ",",
519 "噢",
520 "马",
521 "自立",
522 "曼波",
523 "。",
524 "登录",
525 "手机",
526 "手机号",
527 "。",
528 "中国",
529 "农业",
530 "银行",
531 "中国农业银行",
532 "。",
533 "装",
534 "电视",
535 "电视台",
536 ",",
537 "中国",
538 "中央",
539 "广播",
540 "电视",
541 "电视台",
542 "。",
543 "不缩",
544 "压不缩",
545 ",",
546 "笑",
547 "不活",
548 "。"
549 ]
550 );
551 }
552
553 #[test]
554 fn test_valid_ascii_token_lookup_table() {
555 for c in 0u8..=255u8 {
557 let is_valid = VALID_ASCII_TOKEN[c as usize];
558 let should_be_valid = (c as char).is_ascii_alphanumeric() || c == b'_';
559
560 assert_eq!(
561 is_valid,
562 should_be_valid,
563 "Character '{}' (byte {}) validity mismatch: expected {}, got {}",
564 if c.is_ascii() && !c.is_ascii_control() {
565 c as char
566 } else {
567 '?'
568 },
569 c,
570 should_be_valid,
571 is_valid
572 );
573 }
574 }
575
576 #[test]
577 fn test_analyzer() {
578 let tokenizer = EnglishTokenizer;
579 let analyzer = Analyzer::new(Box::new(tokenizer), false);
580 let text = "Hello, world! This is a test.";
581 let tokens = analyzer.analyze_text(text).unwrap();
582 assert_eq!(
583 tokens,
584 vec![
585 b"hello".to_vec(),
586 b"world".to_vec(),
587 b"this".to_vec(),
588 b"is".to_vec(),
589 b"a".to_vec(),
590 b"test".to_vec()
591 ]
592 );
593 }
594}