1use crate::Bytes;
16use crate::fulltext_index::error::Result;
17
18lazy_static::lazy_static! {
19 static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
20}
21
22const VALID_ASCII_TOKEN: [bool; 256] = [
24 false, false, false, false, false, false, false, false, false, false, false, false, false,
25 false, false, false, false, false, false, false, false, false, false, false, false, false,
26 false, false, false, false, false, false, false, false, false, false, false, false, false,
27 false, false, false, false, false, false, false, false, false, true, true, true, true, true,
28 true, true, true, true, true, false, false, false, false, false, false, false, true, true,
29 true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true,
30 true, true, true, true, true, true, true, true, false, false, false, false, true, false, true,
31 true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true,
32 true, true, true, true, true, true, true, true, true, false, false, false, false, false, false,
33 false, false, false, false, false, false, false, false, false, false, false, false, false,
34 false, false, false, false, false, false, false, false, false, false, false, false, false,
35 false, false, false, false, false, false, false, false, false, false, false, false, false,
36 false, false, false, false, false, false, false, false, false, false, false, false, false,
37 false, false, false, false, false, false, false, false, false, false, false, false, false,
38 false, false, false, false, false, false, false, false, false, false, false, false, false,
39 false, false, false, false, false, false, false, false, false, false, false, false, false,
40 false, false, false, false, false, false, false, false, false, false, false, false, false,
41 false, false, false, false, false, false, false, false, false, false, false, false, false,
42 false, false, false, false, false, false, false, false, false, false,
43];
44
45pub trait Tokenizer: Send {
47 fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str>;
48}
49
50#[derive(Debug, Default)]
54pub struct EnglishTokenizer;
55
56impl Tokenizer for EnglishTokenizer {
57 fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
58 if text.is_ascii() {
59 let mut tokens = Vec::new();
60 let mut start = 0;
61 for (i, &byte) in text.as_bytes().iter().enumerate() {
62 if !VALID_ASCII_TOKEN[byte as usize] {
63 if start < i {
64 tokens.push(&text[start..i]);
65 }
66 start = i + 1;
67 }
68 }
69
70 if start < text.len() {
71 tokens.push(&text[start..]);
72 }
73
74 tokens
75 } else {
76 text.split(|c: char| !c.is_alphanumeric() && c != '_')
77 .filter(|s| !s.is_empty())
78 .collect()
79 }
80 }
81}
82
83#[derive(Debug, Default)]
89pub struct ChineseTokenizer;
90
91impl Tokenizer for ChineseTokenizer {
92 fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
93 if text.is_ascii() {
94 EnglishTokenizer {}.tokenize(text)
95 } else {
96 let mut tokens = JIEBA
99 .cut_for_search(text, true)
100 .into_iter()
101 .filter(|s| is_indexable_token(s))
102 .collect::<Vec<_>>();
103
104 let english = EnglishTokenizer {};
105 tokens.extend(
106 english
107 .tokenize(text)
108 .into_iter()
109 .filter(|token| is_ascii_underscore_token(token)),
110 );
111
112 tokens
113 }
114 }
115}
116
117fn is_indexable_token(token: &str) -> bool {
118 token.chars().any(|c| c.is_alphanumeric() || c == '_')
119}
120
121fn is_ascii_underscore_token(token: &str) -> bool {
122 token.is_ascii() && token.chars().any(|c| c == '_')
123}
124
125pub struct Analyzer {
129 tokenizer: Box<dyn Tokenizer>,
130 case_sensitive: bool,
131}
132
133impl Analyzer {
134 pub fn new(tokenizer: Box<dyn Tokenizer>, case_sensitive: bool) -> Self {
136 Self {
137 tokenizer,
138 case_sensitive,
139 }
140 }
141
142 pub fn analyze_text(&self, text: &str) -> Result<Vec<Bytes>> {
144 let res = self
145 .tokenizer
146 .tokenize(text)
147 .iter()
148 .map(|s| {
149 if self.case_sensitive {
150 s.as_bytes().to_vec()
151 } else {
152 s.to_lowercase().as_bytes().to_vec()
153 }
154 })
155 .collect();
156 Ok(res)
157 }
158}
159
160#[cfg(test)]
161mod tests {
162 use super::*;
163
164 #[test]
165 fn test_english_tokenizer() {
166 let tokenizer = EnglishTokenizer;
167 let text = "Hello, world!!! This is a----++ test012_345+67890 ship_ship ship__ship _ __ __IDENTIFIER__ _ship ship_";
168 let tokens = tokenizer.tokenize(text);
169 assert_eq!(
170 tokens,
171 vec![
172 "Hello",
173 "world",
174 "This",
175 "is",
176 "a",
177 "test012_345",
178 "67890",
179 "ship_ship",
180 "ship__ship",
181 "_",
182 "__",
183 "__IDENTIFIER__",
184 "_ship",
185 "ship_"
186 ]
187 );
188 }
189
190 #[test]
191 fn test_english_tokenizer_with_utf8() {
192 let tokenizer = EnglishTokenizer;
193 let text = "💸unfold the 纸巾😣and gently 清洁表😭面";
194 let tokens = tokenizer.tokenize(text);
195 assert_eq!(
196 tokens,
197 vec!["unfold", "the", "纸巾", "and", "gently", "清洁表", "面"]
200 );
201 }
202
203 #[test]
204 fn test_chinese_tokenizer() {
205 let tokenizer = ChineseTokenizer;
206 let text = "我喜欢苹果";
207 let tokens = tokenizer.tokenize(text);
208 assert_eq!(tokens, vec!["我", "喜欢", "苹果"]);
209 }
210
211 #[test]
212 fn test_chinese_tokenizer_issue_7943_sample() {
213 let tokenizer = ChineseTokenizer;
214 let text = "[2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key:829889AC8 ship_ship ship__ship _ __ __IDENTIFIER__ _ship ship_ EOF";
215 let tokens = tokenizer.tokenize(text);
216
217 assert_eq!(
218 tokens,
219 vec![
220 "2026",
221 "04",
222 "09",
223 "13",
224 "56",
225 "11.031",
226 "2026-04",
227 "09",
228 "13",
229 "56",
230 "11.031",
231 "trace",
232 "_",
233 "id",
234 "340a6a44b0bd8e37bb7697ss7da61ff0",
235 "span",
236 "_",
237 "id",
238 "085ff5ttf1e0a23b",
239 "trace",
240 "_",
241 "flags",
242 "01",
243 "http",
244 "nio-8081",
245 "exec-16",
246 "INFO",
247 "c",
248 "h",
249 "p",
250 "xx",
251 "web",
252 "service",
253 "impl",
254 "CCCXForwardKKKServiceImpl",
255 "pushout",
256 "188",
257 "登录",
258 "手机",
259 "手机号",
260 "18888888888",
261 "的",
262 "动态",
263 "key",
264 "829889AC8",
265 "ship",
266 "_",
267 "ship",
268 "ship",
269 "__",
270 "ship",
271 "_",
272 "__",
273 "__",
274 "IDENTIFIER",
275 "__",
276 "_",
277 "ship",
278 "ship",
279 "_",
280 "EOF",
281 "trace_id",
282 "span_id",
283 "trace_flags",
284 "ship_ship",
285 "ship__ship",
286 "_",
287 "__",
288 "__IDENTIFIER__",
289 "_ship",
290 "ship_"
291 ]
292 );
293 }
294
295 #[test]
296 fn test_chinese_tokenizer_keeps_ascii_underscore_compounds() {
297 let tokenizer = ChineseTokenizer;
298 let text = "trace_id=abc 登录手机号 dynamic_key=xyz";
299
300 let tokens = tokenizer.tokenize(text);
301
302 assert!(tokens.contains(&"trace_id"));
303 assert!(tokens.contains(&"dynamic_key"));
304 assert!(tokens.contains(&"登录"));
305 assert!(tokens.contains(&"手机号"));
306 }
307
308 #[test]
309 fn test_chinese_tokenizer_skips_non_ascii_underscore_tokens() {
310 let tokenizer = ChineseTokenizer;
311 let text = "登录_id trace_id 手机号_trace";
312
313 let tokens = tokenizer.tokenize(text);
314
315 assert_eq!(
316 tokens,
317 [
318 "登录",
319 "_",
320 "id",
321 "trace",
322 "_",
323 "id",
324 "手机",
325 "手机号",
326 "_",
327 "trace",
328 "trace_id"
329 ]
330 );
331 }
332
333 #[test]
334 fn test_chinese_tokenizer_aggressive_tokenization_probe() {
335 let tokenizer = ChineseTokenizer;
336 let text = "哈基米哦南北绿豆,噢马自立曼波。登录手机号。中国农业银行。装电视台,中国中央广播电视台。压不缩,笑不活。";
337
338 let default_tokens = tokenizer.tokenize(text);
339 let cut_hmm_false = JIEBA.cut(text, false);
340 let cut_hmm_true = JIEBA.cut(text, true);
341 let cut_for_search_hmm_false = JIEBA.cut_for_search(text, false);
342 let cut_for_search_hmm_true = JIEBA.cut_for_search(text, true);
343
344 assert_eq!(
345 default_tokens,
346 [
347 "哈基米",
348 "哦",
349 "南北",
350 "绿豆",
351 "噢",
352 "马",
353 "自立",
354 "曼波",
355 "登录",
356 "手机",
357 "手机号",
358 "中国",
359 "农业",
360 "银行",
361 "中国农业银行",
362 "装",
363 "电视",
364 "电视台",
365 "中国",
366 "中央",
367 "广播",
368 "电视",
369 "电视台",
370 "不缩",
371 "压不缩",
372 "笑",
373 "不活",
374 ]
375 );
376 assert_eq!(
377 cut_hmm_false,
378 [
379 "哈",
380 "基",
381 "米",
382 "哦",
383 "南北",
384 "绿豆",
385 ",",
386 "噢",
387 "马",
388 "自立",
389 "曼",
390 "波",
391 "。",
392 "登录",
393 "手机号",
394 "。",
395 "中国农业银行",
396 "。",
397 "装",
398 "电视台",
399 ",",
400 "中国",
401 "中央",
402 "广播",
403 "电视台",
404 "。",
405 "压",
406 "不",
407 "缩",
408 ",",
409 "笑",
410 "不",
411 "活",
412 "。"
413 ]
414 );
415 assert_eq!(
416 cut_hmm_true,
417 [
418 "哈基米",
419 "哦",
420 "南北",
421 "绿豆",
422 ",",
423 "噢",
424 "马",
425 "自立",
426 "曼波",
427 "。",
428 "登录",
429 "手机号",
430 "。",
431 "中国农业银行",
432 "。",
433 "装",
434 "电视台",
435 ",",
436 "中国",
437 "中央",
438 "广播",
439 "电视台",
440 "。",
441 "压不缩",
442 ",",
443 "笑",
444 "不活",
445 "。"
446 ]
447 );
448 assert_eq!(
449 cut_for_search_hmm_false,
450 [
451 "哈",
452 "基",
453 "米",
454 "哦",
455 "南北",
456 "绿豆",
457 ",",
458 "噢",
459 "马",
460 "自立",
461 "曼",
462 "波",
463 "。",
464 "登录",
465 "手机",
466 "手机号",
467 "。",
468 "中国",
469 "农业",
470 "银行",
471 "中国农业银行",
472 "。",
473 "装",
474 "电视",
475 "电视台",
476 ",",
477 "中国",
478 "中央",
479 "广播",
480 "电视",
481 "电视台",
482 "。",
483 "压",
484 "不",
485 "缩",
486 ",",
487 "笑",
488 "不",
489 "活",
490 "。"
491 ]
492 );
493
494 assert_eq!(
495 cut_for_search_hmm_true,
496 [
497 "哈基米",
498 "哦",
499 "南北",
500 "绿豆",
501 ",",
502 "噢",
503 "马",
504 "自立",
505 "曼波",
506 "。",
507 "登录",
508 "手机",
509 "手机号",
510 "。",
511 "中国",
512 "农业",
513 "银行",
514 "中国农业银行",
515 "。",
516 "装",
517 "电视",
518 "电视台",
519 ",",
520 "中国",
521 "中央",
522 "广播",
523 "电视",
524 "电视台",
525 "。",
526 "不缩",
527 "压不缩",
528 ",",
529 "笑",
530 "不活",
531 "。"
532 ]
533 );
534 }
535
536 #[test]
537 fn test_valid_ascii_token_lookup_table() {
538 for c in 0u8..=255u8 {
540 let is_valid = VALID_ASCII_TOKEN[c as usize];
541 let should_be_valid = (c as char).is_ascii_alphanumeric() || c == b'_';
542
543 assert_eq!(
544 is_valid,
545 should_be_valid,
546 "Character '{}' (byte {}) validity mismatch: expected {}, got {}",
547 if c.is_ascii() && !c.is_ascii_control() {
548 c as char
549 } else {
550 '?'
551 },
552 c,
553 should_be_valid,
554 is_valid
555 );
556 }
557 }
558
559 #[test]
560 fn test_analyzer() {
561 let tokenizer = EnglishTokenizer;
562 let analyzer = Analyzer::new(Box::new(tokenizer), false);
563 let text = "Hello, world! This is a test.";
564 let tokens = analyzer.analyze_text(text).unwrap();
565 assert_eq!(
566 tokens,
567 vec![
568 b"hello".to_vec(),
569 b"world".to_vec(),
570 b"this".to_vec(),
571 b"is".to_vec(),
572 b"a".to_vec(),
573 b"test".to_vec()
574 ]
575 );
576 }
577}