index/fulltext_index/
tokenizer.rs1use crate::fulltext_index::error::Result;
16use crate::Bytes;
17
18lazy_static::lazy_static! {
19 static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
20}
21
22pub trait Tokenizer: Send {
24 fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str>;
25}
26
27#[derive(Debug, Default)]
31pub struct EnglishTokenizer;
32
33impl Tokenizer for EnglishTokenizer {
34 fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
35 text.split(|c: char| !c.is_alphanumeric())
36 .filter(|s| !s.is_empty())
37 .collect()
38 }
39}
40
41#[derive(Debug, Default)]
45pub struct ChineseTokenizer;
46
47impl Tokenizer for ChineseTokenizer {
48 fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
49 if text.is_ascii() {
50 EnglishTokenizer {}.tokenize(text)
51 } else {
52 JIEBA.cut(text, false)
53 }
54 }
55}
56
57pub struct Analyzer {
61 tokenizer: Box<dyn Tokenizer>,
62 case_sensitive: bool,
63}
64
65impl Analyzer {
66 pub fn new(tokenizer: Box<dyn Tokenizer>, case_sensitive: bool) -> Self {
68 Self {
69 tokenizer,
70 case_sensitive,
71 }
72 }
73
74 pub fn analyze_text(&self, text: &str) -> Result<Vec<Bytes>> {
76 let res = self
77 .tokenizer
78 .tokenize(text)
79 .iter()
80 .map(|s| {
81 if self.case_sensitive {
82 s.as_bytes().to_vec()
83 } else {
84 s.to_lowercase().as_bytes().to_vec()
85 }
86 })
87 .collect();
88 Ok(res)
89 }
90}
91
92#[cfg(test)]
93mod tests {
94 use super::*;
95
96 #[test]
97 fn test_english_tokenizer() {
98 let tokenizer = EnglishTokenizer;
99 let text = "Hello, world! This is a test0.";
100 let tokens = tokenizer.tokenize(text);
101 assert_eq!(tokens, vec!["Hello", "world", "This", "is", "a", "test0"]);
102 }
103
104 #[test]
105 fn test_chinese_tokenizer() {
106 let tokenizer = ChineseTokenizer;
107 let text = "我喜欢苹果";
108 let tokens = tokenizer.tokenize(text);
109 assert_eq!(tokens, vec!["我", "喜欢", "苹果"]);
110 }
111
112 #[test]
113 fn test_analyzer() {
114 let tokenizer = EnglishTokenizer;
115 let analyzer = Analyzer::new(Box::new(tokenizer), false);
116 let text = "Hello, world! This is a test.";
117 let tokens = analyzer.analyze_text(text).unwrap();
118 assert_eq!(
119 tokens,
120 vec![
121 b"hello".to_vec(),
122 b"world".to_vec(),
123 b"this".to_vec(),
124 b"is".to_vec(),
125 b"a".to_vec(),
126 b"test".to_vec()
127 ]
128 );
129 }
130}