common_function/scalars/
matches_term.rs1use std::fmt;
16use std::iter::repeat_n;
17use std::sync::Arc;
18
19use common_query::error::{InvalidFuncArgsSnafu, Result};
20use common_query::prelude::Volatility;
21use datatypes::prelude::ConcreteDataType;
22use datatypes::scalars::ScalarVectorBuilder;
23use datatypes::vectors::{BooleanVector, BooleanVectorBuilder, MutableVector, VectorRef};
24use memchr::memmem;
25use snafu::ensure;
26
27use crate::function::{Function, FunctionContext};
28use crate::function_registry::FunctionRegistry;
29
30pub struct MatchesTermFunction;
77
78impl MatchesTermFunction {
79 pub fn register(registry: &FunctionRegistry) {
80 registry.register(Arc::new(MatchesTermFunction));
81 }
82}
83
84impl fmt::Display for MatchesTermFunction {
85 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
86 write!(f, "MATCHES_TERM")
87 }
88}
89
90impl Function for MatchesTermFunction {
91 fn name(&self) -> &str {
92 "matches_term"
93 }
94
95 fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
96 Ok(ConcreteDataType::boolean_datatype())
97 }
98
99 fn signature(&self) -> common_query::prelude::Signature {
100 common_query::prelude::Signature::exact(
101 vec![
102 ConcreteDataType::string_datatype(),
103 ConcreteDataType::string_datatype(),
104 ],
105 Volatility::Immutable,
106 )
107 }
108
109 fn eval(&self, _func_ctx: &FunctionContext, columns: &[VectorRef]) -> Result<VectorRef> {
110 ensure!(
111 columns.len() == 2,
112 InvalidFuncArgsSnafu {
113 err_msg: format!(
114 "The length of the args is not correct, expect exactly 2, have: {}",
115 columns.len()
116 ),
117 }
118 );
119
120 let text_column = &columns[0];
121 if text_column.is_empty() {
122 return Ok(Arc::new(BooleanVector::from(Vec::<bool>::with_capacity(0))));
123 }
124
125 let term_column = &columns[1];
126 let compiled_finder = if term_column.is_const() {
127 let term = term_column.get_ref(0).as_string().unwrap();
128 match term {
129 None => {
130 return Ok(Arc::new(BooleanVector::from_iter(repeat_n(
131 None,
132 text_column.len(),
133 ))));
134 }
135 Some(term) => Some(MatchesTermFinder::new(term)),
136 }
137 } else {
138 None
139 };
140
141 let len = text_column.len();
142 let mut result = BooleanVectorBuilder::with_capacity(len);
143 for i in 0..len {
144 let text = text_column.get_ref(i).as_string().unwrap();
145 let Some(text) = text else {
146 result.push_null();
147 continue;
148 };
149
150 let contains = match &compiled_finder {
151 Some(finder) => finder.find(text),
152 None => {
153 let term = match term_column.get_ref(i).as_string().unwrap() {
154 None => {
155 result.push_null();
156 continue;
157 }
158 Some(term) => term,
159 };
160 MatchesTermFinder::new(term).find(text)
161 }
162 };
163 result.push(Some(contains));
164 }
165
166 Ok(result.to_vector())
167 }
168}
169
170#[derive(Clone, Debug)]
190pub struct MatchesTermFinder {
191 finder: memmem::Finder<'static>,
192 term: String,
193 starts_with_non_alnum: bool,
194 ends_with_non_alnum: bool,
195}
196
197impl MatchesTermFinder {
198 pub fn new(term: &str) -> Self {
200 let starts_with_non_alnum = term.chars().next().is_some_and(|c| !c.is_alphanumeric());
201 let ends_with_non_alnum = term.chars().last().is_some_and(|c| !c.is_alphanumeric());
202
203 Self {
204 finder: memmem::Finder::new(term).into_owned(),
205 term: term.to_string(),
206 starts_with_non_alnum,
207 ends_with_non_alnum,
208 }
209 }
210
211 pub fn find(&self, text: &str) -> bool {
213 if self.term.is_empty() {
214 return text.is_empty();
215 }
216
217 if text.len() < self.term.len() {
218 return false;
219 }
220
221 let mut pos = 0;
222 while let Some(found_pos) = self.finder.find(&text.as_bytes()[pos..]) {
223 let actual_pos = pos + found_pos;
224
225 let prev_ok = self.starts_with_non_alnum
226 || text[..actual_pos]
227 .chars()
228 .last()
229 .map(|c| !c.is_alphanumeric())
230 .unwrap_or(true);
231
232 if prev_ok {
233 let next_pos = actual_pos + self.finder.needle().len();
234 let next_ok = self.ends_with_non_alnum
235 || text[next_pos..]
236 .chars()
237 .next()
238 .map(|c| !c.is_alphanumeric())
239 .unwrap_or(true);
240
241 if next_ok {
242 return true;
243 }
244 }
245
246 if let Some(next_char) = text[actual_pos..].chars().next() {
247 pos = actual_pos + next_char.len_utf8();
248 } else {
249 break;
250 }
251 }
252
253 false
254 }
255}
256
257#[cfg(test)]
258mod tests {
259 use super::*;
260
261 #[test]
262 fn matches_term_example() {
263 let finder = MatchesTermFinder::new("hello world");
264 assert!(finder.find("warning:hello world!"));
265 assert!(!finder.find("hello-world"));
266 assert!(!finder.find("hello world2023"));
267
268 let finder = MatchesTermFinder::new("critical error");
269 assert!(finder.find("ERROR:critical error!"));
270 assert!(!finder.find("critical_errors"));
271
272 let finder = MatchesTermFinder::new("");
273 assert!(finder.find(""));
274 assert!(!finder.find("any"));
275
276 let finder = MatchesTermFinder::new("Cat");
277 assert!(finder.find("Cat"));
278 assert!(!finder.find("cat"));
279 }
280
281 #[test]
282 fn matches_term_with_punctuation() {
283 assert!(MatchesTermFinder::new("cat").find("cat!"));
284 assert!(MatchesTermFinder::new("dog").find("!dog"));
285 }
286
287 #[test]
288 fn matches_phrase_with_boundaries() {
289 assert!(MatchesTermFinder::new("hello-world").find("hello-world"));
290 assert!(MatchesTermFinder::new("'foo bar'").find("test: 'foo bar'"));
291 }
292
293 #[test]
294 fn matches_at_text_boundaries() {
295 assert!(MatchesTermFinder::new("start").find("start..."));
296 assert!(MatchesTermFinder::new("end").find("...end"));
297 }
298
299 #[test]
301 fn rejects_partial_matches() {
302 assert!(!MatchesTermFinder::new("cat").find("category"));
303 assert!(!MatchesTermFinder::new("boot").find("rebooted"));
304 }
305
306 #[test]
307 fn rejects_missing_term() {
308 assert!(!MatchesTermFinder::new("foo").find("hello world"));
309 }
310
311 #[test]
313 fn handles_empty_inputs() {
314 assert!(!MatchesTermFinder::new("test").find(""));
315 assert!(!MatchesTermFinder::new("").find("text"));
316 }
317
318 #[test]
319 fn different_unicode_boundaries() {
320 assert!(MatchesTermFinder::new("café").find("café>"));
321 assert!(!MatchesTermFinder::new("café").find("口café>"));
322 assert!(!MatchesTermFinder::new("café").find("café口"));
323 assert!(!MatchesTermFinder::new("café").find("cafémore"));
324 assert!(MatchesTermFinder::new("русский").find("русский!"));
325 assert!(MatchesTermFinder::new("русский").find("русский!"));
326 }
327
328 #[test]
329 fn case_sensitive_matching() {
330 assert!(!MatchesTermFinder::new("cat").find("Cat"));
331 assert!(MatchesTermFinder::new("CaT").find("CaT"));
332 }
333
334 #[test]
335 fn numbers_in_term() {
336 assert!(MatchesTermFinder::new("v1.0").find("v1.0!"));
337 assert!(!MatchesTermFinder::new("v1.0").find("v1.0a"));
338 }
339
340 #[test]
341 fn adjacent_alphanumeric_fails() {
342 assert!(!MatchesTermFinder::new("cat").find("cat5"));
343 assert!(!MatchesTermFinder::new("dog").find("dogcat"));
344 }
345
346 #[test]
347 fn empty_term_text() {
348 assert!(!MatchesTermFinder::new("").find("text"));
349 assert!(MatchesTermFinder::new("").find(""));
350 assert!(!MatchesTermFinder::new("text").find(""));
351 }
352
353 #[test]
354 fn leading_non_alphanumeric() {
355 assert!(MatchesTermFinder::new("/cat").find("dog/cat"));
356 assert!(MatchesTermFinder::new("dog/").find("dog/cat"));
357 assert!(MatchesTermFinder::new("dog/cat").find("dog/cat"));
358 }
359
360 #[test]
361 fn continues_searching_after_boundary_mismatch() {
362 assert!(!MatchesTermFinder::new("log").find("bloglog!"));
363 assert!(MatchesTermFinder::new("log").find("bloglog log"));
364 assert!(MatchesTermFinder::new("log").find("alogblog_log!"));
365
366 assert!(MatchesTermFinder::new("error").find("errorlog_error_case"));
367 assert!(MatchesTermFinder::new("test").find("atestbtestc_test_end"));
368 assert!(MatchesTermFinder::new("data").find("database_data_store"));
369 assert!(!MatchesTermFinder::new("data").find("database_datastore"));
370 assert!(MatchesTermFinder::new("log.txt").find("catalog.txt_log.txt!"));
371 assert!(!MatchesTermFinder::new("log.txt").find("catalog.txtlog.txt!"));
372 assert!(MatchesTermFinder::new("data-set").find("bigdata-set_data-set!"));
373
374 assert!(MatchesTermFinder::new("中文").find("这是中文测试,中文!"));
375 assert!(MatchesTermFinder::new("error").find("错误errorerror日志_error!"));
376 }
377}