common_function/scalars/
matches_term.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::fmt;
16use std::iter::repeat_n;
17use std::sync::Arc;
18
19use common_query::error::{InvalidFuncArgsSnafu, Result};
20use common_query::prelude::Volatility;
21use datatypes::prelude::ConcreteDataType;
22use datatypes::scalars::ScalarVectorBuilder;
23use datatypes::vectors::{BooleanVector, BooleanVectorBuilder, MutableVector, VectorRef};
24use memchr::memmem;
25use snafu::ensure;
26
27use crate::function::{Function, FunctionContext};
28use crate::function_registry::FunctionRegistry;
29
30/// Exact term/phrase matching function for text columns.
31///
32/// This function checks if a text column contains exact term/phrase matches
33/// with non-alphanumeric boundaries. Designed for:
34/// - Whole-word matching (e.g. "cat" in "cat!" but not in "category")
35/// - Phrase matching (e.g. "hello world" in "note:hello world!")
36///
37/// # Signature
38/// `matches_term(text: String, term: String) -> Boolean`
39///
40/// # Arguments
41/// * `text` - String column to search
42/// * `term` - Search term/phrase
43///
44/// # Returns
45/// BooleanVector where each element indicates if the corresponding text
46/// contains an exact match of the term, following these rules:
47/// 1. Exact substring match found (case-sensitive)
48/// 2. Match boundaries are either:
49///    - Start/end of text
50///    - Any non-alphanumeric character (including spaces, hyphens, punctuation, etc.)
51///
52/// # Examples
53/// ```
54/// -- SQL examples --
55/// -- Match phrase with space --
56/// SELECT matches_term(column, 'hello world') FROM table;
57/// -- Text: "warning:hello world!" => true
58/// -- Text: "hello-world"          => false (hyphen instead of space)
59/// -- Text: "hello world2023"      => false (ending with numbers)
60///
61/// -- Match multiple words with boundaries --
62/// SELECT matches_term(column, 'critical error') FROM logs;
63/// -- Match in: "ERROR:critical error!"
64/// -- No match: "critical_errors"
65///
66/// -- Empty string handling --
67/// SELECT matches_term(column, '') FROM table;
68/// -- Text: "" => true
69/// -- Text: "any" => false
70///
71/// -- Case sensitivity --
72/// SELECT matches_term(column, 'Cat') FROM table;
73/// -- Text: "Cat" => true
74/// -- Text: "cat" => false
75/// ```
76pub struct MatchesTermFunction;
77
78impl MatchesTermFunction {
79    pub fn register(registry: &FunctionRegistry) {
80        registry.register(Arc::new(MatchesTermFunction));
81    }
82}
83
84impl fmt::Display for MatchesTermFunction {
85    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
86        write!(f, "MATCHES_TERM")
87    }
88}
89
90impl Function for MatchesTermFunction {
91    fn name(&self) -> &str {
92        "matches_term"
93    }
94
95    fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
96        Ok(ConcreteDataType::boolean_datatype())
97    }
98
99    fn signature(&self) -> common_query::prelude::Signature {
100        common_query::prelude::Signature::exact(
101            vec![
102                ConcreteDataType::string_datatype(),
103                ConcreteDataType::string_datatype(),
104            ],
105            Volatility::Immutable,
106        )
107    }
108
109    fn eval(&self, _func_ctx: &FunctionContext, columns: &[VectorRef]) -> Result<VectorRef> {
110        ensure!(
111            columns.len() == 2,
112            InvalidFuncArgsSnafu {
113                err_msg: format!(
114                    "The length of the args is not correct, expect exactly 2, have: {}",
115                    columns.len()
116                ),
117            }
118        );
119
120        let text_column = &columns[0];
121        if text_column.is_empty() {
122            return Ok(Arc::new(BooleanVector::from(Vec::<bool>::with_capacity(0))));
123        }
124
125        let term_column = &columns[1];
126        let compiled_finder = if term_column.is_const() {
127            let term = term_column.get_ref(0).as_string().unwrap();
128            match term {
129                None => {
130                    return Ok(Arc::new(BooleanVector::from_iter(repeat_n(
131                        None,
132                        text_column.len(),
133                    ))));
134                }
135                Some(term) => Some(MatchesTermFinder::new(term)),
136            }
137        } else {
138            None
139        };
140
141        let len = text_column.len();
142        let mut result = BooleanVectorBuilder::with_capacity(len);
143        for i in 0..len {
144            let text = text_column.get_ref(i).as_string().unwrap();
145            let Some(text) = text else {
146                result.push_null();
147                continue;
148            };
149
150            let contains = match &compiled_finder {
151                Some(finder) => finder.find(text),
152                None => {
153                    let term = match term_column.get_ref(i).as_string().unwrap() {
154                        None => {
155                            result.push_null();
156                            continue;
157                        }
158                        Some(term) => term,
159                    };
160                    MatchesTermFinder::new(term).find(text)
161                }
162            };
163            result.push(Some(contains));
164        }
165
166        Ok(result.to_vector())
167    }
168}
169
170/// A compiled finder for `matches_term` function that holds the compiled term
171/// and its metadata for efficient matching.
172///
173/// A term is considered matched when:
174/// 1. The exact sequence appears in the text
175/// 2. It is either:
176///    - At the start/end of text with adjacent non-alphanumeric character
177///    - Surrounded by non-alphanumeric characters
178///
179/// # Examples
180/// ```
181/// let finder = MatchesTermFinder::new("cat");
182/// assert!(finder.find("cat!"));      // Term at end with punctuation
183/// assert!(finder.find("dog,cat"));   // Term preceded by comma
184/// assert!(!finder.find("category")); // Partial match rejected
185///
186/// let finder = MatchesTermFinder::new("world");
187/// assert!(finder.find("hello-world")); // Hyphen boundary
188/// ```
189#[derive(Clone, Debug)]
190pub struct MatchesTermFinder {
191    finder: memmem::Finder<'static>,
192    term: String,
193    starts_with_non_alnum: bool,
194    ends_with_non_alnum: bool,
195}
196
197impl MatchesTermFinder {
198    /// Create a new `MatchesTermFinder` for the given term.
199    pub fn new(term: &str) -> Self {
200        let starts_with_non_alnum = term.chars().next().is_some_and(|c| !c.is_alphanumeric());
201        let ends_with_non_alnum = term.chars().last().is_some_and(|c| !c.is_alphanumeric());
202
203        Self {
204            finder: memmem::Finder::new(term).into_owned(),
205            term: term.to_string(),
206            starts_with_non_alnum,
207            ends_with_non_alnum,
208        }
209    }
210
211    /// Find the term in the text.
212    pub fn find(&self, text: &str) -> bool {
213        if self.term.is_empty() {
214            return text.is_empty();
215        }
216
217        if text.len() < self.term.len() {
218            return false;
219        }
220
221        let mut pos = 0;
222        while let Some(found_pos) = self.finder.find(&text.as_bytes()[pos..]) {
223            let actual_pos = pos + found_pos;
224
225            let prev_ok = self.starts_with_non_alnum
226                || text[..actual_pos]
227                    .chars()
228                    .last()
229                    .map(|c| !c.is_alphanumeric())
230                    .unwrap_or(true);
231
232            if prev_ok {
233                let next_pos = actual_pos + self.finder.needle().len();
234                let next_ok = self.ends_with_non_alnum
235                    || text[next_pos..]
236                        .chars()
237                        .next()
238                        .map(|c| !c.is_alphanumeric())
239                        .unwrap_or(true);
240
241                if next_ok {
242                    return true;
243                }
244            }
245
246            if let Some(next_char) = text[actual_pos..].chars().next() {
247                pos = actual_pos + next_char.len_utf8();
248            } else {
249                break;
250            }
251        }
252
253        false
254    }
255}
256
257#[cfg(test)]
258mod tests {
259    use super::*;
260
261    #[test]
262    fn matches_term_example() {
263        let finder = MatchesTermFinder::new("hello world");
264        assert!(finder.find("warning:hello world!"));
265        assert!(!finder.find("hello-world"));
266        assert!(!finder.find("hello world2023"));
267
268        let finder = MatchesTermFinder::new("critical error");
269        assert!(finder.find("ERROR:critical error!"));
270        assert!(!finder.find("critical_errors"));
271
272        let finder = MatchesTermFinder::new("");
273        assert!(finder.find(""));
274        assert!(!finder.find("any"));
275
276        let finder = MatchesTermFinder::new("Cat");
277        assert!(finder.find("Cat"));
278        assert!(!finder.find("cat"));
279    }
280
281    #[test]
282    fn matches_term_with_punctuation() {
283        assert!(MatchesTermFinder::new("cat").find("cat!"));
284        assert!(MatchesTermFinder::new("dog").find("!dog"));
285    }
286
287    #[test]
288    fn matches_phrase_with_boundaries() {
289        assert!(MatchesTermFinder::new("hello-world").find("hello-world"));
290        assert!(MatchesTermFinder::new("'foo bar'").find("test: 'foo bar'"));
291    }
292
293    #[test]
294    fn matches_at_text_boundaries() {
295        assert!(MatchesTermFinder::new("start").find("start..."));
296        assert!(MatchesTermFinder::new("end").find("...end"));
297    }
298
299    // Negative cases
300    #[test]
301    fn rejects_partial_matches() {
302        assert!(!MatchesTermFinder::new("cat").find("category"));
303        assert!(!MatchesTermFinder::new("boot").find("rebooted"));
304    }
305
306    #[test]
307    fn rejects_missing_term() {
308        assert!(!MatchesTermFinder::new("foo").find("hello world"));
309    }
310
311    // Edge cases
312    #[test]
313    fn handles_empty_inputs() {
314        assert!(!MatchesTermFinder::new("test").find(""));
315        assert!(!MatchesTermFinder::new("").find("text"));
316    }
317
318    #[test]
319    fn different_unicode_boundaries() {
320        assert!(MatchesTermFinder::new("café").find("café>"));
321        assert!(!MatchesTermFinder::new("café").find("口café>"));
322        assert!(!MatchesTermFinder::new("café").find("café口"));
323        assert!(!MatchesTermFinder::new("café").find("cafémore"));
324        assert!(MatchesTermFinder::new("русский").find("русский!"));
325        assert!(MatchesTermFinder::new("русский").find("русский!"));
326    }
327
328    #[test]
329    fn case_sensitive_matching() {
330        assert!(!MatchesTermFinder::new("cat").find("Cat"));
331        assert!(MatchesTermFinder::new("CaT").find("CaT"));
332    }
333
334    #[test]
335    fn numbers_in_term() {
336        assert!(MatchesTermFinder::new("v1.0").find("v1.0!"));
337        assert!(!MatchesTermFinder::new("v1.0").find("v1.0a"));
338    }
339
340    #[test]
341    fn adjacent_alphanumeric_fails() {
342        assert!(!MatchesTermFinder::new("cat").find("cat5"));
343        assert!(!MatchesTermFinder::new("dog").find("dogcat"));
344    }
345
346    #[test]
347    fn empty_term_text() {
348        assert!(!MatchesTermFinder::new("").find("text"));
349        assert!(MatchesTermFinder::new("").find(""));
350        assert!(!MatchesTermFinder::new("text").find(""));
351    }
352
353    #[test]
354    fn leading_non_alphanumeric() {
355        assert!(MatchesTermFinder::new("/cat").find("dog/cat"));
356        assert!(MatchesTermFinder::new("dog/").find("dog/cat"));
357        assert!(MatchesTermFinder::new("dog/cat").find("dog/cat"));
358    }
359
360    #[test]
361    fn continues_searching_after_boundary_mismatch() {
362        assert!(!MatchesTermFinder::new("log").find("bloglog!"));
363        assert!(MatchesTermFinder::new("log").find("bloglog log"));
364        assert!(MatchesTermFinder::new("log").find("alogblog_log!"));
365
366        assert!(MatchesTermFinder::new("error").find("errorlog_error_case"));
367        assert!(MatchesTermFinder::new("test").find("atestbtestc_test_end"));
368        assert!(MatchesTermFinder::new("data").find("database_data_store"));
369        assert!(!MatchesTermFinder::new("data").find("database_datastore"));
370        assert!(MatchesTermFinder::new("log.txt").find("catalog.txt_log.txt!"));
371        assert!(!MatchesTermFinder::new("log.txt").find("catalog.txtlog.txt!"));
372        assert!(MatchesTermFinder::new("data-set").find("bigdata-set_data-set!"));
373
374        assert!(MatchesTermFinder::new("中文").find("这是中文测试,中文!"));
375        assert!(MatchesTermFinder::new("error").find("错误errorerror日志_error!"));
376    }
377}