index/
fulltext_index.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use puffin::blob_metadata::BlobMetadata;
16use serde::{Deserialize, Serialize};
17use snafu::ResultExt;
18use tantivy::tokenizer::{LowerCaser, SimpleTokenizer, TextAnalyzer, TokenizerManager};
19use tantivy_jieba::JiebaTokenizer;
20pub mod create;
21pub mod error;
22pub mod search;
23pub mod tokenizer;
24
25pub const KEY_FULLTEXT_CONFIG: &str = "fulltext_config";
26
27use crate::fulltext_index::error::{DeserializeFromJsonSnafu, Result};
28
29#[cfg(test)]
30mod tests;
31
32/// Configuration for fulltext index.
33#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
34pub struct Config {
35    /// Analyzer to use for tokenization.
36    pub analyzer: Analyzer,
37
38    /// Whether the index should be case-sensitive.
39    pub case_sensitive: bool,
40}
41
42/// Analyzer to use for tokenization.
43#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
44pub enum Analyzer {
45    #[default]
46    English,
47
48    Chinese,
49}
50
51impl Config {
52    fn build_tantivy_tokenizer(&self) -> TokenizerManager {
53        let mut builder = match self.analyzer {
54            Analyzer::English => TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(),
55            Analyzer::Chinese => TextAnalyzer::builder(JiebaTokenizer {}).dynamic(),
56        };
57
58        if !self.case_sensitive {
59            builder = builder.filter_dynamic(LowerCaser);
60        }
61
62        let tokenizer = builder.build();
63        let tokenizer_manager = TokenizerManager::new();
64        tokenizer_manager.register("default", tokenizer);
65        tokenizer_manager
66    }
67
68    /// Extracts the fulltext index configuration from the blob metadata.
69    pub fn from_blob_metadata(metadata: &BlobMetadata) -> Result<Self> {
70        if let Some(config) = metadata.properties.get(KEY_FULLTEXT_CONFIG) {
71            let config = serde_json::from_str(config).context(DeserializeFromJsonSnafu)?;
72            return Ok(config);
73        }
74
75        Ok(Self::default())
76    }
77}