index/fulltext_index/search/
tantivy.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::collections::{BTreeSet, HashMap};
16use std::path::Path;
17use std::time::Instant;
18
19use async_trait::async_trait;
20use common_telemetry::debug;
21use snafu::{OptionExt, ResultExt};
22use tantivy::collector::DocSetCollector;
23use tantivy::query::QueryParser;
24use tantivy::schema::{Field, Value};
25use tantivy::{Index, IndexReader, ReloadPolicy, TantivyDocument};
26
27use crate::fulltext_index::create::{ROWID_FIELD_NAME, TEXT_FIELD_NAME};
28use crate::fulltext_index::error::{
29    Result, TantivyDocNotFoundSnafu, TantivyParserSnafu, TantivySnafu,
30};
31use crate::fulltext_index::search::{FulltextIndexSearcher, RowId};
32use crate::fulltext_index::Config;
33
34/// `TantivyFulltextIndexSearcher` is a searcher using Tantivy.
35pub struct TantivyFulltextIndexSearcher {
36    /// Tanitvy index.
37    index: Index,
38    /// Tanitvy index reader.
39    reader: IndexReader,
40    /// The default field used to build `QueryParser`
41    default_field: Field,
42}
43
44impl TantivyFulltextIndexSearcher {
45    /// Creates a new `TantivyFulltextIndexSearcher`.
46    pub fn new(path: impl AsRef<Path>, config: Config) -> Result<Self> {
47        let now = Instant::now();
48
49        let mut index = Index::open_in_dir(path.as_ref()).context(TantivySnafu)?;
50        index.set_tokenizers(config.build_tantivy_tokenizer());
51        let reader = index
52            .reader_builder()
53            .reload_policy(ReloadPolicy::Manual)
54            .num_warming_threads(0)
55            .try_into()
56            .context(TantivySnafu)?;
57        let default_field = index
58            .schema()
59            .get_field(TEXT_FIELD_NAME)
60            .context(TantivySnafu)?;
61
62        debug!(
63            "Opened tantivy index on {:?} in {:?}",
64            path.as_ref(),
65            now.elapsed()
66        );
67
68        Ok(Self {
69            index,
70            reader,
71            default_field,
72        })
73    }
74}
75
76#[async_trait]
77impl FulltextIndexSearcher for TantivyFulltextIndexSearcher {
78    async fn search(&self, query: &str) -> Result<BTreeSet<RowId>> {
79        let searcher = self.reader.searcher();
80        let query_parser = QueryParser::for_index(&self.index, vec![self.default_field]);
81        let query = query_parser
82            .parse_query(query)
83            .context(TantivyParserSnafu)?;
84        let doc_addrs = searcher
85            .search(&query, &DocSetCollector)
86            .context(TantivySnafu)?;
87
88        let seg_metas = self
89            .index
90            .searchable_segment_metas()
91            .context(TantivySnafu)?;
92
93        // FAST PATH: only one segment, the doc id is the same as the row id.
94        //            Also for compatibility with the old version.
95        if seg_metas.len() == 1 {
96            return Ok(doc_addrs.into_iter().map(|d| d.doc_id).collect());
97        }
98
99        // SLOW PATH: multiple segments, need to calculate the row id.
100        let rowid_field = searcher
101            .schema()
102            .get_field(ROWID_FIELD_NAME)
103            .context(TantivySnafu)?;
104        let mut seg_offsets = HashMap::with_capacity(seg_metas.len());
105        let mut res = BTreeSet::new();
106        for doc_addr in doc_addrs {
107            let offset = if let Some(offset) = seg_offsets.get(&doc_addr.segment_ord) {
108                *offset
109            } else {
110                // Calculate the offset at the first time meeting the segment and cache it since
111                // the offset is the same for all rows in the same segment.
112                let doc: TantivyDocument = searcher.doc(doc_addr).context(TantivySnafu)?;
113                let rowid = doc
114                    .get_first(rowid_field)
115                    .and_then(|v| v.as_u64())
116                    .context(TantivyDocNotFoundSnafu { doc_addr })?;
117
118                let offset = rowid as u32 - doc_addr.doc_id;
119                seg_offsets.insert(doc_addr.segment_ord, offset);
120                offset
121            };
122
123            res.insert(doc_addr.doc_id + offset);
124        }
125
126        Ok(res)
127    }
128}