1use std::collections::HashMap;
16use std::sync::Arc;
17use std::sync::atomic::AtomicUsize;
18
19use api::v1::SemanticType;
20use common_telemetry::warn;
21use datatypes::arrow::array::{Array, LargeStringArray, StringArray};
22use datatypes::arrow::datatypes::DataType;
23use datatypes::arrow::record_batch::RecordBatch;
24use datatypes::schema::{FulltextAnalyzer, FulltextBackend};
25use index::fulltext_index::create::{
26 BloomFilterFulltextIndexCreator, FulltextIndexCreator, TantivyFulltextIndexCreator,
27};
28use index::fulltext_index::{Analyzer, Config};
29use index::target::IndexTarget;
30use puffin::blob_metadata::CompressionCodec;
31use puffin::puffin_manager::PutOptions;
32use snafu::{ResultExt, ensure};
33use store_api::metadata::RegionMetadataRef;
34use store_api::storage::{ColumnId, ConcreteDataType, FileId, RegionId};
35
36use crate::error::{
37 CastVectorSnafu, ComputeArrowSnafu, CreateFulltextCreatorSnafu, DataTypeMismatchSnafu,
38 FulltextFinishSnafu, FulltextPushTextSnafu, IndexOptionsSnafu, OperateAbortedIndexSnafu,
39 Result,
40};
41use crate::read::Batch;
42use crate::sst::index::TYPE_FULLTEXT_INDEX;
43use crate::sst::index::fulltext_index::{INDEX_BLOB_TYPE_BLOOM, INDEX_BLOB_TYPE_TANTIVY};
44use crate::sst::index::intermediate::{
45 IntermediateLocation, IntermediateManager, TempFileProvider,
46};
47use crate::sst::index::puffin_manager::SstPuffinWriter;
48use crate::sst::index::statistics::{ByteCount, RowCount, Statistics};
49
50pub struct FulltextIndexer {
52 creators: HashMap<ColumnId, SingleCreator>,
54 aborted: bool,
56 stats: Statistics,
58}
59
60impl FulltextIndexer {
61 pub async fn new(
63 region_id: &RegionId,
64 sst_file_id: &FileId,
65 intermediate_manager: &IntermediateManager,
66 metadata: &RegionMetadataRef,
67 compress: bool,
68 mem_limit: usize,
69 ) -> Result<Option<Self>> {
70 let mut creators = HashMap::new();
71
72 for column in &metadata.column_metadatas {
73 if column.semantic_type == SemanticType::Tag {
77 common_telemetry::debug!(
78 "Skip creating fulltext index for tag column {}",
79 column.column_schema.name
80 );
81 continue;
82 }
83
84 let options = column
85 .column_schema
86 .fulltext_options()
87 .context(IndexOptionsSnafu {
88 column_name: &column.column_schema.name,
89 })?;
90
91 let options = match options {
94 Some(options) if options.enable => options,
95 _ => continue,
96 };
97
98 let column_id = column.column_id;
99 let intm_path = intermediate_manager.fulltext_path(region_id, sst_file_id, column_id);
100
101 let config = Config {
102 analyzer: match options.analyzer {
103 FulltextAnalyzer::English => Analyzer::English,
104 FulltextAnalyzer::Chinese => Analyzer::Chinese,
105 },
106 case_sensitive: options.case_sensitive,
107 };
108
109 let inner = match options.backend {
110 FulltextBackend::Tantivy => {
111 let creator = TantivyFulltextIndexCreator::new(&intm_path, config, mem_limit)
112 .await
113 .context(CreateFulltextCreatorSnafu)?;
114 AltFulltextCreator::Tantivy(creator)
115 }
116 FulltextBackend::Bloom => {
117 let temp_file_provider = Arc::new(TempFileProvider::new(
118 IntermediateLocation::new(&metadata.region_id, sst_file_id),
119 intermediate_manager.clone(),
120 ));
121 let global_memory_usage = Arc::new(AtomicUsize::new(0));
122 let creator = BloomFilterFulltextIndexCreator::new(
123 config,
124 options.granularity as _,
125 options.false_positive_rate(),
126 temp_file_provider,
127 global_memory_usage,
128 Some(mem_limit),
129 );
130 AltFulltextCreator::Bloom(creator)
131 }
132 };
133
134 creators.insert(
135 column_id,
136 SingleCreator {
137 column_id,
138 column_name: column.column_schema.name.clone(),
139 inner,
140 compress,
141 },
142 );
143 }
144
145 Ok((!creators.is_empty()).then(move || Self {
146 creators,
147 aborted: false,
148 stats: Statistics::new(TYPE_FULLTEXT_INDEX),
149 }))
150 }
151
152 pub async fn update(&mut self, batch: &mut Batch) -> Result<()> {
154 ensure!(!self.aborted, OperateAbortedIndexSnafu);
155
156 if let Err(update_err) = self.do_update(batch).await {
157 if let Err(err) = self.do_abort().await {
158 if cfg!(any(test, feature = "test")) {
159 panic!("Failed to abort index creator, err: {err}");
160 } else {
161 warn!(err; "Failed to abort index creator");
162 }
163 }
164 return Err(update_err);
165 }
166
167 Ok(())
168 }
169
170 pub async fn update_flat(&mut self, batch: &RecordBatch) -> Result<()> {
172 ensure!(!self.aborted, OperateAbortedIndexSnafu);
173
174 if batch.num_rows() == 0 {
175 return Ok(());
176 }
177
178 if let Err(update_err) = self.do_update_flat(batch).await {
179 if let Err(err) = self.do_abort().await {
180 if cfg!(any(test, feature = "test")) {
181 panic!("Failed to abort index creator, err: {err}");
182 } else {
183 warn!(err; "Failed to abort index creator");
184 }
185 }
186 return Err(update_err);
187 }
188
189 Ok(())
190 }
191
192 pub async fn finish(
194 &mut self,
195 puffin_writer: &mut SstPuffinWriter,
196 ) -> Result<(RowCount, ByteCount)> {
197 ensure!(!self.aborted, OperateAbortedIndexSnafu);
198
199 match self.do_finish(puffin_writer).await {
200 Ok(()) => Ok((self.stats.row_count(), self.stats.byte_count())),
201 Err(finish_err) => {
202 if let Err(err) = self.do_abort().await {
203 if cfg!(any(test, feature = "test")) {
204 panic!("Failed to abort index creator, err: {err}");
205 } else {
206 warn!(err; "Failed to abort index creator");
207 }
208 }
209 Err(finish_err)
210 }
211 }
212 }
213
214 pub async fn abort(&mut self) -> Result<()> {
216 if self.aborted {
217 return Ok(());
218 }
219
220 self.do_abort().await
221 }
222
223 pub fn memory_usage(&self) -> usize {
225 self.creators.values().map(|c| c.inner.memory_usage()).sum()
226 }
227
228 pub fn column_ids(&self) -> impl Iterator<Item = ColumnId> + '_ {
230 self.creators.keys().copied()
231 }
232}
233
234impl FulltextIndexer {
235 async fn do_update(&mut self, batch: &mut Batch) -> Result<()> {
236 let mut guard = self.stats.record_update();
237 guard.inc_row_count(batch.num_rows());
238
239 for creator in self.creators.values_mut() {
240 creator.update(batch).await?;
241 }
242
243 Ok(())
244 }
245
246 async fn do_update_flat(&mut self, batch: &RecordBatch) -> Result<()> {
247 let mut guard = self.stats.record_update();
248 guard.inc_row_count(batch.num_rows());
249
250 for creator in self.creators.values_mut() {
251 creator.update_flat(batch).await?;
252 }
253
254 Ok(())
255 }
256
257 async fn do_finish(&mut self, puffin_writer: &mut SstPuffinWriter) -> Result<()> {
258 let mut guard = self.stats.record_finish();
259
260 let mut written_bytes = 0;
261 for creator in self.creators.values_mut() {
262 written_bytes += creator.finish(puffin_writer).await?;
263 }
264
265 guard.inc_byte_count(written_bytes);
266 Ok(())
267 }
268
269 async fn do_abort(&mut self) -> Result<()> {
270 let _guard = self.stats.record_cleanup();
271
272 self.aborted = true;
273
274 for (_, mut creator) in self.creators.drain() {
275 creator.abort().await?;
276 }
277
278 Ok(())
279 }
280}
281
282struct SingleCreator {
284 column_id: ColumnId,
286 column_name: String,
288 inner: AltFulltextCreator,
290 compress: bool,
292}
293
294impl SingleCreator {
295 async fn update(&mut self, batch: &mut Batch) -> Result<()> {
296 let text_column = batch
297 .fields()
298 .iter()
299 .find(|c| c.column_id == self.column_id);
300 match text_column {
301 Some(column) => {
302 let data = column
303 .data
304 .cast(&ConcreteDataType::string_datatype())
305 .context(CastVectorSnafu {
306 from: column.data.data_type(),
307 to: ConcreteDataType::string_datatype(),
308 })?;
309
310 for i in 0..batch.num_rows() {
311 let data = data.get_ref(i);
312 let text = data
313 .try_into_string()
314 .context(DataTypeMismatchSnafu)?
315 .unwrap_or_default();
316 self.inner.push_text(text).await?;
317 }
318 }
319 _ => {
320 for _ in 0..batch.num_rows() {
324 self.inner.push_text("").await?;
325 }
326 }
327 }
328
329 Ok(())
330 }
331
332 async fn update_flat(&mut self, batch: &RecordBatch) -> Result<()> {
333 if let Some(column_array) = batch.column_by_name(&self.column_name) {
335 match column_array.data_type() {
338 DataType::Utf8 => {
339 let string_array = column_array.as_any().downcast_ref::<StringArray>().unwrap();
340 for text_opt in string_array.iter() {
341 let text = text_opt.unwrap_or_default();
342 self.inner.push_text(text).await?;
343 }
344 }
345 DataType::LargeUtf8 => {
346 let large_string_array = column_array
347 .as_any()
348 .downcast_ref::<LargeStringArray>()
349 .unwrap();
350 for text_opt in large_string_array.iter() {
351 let text = text_opt.unwrap_or_default();
352 self.inner.push_text(text).await?;
353 }
354 }
355 _ => {
356 let array = datatypes::arrow::compute::cast(column_array, &DataType::Utf8)
358 .context(ComputeArrowSnafu)?;
359 let string_array = array.as_any().downcast_ref::<StringArray>().unwrap();
360 for text_opt in string_array.iter() {
361 let text = text_opt.unwrap_or_default();
362 self.inner.push_text(text).await?;
363 }
364 }
365 }
366 } else {
367 for _ in 0..batch.num_rows() {
371 self.inner.push_text("").await?;
372 }
373 }
374
375 Ok(())
376 }
377
378 async fn finish(&mut self, puffin_writer: &mut SstPuffinWriter) -> Result<ByteCount> {
379 let options = PutOptions {
380 compression: self.compress.then_some(CompressionCodec::Zstd),
381 };
382 self.inner
383 .finish(puffin_writer, &self.column_id, options)
384 .await
385 }
386
387 async fn abort(&mut self) -> Result<()> {
388 self.inner.abort(&self.column_id).await;
389 Ok(())
390 }
391}
392
393#[allow(dead_code, clippy::large_enum_variant)]
394enum AltFulltextCreator {
396 Tantivy(TantivyFulltextIndexCreator),
397 Bloom(BloomFilterFulltextIndexCreator),
398}
399
400impl AltFulltextCreator {
401 async fn push_text(&mut self, text: &str) -> Result<()> {
402 match self {
403 Self::Tantivy(creator) => creator.push_text(text).await.context(FulltextPushTextSnafu),
404 Self::Bloom(creator) => creator.push_text(text).await.context(FulltextPushTextSnafu),
405 }
406 }
407
408 fn memory_usage(&self) -> usize {
409 match self {
410 Self::Tantivy(creator) => creator.memory_usage(),
411 Self::Bloom(creator) => creator.memory_usage(),
412 }
413 }
414
415 async fn finish(
416 &mut self,
417 puffin_writer: &mut SstPuffinWriter,
418 column_id: &ColumnId,
419 put_options: PutOptions,
420 ) -> Result<ByteCount> {
421 match self {
422 Self::Tantivy(creator) => {
423 let blob_key = format!(
424 "{INDEX_BLOB_TYPE_TANTIVY}-{}",
425 IndexTarget::ColumnId(*column_id)
426 );
427 creator
428 .finish(puffin_writer, &blob_key, put_options)
429 .await
430 .context(FulltextFinishSnafu)
431 }
432 Self::Bloom(creator) => {
433 let blob_key = format!(
434 "{INDEX_BLOB_TYPE_BLOOM}-{}",
435 IndexTarget::ColumnId(*column_id)
436 );
437 creator
438 .finish(puffin_writer, &blob_key, put_options)
439 .await
440 .context(FulltextFinishSnafu)
441 }
442 }
443 }
444
445 async fn abort(&mut self, column_id: &ColumnId) {
446 match self {
447 Self::Tantivy(creator) => {
448 if let Err(err) = creator.abort().await {
449 warn!(err; "Failed to abort the fulltext index creator in the Tantivy flavor, col_id: {:?}", column_id);
450 }
451 }
452 Self::Bloom(creator) => {
453 if let Err(err) = creator.abort().await {
454 warn!(err; "Failed to abort the fulltext index creator in the Bloom Filter flavor, col_id: {:?}", column_id);
455 }
456 }
457 }
458 }
459}
460
461#[cfg(test)]
462mod tests {
463 use std::collections::{BTreeMap, BTreeSet};
464 use std::sync::Arc;
465
466 use api::v1::SemanticType;
467 use common_base::BitVec;
468 use datatypes::data_type::DataType;
469 use datatypes::schema::{ColumnSchema, FulltextAnalyzer, FulltextOptions};
470 use datatypes::vectors::{UInt8Vector, UInt64Vector};
471 use futures::FutureExt;
472 use futures::future::BoxFuture;
473 use index::fulltext_index::search::RowId;
474 use object_store::ObjectStore;
475 use object_store::services::Memory;
476 use puffin::puffin_manager::{PuffinManager, PuffinWriter};
477 use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder, RegionMetadataRef};
478 use store_api::region_request::PathType;
479 use store_api::storage::{ConcreteDataType, FileId, RegionId};
480
481 use super::*;
482 use crate::access_layer::RegionFilePathFactory;
483 use crate::read::{Batch, BatchColumn};
484 use crate::sst::file::RegionFileId;
485 use crate::sst::index::fulltext_index::applier::FulltextIndexApplier;
486 use crate::sst::index::fulltext_index::applier::builder::{
487 FulltextQuery, FulltextRequest, FulltextTerm,
488 };
489 use crate::sst::index::puffin_manager::PuffinManagerFactory;
490
491 fn mock_object_store() -> ObjectStore {
492 ObjectStore::new(Memory::default()).unwrap().finish()
493 }
494
495 async fn new_intm_mgr(path: impl AsRef<str>) -> IntermediateManager {
496 IntermediateManager::init_fs(path).await.unwrap()
497 }
498
499 fn mock_region_metadata(backend: FulltextBackend) -> RegionMetadataRef {
500 let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 2));
501 builder
502 .push_column_metadata(ColumnMetadata {
503 column_schema: ColumnSchema::new(
504 "text_english_case_sensitive",
505 ConcreteDataType::string_datatype(),
506 true,
507 )
508 .with_fulltext_options(FulltextOptions::new_unchecked(
509 true,
510 FulltextAnalyzer::English,
511 true,
512 backend.clone(),
513 1,
514 0.01,
515 ))
516 .unwrap(),
517 semantic_type: SemanticType::Field,
518 column_id: 1,
519 })
520 .push_column_metadata(ColumnMetadata {
521 column_schema: ColumnSchema::new(
522 "text_english_case_insensitive",
523 ConcreteDataType::string_datatype(),
524 true,
525 )
526 .with_fulltext_options(FulltextOptions::new_unchecked(
527 true,
528 FulltextAnalyzer::English,
529 false,
530 backend.clone(),
531 1,
532 0.01,
533 ))
534 .unwrap(),
535 semantic_type: SemanticType::Field,
536 column_id: 2,
537 })
538 .push_column_metadata(ColumnMetadata {
539 column_schema: ColumnSchema::new(
540 "text_chinese",
541 ConcreteDataType::string_datatype(),
542 true,
543 )
544 .with_fulltext_options(FulltextOptions::new_unchecked(
545 true,
546 FulltextAnalyzer::Chinese,
547 false,
548 backend.clone(),
549 1,
550 0.01,
551 ))
552 .unwrap(),
553 semantic_type: SemanticType::Field,
554 column_id: 3,
555 })
556 .push_column_metadata(ColumnMetadata {
557 column_schema: ColumnSchema::new(
558 "ts",
559 ConcreteDataType::timestamp_millisecond_datatype(),
560 false,
561 ),
562 semantic_type: SemanticType::Timestamp,
563 column_id: 4,
564 });
565
566 Arc::new(builder.build().unwrap())
567 }
568
569 fn new_batch(
570 rows: &[(
571 Option<&str>, Option<&str>, Option<&str>, )],
575 ) -> Batch {
576 let mut vec_english_sensitive =
577 ConcreteDataType::string_datatype().create_mutable_vector(0);
578 let mut vec_english_insensitive =
579 ConcreteDataType::string_datatype().create_mutable_vector(0);
580 let mut vec_chinese = ConcreteDataType::string_datatype().create_mutable_vector(0);
581
582 for (text_english_case_sensitive, text_english_case_insensitive, text_chinese) in rows {
583 match text_english_case_sensitive {
584 Some(s) => vec_english_sensitive.push_value_ref(&(*s).into()),
585 None => vec_english_sensitive.push_null(),
586 }
587 match text_english_case_insensitive {
588 Some(s) => vec_english_insensitive.push_value_ref(&(*s).into()),
589 None => vec_english_insensitive.push_null(),
590 }
591 match text_chinese {
592 Some(s) => vec_chinese.push_value_ref(&(*s).into()),
593 None => vec_chinese.push_null(),
594 }
595 }
596
597 let num_rows = vec_english_sensitive.len();
598 Batch::new(
599 vec![],
600 Arc::new(UInt64Vector::from_iter_values(
601 (0..num_rows).map(|n| n as u64),
602 )),
603 Arc::new(UInt64Vector::from_iter_values(std::iter::repeat_n(
604 0, num_rows,
605 ))),
606 Arc::new(UInt8Vector::from_iter_values(std::iter::repeat_n(
607 1, num_rows,
608 ))),
609 vec![
610 BatchColumn {
611 column_id: 1,
612 data: vec_english_sensitive.to_vector(),
613 },
614 BatchColumn {
615 column_id: 2,
616 data: vec_english_insensitive.to_vector(),
617 },
618 BatchColumn {
619 column_id: 3,
620 data: vec_chinese.to_vector(),
621 },
622 ],
623 )
624 .unwrap()
625 }
626
627 async fn build_fulltext_applier_factory(
636 prefix: &str,
637 backend: FulltextBackend,
638 rows: &[(
639 Option<&str>, Option<&str>, Option<&str>, )],
643 ) -> impl Fn(
644 Vec<(ColumnId, &str)>,
645 Vec<(ColumnId, Vec<(bool, &str)>)>,
646 Option<BitVec>,
647 ) -> BoxFuture<'static, Option<BTreeSet<RowId>>> {
648 let (d, factory) = PuffinManagerFactory::new_for_test_async(prefix).await;
649 let table_dir = "table0".to_string();
650 let sst_file_id = FileId::random();
651 let object_store = mock_object_store();
652 let region_metadata = mock_region_metadata(backend.clone());
653 let intm_mgr = new_intm_mgr(d.path().to_string_lossy()).await;
654
655 let mut indexer = FulltextIndexer::new(
656 ®ion_metadata.region_id,
657 &sst_file_id,
658 &intm_mgr,
659 ®ion_metadata,
660 true,
661 1024,
662 )
663 .await
664 .unwrap()
665 .unwrap();
666
667 let mut batch = new_batch(rows);
668 indexer.update(&mut batch).await.unwrap();
669
670 let puffin_manager = factory.build(
671 object_store.clone(),
672 RegionFilePathFactory::new(table_dir.clone(), PathType::Bare),
673 );
674 let region_file_id = RegionFileId::new(region_metadata.region_id, sst_file_id);
675 let mut writer = puffin_manager.writer(®ion_file_id).await.unwrap();
676 let _ = indexer.finish(&mut writer).await.unwrap();
677 writer.finish().await.unwrap();
678
679 move |queries: Vec<(ColumnId, &str)>,
680 terms_requests: Vec<(ColumnId, Vec<(bool, &str)>)>,
681 coarse_mask: Option<BitVec>| {
682 let _d = &d;
683 let table_dir = table_dir.clone();
684 let object_store = object_store.clone();
685 let factory = factory.clone();
686
687 let mut requests: BTreeMap<ColumnId, FulltextRequest> = BTreeMap::new();
688
689 for (column_id, query) in queries {
691 requests
692 .entry(column_id)
693 .or_default()
694 .queries
695 .push(FulltextQuery(query.to_string()));
696 }
697
698 for (column_id, terms) in terms_requests {
700 let fulltext_terms = terms
701 .into_iter()
702 .map(|(col_lowered, term)| FulltextTerm {
703 col_lowered,
704 term: term.to_string(),
705 })
706 .collect::<Vec<_>>();
707
708 requests
709 .entry(column_id)
710 .or_default()
711 .terms
712 .extend(fulltext_terms);
713 }
714
715 let applier = FulltextIndexApplier::new(
716 table_dir,
717 PathType::Bare,
718 object_store,
719 requests,
720 factory,
721 );
722
723 let backend = backend.clone();
724 async move {
725 match backend {
726 FulltextBackend::Tantivy => {
727 applier.apply_fine(region_file_id, None).await.unwrap()
728 }
729 FulltextBackend::Bloom => {
730 let coarse_mask = coarse_mask.unwrap_or_default();
731 let row_groups = (0..coarse_mask.len()).map(|i| (1, coarse_mask[i]));
732 let resp = applier
734 .apply_coarse(region_file_id, None, row_groups)
735 .await
736 .unwrap();
737 resp.map(|r| {
738 r.into_iter()
739 .filter(|(_, ranges)| !ranges.is_empty())
740 .map(|(row_group_id, _)| row_group_id as RowId)
741 .collect()
742 })
743 }
744 }
745 }
746 .boxed()
747 }
748 }
749
750 fn rows(row_ids: impl IntoIterator<Item = RowId>) -> BTreeSet<RowId> {
751 row_ids.into_iter().collect()
752 }
753
754 #[tokio::test]
755 async fn test_fulltext_index_basic_case_sensitive_tantivy() {
756 let applier_factory = build_fulltext_applier_factory(
757 "test_fulltext_index_basic_case_sensitive_tantivy_",
758 FulltextBackend::Tantivy,
759 &[
760 (Some("hello"), None, None),
761 (Some("world"), None, None),
762 (None, None, None),
763 (Some("Hello, World"), None, None),
764 ],
765 )
766 .await;
767
768 let row_ids = applier_factory(vec![(1, "hello")], vec![], None).await;
769 assert_eq!(row_ids, Some(rows([0])));
770
771 let row_ids = applier_factory(vec![(1, "world")], vec![], None).await;
772 assert_eq!(row_ids, Some(rows([1])));
773
774 let row_ids = applier_factory(vec![(1, "Hello")], vec![], None).await;
775 assert_eq!(row_ids, Some(rows([3])));
776
777 let row_ids = applier_factory(vec![(1, "World")], vec![], None).await;
778 assert_eq!(row_ids, Some(rows([3])));
779
780 let row_ids = applier_factory(vec![], vec![(1, vec![(false, "hello")])], None).await;
781 assert_eq!(row_ids, Some(rows([0])));
782
783 let row_ids = applier_factory(vec![], vec![(1, vec![(true, "hello")])], None).await;
784 assert_eq!(row_ids, None);
785
786 let row_ids = applier_factory(vec![], vec![(1, vec![(false, "world")])], None).await;
787 assert_eq!(row_ids, Some(rows([1])));
788
789 let row_ids = applier_factory(vec![], vec![(1, vec![(true, "world")])], None).await;
790 assert_eq!(row_ids, None);
791
792 let row_ids = applier_factory(vec![], vec![(1, vec![(false, "Hello")])], None).await;
793 assert_eq!(row_ids, Some(rows([3])));
794
795 let row_ids = applier_factory(vec![], vec![(1, vec![(true, "Hello")])], None).await;
796 assert_eq!(row_ids, None);
797
798 let row_ids = applier_factory(vec![], vec![(1, vec![(false, "Hello, World")])], None).await;
799 assert_eq!(row_ids, Some(rows([3])));
800
801 let row_ids = applier_factory(vec![], vec![(1, vec![(true, "Hello, World")])], None).await;
802 assert_eq!(row_ids, None);
803 }
804
805 #[tokio::test]
806 async fn test_fulltext_index_basic_case_sensitive_bloom() {
807 let applier_factory = build_fulltext_applier_factory(
808 "test_fulltext_index_basic_case_sensitive_bloom_",
809 FulltextBackend::Bloom,
810 &[
811 (Some("hello"), None, None),
812 (Some("world"), None, None),
813 (None, None, None),
814 (Some("Hello, World"), None, None),
815 ],
816 )
817 .await;
818
819 let row_ids = applier_factory(
820 vec![],
821 vec![(1, vec![(false, "hello")])],
822 Some(BitVec::from_slice(&[0b1111])),
823 )
824 .await;
825 assert_eq!(row_ids, Some(rows([0])));
826
827 let row_ids = applier_factory(
828 vec![],
829 vec![(1, vec![(false, "hello")])],
830 Some(BitVec::from_slice(&[0b1110])), )
832 .await;
833 assert_eq!(row_ids, Some(rows([])));
834
835 let row_ids = applier_factory(
836 vec![],
837 vec![(1, vec![(true, "hello")])],
838 Some(BitVec::from_slice(&[0b1111])),
839 )
840 .await;
841 assert_eq!(row_ids, None);
842
843 let row_ids = applier_factory(
844 vec![],
845 vec![(1, vec![(false, "world")])],
846 Some(BitVec::from_slice(&[0b1111])),
847 )
848 .await;
849 assert_eq!(row_ids, Some(rows([1])));
850
851 let row_ids = applier_factory(
852 vec![],
853 vec![(1, vec![(false, "world")])],
854 Some(BitVec::from_slice(&[0b1101])), )
856 .await;
857 assert_eq!(row_ids, Some(rows([])));
858
859 let row_ids = applier_factory(
860 vec![],
861 vec![(1, vec![(true, "world")])],
862 Some(BitVec::from_slice(&[0b1111])),
863 )
864 .await;
865 assert_eq!(row_ids, None);
866
867 let row_ids = applier_factory(
868 vec![],
869 vec![(1, vec![(false, "Hello")])],
870 Some(BitVec::from_slice(&[0b1111])),
871 )
872 .await;
873 assert_eq!(row_ids, Some(rows([3])));
874
875 let row_ids = applier_factory(
876 vec![],
877 vec![(1, vec![(false, "Hello")])],
878 Some(BitVec::from_slice(&[0b0111])), )
880 .await;
881 assert_eq!(row_ids, Some(rows([])));
882
883 let row_ids = applier_factory(
884 vec![],
885 vec![(1, vec![(true, "Hello")])],
886 Some(BitVec::from_slice(&[0b1111])),
887 )
888 .await;
889 assert_eq!(row_ids, None);
890
891 let row_ids = applier_factory(
892 vec![],
893 vec![(1, vec![(false, "Hello, World")])],
894 Some(BitVec::from_slice(&[0b1111])),
895 )
896 .await;
897 assert_eq!(row_ids, Some(rows([3])));
898
899 let row_ids = applier_factory(
900 vec![],
901 vec![(1, vec![(false, "Hello, World")])],
902 Some(BitVec::from_slice(&[0b0111])), )
904 .await;
905 assert_eq!(row_ids, Some(rows([])));
906
907 let row_ids = applier_factory(
908 vec![],
909 vec![(1, vec![(true, "Hello, World")])],
910 Some(BitVec::from_slice(&[0b1111])),
911 )
912 .await;
913 assert_eq!(row_ids, None);
914 }
915
916 #[tokio::test]
917 async fn test_fulltext_index_basic_case_insensitive_tantivy() {
918 let applier_factory = build_fulltext_applier_factory(
919 "test_fulltext_index_basic_case_insensitive_tantivy_",
920 FulltextBackend::Tantivy,
921 &[
922 (None, Some("hello"), None),
923 (None, None, None),
924 (None, Some("world"), None),
925 (None, Some("Hello, World"), None),
926 ],
927 )
928 .await;
929
930 let row_ids = applier_factory(vec![(2, "hello")], vec![], None).await;
931 assert_eq!(row_ids, Some(rows([0, 3])));
932
933 let row_ids = applier_factory(vec![(2, "world")], vec![], None).await;
934 assert_eq!(row_ids, Some(rows([2, 3])));
935
936 let row_ids = applier_factory(vec![(2, "Hello")], vec![], None).await;
937 assert_eq!(row_ids, Some(rows([0, 3])));
938
939 let row_ids = applier_factory(vec![(2, "World")], vec![], None).await;
940 assert_eq!(row_ids, Some(rows([2, 3])));
941
942 let row_ids = applier_factory(vec![], vec![(2, vec![(false, "hello")])], None).await;
943 assert_eq!(row_ids, Some(rows([0, 3])));
944
945 let row_ids = applier_factory(vec![], vec![(2, vec![(true, "hello")])], None).await;
946 assert_eq!(row_ids, Some(rows([0, 3])));
947
948 let row_ids = applier_factory(vec![], vec![(2, vec![(false, "world")])], None).await;
949 assert_eq!(row_ids, Some(rows([2, 3])));
950
951 let row_ids = applier_factory(vec![], vec![(2, vec![(true, "world")])], None).await;
952 assert_eq!(row_ids, Some(rows([2, 3])));
953
954 let row_ids = applier_factory(vec![], vec![(2, vec![(false, "Hello")])], None).await;
955 assert_eq!(row_ids, Some(rows([0, 3])));
956
957 let row_ids = applier_factory(vec![], vec![(2, vec![(true, "Hello")])], None).await;
958 assert_eq!(row_ids, Some(rows([0, 3])));
959
960 let row_ids = applier_factory(vec![], vec![(2, vec![(false, "World")])], None).await;
961 assert_eq!(row_ids, Some(rows([2, 3])));
962
963 let row_ids = applier_factory(vec![], vec![(2, vec![(true, "World")])], None).await;
964 assert_eq!(row_ids, Some(rows([2, 3])));
965 }
966
967 #[tokio::test]
968 async fn test_fulltext_index_basic_case_insensitive_bloom() {
969 let applier_factory = build_fulltext_applier_factory(
970 "test_fulltext_index_basic_case_insensitive_bloom_",
971 FulltextBackend::Bloom,
972 &[
973 (None, Some("hello"), None),
974 (None, None, None),
975 (None, Some("world"), None),
976 (None, Some("Hello, World"), None),
977 ],
978 )
979 .await;
980
981 let row_ids = applier_factory(
982 vec![],
983 vec![(2, vec![(false, "hello")])],
984 Some(BitVec::from_slice(&[0b1111])),
985 )
986 .await;
987 assert_eq!(row_ids, Some(rows([0, 3])));
988
989 let row_ids = applier_factory(
990 vec![],
991 vec![(2, vec![(false, "hello")])],
992 Some(BitVec::from_slice(&[0b1110])), )
994 .await;
995 assert_eq!(row_ids, Some(rows([3])));
996
997 let row_ids = applier_factory(
998 vec![],
999 vec![(2, vec![(true, "hello")])],
1000 Some(BitVec::from_slice(&[0b1111])),
1001 )
1002 .await;
1003 assert_eq!(row_ids, Some(rows([0, 3])));
1004
1005 let row_ids = applier_factory(
1006 vec![],
1007 vec![(2, vec![(true, "hello")])],
1008 Some(BitVec::from_slice(&[0b1110])), )
1010 .await;
1011 assert_eq!(row_ids, Some(rows([3])));
1012
1013 let row_ids = applier_factory(
1014 vec![],
1015 vec![(2, vec![(false, "world")])],
1016 Some(BitVec::from_slice(&[0b1111])),
1017 )
1018 .await;
1019 assert_eq!(row_ids, Some(rows([2, 3])));
1020
1021 let row_ids = applier_factory(
1022 vec![],
1023 vec![(2, vec![(false, "world")])],
1024 Some(BitVec::from_slice(&[0b1011])), )
1026 .await;
1027 assert_eq!(row_ids, Some(rows([3])));
1028
1029 let row_ids = applier_factory(
1030 vec![],
1031 vec![(2, vec![(true, "world")])],
1032 Some(BitVec::from_slice(&[0b1111])),
1033 )
1034 .await;
1035 assert_eq!(row_ids, Some(rows([2, 3])));
1036
1037 let row_ids = applier_factory(
1038 vec![],
1039 vec![(2, vec![(true, "world")])],
1040 Some(BitVec::from_slice(&[0b1011])), )
1042 .await;
1043 assert_eq!(row_ids, Some(rows([3])));
1044
1045 let row_ids = applier_factory(
1046 vec![],
1047 vec![(2, vec![(false, "Hello")])],
1048 Some(BitVec::from_slice(&[0b1111])),
1049 )
1050 .await;
1051 assert_eq!(row_ids, Some(rows([0, 3])));
1052
1053 let row_ids = applier_factory(
1054 vec![],
1055 vec![(2, vec![(false, "Hello")])],
1056 Some(BitVec::from_slice(&[0b0111])), )
1058 .await;
1059 assert_eq!(row_ids, Some(rows([0])));
1060
1061 let row_ids = applier_factory(
1062 vec![],
1063 vec![(2, vec![(true, "Hello")])],
1064 Some(BitVec::from_slice(&[0b1111])),
1065 )
1066 .await;
1067 assert_eq!(row_ids, Some(rows([0, 3])));
1068
1069 let row_ids = applier_factory(
1070 vec![],
1071 vec![(2, vec![(true, "Hello")])],
1072 Some(BitVec::from_slice(&[0b1110])), )
1074 .await;
1075 assert_eq!(row_ids, Some(rows([3])));
1076
1077 let row_ids = applier_factory(
1078 vec![],
1079 vec![(2, vec![(false, "World")])],
1080 Some(BitVec::from_slice(&[0b1111])),
1081 )
1082 .await;
1083 assert_eq!(row_ids, Some(rows([2, 3])));
1084
1085 let row_ids = applier_factory(
1086 vec![],
1087 vec![(2, vec![(false, "World")])],
1088 Some(BitVec::from_slice(&[0b0111])), )
1090 .await;
1091 assert_eq!(row_ids, Some(rows([2])));
1092
1093 let row_ids = applier_factory(
1094 vec![],
1095 vec![(2, vec![(true, "World")])],
1096 Some(BitVec::from_slice(&[0b1111])),
1097 )
1098 .await;
1099 assert_eq!(row_ids, Some(rows([2, 3])));
1100
1101 let row_ids = applier_factory(
1102 vec![],
1103 vec![(2, vec![(true, "World")])],
1104 Some(BitVec::from_slice(&[0b1011])), )
1106 .await;
1107 assert_eq!(row_ids, Some(rows([3])));
1108 }
1109
1110 #[tokio::test]
1111 async fn test_fulltext_index_basic_chinese_tantivy() {
1112 let applier_factory = build_fulltext_applier_factory(
1113 "test_fulltext_index_basic_chinese_tantivy_",
1114 FulltextBackend::Tantivy,
1115 &[
1116 (None, None, Some("你好")),
1117 (None, None, None),
1118 (None, None, Some("世界")),
1119 (None, None, Some("你好,世界")),
1120 ],
1121 )
1122 .await;
1123
1124 let row_ids = applier_factory(vec![(3, "你好")], vec![], None).await;
1125 assert_eq!(row_ids, Some(rows([0, 3])));
1126
1127 let row_ids = applier_factory(vec![(3, "世界")], vec![], None).await;
1128 assert_eq!(row_ids, Some(rows([2, 3])));
1129
1130 let row_ids = applier_factory(vec![], vec![(3, vec![(false, "你好")])], None).await;
1131 assert_eq!(row_ids, Some(rows([0, 3])));
1132
1133 let row_ids = applier_factory(vec![], vec![(3, vec![(false, "世界")])], None).await;
1134 assert_eq!(row_ids, Some(rows([2, 3])));
1135 }
1136
1137 #[tokio::test]
1138 async fn test_fulltext_index_basic_chinese_bloom() {
1139 let applier_factory = build_fulltext_applier_factory(
1140 "test_fulltext_index_basic_chinese_bloom_",
1141 FulltextBackend::Bloom,
1142 &[
1143 (None, None, Some("你好")),
1144 (None, None, None),
1145 (None, None, Some("世界")),
1146 (None, None, Some("你好,世界")),
1147 ],
1148 )
1149 .await;
1150
1151 let row_ids = applier_factory(
1152 vec![],
1153 vec![(3, vec![(false, "你好")])],
1154 Some(BitVec::from_slice(&[0b1111])),
1155 )
1156 .await;
1157 assert_eq!(row_ids, Some(rows([0, 3])));
1158
1159 let row_ids = applier_factory(
1160 vec![],
1161 vec![(3, vec![(false, "你好")])],
1162 Some(BitVec::from_slice(&[0b1110])), )
1164 .await;
1165 assert_eq!(row_ids, Some(rows([3])));
1166
1167 let row_ids = applier_factory(
1168 vec![],
1169 vec![(3, vec![(false, "世界")])],
1170 Some(BitVec::from_slice(&[0b1111])),
1171 )
1172 .await;
1173 assert_eq!(row_ids, Some(rows([2, 3])));
1174
1175 let row_ids = applier_factory(
1176 vec![],
1177 vec![(3, vec![(false, "世界")])],
1178 Some(BitVec::from_slice(&[0b1011])), )
1180 .await;
1181 assert_eq!(row_ids, Some(rows([3])));
1182 }
1183
1184 #[tokio::test]
1185 async fn test_fulltext_index_multi_terms_case_sensitive_tantivy() {
1186 let applier_factory = build_fulltext_applier_factory(
1187 "test_fulltext_index_multi_terms_case_sensitive_tantivy_",
1188 FulltextBackend::Tantivy,
1189 &[
1190 (Some("Hello"), None, None),
1191 (Some("World"), None, None),
1192 (None, None, None),
1193 (Some("Hello, World"), None, None),
1194 ],
1195 )
1196 .await;
1197
1198 let row_ids = applier_factory(
1199 vec![],
1200 vec![(1, vec![(false, "hello"), (false, "world")])],
1201 None,
1202 )
1203 .await;
1204 assert_eq!(row_ids, Some(rows([])));
1205
1206 let row_ids = applier_factory(
1207 vec![],
1208 vec![(1, vec![(false, "Hello"), (false, "World")])],
1209 None,
1210 )
1211 .await;
1212 assert_eq!(row_ids, Some(rows([3])));
1213
1214 let row_ids = applier_factory(
1215 vec![],
1216 vec![(1, vec![(true, "Hello"), (false, "World")])],
1217 None,
1218 )
1219 .await;
1220 assert_eq!(row_ids, Some(rows([1, 3])));
1221
1222 let row_ids = applier_factory(
1223 vec![],
1224 vec![(1, vec![(false, "Hello"), (true, "World")])],
1225 None,
1226 )
1227 .await;
1228 assert_eq!(row_ids, Some(rows([0, 3])));
1229
1230 let row_ids = applier_factory(
1231 vec![],
1232 vec![(1, vec![(true, "Hello"), (true, "World")])],
1233 None,
1234 )
1235 .await;
1236 assert_eq!(row_ids, None);
1237 }
1238
1239 #[tokio::test]
1240 async fn test_fulltext_index_multi_terms_case_sensitive_bloom() {
1241 let applier_factory = build_fulltext_applier_factory(
1242 "test_fulltext_index_multi_terms_case_sensitive_bloom_",
1243 FulltextBackend::Bloom,
1244 &[
1245 (Some("Hello"), None, None),
1246 (Some("World"), None, None),
1247 (None, None, None),
1248 (Some("Hello, World"), None, None),
1249 ],
1250 )
1251 .await;
1252
1253 let row_ids = applier_factory(
1254 vec![],
1255 vec![(1, vec![(false, "hello"), (false, "world")])],
1256 Some(BitVec::from_slice(&[0b1111])),
1257 )
1258 .await;
1259 assert_eq!(row_ids, Some(rows([])));
1260
1261 let row_ids = applier_factory(
1262 vec![],
1263 vec![(1, vec![(false, "Hello"), (false, "World")])],
1264 Some(BitVec::from_slice(&[0b1111])),
1265 )
1266 .await;
1267 assert_eq!(row_ids, Some(rows([3])));
1268
1269 let row_ids = applier_factory(
1270 vec![],
1271 vec![(1, vec![(true, "Hello"), (false, "World")])],
1272 Some(BitVec::from_slice(&[0b1111])),
1273 )
1274 .await;
1275 assert_eq!(row_ids, Some(rows([1, 3])));
1276
1277 let row_ids = applier_factory(
1278 vec![],
1279 vec![(1, vec![(false, "Hello"), (true, "World")])],
1280 Some(BitVec::from_slice(&[0b1111])),
1281 )
1282 .await;
1283 assert_eq!(row_ids, Some(rows([0, 3])));
1284
1285 let row_ids = applier_factory(
1286 vec![],
1287 vec![(1, vec![(true, "Hello"), (true, "World")])],
1288 Some(BitVec::from_slice(&[0b1111])),
1289 )
1290 .await;
1291 assert_eq!(row_ids, None);
1292 }
1293
1294 #[tokio::test]
1295 async fn test_fulltext_index_multi_terms_case_insensitive_tantivy() {
1296 let applier_factory = build_fulltext_applier_factory(
1297 "test_fulltext_index_multi_terms_case_insensitive_tantivy_",
1298 FulltextBackend::Tantivy,
1299 &[
1300 (None, Some("hello"), None),
1301 (None, None, None),
1302 (None, Some("world"), None),
1303 (None, Some("Hello, World"), None),
1304 ],
1305 )
1306 .await;
1307
1308 let row_ids = applier_factory(
1309 vec![],
1310 vec![(2, vec![(false, "hello"), (false, "world")])],
1311 None,
1312 )
1313 .await;
1314 assert_eq!(row_ids, Some(rows([3])));
1315
1316 let row_ids = applier_factory(
1317 vec![],
1318 vec![(2, vec![(true, "hello"), (false, "world")])],
1319 None,
1320 )
1321 .await;
1322 assert_eq!(row_ids, Some(rows([3])));
1323
1324 let row_ids = applier_factory(
1325 vec![],
1326 vec![(2, vec![(false, "hello"), (true, "world")])],
1327 None,
1328 )
1329 .await;
1330 assert_eq!(row_ids, Some(rows([3])));
1331
1332 let row_ids = applier_factory(
1333 vec![],
1334 vec![(2, vec![(true, "hello"), (true, "world")])],
1335 None,
1336 )
1337 .await;
1338 assert_eq!(row_ids, Some(rows([3])));
1339 }
1340
1341 #[tokio::test]
1342 async fn test_fulltext_index_multi_terms_case_insensitive_bloom() {
1343 let applier_factory = build_fulltext_applier_factory(
1344 "test_fulltext_index_multi_terms_case_insensitive_bloom_",
1345 FulltextBackend::Bloom,
1346 &[
1347 (None, Some("hello"), None),
1348 (None, None, None),
1349 (None, Some("world"), None),
1350 (None, Some("Hello, World"), None),
1351 ],
1352 )
1353 .await;
1354
1355 let row_ids = applier_factory(
1356 vec![],
1357 vec![(2, vec![(false, "hello"), (false, "world")])],
1358 Some(BitVec::from_slice(&[0b1111])),
1359 )
1360 .await;
1361 assert_eq!(row_ids, Some(rows([3])));
1362
1363 let row_ids = applier_factory(
1364 vec![],
1365 vec![(2, vec![(true, "hello"), (false, "world")])],
1366 Some(BitVec::from_slice(&[0b1111])),
1367 )
1368 .await;
1369 assert_eq!(row_ids, Some(rows([3])));
1370
1371 let row_ids = applier_factory(
1372 vec![],
1373 vec![(2, vec![(false, "hello"), (true, "world")])],
1374 Some(BitVec::from_slice(&[0b1111])),
1375 )
1376 .await;
1377 assert_eq!(row_ids, Some(rows([3])));
1378
1379 let row_ids = applier_factory(
1380 vec![],
1381 vec![(2, vec![(true, "hello"), (true, "world")])],
1382 Some(BitVec::from_slice(&[0b1111])),
1383 )
1384 .await;
1385 assert_eq!(row_ids, Some(rows([3])));
1386 }
1387
1388 #[tokio::test]
1389 async fn test_fulltext_index_multi_columns_tantivy() {
1390 let applier_factory = build_fulltext_applier_factory(
1391 "test_fulltext_index_multi_columns_tantivy_",
1392 FulltextBackend::Tantivy,
1393 &[
1394 (Some("Hello"), None, Some("你好")),
1395 (Some("World"), Some("world"), None),
1396 (None, Some("World"), Some("世界")),
1397 (
1398 Some("Hello, World"),
1399 Some("Hello, World"),
1400 Some("你好,世界"),
1401 ),
1402 ],
1403 )
1404 .await;
1405
1406 let row_ids = applier_factory(
1407 vec![(1, "Hello"), (3, "你好")],
1408 vec![(2, vec![(false, "world")])],
1409 None,
1410 )
1411 .await;
1412 assert_eq!(row_ids, Some(rows([3])));
1413
1414 let row_ids =
1415 applier_factory(vec![(2, "World")], vec![(1, vec![(false, "World")])], None).await;
1416 assert_eq!(row_ids, Some(rows([1, 3])));
1417 }
1418
1419 #[tokio::test]
1420 async fn test_fulltext_index_multi_columns_bloom() {
1421 let applier_factory = build_fulltext_applier_factory(
1422 "test_fulltext_index_multi_columns_bloom_",
1423 FulltextBackend::Bloom,
1424 &[
1425 (Some("Hello"), None, Some("你好")),
1426 (Some("World"), Some("world"), None),
1427 (None, Some("World"), Some("世界")),
1428 (
1429 Some("Hello, World"),
1430 Some("Hello, World"),
1431 Some("你好,世界"),
1432 ),
1433 ],
1434 )
1435 .await;
1436
1437 let row_ids = applier_factory(
1438 vec![],
1439 vec![
1440 (1, vec![(false, "Hello")]),
1441 (2, vec![(false, "world")]),
1442 (3, vec![(false, "你好")]),
1443 ],
1444 Some(BitVec::from_slice(&[0b1111])),
1445 )
1446 .await;
1447 assert_eq!(row_ids, Some(rows([3])));
1448
1449 let row_ids = applier_factory(
1450 vec![],
1451 vec![(1, vec![(false, "World")]), (2, vec![(false, "World")])],
1452 Some(BitVec::from_slice(&[0b1111])),
1453 )
1454 .await;
1455 assert_eq!(row_ids, Some(rows([1, 3])));
1456 }
1457}