mito2/
memtable.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Memtables are write buffers for regions.
16
17use std::collections::BTreeMap;
18use std::fmt;
19use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
20use std::sync::{Arc, Mutex};
21use std::time::Duration;
22
23pub use bulk::part::EncodedBulkPart;
24use bytes::Bytes;
25use common_time::Timestamp;
26use datatypes::arrow::record_batch::RecordBatch;
27use mito_codec::key_values::KeyValue;
28pub use mito_codec::key_values::KeyValues;
29use serde::{Deserialize, Serialize};
30use store_api::metadata::RegionMetadataRef;
31use store_api::storage::{ColumnId, SequenceNumber, SequenceRange};
32
33use crate::config::MitoConfig;
34use crate::error::{Result, UnsupportedOperationSnafu};
35use crate::flush::WriteBufferManagerRef;
36use crate::memtable::bulk::{BulkMemtableBuilder, CompactDispatcher};
37use crate::memtable::partition_tree::{PartitionTreeConfig, PartitionTreeMemtableBuilder};
38use crate::memtable::time_series::TimeSeriesMemtableBuilder;
39use crate::metrics::WRITE_BUFFER_BYTES;
40use crate::read::Batch;
41use crate::read::prune::PruneTimeIterator;
42use crate::read::scan_region::PredicateGroup;
43use crate::region::options::{MemtableOptions, MergeMode, RegionOptions};
44use crate::sst::FormatType;
45use crate::sst::file::FileTimeRange;
46use crate::sst::parquet::SstInfo;
47use crate::sst::parquet::file_range::PreFilterMode;
48
49mod builder;
50pub mod bulk;
51pub mod partition_tree;
52pub mod simple_bulk_memtable;
53mod stats;
54pub mod time_partition;
55pub mod time_series;
56pub(crate) mod version;
57
58pub use bulk::part::{
59    BulkPart, BulkPartEncoder, BulkPartMeta, UnorderedPart, record_batch_estimated_size,
60    sort_primary_key_record_batch,
61};
62#[cfg(any(test, feature = "test"))]
63pub use time_partition::filter_record_batch;
64
65/// Id for memtables.
66///
67/// Should be unique under the same region.
68pub type MemtableId = u32;
69
70/// Config for memtables.
71#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
72#[serde(tag = "type", rename_all = "snake_case")]
73pub enum MemtableConfig {
74    PartitionTree(PartitionTreeConfig),
75    #[default]
76    TimeSeries,
77}
78
79/// Options for querying ranges from a memtable.
80#[derive(Clone)]
81pub struct RangesOptions {
82    /// Whether the ranges are being queried for flush.
83    pub for_flush: bool,
84    /// Mode to pre-filter columns in ranges.
85    pub pre_filter_mode: PreFilterMode,
86    /// Predicate to filter the data.
87    pub predicate: PredicateGroup,
88    /// Sequence range to filter the data.
89    pub sequence: Option<SequenceRange>,
90}
91
92impl Default for RangesOptions {
93    fn default() -> Self {
94        Self {
95            for_flush: false,
96            pre_filter_mode: PreFilterMode::All,
97            predicate: PredicateGroup::default(),
98            sequence: None,
99        }
100    }
101}
102
103impl RangesOptions {
104    /// Creates a new [RangesOptions] for flushing.
105    pub fn for_flush() -> Self {
106        Self {
107            for_flush: true,
108            pre_filter_mode: PreFilterMode::All,
109            predicate: PredicateGroup::default(),
110            sequence: None,
111        }
112    }
113
114    /// Sets the pre-filter mode.
115    #[must_use]
116    pub fn with_pre_filter_mode(mut self, pre_filter_mode: PreFilterMode) -> Self {
117        self.pre_filter_mode = pre_filter_mode;
118        self
119    }
120
121    /// Sets the predicate.
122    #[must_use]
123    pub fn with_predicate(mut self, predicate: PredicateGroup) -> Self {
124        self.predicate = predicate;
125        self
126    }
127
128    /// Sets the sequence range.
129    #[must_use]
130    pub fn with_sequence(mut self, sequence: Option<SequenceRange>) -> Self {
131        self.sequence = sequence;
132        self
133    }
134}
135
136#[derive(Debug, Default, Clone)]
137pub struct MemtableStats {
138    /// The estimated bytes allocated by this memtable from heap.
139    estimated_bytes: usize,
140    /// The inclusive time range that this memtable contains. It is None if
141    /// and only if the memtable is empty.
142    time_range: Option<(Timestamp, Timestamp)>,
143    /// Total rows in memtable
144    pub num_rows: usize,
145    /// Total number of ranges in the memtable.
146    pub num_ranges: usize,
147    /// The maximum sequence number in the memtable.
148    max_sequence: SequenceNumber,
149    /// Number of estimated timeseries in memtable.
150    series_count: usize,
151}
152
153impl MemtableStats {
154    /// Attaches the time range to the stats.
155    #[cfg(any(test, feature = "test"))]
156    pub fn with_time_range(mut self, time_range: Option<(Timestamp, Timestamp)>) -> Self {
157        self.time_range = time_range;
158        self
159    }
160
161    #[cfg(feature = "test")]
162    pub fn with_max_sequence(mut self, max_sequence: SequenceNumber) -> Self {
163        self.max_sequence = max_sequence;
164        self
165    }
166
167    /// Returns the estimated bytes allocated by this memtable.
168    pub fn bytes_allocated(&self) -> usize {
169        self.estimated_bytes
170    }
171
172    /// Returns the time range of the memtable.
173    pub fn time_range(&self) -> Option<(Timestamp, Timestamp)> {
174        self.time_range
175    }
176
177    /// Returns the num of total rows in memtable.
178    pub fn num_rows(&self) -> usize {
179        self.num_rows
180    }
181
182    /// Returns the number of ranges in the memtable.
183    pub fn num_ranges(&self) -> usize {
184        self.num_ranges
185    }
186
187    /// Returns the maximum sequence number in the memtable.
188    pub fn max_sequence(&self) -> SequenceNumber {
189        self.max_sequence
190    }
191
192    /// Series count in memtable.
193    pub fn series_count(&self) -> usize {
194        self.series_count
195    }
196}
197
198pub type BoxedBatchIterator = Box<dyn Iterator<Item = Result<Batch>> + Send>;
199
200pub type BoxedRecordBatchIterator = Box<dyn Iterator<Item = Result<RecordBatch>> + Send>;
201
202/// Ranges in a memtable.
203#[derive(Default)]
204pub struct MemtableRanges {
205    /// Range IDs and ranges.
206    pub ranges: BTreeMap<usize, MemtableRange>,
207    /// Statistics of the memtable at the query time.
208    pub stats: MemtableStats,
209}
210
211impl IterBuilder for MemtableRanges {
212    fn build(&self, _metrics: Option<MemScanMetrics>) -> Result<BoxedBatchIterator> {
213        UnsupportedOperationSnafu {
214            err_msg: "MemtableRanges does not support build iterator",
215        }
216        .fail()
217    }
218
219    fn is_record_batch(&self) -> bool {
220        self.ranges.values().all(|range| range.is_record_batch())
221    }
222}
223
224/// In memory write buffer.
225pub trait Memtable: Send + Sync + fmt::Debug {
226    /// Returns the id of this memtable.
227    fn id(&self) -> MemtableId;
228
229    /// Writes key values into the memtable.
230    fn write(&self, kvs: &KeyValues) -> Result<()>;
231
232    /// Writes one key value pair into the memtable.
233    fn write_one(&self, key_value: KeyValue) -> Result<()>;
234
235    /// Writes an encoded batch of into memtable.
236    fn write_bulk(&self, part: crate::memtable::bulk::part::BulkPart) -> Result<()>;
237
238    /// Scans the memtable.
239    /// `projection` selects columns to read, `None` means reading all columns.
240    /// `filters` are the predicates to be pushed down to memtable.
241    ///
242    /// # Note
243    /// This method should only be used for tests.
244    #[cfg(any(test, feature = "test"))]
245    fn iter(
246        &self,
247        projection: Option<&[ColumnId]>,
248        predicate: Option<table::predicate::Predicate>,
249        sequence: Option<SequenceRange>,
250    ) -> Result<BoxedBatchIterator>;
251
252    /// Returns the ranges in the memtable.
253    ///
254    /// The returned map contains the range id and the range after applying the predicate.
255    fn ranges(
256        &self,
257        projection: Option<&[ColumnId]>,
258        options: RangesOptions,
259    ) -> Result<MemtableRanges>;
260
261    /// Returns true if the memtable is empty.
262    fn is_empty(&self) -> bool;
263
264    /// Turns a mutable memtable into an immutable memtable.
265    fn freeze(&self) -> Result<()>;
266
267    /// Returns the [MemtableStats] info of Memtable.
268    fn stats(&self) -> MemtableStats;
269
270    /// Forks this (immutable) memtable and returns a new mutable memtable with specific memtable `id`.
271    ///
272    /// A region must freeze the memtable before invoking this method.
273    fn fork(&self, id: MemtableId, metadata: &RegionMetadataRef) -> MemtableRef;
274
275    /// Compacts the memtable.
276    ///
277    /// The `for_flush` is true when the flush job calls this method.
278    fn compact(&self, for_flush: bool) -> Result<()> {
279        let _ = for_flush;
280        Ok(())
281    }
282}
283
284pub type MemtableRef = Arc<dyn Memtable>;
285
286/// Builder to build a new [Memtable].
287pub trait MemtableBuilder: Send + Sync + fmt::Debug {
288    /// Builds a new memtable instance.
289    fn build(&self, id: MemtableId, metadata: &RegionMetadataRef) -> MemtableRef;
290
291    /// Returns true if the memtable supports bulk insert and benefits from it.
292    fn use_bulk_insert(&self, metadata: &RegionMetadataRef) -> bool {
293        let _metadata = metadata;
294        false
295    }
296}
297
298pub type MemtableBuilderRef = Arc<dyn MemtableBuilder>;
299
300/// Memtable memory allocation tracker.
301#[derive(Default)]
302pub struct AllocTracker {
303    write_buffer_manager: Option<WriteBufferManagerRef>,
304    /// Bytes allocated by the tracker.
305    bytes_allocated: AtomicUsize,
306    /// Whether allocating is done.
307    is_done_allocating: AtomicBool,
308}
309
310impl fmt::Debug for AllocTracker {
311    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
312        f.debug_struct("AllocTracker")
313            .field("bytes_allocated", &self.bytes_allocated)
314            .field("is_done_allocating", &self.is_done_allocating)
315            .finish()
316    }
317}
318
319impl AllocTracker {
320    /// Returns a new [AllocTracker].
321    pub fn new(write_buffer_manager: Option<WriteBufferManagerRef>) -> AllocTracker {
322        AllocTracker {
323            write_buffer_manager,
324            bytes_allocated: AtomicUsize::new(0),
325            is_done_allocating: AtomicBool::new(false),
326        }
327    }
328
329    /// Tracks `bytes` memory is allocated.
330    pub(crate) fn on_allocation(&self, bytes: usize) {
331        self.bytes_allocated.fetch_add(bytes, Ordering::Relaxed);
332        WRITE_BUFFER_BYTES.add(bytes as i64);
333        if let Some(write_buffer_manager) = &self.write_buffer_manager {
334            write_buffer_manager.reserve_mem(bytes);
335        }
336    }
337
338    /// Marks we have finished allocating memory so we can free it from
339    /// the write buffer's limit.
340    ///
341    /// The region MUST ensure that it calls this method inside the region writer's write lock.
342    pub(crate) fn done_allocating(&self) {
343        if let Some(write_buffer_manager) = &self.write_buffer_manager
344            && self
345                .is_done_allocating
346                .compare_exchange(false, true, Ordering::Relaxed, Ordering::Relaxed)
347                .is_ok()
348        {
349            write_buffer_manager.schedule_free_mem(self.bytes_allocated.load(Ordering::Relaxed));
350        }
351    }
352
353    /// Returns bytes allocated.
354    pub(crate) fn bytes_allocated(&self) -> usize {
355        self.bytes_allocated.load(Ordering::Relaxed)
356    }
357
358    /// Returns the write buffer manager.
359    pub(crate) fn write_buffer_manager(&self) -> Option<WriteBufferManagerRef> {
360        self.write_buffer_manager.clone()
361    }
362}
363
364impl Drop for AllocTracker {
365    fn drop(&mut self) {
366        if !self.is_done_allocating.load(Ordering::Relaxed) {
367            self.done_allocating();
368        }
369
370        let bytes_allocated = self.bytes_allocated.load(Ordering::Relaxed);
371        WRITE_BUFFER_BYTES.sub(bytes_allocated as i64);
372
373        // Memory tracked by this tracker is freed.
374        if let Some(write_buffer_manager) = &self.write_buffer_manager {
375            write_buffer_manager.free_mem(bytes_allocated);
376        }
377    }
378}
379
380/// Provider of memtable builders for regions.
381#[derive(Clone)]
382pub(crate) struct MemtableBuilderProvider {
383    write_buffer_manager: Option<WriteBufferManagerRef>,
384    config: Arc<MitoConfig>,
385    compact_dispatcher: Arc<CompactDispatcher>,
386}
387
388impl MemtableBuilderProvider {
389    pub(crate) fn new(
390        write_buffer_manager: Option<WriteBufferManagerRef>,
391        config: Arc<MitoConfig>,
392    ) -> Self {
393        let compact_dispatcher =
394            Arc::new(CompactDispatcher::new(config.max_background_compactions));
395
396        Self {
397            write_buffer_manager,
398            config,
399            compact_dispatcher,
400        }
401    }
402
403    pub(crate) fn builder_for_options(&self, options: &RegionOptions) -> MemtableBuilderRef {
404        let dedup = options.need_dedup();
405        let merge_mode = options.merge_mode();
406        let flat_format = options
407            .sst_format
408            .map(|format| format == FormatType::Flat)
409            .unwrap_or(self.config.default_experimental_flat_format);
410        if flat_format {
411            if options.memtable.is_some() {
412                common_telemetry::info!(
413                    "Overriding memtable config, use BulkMemtable under flat format"
414                );
415            }
416
417            return Arc::new(
418                BulkMemtableBuilder::new(
419                    self.write_buffer_manager.clone(),
420                    !dedup, // append_mode: true if not dedup, false if dedup
421                    merge_mode,
422                )
423                .with_compact_dispatcher(self.compact_dispatcher.clone()),
424            );
425        }
426
427        // The format is not flat.
428        match &options.memtable {
429            Some(MemtableOptions::TimeSeries) => Arc::new(TimeSeriesMemtableBuilder::new(
430                self.write_buffer_manager.clone(),
431                dedup,
432                merge_mode,
433            )),
434            Some(MemtableOptions::PartitionTree(opts)) => {
435                Arc::new(PartitionTreeMemtableBuilder::new(
436                    PartitionTreeConfig {
437                        index_max_keys_per_shard: opts.index_max_keys_per_shard,
438                        data_freeze_threshold: opts.data_freeze_threshold,
439                        fork_dictionary_bytes: opts.fork_dictionary_bytes,
440                        dedup,
441                        merge_mode,
442                    },
443                    self.write_buffer_manager.clone(),
444                ))
445            }
446            None => self.default_primary_key_memtable_builder(dedup, merge_mode),
447        }
448    }
449
450    fn default_primary_key_memtable_builder(
451        &self,
452        dedup: bool,
453        merge_mode: MergeMode,
454    ) -> MemtableBuilderRef {
455        match &self.config.memtable {
456            MemtableConfig::PartitionTree(config) => {
457                let mut config = config.clone();
458                config.dedup = dedup;
459                Arc::new(PartitionTreeMemtableBuilder::new(
460                    config,
461                    self.write_buffer_manager.clone(),
462                ))
463            }
464            MemtableConfig::TimeSeries => Arc::new(TimeSeriesMemtableBuilder::new(
465                self.write_buffer_manager.clone(),
466                dedup,
467                merge_mode,
468            )),
469        }
470    }
471}
472
473/// Metrics for scanning a memtable.
474#[derive(Clone, Default)]
475pub struct MemScanMetrics(Arc<Mutex<MemScanMetricsData>>);
476
477impl MemScanMetrics {
478    /// Merges the metrics.
479    pub(crate) fn merge_inner(&self, inner: &MemScanMetricsData) {
480        let mut metrics = self.0.lock().unwrap();
481        metrics.total_series += inner.total_series;
482        metrics.num_rows += inner.num_rows;
483        metrics.num_batches += inner.num_batches;
484        metrics.scan_cost += inner.scan_cost;
485    }
486
487    /// Gets the metrics data.
488    pub(crate) fn data(&self) -> MemScanMetricsData {
489        self.0.lock().unwrap().clone()
490    }
491}
492
493#[derive(Clone, Default)]
494pub(crate) struct MemScanMetricsData {
495    /// Total series in the memtable.
496    pub(crate) total_series: usize,
497    /// Number of rows read.
498    pub(crate) num_rows: usize,
499    /// Number of batch read.
500    pub(crate) num_batches: usize,
501    /// Duration to scan the memtable.
502    pub(crate) scan_cost: Duration,
503}
504
505/// Encoded range in the memtable.
506pub struct EncodedRange {
507    /// Encoded file data.
508    pub data: Bytes,
509    /// Metadata of the encoded range.
510    pub sst_info: SstInfo,
511}
512
513/// Builder to build an iterator to read the range.
514/// The builder should know the projection and the predicate to build the iterator.
515pub trait IterBuilder: Send + Sync {
516    /// Returns the iterator to read the range.
517    fn build(&self, metrics: Option<MemScanMetrics>) -> Result<BoxedBatchIterator>;
518
519    /// Returns whether the iterator is a record batch iterator.
520    fn is_record_batch(&self) -> bool {
521        false
522    }
523
524    /// Returns the record batch iterator to read the range.
525    fn build_record_batch(
526        &self,
527        metrics: Option<MemScanMetrics>,
528    ) -> Result<BoxedRecordBatchIterator> {
529        let _metrics = metrics;
530        UnsupportedOperationSnafu {
531            err_msg: "Record batch iterator is not supported by this memtable",
532        }
533        .fail()
534    }
535
536    /// Returns the [EncodedRange] if the range is already encoded into SST.
537    fn encoded_range(&self) -> Option<EncodedRange> {
538        None
539    }
540}
541
542pub type BoxedIterBuilder = Box<dyn IterBuilder>;
543
544/// Context shared by ranges of the same memtable.
545pub struct MemtableRangeContext {
546    /// Id of the memtable.
547    id: MemtableId,
548    /// Iterator builder.
549    builder: BoxedIterBuilder,
550    /// All filters.
551    predicate: PredicateGroup,
552}
553
554pub type MemtableRangeContextRef = Arc<MemtableRangeContext>;
555
556impl MemtableRangeContext {
557    /// Creates a new [MemtableRangeContext].
558    pub fn new(id: MemtableId, builder: BoxedIterBuilder, predicate: PredicateGroup) -> Self {
559        Self {
560            id,
561            builder,
562            predicate,
563        }
564    }
565}
566
567/// A range in the memtable.
568#[derive(Clone)]
569pub struct MemtableRange {
570    /// Shared context.
571    context: MemtableRangeContextRef,
572    /// Number of rows in current memtable range.
573    // todo(hl): use [MemtableRangeStats] instead.
574    num_rows: usize,
575}
576
577impl MemtableRange {
578    /// Creates a new range from context.
579    pub fn new(context: MemtableRangeContextRef, num_rows: usize) -> Self {
580        Self { context, num_rows }
581    }
582
583    /// Returns the id of the memtable to read.
584    pub fn id(&self) -> MemtableId {
585        self.context.id
586    }
587
588    /// Builds an iterator to read the range.
589    /// Filters the result by the specific time range, this ensures memtable won't return
590    /// rows out of the time range when new rows are inserted.
591    pub fn build_prune_iter(
592        &self,
593        time_range: FileTimeRange,
594        metrics: Option<MemScanMetrics>,
595    ) -> Result<BoxedBatchIterator> {
596        let iter = self.context.builder.build(metrics)?;
597        let time_filters = self.context.predicate.time_filters();
598        Ok(Box::new(PruneTimeIterator::new(
599            iter,
600            time_range,
601            time_filters,
602        )))
603    }
604
605    /// Builds an iterator to read all rows in range.
606    pub fn build_iter(&self) -> Result<BoxedBatchIterator> {
607        self.context.builder.build(None)
608    }
609
610    /// Builds a record batch iterator to read all rows in range.
611    ///
612    /// This method doesn't take the optional time range because a bulk part is immutable
613    /// so we don't need to filter rows out of the time range.
614    pub fn build_record_batch_iter(
615        &self,
616        metrics: Option<MemScanMetrics>,
617    ) -> Result<BoxedRecordBatchIterator> {
618        self.context.builder.build_record_batch(metrics)
619    }
620
621    /// Returns whether the iterator is a record batch iterator.
622    pub fn is_record_batch(&self) -> bool {
623        self.context.builder.is_record_batch()
624    }
625
626    pub fn num_rows(&self) -> usize {
627        self.num_rows
628    }
629
630    /// Returns the encoded range if available.
631    pub fn encoded(&self) -> Option<EncodedRange> {
632        self.context.builder.encoded_range()
633    }
634}
635
636#[cfg(test)]
637mod tests {
638    use common_base::readable_size::ReadableSize;
639
640    use super::*;
641    use crate::flush::{WriteBufferManager, WriteBufferManagerImpl};
642
643    #[test]
644    fn test_deserialize_memtable_config() {
645        let s = r#"
646type = "partition_tree"
647index_max_keys_per_shard = 8192
648data_freeze_threshold = 1024
649dedup = true
650fork_dictionary_bytes = "512MiB"
651"#;
652        let config: MemtableConfig = toml::from_str(s).unwrap();
653        let MemtableConfig::PartitionTree(memtable_config) = config else {
654            unreachable!()
655        };
656        assert!(memtable_config.dedup);
657        assert_eq!(8192, memtable_config.index_max_keys_per_shard);
658        assert_eq!(1024, memtable_config.data_freeze_threshold);
659        assert_eq!(ReadableSize::mb(512), memtable_config.fork_dictionary_bytes);
660    }
661
662    #[test]
663    fn test_alloc_tracker_without_manager() {
664        let tracker = AllocTracker::new(None);
665        assert_eq!(0, tracker.bytes_allocated());
666        tracker.on_allocation(100);
667        assert_eq!(100, tracker.bytes_allocated());
668        tracker.on_allocation(200);
669        assert_eq!(300, tracker.bytes_allocated());
670
671        tracker.done_allocating();
672        assert_eq!(300, tracker.bytes_allocated());
673    }
674
675    #[test]
676    fn test_alloc_tracker_with_manager() {
677        let manager = Arc::new(WriteBufferManagerImpl::new(1000));
678        {
679            let tracker = AllocTracker::new(Some(manager.clone() as WriteBufferManagerRef));
680
681            tracker.on_allocation(100);
682            assert_eq!(100, tracker.bytes_allocated());
683            assert_eq!(100, manager.memory_usage());
684            assert_eq!(100, manager.mutable_usage());
685
686            for _ in 0..2 {
687                // Done allocating won't free the same memory multiple times.
688                tracker.done_allocating();
689                assert_eq!(100, manager.memory_usage());
690                assert_eq!(0, manager.mutable_usage());
691            }
692        }
693
694        assert_eq!(0, manager.memory_usage());
695        assert_eq!(0, manager.mutable_usage());
696    }
697
698    #[test]
699    fn test_alloc_tracker_without_done_allocating() {
700        let manager = Arc::new(WriteBufferManagerImpl::new(1000));
701        {
702            let tracker = AllocTracker::new(Some(manager.clone() as WriteBufferManagerRef));
703
704            tracker.on_allocation(100);
705            assert_eq!(100, tracker.bytes_allocated());
706            assert_eq!(100, manager.memory_usage());
707            assert_eq!(100, manager.mutable_usage());
708        }
709
710        assert_eq!(0, manager.memory_usage());
711        assert_eq!(0, manager.mutable_usage());
712    }
713}