meta_srv/gc/
options.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::time::Duration;
16
17use serde::{Deserialize, Serialize};
18use snafu::ensure;
19
20use crate::error::{self, Result};
21
22/// The interval of the gc ticker.
23#[allow(unused)]
24pub(crate) const TICKER_INTERVAL: Duration = Duration::from_secs(60 * 5);
25
26/// Configuration for GC operations.
27///
28/// TODO(discord9): not expose most config to users for now, until GC scheduler is fully stable.
29#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
30#[serde(default)]
31pub struct GcSchedulerOptions {
32    /// Whether GC is enabled. Default to false.
33    /// If set to false, no GC will be performed, and potentially some
34    /// files from datanodes will never be deleted.
35    pub enable: bool,
36    /// Maximum number of tables to process concurrently.
37    pub max_concurrent_tables: usize,
38    /// Maximum number of retries per region when GC fails.
39    pub max_retries_per_region: usize,
40    /// Concurrency for region GC within a table.
41    pub region_gc_concurrency: usize,
42    /// Backoff duration between retries.
43    #[serde(with = "humantime_serde")]
44    pub retry_backoff_duration: Duration,
45    /// Minimum region size threshold for GC (in bytes).
46    pub min_region_size_threshold: u64,
47    /// Weight for SST file count in GC scoring.
48    pub sst_count_weight: f64,
49    /// Weight for file removal rate in GC scoring.
50    pub file_removed_count_weight: f64,
51    /// Cooldown period between GC operations on the same region.
52    #[serde(with = "humantime_serde")]
53    pub gc_cooldown_period: Duration,
54    /// Maximum number of regions to select for GC per table.
55    pub regions_per_table_threshold: usize,
56    /// Timeout duration for mailbox communication with datanodes.
57    #[serde(with = "humantime_serde")]
58    pub mailbox_timeout: Duration,
59    /// Interval for performing full file listing during GC to find orphan files.
60    /// Full file listing is expensive but necessary to clean up orphan files.
61    /// Set to a larger value (e.g., 24 hours) to balance performance and cleanup.
62    /// Every Nth GC cycle will use full file listing, where N = full_file_listing_interval / TICKER_INTERVAL.
63    #[serde(with = "humantime_serde")]
64    pub full_file_listing_interval: Duration,
65    /// Interval for cleaning up stale region entries from the GC tracker.
66    /// This removes entries for regions that no longer exist (e.g., after table drops).
67    /// Set to a larger value (e.g., 6 hours) since this is just for memory cleanup.
68    #[serde(with = "humantime_serde")]
69    pub tracker_cleanup_interval: Duration,
70}
71
72impl Default for GcSchedulerOptions {
73    fn default() -> Self {
74        Self {
75            enable: false,
76            max_concurrent_tables: 10,
77            max_retries_per_region: 3,
78            retry_backoff_duration: Duration::from_secs(5),
79            region_gc_concurrency: 16,
80            min_region_size_threshold: 100 * 1024 * 1024, // 100MB
81            sst_count_weight: 0.5, // more sst means could potentially remove more files, moderate priority
82            file_removed_count_weight: 1.0, // more file to be deleted, higher priority
83            gc_cooldown_period: Duration::from_secs(60 * 5), // 5 minutes
84            regions_per_table_threshold: 20, // Select top 20 regions per table
85            mailbox_timeout: Duration::from_secs(60), // 60 seconds
86            // Perform full file listing every 24 hours to find orphan files
87            full_file_listing_interval: Duration::from_secs(60 * 60 * 24),
88            // Clean up stale tracker entries every 6 hours
89            tracker_cleanup_interval: Duration::from_secs(60 * 60 * 6),
90        }
91    }
92}
93
94impl GcSchedulerOptions {
95    /// Validates the configuration options.
96    pub fn validate(&self) -> Result<()> {
97        ensure!(
98            self.max_concurrent_tables > 0,
99            error::InvalidArgumentsSnafu {
100                err_msg: "max_concurrent_tables must be greater than 0",
101            }
102        );
103
104        ensure!(
105            self.max_retries_per_region > 0,
106            error::InvalidArgumentsSnafu {
107                err_msg: "max_retries_per_region must be greater than 0",
108            }
109        );
110
111        ensure!(
112            self.region_gc_concurrency > 0,
113            error::InvalidArgumentsSnafu {
114                err_msg: "region_gc_concurrency must be greater than 0",
115            }
116        );
117
118        ensure!(
119            !self.retry_backoff_duration.is_zero(),
120            error::InvalidArgumentsSnafu {
121                err_msg: "retry_backoff_duration must be greater than 0",
122            }
123        );
124
125        ensure!(
126            self.sst_count_weight >= 0.0,
127            error::InvalidArgumentsSnafu {
128                err_msg: "sst_count_weight must be non-negative",
129            }
130        );
131
132        ensure!(
133            self.file_removed_count_weight >= 0.0,
134            error::InvalidArgumentsSnafu {
135                err_msg: "file_removal_rate_weight must be non-negative",
136            }
137        );
138
139        ensure!(
140            !self.gc_cooldown_period.is_zero(),
141            error::InvalidArgumentsSnafu {
142                err_msg: "gc_cooldown_period must be greater than 0",
143            }
144        );
145
146        ensure!(
147            self.regions_per_table_threshold > 0,
148            error::InvalidArgumentsSnafu {
149                err_msg: "regions_per_table_threshold must be greater than 0",
150            }
151        );
152
153        ensure!(
154            !self.mailbox_timeout.is_zero(),
155            error::InvalidArgumentsSnafu {
156                err_msg: "mailbox_timeout must be greater than 0",
157            }
158        );
159
160        ensure!(
161            !self.full_file_listing_interval.is_zero(),
162            error::InvalidArgumentsSnafu {
163                err_msg: "full_file_listing_interval must be greater than 0",
164            }
165        );
166
167        ensure!(
168            !self.tracker_cleanup_interval.is_zero(),
169            error::InvalidArgumentsSnafu {
170                err_msg: "tracker_cleanup_interval must be greater than 0",
171            }
172        );
173
174        Ok(())
175    }
176}