meta_srv/
metasrv.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15pub mod builder;
16
17use std::fmt::{self, Display};
18use std::sync::atomic::{AtomicBool, Ordering};
19use std::sync::{Arc, Mutex, RwLock};
20use std::time::Duration;
21
22use clap::ValueEnum;
23use common_base::Plugins;
24use common_base::readable_size::ReadableSize;
25use common_config::{Configurable, DEFAULT_DATA_HOME};
26use common_event_recorder::EventRecorderOptions;
27use common_greptimedb_telemetry::GreptimeDBTelemetryTask;
28use common_meta::cache_invalidator::CacheInvalidatorRef;
29use common_meta::ddl_manager::DdlManagerRef;
30use common_meta::distributed_time_constants;
31use common_meta::key::TableMetadataManagerRef;
32use common_meta::key::runtime_switch::RuntimeSwitchManagerRef;
33use common_meta::kv_backend::{KvBackendRef, ResettableKvBackend, ResettableKvBackendRef};
34use common_meta::leadership_notifier::{
35    LeadershipChangeNotifier, LeadershipChangeNotifierCustomizerRef,
36};
37use common_meta::node_expiry_listener::NodeExpiryListener;
38use common_meta::peer::{Peer, PeerDiscoveryRef};
39use common_meta::reconciliation::manager::ReconciliationManagerRef;
40use common_meta::region_keeper::MemoryRegionKeeperRef;
41use common_meta::region_registry::LeaderRegionRegistryRef;
42use common_meta::sequence::SequenceRef;
43use common_meta::stats::topic::TopicStatsRegistryRef;
44use common_meta::wal_options_allocator::WalOptionsAllocatorRef;
45use common_options::datanode::DatanodeClientOptions;
46use common_options::memory::MemoryOptions;
47use common_procedure::ProcedureManagerRef;
48use common_procedure::options::ProcedureConfig;
49use common_stat::ResourceStatRef;
50use common_telemetry::logging::{LoggingOptions, TracingOptions};
51use common_telemetry::{error, info, warn};
52use common_wal::config::MetasrvWalConfig;
53use serde::{Deserialize, Serialize};
54use servers::export_metrics::ExportMetricsOption;
55use servers::grpc::GrpcOptions;
56use servers::http::HttpOptions;
57use servers::tls::TlsOption;
58use snafu::{OptionExt, ResultExt};
59use store_api::storage::RegionId;
60use tokio::sync::broadcast::error::RecvError;
61
62use crate::cluster::MetaPeerClientRef;
63use crate::discovery;
64use crate::election::{Election, LeaderChangeMessage};
65use crate::error::{
66    self, InitMetadataSnafu, KvBackendSnafu, Result, StartProcedureManagerSnafu,
67    StartTelemetryTaskSnafu, StopProcedureManagerSnafu,
68};
69use crate::failure_detector::PhiAccrualFailureDetectorOptions;
70use crate::handler::{HeartbeatHandlerGroupBuilder, HeartbeatHandlerGroupRef};
71use crate::procedure::ProcedureManagerListenerAdapter;
72use crate::procedure::region_migration::manager::RegionMigrationManagerRef;
73use crate::procedure::wal_prune::manager::WalPruneTickerRef;
74use crate::pubsub::{PublisherRef, SubscriptionManagerRef};
75use crate::region::flush_trigger::RegionFlushTickerRef;
76use crate::region::supervisor::RegionSupervisorTickerRef;
77use crate::selector::{RegionStatAwareSelector, Selector, SelectorType};
78use crate::service::mailbox::MailboxRef;
79use crate::service::store::cached_kv::LeaderCachedKvBackend;
80use crate::state::{StateRef, become_follower, become_leader};
81
82pub const TABLE_ID_SEQ: &str = "table_id";
83pub const FLOW_ID_SEQ: &str = "flow_id";
84pub const METASRV_DATA_DIR: &str = "metasrv";
85
86// The datastores that implements metadata kvbackend.
87#[derive(Clone, Debug, PartialEq, Serialize, Default, Deserialize, ValueEnum)]
88#[serde(rename_all = "snake_case")]
89pub enum BackendImpl {
90    // Etcd as metadata storage.
91    #[default]
92    EtcdStore,
93    // In memory metadata storage - mostly used for testing.
94    MemoryStore,
95    #[cfg(feature = "pg_kvbackend")]
96    // Postgres as metadata storage.
97    PostgresStore,
98    #[cfg(feature = "mysql_kvbackend")]
99    // MySql as metadata storage.
100    MysqlStore,
101}
102
103/// Configuration options for the stats persistence.
104#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
105pub struct StatsPersistenceOptions {
106    /// TTL for the stats table that will be used to store the stats.
107    #[serde(with = "humantime_serde")]
108    pub ttl: Duration,
109    /// The interval to persist the stats.
110    #[serde(with = "humantime_serde")]
111    pub interval: Duration,
112}
113
114impl Default for StatsPersistenceOptions {
115    fn default() -> Self {
116        Self {
117            ttl: Duration::ZERO,
118            interval: Duration::from_mins(10),
119        }
120    }
121}
122
123#[derive(Clone, PartialEq, Serialize, Deserialize)]
124#[serde(default)]
125pub struct MetasrvOptions {
126    /// The address the server listens on.
127    #[deprecated(note = "Use grpc.bind_addr instead")]
128    pub bind_addr: String,
129    /// The address the server advertises to the clients.
130    #[deprecated(note = "Use grpc.server_addr instead")]
131    pub server_addr: String,
132    /// The address of the store, e.g., etcd.
133    pub store_addrs: Vec<String>,
134    /// TLS configuration for kv store backend (PostgreSQL/MySQL)
135    /// Only applicable when using PostgreSQL or MySQL as the metadata store
136    #[serde(default)]
137    pub backend_tls: Option<TlsOption>,
138    /// The type of selector.
139    pub selector: SelectorType,
140    /// Whether to use the memory store.
141    pub use_memory_store: bool,
142    /// Whether to enable region failover.
143    pub enable_region_failover: bool,
144    /// The delay before starting region failure detection.
145    /// This delay helps prevent Metasrv from triggering unnecessary region failovers before all Datanodes are fully started.
146    /// Especially useful when the cluster is not deployed with GreptimeDB Operator and maintenance mode is not enabled.
147    #[serde(with = "humantime_serde")]
148    pub region_failure_detector_initialization_delay: Duration,
149    /// Whether to allow region failover on local WAL.
150    ///
151    /// If it's true, the region failover will be allowed even if the local WAL is used.
152    /// Note that this option is not recommended to be set to true, because it may lead to data loss during failover.
153    pub allow_region_failover_on_local_wal: bool,
154    pub grpc: GrpcOptions,
155    /// The HTTP server options.
156    pub http: HttpOptions,
157    /// The logging options.
158    pub logging: LoggingOptions,
159    /// The procedure options.
160    pub procedure: ProcedureConfig,
161    /// The failure detector options.
162    pub failure_detector: PhiAccrualFailureDetectorOptions,
163    /// The datanode options.
164    pub datanode: DatanodeClientOptions,
165    /// Whether to enable telemetry.
166    pub enable_telemetry: bool,
167    /// The data home directory.
168    pub data_home: String,
169    /// The WAL options.
170    pub wal: MetasrvWalConfig,
171    /// The metrics export options.
172    pub export_metrics: ExportMetricsOption,
173    /// The store key prefix. If it is not empty, all keys in the store will be prefixed with it.
174    /// This is useful when multiple metasrv clusters share the same store.
175    pub store_key_prefix: String,
176    /// The max operations per txn
177    ///
178    /// This value is usually limited by which store is used for the `KvBackend`.
179    /// For example, if using etcd, this value should ensure that it is less than
180    /// or equal to the `--max-txn-ops` option value of etcd.
181    ///
182    /// TODO(jeremy): Currently, this option only affects the etcd store, but it may
183    /// also affect other stores in the future. In other words, each store needs to
184    /// limit the number of operations in a txn because an infinitely large txn could
185    /// potentially block other operations.
186    pub max_txn_ops: usize,
187    /// The factor that determines how often statistics should be flushed,
188    /// based on the number of received heartbeats. When the number of heartbeats
189    /// reaches this factor, a flush operation is triggered.
190    pub flush_stats_factor: usize,
191    /// The tracing options.
192    pub tracing: TracingOptions,
193    /// The memory options.
194    pub memory: MemoryOptions,
195    /// The datastore for kv metadata.
196    pub backend: BackendImpl,
197    #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
198    /// Table name of rds kv backend.
199    pub meta_table_name: String,
200    #[cfg(feature = "pg_kvbackend")]
201    /// Lock id for meta kv election. Only effect when using pg_kvbackend.
202    pub meta_election_lock_id: u64,
203    #[cfg(feature = "pg_kvbackend")]
204    /// Optional PostgreSQL schema for metadata table (defaults to current search_path if empty).
205    pub meta_schema_name: Option<String>,
206    #[serde(with = "humantime_serde")]
207    pub node_max_idle_time: Duration,
208    /// The event recorder options.
209    pub event_recorder: EventRecorderOptions,
210    /// The stats persistence options.
211    pub stats_persistence: StatsPersistenceOptions,
212}
213
214impl fmt::Debug for MetasrvOptions {
215    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
216        let mut debug_struct = f.debug_struct("MetasrvOptions");
217        debug_struct
218            .field("store_addrs", &self.sanitize_store_addrs())
219            .field("backend_tls", &self.backend_tls)
220            .field("selector", &self.selector)
221            .field("use_memory_store", &self.use_memory_store)
222            .field("enable_region_failover", &self.enable_region_failover)
223            .field(
224                "allow_region_failover_on_local_wal",
225                &self.allow_region_failover_on_local_wal,
226            )
227            .field("grpc", &self.grpc)
228            .field("http", &self.http)
229            .field("logging", &self.logging)
230            .field("procedure", &self.procedure)
231            .field("failure_detector", &self.failure_detector)
232            .field("datanode", &self.datanode)
233            .field("enable_telemetry", &self.enable_telemetry)
234            .field("data_home", &self.data_home)
235            .field("wal", &self.wal)
236            .field("export_metrics", &self.export_metrics)
237            .field("store_key_prefix", &self.store_key_prefix)
238            .field("max_txn_ops", &self.max_txn_ops)
239            .field("flush_stats_factor", &self.flush_stats_factor)
240            .field("tracing", &self.tracing)
241            .field("backend", &self.backend)
242            .field("event_recorder", &self.event_recorder)
243            .field("stats_persistence", &self.stats_persistence);
244
245        #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
246        debug_struct.field("meta_table_name", &self.meta_table_name);
247
248        #[cfg(feature = "pg_kvbackend")]
249        debug_struct.field("meta_election_lock_id", &self.meta_election_lock_id);
250        #[cfg(feature = "pg_kvbackend")]
251        debug_struct.field("meta_schema_name", &self.meta_schema_name);
252
253        debug_struct
254            .field("node_max_idle_time", &self.node_max_idle_time)
255            .finish()
256    }
257}
258
259const DEFAULT_METASRV_ADDR_PORT: &str = "3002";
260
261impl Default for MetasrvOptions {
262    fn default() -> Self {
263        Self {
264            #[allow(deprecated)]
265            bind_addr: String::new(),
266            #[allow(deprecated)]
267            server_addr: String::new(),
268            store_addrs: vec!["127.0.0.1:2379".to_string()],
269            backend_tls: None,
270            selector: SelectorType::default(),
271            use_memory_store: false,
272            enable_region_failover: false,
273            region_failure_detector_initialization_delay: Duration::from_secs(10 * 60),
274            allow_region_failover_on_local_wal: false,
275            grpc: GrpcOptions {
276                bind_addr: format!("127.0.0.1:{}", DEFAULT_METASRV_ADDR_PORT),
277                ..Default::default()
278            },
279            http: HttpOptions::default(),
280            logging: LoggingOptions::default(),
281            procedure: ProcedureConfig {
282                max_retry_times: 12,
283                retry_delay: Duration::from_millis(500),
284                // The etcd the maximum size of any request is 1.5 MiB
285                // 1500KiB = 1536KiB (1.5MiB) - 36KiB (reserved size of key)
286                max_metadata_value_size: Some(ReadableSize::kb(1500)),
287                max_running_procedures: 128,
288            },
289            failure_detector: PhiAccrualFailureDetectorOptions::default(),
290            datanode: DatanodeClientOptions::default(),
291            enable_telemetry: true,
292            data_home: DEFAULT_DATA_HOME.to_string(),
293            wal: MetasrvWalConfig::default(),
294            export_metrics: ExportMetricsOption::default(),
295            store_key_prefix: String::new(),
296            max_txn_ops: 128,
297            flush_stats_factor: 3,
298            tracing: TracingOptions::default(),
299            memory: MemoryOptions::default(),
300            backend: BackendImpl::EtcdStore,
301            #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
302            meta_table_name: common_meta::kv_backend::DEFAULT_META_TABLE_NAME.to_string(),
303            #[cfg(feature = "pg_kvbackend")]
304            meta_election_lock_id: common_meta::kv_backend::DEFAULT_META_ELECTION_LOCK_ID,
305            #[cfg(feature = "pg_kvbackend")]
306            meta_schema_name: None,
307            node_max_idle_time: Duration::from_secs(24 * 60 * 60),
308            event_recorder: EventRecorderOptions::default(),
309            stats_persistence: StatsPersistenceOptions::default(),
310        }
311    }
312}
313
314impl Configurable for MetasrvOptions {
315    fn env_list_keys() -> Option<&'static [&'static str]> {
316        Some(&["wal.broker_endpoints", "store_addrs"])
317    }
318}
319
320impl MetasrvOptions {
321    fn sanitize_store_addrs(&self) -> Vec<String> {
322        self.store_addrs
323            .iter()
324            .map(|addr| common_meta::kv_backend::util::sanitize_connection_string(addr))
325            .collect()
326    }
327}
328
329pub struct MetasrvInfo {
330    pub server_addr: String,
331}
332#[derive(Clone)]
333pub struct Context {
334    pub server_addr: String,
335    pub in_memory: ResettableKvBackendRef,
336    pub kv_backend: KvBackendRef,
337    pub leader_cached_kv_backend: ResettableKvBackendRef,
338    pub meta_peer_client: MetaPeerClientRef,
339    pub mailbox: MailboxRef,
340    pub election: Option<ElectionRef>,
341    pub is_infancy: bool,
342    pub table_metadata_manager: TableMetadataManagerRef,
343    pub cache_invalidator: CacheInvalidatorRef,
344    pub leader_region_registry: LeaderRegionRegistryRef,
345    pub topic_stats_registry: TopicStatsRegistryRef,
346}
347
348impl Context {
349    pub fn reset_in_memory(&self) {
350        self.in_memory.reset();
351        self.leader_region_registry.reset();
352    }
353}
354
355/// The value of the leader. It is used to store the leader's address.
356pub struct LeaderValue(pub String);
357
358impl<T: AsRef<[u8]>> From<T> for LeaderValue {
359    fn from(value: T) -> Self {
360        let string = String::from_utf8_lossy(value.as_ref());
361        Self(string.to_string())
362    }
363}
364
365#[derive(Debug, Clone, Serialize, Deserialize)]
366pub struct MetasrvNodeInfo {
367    // The metasrv's address
368    pub addr: String,
369    // The node build version
370    pub version: String,
371    // The node build git commit hash
372    pub git_commit: String,
373    // The node start timestamp in milliseconds
374    pub start_time_ms: u64,
375    // The node total cpu millicores
376    #[serde(default)]
377    pub total_cpu_millicores: i64,
378    #[serde(default)]
379    // The node total memory bytes
380    pub total_memory_bytes: i64,
381    /// The node build cpu usage millicores
382    pub cpu_usage_millicores: i64,
383    /// The node build memory usage bytes
384    pub memory_usage_bytes: i64,
385    // The node hostname
386    #[serde(default)]
387    pub hostname: String,
388}
389
390// TODO(zyy17): Allow deprecated fields for backward compatibility. Remove this when the deprecated top-level fields are removed from the proto.
391#[allow(deprecated)]
392impl From<MetasrvNodeInfo> for api::v1::meta::MetasrvNodeInfo {
393    fn from(node_info: MetasrvNodeInfo) -> Self {
394        Self {
395            peer: Some(api::v1::meta::Peer {
396                addr: node_info.addr,
397                ..Default::default()
398            }),
399            // TODO(zyy17): The following top-level fields are deprecated. They are kept for backward compatibility and will be removed in a future version.
400            // New code should use the fields in `info.NodeInfo` instead.
401            version: node_info.version.clone(),
402            git_commit: node_info.git_commit.clone(),
403            start_time_ms: node_info.start_time_ms,
404            cpus: node_info.total_cpu_millicores as u32,
405            memory_bytes: node_info.total_memory_bytes as u64,
406            // The canonical location for node information.
407            info: Some(api::v1::meta::NodeInfo {
408                version: node_info.version,
409                git_commit: node_info.git_commit,
410                start_time_ms: node_info.start_time_ms,
411                total_cpu_millicores: node_info.total_cpu_millicores,
412                total_memory_bytes: node_info.total_memory_bytes,
413                cpu_usage_millicores: node_info.cpu_usage_millicores,
414                memory_usage_bytes: node_info.memory_usage_bytes,
415                cpus: node_info.total_cpu_millicores as u32,
416                memory_bytes: node_info.total_memory_bytes as u64,
417                hostname: node_info.hostname,
418            }),
419        }
420    }
421}
422
423#[derive(Clone, Copy)]
424pub enum SelectTarget {
425    Datanode,
426    Flownode,
427}
428
429impl Display for SelectTarget {
430    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
431        match self {
432            SelectTarget::Datanode => write!(f, "datanode"),
433            SelectTarget::Flownode => write!(f, "flownode"),
434        }
435    }
436}
437
438#[derive(Clone)]
439pub struct SelectorContext {
440    pub peer_discovery: PeerDiscoveryRef,
441}
442
443pub type SelectorRef = Arc<dyn Selector<Context = SelectorContext, Output = Vec<Peer>>>;
444pub type RegionStatAwareSelectorRef =
445    Arc<dyn RegionStatAwareSelector<Context = SelectorContext, Output = Vec<(RegionId, Peer)>>>;
446pub type ElectionRef = Arc<dyn Election<Leader = LeaderValue>>;
447
448pub struct MetaStateHandler {
449    subscribe_manager: Option<SubscriptionManagerRef>,
450    greptimedb_telemetry_task: Arc<GreptimeDBTelemetryTask>,
451    leader_cached_kv_backend: Arc<LeaderCachedKvBackend>,
452    leadership_change_notifier: LeadershipChangeNotifier,
453    state: StateRef,
454}
455
456impl MetaStateHandler {
457    pub async fn on_leader_start(&self) {
458        self.state.write().unwrap().next_state(become_leader(false));
459
460        if let Err(e) = self.leader_cached_kv_backend.load().await {
461            error!(e; "Failed to load kv into leader cache kv store");
462        } else {
463            self.state.write().unwrap().next_state(become_leader(true));
464        }
465
466        self.leadership_change_notifier
467            .notify_on_leader_start()
468            .await;
469
470        self.greptimedb_telemetry_task.should_report(true);
471    }
472
473    pub async fn on_leader_stop(&self) {
474        self.state.write().unwrap().next_state(become_follower());
475
476        self.leadership_change_notifier
477            .notify_on_leader_stop()
478            .await;
479
480        // Suspends reporting.
481        self.greptimedb_telemetry_task.should_report(false);
482
483        if let Some(sub_manager) = self.subscribe_manager.clone() {
484            info!("Leader changed, un_subscribe all");
485            if let Err(e) = sub_manager.unsubscribe_all() {
486                error!(e; "Failed to un_subscribe all");
487            }
488        }
489    }
490}
491
492pub struct Metasrv {
493    state: StateRef,
494    started: Arc<AtomicBool>,
495    start_time_ms: u64,
496    options: MetasrvOptions,
497    // It is only valid at the leader node and is used to temporarily
498    // store some data that will not be persisted.
499    in_memory: ResettableKvBackendRef,
500    kv_backend: KvBackendRef,
501    leader_cached_kv_backend: Arc<LeaderCachedKvBackend>,
502    meta_peer_client: MetaPeerClientRef,
503    // The selector is used to select a target datanode.
504    selector: SelectorRef,
505    selector_ctx: SelectorContext,
506    // The flow selector is used to select a target flownode.
507    flow_selector: SelectorRef,
508    handler_group: RwLock<Option<HeartbeatHandlerGroupRef>>,
509    handler_group_builder: Mutex<Option<HeartbeatHandlerGroupBuilder>>,
510    election: Option<ElectionRef>,
511    procedure_manager: ProcedureManagerRef,
512    mailbox: MailboxRef,
513    ddl_manager: DdlManagerRef,
514    wal_options_allocator: WalOptionsAllocatorRef,
515    table_metadata_manager: TableMetadataManagerRef,
516    runtime_switch_manager: RuntimeSwitchManagerRef,
517    memory_region_keeper: MemoryRegionKeeperRef,
518    greptimedb_telemetry_task: Arc<GreptimeDBTelemetryTask>,
519    region_migration_manager: RegionMigrationManagerRef,
520    region_supervisor_ticker: Option<RegionSupervisorTickerRef>,
521    cache_invalidator: CacheInvalidatorRef,
522    leader_region_registry: LeaderRegionRegistryRef,
523    topic_stats_registry: TopicStatsRegistryRef,
524    wal_prune_ticker: Option<WalPruneTickerRef>,
525    region_flush_ticker: Option<RegionFlushTickerRef>,
526    table_id_sequence: SequenceRef,
527    reconciliation_manager: ReconciliationManagerRef,
528    resource_stat: ResourceStatRef,
529
530    plugins: Plugins,
531}
532
533impl Metasrv {
534    pub async fn try_start(&self) -> Result<()> {
535        if self
536            .started
537            .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire)
538            .is_err()
539        {
540            warn!("Metasrv already started");
541            return Ok(());
542        }
543
544        let handler_group_builder =
545            self.handler_group_builder
546                .lock()
547                .unwrap()
548                .take()
549                .context(error::UnexpectedSnafu {
550                    violated: "expected heartbeat handler group builder",
551                })?;
552        *self.handler_group.write().unwrap() = Some(Arc::new(handler_group_builder.build()?));
553
554        // Creates default schema if not exists
555        self.table_metadata_manager
556            .init()
557            .await
558            .context(InitMetadataSnafu)?;
559
560        if let Some(election) = self.election() {
561            let procedure_manager = self.procedure_manager.clone();
562            let in_memory = self.in_memory.clone();
563            let leader_cached_kv_backend = self.leader_cached_kv_backend.clone();
564            let subscribe_manager = self.subscription_manager();
565            let mut rx = election.subscribe_leader_change();
566            let greptimedb_telemetry_task = self.greptimedb_telemetry_task.clone();
567            greptimedb_telemetry_task
568                .start()
569                .context(StartTelemetryTaskSnafu)?;
570
571            // Builds leadership change notifier.
572            let mut leadership_change_notifier = LeadershipChangeNotifier::default();
573            leadership_change_notifier.add_listener(self.wal_options_allocator.clone());
574            leadership_change_notifier
575                .add_listener(Arc::new(ProcedureManagerListenerAdapter(procedure_manager)));
576            leadership_change_notifier.add_listener(Arc::new(NodeExpiryListener::new(
577                self.options.node_max_idle_time,
578                self.in_memory.clone(),
579            )));
580            if let Some(region_supervisor_ticker) = &self.region_supervisor_ticker {
581                leadership_change_notifier.add_listener(region_supervisor_ticker.clone() as _);
582            }
583            if let Some(wal_prune_ticker) = &self.wal_prune_ticker {
584                leadership_change_notifier.add_listener(wal_prune_ticker.clone() as _);
585            }
586            if let Some(region_flush_trigger) = &self.region_flush_ticker {
587                leadership_change_notifier.add_listener(region_flush_trigger.clone() as _);
588            }
589            if let Some(customizer) = self.plugins.get::<LeadershipChangeNotifierCustomizerRef>() {
590                customizer.customize(&mut leadership_change_notifier);
591            }
592
593            let state_handler = MetaStateHandler {
594                greptimedb_telemetry_task,
595                subscribe_manager,
596                state: self.state.clone(),
597                leader_cached_kv_backend: leader_cached_kv_backend.clone(),
598                leadership_change_notifier,
599            };
600            let _handle = common_runtime::spawn_global(async move {
601                loop {
602                    match rx.recv().await {
603                        Ok(msg) => {
604                            in_memory.reset();
605                            leader_cached_kv_backend.reset();
606                            info!("Leader's cache has bean cleared on leader change: {msg}");
607                            match msg {
608                                LeaderChangeMessage::Elected(_) => {
609                                    state_handler.on_leader_start().await;
610                                }
611                                LeaderChangeMessage::StepDown(leader) => {
612                                    error!("Leader :{:?} step down", leader);
613
614                                    state_handler.on_leader_stop().await;
615                                }
616                            }
617                        }
618                        Err(RecvError::Closed) => {
619                            error!("Not expected, is leader election loop still running?");
620                            break;
621                        }
622                        Err(RecvError::Lagged(_)) => {
623                            break;
624                        }
625                    }
626                }
627
628                state_handler.on_leader_stop().await;
629            });
630
631            // Register candidate and keep lease in background.
632            {
633                let election = election.clone();
634                let started = self.started.clone();
635                let node_info = self.node_info();
636                let _handle = common_runtime::spawn_global(async move {
637                    while started.load(Ordering::Acquire) {
638                        let res = election.register_candidate(&node_info).await;
639                        if let Err(e) = res {
640                            warn!(e; "Metasrv register candidate error");
641                        }
642                    }
643                });
644            }
645
646            // Campaign
647            {
648                let election = election.clone();
649                let started = self.started.clone();
650                let _handle = common_runtime::spawn_global(async move {
651                    while started.load(Ordering::Acquire) {
652                        let res = election.campaign().await;
653                        if let Err(e) = res {
654                            warn!(e; "Metasrv election error");
655                        }
656                        election.reset_campaign().await;
657                        info!("Metasrv re-initiate election");
658                    }
659                    info!("Metasrv stopped");
660                });
661            }
662        } else {
663            warn!(
664                "Ensure only one instance of Metasrv is running, as there is no election service."
665            );
666
667            if let Err(e) = self.wal_options_allocator.start().await {
668                error!(e; "Failed to start wal options allocator");
669            }
670            // Always load kv into cached kv store.
671            self.leader_cached_kv_backend
672                .load()
673                .await
674                .context(KvBackendSnafu)?;
675            self.procedure_manager
676                .start()
677                .await
678                .context(StartProcedureManagerSnafu)?;
679        }
680
681        info!("Metasrv started");
682
683        Ok(())
684    }
685
686    pub async fn shutdown(&self) -> Result<()> {
687        if self
688            .started
689            .compare_exchange(true, false, Ordering::AcqRel, Ordering::Acquire)
690            .is_err()
691        {
692            warn!("Metasrv already stopped");
693            return Ok(());
694        }
695
696        self.procedure_manager
697            .stop()
698            .await
699            .context(StopProcedureManagerSnafu)?;
700
701        info!("Metasrv stopped");
702
703        Ok(())
704    }
705
706    pub fn start_time_ms(&self) -> u64 {
707        self.start_time_ms
708    }
709
710    pub fn resource_stat(&self) -> &ResourceStatRef {
711        &self.resource_stat
712    }
713
714    pub fn node_info(&self) -> MetasrvNodeInfo {
715        let build_info = common_version::build_info();
716        MetasrvNodeInfo {
717            addr: self.options().grpc.server_addr.clone(),
718            version: build_info.version.to_string(),
719            git_commit: build_info.commit_short.to_string(),
720            start_time_ms: self.start_time_ms(),
721            total_cpu_millicores: self.resource_stat.get_total_cpu_millicores(),
722            total_memory_bytes: self.resource_stat.get_total_memory_bytes(),
723            cpu_usage_millicores: self.resource_stat.get_cpu_usage_millicores(),
724            memory_usage_bytes: self.resource_stat.get_memory_usage_bytes(),
725            hostname: hostname::get()
726                .unwrap_or_default()
727                .to_string_lossy()
728                .to_string(),
729        }
730    }
731
732    /// Looks up a datanode peer by peer_id, returning it only when it's alive.
733    /// A datanode is considered alive when it's still within the lease period.
734    pub(crate) async fn lookup_datanode_peer(&self, peer_id: u64) -> Result<Option<Peer>> {
735        discovery::utils::alive_datanode(
736            self.meta_peer_client.as_ref(),
737            peer_id,
738            Duration::from_secs(distributed_time_constants::DATANODE_LEASE_SECS),
739        )
740        .await
741    }
742
743    pub fn options(&self) -> &MetasrvOptions {
744        &self.options
745    }
746
747    pub fn in_memory(&self) -> &ResettableKvBackendRef {
748        &self.in_memory
749    }
750
751    pub fn kv_backend(&self) -> &KvBackendRef {
752        &self.kv_backend
753    }
754
755    pub fn meta_peer_client(&self) -> &MetaPeerClientRef {
756        &self.meta_peer_client
757    }
758
759    pub fn selector(&self) -> &SelectorRef {
760        &self.selector
761    }
762
763    pub fn selector_ctx(&self) -> &SelectorContext {
764        &self.selector_ctx
765    }
766
767    pub fn flow_selector(&self) -> &SelectorRef {
768        &self.flow_selector
769    }
770
771    pub fn handler_group(&self) -> Option<HeartbeatHandlerGroupRef> {
772        self.handler_group.read().unwrap().clone()
773    }
774
775    pub fn election(&self) -> Option<&ElectionRef> {
776        self.election.as_ref()
777    }
778
779    pub fn mailbox(&self) -> &MailboxRef {
780        &self.mailbox
781    }
782
783    pub fn ddl_manager(&self) -> &DdlManagerRef {
784        &self.ddl_manager
785    }
786
787    pub fn procedure_manager(&self) -> &ProcedureManagerRef {
788        &self.procedure_manager
789    }
790
791    pub fn table_metadata_manager(&self) -> &TableMetadataManagerRef {
792        &self.table_metadata_manager
793    }
794
795    pub fn runtime_switch_manager(&self) -> &RuntimeSwitchManagerRef {
796        &self.runtime_switch_manager
797    }
798
799    pub fn memory_region_keeper(&self) -> &MemoryRegionKeeperRef {
800        &self.memory_region_keeper
801    }
802
803    pub fn region_migration_manager(&self) -> &RegionMigrationManagerRef {
804        &self.region_migration_manager
805    }
806
807    pub fn publish(&self) -> Option<PublisherRef> {
808        self.plugins.get::<PublisherRef>()
809    }
810
811    pub fn subscription_manager(&self) -> Option<SubscriptionManagerRef> {
812        self.plugins.get::<SubscriptionManagerRef>()
813    }
814
815    pub fn table_id_sequence(&self) -> &SequenceRef {
816        &self.table_id_sequence
817    }
818
819    pub fn reconciliation_manager(&self) -> &ReconciliationManagerRef {
820        &self.reconciliation_manager
821    }
822
823    pub fn plugins(&self) -> &Plugins {
824        &self.plugins
825    }
826
827    pub fn started(&self) -> Arc<AtomicBool> {
828        self.started.clone()
829    }
830
831    #[inline]
832    pub fn new_ctx(&self) -> Context {
833        let server_addr = self.options().grpc.server_addr.clone();
834        let in_memory = self.in_memory.clone();
835        let kv_backend = self.kv_backend.clone();
836        let leader_cached_kv_backend = self.leader_cached_kv_backend.clone();
837        let meta_peer_client = self.meta_peer_client.clone();
838        let mailbox = self.mailbox.clone();
839        let election = self.election.clone();
840        let table_metadata_manager = self.table_metadata_manager.clone();
841        let cache_invalidator = self.cache_invalidator.clone();
842        let leader_region_registry = self.leader_region_registry.clone();
843        let topic_stats_registry = self.topic_stats_registry.clone();
844
845        Context {
846            server_addr,
847            in_memory,
848            kv_backend,
849            leader_cached_kv_backend,
850            meta_peer_client,
851            mailbox,
852            election,
853            is_infancy: false,
854            table_metadata_manager,
855            cache_invalidator,
856            leader_region_registry,
857            topic_stats_registry,
858        }
859    }
860}