meta_srv/
metasrv.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15pub mod builder;
16
17use std::fmt::{self, Display};
18use std::sync::atomic::{AtomicBool, Ordering};
19use std::sync::{Arc, Mutex, RwLock};
20use std::time::Duration;
21
22use clap::ValueEnum;
23use common_base::readable_size::ReadableSize;
24use common_base::Plugins;
25use common_config::{Configurable, DEFAULT_DATA_HOME};
26use common_event_recorder::EventRecorderOptions;
27use common_greptimedb_telemetry::GreptimeDBTelemetryTask;
28use common_meta::cache_invalidator::CacheInvalidatorRef;
29use common_meta::ddl_manager::DdlManagerRef;
30use common_meta::distributed_time_constants;
31use common_meta::key::runtime_switch::RuntimeSwitchManagerRef;
32use common_meta::key::TableMetadataManagerRef;
33use common_meta::kv_backend::{KvBackendRef, ResettableKvBackend, ResettableKvBackendRef};
34use common_meta::leadership_notifier::{
35    LeadershipChangeNotifier, LeadershipChangeNotifierCustomizerRef,
36};
37use common_meta::node_expiry_listener::NodeExpiryListener;
38use common_meta::peer::Peer;
39use common_meta::reconciliation::manager::ReconciliationManagerRef;
40use common_meta::region_keeper::MemoryRegionKeeperRef;
41use common_meta::region_registry::LeaderRegionRegistryRef;
42use common_meta::sequence::SequenceRef;
43use common_meta::stats::topic::TopicStatsRegistryRef;
44use common_meta::wal_options_allocator::WalOptionsAllocatorRef;
45use common_options::datanode::DatanodeClientOptions;
46use common_options::memory::MemoryOptions;
47use common_procedure::options::ProcedureConfig;
48use common_procedure::ProcedureManagerRef;
49use common_telemetry::logging::{LoggingOptions, TracingOptions};
50use common_telemetry::{error, info, warn};
51use common_wal::config::MetasrvWalConfig;
52use serde::{Deserialize, Serialize};
53use servers::export_metrics::ExportMetricsOption;
54use servers::grpc::GrpcOptions;
55use servers::http::HttpOptions;
56use servers::tls::TlsOption;
57use snafu::{OptionExt, ResultExt};
58use store_api::storage::RegionId;
59use table::metadata::TableId;
60use tokio::sync::broadcast::error::RecvError;
61
62use crate::cluster::MetaPeerClientRef;
63use crate::election::{Election, LeaderChangeMessage};
64use crate::error::{
65    self, InitMetadataSnafu, KvBackendSnafu, Result, StartProcedureManagerSnafu,
66    StartTelemetryTaskSnafu, StopProcedureManagerSnafu,
67};
68use crate::failure_detector::PhiAccrualFailureDetectorOptions;
69use crate::handler::{HeartbeatHandlerGroupBuilder, HeartbeatHandlerGroupRef};
70use crate::lease::lookup_datanode_peer;
71use crate::procedure::region_migration::manager::RegionMigrationManagerRef;
72use crate::procedure::wal_prune::manager::WalPruneTickerRef;
73use crate::procedure::ProcedureManagerListenerAdapter;
74use crate::pubsub::{PublisherRef, SubscriptionManagerRef};
75use crate::region::flush_trigger::RegionFlushTickerRef;
76use crate::region::supervisor::RegionSupervisorTickerRef;
77use crate::selector::{RegionStatAwareSelector, Selector, SelectorType};
78use crate::service::mailbox::MailboxRef;
79use crate::service::store::cached_kv::LeaderCachedKvBackend;
80use crate::state::{become_follower, become_leader, StateRef};
81
82pub const TABLE_ID_SEQ: &str = "table_id";
83pub const FLOW_ID_SEQ: &str = "flow_id";
84pub const METASRV_DATA_DIR: &str = "metasrv";
85
86// The datastores that implements metadata kvbackend.
87#[derive(Clone, Debug, PartialEq, Serialize, Default, Deserialize, ValueEnum)]
88#[serde(rename_all = "snake_case")]
89pub enum BackendImpl {
90    // Etcd as metadata storage.
91    #[default]
92    EtcdStore,
93    // In memory metadata storage - mostly used for testing.
94    MemoryStore,
95    #[cfg(feature = "pg_kvbackend")]
96    // Postgres as metadata storage.
97    PostgresStore,
98    #[cfg(feature = "mysql_kvbackend")]
99    // MySql as metadata storage.
100    MysqlStore,
101}
102
103/// Configuration options for the stats persistence.
104#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
105pub struct StatsPersistenceOptions {
106    /// TTL for the stats table that will be used to store the stats.
107    #[serde(with = "humantime_serde")]
108    pub ttl: Duration,
109    /// The interval to persist the stats.
110    #[serde(with = "humantime_serde")]
111    pub interval: Duration,
112}
113
114impl Default for StatsPersistenceOptions {
115    fn default() -> Self {
116        Self {
117            ttl: Duration::ZERO,
118            interval: Duration::from_mins(10),
119        }
120    }
121}
122
123#[derive(Clone, PartialEq, Serialize, Deserialize)]
124#[serde(default)]
125pub struct MetasrvOptions {
126    /// The address the server listens on.
127    #[deprecated(note = "Use grpc.bind_addr instead")]
128    pub bind_addr: String,
129    /// The address the server advertises to the clients.
130    #[deprecated(note = "Use grpc.server_addr instead")]
131    pub server_addr: String,
132    /// The address of the store, e.g., etcd.
133    pub store_addrs: Vec<String>,
134    /// TLS configuration for kv store backend (PostgreSQL/MySQL)
135    /// Only applicable when using PostgreSQL or MySQL as the metadata store
136    #[serde(default)]
137    pub backend_tls: Option<TlsOption>,
138    /// The type of selector.
139    pub selector: SelectorType,
140    /// Whether to use the memory store.
141    pub use_memory_store: bool,
142    /// Whether to enable region failover.
143    pub enable_region_failover: bool,
144    /// The delay before starting region failure detection.
145    /// This delay helps prevent Metasrv from triggering unnecessary region failovers before all Datanodes are fully started.
146    /// Especially useful when the cluster is not deployed with GreptimeDB Operator and maintenance mode is not enabled.
147    #[serde(with = "humantime_serde")]
148    pub region_failure_detector_initialization_delay: Duration,
149    /// Whether to allow region failover on local WAL.
150    ///
151    /// If it's true, the region failover will be allowed even if the local WAL is used.
152    /// Note that this option is not recommended to be set to true, because it may lead to data loss during failover.
153    pub allow_region_failover_on_local_wal: bool,
154    pub grpc: GrpcOptions,
155    /// The HTTP server options.
156    pub http: HttpOptions,
157    /// The logging options.
158    pub logging: LoggingOptions,
159    /// The procedure options.
160    pub procedure: ProcedureConfig,
161    /// The failure detector options.
162    pub failure_detector: PhiAccrualFailureDetectorOptions,
163    /// The datanode options.
164    pub datanode: DatanodeClientOptions,
165    /// Whether to enable telemetry.
166    pub enable_telemetry: bool,
167    /// The data home directory.
168    pub data_home: String,
169    /// The WAL options.
170    pub wal: MetasrvWalConfig,
171    /// The metrics export options.
172    pub export_metrics: ExportMetricsOption,
173    /// The store key prefix. If it is not empty, all keys in the store will be prefixed with it.
174    /// This is useful when multiple metasrv clusters share the same store.
175    pub store_key_prefix: String,
176    /// The max operations per txn
177    ///
178    /// This value is usually limited by which store is used for the `KvBackend`.
179    /// For example, if using etcd, this value should ensure that it is less than
180    /// or equal to the `--max-txn-ops` option value of etcd.
181    ///
182    /// TODO(jeremy): Currently, this option only affects the etcd store, but it may
183    /// also affect other stores in the future. In other words, each store needs to
184    /// limit the number of operations in a txn because an infinitely large txn could
185    /// potentially block other operations.
186    pub max_txn_ops: usize,
187    /// The factor that determines how often statistics should be flushed,
188    /// based on the number of received heartbeats. When the number of heartbeats
189    /// reaches this factor, a flush operation is triggered.
190    pub flush_stats_factor: usize,
191    /// The tracing options.
192    pub tracing: TracingOptions,
193    /// The memory options.
194    pub memory: MemoryOptions,
195    /// The datastore for kv metadata.
196    pub backend: BackendImpl,
197    #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
198    /// Table name of rds kv backend.
199    pub meta_table_name: String,
200    #[cfg(feature = "pg_kvbackend")]
201    /// Lock id for meta kv election. Only effect when using pg_kvbackend.
202    pub meta_election_lock_id: u64,
203    #[cfg(feature = "pg_kvbackend")]
204    /// Optional PostgreSQL schema for metadata table (defaults to current search_path if empty).
205    pub meta_schema_name: Option<String>,
206    #[serde(with = "humantime_serde")]
207    pub node_max_idle_time: Duration,
208    /// The event recorder options.
209    pub event_recorder: EventRecorderOptions,
210    /// The stats persistence options.
211    pub stats_persistence: StatsPersistenceOptions,
212}
213
214impl fmt::Debug for MetasrvOptions {
215    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
216        let mut debug_struct = f.debug_struct("MetasrvOptions");
217        debug_struct
218            .field("store_addrs", &self.sanitize_store_addrs())
219            .field("backend_tls", &self.backend_tls)
220            .field("selector", &self.selector)
221            .field("use_memory_store", &self.use_memory_store)
222            .field("enable_region_failover", &self.enable_region_failover)
223            .field(
224                "allow_region_failover_on_local_wal",
225                &self.allow_region_failover_on_local_wal,
226            )
227            .field("grpc", &self.grpc)
228            .field("http", &self.http)
229            .field("logging", &self.logging)
230            .field("procedure", &self.procedure)
231            .field("failure_detector", &self.failure_detector)
232            .field("datanode", &self.datanode)
233            .field("enable_telemetry", &self.enable_telemetry)
234            .field("data_home", &self.data_home)
235            .field("wal", &self.wal)
236            .field("export_metrics", &self.export_metrics)
237            .field("store_key_prefix", &self.store_key_prefix)
238            .field("max_txn_ops", &self.max_txn_ops)
239            .field("flush_stats_factor", &self.flush_stats_factor)
240            .field("tracing", &self.tracing)
241            .field("backend", &self.backend)
242            .field("event_recorder", &self.event_recorder)
243            .field("stats_persistence", &self.stats_persistence);
244
245        #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
246        debug_struct.field("meta_table_name", &self.meta_table_name);
247
248        #[cfg(feature = "pg_kvbackend")]
249        debug_struct.field("meta_election_lock_id", &self.meta_election_lock_id);
250        #[cfg(feature = "pg_kvbackend")]
251        debug_struct.field("meta_schema_name", &self.meta_schema_name);
252
253        debug_struct
254            .field("node_max_idle_time", &self.node_max_idle_time)
255            .finish()
256    }
257}
258
259const DEFAULT_METASRV_ADDR_PORT: &str = "3002";
260
261impl Default for MetasrvOptions {
262    fn default() -> Self {
263        Self {
264            #[allow(deprecated)]
265            bind_addr: String::new(),
266            #[allow(deprecated)]
267            server_addr: String::new(),
268            store_addrs: vec!["127.0.0.1:2379".to_string()],
269            backend_tls: None,
270            selector: SelectorType::default(),
271            use_memory_store: false,
272            enable_region_failover: false,
273            region_failure_detector_initialization_delay: Duration::from_secs(10 * 60),
274            allow_region_failover_on_local_wal: false,
275            grpc: GrpcOptions {
276                bind_addr: format!("127.0.0.1:{}", DEFAULT_METASRV_ADDR_PORT),
277                ..Default::default()
278            },
279            http: HttpOptions::default(),
280            logging: LoggingOptions::default(),
281            procedure: ProcedureConfig {
282                max_retry_times: 12,
283                retry_delay: Duration::from_millis(500),
284                // The etcd the maximum size of any request is 1.5 MiB
285                // 1500KiB = 1536KiB (1.5MiB) - 36KiB (reserved size of key)
286                max_metadata_value_size: Some(ReadableSize::kb(1500)),
287                max_running_procedures: 128,
288            },
289            failure_detector: PhiAccrualFailureDetectorOptions::default(),
290            datanode: DatanodeClientOptions::default(),
291            enable_telemetry: true,
292            data_home: DEFAULT_DATA_HOME.to_string(),
293            wal: MetasrvWalConfig::default(),
294            export_metrics: ExportMetricsOption::default(),
295            store_key_prefix: String::new(),
296            max_txn_ops: 128,
297            flush_stats_factor: 3,
298            tracing: TracingOptions::default(),
299            memory: MemoryOptions::default(),
300            backend: BackendImpl::EtcdStore,
301            #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
302            meta_table_name: common_meta::kv_backend::DEFAULT_META_TABLE_NAME.to_string(),
303            #[cfg(feature = "pg_kvbackend")]
304            meta_election_lock_id: common_meta::kv_backend::DEFAULT_META_ELECTION_LOCK_ID,
305            #[cfg(feature = "pg_kvbackend")]
306            meta_schema_name: None,
307            node_max_idle_time: Duration::from_secs(24 * 60 * 60),
308            event_recorder: EventRecorderOptions::default(),
309            stats_persistence: StatsPersistenceOptions::default(),
310        }
311    }
312}
313
314impl Configurable for MetasrvOptions {
315    fn env_list_keys() -> Option<&'static [&'static str]> {
316        Some(&["wal.broker_endpoints", "store_addrs"])
317    }
318}
319
320impl MetasrvOptions {
321    fn sanitize_store_addrs(&self) -> Vec<String> {
322        self.store_addrs
323            .iter()
324            .map(|addr| common_meta::kv_backend::util::sanitize_connection_string(addr))
325            .collect()
326    }
327}
328
329pub struct MetasrvInfo {
330    pub server_addr: String,
331}
332#[derive(Clone)]
333pub struct Context {
334    pub server_addr: String,
335    pub in_memory: ResettableKvBackendRef,
336    pub kv_backend: KvBackendRef,
337    pub leader_cached_kv_backend: ResettableKvBackendRef,
338    pub meta_peer_client: MetaPeerClientRef,
339    pub mailbox: MailboxRef,
340    pub election: Option<ElectionRef>,
341    pub is_infancy: bool,
342    pub table_metadata_manager: TableMetadataManagerRef,
343    pub cache_invalidator: CacheInvalidatorRef,
344    pub leader_region_registry: LeaderRegionRegistryRef,
345    pub topic_stats_registry: TopicStatsRegistryRef,
346}
347
348impl Context {
349    pub fn reset_in_memory(&self) {
350        self.in_memory.reset();
351        self.leader_region_registry.reset();
352    }
353}
354
355/// The value of the leader. It is used to store the leader's address.
356pub struct LeaderValue(pub String);
357
358impl<T: AsRef<[u8]>> From<T> for LeaderValue {
359    fn from(value: T) -> Self {
360        let string = String::from_utf8_lossy(value.as_ref());
361        Self(string.to_string())
362    }
363}
364
365#[derive(Debug, Clone, Serialize, Deserialize)]
366pub struct MetasrvNodeInfo {
367    // The metasrv's address
368    pub addr: String,
369    // The node build version
370    pub version: String,
371    // The node build git commit hash
372    pub git_commit: String,
373    // The node start timestamp in milliseconds
374    pub start_time_ms: u64,
375}
376
377impl From<MetasrvNodeInfo> for api::v1::meta::MetasrvNodeInfo {
378    fn from(node_info: MetasrvNodeInfo) -> Self {
379        Self {
380            peer: Some(api::v1::meta::Peer {
381                addr: node_info.addr,
382                ..Default::default()
383            }),
384            version: node_info.version,
385            git_commit: node_info.git_commit,
386            start_time_ms: node_info.start_time_ms,
387        }
388    }
389}
390
391#[derive(Clone, Copy)]
392pub enum SelectTarget {
393    Datanode,
394    Flownode,
395}
396
397impl Display for SelectTarget {
398    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
399        match self {
400            SelectTarget::Datanode => write!(f, "datanode"),
401            SelectTarget::Flownode => write!(f, "flownode"),
402        }
403    }
404}
405
406#[derive(Clone)]
407pub struct SelectorContext {
408    pub server_addr: String,
409    pub datanode_lease_secs: u64,
410    pub flownode_lease_secs: u64,
411    pub kv_backend: KvBackendRef,
412    pub meta_peer_client: MetaPeerClientRef,
413    pub table_id: Option<TableId>,
414}
415
416pub type SelectorRef = Arc<dyn Selector<Context = SelectorContext, Output = Vec<Peer>>>;
417pub type RegionStatAwareSelectorRef =
418    Arc<dyn RegionStatAwareSelector<Context = SelectorContext, Output = Vec<(RegionId, Peer)>>>;
419pub type ElectionRef = Arc<dyn Election<Leader = LeaderValue>>;
420
421pub struct MetaStateHandler {
422    subscribe_manager: Option<SubscriptionManagerRef>,
423    greptimedb_telemetry_task: Arc<GreptimeDBTelemetryTask>,
424    leader_cached_kv_backend: Arc<LeaderCachedKvBackend>,
425    leadership_change_notifier: LeadershipChangeNotifier,
426    state: StateRef,
427}
428
429impl MetaStateHandler {
430    pub async fn on_leader_start(&self) {
431        self.state.write().unwrap().next_state(become_leader(false));
432
433        if let Err(e) = self.leader_cached_kv_backend.load().await {
434            error!(e; "Failed to load kv into leader cache kv store");
435        } else {
436            self.state.write().unwrap().next_state(become_leader(true));
437        }
438
439        self.leadership_change_notifier
440            .notify_on_leader_start()
441            .await;
442
443        self.greptimedb_telemetry_task.should_report(true);
444    }
445
446    pub async fn on_leader_stop(&self) {
447        self.state.write().unwrap().next_state(become_follower());
448
449        self.leadership_change_notifier
450            .notify_on_leader_stop()
451            .await;
452
453        // Suspends reporting.
454        self.greptimedb_telemetry_task.should_report(false);
455
456        if let Some(sub_manager) = self.subscribe_manager.clone() {
457            info!("Leader changed, un_subscribe all");
458            if let Err(e) = sub_manager.unsubscribe_all() {
459                error!(e; "Failed to un_subscribe all");
460            }
461        }
462    }
463}
464
465pub struct Metasrv {
466    state: StateRef,
467    started: Arc<AtomicBool>,
468    start_time_ms: u64,
469    options: MetasrvOptions,
470    // It is only valid at the leader node and is used to temporarily
471    // store some data that will not be persisted.
472    in_memory: ResettableKvBackendRef,
473    kv_backend: KvBackendRef,
474    leader_cached_kv_backend: Arc<LeaderCachedKvBackend>,
475    meta_peer_client: MetaPeerClientRef,
476    // The selector is used to select a target datanode.
477    selector: SelectorRef,
478    selector_ctx: SelectorContext,
479    // The flow selector is used to select a target flownode.
480    flow_selector: SelectorRef,
481    handler_group: RwLock<Option<HeartbeatHandlerGroupRef>>,
482    handler_group_builder: Mutex<Option<HeartbeatHandlerGroupBuilder>>,
483    election: Option<ElectionRef>,
484    procedure_manager: ProcedureManagerRef,
485    mailbox: MailboxRef,
486    ddl_manager: DdlManagerRef,
487    wal_options_allocator: WalOptionsAllocatorRef,
488    table_metadata_manager: TableMetadataManagerRef,
489    runtime_switch_manager: RuntimeSwitchManagerRef,
490    memory_region_keeper: MemoryRegionKeeperRef,
491    greptimedb_telemetry_task: Arc<GreptimeDBTelemetryTask>,
492    region_migration_manager: RegionMigrationManagerRef,
493    region_supervisor_ticker: Option<RegionSupervisorTickerRef>,
494    cache_invalidator: CacheInvalidatorRef,
495    leader_region_registry: LeaderRegionRegistryRef,
496    topic_stats_registry: TopicStatsRegistryRef,
497    wal_prune_ticker: Option<WalPruneTickerRef>,
498    region_flush_ticker: Option<RegionFlushTickerRef>,
499    table_id_sequence: SequenceRef,
500    reconciliation_manager: ReconciliationManagerRef,
501
502    plugins: Plugins,
503}
504
505impl Metasrv {
506    pub async fn try_start(&self) -> Result<()> {
507        if self
508            .started
509            .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire)
510            .is_err()
511        {
512            warn!("Metasrv already started");
513            return Ok(());
514        }
515
516        let handler_group_builder =
517            self.handler_group_builder
518                .lock()
519                .unwrap()
520                .take()
521                .context(error::UnexpectedSnafu {
522                    violated: "expected heartbeat handler group builder",
523                })?;
524        *self.handler_group.write().unwrap() = Some(Arc::new(handler_group_builder.build()?));
525
526        // Creates default schema if not exists
527        self.table_metadata_manager
528            .init()
529            .await
530            .context(InitMetadataSnafu)?;
531
532        if let Some(election) = self.election() {
533            let procedure_manager = self.procedure_manager.clone();
534            let in_memory = self.in_memory.clone();
535            let leader_cached_kv_backend = self.leader_cached_kv_backend.clone();
536            let subscribe_manager = self.subscription_manager();
537            let mut rx = election.subscribe_leader_change();
538            let greptimedb_telemetry_task = self.greptimedb_telemetry_task.clone();
539            greptimedb_telemetry_task
540                .start()
541                .context(StartTelemetryTaskSnafu)?;
542
543            // Builds leadership change notifier.
544            let mut leadership_change_notifier = LeadershipChangeNotifier::default();
545            leadership_change_notifier.add_listener(self.wal_options_allocator.clone());
546            leadership_change_notifier
547                .add_listener(Arc::new(ProcedureManagerListenerAdapter(procedure_manager)));
548            leadership_change_notifier.add_listener(Arc::new(NodeExpiryListener::new(
549                self.options.node_max_idle_time,
550                self.in_memory.clone(),
551            )));
552            if let Some(region_supervisor_ticker) = &self.region_supervisor_ticker {
553                leadership_change_notifier.add_listener(region_supervisor_ticker.clone() as _);
554            }
555            if let Some(wal_prune_ticker) = &self.wal_prune_ticker {
556                leadership_change_notifier.add_listener(wal_prune_ticker.clone() as _);
557            }
558            if let Some(region_flush_trigger) = &self.region_flush_ticker {
559                leadership_change_notifier.add_listener(region_flush_trigger.clone() as _);
560            }
561            if let Some(customizer) = self.plugins.get::<LeadershipChangeNotifierCustomizerRef>() {
562                customizer.customize(&mut leadership_change_notifier);
563            }
564
565            let state_handler = MetaStateHandler {
566                greptimedb_telemetry_task,
567                subscribe_manager,
568                state: self.state.clone(),
569                leader_cached_kv_backend: leader_cached_kv_backend.clone(),
570                leadership_change_notifier,
571            };
572            let _handle = common_runtime::spawn_global(async move {
573                loop {
574                    match rx.recv().await {
575                        Ok(msg) => {
576                            in_memory.reset();
577                            leader_cached_kv_backend.reset();
578                            info!("Leader's cache has bean cleared on leader change: {msg}");
579                            match msg {
580                                LeaderChangeMessage::Elected(_) => {
581                                    state_handler.on_leader_start().await;
582                                }
583                                LeaderChangeMessage::StepDown(leader) => {
584                                    error!("Leader :{:?} step down", leader);
585
586                                    state_handler.on_leader_stop().await;
587                                }
588                            }
589                        }
590                        Err(RecvError::Closed) => {
591                            error!("Not expected, is leader election loop still running?");
592                            break;
593                        }
594                        Err(RecvError::Lagged(_)) => {
595                            break;
596                        }
597                    }
598                }
599
600                state_handler.on_leader_stop().await;
601            });
602
603            // Register candidate and keep lease in background.
604            {
605                let election = election.clone();
606                let started = self.started.clone();
607                let node_info = self.node_info();
608                let _handle = common_runtime::spawn_global(async move {
609                    while started.load(Ordering::Acquire) {
610                        let res = election.register_candidate(&node_info).await;
611                        if let Err(e) = res {
612                            warn!(e; "Metasrv register candidate error");
613                        }
614                    }
615                });
616            }
617
618            // Campaign
619            {
620                let election = election.clone();
621                let started = self.started.clone();
622                let _handle = common_runtime::spawn_global(async move {
623                    while started.load(Ordering::Acquire) {
624                        let res = election.campaign().await;
625                        if let Err(e) = res {
626                            warn!(e; "Metasrv election error");
627                        }
628                        election.reset_campaign().await;
629                        info!("Metasrv re-initiate election");
630                    }
631                    info!("Metasrv stopped");
632                });
633            }
634        } else {
635            warn!(
636                "Ensure only one instance of Metasrv is running, as there is no election service."
637            );
638
639            if let Err(e) = self.wal_options_allocator.start().await {
640                error!(e; "Failed to start wal options allocator");
641            }
642            // Always load kv into cached kv store.
643            self.leader_cached_kv_backend
644                .load()
645                .await
646                .context(KvBackendSnafu)?;
647            self.procedure_manager
648                .start()
649                .await
650                .context(StartProcedureManagerSnafu)?;
651        }
652
653        info!("Metasrv started");
654
655        Ok(())
656    }
657
658    pub async fn shutdown(&self) -> Result<()> {
659        if self
660            .started
661            .compare_exchange(true, false, Ordering::AcqRel, Ordering::Acquire)
662            .is_err()
663        {
664            warn!("Metasrv already stopped");
665            return Ok(());
666        }
667
668        self.procedure_manager
669            .stop()
670            .await
671            .context(StopProcedureManagerSnafu)?;
672
673        info!("Metasrv stopped");
674
675        Ok(())
676    }
677
678    pub fn start_time_ms(&self) -> u64 {
679        self.start_time_ms
680    }
681
682    pub fn node_info(&self) -> MetasrvNodeInfo {
683        let build_info = common_version::build_info();
684        MetasrvNodeInfo {
685            addr: self.options().grpc.server_addr.clone(),
686            version: build_info.version.to_string(),
687            git_commit: build_info.commit_short.to_string(),
688            start_time_ms: self.start_time_ms(),
689        }
690    }
691
692    /// Looks up a datanode peer by peer_id, returning it only when it's alive.
693    /// A datanode is considered alive when it's still within the lease period.
694    pub(crate) async fn lookup_datanode_peer(&self, peer_id: u64) -> Result<Option<Peer>> {
695        lookup_datanode_peer(
696            peer_id,
697            &self.meta_peer_client,
698            Duration::from_secs(distributed_time_constants::DATANODE_LEASE_SECS),
699        )
700        .await
701    }
702
703    pub fn options(&self) -> &MetasrvOptions {
704        &self.options
705    }
706
707    pub fn in_memory(&self) -> &ResettableKvBackendRef {
708        &self.in_memory
709    }
710
711    pub fn kv_backend(&self) -> &KvBackendRef {
712        &self.kv_backend
713    }
714
715    pub fn meta_peer_client(&self) -> &MetaPeerClientRef {
716        &self.meta_peer_client
717    }
718
719    pub fn selector(&self) -> &SelectorRef {
720        &self.selector
721    }
722
723    pub fn selector_ctx(&self) -> &SelectorContext {
724        &self.selector_ctx
725    }
726
727    pub fn flow_selector(&self) -> &SelectorRef {
728        &self.flow_selector
729    }
730
731    pub fn handler_group(&self) -> Option<HeartbeatHandlerGroupRef> {
732        self.handler_group.read().unwrap().clone()
733    }
734
735    pub fn election(&self) -> Option<&ElectionRef> {
736        self.election.as_ref()
737    }
738
739    pub fn mailbox(&self) -> &MailboxRef {
740        &self.mailbox
741    }
742
743    pub fn ddl_manager(&self) -> &DdlManagerRef {
744        &self.ddl_manager
745    }
746
747    pub fn procedure_manager(&self) -> &ProcedureManagerRef {
748        &self.procedure_manager
749    }
750
751    pub fn table_metadata_manager(&self) -> &TableMetadataManagerRef {
752        &self.table_metadata_manager
753    }
754
755    pub fn runtime_switch_manager(&self) -> &RuntimeSwitchManagerRef {
756        &self.runtime_switch_manager
757    }
758
759    pub fn memory_region_keeper(&self) -> &MemoryRegionKeeperRef {
760        &self.memory_region_keeper
761    }
762
763    pub fn region_migration_manager(&self) -> &RegionMigrationManagerRef {
764        &self.region_migration_manager
765    }
766
767    pub fn publish(&self) -> Option<PublisherRef> {
768        self.plugins.get::<PublisherRef>()
769    }
770
771    pub fn subscription_manager(&self) -> Option<SubscriptionManagerRef> {
772        self.plugins.get::<SubscriptionManagerRef>()
773    }
774
775    pub fn table_id_sequence(&self) -> &SequenceRef {
776        &self.table_id_sequence
777    }
778
779    pub fn reconciliation_manager(&self) -> &ReconciliationManagerRef {
780        &self.reconciliation_manager
781    }
782
783    pub fn plugins(&self) -> &Plugins {
784        &self.plugins
785    }
786
787    #[inline]
788    pub fn new_ctx(&self) -> Context {
789        let server_addr = self.options().grpc.server_addr.clone();
790        let in_memory = self.in_memory.clone();
791        let kv_backend = self.kv_backend.clone();
792        let leader_cached_kv_backend = self.leader_cached_kv_backend.clone();
793        let meta_peer_client = self.meta_peer_client.clone();
794        let mailbox = self.mailbox.clone();
795        let election = self.election.clone();
796        let table_metadata_manager = self.table_metadata_manager.clone();
797        let cache_invalidator = self.cache_invalidator.clone();
798        let leader_region_registry = self.leader_region_registry.clone();
799        let topic_stats_registry = self.topic_stats_registry.clone();
800
801        Context {
802            server_addr,
803            in_memory,
804            kv_backend,
805            leader_cached_kv_backend,
806            meta_peer_client,
807            mailbox,
808            election,
809            is_infancy: false,
810            table_metadata_manager,
811            cache_invalidator,
812            leader_region_registry,
813            topic_stats_registry,
814        }
815    }
816}