1use std::path::Path;
16use std::sync::atomic::AtomicBool;
17use std::sync::{Arc, Mutex, RwLock};
18use std::time::Duration;
19
20use client::client_manager::NodeClients;
21use client::inserter::InsertOptions;
22use common_base::Plugins;
23use common_catalog::consts::{MIN_USER_FLOW_ID, MIN_USER_TABLE_ID};
24use common_event_recorder::{DEFAULT_COMPACTION_TIME_WINDOW, EventRecorderImpl, EventRecorderRef};
25use common_grpc::channel_manager::ChannelConfig;
26use common_meta::ddl::flow_meta::FlowMetadataAllocator;
27use common_meta::ddl::table_meta::{TableMetadataAllocator, TableMetadataAllocatorRef};
28use common_meta::ddl::{
29 DdlContext, NoopRegionFailureDetectorControl, RegionFailureDetectorControllerRef,
30};
31use common_meta::ddl_manager::{DdlManager, DdlManagerConfiguratorRef};
32use common_meta::distributed_time_constants::default_distributed_time_constants;
33use common_meta::key::TableMetadataManager;
34use common_meta::key::flow::FlowMetadataManager;
35use common_meta::key::flow::flow_state::FlowStateManager;
36use common_meta::key::runtime_switch::{RuntimeSwitchManager, RuntimeSwitchManagerRef};
37use common_meta::kv_backend::memory::MemoryKvBackend;
38use common_meta::kv_backend::{KvBackendRef, ResettableKvBackendRef};
39use common_meta::node_manager::NodeManagerRef;
40use common_meta::reconciliation::manager::ReconciliationManager;
41use common_meta::region_keeper::MemoryRegionKeeper;
42use common_meta::region_registry::LeaderRegionRegistry;
43use common_meta::sequence::SequenceBuilder;
44use common_meta::state_store::KvStateStore;
45use common_meta::stats::topic::TopicStatsRegistry;
46use common_meta::wal_provider::{build_kafka_client, build_wal_provider};
47use common_procedure::ProcedureManagerRef;
48use common_procedure::local::{LocalManager, ManagerConfig};
49use common_stat::ResourceStatImpl;
50use common_telemetry::{info, warn};
51use snafu::{ResultExt, ensure};
52use store_api::storage::MAX_REGION_SEQ;
53
54use crate::bootstrap::build_default_meta_peer_client;
55use crate::cache_invalidator::MetasrvCacheInvalidator;
56use crate::cluster::MetaPeerClientRef;
57use crate::error::{self, BuildWalProviderSnafu, OtherSnafu, Result};
58use crate::events::EventHandlerImpl;
59use crate::gc::GcScheduler;
60use crate::greptimedb_telemetry::get_greptimedb_telemetry_task;
61use crate::handler::failure_handler::RegionFailureHandler;
62use crate::handler::flow_state_handler::FlowStateHandler;
63use crate::handler::persist_stats_handler::PersistStatsHandler;
64use crate::handler::region_lease_handler::{CustomizedRegionLeaseRenewerRef, RegionLeaseHandler};
65use crate::handler::{HeartbeatHandlerGroupBuilder, HeartbeatMailbox, Pushers};
66use crate::metasrv::{
67 ElectionRef, FLOW_ID_SEQ, METASRV_DATA_DIR, Metasrv, MetasrvInfo, MetasrvOptions,
68 RegionStatAwareSelectorRef, SelectTarget, SelectorContext, SelectorRef, TABLE_ID_SEQ,
69};
70use crate::peer::MetasrvPeerAllocator;
71use crate::procedure::region_migration::DefaultContextFactory;
72use crate::procedure::region_migration::manager::RegionMigrationManager;
73use crate::procedure::repartition::DefaultRepartitionProcedureFactory;
74use crate::procedure::wal_prune::Context as WalPruneContext;
75use crate::procedure::wal_prune::manager::{WalPruneManager, WalPruneTicker};
76use crate::region::flush_trigger::RegionFlushTrigger;
77use crate::region::supervisor::{
78 DEFAULT_INITIALIZATION_RETRY_PERIOD, DEFAULT_TICK_INTERVAL, HeartbeatAcceptor,
79 RegionFailureDetectorControl, RegionSupervisor, RegionSupervisorSelector,
80 RegionSupervisorTicker,
81};
82use crate::selector::lease_based::LeaseBasedSelector;
83use crate::selector::round_robin::RoundRobinSelector;
84use crate::service::mailbox::MailboxRef;
85use crate::service::store::cached_kv::LeaderCachedKvBackend;
86use crate::state::State;
87use crate::utils::database::DatabaseOperator;
88use crate::utils::insert_forwarder::InsertForwarder;
89
90const REGION_STATS_TABLE_TWCS_COMPACTION_TIME_WINDOW: Duration = Duration::from_days(1);
92
93pub struct MetasrvBuilder {
95 options: Option<MetasrvOptions>,
96 kv_backend: Option<KvBackendRef>,
97 in_memory: Option<ResettableKvBackendRef>,
98 selector: Option<SelectorRef>,
99 handler_group_builder: Option<HeartbeatHandlerGroupBuilder>,
100 election: Option<ElectionRef>,
101 meta_peer_client: Option<MetaPeerClientRef>,
102 node_manager: Option<NodeManagerRef>,
103 plugins: Option<Plugins>,
104 table_metadata_allocator: Option<TableMetadataAllocatorRef>,
105}
106
107impl MetasrvBuilder {
108 pub fn new() -> Self {
109 Self {
110 kv_backend: None,
111 in_memory: None,
112 selector: None,
113 handler_group_builder: None,
114 meta_peer_client: None,
115 election: None,
116 options: None,
117 node_manager: None,
118 plugins: None,
119 table_metadata_allocator: None,
120 }
121 }
122
123 pub fn options(mut self, options: MetasrvOptions) -> Self {
124 self.options = Some(options);
125 self
126 }
127
128 pub fn kv_backend(mut self, kv_backend: KvBackendRef) -> Self {
129 self.kv_backend = Some(kv_backend);
130 self
131 }
132
133 pub fn in_memory(mut self, in_memory: ResettableKvBackendRef) -> Self {
134 self.in_memory = Some(in_memory);
135 self
136 }
137
138 pub fn selector(mut self, selector: SelectorRef) -> Self {
139 self.selector = Some(selector);
140 self
141 }
142
143 pub fn heartbeat_handler(
144 mut self,
145 handler_group_builder: HeartbeatHandlerGroupBuilder,
146 ) -> Self {
147 self.handler_group_builder = Some(handler_group_builder);
148 self
149 }
150
151 pub fn meta_peer_client(mut self, meta_peer_client: MetaPeerClientRef) -> Self {
152 self.meta_peer_client = Some(meta_peer_client);
153 self
154 }
155
156 pub fn election(mut self, election: Option<ElectionRef>) -> Self {
157 self.election = election;
158 self
159 }
160
161 pub fn node_manager(mut self, node_manager: NodeManagerRef) -> Self {
162 self.node_manager = Some(node_manager);
163 self
164 }
165
166 pub fn plugins(mut self, plugins: Plugins) -> Self {
167 self.plugins = Some(plugins);
168 self
169 }
170
171 pub fn table_metadata_allocator(
172 mut self,
173 table_metadata_allocator: TableMetadataAllocatorRef,
174 ) -> Self {
175 self.table_metadata_allocator = Some(table_metadata_allocator);
176 self
177 }
178
179 pub async fn build(self) -> Result<Metasrv> {
180 let MetasrvBuilder {
181 election,
182 meta_peer_client,
183 options,
184 kv_backend,
185 in_memory,
186 selector,
187 handler_group_builder,
188 node_manager,
189 plugins,
190 table_metadata_allocator,
191 } = self;
192
193 let options = options.unwrap_or_default();
194
195 let kv_backend = kv_backend.unwrap_or_else(|| Arc::new(MemoryKvBackend::new()));
196 let in_memory = in_memory.unwrap_or_else(|| Arc::new(MemoryKvBackend::new()));
197
198 let state = Arc::new(RwLock::new(match election {
199 None => State::leader(options.grpc.server_addr.clone(), true),
200 Some(_) => State::follower(options.grpc.server_addr.clone()),
201 }));
202
203 let leader_cached_kv_backend = Arc::new(LeaderCachedKvBackend::new(
204 state.clone(),
205 kv_backend.clone(),
206 ));
207
208 let meta_peer_client = meta_peer_client
209 .unwrap_or_else(|| build_default_meta_peer_client(&election, &in_memory));
210 let database_operator = Arc::new(DatabaseOperator::new(meta_peer_client.clone()));
211
212 let event_inserter = Box::new(InsertForwarder::new(
213 database_operator.clone(),
214 Some(InsertOptions {
215 ttl: options.event_recorder.ttl,
216 append_mode: true,
217 twcs_compaction_time_window: Some(DEFAULT_COMPACTION_TIME_WINDOW),
218 }),
219 ));
220 let event_recorder = Arc::new(EventRecorderImpl::new(Box::new(EventHandlerImpl::new(
222 event_inserter,
223 ))));
224
225 let selector = selector.unwrap_or_else(|| Arc::new(LeaseBasedSelector));
226 let pushers = Pushers::default();
227 let mailbox = build_mailbox(&kv_backend, &pushers);
228 let runtime_switch_manager = Arc::new(RuntimeSwitchManager::new(kv_backend.clone()));
229 let procedure_manager = build_procedure_manager(
230 &options,
231 &kv_backend,
232 &runtime_switch_manager,
233 event_recorder,
234 );
235
236 let table_metadata_manager = Arc::new(TableMetadataManager::new(
237 leader_cached_kv_backend.clone() as _,
238 ));
239 let flow_metadata_manager = Arc::new(FlowMetadataManager::new(
240 leader_cached_kv_backend.clone() as _,
241 ));
242
243 let selector_ctx = SelectorContext {
244 peer_discovery: meta_peer_client.clone(),
245 };
246
247 let wal_provider = build_wal_provider(&options.wal, kv_backend.clone())
248 .await
249 .context(BuildWalProviderSnafu)?;
250 let wal_provider = Arc::new(wal_provider);
251 let is_remote_wal = wal_provider.is_remote_wal();
252 let table_metadata_allocator = table_metadata_allocator.unwrap_or_else(|| {
253 let sequence = Arc::new(
254 SequenceBuilder::new(TABLE_ID_SEQ, kv_backend.clone())
255 .initial(MIN_USER_TABLE_ID as u64)
256 .step(10)
257 .build(),
258 );
259 let peer_allocator = Arc::new(
260 MetasrvPeerAllocator::new(selector_ctx.clone(), selector.clone())
261 .with_max_items(MAX_REGION_SEQ),
262 );
263 Arc::new(TableMetadataAllocator::with_peer_allocator(
264 sequence,
265 wal_provider.clone(),
266 peer_allocator,
267 ))
268 });
269 let table_id_allocator = table_metadata_allocator.table_id_allocator();
270
271 let flow_selector =
272 Arc::new(RoundRobinSelector::new(SelectTarget::Flownode)) as SelectorRef;
273
274 let flow_metadata_allocator = {
275 let flow_selector_ctx = selector_ctx.clone();
277 let peer_allocator = Arc::new(MetasrvPeerAllocator::new(
278 flow_selector_ctx,
279 flow_selector.clone(),
280 ));
281 let seq = Arc::new(
282 SequenceBuilder::new(FLOW_ID_SEQ, kv_backend.clone())
283 .initial(MIN_USER_FLOW_ID as u64)
284 .step(10)
285 .build(),
286 );
287
288 Arc::new(FlowMetadataAllocator::with_peer_allocator(
289 seq,
290 peer_allocator,
291 ))
292 };
293 let flow_state_handler =
294 FlowStateHandler::new(FlowStateManager::new(in_memory.clone().as_kv_backend_ref()));
295
296 let memory_region_keeper = Arc::new(MemoryRegionKeeper::default());
297 let node_manager = node_manager.unwrap_or_else(|| {
298 let datanode_client_channel_config = ChannelConfig::new()
299 .timeout(Some(options.datanode.client.timeout))
300 .connect_timeout(options.datanode.client.connect_timeout)
301 .tcp_nodelay(options.datanode.client.tcp_nodelay);
302 Arc::new(NodeClients::new(datanode_client_channel_config))
303 });
304 let cache_invalidator = Arc::new(MetasrvCacheInvalidator::new(
305 mailbox.clone(),
306 MetasrvInfo {
307 server_addr: options.grpc.server_addr.clone(),
308 },
309 ));
310
311 if !is_remote_wal && options.enable_region_failover {
312 ensure!(
313 options.allow_region_failover_on_local_wal,
314 error::UnexpectedSnafu {
315 violated: "Region failover is not supported in the local WAL implementation!
316 If you want to enable region failover for local WAL, please set `allow_region_failover_on_local_wal` to true.",
317 }
318 );
319 if options.allow_region_failover_on_local_wal {
320 warn!(
321 "Region failover is force enabled in the local WAL implementation! This may lead to data loss during failover!"
322 );
323 }
324 }
325
326 let (tx, rx) = RegionSupervisor::channel();
327 let (region_failure_detector_controller, region_supervisor_ticker): (
328 RegionFailureDetectorControllerRef,
329 Option<std::sync::Arc<RegionSupervisorTicker>>,
330 ) = if options.enable_region_failover {
331 (
332 Arc::new(RegionFailureDetectorControl::new(tx.clone())) as _,
333 Some(Arc::new(RegionSupervisorTicker::new(
334 DEFAULT_TICK_INTERVAL,
335 options.region_failure_detector_initialization_delay,
336 DEFAULT_INITIALIZATION_RETRY_PERIOD,
337 tx.clone(),
338 ))),
339 )
340 } else {
341 (Arc::new(NoopRegionFailureDetectorControl) as _, None as _)
342 };
343
344 let region_migration_manager = Arc::new(RegionMigrationManager::new(
346 procedure_manager.clone(),
347 DefaultContextFactory::new(
348 in_memory.clone(),
349 table_metadata_manager.clone(),
350 memory_region_keeper.clone(),
351 region_failure_detector_controller.clone(),
352 mailbox.clone(),
353 options.grpc.server_addr.clone(),
354 cache_invalidator.clone(),
355 ),
356 ));
357 region_migration_manager.try_start()?;
358 let region_supervisor_selector = plugins
359 .as_ref()
360 .and_then(|plugins| plugins.get::<RegionStatAwareSelectorRef>());
361
362 let supervisor_selector = match region_supervisor_selector {
363 Some(selector) => {
364 info!("Using region stat aware selector");
365 RegionSupervisorSelector::RegionStatAwareSelector(selector)
366 }
367 None => RegionSupervisorSelector::NaiveSelector(selector.clone()),
368 };
369
370 let region_failover_handler = if options.enable_region_failover {
371 let region_supervisor = RegionSupervisor::new(
372 rx,
373 options.failure_detector,
374 selector_ctx.clone(),
375 supervisor_selector,
376 region_migration_manager.clone(),
377 runtime_switch_manager.clone(),
378 meta_peer_client.clone(),
379 leader_cached_kv_backend.clone(),
380 )
381 .with_state(state.clone());
382
383 Some(RegionFailureHandler::new(
384 region_supervisor,
385 HeartbeatAcceptor::new(tx),
386 ))
387 } else {
388 None
389 };
390
391 let leader_region_registry = Arc::new(LeaderRegionRegistry::default());
392 let topic_stats_registry = Arc::new(TopicStatsRegistry::default());
393
394 let ddl_context = DdlContext {
395 node_manager: node_manager.clone(),
396 cache_invalidator: cache_invalidator.clone(),
397 memory_region_keeper: memory_region_keeper.clone(),
398 leader_region_registry: leader_region_registry.clone(),
399 table_metadata_manager: table_metadata_manager.clone(),
400 table_metadata_allocator: table_metadata_allocator.clone(),
401 flow_metadata_manager: flow_metadata_manager.clone(),
402 flow_metadata_allocator: flow_metadata_allocator.clone(),
403 region_failure_detector_controller,
404 };
405 let procedure_manager_c = procedure_manager.clone();
406 let repartition_procedure_factory = Arc::new(DefaultRepartitionProcedureFactory::new(
407 mailbox.clone(),
408 options.grpc.server_addr.clone(),
409 ));
410 let ddl_manager = DdlManager::try_new(
411 ddl_context,
412 procedure_manager_c,
413 repartition_procedure_factory,
414 true,
415 )
416 .context(error::InitDdlManagerSnafu)?;
417
418 let ddl_manager = if let Some(configurator) = plugins
419 .as_ref()
420 .and_then(|p| p.get::<DdlManagerConfiguratorRef<DdlManagerConfigureContext>>())
421 {
422 let ctx = DdlManagerConfigureContext {
423 kv_backend: kv_backend.clone(),
424 meta_peer_client: meta_peer_client.clone(),
425 };
426 configurator
427 .configure(ddl_manager, ctx)
428 .await
429 .context(OtherSnafu)?
430 } else {
431 ddl_manager
432 };
433
434 let ddl_manager = Arc::new(ddl_manager);
435
436 let region_flush_ticker = if is_remote_wal {
437 let remote_wal_options = options.wal.remote_wal_options().unwrap();
438 let (region_flush_trigger, region_flush_ticker) = RegionFlushTrigger::new(
439 table_metadata_manager.clone(),
440 leader_region_registry.clone(),
441 topic_stats_registry.clone(),
442 mailbox.clone(),
443 options.grpc.server_addr.clone(),
444 remote_wal_options.flush_trigger_size,
445 remote_wal_options.checkpoint_trigger_size,
446 );
447 region_flush_trigger.try_start()?;
448
449 Some(Arc::new(region_flush_ticker))
450 } else {
451 None
452 };
453
454 let wal_prune_ticker = if is_remote_wal && options.wal.enable_active_wal_pruning() {
456 let (tx, rx) = WalPruneManager::channel();
457 let remote_wal_options = options.wal.remote_wal_options().unwrap();
459 let kafka_client = build_kafka_client(&remote_wal_options.connection)
460 .await
461 .context(error::BuildKafkaClientSnafu)?;
462 let wal_prune_context = WalPruneContext {
463 client: Arc::new(kafka_client),
464 table_metadata_manager: table_metadata_manager.clone(),
465 leader_region_registry: leader_region_registry.clone(),
466 };
467 let wal_prune_manager = WalPruneManager::new(
468 remote_wal_options.auto_prune_parallelism,
469 rx,
470 procedure_manager.clone(),
471 wal_prune_context,
472 );
473 wal_prune_manager.try_start().await?;
475 let wal_prune_ticker = Arc::new(WalPruneTicker::new(
476 remote_wal_options.auto_prune_interval,
477 tx.clone(),
478 ));
479 Some(wal_prune_ticker)
480 } else {
481 None
482 };
483
484 let gc_ticker = if options.gc.enable {
485 let (gc_scheduler, gc_ticker) = GcScheduler::new_with_config(
486 table_metadata_manager.clone(),
487 procedure_manager.clone(),
488 meta_peer_client.clone(),
489 mailbox.clone(),
490 options.grpc.server_addr.clone(),
491 options.gc.clone(),
492 )?;
493 gc_scheduler.try_start()?;
494
495 Some(Arc::new(gc_ticker))
496 } else {
497 None
498 };
499
500 let customized_region_lease_renewer = plugins
501 .as_ref()
502 .and_then(|plugins| plugins.get::<CustomizedRegionLeaseRenewerRef>());
503
504 let persist_region_stats_handler = if !options.stats_persistence.ttl.is_zero() {
505 let inserter = Box::new(InsertForwarder::new(
506 database_operator.clone(),
507 Some(InsertOptions {
508 ttl: options.stats_persistence.ttl,
509 append_mode: true,
510 twcs_compaction_time_window: Some(
511 REGION_STATS_TABLE_TWCS_COMPACTION_TIME_WINDOW,
512 ),
513 }),
514 ));
515
516 Some(PersistStatsHandler::new(
517 inserter,
518 options.stats_persistence.interval,
519 ))
520 } else {
521 None
522 };
523
524 let handler_group_builder = match handler_group_builder {
525 Some(handler_group_builder) => handler_group_builder,
526 None => {
527 let region_lease_handler = RegionLeaseHandler::new(
528 default_distributed_time_constants().region_lease.as_secs(),
529 table_metadata_manager.clone(),
530 memory_region_keeper.clone(),
531 customized_region_lease_renewer,
532 );
533
534 HeartbeatHandlerGroupBuilder::new(pushers)
535 .with_plugins(plugins.clone())
536 .with_region_failure_handler(region_failover_handler)
537 .with_region_lease_handler(Some(region_lease_handler))
538 .with_flush_stats_factor(Some(options.flush_stats_factor))
539 .with_flow_state_handler(Some(flow_state_handler))
540 .with_persist_stats_handler(persist_region_stats_handler)
541 .add_default_handlers()
542 }
543 };
544
545 let enable_telemetry = options.enable_telemetry;
546 let metasrv_home = Path::new(&options.data_home)
547 .join(METASRV_DATA_DIR)
548 .to_string_lossy()
549 .to_string();
550
551 let reconciliation_manager = Arc::new(ReconciliationManager::new(
552 node_manager.clone(),
553 table_metadata_manager.clone(),
554 cache_invalidator.clone(),
555 procedure_manager.clone(),
556 ));
557 reconciliation_manager
558 .try_start()
559 .context(error::InitReconciliationManagerSnafu)?;
560
561 let mut resource_stat = ResourceStatImpl::default();
562 resource_stat.start_collect_cpu_usage();
563
564 Ok(Metasrv {
565 state,
566 started: Arc::new(AtomicBool::new(false)),
567 start_time_ms: common_time::util::current_time_millis() as u64,
568 options,
569 in_memory,
570 kv_backend,
571 leader_cached_kv_backend,
572 meta_peer_client: meta_peer_client.clone(),
573 selector,
574 selector_ctx,
575 flow_selector,
577 handler_group: RwLock::new(None),
578 handler_group_builder: Mutex::new(Some(handler_group_builder)),
579 election,
580 procedure_manager,
581 mailbox,
582 ddl_manager,
583 wal_provider,
584 table_metadata_manager,
585 runtime_switch_manager,
586 greptimedb_telemetry_task: get_greptimedb_telemetry_task(
587 Some(metasrv_home),
588 meta_peer_client,
589 enable_telemetry,
590 )
591 .await,
592 plugins: plugins.unwrap_or_else(Plugins::default),
593 memory_region_keeper,
594 region_migration_manager,
595 region_supervisor_ticker,
596 cache_invalidator,
597 leader_region_registry,
598 wal_prune_ticker,
599 region_flush_ticker,
600 table_id_allocator,
601 reconciliation_manager,
602 topic_stats_registry,
603 resource_stat: Arc::new(resource_stat),
604 gc_ticker,
605 database_operator,
606 })
607 }
608}
609
610fn build_mailbox(kv_backend: &KvBackendRef, pushers: &Pushers) -> MailboxRef {
611 let mailbox_sequence = SequenceBuilder::new("heartbeat_mailbox", kv_backend.clone())
612 .initial(1)
613 .step(100)
614 .build();
615
616 HeartbeatMailbox::create(pushers.clone(), mailbox_sequence)
617}
618
619fn build_procedure_manager(
620 options: &MetasrvOptions,
621 kv_backend: &KvBackendRef,
622 runtime_switch_manager: &RuntimeSwitchManagerRef,
623 event_recorder: EventRecorderRef,
624) -> ProcedureManagerRef {
625 let manager_config = ManagerConfig {
626 max_retry_times: options.procedure.max_retry_times,
627 retry_delay: options.procedure.retry_delay,
628 max_running_procedures: options.procedure.max_running_procedures,
629 ..Default::default()
630 };
631 let kv_state_store = Arc::new(
632 KvStateStore::new(kv_backend.clone()).with_max_value_size(
633 options
634 .procedure
635 .max_metadata_value_size
636 .map(|v| v.as_bytes() as usize),
637 ),
638 );
639
640 Arc::new(LocalManager::new(
641 manager_config,
642 kv_state_store.clone(),
643 kv_state_store,
644 Some(runtime_switch_manager.clone()),
645 Some(event_recorder),
646 ))
647}
648
649impl Default for MetasrvBuilder {
650 fn default() -> Self {
651 Self::new()
652 }
653}
654
655pub struct DdlManagerConfigureContext {
657 pub kv_backend: KvBackendRef,
658 pub meta_peer_client: MetaPeerClientRef,
659}