1use std::any::Any;
18use std::collections::HashMap;
19use std::fmt::{Debug, Display};
20use std::sync::{Arc, Mutex};
21
22use api::greptime_proto::v1::meta::{GrantedRegion as PbGrantedRegion, RegionRole as PbRegionRole};
23use api::region::RegionResponse;
24use async_trait::async_trait;
25use common_error::ext::BoxedError;
26use common_recordbatch::{EmptyRecordBatchStream, MemoryPermit, SendableRecordBatchStream};
27use common_time::Timestamp;
28use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
29use datafusion_physical_plan::{DisplayAs, DisplayFormatType};
30use datatypes::schema::SchemaRef;
31use futures::future::join_all;
32use serde::{Deserialize, Serialize};
33use tokio::sync::Semaphore;
34
35use crate::logstore::entry;
36use crate::metadata::RegionMetadataRef;
37use crate::region_request::{
38 BatchRegionDdlRequest, RegionCatchupRequest, RegionOpenRequest, RegionRequest,
39};
40use crate::storage::{RegionId, ScanRequest, SequenceNumber};
41
42#[derive(Debug, PartialEq, Eq, Clone, Copy)]
44pub enum SettableRegionRoleState {
45 Follower,
46 DowngradingLeader,
47 Leader,
49 StagingLeader,
51}
52
53impl Display for SettableRegionRoleState {
54 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
55 match self {
56 SettableRegionRoleState::Follower => write!(f, "Follower"),
57 SettableRegionRoleState::DowngradingLeader => write!(f, "Leader(Downgrading)"),
58 SettableRegionRoleState::Leader => write!(f, "Leader"),
59 SettableRegionRoleState::StagingLeader => write!(f, "Leader(Staging)"),
60 }
61 }
62}
63
64impl From<SettableRegionRoleState> for RegionRole {
65 fn from(value: SettableRegionRoleState) -> Self {
66 match value {
67 SettableRegionRoleState::Follower => RegionRole::Follower,
68 SettableRegionRoleState::DowngradingLeader => RegionRole::DowngradingLeader,
69 SettableRegionRoleState::Leader => RegionRole::Leader,
70 SettableRegionRoleState::StagingLeader => RegionRole::Leader, }
72 }
73}
74
75#[derive(Debug, PartialEq, Eq)]
77pub struct SetRegionRoleStateRequest {
78 region_id: RegionId,
79 region_role_state: SettableRegionRoleState,
80}
81
82#[derive(Debug, PartialEq, Eq)]
84pub enum SetRegionRoleStateSuccess {
85 File,
86 Mito {
87 last_entry_id: entry::Id,
88 },
89 Metric {
90 last_entry_id: entry::Id,
91 metadata_last_entry_id: entry::Id,
92 },
93}
94
95impl SetRegionRoleStateSuccess {
96 pub fn file() -> Self {
98 Self::File
99 }
100
101 pub fn mito(last_entry_id: entry::Id) -> Self {
103 SetRegionRoleStateSuccess::Mito { last_entry_id }
104 }
105
106 pub fn metric(last_entry_id: entry::Id, metadata_last_entry_id: entry::Id) -> Self {
108 SetRegionRoleStateSuccess::Metric {
109 last_entry_id,
110 metadata_last_entry_id,
111 }
112 }
113}
114
115impl SetRegionRoleStateSuccess {
116 pub fn last_entry_id(&self) -> Option<entry::Id> {
118 match self {
119 SetRegionRoleStateSuccess::File => None,
120 SetRegionRoleStateSuccess::Mito { last_entry_id } => Some(*last_entry_id),
121 SetRegionRoleStateSuccess::Metric { last_entry_id, .. } => Some(*last_entry_id),
122 }
123 }
124
125 pub fn metadata_last_entry_id(&self) -> Option<entry::Id> {
127 match self {
128 SetRegionRoleStateSuccess::File => None,
129 SetRegionRoleStateSuccess::Mito { .. } => None,
130 SetRegionRoleStateSuccess::Metric {
131 metadata_last_entry_id,
132 ..
133 } => Some(*metadata_last_entry_id),
134 }
135 }
136}
137
138#[derive(Debug)]
140pub enum SetRegionRoleStateResponse {
141 Success(SetRegionRoleStateSuccess),
142 NotFound,
143 InvalidTransition(BoxedError),
144}
145
146impl SetRegionRoleStateResponse {
147 pub fn success(success: SetRegionRoleStateSuccess) -> Self {
149 Self::Success(success)
150 }
151
152 pub fn invalid_transition(error: BoxedError) -> Self {
154 Self::InvalidTransition(error)
155 }
156
157 pub fn is_not_found(&self) -> bool {
159 matches!(self, SetRegionRoleStateResponse::NotFound)
160 }
161
162 pub fn is_invalid_transition(&self) -> bool {
164 matches!(self, SetRegionRoleStateResponse::InvalidTransition(_))
165 }
166}
167
168#[derive(Debug, Clone, PartialEq, Eq)]
169pub struct GrantedRegion {
170 pub region_id: RegionId,
171 pub region_role: RegionRole,
172 pub extensions: HashMap<String, Vec<u8>>,
173}
174
175impl GrantedRegion {
176 pub fn new(region_id: RegionId, region_role: RegionRole) -> Self {
177 Self {
178 region_id,
179 region_role,
180 extensions: HashMap::new(),
181 }
182 }
183}
184
185impl From<GrantedRegion> for PbGrantedRegion {
186 fn from(value: GrantedRegion) -> Self {
187 PbGrantedRegion {
188 region_id: value.region_id.as_u64(),
189 role: PbRegionRole::from(value.region_role).into(),
190 extensions: value.extensions,
191 }
192 }
193}
194
195impl From<PbGrantedRegion> for GrantedRegion {
196 fn from(value: PbGrantedRegion) -> Self {
197 GrantedRegion {
198 region_id: RegionId::from_u64(value.region_id),
199 region_role: value.role().into(),
200 extensions: value.extensions,
201 }
202 }
203}
204
205#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
208pub enum RegionRole {
209 Follower,
211 Leader,
213 DowngradingLeader,
217}
218
219impl Display for RegionRole {
220 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
221 match self {
222 RegionRole::Follower => write!(f, "Follower"),
223 RegionRole::Leader => write!(f, "Leader"),
224 RegionRole::DowngradingLeader => write!(f, "Leader(Downgrading)"),
225 }
226 }
227}
228
229impl RegionRole {
230 pub fn writable(&self) -> bool {
231 matches!(self, RegionRole::Leader)
232 }
233}
234
235impl From<RegionRole> for PbRegionRole {
236 fn from(value: RegionRole) -> Self {
237 match value {
238 RegionRole::Follower => PbRegionRole::Follower,
239 RegionRole::Leader => PbRegionRole::Leader,
240 RegionRole::DowngradingLeader => PbRegionRole::DowngradingLeader,
241 }
242 }
243}
244
245impl From<PbRegionRole> for RegionRole {
246 fn from(value: PbRegionRole) -> Self {
247 match value {
248 PbRegionRole::Leader => RegionRole::Leader,
249 PbRegionRole::Follower => RegionRole::Follower,
250 PbRegionRole::DowngradingLeader => RegionRole::DowngradingLeader,
251 }
252 }
253}
254
255#[derive(Debug)]
257pub enum ScannerPartitioning {
258 Unknown(usize),
260}
261
262impl ScannerPartitioning {
263 pub fn num_partitions(&self) -> usize {
265 match self {
266 ScannerPartitioning::Unknown(num_partitions) => *num_partitions,
267 }
268 }
269}
270
271#[derive(Debug, Clone, Copy, PartialEq, Eq)]
273pub struct PartitionRange {
274 pub start: Timestamp,
276 pub end: Timestamp,
278 pub num_rows: usize,
280 pub identifier: usize,
282}
283
284#[derive(Debug, Default)]
286pub struct ScannerProperties {
287 pub partitions: Vec<Vec<PartitionRange>>,
292
293 append_mode: bool,
295
296 total_rows: usize,
299
300 pub distinguish_partition_range: bool,
302
303 target_partitions: usize,
305
306 logical_region: bool,
308}
309
310impl ScannerProperties {
311 pub fn with_append_mode(mut self, append_mode: bool) -> Self {
313 self.append_mode = append_mode;
314 self
315 }
316
317 pub fn with_total_rows(mut self, total_rows: usize) -> Self {
319 self.total_rows = total_rows;
320 self
321 }
322
323 pub fn new(partitions: Vec<Vec<PartitionRange>>, append_mode: bool, total_rows: usize) -> Self {
325 Self {
326 partitions,
327 append_mode,
328 total_rows,
329 distinguish_partition_range: false,
330 target_partitions: 0,
331 logical_region: false,
332 }
333 }
334
335 pub fn prepare(&mut self, request: PrepareRequest) {
337 if let Some(ranges) = request.ranges {
338 self.partitions = ranges;
339 }
340 if let Some(distinguish_partition_range) = request.distinguish_partition_range {
341 self.distinguish_partition_range = distinguish_partition_range;
342 }
343 if let Some(target_partitions) = request.target_partitions {
344 self.target_partitions = target_partitions;
345 }
346 }
347
348 pub fn num_partitions(&self) -> usize {
350 self.partitions.len()
351 }
352
353 pub fn append_mode(&self) -> bool {
354 self.append_mode
355 }
356
357 pub fn total_rows(&self) -> usize {
358 self.total_rows
359 }
360
361 pub fn is_logical_region(&self) -> bool {
363 self.logical_region
364 }
365
366 pub fn target_partitions(&self) -> usize {
368 if self.target_partitions == 0 {
369 self.num_partitions()
370 } else {
371 self.target_partitions
372 }
373 }
374
375 pub fn set_logical_region(&mut self, logical_region: bool) {
377 self.logical_region = logical_region;
378 }
379}
380
381#[derive(Default)]
383pub struct PrepareRequest {
384 pub ranges: Option<Vec<Vec<PartitionRange>>>,
386 pub distinguish_partition_range: Option<bool>,
388 pub target_partitions: Option<usize>,
390}
391
392impl PrepareRequest {
393 pub fn with_ranges(mut self, ranges: Vec<Vec<PartitionRange>>) -> Self {
395 self.ranges = Some(ranges);
396 self
397 }
398
399 pub fn with_distinguish_partition_range(mut self, distinguish_partition_range: bool) -> Self {
401 self.distinguish_partition_range = Some(distinguish_partition_range);
402 self
403 }
404
405 pub fn with_target_partitions(mut self, target_partitions: usize) -> Self {
407 self.target_partitions = Some(target_partitions);
408 self
409 }
410}
411
412#[derive(Clone, Default)]
414pub struct QueryScanContext {
415 pub explain_verbose: bool,
417}
418
419pub trait RegionScanner: Debug + DisplayAs + Send {
424 fn name(&self) -> &str;
425
426 fn properties(&self) -> &ScannerProperties;
428
429 fn schema(&self) -> SchemaRef;
431
432 fn metadata(&self) -> RegionMetadataRef;
434
435 fn prepare(&mut self, request: PrepareRequest) -> Result<(), BoxedError>;
439
440 fn scan_partition(
445 &self,
446 ctx: &QueryScanContext,
447 metrics_set: &ExecutionPlanMetricsSet,
448 partition: usize,
449 ) -> Result<SendableRecordBatchStream, BoxedError>;
450
451 fn has_predicate_without_region(&self) -> bool;
453
454 fn set_logical_region(&mut self, logical_region: bool);
456}
457
458pub type RegionScannerRef = Box<dyn RegionScanner>;
459
460pub type BatchResponses = Vec<(RegionId, Result<RegionResponse, BoxedError>)>;
461
462#[derive(Debug, Deserialize, Serialize, Default)]
464pub struct RegionStatistic {
465 #[serde(default)]
467 pub num_rows: u64,
468 pub memtable_size: u64,
470 pub wal_size: u64,
472 pub manifest_size: u64,
474 pub sst_size: u64,
476 pub sst_num: u64,
478 #[serde(default)]
480 pub index_size: u64,
481 #[serde(default)]
483 pub manifest: RegionManifestInfo,
484 #[serde(default)]
485 pub written_bytes: u64,
487 #[serde(default)]
491 pub data_topic_latest_entry_id: u64,
492 #[serde(default)]
493 pub metadata_topic_latest_entry_id: u64,
494}
495
496#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
498pub enum RegionManifestInfo {
499 Mito {
500 manifest_version: u64,
501 flushed_entry_id: u64,
502 file_removed_cnt: u64,
504 },
505 Metric {
506 data_manifest_version: u64,
507 data_flushed_entry_id: u64,
508 metadata_manifest_version: u64,
509 metadata_flushed_entry_id: u64,
510 },
511}
512
513impl RegionManifestInfo {
514 pub fn mito(manifest_version: u64, flushed_entry_id: u64, file_removal_rate: u64) -> Self {
516 Self::Mito {
517 manifest_version,
518 flushed_entry_id,
519 file_removed_cnt: file_removal_rate,
520 }
521 }
522
523 pub fn metric(
525 data_manifest_version: u64,
526 data_flushed_entry_id: u64,
527 metadata_manifest_version: u64,
528 metadata_flushed_entry_id: u64,
529 ) -> Self {
530 Self::Metric {
531 data_manifest_version,
532 data_flushed_entry_id,
533 metadata_manifest_version,
534 metadata_flushed_entry_id,
535 }
536 }
537
538 pub fn is_mito(&self) -> bool {
540 matches!(self, RegionManifestInfo::Mito { .. })
541 }
542
543 pub fn is_metric(&self) -> bool {
545 matches!(self, RegionManifestInfo::Metric { .. })
546 }
547
548 pub fn data_flushed_entry_id(&self) -> u64 {
550 match self {
551 RegionManifestInfo::Mito {
552 flushed_entry_id, ..
553 } => *flushed_entry_id,
554 RegionManifestInfo::Metric {
555 data_flushed_entry_id,
556 ..
557 } => *data_flushed_entry_id,
558 }
559 }
560
561 pub fn data_manifest_version(&self) -> u64 {
563 match self {
564 RegionManifestInfo::Mito {
565 manifest_version, ..
566 } => *manifest_version,
567 RegionManifestInfo::Metric {
568 data_manifest_version,
569 ..
570 } => *data_manifest_version,
571 }
572 }
573
574 pub fn metadata_manifest_version(&self) -> Option<u64> {
576 match self {
577 RegionManifestInfo::Mito { .. } => None,
578 RegionManifestInfo::Metric {
579 metadata_manifest_version,
580 ..
581 } => Some(*metadata_manifest_version),
582 }
583 }
584
585 pub fn metadata_flushed_entry_id(&self) -> Option<u64> {
587 match self {
588 RegionManifestInfo::Mito { .. } => None,
589 RegionManifestInfo::Metric {
590 metadata_flushed_entry_id,
591 ..
592 } => Some(*metadata_flushed_entry_id),
593 }
594 }
595
596 pub fn encode_list(manifest_infos: &[(RegionId, Self)]) -> serde_json::Result<Vec<u8>> {
598 serde_json::to_vec(manifest_infos)
599 }
600
601 pub fn decode_list(value: &[u8]) -> serde_json::Result<Vec<(RegionId, Self)>> {
603 serde_json::from_slice(value)
604 }
605}
606
607impl Default for RegionManifestInfo {
608 fn default() -> Self {
609 Self::Mito {
610 manifest_version: 0,
611 flushed_entry_id: 0,
612 file_removed_cnt: 0,
613 }
614 }
615}
616
617impl RegionStatistic {
618 pub fn deserialize_from_slice(value: &[u8]) -> Option<RegionStatistic> {
622 serde_json::from_slice(value).ok()
623 }
624
625 pub fn serialize_to_vec(&self) -> Option<Vec<u8>> {
629 serde_json::to_vec(self).ok()
630 }
631}
632
633impl RegionStatistic {
634 pub fn estimated_disk_size(&self) -> u64 {
636 self.wal_size + self.sst_size + self.manifest_size + self.index_size
637 }
638}
639
640#[derive(Debug)]
642pub enum SyncManifestResponse {
643 NotSupported,
644 Mito {
645 synced: bool,
647 },
648 Metric {
649 metadata_synced: bool,
651 data_synced: bool,
653 new_opened_logical_region_ids: Vec<RegionId>,
656 },
657}
658
659impl SyncManifestResponse {
660 pub fn is_data_synced(&self) -> bool {
662 match self {
663 SyncManifestResponse::NotSupported => false,
664 SyncManifestResponse::Mito { synced } => *synced,
665 SyncManifestResponse::Metric { data_synced, .. } => *data_synced,
666 }
667 }
668
669 pub fn is_supported(&self) -> bool {
671 matches!(self, SyncManifestResponse::NotSupported)
672 }
673
674 pub fn is_mito(&self) -> bool {
676 matches!(self, SyncManifestResponse::Mito { .. })
677 }
678
679 pub fn is_metric(&self) -> bool {
681 matches!(self, SyncManifestResponse::Metric { .. })
682 }
683
684 pub fn new_opened_logical_region_ids(self) -> Option<Vec<RegionId>> {
686 match self {
687 SyncManifestResponse::Metric {
688 new_opened_logical_region_ids,
689 ..
690 } => Some(new_opened_logical_region_ids),
691 _ => None,
692 }
693 }
694}
695
696#[derive(Debug, Clone)]
698pub struct RemapManifestsRequest {
699 pub region_id: RegionId,
701 pub input_regions: Vec<RegionId>,
703 pub region_mapping: HashMap<RegionId, Vec<RegionId>>,
705 pub new_partition_exprs: HashMap<RegionId, String>,
707}
708
709#[derive(Debug, Clone)]
711pub struct RemapManifestsResponse {
712 pub new_manifests: HashMap<RegionId, String>,
714}
715
716#[async_trait]
717pub trait RegionEngine: Send + Sync {
718 fn name(&self) -> &str;
720
721 async fn handle_batch_open_requests(
723 &self,
724 parallelism: usize,
725 requests: Vec<(RegionId, RegionOpenRequest)>,
726 ) -> Result<BatchResponses, BoxedError> {
727 let semaphore = Arc::new(Semaphore::new(parallelism));
728 let mut tasks = Vec::with_capacity(requests.len());
729
730 for (region_id, request) in requests {
731 let semaphore_moved = semaphore.clone();
732
733 tasks.push(async move {
734 let _permit = semaphore_moved.acquire().await.unwrap();
736 let result = self
737 .handle_request(region_id, RegionRequest::Open(request))
738 .await;
739 (region_id, result)
740 });
741 }
742
743 Ok(join_all(tasks).await)
744 }
745
746 async fn handle_batch_catchup_requests(
747 &self,
748 parallelism: usize,
749 requests: Vec<(RegionId, RegionCatchupRequest)>,
750 ) -> Result<BatchResponses, BoxedError> {
751 let semaphore = Arc::new(Semaphore::new(parallelism));
752 let mut tasks = Vec::with_capacity(requests.len());
753
754 for (region_id, request) in requests {
755 let semaphore_moved = semaphore.clone();
756
757 tasks.push(async move {
758 let _permit = semaphore_moved.acquire().await.unwrap();
760 let result = self
761 .handle_request(region_id, RegionRequest::Catchup(request))
762 .await;
763 (region_id, result)
764 });
765 }
766
767 Ok(join_all(tasks).await)
768 }
769
770 async fn handle_batch_ddl_requests(
771 &self,
772 request: BatchRegionDdlRequest,
773 ) -> Result<RegionResponse, BoxedError> {
774 let requests = request.into_region_requests();
775
776 let mut affected_rows = 0;
777 let mut extensions = HashMap::new();
778
779 for (region_id, request) in requests {
780 let result = self.handle_request(region_id, request).await?;
781 affected_rows += result.affected_rows;
782 extensions.extend(result.extensions);
783 }
784
785 Ok(RegionResponse {
786 affected_rows,
787 extensions,
788 metadata: Vec::new(),
789 })
790 }
791
792 async fn handle_request(
794 &self,
795 region_id: RegionId,
796 request: RegionRequest,
797 ) -> Result<RegionResponse, BoxedError>;
798
799 async fn get_committed_sequence(
801 &self,
802 region_id: RegionId,
803 ) -> Result<SequenceNumber, BoxedError>;
804
805 async fn handle_query(
807 &self,
808 region_id: RegionId,
809 request: ScanRequest,
810 ) -> Result<RegionScannerRef, BoxedError>;
811
812 fn register_query_memory_permit(&self) -> Option<Arc<MemoryPermit>> {
814 None
815 }
816
817 async fn get_metadata(&self, region_id: RegionId) -> Result<RegionMetadataRef, BoxedError>;
819
820 fn region_statistic(&self, region_id: RegionId) -> Option<RegionStatistic>;
822
823 async fn stop(&self) -> Result<(), BoxedError>;
825
826 fn set_region_role(&self, region_id: RegionId, role: RegionRole) -> Result<(), BoxedError>;
832
833 async fn sync_region(
835 &self,
836 region_id: RegionId,
837 manifest_info: RegionManifestInfo,
838 ) -> Result<SyncManifestResponse, BoxedError>;
839
840 async fn remap_manifests(
842 &self,
843 request: RemapManifestsRequest,
844 ) -> Result<RemapManifestsResponse, BoxedError>;
845
846 async fn set_region_role_state_gracefully(
850 &self,
851 region_id: RegionId,
852 region_role_state: SettableRegionRoleState,
853 ) -> Result<SetRegionRoleStateResponse, BoxedError>;
854
855 fn role(&self, region_id: RegionId) -> Option<RegionRole>;
859
860 fn as_any(&self) -> &dyn Any;
861}
862
863pub type RegionEngineRef = Arc<dyn RegionEngine>;
864
865pub struct SinglePartitionScanner {
867 stream: Mutex<Option<SendableRecordBatchStream>>,
868 schema: SchemaRef,
869 properties: ScannerProperties,
870 metadata: RegionMetadataRef,
871}
872
873impl SinglePartitionScanner {
874 pub fn new(
876 stream: SendableRecordBatchStream,
877 append_mode: bool,
878 metadata: RegionMetadataRef,
879 ) -> Self {
880 let schema = stream.schema();
881 Self {
882 stream: Mutex::new(Some(stream)),
883 schema,
884 properties: ScannerProperties::default().with_append_mode(append_mode),
885 metadata,
886 }
887 }
888}
889
890impl Debug for SinglePartitionScanner {
891 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
892 write!(f, "SinglePartitionScanner: <SendableRecordBatchStream>")
893 }
894}
895
896impl RegionScanner for SinglePartitionScanner {
897 fn name(&self) -> &str {
898 "SinglePartition"
899 }
900
901 fn properties(&self) -> &ScannerProperties {
902 &self.properties
903 }
904
905 fn schema(&self) -> SchemaRef {
906 self.schema.clone()
907 }
908
909 fn prepare(&mut self, request: PrepareRequest) -> Result<(), BoxedError> {
910 self.properties.prepare(request);
911 Ok(())
912 }
913
914 fn scan_partition(
915 &self,
916 _ctx: &QueryScanContext,
917 _metrics_set: &ExecutionPlanMetricsSet,
918 _partition: usize,
919 ) -> Result<SendableRecordBatchStream, BoxedError> {
920 let mut stream = self.stream.lock().unwrap();
921 let result = stream
922 .take()
923 .or_else(|| Some(Box::pin(EmptyRecordBatchStream::new(self.schema.clone()))));
924 Ok(result.unwrap())
925 }
926
927 fn has_predicate_without_region(&self) -> bool {
928 false
929 }
930
931 fn metadata(&self) -> RegionMetadataRef {
932 self.metadata.clone()
933 }
934
935 fn set_logical_region(&mut self, logical_region: bool) {
936 self.properties.set_logical_region(logical_region);
937 }
938}
939
940impl DisplayAs for SinglePartitionScanner {
941 fn fmt_as(&self, _t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result {
942 write!(f, "{:?}", self)
943 }
944}