datatypes/
data_type.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::fmt;
16use std::sync::Arc;
17
18use arrow::compute::cast as arrow_array_cast;
19use arrow::datatypes::{
20    DataType as ArrowDataType, IntervalUnit as ArrowIntervalUnit, TimeUnit as ArrowTimeUnit,
21};
22use arrow_schema::DECIMAL_DEFAULT_SCALE;
23use common_decimal::decimal128::DECIMAL128_MAX_PRECISION;
24use common_time::interval::IntervalUnit;
25use common_time::timestamp::TimeUnit;
26use enum_dispatch::enum_dispatch;
27use paste::paste;
28use serde::{Deserialize, Serialize};
29
30use crate::error::{self, Error, Result};
31use crate::type_id::LogicalTypeId;
32use crate::types::{
33    BinaryType, BooleanType, DateType, Decimal128Type, DictionaryType, DurationMicrosecondType,
34    DurationMillisecondType, DurationNanosecondType, DurationSecondType, DurationType, Float32Type,
35    Float64Type, Int8Type, Int16Type, Int32Type, Int64Type, IntervalDayTimeType,
36    IntervalMonthDayNanoType, IntervalType, IntervalYearMonthType, JsonFormat, JsonType, ListType,
37    NullType, StringType, StructType, TimeMillisecondType, TimeType, TimestampMicrosecondType,
38    TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, TimestampType,
39    UInt8Type, UInt16Type, UInt32Type, UInt64Type, VectorType,
40};
41use crate::value::Value;
42use crate::vectors::MutableVector;
43
44#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
45#[enum_dispatch(DataType)]
46pub enum ConcreteDataType {
47    Null(NullType),
48    Boolean(BooleanType),
49
50    // Numeric types:
51    Int8(Int8Type),
52    Int16(Int16Type),
53    Int32(Int32Type),
54    Int64(Int64Type),
55    UInt8(UInt8Type),
56    UInt16(UInt16Type),
57    UInt32(UInt32Type),
58    UInt64(UInt64Type),
59    Float32(Float32Type),
60    Float64(Float64Type),
61
62    // Decimal128 type:
63    Decimal128(Decimal128Type),
64
65    // String types:
66    Binary(BinaryType),
67    String(StringType),
68
69    // Date and time types:
70    Date(DateType),
71    Timestamp(TimestampType),
72    Time(TimeType),
73
74    // Duration type:
75    Duration(DurationType),
76
77    // Interval type:
78    Interval(IntervalType),
79
80    // Compound types:
81    List(ListType),
82    Dictionary(DictionaryType),
83    Struct(StructType),
84
85    // JSON type:
86    Json(JsonType),
87
88    // Vector type:
89    Vector(VectorType),
90}
91
92impl fmt::Display for ConcreteDataType {
93    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
94        match self {
95            ConcreteDataType::Null(v) => write!(f, "{}", v.name()),
96            ConcreteDataType::Boolean(v) => write!(f, "{}", v.name()),
97            ConcreteDataType::Int8(v) => write!(f, "{}", v.name()),
98            ConcreteDataType::Int16(v) => write!(f, "{}", v.name()),
99            ConcreteDataType::Int32(v) => write!(f, "{}", v.name()),
100            ConcreteDataType::Int64(v) => write!(f, "{}", v.name()),
101            ConcreteDataType::UInt8(v) => write!(f, "{}", v.name()),
102            ConcreteDataType::UInt16(v) => write!(f, "{}", v.name()),
103            ConcreteDataType::UInt32(v) => write!(f, "{}", v.name()),
104            ConcreteDataType::UInt64(v) => write!(f, "{}", v.name()),
105            ConcreteDataType::Float32(v) => write!(f, "{}", v.name()),
106            ConcreteDataType::Float64(v) => write!(f, "{}", v.name()),
107            ConcreteDataType::Binary(v) => write!(f, "{}", v.name()),
108            ConcreteDataType::String(v) => write!(f, "{}", v.name()),
109            ConcreteDataType::Date(v) => write!(f, "{}", v.name()),
110            ConcreteDataType::Timestamp(t) => match t {
111                TimestampType::Second(v) => write!(f, "{}", v.name()),
112                TimestampType::Millisecond(v) => write!(f, "{}", v.name()),
113                TimestampType::Microsecond(v) => write!(f, "{}", v.name()),
114                TimestampType::Nanosecond(v) => write!(f, "{}", v.name()),
115            },
116            ConcreteDataType::Time(t) => match t {
117                TimeType::Second(v) => write!(f, "{}", v.name()),
118                TimeType::Millisecond(v) => write!(f, "{}", v.name()),
119                TimeType::Microsecond(v) => write!(f, "{}", v.name()),
120                TimeType::Nanosecond(v) => write!(f, "{}", v.name()),
121            },
122            ConcreteDataType::Interval(i) => match i {
123                IntervalType::YearMonth(v) => write!(f, "{}", v.name()),
124                IntervalType::DayTime(v) => write!(f, "{}", v.name()),
125                IntervalType::MonthDayNano(v) => write!(f, "{}", v.name()),
126            },
127            ConcreteDataType::Duration(d) => match d {
128                DurationType::Second(v) => write!(f, "{}", v.name()),
129                DurationType::Millisecond(v) => write!(f, "{}", v.name()),
130                DurationType::Microsecond(v) => write!(f, "{}", v.name()),
131                DurationType::Nanosecond(v) => write!(f, "{}", v.name()),
132            },
133            ConcreteDataType::Decimal128(v) => write!(f, "{}", v.name()),
134            ConcreteDataType::List(v) => write!(f, "{}", v.name()),
135            ConcreteDataType::Struct(v) => write!(f, "{}", v.name()),
136            ConcreteDataType::Dictionary(v) => write!(f, "{}", v.name()),
137            ConcreteDataType::Json(v) => write!(f, "{}", v.name()),
138            ConcreteDataType::Vector(v) => write!(f, "{}", v.name()),
139        }
140    }
141}
142
143// TODO(yingwen): Refactor these `is_xxx()` methods, such as adding a `properties()` method
144// returning all these properties to the `DataType` trait
145impl ConcreteDataType {
146    pub fn is_float(&self) -> bool {
147        matches!(
148            self,
149            ConcreteDataType::Float64(_) | ConcreteDataType::Float32(_)
150        )
151    }
152
153    pub fn is_boolean(&self) -> bool {
154        matches!(self, ConcreteDataType::Boolean(_))
155    }
156
157    pub fn is_string(&self) -> bool {
158        matches!(self, ConcreteDataType::String(_))
159    }
160
161    pub fn is_stringifiable(&self) -> bool {
162        matches!(
163            self,
164            ConcreteDataType::String(_)
165                | ConcreteDataType::Date(_)
166                | ConcreteDataType::Timestamp(_)
167                | ConcreteDataType::Time(_)
168                | ConcreteDataType::Interval(_)
169                | ConcreteDataType::Duration(_)
170                | ConcreteDataType::Decimal128(_)
171                | ConcreteDataType::Binary(_)
172                | ConcreteDataType::Json(_)
173                | ConcreteDataType::Vector(_)
174        )
175    }
176
177    pub fn is_signed(&self) -> bool {
178        matches!(
179            self,
180            ConcreteDataType::Int8(_)
181                | ConcreteDataType::Int16(_)
182                | ConcreteDataType::Int32(_)
183                | ConcreteDataType::Int64(_)
184                | ConcreteDataType::Date(_)
185                | ConcreteDataType::Timestamp(_)
186                | ConcreteDataType::Time(_)
187                | ConcreteDataType::Interval(_)
188                | ConcreteDataType::Duration(_)
189                | ConcreteDataType::Decimal128(_)
190        )
191    }
192
193    pub fn is_unsigned(&self) -> bool {
194        matches!(
195            self,
196            ConcreteDataType::UInt8(_)
197                | ConcreteDataType::UInt16(_)
198                | ConcreteDataType::UInt32(_)
199                | ConcreteDataType::UInt64(_)
200        )
201    }
202
203    pub fn is_numeric(&self) -> bool {
204        matches!(
205            self,
206            ConcreteDataType::Int8(_)
207                | ConcreteDataType::Int16(_)
208                | ConcreteDataType::Int32(_)
209                | ConcreteDataType::Int64(_)
210                | ConcreteDataType::UInt8(_)
211                | ConcreteDataType::UInt16(_)
212                | ConcreteDataType::UInt32(_)
213                | ConcreteDataType::UInt64(_)
214                | ConcreteDataType::Float32(_)
215                | ConcreteDataType::Float64(_)
216        )
217    }
218
219    pub fn is_timestamp(&self) -> bool {
220        matches!(self, ConcreteDataType::Timestamp(_))
221    }
222
223    pub fn is_decimal(&self) -> bool {
224        matches!(self, ConcreteDataType::Decimal128(_))
225    }
226
227    pub fn is_json(&self) -> bool {
228        matches!(self, ConcreteDataType::Json(_))
229    }
230
231    pub fn is_vector(&self) -> bool {
232        matches!(self, ConcreteDataType::Vector(_))
233    }
234
235    pub fn numerics() -> Vec<ConcreteDataType> {
236        vec![
237            ConcreteDataType::int8_datatype(),
238            ConcreteDataType::int16_datatype(),
239            ConcreteDataType::int32_datatype(),
240            ConcreteDataType::int64_datatype(),
241            ConcreteDataType::uint8_datatype(),
242            ConcreteDataType::uint16_datatype(),
243            ConcreteDataType::uint32_datatype(),
244            ConcreteDataType::uint64_datatype(),
245            ConcreteDataType::float32_datatype(),
246            ConcreteDataType::float64_datatype(),
247        ]
248    }
249
250    pub fn unsigned_integers() -> Vec<ConcreteDataType> {
251        vec![
252            ConcreteDataType::uint8_datatype(),
253            ConcreteDataType::uint16_datatype(),
254            ConcreteDataType::uint32_datatype(),
255            ConcreteDataType::uint64_datatype(),
256        ]
257    }
258
259    pub fn timestamps() -> Vec<ConcreteDataType> {
260        vec![
261            ConcreteDataType::timestamp_second_datatype(),
262            ConcreteDataType::timestamp_millisecond_datatype(),
263            ConcreteDataType::timestamp_microsecond_datatype(),
264            ConcreteDataType::timestamp_nanosecond_datatype(),
265        ]
266    }
267
268    /// Convert arrow data type to [ConcreteDataType].
269    ///
270    /// # Panics
271    /// Panic if given arrow data type is not supported.
272    pub fn from_arrow_type(dt: &ArrowDataType) -> Self {
273        ConcreteDataType::try_from(dt).expect("Unimplemented type")
274    }
275
276    pub fn is_null(&self) -> bool {
277        matches!(self, ConcreteDataType::Null(NullType))
278    }
279
280    /// Try to cast the type as a [`ListType`].
281    pub fn as_list(&self) -> Option<&ListType> {
282        match self {
283            ConcreteDataType::List(t) => Some(t),
284            _ => None,
285        }
286    }
287
288    pub fn as_struct(&self) -> Option<&StructType> {
289        match self {
290            ConcreteDataType::Struct(s) => Some(s),
291            _ => None,
292        }
293    }
294
295    /// Try to cast data type as a [`TimestampType`].
296    pub fn as_timestamp(&self) -> Option<TimestampType> {
297        match self {
298            ConcreteDataType::Timestamp(t) => Some(*t),
299            _ => None,
300        }
301    }
302
303    /// Try to get numeric precision, returns `None` if it's not numeric type
304    pub fn numeric_precision(&self) -> Option<u8> {
305        match self {
306            ConcreteDataType::Int8(_) | ConcreteDataType::UInt8(_) => Some(3),
307            ConcreteDataType::Int16(_) | ConcreteDataType::UInt16(_) => Some(5),
308            ConcreteDataType::Int32(_) | ConcreteDataType::UInt32(_) => Some(10),
309            ConcreteDataType::Int64(_) => Some(19),
310            ConcreteDataType::UInt64(_) => Some(20),
311            ConcreteDataType::Float32(_) => Some(12),
312            ConcreteDataType::Float64(_) => Some(22),
313            ConcreteDataType::Decimal128(decimal_type) => Some(decimal_type.precision()),
314            _ => None,
315        }
316    }
317
318    /// Try to get numeric scale, returns `None` if it's float or not numeric type
319    pub fn numeric_scale(&self) -> Option<i8> {
320        match self {
321            ConcreteDataType::Int8(_)
322            | ConcreteDataType::UInt8(_)
323            | ConcreteDataType::Int16(_)
324            | ConcreteDataType::UInt16(_)
325            | ConcreteDataType::Int32(_)
326            | ConcreteDataType::UInt32(_)
327            | ConcreteDataType::Int64(_)
328            | ConcreteDataType::UInt64(_) => Some(0),
329            ConcreteDataType::Float32(_) | ConcreteDataType::Float64(_) => None,
330            ConcreteDataType::Decimal128(decimal_type) => Some(decimal_type.scale()),
331            _ => None,
332        }
333    }
334
335    /// Try to cast data type as a [`TimeType`].
336    pub fn as_time(&self) -> Option<TimeType> {
337        match self {
338            ConcreteDataType::Int64(_) => Some(TimeType::Millisecond(TimeMillisecondType)),
339            ConcreteDataType::Time(t) => Some(*t),
340            _ => None,
341        }
342    }
343
344    pub fn as_decimal128(&self) -> Option<Decimal128Type> {
345        match self {
346            ConcreteDataType::Decimal128(d) => Some(*d),
347            _ => None,
348        }
349    }
350
351    pub fn as_json(&self) -> Option<JsonType> {
352        match self {
353            ConcreteDataType::Json(j) => Some(j.clone()),
354            _ => None,
355        }
356    }
357
358    pub fn as_vector(&self) -> Option<VectorType> {
359        match self {
360            ConcreteDataType::Vector(v) => Some(*v),
361            _ => None,
362        }
363    }
364
365    /// Checks if the data type can cast to another data type.
366    pub fn can_arrow_type_cast_to(&self, to_type: &ConcreteDataType) -> bool {
367        let array = arrow_array::new_empty_array(&self.as_arrow_type());
368        arrow_array_cast(array.as_ref(), &to_type.as_arrow_type()).is_ok()
369    }
370
371    /// Try to cast data type as a [`DurationType`].
372    pub fn as_duration(&self) -> Option<DurationType> {
373        match self {
374            ConcreteDataType::Duration(d) => Some(*d),
375            _ => None,
376        }
377    }
378
379    /// Return the datatype name in postgres type system
380    pub fn postgres_datatype_name(&self) -> &'static str {
381        match self {
382            &ConcreteDataType::Null(_) => "UNKNOWN",
383            &ConcreteDataType::Boolean(_) => "BOOL",
384            &ConcreteDataType::Int8(_) | &ConcreteDataType::UInt8(_) => "CHAR",
385            &ConcreteDataType::Int16(_) | &ConcreteDataType::UInt16(_) => "INT2",
386            &ConcreteDataType::Int32(_) | &ConcreteDataType::UInt32(_) => "INT4",
387            &ConcreteDataType::Int64(_) | &ConcreteDataType::UInt64(_) => "INT8",
388            &ConcreteDataType::Float32(_) => "FLOAT4",
389            &ConcreteDataType::Float64(_) => "FLOAT8",
390            &ConcreteDataType::Binary(_) | &ConcreteDataType::Vector(_) => "BYTEA",
391            &ConcreteDataType::String(_) => "VARCHAR",
392            &ConcreteDataType::Date(_) => "DATE",
393            &ConcreteDataType::Timestamp(_) => "TIMESTAMP",
394            &ConcreteDataType::Time(_) => "TIME",
395            &ConcreteDataType::Interval(_) => "INTERVAL",
396            &ConcreteDataType::Decimal128(_) => "NUMERIC",
397            &ConcreteDataType::Json(_) => "JSON",
398            ConcreteDataType::List(list) => match list.item_type() {
399                &ConcreteDataType::Null(_) => "UNKNOWN",
400                &ConcreteDataType::Boolean(_) => "_BOOL",
401                &ConcreteDataType::Int8(_) | &ConcreteDataType::UInt8(_) => "_CHAR",
402                &ConcreteDataType::Int16(_) | &ConcreteDataType::UInt16(_) => "_INT2",
403                &ConcreteDataType::Int32(_) | &ConcreteDataType::UInt32(_) => "_INT4",
404                &ConcreteDataType::Int64(_) | &ConcreteDataType::UInt64(_) => "_INT8",
405                &ConcreteDataType::Float32(_) => "_FLOAT4",
406                &ConcreteDataType::Float64(_) => "_FLOAT8",
407                &ConcreteDataType::Binary(_) => "_BYTEA",
408                &ConcreteDataType::String(_) => "_VARCHAR",
409                &ConcreteDataType::Date(_) => "_DATE",
410                &ConcreteDataType::Timestamp(_) => "_TIMESTAMP",
411                &ConcreteDataType::Time(_) => "_TIME",
412                &ConcreteDataType::Interval(_) => "_INTERVAL",
413                &ConcreteDataType::Decimal128(_) => "_NUMERIC",
414                &ConcreteDataType::Json(_) => "_JSON",
415                &ConcreteDataType::Duration(_)
416                | &ConcreteDataType::Dictionary(_)
417                | &ConcreteDataType::Vector(_)
418                | &ConcreteDataType::List(_)
419                | &ConcreteDataType::Struct(_) => "UNKNOWN",
420            },
421            &ConcreteDataType::Duration(_)
422            | &ConcreteDataType::Dictionary(_)
423            | &ConcreteDataType::Struct(_) => "UNKNOWN",
424        }
425    }
426}
427
428impl From<&ConcreteDataType> for ConcreteDataType {
429    fn from(t: &ConcreteDataType) -> Self {
430        t.clone()
431    }
432}
433
434impl TryFrom<&ArrowDataType> for ConcreteDataType {
435    type Error = Error;
436
437    fn try_from(dt: &ArrowDataType) -> Result<ConcreteDataType> {
438        let concrete_type = match dt {
439            ArrowDataType::Null => Self::null_datatype(),
440            ArrowDataType::Boolean => Self::boolean_datatype(),
441            ArrowDataType::UInt8 => Self::uint8_datatype(),
442            ArrowDataType::UInt16 => Self::uint16_datatype(),
443            ArrowDataType::UInt32 => Self::uint32_datatype(),
444            ArrowDataType::UInt64 => Self::uint64_datatype(),
445            ArrowDataType::Int8 => Self::int8_datatype(),
446            ArrowDataType::Int16 => Self::int16_datatype(),
447            ArrowDataType::Int32 => Self::int32_datatype(),
448            ArrowDataType::Int64 => Self::int64_datatype(),
449            ArrowDataType::Float32 => Self::float32_datatype(),
450            ArrowDataType::Float64 => Self::float64_datatype(),
451            ArrowDataType::Date32 => Self::date_datatype(),
452            ArrowDataType::Timestamp(u, _) => ConcreteDataType::from_arrow_time_unit(u),
453            ArrowDataType::Interval(u) => ConcreteDataType::from_arrow_interval_unit(u),
454            ArrowDataType::Binary | ArrowDataType::LargeBinary | ArrowDataType::BinaryView => {
455                Self::binary_datatype()
456            }
457            ArrowDataType::Utf8 | ArrowDataType::Utf8View => Self::string_datatype(),
458            ArrowDataType::LargeUtf8 => Self::large_string_datatype(),
459            ArrowDataType::List(field) => Self::List(ListType::new(Arc::new(
460                ConcreteDataType::from_arrow_type(field.data_type()),
461            ))),
462            ArrowDataType::Dictionary(key_type, value_type) => {
463                let key_type = ConcreteDataType::from_arrow_type(key_type);
464                let value_type = ConcreteDataType::from_arrow_type(value_type);
465                Self::Dictionary(DictionaryType::new(key_type, value_type))
466            }
467            ArrowDataType::Time32(u) => ConcreteDataType::Time(TimeType::from_unit(u.into())),
468            ArrowDataType::Time64(u) => ConcreteDataType::Time(TimeType::from_unit(u.into())),
469            ArrowDataType::Duration(u) => {
470                ConcreteDataType::Duration(DurationType::from_unit(u.into()))
471            }
472            ArrowDataType::Decimal128(precision, scale) => {
473                ConcreteDataType::decimal128_datatype(*precision, *scale)
474            }
475            ArrowDataType::Struct(fields) => ConcreteDataType::Struct(fields.try_into()?),
476            ArrowDataType::Float16
477            | ArrowDataType::Date64
478            | ArrowDataType::FixedSizeBinary(_)
479            | ArrowDataType::ListView(_)
480            | ArrowDataType::FixedSizeList(_, _)
481            | ArrowDataType::LargeList(_)
482            | ArrowDataType::LargeListView(_)
483            | ArrowDataType::Union(_, _)
484            | ArrowDataType::Decimal256(_, _)
485            | ArrowDataType::Map(_, _)
486            | ArrowDataType::RunEndEncoded(_, _)
487            | ArrowDataType::Decimal32(_, _)
488            | ArrowDataType::Decimal64(_, _) => {
489                return error::UnsupportedArrowTypeSnafu {
490                    arrow_type: dt.clone(),
491                }
492                .fail();
493            }
494        };
495
496        Ok(concrete_type)
497    }
498}
499
500macro_rules! impl_new_concrete_type_functions {
501    ($($Type: ident), +) => {
502        paste! {
503            impl ConcreteDataType {
504                $(
505                    pub fn [<$Type:lower _datatype>]() -> ConcreteDataType {
506                        ConcreteDataType::$Type([<$Type Type>]::default())
507                    }
508                )+
509            }
510        }
511    }
512}
513
514impl_new_concrete_type_functions!(
515    Null, Boolean, UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64,
516    Binary, Date, String, Json
517);
518
519impl ConcreteDataType {
520    pub fn large_string_datatype() -> Self {
521        ConcreteDataType::String(StringType::large_utf8())
522    }
523
524    pub fn timestamp_second_datatype() -> Self {
525        ConcreteDataType::Timestamp(TimestampType::Second(TimestampSecondType))
526    }
527
528    pub fn timestamp_millisecond_datatype() -> Self {
529        ConcreteDataType::Timestamp(TimestampType::Millisecond(TimestampMillisecondType))
530    }
531
532    pub fn timestamp_microsecond_datatype() -> Self {
533        ConcreteDataType::Timestamp(TimestampType::Microsecond(TimestampMicrosecondType))
534    }
535
536    pub fn timestamp_nanosecond_datatype() -> Self {
537        ConcreteDataType::Timestamp(TimestampType::Nanosecond(TimestampNanosecondType))
538    }
539
540    /// Returns the time data type with `TimeUnit`.
541    pub fn time_datatype(unit: TimeUnit) -> Self {
542        ConcreteDataType::Time(TimeType::from_unit(unit))
543    }
544
545    /// Creates a [Time(TimeSecondType)] datatype.
546    pub fn time_second_datatype() -> Self {
547        Self::time_datatype(TimeUnit::Second)
548    }
549
550    /// Creates a [Time(TimeMillisecondType)] datatype.
551    pub fn time_millisecond_datatype() -> Self {
552        Self::time_datatype(TimeUnit::Millisecond)
553    }
554
555    /// Creates a [Time(TimeMicrosecond)] datatype.
556    pub fn time_microsecond_datatype() -> Self {
557        Self::time_datatype(TimeUnit::Microsecond)
558    }
559
560    /// Creates a [Time(TimeNanosecond)] datatype.
561    pub fn time_nanosecond_datatype() -> Self {
562        Self::time_datatype(TimeUnit::Nanosecond)
563    }
564
565    /// Creates a [Duration(DurationSecondType)] datatype.
566    pub fn duration_second_datatype() -> Self {
567        ConcreteDataType::Duration(DurationType::Second(DurationSecondType))
568    }
569
570    /// Creates a [Duration(DurationMillisecondType)] datatype.
571    pub fn duration_millisecond_datatype() -> Self {
572        ConcreteDataType::Duration(DurationType::Millisecond(DurationMillisecondType))
573    }
574
575    /// Creates a [Duration(DurationMicrosecondType)] datatype.
576    pub fn duration_microsecond_datatype() -> Self {
577        ConcreteDataType::Duration(DurationType::Microsecond(DurationMicrosecondType))
578    }
579
580    /// Creates a [Duration(DurationNanosecondType)] datatype.
581    pub fn duration_nanosecond_datatype() -> Self {
582        ConcreteDataType::Duration(DurationType::Nanosecond(DurationNanosecondType))
583    }
584
585    /// Creates a [Interval(IntervalMonthDayNanoType)] datatype.
586    pub fn interval_month_day_nano_datatype() -> Self {
587        ConcreteDataType::Interval(IntervalType::MonthDayNano(IntervalMonthDayNanoType))
588    }
589
590    /// Creates a [Interval(IntervalYearMonthType)] datatype.
591    pub fn interval_year_month_datatype() -> Self {
592        ConcreteDataType::Interval(IntervalType::YearMonth(IntervalYearMonthType))
593    }
594
595    /// Creates a [Interval(IntervalDayTimeType)] datatype.
596    pub fn interval_day_time_datatype() -> Self {
597        ConcreteDataType::Interval(IntervalType::DayTime(IntervalDayTimeType))
598    }
599
600    pub fn timestamp_datatype(unit: TimeUnit) -> Self {
601        match unit {
602            TimeUnit::Second => Self::timestamp_second_datatype(),
603            TimeUnit::Millisecond => Self::timestamp_millisecond_datatype(),
604            TimeUnit::Microsecond => Self::timestamp_microsecond_datatype(),
605            TimeUnit::Nanosecond => Self::timestamp_nanosecond_datatype(),
606        }
607    }
608
609    /// Converts from arrow timestamp unit to
610    pub fn from_arrow_time_unit(t: &ArrowTimeUnit) -> Self {
611        match t {
612            ArrowTimeUnit::Second => Self::timestamp_second_datatype(),
613            ArrowTimeUnit::Millisecond => Self::timestamp_millisecond_datatype(),
614            ArrowTimeUnit::Microsecond => Self::timestamp_microsecond_datatype(),
615            ArrowTimeUnit::Nanosecond => Self::timestamp_nanosecond_datatype(),
616        }
617    }
618
619    pub fn duration_datatype(unit: TimeUnit) -> Self {
620        match unit {
621            TimeUnit::Second => Self::duration_second_datatype(),
622            TimeUnit::Millisecond => Self::duration_millisecond_datatype(),
623            TimeUnit::Microsecond => Self::duration_microsecond_datatype(),
624            TimeUnit::Nanosecond => Self::duration_nanosecond_datatype(),
625        }
626    }
627
628    pub fn interval_datatype(unit: IntervalUnit) -> Self {
629        match unit {
630            IntervalUnit::YearMonth => Self::interval_year_month_datatype(),
631            IntervalUnit::DayTime => Self::interval_day_time_datatype(),
632            IntervalUnit::MonthDayNano => Self::interval_month_day_nano_datatype(),
633        }
634    }
635
636    pub fn from_arrow_interval_unit(u: &ArrowIntervalUnit) -> Self {
637        match u {
638            ArrowIntervalUnit::YearMonth => Self::interval_year_month_datatype(),
639            ArrowIntervalUnit::DayTime => Self::interval_day_time_datatype(),
640            ArrowIntervalUnit::MonthDayNano => Self::interval_month_day_nano_datatype(),
641        }
642    }
643
644    pub fn list_datatype(item_type: Arc<ConcreteDataType>) -> ConcreteDataType {
645        ConcreteDataType::List(ListType::new(item_type))
646    }
647
648    pub fn struct_datatype(fields: StructType) -> ConcreteDataType {
649        ConcreteDataType::Struct(fields)
650    }
651
652    pub fn dictionary_datatype(
653        key_type: ConcreteDataType,
654        value_type: ConcreteDataType,
655    ) -> ConcreteDataType {
656        ConcreteDataType::Dictionary(DictionaryType::new(key_type, value_type))
657    }
658
659    pub fn decimal128_datatype(precision: u8, scale: i8) -> ConcreteDataType {
660        ConcreteDataType::Decimal128(Decimal128Type::new(precision, scale))
661    }
662
663    pub fn decimal128_default_datatype() -> ConcreteDataType {
664        Self::decimal128_datatype(DECIMAL128_MAX_PRECISION, DECIMAL_DEFAULT_SCALE)
665    }
666
667    pub fn vector_datatype(dim: u32) -> ConcreteDataType {
668        ConcreteDataType::Vector(VectorType::new(dim))
669    }
670
671    pub fn vector_default_datatype() -> ConcreteDataType {
672        Self::vector_datatype(0)
673    }
674
675    pub fn json_native_datatype(inner_type: ConcreteDataType) -> ConcreteDataType {
676        ConcreteDataType::Json(JsonType::new(JsonFormat::Native(Box::new(inner_type))))
677    }
678}
679
680/// Data type abstraction.
681#[enum_dispatch::enum_dispatch]
682pub trait DataType: std::fmt::Debug + Send + Sync {
683    /// Name of this data type.
684    fn name(&self) -> String;
685
686    /// Returns id of the Logical data type.
687    fn logical_type_id(&self) -> LogicalTypeId;
688
689    /// Returns the default value of this type.
690    fn default_value(&self) -> Value;
691
692    /// Convert this type as [arrow::datatypes::DataType].
693    fn as_arrow_type(&self) -> ArrowDataType;
694
695    /// Creates a mutable vector with given `capacity` of this type.
696    fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector>;
697
698    /// Casts the value to specific DataType.
699    /// Return None if cast failed.
700    fn try_cast(&self, from: Value) -> Option<Value>;
701}
702
703pub type DataTypeRef = Arc<dyn DataType>;
704
705#[cfg(test)]
706mod tests {
707    use arrow::datatypes::Field;
708
709    use super::*;
710
711    #[test]
712    fn test_concrete_type_as_datatype_trait() {
713        let concrete_type = ConcreteDataType::boolean_datatype();
714
715        assert_eq!("Boolean", concrete_type.to_string());
716        assert_eq!(Value::Boolean(false), concrete_type.default_value());
717        assert_eq!(LogicalTypeId::Boolean, concrete_type.logical_type_id());
718        assert_eq!(ArrowDataType::Boolean, concrete_type.as_arrow_type());
719    }
720
721    #[test]
722    fn test_from_arrow_type() {
723        assert!(matches!(
724            ConcreteDataType::from_arrow_type(&ArrowDataType::Null),
725            ConcreteDataType::Null(_)
726        ));
727        assert!(matches!(
728            ConcreteDataType::from_arrow_type(&ArrowDataType::Boolean),
729            ConcreteDataType::Boolean(_)
730        ));
731        assert!(matches!(
732            ConcreteDataType::from_arrow_type(&ArrowDataType::Binary),
733            ConcreteDataType::Binary(_)
734        ));
735        assert!(matches!(
736            ConcreteDataType::from_arrow_type(&ArrowDataType::LargeBinary),
737            ConcreteDataType::Binary(_)
738        ));
739        assert!(matches!(
740            ConcreteDataType::from_arrow_type(&ArrowDataType::Int8),
741            ConcreteDataType::Int8(_)
742        ));
743        assert!(matches!(
744            ConcreteDataType::from_arrow_type(&ArrowDataType::Int16),
745            ConcreteDataType::Int16(_)
746        ));
747        assert!(matches!(
748            ConcreteDataType::from_arrow_type(&ArrowDataType::Int32),
749            ConcreteDataType::Int32(_)
750        ));
751        assert!(matches!(
752            ConcreteDataType::from_arrow_type(&ArrowDataType::Int64),
753            ConcreteDataType::Int64(_)
754        ));
755        assert!(matches!(
756            ConcreteDataType::from_arrow_type(&ArrowDataType::UInt8),
757            ConcreteDataType::UInt8(_)
758        ));
759        assert!(matches!(
760            ConcreteDataType::from_arrow_type(&ArrowDataType::UInt16),
761            ConcreteDataType::UInt16(_)
762        ));
763        assert!(matches!(
764            ConcreteDataType::from_arrow_type(&ArrowDataType::UInt32),
765            ConcreteDataType::UInt32(_)
766        ));
767        assert!(matches!(
768            ConcreteDataType::from_arrow_type(&ArrowDataType::UInt64),
769            ConcreteDataType::UInt64(_)
770        ));
771        assert!(matches!(
772            ConcreteDataType::from_arrow_type(&ArrowDataType::Float32),
773            ConcreteDataType::Float32(_)
774        ));
775        assert!(matches!(
776            ConcreteDataType::from_arrow_type(&ArrowDataType::Float64),
777            ConcreteDataType::Float64(_)
778        ));
779        assert!(matches!(
780            ConcreteDataType::from_arrow_type(&ArrowDataType::Utf8),
781            ConcreteDataType::String(_)
782        ));
783        // Test LargeUtf8 mapping to large String type
784        let large_string_type = ConcreteDataType::from_arrow_type(&ArrowDataType::LargeUtf8);
785        assert!(matches!(large_string_type, ConcreteDataType::String(_)));
786        if let ConcreteDataType::String(string_type) = &large_string_type {
787            assert!(string_type.is_large());
788        } else {
789            panic!("Expected a String type");
790        }
791        assert_eq!(
792            ConcreteDataType::from_arrow_type(&ArrowDataType::List(Arc::new(Field::new(
793                "item",
794                ArrowDataType::Int32,
795                true,
796            )))),
797            ConcreteDataType::List(ListType::new(Arc::new(ConcreteDataType::int32_datatype())))
798        );
799        assert!(matches!(
800            ConcreteDataType::from_arrow_type(&ArrowDataType::Date32),
801            ConcreteDataType::Date(_)
802        ));
803    }
804
805    #[test]
806    fn test_large_utf8_round_trip() {
807        // Test round-trip conversion for LargeUtf8
808        let large_utf8_arrow = ArrowDataType::LargeUtf8;
809        let concrete_type = ConcreteDataType::from_arrow_type(&large_utf8_arrow);
810        let back_to_arrow = concrete_type.as_arrow_type();
811
812        assert!(matches!(concrete_type, ConcreteDataType::String(_)));
813        // Round-trip should preserve the LargeUtf8 type
814        assert_eq!(large_utf8_arrow, back_to_arrow);
815
816        // Test that Utf8 and LargeUtf8 map to different string variants
817        let utf8_concrete = ConcreteDataType::from_arrow_type(&ArrowDataType::Utf8);
818        let large_utf8_concrete = ConcreteDataType::from_arrow_type(&ArrowDataType::LargeUtf8);
819
820        assert!(matches!(utf8_concrete, ConcreteDataType::String(_)));
821        assert!(matches!(large_utf8_concrete, ConcreteDataType::String(_)));
822
823        // They should have different size types
824        if let (ConcreteDataType::String(utf8_type), ConcreteDataType::String(large_type)) =
825            (&utf8_concrete, &large_utf8_concrete)
826        {
827            assert!(!utf8_type.is_large());
828            assert!(large_type.is_large());
829        } else {
830            panic!("Expected both to be String types");
831        }
832
833        // They should be different types
834        assert_ne!(utf8_concrete, large_utf8_concrete);
835    }
836
837    #[test]
838    fn test_from_arrow_timestamp() {
839        assert_eq!(
840            ConcreteDataType::timestamp_millisecond_datatype(),
841            ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Millisecond)
842        );
843        assert_eq!(
844            ConcreteDataType::timestamp_microsecond_datatype(),
845            ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Microsecond)
846        );
847        assert_eq!(
848            ConcreteDataType::timestamp_nanosecond_datatype(),
849            ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Nanosecond)
850        );
851        assert_eq!(
852            ConcreteDataType::timestamp_second_datatype(),
853            ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Second)
854        );
855    }
856
857    #[test]
858    fn test_is_null() {
859        assert!(ConcreteDataType::null_datatype().is_null());
860        assert!(!ConcreteDataType::int32_datatype().is_null());
861    }
862
863    #[test]
864    fn test_is_float() {
865        assert!(!ConcreteDataType::int32_datatype().is_float());
866        assert!(ConcreteDataType::float32_datatype().is_float());
867        assert!(ConcreteDataType::float64_datatype().is_float());
868    }
869
870    #[test]
871    fn test_is_boolean() {
872        assert!(!ConcreteDataType::int32_datatype().is_boolean());
873        assert!(!ConcreteDataType::float32_datatype().is_boolean());
874        assert!(ConcreteDataType::boolean_datatype().is_boolean());
875    }
876
877    #[test]
878    fn test_is_decimal() {
879        assert!(!ConcreteDataType::int32_datatype().is_decimal());
880        assert!(!ConcreteDataType::float32_datatype().is_decimal());
881        assert!(ConcreteDataType::decimal128_datatype(10, 2).is_decimal());
882        assert!(ConcreteDataType::decimal128_datatype(18, 6).is_decimal());
883    }
884
885    #[test]
886    fn test_is_stringifiable() {
887        assert!(!ConcreteDataType::int32_datatype().is_stringifiable());
888        assert!(!ConcreteDataType::float32_datatype().is_stringifiable());
889        assert!(ConcreteDataType::string_datatype().is_stringifiable());
890        assert!(ConcreteDataType::binary_datatype().is_stringifiable());
891        assert!(ConcreteDataType::date_datatype().is_stringifiable());
892        assert!(ConcreteDataType::timestamp_second_datatype().is_stringifiable());
893        assert!(ConcreteDataType::timestamp_millisecond_datatype().is_stringifiable());
894        assert!(ConcreteDataType::timestamp_microsecond_datatype().is_stringifiable());
895        assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_stringifiable());
896        assert!(ConcreteDataType::time_second_datatype().is_stringifiable());
897        assert!(ConcreteDataType::time_millisecond_datatype().is_stringifiable());
898        assert!(ConcreteDataType::time_microsecond_datatype().is_stringifiable());
899        assert!(ConcreteDataType::time_nanosecond_datatype().is_stringifiable());
900
901        assert!(ConcreteDataType::interval_year_month_datatype().is_stringifiable());
902        assert!(ConcreteDataType::interval_day_time_datatype().is_stringifiable());
903        assert!(ConcreteDataType::interval_month_day_nano_datatype().is_stringifiable());
904
905        assert!(ConcreteDataType::duration_second_datatype().is_stringifiable());
906        assert!(ConcreteDataType::duration_millisecond_datatype().is_stringifiable());
907        assert!(ConcreteDataType::duration_microsecond_datatype().is_stringifiable());
908        assert!(ConcreteDataType::duration_nanosecond_datatype().is_stringifiable());
909        assert!(ConcreteDataType::decimal128_datatype(10, 2).is_stringifiable());
910        assert!(ConcreteDataType::vector_default_datatype().is_stringifiable());
911    }
912
913    #[test]
914    fn test_is_signed() {
915        assert!(ConcreteDataType::int8_datatype().is_signed());
916        assert!(ConcreteDataType::int16_datatype().is_signed());
917        assert!(ConcreteDataType::int32_datatype().is_signed());
918        assert!(ConcreteDataType::int64_datatype().is_signed());
919        assert!(ConcreteDataType::date_datatype().is_signed());
920        assert!(ConcreteDataType::timestamp_second_datatype().is_signed());
921        assert!(ConcreteDataType::timestamp_millisecond_datatype().is_signed());
922        assert!(ConcreteDataType::timestamp_microsecond_datatype().is_signed());
923        assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_signed());
924        assert!(ConcreteDataType::time_second_datatype().is_signed());
925        assert!(ConcreteDataType::time_millisecond_datatype().is_signed());
926        assert!(ConcreteDataType::time_microsecond_datatype().is_signed());
927        assert!(ConcreteDataType::time_nanosecond_datatype().is_signed());
928        assert!(ConcreteDataType::interval_year_month_datatype().is_signed());
929        assert!(ConcreteDataType::interval_day_time_datatype().is_signed());
930        assert!(ConcreteDataType::interval_month_day_nano_datatype().is_signed());
931        assert!(ConcreteDataType::duration_second_datatype().is_signed());
932        assert!(ConcreteDataType::duration_millisecond_datatype().is_signed());
933        assert!(ConcreteDataType::duration_microsecond_datatype().is_signed());
934        assert!(ConcreteDataType::duration_nanosecond_datatype().is_signed());
935
936        assert!(!ConcreteDataType::uint8_datatype().is_signed());
937        assert!(!ConcreteDataType::uint16_datatype().is_signed());
938        assert!(!ConcreteDataType::uint32_datatype().is_signed());
939        assert!(!ConcreteDataType::uint64_datatype().is_signed());
940
941        assert!(!ConcreteDataType::float32_datatype().is_signed());
942        assert!(!ConcreteDataType::float64_datatype().is_signed());
943
944        assert!(ConcreteDataType::decimal128_datatype(10, 2).is_signed());
945    }
946
947    #[test]
948    fn test_is_unsigned() {
949        assert!(!ConcreteDataType::int8_datatype().is_unsigned());
950        assert!(!ConcreteDataType::int16_datatype().is_unsigned());
951        assert!(!ConcreteDataType::int32_datatype().is_unsigned());
952        assert!(!ConcreteDataType::int64_datatype().is_unsigned());
953        assert!(!ConcreteDataType::date_datatype().is_unsigned());
954        assert!(!ConcreteDataType::timestamp_second_datatype().is_unsigned());
955        assert!(!ConcreteDataType::timestamp_millisecond_datatype().is_unsigned());
956        assert!(!ConcreteDataType::timestamp_microsecond_datatype().is_unsigned());
957        assert!(!ConcreteDataType::timestamp_nanosecond_datatype().is_unsigned());
958        assert!(!ConcreteDataType::time_second_datatype().is_unsigned());
959        assert!(!ConcreteDataType::time_millisecond_datatype().is_unsigned());
960        assert!(!ConcreteDataType::time_microsecond_datatype().is_unsigned());
961        assert!(!ConcreteDataType::time_nanosecond_datatype().is_unsigned());
962        assert!(!ConcreteDataType::interval_year_month_datatype().is_unsigned());
963        assert!(!ConcreteDataType::interval_day_time_datatype().is_unsigned());
964        assert!(!ConcreteDataType::interval_month_day_nano_datatype().is_unsigned());
965        assert!(!ConcreteDataType::duration_second_datatype().is_unsigned());
966        assert!(!ConcreteDataType::duration_millisecond_datatype().is_unsigned());
967        assert!(!ConcreteDataType::duration_microsecond_datatype().is_unsigned());
968        assert!(!ConcreteDataType::duration_nanosecond_datatype().is_unsigned());
969        assert!(!ConcreteDataType::decimal128_datatype(10, 2).is_unsigned());
970
971        assert!(ConcreteDataType::uint8_datatype().is_unsigned());
972        assert!(ConcreteDataType::uint16_datatype().is_unsigned());
973        assert!(ConcreteDataType::uint32_datatype().is_unsigned());
974        assert!(ConcreteDataType::uint64_datatype().is_unsigned());
975
976        assert!(!ConcreteDataType::float32_datatype().is_unsigned());
977        assert!(!ConcreteDataType::float64_datatype().is_unsigned());
978    }
979
980    #[test]
981    fn test_numerics() {
982        let nums = ConcreteDataType::numerics();
983        assert_eq!(10, nums.len());
984    }
985
986    #[test]
987    fn test_as_list() {
988        let list_type =
989            ConcreteDataType::list_datatype(Arc::new(ConcreteDataType::int32_datatype()));
990        assert_eq!(
991            ListType::new(Arc::new(ConcreteDataType::int32_datatype())),
992            *list_type.as_list().unwrap()
993        );
994        assert!(ConcreteDataType::int32_datatype().as_list().is_none());
995    }
996
997    #[test]
998    fn test_display_concrete_data_type() {
999        assert_eq!(ConcreteDataType::null_datatype().to_string(), "Null");
1000        assert_eq!(ConcreteDataType::boolean_datatype().to_string(), "Boolean");
1001        assert_eq!(ConcreteDataType::binary_datatype().to_string(), "Binary");
1002        assert_eq!(ConcreteDataType::int8_datatype().to_string(), "Int8");
1003        assert_eq!(ConcreteDataType::int16_datatype().to_string(), "Int16");
1004        assert_eq!(ConcreteDataType::int32_datatype().to_string(), "Int32");
1005        assert_eq!(ConcreteDataType::int64_datatype().to_string(), "Int64");
1006        assert_eq!(ConcreteDataType::uint8_datatype().to_string(), "UInt8");
1007        assert_eq!(ConcreteDataType::uint16_datatype().to_string(), "UInt16");
1008        assert_eq!(ConcreteDataType::uint32_datatype().to_string(), "UInt32");
1009        assert_eq!(ConcreteDataType::uint64_datatype().to_string(), "UInt64");
1010        assert_eq!(ConcreteDataType::float32_datatype().to_string(), "Float32");
1011        assert_eq!(ConcreteDataType::float64_datatype().to_string(), "Float64");
1012        assert_eq!(ConcreteDataType::string_datatype().to_string(), "String");
1013        assert_eq!(ConcreteDataType::date_datatype().to_string(), "Date");
1014        assert_eq!(
1015            ConcreteDataType::timestamp_millisecond_datatype().to_string(),
1016            "TimestampMillisecond"
1017        );
1018        assert_eq!(
1019            ConcreteDataType::time_millisecond_datatype().to_string(),
1020            "TimeMillisecond"
1021        );
1022        assert_eq!(
1023            ConcreteDataType::interval_month_day_nano_datatype().to_string(),
1024            "IntervalMonthDayNano"
1025        );
1026        assert_eq!(
1027            ConcreteDataType::duration_second_datatype().to_string(),
1028            "DurationSecond"
1029        );
1030        assert_eq!(
1031            ConcreteDataType::decimal128_datatype(10, 2).to_string(),
1032            "Decimal(10, 2)"
1033        );
1034        // Nested types
1035        assert_eq!(
1036            ConcreteDataType::list_datatype(Arc::new(ConcreteDataType::int32_datatype()))
1037                .to_string(),
1038            "List<Int32>"
1039        );
1040        assert_eq!(
1041            ConcreteDataType::list_datatype(Arc::new(ConcreteDataType::Dictionary(
1042                DictionaryType::new(
1043                    ConcreteDataType::int32_datatype(),
1044                    ConcreteDataType::string_datatype()
1045                )
1046            )))
1047            .to_string(),
1048            "List<Dictionary<Int32, String>>"
1049        );
1050        assert_eq!(
1051            ConcreteDataType::list_datatype(Arc::new(ConcreteDataType::list_datatype(Arc::new(
1052                ConcreteDataType::list_datatype(Arc::new(ConcreteDataType::int32_datatype()))
1053            ))))
1054            .to_string(),
1055            "List<List<List<Int32>>>"
1056        );
1057        assert_eq!(
1058            ConcreteDataType::dictionary_datatype(
1059                ConcreteDataType::int32_datatype(),
1060                ConcreteDataType::string_datatype()
1061            )
1062            .to_string(),
1063            "Dictionary<Int32, String>"
1064        );
1065        assert_eq!(
1066            ConcreteDataType::vector_datatype(3).to_string(),
1067            "Vector(3)"
1068        );
1069    }
1070}