datatypes/types/
string_type.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::sync::Arc;
16
17use arrow::datatypes::DataType as ArrowDataType;
18use common_base::bytes::StringBytes;
19use serde::{Deserialize, Serialize};
20
21use crate::data_type::{DataType, DataTypeRef};
22use crate::type_id::LogicalTypeId;
23use crate::value::Value;
24use crate::vectors::{MutableVector, StringVectorBuilder};
25
26/// String size variant to distinguish between UTF8 and LargeUTF8
27#[derive(
28    Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize, Default,
29)]
30pub enum StringSizeType {
31    /// Regular UTF8 strings (up to 2GB)
32    #[default]
33    Utf8,
34    /// Large UTF8 strings (up to 2^63 bytes)
35    LargeUtf8,
36}
37
38#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize)]
39pub struct StringType {
40    #[serde(default)]
41    size_type: StringSizeType,
42}
43
44/// Custom deserialization to support both old and new formats.
45impl<'de> serde::Deserialize<'de> for StringType {
46    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
47    where
48        D: serde::Deserializer<'de>,
49    {
50        #[derive(serde::Deserialize)]
51        struct Helper {
52            #[serde(default)]
53            size_type: StringSizeType,
54        }
55
56        let opt = Option::<Helper>::deserialize(deserializer)?;
57        Ok(match opt {
58            Some(helper) => Self {
59                size_type: helper.size_type,
60            },
61            None => Self::default(),
62        })
63    }
64}
65
66impl Default for StringType {
67    fn default() -> Self {
68        Self {
69            size_type: StringSizeType::Utf8,
70        }
71    }
72}
73
74impl StringType {
75    /// Create a new StringType with default (Utf8) size
76    pub fn new() -> Self {
77        Self {
78            size_type: StringSizeType::Utf8,
79        }
80    }
81
82    /// Create a new StringType with specified size
83    pub fn with_size(size_type: StringSizeType) -> Self {
84        Self { size_type }
85    }
86
87    /// Create a StringType for regular UTF8 strings
88    pub fn utf8() -> Self {
89        Self::with_size(StringSizeType::Utf8)
90    }
91
92    /// Create a StringType for large UTF8 strings
93    pub fn large_utf8() -> Self {
94        Self::with_size(StringSizeType::LargeUtf8)
95    }
96
97    /// Get the size type
98    pub fn size_type(&self) -> StringSizeType {
99        self.size_type
100    }
101
102    /// Check if this is a large UTF8 string type
103    pub fn is_large(&self) -> bool {
104        matches!(self.size_type, StringSizeType::LargeUtf8)
105    }
106
107    pub fn arc() -> DataTypeRef {
108        Arc::new(Self::new())
109    }
110
111    pub fn large_arc() -> DataTypeRef {
112        Arc::new(Self::large_utf8())
113    }
114}
115
116impl DataType for StringType {
117    fn name(&self) -> String {
118        "String".to_string()
119    }
120
121    fn logical_type_id(&self) -> LogicalTypeId {
122        LogicalTypeId::String
123    }
124
125    fn default_value(&self) -> Value {
126        StringBytes::default().into()
127    }
128
129    fn as_arrow_type(&self) -> ArrowDataType {
130        match self.size_type {
131            StringSizeType::Utf8 => ArrowDataType::Utf8,
132            StringSizeType::LargeUtf8 => ArrowDataType::LargeUtf8,
133        }
134    }
135
136    fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector> {
137        match self.size_type {
138            StringSizeType::Utf8 => Box::new(StringVectorBuilder::with_string_capacity(capacity)),
139            StringSizeType::LargeUtf8 => {
140                Box::new(StringVectorBuilder::with_large_capacity(capacity))
141            }
142        }
143    }
144
145    fn try_cast(&self, from: Value) -> Option<Value> {
146        if from.logical_type_id() == self.logical_type_id() {
147            return Some(from);
148        }
149
150        match from {
151            Value::Null => Some(Value::String(StringBytes::from("null".to_string()))),
152
153            Value::Boolean(v) => Some(Value::String(StringBytes::from(v.to_string()))),
154            Value::UInt8(v) => Some(Value::String(StringBytes::from(v.to_string()))),
155            Value::UInt16(v) => Some(Value::String(StringBytes::from(v.to_string()))),
156            Value::UInt32(v) => Some(Value::String(StringBytes::from(v.to_string()))),
157            Value::UInt64(v) => Some(Value::String(StringBytes::from(v.to_string()))),
158            Value::Int8(v) => Some(Value::String(StringBytes::from(v.to_string()))),
159            Value::Int16(v) => Some(Value::String(StringBytes::from(v.to_string()))),
160            Value::Int32(v) => Some(Value::String(StringBytes::from(v.to_string()))),
161            Value::Int64(v) => Some(Value::String(StringBytes::from(v.to_string()))),
162            Value::Float32(v) => Some(Value::String(StringBytes::from(v.to_string()))),
163            Value::Float64(v) => Some(Value::String(StringBytes::from(v.to_string()))),
164            Value::String(v) => Some(Value::String(v)),
165            Value::Date(v) => Some(Value::String(StringBytes::from(v.to_string()))),
166            Value::Timestamp(v) => Some(Value::String(StringBytes::from(v.to_iso8601_string()))),
167            Value::Time(v) => Some(Value::String(StringBytes::from(v.to_iso8601_string()))),
168            Value::IntervalYearMonth(v) => {
169                Some(Value::String(StringBytes::from(v.to_iso8601_string())))
170            }
171            Value::IntervalDayTime(v) => {
172                Some(Value::String(StringBytes::from(v.to_iso8601_string())))
173            }
174            Value::IntervalMonthDayNano(v) => {
175                Some(Value::String(StringBytes::from(v.to_iso8601_string())))
176            }
177            Value::Duration(v) => Some(Value::String(StringBytes::from(v.to_string()))),
178            Value::Decimal128(v) => Some(Value::String(StringBytes::from(v.to_string()))),
179
180            Value::Json(v) => self.try_cast(*v),
181
182            // StringBytes is only support for utf-8, Value::Binary and collections are not allowed.
183            Value::Binary(_) | Value::List(_) | Value::Struct(_) => None,
184        }
185    }
186}