datatypes/types/
string_type.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::sync::Arc;
16
17use arrow::datatypes::DataType as ArrowDataType;
18use common_base::bytes::StringBytes;
19use serde::{Deserialize, Serialize};
20
21use crate::data_type::{DataType, DataTypeRef};
22use crate::type_id::LogicalTypeId;
23use crate::value::Value;
24use crate::vectors::{MutableVector, StringVectorBuilder};
25
26/// String size variant to distinguish between UTF8 and LargeUTF8
27#[derive(
28    Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize, Default,
29)]
30pub enum StringSizeType {
31    /// Regular UTF8 strings (up to 2GB)
32    #[default]
33    Utf8,
34    /// Large UTF8 strings (up to 2^63 bytes)
35    LargeUtf8,
36    /// A view into string data (Arrow `Utf8View`)
37    Utf8View,
38}
39
40#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize)]
41pub struct StringType {
42    #[serde(default)]
43    size_type: StringSizeType,
44}
45
46/// Custom deserialization to support both old and new formats.
47impl<'de> serde::Deserialize<'de> for StringType {
48    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
49    where
50        D: serde::Deserializer<'de>,
51    {
52        #[derive(serde::Deserialize)]
53        struct Helper {
54            #[serde(default)]
55            size_type: StringSizeType,
56        }
57
58        let opt = Option::<Helper>::deserialize(deserializer)?;
59        Ok(match opt {
60            Some(helper) => Self {
61                size_type: helper.size_type,
62            },
63            None => Self::default(),
64        })
65    }
66}
67
68impl Default for StringType {
69    fn default() -> Self {
70        Self {
71            size_type: StringSizeType::Utf8,
72        }
73    }
74}
75
76impl StringType {
77    /// Create a new StringType with default (Utf8) size
78    pub fn new() -> Self {
79        Self {
80            size_type: StringSizeType::Utf8,
81        }
82    }
83
84    /// Create a new StringType with specified size
85    pub fn with_size(size_type: StringSizeType) -> Self {
86        Self { size_type }
87    }
88
89    /// Create a StringType for regular UTF8 strings
90    pub fn utf8() -> Self {
91        Self::with_size(StringSizeType::Utf8)
92    }
93
94    /// Create a StringType for large UTF8 strings
95    pub fn large_utf8() -> Self {
96        Self::with_size(StringSizeType::LargeUtf8)
97    }
98
99    /// Create a StringType for view strings
100    pub fn utf8_view() -> Self {
101        Self::with_size(StringSizeType::Utf8View)
102    }
103
104    /// Get the size type
105    pub fn size_type(&self) -> StringSizeType {
106        self.size_type
107    }
108
109    /// Check if this is a large UTF8 string type
110    pub fn is_large(&self) -> bool {
111        matches!(self.size_type, StringSizeType::LargeUtf8)
112    }
113
114    pub fn is_view(&self) -> bool {
115        matches!(self.size_type, StringSizeType::Utf8View)
116    }
117
118    pub fn arc() -> DataTypeRef {
119        Arc::new(Self::new())
120    }
121
122    pub fn large_arc() -> DataTypeRef {
123        Arc::new(Self::large_utf8())
124    }
125
126    pub fn view_arc() -> DataTypeRef {
127        Arc::new(Self::utf8_view())
128    }
129}
130
131impl DataType for StringType {
132    fn name(&self) -> String {
133        "String".to_string()
134    }
135
136    fn logical_type_id(&self) -> LogicalTypeId {
137        LogicalTypeId::String
138    }
139
140    fn default_value(&self) -> Value {
141        StringBytes::default().into()
142    }
143
144    fn as_arrow_type(&self) -> ArrowDataType {
145        match self.size_type {
146            StringSizeType::Utf8 => ArrowDataType::Utf8,
147            StringSizeType::LargeUtf8 => ArrowDataType::LargeUtf8,
148            StringSizeType::Utf8View => ArrowDataType::Utf8View,
149        }
150    }
151
152    fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector> {
153        match self.size_type {
154            StringSizeType::Utf8 => Box::new(StringVectorBuilder::with_string_capacity(capacity)),
155            StringSizeType::LargeUtf8 => {
156                Box::new(StringVectorBuilder::with_large_capacity(capacity))
157            }
158            StringSizeType::Utf8View => Box::new(StringVectorBuilder::with_view_capacity(capacity)),
159        }
160    }
161
162    fn try_cast(&self, from: Value) -> Option<Value> {
163        if from.logical_type_id() == self.logical_type_id() {
164            return Some(from);
165        }
166
167        match from {
168            Value::Null => Some(Value::String(StringBytes::from("null".to_string()))),
169
170            Value::Boolean(v) => Some(Value::String(StringBytes::from(v.to_string()))),
171            Value::UInt8(v) => Some(Value::String(StringBytes::from(v.to_string()))),
172            Value::UInt16(v) => Some(Value::String(StringBytes::from(v.to_string()))),
173            Value::UInt32(v) => Some(Value::String(StringBytes::from(v.to_string()))),
174            Value::UInt64(v) => Some(Value::String(StringBytes::from(v.to_string()))),
175            Value::Int8(v) => Some(Value::String(StringBytes::from(v.to_string()))),
176            Value::Int16(v) => Some(Value::String(StringBytes::from(v.to_string()))),
177            Value::Int32(v) => Some(Value::String(StringBytes::from(v.to_string()))),
178            Value::Int64(v) => Some(Value::String(StringBytes::from(v.to_string()))),
179            Value::Float32(v) => Some(Value::String(StringBytes::from(v.to_string()))),
180            Value::Float64(v) => Some(Value::String(StringBytes::from(v.to_string()))),
181            Value::String(v) => Some(Value::String(v)),
182            Value::Date(v) => Some(Value::String(StringBytes::from(v.to_string()))),
183            Value::Timestamp(v) => Some(Value::String(StringBytes::from(v.to_iso8601_string()))),
184            Value::Time(v) => Some(Value::String(StringBytes::from(v.to_iso8601_string()))),
185            Value::IntervalYearMonth(v) => {
186                Some(Value::String(StringBytes::from(v.to_iso8601_string())))
187            }
188            Value::IntervalDayTime(v) => {
189                Some(Value::String(StringBytes::from(v.to_iso8601_string())))
190            }
191            Value::IntervalMonthDayNano(v) => {
192                Some(Value::String(StringBytes::from(v.to_iso8601_string())))
193            }
194            Value::Duration(v) => Some(Value::String(StringBytes::from(v.to_string()))),
195            Value::Decimal128(v) => Some(Value::String(StringBytes::from(v.to_string()))),
196
197            Value::Json(v) => serde_json::to_string(v.as_ref()).ok().map(|s| s.into()),
198
199            // StringBytes is only support for utf-8, Value::Binary and collections are not allowed.
200            Value::Binary(_) | Value::List(_) | Value::Struct(_) => None,
201        }
202    }
203}