datatypes/
vectors.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::any::Any;
16use std::fmt::Debug;
17use std::sync::Arc;
18
19use arrow::array::{Array, ArrayRef};
20use snafu::ensure;
21
22use crate::data_type::ConcreteDataType;
23use crate::error::{self, Result};
24use crate::serialize::Serializable;
25use crate::value::{Value, ValueRef};
26use crate::vectors::operations::VectorOp;
27
28mod binary;
29mod boolean;
30mod constant;
31mod date;
32mod decimal;
33mod dictionary;
34mod duration;
35mod eq;
36mod helper;
37mod interval;
38mod list;
39mod null;
40pub(crate) mod operations;
41mod primitive;
42mod string;
43mod time;
44mod timestamp;
45mod validity;
46
47pub use binary::{BinaryVector, BinaryVectorBuilder};
48pub use boolean::{BooleanVector, BooleanVectorBuilder};
49pub use constant::ConstantVector;
50pub use date::{DateVector, DateVectorBuilder};
51pub use decimal::{Decimal128Vector, Decimal128VectorBuilder};
52pub use dictionary::{DictionaryIter, DictionaryVector};
53pub use duration::{
54    DurationMicrosecondVector, DurationMicrosecondVectorBuilder, DurationMillisecondVector,
55    DurationMillisecondVectorBuilder, DurationNanosecondVector, DurationNanosecondVectorBuilder,
56    DurationSecondVector, DurationSecondVectorBuilder,
57};
58pub use helper::Helper;
59pub use interval::{
60    IntervalDayTimeVector, IntervalDayTimeVectorBuilder, IntervalMonthDayNanoVector,
61    IntervalMonthDayNanoVectorBuilder, IntervalYearMonthVector, IntervalYearMonthVectorBuilder,
62};
63pub use list::{ListIter, ListVector, ListVectorBuilder};
64pub use null::{NullVector, NullVectorBuilder};
65pub use primitive::{
66    Float32Vector, Float32VectorBuilder, Float64Vector, Float64VectorBuilder, Int16Vector,
67    Int16VectorBuilder, Int32Vector, Int32VectorBuilder, Int64Vector, Int64VectorBuilder,
68    Int8Vector, Int8VectorBuilder, PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder,
69    UInt16Vector, UInt16VectorBuilder, UInt32Vector, UInt32VectorBuilder, UInt64Vector,
70    UInt64VectorBuilder, UInt8Vector, UInt8VectorBuilder,
71};
72pub use string::{StringVector, StringVectorBuilder};
73pub use time::{
74    TimeMicrosecondVector, TimeMicrosecondVectorBuilder, TimeMillisecondVector,
75    TimeMillisecondVectorBuilder, TimeNanosecondVector, TimeNanosecondVectorBuilder,
76    TimeSecondVector, TimeSecondVectorBuilder,
77};
78pub use timestamp::{
79    TimestampMicrosecondVector, TimestampMicrosecondVectorBuilder, TimestampMillisecondVector,
80    TimestampMillisecondVectorBuilder, TimestampNanosecondVector, TimestampNanosecondVectorBuilder,
81    TimestampSecondVector, TimestampSecondVectorBuilder,
82};
83pub use validity::Validity;
84
85// TODO(yingwen): arrow 28.0 implements Clone for all arrays, we could upgrade to it and simplify
86// some codes in methods such as `to_arrow_array()` and `to_boxed_arrow_array()`.
87/// Vector of data values.
88pub trait Vector: Send + Sync + Serializable + Debug + VectorOp {
89    /// Returns the data type of the vector.
90    ///
91    /// This may require heap allocation.
92    fn data_type(&self) -> ConcreteDataType;
93
94    fn vector_type_name(&self) -> String;
95
96    /// Returns the vector as [Any](std::any::Any) so that it can be
97    /// downcast to a specific implementation.
98    fn as_any(&self) -> &dyn Any;
99
100    /// Returns number of elements in the vector.
101    fn len(&self) -> usize;
102
103    /// Returns whether the vector is empty.
104    fn is_empty(&self) -> bool {
105        self.len() == 0
106    }
107
108    /// Convert this vector to a new arrow [ArrayRef].
109    fn to_arrow_array(&self) -> ArrayRef;
110
111    /// Convert this vector to a new boxed arrow [Array].
112    fn to_boxed_arrow_array(&self) -> Box<dyn Array>;
113
114    /// Returns the validity of the Array.
115    fn validity(&self) -> Validity;
116
117    /// Returns the memory size of vector.
118    fn memory_size(&self) -> usize;
119
120    /// The number of null slots on this [`Vector`].
121    /// # Implementation
122    /// This is `O(1)`.
123    fn null_count(&self) -> usize;
124
125    /// Returns true when it's a ConstantColumn
126    fn is_const(&self) -> bool {
127        false
128    }
129
130    /// Returns whether row is null.
131    fn is_null(&self, row: usize) -> bool;
132
133    /// If the vector only contains NULL.
134    fn only_null(&self) -> bool {
135        self.null_count() == self.len()
136    }
137
138    /// Slices the `Vector`, returning a new `VectorRef`.
139    ///
140    /// # Panics
141    /// This function panics if `offset + length > self.len()`.
142    fn slice(&self, offset: usize, length: usize) -> VectorRef;
143
144    /// Returns the clone of value at `index`.
145    ///
146    /// # Panics
147    /// Panic if `index` is out of bound.
148    fn get(&self, index: usize) -> Value;
149
150    /// Returns the clone of value at `index` or error if `index`
151    /// is out of bound.
152    fn try_get(&self, index: usize) -> Result<Value> {
153        ensure!(
154            index < self.len(),
155            error::BadArrayAccessSnafu {
156                index,
157                size: self.len()
158            }
159        );
160        Ok(self.get(index))
161    }
162
163    /// Returns the reference of value at `index`.
164    ///
165    /// # Panics
166    /// Panic if `index` is out of bound.
167    fn get_ref(&self, index: usize) -> ValueRef;
168}
169
170pub type VectorRef = Arc<dyn Vector>;
171
172/// Mutable vector that could be used to build an immutable vector.
173pub trait MutableVector: Send + Sync {
174    /// Returns the data type of the vector.
175    fn data_type(&self) -> ConcreteDataType;
176
177    /// Returns the length of the vector.
178    fn len(&self) -> usize;
179
180    /// Returns whether the vector is empty.
181    fn is_empty(&self) -> bool {
182        self.len() == 0
183    }
184
185    /// Convert to Any, to enable dynamic casting.
186    fn as_any(&self) -> &dyn Any;
187
188    /// Convert to mutable Any, to enable dynamic casting.
189    fn as_mut_any(&mut self) -> &mut dyn Any;
190
191    /// Convert `self` to an (immutable) [VectorRef] and reset `self`.
192    fn to_vector(&mut self) -> VectorRef;
193
194    /// Convert `self` to an (immutable) [VectorRef] and without resetting `self`.
195    fn to_vector_cloned(&self) -> VectorRef;
196
197    /// Try to push value ref to this mutable vector.
198    fn try_push_value_ref(&mut self, value: ValueRef) -> Result<()>;
199
200    /// Push value ref to this mutable vector.
201    ///
202    /// # Panics
203    /// Panics if error if data types mismatch.
204    fn push_value_ref(&mut self, value: ValueRef) {
205        self.try_push_value_ref(value).unwrap_or_else(|_| {
206            panic!(
207                "expecting pushing value of datatype {:?}, actual {:?}",
208                self.data_type(),
209                value
210            );
211        });
212    }
213
214    /// Push null to this mutable vector.
215    fn push_null(&mut self);
216
217    /// Push nulls to this mutable vector.
218    fn push_nulls(&mut self, num_nulls: usize) {
219        for _ in 0..num_nulls {
220            self.push_null();
221        }
222    }
223
224    /// Extend this mutable vector by slice of `vector`.
225    ///
226    /// Returns error if data types mismatch.
227    ///
228    /// # Panics
229    /// Panics if `offset + length > vector.len()`.
230    fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()>;
231}
232
233/// Helper to define `try_from_arrow_array(array: arrow::array::ArrayRef)` function.
234macro_rules! impl_try_from_arrow_array_for_vector {
235    ($Array: ident, $Vector: ident) => {
236        impl $Vector {
237            pub fn try_from_arrow_array(
238                array: impl AsRef<dyn arrow::array::Array>,
239            ) -> crate::error::Result<$Vector> {
240                use snafu::OptionExt;
241
242                let arrow_array = array
243                    .as_ref()
244                    .as_any()
245                    .downcast_ref::<$Array>()
246                    .with_context(|| crate::error::ConversionSnafu {
247                        from: std::format!("{:?}", array.as_ref().data_type()),
248                    })?
249                    .clone();
250
251                Ok($Vector::from(arrow_array))
252            }
253        }
254    };
255}
256
257macro_rules! impl_validity_for_vector {
258    ($array: expr) => {
259        Validity::from_array_data($array.to_data())
260    };
261}
262
263macro_rules! impl_get_for_vector {
264    ($array: expr, $index: ident) => {
265        if $array.is_valid($index) {
266            // Safety: The index have been checked by `is_valid()`.
267            unsafe { $array.value_unchecked($index).into() }
268        } else {
269            Value::Null
270        }
271    };
272}
273
274macro_rules! impl_get_ref_for_vector {
275    ($array: expr, $index: ident) => {
276        if $array.is_valid($index) {
277            // Safety: The index have been checked by `is_valid()`.
278            unsafe { $array.value_unchecked($index).into() }
279        } else {
280            ValueRef::Null
281        }
282    };
283}
284
285macro_rules! impl_extend_for_builder {
286    ($mutable_vector: expr, $vector: ident, $VectorType: ident, $offset: ident, $length: ident) => {{
287        use snafu::OptionExt;
288
289        let sliced_vector = $vector.slice($offset, $length);
290        let concrete_vector = sliced_vector
291            .as_any()
292            .downcast_ref::<$VectorType>()
293            .with_context(|| crate::error::CastTypeSnafu {
294                msg: format!(
295                    "Failed to cast vector from {} to {}",
296                    $vector.vector_type_name(),
297                    stringify!($VectorType)
298                ),
299            })?;
300        for value in concrete_vector.iter_data() {
301            $mutable_vector.push(value);
302        }
303        Ok(())
304    }};
305}
306
307pub(crate) use {
308    impl_extend_for_builder, impl_get_for_vector, impl_get_ref_for_vector,
309    impl_try_from_arrow_array_for_vector, impl_validity_for_vector,
310};
311
312#[cfg(test)]
313pub mod tests {
314    use arrow::array::{Array, Int32Array, UInt8Array};
315    use paste::paste;
316    use serde_json;
317
318    use super::*;
319    use crate::data_type::DataType;
320    use crate::prelude::ScalarVectorBuilder;
321    use crate::types::{Int32Type, LogicalPrimitiveType};
322    use crate::vectors::helper::Helper;
323
324    #[test]
325    fn test_df_columns_to_vector() {
326        let df_column: Arc<dyn Array> = Arc::new(Int32Array::from(vec![1, 2, 3]));
327        let vector = Helper::try_into_vector(df_column).unwrap();
328        assert_eq!(
329            Int32Type::build_data_type().as_arrow_type(),
330            vector.data_type().as_arrow_type()
331        );
332    }
333
334    #[test]
335    fn test_serialize_i32_vector() {
336        let df_column: Arc<dyn Array> = Arc::new(Int32Array::from(vec![1, 2, 3]));
337        let json_value = Helper::try_into_vector(df_column)
338            .unwrap()
339            .serialize_to_json()
340            .unwrap();
341        assert_eq!("[1,2,3]", serde_json::to_string(&json_value).unwrap());
342    }
343
344    #[test]
345    fn test_serialize_i8_vector() {
346        let df_column: Arc<dyn Array> = Arc::new(UInt8Array::from(vec![1, 2, 3]));
347        let json_value = Helper::try_into_vector(df_column)
348            .unwrap()
349            .serialize_to_json()
350            .unwrap();
351        assert_eq!("[1,2,3]", serde_json::to_string(&json_value).unwrap());
352    }
353
354    #[test]
355    fn test_mutable_vector_data_type() {
356        macro_rules! mutable_primitive_data_type_eq_with_lower {
357            ($($type: ident),*) => {
358                $(
359                    paste! {
360                        let mutable_vector = [<$type VectorBuilder>]::with_capacity(1024);
361                        assert_eq!(mutable_vector.data_type(), ConcreteDataType::[<$type:lower _datatype>]());
362                    }
363                )*
364            };
365        }
366
367        macro_rules! mutable_time_data_type_eq_with_snake {
368            ($($type: ident),*) => {
369                $(
370                    paste! {
371                        let mutable_vector = [<$type VectorBuilder>]::with_capacity(1024);
372                        assert_eq!(mutable_vector.data_type(), ConcreteDataType::[<$type:snake _datatype>]());
373                    }
374                )*
375            };
376        }
377        // Test Primitive types
378        mutable_primitive_data_type_eq_with_lower!(
379            Boolean, Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float32, Float64,
380            Date, Binary, String
381        );
382
383        // Test types about time
384        mutable_time_data_type_eq_with_snake!(
385            TimeSecond,
386            TimeMillisecond,
387            TimeMicrosecond,
388            TimeNanosecond,
389            TimestampSecond,
390            TimestampMillisecond,
391            TimestampMicrosecond,
392            TimestampNanosecond,
393            DurationSecond,
394            DurationMillisecond,
395            DurationMicrosecond,
396            DurationNanosecond,
397            IntervalYearMonth,
398            IntervalDayTime,
399            IntervalMonthDayNano
400        );
401
402        // Null type
403        let builder = NullVectorBuilder::default();
404        assert_eq!(builder.data_type(), ConcreteDataType::null_datatype());
405
406        // Decimal128 type
407        let builder = Decimal128VectorBuilder::with_capacity(1024);
408        assert_eq!(
409            builder.data_type(),
410            ConcreteDataType::decimal128_datatype(38, 10)
411        );
412
413        let builder = Decimal128VectorBuilder::with_capacity(1024)
414            .with_precision_and_scale(3, 2)
415            .unwrap();
416        assert_eq!(
417            builder.data_type(),
418            ConcreteDataType::decimal128_datatype(3, 2)
419        );
420    }
421
422    #[test]
423    #[should_panic(expected = "Must use ListVectorBuilder::with_type_capacity()")]
424    fn test_mutable_vector_list_data_type() {
425        // List type
426        let builder =
427            ListVectorBuilder::with_type_capacity(ConcreteDataType::int32_datatype(), 1024);
428        assert_eq!(
429            builder.data_type(),
430            ConcreteDataType::list_datatype(ConcreteDataType::int32_datatype())
431        );
432
433        // Panic with_capacity
434        let _ = ListVectorBuilder::with_capacity(1024);
435    }
436
437    #[test]
438    fn test_mutable_vector_to_vector_cloned() {
439        // create a string vector builder
440        let mut builder = ConcreteDataType::string_datatype().create_mutable_vector(1024);
441        builder.push_value_ref(ValueRef::String("hello"));
442        builder.push_value_ref(ValueRef::String("world"));
443        builder.push_value_ref(ValueRef::String("!"));
444
445        // use MutableVector trait to_vector_cloned won't reset builder
446        let vector = builder.to_vector_cloned();
447        assert_eq!(vector.len(), 3);
448        assert_eq!(builder.len(), 3);
449    }
450}