datatypes/
vectors.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::any::Any;
16use std::fmt::Debug;
17use std::sync::Arc;
18
19use arrow::array::{Array, ArrayRef};
20use snafu::ensure;
21
22use crate::data_type::ConcreteDataType;
23use crate::error::{self, Result};
24use crate::serialize::Serializable;
25use crate::value::{Value, ValueRef};
26use crate::vectors::operations::VectorOp;
27
28mod binary;
29mod boolean;
30mod constant;
31mod date;
32mod decimal;
33mod dictionary;
34mod duration;
35mod eq;
36mod helper;
37mod interval;
38mod list;
39mod null;
40pub(crate) mod operations;
41mod primitive;
42mod string;
43mod struct_vector;
44mod time;
45mod timestamp;
46mod validity;
47
48pub use binary::{BinaryVector, BinaryVectorBuilder};
49pub use boolean::{BooleanVector, BooleanVectorBuilder};
50pub use constant::ConstantVector;
51pub use date::{DateVector, DateVectorBuilder};
52pub use decimal::{Decimal128Vector, Decimal128VectorBuilder};
53pub use dictionary::{DictionaryIter, DictionaryVector};
54pub use duration::{
55    DurationMicrosecondVector, DurationMicrosecondVectorBuilder, DurationMillisecondVector,
56    DurationMillisecondVectorBuilder, DurationNanosecondVector, DurationNanosecondVectorBuilder,
57    DurationSecondVector, DurationSecondVectorBuilder,
58};
59pub use helper::Helper;
60pub use interval::{
61    IntervalDayTimeVector, IntervalDayTimeVectorBuilder, IntervalMonthDayNanoVector,
62    IntervalMonthDayNanoVectorBuilder, IntervalYearMonthVector, IntervalYearMonthVectorBuilder,
63};
64pub use list::{ListIter, ListVector, ListVectorBuilder};
65pub use null::{NullVector, NullVectorBuilder};
66pub use primitive::{
67    Float32Vector, Float32VectorBuilder, Float64Vector, Float64VectorBuilder, Int16Vector,
68    Int16VectorBuilder, Int32Vector, Int32VectorBuilder, Int64Vector, Int64VectorBuilder,
69    Int8Vector, Int8VectorBuilder, PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder,
70    UInt16Vector, UInt16VectorBuilder, UInt32Vector, UInt32VectorBuilder, UInt64Vector,
71    UInt64VectorBuilder, UInt8Vector, UInt8VectorBuilder,
72};
73pub use string::{StringVector, StringVectorBuilder};
74pub use time::{
75    TimeMicrosecondVector, TimeMicrosecondVectorBuilder, TimeMillisecondVector,
76    TimeMillisecondVectorBuilder, TimeNanosecondVector, TimeNanosecondVectorBuilder,
77    TimeSecondVector, TimeSecondVectorBuilder,
78};
79pub use timestamp::{
80    TimestampMicrosecondVector, TimestampMicrosecondVectorBuilder, TimestampMillisecondVector,
81    TimestampMillisecondVectorBuilder, TimestampNanosecondVector, TimestampNanosecondVectorBuilder,
82    TimestampSecondVector, TimestampSecondVectorBuilder,
83};
84pub use validity::Validity;
85
86// TODO(yingwen): arrow 28.0 implements Clone for all arrays, we could upgrade to it and simplify
87// some codes in methods such as `to_arrow_array()` and `to_boxed_arrow_array()`.
88/// Vector of data values.
89pub trait Vector: Send + Sync + Serializable + Debug + VectorOp {
90    /// Returns the data type of the vector.
91    ///
92    /// This may require heap allocation.
93    fn data_type(&self) -> ConcreteDataType;
94
95    fn vector_type_name(&self) -> String;
96
97    /// Returns the vector as [Any](std::any::Any) so that it can be
98    /// downcast to a specific implementation.
99    fn as_any(&self) -> &dyn Any;
100
101    /// Returns number of elements in the vector.
102    fn len(&self) -> usize;
103
104    /// Returns whether the vector is empty.
105    fn is_empty(&self) -> bool {
106        self.len() == 0
107    }
108
109    /// Convert this vector to a new arrow [ArrayRef].
110    fn to_arrow_array(&self) -> ArrayRef;
111
112    /// Convert this vector to a new boxed arrow [Array].
113    fn to_boxed_arrow_array(&self) -> Box<dyn Array>;
114
115    /// Returns the validity of the Array.
116    fn validity(&self) -> Validity;
117
118    /// Returns the memory size of vector.
119    fn memory_size(&self) -> usize;
120
121    /// The number of null slots on this [`Vector`].
122    /// # Implementation
123    /// This is `O(1)`.
124    fn null_count(&self) -> usize;
125
126    /// Returns true when it's a ConstantColumn
127    fn is_const(&self) -> bool {
128        false
129    }
130
131    /// Returns whether row is null.
132    fn is_null(&self, row: usize) -> bool;
133
134    /// If the vector only contains NULL.
135    fn only_null(&self) -> bool {
136        self.null_count() == self.len()
137    }
138
139    /// Slices the `Vector`, returning a new `VectorRef`.
140    ///
141    /// # Panics
142    /// This function panics if `offset + length > self.len()`.
143    fn slice(&self, offset: usize, length: usize) -> VectorRef;
144
145    /// Returns the clone of value at `index`.
146    ///
147    /// # Panics
148    /// Panic if `index` is out of bound.
149    fn get(&self, index: usize) -> Value;
150
151    /// Returns the clone of value at `index` or error if `index`
152    /// is out of bound.
153    fn try_get(&self, index: usize) -> Result<Value> {
154        ensure!(
155            index < self.len(),
156            error::BadArrayAccessSnafu {
157                index,
158                size: self.len()
159            }
160        );
161        Ok(self.get(index))
162    }
163
164    /// Returns the reference of value at `index`.
165    ///
166    /// # Panics
167    /// Panic if `index` is out of bound.
168    fn get_ref(&self, index: usize) -> ValueRef;
169}
170
171pub type VectorRef = Arc<dyn Vector>;
172
173/// Mutable vector that could be used to build an immutable vector.
174pub trait MutableVector: Send + Sync {
175    /// Returns the data type of the vector.
176    fn data_type(&self) -> ConcreteDataType;
177
178    /// Returns the length of the vector.
179    fn len(&self) -> usize;
180
181    /// Returns whether the vector is empty.
182    fn is_empty(&self) -> bool {
183        self.len() == 0
184    }
185
186    /// Convert to Any, to enable dynamic casting.
187    fn as_any(&self) -> &dyn Any;
188
189    /// Convert to mutable Any, to enable dynamic casting.
190    fn as_mut_any(&mut self) -> &mut dyn Any;
191
192    /// Convert `self` to an (immutable) [VectorRef] and reset `self`.
193    fn to_vector(&mut self) -> VectorRef;
194
195    /// Convert `self` to an (immutable) [VectorRef] and without resetting `self`.
196    fn to_vector_cloned(&self) -> VectorRef;
197
198    /// Try to push value ref to this mutable vector.
199    fn try_push_value_ref(&mut self, value: ValueRef) -> Result<()>;
200
201    /// Push value ref to this mutable vector.
202    ///
203    /// # Panics
204    /// Panics if error if data types mismatch.
205    fn push_value_ref(&mut self, value: ValueRef) {
206        self.try_push_value_ref(value).unwrap_or_else(|_| {
207            panic!(
208                "expecting pushing value of datatype {:?}, actual {:?}",
209                self.data_type(),
210                value
211            );
212        });
213    }
214
215    /// Push null to this mutable vector.
216    fn push_null(&mut self);
217
218    /// Push nulls to this mutable vector.
219    fn push_nulls(&mut self, num_nulls: usize) {
220        for _ in 0..num_nulls {
221            self.push_null();
222        }
223    }
224
225    /// Extend this mutable vector by slice of `vector`.
226    ///
227    /// Returns error if data types mismatch.
228    ///
229    /// # Panics
230    /// Panics if `offset + length > vector.len()`.
231    fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()>;
232}
233
234/// Helper to define `try_from_arrow_array(array: arrow::array::ArrayRef)` function.
235macro_rules! impl_try_from_arrow_array_for_vector {
236    ($Array: ident, $Vector: ident) => {
237        impl $Vector {
238            pub fn try_from_arrow_array(
239                array: impl AsRef<dyn arrow::array::Array>,
240            ) -> crate::error::Result<$Vector> {
241                use snafu::OptionExt;
242
243                let arrow_array = array
244                    .as_ref()
245                    .as_any()
246                    .downcast_ref::<$Array>()
247                    .with_context(|| crate::error::ConversionSnafu {
248                        from: std::format!("{:?}", array.as_ref().data_type()),
249                    })?
250                    .clone();
251
252                Ok($Vector::from(arrow_array))
253            }
254        }
255    };
256}
257
258macro_rules! impl_validity_for_vector {
259    ($array: expr) => {
260        Validity::from_array_data($array.to_data())
261    };
262}
263
264macro_rules! impl_get_for_vector {
265    ($array: expr, $index: ident) => {
266        if $array.is_valid($index) {
267            // Safety: The index have been checked by `is_valid()`.
268            unsafe { $array.value_unchecked($index).into() }
269        } else {
270            Value::Null
271        }
272    };
273}
274
275macro_rules! impl_get_ref_for_vector {
276    ($array: expr, $index: ident) => {
277        if $array.is_valid($index) {
278            // Safety: The index have been checked by `is_valid()`.
279            unsafe { $array.value_unchecked($index).into() }
280        } else {
281            ValueRef::Null
282        }
283    };
284}
285
286macro_rules! impl_extend_for_builder {
287    ($mutable_vector: expr, $vector: ident, $VectorType: ident, $offset: ident, $length: ident) => {{
288        use snafu::OptionExt;
289
290        let sliced_vector = $vector.slice($offset, $length);
291        let concrete_vector = sliced_vector
292            .as_any()
293            .downcast_ref::<$VectorType>()
294            .with_context(|| crate::error::CastTypeSnafu {
295                msg: format!(
296                    "Failed to cast vector from {} to {}",
297                    $vector.vector_type_name(),
298                    stringify!($VectorType)
299                ),
300            })?;
301        for value in concrete_vector.iter_data() {
302            $mutable_vector.push(value);
303        }
304        Ok(())
305    }};
306}
307
308pub(crate) use {
309    impl_extend_for_builder, impl_get_for_vector, impl_get_ref_for_vector,
310    impl_try_from_arrow_array_for_vector, impl_validity_for_vector,
311};
312
313#[cfg(test)]
314pub mod tests {
315    use arrow::array::{Array, Int32Array, UInt8Array};
316    use paste::paste;
317    use serde_json;
318
319    use super::*;
320    use crate::data_type::DataType;
321    use crate::prelude::ScalarVectorBuilder;
322    use crate::types::{Int32Type, LogicalPrimitiveType};
323    use crate::vectors::helper::Helper;
324
325    #[test]
326    fn test_df_columns_to_vector() {
327        let df_column: Arc<dyn Array> = Arc::new(Int32Array::from(vec![1, 2, 3]));
328        let vector = Helper::try_into_vector(df_column).unwrap();
329        assert_eq!(
330            Int32Type::build_data_type().as_arrow_type(),
331            vector.data_type().as_arrow_type()
332        );
333    }
334
335    #[test]
336    fn test_serialize_i32_vector() {
337        let df_column: Arc<dyn Array> = Arc::new(Int32Array::from(vec![1, 2, 3]));
338        let json_value = Helper::try_into_vector(df_column)
339            .unwrap()
340            .serialize_to_json()
341            .unwrap();
342        assert_eq!("[1,2,3]", serde_json::to_string(&json_value).unwrap());
343    }
344
345    #[test]
346    fn test_serialize_i8_vector() {
347        let df_column: Arc<dyn Array> = Arc::new(UInt8Array::from(vec![1, 2, 3]));
348        let json_value = Helper::try_into_vector(df_column)
349            .unwrap()
350            .serialize_to_json()
351            .unwrap();
352        assert_eq!("[1,2,3]", serde_json::to_string(&json_value).unwrap());
353    }
354
355    #[test]
356    fn test_mutable_vector_data_type() {
357        macro_rules! mutable_primitive_data_type_eq_with_lower {
358            ($($type: ident),*) => {
359                $(
360                    paste! {
361                        let mutable_vector = [<$type VectorBuilder>]::with_capacity(1024);
362                        assert_eq!(mutable_vector.data_type(), ConcreteDataType::[<$type:lower _datatype>]());
363                    }
364                )*
365            };
366        }
367
368        macro_rules! mutable_time_data_type_eq_with_snake {
369            ($($type: ident),*) => {
370                $(
371                    paste! {
372                        let mutable_vector = [<$type VectorBuilder>]::with_capacity(1024);
373                        assert_eq!(mutable_vector.data_type(), ConcreteDataType::[<$type:snake _datatype>]());
374                    }
375                )*
376            };
377        }
378        // Test Primitive types
379        mutable_primitive_data_type_eq_with_lower!(
380            Boolean, Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float32, Float64,
381            Date, Binary, String
382        );
383
384        // Test types about time
385        mutable_time_data_type_eq_with_snake!(
386            TimeSecond,
387            TimeMillisecond,
388            TimeMicrosecond,
389            TimeNanosecond,
390            TimestampSecond,
391            TimestampMillisecond,
392            TimestampMicrosecond,
393            TimestampNanosecond,
394            DurationSecond,
395            DurationMillisecond,
396            DurationMicrosecond,
397            DurationNanosecond,
398            IntervalYearMonth,
399            IntervalDayTime,
400            IntervalMonthDayNano
401        );
402
403        // Null type
404        let builder = NullVectorBuilder::default();
405        assert_eq!(builder.data_type(), ConcreteDataType::null_datatype());
406
407        // Decimal128 type
408        let builder = Decimal128VectorBuilder::with_capacity(1024);
409        assert_eq!(
410            builder.data_type(),
411            ConcreteDataType::decimal128_datatype(38, 10)
412        );
413
414        let builder = Decimal128VectorBuilder::with_capacity(1024)
415            .with_precision_and_scale(3, 2)
416            .unwrap();
417        assert_eq!(
418            builder.data_type(),
419            ConcreteDataType::decimal128_datatype(3, 2)
420        );
421    }
422
423    #[test]
424    #[should_panic(expected = "Must use ListVectorBuilder::with_type_capacity()")]
425    fn test_mutable_vector_list_data_type() {
426        // List type
427        let builder =
428            ListVectorBuilder::with_type_capacity(ConcreteDataType::int32_datatype(), 1024);
429        assert_eq!(
430            builder.data_type(),
431            ConcreteDataType::list_datatype(ConcreteDataType::int32_datatype())
432        );
433
434        // Panic with_capacity
435        let _ = ListVectorBuilder::with_capacity(1024);
436    }
437
438    #[test]
439    fn test_mutable_vector_to_vector_cloned() {
440        // create a string vector builder
441        let mut builder = ConcreteDataType::string_datatype().create_mutable_vector(1024);
442        builder.push_value_ref(ValueRef::String("hello"));
443        builder.push_value_ref(ValueRef::String("world"));
444        builder.push_value_ref(ValueRef::String("!"));
445
446        // use MutableVector trait to_vector_cloned won't reset builder
447        let vector = builder.to_vector_cloned();
448        assert_eq!(vector.len(), 3);
449        assert_eq!(builder.len(), 3);
450    }
451}