datatypes/
vectors.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::any::Any;
16use std::fmt::Debug;
17use std::sync::Arc;
18
19use arrow::array::{Array, ArrayRef};
20use snafu::ensure;
21
22use crate::data_type::ConcreteDataType;
23use crate::error::{self, Result};
24use crate::serialize::Serializable;
25use crate::value::{Value, ValueRef};
26use crate::vectors::operations::VectorOp;
27
28mod binary;
29mod boolean;
30mod constant;
31mod date;
32mod decimal;
33mod dictionary;
34mod duration;
35mod eq;
36mod helper;
37mod interval;
38pub(crate) mod json;
39mod list;
40mod null;
41pub(crate) mod operations;
42mod primitive;
43mod string;
44mod struct_vector;
45mod time;
46mod timestamp;
47mod validity;
48
49pub use binary::{BinaryVector, BinaryVectorBuilder};
50pub use boolean::{BooleanVector, BooleanVectorBuilder};
51pub use constant::ConstantVector;
52pub use date::{DateVector, DateVectorBuilder};
53pub use decimal::{Decimal128Vector, Decimal128VectorBuilder};
54pub use dictionary::{DictionaryIter, DictionaryVector};
55pub use duration::{
56    DurationMicrosecondVector, DurationMicrosecondVectorBuilder, DurationMillisecondVector,
57    DurationMillisecondVectorBuilder, DurationNanosecondVector, DurationNanosecondVectorBuilder,
58    DurationSecondVector, DurationSecondVectorBuilder,
59};
60pub use helper::Helper;
61pub use interval::{
62    IntervalDayTimeVector, IntervalDayTimeVectorBuilder, IntervalMonthDayNanoVector,
63    IntervalMonthDayNanoVectorBuilder, IntervalYearMonthVector, IntervalYearMonthVectorBuilder,
64};
65pub use list::{ListIter, ListVector, ListVectorBuilder};
66pub use null::{NullVector, NullVectorBuilder};
67pub use primitive::{
68    Float32Vector, Float32VectorBuilder, Float64Vector, Float64VectorBuilder, Int8Vector,
69    Int8VectorBuilder, Int16Vector, Int16VectorBuilder, Int32Vector, Int32VectorBuilder,
70    Int64Vector, Int64VectorBuilder, PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder,
71    UInt8Vector, UInt8VectorBuilder, UInt16Vector, UInt16VectorBuilder, UInt32Vector,
72    UInt32VectorBuilder, UInt64Vector, UInt64VectorBuilder,
73};
74pub use string::{StringVector, StringVectorBuilder};
75pub use struct_vector::{StructVector, StructVectorBuilder};
76pub use time::{
77    TimeMicrosecondVector, TimeMicrosecondVectorBuilder, TimeMillisecondVector,
78    TimeMillisecondVectorBuilder, TimeNanosecondVector, TimeNanosecondVectorBuilder,
79    TimeSecondVector, TimeSecondVectorBuilder,
80};
81pub use timestamp::{
82    TimestampMicrosecondVector, TimestampMicrosecondVectorBuilder, TimestampMillisecondVector,
83    TimestampMillisecondVectorBuilder, TimestampNanosecondVector, TimestampNanosecondVectorBuilder,
84    TimestampSecondVector, TimestampSecondVectorBuilder,
85};
86pub use validity::Validity;
87
88// TODO(yingwen): arrow 28.0 implements Clone for all arrays, we could upgrade to it and simplify
89// some codes in methods such as `to_arrow_array()` and `to_boxed_arrow_array()`.
90/// Vector of data values.
91pub trait Vector: Send + Sync + Serializable + Debug + VectorOp {
92    /// Returns the data type of the vector.
93    ///
94    /// This may require heap allocation.
95    fn data_type(&self) -> ConcreteDataType;
96
97    fn vector_type_name(&self) -> String;
98
99    /// Returns the vector as [Any](std::any::Any) so that it can be
100    /// downcast to a specific implementation.
101    fn as_any(&self) -> &dyn Any;
102
103    /// Returns number of elements in the vector.
104    fn len(&self) -> usize;
105
106    /// Returns whether the vector is empty.
107    fn is_empty(&self) -> bool {
108        self.len() == 0
109    }
110
111    /// Convert this vector to a new arrow [ArrayRef].
112    fn to_arrow_array(&self) -> ArrayRef;
113
114    /// Convert this vector to a new boxed arrow [Array].
115    fn to_boxed_arrow_array(&self) -> Box<dyn Array>;
116
117    /// Returns the validity of the Array.
118    fn validity(&self) -> Validity;
119
120    /// Returns the memory size of vector.
121    fn memory_size(&self) -> usize;
122
123    /// The number of null slots on this [`Vector`].
124    /// # Implementation
125    /// This is `O(1)`.
126    fn null_count(&self) -> usize;
127
128    /// Returns true when it's a ConstantColumn
129    fn is_const(&self) -> bool {
130        false
131    }
132
133    /// Returns whether row is null.
134    fn is_null(&self, row: usize) -> bool;
135
136    /// If the vector only contains NULL.
137    fn only_null(&self) -> bool {
138        self.null_count() == self.len()
139    }
140
141    /// Slices the `Vector`, returning a new `VectorRef`.
142    ///
143    /// # Panics
144    /// This function panics if `offset + length > self.len()`.
145    fn slice(&self, offset: usize, length: usize) -> VectorRef;
146
147    /// Returns the clone of value at `index`.
148    ///
149    /// # Panics
150    /// Panic if `index` is out of bound.
151    fn get(&self, index: usize) -> Value;
152
153    /// Returns the clone of value at `index` or error if `index`
154    /// is out of bound.
155    fn try_get(&self, index: usize) -> Result<Value> {
156        ensure!(
157            index < self.len(),
158            error::BadArrayAccessSnafu {
159                index,
160                size: self.len()
161            }
162        );
163        Ok(self.get(index))
164    }
165
166    /// Returns the reference of value at `index`.
167    ///
168    /// # Panics
169    /// Panic if `index` is out of bound.
170    fn get_ref(&self, index: usize) -> ValueRef<'_>;
171}
172
173pub type VectorRef = Arc<dyn Vector>;
174
175/// Mutable vector that could be used to build an immutable vector.
176pub trait MutableVector: Send + Sync {
177    /// Returns the data type of the vector.
178    fn data_type(&self) -> ConcreteDataType;
179
180    /// Returns the length of the vector.
181    fn len(&self) -> usize;
182
183    /// Returns whether the vector is empty.
184    fn is_empty(&self) -> bool {
185        self.len() == 0
186    }
187
188    /// Convert to Any, to enable dynamic casting.
189    fn as_any(&self) -> &dyn Any;
190
191    /// Convert to mutable Any, to enable dynamic casting.
192    fn as_mut_any(&mut self) -> &mut dyn Any;
193
194    /// Convert `self` to an (immutable) [VectorRef] and reset `self`.
195    fn to_vector(&mut self) -> VectorRef;
196
197    /// Convert `self` to an (immutable) [VectorRef] and without resetting `self`.
198    fn to_vector_cloned(&self) -> VectorRef;
199
200    /// Try to push value ref to this mutable vector.
201    fn try_push_value_ref(&mut self, value: &ValueRef) -> Result<()>;
202
203    /// Push value ref to this mutable vector.
204    ///
205    /// # Panics
206    /// Panics if error if data types mismatch.
207    fn push_value_ref(&mut self, value: &ValueRef) {
208        self.try_push_value_ref(value).unwrap_or_else(|_| {
209            panic!(
210                "expecting pushing value of datatype {:?}, actual {:?}",
211                self.data_type(),
212                value
213            );
214        });
215    }
216
217    /// Push null to this mutable vector.
218    fn push_null(&mut self);
219
220    /// Push nulls to this mutable vector.
221    fn push_nulls(&mut self, num_nulls: usize) {
222        for _ in 0..num_nulls {
223            self.push_null();
224        }
225    }
226
227    /// Extend this mutable vector by slice of `vector`.
228    ///
229    /// Returns error if data types mismatch.
230    ///
231    /// # Panics
232    /// Panics if `offset + length > vector.len()`.
233    fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()>;
234}
235
236/// Helper to define `try_from_arrow_array(array: arrow::array::ArrayRef)` function.
237macro_rules! impl_try_from_arrow_array_for_vector {
238    ($Array: ident, $Vector: ident) => {
239        impl $Vector {
240            pub fn try_from_arrow_array(
241                array: impl AsRef<dyn arrow::array::Array>,
242            ) -> crate::error::Result<$Vector> {
243                use snafu::OptionExt;
244
245                let arrow_array = array
246                    .as_ref()
247                    .as_any()
248                    .downcast_ref::<$Array>()
249                    .with_context(|| crate::error::ConversionSnafu {
250                        from: std::format!("{:?}", array.as_ref().data_type()),
251                    })?
252                    .clone();
253
254                Ok($Vector::from(arrow_array))
255            }
256        }
257    };
258}
259
260macro_rules! impl_validity_for_vector {
261    ($array: expr) => {
262        Validity::from_array_data($array.to_data())
263    };
264}
265
266macro_rules! impl_get_for_vector {
267    ($array: expr, $index: ident) => {
268        if $array.is_valid($index) {
269            // Safety: The index have been checked by `is_valid()`.
270            unsafe { $array.value_unchecked($index).into() }
271        } else {
272            Value::Null
273        }
274    };
275}
276
277macro_rules! impl_get_ref_for_vector {
278    ($array: expr, $index: ident) => {
279        if $array.is_valid($index) {
280            // Safety: The index have been checked by `is_valid()`.
281            unsafe { $array.value_unchecked($index).into() }
282        } else {
283            ValueRef::Null
284        }
285    };
286}
287
288macro_rules! impl_extend_for_builder {
289    ($mutable_vector: expr, $vector: ident, $VectorType: ident, $offset: ident, $length: ident) => {{
290        use snafu::OptionExt;
291
292        let sliced_vector = $vector.slice($offset, $length);
293        let concrete_vector = sliced_vector
294            .as_any()
295            .downcast_ref::<$VectorType>()
296            .with_context(|| crate::error::CastTypeSnafu {
297                msg: format!(
298                    "Failed to cast vector from {} to {}",
299                    $vector.vector_type_name(),
300                    stringify!($VectorType)
301                ),
302            })?;
303        for value in concrete_vector.iter_data() {
304            $mutable_vector.push(value);
305        }
306        Ok(())
307    }};
308}
309
310pub(crate) use {
311    impl_extend_for_builder, impl_get_for_vector, impl_get_ref_for_vector,
312    impl_try_from_arrow_array_for_vector, impl_validity_for_vector,
313};
314
315#[cfg(test)]
316pub mod tests {
317    use arrow::array::{Array, Int32Array, UInt8Array};
318    use paste::paste;
319    use serde_json;
320
321    use super::*;
322    use crate::data_type::DataType;
323    use crate::prelude::ScalarVectorBuilder;
324    use crate::types::{Int32Type, LogicalPrimitiveType};
325    use crate::vectors::helper::Helper;
326
327    #[test]
328    fn test_df_columns_to_vector() {
329        let df_column: Arc<dyn Array> = Arc::new(Int32Array::from(vec![1, 2, 3]));
330        let vector = Helper::try_into_vector(df_column).unwrap();
331        assert_eq!(
332            Int32Type::build_data_type().as_arrow_type(),
333            vector.data_type().as_arrow_type()
334        );
335    }
336
337    #[test]
338    fn test_serialize_i32_vector() {
339        let df_column: Arc<dyn Array> = Arc::new(Int32Array::from(vec![1, 2, 3]));
340        let json_value = Helper::try_into_vector(df_column)
341            .unwrap()
342            .serialize_to_json()
343            .unwrap();
344        assert_eq!("[1,2,3]", serde_json::to_string(&json_value).unwrap());
345    }
346
347    #[test]
348    fn test_serialize_i8_vector() {
349        let df_column: Arc<dyn Array> = Arc::new(UInt8Array::from(vec![1, 2, 3]));
350        let json_value = Helper::try_into_vector(df_column)
351            .unwrap()
352            .serialize_to_json()
353            .unwrap();
354        assert_eq!("[1,2,3]", serde_json::to_string(&json_value).unwrap());
355    }
356
357    #[test]
358    fn test_mutable_vector_data_type() {
359        macro_rules! mutable_primitive_data_type_eq_with_lower {
360            ($($type: ident),*) => {
361                $(
362                    paste! {
363                        let mutable_vector = [<$type VectorBuilder>]::with_capacity(1024);
364                        assert_eq!(mutable_vector.data_type(), ConcreteDataType::[<$type:lower _datatype>]());
365                    }
366                )*
367            };
368        }
369
370        macro_rules! mutable_time_data_type_eq_with_snake {
371            ($($type: ident),*) => {
372                $(
373                    paste! {
374                        let mutable_vector = [<$type VectorBuilder>]::with_capacity(1024);
375                        assert_eq!(mutable_vector.data_type(), ConcreteDataType::[<$type:snake _datatype>]());
376                    }
377                )*
378            };
379        }
380        // Test Primitive types
381        mutable_primitive_data_type_eq_with_lower!(
382            Boolean, Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float32, Float64,
383            Date, Binary, String
384        );
385
386        // Test types about time
387        mutable_time_data_type_eq_with_snake!(
388            TimeSecond,
389            TimeMillisecond,
390            TimeMicrosecond,
391            TimeNanosecond,
392            TimestampSecond,
393            TimestampMillisecond,
394            TimestampMicrosecond,
395            TimestampNanosecond,
396            DurationSecond,
397            DurationMillisecond,
398            DurationMicrosecond,
399            DurationNanosecond,
400            IntervalYearMonth,
401            IntervalDayTime,
402            IntervalMonthDayNano
403        );
404
405        // Null type
406        let builder = NullVectorBuilder::default();
407        assert_eq!(builder.data_type(), ConcreteDataType::null_datatype());
408
409        // Decimal128 type
410        let builder = Decimal128VectorBuilder::with_capacity(1024);
411        assert_eq!(
412            builder.data_type(),
413            ConcreteDataType::decimal128_datatype(38, 10)
414        );
415
416        let builder = Decimal128VectorBuilder::with_capacity(1024)
417            .with_precision_and_scale(3, 2)
418            .unwrap();
419        assert_eq!(
420            builder.data_type(),
421            ConcreteDataType::decimal128_datatype(3, 2)
422        );
423    }
424
425    #[test]
426    #[should_panic(expected = "Must use ListVectorBuilder::with_type_capacity()")]
427    fn test_mutable_vector_list_data_type() {
428        let item_type = Arc::new(ConcreteDataType::int32_datatype());
429        // List type
430        let builder = ListVectorBuilder::with_type_capacity(item_type.clone(), 1024);
431        assert_eq!(
432            builder.data_type(),
433            ConcreteDataType::list_datatype(item_type)
434        );
435
436        // Panic with_capacity
437        let _ = ListVectorBuilder::with_capacity(1024);
438    }
439
440    #[test]
441    fn test_mutable_vector_to_vector_cloned() {
442        // create a string vector builder
443        let mut builder = ConcreteDataType::string_datatype().create_mutable_vector(1024);
444        builder.push_value_ref(&ValueRef::String("hello"));
445        builder.push_value_ref(&ValueRef::String("world"));
446        builder.push_value_ref(&ValueRef::String("!"));
447
448        // use MutableVector trait to_vector_cloned won't reset builder
449        let vector = builder.to_vector_cloned();
450        assert_eq!(vector.len(), 3);
451        assert_eq!(builder.len(), 3);
452    }
453}