datatypes/
vectors.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::any::Any;
16use std::fmt::Debug;
17use std::sync::Arc;
18
19use arrow::array::{Array, ArrayRef};
20use snafu::ensure;
21
22use crate::data_type::ConcreteDataType;
23use crate::error::{self, Result};
24use crate::serialize::Serializable;
25use crate::value::{Value, ValueRef};
26use crate::vectors::operations::VectorOp;
27
28mod binary;
29mod boolean;
30mod constant;
31mod date;
32mod decimal;
33mod dictionary;
34mod duration;
35mod eq;
36mod helper;
37mod interval;
38mod list;
39mod null;
40pub(crate) mod operations;
41mod primitive;
42mod string;
43mod struct_vector;
44mod time;
45mod timestamp;
46mod validity;
47
48pub use binary::{BinaryVector, BinaryVectorBuilder};
49pub use boolean::{BooleanVector, BooleanVectorBuilder};
50pub use constant::ConstantVector;
51pub use date::{DateVector, DateVectorBuilder};
52pub use decimal::{Decimal128Vector, Decimal128VectorBuilder};
53pub use dictionary::{DictionaryIter, DictionaryVector};
54pub use duration::{
55    DurationMicrosecondVector, DurationMicrosecondVectorBuilder, DurationMillisecondVector,
56    DurationMillisecondVectorBuilder, DurationNanosecondVector, DurationNanosecondVectorBuilder,
57    DurationSecondVector, DurationSecondVectorBuilder,
58};
59pub use helper::Helper;
60pub use interval::{
61    IntervalDayTimeVector, IntervalDayTimeVectorBuilder, IntervalMonthDayNanoVector,
62    IntervalMonthDayNanoVectorBuilder, IntervalYearMonthVector, IntervalYearMonthVectorBuilder,
63};
64pub use list::{ListIter, ListVector, ListVectorBuilder};
65pub use null::{NullVector, NullVectorBuilder};
66pub use primitive::{
67    Float32Vector, Float32VectorBuilder, Float64Vector, Float64VectorBuilder, Int8Vector,
68    Int8VectorBuilder, Int16Vector, Int16VectorBuilder, Int32Vector, Int32VectorBuilder,
69    Int64Vector, Int64VectorBuilder, PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder,
70    UInt8Vector, UInt8VectorBuilder, UInt16Vector, UInt16VectorBuilder, UInt32Vector,
71    UInt32VectorBuilder, UInt64Vector, UInt64VectorBuilder,
72};
73pub use string::{StringVector, StringVectorBuilder};
74pub use struct_vector::{StructVector, StructVectorBuilder};
75pub use time::{
76    TimeMicrosecondVector, TimeMicrosecondVectorBuilder, TimeMillisecondVector,
77    TimeMillisecondVectorBuilder, TimeNanosecondVector, TimeNanosecondVectorBuilder,
78    TimeSecondVector, TimeSecondVectorBuilder,
79};
80pub use timestamp::{
81    TimestampMicrosecondVector, TimestampMicrosecondVectorBuilder, TimestampMillisecondVector,
82    TimestampMillisecondVectorBuilder, TimestampNanosecondVector, TimestampNanosecondVectorBuilder,
83    TimestampSecondVector, TimestampSecondVectorBuilder,
84};
85pub use validity::Validity;
86
87// TODO(yingwen): arrow 28.0 implements Clone for all arrays, we could upgrade to it and simplify
88// some codes in methods such as `to_arrow_array()` and `to_boxed_arrow_array()`.
89/// Vector of data values.
90pub trait Vector: Send + Sync + Serializable + Debug + VectorOp {
91    /// Returns the data type of the vector.
92    ///
93    /// This may require heap allocation.
94    fn data_type(&self) -> ConcreteDataType;
95
96    fn vector_type_name(&self) -> String;
97
98    /// Returns the vector as [Any](std::any::Any) so that it can be
99    /// downcast to a specific implementation.
100    fn as_any(&self) -> &dyn Any;
101
102    /// Returns number of elements in the vector.
103    fn len(&self) -> usize;
104
105    /// Returns whether the vector is empty.
106    fn is_empty(&self) -> bool {
107        self.len() == 0
108    }
109
110    /// Convert this vector to a new arrow [ArrayRef].
111    fn to_arrow_array(&self) -> ArrayRef;
112
113    /// Convert this vector to a new boxed arrow [Array].
114    fn to_boxed_arrow_array(&self) -> Box<dyn Array>;
115
116    /// Returns the validity of the Array.
117    fn validity(&self) -> Validity;
118
119    /// Returns the memory size of vector.
120    fn memory_size(&self) -> usize;
121
122    /// The number of null slots on this [`Vector`].
123    /// # Implementation
124    /// This is `O(1)`.
125    fn null_count(&self) -> usize;
126
127    /// Returns true when it's a ConstantColumn
128    fn is_const(&self) -> bool {
129        false
130    }
131
132    /// Returns whether row is null.
133    fn is_null(&self, row: usize) -> bool;
134
135    /// If the vector only contains NULL.
136    fn only_null(&self) -> bool {
137        self.null_count() == self.len()
138    }
139
140    /// Slices the `Vector`, returning a new `VectorRef`.
141    ///
142    /// # Panics
143    /// This function panics if `offset + length > self.len()`.
144    fn slice(&self, offset: usize, length: usize) -> VectorRef;
145
146    /// Returns the clone of value at `index`.
147    ///
148    /// # Panics
149    /// Panic if `index` is out of bound.
150    fn get(&self, index: usize) -> Value;
151
152    /// Returns the clone of value at `index` or error if `index`
153    /// is out of bound.
154    fn try_get(&self, index: usize) -> Result<Value> {
155        ensure!(
156            index < self.len(),
157            error::BadArrayAccessSnafu {
158                index,
159                size: self.len()
160            }
161        );
162        Ok(self.get(index))
163    }
164
165    /// Returns the reference of value at `index`.
166    ///
167    /// # Panics
168    /// Panic if `index` is out of bound.
169    fn get_ref(&self, index: usize) -> ValueRef<'_>;
170}
171
172pub type VectorRef = Arc<dyn Vector>;
173
174/// Mutable vector that could be used to build an immutable vector.
175pub trait MutableVector: Send + Sync {
176    /// Returns the data type of the vector.
177    fn data_type(&self) -> ConcreteDataType;
178
179    /// Returns the length of the vector.
180    fn len(&self) -> usize;
181
182    /// Returns whether the vector is empty.
183    fn is_empty(&self) -> bool {
184        self.len() == 0
185    }
186
187    /// Convert to Any, to enable dynamic casting.
188    fn as_any(&self) -> &dyn Any;
189
190    /// Convert to mutable Any, to enable dynamic casting.
191    fn as_mut_any(&mut self) -> &mut dyn Any;
192
193    /// Convert `self` to an (immutable) [VectorRef] and reset `self`.
194    fn to_vector(&mut self) -> VectorRef;
195
196    /// Convert `self` to an (immutable) [VectorRef] and without resetting `self`.
197    fn to_vector_cloned(&self) -> VectorRef;
198
199    /// Try to push value ref to this mutable vector.
200    fn try_push_value_ref(&mut self, value: &ValueRef) -> Result<()>;
201
202    /// Push value ref to this mutable vector.
203    ///
204    /// # Panics
205    /// Panics if error if data types mismatch.
206    fn push_value_ref(&mut self, value: &ValueRef) {
207        self.try_push_value_ref(value).unwrap_or_else(|_| {
208            panic!(
209                "expecting pushing value of datatype {:?}, actual {:?}",
210                self.data_type(),
211                value
212            );
213        });
214    }
215
216    /// Push null to this mutable vector.
217    fn push_null(&mut self);
218
219    /// Push nulls to this mutable vector.
220    fn push_nulls(&mut self, num_nulls: usize) {
221        for _ in 0..num_nulls {
222            self.push_null();
223        }
224    }
225
226    /// Extend this mutable vector by slice of `vector`.
227    ///
228    /// Returns error if data types mismatch.
229    ///
230    /// # Panics
231    /// Panics if `offset + length > vector.len()`.
232    fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()>;
233}
234
235/// Helper to define `try_from_arrow_array(array: arrow::array::ArrayRef)` function.
236macro_rules! impl_try_from_arrow_array_for_vector {
237    ($Array: ident, $Vector: ident) => {
238        impl $Vector {
239            pub fn try_from_arrow_array(
240                array: impl AsRef<dyn arrow::array::Array>,
241            ) -> crate::error::Result<$Vector> {
242                use snafu::OptionExt;
243
244                let arrow_array = array
245                    .as_ref()
246                    .as_any()
247                    .downcast_ref::<$Array>()
248                    .with_context(|| crate::error::ConversionSnafu {
249                        from: std::format!("{:?}", array.as_ref().data_type()),
250                    })?
251                    .clone();
252
253                Ok($Vector::from(arrow_array))
254            }
255        }
256    };
257}
258
259macro_rules! impl_validity_for_vector {
260    ($array: expr) => {
261        Validity::from_array_data($array.to_data())
262    };
263}
264
265macro_rules! impl_get_for_vector {
266    ($array: expr, $index: ident) => {
267        if $array.is_valid($index) {
268            // Safety: The index have been checked by `is_valid()`.
269            unsafe { $array.value_unchecked($index).into() }
270        } else {
271            Value::Null
272        }
273    };
274}
275
276macro_rules! impl_get_ref_for_vector {
277    ($array: expr, $index: ident) => {
278        if $array.is_valid($index) {
279            // Safety: The index have been checked by `is_valid()`.
280            unsafe { $array.value_unchecked($index).into() }
281        } else {
282            ValueRef::Null
283        }
284    };
285}
286
287macro_rules! impl_extend_for_builder {
288    ($mutable_vector: expr, $vector: ident, $VectorType: ident, $offset: ident, $length: ident) => {{
289        use snafu::OptionExt;
290
291        let sliced_vector = $vector.slice($offset, $length);
292        let concrete_vector = sliced_vector
293            .as_any()
294            .downcast_ref::<$VectorType>()
295            .with_context(|| crate::error::CastTypeSnafu {
296                msg: format!(
297                    "Failed to cast vector from {} to {}",
298                    $vector.vector_type_name(),
299                    stringify!($VectorType)
300                ),
301            })?;
302        for value in concrete_vector.iter_data() {
303            $mutable_vector.push(value);
304        }
305        Ok(())
306    }};
307}
308
309pub(crate) use {
310    impl_extend_for_builder, impl_get_for_vector, impl_get_ref_for_vector,
311    impl_try_from_arrow_array_for_vector, impl_validity_for_vector,
312};
313
314#[cfg(test)]
315pub mod tests {
316    use arrow::array::{Array, Int32Array, UInt8Array};
317    use paste::paste;
318    use serde_json;
319
320    use super::*;
321    use crate::data_type::DataType;
322    use crate::prelude::ScalarVectorBuilder;
323    use crate::types::{Int32Type, LogicalPrimitiveType};
324    use crate::vectors::helper::Helper;
325
326    #[test]
327    fn test_df_columns_to_vector() {
328        let df_column: Arc<dyn Array> = Arc::new(Int32Array::from(vec![1, 2, 3]));
329        let vector = Helper::try_into_vector(df_column).unwrap();
330        assert_eq!(
331            Int32Type::build_data_type().as_arrow_type(),
332            vector.data_type().as_arrow_type()
333        );
334    }
335
336    #[test]
337    fn test_serialize_i32_vector() {
338        let df_column: Arc<dyn Array> = Arc::new(Int32Array::from(vec![1, 2, 3]));
339        let json_value = Helper::try_into_vector(df_column)
340            .unwrap()
341            .serialize_to_json()
342            .unwrap();
343        assert_eq!("[1,2,3]", serde_json::to_string(&json_value).unwrap());
344    }
345
346    #[test]
347    fn test_serialize_i8_vector() {
348        let df_column: Arc<dyn Array> = Arc::new(UInt8Array::from(vec![1, 2, 3]));
349        let json_value = Helper::try_into_vector(df_column)
350            .unwrap()
351            .serialize_to_json()
352            .unwrap();
353        assert_eq!("[1,2,3]", serde_json::to_string(&json_value).unwrap());
354    }
355
356    #[test]
357    fn test_mutable_vector_data_type() {
358        macro_rules! mutable_primitive_data_type_eq_with_lower {
359            ($($type: ident),*) => {
360                $(
361                    paste! {
362                        let mutable_vector = [<$type VectorBuilder>]::with_capacity(1024);
363                        assert_eq!(mutable_vector.data_type(), ConcreteDataType::[<$type:lower _datatype>]());
364                    }
365                )*
366            };
367        }
368
369        macro_rules! mutable_time_data_type_eq_with_snake {
370            ($($type: ident),*) => {
371                $(
372                    paste! {
373                        let mutable_vector = [<$type VectorBuilder>]::with_capacity(1024);
374                        assert_eq!(mutable_vector.data_type(), ConcreteDataType::[<$type:snake _datatype>]());
375                    }
376                )*
377            };
378        }
379        // Test Primitive types
380        mutable_primitive_data_type_eq_with_lower!(
381            Boolean, Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float32, Float64,
382            Date, Binary, String
383        );
384
385        // Test types about time
386        mutable_time_data_type_eq_with_snake!(
387            TimeSecond,
388            TimeMillisecond,
389            TimeMicrosecond,
390            TimeNanosecond,
391            TimestampSecond,
392            TimestampMillisecond,
393            TimestampMicrosecond,
394            TimestampNanosecond,
395            DurationSecond,
396            DurationMillisecond,
397            DurationMicrosecond,
398            DurationNanosecond,
399            IntervalYearMonth,
400            IntervalDayTime,
401            IntervalMonthDayNano
402        );
403
404        // Null type
405        let builder = NullVectorBuilder::default();
406        assert_eq!(builder.data_type(), ConcreteDataType::null_datatype());
407
408        // Decimal128 type
409        let builder = Decimal128VectorBuilder::with_capacity(1024);
410        assert_eq!(
411            builder.data_type(),
412            ConcreteDataType::decimal128_datatype(38, 10)
413        );
414
415        let builder = Decimal128VectorBuilder::with_capacity(1024)
416            .with_precision_and_scale(3, 2)
417            .unwrap();
418        assert_eq!(
419            builder.data_type(),
420            ConcreteDataType::decimal128_datatype(3, 2)
421        );
422    }
423
424    #[test]
425    #[should_panic(expected = "Must use ListVectorBuilder::with_type_capacity()")]
426    fn test_mutable_vector_list_data_type() {
427        // List type
428        let builder =
429            ListVectorBuilder::with_type_capacity(ConcreteDataType::int32_datatype(), 1024);
430        assert_eq!(
431            builder.data_type(),
432            ConcreteDataType::list_datatype(ConcreteDataType::int32_datatype())
433        );
434
435        // Panic with_capacity
436        let _ = ListVectorBuilder::with_capacity(1024);
437    }
438
439    #[test]
440    fn test_mutable_vector_to_vector_cloned() {
441        // create a string vector builder
442        let mut builder = ConcreteDataType::string_datatype().create_mutable_vector(1024);
443        builder.push_value_ref(&ValueRef::String("hello"));
444        builder.push_value_ref(&ValueRef::String("world"));
445        builder.push_value_ref(&ValueRef::String("!"));
446
447        // use MutableVector trait to_vector_cloned won't reset builder
448        let vector = builder.to_vector_cloned();
449        assert_eq!(vector.len(), 3);
450        assert_eq!(builder.len(), 3);
451    }
452}