Skip to main content

datatypes/
vectors.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::any::Any;
16use std::fmt::Debug;
17use std::sync::Arc;
18
19use arrow::array::{Array, ArrayRef};
20use snafu::ensure;
21
22use crate::data_type::ConcreteDataType;
23use crate::error::{self, Result};
24use crate::serialize::Serializable;
25use crate::value::{Value, ValueRef};
26use crate::vectors::operations::VectorOp;
27
28mod binary;
29mod boolean;
30mod constant;
31mod date;
32mod decimal;
33mod dictionary;
34mod duration;
35mod eq;
36mod helper;
37mod interval;
38pub(crate) mod json;
39mod list;
40mod null;
41pub(crate) mod operations;
42mod primitive;
43mod string;
44mod struct_vector;
45mod time;
46mod timestamp;
47mod validity;
48
49pub use binary::{BinaryVector, BinaryVectorBuilder};
50pub use boolean::{BooleanVector, BooleanVectorBuilder};
51pub use constant::ConstantVector;
52pub use date::{DateVector, DateVectorBuilder};
53pub use decimal::{Decimal128Vector, Decimal128VectorBuilder};
54pub use dictionary::{DictionaryIter, DictionaryVector};
55pub use duration::{
56    DurationMicrosecondVector, DurationMicrosecondVectorBuilder, DurationMillisecondVector,
57    DurationMillisecondVectorBuilder, DurationNanosecondVector, DurationNanosecondVectorBuilder,
58    DurationSecondVector, DurationSecondVectorBuilder,
59};
60pub use helper::Helper;
61pub use interval::{
62    IntervalDayTimeVector, IntervalDayTimeVectorBuilder, IntervalMonthDayNanoVector,
63    IntervalMonthDayNanoVectorBuilder, IntervalYearMonthVector, IntervalYearMonthVectorBuilder,
64};
65pub use list::{ListIter, ListVector, ListVectorBuilder};
66pub use null::{NullVector, NullVectorBuilder};
67pub use primitive::{
68    Float32Vector, Float32VectorBuilder, Float64Vector, Float64VectorBuilder, Int8Vector,
69    Int8VectorBuilder, Int16Vector, Int16VectorBuilder, Int32Vector, Int32VectorBuilder,
70    Int64Vector, Int64VectorBuilder, PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder,
71    UInt8Vector, UInt8VectorBuilder, UInt16Vector, UInt16VectorBuilder, UInt32Vector,
72    UInt32VectorBuilder, UInt64Vector, UInt64VectorBuilder,
73};
74pub use string::{StringVector, StringVectorBuilder};
75pub use struct_vector::{StructVector, StructVectorBuilder};
76pub use time::{
77    TimeMicrosecondVector, TimeMicrosecondVectorBuilder, TimeMillisecondVector,
78    TimeMillisecondVectorBuilder, TimeNanosecondVector, TimeNanosecondVectorBuilder,
79    TimeSecondVector, TimeSecondVectorBuilder,
80};
81pub use timestamp::{
82    TimestampMicrosecondVector, TimestampMicrosecondVectorBuilder, TimestampMillisecondVector,
83    TimestampMillisecondVectorBuilder, TimestampNanosecondVector, TimestampNanosecondVectorBuilder,
84    TimestampSecondVector, TimestampSecondVectorBuilder,
85};
86pub use validity::Validity;
87
88// TODO(yingwen): arrow 28.0 implements Clone for all arrays, we could upgrade to it and simplify
89// some codes in methods such as `to_arrow_array()` and `to_boxed_arrow_array()`.
90/// Vector of data values.
91pub trait Vector: Send + Sync + Serializable + Debug + VectorOp {
92    /// Returns the data type of the vector.
93    ///
94    /// This may require heap allocation.
95    fn data_type(&self) -> ConcreteDataType;
96
97    fn vector_type_name(&self) -> String;
98
99    /// Returns the vector as [Any](std::any::Any) so that it can be
100    /// downcast to a specific implementation.
101    fn as_any(&self) -> &dyn Any;
102
103    /// Returns number of elements in the vector.
104    fn len(&self) -> usize;
105
106    /// Returns whether the vector is empty.
107    fn is_empty(&self) -> bool {
108        self.len() == 0
109    }
110
111    /// Convert this vector to a new arrow [ArrayRef].
112    fn to_arrow_array(&self) -> ArrayRef;
113
114    /// Convert this vector to a new boxed arrow [Array].
115    fn to_boxed_arrow_array(&self) -> Box<dyn Array>;
116
117    /// Returns the validity of the Array.
118    fn validity(&self) -> Validity;
119
120    /// Returns the memory size of vector.
121    fn memory_size(&self) -> usize;
122
123    /// The number of null slots on this [`Vector`].
124    /// # Implementation
125    /// This is `O(1)`.
126    fn null_count(&self) -> usize;
127
128    /// Returns true when it's a ConstantColumn
129    fn is_const(&self) -> bool {
130        false
131    }
132
133    /// Returns whether row is null.
134    fn is_null(&self, row: usize) -> bool;
135
136    /// If the vector only contains NULL.
137    fn only_null(&self) -> bool {
138        self.null_count() == self.len()
139    }
140
141    /// Slices the `Vector`, returning a new `VectorRef`.
142    ///
143    /// # Panics
144    /// This function panics if `offset + length > self.len()`.
145    fn slice(&self, offset: usize, length: usize) -> VectorRef;
146
147    /// Returns the clone of value at `index`.
148    ///
149    /// # Panics
150    /// Panic if `index` is out of bound.
151    fn get(&self, index: usize) -> Value;
152
153    /// Returns the clone of value at `index` or error if `index`
154    /// is out of bound.
155    fn try_get(&self, index: usize) -> Result<Value> {
156        ensure!(
157            index < self.len(),
158            error::BadArrayAccessSnafu {
159                index,
160                size: self.len()
161            }
162        );
163        Ok(self.get(index))
164    }
165
166    /// Returns the reference of value at `index`.
167    ///
168    /// # Panics
169    /// Panic if `index` is out of bound.
170    fn get_ref(&self, index: usize) -> ValueRef<'_>;
171}
172
173pub type VectorRef = Arc<dyn Vector>;
174
175/// Mutable vector that could be used to build an immutable vector.
176pub trait MutableVector: Send + Sync {
177    /// Returns the data type of the vector.
178    fn data_type(&self) -> ConcreteDataType;
179
180    /// Returns the length of the vector.
181    fn len(&self) -> usize;
182
183    /// Returns whether the vector is empty.
184    fn is_empty(&self) -> bool {
185        self.len() == 0
186    }
187
188    /// Convert to Any, to enable dynamic casting.
189    fn as_any(&self) -> &dyn Any;
190
191    /// Convert to mutable Any, to enable dynamic casting.
192    fn as_mut_any(&mut self) -> &mut dyn Any;
193
194    /// Convert `self` to an (immutable) [VectorRef] and reset `self`.
195    fn to_vector(&mut self) -> VectorRef;
196
197    /// Convert `self` to an (immutable) [VectorRef] and without resetting `self`.
198    fn to_vector_cloned(&self) -> VectorRef;
199
200    /// Try to push value ref to this mutable vector.
201    fn try_push_value_ref(&mut self, value: &ValueRef) -> Result<()>;
202
203    /// Push value ref to this mutable vector.
204    ///
205    /// # Panics
206    /// Panics if error if data types mismatch.
207    fn push_value_ref(&mut self, value: &ValueRef) {
208        self.try_push_value_ref(value).unwrap_or_else(|_| {
209            panic!(
210                "expecting pushing value of datatype {:?}, actual {:?}",
211                self.data_type(),
212                value
213            );
214        });
215    }
216
217    /// Push null to this mutable vector.
218    fn push_null(&mut self);
219
220    /// Push nulls to this mutable vector.
221    fn push_nulls(&mut self, num_nulls: usize) {
222        for _ in 0..num_nulls {
223            self.push_null();
224        }
225    }
226
227    /// Extend this mutable vector by slice of `vector`.
228    ///
229    /// Returns error if data types mismatch.
230    ///
231    /// # Panics
232    /// Panics if `offset + length > vector.len()`.
233    fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()>;
234}
235
236/// Helper to define `try_from_arrow_array(array: arrow::array::ArrayRef)` function.
237macro_rules! impl_try_from_arrow_array_for_vector {
238    ($Array: ident, $Vector: ident) => {
239        impl $Vector {
240            pub fn try_from_arrow_array(
241                array: impl AsRef<dyn arrow::array::Array>,
242            ) -> crate::error::Result<$Vector> {
243                use snafu::OptionExt;
244
245                let arrow_array = array
246                    .as_ref()
247                    .as_any()
248                    .downcast_ref::<$Array>()
249                    .with_context(|| crate::error::ConversionSnafu {
250                        from: std::format!("{:?}", array.as_ref().data_type()),
251                    })?
252                    .clone();
253
254                Ok($Vector::from(arrow_array))
255            }
256        }
257    };
258}
259
260macro_rules! impl_validity_for_vector {
261    ($array: expr) => {
262        Validity::from_array_data($array.to_data())
263    };
264}
265
266macro_rules! impl_get_for_vector {
267    ($array: expr, $index: ident) => {
268        if $array.is_valid($index) {
269            // Safety: The index have been checked by `is_valid()`.
270            unsafe { $array.value_unchecked($index).into() }
271        } else {
272            Value::Null
273        }
274    };
275}
276
277macro_rules! impl_get_ref_for_vector {
278    ($array: expr, $index: ident) => {
279        if $array.is_valid($index) {
280            // Safety: The index have been checked by `is_valid()`.
281            unsafe { $array.value_unchecked($index).into() }
282        } else {
283            ValueRef::Null
284        }
285    };
286}
287
288macro_rules! impl_extend_for_builder {
289    ($mutable_vector: expr, $vector: ident, $VectorType: ident, $offset: ident, $length: ident) => {{
290        use snafu::OptionExt;
291
292        let sliced_vector = $vector.slice($offset, $length);
293        let concrete_vector = sliced_vector
294            .as_any()
295            .downcast_ref::<$VectorType>()
296            .with_context(|| crate::error::CastTypeSnafu {
297                msg: format!(
298                    "Failed to cast vector from {} to {}",
299                    $vector.vector_type_name(),
300                    stringify!($VectorType)
301                ),
302            })?;
303        for value in concrete_vector.iter_data() {
304            $mutable_vector.push(value);
305        }
306        Ok(())
307    }};
308}
309
310pub(crate) use impl_extend_for_builder;
311pub(crate) use impl_get_for_vector;
312pub(crate) use impl_get_ref_for_vector;
313pub(crate) use impl_try_from_arrow_array_for_vector;
314pub(crate) use impl_validity_for_vector;
315
316#[cfg(test)]
317pub mod tests {
318    use arrow::array::{Array, Int32Array, UInt8Array};
319    use paste::paste;
320    use serde_json;
321
322    use super::*;
323    use crate::data_type::DataType;
324    use crate::prelude::ScalarVectorBuilder;
325    use crate::types::{Int32Type, LogicalPrimitiveType};
326    use crate::vectors::helper::Helper;
327
328    #[test]
329    fn test_df_columns_to_vector() {
330        let df_column: Arc<dyn Array> = Arc::new(Int32Array::from(vec![1, 2, 3]));
331        let vector = Helper::try_into_vector(df_column).unwrap();
332        assert_eq!(
333            Int32Type::build_data_type().as_arrow_type(),
334            vector.data_type().as_arrow_type()
335        );
336    }
337
338    #[test]
339    fn test_serialize_i32_vector() {
340        let df_column: Arc<dyn Array> = Arc::new(Int32Array::from(vec![1, 2, 3]));
341        let json_value = Helper::try_into_vector(df_column)
342            .unwrap()
343            .serialize_to_json()
344            .unwrap();
345        assert_eq!("[1,2,3]", serde_json::to_string(&json_value).unwrap());
346    }
347
348    #[test]
349    fn test_serialize_i8_vector() {
350        let df_column: Arc<dyn Array> = Arc::new(UInt8Array::from(vec![1, 2, 3]));
351        let json_value = Helper::try_into_vector(df_column)
352            .unwrap()
353            .serialize_to_json()
354            .unwrap();
355        assert_eq!("[1,2,3]", serde_json::to_string(&json_value).unwrap());
356    }
357
358    #[test]
359    fn test_mutable_vector_data_type() {
360        macro_rules! mutable_primitive_data_type_eq_with_lower {
361            ($($type: ident),*) => {
362                $(
363                    paste! {
364                        let mutable_vector = [<$type VectorBuilder>]::with_capacity(1024);
365                        assert_eq!(mutable_vector.data_type(), ConcreteDataType::[<$type:lower _datatype>]());
366                    }
367                )*
368            };
369        }
370
371        macro_rules! mutable_time_data_type_eq_with_snake {
372            ($($type: ident),*) => {
373                $(
374                    paste! {
375                        let mutable_vector = [<$type VectorBuilder>]::with_capacity(1024);
376                        assert_eq!(mutable_vector.data_type(), ConcreteDataType::[<$type:snake _datatype>]());
377                    }
378                )*
379            };
380        }
381        // Test Primitive types
382        mutable_primitive_data_type_eq_with_lower!(
383            Boolean, Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float32, Float64,
384            Date, Binary, String
385        );
386
387        // Test types about time
388        mutable_time_data_type_eq_with_snake!(
389            TimeSecond,
390            TimeMillisecond,
391            TimeMicrosecond,
392            TimeNanosecond,
393            TimestampSecond,
394            TimestampMillisecond,
395            TimestampMicrosecond,
396            TimestampNanosecond,
397            DurationSecond,
398            DurationMillisecond,
399            DurationMicrosecond,
400            DurationNanosecond,
401            IntervalYearMonth,
402            IntervalDayTime,
403            IntervalMonthDayNano
404        );
405
406        // Null type
407        let builder = NullVectorBuilder::default();
408        assert_eq!(builder.data_type(), ConcreteDataType::null_datatype());
409
410        // Decimal128 type
411        let builder = Decimal128VectorBuilder::with_capacity(1024);
412        assert_eq!(
413            builder.data_type(),
414            ConcreteDataType::decimal128_datatype(38, 10)
415        );
416
417        let builder = Decimal128VectorBuilder::with_capacity(1024)
418            .with_precision_and_scale(3, 2)
419            .unwrap();
420        assert_eq!(
421            builder.data_type(),
422            ConcreteDataType::decimal128_datatype(3, 2)
423        );
424    }
425
426    #[test]
427    #[should_panic(expected = "Must use ListVectorBuilder::with_type_capacity()")]
428    fn test_mutable_vector_list_data_type() {
429        let item_type = Arc::new(ConcreteDataType::int32_datatype());
430        // List type
431        let builder = ListVectorBuilder::with_type_capacity(item_type.clone(), 1024);
432        assert_eq!(
433            builder.data_type(),
434            ConcreteDataType::list_datatype(item_type)
435        );
436
437        // Panic with_capacity
438        let _ = ListVectorBuilder::with_capacity(1024);
439    }
440
441    #[test]
442    fn test_mutable_vector_to_vector_cloned() {
443        // create a string vector builder
444        let mut builder = ConcreteDataType::string_datatype().create_mutable_vector(1024);
445        builder.push_value_ref(&ValueRef::String("hello"));
446        builder.push_value_ref(&ValueRef::String("world"));
447        builder.push_value_ref(&ValueRef::String("!"));
448
449        // use MutableVector trait to_vector_cloned won't reset builder
450        let vector = builder.to_vector_cloned();
451        assert_eq!(vector.len(), 3);
452        assert_eq!(builder.len(), 3);
453    }
454}