datatypes/vectors/
dictionary.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::any::Any;
16use std::fmt;
17use std::sync::Arc;
18
19use arrow::array::{Array, ArrayRef, DictionaryArray, PrimitiveArray, PrimitiveBuilder};
20use arrow::datatypes::{ArrowDictionaryKeyType, ArrowNativeType};
21use serde_json::Value as JsonValue;
22use snafu::ResultExt;
23
24use crate::data_type::ConcreteDataType;
25use crate::error::{self, Result};
26use crate::serialize::Serializable;
27use crate::types::DictionaryType;
28use crate::value::{Value, ValueRef};
29use crate::vectors::operations::VectorOp;
30use crate::vectors::{self, Helper, Validity, Vector, VectorRef};
31
32/// Vector of dictionaries, basically backed by Arrow's `DictionaryArray`.
33pub struct DictionaryVector<K: ArrowDictionaryKeyType> {
34    array: DictionaryArray<K>,
35    /// The datatype of the keys in the dictionary.
36    key_type: ConcreteDataType,
37    /// The datatype of the items in the dictionary.
38    item_type: ConcreteDataType,
39    /// The vector of items in the dictionary.
40    item_vector: VectorRef,
41}
42
43impl<K: ArrowDictionaryKeyType> fmt::Debug for DictionaryVector<K> {
44    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
45        f.debug_struct("DictionaryVector")
46            .field("array", &self.array)
47            .field("key_type", &self.key_type)
48            .field("item_type", &self.item_type)
49            .finish()
50    }
51}
52
53impl<K: ArrowDictionaryKeyType> PartialEq for DictionaryVector<K> {
54    fn eq(&self, other: &DictionaryVector<K>) -> bool {
55        self.array == other.array
56            && self.key_type == other.key_type
57            && self.item_type == other.item_type
58    }
59}
60
61impl<K: ArrowDictionaryKeyType> DictionaryVector<K> {
62    /// Create a new instance of `DictionaryVector` from a dictionary array and item type
63    pub fn new(array: DictionaryArray<K>, item_type: ConcreteDataType) -> Result<Self> {
64        let key_type = ConcreteDataType::try_from(&K::DATA_TYPE)?;
65        let item_vector = Helper::try_into_vector(array.values())?;
66
67        Ok(Self {
68            array,
69            key_type,
70            item_type,
71            item_vector,
72        })
73    }
74
75    /// Returns the underlying Arrow dictionary array
76    pub fn array(&self) -> &DictionaryArray<K> {
77        &self.array
78    }
79
80    /// Returns the keys array of this dictionary
81    pub fn keys(&self) -> &arrow_array::PrimitiveArray<K> {
82        self.array.keys()
83    }
84
85    /// Returns the values array of this dictionary
86    pub fn values(&self) -> &ArrayRef {
87        self.array.values()
88    }
89
90    pub fn as_arrow(&self) -> &dyn Array {
91        &self.array
92    }
93}
94
95impl<K: ArrowDictionaryKeyType> Vector for DictionaryVector<K> {
96    fn data_type(&self) -> ConcreteDataType {
97        ConcreteDataType::Dictionary(DictionaryType::new(
98            self.key_type.clone(),
99            self.item_type.clone(),
100        ))
101    }
102
103    fn vector_type_name(&self) -> String {
104        "DictionaryVector".to_string()
105    }
106
107    fn as_any(&self) -> &dyn Any {
108        self
109    }
110
111    fn len(&self) -> usize {
112        self.array.len()
113    }
114
115    fn to_arrow_array(&self) -> ArrayRef {
116        Arc::new(self.array.clone())
117    }
118
119    fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
120        Box::new(self.array.clone())
121    }
122
123    fn validity(&self) -> Validity {
124        vectors::impl_validity_for_vector!(self.array)
125    }
126
127    fn memory_size(&self) -> usize {
128        self.array.get_buffer_memory_size()
129    }
130
131    fn null_count(&self) -> usize {
132        self.array.null_count()
133    }
134
135    fn is_null(&self, row: usize) -> bool {
136        self.array.is_null(row)
137    }
138
139    fn slice(&self, offset: usize, length: usize) -> VectorRef {
140        Arc::new(Self {
141            array: self.array.slice(offset, length),
142            key_type: self.key_type.clone(),
143            item_type: self.item_type.clone(),
144            item_vector: self.item_vector.clone(),
145        })
146    }
147
148    fn get(&self, index: usize) -> Value {
149        if !self.array.is_valid(index) {
150            return Value::Null;
151        }
152
153        let key = self.array.keys().value(index);
154        self.item_vector.get(key.as_usize())
155    }
156
157    fn get_ref(&self, index: usize) -> ValueRef<'_> {
158        if !self.array.is_valid(index) {
159            return ValueRef::Null;
160        }
161
162        let key = self.array.keys().value(index);
163        self.item_vector.get_ref(key.as_usize())
164    }
165}
166
167impl<K: ArrowDictionaryKeyType> Serializable for DictionaryVector<K> {
168    fn serialize_to_json(&self) -> Result<Vec<JsonValue>> {
169        // Convert the dictionary array to JSON, where each element is either null or
170        // the value it refers to in the dictionary
171        let mut result = Vec::with_capacity(self.len());
172
173        let keys = self.array.keys();
174        let key_values = &keys.values()[..self.len()];
175        for (i, &key) in key_values.iter().enumerate() {
176            if self.is_null(i) {
177                result.push(JsonValue::Null);
178            } else {
179                let value = self.item_vector.get(key.as_usize());
180                let json_value = serde_json::to_value(value).context(error::SerializeSnafu)?;
181                result.push(json_value);
182            }
183        }
184
185        Ok(result)
186    }
187}
188
189impl<K: ArrowDictionaryKeyType> TryFrom<DictionaryArray<K>> for DictionaryVector<K> {
190    type Error = crate::error::Error;
191
192    fn try_from(array: DictionaryArray<K>) -> Result<Self> {
193        let key_type = ConcreteDataType::try_from(array.keys().data_type())?;
194        let item_type = ConcreteDataType::try_from(array.values().data_type())?;
195        let item_vector = Helper::try_into_vector(array.values())?;
196
197        Ok(Self {
198            array,
199            key_type,
200            item_type,
201            item_vector,
202        })
203    }
204}
205
206pub struct DictionaryIter<'a, K: ArrowDictionaryKeyType> {
207    vector: &'a DictionaryVector<K>,
208    idx: usize,
209}
210
211impl<'a, K: ArrowDictionaryKeyType> DictionaryIter<'a, K> {
212    pub fn new(vector: &'a DictionaryVector<K>) -> DictionaryIter<'a, K> {
213        DictionaryIter { vector, idx: 0 }
214    }
215}
216
217impl<'a, K: ArrowDictionaryKeyType> Iterator for DictionaryIter<'a, K> {
218    type Item = Option<ValueRef<'a>>;
219
220    #[inline]
221    fn next(&mut self) -> Option<Self::Item> {
222        if self.idx >= self.vector.len() {
223            return None;
224        }
225
226        let idx = self.idx;
227        self.idx += 1;
228
229        if self.vector.is_null(idx) {
230            return Some(None);
231        }
232
233        Some(Some(self.vector.get_ref(idx)))
234    }
235
236    #[inline]
237    fn size_hint(&self) -> (usize, Option<usize>) {
238        (
239            self.vector.len() - self.idx,
240            Some(self.vector.len() - self.idx),
241        )
242    }
243}
244
245impl<K: ArrowDictionaryKeyType> VectorOp for DictionaryVector<K> {
246    fn replicate(&self, offsets: &[usize]) -> VectorRef {
247        let keys = self.array.keys();
248        let mut replicated_keys = PrimitiveBuilder::new();
249
250        let mut previous_offset = 0;
251        let mut key_iter = keys.iter().chain(std::iter::repeat(None));
252        for &offset in offsets {
253            let key = key_iter.next().unwrap();
254
255            // repeat this key (offset - previous_offset) times
256            let repeat_count = offset - previous_offset;
257            for _ in 0..repeat_count {
258                replicated_keys.append_option(key);
259            }
260
261            previous_offset = offset;
262        }
263
264        let new_keys = replicated_keys.finish();
265        let new_array = DictionaryArray::try_new(new_keys, self.values().clone())
266            .expect("Failed to create replicated dictionary array");
267
268        Arc::new(Self {
269            array: new_array,
270            key_type: self.key_type.clone(),
271            item_type: self.item_type.clone(),
272            item_vector: self.item_vector.clone(),
273        })
274    }
275
276    fn filter(&self, filter: &vectors::BooleanVector) -> Result<VectorRef> {
277        let key_array: ArrayRef = Arc::new(self.array.keys().clone());
278        let key_vector = Helper::try_into_vector(&key_array)?;
279        let filtered_key_vector = key_vector.filter(filter)?;
280        let filtered_key_array = filtered_key_vector.to_arrow_array();
281        let filtered_key_array = filtered_key_array
282            .as_any()
283            .downcast_ref::<PrimitiveArray<K>>()
284            .unwrap();
285
286        let new_array = DictionaryArray::try_new(filtered_key_array.clone(), self.values().clone())
287            .expect("Failed to create filtered dictionary array");
288
289        Ok(Arc::new(Self {
290            array: new_array,
291            key_type: self.key_type.clone(),
292            item_type: self.item_type.clone(),
293            item_vector: self.item_vector.clone(),
294        }))
295    }
296
297    fn cast(&self, to_type: &ConcreteDataType) -> Result<VectorRef> {
298        let new_items = self.item_vector.cast(to_type)?;
299        let new_array =
300            DictionaryArray::try_new(self.array.keys().clone(), new_items.to_arrow_array())
301                .expect("Failed to create casted dictionary array");
302        Ok(Arc::new(Self {
303            array: new_array,
304            key_type: self.key_type.clone(),
305            item_type: to_type.clone(),
306            item_vector: self.item_vector.clone(),
307        }))
308    }
309
310    fn take(&self, indices: &vectors::UInt32Vector) -> Result<VectorRef> {
311        let key_array: ArrayRef = Arc::new(self.array.keys().clone());
312        let key_vector = Helper::try_into_vector(&key_array)?;
313        let new_key_vector = key_vector.take(indices)?;
314        let new_key_array = new_key_vector.to_arrow_array();
315        let new_key_array = new_key_array
316            .as_any()
317            .downcast_ref::<PrimitiveArray<K>>()
318            .unwrap();
319
320        let new_array = DictionaryArray::try_new(new_key_array.clone(), self.values().clone())
321            .expect("Failed to create filtered dictionary array");
322
323        Ok(Arc::new(Self {
324            array: new_array,
325            key_type: self.key_type.clone(),
326            item_type: self.item_type.clone(),
327            item_vector: self.item_vector.clone(),
328        }))
329    }
330}
331
332#[cfg(test)]
333mod tests {
334    use std::sync::Arc;
335
336    use arrow::array::{Int64Array, StringArray, UInt32Array};
337    use arrow::datatypes::{Int64Type, UInt32Type};
338
339    use super::*;
340
341    // Helper function to create a test dictionary vector with string values
342    fn create_test_dictionary() -> DictionaryVector<Int64Type> {
343        // Dictionary values: ["a", "b", "c", "d"]
344        // Keys: [0, 1, 2, null, 1, 3]
345        // Resulting in: ["a", "b", "c", null, "b", "d"]
346        let values = StringArray::from(vec!["a", "b", "c", "d"]);
347        let keys = Int64Array::from(vec![Some(0), Some(1), Some(2), None, Some(1), Some(3)]);
348        let dict_array = DictionaryArray::new(keys, Arc::new(values));
349        DictionaryVector::<Int64Type>::try_from(dict_array).unwrap()
350    }
351
352    #[test]
353    fn test_dictionary_vector_basics() {
354        let dict_vec = create_test_dictionary();
355
356        // Test length and null count
357        assert_eq!(dict_vec.len(), 6);
358        assert_eq!(dict_vec.null_count(), 1);
359
360        // Test data type
361        let data_type = dict_vec.data_type();
362        if let ConcreteDataType::Dictionary(dict_type) = data_type {
363            assert_eq!(*dict_type.value_type(), ConcreteDataType::string_datatype());
364        } else {
365            panic!("Expected Dictionary data type");
366        }
367
368        // Test is_null
369        assert!(!dict_vec.is_null(0));
370        assert!(dict_vec.is_null(3));
371
372        // Test get values
373        assert_eq!(dict_vec.get(0), Value::String("a".to_string().into()));
374        assert_eq!(dict_vec.get(1), Value::String("b".to_string().into()));
375        assert_eq!(dict_vec.get(3), Value::Null);
376        assert_eq!(dict_vec.get(4), Value::String("b".to_string().into()));
377    }
378
379    #[test]
380    fn test_slice() {
381        let dict_vec = create_test_dictionary();
382        let sliced = dict_vec.slice(1, 3);
383
384        assert_eq!(sliced.len(), 3);
385        assert_eq!(sliced.get(0), Value::String("b".to_string().into()));
386        assert_eq!(sliced.get(1), Value::String("c".to_string().into()));
387        assert_eq!(sliced.get(2), Value::Null);
388    }
389
390    #[test]
391    fn test_replicate() {
392        let dict_vec = create_test_dictionary();
393
394        // Replicate with offsets [0, 2, 5] - should get values at these indices
395        let offsets = vec![0, 2, 5];
396        let replicated = dict_vec.replicate(&offsets);
397        assert_eq!(replicated.len(), 5);
398        assert_eq!(replicated.get(0), Value::String("b".to_string().into()));
399        assert_eq!(replicated.get(1), Value::String("b".to_string().into()));
400        assert_eq!(replicated.get(2), Value::String("c".to_string().into()));
401        assert_eq!(replicated.get(3), Value::String("c".to_string().into()));
402        assert_eq!(replicated.get(4), Value::String("c".to_string().into()));
403    }
404
405    #[test]
406    fn test_filter() {
407        let dict_vec = create_test_dictionary();
408
409        // Keep only indices 0, 2, 4
410        let filter_values = vec![true, false, true, false, true, false];
411        let filter = vectors::BooleanVector::from(filter_values);
412
413        let filtered = dict_vec.filter(&filter).unwrap();
414        assert_eq!(filtered.len(), 3);
415
416        // Check the values
417        assert_eq!(filtered.get(0), Value::String("a".to_string().into()));
418        assert_eq!(filtered.get(1), Value::String("c".to_string().into()));
419        assert_eq!(filtered.get(2), Value::String("b".to_string().into()));
420    }
421
422    #[test]
423    fn test_cast() {
424        let dict_vec = create_test_dictionary();
425
426        // Cast to the same type should return an equivalent vector
427        let casted = dict_vec.cast(&ConcreteDataType::string_datatype()).unwrap();
428
429        // The returned vector should have string values
430        assert_eq!(
431            casted.data_type(),
432            ConcreteDataType::Dictionary(DictionaryType::new(
433                ConcreteDataType::int64_datatype(),
434                ConcreteDataType::string_datatype(),
435            ))
436        );
437        assert_eq!(casted.len(), dict_vec.len());
438
439        // Values should match the original dictionary lookups
440        assert_eq!(casted.get(0), Value::String("a".to_string().into()));
441        assert_eq!(casted.get(1), Value::String("b".to_string().into()));
442        assert_eq!(casted.get(2), Value::String("c".to_string().into()));
443        assert_eq!(casted.get(3), Value::Null);
444        assert_eq!(casted.get(4), Value::String("b".to_string().into()));
445        assert_eq!(casted.get(5), Value::String("d".to_string().into()));
446    }
447
448    #[test]
449    fn test_take() {
450        let dict_vec = create_test_dictionary();
451
452        // Take indices 2, 0, 4
453        let indices_vec = vec![Some(2u32), Some(0), Some(4)];
454        let indices = vectors::UInt32Vector::from(indices_vec);
455
456        let taken = dict_vec.take(&indices).unwrap();
457        assert_eq!(taken.len(), 3);
458
459        // Check the values
460        assert_eq!(taken.get(0), Value::String("c".to_string().into()));
461        assert_eq!(taken.get(1), Value::String("a".to_string().into()));
462        assert_eq!(taken.get(2), Value::String("b".to_string().into()));
463    }
464
465    #[test]
466    fn test_other_type() {
467        let values = StringArray::from(vec!["a", "b", "c", "d"]);
468        let keys = UInt32Array::from(vec![Some(0), Some(1), Some(2), None, Some(1), Some(3)]);
469        let dict_array = DictionaryArray::new(keys, Arc::new(values));
470        let dict_vec = DictionaryVector::<UInt32Type>::try_from(dict_array).unwrap();
471        assert_eq!(
472            ConcreteDataType::dictionary_datatype(
473                ConcreteDataType::uint32_datatype(),
474                ConcreteDataType::string_datatype()
475            ),
476            dict_vec.data_type()
477        );
478    }
479}