datatypes/vectors/
dictionary.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::any::Any;
16use std::fmt;
17use std::sync::Arc;
18
19use arrow::array::{Array, ArrayRef, DictionaryArray, PrimitiveArray, PrimitiveBuilder};
20use arrow::datatypes::{ArrowDictionaryKeyType, ArrowNativeType};
21use serde_json::Value as JsonValue;
22use snafu::ResultExt;
23
24use crate::data_type::ConcreteDataType;
25use crate::error::{self, Result};
26use crate::serialize::Serializable;
27use crate::types::DictionaryType;
28use crate::value::{Value, ValueRef};
29use crate::vectors::operations::VectorOp;
30use crate::vectors::{self, Helper, Validity, Vector, VectorRef};
31
32/// Vector of dictionaries, basically backed by Arrow's `DictionaryArray`.
33pub struct DictionaryVector<K: ArrowDictionaryKeyType> {
34    array: DictionaryArray<K>,
35    /// The datatype of the keys in the dictionary.
36    key_type: ConcreteDataType,
37    /// The datatype of the items in the dictionary.
38    item_type: ConcreteDataType,
39    /// The vector of items in the dictionary.
40    item_vector: VectorRef,
41}
42
43impl<K: ArrowDictionaryKeyType> fmt::Debug for DictionaryVector<K> {
44    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
45        f.debug_struct("DictionaryVector")
46            .field("array", &self.array)
47            .field("key_type", &self.key_type)
48            .field("item_type", &self.item_type)
49            .finish()
50    }
51}
52
53impl<K: ArrowDictionaryKeyType> PartialEq for DictionaryVector<K> {
54    fn eq(&self, other: &DictionaryVector<K>) -> bool {
55        self.array == other.array
56            && self.key_type == other.key_type
57            && self.item_type == other.item_type
58    }
59}
60
61impl<K: ArrowDictionaryKeyType> DictionaryVector<K> {
62    /// Create a new instance of `DictionaryVector` from a dictionary array and item type
63    pub fn new(array: DictionaryArray<K>, item_type: ConcreteDataType) -> Result<Self> {
64        let key_type = ConcreteDataType::try_from(&K::DATA_TYPE)?;
65        let item_vector = Helper::try_into_vector(array.values())?;
66
67        Ok(Self {
68            array,
69            key_type,
70            item_type,
71            item_vector,
72        })
73    }
74
75    /// Returns the underlying Arrow dictionary array
76    pub fn array(&self) -> &DictionaryArray<K> {
77        &self.array
78    }
79
80    /// Returns the keys array of this dictionary
81    pub fn keys(&self) -> &arrow_array::PrimitiveArray<K> {
82        self.array.keys()
83    }
84
85    /// Returns the values array of this dictionary
86    pub fn values(&self) -> &ArrayRef {
87        self.array.values()
88    }
89
90    pub fn as_arrow(&self) -> &dyn Array {
91        &self.array
92    }
93}
94
95impl<K: ArrowDictionaryKeyType> Vector for DictionaryVector<K> {
96    fn data_type(&self) -> ConcreteDataType {
97        ConcreteDataType::Dictionary(DictionaryType::new(
98            self.key_type.clone(),
99            self.item_type.clone(),
100        ))
101    }
102
103    fn vector_type_name(&self) -> String {
104        "DictionaryVector".to_string()
105    }
106
107    fn as_any(&self) -> &dyn Any {
108        self
109    }
110
111    fn len(&self) -> usize {
112        self.array.len()
113    }
114
115    fn to_arrow_array(&self) -> ArrayRef {
116        Arc::new(self.array.clone())
117    }
118
119    fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
120        Box::new(self.array.clone())
121    }
122
123    fn validity(&self) -> Validity {
124        vectors::impl_validity_for_vector!(self.array)
125    }
126
127    fn memory_size(&self) -> usize {
128        self.array.get_buffer_memory_size()
129    }
130
131    fn null_count(&self) -> usize {
132        self.array.null_count()
133    }
134
135    fn is_null(&self, row: usize) -> bool {
136        self.array.is_null(row)
137    }
138
139    fn slice(&self, offset: usize, length: usize) -> VectorRef {
140        Arc::new(Self {
141            array: self.array.slice(offset, length),
142            key_type: self.key_type.clone(),
143            item_type: self.item_type.clone(),
144            item_vector: self.item_vector.clone(),
145        })
146    }
147
148    fn get(&self, index: usize) -> Value {
149        if !self.array.is_valid(index) {
150            return Value::Null;
151        }
152
153        let key = self.array.keys().value(index);
154        self.item_vector.get(key.as_usize())
155    }
156
157    fn get_ref(&self, index: usize) -> ValueRef {
158        if !self.array.is_valid(index) {
159            return ValueRef::Null;
160        }
161
162        let key = self.array.keys().value(index);
163        self.item_vector.get_ref(key.as_usize())
164    }
165}
166
167impl<K: ArrowDictionaryKeyType> Serializable for DictionaryVector<K> {
168    fn serialize_to_json(&self) -> Result<Vec<JsonValue>> {
169        // Convert the dictionary array to JSON, where each element is either null or
170        // the value it refers to in the dictionary
171        let mut result = Vec::with_capacity(self.len());
172
173        for i in 0..self.len() {
174            if self.is_null(i) {
175                result.push(JsonValue::Null);
176            } else {
177                let key = self.array.keys().value(i);
178                let value = self.item_vector.get(key.as_usize());
179                let json_value = serde_json::to_value(value).context(error::SerializeSnafu)?;
180                result.push(json_value);
181            }
182        }
183
184        Ok(result)
185    }
186}
187
188impl<K: ArrowDictionaryKeyType> TryFrom<DictionaryArray<K>> for DictionaryVector<K> {
189    type Error = crate::error::Error;
190
191    fn try_from(array: DictionaryArray<K>) -> Result<Self> {
192        let key_type = ConcreteDataType::try_from(array.keys().data_type())?;
193        let item_type = ConcreteDataType::try_from(array.values().data_type())?;
194        let item_vector = Helper::try_into_vector(array.values())?;
195
196        Ok(Self {
197            array,
198            key_type,
199            item_type,
200            item_vector,
201        })
202    }
203}
204
205pub struct DictionaryIter<'a, K: ArrowDictionaryKeyType> {
206    vector: &'a DictionaryVector<K>,
207    idx: usize,
208}
209
210impl<'a, K: ArrowDictionaryKeyType> DictionaryIter<'a, K> {
211    pub fn new(vector: &'a DictionaryVector<K>) -> DictionaryIter<'a, K> {
212        DictionaryIter { vector, idx: 0 }
213    }
214}
215
216impl<'a, K: ArrowDictionaryKeyType> Iterator for DictionaryIter<'a, K> {
217    type Item = Option<ValueRef<'a>>;
218
219    #[inline]
220    fn next(&mut self) -> Option<Self::Item> {
221        if self.idx >= self.vector.len() {
222            return None;
223        }
224
225        let idx = self.idx;
226        self.idx += 1;
227
228        if self.vector.is_null(idx) {
229            return Some(None);
230        }
231
232        Some(Some(self.vector.get_ref(idx)))
233    }
234
235    #[inline]
236    fn size_hint(&self) -> (usize, Option<usize>) {
237        (
238            self.vector.len() - self.idx,
239            Some(self.vector.len() - self.idx),
240        )
241    }
242}
243
244impl<K: ArrowDictionaryKeyType> VectorOp for DictionaryVector<K> {
245    fn replicate(&self, offsets: &[usize]) -> VectorRef {
246        let keys = self.array.keys();
247        let mut replicated_keys = PrimitiveBuilder::new();
248
249        let mut previous_offset = 0;
250        for (i, &offset) in offsets.iter().enumerate() {
251            let key = if i < self.len() {
252                if keys.is_valid(i) {
253                    Some(keys.value(i))
254                } else {
255                    None
256                }
257            } else {
258                None
259            };
260
261            // repeat this key (offset - previous_offset) times
262            let repeat_count = offset - previous_offset;
263            for _ in 0..repeat_count {
264                replicated_keys.append_option(key);
265            }
266
267            previous_offset = offset;
268        }
269
270        let new_keys = replicated_keys.finish();
271        let new_array = DictionaryArray::try_new(new_keys, self.values().clone())
272            .expect("Failed to create replicated dictionary array");
273
274        Arc::new(Self {
275            array: new_array,
276            key_type: self.key_type.clone(),
277            item_type: self.item_type.clone(),
278            item_vector: self.item_vector.clone(),
279        })
280    }
281
282    fn filter(&self, filter: &vectors::BooleanVector) -> Result<VectorRef> {
283        let key_array: ArrayRef = Arc::new(self.array.keys().clone());
284        let key_vector = Helper::try_into_vector(&key_array)?;
285        let filtered_key_vector = key_vector.filter(filter)?;
286        let filtered_key_array = filtered_key_vector.to_arrow_array();
287        let filtered_key_array = filtered_key_array
288            .as_any()
289            .downcast_ref::<PrimitiveArray<K>>()
290            .unwrap();
291
292        let new_array = DictionaryArray::try_new(filtered_key_array.clone(), self.values().clone())
293            .expect("Failed to create filtered dictionary array");
294
295        Ok(Arc::new(Self {
296            array: new_array,
297            key_type: self.key_type.clone(),
298            item_type: self.item_type.clone(),
299            item_vector: self.item_vector.clone(),
300        }))
301    }
302
303    fn cast(&self, to_type: &ConcreteDataType) -> Result<VectorRef> {
304        let new_items = self.item_vector.cast(to_type)?;
305        let new_array =
306            DictionaryArray::try_new(self.array.keys().clone(), new_items.to_arrow_array())
307                .expect("Failed to create casted dictionary array");
308        Ok(Arc::new(Self {
309            array: new_array,
310            key_type: self.key_type.clone(),
311            item_type: to_type.clone(),
312            item_vector: self.item_vector.clone(),
313        }))
314    }
315
316    fn take(&self, indices: &vectors::UInt32Vector) -> Result<VectorRef> {
317        let key_array: ArrayRef = Arc::new(self.array.keys().clone());
318        let key_vector = Helper::try_into_vector(&key_array)?;
319        let new_key_vector = key_vector.take(indices)?;
320        let new_key_array = new_key_vector.to_arrow_array();
321        let new_key_array = new_key_array
322            .as_any()
323            .downcast_ref::<PrimitiveArray<K>>()
324            .unwrap();
325
326        let new_array = DictionaryArray::try_new(new_key_array.clone(), self.values().clone())
327            .expect("Failed to create filtered dictionary array");
328
329        Ok(Arc::new(Self {
330            array: new_array,
331            key_type: self.key_type.clone(),
332            item_type: self.item_type.clone(),
333            item_vector: self.item_vector.clone(),
334        }))
335    }
336}
337
338#[cfg(test)]
339mod tests {
340    use std::sync::Arc;
341
342    use arrow::array::{Int64Array, StringArray, UInt32Array};
343    use arrow::datatypes::{Int64Type, UInt32Type};
344
345    use super::*;
346
347    // Helper function to create a test dictionary vector with string values
348    fn create_test_dictionary() -> DictionaryVector<Int64Type> {
349        // Dictionary values: ["a", "b", "c", "d"]
350        // Keys: [0, 1, 2, null, 1, 3]
351        // Resulting in: ["a", "b", "c", null, "b", "d"]
352        let values = StringArray::from(vec!["a", "b", "c", "d"]);
353        let keys = Int64Array::from(vec![Some(0), Some(1), Some(2), None, Some(1), Some(3)]);
354        let dict_array = DictionaryArray::new(keys, Arc::new(values));
355        DictionaryVector::<Int64Type>::try_from(dict_array).unwrap()
356    }
357
358    #[test]
359    fn test_dictionary_vector_basics() {
360        let dict_vec = create_test_dictionary();
361
362        // Test length and null count
363        assert_eq!(dict_vec.len(), 6);
364        assert_eq!(dict_vec.null_count(), 1);
365
366        // Test data type
367        let data_type = dict_vec.data_type();
368        if let ConcreteDataType::Dictionary(dict_type) = data_type {
369            assert_eq!(*dict_type.value_type(), ConcreteDataType::string_datatype());
370        } else {
371            panic!("Expected Dictionary data type");
372        }
373
374        // Test is_null
375        assert!(!dict_vec.is_null(0));
376        assert!(dict_vec.is_null(3));
377
378        // Test get values
379        assert_eq!(dict_vec.get(0), Value::String("a".to_string().into()));
380        assert_eq!(dict_vec.get(1), Value::String("b".to_string().into()));
381        assert_eq!(dict_vec.get(3), Value::Null);
382        assert_eq!(dict_vec.get(4), Value::String("b".to_string().into()));
383    }
384
385    #[test]
386    fn test_slice() {
387        let dict_vec = create_test_dictionary();
388        let sliced = dict_vec.slice(1, 3);
389
390        assert_eq!(sliced.len(), 3);
391        assert_eq!(sliced.get(0), Value::String("b".to_string().into()));
392        assert_eq!(sliced.get(1), Value::String("c".to_string().into()));
393        assert_eq!(sliced.get(2), Value::Null);
394    }
395
396    #[test]
397    fn test_replicate() {
398        let dict_vec = create_test_dictionary();
399
400        // Replicate with offsets [0, 2, 5] - should get values at these indices
401        let offsets = vec![0, 2, 5];
402        let replicated = dict_vec.replicate(&offsets);
403        assert_eq!(replicated.len(), 5);
404        assert_eq!(replicated.get(0), Value::String("b".to_string().into()));
405        assert_eq!(replicated.get(1), Value::String("b".to_string().into()));
406        assert_eq!(replicated.get(2), Value::String("c".to_string().into()));
407        assert_eq!(replicated.get(3), Value::String("c".to_string().into()));
408        assert_eq!(replicated.get(4), Value::String("c".to_string().into()));
409    }
410
411    #[test]
412    fn test_filter() {
413        let dict_vec = create_test_dictionary();
414
415        // Keep only indices 0, 2, 4
416        let filter_values = vec![true, false, true, false, true, false];
417        let filter = vectors::BooleanVector::from(filter_values);
418
419        let filtered = dict_vec.filter(&filter).unwrap();
420        assert_eq!(filtered.len(), 3);
421
422        // Check the values
423        assert_eq!(filtered.get(0), Value::String("a".to_string().into()));
424        assert_eq!(filtered.get(1), Value::String("c".to_string().into()));
425        assert_eq!(filtered.get(2), Value::String("b".to_string().into()));
426    }
427
428    #[test]
429    fn test_cast() {
430        let dict_vec = create_test_dictionary();
431
432        // Cast to the same type should return an equivalent vector
433        let casted = dict_vec.cast(&ConcreteDataType::string_datatype()).unwrap();
434
435        // The returned vector should have string values
436        assert_eq!(
437            casted.data_type(),
438            ConcreteDataType::Dictionary(DictionaryType::new(
439                ConcreteDataType::int64_datatype(),
440                ConcreteDataType::string_datatype(),
441            ))
442        );
443        assert_eq!(casted.len(), dict_vec.len());
444
445        // Values should match the original dictionary lookups
446        assert_eq!(casted.get(0), Value::String("a".to_string().into()));
447        assert_eq!(casted.get(1), Value::String("b".to_string().into()));
448        assert_eq!(casted.get(2), Value::String("c".to_string().into()));
449        assert_eq!(casted.get(3), Value::Null);
450        assert_eq!(casted.get(4), Value::String("b".to_string().into()));
451        assert_eq!(casted.get(5), Value::String("d".to_string().into()));
452    }
453
454    #[test]
455    fn test_take() {
456        let dict_vec = create_test_dictionary();
457
458        // Take indices 2, 0, 4
459        let indices_vec = vec![Some(2u32), Some(0), Some(4)];
460        let indices = vectors::UInt32Vector::from(indices_vec);
461
462        let taken = dict_vec.take(&indices).unwrap();
463        assert_eq!(taken.len(), 3);
464
465        // Check the values
466        assert_eq!(taken.get(0), Value::String("c".to_string().into()));
467        assert_eq!(taken.get(1), Value::String("a".to_string().into()));
468        assert_eq!(taken.get(2), Value::String("b".to_string().into()));
469    }
470
471    #[test]
472    fn test_other_type() {
473        let values = StringArray::from(vec!["a", "b", "c", "d"]);
474        let keys = UInt32Array::from(vec![Some(0), Some(1), Some(2), None, Some(1), Some(3)]);
475        let dict_array = DictionaryArray::new(keys, Arc::new(values));
476        let dict_vec = DictionaryVector::<UInt32Type>::try_from(dict_array).unwrap();
477        assert_eq!(
478            ConcreteDataType::dictionary_datatype(
479                ConcreteDataType::uint32_datatype(),
480                ConcreteDataType::string_datatype()
481            ),
482            dict_vec.data_type()
483        );
484    }
485}