1use std::any::Any;
16use std::fmt;
17use std::sync::Arc;
18
19use arrow::array::{Array, ArrayRef, DictionaryArray, PrimitiveArray, PrimitiveBuilder};
20use arrow::datatypes::{ArrowDictionaryKeyType, ArrowNativeType};
21use serde_json::Value as JsonValue;
22use snafu::ResultExt;
23
24use crate::data_type::ConcreteDataType;
25use crate::error::{self, Result};
26use crate::serialize::Serializable;
27use crate::types::DictionaryType;
28use crate::value::{Value, ValueRef};
29use crate::vectors::operations::VectorOp;
30use crate::vectors::{self, Helper, Validity, Vector, VectorRef};
31
32pub struct DictionaryVector<K: ArrowDictionaryKeyType> {
34 array: DictionaryArray<K>,
35 key_type: ConcreteDataType,
37 item_type: ConcreteDataType,
39 item_vector: VectorRef,
41}
42
43impl<K: ArrowDictionaryKeyType> fmt::Debug for DictionaryVector<K> {
44 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
45 f.debug_struct("DictionaryVector")
46 .field("array", &self.array)
47 .field("key_type", &self.key_type)
48 .field("item_type", &self.item_type)
49 .finish()
50 }
51}
52
53impl<K: ArrowDictionaryKeyType> PartialEq for DictionaryVector<K> {
54 fn eq(&self, other: &DictionaryVector<K>) -> bool {
55 self.array == other.array
56 && self.key_type == other.key_type
57 && self.item_type == other.item_type
58 }
59}
60
61impl<K: ArrowDictionaryKeyType> DictionaryVector<K> {
62 pub fn new(array: DictionaryArray<K>, item_type: ConcreteDataType) -> Result<Self> {
64 let key_type = ConcreteDataType::try_from(&K::DATA_TYPE)?;
65 let item_vector = Helper::try_into_vector(array.values())?;
66
67 Ok(Self {
68 array,
69 key_type,
70 item_type,
71 item_vector,
72 })
73 }
74
75 pub fn array(&self) -> &DictionaryArray<K> {
77 &self.array
78 }
79
80 pub fn keys(&self) -> &arrow_array::PrimitiveArray<K> {
82 self.array.keys()
83 }
84
85 pub fn values(&self) -> &ArrayRef {
87 self.array.values()
88 }
89
90 pub fn as_arrow(&self) -> &dyn Array {
91 &self.array
92 }
93}
94
95impl<K: ArrowDictionaryKeyType> Vector for DictionaryVector<K> {
96 fn data_type(&self) -> ConcreteDataType {
97 ConcreteDataType::Dictionary(DictionaryType::new(
98 self.key_type.clone(),
99 self.item_type.clone(),
100 ))
101 }
102
103 fn vector_type_name(&self) -> String {
104 "DictionaryVector".to_string()
105 }
106
107 fn as_any(&self) -> &dyn Any {
108 self
109 }
110
111 fn len(&self) -> usize {
112 self.array.len()
113 }
114
115 fn to_arrow_array(&self) -> ArrayRef {
116 Arc::new(self.array.clone())
117 }
118
119 fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
120 Box::new(self.array.clone())
121 }
122
123 fn validity(&self) -> Validity {
124 vectors::impl_validity_for_vector!(self.array)
125 }
126
127 fn memory_size(&self) -> usize {
128 self.array.get_buffer_memory_size()
129 }
130
131 fn null_count(&self) -> usize {
132 self.array.null_count()
133 }
134
135 fn is_null(&self, row: usize) -> bool {
136 self.array.is_null(row)
137 }
138
139 fn slice(&self, offset: usize, length: usize) -> VectorRef {
140 Arc::new(Self {
141 array: self.array.slice(offset, length),
142 key_type: self.key_type.clone(),
143 item_type: self.item_type.clone(),
144 item_vector: self.item_vector.clone(),
145 })
146 }
147
148 fn get(&self, index: usize) -> Value {
149 if !self.array.is_valid(index) {
150 return Value::Null;
151 }
152
153 let key = self.array.keys().value(index);
154 self.item_vector.get(key.as_usize())
155 }
156
157 fn get_ref(&self, index: usize) -> ValueRef<'_> {
158 if !self.array.is_valid(index) {
159 return ValueRef::Null;
160 }
161
162 let key = self.array.keys().value(index);
163 self.item_vector.get_ref(key.as_usize())
164 }
165}
166
167impl<K: ArrowDictionaryKeyType> Serializable for DictionaryVector<K> {
168 fn serialize_to_json(&self) -> Result<Vec<JsonValue>> {
169 let mut result = Vec::with_capacity(self.len());
172
173 let keys = self.array.keys();
174 let key_values = &keys.values()[..self.len()];
175 for (i, &key) in key_values.iter().enumerate() {
176 if self.is_null(i) {
177 result.push(JsonValue::Null);
178 } else {
179 let value = self.item_vector.get(key.as_usize());
180 let json_value = serde_json::to_value(value).context(error::SerializeSnafu)?;
181 result.push(json_value);
182 }
183 }
184
185 Ok(result)
186 }
187}
188
189impl<K: ArrowDictionaryKeyType> TryFrom<DictionaryArray<K>> for DictionaryVector<K> {
190 type Error = crate::error::Error;
191
192 fn try_from(array: DictionaryArray<K>) -> Result<Self> {
193 let key_type = ConcreteDataType::try_from(array.keys().data_type())?;
194 let item_type = ConcreteDataType::try_from(array.values().data_type())?;
195 let item_vector = Helper::try_into_vector(array.values())?;
196
197 Ok(Self {
198 array,
199 key_type,
200 item_type,
201 item_vector,
202 })
203 }
204}
205
206pub struct DictionaryIter<'a, K: ArrowDictionaryKeyType> {
207 vector: &'a DictionaryVector<K>,
208 idx: usize,
209}
210
211impl<'a, K: ArrowDictionaryKeyType> DictionaryIter<'a, K> {
212 pub fn new(vector: &'a DictionaryVector<K>) -> DictionaryIter<'a, K> {
213 DictionaryIter { vector, idx: 0 }
214 }
215}
216
217impl<'a, K: ArrowDictionaryKeyType> Iterator for DictionaryIter<'a, K> {
218 type Item = Option<ValueRef<'a>>;
219
220 #[inline]
221 fn next(&mut self) -> Option<Self::Item> {
222 if self.idx >= self.vector.len() {
223 return None;
224 }
225
226 let idx = self.idx;
227 self.idx += 1;
228
229 if self.vector.is_null(idx) {
230 return Some(None);
231 }
232
233 Some(Some(self.vector.get_ref(idx)))
234 }
235
236 #[inline]
237 fn size_hint(&self) -> (usize, Option<usize>) {
238 (
239 self.vector.len() - self.idx,
240 Some(self.vector.len() - self.idx),
241 )
242 }
243}
244
245impl<K: ArrowDictionaryKeyType> VectorOp for DictionaryVector<K> {
246 fn replicate(&self, offsets: &[usize]) -> VectorRef {
247 let keys = self.array.keys();
248 let mut replicated_keys = PrimitiveBuilder::new();
249
250 let mut previous_offset = 0;
251 let mut key_iter = keys.iter().chain(std::iter::repeat(None));
252 for &offset in offsets {
253 let key = key_iter.next().unwrap();
254
255 let repeat_count = offset - previous_offset;
257 for _ in 0..repeat_count {
258 replicated_keys.append_option(key);
259 }
260
261 previous_offset = offset;
262 }
263
264 let new_keys = replicated_keys.finish();
265 let new_array = DictionaryArray::try_new(new_keys, self.values().clone())
266 .expect("Failed to create replicated dictionary array");
267
268 Arc::new(Self {
269 array: new_array,
270 key_type: self.key_type.clone(),
271 item_type: self.item_type.clone(),
272 item_vector: self.item_vector.clone(),
273 })
274 }
275
276 fn filter(&self, filter: &vectors::BooleanVector) -> Result<VectorRef> {
277 let key_array: ArrayRef = Arc::new(self.array.keys().clone());
278 let key_vector = Helper::try_into_vector(&key_array)?;
279 let filtered_key_vector = key_vector.filter(filter)?;
280 let filtered_key_array = filtered_key_vector.to_arrow_array();
281 let filtered_key_array = filtered_key_array
282 .as_any()
283 .downcast_ref::<PrimitiveArray<K>>()
284 .unwrap();
285
286 let new_array = DictionaryArray::try_new(filtered_key_array.clone(), self.values().clone())
287 .expect("Failed to create filtered dictionary array");
288
289 Ok(Arc::new(Self {
290 array: new_array,
291 key_type: self.key_type.clone(),
292 item_type: self.item_type.clone(),
293 item_vector: self.item_vector.clone(),
294 }))
295 }
296
297 fn cast(&self, to_type: &ConcreteDataType) -> Result<VectorRef> {
298 let new_items = self.item_vector.cast(to_type)?;
299 let new_array =
300 DictionaryArray::try_new(self.array.keys().clone(), new_items.to_arrow_array())
301 .expect("Failed to create casted dictionary array");
302 Ok(Arc::new(Self {
303 array: new_array,
304 key_type: self.key_type.clone(),
305 item_type: to_type.clone(),
306 item_vector: self.item_vector.clone(),
307 }))
308 }
309
310 fn take(&self, indices: &vectors::UInt32Vector) -> Result<VectorRef> {
311 let key_array: ArrayRef = Arc::new(self.array.keys().clone());
312 let key_vector = Helper::try_into_vector(&key_array)?;
313 let new_key_vector = key_vector.take(indices)?;
314 let new_key_array = new_key_vector.to_arrow_array();
315 let new_key_array = new_key_array
316 .as_any()
317 .downcast_ref::<PrimitiveArray<K>>()
318 .unwrap();
319
320 let new_array = DictionaryArray::try_new(new_key_array.clone(), self.values().clone())
321 .expect("Failed to create filtered dictionary array");
322
323 Ok(Arc::new(Self {
324 array: new_array,
325 key_type: self.key_type.clone(),
326 item_type: self.item_type.clone(),
327 item_vector: self.item_vector.clone(),
328 }))
329 }
330}
331
332#[cfg(test)]
333mod tests {
334 use std::sync::Arc;
335
336 use arrow::array::{Int64Array, StringArray, UInt32Array};
337 use arrow::datatypes::{Int64Type, UInt32Type};
338
339 use super::*;
340
341 fn create_test_dictionary() -> DictionaryVector<Int64Type> {
343 let values = StringArray::from(vec!["a", "b", "c", "d"]);
347 let keys = Int64Array::from(vec![Some(0), Some(1), Some(2), None, Some(1), Some(3)]);
348 let dict_array = DictionaryArray::new(keys, Arc::new(values));
349 DictionaryVector::<Int64Type>::try_from(dict_array).unwrap()
350 }
351
352 #[test]
353 fn test_dictionary_vector_basics() {
354 let dict_vec = create_test_dictionary();
355
356 assert_eq!(dict_vec.len(), 6);
358 assert_eq!(dict_vec.null_count(), 1);
359
360 let data_type = dict_vec.data_type();
362 if let ConcreteDataType::Dictionary(dict_type) = data_type {
363 assert_eq!(*dict_type.value_type(), ConcreteDataType::string_datatype());
364 } else {
365 panic!("Expected Dictionary data type");
366 }
367
368 assert!(!dict_vec.is_null(0));
370 assert!(dict_vec.is_null(3));
371
372 assert_eq!(dict_vec.get(0), Value::String("a".to_string().into()));
374 assert_eq!(dict_vec.get(1), Value::String("b".to_string().into()));
375 assert_eq!(dict_vec.get(3), Value::Null);
376 assert_eq!(dict_vec.get(4), Value::String("b".to_string().into()));
377 }
378
379 #[test]
380 fn test_slice() {
381 let dict_vec = create_test_dictionary();
382 let sliced = dict_vec.slice(1, 3);
383
384 assert_eq!(sliced.len(), 3);
385 assert_eq!(sliced.get(0), Value::String("b".to_string().into()));
386 assert_eq!(sliced.get(1), Value::String("c".to_string().into()));
387 assert_eq!(sliced.get(2), Value::Null);
388 }
389
390 #[test]
391 fn test_replicate() {
392 let dict_vec = create_test_dictionary();
393
394 let offsets = vec![0, 2, 5];
396 let replicated = dict_vec.replicate(&offsets);
397 assert_eq!(replicated.len(), 5);
398 assert_eq!(replicated.get(0), Value::String("b".to_string().into()));
399 assert_eq!(replicated.get(1), Value::String("b".to_string().into()));
400 assert_eq!(replicated.get(2), Value::String("c".to_string().into()));
401 assert_eq!(replicated.get(3), Value::String("c".to_string().into()));
402 assert_eq!(replicated.get(4), Value::String("c".to_string().into()));
403 }
404
405 #[test]
406 fn test_filter() {
407 let dict_vec = create_test_dictionary();
408
409 let filter_values = vec![true, false, true, false, true, false];
411 let filter = vectors::BooleanVector::from(filter_values);
412
413 let filtered = dict_vec.filter(&filter).unwrap();
414 assert_eq!(filtered.len(), 3);
415
416 assert_eq!(filtered.get(0), Value::String("a".to_string().into()));
418 assert_eq!(filtered.get(1), Value::String("c".to_string().into()));
419 assert_eq!(filtered.get(2), Value::String("b".to_string().into()));
420 }
421
422 #[test]
423 fn test_cast() {
424 let dict_vec = create_test_dictionary();
425
426 let casted = dict_vec.cast(&ConcreteDataType::string_datatype()).unwrap();
428
429 assert_eq!(
431 casted.data_type(),
432 ConcreteDataType::Dictionary(DictionaryType::new(
433 ConcreteDataType::int64_datatype(),
434 ConcreteDataType::string_datatype(),
435 ))
436 );
437 assert_eq!(casted.len(), dict_vec.len());
438
439 assert_eq!(casted.get(0), Value::String("a".to_string().into()));
441 assert_eq!(casted.get(1), Value::String("b".to_string().into()));
442 assert_eq!(casted.get(2), Value::String("c".to_string().into()));
443 assert_eq!(casted.get(3), Value::Null);
444 assert_eq!(casted.get(4), Value::String("b".to_string().into()));
445 assert_eq!(casted.get(5), Value::String("d".to_string().into()));
446 }
447
448 #[test]
449 fn test_take() {
450 let dict_vec = create_test_dictionary();
451
452 let indices_vec = vec![Some(2u32), Some(0), Some(4)];
454 let indices = vectors::UInt32Vector::from(indices_vec);
455
456 let taken = dict_vec.take(&indices).unwrap();
457 assert_eq!(taken.len(), 3);
458
459 assert_eq!(taken.get(0), Value::String("c".to_string().into()));
461 assert_eq!(taken.get(1), Value::String("a".to_string().into()));
462 assert_eq!(taken.get(2), Value::String("b".to_string().into()));
463 }
464
465 #[test]
466 fn test_other_type() {
467 let values = StringArray::from(vec!["a", "b", "c", "d"]);
468 let keys = UInt32Array::from(vec![Some(0), Some(1), Some(2), None, Some(1), Some(3)]);
469 let dict_array = DictionaryArray::new(keys, Arc::new(values));
470 let dict_vec = DictionaryVector::<UInt32Type>::try_from(dict_array).unwrap();
471 assert_eq!(
472 ConcreteDataType::dictionary_datatype(
473 ConcreteDataType::uint32_datatype(),
474 ConcreteDataType::string_datatype()
475 ),
476 dict_vec.data_type()
477 );
478 }
479}