1use std::any::Any;
16use std::fmt;
17use std::sync::Arc;
18
19use arrow::array::{Array, ArrayRef, DictionaryArray, PrimitiveArray, PrimitiveBuilder};
20use arrow::datatypes::{ArrowDictionaryKeyType, ArrowNativeType};
21use serde_json::Value as JsonValue;
22use snafu::ResultExt;
23
24use crate::data_type::ConcreteDataType;
25use crate::error::{self, Result};
26use crate::serialize::Serializable;
27use crate::types::DictionaryType;
28use crate::value::{Value, ValueRef};
29use crate::vectors::operations::VectorOp;
30use crate::vectors::{self, Helper, Validity, Vector, VectorRef};
31
32pub struct DictionaryVector<K: ArrowDictionaryKeyType> {
34 array: DictionaryArray<K>,
35 key_type: ConcreteDataType,
37 item_type: ConcreteDataType,
39 item_vector: VectorRef,
41}
42
43impl<K: ArrowDictionaryKeyType> fmt::Debug for DictionaryVector<K> {
44 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
45 f.debug_struct("DictionaryVector")
46 .field("array", &self.array)
47 .field("key_type", &self.key_type)
48 .field("item_type", &self.item_type)
49 .finish()
50 }
51}
52
53impl<K: ArrowDictionaryKeyType> PartialEq for DictionaryVector<K> {
54 fn eq(&self, other: &DictionaryVector<K>) -> bool {
55 self.array == other.array
56 && self.key_type == other.key_type
57 && self.item_type == other.item_type
58 }
59}
60
61impl<K: ArrowDictionaryKeyType> DictionaryVector<K> {
62 pub fn new(array: DictionaryArray<K>, item_type: ConcreteDataType) -> Result<Self> {
64 let key_type = ConcreteDataType::try_from(&K::DATA_TYPE)?;
65 let item_vector = Helper::try_into_vector(array.values())?;
66
67 Ok(Self {
68 array,
69 key_type,
70 item_type,
71 item_vector,
72 })
73 }
74
75 pub fn array(&self) -> &DictionaryArray<K> {
77 &self.array
78 }
79
80 pub fn keys(&self) -> &arrow_array::PrimitiveArray<K> {
82 self.array.keys()
83 }
84
85 pub fn values(&self) -> &ArrayRef {
87 self.array.values()
88 }
89
90 pub fn as_arrow(&self) -> &dyn Array {
91 &self.array
92 }
93}
94
95impl<K: ArrowDictionaryKeyType> Vector for DictionaryVector<K> {
96 fn data_type(&self) -> ConcreteDataType {
97 ConcreteDataType::Dictionary(DictionaryType::new(
98 self.key_type.clone(),
99 self.item_type.clone(),
100 ))
101 }
102
103 fn vector_type_name(&self) -> String {
104 "DictionaryVector".to_string()
105 }
106
107 fn as_any(&self) -> &dyn Any {
108 self
109 }
110
111 fn len(&self) -> usize {
112 self.array.len()
113 }
114
115 fn to_arrow_array(&self) -> ArrayRef {
116 Arc::new(self.array.clone())
117 }
118
119 fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
120 Box::new(self.array.clone())
121 }
122
123 fn validity(&self) -> Validity {
124 vectors::impl_validity_for_vector!(self.array)
125 }
126
127 fn memory_size(&self) -> usize {
128 self.array.get_buffer_memory_size()
129 }
130
131 fn null_count(&self) -> usize {
132 self.array.null_count()
133 }
134
135 fn is_null(&self, row: usize) -> bool {
136 self.array.is_null(row)
137 }
138
139 fn slice(&self, offset: usize, length: usize) -> VectorRef {
140 Arc::new(Self {
141 array: self.array.slice(offset, length),
142 key_type: self.key_type.clone(),
143 item_type: self.item_type.clone(),
144 item_vector: self.item_vector.clone(),
145 })
146 }
147
148 fn get(&self, index: usize) -> Value {
149 if !self.array.is_valid(index) {
150 return Value::Null;
151 }
152
153 let key = self.array.keys().value(index);
154 self.item_vector.get(key.as_usize())
155 }
156
157 fn get_ref(&self, index: usize) -> ValueRef {
158 if !self.array.is_valid(index) {
159 return ValueRef::Null;
160 }
161
162 let key = self.array.keys().value(index);
163 self.item_vector.get_ref(key.as_usize())
164 }
165}
166
167impl<K: ArrowDictionaryKeyType> Serializable for DictionaryVector<K> {
168 fn serialize_to_json(&self) -> Result<Vec<JsonValue>> {
169 let mut result = Vec::with_capacity(self.len());
172
173 for i in 0..self.len() {
174 if self.is_null(i) {
175 result.push(JsonValue::Null);
176 } else {
177 let key = self.array.keys().value(i);
178 let value = self.item_vector.get(key.as_usize());
179 let json_value = serde_json::to_value(value).context(error::SerializeSnafu)?;
180 result.push(json_value);
181 }
182 }
183
184 Ok(result)
185 }
186}
187
188impl<K: ArrowDictionaryKeyType> TryFrom<DictionaryArray<K>> for DictionaryVector<K> {
189 type Error = crate::error::Error;
190
191 fn try_from(array: DictionaryArray<K>) -> Result<Self> {
192 let key_type = ConcreteDataType::try_from(array.keys().data_type())?;
193 let item_type = ConcreteDataType::try_from(array.values().data_type())?;
194 let item_vector = Helper::try_into_vector(array.values())?;
195
196 Ok(Self {
197 array,
198 key_type,
199 item_type,
200 item_vector,
201 })
202 }
203}
204
205pub struct DictionaryIter<'a, K: ArrowDictionaryKeyType> {
206 vector: &'a DictionaryVector<K>,
207 idx: usize,
208}
209
210impl<'a, K: ArrowDictionaryKeyType> DictionaryIter<'a, K> {
211 pub fn new(vector: &'a DictionaryVector<K>) -> DictionaryIter<'a, K> {
212 DictionaryIter { vector, idx: 0 }
213 }
214}
215
216impl<'a, K: ArrowDictionaryKeyType> Iterator for DictionaryIter<'a, K> {
217 type Item = Option<ValueRef<'a>>;
218
219 #[inline]
220 fn next(&mut self) -> Option<Self::Item> {
221 if self.idx >= self.vector.len() {
222 return None;
223 }
224
225 let idx = self.idx;
226 self.idx += 1;
227
228 if self.vector.is_null(idx) {
229 return Some(None);
230 }
231
232 Some(Some(self.vector.get_ref(idx)))
233 }
234
235 #[inline]
236 fn size_hint(&self) -> (usize, Option<usize>) {
237 (
238 self.vector.len() - self.idx,
239 Some(self.vector.len() - self.idx),
240 )
241 }
242}
243
244impl<K: ArrowDictionaryKeyType> VectorOp for DictionaryVector<K> {
245 fn replicate(&self, offsets: &[usize]) -> VectorRef {
246 let keys = self.array.keys();
247 let mut replicated_keys = PrimitiveBuilder::new();
248
249 let mut previous_offset = 0;
250 for (i, &offset) in offsets.iter().enumerate() {
251 let key = if i < self.len() {
252 if keys.is_valid(i) {
253 Some(keys.value(i))
254 } else {
255 None
256 }
257 } else {
258 None
259 };
260
261 let repeat_count = offset - previous_offset;
263 for _ in 0..repeat_count {
264 replicated_keys.append_option(key);
265 }
266
267 previous_offset = offset;
268 }
269
270 let new_keys = replicated_keys.finish();
271 let new_array = DictionaryArray::try_new(new_keys, self.values().clone())
272 .expect("Failed to create replicated dictionary array");
273
274 Arc::new(Self {
275 array: new_array,
276 key_type: self.key_type.clone(),
277 item_type: self.item_type.clone(),
278 item_vector: self.item_vector.clone(),
279 })
280 }
281
282 fn filter(&self, filter: &vectors::BooleanVector) -> Result<VectorRef> {
283 let key_array: ArrayRef = Arc::new(self.array.keys().clone());
284 let key_vector = Helper::try_into_vector(&key_array)?;
285 let filtered_key_vector = key_vector.filter(filter)?;
286 let filtered_key_array = filtered_key_vector.to_arrow_array();
287 let filtered_key_array = filtered_key_array
288 .as_any()
289 .downcast_ref::<PrimitiveArray<K>>()
290 .unwrap();
291
292 let new_array = DictionaryArray::try_new(filtered_key_array.clone(), self.values().clone())
293 .expect("Failed to create filtered dictionary array");
294
295 Ok(Arc::new(Self {
296 array: new_array,
297 key_type: self.key_type.clone(),
298 item_type: self.item_type.clone(),
299 item_vector: self.item_vector.clone(),
300 }))
301 }
302
303 fn cast(&self, to_type: &ConcreteDataType) -> Result<VectorRef> {
304 let new_items = self.item_vector.cast(to_type)?;
305 let new_array =
306 DictionaryArray::try_new(self.array.keys().clone(), new_items.to_arrow_array())
307 .expect("Failed to create casted dictionary array");
308 Ok(Arc::new(Self {
309 array: new_array,
310 key_type: self.key_type.clone(),
311 item_type: to_type.clone(),
312 item_vector: self.item_vector.clone(),
313 }))
314 }
315
316 fn take(&self, indices: &vectors::UInt32Vector) -> Result<VectorRef> {
317 let key_array: ArrayRef = Arc::new(self.array.keys().clone());
318 let key_vector = Helper::try_into_vector(&key_array)?;
319 let new_key_vector = key_vector.take(indices)?;
320 let new_key_array = new_key_vector.to_arrow_array();
321 let new_key_array = new_key_array
322 .as_any()
323 .downcast_ref::<PrimitiveArray<K>>()
324 .unwrap();
325
326 let new_array = DictionaryArray::try_new(new_key_array.clone(), self.values().clone())
327 .expect("Failed to create filtered dictionary array");
328
329 Ok(Arc::new(Self {
330 array: new_array,
331 key_type: self.key_type.clone(),
332 item_type: self.item_type.clone(),
333 item_vector: self.item_vector.clone(),
334 }))
335 }
336}
337
338#[cfg(test)]
339mod tests {
340 use std::sync::Arc;
341
342 use arrow::array::{Int64Array, StringArray, UInt32Array};
343 use arrow::datatypes::{Int64Type, UInt32Type};
344
345 use super::*;
346
347 fn create_test_dictionary() -> DictionaryVector<Int64Type> {
349 let values = StringArray::from(vec!["a", "b", "c", "d"]);
353 let keys = Int64Array::from(vec![Some(0), Some(1), Some(2), None, Some(1), Some(3)]);
354 let dict_array = DictionaryArray::new(keys, Arc::new(values));
355 DictionaryVector::<Int64Type>::try_from(dict_array).unwrap()
356 }
357
358 #[test]
359 fn test_dictionary_vector_basics() {
360 let dict_vec = create_test_dictionary();
361
362 assert_eq!(dict_vec.len(), 6);
364 assert_eq!(dict_vec.null_count(), 1);
365
366 let data_type = dict_vec.data_type();
368 if let ConcreteDataType::Dictionary(dict_type) = data_type {
369 assert_eq!(*dict_type.value_type(), ConcreteDataType::string_datatype());
370 } else {
371 panic!("Expected Dictionary data type");
372 }
373
374 assert!(!dict_vec.is_null(0));
376 assert!(dict_vec.is_null(3));
377
378 assert_eq!(dict_vec.get(0), Value::String("a".to_string().into()));
380 assert_eq!(dict_vec.get(1), Value::String("b".to_string().into()));
381 assert_eq!(dict_vec.get(3), Value::Null);
382 assert_eq!(dict_vec.get(4), Value::String("b".to_string().into()));
383 }
384
385 #[test]
386 fn test_slice() {
387 let dict_vec = create_test_dictionary();
388 let sliced = dict_vec.slice(1, 3);
389
390 assert_eq!(sliced.len(), 3);
391 assert_eq!(sliced.get(0), Value::String("b".to_string().into()));
392 assert_eq!(sliced.get(1), Value::String("c".to_string().into()));
393 assert_eq!(sliced.get(2), Value::Null);
394 }
395
396 #[test]
397 fn test_replicate() {
398 let dict_vec = create_test_dictionary();
399
400 let offsets = vec![0, 2, 5];
402 let replicated = dict_vec.replicate(&offsets);
403 assert_eq!(replicated.len(), 5);
404 assert_eq!(replicated.get(0), Value::String("b".to_string().into()));
405 assert_eq!(replicated.get(1), Value::String("b".to_string().into()));
406 assert_eq!(replicated.get(2), Value::String("c".to_string().into()));
407 assert_eq!(replicated.get(3), Value::String("c".to_string().into()));
408 assert_eq!(replicated.get(4), Value::String("c".to_string().into()));
409 }
410
411 #[test]
412 fn test_filter() {
413 let dict_vec = create_test_dictionary();
414
415 let filter_values = vec![true, false, true, false, true, false];
417 let filter = vectors::BooleanVector::from(filter_values);
418
419 let filtered = dict_vec.filter(&filter).unwrap();
420 assert_eq!(filtered.len(), 3);
421
422 assert_eq!(filtered.get(0), Value::String("a".to_string().into()));
424 assert_eq!(filtered.get(1), Value::String("c".to_string().into()));
425 assert_eq!(filtered.get(2), Value::String("b".to_string().into()));
426 }
427
428 #[test]
429 fn test_cast() {
430 let dict_vec = create_test_dictionary();
431
432 let casted = dict_vec.cast(&ConcreteDataType::string_datatype()).unwrap();
434
435 assert_eq!(
437 casted.data_type(),
438 ConcreteDataType::Dictionary(DictionaryType::new(
439 ConcreteDataType::int64_datatype(),
440 ConcreteDataType::string_datatype(),
441 ))
442 );
443 assert_eq!(casted.len(), dict_vec.len());
444
445 assert_eq!(casted.get(0), Value::String("a".to_string().into()));
447 assert_eq!(casted.get(1), Value::String("b".to_string().into()));
448 assert_eq!(casted.get(2), Value::String("c".to_string().into()));
449 assert_eq!(casted.get(3), Value::Null);
450 assert_eq!(casted.get(4), Value::String("b".to_string().into()));
451 assert_eq!(casted.get(5), Value::String("d".to_string().into()));
452 }
453
454 #[test]
455 fn test_take() {
456 let dict_vec = create_test_dictionary();
457
458 let indices_vec = vec![Some(2u32), Some(0), Some(4)];
460 let indices = vectors::UInt32Vector::from(indices_vec);
461
462 let taken = dict_vec.take(&indices).unwrap();
463 assert_eq!(taken.len(), 3);
464
465 assert_eq!(taken.get(0), Value::String("c".to_string().into()));
467 assert_eq!(taken.get(1), Value::String("a".to_string().into()));
468 assert_eq!(taken.get(2), Value::String("b".to_string().into()));
469 }
470
471 #[test]
472 fn test_other_type() {
473 let values = StringArray::from(vec!["a", "b", "c", "d"]);
474 let keys = UInt32Array::from(vec![Some(0), Some(1), Some(2), None, Some(1), Some(3)]);
475 let dict_array = DictionaryArray::new(keys, Arc::new(values));
476 let dict_vec = DictionaryVector::<UInt32Type>::try_from(dict_array).unwrap();
477 assert_eq!(
478 ConcreteDataType::dictionary_datatype(
479 ConcreteDataType::uint32_datatype(),
480 ConcreteDataType::string_datatype()
481 ),
482 dict_vec.data_type()
483 );
484 }
485}