1use std::any::Any;
16use std::sync::Arc;
17
18use arrow::array::{Array, ArrayBuilder, ArrayIter, ArrayRef};
19use snafu::ResultExt;
20
21use crate::arrow_array::{
22 LargeStringArray, MutableLargeStringArray, MutableStringArray, MutableStringViewArray,
23 StringArray, StringViewArray,
24};
25use crate::data_type::ConcreteDataType;
26use crate::error::{self, Result};
27use crate::scalars::{ScalarVector, ScalarVectorBuilder};
28use crate::serialize::Serializable;
29use crate::value::{Value, ValueRef};
30use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef};
31
32#[derive(Debug, PartialEq)]
34enum StringArrayData {
35 String(StringArray),
36 LargeString(LargeStringArray),
37 StringView(StringViewArray),
38}
39
40#[derive(Debug, PartialEq)]
42pub struct StringVector {
43 array: StringArrayData,
44}
45
46impl StringVector {
47 pub(crate) fn as_arrow(&self) -> &dyn Array {
48 match &self.array {
49 StringArrayData::String(array) => array,
50 StringArrayData::LargeString(array) => array,
51 StringArrayData::StringView(array) => array,
52 }
53 }
54
55 pub fn from_string_array(array: StringArray) -> Self {
57 Self {
58 array: StringArrayData::String(array),
59 }
60 }
61
62 pub fn from_large_string_array(array: LargeStringArray) -> Self {
64 Self {
65 array: StringArrayData::LargeString(array),
66 }
67 }
68
69 pub fn from_string_view_array(array: StringViewArray) -> Self {
71 Self {
72 array: StringArrayData::StringView(array),
73 }
74 }
75
76 pub fn from_slice<T: AsRef<str>>(slice: &[T]) -> Self {
77 Self::from_string_array(StringArray::from_iter(
78 slice.iter().map(|s| Some(s.as_ref())),
79 ))
80 }
81}
82
83impl From<StringArray> for StringVector {
84 fn from(array: StringArray) -> Self {
85 Self::from_string_array(array)
86 }
87}
88
89impl From<LargeStringArray> for StringVector {
90 fn from(array: LargeStringArray) -> Self {
91 Self::from_large_string_array(array)
92 }
93}
94
95impl From<StringViewArray> for StringVector {
96 fn from(array: StringViewArray) -> Self {
97 Self::from_string_view_array(array)
98 }
99}
100
101impl From<Vec<Option<String>>> for StringVector {
102 fn from(data: Vec<Option<String>>) -> Self {
103 Self::from_string_array(StringArray::from_iter(data))
104 }
105}
106
107impl From<Vec<Option<&str>>> for StringVector {
108 fn from(data: Vec<Option<&str>>) -> Self {
109 Self::from_string_array(StringArray::from_iter(data))
110 }
111}
112
113impl From<&[Option<String>]> for StringVector {
114 fn from(data: &[Option<String>]) -> Self {
115 Self::from_string_array(StringArray::from_iter(data))
116 }
117}
118
119impl From<&[Option<&str>]> for StringVector {
120 fn from(data: &[Option<&str>]) -> Self {
121 Self::from_string_array(StringArray::from_iter(data))
122 }
123}
124
125impl From<Vec<String>> for StringVector {
126 fn from(data: Vec<String>) -> Self {
127 Self::from_string_array(StringArray::from_iter(data.into_iter().map(Some)))
128 }
129}
130
131impl From<Vec<&str>> for StringVector {
132 fn from(data: Vec<&str>) -> Self {
133 Self::from_string_array(StringArray::from_iter(data.into_iter().map(Some)))
134 }
135}
136
137impl Vector for StringVector {
138 fn data_type(&self) -> ConcreteDataType {
139 match &self.array {
140 StringArrayData::String(_) => ConcreteDataType::string_datatype(),
141 StringArrayData::LargeString(_) => ConcreteDataType::large_string_datatype(),
142 StringArrayData::StringView(_) => ConcreteDataType::utf8_view_datatype(),
143 }
144 }
145
146 fn vector_type_name(&self) -> String {
147 "StringVector".to_string()
148 }
149
150 fn as_any(&self) -> &dyn Any {
151 self
152 }
153
154 fn len(&self) -> usize {
155 match &self.array {
156 StringArrayData::String(array) => array.len(),
157 StringArrayData::LargeString(array) => array.len(),
158 StringArrayData::StringView(array) => array.len(),
159 }
160 }
161
162 fn to_arrow_array(&self) -> ArrayRef {
163 match &self.array {
164 StringArrayData::String(array) => Arc::new(array.clone()),
165 StringArrayData::LargeString(array) => Arc::new(array.clone()),
166 StringArrayData::StringView(array) => Arc::new(array.clone()),
167 }
168 }
169
170 fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
171 match &self.array {
172 StringArrayData::String(array) => Box::new(array.clone()),
173 StringArrayData::LargeString(array) => Box::new(array.clone()),
174 StringArrayData::StringView(array) => Box::new(array.clone()),
175 }
176 }
177
178 fn validity(&self) -> Validity {
179 match &self.array {
180 StringArrayData::String(array) => vectors::impl_validity_for_vector!(array),
181 StringArrayData::LargeString(array) => vectors::impl_validity_for_vector!(array),
182 StringArrayData::StringView(array) => vectors::impl_validity_for_vector!(array),
183 }
184 }
185
186 fn memory_size(&self) -> usize {
187 match &self.array {
188 StringArrayData::String(array) => array.get_buffer_memory_size(),
189 StringArrayData::LargeString(array) => array.get_buffer_memory_size(),
190 StringArrayData::StringView(array) => array.get_buffer_memory_size(),
191 }
192 }
193
194 fn null_count(&self) -> usize {
195 match &self.array {
196 StringArrayData::String(array) => array.null_count(),
197 StringArrayData::LargeString(array) => array.null_count(),
198 StringArrayData::StringView(array) => array.null_count(),
199 }
200 }
201
202 fn is_null(&self, row: usize) -> bool {
203 match &self.array {
204 StringArrayData::String(array) => array.is_null(row),
205 StringArrayData::LargeString(array) => array.is_null(row),
206 StringArrayData::StringView(array) => array.is_null(row),
207 }
208 }
209
210 fn slice(&self, offset: usize, length: usize) -> VectorRef {
211 match &self.array {
212 StringArrayData::String(array) => {
213 Arc::new(Self::from_string_array(array.slice(offset, length)))
214 }
215 StringArrayData::LargeString(array) => {
216 Arc::new(Self::from_large_string_array(array.slice(offset, length)))
217 }
218 StringArrayData::StringView(array) => {
219 Arc::new(Self::from_string_view_array(array.slice(offset, length)))
220 }
221 }
222 }
223
224 fn get(&self, index: usize) -> Value {
225 match &self.array {
226 StringArrayData::String(array) => vectors::impl_get_for_vector!(array, index),
227 StringArrayData::LargeString(array) => vectors::impl_get_for_vector!(array, index),
228 StringArrayData::StringView(array) => vectors::impl_get_for_vector!(array, index),
229 }
230 }
231
232 fn get_ref(&self, index: usize) -> ValueRef<'_> {
233 match &self.array {
234 StringArrayData::String(array) => vectors::impl_get_ref_for_vector!(array, index),
235 StringArrayData::LargeString(array) => vectors::impl_get_ref_for_vector!(array, index),
236 StringArrayData::StringView(array) => vectors::impl_get_ref_for_vector!(array, index),
237 }
238 }
239}
240
241pub enum StringIter<'a> {
242 String(ArrayIter<&'a StringArray>),
243 LargeString(ArrayIter<&'a LargeStringArray>),
244 StringView(ArrayIter<&'a StringViewArray>),
245}
246
247impl<'a> Iterator for StringIter<'a> {
248 type Item = Option<&'a str>;
249
250 fn next(&mut self) -> Option<Self::Item> {
251 match self {
252 StringIter::String(iter) => iter.next(),
253 StringIter::LargeString(iter) => iter.next(),
254 StringIter::StringView(iter) => iter.next(),
255 }
256 }
257}
258
259impl ScalarVector for StringVector {
260 type OwnedItem = String;
261 type RefItem<'a> = &'a str;
262 type Iter<'a> = StringIter<'a>;
263 type Builder = StringVectorBuilder;
264
265 fn get_data(&self, idx: usize) -> Option<Self::RefItem<'_>> {
266 match &self.array {
267 StringArrayData::String(array) => {
268 if array.is_valid(idx) {
269 Some(array.value(idx))
270 } else {
271 None
272 }
273 }
274 StringArrayData::LargeString(array) => {
275 if array.is_valid(idx) {
276 Some(array.value(idx))
277 } else {
278 None
279 }
280 }
281 StringArrayData::StringView(array) => {
282 if array.is_valid(idx) {
283 Some(array.value(idx))
284 } else {
285 None
286 }
287 }
288 }
289 }
290
291 fn iter_data(&self) -> Self::Iter<'_> {
292 match &self.array {
293 StringArrayData::String(array) => StringIter::String(array.iter()),
294 StringArrayData::LargeString(array) => StringIter::LargeString(array.iter()),
295 StringArrayData::StringView(array) => StringIter::StringView(array.iter()),
296 }
297 }
298}
299
300enum MutableStringArrayData {
302 String(MutableStringArray),
303 LargeString(MutableLargeStringArray),
304 StringView(MutableStringViewArray),
305}
306
307pub struct StringVectorBuilder {
308 mutable_array: MutableStringArrayData,
309}
310
311impl Default for StringVectorBuilder {
312 fn default() -> Self {
313 Self::new()
314 }
315}
316
317impl StringVectorBuilder {
318 pub fn new() -> Self {
320 Self {
321 mutable_array: MutableStringArrayData::String(MutableStringArray::new()),
322 }
323 }
324
325 pub fn new_large() -> Self {
327 Self {
328 mutable_array: MutableStringArrayData::LargeString(MutableLargeStringArray::new()),
329 }
330 }
331
332 pub fn new_view() -> Self {
334 Self {
335 mutable_array: MutableStringArrayData::StringView(MutableStringViewArray::new()),
336 }
337 }
338
339 pub fn with_string_capacity(capacity: usize) -> Self {
341 Self {
342 mutable_array: MutableStringArrayData::String(MutableStringArray::with_capacity(
343 capacity, 0,
344 )),
345 }
346 }
347
348 pub fn with_large_capacity(capacity: usize) -> Self {
350 Self {
351 mutable_array: MutableStringArrayData::LargeString(
352 MutableLargeStringArray::with_capacity(capacity, 0),
353 ),
354 }
355 }
356
357 pub fn with_view_capacity(capacity: usize) -> Self {
359 Self {
360 mutable_array: MutableStringArrayData::StringView(
361 MutableStringViewArray::with_capacity(capacity),
362 ),
363 }
364 }
365}
366
367impl MutableVector for StringVectorBuilder {
368 fn data_type(&self) -> ConcreteDataType {
369 match &self.mutable_array {
370 MutableStringArrayData::String(_) => ConcreteDataType::string_datatype(),
371 MutableStringArrayData::LargeString(_) => ConcreteDataType::large_string_datatype(),
372 MutableStringArrayData::StringView(_) => ConcreteDataType::utf8_view_datatype(),
373 }
374 }
375
376 fn len(&self) -> usize {
377 match &self.mutable_array {
378 MutableStringArrayData::String(array) => array.len(),
379 MutableStringArrayData::LargeString(array) => array.len(),
380 MutableStringArrayData::StringView(array) => array.len(),
381 }
382 }
383
384 fn as_any(&self) -> &dyn Any {
385 self
386 }
387
388 fn as_mut_any(&mut self) -> &mut dyn Any {
389 self
390 }
391
392 fn to_vector(&mut self) -> VectorRef {
393 Arc::new(self.finish())
394 }
395
396 fn to_vector_cloned(&self) -> VectorRef {
397 Arc::new(self.finish_cloned())
398 }
399 fn try_push_value_ref(&mut self, value: &ValueRef) -> Result<()> {
400 match value.try_into_string()? {
401 Some(v) => match &mut self.mutable_array {
402 MutableStringArrayData::String(array) => array.append_value(v),
403 MutableStringArrayData::LargeString(array) => array.append_value(v),
404 MutableStringArrayData::StringView(array) => array.append_value(v),
405 },
406 None => match &mut self.mutable_array {
407 MutableStringArrayData::String(array) => array.append_null(),
408 MutableStringArrayData::LargeString(array) => array.append_null(),
409 MutableStringArrayData::StringView(array) => array.append_null(),
410 },
411 }
412 Ok(())
413 }
414
415 fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> {
416 vectors::impl_extend_for_builder!(self, vector, StringVector, offset, length)
417 }
418
419 fn push_null(&mut self) {
420 match &mut self.mutable_array {
421 MutableStringArrayData::String(array) => array.append_null(),
422 MutableStringArrayData::LargeString(array) => array.append_null(),
423 MutableStringArrayData::StringView(array) => array.append_null(),
424 }
425 }
426}
427
428impl ScalarVectorBuilder for StringVectorBuilder {
429 type VectorType = StringVector;
430
431 fn with_capacity(capacity: usize) -> Self {
432 Self {
433 mutable_array: MutableStringArrayData::String(MutableStringArray::with_capacity(
434 capacity, 0,
435 )),
436 }
437 }
438
439 fn push(&mut self, value: Option<<Self::VectorType as ScalarVector>::RefItem<'_>>) {
440 match value {
441 Some(v) => match &mut self.mutable_array {
442 MutableStringArrayData::String(array) => array.append_value(v),
443 MutableStringArrayData::LargeString(array) => array.append_value(v),
444 MutableStringArrayData::StringView(array) => array.append_value(v),
445 },
446 None => match &mut self.mutable_array {
447 MutableStringArrayData::String(array) => array.append_null(),
448 MutableStringArrayData::LargeString(array) => array.append_null(),
449 MutableStringArrayData::StringView(array) => array.append_null(),
450 },
451 }
452 }
453
454 fn finish(&mut self) -> Self::VectorType {
455 match &mut self.mutable_array {
456 MutableStringArrayData::String(array) => {
457 StringVector::from_string_array(array.finish())
458 }
459 MutableStringArrayData::LargeString(array) => {
460 StringVector::from_large_string_array(array.finish())
461 }
462 MutableStringArrayData::StringView(array) => {
463 StringVector::from_string_view_array(array.finish())
464 }
465 }
466 }
467
468 fn finish_cloned(&self) -> Self::VectorType {
469 match &self.mutable_array {
470 MutableStringArrayData::String(array) => {
471 StringVector::from_string_array(array.finish_cloned())
472 }
473 MutableStringArrayData::LargeString(array) => {
474 StringVector::from_large_string_array(array.finish_cloned())
475 }
476 MutableStringArrayData::StringView(array) => {
477 StringVector::from_string_view_array(array.finish_cloned())
478 }
479 }
480 }
481}
482
483impl Serializable for StringVector {
484 fn serialize_to_json(&self) -> Result<Vec<serde_json::Value>> {
485 self.iter_data()
486 .map(serde_json::to_value)
487 .collect::<serde_json::Result<_>>()
488 .context(error::SerializeSnafu)
489 }
490}
491
492impl StringVector {
493 pub fn try_from_arrow_array(
494 array: impl AsRef<dyn Array>,
495 ) -> crate::error::Result<StringVector> {
496 let array = array.as_ref();
497
498 if let Some(string_array) = array.as_any().downcast_ref::<StringArray>() {
499 Ok(StringVector::from_string_array(string_array.clone()))
500 } else if let Some(large_string_array) = array.as_any().downcast_ref::<LargeStringArray>() {
501 Ok(StringVector::from_large_string_array(
502 large_string_array.clone(),
503 ))
504 } else if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
505 Ok(StringVector::from_string_view_array(
506 string_view_array.clone(),
507 ))
508 } else {
509 Err(crate::error::UnsupportedArrowTypeSnafu {
510 arrow_type: array.data_type().clone(),
511 }
512 .build())
513 }
514 }
515}
516
517#[cfg(test)]
518mod tests {
519
520 use std::vec;
521
522 use arrow::datatypes::DataType;
523
524 use super::*;
525
526 #[test]
527 fn test_string_vector_build_get() {
528 let mut builder = StringVectorBuilder::with_capacity(4);
529 builder.push(Some("hello"));
530 builder.push(None);
531 builder.push(Some("world"));
532 let vector = builder.finish();
533
534 assert_eq!(Some("hello"), vector.get_data(0));
535 assert_eq!(None, vector.get_data(1));
536 assert_eq!(Some("world"), vector.get_data(2));
537
538 assert!(vector.try_get(3).is_err());
540
541 assert_eq!(Value::String("hello".into()), vector.get(0));
542 assert_eq!(Value::Null, vector.get(1));
543 assert_eq!(Value::String("world".into()), vector.get(2));
544
545 let mut iter = vector.iter_data();
546 assert_eq!("hello", iter.next().unwrap().unwrap());
547 assert_eq!(None, iter.next().unwrap());
548 assert_eq!("world", iter.next().unwrap().unwrap());
549 assert_eq!(None, iter.next());
550 }
551
552 #[test]
553 fn test_string_view_vector_build_get() {
554 let mut builder = StringVectorBuilder::with_view_capacity(4);
555 builder.push(Some("hello"));
556 builder.push(None);
557 builder.push(Some("world"));
558 let vector = builder.finish();
559
560 assert_eq!(ConcreteDataType::utf8_view_datatype(), vector.data_type());
561 assert_eq!(Some("hello"), vector.get_data(0));
562 assert_eq!(None, vector.get_data(1));
563 assert_eq!(Some("world"), vector.get_data(2));
564
565 assert!(vector.try_get(3).is_err());
567
568 assert_eq!(Value::String("hello".into()), vector.get(0));
569 assert_eq!(Value::Null, vector.get(1));
570 assert_eq!(Value::String("world".into()), vector.get(2));
571
572 let mut iter = vector.iter_data();
573 assert_eq!("hello", iter.next().unwrap().unwrap());
574 assert_eq!(None, iter.next().unwrap());
575 assert_eq!("world", iter.next().unwrap().unwrap());
576 assert_eq!(None, iter.next());
577
578 let arrow_arr = vector.to_arrow_array();
579 assert_eq!(&DataType::Utf8View, arrow_arr.data_type());
580 }
581
582 #[test]
583 fn test_string_vector_builder() {
584 let mut builder = StringVectorBuilder::with_capacity(3);
585 builder.push_value_ref(&ValueRef::String("hello"));
586 assert!(builder.try_push_value_ref(&ValueRef::Int32(123)).is_err());
587
588 let input = StringVector::from_slice(&["world", "one", "two"]);
589 builder.extend_slice_of(&input, 1, 2).unwrap();
590 assert!(
591 builder
592 .extend_slice_of(&crate::vectors::Int32Vector::from_slice([13]), 0, 1)
593 .is_err()
594 );
595 let vector = builder.to_vector();
596
597 let expect: VectorRef = Arc::new(StringVector::from_slice(&["hello", "one", "two"]));
598 assert_eq!(expect, vector);
599 }
600
601 #[test]
602 fn test_string_vector_misc() {
603 let strs = vec!["hello", "greptime", "rust"];
604 let v = StringVector::from(strs.clone());
605 assert_eq!(3, v.len());
606 assert_eq!("StringVector", v.vector_type_name());
607 assert!(!v.is_const());
608 assert!(v.validity().is_all_valid());
609 assert!(!v.only_null());
610 assert_eq!(1040, v.memory_size());
611
612 for (i, s) in strs.iter().enumerate() {
613 assert_eq!(Value::from(*s), v.get(i));
614 assert_eq!(ValueRef::from(*s), v.get_ref(i));
615 assert_eq!(Value::from(*s), v.try_get(i).unwrap());
616 }
617
618 let arrow_arr = v.to_arrow_array();
619 assert_eq!(3, arrow_arr.len());
620 assert_eq!(&DataType::Utf8, arrow_arr.data_type());
621 }
622
623 #[test]
624 fn test_serialize_string_vector() {
625 let mut builder = StringVectorBuilder::with_capacity(3);
626 builder.push(Some("hello"));
627 builder.push(None);
628 builder.push(Some("world"));
629 let string_vector = builder.finish();
630 let serialized =
631 serde_json::to_string(&string_vector.serialize_to_json().unwrap()).unwrap();
632 assert_eq!(r#"["hello",null,"world"]"#, serialized);
633 }
634
635 #[test]
636 fn test_from_arrow_array() {
637 let mut builder = MutableStringArray::new();
638 builder.append_option(Some("A"));
639 builder.append_option(Some("B"));
640 builder.append_null();
641 builder.append_option(Some("D"));
642 let string_array: StringArray = builder.finish();
643 let vector = StringVector::from(string_array);
644 assert_eq!(
645 r#"["A","B",null,"D"]"#,
646 serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(),
647 );
648 }
649
650 #[test]
651 fn test_from_non_option_string() {
652 let nul = String::from_utf8(vec![0]).unwrap();
653 let corpus = vec!["😅😅😅", "😍😍😍😍", "🥵🥵", nul.as_str()];
654 let vector = StringVector::from(corpus);
655 let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap();
656 assert_eq!(r#"["😅😅😅","😍😍😍😍","🥵🥵","\u0000"]"#, serialized);
657
658 let corpus = vec![
659 "🀀🀀🀀".to_string(),
660 "🀁🀁🀁".to_string(),
661 "🀂🀂🀂".to_string(),
662 "🀃🀃🀃".to_string(),
663 "🀆🀆".to_string(),
664 ];
665 let vector = StringVector::from(corpus);
666 let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap();
667 assert_eq!(r#"["🀀🀀🀀","🀁🀁🀁","🀂🀂🀂","🀃🀃🀃","🀆🀆"]"#, serialized);
668 }
669
670 #[test]
671 fn test_string_vector_builder_finish_cloned() {
672 let mut builder = StringVectorBuilder::with_capacity(1024);
673 builder.push(Some("1"));
674 builder.push(Some("2"));
675 builder.push(Some("3"));
676 let vector = builder.finish_cloned();
677 assert_eq!(vector.len(), 3);
678 assert_eq!(
679 r#"["1","2","3"]"#,
680 serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(),
681 );
682 assert_eq!(builder.len(), 3);
683 }
684}