datatypes/vectors/
string.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::any::Any;
16use std::sync::Arc;
17
18use arrow::array::{Array, ArrayBuilder, ArrayIter, ArrayRef};
19use snafu::ResultExt;
20
21use crate::arrow_array::{
22    LargeStringArray, MutableLargeStringArray, MutableStringArray, MutableStringViewArray,
23    StringArray, StringViewArray,
24};
25use crate::data_type::ConcreteDataType;
26use crate::error::{self, Result};
27use crate::scalars::{ScalarVector, ScalarVectorBuilder};
28use crate::serialize::Serializable;
29use crate::value::{Value, ValueRef};
30use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef};
31
32/// Internal representation for string arrays
33#[derive(Debug, PartialEq)]
34enum StringArrayData {
35    String(StringArray),
36    LargeString(LargeStringArray),
37    StringView(StringViewArray),
38}
39
40/// Vector of strings.
41#[derive(Debug, PartialEq)]
42pub struct StringVector {
43    array: StringArrayData,
44}
45
46impl StringVector {
47    pub(crate) fn as_arrow(&self) -> &dyn Array {
48        match &self.array {
49            StringArrayData::String(array) => array,
50            StringArrayData::LargeString(array) => array,
51            StringArrayData::StringView(array) => array,
52        }
53    }
54
55    /// Create a StringVector from a regular StringArray
56    pub fn from_string_array(array: StringArray) -> Self {
57        Self {
58            array: StringArrayData::String(array),
59        }
60    }
61
62    /// Create a StringVector from a LargeStringArray
63    pub fn from_large_string_array(array: LargeStringArray) -> Self {
64        Self {
65            array: StringArrayData::LargeString(array),
66        }
67    }
68
69    /// Create a StringVector from a StringViewArray
70    pub fn from_string_view_array(array: StringViewArray) -> Self {
71        Self {
72            array: StringArrayData::StringView(array),
73        }
74    }
75
76    pub fn from_slice<T: AsRef<str>>(slice: &[T]) -> Self {
77        Self::from_string_array(StringArray::from_iter(
78            slice.iter().map(|s| Some(s.as_ref())),
79        ))
80    }
81}
82
83impl From<StringArray> for StringVector {
84    fn from(array: StringArray) -> Self {
85        Self::from_string_array(array)
86    }
87}
88
89impl From<LargeStringArray> for StringVector {
90    fn from(array: LargeStringArray) -> Self {
91        Self::from_large_string_array(array)
92    }
93}
94
95impl From<StringViewArray> for StringVector {
96    fn from(array: StringViewArray) -> Self {
97        Self::from_string_view_array(array)
98    }
99}
100
101impl From<Vec<Option<String>>> for StringVector {
102    fn from(data: Vec<Option<String>>) -> Self {
103        Self::from_string_array(StringArray::from_iter(data))
104    }
105}
106
107impl From<Vec<Option<&str>>> for StringVector {
108    fn from(data: Vec<Option<&str>>) -> Self {
109        Self::from_string_array(StringArray::from_iter(data))
110    }
111}
112
113impl From<&[Option<String>]> for StringVector {
114    fn from(data: &[Option<String>]) -> Self {
115        Self::from_string_array(StringArray::from_iter(data))
116    }
117}
118
119impl From<&[Option<&str>]> for StringVector {
120    fn from(data: &[Option<&str>]) -> Self {
121        Self::from_string_array(StringArray::from_iter(data))
122    }
123}
124
125impl From<Vec<String>> for StringVector {
126    fn from(data: Vec<String>) -> Self {
127        Self::from_string_array(StringArray::from_iter(data.into_iter().map(Some)))
128    }
129}
130
131impl From<Vec<&str>> for StringVector {
132    fn from(data: Vec<&str>) -> Self {
133        Self::from_string_array(StringArray::from_iter(data.into_iter().map(Some)))
134    }
135}
136
137impl Vector for StringVector {
138    fn data_type(&self) -> ConcreteDataType {
139        match &self.array {
140            StringArrayData::String(_) => ConcreteDataType::string_datatype(),
141            StringArrayData::LargeString(_) => ConcreteDataType::large_string_datatype(),
142            StringArrayData::StringView(_) => ConcreteDataType::utf8_view_datatype(),
143        }
144    }
145
146    fn vector_type_name(&self) -> String {
147        "StringVector".to_string()
148    }
149
150    fn as_any(&self) -> &dyn Any {
151        self
152    }
153
154    fn len(&self) -> usize {
155        match &self.array {
156            StringArrayData::String(array) => array.len(),
157            StringArrayData::LargeString(array) => array.len(),
158            StringArrayData::StringView(array) => array.len(),
159        }
160    }
161
162    fn to_arrow_array(&self) -> ArrayRef {
163        match &self.array {
164            StringArrayData::String(array) => Arc::new(array.clone()),
165            StringArrayData::LargeString(array) => Arc::new(array.clone()),
166            StringArrayData::StringView(array) => Arc::new(array.clone()),
167        }
168    }
169
170    fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
171        match &self.array {
172            StringArrayData::String(array) => Box::new(array.clone()),
173            StringArrayData::LargeString(array) => Box::new(array.clone()),
174            StringArrayData::StringView(array) => Box::new(array.clone()),
175        }
176    }
177
178    fn validity(&self) -> Validity {
179        match &self.array {
180            StringArrayData::String(array) => vectors::impl_validity_for_vector!(array),
181            StringArrayData::LargeString(array) => vectors::impl_validity_for_vector!(array),
182            StringArrayData::StringView(array) => vectors::impl_validity_for_vector!(array),
183        }
184    }
185
186    fn memory_size(&self) -> usize {
187        match &self.array {
188            StringArrayData::String(array) => array.get_buffer_memory_size(),
189            StringArrayData::LargeString(array) => array.get_buffer_memory_size(),
190            StringArrayData::StringView(array) => array.get_buffer_memory_size(),
191        }
192    }
193
194    fn null_count(&self) -> usize {
195        match &self.array {
196            StringArrayData::String(array) => array.null_count(),
197            StringArrayData::LargeString(array) => array.null_count(),
198            StringArrayData::StringView(array) => array.null_count(),
199        }
200    }
201
202    fn is_null(&self, row: usize) -> bool {
203        match &self.array {
204            StringArrayData::String(array) => array.is_null(row),
205            StringArrayData::LargeString(array) => array.is_null(row),
206            StringArrayData::StringView(array) => array.is_null(row),
207        }
208    }
209
210    fn slice(&self, offset: usize, length: usize) -> VectorRef {
211        match &self.array {
212            StringArrayData::String(array) => {
213                Arc::new(Self::from_string_array(array.slice(offset, length)))
214            }
215            StringArrayData::LargeString(array) => {
216                Arc::new(Self::from_large_string_array(array.slice(offset, length)))
217            }
218            StringArrayData::StringView(array) => {
219                Arc::new(Self::from_string_view_array(array.slice(offset, length)))
220            }
221        }
222    }
223
224    fn get(&self, index: usize) -> Value {
225        match &self.array {
226            StringArrayData::String(array) => vectors::impl_get_for_vector!(array, index),
227            StringArrayData::LargeString(array) => vectors::impl_get_for_vector!(array, index),
228            StringArrayData::StringView(array) => vectors::impl_get_for_vector!(array, index),
229        }
230    }
231
232    fn get_ref(&self, index: usize) -> ValueRef<'_> {
233        match &self.array {
234            StringArrayData::String(array) => vectors::impl_get_ref_for_vector!(array, index),
235            StringArrayData::LargeString(array) => vectors::impl_get_ref_for_vector!(array, index),
236            StringArrayData::StringView(array) => vectors::impl_get_ref_for_vector!(array, index),
237        }
238    }
239}
240
241pub enum StringIter<'a> {
242    String(ArrayIter<&'a StringArray>),
243    LargeString(ArrayIter<&'a LargeStringArray>),
244    StringView(ArrayIter<&'a StringViewArray>),
245}
246
247impl<'a> Iterator for StringIter<'a> {
248    type Item = Option<&'a str>;
249
250    fn next(&mut self) -> Option<Self::Item> {
251        match self {
252            StringIter::String(iter) => iter.next(),
253            StringIter::LargeString(iter) => iter.next(),
254            StringIter::StringView(iter) => iter.next(),
255        }
256    }
257}
258
259impl ScalarVector for StringVector {
260    type OwnedItem = String;
261    type RefItem<'a> = &'a str;
262    type Iter<'a> = StringIter<'a>;
263    type Builder = StringVectorBuilder;
264
265    fn get_data(&self, idx: usize) -> Option<Self::RefItem<'_>> {
266        match &self.array {
267            StringArrayData::String(array) => {
268                if array.is_valid(idx) {
269                    Some(array.value(idx))
270                } else {
271                    None
272                }
273            }
274            StringArrayData::LargeString(array) => {
275                if array.is_valid(idx) {
276                    Some(array.value(idx))
277                } else {
278                    None
279                }
280            }
281            StringArrayData::StringView(array) => {
282                if array.is_valid(idx) {
283                    Some(array.value(idx))
284                } else {
285                    None
286                }
287            }
288        }
289    }
290
291    fn iter_data(&self) -> Self::Iter<'_> {
292        match &self.array {
293            StringArrayData::String(array) => StringIter::String(array.iter()),
294            StringArrayData::LargeString(array) => StringIter::LargeString(array.iter()),
295            StringArrayData::StringView(array) => StringIter::StringView(array.iter()),
296        }
297    }
298}
299
300/// Internal representation for mutable string arrays
301enum MutableStringArrayData {
302    String(MutableStringArray),
303    LargeString(MutableLargeStringArray),
304    StringView(MutableStringViewArray),
305}
306
307pub struct StringVectorBuilder {
308    mutable_array: MutableStringArrayData,
309}
310
311impl Default for StringVectorBuilder {
312    fn default() -> Self {
313        Self::new()
314    }
315}
316
317impl StringVectorBuilder {
318    /// Create a builder for regular strings
319    pub fn new() -> Self {
320        Self {
321            mutable_array: MutableStringArrayData::String(MutableStringArray::new()),
322        }
323    }
324
325    /// Create a builder for large strings
326    pub fn new_large() -> Self {
327        Self {
328            mutable_array: MutableStringArrayData::LargeString(MutableLargeStringArray::new()),
329        }
330    }
331
332    /// Create a builder for view strings
333    pub fn new_view() -> Self {
334        Self {
335            mutable_array: MutableStringArrayData::StringView(MutableStringViewArray::new()),
336        }
337    }
338
339    /// Create a builder for regular strings with capacity
340    pub fn with_string_capacity(capacity: usize) -> Self {
341        Self {
342            mutable_array: MutableStringArrayData::String(MutableStringArray::with_capacity(
343                capacity, 0,
344            )),
345        }
346    }
347
348    /// Create a builder for large strings with capacity
349    pub fn with_large_capacity(capacity: usize) -> Self {
350        Self {
351            mutable_array: MutableStringArrayData::LargeString(
352                MutableLargeStringArray::with_capacity(capacity, 0),
353            ),
354        }
355    }
356
357    /// Create a builder for view strings with capacity
358    pub fn with_view_capacity(capacity: usize) -> Self {
359        Self {
360            mutable_array: MutableStringArrayData::StringView(
361                MutableStringViewArray::with_capacity(capacity),
362            ),
363        }
364    }
365}
366
367impl MutableVector for StringVectorBuilder {
368    fn data_type(&self) -> ConcreteDataType {
369        match &self.mutable_array {
370            MutableStringArrayData::String(_) => ConcreteDataType::string_datatype(),
371            MutableStringArrayData::LargeString(_) => ConcreteDataType::large_string_datatype(),
372            MutableStringArrayData::StringView(_) => ConcreteDataType::utf8_view_datatype(),
373        }
374    }
375
376    fn len(&self) -> usize {
377        match &self.mutable_array {
378            MutableStringArrayData::String(array) => array.len(),
379            MutableStringArrayData::LargeString(array) => array.len(),
380            MutableStringArrayData::StringView(array) => array.len(),
381        }
382    }
383
384    fn as_any(&self) -> &dyn Any {
385        self
386    }
387
388    fn as_mut_any(&mut self) -> &mut dyn Any {
389        self
390    }
391
392    fn to_vector(&mut self) -> VectorRef {
393        Arc::new(self.finish())
394    }
395
396    fn to_vector_cloned(&self) -> VectorRef {
397        Arc::new(self.finish_cloned())
398    }
399    fn try_push_value_ref(&mut self, value: &ValueRef) -> Result<()> {
400        match value.try_into_string()? {
401            Some(v) => match &mut self.mutable_array {
402                MutableStringArrayData::String(array) => array.append_value(v),
403                MutableStringArrayData::LargeString(array) => array.append_value(v),
404                MutableStringArrayData::StringView(array) => array.append_value(v),
405            },
406            None => match &mut self.mutable_array {
407                MutableStringArrayData::String(array) => array.append_null(),
408                MutableStringArrayData::LargeString(array) => array.append_null(),
409                MutableStringArrayData::StringView(array) => array.append_null(),
410            },
411        }
412        Ok(())
413    }
414
415    fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> {
416        vectors::impl_extend_for_builder!(self, vector, StringVector, offset, length)
417    }
418
419    fn push_null(&mut self) {
420        match &mut self.mutable_array {
421            MutableStringArrayData::String(array) => array.append_null(),
422            MutableStringArrayData::LargeString(array) => array.append_null(),
423            MutableStringArrayData::StringView(array) => array.append_null(),
424        }
425    }
426}
427
428impl ScalarVectorBuilder for StringVectorBuilder {
429    type VectorType = StringVector;
430
431    fn with_capacity(capacity: usize) -> Self {
432        Self {
433            mutable_array: MutableStringArrayData::String(MutableStringArray::with_capacity(
434                capacity, 0,
435            )),
436        }
437    }
438
439    fn push(&mut self, value: Option<<Self::VectorType as ScalarVector>::RefItem<'_>>) {
440        match value {
441            Some(v) => match &mut self.mutable_array {
442                MutableStringArrayData::String(array) => array.append_value(v),
443                MutableStringArrayData::LargeString(array) => array.append_value(v),
444                MutableStringArrayData::StringView(array) => array.append_value(v),
445            },
446            None => match &mut self.mutable_array {
447                MutableStringArrayData::String(array) => array.append_null(),
448                MutableStringArrayData::LargeString(array) => array.append_null(),
449                MutableStringArrayData::StringView(array) => array.append_null(),
450            },
451        }
452    }
453
454    fn finish(&mut self) -> Self::VectorType {
455        match &mut self.mutable_array {
456            MutableStringArrayData::String(array) => {
457                StringVector::from_string_array(array.finish())
458            }
459            MutableStringArrayData::LargeString(array) => {
460                StringVector::from_large_string_array(array.finish())
461            }
462            MutableStringArrayData::StringView(array) => {
463                StringVector::from_string_view_array(array.finish())
464            }
465        }
466    }
467
468    fn finish_cloned(&self) -> Self::VectorType {
469        match &self.mutable_array {
470            MutableStringArrayData::String(array) => {
471                StringVector::from_string_array(array.finish_cloned())
472            }
473            MutableStringArrayData::LargeString(array) => {
474                StringVector::from_large_string_array(array.finish_cloned())
475            }
476            MutableStringArrayData::StringView(array) => {
477                StringVector::from_string_view_array(array.finish_cloned())
478            }
479        }
480    }
481}
482
483impl Serializable for StringVector {
484    fn serialize_to_json(&self) -> Result<Vec<serde_json::Value>> {
485        self.iter_data()
486            .map(serde_json::to_value)
487            .collect::<serde_json::Result<_>>()
488            .context(error::SerializeSnafu)
489    }
490}
491
492impl StringVector {
493    pub fn try_from_arrow_array(
494        array: impl AsRef<dyn Array>,
495    ) -> crate::error::Result<StringVector> {
496        let array = array.as_ref();
497
498        if let Some(string_array) = array.as_any().downcast_ref::<StringArray>() {
499            Ok(StringVector::from_string_array(string_array.clone()))
500        } else if let Some(large_string_array) = array.as_any().downcast_ref::<LargeStringArray>() {
501            Ok(StringVector::from_large_string_array(
502                large_string_array.clone(),
503            ))
504        } else if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
505            Ok(StringVector::from_string_view_array(
506                string_view_array.clone(),
507            ))
508        } else {
509            Err(crate::error::UnsupportedArrowTypeSnafu {
510                arrow_type: array.data_type().clone(),
511            }
512            .build())
513        }
514    }
515}
516
517#[cfg(test)]
518mod tests {
519
520    use std::vec;
521
522    use arrow::datatypes::DataType;
523
524    use super::*;
525
526    #[test]
527    fn test_string_vector_build_get() {
528        let mut builder = StringVectorBuilder::with_capacity(4);
529        builder.push(Some("hello"));
530        builder.push(None);
531        builder.push(Some("world"));
532        let vector = builder.finish();
533
534        assert_eq!(Some("hello"), vector.get_data(0));
535        assert_eq!(None, vector.get_data(1));
536        assert_eq!(Some("world"), vector.get_data(2));
537
538        // Get out of bound
539        assert!(vector.try_get(3).is_err());
540
541        assert_eq!(Value::String("hello".into()), vector.get(0));
542        assert_eq!(Value::Null, vector.get(1));
543        assert_eq!(Value::String("world".into()), vector.get(2));
544
545        let mut iter = vector.iter_data();
546        assert_eq!("hello", iter.next().unwrap().unwrap());
547        assert_eq!(None, iter.next().unwrap());
548        assert_eq!("world", iter.next().unwrap().unwrap());
549        assert_eq!(None, iter.next());
550    }
551
552    #[test]
553    fn test_string_view_vector_build_get() {
554        let mut builder = StringVectorBuilder::with_view_capacity(4);
555        builder.push(Some("hello"));
556        builder.push(None);
557        builder.push(Some("world"));
558        let vector = builder.finish();
559
560        assert_eq!(ConcreteDataType::utf8_view_datatype(), vector.data_type());
561        assert_eq!(Some("hello"), vector.get_data(0));
562        assert_eq!(None, vector.get_data(1));
563        assert_eq!(Some("world"), vector.get_data(2));
564
565        // Get out of bound
566        assert!(vector.try_get(3).is_err());
567
568        assert_eq!(Value::String("hello".into()), vector.get(0));
569        assert_eq!(Value::Null, vector.get(1));
570        assert_eq!(Value::String("world".into()), vector.get(2));
571
572        let mut iter = vector.iter_data();
573        assert_eq!("hello", iter.next().unwrap().unwrap());
574        assert_eq!(None, iter.next().unwrap());
575        assert_eq!("world", iter.next().unwrap().unwrap());
576        assert_eq!(None, iter.next());
577
578        let arrow_arr = vector.to_arrow_array();
579        assert_eq!(&DataType::Utf8View, arrow_arr.data_type());
580    }
581
582    #[test]
583    fn test_string_vector_builder() {
584        let mut builder = StringVectorBuilder::with_capacity(3);
585        builder.push_value_ref(&ValueRef::String("hello"));
586        assert!(builder.try_push_value_ref(&ValueRef::Int32(123)).is_err());
587
588        let input = StringVector::from_slice(&["world", "one", "two"]);
589        builder.extend_slice_of(&input, 1, 2).unwrap();
590        assert!(
591            builder
592                .extend_slice_of(&crate::vectors::Int32Vector::from_slice([13]), 0, 1)
593                .is_err()
594        );
595        let vector = builder.to_vector();
596
597        let expect: VectorRef = Arc::new(StringVector::from_slice(&["hello", "one", "two"]));
598        assert_eq!(expect, vector);
599    }
600
601    #[test]
602    fn test_string_vector_misc() {
603        let strs = vec!["hello", "greptime", "rust"];
604        let v = StringVector::from(strs.clone());
605        assert_eq!(3, v.len());
606        assert_eq!("StringVector", v.vector_type_name());
607        assert!(!v.is_const());
608        assert!(v.validity().is_all_valid());
609        assert!(!v.only_null());
610        assert_eq!(1040, v.memory_size());
611
612        for (i, s) in strs.iter().enumerate() {
613            assert_eq!(Value::from(*s), v.get(i));
614            assert_eq!(ValueRef::from(*s), v.get_ref(i));
615            assert_eq!(Value::from(*s), v.try_get(i).unwrap());
616        }
617
618        let arrow_arr = v.to_arrow_array();
619        assert_eq!(3, arrow_arr.len());
620        assert_eq!(&DataType::Utf8, arrow_arr.data_type());
621    }
622
623    #[test]
624    fn test_serialize_string_vector() {
625        let mut builder = StringVectorBuilder::with_capacity(3);
626        builder.push(Some("hello"));
627        builder.push(None);
628        builder.push(Some("world"));
629        let string_vector = builder.finish();
630        let serialized =
631            serde_json::to_string(&string_vector.serialize_to_json().unwrap()).unwrap();
632        assert_eq!(r#"["hello",null,"world"]"#, serialized);
633    }
634
635    #[test]
636    fn test_from_arrow_array() {
637        let mut builder = MutableStringArray::new();
638        builder.append_option(Some("A"));
639        builder.append_option(Some("B"));
640        builder.append_null();
641        builder.append_option(Some("D"));
642        let string_array: StringArray = builder.finish();
643        let vector = StringVector::from(string_array);
644        assert_eq!(
645            r#"["A","B",null,"D"]"#,
646            serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(),
647        );
648    }
649
650    #[test]
651    fn test_from_non_option_string() {
652        let nul = String::from_utf8(vec![0]).unwrap();
653        let corpus = vec!["😅😅😅", "😍😍😍😍", "🥵🥵", nul.as_str()];
654        let vector = StringVector::from(corpus);
655        let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap();
656        assert_eq!(r#"["😅😅😅","😍😍😍😍","🥵🥵","\u0000"]"#, serialized);
657
658        let corpus = vec![
659            "🀀🀀🀀".to_string(),
660            "🀁🀁🀁".to_string(),
661            "🀂🀂🀂".to_string(),
662            "🀃🀃🀃".to_string(),
663            "🀆🀆".to_string(),
664        ];
665        let vector = StringVector::from(corpus);
666        let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap();
667        assert_eq!(r#"["🀀🀀🀀","🀁🀁🀁","🀂🀂🀂","🀃🀃🀃","🀆🀆"]"#, serialized);
668    }
669
670    #[test]
671    fn test_string_vector_builder_finish_cloned() {
672        let mut builder = StringVectorBuilder::with_capacity(1024);
673        builder.push(Some("1"));
674        builder.push(Some("2"));
675        builder.push(Some("3"));
676        let vector = builder.finish_cloned();
677        assert_eq!(vector.len(), 3);
678        assert_eq!(
679            r#"["1","2","3"]"#,
680            serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(),
681        );
682        assert_eq!(builder.len(), 3);
683    }
684}