datatypes/vectors/
string.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::any::Any;
16use std::sync::Arc;
17
18use arrow::array::{Array, ArrayBuilder, ArrayIter, ArrayRef};
19use snafu::ResultExt;
20
21use crate::arrow_array::{
22    LargeStringArray, MutableLargeStringArray, MutableStringArray, StringArray,
23};
24use crate::data_type::ConcreteDataType;
25use crate::error::{self, Result};
26use crate::scalars::{ScalarVector, ScalarVectorBuilder};
27use crate::serialize::Serializable;
28use crate::value::{Value, ValueRef};
29use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef};
30
31/// Internal representation for string arrays
32#[derive(Debug, PartialEq)]
33enum StringArrayData {
34    String(StringArray),
35    LargeString(LargeStringArray),
36}
37
38/// Vector of strings.
39#[derive(Debug, PartialEq)]
40pub struct StringVector {
41    array: StringArrayData,
42}
43
44impl StringVector {
45    pub(crate) fn as_arrow(&self) -> &dyn Array {
46        match &self.array {
47            StringArrayData::String(array) => array,
48            StringArrayData::LargeString(array) => array,
49        }
50    }
51
52    /// Create a StringVector from a regular StringArray
53    pub fn from_string_array(array: StringArray) -> Self {
54        Self {
55            array: StringArrayData::String(array),
56        }
57    }
58
59    /// Create a StringVector from a LargeStringArray
60    pub fn from_large_string_array(array: LargeStringArray) -> Self {
61        Self {
62            array: StringArrayData::LargeString(array),
63        }
64    }
65
66    pub fn from_slice<T: AsRef<str>>(slice: &[T]) -> Self {
67        Self::from_string_array(StringArray::from_iter(
68            slice.iter().map(|s| Some(s.as_ref())),
69        ))
70    }
71}
72
73impl From<StringArray> for StringVector {
74    fn from(array: StringArray) -> Self {
75        Self::from_string_array(array)
76    }
77}
78
79impl From<LargeStringArray> for StringVector {
80    fn from(array: LargeStringArray) -> Self {
81        Self::from_large_string_array(array)
82    }
83}
84
85impl From<Vec<Option<String>>> for StringVector {
86    fn from(data: Vec<Option<String>>) -> Self {
87        Self::from_string_array(StringArray::from_iter(data))
88    }
89}
90
91impl From<Vec<Option<&str>>> for StringVector {
92    fn from(data: Vec<Option<&str>>) -> Self {
93        Self::from_string_array(StringArray::from_iter(data))
94    }
95}
96
97impl From<&[Option<String>]> for StringVector {
98    fn from(data: &[Option<String>]) -> Self {
99        Self::from_string_array(StringArray::from_iter(data))
100    }
101}
102
103impl From<&[Option<&str>]> for StringVector {
104    fn from(data: &[Option<&str>]) -> Self {
105        Self::from_string_array(StringArray::from_iter(data))
106    }
107}
108
109impl From<Vec<String>> for StringVector {
110    fn from(data: Vec<String>) -> Self {
111        Self::from_string_array(StringArray::from_iter(data.into_iter().map(Some)))
112    }
113}
114
115impl From<Vec<&str>> for StringVector {
116    fn from(data: Vec<&str>) -> Self {
117        Self::from_string_array(StringArray::from_iter(data.into_iter().map(Some)))
118    }
119}
120
121impl Vector for StringVector {
122    fn data_type(&self) -> ConcreteDataType {
123        ConcreteDataType::string_datatype()
124    }
125
126    fn vector_type_name(&self) -> String {
127        "StringVector".to_string()
128    }
129
130    fn as_any(&self) -> &dyn Any {
131        self
132    }
133
134    fn len(&self) -> usize {
135        match &self.array {
136            StringArrayData::String(array) => array.len(),
137            StringArrayData::LargeString(array) => array.len(),
138        }
139    }
140
141    fn to_arrow_array(&self) -> ArrayRef {
142        match &self.array {
143            StringArrayData::String(array) => Arc::new(array.clone()),
144            StringArrayData::LargeString(array) => Arc::new(array.clone()),
145        }
146    }
147
148    fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
149        match &self.array {
150            StringArrayData::String(array) => Box::new(array.clone()),
151            StringArrayData::LargeString(array) => Box::new(array.clone()),
152        }
153    }
154
155    fn validity(&self) -> Validity {
156        match &self.array {
157            StringArrayData::String(array) => vectors::impl_validity_for_vector!(array),
158            StringArrayData::LargeString(array) => vectors::impl_validity_for_vector!(array),
159        }
160    }
161
162    fn memory_size(&self) -> usize {
163        match &self.array {
164            StringArrayData::String(array) => array.get_buffer_memory_size(),
165            StringArrayData::LargeString(array) => array.get_buffer_memory_size(),
166        }
167    }
168
169    fn null_count(&self) -> usize {
170        match &self.array {
171            StringArrayData::String(array) => array.null_count(),
172            StringArrayData::LargeString(array) => array.null_count(),
173        }
174    }
175
176    fn is_null(&self, row: usize) -> bool {
177        match &self.array {
178            StringArrayData::String(array) => array.is_null(row),
179            StringArrayData::LargeString(array) => array.is_null(row),
180        }
181    }
182
183    fn slice(&self, offset: usize, length: usize) -> VectorRef {
184        match &self.array {
185            StringArrayData::String(array) => {
186                Arc::new(Self::from_string_array(array.slice(offset, length)))
187            }
188            StringArrayData::LargeString(array) => {
189                Arc::new(Self::from_large_string_array(array.slice(offset, length)))
190            }
191        }
192    }
193
194    fn get(&self, index: usize) -> Value {
195        match &self.array {
196            StringArrayData::String(array) => vectors::impl_get_for_vector!(array, index),
197            StringArrayData::LargeString(array) => vectors::impl_get_for_vector!(array, index),
198        }
199    }
200
201    fn get_ref(&self, index: usize) -> ValueRef<'_> {
202        match &self.array {
203            StringArrayData::String(array) => vectors::impl_get_ref_for_vector!(array, index),
204            StringArrayData::LargeString(array) => vectors::impl_get_ref_for_vector!(array, index),
205        }
206    }
207}
208
209pub enum StringIter<'a> {
210    String(ArrayIter<&'a StringArray>),
211    LargeString(ArrayIter<&'a LargeStringArray>),
212}
213
214impl<'a> Iterator for StringIter<'a> {
215    type Item = Option<&'a str>;
216
217    fn next(&mut self) -> Option<Self::Item> {
218        match self {
219            StringIter::String(iter) => iter.next(),
220            StringIter::LargeString(iter) => iter.next(),
221        }
222    }
223}
224
225impl ScalarVector for StringVector {
226    type OwnedItem = String;
227    type RefItem<'a> = &'a str;
228    type Iter<'a> = StringIter<'a>;
229    type Builder = StringVectorBuilder;
230
231    fn get_data(&self, idx: usize) -> Option<Self::RefItem<'_>> {
232        match &self.array {
233            StringArrayData::String(array) => {
234                if array.is_valid(idx) {
235                    Some(array.value(idx))
236                } else {
237                    None
238                }
239            }
240            StringArrayData::LargeString(array) => {
241                if array.is_valid(idx) {
242                    Some(array.value(idx))
243                } else {
244                    None
245                }
246            }
247        }
248    }
249
250    fn iter_data(&self) -> Self::Iter<'_> {
251        match &self.array {
252            StringArrayData::String(array) => StringIter::String(array.iter()),
253            StringArrayData::LargeString(array) => StringIter::LargeString(array.iter()),
254        }
255    }
256}
257
258/// Internal representation for mutable string arrays
259enum MutableStringArrayData {
260    String(MutableStringArray),
261    LargeString(MutableLargeStringArray),
262}
263
264pub struct StringVectorBuilder {
265    mutable_array: MutableStringArrayData,
266}
267
268impl Default for StringVectorBuilder {
269    fn default() -> Self {
270        Self::new()
271    }
272}
273
274impl StringVectorBuilder {
275    /// Create a builder for regular strings
276    pub fn new() -> Self {
277        Self {
278            mutable_array: MutableStringArrayData::String(MutableStringArray::new()),
279        }
280    }
281
282    /// Create a builder for large strings
283    pub fn new_large() -> Self {
284        Self {
285            mutable_array: MutableStringArrayData::LargeString(MutableLargeStringArray::new()),
286        }
287    }
288
289    /// Create a builder for regular strings with capacity
290    pub fn with_string_capacity(capacity: usize) -> Self {
291        Self {
292            mutable_array: MutableStringArrayData::String(MutableStringArray::with_capacity(
293                capacity, 0,
294            )),
295        }
296    }
297
298    /// Create a builder for large strings with capacity
299    pub fn with_large_capacity(capacity: usize) -> Self {
300        Self {
301            mutable_array: MutableStringArrayData::LargeString(
302                MutableLargeStringArray::with_capacity(capacity, 0),
303            ),
304        }
305    }
306}
307
308impl MutableVector for StringVectorBuilder {
309    fn data_type(&self) -> ConcreteDataType {
310        ConcreteDataType::string_datatype()
311    }
312
313    fn len(&self) -> usize {
314        match &self.mutable_array {
315            MutableStringArrayData::String(array) => array.len(),
316            MutableStringArrayData::LargeString(array) => array.len(),
317        }
318    }
319
320    fn as_any(&self) -> &dyn Any {
321        self
322    }
323
324    fn as_mut_any(&mut self) -> &mut dyn Any {
325        self
326    }
327
328    fn to_vector(&mut self) -> VectorRef {
329        Arc::new(self.finish())
330    }
331
332    fn to_vector_cloned(&self) -> VectorRef {
333        Arc::new(self.finish_cloned())
334    }
335    fn try_push_value_ref(&mut self, value: &ValueRef) -> Result<()> {
336        match value.try_into_string()? {
337            Some(v) => match &mut self.mutable_array {
338                MutableStringArrayData::String(array) => array.append_value(v),
339                MutableStringArrayData::LargeString(array) => array.append_value(v),
340            },
341            None => match &mut self.mutable_array {
342                MutableStringArrayData::String(array) => array.append_null(),
343                MutableStringArrayData::LargeString(array) => array.append_null(),
344            },
345        }
346        Ok(())
347    }
348
349    fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> {
350        vectors::impl_extend_for_builder!(self, vector, StringVector, offset, length)
351    }
352
353    fn push_null(&mut self) {
354        match &mut self.mutable_array {
355            MutableStringArrayData::String(array) => array.append_null(),
356            MutableStringArrayData::LargeString(array) => array.append_null(),
357        }
358    }
359}
360
361impl ScalarVectorBuilder for StringVectorBuilder {
362    type VectorType = StringVector;
363
364    fn with_capacity(capacity: usize) -> Self {
365        Self {
366            mutable_array: MutableStringArrayData::String(MutableStringArray::with_capacity(
367                capacity, 0,
368            )),
369        }
370    }
371
372    fn push(&mut self, value: Option<<Self::VectorType as ScalarVector>::RefItem<'_>>) {
373        match value {
374            Some(v) => match &mut self.mutable_array {
375                MutableStringArrayData::String(array) => array.append_value(v),
376                MutableStringArrayData::LargeString(array) => array.append_value(v),
377            },
378            None => match &mut self.mutable_array {
379                MutableStringArrayData::String(array) => array.append_null(),
380                MutableStringArrayData::LargeString(array) => array.append_null(),
381            },
382        }
383    }
384
385    fn finish(&mut self) -> Self::VectorType {
386        match &mut self.mutable_array {
387            MutableStringArrayData::String(array) => {
388                StringVector::from_string_array(array.finish())
389            }
390            MutableStringArrayData::LargeString(array) => {
391                StringVector::from_large_string_array(array.finish())
392            }
393        }
394    }
395
396    fn finish_cloned(&self) -> Self::VectorType {
397        match &self.mutable_array {
398            MutableStringArrayData::String(array) => {
399                StringVector::from_string_array(array.finish_cloned())
400            }
401            MutableStringArrayData::LargeString(array) => {
402                StringVector::from_large_string_array(array.finish_cloned())
403            }
404        }
405    }
406}
407
408impl Serializable for StringVector {
409    fn serialize_to_json(&self) -> Result<Vec<serde_json::Value>> {
410        self.iter_data()
411            .map(serde_json::to_value)
412            .collect::<serde_json::Result<_>>()
413            .context(error::SerializeSnafu)
414    }
415}
416
417impl StringVector {
418    pub fn try_from_arrow_array(
419        array: impl AsRef<dyn Array>,
420    ) -> crate::error::Result<StringVector> {
421        let array = array.as_ref();
422
423        if let Some(string_array) = array.as_any().downcast_ref::<StringArray>() {
424            Ok(StringVector::from_string_array(string_array.clone()))
425        } else if let Some(large_string_array) = array.as_any().downcast_ref::<LargeStringArray>() {
426            Ok(StringVector::from_large_string_array(
427                large_string_array.clone(),
428            ))
429        } else {
430            Err(crate::error::UnsupportedArrowTypeSnafu {
431                arrow_type: array.data_type().clone(),
432            }
433            .build())
434        }
435    }
436}
437
438#[cfg(test)]
439mod tests {
440
441    use std::vec;
442
443    use arrow::datatypes::DataType;
444
445    use super::*;
446
447    #[test]
448    fn test_string_vector_build_get() {
449        let mut builder = StringVectorBuilder::with_capacity(4);
450        builder.push(Some("hello"));
451        builder.push(None);
452        builder.push(Some("world"));
453        let vector = builder.finish();
454
455        assert_eq!(Some("hello"), vector.get_data(0));
456        assert_eq!(None, vector.get_data(1));
457        assert_eq!(Some("world"), vector.get_data(2));
458
459        // Get out of bound
460        assert!(vector.try_get(3).is_err());
461
462        assert_eq!(Value::String("hello".into()), vector.get(0));
463        assert_eq!(Value::Null, vector.get(1));
464        assert_eq!(Value::String("world".into()), vector.get(2));
465
466        let mut iter = vector.iter_data();
467        assert_eq!("hello", iter.next().unwrap().unwrap());
468        assert_eq!(None, iter.next().unwrap());
469        assert_eq!("world", iter.next().unwrap().unwrap());
470        assert_eq!(None, iter.next());
471    }
472
473    #[test]
474    fn test_string_vector_builder() {
475        let mut builder = StringVectorBuilder::with_capacity(3);
476        builder.push_value_ref(&ValueRef::String("hello"));
477        assert!(builder.try_push_value_ref(&ValueRef::Int32(123)).is_err());
478
479        let input = StringVector::from_slice(&["world", "one", "two"]);
480        builder.extend_slice_of(&input, 1, 2).unwrap();
481        assert!(
482            builder
483                .extend_slice_of(&crate::vectors::Int32Vector::from_slice([13]), 0, 1)
484                .is_err()
485        );
486        let vector = builder.to_vector();
487
488        let expect: VectorRef = Arc::new(StringVector::from_slice(&["hello", "one", "two"]));
489        assert_eq!(expect, vector);
490    }
491
492    #[test]
493    fn test_string_vector_misc() {
494        let strs = vec!["hello", "greptime", "rust"];
495        let v = StringVector::from(strs.clone());
496        assert_eq!(3, v.len());
497        assert_eq!("StringVector", v.vector_type_name());
498        assert!(!v.is_const());
499        assert!(v.validity().is_all_valid());
500        assert!(!v.only_null());
501        assert_eq!(1040, v.memory_size());
502
503        for (i, s) in strs.iter().enumerate() {
504            assert_eq!(Value::from(*s), v.get(i));
505            assert_eq!(ValueRef::from(*s), v.get_ref(i));
506            assert_eq!(Value::from(*s), v.try_get(i).unwrap());
507        }
508
509        let arrow_arr = v.to_arrow_array();
510        assert_eq!(3, arrow_arr.len());
511        assert_eq!(&DataType::Utf8, arrow_arr.data_type());
512    }
513
514    #[test]
515    fn test_serialize_string_vector() {
516        let mut builder = StringVectorBuilder::with_capacity(3);
517        builder.push(Some("hello"));
518        builder.push(None);
519        builder.push(Some("world"));
520        let string_vector = builder.finish();
521        let serialized =
522            serde_json::to_string(&string_vector.serialize_to_json().unwrap()).unwrap();
523        assert_eq!(r#"["hello",null,"world"]"#, serialized);
524    }
525
526    #[test]
527    fn test_from_arrow_array() {
528        let mut builder = MutableStringArray::new();
529        builder.append_option(Some("A"));
530        builder.append_option(Some("B"));
531        builder.append_null();
532        builder.append_option(Some("D"));
533        let string_array: StringArray = builder.finish();
534        let vector = StringVector::from(string_array);
535        assert_eq!(
536            r#"["A","B",null,"D"]"#,
537            serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(),
538        );
539    }
540
541    #[test]
542    fn test_from_non_option_string() {
543        let nul = String::from_utf8(vec![0]).unwrap();
544        let corpus = vec!["😅😅😅", "😍😍😍😍", "🥵🥵", nul.as_str()];
545        let vector = StringVector::from(corpus);
546        let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap();
547        assert_eq!(r#"["😅😅😅","😍😍😍😍","🥵🥵","\u0000"]"#, serialized);
548
549        let corpus = vec![
550            "🀀🀀🀀".to_string(),
551            "🀁🀁🀁".to_string(),
552            "🀂🀂🀂".to_string(),
553            "🀃🀃🀃".to_string(),
554            "🀆🀆".to_string(),
555        ];
556        let vector = StringVector::from(corpus);
557        let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap();
558        assert_eq!(r#"["🀀🀀🀀","🀁🀁🀁","🀂🀂🀂","🀃🀃🀃","🀆🀆"]"#, serialized);
559    }
560
561    #[test]
562    fn test_string_vector_builder_finish_cloned() {
563        let mut builder = StringVectorBuilder::with_capacity(1024);
564        builder.push(Some("1"));
565        builder.push(Some("2"));
566        builder.push(Some("3"));
567        let vector = builder.finish_cloned();
568        assert_eq!(vector.len(), 3);
569        assert_eq!(
570            r#"["1","2","3"]"#,
571            serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(),
572        );
573        assert_eq!(builder.len(), 3);
574    }
575}