datatypes/vectors/
string.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::any::Any;
16use std::sync::Arc;
17
18use arrow::array::{Array, ArrayBuilder, ArrayIter, ArrayRef};
19use snafu::ResultExt;
20
21use crate::arrow_array::{MutableStringArray, StringArray};
22use crate::data_type::ConcreteDataType;
23use crate::error::{self, Result};
24use crate::scalars::{ScalarVector, ScalarVectorBuilder};
25use crate::serialize::Serializable;
26use crate::value::{Value, ValueRef};
27use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef};
28
29/// Vector of strings.
30#[derive(Debug, PartialEq)]
31pub struct StringVector {
32    array: StringArray,
33}
34
35impl StringVector {
36    pub(crate) fn as_arrow(&self) -> &dyn Array {
37        &self.array
38    }
39}
40
41impl From<StringArray> for StringVector {
42    fn from(array: StringArray) -> Self {
43        Self { array }
44    }
45}
46
47impl From<Vec<Option<String>>> for StringVector {
48    fn from(data: Vec<Option<String>>) -> Self {
49        Self {
50            array: StringArray::from_iter(data),
51        }
52    }
53}
54
55impl From<Vec<Option<&str>>> for StringVector {
56    fn from(data: Vec<Option<&str>>) -> Self {
57        Self {
58            array: StringArray::from_iter(data),
59        }
60    }
61}
62
63impl From<&[Option<String>]> for StringVector {
64    fn from(data: &[Option<String>]) -> Self {
65        Self {
66            array: StringArray::from_iter(data),
67        }
68    }
69}
70
71impl From<&[Option<&str>]> for StringVector {
72    fn from(data: &[Option<&str>]) -> Self {
73        Self {
74            array: StringArray::from_iter(data),
75        }
76    }
77}
78
79impl From<Vec<String>> for StringVector {
80    fn from(data: Vec<String>) -> Self {
81        Self {
82            array: StringArray::from_iter(data.into_iter().map(Some)),
83        }
84    }
85}
86
87impl From<Vec<&str>> for StringVector {
88    fn from(data: Vec<&str>) -> Self {
89        Self {
90            array: StringArray::from_iter(data.into_iter().map(Some)),
91        }
92    }
93}
94
95impl Vector for StringVector {
96    fn data_type(&self) -> ConcreteDataType {
97        ConcreteDataType::string_datatype()
98    }
99
100    fn vector_type_name(&self) -> String {
101        "StringVector".to_string()
102    }
103
104    fn as_any(&self) -> &dyn Any {
105        self
106    }
107
108    fn len(&self) -> usize {
109        self.array.len()
110    }
111
112    fn to_arrow_array(&self) -> ArrayRef {
113        Arc::new(self.array.clone())
114    }
115
116    fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
117        Box::new(self.array.clone())
118    }
119
120    fn validity(&self) -> Validity {
121        vectors::impl_validity_for_vector!(self.array)
122    }
123
124    fn memory_size(&self) -> usize {
125        self.array.get_buffer_memory_size()
126    }
127
128    fn null_count(&self) -> usize {
129        self.array.null_count()
130    }
131
132    fn is_null(&self, row: usize) -> bool {
133        self.array.is_null(row)
134    }
135
136    fn slice(&self, offset: usize, length: usize) -> VectorRef {
137        Arc::new(Self::from(self.array.slice(offset, length)))
138    }
139
140    fn get(&self, index: usize) -> Value {
141        vectors::impl_get_for_vector!(self.array, index)
142    }
143
144    fn get_ref(&self, index: usize) -> ValueRef {
145        vectors::impl_get_ref_for_vector!(self.array, index)
146    }
147}
148
149impl ScalarVector for StringVector {
150    type OwnedItem = String;
151    type RefItem<'a> = &'a str;
152    type Iter<'a> = ArrayIter<&'a StringArray>;
153    type Builder = StringVectorBuilder;
154
155    fn get_data(&self, idx: usize) -> Option<Self::RefItem<'_>> {
156        if self.array.is_valid(idx) {
157            Some(self.array.value(idx))
158        } else {
159            None
160        }
161    }
162
163    fn iter_data(&self) -> Self::Iter<'_> {
164        self.array.iter()
165    }
166}
167
168pub struct StringVectorBuilder {
169    pub mutable_array: MutableStringArray,
170}
171
172impl MutableVector for StringVectorBuilder {
173    fn data_type(&self) -> ConcreteDataType {
174        ConcreteDataType::string_datatype()
175    }
176
177    fn len(&self) -> usize {
178        self.mutable_array.len()
179    }
180
181    fn as_any(&self) -> &dyn Any {
182        self
183    }
184
185    fn as_mut_any(&mut self) -> &mut dyn Any {
186        self
187    }
188
189    fn to_vector(&mut self) -> VectorRef {
190        Arc::new(self.finish())
191    }
192
193    fn to_vector_cloned(&self) -> VectorRef {
194        Arc::new(self.finish_cloned())
195    }
196
197    fn try_push_value_ref(&mut self, value: ValueRef) -> Result<()> {
198        match value.as_string()? {
199            Some(v) => self.mutable_array.append_value(v),
200            None => self.mutable_array.append_null(),
201        }
202        Ok(())
203    }
204
205    fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> {
206        vectors::impl_extend_for_builder!(self, vector, StringVector, offset, length)
207    }
208
209    fn push_null(&mut self) {
210        self.mutable_array.append_null()
211    }
212}
213
214impl ScalarVectorBuilder for StringVectorBuilder {
215    type VectorType = StringVector;
216
217    fn with_capacity(capacity: usize) -> Self {
218        Self {
219            mutable_array: MutableStringArray::with_capacity(capacity, 0),
220        }
221    }
222
223    fn push(&mut self, value: Option<<Self::VectorType as ScalarVector>::RefItem<'_>>) {
224        match value {
225            Some(v) => self.mutable_array.append_value(v),
226            None => self.mutable_array.append_null(),
227        }
228    }
229
230    fn finish(&mut self) -> Self::VectorType {
231        StringVector {
232            array: self.mutable_array.finish(),
233        }
234    }
235
236    fn finish_cloned(&self) -> Self::VectorType {
237        StringVector {
238            array: self.mutable_array.finish_cloned(),
239        }
240    }
241}
242
243impl Serializable for StringVector {
244    fn serialize_to_json(&self) -> Result<Vec<serde_json::Value>> {
245        self.iter_data()
246            .map(serde_json::to_value)
247            .collect::<serde_json::Result<_>>()
248            .context(error::SerializeSnafu)
249    }
250}
251
252vectors::impl_try_from_arrow_array_for_vector!(StringArray, StringVector);
253
254#[cfg(test)]
255mod tests {
256
257    use std::vec;
258
259    use arrow::datatypes::DataType;
260
261    use super::*;
262
263    #[test]
264    fn test_string_vector_build_get() {
265        let mut builder = StringVectorBuilder::with_capacity(4);
266        builder.push(Some("hello"));
267        builder.push(None);
268        builder.push(Some("world"));
269        let vector = builder.finish();
270
271        assert_eq!(Some("hello"), vector.get_data(0));
272        assert_eq!(None, vector.get_data(1));
273        assert_eq!(Some("world"), vector.get_data(2));
274
275        // Get out of bound
276        assert!(vector.try_get(3).is_err());
277
278        assert_eq!(Value::String("hello".into()), vector.get(0));
279        assert_eq!(Value::Null, vector.get(1));
280        assert_eq!(Value::String("world".into()), vector.get(2));
281
282        let mut iter = vector.iter_data();
283        assert_eq!("hello", iter.next().unwrap().unwrap());
284        assert_eq!(None, iter.next().unwrap());
285        assert_eq!("world", iter.next().unwrap().unwrap());
286        assert_eq!(None, iter.next());
287    }
288
289    #[test]
290    fn test_string_vector_builder() {
291        let mut builder = StringVectorBuilder::with_capacity(3);
292        builder.push_value_ref(ValueRef::String("hello"));
293        assert!(builder.try_push_value_ref(ValueRef::Int32(123)).is_err());
294
295        let input = StringVector::from_slice(&["world", "one", "two"]);
296        builder.extend_slice_of(&input, 1, 2).unwrap();
297        assert!(builder
298            .extend_slice_of(&crate::vectors::Int32Vector::from_slice([13]), 0, 1)
299            .is_err());
300        let vector = builder.to_vector();
301
302        let expect: VectorRef = Arc::new(StringVector::from_slice(&["hello", "one", "two"]));
303        assert_eq!(expect, vector);
304    }
305
306    #[test]
307    fn test_string_vector_misc() {
308        let strs = vec!["hello", "greptime", "rust"];
309        let v = StringVector::from(strs.clone());
310        assert_eq!(3, v.len());
311        assert_eq!("StringVector", v.vector_type_name());
312        assert!(!v.is_const());
313        assert!(v.validity().is_all_valid());
314        assert!(!v.only_null());
315        assert_eq!(1088, v.memory_size());
316
317        for (i, s) in strs.iter().enumerate() {
318            assert_eq!(Value::from(*s), v.get(i));
319            assert_eq!(ValueRef::from(*s), v.get_ref(i));
320            assert_eq!(Value::from(*s), v.try_get(i).unwrap());
321        }
322
323        let arrow_arr = v.to_arrow_array();
324        assert_eq!(3, arrow_arr.len());
325        assert_eq!(&DataType::Utf8, arrow_arr.data_type());
326    }
327
328    #[test]
329    fn test_serialize_string_vector() {
330        let mut builder = StringVectorBuilder::with_capacity(3);
331        builder.push(Some("hello"));
332        builder.push(None);
333        builder.push(Some("world"));
334        let string_vector = builder.finish();
335        let serialized =
336            serde_json::to_string(&string_vector.serialize_to_json().unwrap()).unwrap();
337        assert_eq!(r#"["hello",null,"world"]"#, serialized);
338    }
339
340    #[test]
341    fn test_from_arrow_array() {
342        let mut builder = MutableStringArray::new();
343        builder.append_option(Some("A"));
344        builder.append_option(Some("B"));
345        builder.append_null();
346        builder.append_option(Some("D"));
347        let string_array: StringArray = builder.finish();
348        let vector = StringVector::from(string_array);
349        assert_eq!(
350            r#"["A","B",null,"D"]"#,
351            serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(),
352        );
353    }
354
355    #[test]
356    fn test_from_non_option_string() {
357        let nul = String::from_utf8(vec![0]).unwrap();
358        let corpus = vec!["😅😅😅", "😍😍😍😍", "🥵🥵", nul.as_str()];
359        let vector = StringVector::from(corpus);
360        let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap();
361        assert_eq!(r#"["😅😅😅","😍😍😍😍","🥵🥵","\u0000"]"#, serialized);
362
363        let corpus = vec![
364            "🀀🀀🀀".to_string(),
365            "🀁🀁🀁".to_string(),
366            "🀂🀂🀂".to_string(),
367            "🀃🀃🀃".to_string(),
368            "🀆🀆".to_string(),
369        ];
370        let vector = StringVector::from(corpus);
371        let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap();
372        assert_eq!(r#"["🀀🀀🀀","🀁🀁🀁","🀂🀂🀂","🀃🀃🀃","🀆🀆"]"#, serialized);
373    }
374
375    #[test]
376    fn test_string_vector_builder_finish_cloned() {
377        let mut builder = StringVectorBuilder::with_capacity(1024);
378        builder.push(Some("1"));
379        builder.push(Some("2"));
380        builder.push(Some("3"));
381        let vector = builder.finish_cloned();
382        assert_eq!(vector.len(), 3);
383        assert_eq!(
384            r#"["1","2","3"]"#,
385            serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(),
386        );
387        assert_eq!(builder.len(), 3);
388    }
389}