1use std::any::Any;
16use std::sync::Arc;
17
18use arrow::array::{Array, ArrayBuilder, ArrayIter, ArrayRef};
19use snafu::ResultExt;
20
21use crate::arrow_array::{
22 LargeStringArray, MutableLargeStringArray, MutableStringArray, StringArray,
23};
24use crate::data_type::ConcreteDataType;
25use crate::error::{self, Result};
26use crate::scalars::{ScalarVector, ScalarVectorBuilder};
27use crate::serialize::Serializable;
28use crate::value::{Value, ValueRef};
29use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef};
30
31#[derive(Debug, PartialEq)]
33enum StringArrayData {
34 String(StringArray),
35 LargeString(LargeStringArray),
36}
37
38#[derive(Debug, PartialEq)]
40pub struct StringVector {
41 array: StringArrayData,
42}
43
44impl StringVector {
45 pub(crate) fn as_arrow(&self) -> &dyn Array {
46 match &self.array {
47 StringArrayData::String(array) => array,
48 StringArrayData::LargeString(array) => array,
49 }
50 }
51
52 pub fn from_string_array(array: StringArray) -> Self {
54 Self {
55 array: StringArrayData::String(array),
56 }
57 }
58
59 pub fn from_large_string_array(array: LargeStringArray) -> Self {
61 Self {
62 array: StringArrayData::LargeString(array),
63 }
64 }
65
66 pub fn from_slice<T: AsRef<str>>(slice: &[T]) -> Self {
67 Self::from_string_array(StringArray::from_iter(
68 slice.iter().map(|s| Some(s.as_ref())),
69 ))
70 }
71}
72
73impl From<StringArray> for StringVector {
74 fn from(array: StringArray) -> Self {
75 Self::from_string_array(array)
76 }
77}
78
79impl From<LargeStringArray> for StringVector {
80 fn from(array: LargeStringArray) -> Self {
81 Self::from_large_string_array(array)
82 }
83}
84
85impl From<Vec<Option<String>>> for StringVector {
86 fn from(data: Vec<Option<String>>) -> Self {
87 Self::from_string_array(StringArray::from_iter(data))
88 }
89}
90
91impl From<Vec<Option<&str>>> for StringVector {
92 fn from(data: Vec<Option<&str>>) -> Self {
93 Self::from_string_array(StringArray::from_iter(data))
94 }
95}
96
97impl From<&[Option<String>]> for StringVector {
98 fn from(data: &[Option<String>]) -> Self {
99 Self::from_string_array(StringArray::from_iter(data))
100 }
101}
102
103impl From<&[Option<&str>]> for StringVector {
104 fn from(data: &[Option<&str>]) -> Self {
105 Self::from_string_array(StringArray::from_iter(data))
106 }
107}
108
109impl From<Vec<String>> for StringVector {
110 fn from(data: Vec<String>) -> Self {
111 Self::from_string_array(StringArray::from_iter(data.into_iter().map(Some)))
112 }
113}
114
115impl From<Vec<&str>> for StringVector {
116 fn from(data: Vec<&str>) -> Self {
117 Self::from_string_array(StringArray::from_iter(data.into_iter().map(Some)))
118 }
119}
120
121impl Vector for StringVector {
122 fn data_type(&self) -> ConcreteDataType {
123 ConcreteDataType::string_datatype()
124 }
125
126 fn vector_type_name(&self) -> String {
127 "StringVector".to_string()
128 }
129
130 fn as_any(&self) -> &dyn Any {
131 self
132 }
133
134 fn len(&self) -> usize {
135 match &self.array {
136 StringArrayData::String(array) => array.len(),
137 StringArrayData::LargeString(array) => array.len(),
138 }
139 }
140
141 fn to_arrow_array(&self) -> ArrayRef {
142 match &self.array {
143 StringArrayData::String(array) => Arc::new(array.clone()),
144 StringArrayData::LargeString(array) => Arc::new(array.clone()),
145 }
146 }
147
148 fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
149 match &self.array {
150 StringArrayData::String(array) => Box::new(array.clone()),
151 StringArrayData::LargeString(array) => Box::new(array.clone()),
152 }
153 }
154
155 fn validity(&self) -> Validity {
156 match &self.array {
157 StringArrayData::String(array) => vectors::impl_validity_for_vector!(array),
158 StringArrayData::LargeString(array) => vectors::impl_validity_for_vector!(array),
159 }
160 }
161
162 fn memory_size(&self) -> usize {
163 match &self.array {
164 StringArrayData::String(array) => array.get_buffer_memory_size(),
165 StringArrayData::LargeString(array) => array.get_buffer_memory_size(),
166 }
167 }
168
169 fn null_count(&self) -> usize {
170 match &self.array {
171 StringArrayData::String(array) => array.null_count(),
172 StringArrayData::LargeString(array) => array.null_count(),
173 }
174 }
175
176 fn is_null(&self, row: usize) -> bool {
177 match &self.array {
178 StringArrayData::String(array) => array.is_null(row),
179 StringArrayData::LargeString(array) => array.is_null(row),
180 }
181 }
182
183 fn slice(&self, offset: usize, length: usize) -> VectorRef {
184 match &self.array {
185 StringArrayData::String(array) => {
186 Arc::new(Self::from_string_array(array.slice(offset, length)))
187 }
188 StringArrayData::LargeString(array) => {
189 Arc::new(Self::from_large_string_array(array.slice(offset, length)))
190 }
191 }
192 }
193
194 fn get(&self, index: usize) -> Value {
195 match &self.array {
196 StringArrayData::String(array) => vectors::impl_get_for_vector!(array, index),
197 StringArrayData::LargeString(array) => vectors::impl_get_for_vector!(array, index),
198 }
199 }
200
201 fn get_ref(&self, index: usize) -> ValueRef<'_> {
202 match &self.array {
203 StringArrayData::String(array) => vectors::impl_get_ref_for_vector!(array, index),
204 StringArrayData::LargeString(array) => vectors::impl_get_ref_for_vector!(array, index),
205 }
206 }
207}
208
209pub enum StringIter<'a> {
210 String(ArrayIter<&'a StringArray>),
211 LargeString(ArrayIter<&'a LargeStringArray>),
212}
213
214impl<'a> Iterator for StringIter<'a> {
215 type Item = Option<&'a str>;
216
217 fn next(&mut self) -> Option<Self::Item> {
218 match self {
219 StringIter::String(iter) => iter.next(),
220 StringIter::LargeString(iter) => iter.next(),
221 }
222 }
223}
224
225impl ScalarVector for StringVector {
226 type OwnedItem = String;
227 type RefItem<'a> = &'a str;
228 type Iter<'a> = StringIter<'a>;
229 type Builder = StringVectorBuilder;
230
231 fn get_data(&self, idx: usize) -> Option<Self::RefItem<'_>> {
232 match &self.array {
233 StringArrayData::String(array) => {
234 if array.is_valid(idx) {
235 Some(array.value(idx))
236 } else {
237 None
238 }
239 }
240 StringArrayData::LargeString(array) => {
241 if array.is_valid(idx) {
242 Some(array.value(idx))
243 } else {
244 None
245 }
246 }
247 }
248 }
249
250 fn iter_data(&self) -> Self::Iter<'_> {
251 match &self.array {
252 StringArrayData::String(array) => StringIter::String(array.iter()),
253 StringArrayData::LargeString(array) => StringIter::LargeString(array.iter()),
254 }
255 }
256}
257
258enum MutableStringArrayData {
260 String(MutableStringArray),
261 LargeString(MutableLargeStringArray),
262}
263
264pub struct StringVectorBuilder {
265 mutable_array: MutableStringArrayData,
266}
267
268impl Default for StringVectorBuilder {
269 fn default() -> Self {
270 Self::new()
271 }
272}
273
274impl StringVectorBuilder {
275 pub fn new() -> Self {
277 Self {
278 mutable_array: MutableStringArrayData::String(MutableStringArray::new()),
279 }
280 }
281
282 pub fn new_large() -> Self {
284 Self {
285 mutable_array: MutableStringArrayData::LargeString(MutableLargeStringArray::new()),
286 }
287 }
288
289 pub fn with_string_capacity(capacity: usize) -> Self {
291 Self {
292 mutable_array: MutableStringArrayData::String(MutableStringArray::with_capacity(
293 capacity, 0,
294 )),
295 }
296 }
297
298 pub fn with_large_capacity(capacity: usize) -> Self {
300 Self {
301 mutable_array: MutableStringArrayData::LargeString(
302 MutableLargeStringArray::with_capacity(capacity, 0),
303 ),
304 }
305 }
306}
307
308impl MutableVector for StringVectorBuilder {
309 fn data_type(&self) -> ConcreteDataType {
310 ConcreteDataType::string_datatype()
311 }
312
313 fn len(&self) -> usize {
314 match &self.mutable_array {
315 MutableStringArrayData::String(array) => array.len(),
316 MutableStringArrayData::LargeString(array) => array.len(),
317 }
318 }
319
320 fn as_any(&self) -> &dyn Any {
321 self
322 }
323
324 fn as_mut_any(&mut self) -> &mut dyn Any {
325 self
326 }
327
328 fn to_vector(&mut self) -> VectorRef {
329 Arc::new(self.finish())
330 }
331
332 fn to_vector_cloned(&self) -> VectorRef {
333 Arc::new(self.finish_cloned())
334 }
335 fn try_push_value_ref(&mut self, value: &ValueRef) -> Result<()> {
336 match value.try_into_string()? {
337 Some(v) => match &mut self.mutable_array {
338 MutableStringArrayData::String(array) => array.append_value(v),
339 MutableStringArrayData::LargeString(array) => array.append_value(v),
340 },
341 None => match &mut self.mutable_array {
342 MutableStringArrayData::String(array) => array.append_null(),
343 MutableStringArrayData::LargeString(array) => array.append_null(),
344 },
345 }
346 Ok(())
347 }
348
349 fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> {
350 vectors::impl_extend_for_builder!(self, vector, StringVector, offset, length)
351 }
352
353 fn push_null(&mut self) {
354 match &mut self.mutable_array {
355 MutableStringArrayData::String(array) => array.append_null(),
356 MutableStringArrayData::LargeString(array) => array.append_null(),
357 }
358 }
359}
360
361impl ScalarVectorBuilder for StringVectorBuilder {
362 type VectorType = StringVector;
363
364 fn with_capacity(capacity: usize) -> Self {
365 Self {
366 mutable_array: MutableStringArrayData::String(MutableStringArray::with_capacity(
367 capacity, 0,
368 )),
369 }
370 }
371
372 fn push(&mut self, value: Option<<Self::VectorType as ScalarVector>::RefItem<'_>>) {
373 match value {
374 Some(v) => match &mut self.mutable_array {
375 MutableStringArrayData::String(array) => array.append_value(v),
376 MutableStringArrayData::LargeString(array) => array.append_value(v),
377 },
378 None => match &mut self.mutable_array {
379 MutableStringArrayData::String(array) => array.append_null(),
380 MutableStringArrayData::LargeString(array) => array.append_null(),
381 },
382 }
383 }
384
385 fn finish(&mut self) -> Self::VectorType {
386 match &mut self.mutable_array {
387 MutableStringArrayData::String(array) => {
388 StringVector::from_string_array(array.finish())
389 }
390 MutableStringArrayData::LargeString(array) => {
391 StringVector::from_large_string_array(array.finish())
392 }
393 }
394 }
395
396 fn finish_cloned(&self) -> Self::VectorType {
397 match &self.mutable_array {
398 MutableStringArrayData::String(array) => {
399 StringVector::from_string_array(array.finish_cloned())
400 }
401 MutableStringArrayData::LargeString(array) => {
402 StringVector::from_large_string_array(array.finish_cloned())
403 }
404 }
405 }
406}
407
408impl Serializable for StringVector {
409 fn serialize_to_json(&self) -> Result<Vec<serde_json::Value>> {
410 self.iter_data()
411 .map(serde_json::to_value)
412 .collect::<serde_json::Result<_>>()
413 .context(error::SerializeSnafu)
414 }
415}
416
417impl StringVector {
418 pub fn try_from_arrow_array(
419 array: impl AsRef<dyn Array>,
420 ) -> crate::error::Result<StringVector> {
421 let array = array.as_ref();
422
423 if let Some(string_array) = array.as_any().downcast_ref::<StringArray>() {
424 Ok(StringVector::from_string_array(string_array.clone()))
425 } else if let Some(large_string_array) = array.as_any().downcast_ref::<LargeStringArray>() {
426 Ok(StringVector::from_large_string_array(
427 large_string_array.clone(),
428 ))
429 } else {
430 Err(crate::error::UnsupportedArrowTypeSnafu {
431 arrow_type: array.data_type().clone(),
432 }
433 .build())
434 }
435 }
436}
437
438#[cfg(test)]
439mod tests {
440
441 use std::vec;
442
443 use arrow::datatypes::DataType;
444
445 use super::*;
446
447 #[test]
448 fn test_string_vector_build_get() {
449 let mut builder = StringVectorBuilder::with_capacity(4);
450 builder.push(Some("hello"));
451 builder.push(None);
452 builder.push(Some("world"));
453 let vector = builder.finish();
454
455 assert_eq!(Some("hello"), vector.get_data(0));
456 assert_eq!(None, vector.get_data(1));
457 assert_eq!(Some("world"), vector.get_data(2));
458
459 assert!(vector.try_get(3).is_err());
461
462 assert_eq!(Value::String("hello".into()), vector.get(0));
463 assert_eq!(Value::Null, vector.get(1));
464 assert_eq!(Value::String("world".into()), vector.get(2));
465
466 let mut iter = vector.iter_data();
467 assert_eq!("hello", iter.next().unwrap().unwrap());
468 assert_eq!(None, iter.next().unwrap());
469 assert_eq!("world", iter.next().unwrap().unwrap());
470 assert_eq!(None, iter.next());
471 }
472
473 #[test]
474 fn test_string_vector_builder() {
475 let mut builder = StringVectorBuilder::with_capacity(3);
476 builder.push_value_ref(&ValueRef::String("hello"));
477 assert!(builder.try_push_value_ref(&ValueRef::Int32(123)).is_err());
478
479 let input = StringVector::from_slice(&["world", "one", "two"]);
480 builder.extend_slice_of(&input, 1, 2).unwrap();
481 assert!(
482 builder
483 .extend_slice_of(&crate::vectors::Int32Vector::from_slice([13]), 0, 1)
484 .is_err()
485 );
486 let vector = builder.to_vector();
487
488 let expect: VectorRef = Arc::new(StringVector::from_slice(&["hello", "one", "two"]));
489 assert_eq!(expect, vector);
490 }
491
492 #[test]
493 fn test_string_vector_misc() {
494 let strs = vec!["hello", "greptime", "rust"];
495 let v = StringVector::from(strs.clone());
496 assert_eq!(3, v.len());
497 assert_eq!("StringVector", v.vector_type_name());
498 assert!(!v.is_const());
499 assert!(v.validity().is_all_valid());
500 assert!(!v.only_null());
501 assert_eq!(1040, v.memory_size());
502
503 for (i, s) in strs.iter().enumerate() {
504 assert_eq!(Value::from(*s), v.get(i));
505 assert_eq!(ValueRef::from(*s), v.get_ref(i));
506 assert_eq!(Value::from(*s), v.try_get(i).unwrap());
507 }
508
509 let arrow_arr = v.to_arrow_array();
510 assert_eq!(3, arrow_arr.len());
511 assert_eq!(&DataType::Utf8, arrow_arr.data_type());
512 }
513
514 #[test]
515 fn test_serialize_string_vector() {
516 let mut builder = StringVectorBuilder::with_capacity(3);
517 builder.push(Some("hello"));
518 builder.push(None);
519 builder.push(Some("world"));
520 let string_vector = builder.finish();
521 let serialized =
522 serde_json::to_string(&string_vector.serialize_to_json().unwrap()).unwrap();
523 assert_eq!(r#"["hello",null,"world"]"#, serialized);
524 }
525
526 #[test]
527 fn test_from_arrow_array() {
528 let mut builder = MutableStringArray::new();
529 builder.append_option(Some("A"));
530 builder.append_option(Some("B"));
531 builder.append_null();
532 builder.append_option(Some("D"));
533 let string_array: StringArray = builder.finish();
534 let vector = StringVector::from(string_array);
535 assert_eq!(
536 r#"["A","B",null,"D"]"#,
537 serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(),
538 );
539 }
540
541 #[test]
542 fn test_from_non_option_string() {
543 let nul = String::from_utf8(vec![0]).unwrap();
544 let corpus = vec!["😅😅😅", "😍😍😍😍", "🥵🥵", nul.as_str()];
545 let vector = StringVector::from(corpus);
546 let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap();
547 assert_eq!(r#"["😅😅😅","😍😍😍😍","🥵🥵","\u0000"]"#, serialized);
548
549 let corpus = vec![
550 "🀀🀀🀀".to_string(),
551 "🀁🀁🀁".to_string(),
552 "🀂🀂🀂".to_string(),
553 "🀃🀃🀃".to_string(),
554 "🀆🀆".to_string(),
555 ];
556 let vector = StringVector::from(corpus);
557 let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap();
558 assert_eq!(r#"["🀀🀀🀀","🀁🀁🀁","🀂🀂🀂","🀃🀃🀃","🀆🀆"]"#, serialized);
559 }
560
561 #[test]
562 fn test_string_vector_builder_finish_cloned() {
563 let mut builder = StringVectorBuilder::with_capacity(1024);
564 builder.push(Some("1"));
565 builder.push(Some("2"));
566 builder.push(Some("3"));
567 let vector = builder.finish_cloned();
568 assert_eq!(vector.len(), 3);
569 assert_eq!(
570 r#"["1","2","3"]"#,
571 serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(),
572 );
573 assert_eq!(builder.len(), 3);
574 }
575}