Thanks to visit codestin.com
Credit goes to docs.rs

arrow2/array/binary/
mod.rs

1use crate::{
2    bitmap::{
3        utils::{BitmapIter, ZipValidity},
4        Bitmap,
5    },
6    buffer::Buffer,
7    datatypes::DataType,
8    error::Error,
9    offset::{Offset, Offsets, OffsetsBuffer},
10    trusted_len::TrustedLen,
11};
12
13use either::Either;
14
15use super::{specification::try_check_offsets_bounds, Array, GenericBinaryArray};
16
17mod ffi;
18pub(super) mod fmt;
19mod iterator;
20pub use iterator::*;
21mod from;
22mod mutable_values;
23pub use mutable_values::*;
24mod mutable;
25pub use mutable::*;
26
27#[cfg(feature = "arrow")]
28mod data;
29
30/// A [`BinaryArray`] is Arrow's semantically equivalent of an immutable `Vec<Option<Vec<u8>>>`.
31/// It implements [`Array`].
32///
33/// The size of this struct is `O(1)`, as all data is stored behind an [`std::sync::Arc`].
34/// # Example
35/// ```
36/// use arrow2::array::BinaryArray;
37/// use arrow2::bitmap::Bitmap;
38/// use arrow2::buffer::Buffer;
39///
40/// let array = BinaryArray::<i32>::from([Some([1, 2].as_ref()), None, Some([3].as_ref())]);
41/// assert_eq!(array.value(0), &[1, 2]);
42/// assert_eq!(array.iter().collect::<Vec<_>>(), vec![Some([1, 2].as_ref()), None, Some([3].as_ref())]);
43/// assert_eq!(array.values_iter().collect::<Vec<_>>(), vec![[1, 2].as_ref(), &[], &[3]]);
44/// // the underlying representation:
45/// assert_eq!(array.values(), &Buffer::from(vec![1, 2, 3]));
46/// assert_eq!(array.offsets().buffer(), &Buffer::from(vec![0, 2, 2, 3]));
47/// assert_eq!(array.validity(), Some(&Bitmap::from([true, false, true])));
48/// ```
49///
50/// # Generic parameter
51/// The generic parameter [`Offset`] can only be `i32` or `i64` and tradeoffs maximum array length with
52/// memory usage:
53/// * the sum of lengths of all elements cannot exceed `Offset::MAX`
54/// * the total size of the underlying data is `array.len() * size_of::<Offset>() + sum of lengths of all elements`
55///
56/// # Safety
57/// The following invariants hold:
58/// * Two consecutives `offsets` casted (`as`) to `usize` are valid slices of `values`.
59/// * `len` is equal to `validity.len()`, when defined.
60#[derive(Clone)]
61pub struct BinaryArray<O: Offset> {
62    data_type: DataType,
63    offsets: OffsetsBuffer<O>,
64    values: Buffer<u8>,
65    validity: Option<Bitmap>,
66}
67
68impl<O: Offset> BinaryArray<O> {
69    /// Returns a [`BinaryArray`] created from its internal representation.
70    ///
71    /// # Errors
72    /// This function returns an error iff:
73    /// * The last offset is not equal to the values' length.
74    /// * the validity's length is not equal to `offsets.len()`.
75    /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`.
76    /// # Implementation
77    /// This function is `O(1)`
78    pub fn try_new(
79        data_type: DataType,
80        offsets: OffsetsBuffer<O>,
81        values: Buffer<u8>,
82        validity: Option<Bitmap>,
83    ) -> Result<Self, Error> {
84        try_check_offsets_bounds(&offsets, values.len())?;
85
86        if validity
87            .as_ref()
88            .map_or(false, |validity| validity.len() != offsets.len_proxy())
89        {
90            return Err(Error::oos(
91                "validity mask length must match the number of values",
92            ));
93        }
94
95        if data_type.to_physical_type() != Self::default_data_type().to_physical_type() {
96            return Err(Error::oos(
97                "BinaryArray can only be initialized with DataType::Binary or DataType::LargeBinary",
98            ));
99        }
100
101        Ok(Self {
102            data_type,
103            offsets,
104            values,
105            validity,
106        })
107    }
108
109    /// Creates a new [`BinaryArray`] from slices of `&[u8]`.
110    pub fn from_slice<T: AsRef<[u8]>, P: AsRef<[T]>>(slice: P) -> Self {
111        Self::from_trusted_len_values_iter(slice.as_ref().iter())
112    }
113
114    /// Creates a new [`BinaryArray`] from a slice of optional `&[u8]`.
115    // Note: this can't be `impl From` because Rust does not allow double `AsRef` on it.
116    pub fn from<T: AsRef<[u8]>, P: AsRef<[Option<T>]>>(slice: P) -> Self {
117        MutableBinaryArray::<O>::from(slice).into()
118    }
119
120    /// Returns an iterator of `Option<&[u8]>` over every element of this array.
121    pub fn iter(&self) -> ZipValidity<&[u8], BinaryValueIter<O>, BitmapIter> {
122        ZipValidity::new_with_validity(self.values_iter(), self.validity.as_ref())
123    }
124
125    /// Returns an iterator of `&[u8]` over every element of this array, ignoring the validity
126    pub fn values_iter(&self) -> BinaryValueIter<O> {
127        BinaryValueIter::new(self)
128    }
129
130    /// Returns the length of this array
131    #[inline]
132    pub fn len(&self) -> usize {
133        self.offsets.len_proxy()
134    }
135
136    /// Returns the element at index `i`
137    /// # Panics
138    /// iff `i >= self.len()`
139    #[inline]
140    pub fn value(&self, i: usize) -> &[u8] {
141        assert!(i < self.len());
142        unsafe { self.value_unchecked(i) }
143    }
144
145    /// Returns the element at index `i`
146    /// # Safety
147    /// Assumes that the `i < self.len`.
148    #[inline]
149    pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] {
150        // soundness: the invariant of the function
151        let (start, end) = self.offsets.start_end_unchecked(i);
152
153        // soundness: the invariant of the struct
154        self.values.get_unchecked(start..end)
155    }
156
157    /// Returns the element at index `i` or `None` if it is null
158    /// # Panics
159    /// iff `i >= self.len()`
160    #[inline]
161    pub fn get(&self, i: usize) -> Option<&[u8]> {
162        if !self.is_null(i) {
163            // soundness: Array::is_null panics if i >= self.len
164            unsafe { Some(self.value_unchecked(i)) }
165        } else {
166            None
167        }
168    }
169
170    /// Returns the [`DataType`] of this array.
171    #[inline]
172    pub fn data_type(&self) -> &DataType {
173        &self.data_type
174    }
175
176    /// Returns the values of this [`BinaryArray`].
177    #[inline]
178    pub fn values(&self) -> &Buffer<u8> {
179        &self.values
180    }
181
182    /// Returns the offsets of this [`BinaryArray`].
183    #[inline]
184    pub fn offsets(&self) -> &OffsetsBuffer<O> {
185        &self.offsets
186    }
187
188    /// The optional validity.
189    #[inline]
190    pub fn validity(&self) -> Option<&Bitmap> {
191        self.validity.as_ref()
192    }
193
194    /// Slices this [`BinaryArray`].
195    /// # Implementation
196    /// This function is `O(1)`.
197    /// # Panics
198    /// iff `offset + length > self.len()`.
199    pub fn slice(&mut self, offset: usize, length: usize) {
200        assert!(
201            offset + length <= self.len(),
202            "the offset of the new Buffer cannot exceed the existing length"
203        );
204        unsafe { self.slice_unchecked(offset, length) }
205    }
206
207    /// Slices this [`BinaryArray`].
208    /// # Implementation
209    /// This function is `O(1)`.
210    /// # Safety
211    /// The caller must ensure that `offset + length <= self.len()`.
212    pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
213        self.validity.as_mut().and_then(|bitmap| {
214            bitmap.slice_unchecked(offset, length);
215            (bitmap.unset_bits() > 0).then(|| bitmap)
216        });
217        self.offsets.slice_unchecked(offset, length + 1);
218    }
219
220    impl_sliced!();
221    impl_mut_validity!();
222    impl_into_array!();
223
224    /// Returns its internal representation
225    #[must_use]
226    pub fn into_inner(self) -> (DataType, OffsetsBuffer<O>, Buffer<u8>, Option<Bitmap>) {
227        let Self {
228            data_type,
229            offsets,
230            values,
231            validity,
232        } = self;
233        (data_type, offsets, values, validity)
234    }
235
236    /// Try to convert this `BinaryArray` to a `MutableBinaryArray`
237    #[must_use]
238    pub fn into_mut(self) -> Either<Self, MutableBinaryArray<O>> {
239        use Either::*;
240        if let Some(bitmap) = self.validity {
241            match bitmap.into_mut() {
242                // Safety: invariants are preserved
243                Left(bitmap) => Left(BinaryArray::new(
244                    self.data_type,
245                    self.offsets,
246                    self.values,
247                    Some(bitmap),
248                )),
249                Right(mutable_bitmap) => match (self.values.into_mut(), self.offsets.into_mut()) {
250                    (Left(values), Left(offsets)) => Left(BinaryArray::new(
251                        self.data_type,
252                        offsets,
253                        values,
254                        Some(mutable_bitmap.into()),
255                    )),
256                    (Left(values), Right(offsets)) => Left(BinaryArray::new(
257                        self.data_type,
258                        offsets.into(),
259                        values,
260                        Some(mutable_bitmap.into()),
261                    )),
262                    (Right(values), Left(offsets)) => Left(BinaryArray::new(
263                        self.data_type,
264                        offsets,
265                        values.into(),
266                        Some(mutable_bitmap.into()),
267                    )),
268                    (Right(values), Right(offsets)) => Right(
269                        MutableBinaryArray::try_new(
270                            self.data_type,
271                            offsets,
272                            values,
273                            Some(mutable_bitmap),
274                        )
275                        .unwrap(),
276                    ),
277                },
278            }
279        } else {
280            match (self.values.into_mut(), self.offsets.into_mut()) {
281                (Left(values), Left(offsets)) => {
282                    Left(BinaryArray::new(self.data_type, offsets, values, None))
283                }
284                (Left(values), Right(offsets)) => Left(BinaryArray::new(
285                    self.data_type,
286                    offsets.into(),
287                    values,
288                    None,
289                )),
290                (Right(values), Left(offsets)) => Left(BinaryArray::new(
291                    self.data_type,
292                    offsets,
293                    values.into(),
294                    None,
295                )),
296                (Right(values), Right(offsets)) => Right(
297                    MutableBinaryArray::try_new(self.data_type, offsets, values, None).unwrap(),
298                ),
299            }
300        }
301    }
302
303    /// Creates an empty [`BinaryArray`], i.e. whose `.len` is zero.
304    pub fn new_empty(data_type: DataType) -> Self {
305        Self::new(data_type, OffsetsBuffer::new(), Buffer::new(), None)
306    }
307
308    /// Creates an null [`BinaryArray`], i.e. whose `.null_count() == .len()`.
309    #[inline]
310    pub fn new_null(data_type: DataType, length: usize) -> Self {
311        Self::new(
312            data_type,
313            Offsets::new_zeroed(length).into(),
314            Buffer::new(),
315            Some(Bitmap::new_zeroed(length)),
316        )
317    }
318
319    /// Returns the default [`DataType`], `DataType::Binary` or `DataType::LargeBinary`
320    pub fn default_data_type() -> DataType {
321        if O::IS_LARGE {
322            DataType::LargeBinary
323        } else {
324            DataType::Binary
325        }
326    }
327
328    /// Alias for unwrapping [`Self::try_new`]
329    pub fn new(
330        data_type: DataType,
331        offsets: OffsetsBuffer<O>,
332        values: Buffer<u8>,
333        validity: Option<Bitmap>,
334    ) -> Self {
335        Self::try_new(data_type, offsets, values, validity).unwrap()
336    }
337
338    /// Returns a [`BinaryArray`] from an iterator of trusted length.
339    ///
340    /// The [`BinaryArray`] is guaranteed to not have a validity
341    #[inline]
342    pub fn from_trusted_len_values_iter<T: AsRef<[u8]>, I: TrustedLen<Item = T>>(
343        iterator: I,
344    ) -> Self {
345        MutableBinaryArray::<O>::from_trusted_len_values_iter(iterator).into()
346    }
347
348    /// Returns a new [`BinaryArray`] from a [`Iterator`] of `&[u8]`.
349    ///
350    /// The [`BinaryArray`] is guaranteed to not have a validity
351    pub fn from_iter_values<T: AsRef<[u8]>, I: Iterator<Item = T>>(iterator: I) -> Self {
352        MutableBinaryArray::<O>::from_iter_values(iterator).into()
353    }
354
355    /// Creates a [`BinaryArray`] from an iterator of trusted length.
356    /// # Safety
357    /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
358    /// I.e. that `size_hint().1` correctly reports its length.
359    #[inline]
360    pub unsafe fn from_trusted_len_iter_unchecked<I, P>(iterator: I) -> Self
361    where
362        P: AsRef<[u8]>,
363        I: Iterator<Item = Option<P>>,
364    {
365        MutableBinaryArray::<O>::from_trusted_len_iter_unchecked(iterator).into()
366    }
367
368    /// Creates a [`BinaryArray`] from a [`TrustedLen`]
369    #[inline]
370    pub fn from_trusted_len_iter<I, P>(iterator: I) -> Self
371    where
372        P: AsRef<[u8]>,
373        I: TrustedLen<Item = Option<P>>,
374    {
375        // soundness: I is `TrustedLen`
376        unsafe { Self::from_trusted_len_iter_unchecked(iterator) }
377    }
378
379    /// Creates a [`BinaryArray`] from an falible iterator of trusted length.
380    /// # Safety
381    /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
382    /// I.e. that `size_hint().1` correctly reports its length.
383    #[inline]
384    pub unsafe fn try_from_trusted_len_iter_unchecked<E, I, P>(iterator: I) -> Result<Self, E>
385    where
386        P: AsRef<[u8]>,
387        I: IntoIterator<Item = Result<Option<P>, E>>,
388    {
389        MutableBinaryArray::<O>::try_from_trusted_len_iter_unchecked(iterator).map(|x| x.into())
390    }
391
392    /// Creates a [`BinaryArray`] from an fallible iterator of trusted length.
393    #[inline]
394    pub fn try_from_trusted_len_iter<E, I, P>(iter: I) -> Result<Self, E>
395    where
396        P: AsRef<[u8]>,
397        I: TrustedLen<Item = Result<Option<P>, E>>,
398    {
399        // soundness: I: TrustedLen
400        unsafe { Self::try_from_trusted_len_iter_unchecked(iter) }
401    }
402}
403
404impl<O: Offset> Array for BinaryArray<O> {
405    impl_common_array!();
406
407    fn validity(&self) -> Option<&Bitmap> {
408        self.validity.as_ref()
409    }
410
411    #[inline]
412    fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
413        Box::new(self.clone().with_validity(validity))
414    }
415}
416
417unsafe impl<O: Offset> GenericBinaryArray<O> for BinaryArray<O> {
418    #[inline]
419    fn values(&self) -> &[u8] {
420        self.values()
421    }
422
423    #[inline]
424    fn offsets(&self) -> &[O] {
425        self.offsets().buffer()
426    }
427}