Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion encodings/fastlanes/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ mod delta;
mod r#for;
mod rle;

pub(crate) const FL_CHUNK_SIZE: usize = 1024;
pub const FL_CHUNK_SIZE: usize = 1024;

use bitpacking::compute::is_constant::BitPackedIsConstantKernel;
use r#for::compute::is_constant::FoRIsConstantKernel;
Expand Down
15 changes: 15 additions & 0 deletions vortex-duckdb/src/e2e_test/vortex_scan_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ use vortex::array::validity::Validity;
use vortex::buffer::buffer;
use vortex::dtype::Nullability;
use vortex::dtype::PType;
use vortex::encodings::fastlanes::RLEData;
use vortex::file::WriteOptionsSessionExt;
use vortex::io::runtime::BlockingRuntime;
use vortex::scalar::PValue;
Expand Down Expand Up @@ -956,6 +957,20 @@ fn test_vortex_encodings_roundtrip() {
not(duckdb_release),
ignore = "spatial extension requires a release DuckDB build"
)]
#[test]
fn test_fastlanes_rle_roundtrip() {
let expected: Vec<i32> = (0i32..2048).map(|i| i / 256).collect();
let file = RUNTIME.block_on(async {
let mut ctx = SESSION.create_execution_ctx();
let primitive = PrimitiveArray::from_iter(expected.clone());
let rle = RLEData::encode(primitive.as_view(), &mut ctx).unwrap();
write_single_column_vortex_file("rle_col", rle.into_array()).await
});

let values: Vec<i32> = scan_vortex_file::<i32, _>(file, "SELECT rle_col FROM ?", 0).unwrap();
assert_eq!(values, expected);
}

#[test]
fn test_geometry() {
let file = RUNTIME.block_on(async {
Expand Down
7 changes: 7 additions & 0 deletions vortex-duckdb/src/exporter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ mod geo;
mod list;
mod list_view;
mod primitive;
mod rle;
mod run_end;
mod sequence;
mod struct_;
Expand All @@ -34,6 +35,7 @@ use vortex::array::arrays::List;
use vortex::array::arrays::StructArray;
use vortex::array::arrays::struct_::StructArrayExt;
use vortex::buffer::BitChunks;
use vortex::encodings::fastlanes::RLE;
use vortex::encodings::runend::RunEnd;
use vortex::encodings::sequence::Sequence;
use vortex::error::VortexExpect;
Expand Down Expand Up @@ -244,6 +246,11 @@ fn new_array_exporter_with_flatten(
Err(array) => array,
};

let array = match array.try_downcast::<RLE>() {
Ok(array) => return rle::new_exporter_with_flatten(array, cache, ctx, flatten),
Err(array) => array,
};

let array = match array.try_downcast::<Dict>() {
Ok(array) => return dict::new_exporter_with_flatten(&array, cache, ctx, flatten),
Err(array) => array,
Expand Down
292 changes: 292 additions & 0 deletions vortex-duckdb/src/exporter/rle.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,292 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

use std::marker::PhantomData;

use num_traits::AsPrimitive;
use vortex::array::ExecutionCtx;
use vortex::array::IntoArray;
use vortex::array::arrays::PrimitiveArray;
use vortex::array::match_each_unsigned_integer_ptype;
use vortex::array::validity::Validity;
use vortex::dtype::IntegerPType;
use vortex::encodings::fastlanes::FL_CHUNK_SIZE;
use vortex::encodings::fastlanes::RLEArray;
use vortex::encodings::fastlanes::RLEArrayExt;
use vortex::error::VortexResult;

use crate::duckdb::ReusableDict;
use crate::duckdb::SelectionVector;
use crate::duckdb::VectorRef;
use crate::exporter::ColumnExporter;
use crate::exporter::all_invalid;
use crate::exporter::cache::ConversionCache;
use crate::exporter::cached_values_dict;
use crate::exporter::canonical;

struct RLEExporter<I: IntegerPType, O: IntegerPType> {
values: ReusableDict,
indices: PrimitiveArray,
values_idx_offsets: PrimitiveArray,
/// Offset relative to the first chunk
offset: usize,
indices_type: PhantomData<I>,
values_idx_offsets_type: PhantomData<O>,
}

pub(crate) fn new_exporter_with_flatten(
array: RLEArray,
cache: &ConversionCache,
ctx: &mut ExecutionCtx,
flatten: bool,
) -> VortexResult<Box<dyn ColumnExporter>> {
if flatten || array.is_empty() {
return canonical::new_exporter(array.into_array(), cache, ctx);
}
// DuckDB dictionary can't carry validity on codes.
// Don't execute the validity mask, if there's a chance of NULL,
// canonicalize
match array.indices().validity()? {
Validity::AllInvalid => return Ok(all_invalid::new_exporter()),
Validity::Array(_) => return canonical::new_exporter(array.into_array(), cache, ctx),
_ => {}
}

let indices = array.indices().clone().execute::<PrimitiveArray>(ctx)?;
let values = array.values().clone();
let values_idx_offsets = array
.values_idx_offsets()
.clone()
.execute::<PrimitiveArray>(ctx)?;

let values = cached_values_dict(values, cache, ctx)?;
match_each_unsigned_integer_ptype!(indices.ptype(), |I| {
match_each_unsigned_integer_ptype!(values_idx_offsets.ptype(), |O| {
Ok(Box::new(RLEExporter {
values,
indices,
values_idx_offsets,
offset: array.offset(),
indices_type: PhantomData::<I>,
values_idx_offsets_type: PhantomData::<O>,
}))
})
})
}

impl<I, O> ColumnExporter for RLEExporter<I, O>
where
I: IntegerPType + AsPrimitive<u32>,
O: IntegerPType + AsPrimitive<u32>,
{
fn export(
&self,
offset: usize,
len: usize,
vector: &mut VectorRef,
_ctx: &mut ExecutionCtx,
) -> VortexResult<()> {
let mut selection_vec = SelectionVector::with_capacity(len);
let mut selection = unsafe { selection_vec.as_slice_mut(len) };

let indices = self.indices.as_slice::<I>();
let values_idx_offsets = self.values_idx_offsets.as_slice::<O>();

let mut pos = self.offset + offset;
let end = pos + len;

let first_idx_offset = values_idx_offsets[0];
while pos < end {
let chunk_idx = pos / FL_CHUNK_SIZE;
let base: u32 = (values_idx_offsets[chunk_idx] - first_idx_offset).as_();
let take = ((chunk_idx + 1) * FL_CHUNK_SIZE).min(end) - pos;

for (dst, idx) in selection[..take].iter_mut().zip(&indices[pos..pos + take]) {
let idx: u32 = idx.as_();
*dst = base + idx;
}

selection = &mut selection[take..];
pos += take;
}

vector.reuse_dictionary(&self.values, &selection_vec);
Ok(())
}
}

#[cfg(test)]
mod tests {
use vortex::array::ArrayRef;
use vortex::array::IntoArray;
use vortex::array::VortexSessionExecute;
use vortex::array::arrays::PrimitiveArray;
use vortex::encodings::fastlanes::RLEArray;
use vortex::encodings::fastlanes::RLEData;
use vortex::error::VortexResult;

use crate::SESSION;
use crate::cpp::duckdb_type::DUCKDB_TYPE_INTEGER;
use crate::duckdb::DataChunk;
use crate::duckdb::LogicalType;
use crate::exporter::ConversionCache;
use crate::exporter::new_array_exporter;

fn encode_rle(values: Vec<i32>) -> VortexResult<RLEArray> {
let mut ctx = SESSION.create_execution_ctx();
let primitive = PrimitiveArray::from_iter(values);
RLEData::encode(primitive.as_view(), &mut ctx)
}

fn export_flat(array: ArrayRef, len: usize) -> VortexResult<Vec<i32>> {
let mut ctx = SESSION.create_execution_ctx();
let mut chunk = DataChunk::new([LogicalType::new(DUCKDB_TYPE_INTEGER)]);
new_array_exporter(array, &ConversionCache::default(), &mut ctx)?.export(
0,
len,
chunk.get_vector_mut(0),
&mut ctx,
)?;
chunk.set_len(len);
let vector = chunk.get_vector(0);
vector.flatten(len as u64);
Ok(vector.as_slice_with_len::<i32>(len).to_vec())
}

#[test]
fn test_roundtrip_two_chunks() -> VortexResult<()> {
let expected: Vec<i32> = (0i32..2048).map(|i| i / 100).collect();
let rle = encode_rle(expected.clone())?;
let exported = export_flat(rle.into_array(), 2048)?;
assert_eq!(exported, expected);
Ok(())
}

#[test]
fn test_roundtrip_boundary() -> VortexResult<()> {
let source: Vec<i32> = (0i32..2048).map(|i| i / 100).collect();
let rle = encode_rle(source.clone())?;
let sliced = rle.into_array().slice(500..1700)?;
let exported = export_flat(sliced, 1200)?;
assert_eq!(exported, source[500..1700]);
Ok(())
}

#[test]
fn test_roundtrip_slice() -> VortexResult<()> {
let source: Vec<i32> = (0i32..3072).map(|i| i / 100).collect();
let rle = encode_rle(source.clone())?;
let sliced = rle.into_array().slice(1200..2000)?;
let exported = export_flat(sliced, 800)?;
assert_eq!(exported, source[1200..2000]);
Ok(())
}

fn chunk_string(array: ArrayRef, offset: usize, len: usize) -> VortexResult<String> {
let mut ctx = SESSION.create_execution_ctx();
let mut chunk = DataChunk::new([LogicalType::new(DUCKDB_TYPE_INTEGER)]);
new_array_exporter(array, &ConversionCache::default(), &mut ctx)?.export(
offset,
len,
chunk.get_vector_mut(0),
&mut ctx,
)?;
chunk.set_len(len);
String::try_from(&*chunk)
}

fn two_chunk_rle() -> VortexResult<RLEArray> {
let mut ctx = SESSION.create_execution_ctx();
let source: Vec<i32> = std::iter::repeat_n(10i32, 1024)
.chain(std::iter::repeat_n(20, 1024))
.collect();
RLEData::encode(PrimitiveArray::from_iter(source).as_view(), &mut ctx)
}

#[test]
fn test_one_chunk() -> VortexResult<()> {
let rle = two_chunk_rle()?;
let chunk_str = chunk_string(rle.into_array(), 0, 5)?;
assert_eq!(
chunk_str,
r#"Chunk - [1 Columns]
- DICTIONARY INTEGER: 5 = [ 10, 10, 10, 10, 10]
"#
);
Ok(())
}

#[test]
fn test_one_chunk_nulls() -> VortexResult<()> {
let mut ctx = SESSION.create_execution_ctx();
let source = vec![Some(0u32), Some(1), None, Some(3), None];
let rle = RLEData::encode(PrimitiveArray::from_option_iter(source).as_view(), &mut ctx)?;
let chunk_str = chunk_string(rle.into_array(), 0, 5)?;
assert_eq!(
chunk_str,
r#"Chunk - [1 Columns]
- FLAT INTEGER: 5 = [ 0, 1, NULL, 3, NULL]
"#
);
Ok(())
}

#[test]
fn test_chunk_boundary() -> VortexResult<()> {
let rle = two_chunk_rle()?;
let chunk_str = chunk_string(rle.into_array(), 1020, 10)?;
assert_eq!(
chunk_str,
r#"Chunk - [1 Columns]
- DICTIONARY INTEGER: 10 = [ 10, 10, 10, 10, 20, 20, 20, 20, 20, 20]
"#
);
Ok(())
}

#[test]
fn test_chunk_slice() -> VortexResult<()> {
let rle = two_chunk_rle()?;
let sliced = rle.into_array().slice(1500..1510)?;
let chunk_str = chunk_string(sliced, 0, 10)?;
assert_eq!(
chunk_str,
r#"Chunk - [1 Columns]
- FLAT INTEGER: 10 = [ 20, 20, 20, 20, 20, 20, 20, 20, 20, 20]
"#
);
Ok(())
}

#[test]
fn test_roundtrip_with_nulls() -> VortexResult<()> {
let source: Vec<Option<i32>> = (0i32..1024)
.map(|i| if i % 7 == 0 { None } else { Some(i / 50) })
.collect();
let mut ctx = SESSION.create_execution_ctx();
let primitive = PrimitiveArray::from_option_iter(source.clone());
let rle = RLEData::encode(primitive.as_view(), &mut ctx)?;

let mut chunk = DataChunk::new([LogicalType::new(DUCKDB_TYPE_INTEGER)]);
new_array_exporter(rle.into_array(), &ConversionCache::default(), &mut ctx)?.export(
0,
1024,
chunk.get_vector_mut(0),
&mut ctx,
)?;
chunk.set_len(1024);

let vector = chunk.get_vector(0);
vector.flatten(1024);
let slice = vector.as_slice_with_len::<i32>(1024);
for (i, expected) in source.iter().enumerate() {
if let Some(v) = expected {
assert!(!vector.row_is_null(i as u64), "row {i} is null");
assert_eq!(slice[i], *v);
} else {
assert!(vector.row_is_null(i as u64), "row {i} not null");
}
}
Ok(())
}
}
27 changes: 27 additions & 0 deletions vortex-duckdb/src/exporter/run_end.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,37 @@ mod tests {
use vortex::error::VortexResult;

use crate::SESSION;
use crate::cpp::duckdb_type::DUCKDB_TYPE_INTEGER;
use crate::duckdb::DataChunk;
use crate::duckdb::LogicalType;
use crate::exporter::ArrayExporter;
use crate::exporter::ConversionCache;
use crate::exporter::new_array_exporter;

#[test]
fn test_one_chunk_null() -> VortexResult<()> {
let mut ctx = SESSION.create_execution_ctx();
let source = vec![Some(0u32), Some(1), None, Some(3), None];
let array = PrimitiveArray::from_option_iter(source);
let array = RunEnd::encode(array.into_array(), &mut ctx)?;

let mut chunk = DataChunk::new([LogicalType::new(DUCKDB_TYPE_INTEGER)]);
new_array_exporter(array.into_array(), &ConversionCache::default(), &mut ctx)?.export(
0,
5,
chunk.get_vector_mut(0),
&mut ctx,
)?;
chunk.set_len(5);
let chunk_str = String::try_from(&*chunk)?;
assert_eq!(
chunk_str,
r#"Chunk - [1 Columns]
- DICTIONARY INTEGER: 5 = [ 0, 1, NULL, 3, NULL]
"#
);
Ok(())
}

#[test]
fn run_end_with_chunked_values_exports_across_value_chunks() -> VortexResult<()> {
Expand Down
Loading