Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion vortex-duckdb/src/exporter/constant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use crate::duckdb::Value;
use crate::duckdb::VectorRef;
use crate::exporter::ColumnExporter;
use crate::exporter::ConversionCache;
use crate::exporter::canonical;
use crate::exporter::new_array_exporter;
use crate::exporter::validity;

Expand Down Expand Up @@ -46,7 +47,19 @@ pub fn new_exporter_with_mask(
new_exporter(array)
}

pub(crate) fn new_exporter(array: ConstantArray) -> VortexResult<Box<dyn ColumnExporter>> {
pub(crate) fn new_exporter_with_flatten(
array: ConstantArray,
cache: &ConversionCache,
ctx: &mut ExecutionCtx,
flatten: bool,
) -> VortexResult<Box<dyn ColumnExporter>> {
if flatten {
return canonical::new_exporter(array.into_array(), cache, ctx);
}
new_exporter(array)
}

fn new_exporter(array: ConstantArray) -> VortexResult<Box<dyn ColumnExporter>> {
let value = if array.scalar().is_null() {
// If the scalar is null and _not_ of type Null, then we cannot assign a null DuckDB value
// to a constant vector since DuckDB will complain about a type-mismatch. In these cases,
Expand Down
32 changes: 4 additions & 28 deletions vortex-duckdb/src/exporter/dict.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ use crate::duckdb::VectorRef;
use crate::exporter::ColumnExporter;
use crate::exporter::all_invalid;
use crate::exporter::cache::ConversionCache;
use crate::exporter::cached_values_dict;
use crate::exporter::constant;
use crate::exporter::new_array_exporter;
use crate::exporter::new_array_exporter_with_flatten;

struct DictExporter<I: IntegerPType> {
// Store the dictionary values once and export the same dictionary with each codes chunk.
Expand Down Expand Up @@ -70,7 +70,7 @@ pub(crate) fn new_exporter_with_flatten(
let values_key = values.addr();
let codes = array.codes().clone().execute::<PrimitiveArray>(ctx)?;

let reusable_dict = if flatten {
if flatten {
let canonical = cache
.canonical_cache
.get(&values_key)
Expand All @@ -93,33 +93,9 @@ pub(crate) fn new_exporter_with_flatten(
cache,
ctx,
);
} else {
// Check if we have a cached vector and extract it if we do.
let reusable_dict = cache
.dict_cache
.get(&values_key)
.map(|entry| entry.value().1.clone());

match reusable_dict {
Some(reusable_dict) => reusable_dict,
None => {
// Create a new reusable dictionary for the values.
let mut reusable_dict = ReusableDict::new(values.dtype().try_into()?, values.len());
new_array_exporter_with_flatten(values.clone(), cache, ctx, true)?.export(
0,
values.len(),
reusable_dict.vector(),
ctx,
)?;

cache
.dict_cache
.insert(values_key, (values.clone(), reusable_dict.clone()));
}

reusable_dict
}
}
};
let reusable_dict = cached_values_dict(values.clone(), cache, ctx)?;

match_each_integer_ptype!(codes.ptype(), |I| {
Ok(Box::new(DictExporter {
Expand Down
31 changes: 29 additions & 2 deletions vortex-duckdb/src/exporter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ use vortex::error::vortex_bail;
use vortex::error::vortex_ensure;

use crate::duckdb::DataChunkRef;
use crate::duckdb::ReusableDict;
use crate::duckdb::VectorRef;
use crate::duckdb::duckdb_vector_size;

Expand Down Expand Up @@ -195,6 +196,32 @@ fn new_array_exporter(
new_array_exporter_with_flatten(array, cache, ctx, false)
}

/// Export "values" into a ReusableDict, saving the dictionary in "cache".
fn cached_values_dict(
values: ArrayRef,
cache: &ConversionCache,
ctx: &mut ExecutionCtx,
) -> VortexResult<ReusableDict> {
let key = values.addr();
if let Some(entry) = cache.dict_cache.get(&key) {
return Ok(entry.value().1.clone());
}
let mut dict = ReusableDict::new(values.dtype().try_into()?, values.len());
// ReusableDict's values must be flattened. When we call
Comment thread
myrrc marked this conversation as resolved.
// vector.reuse_dictionary() with dict returned from this function, if
// data is not flat, duckdb's functions like TupleDataScatter read inner
// storage directly as T. If data inside was SEQUENCE or CONSTANT vectors
// which don't have a T buffer, we read garbage data.
new_array_exporter_with_flatten(values.clone(), cache, ctx, true)?.export(
0,
values.len(),
dict.vector(),
ctx,
)?;
cache.dict_cache.insert(key, (values, dict.clone()));
Ok(dict)
}

/// Create a DuckDB exporter for the given Vortex array.
fn new_array_exporter_with_flatten(
array: ArrayRef,
Expand All @@ -203,12 +230,12 @@ fn new_array_exporter_with_flatten(
flatten: bool,
) -> VortexResult<Box<dyn ColumnExporter>> {
let array = match array.try_downcast::<Constant>() {
Ok(array) => return constant::new_exporter(array),
Ok(array) => return constant::new_exporter_with_flatten(array, cache, ctx, flatten),
Err(array) => array,
};

let array = match array.try_downcast::<Sequence>() {
Ok(array) => return sequence::new_exporter(&array),
Ok(array) => return sequence::new_exporter_with_flatten(&array, cache, ctx, flatten),
Err(array) => array,
};

Expand Down
43 changes: 13 additions & 30 deletions vortex-duckdb/src/exporter/run_end.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,21 @@ use vortex::error::VortexExpect;
use vortex::error::VortexResult;

use crate::convert::ToDuckDBScalar;
use crate::duckdb::ReusableDict;
use crate::duckdb::SelectionVector;
use crate::duckdb::VectorRef;
use crate::exporter::ColumnExporter;
use crate::exporter::cache::ConversionCache;
use crate::exporter::cached_values_dict;
use crate::exporter::canonical;
use crate::exporter::new_array_exporter_with_flatten;

/// We export run-end arrays to a DuckDB dictionary vector, using a selection vector to
/// repeat the values in the run-end array.
/// We export run-end arrays to a DuckDB dictionary vector. Values are exported
/// into a ReusableDict with SelectionVector applied in export().
struct RunEndExporter<E: IntegerPType> {
ends: PrimitiveArray,
ends_type: PhantomData<E>,
values: ArrayRef,
values_exporter: Box<dyn ColumnExporter>,
values_dict: ReusableDict,
run_end_offset: usize,
}

Expand All @@ -50,16 +51,14 @@ pub(crate) fn new_exporter_with_flatten(
let ends = array.ends().clone();
let values = array.values().clone();
let ends = ends.execute::<PrimitiveArray>(ctx)?;
// REE exports values in run-index space, not outer row space. Materialize the dictionary
// payload so chunked physical boundaries in the values child cannot constrain row batches.
let values_exporter = new_array_exporter_with_flatten(values.clone(), cache, ctx, true)?;
let values_dict = cached_values_dict(values.clone(), cache, ctx)?;

match_each_integer_ptype!(ends.ptype(), |E| {
Ok(Box::new(RunEndExporter {
ends,
ends_type: PhantomData::<E>,
values,
values_exporter,
values_dict,
run_end_offset: offset,
}))
})
Expand Down Expand Up @@ -88,10 +87,7 @@ impl<E: IntegerPType> ColumnExporter for RunEndExporter<E> {

// Find the final run in case we can short-circuit and return a constant vector.
let end_run_idx = ends_slice
.search_sorted(
&offset.add(E::from_usize(len).vortex_expect("len out of bounds")),
SearchSortedSide::Right,
)?
.search_sorted(&end_offset, SearchSortedSide::Right)?
.to_ends_index(ends_slice.len());

if start_run_idx == end_run_idx {
Expand All @@ -113,29 +109,16 @@ impl<E: IntegerPType> ColumnExporter for RunEndExporter<E> {
.to_usize()
.vortex_expect("run_len is usize");

// Push the runs into the selection vector.
sel_vec_slice[..run_len].fill(u32::try_from(run_idx).vortex_expect("sel_idx is u32"));
let global_run_idx =
u32::try_from(start_run_idx + run_idx).vortex_expect("run index exceeds u32");
sel_vec_slice[..run_len].fill(global_run_idx);
sel_vec_slice = &mut sel_vec_slice[run_len..];

offset = next_end;
}
assert!(
sel_vec_slice.is_empty(),
"Selection vector not completely filled"
);

// The values in the selection vector are the run indices, so we can find the number of
// values we referenced by looking at the last index of the selection vector.
let values_len = *unsafe { sel_vec.as_slice_mut(len) }
.last()
.vortex_expect("non-empty")
+ 1;

// Export the run-end values into the vector, and then turn it into a dictionary vector.
self.values_exporter
.export(start_run_idx, values_len as usize, vector, ctx)?;
vector.dictionary(vector, values_len as usize, &sel_vec, len as _);
debug_assert!(sel_vec_slice.is_empty());

vector.reuse_dictionary(&self.values_dict, &sel_vec);
Ok(())
}
}
Expand Down
16 changes: 14 additions & 2 deletions vortex-duckdb/src/exporter/sequence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,30 @@

use bitvec::macros::internal::funty::Fundamental;
use vortex::array::ExecutionCtx;
use vortex::array::IntoArray;
use vortex::encodings::sequence::SequenceArray;
use vortex::error::VortexExpect;
use vortex::error::VortexResult;

use crate::duckdb::VectorRef;
use crate::exporter::ColumnExporter;
use crate::exporter::ConversionCache;
use crate::exporter::canonical;

struct SequenceExporter {
start: i64,
step: i64,
}

pub(crate) fn new_exporter(array: &SequenceArray) -> VortexResult<Box<dyn ColumnExporter>> {
pub(crate) fn new_exporter_with_flatten(
Comment thread
myrrc marked this conversation as resolved.
array: &SequenceArray,
cache: &ConversionCache,
ctx: &mut ExecutionCtx,
flatten: bool,
) -> VortexResult<Box<dyn ColumnExporter>> {
if flatten {
return canonical::new_exporter(array.clone().into_array(), cache, ctx);
}
Ok(Box::new(SequenceExporter {
start: array.base().as_i64().vortex_expect("cannot have null base"),
step: array
Expand Down Expand Up @@ -58,8 +69,9 @@ mod tests {
fn test_sequence() {
let arr = Sequence::try_new_typed(2, 5, Nullability::NonNullable, 100).unwrap();
let mut chunk = DataChunk::new([LogicalType::new(cpp::duckdb_type::DUCKDB_TYPE_INTEGER)]);
let mut ctx = SESSION.create_execution_ctx();

new_exporter(&arr)
new_exporter_with_flatten(&arr, &ConversionCache::default(), &mut ctx, false)
.unwrap()
.export(
0,
Expand Down
Loading