Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 039e504

Browse files
authored
Add test that the right batch metadata is added/removed/retained (#10381)
### Related * Part of #10273 * Cherry-picked from #10350 ### What Ensure that all the Rerun-specific metadata that is added to a record batch actually makes it into the record batch * [x] full-check
1 parent 97ed174 commit 039e504

File tree

2 files changed

+140
-76
lines changed

2 files changed

+140
-76
lines changed

crates/store/re_sorbet/src/migrations/v0_0_1__to__v0_0_2.rs

Lines changed: 20 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -274,56 +274,13 @@ fn migrate_record_batch(batch: &ArrowRecordBatch) -> ArrowRecordBatch {
274274
fn reorder_columns(batch: &ArrowRecordBatch) -> ArrowRecordBatch {
275275
re_tracing::profile_function!();
276276

277-
let needs_reordering = 'check: {
278-
let mut row_ids = false;
279-
let mut indices = false;
280-
let mut components = false;
281-
282-
let has_indices = batch.schema_ref().fields().iter().any(|field| {
283-
let column_kind = ColumnKind::try_from(field.as_ref()).unwrap_or(ColumnKind::Component);
284-
column_kind == ColumnKind::Index
285-
});
286-
287-
for field in batch.schema_ref().fields() {
288-
let column_kind = ColumnKind::try_from(field.as_ref()).unwrap_or(ColumnKind::Component);
289-
match column_kind {
290-
ColumnKind::RowId => {
291-
row_ids = true;
292-
if (has_indices && indices) || components {
293-
break 'check true;
294-
}
295-
}
296-
297-
ColumnKind::Index => {
298-
indices = true;
299-
if !row_ids || components {
300-
break 'check true;
301-
}
302-
}
303-
304-
ColumnKind::Component => {
305-
components = true;
306-
if !row_ids || (has_indices && !indices) {
307-
break 'check true;
308-
}
309-
}
310-
}
311-
}
312-
313-
false
314-
};
315-
316-
if !needs_reordering {
317-
return batch.clone();
318-
}
319-
320277
let mut row_ids = vec![];
321278
let mut indices = vec![];
322279
let mut components = vec![];
323280

324281
for (field, array) in itertools::izip!(batch.schema().fields(), batch.columns()) {
325-
let field = field.clone();
326-
let array = array.clone();
282+
let field = Arc::clone(field);
283+
let array = Arc::clone(array);
327284
let column_kind = ColumnKind::try_from(field.as_ref()).unwrap_or(ColumnKind::Component);
328285
match column_kind {
329286
ColumnKind::RowId => row_ids.push((field, array)),
@@ -335,34 +292,31 @@ fn reorder_columns(batch: &ArrowRecordBatch) -> ArrowRecordBatch {
335292
let (fields, arrays): (Vec<ArrowFieldRef>, Vec<ArrowArrayRef>) =
336293
itertools::chain!(row_ids, indices, components).unzip();
337294

338-
let schema = Arc::new(ArrowSchema::new_with_metadata(
339-
fields,
340-
batch.schema().metadata.clone(),
341-
));
342-
343-
if schema.fields() != batch.schema().fields() {
295+
if fields == batch.schema().fields().as_ref() {
296+
batch.clone() // Early-out - no reordering needed
297+
} else {
344298
re_log::debug!(
345-
"Reordered columns. Before: {:?}, after: {:?}",
299+
"Reordering columns. Before: {:?}, after: {:?}",
346300
batch
347301
.schema()
348302
.fields()
349303
.iter()
350304
.map(|f| f.name())
351305
.collect_vec(),
352-
schema.fields().iter().map(|f| f.name()).collect_vec()
306+
fields.iter().map(|f| f.name()).collect_vec()
353307
);
354-
} else {
355-
debug_assert!(
356-
false,
357-
"reordered something that didn't need to be reordered"
358-
);
359-
}
360308

361-
ArrowRecordBatch::try_new_with_options(
362-
schema.clone(),
363-
arrays,
364-
&ArrowRecordBatchOptions::default().with_row_count(Some(batch.num_rows())),
365-
)
366-
.ok_or_log_error()
367-
.unwrap_or_else(|| ArrowRecordBatch::new_empty(schema))
309+
let schema = Arc::new(ArrowSchema::new_with_metadata(
310+
fields,
311+
batch.schema().metadata.clone(),
312+
));
313+
314+
ArrowRecordBatch::try_new_with_options(
315+
schema.clone(),
316+
arrays,
317+
&ArrowRecordBatchOptions::default().with_row_count(Some(batch.num_rows())),
318+
)
319+
.ok_or_log_error()
320+
.unwrap_or_else(|| ArrowRecordBatch::new_empty(schema))
321+
}
368322
}

crates/store/re_sorbet/src/sorbet_batch.rs

Lines changed: 120 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ use arrow::{
88
error::ArrowError,
99
};
1010

11+
use itertools::Itertools as _;
1112
use re_log::ResultExt as _;
1213

1314
use crate::{
@@ -21,6 +22,11 @@ use crate::{
2122
#[derive(Debug, Clone)]
2223
pub struct SorbetBatch {
2324
schema: SorbetSchema,
25+
26+
/// This record batch contains has all the meta-data
27+
/// required by a [`SorbetBatch`].
28+
///
29+
/// It also has all non-Rerun metadata intact from wherever it was created from.
2430
batch: ArrowRecordBatch,
2531
}
2632

@@ -160,6 +166,9 @@ impl SorbetBatch {
160166
/// Will perform some transformations:
161167
/// * Will automatically wrap data columns in `ListArrays` if they are not already
162168
/// * Will migrate legacy data to more modern form
169+
///
170+
/// Non-Rerun metadata will be preserved (both at batch-level and column-level).
171+
/// Rerun metadata will be updated and added to the batch if needed.
163172
#[tracing::instrument(level = "trace", skip_all)]
164173
pub fn try_from_record_batch(
165174
batch: &ArrowRecordBatch,
@@ -175,20 +184,27 @@ impl SorbetBatch {
175184

176185
let _span = tracing::trace_span!("extend_metadata").entered();
177186

178-
for (field, column) in itertools::izip!(
187+
let new_fields = itertools::izip!(
188+
batch.schema_ref().fields(),
179189
sorbet_schema.columns.arrow_fields(batch_type),
180190
batch.columns()
181-
) {
182-
debug_assert_eq!(field.data_type(), column.data_type());
183-
}
191+
)
192+
.map(|(old_field, mut new_field, column)| {
193+
debug_assert_eq!(new_field.data_type(), column.data_type());
194+
195+
let mut metadata = old_field.metadata().clone();
196+
metadata.extend(new_field.metadata().clone()); // overwrite old with new
197+
new_field.set_metadata(metadata);
198+
199+
Arc::new(new_field)
200+
})
201+
.collect_vec();
202+
203+
let mut batch_metadata = batch.schema_ref().metadata.clone();
204+
batch_metadata.extend(sorbet_schema.arrow_batch_metadata()); // overwrite old with new
184205

185-
// Extend with any metadata that might have been missing:
186-
let mut arrow_schema = ArrowSchema::clone(batch.schema_ref().as_ref());
187-
arrow_schema
188-
.metadata
189-
.extend(sorbet_schema.arrow_batch_metadata());
206+
let arrow_schema = Arc::new(ArrowSchema::new_with_metadata(new_fields, batch_metadata));
190207

191-
let arrow_schema = Arc::new(arrow_schema);
192208
let batch = ArrowRecordBatch::try_new_with_options(
193209
arrow_schema.clone(),
194210
batch.columns().to_vec(),
@@ -203,3 +219,97 @@ impl SorbetBatch {
203219
})
204220
}
205221
}
222+
223+
#[cfg(test)]
224+
mod tests {
225+
226+
use crate::{RowIdColumnDescriptor, sorbet_batch};
227+
228+
use super::*;
229+
230+
/// Test that user-provided metadata is preserved when converting to and from a [`SorbetBatch`].
231+
///
232+
/// Also test that we add the proper Rerun metadata, and remove old Rerun metadata that is not relevant anymore.
233+
#[test]
234+
fn test_sorbet_batch_metadata() {
235+
let original: ArrowRecordBatch = {
236+
let mut row_id_field = RowIdColumnDescriptor::from_sorted(false).to_arrow_field();
237+
row_id_field
238+
.metadata_mut()
239+
.remove("ARROW:extension:metadata");
240+
row_id_field.metadata_mut().insert(
241+
"custom_column_key".to_owned(),
242+
"custom_column_value".to_owned(),
243+
);
244+
let fields = vec![Arc::new(row_id_field)];
245+
let arrow_schema = ArrowSchema::new_with_metadata(
246+
fields,
247+
[
248+
(
249+
"rerun.id".to_owned(),
250+
re_types_core::ChunkId::new().to_string(),
251+
),
252+
(
253+
"custom_batch_key".to_owned(),
254+
"custom_batch_value".to_owned(),
255+
),
256+
]
257+
.into_iter()
258+
.collect(),
259+
);
260+
ArrowRecordBatch::new_empty(arrow_schema.into())
261+
};
262+
263+
{
264+
// Check original has what we expect:
265+
assert!(original.schema().metadata().contains_key("rerun.id"));
266+
assert!(
267+
original
268+
.schema()
269+
.metadata()
270+
.contains_key("custom_batch_key")
271+
);
272+
let row_id = original.schema_ref().field(0);
273+
assert!(
274+
!row_id.metadata().contains_key("ARROW:extension:metadata"),
275+
"We intentionally omitted this from the original"
276+
);
277+
}
278+
279+
let sorbet_batch = sorbet_batch::SorbetBatch::try_from_record_batch(
280+
&original,
281+
crate::BatchType::Dataframe,
282+
)
283+
.unwrap();
284+
285+
let ret = ArrowRecordBatch::from(sorbet_batch);
286+
287+
assert!(
288+
!ret.schema().metadata().contains_key("rerun.id"),
289+
"This should have been removed/renamed"
290+
);
291+
assert!(
292+
ret.schema().metadata().contains_key("rerun:id"),
293+
"This should have been added/renamed"
294+
);
295+
assert!(
296+
ret.schema().metadata().contains_key("custom_batch_key"),
297+
"This should remain"
298+
);
299+
assert!(
300+
ret.schema().metadata().contains_key("sorbet:version"),
301+
"This should have been added"
302+
);
303+
304+
// Check field:
305+
let row_id = ret.schema_ref().field(0);
306+
assert!(
307+
row_id.metadata().contains_key("custom_column_key"),
308+
"This should remain"
309+
);
310+
assert!(
311+
row_id.metadata().contains_key("ARROW:extension:metadata"),
312+
"This should have been added"
313+
);
314+
}
315+
}

0 commit comments

Comments
 (0)