@@ -8,6 +8,7 @@ use arrow::{
8
8
error:: ArrowError ,
9
9
} ;
10
10
11
+ use itertools:: Itertools as _;
11
12
use re_log:: ResultExt as _;
12
13
13
14
use crate :: {
@@ -21,6 +22,11 @@ use crate::{
21
22
#[ derive( Debug , Clone ) ]
22
23
pub struct SorbetBatch {
23
24
schema : SorbetSchema ,
25
+
26
+ /// This record batch contains has all the meta-data
27
+ /// required by a [`SorbetBatch`].
28
+ ///
29
+ /// It also has all non-Rerun metadata intact from wherever it was created from.
24
30
batch : ArrowRecordBatch ,
25
31
}
26
32
@@ -160,6 +166,9 @@ impl SorbetBatch {
160
166
/// Will perform some transformations:
161
167
/// * Will automatically wrap data columns in `ListArrays` if they are not already
162
168
/// * Will migrate legacy data to more modern form
169
+ ///
170
+ /// Non-Rerun metadata will be preserved (both at batch-level and column-level).
171
+ /// Rerun metadata will be updated and added to the batch if needed.
163
172
#[ tracing:: instrument( level = "trace" , skip_all) ]
164
173
pub fn try_from_record_batch (
165
174
batch : & ArrowRecordBatch ,
@@ -175,20 +184,27 @@ impl SorbetBatch {
175
184
176
185
let _span = tracing:: trace_span!( "extend_metadata" ) . entered ( ) ;
177
186
178
- for ( field, column) in itertools:: izip!(
187
+ let new_fields = itertools:: izip!(
188
+ batch. schema_ref( ) . fields( ) ,
179
189
sorbet_schema. columns. arrow_fields( batch_type) ,
180
190
batch. columns( )
181
- ) {
182
- debug_assert_eq ! ( field. data_type( ) , column. data_type( ) ) ;
183
- }
191
+ )
192
+ . map ( |( old_field, mut new_field, column) | {
193
+ debug_assert_eq ! ( new_field. data_type( ) , column. data_type( ) ) ;
194
+
195
+ let mut metadata = old_field. metadata ( ) . clone ( ) ;
196
+ metadata. extend ( new_field. metadata ( ) . clone ( ) ) ; // overwrite old with new
197
+ new_field. set_metadata ( metadata) ;
198
+
199
+ Arc :: new ( new_field)
200
+ } )
201
+ . collect_vec ( ) ;
202
+
203
+ let mut batch_metadata = batch. schema_ref ( ) . metadata . clone ( ) ;
204
+ batch_metadata. extend ( sorbet_schema. arrow_batch_metadata ( ) ) ; // overwrite old with new
184
205
185
- // Extend with any metadata that might have been missing:
186
- let mut arrow_schema = ArrowSchema :: clone ( batch. schema_ref ( ) . as_ref ( ) ) ;
187
- arrow_schema
188
- . metadata
189
- . extend ( sorbet_schema. arrow_batch_metadata ( ) ) ;
206
+ let arrow_schema = Arc :: new ( ArrowSchema :: new_with_metadata ( new_fields, batch_metadata) ) ;
190
207
191
- let arrow_schema = Arc :: new ( arrow_schema) ;
192
208
let batch = ArrowRecordBatch :: try_new_with_options (
193
209
arrow_schema. clone ( ) ,
194
210
batch. columns ( ) . to_vec ( ) ,
@@ -203,3 +219,97 @@ impl SorbetBatch {
203
219
} )
204
220
}
205
221
}
222
+
223
+ #[ cfg( test) ]
224
+ mod tests {
225
+
226
+ use crate :: { RowIdColumnDescriptor , sorbet_batch} ;
227
+
228
+ use super :: * ;
229
+
230
+ /// Test that user-provided metadata is preserved when converting to and from a [`SorbetBatch`].
231
+ ///
232
+ /// Also test that we add the proper Rerun metadata, and remove old Rerun metadata that is not relevant anymore.
233
+ #[ test]
234
+ fn test_sorbet_batch_metadata ( ) {
235
+ let original: ArrowRecordBatch = {
236
+ let mut row_id_field = RowIdColumnDescriptor :: from_sorted ( false ) . to_arrow_field ( ) ;
237
+ row_id_field
238
+ . metadata_mut ( )
239
+ . remove ( "ARROW:extension:metadata" ) ;
240
+ row_id_field. metadata_mut ( ) . insert (
241
+ "custom_column_key" . to_owned ( ) ,
242
+ "custom_column_value" . to_owned ( ) ,
243
+ ) ;
244
+ let fields = vec ! [ Arc :: new( row_id_field) ] ;
245
+ let arrow_schema = ArrowSchema :: new_with_metadata (
246
+ fields,
247
+ [
248
+ (
249
+ "rerun.id" . to_owned ( ) ,
250
+ re_types_core:: ChunkId :: new ( ) . to_string ( ) ,
251
+ ) ,
252
+ (
253
+ "custom_batch_key" . to_owned ( ) ,
254
+ "custom_batch_value" . to_owned ( ) ,
255
+ ) ,
256
+ ]
257
+ . into_iter ( )
258
+ . collect ( ) ,
259
+ ) ;
260
+ ArrowRecordBatch :: new_empty ( arrow_schema. into ( ) )
261
+ } ;
262
+
263
+ {
264
+ // Check original has what we expect:
265
+ assert ! ( original. schema( ) . metadata( ) . contains_key( "rerun.id" ) ) ;
266
+ assert ! (
267
+ original
268
+ . schema( )
269
+ . metadata( )
270
+ . contains_key( "custom_batch_key" )
271
+ ) ;
272
+ let row_id = original. schema_ref ( ) . field ( 0 ) ;
273
+ assert ! (
274
+ !row_id. metadata( ) . contains_key( "ARROW:extension:metadata" ) ,
275
+ "We intentionally omitted this from the original"
276
+ ) ;
277
+ }
278
+
279
+ let sorbet_batch = sorbet_batch:: SorbetBatch :: try_from_record_batch (
280
+ & original,
281
+ crate :: BatchType :: Dataframe ,
282
+ )
283
+ . unwrap ( ) ;
284
+
285
+ let ret = ArrowRecordBatch :: from ( sorbet_batch) ;
286
+
287
+ assert ! (
288
+ !ret. schema( ) . metadata( ) . contains_key( "rerun.id" ) ,
289
+ "This should have been removed/renamed"
290
+ ) ;
291
+ assert ! (
292
+ ret. schema( ) . metadata( ) . contains_key( "rerun:id" ) ,
293
+ "This should have been added/renamed"
294
+ ) ;
295
+ assert ! (
296
+ ret. schema( ) . metadata( ) . contains_key( "custom_batch_key" ) ,
297
+ "This should remain"
298
+ ) ;
299
+ assert ! (
300
+ ret. schema( ) . metadata( ) . contains_key( "sorbet:version" ) ,
301
+ "This should have been added"
302
+ ) ;
303
+
304
+ // Check field:
305
+ let row_id = ret. schema_ref ( ) . field ( 0 ) ;
306
+ assert ! (
307
+ row_id. metadata( ) . contains_key( "custom_column_key" ) ,
308
+ "This should remain"
309
+ ) ;
310
+ assert ! (
311
+ row_id. metadata( ) . contains_key( "ARROW:extension:metadata" ) ,
312
+ "This should have been added"
313
+ ) ;
314
+ }
315
+ }
0 commit comments