Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 908c14f

Browse files
Spark, Arrow: Support for Row lineage when doing Vectorized Parquet reads
1 parent e600345 commit 908c14f

File tree

7 files changed

+304
-49
lines changed

7 files changed

+304
-49
lines changed

arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ private VectorHolder(Types.NestedField field) {
7373
icebergField = field;
7474
}
7575

76-
private VectorHolder(FieldVector vec, Types.NestedField field, NullabilityHolder nulls) {
76+
VectorHolder(FieldVector vec, Types.NestedField field, NullabilityHolder nulls) {
7777
columnDescriptor = null;
7878
vector = vec;
7979
isDictionaryEncoded = false;
@@ -167,6 +167,11 @@ public PositionVectorHolder(
167167
}
168168
}
169169

170+
public static VectorHolder vectorHolder(
171+
FieldVector vector, Types.NestedField icebergField, NullabilityHolder nulls) {
172+
return new VectorHolder(vector, icebergField, nulls);
173+
}
174+
170175
public static class DeletedVectorHolder extends VectorHolder {
171176
private final int numRows;
172177

arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java

Lines changed: 174 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
*/
1919
package org.apache.iceberg.arrow.vectorized;
2020

21+
import static org.apache.iceberg.arrow.vectorized.ArrowVectorAccessors.getVectorAccessor;
22+
2123
import java.util.Map;
2224
import org.apache.arrow.memory.ArrowBuf;
2325
import org.apache.arrow.memory.BufferAllocator;
@@ -461,6 +463,49 @@ public static VectorizedArrowReader positionsWithSetArrowValidityVector() {
461463
return new PositionVectorReader(true);
462464
}
463465

466+
public static VectorizedArrowReader rowIds(long baseRowId, VectorizedArrowReader idReader) {
467+
return new RowIdVectorReader(baseRowId, idReader);
468+
}
469+
470+
public static VectorizedArrowReader lastUpdated(
471+
Long baseRowId, Long fileLastUpdated, VectorizedArrowReader seqReader) {
472+
if (fileLastUpdated != null && baseRowId != null) {
473+
return new LastUpdatedSeqVectorReader(fileLastUpdated, seqReader);
474+
} else {
475+
return nulls();
476+
}
477+
}
478+
479+
public static VectorizedReader<?> replaceWithMetadataReader(
480+
Types.NestedField icebergField,
481+
VectorizedReader<?> reader,
482+
Map<Integer, ?> idToConstant,
483+
boolean setArrowValidityVector) {
484+
int id = icebergField.fieldId();
485+
if (id == MetadataColumns.ROW_ID.fieldId()) {
486+
Long baseRowId = (Long) idToConstant.get(id);
487+
return rowIds(baseRowId, (VectorizedArrowReader) reader);
488+
} else if (id == MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.fieldId()) {
489+
Long baseRowId = (Long) idToConstant.get(id);
490+
Long fileSeqNumber = (Long) idToConstant.get(id);
491+
return VectorizedArrowReader.lastUpdated(
492+
baseRowId, fileSeqNumber, (VectorizedArrowReader) reader);
493+
} else if (idToConstant.containsKey(id)) {
494+
// containsKey is used because the constant may be null
495+
return new ConstantVectorReader<>(icebergField, idToConstant.get(id));
496+
} else if (id == MetadataColumns.ROW_POSITION.fieldId()) {
497+
if (setArrowValidityVector) {
498+
return positionsWithSetArrowValidityVector();
499+
} else {
500+
return VectorizedArrowReader.positions();
501+
}
502+
} else if (id == MetadataColumns.IS_DELETED.fieldId()) {
503+
return new DeletedVectorReader();
504+
}
505+
506+
return reader;
507+
}
508+
464509
private static final class NullVectorReader extends VectorizedArrowReader {
465510
private static final NullVectorReader INSTANCE = new NullVectorReader();
466511

@@ -530,12 +575,6 @@ private static BigIntVector newVector(int valueCount) {
530575
return vector;
531576
}
532577

533-
private static NullabilityHolder newNullabilityHolder(int size) {
534-
NullabilityHolder nullabilityHolder = new NullabilityHolder(size);
535-
nullabilityHolder.setNotNulls(0, size);
536-
return nullabilityHolder;
537-
}
538-
539578
@Override
540579
public void setRowGroupInfo(
541580
PageReadStore source, Map<ColumnPath, ColumnChunkMetaData> metadata) {
@@ -567,6 +606,135 @@ public void close() {
567606
}
568607
}
569608

609+
private static final class RowIdVectorReader extends VectorizedArrowReader {
610+
private static final Field ROW_ID_ARROW_FIELD = ArrowSchemaUtil.convert(MetadataColumns.ROW_ID);
611+
612+
private final long firstRowId;
613+
private final VectorizedReader<VectorHolder> idReader;
614+
private final VectorizedReader<VectorHolder> posReader;
615+
private NullabilityHolder nulls;
616+
617+
private RowIdVectorReader(long firstRowId, VectorizedArrowReader idReader) {
618+
this.firstRowId = firstRowId;
619+
this.idReader = idReader != null ? idReader : nulls();
620+
this.posReader = new PositionVectorReader(true);
621+
}
622+
623+
@Override
624+
public VectorHolder read(VectorHolder reuse, int numValsToRead) {
625+
FieldVector positions = posReader.read(null, numValsToRead).vector();
626+
VectorHolder ids = idReader.read(null, numValsToRead);
627+
BigIntVector vec = allocateBigIntVector(ROW_ID_ARROW_FIELD, numValsToRead);
628+
ArrowBuf dataBuffer = vec.getDataBuffer();
629+
boolean isNullReader = ids.vector() == null;
630+
ArrowVectorAccessor<?, String, ?, ?> idsAccessor =
631+
isNullReader ? null : getVectorAccessor(ids);
632+
for (int i = 0; i < numValsToRead; i += 1) {
633+
long bufferOffset = (long) i * Long.BYTES;
634+
if (isNullReader || ids.nullabilityHolder().isNullAt(i) == 1) {
635+
long rowId = firstRowId + (Long) positions.getObject(i);
636+
dataBuffer.setLong(bufferOffset, rowId);
637+
} else {
638+
long materializedRowId = idsAccessor.getLong(i);
639+
dataBuffer.setLong(bufferOffset, materializedRowId);
640+
}
641+
}
642+
643+
vec.setValueCount(numValsToRead);
644+
return VectorHolder.vectorHolder(vec, MetadataColumns.ROW_ID, nulls);
645+
}
646+
647+
@Override
648+
public void setRowGroupInfo(
649+
PageReadStore source, Map<ColumnPath, ColumnChunkMetaData> metadata) {
650+
idReader.setRowGroupInfo(source, metadata);
651+
posReader.setRowGroupInfo(source, metadata);
652+
}
653+
654+
@Override
655+
public void setBatchSize(int batchSize) {
656+
if (nulls == null || nulls.size() < batchSize) {
657+
this.nulls = newNullabilityHolder(batchSize);
658+
}
659+
660+
idReader.setBatchSize(batchSize);
661+
posReader.setBatchSize(batchSize);
662+
}
663+
664+
@Override
665+
public void close() {
666+
// don't close vectors as they are not owned by readers
667+
}
668+
}
669+
670+
private static final class LastUpdatedSeqVectorReader extends VectorizedArrowReader {
671+
private static final Field LAST_UPDATED_SEQ =
672+
ArrowSchemaUtil.convert(MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER);
673+
674+
private final long lastUpdatedSeq;
675+
private final VectorizedReader<VectorHolder> seqReader;
676+
private NullabilityHolder nulls;
677+
678+
private LastUpdatedSeqVectorReader(
679+
long lastUpdatedSeq, VectorizedReader<VectorHolder> seqReader) {
680+
this.lastUpdatedSeq = lastUpdatedSeq;
681+
this.seqReader = seqReader == null ? nulls() : seqReader;
682+
}
683+
684+
@Override
685+
public VectorHolder read(VectorHolder reuse, int numValsToRead) {
686+
BigIntVector vec = allocateBigIntVector(LAST_UPDATED_SEQ, numValsToRead);
687+
ArrowBuf dataBuffer = vec.getDataBuffer();
688+
VectorHolder seqNumbers = seqReader.read(null, numValsToRead);
689+
ArrowVectorAccessor<?, String, ?, ?> accessor =
690+
seqNumbers.vector() == null ? null : getVectorAccessor(seqNumbers);
691+
for (int i = 0; i < numValsToRead; i += 1) {
692+
long bufferOffset = (long) i * Long.BYTES;
693+
if (seqNumbers.vector() == null || seqNumbers.nullabilityHolder().isNullAt(i) == 1) {
694+
dataBuffer.setLong(bufferOffset, lastUpdatedSeq);
695+
} else {
696+
long materializedSeqNumber = accessor.getLong(i);
697+
dataBuffer.setLong(bufferOffset, materializedSeqNumber);
698+
}
699+
}
700+
701+
vec.setValueCount(numValsToRead);
702+
return VectorHolder.vectorHolder(vec, MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER, nulls);
703+
}
704+
705+
@Override
706+
public void setRowGroupInfo(
707+
PageReadStore source, Map<ColumnPath, ColumnChunkMetaData> metadata) {
708+
seqReader.setRowGroupInfo(source, metadata);
709+
}
710+
711+
@Override
712+
public void setBatchSize(int batchSize) {
713+
if (nulls == null || nulls.size() < batchSize) {
714+
this.nulls = newNullabilityHolder(batchSize);
715+
}
716+
717+
seqReader.setBatchSize(batchSize);
718+
}
719+
720+
@Override
721+
public void close() {
722+
// don't close vectors as they are not owned by readers
723+
}
724+
}
725+
726+
private static BigIntVector allocateBigIntVector(Field field, int valueCount) {
727+
BigIntVector vector = (BigIntVector) field.createVector(ArrowAllocation.rootAllocator());
728+
vector.allocateNew(valueCount);
729+
return vector;
730+
}
731+
732+
private static NullabilityHolder newNullabilityHolder(int size) {
733+
NullabilityHolder nullabilityHolder = new NullabilityHolder(size);
734+
nullabilityHolder.setNotNulls(0, size);
735+
return nullabilityHolder;
736+
}
737+
570738
/**
571739
* A Dummy Vector Reader which doesn't actually read files, instead it returns a dummy
572740
* VectorHolder which indicates the constant value which should be used for this column.

arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedReaderBuilder.java

Lines changed: 16 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,9 @@
2424
import java.util.function.Function;
2525
import java.util.stream.IntStream;
2626
import org.apache.arrow.memory.BufferAllocator;
27-
import org.apache.iceberg.MetadataColumns;
2827
import org.apache.iceberg.Schema;
2928
import org.apache.iceberg.arrow.ArrowAllocation;
3029
import org.apache.iceberg.arrow.vectorized.VectorizedArrowReader.ConstantVectorReader;
31-
import org.apache.iceberg.arrow.vectorized.VectorizedArrowReader.DeletedVectorReader;
3230
import org.apache.iceberg.parquet.TypeWithSchemaVisitor;
3331
import org.apache.iceberg.parquet.VectorizedReader;
3432
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
@@ -101,33 +99,26 @@ public VectorizedReader<?> message(
10199
Lists.newArrayListWithExpectedSize(icebergFields.size());
102100

103101
for (Types.NestedField field : icebergFields) {
104-
int id = field.fieldId();
105-
VectorizedReader<?> reader = readersById.get(id);
106-
if (idToConstant.containsKey(id)) {
107-
reorderedFields.add(constantReader(field, idToConstant.get(id)));
108-
} else if (id == MetadataColumns.ROW_POSITION.fieldId()) {
109-
if (setArrowValidityVector) {
110-
reorderedFields.add(VectorizedArrowReader.positionsWithSetArrowValidityVector());
111-
} else {
112-
reorderedFields.add(VectorizedArrowReader.positions());
113-
}
114-
} else if (id == MetadataColumns.IS_DELETED.fieldId()) {
115-
reorderedFields.add(new DeletedVectorReader());
116-
} else if (reader != null) {
117-
reorderedFields.add(reader);
118-
} else if (field.initialDefault() != null) {
119-
reorderedFields.add(
120-
constantReader(field, convert.apply(field.type(), field.initialDefault())));
121-
} else if (field.isOptional()) {
122-
reorderedFields.add(VectorizedArrowReader.nulls());
123-
} else {
124-
throw new IllegalArgumentException(
125-
String.format("Missing required field: %s", field.name()));
126-
}
102+
VectorizedReader<?> reader =
103+
VectorizedArrowReader.replaceWithMetadataReader(
104+
field, readersById.get(field.fieldId()), idToConstant, setArrowValidityVector);
105+
reorderedFields.add(defaultReader(field, reader));
127106
}
128107
return vectorizedReader(reorderedFields);
129108
}
130109

110+
private VectorizedReader<?> defaultReader(Types.NestedField field, VectorizedReader<?> reader) {
111+
if (reader != null) {
112+
return reader;
113+
} else if (field.initialDefault() != null) {
114+
return constantReader(field, convert.apply(field.type(), field.initialDefault()));
115+
} else if (field.isOptional()) {
116+
return VectorizedArrowReader.nulls();
117+
}
118+
119+
throw new IllegalArgumentException(String.format("Missing required field: %s", field.name()));
120+
}
121+
131122
private <T> ConstantVectorReader<T> constantReader(Types.NestedField field, T constant) {
132123
return new ConstantVectorReader<>(field, constant);
133124
}

spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRowLevelOperationsWithLineage.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,6 @@ public void beforeEach() {
9191
assumeThat(formatVersion).isGreaterThanOrEqualTo(3);
9292
// ToDo: Remove these as row lineage inheritance gets implemented in the other readers
9393
assumeThat(fileFormat).isEqualTo(FileFormat.PARQUET);
94-
assumeThat(vectorized).isFalse();
9594
}
9695

9796
@AfterEach

spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,11 +73,15 @@ public static void assertEqualsSafe(Types.StructType struct, Record expected, Ro
7373
}
7474

7575
public static void assertEqualsBatch(
76-
Types.StructType struct, Iterator<Record> expectedRecords, ColumnarBatch batch) {
77-
for (int rowId = 0; rowId < batch.numRows(); rowId++) {
78-
InternalRow row = batch.getRow(rowId);
76+
Types.StructType struct,
77+
Iterator<Record> expectedRecords,
78+
ColumnarBatch batch,
79+
Map<Integer, Object> idToConstant,
80+
Integer numRowsAlreadyRead) {
81+
for (int rowPos = 0; rowPos < batch.numRows(); rowPos++) {
82+
InternalRow row = batch.getRow(rowPos);
7983
Record expectedRecord = expectedRecords.next();
80-
assertEqualsUnsafe(struct, expectedRecord, row);
84+
assertEqualsUnsafe(struct, expectedRecord, row, idToConstant, numRowsAlreadyRead + rowPos);
8185
}
8286
}
8387

spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
import org.apache.iceberg.FileContent;
5151
import org.apache.iceberg.FileScanTask;
5252
import org.apache.iceberg.ManifestFile;
53+
import org.apache.iceberg.MetadataColumns;
5354
import org.apache.iceberg.Schema;
5455
import org.apache.iceberg.Snapshot;
5556
import org.apache.iceberg.Table;
@@ -115,22 +116,38 @@ public static void assertEqualsSafe(Types.StructType struct, Record rec, Row row
115116

116117
public static void assertEqualsBatch(
117118
Types.StructType struct, Iterator<Record> expected, ColumnarBatch batch) {
118-
for (int rowId = 0; rowId < batch.numRows(); rowId++) {
119-
InternalRow row = batch.getRow(rowId);
120-
Record rec = expected.next();
119+
assertEqualsBatch(struct, expected, batch, null, null);
120+
}
121+
122+
public static void assertEqualsBatch(
123+
Types.StructType struct,
124+
Iterator<Record> expected,
125+
ColumnarBatch batch,
126+
Integer numRowsRead,
127+
Map<Integer, Object> idToConstant) {
128+
for (int rowPos = 0; rowPos < batch.numRows(); rowPos++) {
129+
InternalRow row = batch.getRow(rowPos);
130+
Record expectedRecord = expected.next();
121131

122132
List<Types.NestedField> fields = struct.fields();
123133
for (int readPos = 0; readPos < fields.size(); readPos += 1) {
124134
Types.NestedField field = fields.get(readPos);
125-
Field writeField = rec.getSchema().getField(field.name());
135+
Field expectedField = expectedRecord.getSchema().getField(field.name());
126136

127137
Type fieldType = field.type();
128138
Object actualValue = row.isNullAt(readPos) ? null : row.get(readPos, convert(fieldType));
129139

130140
Object expectedValue;
131-
if (writeField != null) {
132-
int writePos = writeField.pos();
133-
expectedValue = rec.get(writePos);
141+
if (expectedField != null) {
142+
if (field.fieldId() == MetadataColumns.ROW_ID.fieldId() && idToConstant != null) {
143+
expectedValue = expectedRecord.get(expectedField.pos());
144+
if (expectedValue == null) {
145+
expectedValue = (Long) idToConstant.get(field.fieldId()) + numRowsRead + rowPos;
146+
}
147+
} else {
148+
int writePos = expectedField.pos();
149+
expectedValue = expectedRecord.get(writePos);
150+
}
134151
} else {
135152
expectedValue = field.initialDefault();
136153
}

0 commit comments

Comments
 (0)