Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 5982dc7

Browse files
Row lineage vectorized parquet reader support
1 parent 53e1db6 commit 5982dc7

File tree

6 files changed

+254
-68
lines changed

6 files changed

+254
-68
lines changed

arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,13 @@ public RowIdVectorHolder(
174174
}
175175
}
176176

177+
public static class LastUpdatedSeqVectorHolder extends VectorHolder {
178+
public LastUpdatedSeqVectorHolder(
179+
FieldVector vector, Types.NestedField icebergField, NullabilityHolder nulls) {
180+
super(vector, icebergField, nulls);
181+
}
182+
}
183+
177184
public static class DeletedVectorHolder extends VectorHolder {
178185
private final int numRows;
179186

arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java

Lines changed: 114 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -461,8 +461,43 @@ public static VectorizedArrowReader positionsWithSetArrowValidityVector() {
461461
return new PositionVectorReader(true);
462462
}
463463

464-
public static VectorizedArrowReader rowIds(long firstRowId, VectorizedArrowReader idReader) {
465-
return new RowIdVectorReader(firstRowId, idReader);
464+
public static VectorizedArrowReader rowIds(long baseRowId, VectorizedArrowReader idReader) {
465+
return new RowIdVectorReader(baseRowId, idReader);
466+
}
467+
468+
public static VectorizedArrowReader lastUpdated(
469+
Long baseRowId, Long fileLastUpdated, VectorizedArrowReader seqReader) {
470+
if (fileLastUpdated != null && baseRowId != null) {
471+
return new LastUpdatedSeqVectorReader(fileLastUpdated, seqReader);
472+
} else {
473+
return nulls();
474+
}
475+
}
476+
477+
public static VectorizedReader<?> replaceWithMetadataReader(
478+
int id,
479+
VectorizedReader<?> reader,
480+
Map<Integer, ?> idToConstant,
481+
boolean setArrowValidityVector) {
482+
if (id == MetadataColumns.ROW_ID.fieldId()) {
483+
Long baseRowId = (Long) idToConstant.get(id);
484+
return rowIds(baseRowId, (VectorizedArrowReader) reader);
485+
} else if (id == MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.fieldId()) {
486+
Long baseRowId = (Long) idToConstant.get(id);
487+
Long fileSeqNumber = (Long) idToConstant.get(id);
488+
return VectorizedArrowReader.lastUpdated(
489+
baseRowId, fileSeqNumber, (VectorizedArrowReader) reader);
490+
} else if (id == MetadataColumns.ROW_POSITION.fieldId()) {
491+
if (setArrowValidityVector) {
492+
return positionsWithSetArrowValidityVector();
493+
} else {
494+
return VectorizedArrowReader.positions();
495+
}
496+
} else if (id == MetadataColumns.IS_DELETED.fieldId()) {
497+
return new DeletedVectorReader();
498+
}
499+
500+
return reader;
466501
}
467502

468503
private static final class NullVectorReader extends VectorizedArrowReader {
@@ -575,8 +610,8 @@ private static final class RowIdVectorReader extends VectorizedArrowReader {
575610
private static final Field ROW_ID_ARROW_FIELD = ArrowSchemaUtil.convert(MetadataColumns.ROW_ID);
576611

577612
private final long firstRowId;
578-
private final VectorizedArrowReader idReader;
579-
private final VectorizedArrowReader posReader;
613+
private final VectorizedReader<VectorHolder> idReader;
614+
private final VectorizedReader<VectorHolder> posReader;
580615
private NullabilityHolder nulls;
581616

582617
private RowIdVectorReader(long firstRowId, VectorizedArrowReader idReader) {
@@ -588,21 +623,23 @@ private RowIdVectorReader(long firstRowId, VectorizedArrowReader idReader) {
588623
@Override
589624
public VectorHolder read(VectorHolder reuse, int numValsToRead) {
590625
FieldVector positions = posReader.read(null, numValsToRead).vector();
591-
FieldVector ids = idReader.read(null, numValsToRead).vector();
626+
VectorHolder ids = idReader.read(null, numValsToRead);
592627
BigIntVector vec = newVector(numValsToRead);
593628
ArrowBuf dataBuffer = vec.getDataBuffer();
594629
for (int i = 0; i < numValsToRead; i += 1) {
595-
if (ids.isNull(i)) {
630+
long bufferOffset = (long) i * Long.BYTES;
631+
if (ids.nullabilityHolder().isNullAt(i) == 1) {
596632
long rowId = firstRowId + (Long) positions.getObject(i);
597-
dataBuffer.setLong((long) i * Long.BYTES, rowId);
633+
dataBuffer.setLong(bufferOffset, rowId);
598634
} else {
599-
dataBuffer.setLong((long) i * Long.BYTES, (Long) ids.getObject(i));
635+
long materializedRowId = ids.vector().getDataBuffer().getLong(bufferOffset);
636+
dataBuffer.setLong(bufferOffset, materializedRowId);
600637
}
601638
}
602639

603640
vec.setValueCount(numValsToRead);
604641

605-
return new VectorHolder.RowIdVectorHolder(vec, MetadataColumns.ROW_POSITION, nulls);
642+
return new VectorHolder.RowIdVectorHolder(vec, MetadataColumns.ROW_ID, nulls);
606643
}
607644

608645
@Override
@@ -641,6 +678,74 @@ private static NullabilityHolder newNullabilityHolder(int size) {
641678
}
642679
}
643680

681+
private static final class LastUpdatedSeqVectorReader extends VectorizedArrowReader {
682+
private static final Field LAST_UPDATED_SEQ =
683+
ArrowSchemaUtil.convert(MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER);
684+
685+
private final long lastUpdatedSeq;
686+
private final VectorizedReader<VectorHolder> seqReader;
687+
private NullabilityHolder nulls;
688+
689+
private LastUpdatedSeqVectorReader(
690+
long lastUpdatedSeq, VectorizedReader<VectorHolder> seqReader) {
691+
this.lastUpdatedSeq = lastUpdatedSeq;
692+
this.seqReader = seqReader == null ? nulls() : seqReader;
693+
}
694+
695+
@Override
696+
public VectorHolder read(VectorHolder reuse, int numValsToRead) {
697+
VectorHolder seqNumbers = seqReader.read(null, numValsToRead);
698+
BigIntVector vec = newVector(numValsToRead);
699+
ArrowBuf dataBuffer = vec.getDataBuffer();
700+
for (int i = 0; i < numValsToRead; i += 1) {
701+
long bufferOffset = (long) i * Long.BYTES;
702+
if (seqNumbers.nullabilityHolder().isNullAt(i) == 1) {
703+
dataBuffer.setLong(bufferOffset, lastUpdatedSeq);
704+
} else {
705+
long materializedRowId = seqNumbers.vector().getDataBuffer().getLong(bufferOffset);
706+
dataBuffer.setLong(bufferOffset, materializedRowId);
707+
}
708+
}
709+
710+
vec.setValueCount(numValsToRead);
711+
712+
return new VectorHolder.RowIdVectorHolder(vec, MetadataColumns.ROW_ID, nulls);
713+
}
714+
715+
@Override
716+
public void setRowGroupInfo(
717+
PageReadStore source, Map<ColumnPath, ColumnChunkMetaData> metadata) {
718+
seqReader.setRowGroupInfo(source, metadata);
719+
}
720+
721+
@Override
722+
public void setBatchSize(int batchSize) {
723+
if (nulls == null || nulls.size() < batchSize) {
724+
this.nulls = newNullabilityHolder(batchSize);
725+
}
726+
727+
seqReader.setBatchSize(batchSize);
728+
}
729+
730+
@Override
731+
public void close() {
732+
// don't close vectors as they are not owned by readers
733+
}
734+
735+
private static BigIntVector newVector(int valueCount) {
736+
BigIntVector vector =
737+
(BigIntVector) LAST_UPDATED_SEQ.createVector(ArrowAllocation.rootAllocator());
738+
vector.allocateNew(valueCount);
739+
return vector;
740+
}
741+
742+
private static NullabilityHolder newNullabilityHolder(int size) {
743+
NullabilityHolder nullabilityHolder = new NullabilityHolder(size);
744+
nullabilityHolder.setNotNulls(0, size);
745+
return nullabilityHolder;
746+
}
747+
}
748+
644749
/**
645750
* A Dummy Vector Reader which doesn't actually read files, instead it returns a dummy
646751
* VectorHolder which indicates the constant value which should be used for this column.

arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedReaderBuilder.java

Lines changed: 16 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,9 @@
2424
import java.util.function.Function;
2525
import java.util.stream.IntStream;
2626
import org.apache.arrow.memory.BufferAllocator;
27-
import org.apache.iceberg.MetadataColumns;
2827
import org.apache.iceberg.Schema;
2928
import org.apache.iceberg.arrow.ArrowAllocation;
3029
import org.apache.iceberg.arrow.vectorized.VectorizedArrowReader.ConstantVectorReader;
31-
import org.apache.iceberg.arrow.vectorized.VectorizedArrowReader.DeletedVectorReader;
3230
import org.apache.iceberg.parquet.TypeWithSchemaVisitor;
3331
import org.apache.iceberg.parquet.VectorizedReader;
3432
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
@@ -102,32 +100,26 @@ public VectorizedReader<?> message(
102100

103101
for (Types.NestedField field : icebergFields) {
104102
int id = field.fieldId();
105-
VectorizedReader<?> reader = readersById.get(id);
106-
if (idToConstant.containsKey(id)) {
107-
reorderedFields.add(constantReader(field, idToConstant.get(id)));
108-
} else if (id == MetadataColumns.ROW_POSITION.fieldId()) {
109-
if (setArrowValidityVector) {
110-
reorderedFields.add(VectorizedArrowReader.positionsWithSetArrowValidityVector());
111-
} else {
112-
reorderedFields.add(VectorizedArrowReader.positions());
113-
}
114-
} else if (id == MetadataColumns.IS_DELETED.fieldId()) {
115-
reorderedFields.add(new DeletedVectorReader());
116-
} else if (reader != null) {
117-
reorderedFields.add(reader);
118-
} else if (field.initialDefault() != null) {
119-
reorderedFields.add(
120-
constantReader(field, convert.apply(field.type(), field.initialDefault())));
121-
} else if (field.isOptional()) {
122-
reorderedFields.add(VectorizedArrowReader.nulls());
123-
} else {
124-
throw new IllegalArgumentException(
125-
String.format("Missing required field: %s", field.name()));
126-
}
103+
VectorizedReader<?> reader =
104+
VectorizedArrowReader.replaceWithMetadataReader(
105+
id, readersById.get(id), idToConstant, setArrowValidityVector);
106+
reorderedFields.add(defaultReader(field, reader));
127107
}
128108
return vectorizedReader(reorderedFields);
129109
}
130110

111+
private VectorizedReader<?> defaultReader(Types.NestedField field, VectorizedReader<?> reader) {
112+
if (reader != null) {
113+
return reader;
114+
} else if (field.initialDefault() != null) {
115+
return constantReader(field, convert.apply(field.type(), field.initialDefault()));
116+
} else if (field.isOptional()) {
117+
return VectorizedArrowReader.nulls();
118+
}
119+
120+
throw new IllegalArgumentException(String.format("Missing required field: %s", field.name()));
121+
}
122+
131123
private <T> ConstantVectorReader<T> constantReader(Types.NestedField field, T constant) {
132124
return new ConstantVectorReader<>(field, constant);
133125
}

spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java

Lines changed: 8 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -73,28 +73,15 @@ public static void assertEqualsSafe(Types.StructType struct, Record expected, Ro
7373
}
7474

7575
public static void assertEqualsBatch(
76-
Types.StructType struct, Iterator<Record> expectedRecords, ColumnarBatch batch) {
77-
for (int rowId = 0; rowId < batch.numRows(); rowId++) {
78-
InternalRow row = batch.getRow(rowId);
76+
Types.StructType struct,
77+
Iterator<Record> expectedRecords,
78+
ColumnarBatch batch,
79+
Map<Integer, Object> idToConstant,
80+
Integer numRowsAlreadyRead) {
81+
for (int rowPos = 0; rowPos < batch.numRows(); rowPos++) {
82+
InternalRow row = batch.getRow(rowPos);
7983
Record expectedRecord = expectedRecords.next();
80-
Types.StructType expectedRecordType = expectedRecord.struct();
81-
List<Types.NestedField> fields = struct.fields();
82-
83-
for (int readPos = 0; readPos < fields.size(); readPos += 1) {
84-
Types.NestedField field = fields.get(readPos);
85-
Types.NestedField expectedField = expectedRecordType.field(field.fieldId());
86-
Object expectedValue;
87-
Object actualValue = row.isNullAt(readPos) ? null : row.get(readPos, convert(field.type()));
88-
if (expectedField != null) {
89-
expectedValue = expectedRecord.getField(expectedField.name());
90-
assertEqualsUnsafe(field.type(), expectedValue, actualValue);
91-
} else {
92-
assertEqualsUnsafe(
93-
field.type(),
94-
GenericDataUtil.internalToGeneric(field.type(), field.initialDefault()),
95-
actualValue);
96-
}
97-
}
84+
assertEqualsUnsafe(struct, expectedRecord, row, idToConstant, numRowsAlreadyRead + rowPos);
9885
}
9986
}
10087

spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
import org.apache.iceberg.FileContent;
5151
import org.apache.iceberg.FileScanTask;
5252
import org.apache.iceberg.ManifestFile;
53+
import org.apache.iceberg.MetadataColumns;
5354
import org.apache.iceberg.Schema;
5455
import org.apache.iceberg.Snapshot;
5556
import org.apache.iceberg.Table;
@@ -115,22 +116,38 @@ public static void assertEqualsSafe(Types.StructType struct, Record rec, Row row
115116

116117
public static void assertEqualsBatch(
117118
Types.StructType struct, Iterator<Record> expected, ColumnarBatch batch) {
118-
for (int rowId = 0; rowId < batch.numRows(); rowId++) {
119-
InternalRow row = batch.getRow(rowId);
120-
Record rec = expected.next();
119+
assertEqualsBatch(struct, expected, batch, null, null);
120+
}
121+
122+
public static void assertEqualsBatch(
123+
Types.StructType struct,
124+
Iterator<Record> expected,
125+
ColumnarBatch batch,
126+
Integer numRowsRead,
127+
Map<Integer, Object> idToConstant) {
128+
for (int rowPos = 0; rowPos < batch.numRows(); rowPos++) {
129+
InternalRow row = batch.getRow(rowPos);
130+
Record expectedRecord = expected.next();
121131

122132
List<Types.NestedField> fields = struct.fields();
123133
for (int readPos = 0; readPos < fields.size(); readPos += 1) {
124134
Types.NestedField field = fields.get(readPos);
125-
Field writeField = rec.getSchema().getField(field.name());
135+
Field expectedField = expectedRecord.getSchema().getField(field.name());
126136

127137
Type fieldType = field.type();
128138
Object actualValue = row.isNullAt(readPos) ? null : row.get(readPos, convert(fieldType));
129139

130140
Object expectedValue;
131-
if (writeField != null) {
132-
int writePos = writeField.pos();
133-
expectedValue = rec.get(writePos);
141+
if (expectedField != null) {
142+
if (field.fieldId() == MetadataColumns.ROW_ID.fieldId() && idToConstant != null) {
143+
expectedValue = expectedRecord.get(expectedField.pos());
144+
if (expectedValue == null) {
145+
expectedValue = (Long) idToConstant.get(field.fieldId()) + numRowsRead + rowPos;
146+
}
147+
} else {
148+
int writePos = expectedField.pos();
149+
expectedValue = expectedRecord.get(writePos);
150+
}
134151
} else {
135152
expectedValue = field.initialDefault();
136153
}

0 commit comments

Comments
 (0)