Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 73b179c

Browse files
Spark 3.5, Arrow: Support for Row lineage when using the Parquet Vectorized reader (#12928)
1 parent d3ebea5 commit 73b179c

File tree

7 files changed

+388
-68
lines changed

7 files changed

+388
-68
lines changed

arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ private VectorHolder(Types.NestedField field) {
7373
icebergField = field;
7474
}
7575

76-
private VectorHolder(FieldVector vec, Types.NestedField field, NullabilityHolder nulls) {
76+
VectorHolder(FieldVector vec, Types.NestedField field, NullabilityHolder nulls) {
7777
columnDescriptor = null;
7878
vector = vec;
7979
isDictionaryEncoded = false;
@@ -167,6 +167,11 @@ public PositionVectorHolder(
167167
}
168168
}
169169

170+
public static VectorHolder vectorHolder(
171+
FieldVector vector, Types.NestedField icebergField, NullabilityHolder nulls) {
172+
return new VectorHolder(vector, icebergField, nulls);
173+
}
174+
170175
public static class DeletedVectorHolder extends VectorHolder {
171176
private final int numRows;
172177

arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java

Lines changed: 205 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -461,6 +461,53 @@ public static VectorizedArrowReader positionsWithSetArrowValidityVector() {
461461
return new PositionVectorReader(true);
462462
}
463463

464+
public static VectorizedArrowReader rowIds(Long baseRowId, VectorizedArrowReader idReader) {
465+
if (baseRowId != null) {
466+
return new RowIdVectorReader(baseRowId, idReader);
467+
} else {
468+
return nulls();
469+
}
470+
}
471+
472+
public static VectorizedArrowReader lastUpdated(
473+
Long baseRowId, Long fileLastUpdated, VectorizedArrowReader seqReader) {
474+
if (fileLastUpdated != null && baseRowId != null) {
475+
return new LastUpdatedSeqVectorReader(fileLastUpdated, seqReader);
476+
} else {
477+
return nulls();
478+
}
479+
}
480+
481+
public static VectorizedReader<?> replaceWithMetadataReader(
482+
Types.NestedField icebergField,
483+
VectorizedReader<?> reader,
484+
Map<Integer, ?> idToConstant,
485+
boolean setArrowValidityVector) {
486+
int id = icebergField.fieldId();
487+
if (id == MetadataColumns.ROW_ID.fieldId()) {
488+
Long baseRowId = (Long) idToConstant.get(id);
489+
return rowIds(baseRowId, (VectorizedArrowReader) reader);
490+
} else if (id == MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.fieldId()) {
491+
Long baseRowId = (Long) idToConstant.get(MetadataColumns.ROW_ID.fieldId());
492+
Long fileSeqNumber = (Long) idToConstant.get(id);
493+
return VectorizedArrowReader.lastUpdated(
494+
baseRowId, fileSeqNumber, (VectorizedArrowReader) reader);
495+
} else if (idToConstant.containsKey(id)) {
496+
// containsKey is used because the constant may be null
497+
return new ConstantVectorReader<>(icebergField, idToConstant.get(id));
498+
} else if (id == MetadataColumns.ROW_POSITION.fieldId()) {
499+
if (setArrowValidityVector) {
500+
return positionsWithSetArrowValidityVector();
501+
} else {
502+
return VectorizedArrowReader.positions();
503+
}
504+
} else if (id == MetadataColumns.IS_DELETED.fieldId()) {
505+
return new DeletedVectorReader();
506+
}
507+
508+
return reader;
509+
}
510+
464511
private static final class NullVectorReader extends VectorizedArrowReader {
465512
private static final NullVectorReader INSTANCE = new NullVectorReader();
466513

@@ -530,12 +577,6 @@ private static BigIntVector newVector(int valueCount) {
530577
return vector;
531578
}
532579

533-
private static NullabilityHolder newNullabilityHolder(int size) {
534-
NullabilityHolder nullabilityHolder = new NullabilityHolder(size);
535-
nullabilityHolder.setNotNulls(0, size);
536-
return nullabilityHolder;
537-
}
538-
539580
@Override
540581
public void setRowGroupInfo(
541582
PageReadStore source, Map<ColumnPath, ColumnChunkMetaData> metadata) {
@@ -567,6 +608,164 @@ public void close() {
567608
}
568609
}
569610

611+
private static final class RowIdVectorReader extends VectorizedArrowReader {
612+
private static final Field ROW_ID_ARROW_FIELD = ArrowSchemaUtil.convert(MetadataColumns.ROW_ID);
613+
614+
private final long firstRowId;
615+
private final VectorizedReader<VectorHolder> idReader;
616+
private final VectorizedReader<VectorHolder> posReader;
617+
private NullabilityHolder nulls;
618+
619+
private RowIdVectorReader(long firstRowId, VectorizedArrowReader idReader) {
620+
this.firstRowId = firstRowId;
621+
this.idReader = idReader != null ? idReader : nulls();
622+
this.posReader = new PositionVectorReader(true);
623+
}
624+
625+
@Override
626+
public VectorHolder read(VectorHolder reuse, int numValsToRead) {
627+
FieldVector positions = null;
628+
FieldVector ids = null;
629+
630+
try {
631+
positions = posReader.read(null, numValsToRead).vector();
632+
VectorHolder idsHolder = idReader.read(null, numValsToRead);
633+
ids = idsHolder.vector();
634+
ArrowVectorAccessor<?, String, ?, ?> idsAccessor =
635+
ids == null ? null : ArrowVectorAccessors.getVectorAccessor(idsHolder);
636+
637+
BigIntVector rowIds = allocateBigIntVector(ROW_ID_ARROW_FIELD, numValsToRead);
638+
ArrowBuf dataBuffer = rowIds.getDataBuffer();
639+
for (int i = 0; i < numValsToRead; i += 1) {
640+
long bufferOffset = (long) i * Long.BYTES;
641+
if (idsAccessor == null || isNull(idsHolder, i)) {
642+
long rowId = firstRowId + (Long) positions.getObject(i);
643+
dataBuffer.setLong(bufferOffset, rowId);
644+
} else {
645+
long materializedRowId = idsAccessor.getLong(i);
646+
dataBuffer.setLong(bufferOffset, materializedRowId);
647+
}
648+
}
649+
650+
rowIds.setValueCount(numValsToRead);
651+
return VectorHolder.vectorHolder(rowIds, MetadataColumns.ROW_ID, nulls);
652+
} finally {
653+
if (positions != null) {
654+
positions.close();
655+
}
656+
657+
if (ids != null) {
658+
ids.close();
659+
}
660+
}
661+
}
662+
663+
@Override
664+
public void setRowGroupInfo(
665+
PageReadStore source, Map<ColumnPath, ColumnChunkMetaData> metadata) {
666+
idReader.setRowGroupInfo(source, metadata);
667+
posReader.setRowGroupInfo(source, metadata);
668+
}
669+
670+
@Override
671+
public void setBatchSize(int batchSize) {
672+
if (nulls == null || nulls.size() < batchSize) {
673+
this.nulls = newNullabilityHolder(batchSize);
674+
}
675+
676+
idReader.setBatchSize(batchSize);
677+
posReader.setBatchSize(batchSize);
678+
}
679+
680+
@Override
681+
public void close() {
682+
// don't close result vectors as they are not owned by readers
683+
}
684+
}
685+
686+
private static final class LastUpdatedSeqVectorReader extends VectorizedArrowReader {
687+
private static final Field LAST_UPDATED_SEQ =
688+
ArrowSchemaUtil.convert(MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER);
689+
690+
private final long lastUpdatedSeq;
691+
private final VectorizedReader<VectorHolder> seqReader;
692+
private NullabilityHolder nulls;
693+
694+
private LastUpdatedSeqVectorReader(
695+
long lastUpdatedSeq, VectorizedReader<VectorHolder> seqReader) {
696+
this.lastUpdatedSeq = lastUpdatedSeq;
697+
this.seqReader = seqReader == null ? nulls() : seqReader;
698+
}
699+
700+
@Override
701+
public VectorHolder read(VectorHolder reuse, int numValsToRead) {
702+
FieldVector seqNumbers = null;
703+
try {
704+
VectorHolder seqNumbersHolder = seqReader.read(null, numValsToRead);
705+
seqNumbers = seqNumbersHolder.vector();
706+
ArrowVectorAccessor<?, String, ?, ?> seqAccessor =
707+
seqNumbers == null ? null : ArrowVectorAccessors.getVectorAccessor(seqNumbersHolder);
708+
709+
BigIntVector lastUpdatedSequenceNumbers =
710+
allocateBigIntVector(LAST_UPDATED_SEQ, numValsToRead);
711+
ArrowBuf dataBuffer = lastUpdatedSequenceNumbers.getDataBuffer();
712+
for (int i = 0; i < numValsToRead; i += 1) {
713+
long bufferOffset = (long) i * Long.BYTES;
714+
if (seqAccessor == null || isNull(seqNumbersHolder, i)) {
715+
dataBuffer.setLong(bufferOffset, lastUpdatedSeq);
716+
} else {
717+
long materializedSeqNumber = seqAccessor.getLong(i);
718+
dataBuffer.setLong(bufferOffset, materializedSeqNumber);
719+
}
720+
}
721+
722+
lastUpdatedSequenceNumbers.setValueCount(numValsToRead);
723+
return VectorHolder.vectorHolder(
724+
lastUpdatedSequenceNumbers, MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER, nulls);
725+
} finally {
726+
if (seqNumbers != null) {
727+
seqNumbers.close();
728+
}
729+
}
730+
}
731+
732+
@Override
733+
public void setRowGroupInfo(
734+
PageReadStore source, Map<ColumnPath, ColumnChunkMetaData> metadata) {
735+
seqReader.setRowGroupInfo(source, metadata);
736+
}
737+
738+
@Override
739+
public void setBatchSize(int batchSize) {
740+
if (nulls == null || nulls.size() < batchSize) {
741+
this.nulls = newNullabilityHolder(batchSize);
742+
}
743+
744+
seqReader.setBatchSize(batchSize);
745+
}
746+
747+
@Override
748+
public void close() {
749+
// don't close result vectors as they are not owned by readers
750+
}
751+
}
752+
753+
private static boolean isNull(VectorHolder holder, int index) {
754+
return holder.nullabilityHolder().isNullAt(index) == 1;
755+
}
756+
757+
private static BigIntVector allocateBigIntVector(Field field, int valueCount) {
758+
BigIntVector vector = (BigIntVector) field.createVector(ArrowAllocation.rootAllocator());
759+
vector.allocateNew(valueCount);
760+
return vector;
761+
}
762+
763+
private static NullabilityHolder newNullabilityHolder(int size) {
764+
NullabilityHolder nullabilityHolder = new NullabilityHolder(size);
765+
nullabilityHolder.setNotNulls(0, size);
766+
return nullabilityHolder;
767+
}
768+
570769
/**
571770
* A Dummy Vector Reader which doesn't actually read files, instead it returns a dummy
572771
* VectorHolder which indicates the constant value which should be used for this column.

arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedReaderBuilder.java

Lines changed: 16 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,9 @@
2424
import java.util.function.Function;
2525
import java.util.stream.IntStream;
2626
import org.apache.arrow.memory.BufferAllocator;
27-
import org.apache.iceberg.MetadataColumns;
2827
import org.apache.iceberg.Schema;
2928
import org.apache.iceberg.arrow.ArrowAllocation;
3029
import org.apache.iceberg.arrow.vectorized.VectorizedArrowReader.ConstantVectorReader;
31-
import org.apache.iceberg.arrow.vectorized.VectorizedArrowReader.DeletedVectorReader;
3230
import org.apache.iceberg.parquet.TypeWithSchemaVisitor;
3331
import org.apache.iceberg.parquet.VectorizedReader;
3432
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
@@ -101,33 +99,26 @@ public VectorizedReader<?> message(
10199
Lists.newArrayListWithExpectedSize(icebergFields.size());
102100

103101
for (Types.NestedField field : icebergFields) {
104-
int id = field.fieldId();
105-
VectorizedReader<?> reader = readersById.get(id);
106-
if (idToConstant.containsKey(id)) {
107-
reorderedFields.add(constantReader(field, idToConstant.get(id)));
108-
} else if (id == MetadataColumns.ROW_POSITION.fieldId()) {
109-
if (setArrowValidityVector) {
110-
reorderedFields.add(VectorizedArrowReader.positionsWithSetArrowValidityVector());
111-
} else {
112-
reorderedFields.add(VectorizedArrowReader.positions());
113-
}
114-
} else if (id == MetadataColumns.IS_DELETED.fieldId()) {
115-
reorderedFields.add(new DeletedVectorReader());
116-
} else if (reader != null) {
117-
reorderedFields.add(reader);
118-
} else if (field.initialDefault() != null) {
119-
reorderedFields.add(
120-
constantReader(field, convert.apply(field.type(), field.initialDefault())));
121-
} else if (field.isOptional()) {
122-
reorderedFields.add(VectorizedArrowReader.nulls());
123-
} else {
124-
throw new IllegalArgumentException(
125-
String.format("Missing required field: %s", field.name()));
126-
}
102+
VectorizedReader<?> reader =
103+
VectorizedArrowReader.replaceWithMetadataReader(
104+
field, readersById.get(field.fieldId()), idToConstant, setArrowValidityVector);
105+
reorderedFields.add(defaultReader(field, reader));
127106
}
128107
return vectorizedReader(reorderedFields);
129108
}
130109

110+
private VectorizedReader<?> defaultReader(Types.NestedField field, VectorizedReader<?> reader) {
111+
if (reader != null) {
112+
return reader;
113+
} else if (field.initialDefault() != null) {
114+
return constantReader(field, convert.apply(field.type(), field.initialDefault()));
115+
} else if (field.isOptional()) {
116+
return VectorizedArrowReader.nulls();
117+
}
118+
119+
throw new IllegalArgumentException(String.format("Missing required field: %s", field.name()));
120+
}
121+
131122
private <T> ConstantVectorReader<T> constantReader(Types.NestedField field, T constant) {
132123
return new ConstantVectorReader<>(field, constant);
133124
}

0 commit comments

Comments
 (0)