Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ private VectorHolder(Types.NestedField field) {
icebergField = field;
}

private VectorHolder(FieldVector vec, Types.NestedField field, NullabilityHolder nulls) {
VectorHolder(FieldVector vec, Types.NestedField field, NullabilityHolder nulls) {
columnDescriptor = null;
vector = vec;
isDictionaryEncoded = false;
Expand Down Expand Up @@ -167,6 +167,11 @@ public PositionVectorHolder(
}
}

public static VectorHolder vectorHolder(
FieldVector vector, Types.NestedField icebergField, NullabilityHolder nulls) {
return new VectorHolder(vector, icebergField, nulls);
}

public static class DeletedVectorHolder extends VectorHolder {
private final int numRows;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,53 @@ public static VectorizedArrowReader positionsWithSetArrowValidityVector() {
return new PositionVectorReader(true);
}

public static VectorizedArrowReader rowIds(Long baseRowId, VectorizedArrowReader idReader) {
if (baseRowId != null) {
return new RowIdVectorReader(baseRowId, idReader);
} else {
return nulls();
}
}

public static VectorizedArrowReader lastUpdated(
Long baseRowId, Long fileLastUpdated, VectorizedArrowReader seqReader) {
if (fileLastUpdated != null && baseRowId != null) {
return new LastUpdatedSeqVectorReader(fileLastUpdated, seqReader);
} else {
return nulls();
}
}

public static VectorizedReader<?> replaceWithMetadataReader(
Types.NestedField icebergField,
VectorizedReader<?> reader,
Map<Integer, ?> idToConstant,
boolean setArrowValidityVector) {
int id = icebergField.fieldId();
if (id == MetadataColumns.ROW_ID.fieldId()) {
Long baseRowId = (Long) idToConstant.get(id);
return rowIds(baseRowId, (VectorizedArrowReader) reader);
} else if (id == MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.fieldId()) {
Long baseRowId = (Long) idToConstant.get(MetadataColumns.ROW_ID.fieldId());
Long fileSeqNumber = (Long) idToConstant.get(id);
return VectorizedArrowReader.lastUpdated(
baseRowId, fileSeqNumber, (VectorizedArrowReader) reader);
} else if (idToConstant.containsKey(id)) {
// containsKey is used because the constant may be null
return new ConstantVectorReader<>(icebergField, idToConstant.get(id));
} else if (id == MetadataColumns.ROW_POSITION.fieldId()) {
if (setArrowValidityVector) {
return positionsWithSetArrowValidityVector();
} else {
return VectorizedArrowReader.positions();
}
} else if (id == MetadataColumns.IS_DELETED.fieldId()) {
return new DeletedVectorReader();
}

return reader;
}

private static final class NullVectorReader extends VectorizedArrowReader {
private static final NullVectorReader INSTANCE = new NullVectorReader();

Expand Down Expand Up @@ -530,12 +577,6 @@ private static BigIntVector newVector(int valueCount) {
return vector;
}

private static NullabilityHolder newNullabilityHolder(int size) {
NullabilityHolder nullabilityHolder = new NullabilityHolder(size);
nullabilityHolder.setNotNulls(0, size);
return nullabilityHolder;
}

@Override
public void setRowGroupInfo(
PageReadStore source, Map<ColumnPath, ColumnChunkMetaData> metadata) {
Expand Down Expand Up @@ -567,6 +608,164 @@ public void close() {
}
}

private static final class RowIdVectorReader extends VectorizedArrowReader {
private static final Field ROW_ID_ARROW_FIELD = ArrowSchemaUtil.convert(MetadataColumns.ROW_ID);

private final long firstRowId;
private final VectorizedReader<VectorHolder> idReader;
private final VectorizedReader<VectorHolder> posReader;
private NullabilityHolder nulls;

private RowIdVectorReader(long firstRowId, VectorizedArrowReader idReader) {
this.firstRowId = firstRowId;
this.idReader = idReader != null ? idReader : nulls();
this.posReader = new PositionVectorReader(true);
}

@Override
public VectorHolder read(VectorHolder reuse, int numValsToRead) {
FieldVector positions = null;
FieldVector ids = null;

try {
positions = posReader.read(null, numValsToRead).vector();
VectorHolder idsHolder = idReader.read(null, numValsToRead);
ids = idsHolder.vector();
ArrowVectorAccessor<?, String, ?, ?> idsAccessor =
ids == null ? null : ArrowVectorAccessors.getVectorAccessor(idsHolder);

BigIntVector rowIds = allocateBigIntVector(ROW_ID_ARROW_FIELD, numValsToRead);
ArrowBuf dataBuffer = rowIds.getDataBuffer();
for (int i = 0; i < numValsToRead; i += 1) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
for (int i = 0; i < numValsToRead; i += 1) {
for (int i = 0; i < numValsToRead; i++) {

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Happy to change it if you feel strongly about it, but I mostly just followed the increment pattern of i += 1 already in this class (and this package it looks like). If we do change it, I'd change it for the other instances in this class just to keep things consistent.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually didn't realize that we have so many places that do i += 1 in for loops. It's not a big deal and I don't feel strong about it but it would be great to fix this throughout the codebase in a separate PR

long bufferOffset = (long) i * Long.BYTES;
if (idsAccessor == null || isNull(idsHolder, i)) {
long rowId = firstRowId + (Long) positions.getObject(i);
dataBuffer.setLong(bufferOffset, rowId);
} else {
long materializedRowId = idsAccessor.getLong(i);
dataBuffer.setLong(bufferOffset, materializedRowId);
}
}

rowIds.setValueCount(numValsToRead);
return VectorHolder.vectorHolder(rowIds, MetadataColumns.ROW_ID, nulls);
} finally {
if (positions != null) {
positions.close();
}

if (ids != null) {
ids.close();
}
}
}

@Override
public void setRowGroupInfo(
PageReadStore source, Map<ColumnPath, ColumnChunkMetaData> metadata) {
idReader.setRowGroupInfo(source, metadata);
posReader.setRowGroupInfo(source, metadata);
}

@Override
public void setBatchSize(int batchSize) {
if (nulls == null || nulls.size() < batchSize) {
this.nulls = newNullabilityHolder(batchSize);
}

idReader.setBatchSize(batchSize);
posReader.setBatchSize(batchSize);
}

@Override
public void close() {
// don't close result vectors as they are not owned by readers
}
}

private static final class LastUpdatedSeqVectorReader extends VectorizedArrowReader {
private static final Field LAST_UPDATED_SEQ =
ArrowSchemaUtil.convert(MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER);

private final long lastUpdatedSeq;
private final VectorizedReader<VectorHolder> seqReader;
private NullabilityHolder nulls;

private LastUpdatedSeqVectorReader(
long lastUpdatedSeq, VectorizedReader<VectorHolder> seqReader) {
this.lastUpdatedSeq = lastUpdatedSeq;
this.seqReader = seqReader == null ? nulls() : seqReader;
}

@Override
public VectorHolder read(VectorHolder reuse, int numValsToRead) {
FieldVector seqNumbers = null;
try {
VectorHolder seqNumbersHolder = seqReader.read(null, numValsToRead);
seqNumbers = seqNumbersHolder.vector();
ArrowVectorAccessor<?, String, ?, ?> seqAccessor =
seqNumbers == null ? null : ArrowVectorAccessors.getVectorAccessor(seqNumbersHolder);

BigIntVector lastUpdatedSequenceNumbers =
allocateBigIntVector(LAST_UPDATED_SEQ, numValsToRead);
ArrowBuf dataBuffer = lastUpdatedSequenceNumbers.getDataBuffer();
for (int i = 0; i < numValsToRead; i += 1) {
long bufferOffset = (long) i * Long.BYTES;
if (seqAccessor == null || isNull(seqNumbersHolder, i)) {
dataBuffer.setLong(bufferOffset, lastUpdatedSeq);
} else {
long materializedSeqNumber = seqAccessor.getLong(i);
dataBuffer.setLong(bufferOffset, materializedSeqNumber);
}
}

lastUpdatedSequenceNumbers.setValueCount(numValsToRead);
return VectorHolder.vectorHolder(
lastUpdatedSequenceNumbers, MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER, nulls);
} finally {
if (seqNumbers != null) {
seqNumbers.close();
}
}
}

@Override
public void setRowGroupInfo(
PageReadStore source, Map<ColumnPath, ColumnChunkMetaData> metadata) {
seqReader.setRowGroupInfo(source, metadata);
}

@Override
public void setBatchSize(int batchSize) {
if (nulls == null || nulls.size() < batchSize) {
this.nulls = newNullabilityHolder(batchSize);
}

seqReader.setBatchSize(batchSize);
}

@Override
public void close() {
// don't close result vectors as they are not owned by readers
}
}

private static boolean isNull(VectorHolder holder, int index) {
return holder.nullabilityHolder().isNullAt(index) == 1;
}

private static BigIntVector allocateBigIntVector(Field field, int valueCount) {
BigIntVector vector = (BigIntVector) field.createVector(ArrowAllocation.rootAllocator());
vector.allocateNew(valueCount);
return vector;
}

private static NullabilityHolder newNullabilityHolder(int size) {
NullabilityHolder nullabilityHolder = new NullabilityHolder(size);
nullabilityHolder.setNotNulls(0, size);
return nullabilityHolder;
}

/**
* A Dummy Vector Reader which doesn't actually read files, instead it returns a dummy
* VectorHolder which indicates the constant value which should be used for this column.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,9 @@
import java.util.function.Function;
import java.util.stream.IntStream;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.iceberg.MetadataColumns;
import org.apache.iceberg.Schema;
import org.apache.iceberg.arrow.ArrowAllocation;
import org.apache.iceberg.arrow.vectorized.VectorizedArrowReader.ConstantVectorReader;
import org.apache.iceberg.arrow.vectorized.VectorizedArrowReader.DeletedVectorReader;
import org.apache.iceberg.parquet.TypeWithSchemaVisitor;
import org.apache.iceberg.parquet.VectorizedReader;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
Expand Down Expand Up @@ -101,33 +99,26 @@ public VectorizedReader<?> message(
Lists.newArrayListWithExpectedSize(icebergFields.size());

for (Types.NestedField field : icebergFields) {
int id = field.fieldId();
VectorizedReader<?> reader = readersById.get(id);
if (idToConstant.containsKey(id)) {
reorderedFields.add(constantReader(field, idToConstant.get(id)));
} else if (id == MetadataColumns.ROW_POSITION.fieldId()) {
if (setArrowValidityVector) {
reorderedFields.add(VectorizedArrowReader.positionsWithSetArrowValidityVector());
} else {
reorderedFields.add(VectorizedArrowReader.positions());
}
} else if (id == MetadataColumns.IS_DELETED.fieldId()) {
reorderedFields.add(new DeletedVectorReader());
} else if (reader != null) {
reorderedFields.add(reader);
} else if (field.initialDefault() != null) {
reorderedFields.add(
constantReader(field, convert.apply(field.type(), field.initialDefault())));
} else if (field.isOptional()) {
reorderedFields.add(VectorizedArrowReader.nulls());
} else {
throw new IllegalArgumentException(
String.format("Missing required field: %s", field.name()));
}
VectorizedReader<?> reader =
VectorizedArrowReader.replaceWithMetadataReader(
field, readersById.get(field.fieldId()), idToConstant, setArrowValidityVector);
reorderedFields.add(defaultReader(field, reader));
Comment on lines +102 to +105
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as the refactoring done in https://github.com/apache/iceberg/pull/12836/files

}
return vectorizedReader(reorderedFields);
}

private VectorizedReader<?> defaultReader(Types.NestedField field, VectorizedReader<?> reader) {
if (reader != null) {
return reader;
} else if (field.initialDefault() != null) {
return constantReader(field, convert.apply(field.type(), field.initialDefault()));
} else if (field.isOptional()) {
return VectorizedArrowReader.nulls();
}

throw new IllegalArgumentException(String.format("Missing required field: %s", field.name()));
}

private <T> ConstantVectorReader<T> constantReader(Types.NestedField field, T constant) {
return new ConstantVectorReader<>(field, constant);
}
Expand Down
Loading