From a69ec52a7f08470212a615311fd63e7229ea1f38 Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Tue, 10 Jun 2025 10:26:36 -0700 Subject: [PATCH 01/31] rebase --- .../parquet/VectorizedPageIterator.java | 33 ++++---- ...ectorizedParquetDefinitionLevelReader.java | 50 ++++++------ .../parquet/VectorizedPlainValuesReader.java | 77 +++++++++++++++++++ .../parquet/VectorizedValuesReader.java | 55 +++++++++++++ 4 files changed, 174 insertions(+), 41 deletions(-) create mode 100644 arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPlainValuesReader.java create mode 100644 arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java index b97eb1545550..4f01216f35b3 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java @@ -45,7 +45,7 @@ public VectorizedPageIterator( this.setArrowValidityVector = setValidityVector; } - private ValuesAsBytesReader plainValuesReader = null; + private VectorizedValuesReader valuesReader = null; private VectorizedDictionaryEncodedParquetValuesReader dictionaryEncodedValuesReader = null; private boolean allPagesDictEncoded; private VectorizedParquetDefinitionLevelReader vectorizedDefinitionLevelReader; @@ -65,13 +65,13 @@ public void setAllPagesDictEncoded(boolean allDictEncoded) { @Override protected void reset() { super.reset(); - this.plainValuesReader = null; + this.valuesReader = null; this.vectorizedDefinitionLevelReader = null; } @Override protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) { - ValuesReader previousReader = plainValuesReader; + ValuesReader previousReader = (ValuesReader) valuesReader; if (dataEncoding.usesDictionary()) { if (dictionary == null) { throw new ParquetDecodingException( @@ -94,7 +94,9 @@ protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, i throw new ParquetDecodingException("could not read page in col " + desc, e); } } else { - if (dataEncoding != Encoding.PLAIN) { + if (dataEncoding == Encoding.PLAIN) { + valuesReader = new VectorizedPlainValuesReader(); + } else { throw new UnsupportedOperationException( "Cannot support vectorized reads for column " + desc @@ -103,14 +105,13 @@ protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, i + dataEncoding + ". Disable vectorized reads to read this table/file"); } - plainValuesReader = new ValuesAsBytesReader(); - plainValuesReader.initFromPage(valueCount, in); + valuesReader.initFromPage(valueCount, in); dictionaryDecodeMode = DictionaryDecodeMode.NONE; } if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) && previousReader instanceof RequiresPreviousReader) { // previous reader can only be set if reading sequentially - ((RequiresPreviousReader) plainValuesReader).setPreviousReader(previousReader); + ((RequiresPreviousReader) valuesReader).setPreviousReader(previousReader); } } @@ -204,7 +205,7 @@ protected void nextVal( FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { vectorizedDefinitionLevelReader .integerReader() - .nextBatch(vector, numVals, typeWidth, batchSize, holder, plainValuesReader); + .nextBatch(vector, numVals, typeWidth, batchSize, holder, valuesReader); } @Override @@ -231,7 +232,7 @@ protected void nextVal( FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { vectorizedDefinitionLevelReader .longReader() - .nextBatch(vector, numVals, typeWidth, batchSize, holder, plainValuesReader); + .nextBatch(vector, numVals, typeWidth, batchSize, holder, valuesReader); } @Override @@ -262,7 +263,7 @@ protected void nextVal( FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { vectorizedDefinitionLevelReader .timestampMillisReader() - .nextBatch(vector, numVals, typeWidth, batchSize, holder, plainValuesReader); + .nextBatch(vector, numVals, typeWidth, batchSize, holder, valuesReader); } @Override @@ -288,7 +289,7 @@ protected void nextVal( FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { vectorizedDefinitionLevelReader .timestampInt96Reader() - .nextBatch(vector, numVals, typeWidth, batchSize, holder, plainValuesReader); + .nextBatch(vector, numVals, typeWidth, batchSize, holder, valuesReader); } @Override @@ -315,7 +316,7 @@ protected void nextVal( FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { vectorizedDefinitionLevelReader .floatReader() - .nextBatch(vector, numVals, typeWidth, batchSize, holder, plainValuesReader); + .nextBatch(vector, numVals, typeWidth, batchSize, holder, valuesReader); } @Override @@ -342,7 +343,7 @@ protected void nextVal( FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { vectorizedDefinitionLevelReader .doubleReader() - .nextBatch(vector, numVals, typeWidth, batchSize, holder, plainValuesReader); + .nextBatch(vector, numVals, typeWidth, batchSize, holder, valuesReader); } @Override @@ -371,7 +372,7 @@ protected void nextVal( FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { vectorizedDefinitionLevelReader .fixedSizeBinaryReader() - .nextBatch(vector, numVals, typeWidth, batchSize, holder, plainValuesReader); + .nextBatch(vector, numVals, typeWidth, batchSize, holder, valuesReader); } @Override @@ -397,7 +398,7 @@ protected void nextVal( FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { vectorizedDefinitionLevelReader .varWidthReader() - .nextBatch(vector, numVals, typeWidth, batchSize, holder, plainValuesReader); + .nextBatch(vector, numVals, typeWidth, batchSize, holder, valuesReader); } @Override @@ -423,7 +424,7 @@ protected void nextVal( FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { vectorizedDefinitionLevelReader .booleanReader() - .nextBatch(vector, numVals, typeWidth, batchSize, holder, plainValuesReader); + .nextBatch(vector, numVals, typeWidth, batchSize, holder, valuesReader); } @Override diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java index 0d7bbc6e4977..b85e350fd063 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java @@ -84,7 +84,7 @@ public void nextBatch( final int typeWidth, final int numValsToRead, NullabilityHolder nullabilityHolder, - ValuesAsBytesReader valuesReader) { + VectorizedValuesReader valuesReader) { nextBatch( vector, startOffset, @@ -147,7 +147,7 @@ protected abstract void nextRleBatch( FieldVector vector, int typeWidth, NullabilityHolder nullabilityHolder, - ValuesAsBytesReader valuesReader, + VectorizedValuesReader valuesReader, int idx, int numValues, byte[] byteArray); @@ -156,7 +156,7 @@ protected abstract void nextPackedBatch( FieldVector vector, int typeWidth, NullabilityHolder nullabilityHolder, - ValuesAsBytesReader valuesReader, + VectorizedValuesReader valuesReader, int idx, int numValues, byte[] byteArray); @@ -220,7 +220,7 @@ protected void nextRleBatch( final FieldVector vector, final int typeWidth, NullabilityHolder nullabilityHolder, - ValuesAsBytesReader valuesReader, + VectorizedValuesReader valuesReader, int idx, int numValues, byte[] byteArray) { @@ -232,7 +232,7 @@ protected void nextPackedBatch( final FieldVector vector, final int typeWidth, NullabilityHolder nullabilityHolder, - ValuesAsBytesReader valuesReader, + VectorizedValuesReader valuesReader, int idx, int numValues, byte[] byteArray) { @@ -252,13 +252,13 @@ protected void nextPackedBatch( } protected abstract void nextVal( - FieldVector vector, int idx, ValuesAsBytesReader valuesReader, Mode mode); + FieldVector vector, int idx, VectorizedValuesReader valuesReader, Mode mode); } class LongReader extends NumericBaseReader { @Override protected void nextVal( - FieldVector vector, int idx, ValuesAsBytesReader valuesReader, Mode mode) { + FieldVector vector, int idx, VectorizedValuesReader valuesReader, Mode mode) { vector.getDataBuffer().setLong(idx, valuesReader.readLong()); } @@ -285,7 +285,7 @@ protected void nextDictEncodedVal( class DoubleReader extends NumericBaseReader { @Override protected void nextVal( - FieldVector vector, int idx, ValuesAsBytesReader valuesReader, Mode mode) { + FieldVector vector, int idx, VectorizedValuesReader valuesReader, Mode mode) { vector.getDataBuffer().setDouble(idx, valuesReader.readDouble()); } @@ -312,7 +312,7 @@ protected void nextDictEncodedVal( class FloatReader extends NumericBaseReader { @Override protected void nextVal( - FieldVector vector, int idx, ValuesAsBytesReader valuesReader, Mode mode) { + FieldVector vector, int idx, VectorizedValuesReader valuesReader, Mode mode) { vector.getDataBuffer().setFloat(idx, valuesReader.readFloat()); } @@ -339,7 +339,7 @@ protected void nextDictEncodedVal( class IntegerReader extends NumericBaseReader { @Override protected void nextVal( - FieldVector vector, int idx, ValuesAsBytesReader valuesReader, Mode mode) { + FieldVector vector, int idx, VectorizedValuesReader valuesReader, Mode mode) { vector.getDataBuffer().setInt(idx, valuesReader.readInteger()); } @@ -371,7 +371,7 @@ protected void nextRleBatch( FieldVector vector, int typeWidth, NullabilityHolder nullabilityHolder, - ValuesAsBytesReader valuesReader, + VectorizedValuesReader valuesReader, int idx, int numValues, byte[] byteArray) { @@ -392,7 +392,7 @@ protected void nextPackedBatch( FieldVector vector, int typeWidth, NullabilityHolder nullabilityHolder, - ValuesAsBytesReader valuesReader, + VectorizedValuesReader valuesReader, int idx, int numValues, byte[] byteArray) { @@ -411,7 +411,7 @@ protected void nextPackedBatch( protected abstract void nextVal( FieldVector vector, int idx, - ValuesAsBytesReader valuesReader, + VectorizedValuesReader valuesReader, int typeWidth, byte[] byteArray); } @@ -422,7 +422,7 @@ class TimestampMillisReader extends BaseReader { protected void nextVal( FieldVector vector, int idx, - ValuesAsBytesReader valuesReader, + VectorizedValuesReader valuesReader, int typeWidth, byte[] byteArray) { vector.getDataBuffer().setLong((long) idx * typeWidth, valuesReader.readLong() * 1000); @@ -455,11 +455,11 @@ class TimestampInt96Reader extends BaseReader { protected void nextVal( FieldVector vector, int idx, - ValuesAsBytesReader valuesReader, + VectorizedValuesReader valuesReader, int typeWidth, byte[] byteArray) { // 8 bytes (time of day nanos) + 4 bytes(julianDay) = 12 bytes - ByteBuffer buffer = valuesReader.getBuffer(12).order(ByteOrder.LITTLE_ENDIAN); + ByteBuffer buffer = valuesReader.readBinary(12).toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); long timestampInt96 = ParquetUtil.extractTimestampInt96(buffer); vector.getDataBuffer().setLong((long) idx * typeWidth, timestampInt96); } @@ -500,10 +500,10 @@ class FixedSizeBinaryReader extends BaseReader { protected void nextVal( FieldVector vector, int idx, - ValuesAsBytesReader valuesReader, + VectorizedValuesReader valuesReader, int typeWidth, byte[] byteArray) { - valuesReader.getBuffer(typeWidth).get(byteArray, 0, typeWidth); + valuesReader.readBinary(typeWidth).toByteBuffer().get(byteArray, 0, typeWidth); ((FixedSizeBinaryVector) vector).set(idx, byteArray); } @@ -535,11 +535,11 @@ class VarWidthReader extends BaseReader { protected void nextVal( FieldVector vector, int idx, - ValuesAsBytesReader valuesReader, + VectorizedValuesReader valuesReader, int typeWidth, byte[] byteArray) { int len = valuesReader.readInteger(); - ByteBuffer buffer = valuesReader.getBuffer(len); + ByteBuffer buffer = valuesReader.readBinary(len).toByteBuffer(); // Calling setValueLengthSafe takes care of allocating a larger buffer if // running out of space. ((BaseVariableWidthVector) vector).setValueLengthSafe(idx, len); @@ -580,10 +580,10 @@ class BooleanReader extends BaseReader { protected void nextVal( FieldVector vector, int idx, - ValuesAsBytesReader valuesReader, + VectorizedValuesReader valuesReader, int typeWidth, byte[] byteArray) { - ((BitVector) vector).setSafe(idx, valuesReader.readBooleanAsInt()); + ((BitVector) vector).setSafe(idx, valuesReader.readBoolean() ? 1 : 0); } @Override @@ -606,7 +606,7 @@ class DictionaryIdReader extends BaseReader { protected void nextVal( FieldVector vector, int idx, - ValuesAsBytesReader valuesReader, + VectorizedValuesReader valuesReader, int typeWidth, byte[] byteArray) { throw new UnsupportedOperationException(); @@ -651,13 +651,13 @@ private void setNulls( private void setNextNValuesInVector( int typeWidth, NullabilityHolder nullabilityHolder, - ValuesAsBytesReader valuesReader, + VectorizedValuesReader valuesReader, int bufferIdx, FieldVector vector, int numValues) { ArrowBuf validityBuffer = vector.getValidityBuffer(); if (currentValue == maxDefLevel) { - ByteBuffer buffer = valuesReader.getBuffer(numValues * typeWidth); + ByteBuffer buffer = valuesReader.readBinary(numValues * typeWidth).toByteBuffer(); vector.getDataBuffer().setBytes((long) bufferIdx * typeWidth, buffer); nullabilityHolder.setNotNulls(bufferIdx, numValues); if (setArrowValidityVector) { diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPlainValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPlainValuesReader.java new file mode 100644 index 000000000000..9bee621f7760 --- /dev/null +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPlainValuesReader.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.arrow.vectorized.parquet; + +import java.nio.ByteBuffer; +import org.apache.arrow.vector.FieldVector; +import org.apache.iceberg.parquet.ValuesAsBytesReader; +import org.apache.parquet.io.api.Binary; + +class VectorizedPlainValuesReader extends ValuesAsBytesReader implements VectorizedValuesReader { + + VectorizedPlainValuesReader() {} + + @Override + public byte readByte() { + return (byte) readInteger(); + } + + @Override + public short readShort() { + return (short) readInteger(); + } + + @Override + public Binary readBinary(int len) { + ByteBuffer buffer = getBuffer(len); + if (buffer.hasArray()) { + return Binary.fromConstantByteArray( + buffer.array(), buffer.arrayOffset() + buffer.position(), len); + } else { + byte[] bytes = new byte[len]; + buffer.get(bytes); + return Binary.fromConstantByteArray(bytes); + } + } + + private void readValues(int total, FieldVector vec, int rowId, int typeWidth) { + ByteBuffer buffer = getBuffer(total * typeWidth); + vec.getDataBuffer().setBytes((long) rowId * typeWidth, buffer); + } + + @Override + public void readIntegers(int total, FieldVector vec, int rowId) { + readValues(total, vec, rowId, 4); + } + + @Override + public void readLongs(int total, FieldVector vec, int rowId) { + readValues(total, vec, rowId, 8); + } + + @Override + public void readFloats(int total, FieldVector vec, int rowId) { + readValues(total, vec, rowId, 4); + } + + @Override + public void readDoubles(int total, FieldVector vec, int rowId) { + readValues(total, vec, rowId, 8); + } +} \ No newline at end of file diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java new file mode 100644 index 000000000000..e1d65e5e8a3a --- /dev/null +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.arrow.vectorized.parquet; + +import org.apache.arrow.vector.FieldVector; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.io.api.Binary; + +/** Interface for value decoding that supports vectorized (aka batched) decoding. */ +interface VectorizedValuesReader { + boolean readBoolean(); + + byte readByte(); + + short readShort(); + + int readInteger(); + + long readLong(); + + float readFloat(); + + double readDouble(); + + Binary readBinary(int len); + + /* + * Reads `total` values into `vec` start at `vec[rowId]` + */ + void readIntegers(int total, FieldVector vec, int rowId); + + void readLongs(int total, FieldVector vec, int rowId); + + void readFloats(int total, FieldVector vec, int rowId); + + void readDoubles(int total, FieldVector vec, int rowId); + + void initFromPage(int valueCount, ByteBufferInputStream in); +} \ No newline at end of file From 0bba5eff56b46035d3a3755afd88417f486c0f2e Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Tue, 10 Jun 2025 10:33:29 -0700 Subject: [PATCH 02/31] lint --- .../parquet/VectorizedPageIterator.java | 1 - ...ectorizedParquetDefinitionLevelReader.java | 1 - .../parquet/VectorizedPlainValuesReader.java | 82 +++++++++---------- .../parquet/VectorizedValuesReader.java | 34 ++++---- 4 files changed, 58 insertions(+), 60 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java index 4f01216f35b3..7551776853b8 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java @@ -24,7 +24,6 @@ import org.apache.iceberg.arrow.vectorized.NullabilityHolder; import org.apache.iceberg.parquet.BasePageIterator; import org.apache.iceberg.parquet.ParquetUtil; -import org.apache.iceberg.parquet.ValuesAsBytesReader; import org.apache.parquet.CorruptDeltaByteArrays; import org.apache.parquet.bytes.ByteBufferInputStream; import org.apache.parquet.bytes.BytesUtils; diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java index b85e350fd063..0f85101a5b79 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java @@ -29,7 +29,6 @@ import org.apache.arrow.vector.IntVector; import org.apache.iceberg.arrow.vectorized.NullabilityHolder; import org.apache.iceberg.parquet.ParquetUtil; -import org.apache.iceberg.parquet.ValuesAsBytesReader; import org.apache.parquet.column.Dictionary; public final class VectorizedParquetDefinitionLevelReader diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPlainValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPlainValuesReader.java index 9bee621f7760..2c5467c9c57c 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPlainValuesReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPlainValuesReader.java @@ -25,53 +25,53 @@ class VectorizedPlainValuesReader extends ValuesAsBytesReader implements VectorizedValuesReader { - VectorizedPlainValuesReader() {} + VectorizedPlainValuesReader() {} - @Override - public byte readByte() { - return (byte) readInteger(); - } + @Override + public byte readByte() { + return (byte) readInteger(); + } - @Override - public short readShort() { - return (short) readInteger(); - } + @Override + public short readShort() { + return (short) readInteger(); + } - @Override - public Binary readBinary(int len) { - ByteBuffer buffer = getBuffer(len); - if (buffer.hasArray()) { - return Binary.fromConstantByteArray( - buffer.array(), buffer.arrayOffset() + buffer.position(), len); - } else { - byte[] bytes = new byte[len]; - buffer.get(bytes); - return Binary.fromConstantByteArray(bytes); - } + @Override + public Binary readBinary(int len) { + ByteBuffer buffer = getBuffer(len); + if (buffer.hasArray()) { + return Binary.fromConstantByteArray( + buffer.array(), buffer.arrayOffset() + buffer.position(), len); + } else { + byte[] bytes = new byte[len]; + buffer.get(bytes); + return Binary.fromConstantByteArray(bytes); } + } - private void readValues(int total, FieldVector vec, int rowId, int typeWidth) { - ByteBuffer buffer = getBuffer(total * typeWidth); - vec.getDataBuffer().setBytes((long) rowId * typeWidth, buffer); - } + private void readValues(int total, FieldVector vec, int rowId, int typeWidth) { + ByteBuffer buffer = getBuffer(total * typeWidth); + vec.getDataBuffer().setBytes((long) rowId * typeWidth, buffer); + } - @Override - public void readIntegers(int total, FieldVector vec, int rowId) { - readValues(total, vec, rowId, 4); - } + @Override + public void readIntegers(int total, FieldVector vec, int rowId) { + readValues(total, vec, rowId, 4); + } - @Override - public void readLongs(int total, FieldVector vec, int rowId) { - readValues(total, vec, rowId, 8); - } + @Override + public void readLongs(int total, FieldVector vec, int rowId) { + readValues(total, vec, rowId, 8); + } - @Override - public void readFloats(int total, FieldVector vec, int rowId) { - readValues(total, vec, rowId, 4); - } + @Override + public void readFloats(int total, FieldVector vec, int rowId) { + readValues(total, vec, rowId, 4); + } - @Override - public void readDoubles(int total, FieldVector vec, int rowId) { - readValues(total, vec, rowId, 8); - } -} \ No newline at end of file + @Override + public void readDoubles(int total, FieldVector vec, int rowId) { + readValues(total, vec, rowId, 8); + } +} diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java index e1d65e5e8a3a..13d87b40b795 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java @@ -24,32 +24,32 @@ /** Interface for value decoding that supports vectorized (aka batched) decoding. */ interface VectorizedValuesReader { - boolean readBoolean(); + boolean readBoolean(); - byte readByte(); + byte readByte(); - short readShort(); + short readShort(); - int readInteger(); + int readInteger(); - long readLong(); + long readLong(); - float readFloat(); + float readFloat(); - double readDouble(); + double readDouble(); - Binary readBinary(int len); + Binary readBinary(int len); - /* - * Reads `total` values into `vec` start at `vec[rowId]` - */ - void readIntegers(int total, FieldVector vec, int rowId); + /* + * Reads `total` values into `vec` start at `vec[rowId]` + */ + void readIntegers(int total, FieldVector vec, int rowId); - void readLongs(int total, FieldVector vec, int rowId); + void readLongs(int total, FieldVector vec, int rowId); - void readFloats(int total, FieldVector vec, int rowId); + void readFloats(int total, FieldVector vec, int rowId); - void readDoubles(int total, FieldVector vec, int rowId); + void readDoubles(int total, FieldVector vec, int rowId); - void initFromPage(int valueCount, ByteBufferInputStream in); -} \ No newline at end of file + void initFromPage(int valueCount, ByteBufferInputStream in); +} From 9ecc2be0e052adefd681cadc67aea406e20a5143 Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Wed, 18 Jun 2025 09:20:32 -0700 Subject: [PATCH 03/31] some changes per comments --- .../parquet/VectorizedPlainValuesReader.java | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPlainValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPlainValuesReader.java index 2c5467c9c57c..ac98da17f2ab 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPlainValuesReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPlainValuesReader.java @@ -25,6 +25,11 @@ class VectorizedPlainValuesReader extends ValuesAsBytesReader implements VectorizedValuesReader { + public static final int INT_SIZE = 4; + public static final int LONG_SIZE = 8; + public static final int FLOAT_SIZE = 4; + public static final int DOUBLE_SIZE = 8; + VectorizedPlainValuesReader() {} @Override @@ -57,21 +62,21 @@ private void readValues(int total, FieldVector vec, int rowId, int typeWidth) { @Override public void readIntegers(int total, FieldVector vec, int rowId) { - readValues(total, vec, rowId, 4); + readValues(total, vec, rowId, INT_SIZE); } @Override public void readLongs(int total, FieldVector vec, int rowId) { - readValues(total, vec, rowId, 8); + readValues(total, vec, rowId, LONG_SIZE); } @Override public void readFloats(int total, FieldVector vec, int rowId) { - readValues(total, vec, rowId, 4); + readValues(total, vec, rowId, FLOAT_SIZE); } @Override public void readDoubles(int total, FieldVector vec, int rowId) { - readValues(total, vec, rowId, 8); + readValues(total, vec, rowId, DOUBLE_SIZE); } } From 8d186fe9f089c9d59524fc090415321bad63f768 Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Mon, 23 Jun 2025 10:14:47 -0700 Subject: [PATCH 04/31] javadoc --- .../parquet/VectorizedValuesReader.java | 49 +++++++++++++++++-- 1 file changed, 46 insertions(+), 3 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java index 13d87b40b795..d24bafd2942f 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java @@ -20,36 +20,79 @@ import org.apache.arrow.vector.FieldVector; import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.column.values.ValuesReader; import org.apache.parquet.io.api.Binary; -/** Interface for value decoding that supports vectorized (aka batched) decoding. */ +/** + * Interface for value decoding that supports vectorized (aka batched) decoding. + * Implementations are expected to be {@link ValuesReader} instances, and this interface + * "extends" that abstract class by overriding the salient methods. + */ interface VectorizedValuesReader { + + /** + * Read a single boolean + */ boolean readBoolean(); + /** + * Read a single byte + */ byte readByte(); + /** + * Read a single short + */ short readShort(); + /** + * Read a single integer + */ int readInteger(); + /** + * Read a single long + */ long readLong(); + /** + * Read a single float + */ float readFloat(); + /** + * Read a single double + */ double readDouble(); + /** + * Read binary data of some length + * @param len The number of bytes to read + */ Binary readBinary(int len); - /* - * Reads `total` values into `vec` start at `vec[rowId]` + /** + * Read `total` integers into `vec` starting at `vec[rowId]` */ void readIntegers(int total, FieldVector vec, int rowId); + /** + * Read `total` longs into `vec` starting at `vec[rowId]` + */ void readLongs(int total, FieldVector vec, int rowId); + /** + * Read `total` floats into `vec` starting at `vec[rowId]` + */ void readFloats(int total, FieldVector vec, int rowId); + /** + * Read `total` doubles into `vec` starting at `vec[rowId]` + */ void readDoubles(int total, FieldVector vec, int rowId); + /** + * Initialize the reader from a page. See {@link ValuesReader#initFromPage(int, ByteBufferInputStream)}. + */ void initFromPage(int valueCount, ByteBufferInputStream in); } From 5ce8913013000bfd6fd800f7f401e7d2d4e8ba02 Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Mon, 23 Jun 2025 10:18:19 -0700 Subject: [PATCH 05/31] lint --- .../parquet/VectorizedValuesReader.java | 54 ++++++------------- 1 file changed, 17 insertions(+), 37 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java index d24bafd2942f..7f02752bd5c9 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java @@ -24,75 +24,55 @@ import org.apache.parquet.io.api.Binary; /** - * Interface for value decoding that supports vectorized (aka batched) decoding. - * Implementations are expected to be {@link ValuesReader} instances, and this interface - * "extends" that abstract class by overriding the salient methods. + * Interface for value decoding that supports vectorized (aka batched) decoding. Implementations are + * expected to be {@link ValuesReader} instances, and this interface "extends" that abstract class + * by overriding the salient methods. */ interface VectorizedValuesReader { - /** - * Read a single boolean - */ + /** Read a single boolean */ boolean readBoolean(); - /** - * Read a single byte - */ + /** Read a single byte */ byte readByte(); - /** - * Read a single short - */ + /** Read a single short */ short readShort(); - /** - * Read a single integer - */ + /** Read a single integer */ int readInteger(); - /** - * Read a single long - */ + /** Read a single long */ long readLong(); - /** - * Read a single float - */ + /** Read a single float */ float readFloat(); - /** - * Read a single double - */ + /** Read a single double */ double readDouble(); /** * Read binary data of some length + * * @param len The number of bytes to read */ Binary readBinary(int len); - /** - * Read `total` integers into `vec` starting at `vec[rowId]` - */ + /** Read `total` integers into `vec` starting at `vec[rowId]` */ void readIntegers(int total, FieldVector vec, int rowId); - /** - * Read `total` longs into `vec` starting at `vec[rowId]` - */ + /** Read `total` longs into `vec` starting at `vec[rowId]` */ void readLongs(int total, FieldVector vec, int rowId); - /** - * Read `total` floats into `vec` starting at `vec[rowId]` - */ + /** Read `total` floats into `vec` starting at `vec[rowId]` */ void readFloats(int total, FieldVector vec, int rowId); - /** - * Read `total` doubles into `vec` starting at `vec[rowId]` - */ + /** Read `total` doubles into `vec` starting at `vec[rowId]` */ void readDoubles(int total, FieldVector vec, int rowId); /** - * Initialize the reader from a page. See {@link ValuesReader#initFromPage(int, ByteBufferInputStream)}. + * Initialize the reader from a page. See {@link ValuesReader#initFromPage(int, + * ByteBufferInputStream)}. */ void initFromPage(int valueCount, ByteBufferInputStream in); } From 9fe0bba8e1f58d277bf83b1bedb4f33906c34256 Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Mon, 23 Jun 2025 10:48:08 -0700 Subject: [PATCH 06/31] create class --- .../VectorizedDeltaEncodedValuesReader.java | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java new file mode 100644 index 000000000000..62b44aef3d1d --- /dev/null +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java @@ -0,0 +1,69 @@ +/* + * + * * Licensed to the Apache Software Foundation (ASF) under one + * * or more contributor license agreements. See the NOTICE file + * * distributed with this work for additional information + * * regarding copyright ownership. The ASF licenses this file + * * to you under the Apache License, Version 2.0 (the + * * "License"); you may not use this file except in compliance + * * with the License. You may obtain a copy of the License at + * * + * * http://www.apache.org/licenses/LICENSE-2.0 + * * + * * Unless required by applicable law or agreed to in writing, + * * software distributed under the License is distributed on an + * * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * * KIND, either express or implied. See the License for the + * * specific language governing permissions and limitations + * * under the License. + * + */ + +package org.apache.iceberg.arrow.vectorized.parquet; + +import org.apache.arrow.vector.FieldVector; +import org.apache.parquet.column.values.ValuesReader; +import org.apache.parquet.io.api.Binary; + +public class VectorizedDeltaEncodedValuesReader extends ValuesReader implements VectorizedValuesReader { + + @Override + public void skip() { + + } + + @Override + public byte readByte() { + return 0; + } + + @Override + public short readShort() { + return 0; + } + + @Override + public Binary readBinary(int len) { + return null; + } + + @Override + public void readIntegers(int total, FieldVector vec, int rowId) { + + } + + @Override + public void readLongs(int total, FieldVector vec, int rowId) { + + } + + @Override + public void readFloats(int total, FieldVector vec, int rowId) { + + } + + @Override + public void readDoubles(int total, FieldVector vec, int rowId) { + + } +} From 6cecf96db382b9f37123d26e3545457ddb19d728 Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Mon, 23 Jun 2025 10:49:55 -0700 Subject: [PATCH 07/31] remove clash --- .../arrow/vectorized/parquet/VectorizedPageIterator.java | 7 ++++++- .../arrow/vectorized/parquet/VectorizedValuesReader.java | 3 ++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java index 7551776853b8..99cf2dc45864 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java @@ -104,7 +104,12 @@ protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, i + dataEncoding + ". Disable vectorized reads to read this table/file"); } - valuesReader.initFromPage(valueCount, in); + try { + valuesReader.initFromPage(valueCount, in); + } catch (IOException e) { + throw new ParquetDecodingException( + "could not read page " + valueCount + " in col " + desc, e); + } dictionaryDecodeMode = DictionaryDecodeMode.NONE; } if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java index 7f02752bd5c9..f8b7c92f5999 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java @@ -18,6 +18,7 @@ */ package org.apache.iceberg.arrow.vectorized.parquet; +import java.io.IOException; import org.apache.arrow.vector.FieldVector; import org.apache.parquet.bytes.ByteBufferInputStream; import org.apache.parquet.column.values.ValuesReader; @@ -74,5 +75,5 @@ interface VectorizedValuesReader { * Initialize the reader from a page. See {@link ValuesReader#initFromPage(int, * ByteBufferInputStream)}. */ - void initFromPage(int valueCount, ByteBufferInputStream in); + void initFromPage(int valueCount, ByteBufferInputStream in) throws IOException; } From 3aed168ac53a1241c33bec4ff0c76343bc0402be Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Mon, 23 Jun 2025 14:57:41 -0700 Subject: [PATCH 08/31] refactoring --- .../VectorizedDeltaEncodedValuesReader.java | 243 +++++++++++++++++- .../parquet/VectorizedPlainValuesReader.java | 5 - .../parquet/VectorizedValuesReader.java | 5 + .../iceberg/parquet/ValuesAsBytesReader.java | 4 +- 4 files changed, 243 insertions(+), 14 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java index 62b44aef3d1d..dc10b251f4aa 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java @@ -21,15 +21,80 @@ package org.apache.iceberg.arrow.vectorized.parquet; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Arrays; import org.apache.arrow.vector.FieldVector; +import org.apache.parquet.Preconditions; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.bytes.BytesUtils; import org.apache.parquet.column.values.ValuesReader; +import org.apache.parquet.column.values.bitpacking.BytePackerForLong; +import org.apache.parquet.column.values.bitpacking.Packer; +import org.apache.parquet.column.values.plain.PlainValuesReader; +import org.apache.parquet.io.ParquetDecodingException; import org.apache.parquet.io.api.Binary; -public class VectorizedDeltaEncodedValuesReader extends ValuesReader implements VectorizedValuesReader { +/** + * A {@link VectorizedValuesReader} implementation for the encoding type DELTA_BINARY_PACKED. + * This is adapted from Spark's VectorizedDeltaBinaryPackedReader. + * + * @see + * Parquet format encodings: DELTA_BINARY_PACKED + */ +public class VectorizedDeltaEncodedValuesReader + extends ValuesReader implements VectorizedValuesReader { - @Override - public void skip() { + // header data + private int blockSizeInValues; + private int miniBlockNumInABlock; + private int totalValueCount; + private long firstValue; + + private int miniBlockSizeInValues; + + // values read by the caller + private int valuesRead = 0; + // variables to keep state of the current block and miniblock + private long lastValueRead; // needed to compute the next value + private long minDeltaInCurrentBlock; // needed to compute the next value + // currentMiniBlock keeps track of the mini block within the current block that + // we read and decoded most recently. Only used as an index into + // bitWidths array + private int currentMiniBlock = 0; + private int[] bitWidths; // bit widths for each miniBlock in the current block + private int remainingInBlock = 0; // values in current block still to be read + private int remainingInMiniBlock = 0; // values in current mini block still to be read + private long[] unpackedValuesBuffer; + + private ByteBufferInputStream in; + + // temporary buffers used by readByte, readShort, readInteger, and readLong + private byte byteVal; + private short shortVal; + private int intVal; + private long longVal; + + @Override + public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOException { + Preconditions.checkArgument(valueCount >= 1, + "Page must have at least one value, but it has " + valueCount); + this.in = in; + // Read the header + this.blockSizeInValues = BytesUtils.readUnsignedVarInt(in); + this.miniBlockNumInABlock = BytesUtils.readUnsignedVarInt(in); + double miniSize = (double) blockSizeInValues / miniBlockNumInABlock; + Preconditions.checkArgument(miniSize % 8 == 0, + "miniBlockSize must be multiple of 8, but it's " + miniSize); + this.miniBlockSizeInValues = (int) miniSize; + // True value count. May be less than valueCount because of nulls + this.totalValueCount = BytesUtils.readUnsignedVarInt(in); + this.bitWidths = new int[miniBlockNumInABlock]; + this.unpackedValuesBuffer = new long[miniBlockSizeInValues]; + // read the first value + firstValue = BytesUtils.readZigZagVarLong(in); } @Override @@ -42,28 +107,192 @@ public short readShort() { return 0; } + @Override + public int readInteger() { + return -1; + } + + @Override + public long readLong() { + return -1; + } + + @Override + public void skip() { + throw new UnsupportedOperationException("skip is not supported"); + } + @Override public Binary readBinary(int len) { - return null; + throw new UnsupportedOperationException("readBinary is not supported"); } @Override public void readIntegers(int total, FieldVector vec, int rowId) { - + readValues( + total, + vec, + rowId, + INT_SIZE, + (b, v) -> b.putInt((int) v)); } @Override public void readLongs(int total, FieldVector vec, int rowId) { - + readValues( + total, + vec, + rowId, + LONG_SIZE, + ByteBuffer::putLong); } @Override public void readFloats(int total, FieldVector vec, int rowId) { - + throw new UnsupportedOperationException("readFloats is not supported"); } @Override public void readDoubles(int total, FieldVector vec, int rowId) { + throw new UnsupportedOperationException("readDoubles is not supported"); + } + + private void readValues( + int total, FieldVector vec, int rowId, int typeWidth, IntegerOutputWriter outputWriter) { + if (valuesRead + total > totalValueCount) { + throw new ParquetDecodingException( + "No more values to read. Total values read: " + valuesRead + ", total count: " + + totalValueCount + ", trying to read " + total + " more."); + } + int remaining = total; + // First value + if (valuesRead == 0) { + ByteBuffer firstValueBuffer = getBuffer(typeWidth); + outputWriter.write(firstValueBuffer, firstValue); + vec.getDataBuffer().setBytes((long) rowId * typeWidth, firstValueBuffer); + lastValueRead = firstValue; + rowId++; + remaining--; + } + while (remaining > 0) { + int n; + try { + n = loadMiniBlockToOutput(remaining, vec, rowId, typeWidth, outputWriter); + } catch (IOException e) { + throw new ParquetDecodingException("Error reading mini block.", e); + } + rowId += n; + remaining -= n; + } + valuesRead = total - remaining; + } + + /** + * Read from a mini block. Read at most 'remaining' values into output. + * + * @return the number of values read into output + */ + private int loadMiniBlockToOutput( + int remaining, + FieldVector vec, + int rowId, + int typeWidth, + IntegerOutputWriter outputWriter) throws IOException { + + // new block; read the block header + if (remainingInBlock == 0) { + readBlockHeader(); + } + + // new miniblock, unpack the miniblock + if (remainingInMiniBlock == 0) { + unpackMiniBlock(); + } + + // read values from miniblock + ByteBuffer buffer = getBuffer(remainingInMiniBlock * typeWidth); + int valuesRead = 0; + for (int i = miniBlockSizeInValues - remainingInMiniBlock; + i < miniBlockSizeInValues && valuesRead < remaining; i++) { + // calculate values from deltas unpacked for current block + long outValue = lastValueRead + minDeltaInCurrentBlock + unpackedValuesBuffer[i]; + lastValueRead = outValue; + outputWriter.write(buffer, outValue); + remainingInBlock--; + remainingInMiniBlock--; + valuesRead++; + } + vec.getDataBuffer().setBytes((long) rowId * typeWidth, buffer); + + return valuesRead; + } + + private void readBlockHeader() { + try { + minDeltaInCurrentBlock = BytesUtils.readZigZagVarLong(in); + } catch (IOException e) { + throw new ParquetDecodingException("Can not read min delta in current block", e); + } + readBitWidthsForMiniBlocks(); + remainingInBlock = blockSizeInValues; + currentMiniBlock = 0; + remainingInMiniBlock = 0; + } + + private ByteBuffer getBuffer(int length) { + try { + return this.in.slice(length).order(ByteOrder.LITTLE_ENDIAN); + } catch (IOException e) { + throw new ParquetDecodingException("Failed to read " + length + " bytes", e); + } + } + + /** + * mini block has a size of 8*n, unpack 32 value each time + * + * see org.apache.parquet.column.values.delta.DeltaBinaryPackingValuesReader#unpackMiniBlock + */ + private void unpackMiniBlock() throws IOException { + Arrays.fill(this.unpackedValuesBuffer, 0); + BytePackerForLong packer = Packer.LITTLE_ENDIAN.newBytePackerForLong( + bitWidths[currentMiniBlock]); + for (int j = 0; j < miniBlockSizeInValues; j += 8) { + ByteBuffer buffer = in.slice(packer.getBitWidth()); + if (buffer.hasArray()) { + packer.unpack8Values(buffer.array(), + buffer.arrayOffset() + buffer.position(), unpackedValuesBuffer, j); + } else { + packer.unpack8Values(buffer, buffer.position(), unpackedValuesBuffer, j); + } + } + remainingInMiniBlock = miniBlockSizeInValues; + currentMiniBlock++; + } + + // From org.apache.parquet.column.values.delta.DeltaBinaryPackingValuesReader + private void readBitWidthsForMiniBlocks() { + for (int i = 0; i < miniBlockNumInABlock; i++) { + try { + bitWidths[i] = BytesUtils.readIntLittleEndianOnOneByte(in); + } catch (IOException e) { + throw new ParquetDecodingException("Can not decode bitwidth in block header", e); + } + } + } + + /** + * A functional interface to write long values to into a ByteBuffer + */ + @FunctionalInterface + interface IntegerOutputWriter { + /** + * A functional interface that writes a long value to a specified row in a ByteBuffer, + * which will be written into a FieldVector + * + * @param buffer a ByteBuffer to write the value into + * @param val value to write + */ + void write(ByteBuffer buffer, long val); } } diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPlainValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPlainValuesReader.java index ac98da17f2ab..764b2fc353e3 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPlainValuesReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPlainValuesReader.java @@ -25,11 +25,6 @@ class VectorizedPlainValuesReader extends ValuesAsBytesReader implements VectorizedValuesReader { - public static final int INT_SIZE = 4; - public static final int LONG_SIZE = 8; - public static final int FLOAT_SIZE = 4; - public static final int DOUBLE_SIZE = 8; - VectorizedPlainValuesReader() {} @Override diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java index f8b7c92f5999..8eb6431a2c85 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java @@ -31,6 +31,11 @@ */ interface VectorizedValuesReader { + public static final int INT_SIZE = 4; + public static final int LONG_SIZE = 8; + public static final int FLOAT_SIZE = 4; + public static final int DOUBLE_SIZE = 8; + /** Read a single boolean */ boolean readBoolean(); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ValuesAsBytesReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/ValuesAsBytesReader.java index 71e10247af37..ca876d16df29 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ValuesAsBytesReader.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ValuesAsBytesReader.java @@ -56,12 +56,12 @@ public ByteBuffer getBuffer(int length) { } @Override - public final int readInteger() { + public int readInteger() { return getBuffer(4).getInt(); } @Override - public final long readLong() { + public long readLong() { return getBuffer(8).getLong(); } From 98d1c5c5720bf160021706d074320d03d21951b6 Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Mon, 23 Jun 2025 14:59:02 -0700 Subject: [PATCH 09/31] clean up --- .../VectorizedDeltaEncodedValuesReader.java | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java index dc10b251f4aa..3b41a1e6188b 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java @@ -71,9 +71,7 @@ public class VectorizedDeltaEncodedValuesReader private ByteBufferInputStream in; - // temporary buffers used by readByte, readShort, readInteger, and readLong - private byte byteVal; - private short shortVal; + // temporary buffers used by readInteger and readLong private int intVal; private long longVal; @@ -99,22 +97,24 @@ public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOExce @Override public byte readByte() { - return 0; + throw new UnsupportedOperationException("readByte is not supported"); } @Override public short readShort() { - return 0; + throw new UnsupportedOperationException("readShort is not supported"); } @Override public int readInteger() { - return -1; + readValues(1, null, 0, INT_SIZE, (b, v) -> intVal = (int) v); + return intVal; } @Override public long readLong() { - return -1; + readValues(1, null, 0, LONG_SIZE, (b, v) -> longVal = (int) v); + return longVal; } @Override From b72e3386607598f8eab0908f69543a961490d7bb Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Mon, 23 Jun 2025 15:28:06 -0700 Subject: [PATCH 10/31] wire up --- .../arrow/vectorized/parquet/VectorizedPageIterator.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java index 99cf2dc45864..ca39f4011513 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java @@ -95,6 +95,8 @@ protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, i } else { if (dataEncoding == Encoding.PLAIN) { valuesReader = new VectorizedPlainValuesReader(); + } else if (dataEncoding == Encoding.DELTA_BINARY_PACKED) { + valuesReader = new VectorizedDeltaEncodedValuesReader(); } else { throw new UnsupportedOperationException( "Cannot support vectorized reads for column " From b76cc47e832c4085b5defb9db6757b3a134ed0ac Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Wed, 25 Jun 2025 15:22:19 -0700 Subject: [PATCH 11/31] tweak header --- .../VectorizedDeltaEncodedValuesReader.java | 31 +++++++++---------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java index 3b41a1e6188b..b94b2fc797c4 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java @@ -1,24 +1,21 @@ /* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * - * * Licensed to the Apache Software Foundation (ASF) under one - * * or more contributor license agreements. See the NOTICE file - * * distributed with this work for additional information - * * regarding copyright ownership. The ASF licenses this file - * * to you under the Apache License, Version 2.0 (the - * * "License"); you may not use this file except in compliance - * * with the License. You may obtain a copy of the License at - * * - * * http://www.apache.org/licenses/LICENSE-2.0 - * * - * * Unless required by applicable law or agreed to in writing, - * * software distributed under the License is distributed on an - * * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * * KIND, either express or implied. See the License for the - * * specific language governing permissions and limitations - * * under the License. + * http://www.apache.org/licenses/LICENSE-2.0 * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ - package org.apache.iceberg.arrow.vectorized.parquet; import java.io.IOException; From ec077750579ebeed715ae9e528ff2b7118e45309 Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Wed, 25 Jun 2025 16:59:04 -0700 Subject: [PATCH 12/31] check in --- .../VectorizedDeltaEncodedValuesReader.java | 435 +++++++++--------- .../parquet/VectorizedPageIterator.java | 27 +- 2 files changed, 228 insertions(+), 234 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java index b94b2fc797c4..4232dc8796d9 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java @@ -29,267 +29,258 @@ import org.apache.parquet.column.values.ValuesReader; import org.apache.parquet.column.values.bitpacking.BytePackerForLong; import org.apache.parquet.column.values.bitpacking.Packer; -import org.apache.parquet.column.values.plain.PlainValuesReader; import org.apache.parquet.io.ParquetDecodingException; import org.apache.parquet.io.api.Binary; /** - * A {@link VectorizedValuesReader} implementation for the encoding type DELTA_BINARY_PACKED. - * This is adapted from Spark's VectorizedDeltaBinaryPackedReader. + * A {@link VectorizedValuesReader} implementation for the encoding type DELTA_BINARY_PACKED. This + * is adapted from Spark's VectorizedDeltaBinaryPackedReader. * - * @see - * Parquet format encodings: DELTA_BINARY_PACKED + * @see + * Parquet format encodings: DELTA_BINARY_PACKED */ -public class VectorizedDeltaEncodedValuesReader - extends ValuesReader implements VectorizedValuesReader { +public class VectorizedDeltaEncodedValuesReader extends ValuesReader + implements VectorizedValuesReader { - // header data - private int blockSizeInValues; - private int miniBlockNumInABlock; - private int totalValueCount; - private long firstValue; + // header data + private int blockSizeInValues; + private int miniBlockNumInABlock; + private int totalValueCount; + private long firstValue; - private int miniBlockSizeInValues; + private int miniBlockSizeInValues; - // values read by the caller - private int valuesRead = 0; + // values read by the caller + private int valuesRead = 0; - // variables to keep state of the current block and miniblock - private long lastValueRead; // needed to compute the next value - private long minDeltaInCurrentBlock; // needed to compute the next value - // currentMiniBlock keeps track of the mini block within the current block that - // we read and decoded most recently. Only used as an index into - // bitWidths array - private int currentMiniBlock = 0; - private int[] bitWidths; // bit widths for each miniBlock in the current block - private int remainingInBlock = 0; // values in current block still to be read - private int remainingInMiniBlock = 0; // values in current mini block still to be read - private long[] unpackedValuesBuffer; + // variables to keep state of the current block and miniblock + private long lastValueRead; // needed to compute the next value + private long minDeltaInCurrentBlock; // needed to compute the next value + // currentMiniBlock keeps track of the mini block within the current block that + // we read and decoded most recently. Only used as an index into + // bitWidths array + private int currentMiniBlock = 0; + private int[] bitWidths; // bit widths for each miniBlock in the current block + private int remainingInBlock = 0; // values in current block still to be read + private int remainingInMiniBlock = 0; // values in current mini block still to be read + private long[] unpackedValuesBuffer; - private ByteBufferInputStream in; + private ByteBufferInputStream in; - // temporary buffers used by readInteger and readLong - private int intVal; - private long longVal; + // temporary buffers used by readInteger and readLong + private int intVal; + private long longVal; - @Override - public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOException { - Preconditions.checkArgument(valueCount >= 1, - "Page must have at least one value, but it has " + valueCount); - this.in = in; - // Read the header - this.blockSizeInValues = BytesUtils.readUnsignedVarInt(in); - this.miniBlockNumInABlock = BytesUtils.readUnsignedVarInt(in); - double miniSize = (double) blockSizeInValues / miniBlockNumInABlock; - Preconditions.checkArgument(miniSize % 8 == 0, - "miniBlockSize must be multiple of 8, but it's " + miniSize); - this.miniBlockSizeInValues = (int) miniSize; - // True value count. May be less than valueCount because of nulls - this.totalValueCount = BytesUtils.readUnsignedVarInt(in); - this.bitWidths = new int[miniBlockNumInABlock]; - this.unpackedValuesBuffer = new long[miniBlockSizeInValues]; - // read the first value - firstValue = BytesUtils.readZigZagVarLong(in); - } + @Override + public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOException { + Preconditions.checkArgument( + valueCount >= 1, "Page must have at least one value, but it has " + valueCount); + this.in = in; + // Read the header + this.blockSizeInValues = BytesUtils.readUnsignedVarInt(in); + this.miniBlockNumInABlock = BytesUtils.readUnsignedVarInt(in); + double miniSize = (double) blockSizeInValues / miniBlockNumInABlock; + Preconditions.checkArgument( + miniSize % 8 == 0, "miniBlockSize must be multiple of 8, but it's " + miniSize); + this.miniBlockSizeInValues = (int) miniSize; + // True value count. May be less than valueCount because of nulls + this.totalValueCount = BytesUtils.readUnsignedVarInt(in); + this.bitWidths = new int[miniBlockNumInABlock]; + this.unpackedValuesBuffer = new long[miniBlockSizeInValues]; + // read the first value + firstValue = BytesUtils.readZigZagVarLong(in); + } - @Override - public byte readByte() { - throw new UnsupportedOperationException("readByte is not supported"); - } + @Override + public byte readByte() { + throw new UnsupportedOperationException("readByte is not supported"); + } - @Override - public short readShort() { - throw new UnsupportedOperationException("readShort is not supported"); - } + @Override + public short readShort() { + throw new UnsupportedOperationException("readShort is not supported"); + } - @Override - public int readInteger() { - readValues(1, null, 0, INT_SIZE, (b, v) -> intVal = (int) v); - return intVal; - } + @Override + public int readInteger() { + readValues(1, null, 0, INT_SIZE, (b, v) -> intVal = (int) v); + return intVal; + } - @Override - public long readLong() { - readValues(1, null, 0, LONG_SIZE, (b, v) -> longVal = (int) v); - return longVal; - } + @Override + public long readLong() { + readValues(1, null, 0, LONG_SIZE, (b, v) -> longVal = (int) v); + return longVal; + } - @Override - public void skip() { - throw new UnsupportedOperationException("skip is not supported"); - } + @Override + public void skip() { + throw new UnsupportedOperationException("skip is not supported"); + } - @Override - public Binary readBinary(int len) { - throw new UnsupportedOperationException("readBinary is not supported"); - } + @Override + public Binary readBinary(int len) { + throw new UnsupportedOperationException("readBinary is not supported"); + } - @Override - public void readIntegers(int total, FieldVector vec, int rowId) { - readValues( - total, - vec, - rowId, - INT_SIZE, - (b, v) -> b.putInt((int) v)); - } + @Override + public void readIntegers(int total, FieldVector vec, int rowId) { + readValues(total, vec, rowId, INT_SIZE, (b, v) -> b.putInt((int) v)); + } - @Override - public void readLongs(int total, FieldVector vec, int rowId) { - readValues( - total, - vec, - rowId, - LONG_SIZE, - ByteBuffer::putLong); - } + @Override + public void readLongs(int total, FieldVector vec, int rowId) { + readValues(total, vec, rowId, LONG_SIZE, ByteBuffer::putLong); + } - @Override - public void readFloats(int total, FieldVector vec, int rowId) { - throw new UnsupportedOperationException("readFloats is not supported"); - } + @Override + public void readFloats(int total, FieldVector vec, int rowId) { + throw new UnsupportedOperationException("readFloats is not supported"); + } - @Override - public void readDoubles(int total, FieldVector vec, int rowId) { - throw new UnsupportedOperationException("readDoubles is not supported"); - } + @Override + public void readDoubles(int total, FieldVector vec, int rowId) { + throw new UnsupportedOperationException("readDoubles is not supported"); + } - private void readValues( - int total, FieldVector vec, int rowId, int typeWidth, IntegerOutputWriter outputWriter) { - if (valuesRead + total > totalValueCount) { - throw new ParquetDecodingException( - "No more values to read. Total values read: " + valuesRead + ", total count: " - + totalValueCount + ", trying to read " + total + " more."); - } - int remaining = total; - // First value - if (valuesRead == 0) { - ByteBuffer firstValueBuffer = getBuffer(typeWidth); - outputWriter.write(firstValueBuffer, firstValue); - vec.getDataBuffer().setBytes((long) rowId * typeWidth, firstValueBuffer); - lastValueRead = firstValue; - rowId++; - remaining--; - } - while (remaining > 0) { - int n; - try { - n = loadMiniBlockToOutput(remaining, vec, rowId, typeWidth, outputWriter); - } catch (IOException e) { - throw new ParquetDecodingException("Error reading mini block.", e); - } - rowId += n; - remaining -= n; - } - valuesRead = total - remaining; + private void readValues( + int total, FieldVector vec, int rowId, int typeWidth, IntegerOutputWriter outputWriter) { + if (valuesRead + total > totalValueCount) { + throw new ParquetDecodingException( + "No more values to read. Total values read: " + + valuesRead + + ", total count: " + + totalValueCount + + ", trying to read " + + total + + " more."); } + int remaining = total; + // First value + if (valuesRead == 0) { + ByteBuffer firstValueBuffer = getBuffer(typeWidth); + outputWriter.write(firstValueBuffer, firstValue); + vec.getDataBuffer().setBytes((long) rowId * typeWidth, firstValueBuffer); + lastValueRead = firstValue; + rowId++; + remaining--; + } + while (remaining > 0) { + int n; + try { + n = loadMiniBlockToOutput(remaining, vec, rowId, typeWidth, outputWriter); + } catch (IOException e) { + throw new ParquetDecodingException("Error reading mini block.", e); + } + rowId += n; + remaining -= n; + } + valuesRead = total - remaining; + } - /** - * Read from a mini block. Read at most 'remaining' values into output. - * - * @return the number of values read into output - */ - private int loadMiniBlockToOutput( - int remaining, - FieldVector vec, - int rowId, - int typeWidth, - IntegerOutputWriter outputWriter) throws IOException { - - // new block; read the block header - if (remainingInBlock == 0) { - readBlockHeader(); - } + /** + * Read from a mini block. Read at most 'remaining' values into output. + * + * @return the number of values read into output + */ + private int loadMiniBlockToOutput( + int remaining, FieldVector vec, int rowId, int typeWidth, IntegerOutputWriter outputWriter) + throws IOException { - // new miniblock, unpack the miniblock - if (remainingInMiniBlock == 0) { - unpackMiniBlock(); - } + // new block; read the block header + if (remainingInBlock == 0) { + readBlockHeader(); + } - // read values from miniblock - ByteBuffer buffer = getBuffer(remainingInMiniBlock * typeWidth); - int valuesRead = 0; - for (int i = miniBlockSizeInValues - remainingInMiniBlock; - i < miniBlockSizeInValues && valuesRead < remaining; i++) { - // calculate values from deltas unpacked for current block - long outValue = lastValueRead + minDeltaInCurrentBlock + unpackedValuesBuffer[i]; - lastValueRead = outValue; - outputWriter.write(buffer, outValue); - remainingInBlock--; - remainingInMiniBlock--; - valuesRead++; - } - vec.getDataBuffer().setBytes((long) rowId * typeWidth, buffer); + // new miniblock, unpack the miniblock + if (remainingInMiniBlock == 0) { + unpackMiniBlock(); + } - return valuesRead; + // read values from miniblock + ByteBuffer buffer = getBuffer(remainingInMiniBlock * typeWidth); + int valuesRead = 0; + for (int i = miniBlockSizeInValues - remainingInMiniBlock; + i < miniBlockSizeInValues && valuesRead < remaining; + i++) { + // calculate values from deltas unpacked for current block + long outValue = lastValueRead + minDeltaInCurrentBlock + unpackedValuesBuffer[i]; + lastValueRead = outValue; + outputWriter.write(buffer, outValue); + remainingInBlock--; + remainingInMiniBlock--; + valuesRead++; } + vec.getDataBuffer().setBytes((long) rowId * typeWidth, buffer); + + return valuesRead; + } - private void readBlockHeader() { - try { - minDeltaInCurrentBlock = BytesUtils.readZigZagVarLong(in); - } catch (IOException e) { - throw new ParquetDecodingException("Can not read min delta in current block", e); - } - readBitWidthsForMiniBlocks(); - remainingInBlock = blockSizeInValues; - currentMiniBlock = 0; - remainingInMiniBlock = 0; + private void readBlockHeader() { + try { + minDeltaInCurrentBlock = BytesUtils.readZigZagVarLong(in); + } catch (IOException e) { + throw new ParquetDecodingException("Can not read min delta in current block", e); } + readBitWidthsForMiniBlocks(); + remainingInBlock = blockSizeInValues; + currentMiniBlock = 0; + remainingInMiniBlock = 0; + } - private ByteBuffer getBuffer(int length) { - try { - return this.in.slice(length).order(ByteOrder.LITTLE_ENDIAN); - } catch (IOException e) { - throw new ParquetDecodingException("Failed to read " + length + " bytes", e); - } + private ByteBuffer getBuffer(int length) { + try { + return this.in.slice(length).order(ByteOrder.LITTLE_ENDIAN); + } catch (IOException e) { + throw new ParquetDecodingException("Failed to read " + length + " bytes", e); } + } - /** - * mini block has a size of 8*n, unpack 32 value each time - * - * see org.apache.parquet.column.values.delta.DeltaBinaryPackingValuesReader#unpackMiniBlock - */ - private void unpackMiniBlock() throws IOException { - Arrays.fill(this.unpackedValuesBuffer, 0); - BytePackerForLong packer = Packer.LITTLE_ENDIAN.newBytePackerForLong( - bitWidths[currentMiniBlock]); - for (int j = 0; j < miniBlockSizeInValues; j += 8) { - ByteBuffer buffer = in.slice(packer.getBitWidth()); - if (buffer.hasArray()) { - packer.unpack8Values(buffer.array(), - buffer.arrayOffset() + buffer.position(), unpackedValuesBuffer, j); - } else { - packer.unpack8Values(buffer, buffer.position(), unpackedValuesBuffer, j); - } - } - remainingInMiniBlock = miniBlockSizeInValues; - currentMiniBlock++; + /** + * mini block has a size of 8*n, unpack 32 value each time + * + *

see org.apache.parquet.column.values.delta.DeltaBinaryPackingValuesReader#unpackMiniBlock + */ + private void unpackMiniBlock() throws IOException { + Arrays.fill(this.unpackedValuesBuffer, 0); + BytePackerForLong packer = + Packer.LITTLE_ENDIAN.newBytePackerForLong(bitWidths[currentMiniBlock]); + for (int j = 0; j < miniBlockSizeInValues; j += 8) { + ByteBuffer buffer = in.slice(packer.getBitWidth()); + if (buffer.hasArray()) { + packer.unpack8Values( + buffer.array(), buffer.arrayOffset() + buffer.position(), unpackedValuesBuffer, j); + } else { + packer.unpack8Values(buffer, buffer.position(), unpackedValuesBuffer, j); + } } + remainingInMiniBlock = miniBlockSizeInValues; + currentMiniBlock++; + } - // From org.apache.parquet.column.values.delta.DeltaBinaryPackingValuesReader - private void readBitWidthsForMiniBlocks() { - for (int i = 0; i < miniBlockNumInABlock; i++) { - try { - bitWidths[i] = BytesUtils.readIntLittleEndianOnOneByte(in); - } catch (IOException e) { - throw new ParquetDecodingException("Can not decode bitwidth in block header", e); - } - } + // From org.apache.parquet.column.values.delta.DeltaBinaryPackingValuesReader + private void readBitWidthsForMiniBlocks() { + for (int i = 0; i < miniBlockNumInABlock; i++) { + try { + bitWidths[i] = BytesUtils.readIntLittleEndianOnOneByte(in); + } catch (IOException e) { + throw new ParquetDecodingException("Can not decode bitwidth in block header", e); + } } + } + + /** A functional interface to write long values to into a ByteBuffer */ + @FunctionalInterface + interface IntegerOutputWriter { /** - * A functional interface to write long values to into a ByteBuffer + * A functional interface that writes a long value to a specified row in a ByteBuffer, which + * will be written into a FieldVector + * + * @param buffer a ByteBuffer to write the value into + * @param val value to write */ - @FunctionalInterface - interface IntegerOutputWriter { - - /** - * A functional interface that writes a long value to a specified row in a ByteBuffer, - * which will be written into a FieldVector - * - * @param buffer a ByteBuffer to write the value into - * @param val value to write - */ - void write(ByteBuffer buffer, long val); - } + void write(ByteBuffer buffer, long val); + } } diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java index ca39f4011513..be1a3324ae43 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java @@ -93,18 +93,21 @@ protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, i throw new ParquetDecodingException("could not read page in col " + desc, e); } } else { - if (dataEncoding == Encoding.PLAIN) { - valuesReader = new VectorizedPlainValuesReader(); - } else if (dataEncoding == Encoding.DELTA_BINARY_PACKED) { - valuesReader = new VectorizedDeltaEncodedValuesReader(); - } else { - throw new UnsupportedOperationException( - "Cannot support vectorized reads for column " - + desc - + " with " - + "encoding " - + dataEncoding - + ". Disable vectorized reads to read this table/file"); + switch (dataEncoding) { + case PLAIN: + valuesReader = new VectorizedPlainValuesReader(); + break; + case DELTA_BINARY_PACKED: + valuesReader = new VectorizedDeltaEncodedValuesReader(); + break; + default: + throw new UnsupportedOperationException( + "Cannot support vectorized reads for column " + + desc + + " with " + + "encoding " + + dataEncoding + + ". Disable vectorized reads to read this table/file"); } try { valuesReader.initFromPage(valueCount, in); From 1969466d696f963eb18a6513a515244879f0dff9 Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Thu, 26 Jun 2025 22:25:27 -0700 Subject: [PATCH 13/31] debugging --- .../vectorized/parquet/TestParquetVectorizedReads.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java index ff9d624ae68f..094c9d6a9053 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java @@ -25,6 +25,7 @@ import static org.assertj.core.api.Assumptions.assumeThat; import java.io.File; +import java.io.FileInputStream; import java.io.IOException; import java.util.Iterator; import org.apache.iceberg.Files; @@ -50,6 +51,7 @@ import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.Type; +import org.apache.spark.sql.catalyst.plans.logical.Except; import org.apache.spark.sql.vectorized.ColumnarBatch; import org.junit.jupiter.api.Test; @@ -298,9 +300,11 @@ public void testSupportedReadsForParquetV2() throws Exception { // (i.e. decimals > 8 bytes) Schema schema = new Schema( - optional(102, "float_data", Types.FloatType.get()), - optional(103, "double_data", Types.DoubleType.get()), - optional(104, "decimal_data", Types.DecimalType.of(25, 5))); +// optional(102, "float_data", Types.FloatType.get()), +// optional(103, "double_data", Types.DoubleType.get()), +// optional(104, "decimal_data", Types.DecimalType.of(25, 5)), + optional(105, "int_data", Types.IntegerType.get()), + optional(106, "long_data", Types.LongType.get())); File dataFile = File.createTempFile("junit", null, temp.toFile()); assertThat(dataFile.delete()).as("Delete should succeed").isTrue(); From d2b173b468687302aa21817ac830da5f88892a54 Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Thu, 26 Jun 2025 22:25:41 -0700 Subject: [PATCH 14/31] debugging --- .../VectorizedDeltaEncodedValuesReader.java | 8 +++- ...ectorizedParquetDefinitionLevelReader.java | 37 +++++++++++-------- .../parquet/VectorizedValuesReader.java | 8 ++-- .../apache/iceberg/parquet/PageIterator.java | 7 ---- 4 files changed, 31 insertions(+), 29 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java index 4232dc8796d9..5022b8f82147 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java @@ -161,7 +161,9 @@ private void readValues( if (valuesRead == 0) { ByteBuffer firstValueBuffer = getBuffer(typeWidth); outputWriter.write(firstValueBuffer, firstValue); - vec.getDataBuffer().setBytes((long) rowId * typeWidth, firstValueBuffer); + if (vec != null) { + vec.getDataBuffer().setBytes((long) rowId * typeWidth, firstValueBuffer); + } lastValueRead = firstValue; rowId++; remaining--; @@ -212,7 +214,9 @@ private int loadMiniBlockToOutput( remainingInMiniBlock--; valuesRead++; } - vec.getDataBuffer().setBytes((long) rowId * typeWidth, buffer); + if (vec != null) { + vec.getDataBuffer().setBytes((long) rowId * typeWidth, buffer); + } return valuesRead; } diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java index 0f85101a5b79..3a8875c58b07 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java @@ -84,22 +84,27 @@ public void nextBatch( final int numValsToRead, NullabilityHolder nullabilityHolder, VectorizedValuesReader valuesReader) { - nextBatch( - vector, - startOffset, - typeWidth, - numValsToRead, - (mode, idx, numValues, byteArray, validityBuffer) -> { - switch (mode) { - case RLE: - nextRleBatch( - vector, typeWidth, nullabilityHolder, valuesReader, idx, numValues, byteArray); - break; - case PACKED: - nextPackedBatch( - vector, typeWidth, nullabilityHolder, valuesReader, idx, numValues, byteArray); - } - }); + if (valuesReader instanceof VectorizedPlainValuesReader) { + nextBatch( + vector, + startOffset, + typeWidth, + numValsToRead, + (mode, idx, numValues, byteArray, validityBuffer) -> { + switch (mode) { + case RLE: + nextRleBatch( + vector, typeWidth, nullabilityHolder, valuesReader, idx, numValues, byteArray); + break; + case PACKED: + nextPackedBatch( + vector, typeWidth, nullabilityHolder, valuesReader, idx, numValues, byteArray); + } + }); + } else { + // TODO actually call the appropriate methods + valuesReader.readIntegers(numValsToRead, vector, startOffset); + } } public void nextDictEncodedBatch( diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java index 8eb6431a2c85..7c23149b18ab 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedValuesReader.java @@ -31,10 +31,10 @@ */ interface VectorizedValuesReader { - public static final int INT_SIZE = 4; - public static final int LONG_SIZE = 8; - public static final int FLOAT_SIZE = 4; - public static final int DOUBLE_SIZE = 8; + int INT_SIZE = 4; + int LONG_SIZE = 8; + int FLOAT_SIZE = 4; + int DOUBLE_SIZE = 8; /** Read a single boolean */ boolean readBoolean(); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/PageIterator.java b/parquet/src/main/java/org/apache/iceberg/parquet/PageIterator.java index bff13603002f..a68d2f9b82e7 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/PageIterator.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/PageIterator.java @@ -257,13 +257,6 @@ protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, i } else { this.values = dataEncoding.getValuesReader(desc, ValuesType.VALUES); } - - // if (dataEncoding.usesDictionary() && converter.hasDictionarySupport()) { - // bindToDictionary(dictionary); - // } else { - // bind(path.getType()); - // } - try { values.initFromPage(valueCount, in); } catch (IOException e) { From 1f219e589a608a17024a77abdb8ee7c231b95775 Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Tue, 1 Jul 2025 13:02:36 -0700 Subject: [PATCH 15/31] debugging commit --- .../VectorizedDeltaEncodedValuesReader.java | 48 ++++------- ...ectorizedParquetDefinitionLevelReader.java | 85 ++++++++++++++----- .../iceberg/data/RandomGenericData.java | 5 ++ .../parquet/TestParquetVectorizedReads.java | 12 ++- 4 files changed, 93 insertions(+), 57 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java index 5022b8f82147..4ff29141313a 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java @@ -20,7 +20,6 @@ import java.io.IOException; import java.nio.ByteBuffer; -import java.nio.ByteOrder; import java.util.Arrays; import org.apache.arrow.vector.FieldVector; import org.apache.parquet.Preconditions; @@ -78,18 +77,18 @@ public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOExce valueCount >= 1, "Page must have at least one value, but it has " + valueCount); this.in = in; // Read the header - this.blockSizeInValues = BytesUtils.readUnsignedVarInt(in); - this.miniBlockNumInABlock = BytesUtils.readUnsignedVarInt(in); + this.blockSizeInValues = BytesUtils.readUnsignedVarInt(this.in); + this.miniBlockNumInABlock = BytesUtils.readUnsignedVarInt(this.in); double miniSize = (double) blockSizeInValues / miniBlockNumInABlock; Preconditions.checkArgument( miniSize % 8 == 0, "miniBlockSize must be multiple of 8, but it's " + miniSize); this.miniBlockSizeInValues = (int) miniSize; // True value count. May be less than valueCount because of nulls - this.totalValueCount = BytesUtils.readUnsignedVarInt(in); + this.totalValueCount = BytesUtils.readUnsignedVarInt(this.in); this.bitWidths = new int[miniBlockNumInABlock]; this.unpackedValuesBuffer = new long[miniBlockSizeInValues]; // read the first value - firstValue = BytesUtils.readZigZagVarLong(in); + firstValue = BytesUtils.readZigZagVarLong(this.in); } @Override @@ -104,13 +103,13 @@ public short readShort() { @Override public int readInteger() { - readValues(1, null, 0, INT_SIZE, (b, v) -> intVal = (int) v); + readValues(1, null, 0, INT_SIZE, (f, i, v) -> intVal = (int) v); return intVal; } @Override public long readLong() { - readValues(1, null, 0, LONG_SIZE, (b, v) -> longVal = (int) v); + readValues(1, null, 0, LONG_SIZE, (f, i, v) -> longVal = v); return longVal; } @@ -126,12 +125,12 @@ public Binary readBinary(int len) { @Override public void readIntegers(int total, FieldVector vec, int rowId) { - readValues(total, vec, rowId, INT_SIZE, (b, v) -> b.putInt((int) v)); + readValues(total, vec, rowId, INT_SIZE, (f, i, v) -> f.getDataBuffer().setLong(i, v)); } @Override public void readLongs(int total, FieldVector vec, int rowId) { - readValues(total, vec, rowId, LONG_SIZE, ByteBuffer::putLong); + readValues(total, vec, rowId, LONG_SIZE, (f, i, v) -> f.getDataBuffer().setLong(i, v)); } @Override @@ -159,11 +158,7 @@ private void readValues( int remaining = total; // First value if (valuesRead == 0) { - ByteBuffer firstValueBuffer = getBuffer(typeWidth); - outputWriter.write(firstValueBuffer, firstValue); - if (vec != null) { - vec.getDataBuffer().setBytes((long) rowId * typeWidth, firstValueBuffer); - } + outputWriter.write(vec, (long) rowId * typeWidth, firstValue); lastValueRead = firstValue; rowId++; remaining--; @@ -201,7 +196,6 @@ private int loadMiniBlockToOutput( } // read values from miniblock - ByteBuffer buffer = getBuffer(remainingInMiniBlock * typeWidth); int valuesRead = 0; for (int i = miniBlockSizeInValues - remainingInMiniBlock; i < miniBlockSizeInValues && valuesRead < remaining; @@ -209,14 +203,11 @@ private int loadMiniBlockToOutput( // calculate values from deltas unpacked for current block long outValue = lastValueRead + minDeltaInCurrentBlock + unpackedValuesBuffer[i]; lastValueRead = outValue; - outputWriter.write(buffer, outValue); + outputWriter.write(vec, ((long) (rowId + valuesRead) * typeWidth), outValue); remainingInBlock--; remainingInMiniBlock--; valuesRead++; } - if (vec != null) { - vec.getDataBuffer().setBytes((long) rowId * typeWidth, buffer); - } return valuesRead; } @@ -233,14 +224,6 @@ private void readBlockHeader() { remainingInMiniBlock = 0; } - private ByteBuffer getBuffer(int length) { - try { - return this.in.slice(length).order(ByteOrder.LITTLE_ENDIAN); - } catch (IOException e) { - throw new ParquetDecodingException("Failed to read " + length + " bytes", e); - } - } - /** * mini block has a size of 8*n, unpack 32 value each time * @@ -274,17 +257,18 @@ private void readBitWidthsForMiniBlocks() { } } - /** A functional interface to write long values to into a ByteBuffer */ + /** A functional interface to write long values to into a FieldVector */ @FunctionalInterface interface IntegerOutputWriter { /** - * A functional interface that writes a long value to a specified row in a ByteBuffer, which - * will be written into a FieldVector + * A functional interface that can be used to write a long value to a specified row in a + * FieldVector * - * @param buffer a ByteBuffer to write the value into + * @param vec a FieldVector to write the value into + * @param index The offset to write to * @param val value to write */ - void write(ByteBuffer buffer, long val); + void write(FieldVector vec, long index, long val); } } diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java index 3a8875c58b07..c04e7e0cb1b5 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java @@ -84,27 +84,39 @@ public void nextBatch( final int numValsToRead, NullabilityHolder nullabilityHolder, VectorizedValuesReader valuesReader) { - if (valuesReader instanceof VectorizedPlainValuesReader) { - nextBatch( - vector, - startOffset, - typeWidth, - numValsToRead, - (mode, idx, numValues, byteArray, validityBuffer) -> { - switch (mode) { - case RLE: - nextRleBatch( - vector, typeWidth, nullabilityHolder, valuesReader, idx, numValues, byteArray); - break; - case PACKED: - nextPackedBatch( - vector, typeWidth, nullabilityHolder, valuesReader, idx, numValues, byteArray); - } - }); - } else { - // TODO actually call the appropriate methods - valuesReader.readIntegers(numValsToRead, vector, startOffset); - } + nextBatch( + vector, + startOffset, + typeWidth, + numValsToRead, + (mode, idx, numValues, byteArray, validityBuffer) -> { + if (valuesReader instanceof VectorizedPlainValuesReader) { + switch (mode) { + case RLE: + nextRleBatch( + vector, + typeWidth, + nullabilityHolder, + valuesReader, + idx, + numValues, + byteArray); + break; + case PACKED: + nextPackedBatch( + vector, + typeWidth, + nullabilityHolder, + valuesReader, + idx, + numValues, + byteArray); + } + } else { + nextVectorizedBatch( + vector, typeWidth, nullabilityHolder, valuesReader, idx, numValues); + } + }); } public void nextDictEncodedBatch( @@ -165,6 +177,19 @@ protected abstract void nextPackedBatch( int numValues, byte[] byteArray); + protected void nextVectorizedBatch( + FieldVector vector, + int typeWidth, + NullabilityHolder nullabilityHolder, + VectorizedValuesReader valuesReader, + int idx, + int numValues) { + throw new UnsupportedOperationException( + this.getClass().getName() + + " does not support reader " + + valuesReader.getClass().getName()); + } + protected void nextRleDictEncodedBatch( FieldVector vector, int typeWidth, @@ -284,6 +309,24 @@ protected void nextDictEncodedVal( .setLong((long) idx * typeWidth, dict.decodeToLong(reader.readInteger())); } } + + @Override + protected void nextVectorizedBatch( + FieldVector vector, + int typeWidth, + NullabilityHolder nullabilityHolder, + VectorizedValuesReader valuesReader, + int idx, + int numValues) { + if (currentValue == maxDefLevel) { + valuesReader.readLongs(numValues, vector, idx); + for (int i = 0; i < numValues; i++) { + nullabilityHolder.setNotNull(idx + i); + } + } else { + setNulls(nullabilityHolder, idx + numValues, numValues, vector.getValidityBuffer()); + } + } } class DoubleReader extends NumericBaseReader { diff --git a/data/src/test/java/org/apache/iceberg/data/RandomGenericData.java b/data/src/test/java/org/apache/iceberg/data/RandomGenericData.java index 4963052e0877..eb3795fb8b79 100644 --- a/data/src/test/java/org/apache/iceberg/data/RandomGenericData.java +++ b/data/src/test/java/org/apache/iceberg/data/RandomGenericData.java @@ -175,6 +175,7 @@ public abstract static class RandomDataGenerator private final Random random; private final float nullPercentage; + private int currentInt = 1; protected RandomDataGenerator(long seed) { this(seed, DEFAULT_NULL_PERCENTAGE); @@ -289,6 +290,10 @@ public Object primitive(Type.PrimitiveType primitive) { } else { return EPOCH.plus((long) result, NANOS).toLocalDateTime(); } + case INTEGER: + return currentInt++; + case LONG: + return (long)currentInt++; default: return result; } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java index 094c9d6a9053..74e0bcecd5c5 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java @@ -185,7 +185,11 @@ void assertRecordsMatch( while (batches.hasNext()) { ColumnarBatch batch = batches.next(); numRowsRead += batch.numRows(); - GenericsHelpers.assertEqualsBatch(schema.asStruct(), expectedIter, batch); + if (numRowsRead != batch.numRows()) { + // todo skip the first batch for debugging + GenericsHelpers.assertEqualsBatch(schema.asStruct(), expectedIter, batch); + + } } assertThat(numRowsRead).isEqualTo(expectedSize); } @@ -303,13 +307,13 @@ public void testSupportedReadsForParquetV2() throws Exception { // optional(102, "float_data", Types.FloatType.get()), // optional(103, "double_data", Types.DoubleType.get()), // optional(104, "decimal_data", Types.DecimalType.of(25, 5)), - optional(105, "int_data", Types.IntegerType.get()), +// optional(105, "int_data", Types.IntegerType.get()), optional(106, "long_data", Types.LongType.get())); File dataFile = File.createTempFile("junit", null, temp.toFile()); assertThat(dataFile.delete()).as("Delete should succeed").isTrue(); Iterable data = - generateData(schema, 30000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); + generateData(schema, 30000, 0L, 0, IDENTITY); try (FileAppender writer = getParquetV2Writer(schema, dataFile)) { writer.addAll(data); } @@ -318,7 +322,7 @@ public void testSupportedReadsForParquetV2() throws Exception { @Test public void testUnsupportedReadsForParquetV2() throws Exception { - // Longs, ints, string types etc use delta encoding and which are not supported for vectorized + // Longs, ints, string types etc. use delta encoding and which are not supported for vectorized // reads Schema schema = new Schema(SUPPORTED_PRIMITIVES.fields()); File dataFile = File.createTempFile("junit", null, temp.toFile()); From 21c11d84b6d070f0616a6f467871eb68e9f4164e Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Tue, 1 Jul 2025 13:23:13 -0700 Subject: [PATCH 16/31] move code --- ...ectorizedParquetDefinitionLevelReader.java | 70 +++++-------------- 1 file changed, 18 insertions(+), 52 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java index c04e7e0cb1b5..c9e1647d6904 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java @@ -30,6 +30,7 @@ import org.apache.iceberg.arrow.vectorized.NullabilityHolder; import org.apache.iceberg.parquet.ParquetUtil; import org.apache.parquet.column.Dictionary; +import org.apache.parquet.column.values.plain.PlainValuesReader; public final class VectorizedParquetDefinitionLevelReader extends BaseVectorizedParquetValuesReader { @@ -90,32 +91,27 @@ public void nextBatch( typeWidth, numValsToRead, (mode, idx, numValues, byteArray, validityBuffer) -> { - if (valuesReader instanceof VectorizedPlainValuesReader) { switch (mode) { case RLE: nextRleBatch( - vector, - typeWidth, - nullabilityHolder, - valuesReader, - idx, - numValues, - byteArray); + vector, + typeWidth, + nullabilityHolder, + valuesReader, + idx, + numValues, + byteArray); break; case PACKED: nextPackedBatch( - vector, - typeWidth, - nullabilityHolder, - valuesReader, - idx, - numValues, - byteArray); + vector, + typeWidth, + nullabilityHolder, + valuesReader, + idx, + numValues, + byteArray); } - } else { - nextVectorizedBatch( - vector, typeWidth, nullabilityHolder, valuesReader, idx, numValues); - } }); } @@ -177,19 +173,6 @@ protected abstract void nextPackedBatch( int numValues, byte[] byteArray); - protected void nextVectorizedBatch( - FieldVector vector, - int typeWidth, - NullabilityHolder nullabilityHolder, - VectorizedValuesReader valuesReader, - int idx, - int numValues) { - throw new UnsupportedOperationException( - this.getClass().getName() - + " does not support reader " - + valuesReader.getClass().getName()); - } - protected void nextRleDictEncodedBatch( FieldVector vector, int typeWidth, @@ -309,24 +292,6 @@ protected void nextDictEncodedVal( .setLong((long) idx * typeWidth, dict.decodeToLong(reader.readInteger())); } } - - @Override - protected void nextVectorizedBatch( - FieldVector vector, - int typeWidth, - NullabilityHolder nullabilityHolder, - VectorizedValuesReader valuesReader, - int idx, - int numValues) { - if (currentValue == maxDefLevel) { - valuesReader.readLongs(numValues, vector, idx); - for (int i = 0; i < numValues; i++) { - nullabilityHolder.setNotNull(idx + i); - } - } else { - setNulls(nullabilityHolder, idx + numValues, numValues, vector.getValidityBuffer()); - } - } } class DoubleReader extends NumericBaseReader { @@ -695,6 +660,7 @@ private void setNulls( } } + @SuppressWarnings({"all"}) private void setNextNValuesInVector( int typeWidth, NullabilityHolder nullabilityHolder, @@ -704,8 +670,8 @@ private void setNextNValuesInVector( int numValues) { ArrowBuf validityBuffer = vector.getValidityBuffer(); if (currentValue == maxDefLevel) { - ByteBuffer buffer = valuesReader.readBinary(numValues * typeWidth).toByteBuffer(); - vector.getDataBuffer().setBytes((long) bufferIdx * typeWidth, buffer); + // TODO read the correct type not just hard-coded longs here + valuesReader.readLongs(numValues, vector, bufferIdx); nullabilityHolder.setNotNulls(bufferIdx, numValues); if (setArrowValidityVector) { for (int i = 0; i < numValues; i++) { From e4bc23fb2035def5d35dab1669e8bd1180f1d589 Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Tue, 1 Jul 2025 13:37:57 -0700 Subject: [PATCH 17/31] switch back to floats --- .../parquet/VectorizedDeltaEncodedValuesReader.java | 5 ++++- .../parquet/VectorizedParquetDefinitionLevelReader.java | 2 +- .../data/vectorized/parquet/TestParquetVectorizedReads.java | 5 +++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java index 4ff29141313a..66db9e9319c7 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java @@ -158,7 +158,8 @@ private void readValues( int remaining = total; // First value if (valuesRead == 0) { - outputWriter.write(vec, (long) rowId * typeWidth, firstValue); + System.out.println("#### (F) Wrote value " + firstValue + " to " + ((long) (rowId + valuesRead) * typeWidth)); + outputWriter.write(vec, ((long) (rowId + valuesRead) * typeWidth), firstValue); lastValueRead = firstValue; rowId++; remaining--; @@ -203,6 +204,8 @@ private int loadMiniBlockToOutput( // calculate values from deltas unpacked for current block long outValue = lastValueRead + minDeltaInCurrentBlock + unpackedValuesBuffer[i]; lastValueRead = outValue; + System.out.println("#### (O) Wrote value " + outValue + " to " + ((long) (rowId + valuesRead) * typeWidth) + + " vec IS null == " + (vec == null)); outputWriter.write(vec, ((long) (rowId + valuesRead) * typeWidth), outValue); remainingInBlock--; remainingInMiniBlock--; diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java index c9e1647d6904..9edc91ace6be 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java @@ -671,7 +671,7 @@ private void setNextNValuesInVector( ArrowBuf validityBuffer = vector.getValidityBuffer(); if (currentValue == maxDefLevel) { // TODO read the correct type not just hard-coded longs here - valuesReader.readLongs(numValues, vector, bufferIdx); + valuesReader.readFloats(numValues, vector, bufferIdx); nullabilityHolder.setNotNulls(bufferIdx, numValues); if (setArrowValidityVector) { for (int i = 0; i < numValues; i++) { diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java index 74e0bcecd5c5..0889118e1fd9 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java @@ -304,11 +304,12 @@ public void testSupportedReadsForParquetV2() throws Exception { // (i.e. decimals > 8 bytes) Schema schema = new Schema( -// optional(102, "float_data", Types.FloatType.get()), + optional(102, "float_data", Types.FloatType.get()) // optional(103, "double_data", Types.DoubleType.get()), // optional(104, "decimal_data", Types.DecimalType.of(25, 5)), // optional(105, "int_data", Types.IntegerType.get()), - optional(106, "long_data", Types.LongType.get())); +// optional(106, "long_data", Types.LongType.get()) + ); File dataFile = File.createTempFile("junit", null, temp.toFile()); assertThat(dataFile.delete()).as("Delete should succeed").isTrue(); From a88af2e8e7500b693816a0aba16ebfc4520979b0 Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Tue, 1 Jul 2025 13:41:09 -0700 Subject: [PATCH 18/31] clean a bit --- .../org/apache/iceberg/parquet/ValuesAsBytesReader.java | 4 ++-- .../data/vectorized/parquet/TestParquetVectorizedReads.java | 6 +----- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ValuesAsBytesReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/ValuesAsBytesReader.java index ca876d16df29..71e10247af37 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ValuesAsBytesReader.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ValuesAsBytesReader.java @@ -56,12 +56,12 @@ public ByteBuffer getBuffer(int length) { } @Override - public int readInteger() { + public final int readInteger() { return getBuffer(4).getInt(); } @Override - public long readLong() { + public final long readLong() { return getBuffer(8).getLong(); } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java index 0889118e1fd9..7f55843d706a 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java @@ -185,11 +185,7 @@ void assertRecordsMatch( while (batches.hasNext()) { ColumnarBatch batch = batches.next(); numRowsRead += batch.numRows(); - if (numRowsRead != batch.numRows()) { - // todo skip the first batch for debugging - GenericsHelpers.assertEqualsBatch(schema.asStruct(), expectedIter, batch); - - } + GenericsHelpers.assertEqualsBatch(schema.asStruct(), expectedIter, batch); } assertThat(numRowsRead).isEqualTo(expectedSize); } From c375e99d0e35b1255d179f51e4449c60a478791c Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Tue, 1 Jul 2025 13:54:18 -0700 Subject: [PATCH 19/31] semistable --- ...ectorizedParquetDefinitionLevelReader.java | 33 ++++++++++++++++--- .../iceberg/data/RandomGenericData.java | 5 --- .../parquet/TestParquetVectorizedReads.java | 13 ++++---- 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java index 9edc91ace6be..ef6f9c4ba2a0 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java @@ -30,6 +30,7 @@ import org.apache.iceberg.arrow.vectorized.NullabilityHolder; import org.apache.iceberg.parquet.ParquetUtil; import org.apache.parquet.column.Dictionary; +import org.apache.parquet.column.values.ValuesReader; import org.apache.parquet.column.values.plain.PlainValuesReader; public final class VectorizedParquetDefinitionLevelReader @@ -236,7 +237,7 @@ protected void nextRleBatch( int idx, int numValues, byte[] byteArray) { - setNextNValuesInVector(typeWidth, nullabilityHolder, valuesReader, idx, vector, numValues); + setNextNValuesInVector(nullabilityHolder, valuesReader, idx, vector, numValues, this); } @Override @@ -265,6 +266,9 @@ protected void nextPackedBatch( protected abstract void nextVal( FieldVector vector, int idx, VectorizedValuesReader valuesReader, Mode mode); + + public abstract void nextVals( + FieldVector vector, int rowId, VectorizedValuesReader valuesReader, int total); } class LongReader extends NumericBaseReader { @@ -292,6 +296,11 @@ protected void nextDictEncodedVal( .setLong((long) idx * typeWidth, dict.decodeToLong(reader.readInteger())); } } + + @Override + public void nextVals(FieldVector vector, int rowId, VectorizedValuesReader valuesReader, int total) { + valuesReader.readLongs(total, vector, rowId); + } } class DoubleReader extends NumericBaseReader { @@ -319,6 +328,11 @@ protected void nextDictEncodedVal( .setDouble((long) idx * typeWidth, dict.decodeToDouble(reader.readInteger())); } } + + @Override + public void nextVals(FieldVector vector, int rowId, VectorizedValuesReader valuesReader, int total) { + valuesReader.readDoubles(total, vector, rowId); + } } class FloatReader extends NumericBaseReader { @@ -346,6 +360,11 @@ protected void nextDictEncodedVal( .setFloat((long) idx * typeWidth, dict.decodeToFloat(reader.readInteger())); } } + + @Override + public void nextVals(FieldVector vector, int rowId, VectorizedValuesReader valuesReader, int total) { + valuesReader.readFloats(total, vector, rowId); + } } class IntegerReader extends NumericBaseReader { @@ -375,6 +394,11 @@ protected void nextDictEncodedVal( .setInt((long) idx * typeWidth, dict.decodeToInt(reader.readInteger())); } } + + @Override + public void nextVals(FieldVector vector, int rowId, VectorizedValuesReader valuesReader, int total) { + valuesReader.readIntegers(total, vector, rowId); + } } abstract class BaseReader extends CommonReader { @@ -662,16 +686,15 @@ private void setNulls( @SuppressWarnings({"all"}) private void setNextNValuesInVector( - int typeWidth, NullabilityHolder nullabilityHolder, VectorizedValuesReader valuesReader, int bufferIdx, FieldVector vector, - int numValues) { + int numValues, + NumericBaseReader reader) { ArrowBuf validityBuffer = vector.getValidityBuffer(); if (currentValue == maxDefLevel) { - // TODO read the correct type not just hard-coded longs here - valuesReader.readFloats(numValues, vector, bufferIdx); + reader.nextVals(vector, bufferIdx, valuesReader, numValues); nullabilityHolder.setNotNulls(bufferIdx, numValues); if (setArrowValidityVector) { for (int i = 0; i < numValues; i++) { diff --git a/data/src/test/java/org/apache/iceberg/data/RandomGenericData.java b/data/src/test/java/org/apache/iceberg/data/RandomGenericData.java index eb3795fb8b79..4963052e0877 100644 --- a/data/src/test/java/org/apache/iceberg/data/RandomGenericData.java +++ b/data/src/test/java/org/apache/iceberg/data/RandomGenericData.java @@ -175,7 +175,6 @@ public abstract static class RandomDataGenerator private final Random random; private final float nullPercentage; - private int currentInt = 1; protected RandomDataGenerator(long seed) { this(seed, DEFAULT_NULL_PERCENTAGE); @@ -290,10 +289,6 @@ public Object primitive(Type.PrimitiveType primitive) { } else { return EPOCH.plus((long) result, NANOS).toLocalDateTime(); } - case INTEGER: - return currentInt++; - case LONG: - return (long)currentInt++; default: return result; } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java index 7f55843d706a..054d5c0c6b4a 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java @@ -300,11 +300,11 @@ public void testSupportedReadsForParquetV2() throws Exception { // (i.e. decimals > 8 bytes) Schema schema = new Schema( - optional(102, "float_data", Types.FloatType.get()) -// optional(103, "double_data", Types.DoubleType.get()), -// optional(104, "decimal_data", Types.DecimalType.of(25, 5)), -// optional(105, "int_data", Types.IntegerType.get()), -// optional(106, "long_data", Types.LongType.get()) + optional(102, "float_data", Types.FloatType.get()), + optional(103, "double_data", Types.DoubleType.get()), + optional(104, "decimal_data", Types.DecimalType.of(25, 5)), + optional(105, "int_data", Types.IntegerType.get()), + optional(106, "long_data", Types.LongType.get()) ); File dataFile = File.createTempFile("junit", null, temp.toFile()); @@ -319,8 +319,7 @@ public void testSupportedReadsForParquetV2() throws Exception { @Test public void testUnsupportedReadsForParquetV2() throws Exception { - // Longs, ints, string types etc. use delta encoding and which are not supported for vectorized - // reads + // Some types use delta encoding and which are not supported for vectorized reads Schema schema = new Schema(SUPPORTED_PRIMITIVES.fields()); File dataFile = File.createTempFile("junit", null, temp.toFile()); assertThat(dataFile.delete()).as("Delete should succeed").isTrue(); From f8cfbb28f0a037c22628c10fc9dc16ec13976e4e Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Tue, 1 Jul 2025 13:56:04 -0700 Subject: [PATCH 20/31] polish --- ...ectorizedParquetDefinitionLevelReader.java | 31 ++++++------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java index ef6f9c4ba2a0..e64a3b7e5a2d 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java @@ -92,27 +92,15 @@ public void nextBatch( typeWidth, numValsToRead, (mode, idx, numValues, byteArray, validityBuffer) -> { - switch (mode) { - case RLE: - nextRleBatch( - vector, - typeWidth, - nullabilityHolder, - valuesReader, - idx, - numValues, - byteArray); - break; - case PACKED: - nextPackedBatch( - vector, - typeWidth, - nullabilityHolder, - valuesReader, - idx, - numValues, - byteArray); - } + switch (mode) { + case RLE: + nextRleBatch( + vector, typeWidth, nullabilityHolder, valuesReader, idx, numValues, byteArray); + break; + case PACKED: + nextPackedBatch( + vector, typeWidth, nullabilityHolder, valuesReader, idx, numValues, byteArray); + } }); } @@ -684,7 +672,6 @@ private void setNulls( } } - @SuppressWarnings({"all"}) private void setNextNValuesInVector( NullabilityHolder nullabilityHolder, VectorizedValuesReader valuesReader, From 9d27297cc7736e9877b3907065c3cbda27e5eae8 Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Tue, 1 Jul 2025 13:57:35 -0700 Subject: [PATCH 21/31] stable: --- .../vectorized/parquet/VectorizedDeltaEncodedValuesReader.java | 3 --- .../data/vectorized/parquet/TestParquetVectorizedReads.java | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java index 66db9e9319c7..efe4b72ccb65 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java @@ -158,7 +158,6 @@ private void readValues( int remaining = total; // First value if (valuesRead == 0) { - System.out.println("#### (F) Wrote value " + firstValue + " to " + ((long) (rowId + valuesRead) * typeWidth)); outputWriter.write(vec, ((long) (rowId + valuesRead) * typeWidth), firstValue); lastValueRead = firstValue; rowId++; @@ -204,8 +203,6 @@ private int loadMiniBlockToOutput( // calculate values from deltas unpacked for current block long outValue = lastValueRead + minDeltaInCurrentBlock + unpackedValuesBuffer[i]; lastValueRead = outValue; - System.out.println("#### (O) Wrote value " + outValue + " to " + ((long) (rowId + valuesRead) * typeWidth) + - " vec IS null == " + (vec == null)); outputWriter.write(vec, ((long) (rowId + valuesRead) * typeWidth), outValue); remainingInBlock--; remainingInMiniBlock--; diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java index 054d5c0c6b4a..f53a1ba1fc1f 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java @@ -310,7 +310,7 @@ public void testSupportedReadsForParquetV2() throws Exception { File dataFile = File.createTempFile("junit", null, temp.toFile()); assertThat(dataFile.delete()).as("Delete should succeed").isTrue(); Iterable data = - generateData(schema, 30000, 0L, 0, IDENTITY); + generateData(schema, 30000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); try (FileAppender writer = getParquetV2Writer(schema, dataFile)) { writer.addAll(data); } From d75f85e1a532fd8bbada723ad4aa0e39ef415b63 Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Tue, 1 Jul 2025 13:59:46 -0700 Subject: [PATCH 22/31] spotless; polish --- ...ectorizedParquetDefinitionLevelReader.java | 20 ++++++++++--------- .../parquet/TestParquetVectorizedReads.java | 3 +-- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java index e64a3b7e5a2d..26872c686ec3 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java @@ -30,8 +30,6 @@ import org.apache.iceberg.arrow.vectorized.NullabilityHolder; import org.apache.iceberg.parquet.ParquetUtil; import org.apache.parquet.column.Dictionary; -import org.apache.parquet.column.values.ValuesReader; -import org.apache.parquet.column.values.plain.PlainValuesReader; public final class VectorizedParquetDefinitionLevelReader extends BaseVectorizedParquetValuesReader { @@ -95,11 +93,11 @@ public void nextBatch( switch (mode) { case RLE: nextRleBatch( - vector, typeWidth, nullabilityHolder, valuesReader, idx, numValues, byteArray); + vector, typeWidth, nullabilityHolder, valuesReader, idx, numValues, byteArray); break; case PACKED: nextPackedBatch( - vector, typeWidth, nullabilityHolder, valuesReader, idx, numValues, byteArray); + vector, typeWidth, nullabilityHolder, valuesReader, idx, numValues, byteArray); } }); } @@ -256,7 +254,7 @@ protected abstract void nextVal( FieldVector vector, int idx, VectorizedValuesReader valuesReader, Mode mode); public abstract void nextVals( - FieldVector vector, int rowId, VectorizedValuesReader valuesReader, int total); + FieldVector vector, int rowId, VectorizedValuesReader valuesReader, int total); } class LongReader extends NumericBaseReader { @@ -286,7 +284,8 @@ protected void nextDictEncodedVal( } @Override - public void nextVals(FieldVector vector, int rowId, VectorizedValuesReader valuesReader, int total) { + public void nextVals( + FieldVector vector, int rowId, VectorizedValuesReader valuesReader, int total) { valuesReader.readLongs(total, vector, rowId); } } @@ -318,7 +317,8 @@ protected void nextDictEncodedVal( } @Override - public void nextVals(FieldVector vector, int rowId, VectorizedValuesReader valuesReader, int total) { + public void nextVals( + FieldVector vector, int rowId, VectorizedValuesReader valuesReader, int total) { valuesReader.readDoubles(total, vector, rowId); } } @@ -350,7 +350,8 @@ protected void nextDictEncodedVal( } @Override - public void nextVals(FieldVector vector, int rowId, VectorizedValuesReader valuesReader, int total) { + public void nextVals( + FieldVector vector, int rowId, VectorizedValuesReader valuesReader, int total) { valuesReader.readFloats(total, vector, rowId); } } @@ -384,7 +385,8 @@ protected void nextDictEncodedVal( } @Override - public void nextVals(FieldVector vector, int rowId, VectorizedValuesReader valuesReader, int total) { + public void nextVals( + FieldVector vector, int rowId, VectorizedValuesReader valuesReader, int total) { valuesReader.readIntegers(total, vector, rowId); } } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java index f53a1ba1fc1f..b9fba3813123 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java @@ -25,7 +25,6 @@ import static org.assertj.core.api.Assumptions.assumeThat; import java.io.File; -import java.io.FileInputStream; import java.io.IOException; import java.util.Iterator; import org.apache.iceberg.Files; @@ -297,7 +296,7 @@ public void testReadsForTypePromotedColumns() throws Exception { public void testSupportedReadsForParquetV2() throws Exception { // Float and double column types are written using plain encoding with Parquet V2, // also Parquet V2 will dictionary encode decimals that use fixed length binary - // (i.e. decimals > 8 bytes) + // (i.e. decimals > 8 bytes). Int and long types use DELTA_BINARY_PACKED. Schema schema = new Schema( optional(102, "float_data", Types.FloatType.get()), From 03f63953850b46318a26e6c100380e49bd3eda56 Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Tue, 1 Jul 2025 16:34:55 -0700 Subject: [PATCH 23/31] spotless --- .../data/vectorized/parquet/TestParquetVectorizedReads.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java index b9fba3813123..d7f7f9c68d21 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java @@ -50,7 +50,6 @@ import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.Type; -import org.apache.spark.sql.catalyst.plans.logical.Except; import org.apache.spark.sql.vectorized.ColumnarBatch; import org.junit.jupiter.api.Test; @@ -303,8 +302,7 @@ public void testSupportedReadsForParquetV2() throws Exception { optional(103, "double_data", Types.DoubleType.get()), optional(104, "decimal_data", Types.DecimalType.of(25, 5)), optional(105, "int_data", Types.IntegerType.get()), - optional(106, "long_data", Types.LongType.get()) - ); + optional(106, "long_data", Types.LongType.get())); File dataFile = File.createTempFile("junit", null, temp.toFile()); assertThat(dataFile.delete()).as("Delete should succeed").isTrue(); From c39570d6e7c44d721005bf4c2132e8c5ce28a65d Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Tue, 1 Jul 2025 17:12:27 -0700 Subject: [PATCH 24/31] fix lints --- .../VectorizedDeltaEncodedValuesReader.java | 43 ++++++++++--------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java index efe4b72ccb65..8f7b6f20057e 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java @@ -22,7 +22,7 @@ import java.nio.ByteBuffer; import java.util.Arrays; import org.apache.arrow.vector.FieldVector; -import org.apache.parquet.Preconditions; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.parquet.bytes.ByteBufferInputStream; import org.apache.parquet.bytes.BytesUtils; import org.apache.parquet.column.values.ValuesReader; @@ -65,7 +65,7 @@ public class VectorizedDeltaEncodedValuesReader extends ValuesReader private int remainingInMiniBlock = 0; // values in current mini block still to be read private long[] unpackedValuesBuffer; - private ByteBufferInputStream in; + private ByteBufferInputStream inputStream; // temporary buffers used by readInteger and readLong private int intVal; @@ -75,20 +75,20 @@ public class VectorizedDeltaEncodedValuesReader extends ValuesReader public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOException { Preconditions.checkArgument( valueCount >= 1, "Page must have at least one value, but it has " + valueCount); - this.in = in; + this.inputStream = in; // Read the header - this.blockSizeInValues = BytesUtils.readUnsignedVarInt(this.in); - this.miniBlockNumInABlock = BytesUtils.readUnsignedVarInt(this.in); + this.blockSizeInValues = BytesUtils.readUnsignedVarInt(this.inputStream); + this.miniBlockNumInABlock = BytesUtils.readUnsignedVarInt(this.inputStream); double miniSize = (double) blockSizeInValues / miniBlockNumInABlock; Preconditions.checkArgument( miniSize % 8 == 0, "miniBlockSize must be multiple of 8, but it's " + miniSize); this.miniBlockSizeInValues = (int) miniSize; // True value count. May be less than valueCount because of nulls - this.totalValueCount = BytesUtils.readUnsignedVarInt(this.in); + this.totalValueCount = BytesUtils.readUnsignedVarInt(this.inputStream); this.bitWidths = new int[miniBlockNumInABlock]; this.unpackedValuesBuffer = new long[miniBlockSizeInValues]; // read the first value - firstValue = BytesUtils.readZigZagVarLong(this.in); + firstValue = BytesUtils.readZigZagVarLong(this.inputStream); } @Override @@ -156,22 +156,23 @@ private void readValues( + " more."); } int remaining = total; + int currentRowId = rowId; // First value if (valuesRead == 0) { - outputWriter.write(vec, ((long) (rowId + valuesRead) * typeWidth), firstValue); + outputWriter.write(vec, ((long) (currentRowId + valuesRead) * typeWidth), firstValue); lastValueRead = firstValue; - rowId++; + currentRowId++; remaining--; } while (remaining > 0) { - int n; + int loadedRows; try { - n = loadMiniBlockToOutput(remaining, vec, rowId, typeWidth, outputWriter); + loadedRows = loadMiniBlockToOutput(remaining, vec, currentRowId, typeWidth, outputWriter); } catch (IOException e) { throw new ParquetDecodingException("Error reading mini block.", e); } - rowId += n; - remaining -= n; + currentRowId += loadedRows; + remaining -= loadedRows; } valuesRead = total - remaining; } @@ -196,25 +197,25 @@ private int loadMiniBlockToOutput( } // read values from miniblock - int valuesRead = 0; + int valuesReadInMiniBlock = 0; for (int i = miniBlockSizeInValues - remainingInMiniBlock; - i < miniBlockSizeInValues && valuesRead < remaining; + i < miniBlockSizeInValues && valuesReadInMiniBlock < remaining; i++) { // calculate values from deltas unpacked for current block long outValue = lastValueRead + minDeltaInCurrentBlock + unpackedValuesBuffer[i]; lastValueRead = outValue; - outputWriter.write(vec, ((long) (rowId + valuesRead) * typeWidth), outValue); + outputWriter.write(vec, ((long) (rowId + valuesReadInMiniBlock) * typeWidth), outValue); remainingInBlock--; remainingInMiniBlock--; - valuesRead++; + valuesReadInMiniBlock++; } - return valuesRead; + return valuesReadInMiniBlock; } private void readBlockHeader() { try { - minDeltaInCurrentBlock = BytesUtils.readZigZagVarLong(in); + minDeltaInCurrentBlock = BytesUtils.readZigZagVarLong(inputStream); } catch (IOException e) { throw new ParquetDecodingException("Can not read min delta in current block", e); } @@ -234,7 +235,7 @@ private void unpackMiniBlock() throws IOException { BytePackerForLong packer = Packer.LITTLE_ENDIAN.newBytePackerForLong(bitWidths[currentMiniBlock]); for (int j = 0; j < miniBlockSizeInValues; j += 8) { - ByteBuffer buffer = in.slice(packer.getBitWidth()); + ByteBuffer buffer = inputStream.slice(packer.getBitWidth()); if (buffer.hasArray()) { packer.unpack8Values( buffer.array(), buffer.arrayOffset() + buffer.position(), unpackedValuesBuffer, j); @@ -250,7 +251,7 @@ private void unpackMiniBlock() throws IOException { private void readBitWidthsForMiniBlocks() { for (int i = 0; i < miniBlockNumInABlock; i++) { try { - bitWidths[i] = BytesUtils.readIntLittleEndianOnOneByte(in); + bitWidths[i] = BytesUtils.readIntLittleEndianOnOneByte(inputStream); } catch (IOException e) { throw new ParquetDecodingException("Can not decode bitwidth in block header", e); } From 3a73ecc2dcb0864109485454951b9991504d0826 Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Tue, 15 Jul 2025 07:05:19 -0700 Subject: [PATCH 25/31] review comments --- LICENSE | 1 + .../parquet/VectorizedDeltaEncodedValuesReader.java | 11 ++++++----- .../java/org/apache/iceberg/parquet/PageIterator.java | 7 +++++++ 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/LICENSE b/LICENSE index 34511297cd8a..80cfd3652e69 100644 --- a/LICENSE +++ b/LICENSE @@ -288,6 +288,7 @@ This product includes code from Apache Spark. * casting logic in AssignmentAlignmentSupport * implementation of SetAccumulator. * Connector expressions. +* implementation of VectorizedDeltaEncodedValuesReader Copyright: 2011-2018 The Apache Software Foundation Home page: https://spark.apache.org/ diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java index 8f7b6f20057e..d3f5c5cce809 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java @@ -44,7 +44,7 @@ public class VectorizedDeltaEncodedValuesReader extends ValuesReader // header data private int blockSizeInValues; - private int miniBlockNumInABlock; + private int miniBlocksPerBlock; private int totalValueCount; private long firstValue; @@ -78,14 +78,14 @@ public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOExce this.inputStream = in; // Read the header this.blockSizeInValues = BytesUtils.readUnsignedVarInt(this.inputStream); - this.miniBlockNumInABlock = BytesUtils.readUnsignedVarInt(this.inputStream); - double miniSize = (double) blockSizeInValues / miniBlockNumInABlock; + this.miniBlocksPerBlock = BytesUtils.readUnsignedVarInt(this.inputStream); + double miniSize = (double) blockSizeInValues / miniBlocksPerBlock; Preconditions.checkArgument( miniSize % 8 == 0, "miniBlockSize must be multiple of 8, but it's " + miniSize); this.miniBlockSizeInValues = (int) miniSize; // True value count. May be less than valueCount because of nulls this.totalValueCount = BytesUtils.readUnsignedVarInt(this.inputStream); - this.bitWidths = new int[miniBlockNumInABlock]; + this.bitWidths = new int[miniBlocksPerBlock]; this.unpackedValuesBuffer = new long[miniBlockSizeInValues]; // read the first value firstValue = BytesUtils.readZigZagVarLong(this.inputStream); @@ -164,6 +164,7 @@ private void readValues( currentRowId++; remaining--; } + while (remaining > 0) { int loadedRows; try { @@ -249,7 +250,7 @@ private void unpackMiniBlock() throws IOException { // From org.apache.parquet.column.values.delta.DeltaBinaryPackingValuesReader private void readBitWidthsForMiniBlocks() { - for (int i = 0; i < miniBlockNumInABlock; i++) { + for (int i = 0; i < miniBlocksPerBlock; i++) { try { bitWidths[i] = BytesUtils.readIntLittleEndianOnOneByte(inputStream); } catch (IOException e) { diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/PageIterator.java b/parquet/src/main/java/org/apache/iceberg/parquet/PageIterator.java index a68d2f9b82e7..bff13603002f 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/PageIterator.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/PageIterator.java @@ -257,6 +257,13 @@ protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, i } else { this.values = dataEncoding.getValuesReader(desc, ValuesType.VALUES); } + + // if (dataEncoding.usesDictionary() && converter.hasDictionarySupport()) { + // bindToDictionary(dictionary); + // } else { + // bind(path.getType()); + // } + try { values.initFromPage(valueCount, in); } catch (IOException e) { From 44a81ac5c0c12598d8c63d9934f9158b418a5c9b Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Mon, 21 Jul 2025 16:35:47 -0700 Subject: [PATCH 26/31] amogh comments --- .../VectorizedDeltaEncodedValuesReader.java | 1 + ...ectorizedParquetDefinitionLevelReader.java | 43 +++++++++---------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java index d3f5c5cce809..60901d134944 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java @@ -155,6 +155,7 @@ private void readValues( + total + " more."); } + int remaining = total; int currentRowId = rowId; // First value diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java index 26872c686ec3..38f74b096323 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java @@ -223,7 +223,7 @@ protected void nextRleBatch( int idx, int numValues, byte[] byteArray) { - setNextNValuesInVector(nullabilityHolder, valuesReader, idx, vector, numValues, this); + setNextNValuesInVector(nullabilityHolder, valuesReader, idx, vector, numValues); } @Override @@ -255,6 +255,26 @@ protected abstract void nextVal( public abstract void nextVals( FieldVector vector, int rowId, VectorizedValuesReader valuesReader, int total); + + private void setNextNValuesInVector( + NullabilityHolder nullabilityHolder, + VectorizedValuesReader valuesReader, + int bufferIdx, + FieldVector vector, + int numValues) { + ArrowBuf validityBuffer = vector.getValidityBuffer(); + if (currentValue == maxDefLevel) { + nextVals(vector, bufferIdx, valuesReader, numValues); + nullabilityHolder.setNotNulls(bufferIdx, numValues); + if (setArrowValidityVector) { + for (int i = 0; i < numValues; i++) { + BitVectorHelper.setBit(validityBuffer, bufferIdx + i); + } + } + } else { + setNulls(nullabilityHolder, bufferIdx, numValues, validityBuffer); + } + } } class LongReader extends NumericBaseReader { @@ -674,27 +694,6 @@ private void setNulls( } } - private void setNextNValuesInVector( - NullabilityHolder nullabilityHolder, - VectorizedValuesReader valuesReader, - int bufferIdx, - FieldVector vector, - int numValues, - NumericBaseReader reader) { - ArrowBuf validityBuffer = vector.getValidityBuffer(); - if (currentValue == maxDefLevel) { - reader.nextVals(vector, bufferIdx, valuesReader, numValues); - nullabilityHolder.setNotNulls(bufferIdx, numValues); - if (setArrowValidityVector) { - for (int i = 0; i < numValues; i++) { - BitVectorHelper.setBit(validityBuffer, bufferIdx + i); - } - } - } else { - setNulls(nullabilityHolder, bufferIdx, numValues, validityBuffer); - } - } - LongReader longReader() { return new LongReader(); } From d5847532752d15569510ffe020afd8d2ebd9de7f Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Mon, 21 Jul 2025 16:37:53 -0700 Subject: [PATCH 27/31] russell comments --- .../parquet/VectorizedDeltaEncodedValuesReader.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java index 60901d134944..0d2c183706ad 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java @@ -91,11 +91,13 @@ public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOExce firstValue = BytesUtils.readZigZagVarLong(this.inputStream); } + /** DELTA_BINARY_PACKED only supports INT32 & INT64 */ @Override public byte readByte() { throw new UnsupportedOperationException("readByte is not supported"); } + /** DELTA_BINARY_PACKED only supports INT32 & INT64 */ @Override public short readShort() { throw new UnsupportedOperationException("readShort is not supported"); @@ -113,11 +115,13 @@ public long readLong() { return longVal; } + /** The Iceberg reader currently does not do skipping */ @Override public void skip() { throw new UnsupportedOperationException("skip is not supported"); } + /** DELTA_BINARY_PACKED only supports INT32 & INT64 */ @Override public Binary readBinary(int len) { throw new UnsupportedOperationException("readBinary is not supported"); @@ -133,11 +137,13 @@ public void readLongs(int total, FieldVector vec, int rowId) { readValues(total, vec, rowId, LONG_SIZE, (f, i, v) -> f.getDataBuffer().setLong(i, v)); } + /** DELTA_BINARY_PACKED only supports INT32 & INT64 */ @Override public void readFloats(int total, FieldVector vec, int rowId) { throw new UnsupportedOperationException("readFloats is not supported"); } + /** DELTA_BINARY_PACKED only supports INT32 & INT64 */ @Override public void readDoubles(int total, FieldVector vec, int rowId) { throw new UnsupportedOperationException("readDoubles is not supported"); From d253f1be16761b6a6400599309f93120fcd6b1bd Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Mon, 21 Jul 2025 16:42:41 -0700 Subject: [PATCH 28/31] spotless --- .../VectorizedParquetDefinitionLevelReader.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java index 38f74b096323..c7dbe8de7b92 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java @@ -257,11 +257,11 @@ public abstract void nextVals( FieldVector vector, int rowId, VectorizedValuesReader valuesReader, int total); private void setNextNValuesInVector( - NullabilityHolder nullabilityHolder, - VectorizedValuesReader valuesReader, - int bufferIdx, - FieldVector vector, - int numValues) { + NullabilityHolder nullabilityHolder, + VectorizedValuesReader valuesReader, + int bufferIdx, + FieldVector vector, + int numValues) { ArrowBuf validityBuffer = vector.getValidityBuffer(); if (currentValue == maxDefLevel) { nextVals(vector, bufferIdx, valuesReader, numValues); From e0b505b705f6e33c5c9c96d8e490063abca8bae1 Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Tue, 22 Jul 2025 10:33:23 -0700 Subject: [PATCH 29/31] retry docs From 97a315ee5a11e175f5719fbf8bb713e1f9acdf8f Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Tue, 22 Jul 2025 10:34:12 -0700 Subject: [PATCH 30/31] javadoc fix --- .../parquet/VectorizedDeltaEncodedValuesReader.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java index 0d2c183706ad..c114f4b85a1b 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java @@ -91,13 +91,13 @@ public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOExce firstValue = BytesUtils.readZigZagVarLong(this.inputStream); } - /** DELTA_BINARY_PACKED only supports INT32 & INT64 */ + /** DELTA_BINARY_PACKED only supports INT32 and INT64 */ @Override public byte readByte() { throw new UnsupportedOperationException("readByte is not supported"); } - /** DELTA_BINARY_PACKED only supports INT32 & INT64 */ + /** DELTA_BINARY_PACKED only supports INT32 and INT64 */ @Override public short readShort() { throw new UnsupportedOperationException("readShort is not supported"); @@ -121,7 +121,7 @@ public void skip() { throw new UnsupportedOperationException("skip is not supported"); } - /** DELTA_BINARY_PACKED only supports INT32 & INT64 */ + /** DELTA_BINARY_PACKED only supports INT32 and INT64 */ @Override public Binary readBinary(int len) { throw new UnsupportedOperationException("readBinary is not supported"); @@ -137,13 +137,13 @@ public void readLongs(int total, FieldVector vec, int rowId) { readValues(total, vec, rowId, LONG_SIZE, (f, i, v) -> f.getDataBuffer().setLong(i, v)); } - /** DELTA_BINARY_PACKED only supports INT32 & INT64 */ + /** DELTA_BINARY_PACKED only supports INT32 and INT64 */ @Override public void readFloats(int total, FieldVector vec, int rowId) { throw new UnsupportedOperationException("readFloats is not supported"); } - /** DELTA_BINARY_PACKED only supports INT32 & INT64 */ + /** DELTA_BINARY_PACKED only supports INT32 and INT64 */ @Override public void readDoubles(int total, FieldVector vec, int rowId) { throw new UnsupportedOperationException("readDoubles is not supported"); From dfb7b77599689fd0f7d2d1a2a363b1f025de053a Mon Sep 17 00:00:00 2001 From: Eric Maynard Date: Wed, 30 Jul 2025 00:42:07 +0900 Subject: [PATCH 31/31] putInt --- .../vectorized/parquet/VectorizedDeltaEncodedValuesReader.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java index c114f4b85a1b..115518e1fb50 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDeltaEncodedValuesReader.java @@ -129,7 +129,7 @@ public Binary readBinary(int len) { @Override public void readIntegers(int total, FieldVector vec, int rowId) { - readValues(total, vec, rowId, INT_SIZE, (f, i, v) -> f.getDataBuffer().setLong(i, v)); + readValues(total, vec, rowId, INT_SIZE, (f, i, v) -> f.getDataBuffer().setInt(i, (int) v)); } @Override