Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Spark: Support Parquet dictionary encoded UUIDs
While fixing some issues on the PyIceberg ends to fully support UUIDs:
apache/iceberg-python#2007

I noticed this issue, and was suprised since UUID used to work with
Spark, but it turns out that the dictionary encoded UUID was not
implemented yet.

For PyIceberg we only generate little data, so therefore this wasn't
caught previously.
  • Loading branch information
Fokko committed Jun 16, 2025
commit f033e4d32ccbc79399f0aa966a0cc2754005bfd6
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,8 @@ public ArrowVectorAccessor<DecimalT, Utf8StringT, ArrayT, ChildVectorT> getVecto
switch (primitive.getPrimitiveTypeName()) {
case FIXED_LEN_BYTE_ARRAY:
case BINARY:
return new DictionaryBinaryAccessor<>((IntVector) vector, dictionary);
return new DictionaryBinaryAccessor<>(
(IntVector) vector, dictionary, stringFactorySupplier.get());
case FLOAT:
return new DictionaryFloatAccessor<>((IntVector) vector, dictionary);
case INT64:
Expand Down Expand Up @@ -452,17 +453,27 @@ private static class DictionaryBinaryAccessor<
extends ArrowVectorAccessor<DecimalT, Utf8StringT, ArrayT, ChildVectorT> {
private final IntVector offsetVector;
private final Dictionary dictionary;
private final StringFactory<Utf8StringT> stringFactory;

DictionaryBinaryAccessor(IntVector vector, Dictionary dictionary) {
DictionaryBinaryAccessor(
IntVector vector, Dictionary dictionary, StringFactory<Utf8StringT> stringFactory) {
super(vector);
this.offsetVector = vector;
this.dictionary = dictionary;
this.stringFactory = stringFactory;
}

@Override
public final byte[] getBinary(int rowId) {
return dictionary.decodeToBinary(offsetVector.get(rowId)).getBytes();
}

@Override
public Utf8StringT getUTF8String(int rowId) {
return null == stringFactory
? super.getUTF8String(rowId)
: stringFactory.ofRow(offsetVector, dictionary, rowId);
}
}

private static class DictionaryTimestampInt96Accessor<
Expand Down Expand Up @@ -815,6 +826,13 @@ default Utf8StringT ofRow(FixedSizeBinaryVector vector, int rowId) {
getGenericClass().getSimpleName()));
}

/** Create a UTF8 String from the row value in the Dictionary. */
default Utf8StringT ofRow(IntVector offsetVector, Dictionary dictionary, int rowId) {
throw new UnsupportedOperationException(
String.format(
"Creating %s from a Dictionary is not supported", getGenericClass().getSimpleName()));
}

/** Create a UTF8 String from the byte array. */
Utf8StringT ofBytes(byte[] bytes);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,13 @@
import java.nio.ByteBuffer;
import org.apache.arrow.memory.ArrowBuf;
import org.apache.arrow.vector.FixedSizeBinaryVector;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.ValueVector;
import org.apache.arrow.vector.VarCharVector;
import org.apache.arrow.vector.complex.ListVector;
import org.apache.iceberg.arrow.vectorized.GenericArrowVectorAccessorFactory;
import org.apache.iceberg.util.UUIDUtil;
import org.apache.parquet.column.Dictionary;
import org.apache.spark.sql.types.Decimal;
import org.apache.spark.sql.vectorized.ArrowColumnVector;
import org.apache.spark.sql.vectorized.ColumnarArray;
Expand Down Expand Up @@ -81,6 +83,12 @@ public UTF8String ofRow(FixedSizeBinaryVector vector, int rowId) {
return UTF8String.fromString(UUIDUtil.convert(vector.get(rowId)).toString());
}

@Override
public UTF8String ofRow(IntVector offsetVector, Dictionary dictionary, int rowId) {
byte[] bytes = dictionary.decodeToBinary(offsetVector.get(rowId)).getBytes();
return UTF8String.fromString(UUIDUtil.convert(bytes).toString());
}

@Override
public UTF8String ofBytes(byte[] bytes) {
return UTF8String.fromBytes(bytes);
Expand Down