diff --git a/api/src/main/java/org/apache/iceberg/variants/Variant.java b/api/src/main/java/org/apache/iceberg/variants/Variant.java index c9e19b2ec033..75466dc04055 100644 --- a/api/src/main/java/org/apache/iceberg/variants/Variant.java +++ b/api/src/main/java/org/apache/iceberg/variants/Variant.java @@ -22,6 +22,10 @@ /** A variant metadata and value pair. */ public interface Variant { + + /** The current version of the Variant spec */ + byte VARIANT_SPEC_VERSION = (byte) 1; + /** Returns the metadata for all values in the variant. */ VariantMetadata metadata(); diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 865344549064..eeabe54f5f05 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -77,7 +77,7 @@ nessie = "0.104.5" netty-buffer = "4.2.4.Final" object-client-bundle = "3.3.2" orc = "1.9.7" -parquet = "1.15.2" +parquet = "1.16.0" roaringbitmap = "1.3.0" scala-collection-compat = "2.13.0" slf4j = "2.0.17" diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/TypeToMessageType.java b/parquet/src/main/java/org/apache/iceberg/parquet/TypeToMessageType.java index a1cb10ca8c57..d648cbf0694b 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/TypeToMessageType.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/TypeToMessageType.java @@ -42,6 +42,7 @@ import org.apache.iceberg.types.Types.StructType; import org.apache.iceberg.types.Types.TimestampNanoType; import org.apache.iceberg.types.Types.TimestampType; +import org.apache.iceberg.variants.Variant; import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit; @@ -182,6 +183,7 @@ public Type variant(Type.Repetition repetition, int id, String originalName) { shreddedType.getRepetition()); return Types.buildGroup(repetition) + .as(LogicalTypeAnnotation.variantType(Variant.VARIANT_SPEC_VERSION)) .id(id) .required(BINARY) .named(METADATA) @@ -192,6 +194,7 @@ public Type variant(Type.Repetition repetition, int id, String originalName) { } else { return Types.buildGroup(repetition) + .as(LogicalTypeAnnotation.variantType(Variant.VARIANT_SPEC_VERSION)) .id(id) .required(BINARY) .named(METADATA) diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/TypeWithSchemaVisitor.java b/parquet/src/main/java/org/apache/iceberg/parquet/TypeWithSchemaVisitor.java index fa27831a476a..4ab454829765 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/TypeWithSchemaVisitor.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/TypeWithSchemaVisitor.java @@ -61,7 +61,8 @@ public static T visit( return visitList(iType, group, visitor); } else if (annotation instanceof MapLogicalTypeAnnotation) { return visitMap(iType, group, visitor); - } else if (iType != null && iType.isVariantType()) { + } else if (annotation instanceof LogicalTypeAnnotation.VariantLogicalTypeAnnotation + || (iType != null && iType.isVariantType())) { // when Parquet has a VARIANT logical type, use it here return visitVariant(iType.asVariantType(), group, visitor); } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java index f90cf3b693d0..50e42dcbb5cb 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java @@ -305,7 +305,7 @@ public void testBinPackAfterPartitionChange() { .option( RewriteDataFiles.TARGET_FILE_SIZE_BYTES, // Increase max file size for V3 to account for additional row lineage fields - Integer.toString(averageFileSize(table) + (formatVersion >= 3 ? 11000 : 1001))) + Integer.toString(averageFileSize(table) + (formatVersion >= 3 ? 12000 : 1100))) .execute(); assertThat(result.rewriteResults()) diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java index f90cf3b693d0..50e42dcbb5cb 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java @@ -305,7 +305,7 @@ public void testBinPackAfterPartitionChange() { .option( RewriteDataFiles.TARGET_FILE_SIZE_BYTES, // Increase max file size for V3 to account for additional row lineage fields - Integer.toString(averageFileSize(table) + (formatVersion >= 3 ? 11000 : 1001))) + Integer.toString(averageFileSize(table) + (formatVersion >= 3 ? 12000 : 1100))) .execute(); assertThat(result.rewriteResults()) diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java index 274934355084..54a658bfad8e 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java @@ -23,6 +23,7 @@ import org.apache.iceberg.avro.AvroSchemaUtil; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.variants.Variant; import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; @@ -154,11 +155,15 @@ public static T visit(DataType sType, Type type, ParquetWithSparkSchemaVisit } finally { visitor.fieldNames.pop(); } - } else if (sType instanceof VariantType) { - // TODO: Use LogicalTypeAnnotation.variantType().equals(annotation) when VARIANT type is - // added to Parquet - // Preconditions.checkArgument( - // sType instanceof VariantType, "Invalid variant: %s is not a VariantType", sType); + } else if (LogicalTypeAnnotation.variantType(Variant.VARIANT_SPEC_VERSION).equals(annotation) + || sType instanceof VariantType) { + // For the Variant we both check the Parquet LogicalTypeAnnotation, and we rely on the + // Iceberg schema, since there are engines like Spark that produce VariantTypes without the + // annotation. + Preconditions.checkArgument( + sType instanceof VariantType, + "Invalid variant: Spark type %s is not a variant type", + sType); VariantType variant = (VariantType) sType; return visitor.variant(variant, group); diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java index 8bdab42e9170..7e9462424d49 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java @@ -50,6 +50,7 @@ import java.util.stream.IntStream; import java.util.stream.LongStream; import java.util.stream.Stream; +import java.util.stream.StreamSupport; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.BaseTable; import org.apache.iceberg.ContentFile; @@ -305,7 +306,7 @@ public void testBinPackAfterPartitionChange() { .option( RewriteDataFiles.TARGET_FILE_SIZE_BYTES, // Increase max file size for V3 to account for additional row lineage fields - Integer.toString(averageFileSize(table) + (formatVersion >= 3 ? 11000 : 1001))) + Integer.toString(averageFileSize(table) + (formatVersion >= 3 ? 12000 : 1100))) .execute(); assertThat(result.rewriteResults()) @@ -2167,8 +2168,10 @@ protected void shouldHaveMultipleFiles(Table table) { protected void shouldHaveFiles(Table table, int numExpected) { table.refresh(); - int numFiles = Iterables.size(table.newScan().planFiles()); - assertThat(numFiles).as("Did not have the expected number of files").isEqualTo(numExpected); + List files = + StreamSupport.stream(table.newScan().planFiles().spliterator(), false) + .collect(Collectors.toList()); + assertThat(files.size()).as("Did not have the expected number of files").isEqualTo(numExpected); } protected long shouldHaveMinSequenceNumberInPartition(