Spark: Update Spark Parquet vectorized read tests to uses Iceberg Record instead of Avro GenericRecord (#12925)

amogh-jahagirdar · web-flow · commit d0cf7f53c675 · 2025-04-30T11:09:03.000-06:00
diff --git a/data/src/test/java/org/apache/iceberg/data/RandomGenericData.java b/data/src/test/java/org/apache/iceberg/data/RandomGenericData.java
@@ -37,6 +37,7 @@
 import java.util.function.Supplier;
 import org.apache.iceberg.RandomVariants;
 import org.apache.iceberg.Schema;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.relocated.com.google.common.collect.Maps;
 import org.apache.iceberg.relocated.com.google.common.collect.Sets;
@@ -64,6 +65,12 @@ public static Iterable<Record> generateDictionaryEncodableRecords(
     return generateIcebergGenerics(schema, numRecords, () -> new DictionaryEncodedGenerator(seed));
   }
 
+  public static Iterable<Record> generateDictionaryEncodableRecords(
+      Schema schema, int numRecords, long seed, float nullPercentage) {
+    return generateIcebergGenerics(
+        schema, numRecords, () -> new DictionaryEncodedGenerator(seed, nullPercentage));
+  }
+
   private static Iterable<Record> generateIcebergGenerics(
       Schema schema, int numRecords, Supplier<RandomDataGenerator<Record>> supplier) {
     return () ->
@@ -92,6 +99,10 @@ private RandomRecordGenerator(long seed) {
       super(seed);
     }
 
+    private RandomRecordGenerator(long seed, float nullPercentage) {
+      super(seed, nullPercentage);
+    }
+
     @Override
     public Record schema(Schema schema, Supplier<Object> structResult) {
       return (Record) structResult.get();
@@ -115,6 +126,10 @@ private static class DictionaryEncodedGenerator extends RandomRecordGenerator {
       super(seed);
     }
 
+    DictionaryEncodedGenerator(long seed, float nullPercentage) {
+      super(seed, nullPercentage);
+    }
+
     @Override
     protected int getMaxEntries() {
       // Here we limited the max entries in LIST or MAP to be 3, because we have the mechanism to
@@ -155,11 +170,22 @@ protected Object randomValue(Type.PrimitiveType primitive, Random rand) {
 
   public abstract static class RandomDataGenerator<T>
       extends TypeUtil.CustomOrderSchemaVisitor<Object> {
-    private final Random random;
     private static final int MAX_ENTRIES = 20;
+    private static final float DEFAULT_NULL_PERCENTAGE = 0.05f;
+
+    private final Random random;
+    private final float nullPercentage;
 
     protected RandomDataGenerator(long seed) {
+      this(seed, DEFAULT_NULL_PERCENTAGE);
+    }
+
+    protected RandomDataGenerator(long seed, float nullPercentage) {
+      Preconditions.checkArgument(
+          0.0f <= nullPercentage && nullPercentage <= 1.0f,
+          "Percentage needs to be in the range (0.0, 1.0)");
       this.random = new Random(seed);
+      this.nullPercentage = nullPercentage;
     }
 
     protected int getMaxEntries() {
@@ -174,21 +200,23 @@ protected int getMaxEntries() {
 
     @Override
     public Object field(Types.NestedField field, Supplier<Object> fieldResult) {
-      // return null 5% of the time when the value is optional
-      if (field.isOptional() && random.nextInt(20) == 1) {
+      if (field.isOptional() && isNull()) {
         return null;
       }
       return fieldResult.get();
     }
 
+    private boolean isNull() {
+      return random.nextFloat() < nullPercentage;
+    }
+
     @Override
     public Object list(Types.ListType list, Supplier<Object> elementResult) {
       int numElements = random.nextInt(getMaxEntries());
 
       List<Object> result = Lists.newArrayListWithExpectedSize(numElements);
       for (int i = 0; i < numElements; i += 1) {
-        // return null 5% of the time when the value is optional
-        if (list.isElementOptional() && random.nextInt(20) == 1) {
+        if (list.isElementOptional() && isNull()) {
           result.add(null);
         } else {
           result.add(elementResult.get());
@@ -220,8 +248,7 @@ public Object map(Types.MapType map, Supplier<Object> keyResult, Supplier<Object
 
         keySet.add(key);
 
-        // return null 5% of the time when the value is optional
-        if (map.isValueOptional() && random.nextInt(20) == 1) {
+        if (map.isValueOptional() && isNull()) {
           result.put(key, null);
         } else {
           result.put(key, valueResult.get());
diff --git a/data/src/test/java/org/apache/iceberg/data/TestLocalScan.java b/data/src/test/java/org/apache/iceberg/data/TestLocalScan.java
@@ -36,6 +36,7 @@
 import java.nio.ByteOrder;
 import java.nio.file.Files;
 import java.util.Arrays;
+import java.util.Comparator;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Set;
@@ -263,9 +264,16 @@ public void testRandomData() throws IOException {
 
     append.commit();
 
-    Set<Record> records = Sets.newHashSet(IcebergGenerics.read(table).build());
+    Comparator<Record> recordComparator =
+        Comparator.comparing((Record r) -> r.get(0, Long.class))
+            .thenComparing(
+                (Record r) -> r.get(1, String.class), Comparator.nullsFirst(String::compareTo));
+    List<Record> records = Lists.newArrayList(IcebergGenerics.read(table).build());
+
+    expected.sort(recordComparator);
+    records.sort(recordComparator);
     assertThat(records).as("Should produce correct number of records").hasSameSizeAs(expected);
-    assertThat(records).as("Random record set should match").isEqualTo(Sets.newHashSet(expected));
+    assertThat(records).as("Random record set should match").isEqualTo(expected);
   }
 
   @TestTemplate
diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java
@@ -241,9 +241,18 @@ protected Object randomValue(Type.PrimitiveType primitive, Random rand) {
 
   private static class SparkRandomDataGenerator extends TypeUtil.CustomOrderSchemaVisitor<Object> {
     private final Random random;
+    private final float nullPercentage;
 
     private SparkRandomDataGenerator(long seed) {
+      this(seed, DEFAULT_NULL_PERCENTAGE);
+    }
+
+    private SparkRandomDataGenerator(long seed, float nullPercentage) {
+      Preconditions.checkArgument(
+          0.0f <= nullPercentage && nullPercentage <= 1.0f,
+          "Percentage needs to be in the range (0.0, 1.0)");
       this.random = new Random(seed);
+      this.nullPercentage = nullPercentage;
     }
 
     @Override
@@ -264,22 +273,24 @@ public InternalRow struct(Types.StructType struct, Iterable<Object> fieldResults
 
     @Override
     public Object field(Types.NestedField field, Supplier<Object> fieldResult) {
-      // return null 5% of the time when the value is optional
-      if (field.isOptional() && random.nextInt(20) == 1) {
+      if (field.isOptional() && isNull()) {
         return null;
       }
       return fieldResult.get();
     }
 
+    private boolean isNull() {
+      return random.nextFloat() < nullPercentage;
+    }
+
     @Override
     public GenericArrayData list(Types.ListType list, Supplier<Object> elementResult) {
       int numElements = random.nextInt(20);
       Object[] arr = new Object[numElements];
       GenericArrayData result = new GenericArrayData(arr);
 
       for (int i = 0; i < numElements; i += 1) {
-        // return null 5% of the time when the value is optional
-        if (list.isElementOptional() && random.nextInt(20) == 1) {
+        if (list.isElementOptional() && isNull()) {
           arr[i] = null;
         } else {
           arr[i] = elementResult.get();
@@ -310,8 +321,7 @@ public Object map(Types.MapType map, Supplier<Object> keyResult, Supplier<Object
         keySet.add(key);
 
         keysArr[i] = key;
-        // return null 5% of the time when the value is optional
-        if (map.isValueOptional() && random.nextInt(20) == 1) {
+        if (map.isValueOptional() && isNull()) {
           valuesArr[i] = null;
         } else {
           valuesArr[i] = valueResult.get();
diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java
@@ -35,6 +35,7 @@
 import java.time.temporal.ChronoUnit;
 import java.util.Collection;
 import java.util.Date;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.UUID;
@@ -49,6 +50,7 @@
 import org.apache.spark.sql.catalyst.util.ArrayData;
 import org.apache.spark.sql.catalyst.util.MapData;
 import org.apache.spark.sql.types.Decimal;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
 import org.apache.spark.unsafe.types.UTF8String;
 import scala.collection.Seq;
 
@@ -70,6 +72,15 @@ public static void assertEqualsSafe(Types.StructType struct, Record expected, Ro
     }
   }
 
+  public static void assertEqualsBatch(
+      Types.StructType struct, Iterator<Record> expectedRecords, ColumnarBatch batch) {
+    for (int rowId = 0; rowId < batch.numRows(); rowId++) {
+      InternalRow row = batch.getRow(rowId);
+      Record expectedRecord = expectedRecords.next();
+      assertEqualsUnsafe(struct, expectedRecord, row);
+    }
+  }
+
   private static void assertEqualsSafe(
       Types.ListType list, Collection<?> expected, List<?> actual) {
     Type elementType = list.elementType();
@@ -289,11 +300,27 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual
     }
 
     switch (type.typeId()) {
+      case LONG:
+        assertThat(actual).as("Should be a long").isInstanceOf(Long.class);
+        if (expected instanceof Integer) {
+          assertThat(actual).as("Values didn't match").isEqualTo(((Number) expected).longValue());
+        } else {
+          assertThat(actual).as("Primitive value should be equal to expected").isEqualTo(expected);
+        }
+        break;
+      case DOUBLE:
+        assertThat(actual).as("Should be a double").isInstanceOf(Double.class);
+        if (expected instanceof Float) {
+          assertThat(Double.doubleToLongBits((double) actual))
+              .as("Values didn't match")
+              .isEqualTo(Double.doubleToLongBits(((Number) expected).doubleValue()));
+        } else {
+          assertThat(actual).as("Primitive value should be equal to expected").isEqualTo(expected);
+        }
+        break;
       case BOOLEAN:
       case INTEGER:
-      case LONG:
       case FLOAT:
-      case DOUBLE:
         assertThat(actual).as("Primitive value should be equal to expected").isEqualTo(expected);
         break;
       case DATE:
diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java
@@ -241,9 +241,18 @@ protected Object randomValue(Type.PrimitiveType primitive, Random rand) {
 
   private static class SparkRandomDataGenerator extends TypeUtil.CustomOrderSchemaVisitor<Object> {
     private final Random random;
+    private final float nullPercentage;
 
     private SparkRandomDataGenerator(long seed) {
+      this(seed, DEFAULT_NULL_PERCENTAGE);
+    }
+
+    private SparkRandomDataGenerator(long seed, float nullPercentage) {
+      Preconditions.checkArgument(
+          0.0f <= nullPercentage && nullPercentage <= 1.0f,
+          "Percentage needs to be in the range (0.0, 1.0)");
       this.random = new Random(seed);
+      this.nullPercentage = nullPercentage;
     }
 
     @Override
@@ -265,21 +274,24 @@ public InternalRow struct(Types.StructType struct, Iterable<Object> fieldResults
     @Override
     public Object field(Types.NestedField field, Supplier<Object> fieldResult) {
       // return null 5% of the time when the value is optional
-      if (field.isOptional() && random.nextInt(20) == 1) {
+      if (field.isOptional() && isNull()) {
         return null;
       }
       return fieldResult.get();
     }
 
+    private boolean isNull() {
+      return random.nextFloat() < nullPercentage;
+    }
+
     @Override
     public GenericArrayData list(Types.ListType list, Supplier<Object> elementResult) {
       int numElements = random.nextInt(20);
       Object[] arr = new Object[numElements];
       GenericArrayData result = new GenericArrayData(arr);
 
       for (int i = 0; i < numElements; i += 1) {
-        // return null 5% of the time when the value is optional
-        if (list.isElementOptional() && random.nextInt(20) == 1) {
+        if (list.isElementOptional() && isNull()) {
           arr[i] = null;
         } else {
           arr[i] = elementResult.get();
@@ -310,8 +322,7 @@ public Object map(Types.MapType map, Supplier<Object> keyResult, Supplier<Object
         keySet.add(key);
 
         keysArr[i] = key;
-        // return null 5% of the time when the value is optional
-        if (map.isValueOptional() && random.nextInt(20) == 1) {
+        if (map.isValueOptional() && isNull()) {
           valuesArr[i] = null;
         } else {
           valuesArr[i] = valueResult.get();
diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java
@@ -29,9 +29,11 @@
 import java.nio.file.Paths;
 import java.util.Iterator;
 import java.util.List;
-import org.apache.avro.generic.GenericData;
 import org.apache.iceberg.Files;
 import org.apache.iceberg.Schema;
+import org.apache.iceberg.data.RandomGenericData;
+import org.apache.iceberg.data.Record;
+import org.apache.iceberg.data.parquet.GenericParquetWriter;
 import org.apache.iceberg.io.CloseableIterable;
 import org.apache.iceberg.io.FileAppender;
 import org.apache.iceberg.parquet.Parquet;
@@ -40,7 +42,6 @@
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
 import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
-import org.apache.iceberg.spark.data.RandomData;
 import org.apache.iceberg.spark.data.TestHelpers;
 import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders;
 import org.apache.iceberg.types.Types;
@@ -71,14 +72,15 @@ public static void stopSpark() {
   }
 
   @Override
-  Iterable<GenericData.Record> generateData(
+  Iterable<Record> generateData(
       Schema schema,
       int numRecords,
       long seed,
       float nullPercentage,
-      Function<GenericData.Record, GenericData.Record> transform) {
+      Function<Record, Record> transform) {
     Iterable data =
-        RandomData.generateDictionaryEncodableData(schema, numRecords, seed, nullPercentage);
+        RandomGenericData.generateDictionaryEncodableRecords(
+            schema, numRecords, seed, nullPercentage);
     return transform == IDENTITY ? data : Iterables.transform(data, transform);
   }
 
@@ -92,19 +94,16 @@ public void testMixedDictionaryNonDictionaryReads() throws IOException {
     Schema schema = new Schema(SUPPORTED_PRIMITIVES.fields());
     File dictionaryEncodedFile = File.createTempFile("junit", null, temp.toFile());
     assertThat(dictionaryEncodedFile.delete()).as("Delete should succeed").isTrue();
-    Iterable<GenericData.Record> dictionaryEncodableData =
-        RandomData.generateDictionaryEncodableData(
-            schema, 10000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE);
-    try (FileAppender<GenericData.Record> writer =
-        getParquetWriter(schema, dictionaryEncodedFile)) {
+    Iterable<Record> dictionaryEncodableData =
+        RandomGenericData.generateDictionaryEncodableRecords(schema, 10000, 0L);
+    try (FileAppender<Record> writer = getParquetWriter(schema, dictionaryEncodedFile)) {
       writer.addAll(dictionaryEncodableData);
     }
 
     File plainEncodingFile = File.createTempFile("junit", null, temp.toFile());
     assertThat(plainEncodingFile.delete()).as("Delete should succeed").isTrue();
-    Iterable<GenericData.Record> nonDictionaryData =
-        RandomData.generate(schema, 10000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE);
-    try (FileAppender<GenericData.Record> writer = getParquetWriter(schema, plainEncodingFile)) {
+    Iterable<Record> nonDictionaryData = RandomGenericData.generate(schema, 10000, 0L);
+    try (FileAppender<Record> writer = getParquetWriter(schema, plainEncodingFile)) {
       writer.addAll(nonDictionaryData);
     }
 
@@ -132,12 +131,13 @@ public void testBinaryNotAllPagesDictionaryEncoded() throws IOException {
     File parquetFile = File.createTempFile("junit", null, temp.toFile());
     assertThat(parquetFile.delete()).as("Delete should succeed").isTrue();
 
-    Iterable<GenericData.Record> records = RandomData.generateFallbackData(schema, 500, 0L, 100);
-    try (FileAppender<GenericData.Record> writer =
+    Iterable<Record> records = RandomGenericData.generateFallbackRecords(schema, 500, 0L, 100);
+    try (FileAppender<Record> writer =
         Parquet.write(Files.localOutput(parquetFile))
             .schema(schema)
             .set(PARQUET_DICT_SIZE_BYTES, "4096")
             .set(PARQUET_PAGE_ROW_LIMIT, "100")
+            .createWriterFunc(GenericParquetWriter::create)
             .build()) {
       writer.addAll(records);
     }
diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java
diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java