Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit d0cf7f5

Browse files
Spark: Update Spark Parquet vectorized read tests to uses Iceberg Record instead of Avro GenericRecord (#12925)
1 parent 5ac1942 commit d0cf7f5

File tree

8 files changed

+163
-71
lines changed

8 files changed

+163
-71
lines changed

data/src/test/java/org/apache/iceberg/data/RandomGenericData.java

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
import java.util.function.Supplier;
3838
import org.apache.iceberg.RandomVariants;
3939
import org.apache.iceberg.Schema;
40+
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
4041
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
4142
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
4243
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
@@ -64,6 +65,12 @@ public static Iterable<Record> generateDictionaryEncodableRecords(
6465
return generateIcebergGenerics(schema, numRecords, () -> new DictionaryEncodedGenerator(seed));
6566
}
6667

68+
public static Iterable<Record> generateDictionaryEncodableRecords(
69+
Schema schema, int numRecords, long seed, float nullPercentage) {
70+
return generateIcebergGenerics(
71+
schema, numRecords, () -> new DictionaryEncodedGenerator(seed, nullPercentage));
72+
}
73+
6774
private static Iterable<Record> generateIcebergGenerics(
6875
Schema schema, int numRecords, Supplier<RandomDataGenerator<Record>> supplier) {
6976
return () ->
@@ -92,6 +99,10 @@ private RandomRecordGenerator(long seed) {
9299
super(seed);
93100
}
94101

102+
private RandomRecordGenerator(long seed, float nullPercentage) {
103+
super(seed, nullPercentage);
104+
}
105+
95106
@Override
96107
public Record schema(Schema schema, Supplier<Object> structResult) {
97108
return (Record) structResult.get();
@@ -115,6 +126,10 @@ private static class DictionaryEncodedGenerator extends RandomRecordGenerator {
115126
super(seed);
116127
}
117128

129+
DictionaryEncodedGenerator(long seed, float nullPercentage) {
130+
super(seed, nullPercentage);
131+
}
132+
118133
@Override
119134
protected int getMaxEntries() {
120135
// Here we limited the max entries in LIST or MAP to be 3, because we have the mechanism to
@@ -155,11 +170,22 @@ protected Object randomValue(Type.PrimitiveType primitive, Random rand) {
155170

156171
public abstract static class RandomDataGenerator<T>
157172
extends TypeUtil.CustomOrderSchemaVisitor<Object> {
158-
private final Random random;
159173
private static final int MAX_ENTRIES = 20;
174+
private static final float DEFAULT_NULL_PERCENTAGE = 0.05f;
175+
176+
private final Random random;
177+
private final float nullPercentage;
160178

161179
protected RandomDataGenerator(long seed) {
180+
this(seed, DEFAULT_NULL_PERCENTAGE);
181+
}
182+
183+
protected RandomDataGenerator(long seed, float nullPercentage) {
184+
Preconditions.checkArgument(
185+
0.0f <= nullPercentage && nullPercentage <= 1.0f,
186+
"Percentage needs to be in the range (0.0, 1.0)");
162187
this.random = new Random(seed);
188+
this.nullPercentage = nullPercentage;
163189
}
164190

165191
protected int getMaxEntries() {
@@ -174,21 +200,23 @@ protected int getMaxEntries() {
174200

175201
@Override
176202
public Object field(Types.NestedField field, Supplier<Object> fieldResult) {
177-
// return null 5% of the time when the value is optional
178-
if (field.isOptional() && random.nextInt(20) == 1) {
203+
if (field.isOptional() && isNull()) {
179204
return null;
180205
}
181206
return fieldResult.get();
182207
}
183208

209+
private boolean isNull() {
210+
return random.nextFloat() < nullPercentage;
211+
}
212+
184213
@Override
185214
public Object list(Types.ListType list, Supplier<Object> elementResult) {
186215
int numElements = random.nextInt(getMaxEntries());
187216

188217
List<Object> result = Lists.newArrayListWithExpectedSize(numElements);
189218
for (int i = 0; i < numElements; i += 1) {
190-
// return null 5% of the time when the value is optional
191-
if (list.isElementOptional() && random.nextInt(20) == 1) {
219+
if (list.isElementOptional() && isNull()) {
192220
result.add(null);
193221
} else {
194222
result.add(elementResult.get());
@@ -220,8 +248,7 @@ public Object map(Types.MapType map, Supplier<Object> keyResult, Supplier<Object
220248

221249
keySet.add(key);
222250

223-
// return null 5% of the time when the value is optional
224-
if (map.isValueOptional() && random.nextInt(20) == 1) {
251+
if (map.isValueOptional() && isNull()) {
225252
result.put(key, null);
226253
} else {
227254
result.put(key, valueResult.get());

data/src/test/java/org/apache/iceberg/data/TestLocalScan.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import java.nio.ByteOrder;
3737
import java.nio.file.Files;
3838
import java.util.Arrays;
39+
import java.util.Comparator;
3940
import java.util.Iterator;
4041
import java.util.List;
4142
import java.util.Set;
@@ -263,9 +264,16 @@ public void testRandomData() throws IOException {
263264

264265
append.commit();
265266

266-
Set<Record> records = Sets.newHashSet(IcebergGenerics.read(table).build());
267+
Comparator<Record> recordComparator =
268+
Comparator.comparing((Record r) -> r.get(0, Long.class))
269+
.thenComparing(
270+
(Record r) -> r.get(1, String.class), Comparator.nullsFirst(String::compareTo));
271+
List<Record> records = Lists.newArrayList(IcebergGenerics.read(table).build());
272+
273+
expected.sort(recordComparator);
274+
records.sort(recordComparator);
267275
assertThat(records).as("Should produce correct number of records").hasSameSizeAs(expected);
268-
assertThat(records).as("Random record set should match").isEqualTo(Sets.newHashSet(expected));
276+
assertThat(records).as("Random record set should match").isEqualTo(expected);
269277
}
270278

271279
@TestTemplate

spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -241,9 +241,18 @@ protected Object randomValue(Type.PrimitiveType primitive, Random rand) {
241241

242242
private static class SparkRandomDataGenerator extends TypeUtil.CustomOrderSchemaVisitor<Object> {
243243
private final Random random;
244+
private final float nullPercentage;
244245

245246
private SparkRandomDataGenerator(long seed) {
247+
this(seed, DEFAULT_NULL_PERCENTAGE);
248+
}
249+
250+
private SparkRandomDataGenerator(long seed, float nullPercentage) {
251+
Preconditions.checkArgument(
252+
0.0f <= nullPercentage && nullPercentage <= 1.0f,
253+
"Percentage needs to be in the range (0.0, 1.0)");
246254
this.random = new Random(seed);
255+
this.nullPercentage = nullPercentage;
247256
}
248257

249258
@Override
@@ -264,22 +273,24 @@ public InternalRow struct(Types.StructType struct, Iterable<Object> fieldResults
264273

265274
@Override
266275
public Object field(Types.NestedField field, Supplier<Object> fieldResult) {
267-
// return null 5% of the time when the value is optional
268-
if (field.isOptional() && random.nextInt(20) == 1) {
276+
if (field.isOptional() && isNull()) {
269277
return null;
270278
}
271279
return fieldResult.get();
272280
}
273281

282+
private boolean isNull() {
283+
return random.nextFloat() < nullPercentage;
284+
}
285+
274286
@Override
275287
public GenericArrayData list(Types.ListType list, Supplier<Object> elementResult) {
276288
int numElements = random.nextInt(20);
277289
Object[] arr = new Object[numElements];
278290
GenericArrayData result = new GenericArrayData(arr);
279291

280292
for (int i = 0; i < numElements; i += 1) {
281-
// return null 5% of the time when the value is optional
282-
if (list.isElementOptional() && random.nextInt(20) == 1) {
293+
if (list.isElementOptional() && isNull()) {
283294
arr[i] = null;
284295
} else {
285296
arr[i] = elementResult.get();
@@ -310,8 +321,7 @@ public Object map(Types.MapType map, Supplier<Object> keyResult, Supplier<Object
310321
keySet.add(key);
311322

312323
keysArr[i] = key;
313-
// return null 5% of the time when the value is optional
314-
if (map.isValueOptional() && random.nextInt(20) == 1) {
324+
if (map.isValueOptional() && isNull()) {
315325
valuesArr[i] = null;
316326
} else {
317327
valuesArr[i] = valueResult.get();

spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
import java.time.temporal.ChronoUnit;
3636
import java.util.Collection;
3737
import java.util.Date;
38+
import java.util.Iterator;
3839
import java.util.List;
3940
import java.util.Map;
4041
import java.util.UUID;
@@ -49,6 +50,7 @@
4950
import org.apache.spark.sql.catalyst.util.ArrayData;
5051
import org.apache.spark.sql.catalyst.util.MapData;
5152
import org.apache.spark.sql.types.Decimal;
53+
import org.apache.spark.sql.vectorized.ColumnarBatch;
5254
import org.apache.spark.unsafe.types.UTF8String;
5355
import scala.collection.Seq;
5456

@@ -70,6 +72,15 @@ public static void assertEqualsSafe(Types.StructType struct, Record expected, Ro
7072
}
7173
}
7274

75+
public static void assertEqualsBatch(
76+
Types.StructType struct, Iterator<Record> expectedRecords, ColumnarBatch batch) {
77+
for (int rowId = 0; rowId < batch.numRows(); rowId++) {
78+
InternalRow row = batch.getRow(rowId);
79+
Record expectedRecord = expectedRecords.next();
80+
assertEqualsUnsafe(struct, expectedRecord, row);
81+
}
82+
}
83+
7384
private static void assertEqualsSafe(
7485
Types.ListType list, Collection<?> expected, List<?> actual) {
7586
Type elementType = list.elementType();
@@ -289,11 +300,27 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual
289300
}
290301

291302
switch (type.typeId()) {
303+
case LONG:
304+
assertThat(actual).as("Should be a long").isInstanceOf(Long.class);
305+
if (expected instanceof Integer) {
306+
assertThat(actual).as("Values didn't match").isEqualTo(((Number) expected).longValue());
307+
} else {
308+
assertThat(actual).as("Primitive value should be equal to expected").isEqualTo(expected);
309+
}
310+
break;
311+
case DOUBLE:
312+
assertThat(actual).as("Should be a double").isInstanceOf(Double.class);
313+
if (expected instanceof Float) {
314+
assertThat(Double.doubleToLongBits((double) actual))
315+
.as("Values didn't match")
316+
.isEqualTo(Double.doubleToLongBits(((Number) expected).doubleValue()));
317+
} else {
318+
assertThat(actual).as("Primitive value should be equal to expected").isEqualTo(expected);
319+
}
320+
break;
292321
case BOOLEAN:
293322
case INTEGER:
294-
case LONG:
295323
case FLOAT:
296-
case DOUBLE:
297324
assertThat(actual).as("Primitive value should be equal to expected").isEqualTo(expected);
298325
break;
299326
case DATE:

spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -241,9 +241,18 @@ protected Object randomValue(Type.PrimitiveType primitive, Random rand) {
241241

242242
private static class SparkRandomDataGenerator extends TypeUtil.CustomOrderSchemaVisitor<Object> {
243243
private final Random random;
244+
private final float nullPercentage;
244245

245246
private SparkRandomDataGenerator(long seed) {
247+
this(seed, DEFAULT_NULL_PERCENTAGE);
248+
}
249+
250+
private SparkRandomDataGenerator(long seed, float nullPercentage) {
251+
Preconditions.checkArgument(
252+
0.0f <= nullPercentage && nullPercentage <= 1.0f,
253+
"Percentage needs to be in the range (0.0, 1.0)");
246254
this.random = new Random(seed);
255+
this.nullPercentage = nullPercentage;
247256
}
248257

249258
@Override
@@ -265,21 +274,24 @@ public InternalRow struct(Types.StructType struct, Iterable<Object> fieldResults
265274
@Override
266275
public Object field(Types.NestedField field, Supplier<Object> fieldResult) {
267276
// return null 5% of the time when the value is optional
268-
if (field.isOptional() && random.nextInt(20) == 1) {
277+
if (field.isOptional() && isNull()) {
269278
return null;
270279
}
271280
return fieldResult.get();
272281
}
273282

283+
private boolean isNull() {
284+
return random.nextFloat() < nullPercentage;
285+
}
286+
274287
@Override
275288
public GenericArrayData list(Types.ListType list, Supplier<Object> elementResult) {
276289
int numElements = random.nextInt(20);
277290
Object[] arr = new Object[numElements];
278291
GenericArrayData result = new GenericArrayData(arr);
279292

280293
for (int i = 0; i < numElements; i += 1) {
281-
// return null 5% of the time when the value is optional
282-
if (list.isElementOptional() && random.nextInt(20) == 1) {
294+
if (list.isElementOptional() && isNull()) {
283295
arr[i] = null;
284296
} else {
285297
arr[i] = elementResult.get();
@@ -310,8 +322,7 @@ public Object map(Types.MapType map, Supplier<Object> keyResult, Supplier<Object
310322
keySet.add(key);
311323

312324
keysArr[i] = key;
313-
// return null 5% of the time when the value is optional
314-
if (map.isValueOptional() && random.nextInt(20) == 1) {
325+
if (map.isValueOptional() && isNull()) {
315326
valuesArr[i] = null;
316327
} else {
317328
valuesArr[i] = valueResult.get();

spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,11 @@
2929
import java.nio.file.Paths;
3030
import java.util.Iterator;
3131
import java.util.List;
32-
import org.apache.avro.generic.GenericData;
3332
import org.apache.iceberg.Files;
3433
import org.apache.iceberg.Schema;
34+
import org.apache.iceberg.data.RandomGenericData;
35+
import org.apache.iceberg.data.Record;
36+
import org.apache.iceberg.data.parquet.GenericParquetWriter;
3537
import org.apache.iceberg.io.CloseableIterable;
3638
import org.apache.iceberg.io.FileAppender;
3739
import org.apache.iceberg.parquet.Parquet;
@@ -40,7 +42,6 @@
4042
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
4143
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
4244
import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
43-
import org.apache.iceberg.spark.data.RandomData;
4445
import org.apache.iceberg.spark.data.TestHelpers;
4546
import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders;
4647
import org.apache.iceberg.types.Types;
@@ -71,14 +72,15 @@ public static void stopSpark() {
7172
}
7273

7374
@Override
74-
Iterable<GenericData.Record> generateData(
75+
Iterable<Record> generateData(
7576
Schema schema,
7677
int numRecords,
7778
long seed,
7879
float nullPercentage,
79-
Function<GenericData.Record, GenericData.Record> transform) {
80+
Function<Record, Record> transform) {
8081
Iterable data =
81-
RandomData.generateDictionaryEncodableData(schema, numRecords, seed, nullPercentage);
82+
RandomGenericData.generateDictionaryEncodableRecords(
83+
schema, numRecords, seed, nullPercentage);
8284
return transform == IDENTITY ? data : Iterables.transform(data, transform);
8385
}
8486

@@ -92,19 +94,16 @@ public void testMixedDictionaryNonDictionaryReads() throws IOException {
9294
Schema schema = new Schema(SUPPORTED_PRIMITIVES.fields());
9395
File dictionaryEncodedFile = File.createTempFile("junit", null, temp.toFile());
9496
assertThat(dictionaryEncodedFile.delete()).as("Delete should succeed").isTrue();
95-
Iterable<GenericData.Record> dictionaryEncodableData =
96-
RandomData.generateDictionaryEncodableData(
97-
schema, 10000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE);
98-
try (FileAppender<GenericData.Record> writer =
99-
getParquetWriter(schema, dictionaryEncodedFile)) {
97+
Iterable<Record> dictionaryEncodableData =
98+
RandomGenericData.generateDictionaryEncodableRecords(schema, 10000, 0L);
99+
try (FileAppender<Record> writer = getParquetWriter(schema, dictionaryEncodedFile)) {
100100
writer.addAll(dictionaryEncodableData);
101101
}
102102

103103
File plainEncodingFile = File.createTempFile("junit", null, temp.toFile());
104104
assertThat(plainEncodingFile.delete()).as("Delete should succeed").isTrue();
105-
Iterable<GenericData.Record> nonDictionaryData =
106-
RandomData.generate(schema, 10000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE);
107-
try (FileAppender<GenericData.Record> writer = getParquetWriter(schema, plainEncodingFile)) {
105+
Iterable<Record> nonDictionaryData = RandomGenericData.generate(schema, 10000, 0L);
106+
try (FileAppender<Record> writer = getParquetWriter(schema, plainEncodingFile)) {
108107
writer.addAll(nonDictionaryData);
109108
}
110109

@@ -132,12 +131,13 @@ public void testBinaryNotAllPagesDictionaryEncoded() throws IOException {
132131
File parquetFile = File.createTempFile("junit", null, temp.toFile());
133132
assertThat(parquetFile.delete()).as("Delete should succeed").isTrue();
134133

135-
Iterable<GenericData.Record> records = RandomData.generateFallbackData(schema, 500, 0L, 100);
136-
try (FileAppender<GenericData.Record> writer =
134+
Iterable<Record> records = RandomGenericData.generateFallbackRecords(schema, 500, 0L, 100);
135+
try (FileAppender<Record> writer =
137136
Parquet.write(Files.localOutput(parquetFile))
138137
.schema(schema)
139138
.set(PARQUET_DICT_SIZE_BYTES, "4096")
140139
.set(PARQUET_PAGE_ROW_LIMIT, "100")
140+
.createWriterFunc(GenericParquetWriter::create)
141141
.build()) {
142142
writer.addAll(records);
143143
}

0 commit comments

Comments
 (0)