diff --git a/core/src/main/java/org/apache/iceberg/ManifestReader.java b/core/src/main/java/org/apache/iceberg/ManifestReader.java index d7917eabb10c..f2ef5cb0deff 100644 --- a/core/src/main/java/org/apache/iceberg/ManifestReader.java +++ b/core/src/main/java/org/apache/iceberg/ManifestReader.java @@ -278,6 +278,9 @@ private CloseableIterable> open(Schema projection) { if (projection.findField(DataFile.RECORD_COUNT.fieldId()) == null) { fields.add(DataFile.RECORD_COUNT); } + if (projection.findField(DataFile.FIRST_ROW_ID.fieldId()) == null) { + fields.add(DataFile.FIRST_ROW_ID); + } fields.add(MetadataColumns.ROW_POSITION); CloseableIterable> reader = diff --git a/core/src/main/java/org/apache/iceberg/util/PartitionUtil.java b/core/src/main/java/org/apache/iceberg/util/PartitionUtil.java index 411d401075d6..ad6ef605420a 100644 --- a/core/src/main/java/org/apache/iceberg/util/PartitionUtil.java +++ b/core/src/main/java/org/apache/iceberg/util/PartitionUtil.java @@ -53,6 +53,17 @@ private PartitionUtil() {} // use java.util.HashMap because partition data may contain null values Map idToConstant = Maps.newHashMap(); + // add first_row_id as _row_id + if (task.file().firstRowId() != null) { + idToConstant.put( + MetadataColumns.ROW_ID.fieldId(), + convertConstant.apply(Types.LongType.get(), task.file().firstRowId())); + } + + idToConstant.put( + MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.fieldId(), + convertConstant.apply(Types.LongType.get(), task.file().fileSequenceNumber())); + // add _file idToConstant.put( MetadataColumns.FILE_PATH.fieldId(), diff --git a/core/src/test/java/org/apache/iceberg/TestRowLineageAssignment.java b/core/src/test/java/org/apache/iceberg/TestRowLineageAssignment.java index 404e083f48d3..38bea41d7ddc 100644 --- a/core/src/test/java/org/apache/iceberg/TestRowLineageAssignment.java +++ b/core/src/test/java/org/apache/iceberg/TestRowLineageAssignment.java @@ -685,6 +685,9 @@ private static void checkDataFileAssignment( try (ManifestReader reader = ManifestFiles.read(manifest, table.io(), table.specs())) { + // test that the first_row_id column is always scanned, even if not requested + reader.select(BaseScan.SCAN_COLUMNS); + for (DataFile file : reader) { assertThat(file.content()).isEqualTo(FileContent.DATA); if (index < firstRowIds.length) { diff --git a/core/src/test/java/org/apache/iceberg/data/DataTest.java b/core/src/test/java/org/apache/iceberg/data/DataTest.java index cc788e2ec078..f28022cc792e 100644 --- a/core/src/test/java/org/apache/iceberg/data/DataTest.java +++ b/core/src/test/java/org/apache/iceberg/data/DataTest.java @@ -28,9 +28,11 @@ import java.nio.ByteBuffer; import java.nio.file.Path; import java.util.List; +import java.util.Map; import java.util.UUID; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Stream; +import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Schema; import org.apache.iceberg.expressions.Literal; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; @@ -53,6 +55,14 @@ public abstract class DataTest { + private static final long FIRST_ROW_ID = 2_000L; + protected static final Map ID_TO_CONSTANT = + Map.of( + MetadataColumns.ROW_ID.fieldId(), + FIRST_ROW_ID, + MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.fieldId(), + 34L); + protected abstract void writeAndValidate(Schema schema) throws IOException; protected void writeAndValidate(Schema schema, List data) throws IOException { @@ -139,6 +149,10 @@ protected boolean supportsGeospatial() { return false; } + protected boolean supportsRowLineage() { + return false; + } + @ParameterizedTest @FieldSource("SIMPLE_TYPES") public void testTypeSchema(Type type) throws IOException { @@ -599,4 +613,38 @@ public void testWriteNullValueForRequiredType() throws Exception { () -> writeAndValidate(schema, ImmutableList.of(genericRecord))); } } + + @Test + public void testRowLineage() throws Exception { + Assumptions.assumeThat(supportsRowLineage()) + .as("Row Lineage support is not implemented") + .isTrue(); + + Schema schema = + new Schema( + required(1, "id", LongType.get()), + required(2, "data", Types.StringType.get()), + MetadataColumns.ROW_ID, + MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER); + + GenericRecord record = GenericRecord.create(schema); + + writeAndValidate( + schema, + List.of( + record.copy(Map.of("id", 1L, "data", "a")), + record.copy(Map.of("id", 2L, "data", "b")), + record.copy( + Map.of( + "id", + 3L, + "data", + "c", + "_row_id", + 1_000L, + "_last_updated_sequence_number", + 33L)), + record.copy(Map.of("id", 4L, "data", "d", "_row_id", 1_001L)), + record.copy(Map.of("id", 5L, "data", "e")))); + } } diff --git a/data/src/test/java/org/apache/iceberg/data/DataTestHelpers.java b/data/src/test/java/org/apache/iceberg/data/DataTestHelpers.java index e05afb998828..fc8d47680b0f 100644 --- a/data/src/test/java/org/apache/iceberg/data/DataTestHelpers.java +++ b/data/src/test/java/org/apache/iceberg/data/DataTestHelpers.java @@ -22,6 +22,7 @@ import java.util.List; import java.util.Map; +import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; import org.apache.iceberg.variants.Variant; @@ -31,12 +32,39 @@ public class DataTestHelpers { private DataTestHelpers() {} public static void assertEquals(Types.StructType struct, Record expected, Record actual) { + assertEquals(struct, expected, actual, null, -1); + } + + public static void assertEquals( + Types.StructType struct, + Record expected, + Record actual, + Map idToConstant, + int pos) { Types.StructType expectedType = expected.struct(); for (Types.NestedField field : struct.fields()) { Types.NestedField expectedField = expectedType.field(field.fieldId()); + Object expectedValue; if (expectedField != null) { - assertEquals( - field.type(), expected.getField(expectedField.name()), actual.getField(field.name())); + int id = expectedField.fieldId(); + if (id == MetadataColumns.ROW_ID.fieldId()) { + expectedValue = expected.getField(expectedField.name()); + if (expectedValue == null && idToConstant != null) { + expectedValue = (Long) idToConstant.get(id) + pos; + } + + } else if (id == MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.fieldId()) { + expectedValue = expected.getField(expectedField.name()); + if (expectedValue == null && idToConstant != null) { + expectedValue = idToConstant.get(id); + } + + } else { + expectedValue = expected.getField(expectedField.name()); + } + + assertEquals(field.type(), expectedValue, actual.getField(field.name())); + } else { assertEquals( field.type(), diff --git a/data/src/test/java/org/apache/iceberg/data/parquet/TestGenericData.java b/data/src/test/java/org/apache/iceberg/data/parquet/TestGenericData.java index c663ad228c5c..1b5917e97296 100644 --- a/data/src/test/java/org/apache/iceberg/data/parquet/TestGenericData.java +++ b/data/src/test/java/org/apache/iceberg/data/parquet/TestGenericData.java @@ -36,8 +36,10 @@ import org.apache.iceberg.data.DataTestHelpers; import org.apache.iceberg.data.RandomGenericData; import org.apache.iceberg.data.Record; +import org.apache.iceberg.inmemory.InMemoryOutputFile; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.io.OutputFile; import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.types.Types; @@ -61,6 +63,11 @@ protected boolean supportsTimestampNanos() { return true; } + @Override + protected boolean supportsRowLineage() { + return true; + } + @Override protected void writeAndValidate(Schema schema) throws IOException { writeAndValidate(schema, schema); @@ -80,11 +87,10 @@ protected void writeAndValidate(Schema writeSchema, Schema expectedSchema) throw private void writeAndValidate(Schema writeSchema, Schema expectedSchema, List expected) throws IOException { - File testFile = File.createTempFile("junit", null, temp.toFile()); - assertThat(testFile.delete()).isTrue(); + OutputFile output = new InMemoryOutputFile(); try (FileAppender appender = - Parquet.write(Files.localOutput(testFile)) + Parquet.write(output) .schema(writeSchema) .createWriterFunc(GenericParquetWriter::create) .build()) { @@ -93,30 +99,34 @@ private void writeAndValidate(Schema writeSchema, Schema expectedSchema, List rows; try (CloseableIterable reader = - Parquet.read(Files.localInput(testFile)) + Parquet.read(output.toInputFile()) .project(expectedSchema) .createReaderFunc( - fileSchema -> GenericParquetReaders.buildReader(expectedSchema, fileSchema)) + fileSchema -> + GenericParquetReaders.buildReader(expectedSchema, fileSchema, ID_TO_CONSTANT)) .build()) { rows = Lists.newArrayList(reader); } - for (int i = 0; i < expected.size(); i += 1) { - DataTestHelpers.assertEquals(expectedSchema.asStruct(), expected.get(i), rows.get(i)); + for (int pos = 0; pos < expected.size(); pos += 1) { + DataTestHelpers.assertEquals( + expectedSchema.asStruct(), expected.get(pos), rows.get(pos), ID_TO_CONSTANT, pos); } // test reuseContainers try (CloseableIterable reader = - Parquet.read(Files.localInput(testFile)) + Parquet.read(output.toInputFile()) .project(expectedSchema) .reuseContainers() .createReaderFunc( - fileSchema -> GenericParquetReaders.buildReader(expectedSchema, fileSchema)) + fileSchema -> + GenericParquetReaders.buildReader(expectedSchema, fileSchema, ID_TO_CONSTANT)) .build()) { - int index = 0; + int pos = 0; for (Record actualRecord : reader) { - DataTestHelpers.assertEquals(expectedSchema.asStruct(), expected.get(index), actualRecord); - index += 1; + DataTestHelpers.assertEquals( + expectedSchema.asStruct(), expected.get(pos), actualRecord, ID_TO_CONSTANT, pos); + pos += 1; } } } diff --git a/parquet/src/main/java/org/apache/iceberg/data/parquet/BaseParquetReaders.java b/parquet/src/main/java/org/apache/iceberg/data/parquet/BaseParquetReaders.java index 4fa2d37a6235..8f2957e1c60d 100644 --- a/parquet/src/main/java/org/apache/iceberg/data/parquet/BaseParquetReaders.java +++ b/parquet/src/main/java/org/apache/iceberg/data/parquet/BaseParquetReaders.java @@ -22,7 +22,6 @@ import java.util.List; import java.util.Map; import java.util.Optional; -import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Schema; import org.apache.iceberg.parquet.ParquetSchemaUtil; import org.apache.iceberg.parquet.ParquetValueReader; @@ -77,7 +76,7 @@ protected ParquetValueReader createReader( } protected abstract ParquetValueReader createStructReader( - List types, List> fieldReaders, Types.StructType structType); + List> fieldReaders, Types.StructType structType); protected abstract ParquetValueReader fixedReader(ColumnDescriptor desc); @@ -110,7 +109,6 @@ public ParquetValueReader struct( // the expected struct is ignored because nested fields are never found when the List> newFields = Lists.newArrayListWithExpectedSize(fieldReaders.size()); - List types = Lists.newArrayListWithExpectedSize(fieldReaders.size()); List fields = struct.getFields(); for (int i = 0; i < fields.size(); i += 1) { ParquetValueReader fieldReader = fieldReaders.get(i); @@ -118,11 +116,10 @@ public ParquetValueReader struct( Type fieldType = fields.get(i); int fieldD = type().getMaxDefinitionLevel(path(fieldType.getName())) - 1; newFields.add(ParquetValueReaders.option(fieldType, fieldD, fieldReader)); - types.add(fieldType); } } - return createStructReader(types, newFields, expected); + return createStructReader(newFields, expected); } } @@ -225,10 +222,12 @@ public ParquetValueReader message( @Override public ParquetValueReader struct( Types.StructType expected, GroupType struct, List> fieldReaders) { + if (null == expected) { + return createStructReader(ImmutableList.of(), null); + } + // match the expected struct's order Map> readersById = Maps.newHashMap(); - Map typesById = Maps.newHashMap(); - Map maxDefinitionLevelsById = Maps.newHashMap(); List fields = struct.getFields(); for (int i = 0; i < fields.size(); i += 1) { ParquetValueReader fieldReader = fieldReaders.get(i); @@ -237,55 +236,37 @@ public ParquetValueReader struct( int fieldD = type.getMaxDefinitionLevel(path(fieldType.getName())) - 1; int id = fieldType.getId().intValue(); readersById.put(id, ParquetValueReaders.option(fieldType, fieldD, fieldReader)); - typesById.put(id, fieldType); - if (idToConstant.containsKey(id)) { - maxDefinitionLevelsById.put(id, fieldD); - } } } - List expectedFields = - expected != null ? expected.fields() : ImmutableList.of(); + int constantDefinitionLevel = type.getMaxDefinitionLevel(currentPath()); + List expectedFields = expected.fields(); List> reorderedFields = Lists.newArrayListWithExpectedSize(expectedFields.size()); - List types = Lists.newArrayListWithExpectedSize(expectedFields.size()); - // Defaulting to parent max definition level - int defaultMaxDefinitionLevel = type.getMaxDefinitionLevel(currentPath()); + for (Types.NestedField field : expectedFields) { int id = field.fieldId(); - ParquetValueReader reader = readersById.get(id); - if (idToConstant.containsKey(id)) { - // containsKey is used because the constant may be null - int fieldMaxDefinitionLevel = - maxDefinitionLevelsById.getOrDefault(id, defaultMaxDefinitionLevel); - reorderedFields.add( - ParquetValueReaders.constant(idToConstant.get(id), fieldMaxDefinitionLevel)); - types.add(null); - } else if (id == MetadataColumns.ROW_POSITION.fieldId()) { - reorderedFields.add(ParquetValueReaders.position()); - types.add(null); - } else if (id == MetadataColumns.IS_DELETED.fieldId()) { - reorderedFields.add(ParquetValueReaders.constant(false)); - types.add(null); - } else if (reader != null) { - reorderedFields.add(reader); - types.add(typesById.get(id)); - } else if (field.initialDefault() != null) { - reorderedFields.add( - ParquetValueReaders.constant( - convertConstant(field.type(), field.initialDefault()), - maxDefinitionLevelsById.getOrDefault(id, defaultMaxDefinitionLevel))); - types.add(typesById.get(id)); - } else if (field.isOptional()) { - reorderedFields.add(ParquetValueReaders.nulls()); - types.add(null); - } else { - throw new IllegalArgumentException( - String.format("Missing required field: %s", field.name())); - } + ParquetValueReader reader = + ParquetValueReaders.replaceWithMetadataReader( + id, readersById.get(id), idToConstant, constantDefinitionLevel); + reorderedFields.add(defaultReader(field, reader, constantDefinitionLevel)); + } + + return createStructReader(reorderedFields, expected); + } + + private ParquetValueReader defaultReader( + Types.NestedField field, ParquetValueReader reader, int constantDL) { + if (reader != null) { + return reader; + } else if (field.initialDefault() != null) { + return ParquetValueReaders.constant( + convertConstant(field.type(), field.initialDefault()), constantDL); + } else if (field.isOptional()) { + return ParquetValueReaders.nulls(); } - return createStructReader(types, reorderedFields, expected); + throw new IllegalArgumentException(String.format("Missing required field: %s", field.name())); } @Override diff --git a/parquet/src/main/java/org/apache/iceberg/data/parquet/GenericParquetReaders.java b/parquet/src/main/java/org/apache/iceberg/data/parquet/GenericParquetReaders.java index e12f379b36bb..182412cfb54c 100644 --- a/parquet/src/main/java/org/apache/iceberg/data/parquet/GenericParquetReaders.java +++ b/parquet/src/main/java/org/apache/iceberg/data/parquet/GenericParquetReaders.java @@ -59,12 +59,24 @@ public static ParquetValueReader buildReader( return INSTANCE.createReader(expectedSchema, fileSchema, idToConstant); } - @Override + /** + * Create a struct reader. + * + * @deprecated will be removed in 1.10.0; use {@link #createStructReader(List, StructType)} + * instead. + */ + @Deprecated protected ParquetValueReader createStructReader( List types, List> fieldReaders, StructType structType) { return ParquetValueReaders.recordReader(fieldReaders, structType); } + @Override + protected ParquetValueReader createStructReader( + List> fieldReaders, StructType structType) { + return ParquetValueReaders.recordReader(fieldReaders, structType); + } + @Override protected ParquetValueReader fixedReader(ColumnDescriptor desc) { return new GenericParquetReaders.FixedReader(desc); diff --git a/parquet/src/main/java/org/apache/iceberg/data/parquet/InternalReader.java b/parquet/src/main/java/org/apache/iceberg/data/parquet/InternalReader.java index 05613eb1de16..692a9857cf77 100644 --- a/parquet/src/main/java/org/apache/iceberg/data/parquet/InternalReader.java +++ b/parquet/src/main/java/org/apache/iceberg/data/parquet/InternalReader.java @@ -47,13 +47,26 @@ public static ParquetValueReader create( return (ParquetValueReader) INSTANCE.createReader(expectedSchema, fileSchema, idToConstant); } - @Override + /** + * Create a struct reader. + * + * @deprecated will be removed in 1.10.0; use {@link #createStructReader(List, StructType)} + * instead. + */ + @Deprecated @SuppressWarnings("unchecked") protected ParquetValueReader createStructReader( List types, List> fieldReaders, StructType structType) { return (ParquetValueReader) ParquetValueReaders.recordReader(fieldReaders, structType); } + @Override + @SuppressWarnings("unchecked") + protected ParquetValueReader createStructReader( + List> fieldReaders, StructType structType) { + return (ParquetValueReader) ParquetValueReaders.recordReader(fieldReaders, structType); + } + @Override protected ParquetValueReader fixedReader(ColumnDescriptor desc) { return new ParquetValueReaders.BytesReader(desc); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java index 63aac8006e2d..e91db8282e60 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java @@ -29,6 +29,7 @@ import java.util.List; import java.util.Map; import java.util.UUID; +import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.data.Record; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; @@ -161,6 +162,25 @@ public static ParquetValueReader position() { return new PositionReader(); } + @SuppressWarnings("unchecked") + public static ParquetValueReader rowIds(Long baseRowId, ParquetValueReader idReader) { + if (baseRowId != null) { + return new RowIdReader(baseRowId, (ParquetValueReader) idReader); + } else { + return ParquetValueReaders.nulls(); + } + } + + @SuppressWarnings("unchecked") + public static ParquetValueReader lastUpdated( + Long baseRowId, Long fileLastUpdated, ParquetValueReader seqReader) { + if (fileLastUpdated != null && baseRowId != null) { + return new LastUpdatedSeqReader(fileLastUpdated, (ParquetValueReader) seqReader); + } else { + return ParquetValueReaders.nulls(); + } + } + public static ParquetValueReader uuids(ColumnDescriptor desc) { return new UUIDReader(desc); } @@ -174,6 +194,27 @@ public static ParquetValueReader recordReader( return new RecordReader(readers, struct); } + public static ParquetValueReader replaceWithMetadataReader( + int id, ParquetValueReader reader, Map idToConstant, int constantDL) { + if (id == MetadataColumns.ROW_ID.fieldId()) { + Long baseRowId = (Long) idToConstant.get(id); + return ParquetValueReaders.rowIds(baseRowId, reader); + } else if (id == MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.fieldId()) { + Long baseRowId = (Long) idToConstant.get(id); + Long fileSeqNumber = (Long) idToConstant.get(id); + return ParquetValueReaders.lastUpdated(baseRowId, fileSeqNumber, reader); + } else if (idToConstant.containsKey(id)) { + // containsKey is used because the constant may be null + return ParquetValueReaders.constant(idToConstant.get(id), constantDL); + } else if (id == MetadataColumns.ROW_POSITION.fieldId()) { + return ParquetValueReaders.position(); + } else if (id == MetadataColumns.IS_DELETED.fieldId()) { + return ParquetValueReaders.constant(false, constantDL); + } + + return reader; + } + private static class NullReader implements ParquetValueReader { private static final NullReader INSTANCE = new NullReader<>(); private static final ImmutableList> COLUMNS = ImmutableList.of(); @@ -237,36 +278,9 @@ private static class ConstantReader implements ParquetValueReader { this.children = NullReader.COLUMNS; } - ConstantReader(C constantValue, int definitionLevel) { + ConstantReader(C constantValue, int parentDl) { this.constantValue = constantValue; - this.column = - new TripleIterator() { - @Override - public int currentDefinitionLevel() { - return definitionLevel; - } - - @Override - public int currentRepetitionLevel() { - return 0; - } - - @Override - public N nextNull() { - return null; - } - - @Override - public boolean hasNext() { - return false; - } - - @Override - public Object next() { - return null; - } - }; - + this.column = new ConstantDLColumn<>(parentDl); this.children = ImmutableList.of(column); } @@ -287,6 +301,39 @@ public List> columns() { @Override public void setPageSource(PageReadStore pageStore) {} + + private static class ConstantDLColumn implements TripleIterator { + private final int definitionLevel; + + private ConstantDLColumn(int definitionLevel) { + this.definitionLevel = definitionLevel; + } + + @Override + public int currentDefinitionLevel() { + return definitionLevel; + } + + @Override + public int currentRepetitionLevel() { + return 0; + } + + @Override + public N nextNull() { + return null; + } + + @Override + public boolean hasNext() { + return false; + } + + @Override + public T next() { + return null; + } + } } private static class PositionReader implements ParquetValueReader { @@ -322,6 +369,81 @@ public void setPageSource(PageReadStore pageStore) { } } + private static class RowIdReader implements ParquetValueReader { + private final long firstRowId; + private final ParquetValueReader idReader; + private final ParquetValueReader posReader; + + private RowIdReader(long firstRowId, ParquetValueReader idReader) { + this.firstRowId = firstRowId; + this.idReader = idReader != null ? idReader : nulls(); + this.posReader = position(); + } + + @Override + public Long read(Long reuse) { + // always call the position reader to keep the position accurate + long pos = posReader.read(null); + Long idFromFile = idReader.read(null); + if (idFromFile != null) { + return idFromFile; + } + + return firstRowId + pos; + } + + @Override + public TripleIterator column() { + return idReader.column(); + } + + @Override + public List> columns() { + return idReader.columns(); + } + + @Override + public void setPageSource(PageReadStore pageStore) { + idReader.setPageSource(pageStore); + posReader.setPageSource(pageStore); + } + } + + private static class LastUpdatedSeqReader implements ParquetValueReader { + private final long fileLastUpdated; + private final ParquetValueReader seqReader; + + private LastUpdatedSeqReader(long fileLastUpdated, ParquetValueReader seqReader) { + this.fileLastUpdated = fileLastUpdated; + this.seqReader = seqReader != null ? seqReader : nulls(); + } + + @Override + public Long read(Long reuse) { + Long rowLastUpdated = seqReader.read(null); + if (rowLastUpdated != null) { + return rowLastUpdated; + } + + return fileLastUpdated; + } + + @Override + public TripleIterator column() { + return seqReader.column(); + } + + @Override + public List> columns() { + return seqReader.columns(); + } + + @Override + public void setPageSource(PageReadStore pageStore) { + seqReader.setPageSource(pageStore); + } + } + public abstract static class PrimitiveReader implements ParquetValueReader { private final ColumnDescriptor desc; diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java index 5a7fd50067b9..87c97cc7a663 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java @@ -24,7 +24,6 @@ import java.util.Arrays; import java.util.List; import java.util.Map; -import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Schema; import org.apache.iceberg.parquet.ParquetSchemaUtil; import org.apache.iceberg.parquet.ParquetValueReader; @@ -135,10 +134,12 @@ public ParquetValueReader message( @Override public ParquetValueReader struct( Types.StructType expected, GroupType struct, List> fieldReaders) { + if (null == expected) { + return new InternalRowReader(ImmutableList.of()); + } + // match the expected struct's order Map> readersById = Maps.newHashMap(); - Map typesById = Maps.newHashMap(); - Map maxDefinitionLevelsById = Maps.newHashMap(); List fields = struct.getFields(); for (int i = 0; i < fields.size(); i += 1) { Type fieldType = fields.get(i); @@ -146,50 +147,39 @@ public ParquetValueReader struct( if (fieldType.getId() != null) { int id = fieldType.getId().intValue(); readersById.put(id, ParquetValueReaders.option(fieldType, fieldD, fieldReaders.get(i))); - typesById.put(id, fieldType); - if (idToConstant.containsKey(id)) { - maxDefinitionLevelsById.put(id, fieldD); - } } } - List expectedFields = - expected != null ? expected.fields() : ImmutableList.of(); + int constantDefinitionLevel = type.getMaxDefinitionLevel(currentPath()); + List expectedFields = expected.fields(); List> reorderedFields = Lists.newArrayListWithExpectedSize(expectedFields.size()); - // Defaulting to parent max definition level - int defaultMaxDefinitionLevel = type.getMaxDefinitionLevel(currentPath()); + for (Types.NestedField field : expectedFields) { int id = field.fieldId(); - ParquetValueReader reader = readersById.get(id); - if (idToConstant.containsKey(id)) { - // containsKey is used because the constant may be null - int fieldMaxDefinitionLevel = - maxDefinitionLevelsById.getOrDefault(id, defaultMaxDefinitionLevel); - reorderedFields.add( - ParquetValueReaders.constant(idToConstant.get(id), fieldMaxDefinitionLevel)); - } else if (id == MetadataColumns.ROW_POSITION.fieldId()) { - reorderedFields.add(ParquetValueReaders.position()); - } else if (id == MetadataColumns.IS_DELETED.fieldId()) { - reorderedFields.add(ParquetValueReaders.constant(false)); - } else if (reader != null) { - reorderedFields.add(reader); - } else if (field.initialDefault() != null) { - reorderedFields.add( - ParquetValueReaders.constant( - SparkUtil.internalToSpark(field.type(), field.initialDefault()), - maxDefinitionLevelsById.getOrDefault(id, defaultMaxDefinitionLevel))); - } else if (field.isOptional()) { - reorderedFields.add(ParquetValueReaders.nulls()); - } else { - throw new IllegalArgumentException( - String.format("Missing required field: %s", field.name())); - } + ParquetValueReader reader = + ParquetValueReaders.replaceWithMetadataReader( + id, readersById.get(id), idToConstant, constantDefinitionLevel); + reorderedFields.add(defaultReader(field, reader, constantDefinitionLevel)); } return new InternalRowReader(reorderedFields); } + private ParquetValueReader defaultReader( + Types.NestedField field, ParquetValueReader reader, int constantDL) { + if (reader != null) { + return reader; + } else if (field.initialDefault() != null) { + return ParquetValueReaders.constant( + SparkUtil.internalToSpark(field.type(), field.initialDefault()), constantDL); + } else if (field.isOptional()) { + return ParquetValueReaders.nulls(); + } + + throw new IllegalArgumentException(String.format("Missing required field: %s", field.name())); + } + @Override public ParquetValueReader list( Types.ListType expectedList, GroupType array, ParquetValueReader elementReader) { diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatch.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatch.java index 11f054b11710..45bd48aea2ec 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatch.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatch.java @@ -180,7 +180,10 @@ private boolean useCometBatchReads() { } private boolean supportsCometBatchReads(Types.NestedField field) { - return field.type().isPrimitiveType() && !field.type().typeId().equals(Type.TypeID.UUID); + return field.type().isPrimitiveType() + && !field.type().typeId().equals(Type.TypeID.UUID) + && field.fieldId() != MetadataColumns.ROW_ID.fieldId() + && field.fieldId() != MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.fieldId(); } // conditions for using ORC batch reads: diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/SparkTestHelperBase.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/SparkTestHelperBase.java index 0b3d0244a087..e1b75ca55e34 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/SparkTestHelperBase.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/SparkTestHelperBase.java @@ -63,10 +63,7 @@ protected void assertEquals( Object[] expected = expectedRows.get(row); Object[] actual = actualRows.get(row); assertThat(actual).as("Number of columns should match").hasSameSizeAs(expected); - for (int col = 0; col < actualRows.get(row).length; col += 1) { - String newContext = String.format("%s: row %d col %d", context, row + 1, col + 1); - assertEquals(newContext, expected, actual); - } + assertEquals(context + ": row " + (row + 1), expected, actual); } } @@ -83,7 +80,9 @@ protected void assertEquals(String context, Object[] expectedRow, Object[] actua assertEquals(newContext, (Object[]) expectedValue, (Object[]) actualValue); } } else if (expectedValue != ANY) { - assertThat(actualValue).as(context + " contents should match").isEqualTo(expectedValue); + assertThat(actualValue) + .as(context + " col " + (col + 1) + " contents should match") + .isEqualTo(expectedValue); } } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java index 4e4000794ec4..a31138ae01a2 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java @@ -27,10 +27,15 @@ import java.math.BigDecimal; import java.nio.ByteBuffer; import java.nio.file.Path; +import java.util.List; +import java.util.Map; import java.util.UUID; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Stream; +import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Schema; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; import org.apache.iceberg.expressions.Literal; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.types.Type; @@ -51,6 +56,14 @@ public abstract class AvroDataTest { + private static final long FIRST_ROW_ID = 2_000L; + protected static final Map ID_TO_CONSTANT = + Map.of( + MetadataColumns.ROW_ID.fieldId(), + FIRST_ROW_ID, + MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.fieldId(), + 34L); + protected abstract void writeAndValidate(Schema schema) throws IOException; protected void writeAndValidate(Schema writeSchema, Schema expectedSchema) throws IOException { @@ -58,6 +71,12 @@ protected void writeAndValidate(Schema writeSchema, Schema expectedSchema) throw "Cannot run test, writeAndValidate(Schema, Schema) is not implemented"); } + protected void writeAndValidate(Schema writeSchema, Schema expectedSchema, List records) + throws IOException { + throw new UnsupportedEncodingException( + "Cannot run test, writeAndValidate(Schema, Schema, List) is not implemented"); + } + protected boolean supportsDefaultValues() { return false; } @@ -66,6 +85,10 @@ protected boolean supportsNestedTypes() { return true; } + protected boolean supportsRowLineage() { + return false; + } + protected static final StructType SUPPORTED_PRIMITIVES = StructType.of( required(100, "id", LongType.get()), @@ -547,4 +570,39 @@ public void testPrimitiveTypeDefaultValues(Type.PrimitiveType type, Literal d writeAndValidate(writeSchema, readSchema); } + + @Test + public void testRowLineage() throws Exception { + Assumptions.assumeThat(supportsRowLineage()) + .as("Row lineage support is not implemented") + .isTrue(); + + Schema schema = + new Schema( + required(1, "id", LongType.get()), + required(2, "data", Types.StringType.get()), + MetadataColumns.ROW_ID, + MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER); + + GenericRecord record = GenericRecord.create(schema); + + writeAndValidate( + schema, + schema, + List.of( + record.copy(Map.of("id", 1L, "data", "a")), + record.copy(Map.of("id", 2L, "data", "b")), + record.copy( + Map.of( + "id", + 3L, + "data", + "c", + "_row_id", + 1_000L, + "_last_updated_sequence_number", + 33L)), + record.copy(Map.of("id", 4L, "data", "d", "_row_id", 1_001L)), + record.copy(Map.of("id", 5L, "data", "e")))); + } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java index 501b46878bd2..43cecd0473fd 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java @@ -38,6 +38,8 @@ import java.util.List; import java.util.Map; import java.util.UUID; +import org.apache.iceberg.MetadataColumns; +import org.apache.iceberg.data.GenericDataUtil; import org.apache.iceberg.data.Record; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.types.Type; @@ -205,12 +207,47 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) public static void assertEqualsUnsafe( Types.StructType struct, Record expected, InternalRow actual) { + assertEqualsUnsafe(struct, expected, actual, null, -1); + } + + public static void assertEqualsUnsafe( + Types.StructType struct, + Record expected, + InternalRow actual, + Map idToConstant, + int pos) { + Types.StructType expectedType = expected.struct(); List fields = struct.fields(); - for (int i = 0; i < fields.size(); i += 1) { - Type fieldType = fields.get(i).type(); + for (int readPos = 0; readPos < fields.size(); readPos += 1) { + Types.NestedField field = fields.get(readPos); + Types.NestedField expectedField = expectedType.field(field.fieldId()); - Object expectedValue = expected.get(i); - Object actualValue = actual.get(i, convert(fieldType)); + Type fieldType = field.type(); + Object actualValue = + actual.isNullAt(readPos) ? null : actual.get(readPos, convert(fieldType)); + + Object expectedValue; + if (expectedField != null) { + int id = expectedField.fieldId(); + if (id == MetadataColumns.ROW_ID.fieldId()) { + expectedValue = expected.getField(expectedField.name()); + if (expectedValue == null && idToConstant != null) { + expectedValue = (Long) idToConstant.get(id) + pos; + } + + } else if (id == MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.fieldId()) { + expectedValue = expected.getField(expectedField.name()); + if (expectedValue == null && idToConstant != null) { + expectedValue = idToConstant.get(id); + } + + } else { + expectedValue = expected.getField(expectedField.name()); + } + } else { + // comparison expects Iceberg's generic representation + expectedValue = GenericDataUtil.internalToGeneric(field.type(), field.initialDefault()); + } assertEqualsUnsafe(fieldType, expectedValue, actualValue); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java index 1cd4fccfdd3f..210e901bf6c0 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java @@ -18,18 +18,15 @@ */ package org.apache.iceberg.spark.data; -import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; import static org.apache.iceberg.types.Types.NestedField.required; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.assertj.core.api.Assumptions.assumeThat; -import java.io.File; import java.io.IOException; import java.util.Iterator; import java.util.List; import java.util.Map; -import org.apache.avro.generic.GenericData; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.DataFiles; import org.apache.iceberg.FileFormat; @@ -39,11 +36,15 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.Table; import org.apache.iceberg.data.IcebergGenerics; +import org.apache.iceberg.data.RandomGenericData; import org.apache.iceberg.data.Record; +import org.apache.iceberg.data.parquet.GenericParquetWriter; import org.apache.iceberg.hadoop.HadoopTables; +import org.apache.iceberg.inmemory.InMemoryOutputFile; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.FileAppender; import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.io.OutputFile; import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.parquet.ParquetUtil; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; @@ -69,6 +70,13 @@ protected void writeAndValidate(Schema schema) throws IOException { @Override protected void writeAndValidate(Schema writeSchema, Schema expectedSchema) throws IOException { + List expected = RandomGenericData.generate(writeSchema, 100, 0L); + writeAndValidate(writeSchema, expectedSchema, expected); + } + + @Override + protected void writeAndValidate(Schema writeSchema, Schema expectedSchema, List expected) + throws IOException { assumeThat( TypeUtil.find( writeSchema, @@ -76,30 +84,39 @@ protected void writeAndValidate(Schema writeSchema, Schema expectedSchema) throw .as("Parquet Avro cannot write non-string map keys") .isNull(); - List expected = RandomData.generateList(writeSchema, 100, 0L); - - File testFile = File.createTempFile("junit", null, temp.toFile()); - assertThat(testFile.delete()).as("Delete should succeed").isTrue(); - - try (FileAppender writer = - Parquet.write(Files.localOutput(testFile)).schema(writeSchema).named("test").build()) { + OutputFile output = new InMemoryOutputFile(); + try (FileAppender writer = + Parquet.write(output) + .schema(writeSchema) + .createWriterFunc(GenericParquetWriter::create) + .named("test") + .build()) { writer.addAll(expected); } try (CloseableIterable reader = - Parquet.read(Files.localInput(testFile)) + Parquet.read(output.toInputFile()) .project(expectedSchema) - .createReaderFunc(type -> SparkParquetReaders.buildReader(expectedSchema, type)) + .createReaderFunc( + type -> SparkParquetReaders.buildReader(expectedSchema, type, ID_TO_CONSTANT)) .build()) { Iterator rows = reader.iterator(); - for (GenericData.Record record : expected) { + int pos = 0; + for (Record record : expected) { assertThat(rows).as("Should have expected number of rows").hasNext(); - assertEqualsUnsafe(expectedSchema.asStruct(), record, rows.next()); + GenericsHelpers.assertEqualsUnsafe( + expectedSchema.asStruct(), record, rows.next(), ID_TO_CONSTANT, pos); + pos += 1; } assertThat(rows).as("Should not have extra rows").isExhausted(); } } + @Override + protected boolean supportsRowLineage() { + return true; + } + @Override protected boolean supportsDefaultValues() { return true;