Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Address comments, rename rowlineage extract variable, use lists consi…
…stently for surfacing metadata columns, and include test refactorings that were done in 3.4/3.5
  • Loading branch information
amogh-jahagirdar committed Jul 10, 2025
commit 5d20c37f71039acf8f4096650e81c761b40e74b6
Original file line number Diff line number Diff line change
Expand Up @@ -177,19 +177,7 @@ public static Object[][] parameters() {
SparkCatalog.class.getName(),
ImmutableMap.of("type", "hadoop"),
FileFormat.PARQUET,
true,
WRITE_DISTRIBUTION_MODE_HASH,
true,
null,
LOCAL,
3
},
{
"testhadoop",
SparkCatalog.class.getName(),
ImmutableMap.of("type", "hadoop"),
FileFormat.PARQUET,
false,
RANDOM.nextBoolean(),
WRITE_DISTRIBUTION_MODE_HASH,
true,
null,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@
package org.apache.iceberg.spark.extensions;

import static org.apache.iceberg.MetadataColumns.schemaWithRowLineage;
import static org.apache.iceberg.PlanningMode.DISTRIBUTED;
import static org.apache.iceberg.PlanningMode.LOCAL;
import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_HASH;
import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_RANGE;
import static org.apache.iceberg.spark.Spark3Util.loadIcebergTable;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assumptions.assumeThat;
Expand All @@ -33,6 +37,7 @@
import org.apache.iceberg.FileFormat;
import org.apache.iceberg.Files;
import org.apache.iceberg.MetadataColumns;
import org.apache.iceberg.Parameters;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Schema;
import org.apache.iceberg.Snapshot;
Expand All @@ -47,8 +52,11 @@
import org.apache.iceberg.io.DataWriter;
import org.apache.iceberg.io.OutputFile;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.spark.SparkCatalog;
import org.apache.iceberg.spark.SparkSessionCatalog;
import org.apache.iceberg.spark.functions.BucketFunction;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.Pair;
Expand All @@ -57,7 +65,6 @@
import org.apache.spark.sql.catalyst.parser.ParseException;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.TestTemplate;

public abstract class TestRowLevelOperationsWithLineage extends SparkRowLevelOperationsTestBase {
Expand All @@ -81,18 +88,68 @@ record ->
createRecord(SCHEMA, 103, "d", 3L, 1L),
createRecord(SCHEMA, 104, "e", 4L, 1L));

@Parameters(
name =
"catalogName = {0}, implementation = {1}, config = {2},"
+ " format = {3}, vectorized = {4}, distributionMode = {5},"
+ " fanout = {6}, branch = {7}, planningMode = {8}, formatVersion = {9}")
public static Object[][] parameters() {
return new Object[][] {
{
"testhadoop",
SparkCatalog.class.getName(),
ImmutableMap.of("type", "hadoop"),
FileFormat.PARQUET,
true,
WRITE_DISTRIBUTION_MODE_HASH,
true,
null,
LOCAL,
3
},
{
"testhadoop",
SparkCatalog.class.getName(),
ImmutableMap.of("type", "hadoop"),
FileFormat.PARQUET,
false,
WRITE_DISTRIBUTION_MODE_RANGE,
true,
null,
DISTRIBUTED,
3
},
{
"spark_catalog",
SparkSessionCatalog.class.getName(),
ImmutableMap.of(
"type",
"hive",
"default-namespace",
"default",
"clients",
"1",
"parquet-enabled",
"false",
"cache-enabled",
"false" // Spark will delete tables using v1, leaving the cache out of sync
),
FileFormat.AVRO,
false,
WRITE_DISTRIBUTION_MODE_RANGE,
false,
null,
DISTRIBUTED,
3
},
};
}

@BeforeAll
public static void setupSparkConf() {
spark.conf().set("spark.sql.shuffle.partitions", "4");
}

@BeforeEach
public void beforeEach() {
assumeThat(formatVersion).isGreaterThanOrEqualTo(3);
// ToDo: Remove these as row lineage inheritance gets implemented in the other readers
assumeThat(fileFormat).isEqualTo(FileFormat.PARQUET);
}

@AfterEach
public void removeTables() {
sql("DROP TABLE IF EXISTS %s", tableName);
Expand Down Expand Up @@ -437,7 +494,6 @@ protected void appendUnpartitionedRecords(Table table, List<Record> records) thr
appendRecords(table, partitionRecords(records, table.spec(), record -> null));
}

// Append unpartitioned records?
protected void appendRecords(Table table, PartitionMap<List<Record>> partitionedRecords)
throws IOException {
AppendFiles append = table.newAppend();
Expand Down Expand Up @@ -471,6 +527,7 @@ protected static Record createRecord(
}

private Snapshot latestSnapshot(Table table) {
table.refresh();
return branch != null ? table.snapshot(branch) : table.currentSnapshot();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ public class ManifestFileBean implements ManifestFile, Serializable {
private Long addedSnapshotId = null;
private Integer content = null;
private Long sequenceNumber = null;
private Long firstRowId = null;

public static ManifestFileBean fromManifest(ManifestFile manifest) {
ManifestFileBean bean = new ManifestFileBean();
Expand All @@ -46,6 +47,7 @@ public static ManifestFileBean fromManifest(ManifestFile manifest) {
bean.setAddedSnapshotId(manifest.snapshotId());
bean.setContent(manifest.content().id());
bean.setSequenceNumber(manifest.sequenceNumber());
bean.setFirstRowId(manifest.firstRowId());

return bean;
}
Expand Down Expand Up @@ -98,6 +100,10 @@ public void setSequenceNumber(Long sequenceNumber) {
this.sequenceNumber = sequenceNumber;
}

public void setFirstRowId(Long firstRowId) {
this.firstRowId = firstRowId;
}

@Override
public String path() {
return path;
Expand Down Expand Up @@ -173,6 +179,11 @@ public ByteBuffer keyMetadata() {
return null;
}

@Override
public Long firstRowId() {
return firstRowId;
}

@Override
public ManifestFile copy() {
throw new UnsupportedOperationException("Cannot copy");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,31 @@ private SparkSession spark() {
public MetadataColumn[] metadataColumns() {
DataType sparkPartitionType = SparkSchemaUtil.convert(Partitioning.partitionType(icebergTable));
return new MetadataColumn[] {
new SparkMetadataColumn(MetadataColumns.SPEC_ID.name(), DataTypes.IntegerType, true),
new SparkMetadataColumn(MetadataColumns.PARTITION_COLUMN_NAME, sparkPartitionType, true),
new SparkMetadataColumn(MetadataColumns.FILE_PATH.name(), DataTypes.StringType, false),
new SparkMetadataColumn(MetadataColumns.ROW_POSITION.name(), DataTypes.LongType, false),
new SparkMetadataColumn(MetadataColumns.IS_DELETED.name(), DataTypes.BooleanType, false)
SparkMetadataColumn.builder()
.name(MetadataColumns.SPEC_ID.name())
.dataType(DataTypes.IntegerType)
.withNullability(true)
.build(),
SparkMetadataColumn.builder()
.name(MetadataColumns.PARTITION_COLUMN_NAME)
.dataType(sparkPartitionType)
.withNullability(true)
.build(),
SparkMetadataColumn.builder()
.name(MetadataColumns.FILE_PATH.name())
.dataType(DataTypes.StringType)
.withNullability(false)
.build(),
SparkMetadataColumn.builder()
.name(MetadataColumns.ROW_POSITION.name())
.dataType(DataTypes.LongType)
.withNullability(false)
.build(),
SparkMetadataColumn.builder()
.name(MetadataColumns.IS_DELETED.name())
.dataType(DataTypes.BooleanType)
.withNullability(false)
.build(),
};
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
*/
package org.apache.iceberg.spark.source;

import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.base.Strings;
import org.apache.spark.sql.connector.catalog.MetadataColumn;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.MetadataBuilder;
Expand All @@ -31,17 +33,62 @@ public class SparkMetadataColumn implements MetadataColumn {
private final boolean preserveOnUpdate;
private final boolean preserveOnDelete;

public SparkMetadataColumn(String name, DataType dataType, boolean isNullable) {
this(
name,
dataType,
isNullable,
MetadataColumn.PRESERVE_ON_REINSERT_DEFAULT,
MetadataColumn.PRESERVE_ON_UPDATE_DEFAULT,
MetadataColumn.PRESERVE_ON_DELETE_DEFAULT);
public static class Builder {
private String name;
private DataType dataType;
private boolean isNullable;
private boolean preserveOnReinsert = MetadataColumn.PRESERVE_ON_REINSERT_DEFAULT;
private boolean preserveOnUpdate = MetadataColumn.PRESERVE_ON_UPDATE_DEFAULT;
private boolean preserveOnDelete = MetadataColumn.PRESERVE_ON_DELETE_DEFAULT;

public Builder name(String fieldName) {
Preconditions.checkArgument(
!Strings.isNullOrEmpty(fieldName), "Cannot have a null or empty name");
this.name = fieldName;
return this;
}

public Builder dataType(DataType type) {
Preconditions.checkArgument(type != null, "Cannot have a null datatype");
this.dataType = type;
return this;
}

public Builder withNullability(boolean nullable) {
this.isNullable = nullable;
return this;
}

public Builder preserveOnReinsert(boolean shouldPreserveOnReinsert) {
this.preserveOnReinsert = shouldPreserveOnReinsert;
return this;
}

public Builder preserveOnUpdate(boolean shouldPreserveOnUpdate) {
this.preserveOnUpdate = shouldPreserveOnUpdate;
return this;
}

public Builder preserveOnDelete(boolean shouldPreserveOnDelete) {
this.preserveOnDelete = shouldPreserveOnDelete;
return this;
}

public SparkMetadataColumn build() {
Preconditions.checkArgument(
name != null, "Cannot build a SparkMetadataColumn with a null name");
Preconditions.checkArgument(
dataType != null, "Cannot build a SparkMetadataColumn with a null data type");
return new SparkMetadataColumn(
name, dataType, isNullable, preserveOnReinsert, preserveOnUpdate, preserveOnDelete);
}
}

public static Builder builder() {
return new Builder();
}

public SparkMetadataColumn(
private SparkMetadataColumn(
String name,
DataType dataType,
boolean isNullable,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@
*/
package org.apache.iceberg.spark.source;

import java.util.List;
import org.apache.iceberg.IsolationLevel;
import org.apache.iceberg.MetadataColumns;
import org.apache.iceberg.Table;
import org.apache.iceberg.TableUtil;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.connector.expressions.Expressions;
import org.apache.spark.sql.connector.expressions.NamedReference;
Expand Down Expand Up @@ -97,16 +99,16 @@ public DeltaWriteBuilder newWriteBuilder(LogicalWriteInfo info) {

@Override
public NamedReference[] requiredMetadataAttributes() {
NamedReference specId = Expressions.column(MetadataColumns.SPEC_ID.name());
NamedReference partition = Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME);
List<NamedReference> metadataAttributes = Lists.newArrayList();
metadataAttributes.add(Expressions.column(MetadataColumns.SPEC_ID.name()));
metadataAttributes.add(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME));
if (TableUtil.supportsRowLineage(table)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: I'm fine either way but I think it would be could to align how this is done here (stores named references in an array) vs in SparkCopyOnWriteOperation (which stores named references in a list)

NamedReference rowId = Expressions.column(MetadataColumns.ROW_ID.name());
NamedReference lastUpdatedSequenceNumber =
Expressions.column(MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.name());
return new NamedReference[] {specId, partition, rowId, lastUpdatedSequenceNumber};
metadataAttributes.add(Expressions.column(MetadataColumns.ROW_ID.name()));
metadataAttributes.add(
Expressions.column(MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.name()));
}

return new NamedReference[] {specId, partition};
return metadataAttributes.toArray(new NamedReference[0]);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ public DeltaWriter<InternalRow> createWriter(int partitionId, long taskId) {
.writeProperties(writeProperties)
.build();

Function<InternalRow, InternalRow> extractRowLineage =
Function<InternalRow, InternalRow> rowLineageExtractor =
context.dataSchema() != null
&& context.dataSchema().findField(MetadataColumns.ROW_ID.fieldId()) != null
? new ExtractRowLineageFromMetadata()
Expand All @@ -445,7 +445,7 @@ public DeltaWriter<InternalRow> createWriter(int partitionId, long taskId) {
writerFactory,
dataFileFactory,
deleteFileFactory,
extractRowLineage,
rowLineageExtractor,
context);

} else {
Expand All @@ -455,7 +455,7 @@ public DeltaWriter<InternalRow> createWriter(int partitionId, long taskId) {
writerFactory,
dataFileFactory,
deleteFileFactory,
extractRowLineage,
rowLineageExtractor,
context);
}
}
Expand Down
Loading