Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 5a5f402

Browse files
authored
Implement pluggable Lineage in Java SDK (#36781)
1 parent 4cdcf04 commit 5a5f402

11 files changed

Lines changed: 673 additions & 21 deletions

File tree

CHANGES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@
7474
compatible. Both coders can decode encoded bytes from the other coder
7575
([#38139](https://github.com/apache/beam/issues/38139)).
7676
* (Python) Added type alias for with_exception_handling to be used for typehints. ([#38173](https://github.com/apache/beam/issues/38173)).
77+
* Added plugin mechanism to support different Lineage implementations (Java) ([#36790](https://github.com/apache/beam/issues/36790)).
7778

7879
## Breaking Changes
7980

sdks/java/core/src/main/java/org/apache/beam/sdk/io/FileSystems.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -578,6 +578,7 @@ public static void setDefaultPipelineOptions(PipelineOptions options) {
578578

579579
// entry to set other PipelineOption determined flags
580580
Metrics.setDefaultPipelineOptions(options);
581+
Lineage.setDefaultPipelineOptions(options);
581582

582583
while (true) {
583584
KV<Long, Integer> revision = FILESYSTEM_REVISION.get();
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.beam.sdk.lineage;
19+
20+
import org.apache.beam.sdk.annotations.Internal;
21+
22+
/**
23+
* Plugin interface for lineage implementations.
24+
*
25+
* <p>This is the core contract that lineage plugins must implement. Custom implementations are
26+
* selected via the {@code --lineageType} pipeline option (see {@link LineageOptions}).
27+
*
28+
* <p>Implementations must provide a public constructor accepting ({@link
29+
* org.apache.beam.sdk.options.PipelineOptions}, {@link
30+
* org.apache.beam.sdk.metrics.Lineage.LineageDirection}).
31+
*
32+
* <p>End users should use the {@link org.apache.beam.sdk.metrics.Lineage} facade class instead of
33+
* implementing this interface directly.
34+
*/
35+
@Internal
36+
public interface LineageBase {
37+
/**
38+
* Adds the given FQN as lineage.
39+
*
40+
* @param rollupSegments should be an iterable of strings whose concatenation is a valid <a
41+
* href="https://cloud.google.com/data-catalog/docs/fully-qualified-names">Dataplex FQN </a>
42+
* which is already escaped.
43+
* <p>In particular, this means they will often have trailing delimiters.
44+
*/
45+
void add(Iterable<String> rollupSegments);
46+
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.beam.sdk.lineage;
19+
20+
import org.apache.beam.sdk.options.Description;
21+
import org.apache.beam.sdk.options.PipelineOptions;
22+
import org.checkerframework.checker.nullness.qual.Nullable;
23+
24+
/**
25+
* Pipeline options for selecting a custom {@link LineageBase} implementation.
26+
*
27+
* <p>When not set, the default Metrics-based lineage is used. Can be set from the command line:
28+
* {@code --lineageType=com.example.MyLineage}
29+
*/
30+
public interface LineageOptions extends PipelineOptions {
31+
32+
@Description(
33+
"The fully qualified class name of the LineageBase implementation to use for recording "
34+
+ "lineage. The class must implement LineageBase and have a public constructor accepting "
35+
+ "(PipelineOptions, Lineage.LineageDirection). "
36+
+ "If not specified, the default Metrics-based lineage is used.")
37+
@Nullable
38+
Class<? extends LineageBase> getLineageType();
39+
40+
void setLineageType(@Nullable Class<? extends LineageBase> lineageClass);
41+
}
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
/**
19+
* Lineage tracking support for Apache Beam pipelines.
20+
*
21+
* <p>This package provides a plugin mechanism to support different lineage implementations through
22+
* the {@link org.apache.beam.sdk.lineage.LineageBase} interface. Lineage implementations can be
23+
* selected via the {@code --lineageType} pipeline option to track data lineage information during
24+
* pipeline execution.
25+
*
26+
* <p>For lineage capabilities, see {@link org.apache.beam.sdk.metrics.Lineage}.
27+
*/
28+
package org.apache.beam.sdk.lineage;
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.beam.sdk.metrics;
19+
20+
import org.apache.beam.sdk.lineage.LineageBase;
21+
import org.apache.beam.sdk.options.PipelineOptions;
22+
import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList;
23+
24+
/**
25+
* Lineage implementation that stores lineage information in {@link BoundedTrie} metrics.
26+
*
27+
* <p>Used when {@link Metrics.MetricsFlag#lineageRollupEnabled()} is true.
28+
*/
29+
class BoundedTrieMetricsLineage implements LineageBase {
30+
31+
private final BoundedTrie metric;
32+
33+
@SuppressWarnings("unused")
34+
public BoundedTrieMetricsLineage(PipelineOptions options, Lineage.LineageDirection direction) {
35+
Lineage.Type type =
36+
(direction == Lineage.LineageDirection.SOURCE)
37+
? Lineage.Type.SOURCEV2
38+
: Lineage.Type.SINKV2;
39+
this.metric = Metrics.boundedTrie(Lineage.LINEAGE_NAMESPACE, type.toString());
40+
}
41+
42+
@Override
43+
public void add(Iterable<String> rollupSegments) {
44+
metric.add(ImmutableList.copyOf(rollupSegments));
45+
}
46+
}

sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/Lineage.java

Lines changed: 90 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -17,51 +17,121 @@
1717
*/
1818
package org.apache.beam.sdk.metrics;
1919

20+
import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkNotNull;
21+
2022
import java.util.ArrayList;
2123
import java.util.HashSet;
2224
import java.util.Iterator;
2325
import java.util.List;
2426
import java.util.Set;
2527
import java.util.regex.Pattern;
2628
import org.apache.beam.sdk.annotations.Internal;
29+
import org.apache.beam.sdk.lineage.LineageBase;
30+
import org.apache.beam.sdk.lineage.LineageOptions;
2731
import org.apache.beam.sdk.metrics.Metrics.MetricsFlag;
32+
import org.apache.beam.sdk.options.PipelineOptions;
2833
import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting;
2934
import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Splitter;
30-
import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList;
3135
import org.checkerframework.checker.nullness.qual.Nullable;
36+
import org.slf4j.Logger;
37+
import org.slf4j.LoggerFactory;
3238

3339
/**
3440
* Standard collection of metrics used to record source and sinks information for lineage tracking.
3541
*/
3642
public class Lineage {
37-
3843
public static final String LINEAGE_NAMESPACE = "lineage";
39-
private static final Lineage SOURCES = new Lineage(Type.SOURCE);
40-
private static final Lineage SINKS = new Lineage(Type.SINK);
44+
private static final Logger LOG = LoggerFactory.getLogger(Lineage.class);
45+
46+
private static volatile @Nullable Lineage sources;
47+
private static volatile @Nullable Lineage sinks;
48+
private static volatile @Nullable Class<? extends LineageBase> currentLineageType;
49+
50+
private static final Object INIT_LOCK = new Object();
51+
4152
// Reserved characters are backtick, colon, whitespace (space, \t, \n) and dot.
4253
private static final Pattern RESERVED_CHARS = Pattern.compile("[:\\s.`]");
4354

44-
private final Metric metric;
55+
private final LineageBase delegate;
4556

46-
private Lineage(Type type) {
47-
if (MetricsFlag.lineageRollupEnabled()) {
48-
this.metric =
49-
Metrics.boundedTrie(
50-
LINEAGE_NAMESPACE,
51-
type == Type.SOURCE ? Type.SOURCEV2.toString() : Type.SINKV2.toString());
52-
} else {
53-
this.metric = Metrics.stringSet(LINEAGE_NAMESPACE, type.toString());
57+
public enum LineageDirection {
58+
SOURCE,
59+
SINK
60+
}
61+
62+
private Lineage(LineageBase delegate) {
63+
this.delegate = checkNotNull(delegate, "delegate cannot be null");
64+
}
65+
66+
@Internal
67+
public static void setDefaultPipelineOptions(PipelineOptions options) {
68+
checkNotNull(options, "options cannot be null");
69+
Class<? extends LineageBase> requestedType = options.as(LineageOptions.class).getLineageType();
70+
71+
if (canSkipInit(requestedType)) {
72+
return;
73+
}
74+
synchronized (INIT_LOCK) {
75+
if (canSkipInit(requestedType)) {
76+
return;
77+
}
78+
sources = createLineage(options, LineageDirection.SOURCE);
79+
sinks = createLineage(options, LineageDirection.SINK);
80+
currentLineageType = requestedType;
81+
LOG.debug("Lineage initialized with type {}", requestedType);
82+
}
83+
}
84+
85+
private static boolean canSkipInit(@Nullable Class<? extends LineageBase> requestedType) {
86+
if (sources == null) {
87+
return false;
88+
}
89+
// When no type is requested, preserve whatever is already initialized.
90+
// When a type is requested, only re-init if it differs from the active type.
91+
return requestedType == null || requestedType.equals(currentLineageType);
92+
}
93+
94+
private static Lineage createLineage(PipelineOptions options, LineageDirection direction) {
95+
Class<? extends LineageBase> lineageClass = options.as(LineageOptions.class).getLineageType();
96+
97+
if (lineageClass != null) {
98+
try {
99+
LineageBase lineage =
100+
lineageClass
101+
.getDeclaredConstructor(PipelineOptions.class, LineageDirection.class)
102+
.newInstance(options, direction);
103+
LOG.info("Using {} for lineage direction {}", lineageClass.getName(), direction);
104+
return new Lineage(lineage);
105+
} catch (ReflectiveOperationException e) {
106+
throw new IllegalArgumentException(
107+
"Failed to instantiate lineage implementation: "
108+
+ lineageClass.getName()
109+
+ ". The class must have a public constructor accepting "
110+
+ "(PipelineOptions, Lineage.LineageDirection).",
111+
e);
112+
}
54113
}
114+
115+
LOG.debug("Using default Metrics-based lineage for direction {}", direction);
116+
LineageBase defaultLineage =
117+
MetricsFlag.lineageRollupEnabled()
118+
? new BoundedTrieMetricsLineage(options, direction)
119+
: new StringSetMetricsLineage(options, direction);
120+
return new Lineage(defaultLineage);
55121
}
56122

57123
/** {@link Lineage} representing sources and optionally side inputs. */
58124
public static Lineage getSources() {
59-
return SOURCES;
125+
return checkNotNull(
126+
sources,
127+
"Lineage not initialized. FileSystems.setDefaultPipelineOptions must be called first.");
60128
}
61129

62130
/** {@link Lineage} representing sinks. */
63131
public static Lineage getSinks() {
64-
return SINKS;
132+
return checkNotNull(
133+
sinks,
134+
"Lineage not initialized. FileSystems.setDefaultPipelineOptions must be called first.");
65135
}
66136

67137
@VisibleForTesting
@@ -140,12 +210,7 @@ public void add(String system, Iterable<String> segments) {
140210
* <p>In particular, this means they will often have trailing delimiters.
141211
*/
142212
public void add(Iterable<String> rollupSegments) {
143-
ImmutableList<String> segments = ImmutableList.copyOf(rollupSegments);
144-
if (MetricsFlag.lineageRollupEnabled()) {
145-
((BoundedTrie) this.metric).add(segments);
146-
} else {
147-
((StringSet) this.metric).add(String.join("", segments));
148-
}
213+
delegate.add(rollupSegments);
149214
}
150215

151216
/**
@@ -156,6 +221,8 @@ public void add(Iterable<String> rollupSegments) {
156221
* @param truncatedMarker the marker to use to represent truncated FQNs.
157222
* @return A flat representation of all FQNs. If the FQN was truncated then it has a trailing
158223
* truncatedMarker.
224+
* <p>NOTE: When using a custom Lineage plugin, this method will return empty results since
225+
* lineage is not stored in Metrics.
159226
*/
160227
public static Set<String> query(MetricResults results, Type type, String truncatedMarker) {
161228
MetricQueryResults lineageQueryResults = getLineageQueryResults(results, type);
@@ -184,6 +251,8 @@ public static Set<String> query(MetricResults results, Type type, String truncat
184251
* @param results FQNs from the result
185252
* @param type sources or sinks
186253
* @return A flat representation of all FQNs. If the FQN was truncated then it has a trailing '*'.
254+
* <p>NOTE: When using a custom Lineage plugin, this method will return empty results since
255+
* lineage is not stored in Metrics.
187256
*/
188257
public static Set<String> query(MetricResults results, Type type) {
189258
if (MetricsFlag.lineageRollupEnabled()) {
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.beam.sdk.metrics;
19+
20+
import org.apache.beam.sdk.lineage.LineageBase;
21+
import org.apache.beam.sdk.options.PipelineOptions;
22+
23+
/**
24+
* Lineage implementation that stores lineage information in {@link StringSet} metrics.
25+
*
26+
* <p>Used when {@link Metrics.MetricsFlag#lineageRollupEnabled()} is false.
27+
*/
28+
class StringSetMetricsLineage implements LineageBase {
29+
30+
private final StringSet metric;
31+
32+
@SuppressWarnings("unused")
33+
public StringSetMetricsLineage(PipelineOptions options, Lineage.LineageDirection direction) {
34+
Lineage.Type type =
35+
(direction == Lineage.LineageDirection.SOURCE) ? Lineage.Type.SOURCE : Lineage.Type.SINK;
36+
this.metric = Metrics.stringSet(Lineage.LINEAGE_NAMESPACE, type.toString());
37+
}
38+
39+
@Override
40+
public void add(Iterable<String> rollupSegments) {
41+
metric.add(String.join("", rollupSegments));
42+
}
43+
}

0 commit comments

Comments
 (0)