mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-04 22:52:54 +00:00
Openlineage 1 31 0 upgrade - Tamas' changes (#13658)
Co-authored-by: treff7es <treff7es@gmail.com>
This commit is contained in:
parent
2cee79fd4d
commit
cfc05747aa
@ -61,7 +61,7 @@ buildscript {
|
|||||||
ext.hazelcastVersion = '5.3.6'
|
ext.hazelcastVersion = '5.3.6'
|
||||||
ext.ebeanVersion = '15.5.2'
|
ext.ebeanVersion = '15.5.2'
|
||||||
ext.googleJavaFormatVersion = '1.18.1'
|
ext.googleJavaFormatVersion = '1.18.1'
|
||||||
ext.openLineageVersion = '1.25.0'
|
ext.openLineageVersion = '1.31.0'
|
||||||
ext.logbackClassicJava8 = '1.2.12'
|
ext.logbackClassicJava8 = '1.2.12'
|
||||||
ext.awsSdk2Version = '2.30.33'
|
ext.awsSdk2Version = '2.30.33'
|
||||||
|
|
||||||
|
|||||||
@ -24,7 +24,7 @@ When running jobs using spark-submit, the agent needs to be configured in the co
|
|||||||
|
|
||||||
```text
|
```text
|
||||||
#Configuring DataHub spark agent jar
|
#Configuring DataHub spark agent jar
|
||||||
spark.jars.packages io.acryl:acryl-spark-lineage:0.2.17
|
spark.jars.packages io.acryl:acryl-spark-lineage:0.2.18
|
||||||
spark.extraListeners datahub.spark.DatahubSparkListener
|
spark.extraListeners datahub.spark.DatahubSparkListener
|
||||||
spark.datahub.rest.server http://localhost:8080
|
spark.datahub.rest.server http://localhost:8080
|
||||||
```
|
```
|
||||||
@ -32,7 +32,7 @@ spark.datahub.rest.server http://localhost:8080
|
|||||||
## spark-submit command line
|
## spark-submit command line
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
spark-submit --packages io.acryl:acryl-spark-lineage:0.2.17 --conf "spark.extraListeners=datahub.spark.DatahubSparkListener" my_spark_job_to_run.py
|
spark-submit --packages io.acryl:acryl-spark-lineage:0.2.18 --conf "spark.extraListeners=datahub.spark.DatahubSparkListener" my_spark_job_to_run.py
|
||||||
```
|
```
|
||||||
|
|
||||||
### Configuration Instructions: Amazon EMR
|
### Configuration Instructions: Amazon EMR
|
||||||
@ -41,7 +41,7 @@ Set the following spark-defaults configuration properties as it
|
|||||||
stated [here](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-configure.html)
|
stated [here](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-configure.html)
|
||||||
|
|
||||||
```text
|
```text
|
||||||
spark.jars.packages io.acryl:acryl-spark-lineage:0.2.17
|
spark.jars.packages io.acryl:acryl-spark-lineage:0.2.18
|
||||||
spark.extraListeners datahub.spark.DatahubSparkListener
|
spark.extraListeners datahub.spark.DatahubSparkListener
|
||||||
spark.datahub.rest.server https://your_datahub_host/gms
|
spark.datahub.rest.server https://your_datahub_host/gms
|
||||||
#If you have authentication set up then you also need to specify the Datahub access token
|
#If you have authentication set up then you also need to specify the Datahub access token
|
||||||
@ -56,7 +56,7 @@ When running interactive jobs from a notebook, the listener can be configured wh
|
|||||||
spark = SparkSession.builder
|
spark = SparkSession.builder
|
||||||
.master("spark://spark-master:7077")
|
.master("spark://spark-master:7077")
|
||||||
.appName("test-application")
|
.appName("test-application")
|
||||||
.config("spark.jars.packages", "io.acryl:acryl-spark-lineage:0.2.17")
|
.config("spark.jars.packages", "io.acryl:acryl-spark-lineage:0.2.18")
|
||||||
.config("spark.extraListeners", "datahub.spark.DatahubSparkListener")
|
.config("spark.extraListeners", "datahub.spark.DatahubSparkListener")
|
||||||
.config("spark.datahub.rest.server", "http://localhost:8080")
|
.config("spark.datahub.rest.server", "http://localhost:8080")
|
||||||
.enableHiveSupport()
|
.enableHiveSupport()
|
||||||
@ -79,7 +79,7 @@ appName("test-application")
|
|||||||
config("spark.master","spark://spark-master:7077")
|
config("spark.master","spark://spark-master:7077")
|
||||||
.
|
.
|
||||||
|
|
||||||
config("spark.jars.packages","io.acryl:acryl-spark-lineage:0.2.17")
|
config("spark.jars.packages","io.acryl:acryl-spark-lineage:0.2.18")
|
||||||
.
|
.
|
||||||
|
|
||||||
config("spark.extraListeners","datahub.spark.DatahubSparkListener")
|
config("spark.extraListeners","datahub.spark.DatahubSparkListener")
|
||||||
@ -199,6 +199,7 @@ information like tokens.
|
|||||||
| spark.datahub.s3.filename | | | The name of the file where metadata will be written if it is not set random filename will be used on s3 if s3 emitter is set |
|
| spark.datahub.s3.filename | | | The name of the file where metadata will be written if it is not set random filename will be used on s3 if s3 emitter is set |
|
||||||
| spark.datahub.log.mcps | | true | Set this to true to log MCPS to the log. By default, it is enabled. |
|
| spark.datahub.log.mcps | | true | Set this to true to log MCPS to the log. By default, it is enabled. |
|
||||||
| spark.datahub.legacyLineageCleanup.enabled | | false | Set this to true to remove legacy lineages from older Spark Plugin runs. This will remove those lineages from the Datasets which it adds to DataJob. By default, it is disabled. |
|
| spark.datahub.legacyLineageCleanup.enabled | | false | Set this to true to remove legacy lineages from older Spark Plugin runs. This will remove those lineages from the Datasets which it adds to DataJob. By default, it is disabled. |
|
||||||
|
| spark.datahub.capture_spark_plan | | false | Set this to true to capture the Spark plan. By default, it is disabled. |
|
||||||
| spark.datahub.metadata.dataset.enableEnhancedMergeIntoExtraction | | false | Set this to true to enable enhanced table name extraction for Delta Lake MERGE INTO commands. This improves lineage tracking by including the target table name in the job name. By default, it is disabled. |
|
| spark.datahub.metadata.dataset.enableEnhancedMergeIntoExtraction | | false | Set this to true to enable enhanced table name extraction for Delta Lake MERGE INTO commands. This improves lineage tracking by including the target table name in the job name. By default, it is disabled. |
|
||||||
|
|
||||||
## What to Expect: The Metadata Model
|
## What to Expect: The Metadata Model
|
||||||
@ -386,6 +387,12 @@ Use Java 8 to build the project. The project uses Gradle as the build tool. To b
|
|||||||
|
|
||||||
## Changelog
|
## Changelog
|
||||||
|
|
||||||
|
### Version 0.2.18
|
||||||
|
|
||||||
|
- _Changes_:
|
||||||
|
- OpenLineage 1.31.0 upgrade
|
||||||
|
- Add `spark.datahub.capture_spark_plan` option to capture the Spark plan. By default, it is disabled.
|
||||||
|
|
||||||
### Version 0.2.17
|
### Version 0.2.17
|
||||||
|
|
||||||
- _Major changes_:
|
- _Major changes_:
|
||||||
|
|||||||
@ -36,6 +36,7 @@ import java.net.URISyntaxException;
|
|||||||
import java.time.Instant;
|
import java.time.Instant;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
@ -67,6 +68,7 @@ public class DatahubSparkListener extends SparkListener {
|
|||||||
private static ContextFactory contextFactory;
|
private static ContextFactory contextFactory;
|
||||||
private static CircuitBreaker circuitBreaker = new NoOpCircuitBreaker();
|
private static CircuitBreaker circuitBreaker = new NoOpCircuitBreaker();
|
||||||
private static final String sparkVersion = package$.MODULE$.SPARK_VERSION();
|
private static final String sparkVersion = package$.MODULE$.SPARK_VERSION();
|
||||||
|
private final SparkConf conf;
|
||||||
|
|
||||||
private final Function0<Option<SparkContext>> activeSparkContext =
|
private final Function0<Option<SparkContext>> activeSparkContext =
|
||||||
ScalaConversionUtils.toScalaFn(SparkContext$.MODULE$::getActive);
|
ScalaConversionUtils.toScalaFn(SparkContext$.MODULE$::getActive);
|
||||||
@ -74,8 +76,10 @@ public class DatahubSparkListener extends SparkListener {
|
|||||||
private static MeterRegistry meterRegistry;
|
private static MeterRegistry meterRegistry;
|
||||||
private boolean isDisabled;
|
private boolean isDisabled;
|
||||||
|
|
||||||
public DatahubSparkListener() throws URISyntaxException {
|
public DatahubSparkListener(SparkConf conf) throws URISyntaxException {
|
||||||
listener = new OpenLineageSparkListener();
|
this.conf = ((SparkConf) Objects.requireNonNull(conf)).clone();
|
||||||
|
|
||||||
|
listener = new OpenLineageSparkListener(conf);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static SparkAppContext getSparkAppContext(
|
private static SparkAppContext getSparkAppContext(
|
||||||
@ -255,7 +259,10 @@ public class DatahubSparkListener extends SparkListener {
|
|||||||
SparkEnv sparkEnv = SparkEnv$.MODULE$.get();
|
SparkEnv sparkEnv = SparkEnv$.MODULE$.get();
|
||||||
if (sparkEnv != null) {
|
if (sparkEnv != null) {
|
||||||
log.info("sparkEnv: {}", sparkEnv.conf().toDebugString());
|
log.info("sparkEnv: {}", sparkEnv.conf().toDebugString());
|
||||||
sparkEnv.conf().set("spark.openlineage.facets.disabled", "[spark_unknown;spark.logicalPlan]");
|
if (datahubConf.hasPath("capture_spark_plan")
|
||||||
|
&& datahubConf.getBoolean("capture_spark_plan")) {
|
||||||
|
sparkEnv.conf().set("spark.openlineage.facets.spark.logicalPlan.disabled", "false");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (properties != null) {
|
if (properties != null) {
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
/* Copyright 2018-2024 contributors to the OpenLineage project
|
/* Copyright 2018-2025 contributors to the OpenLineage project
|
||||||
/* SPDX-License-Identifier: Apache-2.0
|
/* SPDX-License-Identifier: Apache-2.0
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@ -173,20 +173,34 @@ public class PlanUtils {
|
|||||||
* and namespace.
|
* and namespace.
|
||||||
*
|
*
|
||||||
* @param parentRunId
|
* @param parentRunId
|
||||||
* @param parentJob
|
* @param parentJobName
|
||||||
* @param parentJobNamespace
|
* @param parentJobNamespace
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public static OpenLineage.ParentRunFacet parentRunFacet(
|
public static OpenLineage.ParentRunFacet parentRunFacet(
|
||||||
UUID parentRunId, String parentJob, String parentJobNamespace) {
|
UUID parentRunId,
|
||||||
|
String parentJobName,
|
||||||
|
String parentJobNamespace,
|
||||||
|
UUID rootParentRunId,
|
||||||
|
String rootParentJobName,
|
||||||
|
String rootParentJobNamespace) {
|
||||||
return new OpenLineage(Versions.OPEN_LINEAGE_PRODUCER_URI)
|
return new OpenLineage(Versions.OPEN_LINEAGE_PRODUCER_URI)
|
||||||
.newParentRunFacetBuilder()
|
.newParentRunFacetBuilder()
|
||||||
.run(new OpenLineage.ParentRunFacetRunBuilder().runId(parentRunId).build())
|
.run(new OpenLineage.ParentRunFacetRunBuilder().runId(parentRunId).build())
|
||||||
.job(
|
.job(
|
||||||
new OpenLineage.ParentRunFacetJobBuilder()
|
new OpenLineage.ParentRunFacetJobBuilder()
|
||||||
.name(NameNormalizer.normalize(parentJob))
|
.name(NameNormalizer.normalize(parentJobName))
|
||||||
.namespace(parentJobNamespace)
|
.namespace(parentJobNamespace)
|
||||||
.build())
|
.build())
|
||||||
|
.root(
|
||||||
|
new OpenLineage.ParentRunFacetRootBuilder()
|
||||||
|
.run(new OpenLineage.RootRunBuilder().runId(rootParentRunId).build())
|
||||||
|
.job(
|
||||||
|
new OpenLineage.RootJobBuilder()
|
||||||
|
.namespace(rootParentJobNamespace)
|
||||||
|
.name(rootParentJobName)
|
||||||
|
.build())
|
||||||
|
.build())
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
/* Copyright 2018-2024 contributors to the OpenLineage project
|
/* Copyright 2018-2025 contributors to the OpenLineage project
|
||||||
/* SPDX-License-Identifier: Apache-2.0
|
/* SPDX-License-Identifier: Apache-2.0
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@ -23,6 +23,7 @@ import org.apache.spark.sql.execution.datasources.FilePartition;
|
|||||||
import org.apache.spark.sql.execution.datasources.FileScanRDD;
|
import org.apache.spark.sql.execution.datasources.FileScanRDD;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
import scala.collection.immutable.Seq;
|
import scala.collection.immutable.Seq;
|
||||||
|
import scala.collection.mutable.ArrayBuffer;
|
||||||
|
|
||||||
/** Utility class to extract paths from RDD nodes. */
|
/** Utility class to extract paths from RDD nodes. */
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@ -65,6 +66,11 @@ public class RddPathUtils {
|
|||||||
public Stream<Path> extract(HadoopRDD rdd) {
|
public Stream<Path> extract(HadoopRDD rdd) {
|
||||||
org.apache.hadoop.fs.Path[] inputPaths = FileInputFormat.getInputPaths(rdd.getJobConf());
|
org.apache.hadoop.fs.Path[] inputPaths = FileInputFormat.getInputPaths(rdd.getJobConf());
|
||||||
Configuration hadoopConf = rdd.getConf();
|
Configuration hadoopConf = rdd.getConf();
|
||||||
|
if (log.isDebugEnabled()) {
|
||||||
|
log.debug("Hadoop RDD class {}", rdd.getClass());
|
||||||
|
log.debug("Hadoop RDD input paths {}", Arrays.toString(inputPaths));
|
||||||
|
log.debug("Hadoop RDD job conf {}", rdd.getJobConf());
|
||||||
|
}
|
||||||
return Arrays.stream(inputPaths).map(p -> PlanUtils.getDirectoryPath(p, hadoopConf));
|
return Arrays.stream(inputPaths).map(p -> PlanUtils.getDirectoryPath(p, hadoopConf));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -78,6 +84,9 @@ public class RddPathUtils {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Stream<Path> extract(MapPartitionsRDD rdd) {
|
public Stream<Path> extract(MapPartitionsRDD rdd) {
|
||||||
|
if (log.isDebugEnabled()) {
|
||||||
|
log.debug("Parent RDD: {}", rdd.prev());
|
||||||
|
}
|
||||||
return findRDDPaths(rdd.prev());
|
return findRDDPaths(rdd.prev());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -122,7 +131,9 @@ public class RddPathUtils {
|
|||||||
try {
|
try {
|
||||||
Object data = FieldUtils.readField(rdd, "data", true);
|
Object data = FieldUtils.readField(rdd, "data", true);
|
||||||
log.debug("ParallelCollectionRDD data: {}", data);
|
log.debug("ParallelCollectionRDD data: {}", data);
|
||||||
if ((data instanceof Seq) && ((Seq) data).head() instanceof Tuple2) {
|
if ((data instanceof Seq)
|
||||||
|
&& (!((Seq<?>) data).isEmpty())
|
||||||
|
&& ((Seq) data).head() instanceof Tuple2) {
|
||||||
// exit if the first element is invalid
|
// exit if the first element is invalid
|
||||||
Seq data_slice = (Seq) ((Seq) data).slice(0, SEQ_LIMIT);
|
Seq data_slice = (Seq) ((Seq) data).slice(0, SEQ_LIMIT);
|
||||||
return ScalaConversionUtils.fromSeq(data_slice).stream()
|
return ScalaConversionUtils.fromSeq(data_slice).stream()
|
||||||
@ -140,6 +151,11 @@ public class RddPathUtils {
|
|||||||
return path;
|
return path;
|
||||||
})
|
})
|
||||||
.filter(Objects::nonNull);
|
.filter(Objects::nonNull);
|
||||||
|
} else if ((data instanceof ArrayBuffer) && !((ArrayBuffer<?>) data).isEmpty()) {
|
||||||
|
ArrayBuffer<?> dataBuffer = (ArrayBuffer<?>) data;
|
||||||
|
return ScalaConversionUtils.fromSeq(dataBuffer.toSeq()).stream()
|
||||||
|
.map(o -> parentOf(o.toString()))
|
||||||
|
.filter(Objects::nonNull);
|
||||||
} else {
|
} else {
|
||||||
// Changed to debug to silence error
|
// Changed to debug to silence error
|
||||||
log.debug("Cannot extract path from ParallelCollectionRDD {}", data);
|
log.debug("Cannot extract path from ParallelCollectionRDD {}", data);
|
||||||
@ -156,6 +172,9 @@ public class RddPathUtils {
|
|||||||
try {
|
try {
|
||||||
return new Path(path).getParent();
|
return new Path(path).getParent();
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
if (log.isDebugEnabled()) {
|
||||||
|
log.debug("Cannot get parent of path {}", path, e);
|
||||||
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
/* Copyright 2018-2024 contributors to the OpenLineage project
|
/* Copyright 2018-2025 contributors to the OpenLineage project
|
||||||
/* SPDX-License-Identifier: Apache-2.0
|
/* SPDX-License-Identifier: Apache-2.0
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
/* Copyright 2018-2024 contributors to the OpenLineage project
|
/* Copyright 2018-2025 contributors to the OpenLineage project
|
||||||
/* SPDX-License-Identifier: Apache-2.0
|
/* SPDX-License-Identifier: Apache-2.0
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@ -21,6 +21,8 @@ public interface Vendors {
|
|||||||
Arrays.asList(
|
Arrays.asList(
|
||||||
// Add vendor classes here
|
// Add vendor classes here
|
||||||
"io.openlineage.spark.agent.vendor.snowflake.SnowflakeVendor",
|
"io.openlineage.spark.agent.vendor.snowflake.SnowflakeVendor",
|
||||||
|
"io.openlineage.spark.agent.vendor.iceberg.IcebergVendor",
|
||||||
|
"io.openlineage.spark.agent.vendor.gcp.GcpVendor",
|
||||||
// This is the only chance we have to add the RedshiftVendor to the list of vendors
|
// This is the only chance we have to add the RedshiftVendor to the list of vendors
|
||||||
"io.openlineage.spark.agent.vendor.redshift.RedshiftVendor");
|
"io.openlineage.spark.agent.vendor.redshift.RedshiftVendor");
|
||||||
|
|
||||||
@ -56,7 +58,7 @@ public interface Vendors {
|
|||||||
// and the app
|
// and the app
|
||||||
// https://github.com/OpenLineage/OpenLineage/issues/1860
|
// https://github.com/OpenLineage/OpenLineage/issues/1860
|
||||||
// ServiceLoader<Vendor> serviceLoader = ServiceLoader.load(Vendor.class);
|
// ServiceLoader<Vendor> serviceLoader = ServiceLoader.load(Vendor.class);
|
||||||
return new VendorsImpl(vendors);
|
return new VendorsImpl(vendors, new VendorsContext());
|
||||||
}
|
}
|
||||||
|
|
||||||
static Vendors empty() {
|
static Vendors empty() {
|
||||||
@ -71,10 +73,17 @@ public interface Vendors {
|
|||||||
public Collection<OpenLineageEventHandlerFactory> getEventHandlerFactories() {
|
public Collection<OpenLineageEventHandlerFactory> getEventHandlerFactories() {
|
||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public VendorsContext getVendorsContext() {
|
||||||
|
return new VendorsContext();
|
||||||
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
Collection<VisitorFactory> getVisitorFactories();
|
Collection<VisitorFactory> getVisitorFactories();
|
||||||
|
|
||||||
Collection<OpenLineageEventHandlerFactory> getEventHandlerFactories();
|
Collection<OpenLineageEventHandlerFactory> getEventHandlerFactories();
|
||||||
|
|
||||||
|
VendorsContext getVendorsContext();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,28 @@
|
|||||||
|
/*
|
||||||
|
/* Copyright 2018-2025 contributors to the OpenLineage project
|
||||||
|
/* SPDX-License-Identifier: Apache-2.0
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.openlineage.spark.api;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
/** Class to store all the vendors related context information. */
|
||||||
|
public class VendorsContext {
|
||||||
|
private final Map<String, Object> contextMap = new HashMap<>();
|
||||||
|
|
||||||
|
public void register(String key, Object value) {
|
||||||
|
contextMap.put(key, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Optional<Object> fromVendorsContext(String key) {
|
||||||
|
return Optional.ofNullable(contextMap.get(key));
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean contains(String key) {
|
||||||
|
return contextMap.containsKey(key);
|
||||||
|
}
|
||||||
|
;
|
||||||
|
}
|
||||||
@ -1,29 +1,27 @@
|
|||||||
/*
|
/*
|
||||||
/* Copyright 2018-2024 contributors to the OpenLineage project
|
/* Copyright 2018-2025 contributors to the OpenLineage project
|
||||||
/* SPDX-License-Identifier: Apache-2.0
|
/* SPDX-License-Identifier: Apache-2.0
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package io.openlineage.spark.api;
|
package io.openlineage.spark.api;
|
||||||
|
|
||||||
import io.openlineage.spark.agent.lifecycle.VisitorFactory;
|
import io.openlineage.spark.agent.lifecycle.VisitorFactory;
|
||||||
import io.openlineage.spark.agent.vendor.redshift.RedshiftVendor;
|
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
|
|
||||||
@Slf4j
|
|
||||||
public class VendorsImpl implements Vendors {
|
public class VendorsImpl implements Vendors {
|
||||||
private final List<Vendor> vendors;
|
private final List<Vendor> vendors;
|
||||||
|
private final VendorsContext vendorsContext;
|
||||||
|
|
||||||
public VendorsImpl(List<Vendor> vendors) {
|
public VendorsImpl(List<Vendor> vendors, VendorsContext vendorsContext) {
|
||||||
this.vendors = vendors;
|
this.vendors = vendors;
|
||||||
|
this.vendorsContext = vendorsContext;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Collection<VisitorFactory> getVisitorFactories() {
|
public Collection<VisitorFactory> getVisitorFactories() {
|
||||||
vendors.add(new RedshiftVendor());
|
|
||||||
return vendors.stream()
|
return vendors.stream()
|
||||||
.map(Vendor::getVisitorFactory)
|
.map(Vendor::getVisitorFactory)
|
||||||
.filter(Optional::isPresent)
|
.filter(Optional::isPresent)
|
||||||
@ -39,4 +37,9 @@ public class VendorsImpl implements Vendors {
|
|||||||
.map(Optional::get)
|
.map(Optional::get)
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public VendorsContext getVendorsContext() {
|
||||||
|
return vendorsContext;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -67,6 +67,7 @@ import java.net.URI;
|
|||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.time.ZonedDateTime;
|
import java.time.ZonedDateTime;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -80,6 +81,7 @@ import java.util.stream.Stream;
|
|||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.json.JSONArray;
|
import org.json.JSONArray;
|
||||||
import org.json.JSONException;
|
import org.json.JSONException;
|
||||||
|
import org.json.JSONObject;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
public class OpenLineageToDataHub {
|
public class OpenLineageToDataHub {
|
||||||
@ -605,12 +607,23 @@ public class OpenLineageToDataHub {
|
|||||||
forEachValue(airflowProperties, customProperties);
|
forEachValue(airflowProperties, customProperties);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case "spark.logicalPlan":
|
||||||
|
{
|
||||||
|
if (flowProperties) {
|
||||||
|
JSONObject jsonObject = new JSONObject(entry.getValue().getAdditionalProperties());
|
||||||
|
customProperties.put("spark.logicalPlan", jsonObject.toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
case "unknownSourceAttribute":
|
case "unknownSourceAttribute":
|
||||||
{
|
{
|
||||||
if (!flowProperties) {
|
if (!flowProperties) {
|
||||||
List<Map<String, Object>> unknownItems =
|
List<Map<String, Object>> unknownItems =
|
||||||
(List<Map<String, Object>>)
|
(List<Map<String, Object>>)
|
||||||
entry.getValue().getAdditionalProperties().get("unknownItems");
|
entry
|
||||||
|
.getValue()
|
||||||
|
.getAdditionalProperties()
|
||||||
|
.getOrDefault("unknownItems", Collections.emptyList());
|
||||||
for (Map<String, Object> item : unknownItems) {
|
for (Map<String, Object> item : unknownItems) {
|
||||||
forEachValue(item, customProperties);
|
forEachValue(item, customProperties);
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user