mirror of
https://github.com/datahub-project/datahub.git
synced 2025-08-15 12:46:53 +00:00
Start adding java ETL examples, starting with kafka etl. (#1805)
Start adding java ETL examples, starting with kafka etl. We've had a few requests to start providing Java examples rather than Python due to type safety. I've also started to add these to metadata-ingestion-examples to make it clearer these are *examples*. They can be used directly or as a basis for other things. As we port to Java we'll move examples to contrib.
This commit is contained in:
parent
91486a2ffd
commit
6ece2d6469
@ -46,7 +46,7 @@ Please follow the [DataHub Quickstart Guide](docs/quickstart.md) to get a copy o
|
||||
* [Frontend](datahub-frontend)
|
||||
* [Web App](datahub-web)
|
||||
* [Generalized Metadata Service](gms)
|
||||
* [Metadata Ingestion](metadata-ingestion)
|
||||
* [Metadata Ingestion](metadata-ingestion-examples)
|
||||
* [Metadata Processing Jobs](metadata-jobs)
|
||||
|
||||
## Releases
|
||||
|
@ -44,7 +44,8 @@ project.ext.externalDependency = [
|
||||
'httpClient': 'org.apache.httpcomponents:httpclient:4.5.9',
|
||||
'jacksonCore': 'com.fasterxml.jackson.core:jackson-core:2.9.7',
|
||||
'jacksonDataBind': 'com.fasterxml.jackson.core:jackson-databind:2.9.7',
|
||||
"javatuples": "org.javatuples:javatuples:1.2",
|
||||
'javatuples': 'org.javatuples:javatuples:1.2',
|
||||
'javaxInject' : 'javax.inject:javax.inject:1',
|
||||
'jerseyCore': 'org.glassfish.jersey.core:jersey-client:2.25.1',
|
||||
'jerseyGuava': 'org.glassfish.jersey.bundles.repackaged:jersey-guava:2.25.1',
|
||||
'jsonSimple': 'com.googlecode.json-simple:json-simple:1.1.1',
|
||||
@ -57,8 +58,8 @@ project.ext.externalDependency = [
|
||||
'mariadbConnector': 'org.mariadb.jdbc:mariadb-java-client:2.6.0',
|
||||
'mockito': 'org.mockito:mockito-core:3.0.0',
|
||||
'mysqlConnector': 'mysql:mysql-connector-java:5.1.47',
|
||||
"neo4jHarness": "org.neo4j.test:neo4j-harness:3.4.11",
|
||||
"neo4jJavaDriver": "org.neo4j.driver:neo4j-java-driver:4.0.0",
|
||||
'neo4jHarness': 'org.neo4j.test:neo4j-harness:3.4.11',
|
||||
'neo4jJavaDriver': 'org.neo4j.driver:neo4j-java-driver:4.0.0',
|
||||
'parseqTest': 'com.linkedin.parseq:parseq:3.0.7:test',
|
||||
'playDocs': 'com.typesafe.play:play-docs_2.11:2.6.18',
|
||||
'playGuice': 'com.typesafe.play:play-guice_2.11:2.6.18',
|
||||
@ -66,7 +67,7 @@ project.ext.externalDependency = [
|
||||
'playTest': 'com.typesafe.play:play-test_2.11:2.6.18',
|
||||
'postgresql': 'org.postgresql:postgresql:42.2.14',
|
||||
'reflections': 'org.reflections:reflections:0.9.11',
|
||||
"rythmEngine": "org.rythmengine:rythm-engine:1.3.0",
|
||||
'rythmEngine': 'org.rythmengine:rythm-engine:1.3.0',
|
||||
'servletApi': 'javax.servlet:javax.servlet-api:3.1.0',
|
||||
'springBeans': 'org.springframework:spring-beans:5.2.3.RELEASE',
|
||||
'springContext': 'org.springframework:spring-context:5.2.3.RELEASE',
|
||||
|
23
contrib/metadata-ingestion/python/README.md
Normal file
23
contrib/metadata-ingestion/python/README.md
Normal file
@ -0,0 +1,23 @@
|
||||
# Python ETL examples
|
||||
|
||||
ETL scripts written in Python.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. Before running any python metadata ingestion job, you should make sure that DataHub backend services are all running.
|
||||
The easiest way to do that is through [Docker images](../../docker).
|
||||
2. You also need to build the `mxe-schemas` module as below.
|
||||
```
|
||||
./gradlew :metadata-events:mxe-schemas:build
|
||||
```
|
||||
This is needed to generate `MetadataChangeEvent.avsc` which is the schema for `MetadataChangeEvent` Kafka topic.
|
||||
3. All the scripts are written using Python 3 and most likely won't work with Python 2.x interpreters.
|
||||
You can verify the version of your Python using the following command.
|
||||
```
|
||||
python --version
|
||||
```
|
||||
We recommend using [pyenv](https://github.com/pyenv/pyenv) to install and manage your Python environment.
|
||||
4. Before launching each ETL ingestion pipeline, you can install/verify the library versions as below.
|
||||
```
|
||||
pip install --user -r requirements.txt
|
||||
```
|
17
contrib/metadata-ingestion/python/kafka-etl/README.md
Normal file
17
contrib/metadata-ingestion/python/kafka-etl/README.md
Normal file
@ -0,0 +1,17 @@
|
||||
# Kafka ETL
|
||||
|
||||
## Ingest metadata from Kafka to DataHub
|
||||
The kafka_etl provides you ETL channel to communicate with your kafka.
|
||||
```
|
||||
➜ Config your kafka environmental variable in the file.
|
||||
ZOOKEEPER # Your zookeeper host.
|
||||
|
||||
➜ Config your Kafka broker environmental variable in the file.
|
||||
AVROLOADPATH # Your model event in avro format.
|
||||
KAFKATOPIC # Your event topic.
|
||||
BOOTSTRAP # Kafka bootstrap server.
|
||||
SCHEMAREGISTRY # Kafka schema registry host.
|
||||
|
||||
➜ python kafka_etl.py
|
||||
```
|
||||
This will bootstrap DataHub with your metadata in the kafka as a dataset entity.
|
@ -12,19 +12,24 @@ dependencies {
|
||||
avsc project(':metadata-events:mxe-schemas')
|
||||
}
|
||||
|
||||
def genDir = file("src/generated/java")
|
||||
|
||||
task avroCodeGen(type: com.commercehub.gradle.plugin.avro.GenerateAvroJavaTask, dependsOn: configurations.avsc) {
|
||||
source("$rootDir/metadata-events/mxe-schemas/src/renamed/avro")
|
||||
outputDir = file("src/generated/java")
|
||||
outputDir = genDir
|
||||
dependsOn(':metadata-events:mxe-schemas:renameNamespace')
|
||||
}
|
||||
|
||||
compileJava.source(avroCodeGen.outputs)
|
||||
build.dependsOn avroCodeGen
|
||||
|
||||
clean {
|
||||
project.delete('src/generated')
|
||||
idea {
|
||||
module {
|
||||
sourceDirs += genDir
|
||||
generatedSourceDirs += genDir
|
||||
}
|
||||
}
|
||||
|
||||
avroCodeGen.dependsOn(':metadata-events:mxe-schemas:renameNamespace')
|
||||
project.rootProject.tasks.idea.dependsOn(avroCodeGen)
|
||||
|
||||
// Exclude classes from avro-schemas
|
||||
jar {
|
||||
|
@ -53,7 +53,7 @@ public class EventUtils {
|
||||
|
||||
@Nonnull
|
||||
private static Schema getAvroSchemaFromResource(@Nonnull String resourcePath) {
|
||||
URL url = Resources.getResource(resourcePath);
|
||||
URL url = EventUtils.class.getClassLoader().getResource(resourcePath);
|
||||
try {
|
||||
return Schema.parse(Resources.toString(url, Charsets.UTF_8));
|
||||
} catch (IOException e) {
|
||||
|
47
metadata-ingestion-examples/README.md
Normal file
47
metadata-ingestion-examples/README.md
Normal file
@ -0,0 +1,47 @@
|
||||
# Metadata Ingestion
|
||||
|
||||
This directory contains example apps for ingesting data into DataHub.
|
||||
|
||||
You are more than welcome to use these examples directly, or use them as a reference for you own jobs.
|
||||
|
||||
See the READMEs of each example for more information on each.
|
||||
|
||||
### Common themes
|
||||
|
||||
All these examples ingest by firing MetadataChangeEvent Kafka events. They do not ingest directly into DataHub, though
|
||||
this is possible. Instead, the mce-consumer-job should be running, listening for these events, and perform the ingestion
|
||||
for us.
|
||||
|
||||
### A note on languages
|
||||
|
||||
We initially wrote these examples in Python (they still exist in `metadata-ingestion`; TODO to delete them once they're
|
||||
all ported). The idea was that these were very small example scripts, that should've been easy to use. However, upon
|
||||
reflection, not all developers are familiar with Python, and the lack of types can hinder development. So the decision
|
||||
was made to port the examples to Java.
|
||||
|
||||
You're more than welcome to extrapolate these examples into whatever languages you like. At LinkedIn, we primarily use
|
||||
Java.
|
||||
|
||||
### Ingestion at LinkedIn
|
||||
|
||||
It is worth noting that we do not use any of these examples directly (in Java, Python, or anything else) at LinkedIn. We
|
||||
have several different pipelines for ingesting data; it all depends on the source.
|
||||
|
||||
- Some pipelines are based off other Kafka events, where we'll transform some existing Kafka event to a metadata event.
|
||||
- For example, we get Kafka events hive changes. We make MCEs out of those hive events to ingest hive data.
|
||||
- For others, we've directly instrumented existing pipelines / apps / jobs to also emit metadata events.
|
||||
- For example, TODO? Gobblin?
|
||||
- For others still, we've created a series offline jobs to ingest data.
|
||||
- For example, we have an Azkaban job to process our HDFS datasets.
|
||||
|
||||
For some sources of data one of these example scripts may work fine. For others, it may make more sense to have some
|
||||
custom logic, like the above list. Namely, all these examples today are one-off (they run, fire events, and then stop),
|
||||
you may wish to build continuous ingestion pipelines instead.
|
||||
|
||||
### "Real" Ingestion Applications
|
||||
|
||||
We appreciate any contributions of apps you may wish to make to ingest data from other sources.
|
||||
|
||||
TODO this section feels a little weird. Are our ingestion apps not really real apps? :p LDAP is real, as is kafka.
|
||||
Granted, these are just one off apps to ingest. Maybe we should provide a library for these, then expose the one off
|
||||
apps as examples?
|
20
metadata-ingestion-examples/common/build.gradle
Normal file
20
metadata-ingestion-examples/common/build.gradle
Normal file
@ -0,0 +1,20 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
}
|
||||
|
||||
dependencies {
|
||||
compile project(':metadata-dao-impl:kafka-producer')
|
||||
|
||||
compile externalDependency.javaxInject
|
||||
compile externalDependency.kafkaAvroSerde
|
||||
compile externalDependency.kafkaSerializers
|
||||
compile externalDependency.lombok
|
||||
compile externalDependency.springBeans
|
||||
compile externalDependency.springBootAutoconfigure
|
||||
compile externalDependency.springCore
|
||||
compile externalDependency.springKafka
|
||||
|
||||
annotationProcessor externalDependency.lombok
|
||||
|
||||
runtime externalDependency.logbackClassic
|
||||
}
|
@ -0,0 +1,42 @@
|
||||
package com.linkedin.metadata.examples.configs;
|
||||
|
||||
import io.confluent.kafka.serializers.AbstractKafkaAvroSerDeConfig;
|
||||
import io.confluent.kafka.serializers.KafkaAvroSerializer;
|
||||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.kafka.clients.producer.KafkaProducer;
|
||||
import org.apache.kafka.clients.producer.Producer;
|
||||
import org.apache.kafka.common.serialization.StringSerializer;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.boot.autoconfigure.kafka.KafkaProperties;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
|
||||
@Configuration
|
||||
public class KafkaConfig {
|
||||
@Value("${KAFKA_BOOTSTRAP_SERVER:localhost:29092}")
|
||||
private String kafkaBootstrapServers;
|
||||
|
||||
@Value("${KAFKA_SCHEMAREGISTRY_URL:http://localhost:8081}")
|
||||
private String kafkaSchemaRegistryUrl;
|
||||
|
||||
@Bean(name = "kafkaEventProducer")
|
||||
public Producer<String, IndexedRecord> kafkaListenerContainerFactory(KafkaProperties properties) {
|
||||
KafkaProperties.Producer producerProps = properties.getProducer();
|
||||
|
||||
producerProps.setKeySerializer(StringSerializer.class);
|
||||
producerProps.setValueSerializer(KafkaAvroSerializer.class);
|
||||
|
||||
// KAFKA_BOOTSTRAP_SERVER has precedence over SPRING_KAFKA_BOOTSTRAP_SERVERS
|
||||
if (kafkaBootstrapServers != null && kafkaBootstrapServers.length() > 0) {
|
||||
producerProps.setBootstrapServers(Arrays.asList(kafkaBootstrapServers.split(",")));
|
||||
} // else we rely on KafkaProperties which defaults to localhost:9092
|
||||
|
||||
Map<String, Object> props = properties.buildProducerProperties();
|
||||
props.put(AbstractKafkaAvroSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG, kafkaSchemaRegistryUrl);
|
||||
|
||||
return new KafkaProducer<>(props);
|
||||
}
|
||||
}
|
@ -0,0 +1,19 @@
|
||||
package com.linkedin.metadata.examples.configs;
|
||||
|
||||
import io.confluent.kafka.schemaregistry.client.CachedSchemaRegistryClient;
|
||||
import io.confluent.kafka.schemaregistry.client.SchemaRegistryClient;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
|
||||
@Configuration
|
||||
public class SchemaRegistryConfig {
|
||||
@Value("${SCHEMAREGISTRY_URL:http://localhost:8081}")
|
||||
private String schemaRegistryUrl;
|
||||
|
||||
@Bean(name = "schemaRegistryClient")
|
||||
public SchemaRegistryClient schemaRegistryFactory() {
|
||||
return new CachedSchemaRegistryClient(schemaRegistryUrl, 512);
|
||||
}
|
||||
}
|
@ -0,0 +1,26 @@
|
||||
package com.linkedin.metadata.examples.configs;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.zookeeper.Watcher;
|
||||
import org.apache.zookeeper.ZooKeeper;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
|
||||
@Configuration
|
||||
public class ZooKeeperConfig {
|
||||
@Value("${ZOOKEEPER:localhost:2181}")
|
||||
private String zookeeper;
|
||||
|
||||
@Value("${ZOOKEEPER_TIMEOUT_MILLIS:3000}")
|
||||
private int timeoutMillis;
|
||||
|
||||
@Bean(name = "zooKeeper")
|
||||
public ZooKeeper zooKeeperFactory() throws IOException {
|
||||
Watcher noopWatcher = event -> {
|
||||
};
|
||||
|
||||
return new ZooKeeper(zookeeper, timeoutMillis, noopWatcher);
|
||||
}
|
||||
}
|
40
metadata-ingestion-examples/kafka-etl/README.md
Normal file
40
metadata-ingestion-examples/kafka-etl/README.md
Normal file
@ -0,0 +1,40 @@
|
||||
# Kafka ETL
|
||||
|
||||
A small application which reads existing Kafka topics from ZooKeeper, retrieves their schema from the schema registry,
|
||||
and then fires an MCE for each schema.
|
||||
|
||||
## Running the Application
|
||||
|
||||
First, ensure that services this depends on, like schema registry / zookeeper / mce-consumer-job / gms / etc, are all
|
||||
running.
|
||||
|
||||
This application can be run via gradle:
|
||||
|
||||
```
|
||||
./gradlew :metadata-ingestion-examples:kafka-etl:bootRun
|
||||
```
|
||||
|
||||
Or by building and running the jar:
|
||||
|
||||
```
|
||||
./gradlew :metadata-ingestion-examples:kafka-etl:build
|
||||
|
||||
java -jar metadata-ingestion-examples/kafka-etl/build/libs/kafka-etl.jar
|
||||
```
|
||||
|
||||
### Environment Variables
|
||||
|
||||
See the files under `src/main/java/com/linkedin/metadata/examples/kafka/config` for a list of customizable spring
|
||||
environment variables.
|
||||
|
||||
### Common pitfalls
|
||||
|
||||
For events to be fired correctly, schemas must exist in the schema registry. If a topic was newly created, but no schema
|
||||
has been registered for it yet, this application will fail to retrieve the schema for that topic. Check the output of
|
||||
the application to see if this happens. If you see a message like
|
||||
|
||||
```
|
||||
io.confluent.kafka.schemaregistry.client.rest.exceptions.RestClientException: Subject not found.; error code: 40401
|
||||
```
|
||||
|
||||
Then the odds are good that you need to register the schema for this topic.
|
29
metadata-ingestion-examples/kafka-etl/build.gradle
Normal file
29
metadata-ingestion-examples/kafka-etl/build.gradle
Normal file
@ -0,0 +1,29 @@
|
||||
plugins {
|
||||
id 'org.springframework.boot'
|
||||
id 'java'
|
||||
}
|
||||
|
||||
dependencies {
|
||||
compile project(':metadata-utils')
|
||||
compile project(':metadata-builders')
|
||||
compile project(':metadata-dao-impl:kafka-producer')
|
||||
compile project(':metadata-events:mxe-schemas')
|
||||
compile project(':metadata-ingestion-examples:common')
|
||||
|
||||
compile externalDependency.javaxInject
|
||||
compile externalDependency.kafkaAvroSerde
|
||||
compile externalDependency.kafkaSerializers
|
||||
compile externalDependency.lombok
|
||||
compile externalDependency.springBeans
|
||||
compile externalDependency.springBootAutoconfigure
|
||||
compile externalDependency.springCore
|
||||
compile externalDependency.springKafka
|
||||
|
||||
annotationProcessor externalDependency.lombok
|
||||
|
||||
runtime externalDependency.logbackClassic
|
||||
}
|
||||
|
||||
bootJar {
|
||||
mainClassName = 'com.linkedin.metadata.examples.kafka.KafkaEtlApplication'
|
||||
}
|
@ -0,0 +1,115 @@
|
||||
package com.linkedin.metadata.examples.kafka;
|
||||
|
||||
import com.linkedin.common.AuditStamp;
|
||||
import com.linkedin.common.FabricType;
|
||||
import com.linkedin.common.urn.CorpuserUrn;
|
||||
import com.linkedin.common.urn.DataPlatformUrn;
|
||||
import com.linkedin.common.urn.DatasetUrn;
|
||||
import com.linkedin.metadata.aspect.DatasetAspect;
|
||||
import com.linkedin.metadata.dao.producer.KafkaMetadataEventProducer;
|
||||
import com.linkedin.metadata.snapshot.DatasetSnapshot;
|
||||
import com.linkedin.mxe.MetadataChangeEvent;
|
||||
import com.linkedin.schema.KafkaSchema;
|
||||
import com.linkedin.schema.SchemaField;
|
||||
import com.linkedin.schema.SchemaFieldArray;
|
||||
import com.linkedin.schema.SchemaFieldDataType;
|
||||
import com.linkedin.schema.SchemaMetadata;
|
||||
import com.linkedin.schema.StringType;
|
||||
import io.confluent.kafka.schemaregistry.client.SchemaRegistryClient;
|
||||
import java.util.List;
|
||||
import javax.inject.Inject;
|
||||
import javax.inject.Named;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.kafka.clients.producer.Producer;
|
||||
import org.apache.zookeeper.ZooKeeper;
|
||||
import org.springframework.boot.CommandLineRunner;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
/**
|
||||
* Gathers Kafka topics from the local zookeeper instance and schemas from the schema registry, and then fires
|
||||
* MetadataChangeEvents for their schemas.
|
||||
*
|
||||
* <p>This should cause DataHub to be populated with this information, assuming it and the mce-consumer-job are running
|
||||
* locally.
|
||||
*
|
||||
* <p>Can be run with {@code ./gradlew :metadata-ingestion-examples:java:kafka-etl:bootRun}.
|
||||
*/
|
||||
@Slf4j
|
||||
@Component
|
||||
public final class KafkaEtl implements CommandLineRunner {
|
||||
private static final DataPlatformUrn KAFKA_URN = new DataPlatformUrn("kafka");
|
||||
|
||||
@Inject
|
||||
@Named("kafkaEventProducer")
|
||||
private Producer<String, IndexedRecord> _producer;
|
||||
|
||||
@Inject
|
||||
@Named("zooKeeper")
|
||||
private ZooKeeper _zooKeeper;
|
||||
|
||||
@Inject
|
||||
@Named("schemaRegistryClient")
|
||||
private SchemaRegistryClient _schemaRegistryClient;
|
||||
|
||||
private SchemaMetadata buildDatasetSchema(String datasetName, String schema, int schemaVersion) {
|
||||
final AuditStamp auditStamp = new AuditStamp();
|
||||
auditStamp.setTime(System.currentTimeMillis());
|
||||
auditStamp.setActor(new CorpuserUrn(System.getenv("USER")));
|
||||
final SchemaMetadata.PlatformSchema platformSchema = new SchemaMetadata.PlatformSchema();
|
||||
platformSchema.setKafkaSchema(new KafkaSchema().setDocumentSchema(schema));
|
||||
return new SchemaMetadata().setSchemaName(datasetName)
|
||||
.setPlatform(KAFKA_URN)
|
||||
.setCreated(auditStamp)
|
||||
.setLastModified(auditStamp)
|
||||
.setVersion(schemaVersion)
|
||||
.setHash("")
|
||||
.setPlatformSchema(platformSchema)
|
||||
.setFields(new SchemaFieldArray(new SchemaField().setFieldPath("")
|
||||
.setDescription("")
|
||||
.setNativeDataType("string")
|
||||
.setType(new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())))));
|
||||
}
|
||||
|
||||
private void produceKafkaDatasetMce(SchemaMetadata schemaMetadata) {
|
||||
MetadataChangeEvent.class.getClassLoader().getResource("avro/com/linkedin/mxe/MetadataChangeEvent.avsc");
|
||||
|
||||
// Kafka topics are considered datasets in the current DataHub metadata ecosystem.
|
||||
final KafkaMetadataEventProducer<DatasetSnapshot, DatasetAspect, DatasetUrn> eventProducer =
|
||||
new KafkaMetadataEventProducer<>(DatasetSnapshot.class, DatasetAspect.class, _producer);
|
||||
eventProducer.produceSnapshotBasedMetadataChangeEvent(
|
||||
new DatasetUrn(KAFKA_URN, schemaMetadata.getSchemaName(), FabricType.PROD), schemaMetadata);
|
||||
_producer.flush();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run(String... args) throws Exception {
|
||||
log.info("Starting up");
|
||||
|
||||
final List<String> topics = _zooKeeper.getChildren("/brokers/topics", false);
|
||||
for (String datasetName : topics) {
|
||||
if (datasetName.startsWith("_")) {
|
||||
continue;
|
||||
}
|
||||
|
||||
final String topic = datasetName + "-value";
|
||||
io.confluent.kafka.schemaregistry.client.SchemaMetadata schemaMetadata;
|
||||
try {
|
||||
schemaMetadata = _schemaRegistryClient.getLatestSchemaMetadata(topic);
|
||||
} catch (Throwable t) {
|
||||
log.error("Failed to get schema for topic " + datasetName, t);
|
||||
log.error("Common failure: does this event schema exist in the schema registry?");
|
||||
continue;
|
||||
}
|
||||
|
||||
if (schemaMetadata == null) {
|
||||
log.warn(String.format("Skipping topic without schema: %s", topic));
|
||||
continue;
|
||||
}
|
||||
log.trace(topic);
|
||||
|
||||
produceKafkaDatasetMce(buildDatasetSchema(datasetName, schemaMetadata.getSchema(), schemaMetadata.getVersion()));
|
||||
log.info("Successfully fired MCE for " + datasetName);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,16 @@
|
||||
package com.linkedin.metadata.examples.kafka;
|
||||
|
||||
import org.springframework.boot.WebApplicationType;
|
||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||
import org.springframework.boot.autoconfigure.elasticsearch.rest.RestClientAutoConfiguration;
|
||||
import org.springframework.boot.builder.SpringApplicationBuilder;
|
||||
|
||||
|
||||
@SuppressWarnings("checkstyle:HideUtilityClassConstructor")
|
||||
@SpringBootApplication(exclude = {RestClientAutoConfiguration.class}, scanBasePackages = {
|
||||
"com.linkedin.metadata.examples.configs", "com.linkedin.metadata.examples.kafka"})
|
||||
public class KafkaEtlApplication {
|
||||
public static void main(String[] args) {
|
||||
new SpringApplicationBuilder(KafkaEtlApplication.class).web(WebApplicationType.NONE).run(args);
|
||||
}
|
||||
}
|
@ -0,0 +1,40 @@
|
||||
<configuration>
|
||||
<property name="LOG_DIR" value="${LOG_DIR:- /tmp/datahub/logs}"/>
|
||||
|
||||
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
|
||||
<encoder>
|
||||
<pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
|
||||
</encoder>
|
||||
</appender>
|
||||
|
||||
<appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender">
|
||||
<file>${LOG_DIR}/kafka-etl-java.log</file>
|
||||
<append>true</append>
|
||||
<encoder>
|
||||
<pattern>%d{HH:mm:ss} [%thread] %-5level %logger{36} - %msg%n</pattern>
|
||||
</encoder>
|
||||
<rollingPolicy class="ch.qos.logback.core.rolling.FixedWindowRollingPolicy">
|
||||
<FileNamePattern>${LOG_DIR}/kafka-etl.%i.log</FileNamePattern>
|
||||
<minIndex>1</minIndex>
|
||||
<maxIndex>3</maxIndex>
|
||||
</rollingPolicy>
|
||||
<triggeringPolicy class="ch.qos.logback.core.rolling.SizeBasedTriggeringPolicy">
|
||||
<MaxFileSize>100MB</MaxFileSize>
|
||||
</triggeringPolicy>
|
||||
</appender>
|
||||
|
||||
<logger name="org.apache.kafka.clients" level="warn" additivity="false">
|
||||
<appender-ref ref="STDOUT" />
|
||||
<appender-ref ref="FILE"/>
|
||||
</logger>
|
||||
|
||||
<logger name="com.linkedin.metadata.examples.kafka" level="info" additivity="false">
|
||||
<appender-ref ref="STDOUT" />
|
||||
<appender-ref ref="FILE"/>
|
||||
</logger>
|
||||
|
||||
<root level="warn">
|
||||
<appender-ref ref="STDOUT" />
|
||||
<appender-ref ref="FILE"/>
|
||||
</root>
|
||||
</configuration>
|
@ -90,22 +90,6 @@ The ldap_etl provides you ETL channel to communicate with your LDAP server.
|
||||
```
|
||||
This will bootstrap DataHub with your metadata in the LDAP server as an user entity.
|
||||
|
||||
## Ingest metadata from Kafka to DataHub
|
||||
The kafka_etl provides you ETL channel to communicate with your kafka.
|
||||
```
|
||||
➜ Config your kafka environmental variable in the file.
|
||||
ZOOKEEPER # Your zookeeper host.
|
||||
|
||||
➜ Config your Kafka broker environmental variable in the file.
|
||||
AVROLOADPATH # Your model event in avro format.
|
||||
KAFKATOPIC # Your event topic.
|
||||
BOOTSTRAP # Kafka bootstrap server.
|
||||
SCHEMAREGISTRY # Kafka schema registry host.
|
||||
|
||||
➜ python kafka_etl.py
|
||||
```
|
||||
This will bootstrap DataHub with your metadata in the kafka as a dataset entity.
|
||||
|
||||
## Ingest metadata from MySQL to DataHub
|
||||
The mysql_etl provides you ETL channel to communicate with your MySQL.
|
||||
```
|
||||
|
@ -18,6 +18,8 @@ include 'metadata-events:mxe-avro-1.7'
|
||||
include 'metadata-events:mxe-registration'
|
||||
include 'metadata-events:mxe-schemas'
|
||||
include 'metadata-events:mxe-utils-avro-1.7'
|
||||
include 'metadata-ingestion-examples:common'
|
||||
include 'metadata-ingestion-examples:kafka-etl'
|
||||
include 'metadata-jobs:mae-consumer-job'
|
||||
include 'metadata-jobs:mce-consumer-job'
|
||||
include 'metadata-models'
|
||||
|
Loading…
x
Reference in New Issue
Block a user