mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-25 09:15:57 +00:00
feat(ingest/kafka-connect): add config to lowercase urns, do not emit… (#7393)
Co-authored-by: John Joyce <john@acryl.io>
This commit is contained in:
parent
0cdf817499
commit
d436ab9f9b
@ -54,9 +54,10 @@ class KafkaConnectSourceConfig(DatasetLineageProviderConfigBase):
|
|||||||
cluster_name: Optional[str] = Field(
|
cluster_name: Optional[str] = Field(
|
||||||
default="connect-cluster", description="Cluster to ingest from."
|
default="connect-cluster", description="Cluster to ingest from."
|
||||||
)
|
)
|
||||||
construct_lineage_workunits: bool = Field(
|
# convert lineage dataset's urns to lowercase
|
||||||
default=True,
|
convert_lineage_urns_to_lowercase: bool = Field(
|
||||||
description="Whether to create the input and output Dataset entities",
|
default=False,
|
||||||
|
description="Whether to convert the urns of ingested lineage dataset to lowercase",
|
||||||
)
|
)
|
||||||
connector_patterns: AllowDenyPattern = Field(
|
connector_patterns: AllowDenyPattern = Field(
|
||||||
default=AllowDenyPattern.allow_all(),
|
default=AllowDenyPattern.allow_all(),
|
||||||
@ -492,7 +493,7 @@ class ConfluentJDBCSourceConnector:
|
|||||||
|
|
||||||
matcher = transform_regex.matcher(topic)
|
matcher = transform_regex.matcher(topic)
|
||||||
if matcher.matches():
|
if matcher.matches():
|
||||||
topic = matcher.replaceFirst(transform_replacement)
|
topic = str(matcher.replaceFirst(transform_replacement))
|
||||||
|
|
||||||
# Additional check to confirm that the topic present
|
# Additional check to confirm that the topic present
|
||||||
# in connector topics
|
# in connector topics
|
||||||
@ -1077,18 +1078,8 @@ class KafkaConnectSource(Source):
|
|||||||
for lineage in lineages:
|
for lineage in lineages:
|
||||||
source_dataset = lineage.source_dataset
|
source_dataset = lineage.source_dataset
|
||||||
source_platform = lineage.source_platform
|
source_platform = lineage.source_platform
|
||||||
source_platform_instance = (
|
|
||||||
self.config.platform_instance_map.get(source_platform)
|
|
||||||
if self.config.platform_instance_map
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
target_dataset = lineage.target_dataset
|
target_dataset = lineage.target_dataset
|
||||||
target_platform = lineage.target_platform
|
target_platform = lineage.target_platform
|
||||||
target_platform_instance = (
|
|
||||||
self.config.platform_instance_map.get(target_platform)
|
|
||||||
if self.config.platform_instance_map
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
job_property_bag = lineage.job_property_bag
|
job_property_bag = lineage.job_property_bag
|
||||||
|
|
||||||
job_id = (
|
job_id = (
|
||||||
@ -1100,22 +1091,18 @@ class KafkaConnectSource(Source):
|
|||||||
|
|
||||||
inlets = (
|
inlets = (
|
||||||
[
|
[
|
||||||
builder.make_dataset_urn_with_platform_instance(
|
self.make_lineage_dataset_urn(
|
||||||
source_platform,
|
source_platform,
|
||||||
source_dataset,
|
source_dataset,
|
||||||
platform_instance=source_platform_instance,
|
|
||||||
env=self.config.env,
|
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
if source_dataset
|
if source_dataset
|
||||||
else []
|
else []
|
||||||
)
|
)
|
||||||
outlets = [
|
outlets = [
|
||||||
builder.make_dataset_urn_with_platform_instance(
|
self.make_lineage_dataset_urn(
|
||||||
target_platform,
|
target_platform,
|
||||||
target_dataset,
|
target_dataset,
|
||||||
platform_instance=target_platform_instance,
|
|
||||||
env=self.config.env,
|
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -1152,69 +1139,6 @@ class KafkaConnectSource(Source):
|
|||||||
self.report.report_workunit(wu)
|
self.report.report_workunit(wu)
|
||||||
yield wu
|
yield wu
|
||||||
|
|
||||||
def construct_lineage_workunits(
|
|
||||||
self, connector: ConnectorManifest
|
|
||||||
) -> Iterable[MetadataWorkUnit]:
|
|
||||||
lineages = connector.lineages
|
|
||||||
if lineages:
|
|
||||||
for lineage in lineages:
|
|
||||||
source_dataset = lineage.source_dataset
|
|
||||||
source_platform = lineage.source_platform
|
|
||||||
source_platform_instance = (
|
|
||||||
self.config.platform_instance_map.get(source_platform)
|
|
||||||
if self.config.platform_instance_map
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
target_dataset = lineage.target_dataset
|
|
||||||
target_platform = lineage.target_platform
|
|
||||||
target_platform_instance = (
|
|
||||||
self.config.platform_instance_map.get(target_platform)
|
|
||||||
if self.config.platform_instance_map
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
|
|
||||||
mcp = MetadataChangeProposalWrapper(
|
|
||||||
entityUrn=builder.make_dataset_urn_with_platform_instance(
|
|
||||||
target_platform,
|
|
||||||
target_dataset,
|
|
||||||
platform_instance=target_platform_instance,
|
|
||||||
env=self.config.env,
|
|
||||||
),
|
|
||||||
aspect=models.DataPlatformInstanceClass(
|
|
||||||
platform=builder.make_data_platform_urn(target_platform),
|
|
||||||
instance=builder.make_dataplatform_instance_urn(
|
|
||||||
target_platform, target_platform_instance
|
|
||||||
)
|
|
||||||
if target_platform_instance
|
|
||||||
else None,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
wu = MetadataWorkUnit(id=target_dataset, mcp=mcp)
|
|
||||||
self.report.report_workunit(wu)
|
|
||||||
yield wu
|
|
||||||
if source_dataset:
|
|
||||||
mcp = MetadataChangeProposalWrapper(
|
|
||||||
entityUrn=builder.make_dataset_urn_with_platform_instance(
|
|
||||||
source_platform,
|
|
||||||
source_dataset,
|
|
||||||
platform_instance=source_platform_instance,
|
|
||||||
env=self.config.env,
|
|
||||||
),
|
|
||||||
aspect=models.DataPlatformInstanceClass(
|
|
||||||
platform=builder.make_data_platform_urn(source_platform),
|
|
||||||
instance=builder.make_dataplatform_instance_urn(
|
|
||||||
source_platform, source_platform_instance
|
|
||||||
)
|
|
||||||
if source_platform_instance
|
|
||||||
else None,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
wu = MetadataWorkUnit(id=source_dataset, mcp=mcp)
|
|
||||||
self.report.report_workunit(wu)
|
|
||||||
yield wu
|
|
||||||
|
|
||||||
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
|
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
|
||||||
connectors_manifest = self.get_connectors_manifest()
|
connectors_manifest = self.get_connectors_manifest()
|
||||||
for connector in connectors_manifest:
|
for connector in connectors_manifest:
|
||||||
@ -1222,9 +1146,6 @@ class KafkaConnectSource(Source):
|
|||||||
if self.config.connector_patterns.allowed(name):
|
if self.config.connector_patterns.allowed(name):
|
||||||
yield from self.construct_flow_workunit(connector)
|
yield from self.construct_flow_workunit(connector)
|
||||||
yield from self.construct_job_workunits(connector)
|
yield from self.construct_job_workunits(connector)
|
||||||
if self.config.construct_lineage_workunits:
|
|
||||||
yield from self.construct_lineage_workunits(connector)
|
|
||||||
|
|
||||||
self.report.report_connector_scanned(name)
|
self.report.report_connector_scanned(name)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
@ -1233,6 +1154,20 @@ class KafkaConnectSource(Source):
|
|||||||
def get_report(self) -> KafkaConnectSourceReport:
|
def get_report(self) -> KafkaConnectSourceReport:
|
||||||
return self.report
|
return self.report
|
||||||
|
|
||||||
|
def make_lineage_dataset_urn(self, platform: str, name: str) -> str:
|
||||||
|
if self.config.convert_lineage_urns_to_lowercase:
|
||||||
|
name = name.lower()
|
||||||
|
|
||||||
|
platform_instance = (
|
||||||
|
self.config.platform_instance_map.get(platform)
|
||||||
|
if self.config.platform_instance_map
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
|
return builder.make_dataset_urn_with_platform_instance(
|
||||||
|
platform, name, platform_instance, self.config.env
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# TODO: Find a more automated way to discover new platforms with 3 level naming hierarchy.
|
# TODO: Find a more automated way to discover new platforms with 3 level naming hierarchy.
|
||||||
def has_three_level_hierarchy(platform: str) -> bool:
|
def has_three_level_hierarchy(platform: str) -> bool:
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -1,72 +1,58 @@
|
|||||||
[
|
[
|
||||||
{
|
{
|
||||||
"entityType": "dataFlow",
|
"entityType": "dataFlow",
|
||||||
"entityUrn": "urn:li:dataFlow:(kafka-connect,source_mongodb_connector,PROD)",
|
"entityUrn": "urn:li:dataFlow:(kafka-connect,source_mongodb_connector,PROD)",
|
||||||
"changeType": "UPSERT",
|
"changeType": "UPSERT",
|
||||||
"aspectName": "dataFlowInfo",
|
"aspectName": "dataFlowInfo",
|
||||||
"aspect": {
|
"aspect": {
|
||||||
"value": "{\"customProperties\": {}, \"name\": \"source_mongodb_connector\", \"description\": \"Source connector using `com.mongodb.kafka.connect.MongoSourceConnector` plugin.\"}",
|
"json": {
|
||||||
"contentType": "application/json"
|
"customProperties": {},
|
||||||
|
"name": "source_mongodb_connector",
|
||||||
|
"description": "Source connector using `com.mongodb.kafka.connect.MongoSourceConnector` plugin."
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"systemMetadata": {
|
"systemMetadata": {
|
||||||
"lastObserved": 1635166800000,
|
"lastObserved": 1635166800000,
|
||||||
"runId": "kafka-connect-run"
|
"runId": "kafka-connect-run"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"entityType": "dataJob",
|
"entityType": "dataJob",
|
||||||
"entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(kafka-connect,source_mongodb_connector,PROD),test_db.purchases)",
|
"entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(kafka-connect,source_mongodb_connector,PROD),test_db.purchases)",
|
||||||
"changeType": "UPSERT",
|
"changeType": "UPSERT",
|
||||||
"aspectName": "dataJobInfo",
|
"aspectName": "dataJobInfo",
|
||||||
"aspect": {
|
"aspect": {
|
||||||
"value": "{\"customProperties\": {}, \"name\": \"source_mongodb_connector:test_db.purchases\", \"type\": {\"string\": \"COMMAND\"}}",
|
"json": {
|
||||||
"contentType": "application/json"
|
"customProperties": {},
|
||||||
|
"name": "source_mongodb_connector:test_db.purchases",
|
||||||
|
"type": {
|
||||||
|
"string": "COMMAND"
|
||||||
|
}
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"systemMetadata": {
|
"systemMetadata": {
|
||||||
"lastObserved": 1635166800000,
|
"lastObserved": 1635166800000,
|
||||||
"runId": "kafka-connect-run"
|
"runId": "kafka-connect-run"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"entityType": "dataJob",
|
"entityType": "dataJob",
|
||||||
"entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(kafka-connect,source_mongodb_connector,PROD),test_db.purchases)",
|
"entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(kafka-connect,source_mongodb_connector,PROD),test_db.purchases)",
|
||||||
"changeType": "UPSERT",
|
"changeType": "UPSERT",
|
||||||
"aspectName": "dataJobInputOutput",
|
"aspectName": "dataJobInputOutput",
|
||||||
"aspect": {
|
"aspect": {
|
||||||
"value": "{\"inputDatasets\": [\"urn:li:dataset:(urn:li:dataPlatform:mongodb,test_db.purchases,PROD)\"], \"outputDatasets\": [\"urn:li:dataset:(urn:li:dataPlatform:kafka,mongodb.test_db.purchases,PROD)\"]}",
|
"json": {
|
||||||
"contentType": "application/json"
|
"inputDatasets": [
|
||||||
|
"urn:li:dataset:(urn:li:dataPlatform:mongodb,test_db.purchases,PROD)"
|
||||||
|
],
|
||||||
|
"outputDatasets": [
|
||||||
|
"urn:li:dataset:(urn:li:dataPlatform:kafka,mongodb.test_db.purchases,PROD)"
|
||||||
|
]
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"systemMetadata": {
|
"systemMetadata": {
|
||||||
"lastObserved": 1635166800000,
|
"lastObserved": 1635166800000,
|
||||||
"runId": "kafka-connect-run"
|
"runId": "kafka-connect-run"
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
{
|
]
|
||||||
"entityType": "dataset",
|
|
||||||
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:kafka,mongodb.test_db.purchases,PROD)",
|
|
||||||
"changeType": "UPSERT",
|
|
||||||
"aspectName": "dataPlatformInstance",
|
|
||||||
"aspect": {
|
|
||||||
"value": "{\"platform\": \"urn:li:dataPlatform:kafka\"}",
|
|
||||||
"contentType": "application/json"
|
|
||||||
},
|
|
||||||
"systemMetadata": {
|
|
||||||
"lastObserved": 1635166800000,
|
|
||||||
"runId": "kafka-connect-run"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"entityType": "dataset",
|
|
||||||
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,test_db.purchases,PROD)",
|
|
||||||
"changeType": "UPSERT",
|
|
||||||
"aspectName": "dataPlatformInstance",
|
|
||||||
"aspect": {
|
|
||||||
"value": "{\"platform\": \"urn:li:dataPlatform:mongodb\"}",
|
|
||||||
"contentType": "application/json"
|
|
||||||
},
|
|
||||||
"systemMetadata": {
|
|
||||||
"lastObserved": 1635166800000,
|
|
||||||
"runId": "kafka-connect-run"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@ -13,7 +13,7 @@ source:
|
|||||||
- provider: env
|
- provider: env
|
||||||
path_key: POSTGRES_CONNECTION_URL
|
path_key: POSTGRES_CONNECTION_URL
|
||||||
value: jdbc:postgresql://test_postgres:5432/postgres
|
value: jdbc:postgresql://test_postgres:5432/postgres
|
||||||
construct_lineage_workunits: true
|
convert_lineage_urns_to_lowercase: true
|
||||||
platform_instance_map: # optional
|
platform_instance_map: # optional
|
||||||
mysql: mysql1 # optional
|
mysql: mysql1 # optional
|
||||||
connect_to_platform_map: # optional
|
connect_to_platform_map: # optional
|
||||||
|
|||||||
@ -14,6 +14,12 @@ CREATE TABLE member (
|
|||||||
PRIMARY KEY (id)
|
PRIMARY KEY (id)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
CREATE TABLE MixedCaseTable (
|
||||||
|
id INTEGER NOT NULL,
|
||||||
|
name VARCHAR ( 50 ) NOT NULL,
|
||||||
|
PRIMARY KEY (id)
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
INSERT INTO book (id, name, author) VALUES (1, 'Book1', 'ABC');
|
INSERT INTO book (id, name, author) VALUES (1, 'Book1', 'ABC');
|
||||||
INSERT INTO book (id, name, author) VALUES (2, 'Book2', 'PQR');
|
INSERT INTO book (id, name, author) VALUES (2, 'Book2', 'PQR');
|
||||||
@ -21,3 +27,5 @@ INSERT INTO book (id, name, author) VALUES (3, 'Book3', 'XYZ');
|
|||||||
|
|
||||||
INSERT INTO member(id, name) VALUES (1, 'Member1');
|
INSERT INTO member(id, name) VALUES (1, 'Member1');
|
||||||
INSERT INTO member(id, name) VALUES (2, 'Member2');
|
INSERT INTO member(id, name) VALUES (2, 'Member2');
|
||||||
|
|
||||||
|
INSERT INTO MixedCaseTable(id, name) VALUES (2, 'Member2');
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user