mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-28 10:28:22 +00:00
fix(ingest): encode reserved characters when creating dataset urn (#5977)
Co-authored-by: Harshal Sheth <hsheth2@gmail.com>
This commit is contained in:
parent
46494bc0ad
commit
b195b6c123
19
metadata-ingestion/src/datahub/utilities/urn_encoder.py
Normal file
19
metadata-ingestion/src/datahub/utilities/urn_encoder.py
Normal file
@ -0,0 +1,19 @@
|
||||
import urllib.parse
|
||||
from typing import List
|
||||
|
||||
RESERVED_CHARS = [",", "(", ")"]
|
||||
|
||||
|
||||
class UrnEncoder:
|
||||
@staticmethod
|
||||
def encode_string_array(arr: List[str]) -> List[str]:
|
||||
return [UrnEncoder.encode_string(s) for s in arr]
|
||||
|
||||
@staticmethod
|
||||
def encode_string(s: str) -> str:
|
||||
return "".join([UrnEncoder.encode_char(c) for c in s])
|
||||
|
||||
@staticmethod
|
||||
def encode_char(c: str) -> str:
|
||||
assert len(c) == 1, "Invalid input, Expected single character"
|
||||
return urllib.parse.quote(c) if c in RESERVED_CHARS else c
|
||||
@ -1,6 +1,7 @@
|
||||
from typing import List, Optional
|
||||
|
||||
from datahub.configuration.source_common import ALL_ENV_TYPES
|
||||
from datahub.utilities.urn_encoder import UrnEncoder
|
||||
from datahub.utilities.urns.data_platform_urn import DataPlatformUrn
|
||||
from datahub.utilities.urns.error import InvalidUrnError
|
||||
from datahub.utilities.urns.urn import Urn
|
||||
@ -15,7 +16,7 @@ class DatasetUrn(Urn):
|
||||
ENTITY_TYPE: str = "dataset"
|
||||
|
||||
def __init__(self, entity_type: str, entity_id: List[str], domain: str = "li"):
|
||||
super().__init__(entity_type, entity_id, domain)
|
||||
super().__init__(entity_type, UrnEncoder.encode_string_array(entity_id), domain)
|
||||
|
||||
@classmethod
|
||||
def create_from_string(cls, urn_str: str) -> "DatasetUrn":
|
||||
|
||||
@ -258,7 +258,7 @@
|
||||
"upstreamDatabases": [
|
||||
{
|
||||
"id": "1ade1d51-bbc3-ed8d-25d2-c51f44b8b31b",
|
||||
"name": "Sample - Superstore.xls",
|
||||
"name": "Sample - Superstore, (new).xls",
|
||||
"connectionType": "excel-direct",
|
||||
"isEmbedded": true
|
||||
}
|
||||
@ -267,7 +267,7 @@
|
||||
{
|
||||
"id": "15714253-8e46-a209-63cc-700705b66de9",
|
||||
"database": {
|
||||
"name": "Sample - Superstore.xls"
|
||||
"name": "Sample - Superstore, (new).xls"
|
||||
},
|
||||
"name": "People",
|
||||
"schema": "",
|
||||
@ -288,7 +288,7 @@
|
||||
{
|
||||
"id": "19be3c28-8e4d-ebac-b44d-8f0851d9f206",
|
||||
"database": {
|
||||
"name": "Sample - Superstore.xls"
|
||||
"name": "Sample - Superstore, (new).xls"
|
||||
},
|
||||
"name": "Returns",
|
||||
"schema": "",
|
||||
@ -309,7 +309,7 @@
|
||||
{
|
||||
"id": "b0e0c3eb-6e53-e0f5-ded1-478d5d9f7281",
|
||||
"database": {
|
||||
"name": "Sample - Superstore.xls"
|
||||
"name": "Sample - Superstore, (new).xls"
|
||||
},
|
||||
"name": "Orders",
|
||||
"schema": "",
|
||||
|
||||
@ -22897,7 +22897,7 @@
|
||||
{
|
||||
"proposedSnapshot": {
|
||||
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
|
||||
"urn": "urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore.xls.people,PROD)",
|
||||
"urn": "urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore%2C %28new%29.xls.people,PROD)",
|
||||
"aspects": [
|
||||
{
|
||||
"com.linkedin.pegasus2avro.common.BrowsePaths": {
|
||||
@ -22966,7 +22966,7 @@
|
||||
{
|
||||
"proposedSnapshot": {
|
||||
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
|
||||
"urn": "urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore.xls.returns,PROD)",
|
||||
"urn": "urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore%2C %28new%29.xls.returns,PROD)",
|
||||
"aspects": [
|
||||
{
|
||||
"com.linkedin.pegasus2avro.common.BrowsePaths": {
|
||||
@ -23035,7 +23035,7 @@
|
||||
{
|
||||
"proposedSnapshot": {
|
||||
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
|
||||
"urn": "urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore.xls.orders,PROD)",
|
||||
"urn": "urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore%2C %28new%29.xls.orders,PROD)",
|
||||
"aspects": [
|
||||
{
|
||||
"com.linkedin.pegasus2avro.common.BrowsePaths": {
|
||||
@ -30907,7 +30907,7 @@
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "upstreamLineage",
|
||||
"aspect": {
|
||||
"value": "{\"upstreams\": [{\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore.xls.people,PROD)\", \"type\": \"TRANSFORMED\"}, {\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore.xls.returns,PROD)\", \"type\": \"TRANSFORMED\"}, {\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore.xls.orders,PROD)\", \"type\": \"TRANSFORMED\"}]}",
|
||||
"value": "{\"upstreams\": [{\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore%2C %28new%29.xls.people,PROD)\", \"type\": \"TRANSFORMED\"}, {\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore%2C %28new%29.xls.returns,PROD)\", \"type\": \"TRANSFORMED\"}, {\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore%2C %28new%29.xls.orders,PROD)\", \"type\": \"TRANSFORMED\"}]}",
|
||||
"contentType": "application/json"
|
||||
},
|
||||
"systemMetadata": {
|
||||
|
||||
@ -21,3 +21,15 @@ def test_can_add_aspect():
|
||||
assert builder.can_add_aspect(dataset_mce, DatasetPropertiesClass)
|
||||
assert builder.can_add_aspect(dataset_mce, OwnershipClass)
|
||||
assert not builder.can_add_aspect(dataset_mce, DataFlowInfoClass)
|
||||
|
||||
|
||||
def test_create_dataset_urn_with_reserved_chars() -> None:
|
||||
assert (
|
||||
builder.make_dataset_urn_with_platform_instance(
|
||||
"platform)",
|
||||
"table_(name)",
|
||||
"platform,instance",
|
||||
builder.DEFAULT_ENV,
|
||||
)
|
||||
== "urn:li:dataset:(urn:li:dataPlatform:platform%29,platform%2Cinstance.table_%28name%29,PROD)"
|
||||
)
|
||||
|
||||
29
metadata-ingestion/tests/unit/utilities/test_urn_encoder.py
Normal file
29
metadata-ingestion/tests/unit/utilities/test_urn_encoder.py
Normal file
@ -0,0 +1,29 @@
|
||||
import pytest
|
||||
|
||||
from datahub.utilities.urn_encoder import UrnEncoder
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"name",
|
||||
[
|
||||
"test-database.test-schema.test-table",
|
||||
"test_database.test$schema.test+table",
|
||||
"test&database.%testschema.test*table",
|
||||
],
|
||||
)
|
||||
def test_encode_string_without_reserved_chars_no_change(name):
|
||||
assert UrnEncoder.encode_string(name) == name
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"name",
|
||||
[
|
||||
"test-database,test-schema,test-table",
|
||||
"test_database,(test$schema),test+table",
|
||||
"test&database.test(schema.test*table",
|
||||
],
|
||||
)
|
||||
def test_encode_string_with_reserved_chars(name):
|
||||
assert UrnEncoder.encode_string(name) == name.replace(",", "%2C").replace(
|
||||
"(", "%28"
|
||||
).replace(")", "%29")
|
||||
Loading…
x
Reference in New Issue
Block a user