fix(ingest): encode reserved characters when creating dataset urn (#5977)

Co-authored-by: Harshal Sheth <hsheth2@gmail.com>
This commit is contained in:
Mayuri Nehate 2022-09-21 05:29:02 +05:30 committed by GitHub
parent 46494bc0ad
commit b195b6c123
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 70 additions and 9 deletions

View File

@ -0,0 +1,19 @@
import urllib.parse
from typing import List
RESERVED_CHARS = [",", "(", ")"]
class UrnEncoder:
@staticmethod
def encode_string_array(arr: List[str]) -> List[str]:
return [UrnEncoder.encode_string(s) for s in arr]
@staticmethod
def encode_string(s: str) -> str:
return "".join([UrnEncoder.encode_char(c) for c in s])
@staticmethod
def encode_char(c: str) -> str:
assert len(c) == 1, "Invalid input, Expected single character"
return urllib.parse.quote(c) if c in RESERVED_CHARS else c

View File

@ -1,6 +1,7 @@
from typing import List, Optional
from datahub.configuration.source_common import ALL_ENV_TYPES
from datahub.utilities.urn_encoder import UrnEncoder
from datahub.utilities.urns.data_platform_urn import DataPlatformUrn
from datahub.utilities.urns.error import InvalidUrnError
from datahub.utilities.urns.urn import Urn
@ -15,7 +16,7 @@ class DatasetUrn(Urn):
ENTITY_TYPE: str = "dataset"
def __init__(self, entity_type: str, entity_id: List[str], domain: str = "li"):
super().__init__(entity_type, entity_id, domain)
super().__init__(entity_type, UrnEncoder.encode_string_array(entity_id), domain)
@classmethod
def create_from_string(cls, urn_str: str) -> "DatasetUrn":

View File

@ -258,7 +258,7 @@
"upstreamDatabases": [
{
"id": "1ade1d51-bbc3-ed8d-25d2-c51f44b8b31b",
"name": "Sample - Superstore.xls",
"name": "Sample - Superstore, (new).xls",
"connectionType": "excel-direct",
"isEmbedded": true
}
@ -267,7 +267,7 @@
{
"id": "15714253-8e46-a209-63cc-700705b66de9",
"database": {
"name": "Sample - Superstore.xls"
"name": "Sample - Superstore, (new).xls"
},
"name": "People",
"schema": "",
@ -288,7 +288,7 @@
{
"id": "19be3c28-8e4d-ebac-b44d-8f0851d9f206",
"database": {
"name": "Sample - Superstore.xls"
"name": "Sample - Superstore, (new).xls"
},
"name": "Returns",
"schema": "",
@ -309,7 +309,7 @@
{
"id": "b0e0c3eb-6e53-e0f5-ded1-478d5d9f7281",
"database": {
"name": "Sample - Superstore.xls"
"name": "Sample - Superstore, (new).xls"
},
"name": "Orders",
"schema": "",

View File

@ -22897,7 +22897,7 @@
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
"urn": "urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore.xls.people,PROD)",
"urn": "urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore%2C %28new%29.xls.people,PROD)",
"aspects": [
{
"com.linkedin.pegasus2avro.common.BrowsePaths": {
@ -22966,7 +22966,7 @@
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
"urn": "urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore.xls.returns,PROD)",
"urn": "urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore%2C %28new%29.xls.returns,PROD)",
"aspects": [
{
"com.linkedin.pegasus2avro.common.BrowsePaths": {
@ -23035,7 +23035,7 @@
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
"urn": "urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore.xls.orders,PROD)",
"urn": "urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore%2C %28new%29.xls.orders,PROD)",
"aspects": [
{
"com.linkedin.pegasus2avro.common.BrowsePaths": {
@ -30907,7 +30907,7 @@
"changeType": "UPSERT",
"aspectName": "upstreamLineage",
"aspect": {
"value": "{\"upstreams\": [{\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore.xls.people,PROD)\", \"type\": \"TRANSFORMED\"}, {\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore.xls.returns,PROD)\", \"type\": \"TRANSFORMED\"}, {\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore.xls.orders,PROD)\", \"type\": \"TRANSFORMED\"}]}",
"value": "{\"upstreams\": [{\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore%2C %28new%29.xls.people,PROD)\", \"type\": \"TRANSFORMED\"}, {\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore%2C %28new%29.xls.returns,PROD)\", \"type\": \"TRANSFORMED\"}, {\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore%2C %28new%29.xls.orders,PROD)\", \"type\": \"TRANSFORMED\"}]}",
"contentType": "application/json"
},
"systemMetadata": {

View File

@ -21,3 +21,15 @@ def test_can_add_aspect():
assert builder.can_add_aspect(dataset_mce, DatasetPropertiesClass)
assert builder.can_add_aspect(dataset_mce, OwnershipClass)
assert not builder.can_add_aspect(dataset_mce, DataFlowInfoClass)
def test_create_dataset_urn_with_reserved_chars() -> None:
assert (
builder.make_dataset_urn_with_platform_instance(
"platform)",
"table_(name)",
"platform,instance",
builder.DEFAULT_ENV,
)
== "urn:li:dataset:(urn:li:dataPlatform:platform%29,platform%2Cinstance.table_%28name%29,PROD)"
)

View File

@ -0,0 +1,29 @@
import pytest
from datahub.utilities.urn_encoder import UrnEncoder
@pytest.mark.parametrize(
"name",
[
"test-database.test-schema.test-table",
"test_database.test$schema.test+table",
"test&database.%testschema.test*table",
],
)
def test_encode_string_without_reserved_chars_no_change(name):
assert UrnEncoder.encode_string(name) == name
@pytest.mark.parametrize(
"name",
[
"test-database,test-schema,test-table",
"test_database,(test$schema),test+table",
"test&database.test(schema.test*table",
],
)
def test_encode_string_with_reserved_chars(name):
assert UrnEncoder.encode_string(name) == name.replace(",", "%2C").replace(
"(", "%28"
).replace(")", "%29")