mirror of
https://github.com/datahub-project/datahub.git
synced 2025-08-23 00:28:03 +00:00
feat(ingest): lib - add better support for working with urns (#4172)
Co-authored-by: Xu Wang <xu.wang@grandrounds.com>
This commit is contained in:
parent
ede31c4177
commit
aa3363bcc2
@ -0,0 +1,31 @@
|
||||
from typing import List
|
||||
|
||||
from datahub.utilities.urns.error import InvalidUrnError
|
||||
from datahub.utilities.urns.urn import Urn
|
||||
|
||||
|
||||
class DataPlatformUrn(Urn):
|
||||
"""
|
||||
expected dataset urn format: urn:li:dataPlatform:<platform_name>. example: "urn:li:dataPlatform:hive"
|
||||
"""
|
||||
|
||||
ENTITY_TYPE: str = "dataPlatform"
|
||||
|
||||
def __init__(self, entity_type: str, entity_id: List[str], domain: str = "li"):
|
||||
super().__init__(entity_type, entity_id, domain)
|
||||
|
||||
@classmethod
|
||||
def create_from_string(cls, urn_str: str) -> "DataPlatformUrn":
|
||||
urn: Urn = super().create_from_string(urn_str)
|
||||
return cls(urn.get_type(), urn.get_entity_id(), urn.get_domain())
|
||||
|
||||
@classmethod
|
||||
def create_from_id(cls, platform_id: str) -> "DataPlatformUrn":
|
||||
return cls(DataPlatformUrn.ENTITY_TYPE, [platform_id])
|
||||
|
||||
@staticmethod
|
||||
def _validate_entity_type(entity_type: str) -> None:
|
||||
if entity_type != DataPlatformUrn.ENTITY_TYPE:
|
||||
raise InvalidUrnError(
|
||||
f"Entity type should be {DataPlatformUrn.ENTITY_TYPE} but found {entity_type}"
|
||||
)
|
89
metadata-ingestion/src/datahub/utilities/urns/dataset_urn.py
Normal file
89
metadata-ingestion/src/datahub/utilities/urns/dataset_urn.py
Normal file
@ -0,0 +1,89 @@
|
||||
from typing import List, Set
|
||||
|
||||
from datahub.metadata.schema_classes import FabricTypeClass
|
||||
from datahub.utilities.urns.data_platform_urn import DataPlatformUrn
|
||||
from datahub.utilities.urns.error import InvalidUrnError
|
||||
from datahub.utilities.urns.urn import Urn
|
||||
|
||||
|
||||
class DatasetUrn(Urn):
|
||||
"""
|
||||
expected dataset urn format: urn:li:dataset:(<platform_urn_str>,<table_name>,env). example:
|
||||
urn:li:dataset:(urn:li:dataPlatform:hive,member,prod)
|
||||
"""
|
||||
|
||||
ENTITY_TYPE: str = "dataset"
|
||||
VALID_FABRIC_SET: Set[str] = set(
|
||||
[
|
||||
str(getattr(FabricTypeClass, attr)).upper()
|
||||
for attr in dir(FabricTypeClass)
|
||||
if not callable(getattr(FabricTypeClass, attr)) and not attr.startswith("_")
|
||||
]
|
||||
)
|
||||
|
||||
def __init__(self, entity_type: str, entity_id: List[str], domain: str = "li"):
|
||||
super().__init__(entity_type, entity_id, domain)
|
||||
|
||||
@classmethod
|
||||
def create_from_string(cls, urn_str: str) -> "DatasetUrn":
|
||||
"""
|
||||
Create a DatasetUrn from the its string representation
|
||||
:param urn_str: the string representation of the DatasetUrn
|
||||
:return: DatasetUrn of the given string representation
|
||||
:raises InvalidUrnError is the string representation is in invalid format
|
||||
"""
|
||||
urn: Urn = super().create_from_string(urn_str)
|
||||
return cls(urn.get_type(), urn.get_entity_id(), urn.get_domain())
|
||||
|
||||
def get_data_platform_urn(self) -> DataPlatformUrn:
|
||||
"""
|
||||
:return: the DataPlatformUrn of where the Dataset is created
|
||||
"""
|
||||
return DataPlatformUrn.create_from_string(self.get_entity_id()[0])
|
||||
|
||||
def get_dataset_name(self) -> str:
|
||||
"""
|
||||
:return: the dataset name from this DatasetUrn
|
||||
"""
|
||||
return self.get_entity_id()[1]
|
||||
|
||||
def get_env(self) -> str:
|
||||
"""
|
||||
:return: the environment where the Dataset is created
|
||||
"""
|
||||
return self.get_entity_id()[2]
|
||||
|
||||
@classmethod
|
||||
def create_from_ids(
|
||||
cls, platform_id: str, table_name: str, env: str
|
||||
) -> "DatasetUrn":
|
||||
entity_id: List[str] = [
|
||||
str(DataPlatformUrn.create_from_id(platform_id)),
|
||||
table_name,
|
||||
env,
|
||||
]
|
||||
return cls(DatasetUrn.ENTITY_TYPE, entity_id)
|
||||
|
||||
@staticmethod
|
||||
def _validate_entity_type(entity_type: str) -> None:
|
||||
if entity_type != DatasetUrn.ENTITY_TYPE:
|
||||
raise InvalidUrnError(
|
||||
f"Entity type should be {DatasetUrn.ENTITY_TYPE} but found {entity_type}"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _validate_entity_id(entity_id: List[str]) -> None:
|
||||
# expected entity id format (<platform_urn>,<table_name>,<env>)
|
||||
if len(entity_id) != 3:
|
||||
raise InvalidUrnError(
|
||||
f"Expect 3 parts in the entity id but found {entity_id}"
|
||||
)
|
||||
|
||||
platform_urn_str = entity_id[0]
|
||||
|
||||
DataPlatformUrn.validate(platform_urn_str)
|
||||
env = entity_id[2].upper()
|
||||
if env not in DatasetUrn.VALID_FABRIC_SET:
|
||||
raise InvalidUrnError(
|
||||
f"Invalid env:{env}. Allowed evn are {DatasetUrn.VALID_FABRIC_SET}"
|
||||
)
|
3
metadata-ingestion/src/datahub/utilities/urns/error.py
Normal file
3
metadata-ingestion/src/datahub/utilities/urns/error.py
Normal file
@ -0,0 +1,3 @@
|
||||
class InvalidUrnError(Exception):
|
||||
def __init__(self, msg: str):
|
||||
super().__init__(msg)
|
129
metadata-ingestion/src/datahub/utilities/urns/urn.py
Normal file
129
metadata-ingestion/src/datahub/utilities/urns/urn.py
Normal file
@ -0,0 +1,129 @@
|
||||
from typing import List
|
||||
|
||||
from datahub.utilities.urns.error import InvalidUrnError
|
||||
|
||||
|
||||
class Urn:
|
||||
"""
|
||||
URNs are Globally Unique Identifiers (GUID) used to represent an entity.
|
||||
It will be in format of urn:<domain>:<type>:<id>
|
||||
"""
|
||||
|
||||
URN_PREFIX: str = "urn"
|
||||
# all the Datahub urn use li domain for now.
|
||||
LI_DOMAIN: str = "li"
|
||||
|
||||
_entity_type: str
|
||||
_domain: str
|
||||
_entity_id: List[str]
|
||||
|
||||
def __init__(
|
||||
self, entity_type: str, entity_id: List[str], urn_domain: str = LI_DOMAIN
|
||||
):
|
||||
if len(entity_id) == 0:
|
||||
raise InvalidUrnError("Empty entity id.")
|
||||
self._validate_entity_type(entity_type)
|
||||
self._validate_entity_id(entity_id)
|
||||
self._entity_type = entity_type
|
||||
self._domain = urn_domain
|
||||
self._entity_id = entity_id
|
||||
|
||||
@classmethod
|
||||
def create_from_string(cls, urn_str: str) -> "Urn":
|
||||
"""
|
||||
Create a Urn from the its string representation
|
||||
:param urn_str: the string representation of the Urn
|
||||
:return: Urn of the given string representation
|
||||
:raises InvalidUrnError if the string representation is in invalid format
|
||||
"""
|
||||
|
||||
# expect urn string in format of urn:<domain>:<type>:<id>
|
||||
cls.validate(urn_str)
|
||||
parts: List[str] = urn_str.split(":", 3)
|
||||
|
||||
return cls(parts[2], cls._get_entity_id_from_str(parts[3]), parts[1])
|
||||
|
||||
@classmethod
|
||||
def validate(cls, urn_str: str) -> None:
|
||||
"""
|
||||
Validate if a string is in valid Urn format
|
||||
:param urn_str: to be validated urn string
|
||||
:raises InvalidUrnError if the string representation is in invalid format
|
||||
"""
|
||||
parts: List[str] = urn_str.split(":", 3)
|
||||
if len(parts) != 4:
|
||||
raise InvalidUrnError(
|
||||
f"Invalid urn string: {urn_str}. Expect 4 parts from urn string but found {len(parts)}"
|
||||
)
|
||||
|
||||
if "" in parts:
|
||||
raise InvalidUrnError(
|
||||
f"Invalid urn string: {urn_str}. There should not be empty parts in urn string."
|
||||
)
|
||||
|
||||
if parts[0] != Urn.URN_PREFIX:
|
||||
raise InvalidUrnError(
|
||||
f'Invalid urn string: {urn_str}. Expect urn starting with "urn" but found {parts[0]}'
|
||||
)
|
||||
|
||||
if "" in cls._get_entity_id_from_str(parts[3]):
|
||||
raise InvalidUrnError(
|
||||
f"Invalid entity id in urn string: {urn_str}. There should not be empty parts in entity id."
|
||||
)
|
||||
|
||||
cls._validate_entity_type(parts[2])
|
||||
cls._validate_entity_id(cls._get_entity_id_from_str(parts[3]))
|
||||
|
||||
def get_type(self) -> str:
|
||||
return self._entity_type
|
||||
|
||||
def get_entity_id(self) -> List[str]:
|
||||
return self._entity_id
|
||||
|
||||
def get_entity_id_as_string(self) -> str:
|
||||
"""
|
||||
:return: string representation of the entity ids. If there are more than one part in the entity id part, it will
|
||||
return in this format (<part1>,<part2>,...)
|
||||
"""
|
||||
return self._entity_id_to_string()
|
||||
|
||||
def get_domain(self) -> str:
|
||||
return self._domain
|
||||
|
||||
@staticmethod
|
||||
def _get_entity_id_from_str(entity_id: str) -> List[str]:
|
||||
if not (entity_id.startswith("(") and entity_id.endswith(")")):
|
||||
return [entity_id]
|
||||
return [sub_id.strip() for sub_id in entity_id[1:-1].split(",")]
|
||||
|
||||
@staticmethod
|
||||
def _validate_entity_type(entity_type: str) -> None:
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def _validate_entity_id(entity_id: List[str]) -> None:
|
||||
pass
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"{self.URN_PREFIX}:{self._domain}:{self._entity_type}:{self._entity_id_to_string()}"
|
||||
|
||||
def _entity_id_to_string(self) -> str:
|
||||
if len(self._entity_id) == 1:
|
||||
return self._entity_id[0]
|
||||
result = ""
|
||||
for part in self._entity_id:
|
||||
result = result + part + ","
|
||||
return f"({result[:-1]})"
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return hash((self._domain, self._entity_type) + tuple(self._entity_id))
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
if not isinstance(other, Urn):
|
||||
return False
|
||||
|
||||
return (
|
||||
self._entity_id == other._entity_id
|
||||
and self._domain == other._domain
|
||||
and self._entity_type == other._entity_type
|
||||
)
|
42
metadata-ingestion/tests/unit/test_dataset_urn.py
Normal file
42
metadata-ingestion/tests/unit/test_dataset_urn.py
Normal file
@ -0,0 +1,42 @@
|
||||
import unittest
|
||||
|
||||
from datahub.utilities.urns.data_platform_urn import DataPlatformUrn
|
||||
from datahub.utilities.urns.dataset_urn import DatasetUrn
|
||||
from datahub.utilities.urns.error import InvalidUrnError
|
||||
|
||||
|
||||
class TestDatasetUrn(unittest.TestCase):
|
||||
def test_parse_urn(self) -> None:
|
||||
dataset_urn_str = "urn:li:dataset:(urn:li:dataPlatform:abc,def,prod)"
|
||||
dataset_urn = DatasetUrn.create_from_string(dataset_urn_str)
|
||||
assert (
|
||||
dataset_urn.get_data_platform_urn()
|
||||
== DataPlatformUrn.create_from_string("urn:li:dataPlatform:abc")
|
||||
)
|
||||
assert dataset_urn.get_dataset_name() == "def"
|
||||
assert dataset_urn.get_env() == "prod"
|
||||
assert (
|
||||
dataset_urn.__str__() == "urn:li:dataset:(urn:li:dataPlatform:abc,def,prod)"
|
||||
)
|
||||
assert dataset_urn == DatasetUrn(
|
||||
"dataset", ["urn:li:dataPlatform:abc", "def", "prod"]
|
||||
)
|
||||
|
||||
def test_invalid_urn(self) -> None:
|
||||
with self.assertRaises(InvalidUrnError):
|
||||
DatasetUrn.create_from_string(
|
||||
"urn:li:abc:(urn:li:dataPlatform:abc,def,prod)"
|
||||
)
|
||||
|
||||
with self.assertRaises(InvalidUrnError):
|
||||
DatasetUrn.create_from_string(
|
||||
"urn:li:dataset:(urn:li:user:abc,dataset,prod)"
|
||||
)
|
||||
|
||||
with self.assertRaises(InvalidUrnError):
|
||||
DatasetUrn.create_from_string("urn:li:dataset:(urn:li:user:abc,dataset)")
|
||||
|
||||
with self.assertRaises(InvalidUrnError):
|
||||
DatasetUrn.create_from_string(
|
||||
"urn:li:dataset:(urn:li:user:abc,dataset,invalidEnv)"
|
||||
)
|
36
metadata-ingestion/tests/unit/test_urn.py
Normal file
36
metadata-ingestion/tests/unit/test_urn.py
Normal file
@ -0,0 +1,36 @@
|
||||
import unittest
|
||||
|
||||
from datahub.utilities.urns.error import InvalidUrnError
|
||||
from datahub.utilities.urns.urn import Urn
|
||||
|
||||
|
||||
class TestUrn(unittest.TestCase):
|
||||
def test_parse_urn(self) -> None:
|
||||
simple_urn_str = "urn:li:dataPlatform:abc"
|
||||
urn = Urn.create_from_string(simple_urn_str)
|
||||
assert urn.get_entity_id_as_string() == "abc"
|
||||
assert urn.get_entity_id() == ["abc"]
|
||||
assert urn.get_type() == "dataPlatform"
|
||||
assert urn.get_domain() == "li"
|
||||
assert urn.__str__() == simple_urn_str
|
||||
assert urn == Urn("dataPlatform", ["abc"])
|
||||
|
||||
complex_urn_str = "urn:li:dataset:(urn:li:dataPlatform:abc, def, prod)"
|
||||
urn = Urn.create_from_string(complex_urn_str)
|
||||
assert urn.get_entity_id_as_string() == "(urn:li:dataPlatform:abc,def,prod)"
|
||||
assert urn.get_entity_id() == ["urn:li:dataPlatform:abc", "def", "prod"]
|
||||
assert urn.get_type() == "dataset"
|
||||
assert urn.__str__() == "urn:li:dataset:(urn:li:dataPlatform:abc,def,prod)"
|
||||
|
||||
def test_invalid_urn(self) -> None:
|
||||
with self.assertRaises(InvalidUrnError):
|
||||
Urn.create_from_string("urn:li:abc")
|
||||
|
||||
with self.assertRaises(InvalidUrnError):
|
||||
Urn.create_from_string("urn:li:abc:")
|
||||
|
||||
with self.assertRaises(InvalidUrnError):
|
||||
Urn.create_from_string("urn:li:abc:()")
|
||||
|
||||
with self.assertRaises(InvalidUrnError):
|
||||
Urn.create_from_string("urn:li:abc:(abc,)")
|
Loading…
x
Reference in New Issue
Block a user