From fbd4c1e012034dadd279ef0a8fa87d99fbc14c7c Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 21 Mar 2025 11:39:45 -0700 Subject: [PATCH] docs(sdk): add docstrings for some sdk classes (#12940) --- metadata-ingestion/src/datahub/sdk/dataset.py | 122 ++++++++++++++++++ metadata-ingestion/src/datahub/sdk/entity.py | 98 +++++++++++++- .../src/datahub/sdk/entity_client.py | 24 ++++ .../src/datahub/sdk/main_client.py | 22 ++++ .../src/datahub/sdk/search_filters.py | 8 +- 5 files changed, 269 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/src/datahub/sdk/dataset.py b/metadata-ingestion/src/datahub/sdk/dataset.py index ad37d5d78d..1a4762961e 100644 --- a/metadata-ingestion/src/datahub/sdk/dataset.py +++ b/metadata-ingestion/src/datahub/sdk/dataset.py @@ -430,10 +430,22 @@ class Dataset( HasDomain, Entity, ): + """Represents a dataset in DataHub. + + A dataset represents a collection of data, such as a table, view, or file. + This class provides methods for managing dataset metadata including schema, + lineage, and various aspects like ownership, tags, and terms. + """ + __slots__ = () @classmethod def get_urn_type(cls) -> Type[DatasetUrn]: + """Get the URN type for datasets. + + Returns: + The DatasetUrn class. + """ return DatasetUrn def __init__( @@ -466,6 +478,31 @@ class Dataset( schema: Optional[SchemaFieldsInputType] = None, upstreams: Optional[models.UpstreamLineageClass] = None, ): + """Initialize a new Dataset instance. + + Args: + platform: The platform this dataset belongs to (e.g. "mysql", "snowflake"). + name: The name of the dataset. + platform_instance: Optional platform instance identifier. + env: The environment this dataset belongs to (default: DEFAULT_ENV). + description: Optional description of the dataset. + display_name: Optional display name for the dataset. + qualified_name: Optional qualified name for the dataset. + external_url: Optional URL to external documentation or source. + custom_properties: Optional dictionary of custom properties. + created: Optional creation timestamp. + last_modified: Optional last modification timestamp. + parent_container: Optional parent container for this dataset. + subtype: Optional subtype of the dataset. + owners: Optional list of owners. + links: Optional list of links. + tags: Optional list of tags. + terms: Optional list of glossary terms. + domain: Optional domain this dataset belongs to. + extra_aspects: Optional list of additional aspects. + schema: Optional schema definition for the dataset. + upstreams: Optional upstream lineage information. + """ urn = DatasetUrn.create_from_ids( platform_id=platform, table_name=name, @@ -539,6 +576,11 @@ class Dataset( @property def description(self) -> Optional[str]: + """Get the description of the dataset. + + Returns: + The description if set, None otherwise. + """ editable_props = self._get_editable_props() return first_non_null( [ @@ -548,6 +590,15 @@ class Dataset( ) def set_description(self, description: str) -> None: + """Set the description of the dataset. + + Args: + description: The description to set. + + Note: + If called during ingestion, this will warn if overwriting + a non-ingestion description. + """ if is_ingestion_attribution(): editable_props = self._get_editable_props() if editable_props is not None and editable_props.description is not None: @@ -565,41 +616,96 @@ class Dataset( @property def display_name(self) -> Optional[str]: + """Get the display name of the dataset. + + Returns: + The display name if set, None otherwise. + """ return self._ensure_dataset_props().name def set_display_name(self, display_name: str) -> None: + """Set the display name of the dataset. + + Args: + display_name: The display name to set. + """ self._ensure_dataset_props().name = display_name @property def qualified_name(self) -> Optional[str]: + """Get the qualified name of the dataset. + + Returns: + The qualified name if set, None otherwise. + """ return self._ensure_dataset_props().qualifiedName def set_qualified_name(self, qualified_name: str) -> None: + """Set the qualified name of the dataset. + + Args: + qualified_name: The qualified name to set. + """ self._ensure_dataset_props().qualifiedName = qualified_name @property def external_url(self) -> Optional[str]: + """Get the external URL of the dataset. + + Returns: + The external URL if set, None otherwise. + """ return self._ensure_dataset_props().externalUrl def set_external_url(self, external_url: str) -> None: + """Set the external URL of the dataset. + + Args: + external_url: The external URL to set. + """ self._ensure_dataset_props().externalUrl = external_url @property def custom_properties(self) -> Dict[str, str]: + """Get the custom properties of the dataset. + + Returns: + Dictionary of custom properties. + """ return self._ensure_dataset_props().customProperties def set_custom_properties(self, custom_properties: Dict[str, str]) -> None: + """Set the custom properties of the dataset. + + Args: + custom_properties: Dictionary of custom properties to set. + """ self._ensure_dataset_props().customProperties = custom_properties @property def created(self) -> Optional[datetime]: + """Get the creation timestamp of the dataset. + + Returns: + The creation timestamp if set, None otherwise. + """ return parse_time_stamp(self._ensure_dataset_props().created) def set_created(self, created: datetime) -> None: + """Set the creation timestamp of the dataset. + + Args: + created: The creation timestamp to set. + """ self._ensure_dataset_props().created = make_time_stamp(created) @property def last_modified(self) -> Optional[datetime]: + """Get the last modification timestamp of the dataset. + + Returns: + The last modification timestamp if set, None otherwise. + """ return parse_time_stamp(self._ensure_dataset_props().lastModified) def set_last_modified(self, last_modified: datetime) -> None: @@ -614,6 +720,11 @@ class Dataset( @property def schema(self) -> List[SchemaField]: # TODO: Add some caching here to avoid iterating over the schema every time. + """Get the schema fields of the dataset. + + Returns: + List of SchemaField objects representing the dataset's schema. + """ schema_dict = self._schema_dict() return [SchemaField(self, field_path) for field_path in schema_dict] @@ -669,6 +780,17 @@ class Dataset( def __getitem__(self, field_path: str) -> SchemaField: # TODO: Automatically deal with field path v2? + """Get a schema field by its path. + + Args: + field_path: The path of the field to retrieve. + + Returns: + A SchemaField instance. + + Raises: + SchemaFieldKeyError: If the field is not found. + """ schema_dict = self._schema_dict() if field_path not in schema_dict: raise SchemaFieldKeyError(f"Field {field_path} not found in schema") diff --git a/metadata-ingestion/src/datahub/sdk/entity.py b/metadata-ingestion/src/datahub/sdk/entity.py index f50c86f4b2..f20ec376e1 100644 --- a/metadata-ingestion/src/datahub/sdk/entity.py +++ b/metadata-ingestion/src/datahub/sdk/entity.py @@ -20,9 +20,24 @@ ExtraAspectsType = Union[None, List[AspectTypeVar]] class Entity: + """Base class for all DataHub entities. + + This class provides the core functionality for working with DataHub entities, + including aspect management and URN handling. It should not be instantiated directly; + instead, use one of its subclasses like Dataset or Container. + """ + __slots__ = ("_urn", "_prev_aspects", "_aspects") def __init__(self, /, urn: Urn): + """Initialize a new Entity instance. + + Args: + urn: The URN that uniquely identifies this entity. + + Raises: + SdkUsageError: If this base class is instantiated directly. + """ # This method is not meant for direct usage. if type(self) is Entity: raise SdkUsageError(f"{Entity.__name__} cannot be instantiated directly.") @@ -36,6 +51,15 @@ class Entity: @classmethod def _new_from_graph(cls, urn: Urn, current_aspects: models.AspectBag) -> Self: + """Create a new entity instance from graph data. + + Args: + urn: The URN of the entity. + current_aspects: The current aspects of the entity from the graph. + + Returns: + A new entity instance initialized with the graph data. + """ # If an init method from a subclass adds required fields, it also needs to override this method. # An alternative approach would call cls.__new__() to bypass the init method, but it's a bit # too hacky for my taste. @@ -43,6 +67,14 @@ class Entity: return entity._init_from_graph(current_aspects) def _init_from_graph(self, current_aspects: models.AspectBag) -> Self: + """Initialize the entity with aspects from the graph. + + Args: + current_aspects: The current aspects of the entity from the graph. + + Returns: + The entity instance with initialized aspects. + """ self._prev_aspects = current_aspects self._aspects = {} @@ -54,14 +86,30 @@ class Entity: @classmethod @abc.abstractmethod - def get_urn_type(cls) -> Type[_SpecificUrn]: ... + def get_urn_type(cls) -> Type[_SpecificUrn]: + """Get the URN type for this entity class. + + Returns: + The URN type class that corresponds to this entity type. + """ + ... @classmethod def entity_type_name(cls) -> str: + """Get the entity type name. + + Returns: + The string name of this entity type. + """ return cls.get_urn_type().ENTITY_TYPE @property def urn(self) -> _SpecificUrn: + """Get the entity's URN. + + Returns: + The URN that uniquely identifies this entity. + """ return self._urn def _get_aspect( @@ -69,12 +117,33 @@ class Entity: aspect_type: Type[AspectTypeVar], /, ) -> Optional[AspectTypeVar]: + """Get an aspect of the entity by its type. + + Args: + aspect_type: The type of aspect to retrieve. + + Returns: + The aspect if it exists, None otherwise. + """ return self._aspects.get(aspect_type.ASPECT_NAME) # type: ignore def _set_aspect(self, value: AspectTypeVar, /) -> None: + """Set an aspect of the entity. + + Args: + value: The aspect to set. + """ self._aspects[value.ASPECT_NAME] = value # type: ignore def _setdefault_aspect(self, default_aspect: AspectTypeVar, /) -> AspectTypeVar: + """Set a default aspect if it doesn't exist. + + Args: + default_aspect: The default aspect to set if none exists. + + Returns: + The existing aspect if one exists, otherwise the default aspect. + """ # Similar semantics to dict.setdefault. if existing_aspect := self._get_aspect(type(default_aspect)): return existing_aspect @@ -85,6 +154,14 @@ class Entity: self, change_type: Union[str, models.ChangeTypeClass] = models.ChangeTypeClass.UPSERT, ) -> List[MetadataChangeProposalWrapper]: + """Convert the entity's aspects to MetadataChangeProposals. + + Args: + change_type: The type of change to apply (default: UPSERT). + + Returns: + A list of MetadataChangeProposalWrapper objects. + """ urn_str = str(self.urn) mcps = [] @@ -100,13 +177,32 @@ class Entity: return mcps def as_workunits(self) -> List[MetadataWorkUnit]: + """Convert the entity's aspects to MetadataWorkUnits. + + Returns: + A list of MetadataWorkUnit objects. + """ return [mcp.as_workunit() for mcp in self._as_mcps()] def _set_extra_aspects(self, extra_aspects: ExtraAspectsType) -> None: + """Set additional aspects on the entity. + + Args: + extra_aspects: List of additional aspects to set. + + Note: + This method does not validate for conflicts between extra aspects + and standard aspects. + """ # TODO: Add validation to ensure that an "extra aspect" does not conflict # with / get overridden by a standard aspect. for aspect in extra_aspects or []: self._set_aspect(aspect) def __repr__(self) -> str: + """Get a string representation of the entity. + + Returns: + A string in the format "EntityClass('urn')". + """ return f"{self.__class__.__name__}('{self.urn}')" diff --git a/metadata-ingestion/src/datahub/sdk/entity_client.py b/metadata-ingestion/src/datahub/sdk/entity_client.py index bcb9f2798f..8b26badb9b 100644 --- a/metadata-ingestion/src/datahub/sdk/entity_client.py +++ b/metadata-ingestion/src/datahub/sdk/entity_client.py @@ -24,7 +24,18 @@ if TYPE_CHECKING: class EntityClient: + """Client for managing DataHub entities. + + This class provides methods for retrieving and managing DataHub entities + such as datasets, containers, and other metadata objects. + """ + def __init__(self, client: DataHubClient): + """Private constructor - use :py:attr:`DataHubClient.entities` instead. + + Args: + client: The parent DataHubClient instance. + """ self._client = client # TODO: Make all of these methods sync by default. @@ -40,6 +51,19 @@ class EntityClient: @overload def get(self, urn: Union[Urn, str]) -> Entity: ... def get(self, urn: UrnOrStr) -> Entity: + """Retrieve an entity by its urn. + + Args: + urn: The urn of the entity to retrieve. Can be a string or :py:class:`Urn` object. + + Returns: + The retrieved entity instance. + + Raises: + ItemNotFoundError: If the entity does not exist. + SdkUsageError: If the entity type is not yet supported. + InvalidUrnError: If the URN is invalid. + """ if not isinstance(urn, Urn): urn = Urn.from_string(urn) diff --git a/metadata-ingestion/src/datahub/sdk/main_client.py b/metadata-ingestion/src/datahub/sdk/main_client.py index ef58007a46..8884231df8 100644 --- a/metadata-ingestion/src/datahub/sdk/main_client.py +++ b/metadata-ingestion/src/datahub/sdk/main_client.py @@ -11,6 +11,17 @@ from datahub.sdk.search_client import SearchClient class DataHubClient: + """Main client for interacting with DataHub. + + This class provides the primary interface for interacting with DataHub, + including entity management, search, and resolution capabilities. + + The client can be initialized in three ways: + 1. With a server URL and optional token + 2. With a DatahubClientConfig object + 3. With an existing (legacy) :py:class:`DataHubGraph` instance + """ + @overload def __init__(self, *, server: str, token: Optional[str] = None): ... @overload @@ -25,6 +36,17 @@ class DataHubClient: graph: Optional[DataHubGraph] = None, config: Optional[DatahubClientConfig] = None, ): + """Initialize a new DataHubClient instance. + + Args: + server: The URL of the DataHub server (e.g. "http://localhost:8080"). + token: Optional authentication token. + graph: An existing DataHubGraph instance to use. + config: A DatahubClientConfig object with connection details. + + Raises: + SdkUsageError: If invalid combinations of arguments are provided. + """ if server is not None: if config is not None: raise SdkUsageError("Cannot specify both server and config") diff --git a/metadata-ingestion/src/datahub/sdk/search_filters.py b/metadata-ingestion/src/datahub/sdk/search_filters.py index 5c5116b181..0556702377 100644 --- a/metadata-ingestion/src/datahub/sdk/search_filters.py +++ b/metadata-ingestion/src/datahub/sdk/search_filters.py @@ -157,7 +157,7 @@ class _EnvFilter(_BaseFilter): class _CustomCondition(_BaseFilter): - """Represents a single field condition""" + """Represents a single field condition.""" field: str condition: str @@ -173,7 +173,7 @@ class _CustomCondition(_BaseFilter): class _And(_BaseFilter): - """Represents an AND conjunction of filters""" + """Represents an AND conjunction of filters.""" and_: Sequence["Filter"] = pydantic.Field(alias="and") # TODO: Add validator to ensure that the "and" field is not empty @@ -221,7 +221,7 @@ class _And(_BaseFilter): class _Or(_BaseFilter): - """Represents an OR conjunction of filters""" + """Represents an OR conjunction of filters.""" or_: Sequence["Filter"] = pydantic.Field(alias="or") # TODO: Add validator to ensure that the "or" field is not empty @@ -234,7 +234,7 @@ class _Or(_BaseFilter): class _Not(_BaseFilter): - """Represents a NOT filter""" + """Represents a NOT filter.""" not_: "Filter" = pydantic.Field(alias="not")