mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-01 19:25:56 +00:00
Refining docs (#4001)
This commit is contained in:
parent
5330a68a47
commit
3c2a4fe39d
@ -45,18 +45,18 @@ DataHub's "core" Entity types model the Data Assets that comprise the Modern Dat
|
||||
|
||||
## The Entity Registry
|
||||
|
||||
Where are Entities and their aspects defined in DataHub? Where doe the Metadata Model "live"? The Metadata Model is "stitched together" by means
|
||||
Where are Entities and their aspects defined in DataHub? Where does the Metadata Model "live"? The Metadata Model is stitched together by means
|
||||
of an **Entity Registry**, a catalog of Entities that comprise the Metadata Graph along with the aspects associated with each. Put
|
||||
simply, this is where the "schema" of the model is represented.
|
||||
simply, this is where the "schema" of the model is defined.
|
||||
|
||||
Traditionally, the Entity Registry was constructed using [Snapshot](https://github.com/linkedin/datahub/tree/master/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot) models, which Data Models (schemas) that explicitly tied
|
||||
an Entity to the Aspects associated with it. An example is [DatasetSnapshot](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/DatasetSnapshot.pdl), which declares the core `Dataset` Entity.
|
||||
The relationship particular aspects of a Dataset was captured via a "union" where the list of possible aspects was modeled. An example is
|
||||
Traditionally, the Entity Registry was constructed using [Snapshot](https://github.com/linkedin/datahub/tree/master/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot) models, which are schemas that explicitly tie
|
||||
an Entity to the Aspects associated with it. An example is [DatasetSnapshot](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/DatasetSnapshot.pdl), which defines the core `Dataset` Entity.
|
||||
The Aspects of the Dataset entity are captured via a union field inside a special "Aspect" schema. An example is
|
||||
[DatasetAspect](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/metadata/aspect/DatasetAspect.pdl).
|
||||
This file associates dataset-specific aspects, such as [DatasetProperties](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl), as well as common aspects like [Ownership](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/common/Ownership.pdl),
|
||||
This file associates dataset-specific aspects (like [DatasetProperties](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl)) and common aspects (like [Ownership](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/common/Ownership.pdl),
|
||||
[InstitutionalMemory](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/common/InstitutionalMemory.pdl),
|
||||
and [Status](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/common/Status.pdl),
|
||||
to the Dataset Entity. Similar Snapshots
|
||||
and [Status](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/common/Status.pdl))
|
||||
to the Dataset Entity. This approach to defining Entities will soon be deprecated in favor of a new approach.
|
||||
|
||||
As of January 2022, DataHub has deprecated support for Snapshot models as a means of adding new entities. Instead,
|
||||
the Entity Registry is defined inside a YAML configuration file called [entity-registry.yml](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/resources/entity-registry.yml),
|
||||
@ -67,7 +67,6 @@ each aspect name provided by configuration (via the [@Aspect](https://github.com
|
||||
By moving to this format, evolving the Metadata Model becomes much easier. Adding Entities & Aspects becomes a matter of adding a
|
||||
to the YAML configuration, instead of creating new Snapshot / Aspect files.
|
||||
|
||||
> New to [PDL](https://linkedin.github.io/rest.li/pdl_schema) files? Don't fret. They are just a way to define a JSON document "schema" for Aspects in DataHub. All Data ingested to DataHub's Metadata Service is validated against a PDL schema, with each @Aspect corresponding to a single schema. Structurally, PDL is quite similar to [Protobuf](https://developers.google.com/protocol-buffers) and conveniently maps to JSON.
|
||||
|
||||
## Exploring DataHub's Metadata Model
|
||||
|
||||
@ -104,6 +103,8 @@ DataHub’s modeling language allows you to optimize metadata persistence to ali
|
||||
|
||||
There are three supported ways to query the metadata graph: by primary key lookup, a search query, and via relationship traversal.
|
||||
|
||||
> New to [PDL](https://linkedin.github.io/rest.li/pdl_schema) files? Don't fret. They are just a way to define a JSON document "schema" for Aspects in DataHub. All Data ingested to DataHub's Metadata Service is validated against a PDL schema, with each @Aspect corresponding to a single schema. Structurally, PDL is quite similar to [Protobuf](https://developers.google.com/protocol-buffers) and conveniently maps to JSON.
|
||||
|
||||
### Querying an Entity
|
||||
|
||||
#### Fetching Latest Entity Aspects (Snapshot)
|
||||
|
||||
@ -194,7 +194,7 @@ class AzureADSource(Source):
|
||||
datahub_corp_user_snapshots, datahub_corp_user_urn_to_group_membership
|
||||
)
|
||||
|
||||
# Create MetadatWorkUnits for CorpUsers
|
||||
# Create MetadataWorkUnits for CorpUsers
|
||||
if self.config.ingest_users:
|
||||
# 3) the users
|
||||
for azure_ad_users in self._get_azure_ad_users():
|
||||
|
||||
@ -5,6 +5,18 @@ from tests.utils import GMS_ENDPOINT
|
||||
from tests.utils import ingest_file_via_rest
|
||||
from tests.utils import delete_urns_from_file
|
||||
|
||||
from typing import List
|
||||
|
||||
import datahub.emitter.mce_builder as builder
|
||||
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
||||
from datahub.emitter.rest_emitter import DatahubRestEmitter
|
||||
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
||||
DatasetLineageTypeClass,
|
||||
UpstreamClass,
|
||||
UpstreamLineage,
|
||||
)
|
||||
from datahub.metadata.schema_classes import ChangeTypeClass
|
||||
|
||||
@pytest.fixture(scope="module", autouse=False)
|
||||
def ingest_cleanup_data(request):
|
||||
print("ingesting domains test data")
|
||||
@ -21,6 +33,39 @@ def test_healthchecks(wait_for_healthchecks):
|
||||
@pytest.mark.dependency(depends=["test_healthchecks"])
|
||||
def test_create_list_get_domain(frontend_session):
|
||||
|
||||
# Construct upstream tables.
|
||||
upstream_tables: List[UpstreamClass] = []
|
||||
upstream_table_1 = UpstreamClass(
|
||||
dataset=builder.make_dataset_urn("bigquery", "upstream_table_1", "PROD"),
|
||||
type=DatasetLineageTypeClass.TRANSFORMED,
|
||||
)
|
||||
upstream_tables.append(upstream_table_1)
|
||||
upstream_table_2 = UpstreamClass(
|
||||
dataset=builder.make_dataset_urn("bigquery", "upstream_table_2", "PROD"),
|
||||
type=DatasetLineageTypeClass.TRANSFORMED,
|
||||
)
|
||||
upstream_tables.append(upstream_table_2)
|
||||
|
||||
# Construct a lineage object.
|
||||
upstream_lineage = UpstreamLineage(upstreams=upstream_tables)
|
||||
|
||||
# Construct a MetadataChangeProposalWrapper object.
|
||||
lineage_mcp = MetadataChangeProposalWrapper(
|
||||
entityType="dataset",
|
||||
changeType=ChangeTypeClass.UPSERT,
|
||||
entityUrn=builder.make_dataset_urn("bigquery", "downstream"),
|
||||
aspectName="upstreamLineage",
|
||||
aspect=upstream_lineage,
|
||||
systemMetadata=
|
||||
)
|
||||
|
||||
# Create an emitter to the GMS REST API.
|
||||
emitter = DatahubRestEmitter("http://localhost:8080")
|
||||
|
||||
# Emit metadata!
|
||||
emitter.emit_mcp(lineage_mcp)
|
||||
|
||||
|
||||
# Get count of existing secrets
|
||||
list_domains_json = {
|
||||
"query": """query listDomains($input: ListDomainsInput!) {\n
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user