From 3c2a4fe39df9e76e9a4dd45de4ed7c35afaa5c71 Mon Sep 17 00:00:00 2001 From: John Joyce Date: Fri, 28 Jan 2022 15:00:35 -0800 Subject: [PATCH] Refining docs (#4001) --- docs/modeling/metadata-model.md | 19 ++++---- .../ingestion/source/identity/azure_ad.py | 2 +- smoke-test/tests/domains/domains_test.py | 45 +++++++++++++++++++ 3 files changed, 56 insertions(+), 10 deletions(-) diff --git a/docs/modeling/metadata-model.md b/docs/modeling/metadata-model.md index 7d1d285db4..cb053149ba 100644 --- a/docs/modeling/metadata-model.md +++ b/docs/modeling/metadata-model.md @@ -45,18 +45,18 @@ DataHub's "core" Entity types model the Data Assets that comprise the Modern Dat ## The Entity Registry -Where are Entities and their aspects defined in DataHub? Where doe the Metadata Model "live"? The Metadata Model is "stitched together" by means +Where are Entities and their aspects defined in DataHub? Where does the Metadata Model "live"? The Metadata Model is stitched together by means of an **Entity Registry**, a catalog of Entities that comprise the Metadata Graph along with the aspects associated with each. Put -simply, this is where the "schema" of the model is represented. +simply, this is where the "schema" of the model is defined. -Traditionally, the Entity Registry was constructed using [Snapshot](https://github.com/linkedin/datahub/tree/master/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot) models, which Data Models (schemas) that explicitly tied -an Entity to the Aspects associated with it. An example is [DatasetSnapshot](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/DatasetSnapshot.pdl), which declares the core `Dataset` Entity. -The relationship particular aspects of a Dataset was captured via a "union" where the list of possible aspects was modeled. An example is +Traditionally, the Entity Registry was constructed using [Snapshot](https://github.com/linkedin/datahub/tree/master/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot) models, which are schemas that explicitly tie +an Entity to the Aspects associated with it. An example is [DatasetSnapshot](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/DatasetSnapshot.pdl), which defines the core `Dataset` Entity. +The Aspects of the Dataset entity are captured via a union field inside a special "Aspect" schema. An example is [DatasetAspect](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/metadata/aspect/DatasetAspect.pdl). -This file associates dataset-specific aspects, such as [DatasetProperties](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl), as well as common aspects like [Ownership](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/common/Ownership.pdl), +This file associates dataset-specific aspects (like [DatasetProperties](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl)) and common aspects (like [Ownership](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/common/Ownership.pdl), [InstitutionalMemory](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/common/InstitutionalMemory.pdl), -and [Status](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/common/Status.pdl), -to the Dataset Entity. Similar Snapshots +and [Status](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/common/Status.pdl)) +to the Dataset Entity. This approach to defining Entities will soon be deprecated in favor of a new approach. As of January 2022, DataHub has deprecated support for Snapshot models as a means of adding new entities. Instead, the Entity Registry is defined inside a YAML configuration file called [entity-registry.yml](https://github.com/linkedin/datahub/blob/master/metadata-models/src/main/resources/entity-registry.yml), @@ -67,7 +67,6 @@ each aspect name provided by configuration (via the [@Aspect](https://github.com By moving to this format, evolving the Metadata Model becomes much easier. Adding Entities & Aspects becomes a matter of adding a to the YAML configuration, instead of creating new Snapshot / Aspect files. -> New to [PDL](https://linkedin.github.io/rest.li/pdl_schema) files? Don't fret. They are just a way to define a JSON document "schema" for Aspects in DataHub. All Data ingested to DataHub's Metadata Service is validated against a PDL schema, with each @Aspect corresponding to a single schema. Structurally, PDL is quite similar to [Protobuf](https://developers.google.com/protocol-buffers) and conveniently maps to JSON. ## Exploring DataHub's Metadata Model @@ -104,6 +103,8 @@ DataHub’s modeling language allows you to optimize metadata persistence to ali There are three supported ways to query the metadata graph: by primary key lookup, a search query, and via relationship traversal. +> New to [PDL](https://linkedin.github.io/rest.li/pdl_schema) files? Don't fret. They are just a way to define a JSON document "schema" for Aspects in DataHub. All Data ingested to DataHub's Metadata Service is validated against a PDL schema, with each @Aspect corresponding to a single schema. Structurally, PDL is quite similar to [Protobuf](https://developers.google.com/protocol-buffers) and conveniently maps to JSON. + ### Querying an Entity #### Fetching Latest Entity Aspects (Snapshot) diff --git a/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py b/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py index 9c5b2887d1..56f41ec75b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py +++ b/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py @@ -194,7 +194,7 @@ class AzureADSource(Source): datahub_corp_user_snapshots, datahub_corp_user_urn_to_group_membership ) - # Create MetadatWorkUnits for CorpUsers + # Create MetadataWorkUnits for CorpUsers if self.config.ingest_users: # 3) the users for azure_ad_users in self._get_azure_ad_users(): diff --git a/smoke-test/tests/domains/domains_test.py b/smoke-test/tests/domains/domains_test.py index 11b41cb424..b1eb68ac30 100644 --- a/smoke-test/tests/domains/domains_test.py +++ b/smoke-test/tests/domains/domains_test.py @@ -5,6 +5,18 @@ from tests.utils import GMS_ENDPOINT from tests.utils import ingest_file_via_rest from tests.utils import delete_urns_from_file +from typing import List + +import datahub.emitter.mce_builder as builder +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.emitter.rest_emitter import DatahubRestEmitter +from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( + DatasetLineageTypeClass, + UpstreamClass, + UpstreamLineage, +) +from datahub.metadata.schema_classes import ChangeTypeClass + @pytest.fixture(scope="module", autouse=False) def ingest_cleanup_data(request): print("ingesting domains test data") @@ -21,6 +33,39 @@ def test_healthchecks(wait_for_healthchecks): @pytest.mark.dependency(depends=["test_healthchecks"]) def test_create_list_get_domain(frontend_session): + # Construct upstream tables. + upstream_tables: List[UpstreamClass] = [] + upstream_table_1 = UpstreamClass( + dataset=builder.make_dataset_urn("bigquery", "upstream_table_1", "PROD"), + type=DatasetLineageTypeClass.TRANSFORMED, + ) + upstream_tables.append(upstream_table_1) + upstream_table_2 = UpstreamClass( + dataset=builder.make_dataset_urn("bigquery", "upstream_table_2", "PROD"), + type=DatasetLineageTypeClass.TRANSFORMED, + ) + upstream_tables.append(upstream_table_2) + + # Construct a lineage object. + upstream_lineage = UpstreamLineage(upstreams=upstream_tables) + + # Construct a MetadataChangeProposalWrapper object. + lineage_mcp = MetadataChangeProposalWrapper( + entityType="dataset", + changeType=ChangeTypeClass.UPSERT, + entityUrn=builder.make_dataset_urn("bigquery", "downstream"), + aspectName="upstreamLineage", + aspect=upstream_lineage, + systemMetadata= + ) + + # Create an emitter to the GMS REST API. + emitter = DatahubRestEmitter("http://localhost:8080") + + # Emit metadata! + emitter.emit_mcp(lineage_mcp) + + # Get count of existing secrets list_domains_json = { "query": """query listDomains($input: ListDomainsInput!) {\n