feat(model): fully populate the entity registry (#7818)

This commit is contained in:
Harshal Sheth 2023-04-16 02:03:05 +05:30 committed by GitHub
parent 97ac8d93f8
commit af566e1184
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 214 additions and 3 deletions

View File

@ -1,3 +1,4 @@
import collections
import json
import re
from pathlib import Path
@ -360,6 +361,15 @@ def generate(
f'Entity key {entity.keyAspect} is used by {aspect["Aspect"]["keyForEntity"]} and {entity.name}'
)
# Also require that the aspect list is deduplicated.
duplicate_aspects = collections.Counter(entity.aspects) - collections.Counter(
set(entity.aspects)
)
if duplicate_aspects:
raise ValueError(
f"Entity {entity.name} has duplicate aspects: {duplicate_aspects}"
)
aspect["Aspect"]["keyForEntity"] = entity.name
aspect["Aspect"]["entityCategory"] = entity.category
aspect["Aspect"]["entityAspects"] = entity.aspects

View File

@ -1,15 +1,27 @@
import os
import pathlib
import typing
from typing import List, Type
import pytest
import typing_inspect
from datahub.metadata.schema_classes import (
ASPECT_CLASSES,
KEY_ASPECTS,
FineGrainedLineageClass,
MetadataChangeEventClass,
OwnershipClass,
TelemetryKeyClass,
UpstreamClass,
_Aspect,
)
_UPDATE_ENTITY_REGISTRY = os.getenv("UPDATE_ENTITY_REGISTRY", "false").lower() == "true"
ENTITY_REGISTRY_PATH = pathlib.Path(
"../metadata-models/src/main/resources/entity-registry.yml"
)
def test_class_filter() -> None:
# The codegen should only generate classes for aspects and a few extra classes.
@ -67,3 +79,74 @@ def test_urn_annotation():
assert FineGrainedLineageClass.RECORD_SCHEMA.fields_dict["upstreams"].get_prop(
"urn_is_array"
)
def _add_to_registry(entity: str, aspect: str) -> None:
from ruamel.yaml import YAML
yaml = YAML()
doc = yaml.load(ENTITY_REGISTRY_PATH)
for entry in doc["entities"]:
if entry["name"] == entity:
entry["aspects"].append(aspect)
break
else:
raise ValueError(
f'could not find entity "{entity}" in entity registry at {ENTITY_REGISTRY_PATH}'
)
# Prevent line wrapping + preserve indentation.
yaml.width = 2**20 # type: ignore[assignment]
yaml.indent(mapping=2, sequence=4, offset=2)
yaml.dump(doc, ENTITY_REGISTRY_PATH)
def test_entity_registry_completeness():
# The snapshot classes can have aspects that the entity registry doesn't know about.
# This ensures that we don't have any of those cases.
errors: List[str] = []
snapshot_classes: List[Type] = typing_inspect.get_args(
typing.get_type_hints(MetadataChangeEventClass.__init__)["proposedSnapshot"]
)
lowercase_entity_type_map = {name.lower(): name for name in KEY_ASPECTS}
for snapshot_class in snapshot_classes:
lowercase_entity_type: str = snapshot_class.__name__.replace(
"SnapshotClass", ""
).lower()
entity_type = lowercase_entity_type_map[lowercase_entity_type]
key_aspect = KEY_ASPECTS[entity_type]
supported_aspect_names = set(key_aspect.get_aspect_info()["entityAspects"])
snapshot_aspect_types: List[Type[_Aspect]] = typing_inspect.get_args(
typing_inspect.get_args(
typing.get_type_hints(snapshot_class.__init__)["aspects"]
)[0]
)
# print(f"Entity type: {entity_type}")
# print(f"Supported aspects: {supported_aspect_names}")
# print(f"Snapshot aspects: {snapshot_aspect_types}")
for aspect_type in snapshot_aspect_types:
if aspect_type == key_aspect:
continue
aspect_name = aspect_type.ASPECT_NAME
if aspect_name not in supported_aspect_names:
if _UPDATE_ENTITY_REGISTRY:
_add_to_registry(entity_type, aspect_name)
else:
error = f"entity {entity_type}: aspect {aspect_name} is missing from the entity registry"
print(error)
errors.append(error)
assert (
not errors
), f'To fix these errors, run "UPDATE_ENTITY_REGISTRY=true pytest {__file__}"'

View File

@ -22,6 +22,18 @@ entities:
- testResults
- siblings
- embed
- datasetProperties
- editableDatasetProperties
- datasetDeprecation
- datasetUpstreamLineage
- upstreamLineage
- institutionalMemory
- ownership
- editableSchemaMetadata
- globalTags
- glossaryTerms
- browsePaths
- dataPlatformInstance
- name: dataHubPolicy
doc: DataHub Policies represent access policies granted to users or groups on metadata operations like edit, view etc.
category: internal
@ -36,6 +48,16 @@ entities:
- domains
- deprecation
- versionInfo
- dataJobInfo
- dataJobInputOutput
- editableDataJobProperties
- ownership
- status
- globalTags
- browsePaths
- glossaryTerms
- institutionalMemory
- dataPlatformInstance
- name: dataFlow
category: core
keyAspect: dataFlowKey
@ -43,6 +65,21 @@ entities:
- domains
- deprecation
- versionInfo
- dataFlowInfo
- editableDataFlowProperties
- ownership
- status
- globalTags
- browsePaths
- glossaryTerms
- institutionalMemory
- dataPlatformInstance
- name: dataProcess
keyAspect: dataProcessKey
aspects:
- dataProcessInfo
- ownership
- status
- name: dataProcessInstance
doc: DataProcessInstance represents an instance of a datajob/jobflow run
keyAspect: dataProcessInstanceKey
@ -55,12 +92,22 @@ entities:
- name: chart
keyAspect: chartKey
aspects:
- domains
- container
- deprecation
- chartInfo
- editableChartProperties
- chartQuery
- inputFields
- chartUsageStatistics
- embed
- browsePaths
- domains
- container
- deprecation
- ownership
- status
- institutionalMemory
- dataPlatformInstance
- globalTags
- glossaryTerms
- name: dashboard
keyAspect: dashboardKey
aspects:
@ -71,6 +118,15 @@ entities:
- inputFields
- subTypes
- embed
- dashboardInfo
- editableDashboardProperties
- ownership
- status
- globalTags
- browsePaths
- glossaryTerms
- institutionalMemory
- dataPlatformInstance
- name: notebook
doc: Notebook represents a combination of query, text, chart and etc. This is in BETA version
keyAspect: notebookKey
@ -141,15 +197,19 @@ entities:
- tagProperties
- ownership
- deprecation
- status
- name: glossaryTerm
keyAspect: glossaryTermKey
aspects:
- glossaryTermInfo
- glossaryRelatedTerms
- institutionalMemory
- schemaMetadata
- ownership
- deprecation
- domains
- status
- browsePaths
- name: glossaryNode
keyAspect: glossaryNodeKey
aspects:
@ -205,6 +265,24 @@ entities:
- glossaryTerms
- editableMlModelProperties
- domains
- ownership
- mlModelProperties
- intendedUse
- mlModelFactorPrompts
- mlModelMetrics
- mlModelEvaluationData
- mlModelTrainingData
- mlModelQuantitativeAnalyses
- mlModelEthicalConsiderations
- mlModelCaveatsAndRecommendations
- institutionalMemory
- sourceCode
- status
- cost
- deprecation
- browsePaths
- globalTags
- dataPlatformInstance
- name: mlModelGroup
category: core
keyAspect: mlModelGroupKey
@ -212,6 +290,23 @@ entities:
- glossaryTerms
- editableMlModelGroupProperties
- domains
- mlModelGroupProperties
- ownership
- status
- deprecation
- browsePaths
- globalTags
- dataPlatformInstance
- name: mlModelDeployment
category: core
keyAspect: mlModelDeploymentKey
aspects:
- mlModelDeploymentProperties
- ownership
- status
- deprecation
- globalTags
- dataPlatformInstance
- name: mlFeatureTable
category: core
keyAspect: mlFeatureTableKey
@ -219,6 +314,14 @@ entities:
- glossaryTerms
- editableMlFeatureTableProperties
- domains
- mlFeatureTableProperties
- ownership
- institutionalMemory
- status
- deprecation
- browsePaths
- globalTags
- dataPlatformInstance
- name: mlFeature
category: core
keyAspect: mlFeatureKey
@ -226,6 +329,14 @@ entities:
- glossaryTerms
- editableMlFeatureProperties
- domains
- mlFeatureProperties
- ownership
- institutionalMemory
- status
- deprecation
- browsePaths
- globalTags
- dataPlatformInstance
- name: mlPrimaryKey
category: core
keyAspect: mlPrimaryKeyKey
@ -233,6 +344,13 @@ entities:
- glossaryTerms
- editableMlPrimaryKeyProperties
- domains
- mlPrimaryKeyProperties
- ownership
- institutionalMemory
- status
- deprecation
- globalTags
- dataPlatformInstance
- name: telemetry
category: internal
keyAspect: telemetryKey