mirror of
https://github.com/datahub-project/datahub.git
synced 2025-08-18 14:16:48 +00:00
feat(model): fully populate the entity registry (#7818)
This commit is contained in:
parent
97ac8d93f8
commit
af566e1184
@ -1,3 +1,4 @@
|
||||
import collections
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
@ -360,6 +361,15 @@ def generate(
|
||||
f'Entity key {entity.keyAspect} is used by {aspect["Aspect"]["keyForEntity"]} and {entity.name}'
|
||||
)
|
||||
|
||||
# Also require that the aspect list is deduplicated.
|
||||
duplicate_aspects = collections.Counter(entity.aspects) - collections.Counter(
|
||||
set(entity.aspects)
|
||||
)
|
||||
if duplicate_aspects:
|
||||
raise ValueError(
|
||||
f"Entity {entity.name} has duplicate aspects: {duplicate_aspects}"
|
||||
)
|
||||
|
||||
aspect["Aspect"]["keyForEntity"] = entity.name
|
||||
aspect["Aspect"]["entityCategory"] = entity.category
|
||||
aspect["Aspect"]["entityAspects"] = entity.aspects
|
||||
|
@ -1,15 +1,27 @@
|
||||
import os
|
||||
import pathlib
|
||||
import typing
|
||||
from typing import List, Type
|
||||
|
||||
import pytest
|
||||
import typing_inspect
|
||||
|
||||
from datahub.metadata.schema_classes import (
|
||||
ASPECT_CLASSES,
|
||||
KEY_ASPECTS,
|
||||
FineGrainedLineageClass,
|
||||
MetadataChangeEventClass,
|
||||
OwnershipClass,
|
||||
TelemetryKeyClass,
|
||||
UpstreamClass,
|
||||
_Aspect,
|
||||
)
|
||||
|
||||
_UPDATE_ENTITY_REGISTRY = os.getenv("UPDATE_ENTITY_REGISTRY", "false").lower() == "true"
|
||||
ENTITY_REGISTRY_PATH = pathlib.Path(
|
||||
"../metadata-models/src/main/resources/entity-registry.yml"
|
||||
)
|
||||
|
||||
|
||||
def test_class_filter() -> None:
|
||||
# The codegen should only generate classes for aspects and a few extra classes.
|
||||
@ -67,3 +79,74 @@ def test_urn_annotation():
|
||||
assert FineGrainedLineageClass.RECORD_SCHEMA.fields_dict["upstreams"].get_prop(
|
||||
"urn_is_array"
|
||||
)
|
||||
|
||||
|
||||
def _add_to_registry(entity: str, aspect: str) -> None:
|
||||
from ruamel.yaml import YAML
|
||||
|
||||
yaml = YAML()
|
||||
|
||||
doc = yaml.load(ENTITY_REGISTRY_PATH)
|
||||
|
||||
for entry in doc["entities"]:
|
||||
if entry["name"] == entity:
|
||||
entry["aspects"].append(aspect)
|
||||
break
|
||||
else:
|
||||
raise ValueError(
|
||||
f'could not find entity "{entity}" in entity registry at {ENTITY_REGISTRY_PATH}'
|
||||
)
|
||||
|
||||
# Prevent line wrapping + preserve indentation.
|
||||
yaml.width = 2**20 # type: ignore[assignment]
|
||||
yaml.indent(mapping=2, sequence=4, offset=2)
|
||||
yaml.dump(doc, ENTITY_REGISTRY_PATH)
|
||||
|
||||
|
||||
def test_entity_registry_completeness():
|
||||
# The snapshot classes can have aspects that the entity registry doesn't know about.
|
||||
# This ensures that we don't have any of those cases.
|
||||
|
||||
errors: List[str] = []
|
||||
|
||||
snapshot_classes: List[Type] = typing_inspect.get_args(
|
||||
typing.get_type_hints(MetadataChangeEventClass.__init__)["proposedSnapshot"]
|
||||
)
|
||||
|
||||
lowercase_entity_type_map = {name.lower(): name for name in KEY_ASPECTS}
|
||||
|
||||
for snapshot_class in snapshot_classes:
|
||||
lowercase_entity_type: str = snapshot_class.__name__.replace(
|
||||
"SnapshotClass", ""
|
||||
).lower()
|
||||
entity_type = lowercase_entity_type_map[lowercase_entity_type]
|
||||
|
||||
key_aspect = KEY_ASPECTS[entity_type]
|
||||
supported_aspect_names = set(key_aspect.get_aspect_info()["entityAspects"])
|
||||
|
||||
snapshot_aspect_types: List[Type[_Aspect]] = typing_inspect.get_args(
|
||||
typing_inspect.get_args(
|
||||
typing.get_type_hints(snapshot_class.__init__)["aspects"]
|
||||
)[0]
|
||||
)
|
||||
|
||||
# print(f"Entity type: {entity_type}")
|
||||
# print(f"Supported aspects: {supported_aspect_names}")
|
||||
# print(f"Snapshot aspects: {snapshot_aspect_types}")
|
||||
|
||||
for aspect_type in snapshot_aspect_types:
|
||||
if aspect_type == key_aspect:
|
||||
continue
|
||||
|
||||
aspect_name = aspect_type.ASPECT_NAME
|
||||
if aspect_name not in supported_aspect_names:
|
||||
if _UPDATE_ENTITY_REGISTRY:
|
||||
_add_to_registry(entity_type, aspect_name)
|
||||
else:
|
||||
error = f"entity {entity_type}: aspect {aspect_name} is missing from the entity registry"
|
||||
print(error)
|
||||
errors.append(error)
|
||||
|
||||
assert (
|
||||
not errors
|
||||
), f'To fix these errors, run "UPDATE_ENTITY_REGISTRY=true pytest {__file__}"'
|
||||
|
@ -22,6 +22,18 @@ entities:
|
||||
- testResults
|
||||
- siblings
|
||||
- embed
|
||||
- datasetProperties
|
||||
- editableDatasetProperties
|
||||
- datasetDeprecation
|
||||
- datasetUpstreamLineage
|
||||
- upstreamLineage
|
||||
- institutionalMemory
|
||||
- ownership
|
||||
- editableSchemaMetadata
|
||||
- globalTags
|
||||
- glossaryTerms
|
||||
- browsePaths
|
||||
- dataPlatformInstance
|
||||
- name: dataHubPolicy
|
||||
doc: DataHub Policies represent access policies granted to users or groups on metadata operations like edit, view etc.
|
||||
category: internal
|
||||
@ -36,6 +48,16 @@ entities:
|
||||
- domains
|
||||
- deprecation
|
||||
- versionInfo
|
||||
- dataJobInfo
|
||||
- dataJobInputOutput
|
||||
- editableDataJobProperties
|
||||
- ownership
|
||||
- status
|
||||
- globalTags
|
||||
- browsePaths
|
||||
- glossaryTerms
|
||||
- institutionalMemory
|
||||
- dataPlatformInstance
|
||||
- name: dataFlow
|
||||
category: core
|
||||
keyAspect: dataFlowKey
|
||||
@ -43,6 +65,21 @@ entities:
|
||||
- domains
|
||||
- deprecation
|
||||
- versionInfo
|
||||
- dataFlowInfo
|
||||
- editableDataFlowProperties
|
||||
- ownership
|
||||
- status
|
||||
- globalTags
|
||||
- browsePaths
|
||||
- glossaryTerms
|
||||
- institutionalMemory
|
||||
- dataPlatformInstance
|
||||
- name: dataProcess
|
||||
keyAspect: dataProcessKey
|
||||
aspects:
|
||||
- dataProcessInfo
|
||||
- ownership
|
||||
- status
|
||||
- name: dataProcessInstance
|
||||
doc: DataProcessInstance represents an instance of a datajob/jobflow run
|
||||
keyAspect: dataProcessInstanceKey
|
||||
@ -55,12 +92,22 @@ entities:
|
||||
- name: chart
|
||||
keyAspect: chartKey
|
||||
aspects:
|
||||
- domains
|
||||
- container
|
||||
- deprecation
|
||||
- chartInfo
|
||||
- editableChartProperties
|
||||
- chartQuery
|
||||
- inputFields
|
||||
- chartUsageStatistics
|
||||
- embed
|
||||
- browsePaths
|
||||
- domains
|
||||
- container
|
||||
- deprecation
|
||||
- ownership
|
||||
- status
|
||||
- institutionalMemory
|
||||
- dataPlatformInstance
|
||||
- globalTags
|
||||
- glossaryTerms
|
||||
- name: dashboard
|
||||
keyAspect: dashboardKey
|
||||
aspects:
|
||||
@ -71,6 +118,15 @@ entities:
|
||||
- inputFields
|
||||
- subTypes
|
||||
- embed
|
||||
- dashboardInfo
|
||||
- editableDashboardProperties
|
||||
- ownership
|
||||
- status
|
||||
- globalTags
|
||||
- browsePaths
|
||||
- glossaryTerms
|
||||
- institutionalMemory
|
||||
- dataPlatformInstance
|
||||
- name: notebook
|
||||
doc: Notebook represents a combination of query, text, chart and etc. This is in BETA version
|
||||
keyAspect: notebookKey
|
||||
@ -141,15 +197,19 @@ entities:
|
||||
- tagProperties
|
||||
- ownership
|
||||
- deprecation
|
||||
- status
|
||||
- name: glossaryTerm
|
||||
keyAspect: glossaryTermKey
|
||||
aspects:
|
||||
- glossaryTermInfo
|
||||
- glossaryRelatedTerms
|
||||
- institutionalMemory
|
||||
- schemaMetadata
|
||||
- ownership
|
||||
- deprecation
|
||||
- domains
|
||||
- status
|
||||
- browsePaths
|
||||
- name: glossaryNode
|
||||
keyAspect: glossaryNodeKey
|
||||
aspects:
|
||||
@ -205,6 +265,24 @@ entities:
|
||||
- glossaryTerms
|
||||
- editableMlModelProperties
|
||||
- domains
|
||||
- ownership
|
||||
- mlModelProperties
|
||||
- intendedUse
|
||||
- mlModelFactorPrompts
|
||||
- mlModelMetrics
|
||||
- mlModelEvaluationData
|
||||
- mlModelTrainingData
|
||||
- mlModelQuantitativeAnalyses
|
||||
- mlModelEthicalConsiderations
|
||||
- mlModelCaveatsAndRecommendations
|
||||
- institutionalMemory
|
||||
- sourceCode
|
||||
- status
|
||||
- cost
|
||||
- deprecation
|
||||
- browsePaths
|
||||
- globalTags
|
||||
- dataPlatformInstance
|
||||
- name: mlModelGroup
|
||||
category: core
|
||||
keyAspect: mlModelGroupKey
|
||||
@ -212,6 +290,23 @@ entities:
|
||||
- glossaryTerms
|
||||
- editableMlModelGroupProperties
|
||||
- domains
|
||||
- mlModelGroupProperties
|
||||
- ownership
|
||||
- status
|
||||
- deprecation
|
||||
- browsePaths
|
||||
- globalTags
|
||||
- dataPlatformInstance
|
||||
- name: mlModelDeployment
|
||||
category: core
|
||||
keyAspect: mlModelDeploymentKey
|
||||
aspects:
|
||||
- mlModelDeploymentProperties
|
||||
- ownership
|
||||
- status
|
||||
- deprecation
|
||||
- globalTags
|
||||
- dataPlatformInstance
|
||||
- name: mlFeatureTable
|
||||
category: core
|
||||
keyAspect: mlFeatureTableKey
|
||||
@ -219,6 +314,14 @@ entities:
|
||||
- glossaryTerms
|
||||
- editableMlFeatureTableProperties
|
||||
- domains
|
||||
- mlFeatureTableProperties
|
||||
- ownership
|
||||
- institutionalMemory
|
||||
- status
|
||||
- deprecation
|
||||
- browsePaths
|
||||
- globalTags
|
||||
- dataPlatformInstance
|
||||
- name: mlFeature
|
||||
category: core
|
||||
keyAspect: mlFeatureKey
|
||||
@ -226,6 +329,14 @@ entities:
|
||||
- glossaryTerms
|
||||
- editableMlFeatureProperties
|
||||
- domains
|
||||
- mlFeatureProperties
|
||||
- ownership
|
||||
- institutionalMemory
|
||||
- status
|
||||
- deprecation
|
||||
- browsePaths
|
||||
- globalTags
|
||||
- dataPlatformInstance
|
||||
- name: mlPrimaryKey
|
||||
category: core
|
||||
keyAspect: mlPrimaryKeyKey
|
||||
@ -233,6 +344,13 @@ entities:
|
||||
- glossaryTerms
|
||||
- editableMlPrimaryKeyProperties
|
||||
- domains
|
||||
- mlPrimaryKeyProperties
|
||||
- ownership
|
||||
- institutionalMemory
|
||||
- status
|
||||
- deprecation
|
||||
- globalTags
|
||||
- dataPlatformInstance
|
||||
- name: telemetry
|
||||
category: internal
|
||||
keyAspect: telemetryKey
|
||||
|
Loading…
x
Reference in New Issue
Block a user