diff --git a/metadata-ingestion/scripts/avro_codegen.py b/metadata-ingestion/scripts/avro_codegen.py index 88a328ae7d..fbd8204b45 100644 --- a/metadata-ingestion/scripts/avro_codegen.py +++ b/metadata-ingestion/scripts/avro_codegen.py @@ -1,3 +1,4 @@ +import collections import json import re from pathlib import Path @@ -360,6 +361,15 @@ def generate( f'Entity key {entity.keyAspect} is used by {aspect["Aspect"]["keyForEntity"]} and {entity.name}' ) + # Also require that the aspect list is deduplicated. + duplicate_aspects = collections.Counter(entity.aspects) - collections.Counter( + set(entity.aspects) + ) + if duplicate_aspects: + raise ValueError( + f"Entity {entity.name} has duplicate aspects: {duplicate_aspects}" + ) + aspect["Aspect"]["keyForEntity"] = entity.name aspect["Aspect"]["entityCategory"] = entity.category aspect["Aspect"]["entityAspects"] = entity.aspects diff --git a/metadata-ingestion/tests/unit/test_codegen.py b/metadata-ingestion/tests/unit/test_codegen.py index 3dec1e11b6..b0dc97746c 100644 --- a/metadata-ingestion/tests/unit/test_codegen.py +++ b/metadata-ingestion/tests/unit/test_codegen.py @@ -1,15 +1,27 @@ +import os +import pathlib +import typing +from typing import List, Type + import pytest +import typing_inspect from datahub.metadata.schema_classes import ( ASPECT_CLASSES, KEY_ASPECTS, FineGrainedLineageClass, + MetadataChangeEventClass, OwnershipClass, TelemetryKeyClass, UpstreamClass, _Aspect, ) +_UPDATE_ENTITY_REGISTRY = os.getenv("UPDATE_ENTITY_REGISTRY", "false").lower() == "true" +ENTITY_REGISTRY_PATH = pathlib.Path( + "../metadata-models/src/main/resources/entity-registry.yml" +) + def test_class_filter() -> None: # The codegen should only generate classes for aspects and a few extra classes. @@ -67,3 +79,74 @@ def test_urn_annotation(): assert FineGrainedLineageClass.RECORD_SCHEMA.fields_dict["upstreams"].get_prop( "urn_is_array" ) + + +def _add_to_registry(entity: str, aspect: str) -> None: + from ruamel.yaml import YAML + + yaml = YAML() + + doc = yaml.load(ENTITY_REGISTRY_PATH) + + for entry in doc["entities"]: + if entry["name"] == entity: + entry["aspects"].append(aspect) + break + else: + raise ValueError( + f'could not find entity "{entity}" in entity registry at {ENTITY_REGISTRY_PATH}' + ) + + # Prevent line wrapping + preserve indentation. + yaml.width = 2**20 # type: ignore[assignment] + yaml.indent(mapping=2, sequence=4, offset=2) + yaml.dump(doc, ENTITY_REGISTRY_PATH) + + +def test_entity_registry_completeness(): + # The snapshot classes can have aspects that the entity registry doesn't know about. + # This ensures that we don't have any of those cases. + + errors: List[str] = [] + + snapshot_classes: List[Type] = typing_inspect.get_args( + typing.get_type_hints(MetadataChangeEventClass.__init__)["proposedSnapshot"] + ) + + lowercase_entity_type_map = {name.lower(): name for name in KEY_ASPECTS} + + for snapshot_class in snapshot_classes: + lowercase_entity_type: str = snapshot_class.__name__.replace( + "SnapshotClass", "" + ).lower() + entity_type = lowercase_entity_type_map[lowercase_entity_type] + + key_aspect = KEY_ASPECTS[entity_type] + supported_aspect_names = set(key_aspect.get_aspect_info()["entityAspects"]) + + snapshot_aspect_types: List[Type[_Aspect]] = typing_inspect.get_args( + typing_inspect.get_args( + typing.get_type_hints(snapshot_class.__init__)["aspects"] + )[0] + ) + + # print(f"Entity type: {entity_type}") + # print(f"Supported aspects: {supported_aspect_names}") + # print(f"Snapshot aspects: {snapshot_aspect_types}") + + for aspect_type in snapshot_aspect_types: + if aspect_type == key_aspect: + continue + + aspect_name = aspect_type.ASPECT_NAME + if aspect_name not in supported_aspect_names: + if _UPDATE_ENTITY_REGISTRY: + _add_to_registry(entity_type, aspect_name) + else: + error = f"entity {entity_type}: aspect {aspect_name} is missing from the entity registry" + print(error) + errors.append(error) + + assert ( + not errors + ), f'To fix these errors, run "UPDATE_ENTITY_REGISTRY=true pytest {__file__}"' diff --git a/metadata-models/src/main/resources/entity-registry.yml b/metadata-models/src/main/resources/entity-registry.yml index c8a184b98c..89368397c8 100644 --- a/metadata-models/src/main/resources/entity-registry.yml +++ b/metadata-models/src/main/resources/entity-registry.yml @@ -22,6 +22,18 @@ entities: - testResults - siblings - embed + - datasetProperties + - editableDatasetProperties + - datasetDeprecation + - datasetUpstreamLineage + - upstreamLineage + - institutionalMemory + - ownership + - editableSchemaMetadata + - globalTags + - glossaryTerms + - browsePaths + - dataPlatformInstance - name: dataHubPolicy doc: DataHub Policies represent access policies granted to users or groups on metadata operations like edit, view etc. category: internal @@ -36,6 +48,16 @@ entities: - domains - deprecation - versionInfo + - dataJobInfo + - dataJobInputOutput + - editableDataJobProperties + - ownership + - status + - globalTags + - browsePaths + - glossaryTerms + - institutionalMemory + - dataPlatformInstance - name: dataFlow category: core keyAspect: dataFlowKey @@ -43,6 +65,21 @@ entities: - domains - deprecation - versionInfo + - dataFlowInfo + - editableDataFlowProperties + - ownership + - status + - globalTags + - browsePaths + - glossaryTerms + - institutionalMemory + - dataPlatformInstance + - name: dataProcess + keyAspect: dataProcessKey + aspects: + - dataProcessInfo + - ownership + - status - name: dataProcessInstance doc: DataProcessInstance represents an instance of a datajob/jobflow run keyAspect: dataProcessInstanceKey @@ -55,12 +92,22 @@ entities: - name: chart keyAspect: chartKey aspects: - - domains - - container - - deprecation + - chartInfo + - editableChartProperties + - chartQuery - inputFields - chartUsageStatistics - embed + - browsePaths + - domains + - container + - deprecation + - ownership + - status + - institutionalMemory + - dataPlatformInstance + - globalTags + - glossaryTerms - name: dashboard keyAspect: dashboardKey aspects: @@ -71,6 +118,15 @@ entities: - inputFields - subTypes - embed + - dashboardInfo + - editableDashboardProperties + - ownership + - status + - globalTags + - browsePaths + - glossaryTerms + - institutionalMemory + - dataPlatformInstance - name: notebook doc: Notebook represents a combination of query, text, chart and etc. This is in BETA version keyAspect: notebookKey @@ -141,15 +197,19 @@ entities: - tagProperties - ownership - deprecation + - status - name: glossaryTerm keyAspect: glossaryTermKey aspects: - glossaryTermInfo + - glossaryRelatedTerms - institutionalMemory - schemaMetadata - ownership - deprecation - domains + - status + - browsePaths - name: glossaryNode keyAspect: glossaryNodeKey aspects: @@ -205,6 +265,24 @@ entities: - glossaryTerms - editableMlModelProperties - domains + - ownership + - mlModelProperties + - intendedUse + - mlModelFactorPrompts + - mlModelMetrics + - mlModelEvaluationData + - mlModelTrainingData + - mlModelQuantitativeAnalyses + - mlModelEthicalConsiderations + - mlModelCaveatsAndRecommendations + - institutionalMemory + - sourceCode + - status + - cost + - deprecation + - browsePaths + - globalTags + - dataPlatformInstance - name: mlModelGroup category: core keyAspect: mlModelGroupKey @@ -212,6 +290,23 @@ entities: - glossaryTerms - editableMlModelGroupProperties - domains + - mlModelGroupProperties + - ownership + - status + - deprecation + - browsePaths + - globalTags + - dataPlatformInstance + - name: mlModelDeployment + category: core + keyAspect: mlModelDeploymentKey + aspects: + - mlModelDeploymentProperties + - ownership + - status + - deprecation + - globalTags + - dataPlatformInstance - name: mlFeatureTable category: core keyAspect: mlFeatureTableKey @@ -219,6 +314,14 @@ entities: - glossaryTerms - editableMlFeatureTableProperties - domains + - mlFeatureTableProperties + - ownership + - institutionalMemory + - status + - deprecation + - browsePaths + - globalTags + - dataPlatformInstance - name: mlFeature category: core keyAspect: mlFeatureKey @@ -226,6 +329,14 @@ entities: - glossaryTerms - editableMlFeatureProperties - domains + - mlFeatureProperties + - ownership + - institutionalMemory + - status + - deprecation + - browsePaths + - globalTags + - dataPlatformInstance - name: mlPrimaryKey category: core keyAspect: mlPrimaryKeyKey @@ -233,6 +344,13 @@ entities: - glossaryTerms - editableMlPrimaryKeyProperties - domains + - mlPrimaryKeyProperties + - ownership + - institutionalMemory + - status + - deprecation + - globalTags + - dataPlatformInstance - name: telemetry category: internal keyAspect: telemetryKey