diff --git a/docs/api/tutorials/structured-properties.md b/docs/api/tutorials/structured-properties.md index 2caa015e20..ed270811b8 100644 --- a/docs/api/tutorials/structured-properties.md +++ b/docs/api/tutorials/structured-properties.md @@ -6,7 +6,7 @@ import TabItem from '@theme/TabItem'; ## Why Would You Use Structured Properties? Structured properties are a structured, named set of properties that can be attached to logical entities like Datasets, DataJobs, etc. -Structured properties have values that are types. Conceptually, they are like “field definitions”. +Structured properties have values that are typed and support constraints. Learn more about structured properties in the [Structured Properties Feature Guide](../../../docs/features/feature-guides/properties/overview.md). @@ -15,6 +15,7 @@ Learn more about structured properties in the [Structured Properties Feature Gui This guide will show you how to execute the following actions with structured properties. - Create structured properties +- List structured properties - Read structured properties - Delete structured properties - Add structured properties to a dataset @@ -32,7 +33,8 @@ Additionally, you need to have the following tools installed according to the me -Install the relevant CLI version. Forms are available as of CLI version `0.13.1`. The corresponding DataHub Cloud release version is `v0.2.16.5` +Install the relevant CLI version. +Structured Properties were introduced in version `0.13.1`, but we continuously improve and add new functionality, so you should always [upgrade](https://datahubproject.io/docs/cli/#installation) to the latest cli for best results. Connect to your instance via [init](https://datahubproject.io/docs/cli/#init): - Run `datahub init` to update the instance you want to load into. @@ -56,33 +58,8 @@ Requirements for OpenAPI are: The following code will create a structured property `io.acryl.privacy.retentionTime`. - -```graphql -mutation createStructuredProperty { - createStructuredProperty( - input: { - id: "retentionTime", - qualifiedName:"retentionTime", - displayName: "Retention Time", - description: "Retention Time is used to figure out how long to retain records in a dataset", - valueType: "urn:li:dataType:datahub.number", - allowedValues: [ - {numberValue: 30, description: "30 days, usually reserved for datasets that are ephemeral and contain pii"}, - {numberValue: 90, description:"description: Use this for datasets that drive monthly reporting but contain pii"}, - {numberValue: 365, description:"Use this for non-sensitive data that can be retained for longer"} - ], - cardinality: SINGLE, - entityTypes: ["urn:li:entityType:datahub.dataset", "urn:li:entityType:datahub.dataFlow"], - } - ) { - urn - } -} -``` - - - + Create a yaml file representing the properties you’d like to load. For example, below file represents a property `io.acryl.privacy.retentionTime`. You can see the full example [here](https://github.com/datahub-project/datahub/blob/example-yaml-sp/metadata-ingestion/examples/structured_properties/struct_props.yaml). @@ -108,13 +85,41 @@ For example, below file represents a property `io.acryl.privacy.retentionTime`. ``` Use the CLI to create your properties: -```commandline +```shell datahub properties upsert -f {properties_yaml} ``` If successful, you should see `Created structured property urn:li:structuredProperty:...` + + + +```graphql +mutation createStructuredProperty { + createStructuredProperty( + input: { + id: "retentionTime", + qualifiedName:"retentionTime", + displayName: "Retention Time", + description: "Retention Time is used to figure out how long to retain records in a dataset", + valueType: "urn:li:dataType:datahub.number", + allowedValues: [ + {numberValue: 30, description: "30 days, usually reserved for datasets that are ephemeral and contain pii"}, + {numberValue: 90, description:"description: Use this for datasets that drive monthly reporting but contain pii"}, + {numberValue: 365, description:"Use this for non-sensitive data that can be retained for longer"} + ], + cardinality: SINGLE, + entityTypes: ["urn:li:entityType:datahub.dataset", "urn:li:entityType:datahub.dataFlow"], + } + ) { + urn + } +} +``` + + + ```shell @@ -236,9 +241,182 @@ Example Response: -## Read Structured Properties +## List Structured Properties -You can see the properties you created by running the following command: +You can list all structured properties in your DataHub instance using the following methods: + + + + +```shell +datahub properties list +``` + +This will show all properties with their full details. + +Example Response: +```json +{ + "urn": "urn:li:structuredProperty:clusterName", + "qualified_name": "clusterName", + "type": "urn:li:dataType:datahub.string", + "description": "Test Cluster Name Property", + "display_name": "Cluster's name", + "entity_types": [ + "urn:li:entityType:datahub.dataset" + ], + "cardinality": "SINGLE" +} +{ + "urn": "urn:li:structuredProperty:projectNames", + "qualified_name": "projectNames", + "type": "urn:li:dataType:datahub.string", + "description": "Test property for project name", + "display_name": "Project Name", + "entity_types": [ + "urn:li:entityType:datahub.dataset", + "urn:li:entityType:datahub.dataFlow" + ], + "cardinality": "MULTIPLE", + "allowed_values": [ + { + "value": "Tracking", + "description": "test value 1 for project" + }, + { + "value": "DataHub", + "description": "test value 2 for project" + } + ] +} +``` + + +If you only want to see the URNs, you can use: + +```shell +datahub properties list --no-details +``` + +Example Response: +``` +[2025-01-08 22:23:00,625] INFO {datahub.cli.specific.structuredproperties_cli:134} - Listing structured property urns only, use --details for more information +urn:li:structuredProperty:clusterName +urn:li:structuredProperty:clusterType +urn:li:structuredProperty:io.acryl.dataManagement.deprecationDate +urn:li:structuredProperty:projectNames +``` + +To download all the structured property definitions into a single file that you can use with the `upsert` command as described in the [create section](#create-structured-properties), you can run the list command with the `--to-file` option. + +```shell +datahub properties list --to-file structured_properties.yaml +``` + +Example Response: +```yaml + - urn: urn:li:structuredProperty:clusterName + qualified_name: clusterName + type: urn:li:dataType:datahub.string + description: Test Cluster Name Property + display_name: Cluster's name + entity_types: + - urn:li:entityType:datahub.dataset + cardinality: SINGLE + - urn: urn:li:structuredProperty:clusterType + qualified_name: clusterType + type: urn:li:dataType:datahub.string + description: Test Cluster Type Property + display_name: Cluster's type + entity_types: + - urn:li:entityType:datahub.dataset + cardinality: SINGLE + - urn: urn:li:structuredProperty:io.acryl.dataManagement.deprecationDate + qualified_name: io.acryl.dataManagement.deprecationDate + type: urn:li:dataType:datahub.date + display_name: Deprecation Date + entity_types: + - urn:li:entityType:datahub.dataset + - urn:li:entityType:datahub.dataFlow + - urn:li:entityType:datahub.dataJob + - urn:li:entityType:datahub.schemaField + cardinality: SINGLE + - urn: urn:li:structuredProperty:io.acryl.privacy.enumProperty5712 + qualified_name: io.acryl.privacy.enumProperty5712 + type: urn:li:dataType:datahub.string + description: The retention policy for the dataset + entity_types: + - urn:li:entityType:datahub.dataset + cardinality: MULTIPLE + allowed_values: + - value: foo + - value: bar +... etc. +``` + + + + + +Example Request: +```bash +curl -X 'GET' \ + 'http://localhost:9002/openapi/v3/entity/structuredproperty?systemMetadata=false&includeSoftDelete=false&skipCache=false&aspects=structuredPropertySettings&aspects=propertyDefinition&aspects=institutionalMemory&aspects=structuredPropertyKey&aspects=status&count=10&sortCriteria=urn&sortOrder=ASCENDING&query=*' \ + -H 'accept: application/json' +``` + +Example Response: +```json +{ + "scrollId": "...", + "entities": [ + { + "urn": "urn:li:structuredProperty:clusterName", + "propertyDefinition": { + "value": { + "immutable": false, + "qualifiedName": "clusterName", + "displayName": "Cluster's name", + "valueType": "urn:li:dataType:datahub.string", + "description": "Test Cluster Name Property", + "entityTypes": [ + "urn:li:entityType:datahub.dataset" + ], + "cardinality": "SINGLE" + } + }, + "structuredPropertyKey": { + "value": { + "id": "clusterName" + } + } + } + ] +} +``` + +Key Query Parameters: +- `count`: Number of results to return per page (default: 10) +- `sortCriteria`: Field to sort by (default: urn) +- `sortOrder`: Sort order (ASCENDING or DESCENDING) +- `query`: Search query to filter properties (* for all) + + + + +The list endpoint returns all structured properties in your DataHub instance. Each property includes: +- URN: Unique identifier for the property +- Qualified Name: The property's qualified name +- Type: The data type of the property (string, number, date, etc.) +- Description: A description of the property's purpose +- Display Name: Human-readable name for the property +- Entity Types: The types of entities this property can be applied to +- Cardinality: Whether the property accepts single (SINGLE) or multiple (MULTIPLE) values +- Allowed Values: If specified, the list of allowed values for this property + +## Read a single Structured Property + +You can read an individual property you created by running the following command: @@ -279,6 +457,91 @@ If successful, you should see metadata about your properties returned. } ``` + + + +Example Request: +```graphql +query { + structuredProperty(urn: "urn:li:structuredProperty:projectNames") { + urn + type + definition { + qualifiedName + displayName + description + cardinality + allowedValues { + value { + ... on StringValue { + stringValue + } + ... on NumberValue { + numberValue + } + } + description + } + entityTypes { + urn + info { + type + qualifiedName + } + } + } + } +} +``` + +Example Response: +```json +{ + "data": { + "structuredProperty": { + "urn": "urn:li:structuredProperty:projectNames", + "type": "STRUCTURED_PROPERTY", + "definition": { + "qualifiedName": "projectNames", + "displayName": "Project Name", + "description": "Test property for project name", + "cardinality": "MULTIPLE", + "allowedValues": [ + { + "value": { + "stringValue": "Tracking" + }, + "description": "test value 1 for project" + }, + { + "value": { + "stringValue": "DataHub" + }, + "description": "test value 2 for project" + } + ], + "entityTypes": [ + { + "urn": "urn:li:entityType:datahub.dataset", + "info": { + "type": "DATASET", + "qualifiedName": "datahub.dataset" + } + }, + { + "urn": "urn:li:entityType:datahub.dataFlow", + "info": { + "type": "DATA_FLOW", + "qualifiedName": "datahub.dataFlow" + } + } + ] + } + } + }, + "extensions": {} +} +``` @@ -389,7 +652,7 @@ Example Response: This action will set/replace all structured properties on the entity. See PATCH operations to add/remove a single property. - + ```graphql mutation upsertStructuredProperties { @@ -537,7 +800,7 @@ datahub dataset get --urn {urn} For reading all structured properties from a dataset: - + ```graphql query getDataset { diff --git a/metadata-ingestion/examples/structured_properties/list_structured_properties.py b/metadata-ingestion/examples/structured_properties/list_structured_properties.py new file mode 100644 index 0000000000..66ac90c122 --- /dev/null +++ b/metadata-ingestion/examples/structured_properties/list_structured_properties.py @@ -0,0 +1,12 @@ +# Usage: python3 list_structured_properties.py +# Expected Output: List of structured properties +# This script lists all structured properties in DataHub +from datahub.api.entities.structuredproperties.structuredproperties import ( + StructuredProperties, +) +from datahub.ingestion.graph.client import get_default_graph + +with get_default_graph() as graph: + structuredproperties = StructuredProperties.list(graph) + for structuredproperty in structuredproperties: + print(structuredproperty.dict()) diff --git a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py index 619f69b016..179dbdb231 100644 --- a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py +++ b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py @@ -1,7 +1,7 @@ import logging from enum import Enum from pathlib import Path -from typing import List, Optional +from typing import Iterable, List, Optional import yaml from pydantic import validator @@ -226,3 +226,14 @@ class StructuredProperties(ConfigModel): yaml.indent(mapping=2, sequence=4, offset=2) yaml.default_flow_style = False yaml.dump(self.dict(), fp) + + @staticmethod + def list_urns(graph: DataHubGraph) -> Iterable[str]: + return graph.get_urns_by_filter( + entity_types=["structuredProperty"], + ) + + @staticmethod + def list(graph: DataHubGraph) -> Iterable["StructuredProperties"]: + for urn in StructuredProperties.list_urns(graph): + yield StructuredProperties.from_datahub(graph, urn) diff --git a/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py b/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py index 42285cf13a..5cd28516a0 100644 --- a/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py +++ b/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py @@ -1,9 +1,11 @@ import json import logging from pathlib import Path +from typing import Iterable import click from click_default_group import DefaultGroup +from ruamel.yaml import YAML from datahub.api.entities.structuredproperties.structuredproperties import ( StructuredProperties, @@ -61,3 +63,85 @@ def get(urn: str, to_file: str) -> None: ) else: click.secho(f"Structured property {urn} does not exist") + + +@properties.command( + name="list", +) +@click.option("--details/--no-details", is_flag=True, default=True) +@click.option("--to-file", required=False, type=str) +@telemetry.with_telemetry() +def list(details: bool, to_file: str) -> None: + """List structured properties in DataHub""" + + def to_yaml_list( + objects: Iterable[StructuredProperties], # iterable of objects to dump + file: Path, + ) -> None: + # if file exists, first we read it + yaml = YAML(typ="rt") # default, if not specfied, is 'rt' (round-trip) + yaml.indent(mapping=2, sequence=4, offset=2) + yaml.default_flow_style = False + serialized_objects = [] + if file.exists(): + with open(file, "r") as fp: + existing_objects = yaml.load(fp) # this is a list of dicts + existing_objects = [ + StructuredProperties.parse_obj(obj) for obj in existing_objects + ] + objects = [obj for obj in objects] + # do a positional update of the existing objects + existing_urns = {obj.urn for obj in existing_objects} + # existing_urns = {obj["urn"] if "urn" in obj else f"urn:li:structuredProperty:{obj['id']}" for obj in existing_objects} + for i, obj in enumerate(existing_objects): + # existing_urn = obj["urn"] if "urn" in obj else f"urn:li:structuredProperty:{obj['id']}" + existing_urn = obj.urn + # breakpoint() + if existing_urn in {obj.urn for obj in objects}: + existing_objects[i] = next( + obj.dict(exclude_unset=True, exclude_none=True) + for obj in objects + if obj.urn == existing_urn + ) + new_objects = [ + obj.dict(exclude_unset=True, exclude_none=True) + for obj in objects + if obj.urn not in existing_urns + ] + serialized_objects = existing_objects + new_objects + else: + serialized_objects = [ + obj.dict(exclude_unset=True, exclude_none=True) for obj in objects + ] + + with open(file, "w") as fp: + yaml.dump(serialized_objects, fp) + + with get_default_graph() as graph: + if details: + logger.info( + "Listing structured properties with details. Use --no-details for urns only" + ) + structuredproperties = StructuredProperties.list(graph) + if to_file: + to_yaml_list(structuredproperties, Path(to_file)) + else: + for structuredproperty in structuredproperties: + click.secho( + f"{json.dumps(structuredproperty.dict(exclude_unset=True, exclude_none=True), indent=2)}" + ) + else: + logger.info( + "Listing structured property urns only, use --details for more information" + ) + structured_property_urns = StructuredProperties.list_urns(graph) + if to_file: + with open(to_file, "w") as f: + for urn in structured_property_urns: + f.write(f"{urn}\n") + click.secho( + f"Structured property urns written to {to_file}", fg="green" + ) + else: + for urn in structured_property_urns: + click.secho(f"{urn}") diff --git a/metadata-ingestion/tests/unit/structured_properties/test_structured_properties.py b/metadata-ingestion/tests/unit/structured_properties/test_structured_properties.py new file mode 100644 index 0000000000..d03b08b77d --- /dev/null +++ b/metadata-ingestion/tests/unit/structured_properties/test_structured_properties.py @@ -0,0 +1,213 @@ +from unittest.mock import Mock + +import pytest +import yaml + +from datahub.api.entities.structuredproperties.structuredproperties import ( + AllowedValue, + StructuredProperties, + TypeQualifierAllowedTypes, +) +from datahub.ingestion.graph.client import DataHubGraph +from datahub.metadata.schema_classes import ( + PropertyValueClass, + StructuredPropertyDefinitionClass, +) + + +@pytest.fixture +def sample_yaml_content(): + return """ +- id: test_property + type: string + description: Test description + display_name: Test Property + entity_types: + - dataset + cardinality: SINGLE + allowed_values: + - value: test_value + description: Test value description +""" + + +@pytest.fixture +def sample_yaml_file(tmp_path, sample_yaml_content): + yaml_file = tmp_path / "test_properties.yaml" + yaml_file.write_text(sample_yaml_content) + return str(yaml_file) + + +@pytest.fixture +def mock_graph(): + return Mock(spec=DataHubGraph) + + +def test_structured_properties_basic_creation(): + props = StructuredProperties( + id="test_prop", type="string", description="Test description" + ) + assert props.id == "test_prop" + assert props.type == "urn:li:dataType:datahub.string" + assert props.description == "Test description" + assert props.urn == "urn:li:structuredProperty:test_prop" + + +def test_structured_properties_validate_type(): + # Test valid types + props = StructuredProperties(id="test", type="string") + assert props.type == "urn:li:dataType:datahub.string" + + # Test invalid type + with pytest.raises(ValueError, match="Type .* is not allowed"): + StructuredProperties(id="test", type="invalid_type") + + +def test_structured_properties_validate_entity_types(): + # Test valid entity type + props = StructuredProperties(id="test", type="string", entity_types=["dataset"]) + assert props.entity_types + assert "urn:li:entityType:datahub.dataset" in props.entity_types + + # Test invalid entity type + with pytest.raises(ValueError, match="not a valid entity type"): + StructuredProperties(id="test", type="string", entity_types=["invalid_entity"]) + + +def test_structured_properties_from_yaml(sample_yaml_file): + props = StructuredProperties.from_yaml(sample_yaml_file) + assert len(props) == 1 + assert props[0].id == "test_property" + assert props[0].type == "urn:li:dataType:datahub.string" + assert props[0].description == "Test description" + assert props[0].display_name + assert props[0].display_name == "Test Property" + assert props[0].allowed_values + assert len(props[0].allowed_values) == 1 + assert props[0].allowed_values[0].value == "test_value" + + +def test_structured_properties_generate_mcps(): + props = StructuredProperties( + id="test_prop", + type="string", + description="Test description", + display_name="Test Property", + entity_types=["dataset"], + allowed_values=[ + AllowedValue(value="test_value", description="Test value description") + ], + ) + + mcps = props.generate_mcps() + assert len(mcps) == 1 + mcp = mcps[0] + + assert mcp.entityUrn == "urn:li:structuredProperty:test_prop" + assert isinstance(mcp.aspect, StructuredPropertyDefinitionClass) + assert mcp.aspect.valueType == "urn:li:dataType:datahub.string" + assert mcp.aspect.description == "Test description" + assert mcp.aspect.allowedValues + assert len(mcp.aspect.allowedValues) == 1 + assert mcp.aspect.allowedValues[0].value == "test_value" + + +def test_structured_properties_from_datahub(mock_graph): + mock_aspect = StructuredPropertyDefinitionClass( + qualifiedName="test_prop", + valueType="urn:li:dataType:datahub.string", + displayName="Test Property", + description="Test description", + entityTypes=["urn:li:entityType:datahub.dataset"], + cardinality="SINGLE", + allowedValues=[ + PropertyValueClass(value="test_value", description="Test description") + ], + ) + + mock_graph.get_aspect.return_value = mock_aspect + + props = StructuredProperties.from_datahub( + mock_graph, "urn:li:structuredProperty:test_prop" + ) + + assert props.qualified_name == "test_prop" + assert props.type == "urn:li:dataType:datahub.string" + assert props.display_name == "Test Property" + assert props.allowed_values + assert len(props.allowed_values) == 1 + assert props.allowed_values[0].value == "test_value" + + +def test_structured_properties_to_yaml(tmp_path): + props = StructuredProperties( + id="test_prop", + type="string", + description="Test description", + allowed_values=[ + AllowedValue(value="test_value", description="Test value description") + ], + ) + + yaml_file = tmp_path / "output.yaml" + props.to_yaml(yaml_file) + + # Verify the yaml file was created and contains expected content + assert yaml_file.exists() + with open(yaml_file) as f: + content = yaml.safe_load(f) + assert content["id"] == "test_prop" + assert content["type"] == "urn:li:dataType:datahub.string" + assert content["description"] == "Test description" + + +@pytest.mark.parametrize( + "input_type,expected_type", + [ + ("string", "urn:li:dataType:datahub.string"), + ("STRING", "urn:li:dataType:datahub.string"), + ("number", "urn:li:dataType:datahub.number"), + ("date", "urn:li:dataType:datahub.date"), + ], +) +def test_structured_properties_type_normalization(input_type, expected_type): + props = StructuredProperties(id="test_prop", type=input_type) + assert props.type == expected_type + + +def test_structured_properties_type_qualifier(): + props = StructuredProperties( + id="test_prop", + type="urn", + type_qualifier=TypeQualifierAllowedTypes(allowed_types=["dataset"]), + ) + + mcps = props.generate_mcps() + assert mcps[0].aspect + assert mcps[0].aspect.typeQualifier["allowedTypes"] == [ # type: ignore + "urn:li:entityType:datahub.dataset" + ] + + +def test_structured_properties_list(mock_graph): + mock_graph.get_urns_by_filter.return_value = [ + "urn:li:structuredProperty:prop1", + "urn:li:structuredProperty:prop2", + ] + + mock_aspect = StructuredPropertyDefinitionClass( + qualifiedName="test_prop", + valueType="urn:li:dataType:string", + entityTypes=["urn:li:entityType:datahub.dataset"], + ) + mock_graph.get_aspect.return_value = mock_aspect + + props = list(StructuredProperties.list(mock_graph)) + + # Verify get_urns_by_filter was called with correct arguments + mock_graph.get_urns_by_filter.assert_called_once_with( + entity_types=["structuredProperty"] + ) + + assert len(props) == 2 + assert all(isinstance(prop, StructuredProperties) for prop in props) diff --git a/smoke-test/tests/structured_properties/test_structured_properties.py b/smoke-test/tests/structured_properties/test_structured_properties.py index 533a03a557..e3c33aa406 100644 --- a/smoke-test/tests/structured_properties/test_structured_properties.py +++ b/smoke-test/tests/structured_properties/test_structured_properties.py @@ -839,3 +839,49 @@ def test_dataset_structured_property_delete(ingest_cleanup_data, graph_client, c # Validate search works for property #1 & #2 validate_search(property1.qualified_name, expected=[]) validate_search(property2.qualified_name, expected=[dataset_urns[0]]) + + +def test_structured_properties_list(ingest_cleanup_data, graph_client, caplog): + # Create property, assign value to target dataset urn + def create_property(): + property_name = f"listTest{randint(10, 10000)}Property" + value_type = "string" + property_urn = f"urn:li:structuredProperty:{default_namespace}.{property_name}" + + create_property_definition( + property_name=property_name, + graph=graph_client, + value_type=value_type, + cardinality="SINGLE", + ) + + test_property = StructuredProperties.from_datahub( + graph=graph_client, urn=property_urn + ) + assert test_property is not None + + return test_property + + # create 2 structured properties + property1 = create_property() + property2 = create_property() + wait_for_writes_to_sync() + + # validate that urns are in the list + structured_properties_urns = [ + u for u in StructuredProperties.list_urns(graph_client) + ] + assert property1.urn in structured_properties_urns + assert property2.urn in structured_properties_urns + + # list structured properties (full) + structured_properties = StructuredProperties.list(graph_client) + matched_properties = [ + p for p in structured_properties if p.urn in [property1.urn, property2.urn] + ] + assert len(matched_properties) == 2 + retrieved_property1 = next(p for p in matched_properties if p.urn == property1.urn) + retrieved_property2 = next(p for p in matched_properties if p.urn == property2.urn) + + assert property1.dict() == retrieved_property1.dict() + assert property2.dict() == retrieved_property2.dict()