From 3abeadde4ed057d97ff17d84ccdf07adb0b893f1 Mon Sep 17 00:00:00 2001 From: Gabe Lyons Date: Fri, 1 Aug 2025 12:26:45 -0700 Subject: [PATCH] feat(meta-mappping): support meta mappings of lists (#14306) --- metadata-ingestion/docs/sources/dbt/dbt.md | 24 +++++++ .../src/datahub/utilities/mapping.py | 31 +++++++++- metadata-ingestion/tests/unit/test_mapping.py | 62 +++++++++++++++++++ 3 files changed, 115 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/docs/sources/dbt/dbt.md b/metadata-ingestion/docs/sources/dbt/dbt.md index 80ed11df53..978f930f1b 100644 --- a/metadata-ingestion/docs/sources/dbt/dbt.md +++ b/metadata-ingestion/docs/sources/dbt/dbt.md @@ -78,6 +78,7 @@ Note: 1. The dbt `meta_mapping` config works at the model level, while the `column_meta_mapping` config works at the column level. The `add_owner` operation is not supported at the column level. 2. For string meta properties we support regex matching. +3. **List support**: YAML lists are now supported in meta properties. Each item in the list that matches the regex pattern will be processed. With regex matching, you can also use the matched value to customize how you populate the tag, term or owner fields. Here are a few advanced examples: @@ -175,6 +176,29 @@ meta_mapping: In the examples above, we show two ways of writing the matching regexes. In the first one, `^@(.*)` the first matching group (a.k.a. match.group(1)) is automatically inferred. In the second example, `^@(?P(.*))`, we use a named matching group (called owner, since we are matching an owner) to capture the string we want to provide to the ownership urn. +#### Working with Lists + +YAML lists are fully supported in dbt meta properties. Each item in the list is evaluated against the match pattern, and only matching items are processed. + +```yaml +meta: + owners: + - alice@company.com + - bob@company.com + - contractor@external.com +``` + +```yaml +meta_mapping: + owners: + match: ".*@company.com" + operation: "add_owner" + config: + owner_type: user +``` + +This will add `alice@company.com` and `bob@company.com` as owners (matching `.*@company.com`) but skip `contractor@external.com` (doesn't match the pattern). + ### dbt query_tag automated mappings This works similarly as the dbt meta mapping but for the query tags diff --git a/metadata-ingestion/src/datahub/utilities/mapping.py b/metadata-ingestion/src/datahub/utilities/mapping.py index 4772730c90..7245c2e7ca 100644 --- a/metadata-ingestion/src/datahub/utilities/mapping.py +++ b/metadata-ingestion/src/datahub/utilities/mapping.py @@ -83,7 +83,7 @@ class Constants: MATCH = "match" USER_OWNER = "user" GROUP_OWNER = "group" - OPERAND_DATATYPE_SUPPORTED = [int, bool, str, float] + OPERAND_DATATYPE_SUPPORTED = [int, bool, str, float, list] TAG_PARTITION_KEY = "PARTITION_KEY" TAG_DIST_KEY = "DIST_KEY" TAG_SORT_KEY = "SORT_KEY" @@ -455,7 +455,34 @@ class OperationProcessor: # function to check if a match clause is satisfied to a value. if not any( isinstance(raw_props_value, t) for t in Constants.OPERAND_DATATYPE_SUPPORTED - ) or not isinstance(raw_props_value, type(match_clause)): + ): + return None + + # Handle list values by checking if any item in the list matches + if isinstance(raw_props_value, list): + # For lists, we need to find at least one matching item + # Return a match with the concatenated values of all matching items + matching_items = [] + for item in raw_props_value: + if isinstance(item, str): + match = re.match(match_clause, item) + if match: + matching_items.append(item) + elif isinstance(match_clause, type(item)): + match = re.match(str(match_clause), str(item)) + if match: + matching_items.append(str(item)) + + if matching_items: + # Create a synthetic match object with all matching items joined + combined_value = ",".join(matching_items) + return re.match( + ".*", combined_value + ) # Always matches, returns combined value + return None + + # Handle scalar values (existing logic) + elif not isinstance(raw_props_value, type(match_clause)): return None elif isinstance(raw_props_value, str): return re.match(match_clause, raw_props_value) diff --git a/metadata-ingestion/tests/unit/test_mapping.py b/metadata-ingestion/tests/unit/test_mapping.py index fff9ba81fb..0a934bed0a 100644 --- a/metadata-ingestion/tests/unit/test_mapping.py +++ b/metadata-ingestion/tests/unit/test_mapping.py @@ -452,3 +452,65 @@ def test_validate_ownership_type_non_urn_invalid(): # Non-urn input that is not valid should raise ValueError. with pytest.raises(ValueError): validate_ownership_type("invalid_type") + + +def test_operation_processor_list_values(): + """Test that list values are properly handled in operation definitions.""" + raw_props = { + "owners_list": [ + "owner1@company.com", + "owner2@company.com", + "owner3@company.com", + ], + "tags_list": ["tag1", "tag2", "tag3"], + "mixed_list": ["match1", "nomatch", "match2"], + } + + processor = OperationProcessor( + operation_defs={ + "owners_list": { + "match": ".*@company.com", + "operation": "add_owner", + "config": {"owner_type": "user"}, + }, + "tags_list": { + "match": "tag.*", + "operation": "add_tag", + "config": {"tag": "list_{{ $match }}"}, + }, + "mixed_list": { + "match": "match.*", + "operation": "add_term", + "config": {"term": "{{ $match }}"}, + }, + }, + strip_owner_email_id=True, + ) + + aspect_map = processor.process(raw_props) + + # Test owners from list + assert "add_owner" in aspect_map + ownership_aspect: OwnershipClass = aspect_map["add_owner"] + assert len(ownership_aspect.owners) == 3 + owner_urns = {owner.owner for owner in ownership_aspect.owners} + expected_owners = { + "urn:li:corpuser:owner1", + "urn:li:corpuser:owner2", + "urn:li:corpuser:owner3", + } + assert owner_urns == expected_owners + + # Test tags from list - note: tags use the match replacement but join with comma + assert "add_tag" in aspect_map + tag_aspect: GlobalTagsClass = aspect_map["add_tag"] + assert len(tag_aspect.tags) == 1 + # The matched values get joined with comma, and commas get URL-encoded in URNs + assert tag_aspect.tags[0].tag == "urn:li:tag:list_tag1%2Ctag2%2Ctag3" + + # Test terms from list - only matching items + assert "add_term" in aspect_map + term_aspect: GlossaryTermsClass = aspect_map["add_term"] + assert len(term_aspect.terms) == 1 + # The matched values get joined with comma - terms don't get URL-encoded + assert term_aspect.terms[0].urn == "urn:li:glossaryTerm:match1,match2"