feat(meta-mappping): support meta mappings of lists (#14306)

This commit is contained in:
Gabe Lyons 2025-08-01 12:26:45 -07:00 committed by GitHub
parent 8708ae0dd7
commit 3abeadde4e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 115 additions and 2 deletions

View File

@ -78,6 +78,7 @@ Note:
1. The dbt `meta_mapping` config works at the model level, while the `column_meta_mapping` config works at the column level. The `add_owner` operation is not supported at the column level.
2. For string meta properties we support regex matching.
3. **List support**: YAML lists are now supported in meta properties. Each item in the list that matches the regex pattern will be processed.
With regex matching, you can also use the matched value to customize how you populate the tag, term or owner fields. Here are a few advanced examples:
@ -175,6 +176,29 @@ meta_mapping:
In the examples above, we show two ways of writing the matching regexes. In the first one, `^@(.*)` the first matching group (a.k.a. match.group(1)) is automatically inferred. In the second example, `^@(?P<owner>(.*))`, we use a named matching group (called owner, since we are matching an owner) to capture the string we want to provide to the ownership urn.
#### Working with Lists
YAML lists are fully supported in dbt meta properties. Each item in the list is evaluated against the match pattern, and only matching items are processed.
```yaml
meta:
owners:
- alice@company.com
- bob@company.com
- contractor@external.com
```
```yaml
meta_mapping:
owners:
match: ".*@company.com"
operation: "add_owner"
config:
owner_type: user
```
This will add `alice@company.com` and `bob@company.com` as owners (matching `.*@company.com`) but skip `contractor@external.com` (doesn't match the pattern).
### dbt query_tag automated mappings
This works similarly as the dbt meta mapping but for the query tags

View File

@ -83,7 +83,7 @@ class Constants:
MATCH = "match"
USER_OWNER = "user"
GROUP_OWNER = "group"
OPERAND_DATATYPE_SUPPORTED = [int, bool, str, float]
OPERAND_DATATYPE_SUPPORTED = [int, bool, str, float, list]
TAG_PARTITION_KEY = "PARTITION_KEY"
TAG_DIST_KEY = "DIST_KEY"
TAG_SORT_KEY = "SORT_KEY"
@ -455,7 +455,34 @@ class OperationProcessor:
# function to check if a match clause is satisfied to a value.
if not any(
isinstance(raw_props_value, t) for t in Constants.OPERAND_DATATYPE_SUPPORTED
) or not isinstance(raw_props_value, type(match_clause)):
):
return None
# Handle list values by checking if any item in the list matches
if isinstance(raw_props_value, list):
# For lists, we need to find at least one matching item
# Return a match with the concatenated values of all matching items
matching_items = []
for item in raw_props_value:
if isinstance(item, str):
match = re.match(match_clause, item)
if match:
matching_items.append(item)
elif isinstance(match_clause, type(item)):
match = re.match(str(match_clause), str(item))
if match:
matching_items.append(str(item))
if matching_items:
# Create a synthetic match object with all matching items joined
combined_value = ",".join(matching_items)
return re.match(
".*", combined_value
) # Always matches, returns combined value
return None
# Handle scalar values (existing logic)
elif not isinstance(raw_props_value, type(match_clause)):
return None
elif isinstance(raw_props_value, str):
return re.match(match_clause, raw_props_value)

View File

@ -452,3 +452,65 @@ def test_validate_ownership_type_non_urn_invalid():
# Non-urn input that is not valid should raise ValueError.
with pytest.raises(ValueError):
validate_ownership_type("invalid_type")
def test_operation_processor_list_values():
"""Test that list values are properly handled in operation definitions."""
raw_props = {
"owners_list": [
"owner1@company.com",
"owner2@company.com",
"owner3@company.com",
],
"tags_list": ["tag1", "tag2", "tag3"],
"mixed_list": ["match1", "nomatch", "match2"],
}
processor = OperationProcessor(
operation_defs={
"owners_list": {
"match": ".*@company.com",
"operation": "add_owner",
"config": {"owner_type": "user"},
},
"tags_list": {
"match": "tag.*",
"operation": "add_tag",
"config": {"tag": "list_{{ $match }}"},
},
"mixed_list": {
"match": "match.*",
"operation": "add_term",
"config": {"term": "{{ $match }}"},
},
},
strip_owner_email_id=True,
)
aspect_map = processor.process(raw_props)
# Test owners from list
assert "add_owner" in aspect_map
ownership_aspect: OwnershipClass = aspect_map["add_owner"]
assert len(ownership_aspect.owners) == 3
owner_urns = {owner.owner for owner in ownership_aspect.owners}
expected_owners = {
"urn:li:corpuser:owner1",
"urn:li:corpuser:owner2",
"urn:li:corpuser:owner3",
}
assert owner_urns == expected_owners
# Test tags from list - note: tags use the match replacement but join with comma
assert "add_tag" in aspect_map
tag_aspect: GlobalTagsClass = aspect_map["add_tag"]
assert len(tag_aspect.tags) == 1
# The matched values get joined with comma, and commas get URL-encoded in URNs
assert tag_aspect.tags[0].tag == "urn:li:tag:list_tag1%2Ctag2%2Ctag3"
# Test terms from list - only matching items
assert "add_term" in aspect_map
term_aspect: GlossaryTermsClass = aspect_map["add_term"]
assert len(term_aspect.terms) == 1
# The matched values get joined with comma - terms don't get URL-encoded
assert term_aspect.terms[0].urn == "urn:li:glossaryTerm:match1,match2"