mirror of
https://github.com/datahub-project/datahub.git
synced 2025-08-17 13:45:54 +00:00
feat(meta-mappping): support meta mappings of lists (#14306)
This commit is contained in:
parent
8708ae0dd7
commit
3abeadde4e
@ -78,6 +78,7 @@ Note:
|
||||
|
||||
1. The dbt `meta_mapping` config works at the model level, while the `column_meta_mapping` config works at the column level. The `add_owner` operation is not supported at the column level.
|
||||
2. For string meta properties we support regex matching.
|
||||
3. **List support**: YAML lists are now supported in meta properties. Each item in the list that matches the regex pattern will be processed.
|
||||
|
||||
With regex matching, you can also use the matched value to customize how you populate the tag, term or owner fields. Here are a few advanced examples:
|
||||
|
||||
@ -175,6 +176,29 @@ meta_mapping:
|
||||
|
||||
In the examples above, we show two ways of writing the matching regexes. In the first one, `^@(.*)` the first matching group (a.k.a. match.group(1)) is automatically inferred. In the second example, `^@(?P<owner>(.*))`, we use a named matching group (called owner, since we are matching an owner) to capture the string we want to provide to the ownership urn.
|
||||
|
||||
#### Working with Lists
|
||||
|
||||
YAML lists are fully supported in dbt meta properties. Each item in the list is evaluated against the match pattern, and only matching items are processed.
|
||||
|
||||
```yaml
|
||||
meta:
|
||||
owners:
|
||||
- alice@company.com
|
||||
- bob@company.com
|
||||
- contractor@external.com
|
||||
```
|
||||
|
||||
```yaml
|
||||
meta_mapping:
|
||||
owners:
|
||||
match: ".*@company.com"
|
||||
operation: "add_owner"
|
||||
config:
|
||||
owner_type: user
|
||||
```
|
||||
|
||||
This will add `alice@company.com` and `bob@company.com` as owners (matching `.*@company.com`) but skip `contractor@external.com` (doesn't match the pattern).
|
||||
|
||||
### dbt query_tag automated mappings
|
||||
|
||||
This works similarly as the dbt meta mapping but for the query tags
|
||||
|
@ -83,7 +83,7 @@ class Constants:
|
||||
MATCH = "match"
|
||||
USER_OWNER = "user"
|
||||
GROUP_OWNER = "group"
|
||||
OPERAND_DATATYPE_SUPPORTED = [int, bool, str, float]
|
||||
OPERAND_DATATYPE_SUPPORTED = [int, bool, str, float, list]
|
||||
TAG_PARTITION_KEY = "PARTITION_KEY"
|
||||
TAG_DIST_KEY = "DIST_KEY"
|
||||
TAG_SORT_KEY = "SORT_KEY"
|
||||
@ -455,7 +455,34 @@ class OperationProcessor:
|
||||
# function to check if a match clause is satisfied to a value.
|
||||
if not any(
|
||||
isinstance(raw_props_value, t) for t in Constants.OPERAND_DATATYPE_SUPPORTED
|
||||
) or not isinstance(raw_props_value, type(match_clause)):
|
||||
):
|
||||
return None
|
||||
|
||||
# Handle list values by checking if any item in the list matches
|
||||
if isinstance(raw_props_value, list):
|
||||
# For lists, we need to find at least one matching item
|
||||
# Return a match with the concatenated values of all matching items
|
||||
matching_items = []
|
||||
for item in raw_props_value:
|
||||
if isinstance(item, str):
|
||||
match = re.match(match_clause, item)
|
||||
if match:
|
||||
matching_items.append(item)
|
||||
elif isinstance(match_clause, type(item)):
|
||||
match = re.match(str(match_clause), str(item))
|
||||
if match:
|
||||
matching_items.append(str(item))
|
||||
|
||||
if matching_items:
|
||||
# Create a synthetic match object with all matching items joined
|
||||
combined_value = ",".join(matching_items)
|
||||
return re.match(
|
||||
".*", combined_value
|
||||
) # Always matches, returns combined value
|
||||
return None
|
||||
|
||||
# Handle scalar values (existing logic)
|
||||
elif not isinstance(raw_props_value, type(match_clause)):
|
||||
return None
|
||||
elif isinstance(raw_props_value, str):
|
||||
return re.match(match_clause, raw_props_value)
|
||||
|
@ -452,3 +452,65 @@ def test_validate_ownership_type_non_urn_invalid():
|
||||
# Non-urn input that is not valid should raise ValueError.
|
||||
with pytest.raises(ValueError):
|
||||
validate_ownership_type("invalid_type")
|
||||
|
||||
|
||||
def test_operation_processor_list_values():
|
||||
"""Test that list values are properly handled in operation definitions."""
|
||||
raw_props = {
|
||||
"owners_list": [
|
||||
"owner1@company.com",
|
||||
"owner2@company.com",
|
||||
"owner3@company.com",
|
||||
],
|
||||
"tags_list": ["tag1", "tag2", "tag3"],
|
||||
"mixed_list": ["match1", "nomatch", "match2"],
|
||||
}
|
||||
|
||||
processor = OperationProcessor(
|
||||
operation_defs={
|
||||
"owners_list": {
|
||||
"match": ".*@company.com",
|
||||
"operation": "add_owner",
|
||||
"config": {"owner_type": "user"},
|
||||
},
|
||||
"tags_list": {
|
||||
"match": "tag.*",
|
||||
"operation": "add_tag",
|
||||
"config": {"tag": "list_{{ $match }}"},
|
||||
},
|
||||
"mixed_list": {
|
||||
"match": "match.*",
|
||||
"operation": "add_term",
|
||||
"config": {"term": "{{ $match }}"},
|
||||
},
|
||||
},
|
||||
strip_owner_email_id=True,
|
||||
)
|
||||
|
||||
aspect_map = processor.process(raw_props)
|
||||
|
||||
# Test owners from list
|
||||
assert "add_owner" in aspect_map
|
||||
ownership_aspect: OwnershipClass = aspect_map["add_owner"]
|
||||
assert len(ownership_aspect.owners) == 3
|
||||
owner_urns = {owner.owner for owner in ownership_aspect.owners}
|
||||
expected_owners = {
|
||||
"urn:li:corpuser:owner1",
|
||||
"urn:li:corpuser:owner2",
|
||||
"urn:li:corpuser:owner3",
|
||||
}
|
||||
assert owner_urns == expected_owners
|
||||
|
||||
# Test tags from list - note: tags use the match replacement but join with comma
|
||||
assert "add_tag" in aspect_map
|
||||
tag_aspect: GlobalTagsClass = aspect_map["add_tag"]
|
||||
assert len(tag_aspect.tags) == 1
|
||||
# The matched values get joined with comma, and commas get URL-encoded in URNs
|
||||
assert tag_aspect.tags[0].tag == "urn:li:tag:list_tag1%2Ctag2%2Ctag3"
|
||||
|
||||
# Test terms from list - only matching items
|
||||
assert "add_term" in aspect_map
|
||||
term_aspect: GlossaryTermsClass = aspect_map["add_term"]
|
||||
assert len(term_aspect.terms) == 1
|
||||
# The matched values get joined with comma - terms don't get URL-encoded
|
||||
assert term_aspect.terms[0].urn == "urn:li:glossaryTerm:match1,match2"
|
||||
|
Loading…
x
Reference in New Issue
Block a user