mirror of
https://github.com/datahub-project/datahub.git
synced 2025-08-17 13:45:54 +00:00
feat(meta-mappping): support meta mappings of lists (#14306)
This commit is contained in:
parent
8708ae0dd7
commit
3abeadde4e
@ -78,6 +78,7 @@ Note:
|
|||||||
|
|
||||||
1. The dbt `meta_mapping` config works at the model level, while the `column_meta_mapping` config works at the column level. The `add_owner` operation is not supported at the column level.
|
1. The dbt `meta_mapping` config works at the model level, while the `column_meta_mapping` config works at the column level. The `add_owner` operation is not supported at the column level.
|
||||||
2. For string meta properties we support regex matching.
|
2. For string meta properties we support regex matching.
|
||||||
|
3. **List support**: YAML lists are now supported in meta properties. Each item in the list that matches the regex pattern will be processed.
|
||||||
|
|
||||||
With regex matching, you can also use the matched value to customize how you populate the tag, term or owner fields. Here are a few advanced examples:
|
With regex matching, you can also use the matched value to customize how you populate the tag, term or owner fields. Here are a few advanced examples:
|
||||||
|
|
||||||
@ -175,6 +176,29 @@ meta_mapping:
|
|||||||
|
|
||||||
In the examples above, we show two ways of writing the matching regexes. In the first one, `^@(.*)` the first matching group (a.k.a. match.group(1)) is automatically inferred. In the second example, `^@(?P<owner>(.*))`, we use a named matching group (called owner, since we are matching an owner) to capture the string we want to provide to the ownership urn.
|
In the examples above, we show two ways of writing the matching regexes. In the first one, `^@(.*)` the first matching group (a.k.a. match.group(1)) is automatically inferred. In the second example, `^@(?P<owner>(.*))`, we use a named matching group (called owner, since we are matching an owner) to capture the string we want to provide to the ownership urn.
|
||||||
|
|
||||||
|
#### Working with Lists
|
||||||
|
|
||||||
|
YAML lists are fully supported in dbt meta properties. Each item in the list is evaluated against the match pattern, and only matching items are processed.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
meta:
|
||||||
|
owners:
|
||||||
|
- alice@company.com
|
||||||
|
- bob@company.com
|
||||||
|
- contractor@external.com
|
||||||
|
```
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
meta_mapping:
|
||||||
|
owners:
|
||||||
|
match: ".*@company.com"
|
||||||
|
operation: "add_owner"
|
||||||
|
config:
|
||||||
|
owner_type: user
|
||||||
|
```
|
||||||
|
|
||||||
|
This will add `alice@company.com` and `bob@company.com` as owners (matching `.*@company.com`) but skip `contractor@external.com` (doesn't match the pattern).
|
||||||
|
|
||||||
### dbt query_tag automated mappings
|
### dbt query_tag automated mappings
|
||||||
|
|
||||||
This works similarly as the dbt meta mapping but for the query tags
|
This works similarly as the dbt meta mapping but for the query tags
|
||||||
|
@ -83,7 +83,7 @@ class Constants:
|
|||||||
MATCH = "match"
|
MATCH = "match"
|
||||||
USER_OWNER = "user"
|
USER_OWNER = "user"
|
||||||
GROUP_OWNER = "group"
|
GROUP_OWNER = "group"
|
||||||
OPERAND_DATATYPE_SUPPORTED = [int, bool, str, float]
|
OPERAND_DATATYPE_SUPPORTED = [int, bool, str, float, list]
|
||||||
TAG_PARTITION_KEY = "PARTITION_KEY"
|
TAG_PARTITION_KEY = "PARTITION_KEY"
|
||||||
TAG_DIST_KEY = "DIST_KEY"
|
TAG_DIST_KEY = "DIST_KEY"
|
||||||
TAG_SORT_KEY = "SORT_KEY"
|
TAG_SORT_KEY = "SORT_KEY"
|
||||||
@ -455,7 +455,34 @@ class OperationProcessor:
|
|||||||
# function to check if a match clause is satisfied to a value.
|
# function to check if a match clause is satisfied to a value.
|
||||||
if not any(
|
if not any(
|
||||||
isinstance(raw_props_value, t) for t in Constants.OPERAND_DATATYPE_SUPPORTED
|
isinstance(raw_props_value, t) for t in Constants.OPERAND_DATATYPE_SUPPORTED
|
||||||
) or not isinstance(raw_props_value, type(match_clause)):
|
):
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Handle list values by checking if any item in the list matches
|
||||||
|
if isinstance(raw_props_value, list):
|
||||||
|
# For lists, we need to find at least one matching item
|
||||||
|
# Return a match with the concatenated values of all matching items
|
||||||
|
matching_items = []
|
||||||
|
for item in raw_props_value:
|
||||||
|
if isinstance(item, str):
|
||||||
|
match = re.match(match_clause, item)
|
||||||
|
if match:
|
||||||
|
matching_items.append(item)
|
||||||
|
elif isinstance(match_clause, type(item)):
|
||||||
|
match = re.match(str(match_clause), str(item))
|
||||||
|
if match:
|
||||||
|
matching_items.append(str(item))
|
||||||
|
|
||||||
|
if matching_items:
|
||||||
|
# Create a synthetic match object with all matching items joined
|
||||||
|
combined_value = ",".join(matching_items)
|
||||||
|
return re.match(
|
||||||
|
".*", combined_value
|
||||||
|
) # Always matches, returns combined value
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Handle scalar values (existing logic)
|
||||||
|
elif not isinstance(raw_props_value, type(match_clause)):
|
||||||
return None
|
return None
|
||||||
elif isinstance(raw_props_value, str):
|
elif isinstance(raw_props_value, str):
|
||||||
return re.match(match_clause, raw_props_value)
|
return re.match(match_clause, raw_props_value)
|
||||||
|
@ -452,3 +452,65 @@ def test_validate_ownership_type_non_urn_invalid():
|
|||||||
# Non-urn input that is not valid should raise ValueError.
|
# Non-urn input that is not valid should raise ValueError.
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
validate_ownership_type("invalid_type")
|
validate_ownership_type("invalid_type")
|
||||||
|
|
||||||
|
|
||||||
|
def test_operation_processor_list_values():
|
||||||
|
"""Test that list values are properly handled in operation definitions."""
|
||||||
|
raw_props = {
|
||||||
|
"owners_list": [
|
||||||
|
"owner1@company.com",
|
||||||
|
"owner2@company.com",
|
||||||
|
"owner3@company.com",
|
||||||
|
],
|
||||||
|
"tags_list": ["tag1", "tag2", "tag3"],
|
||||||
|
"mixed_list": ["match1", "nomatch", "match2"],
|
||||||
|
}
|
||||||
|
|
||||||
|
processor = OperationProcessor(
|
||||||
|
operation_defs={
|
||||||
|
"owners_list": {
|
||||||
|
"match": ".*@company.com",
|
||||||
|
"operation": "add_owner",
|
||||||
|
"config": {"owner_type": "user"},
|
||||||
|
},
|
||||||
|
"tags_list": {
|
||||||
|
"match": "tag.*",
|
||||||
|
"operation": "add_tag",
|
||||||
|
"config": {"tag": "list_{{ $match }}"},
|
||||||
|
},
|
||||||
|
"mixed_list": {
|
||||||
|
"match": "match.*",
|
||||||
|
"operation": "add_term",
|
||||||
|
"config": {"term": "{{ $match }}"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
strip_owner_email_id=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
aspect_map = processor.process(raw_props)
|
||||||
|
|
||||||
|
# Test owners from list
|
||||||
|
assert "add_owner" in aspect_map
|
||||||
|
ownership_aspect: OwnershipClass = aspect_map["add_owner"]
|
||||||
|
assert len(ownership_aspect.owners) == 3
|
||||||
|
owner_urns = {owner.owner for owner in ownership_aspect.owners}
|
||||||
|
expected_owners = {
|
||||||
|
"urn:li:corpuser:owner1",
|
||||||
|
"urn:li:corpuser:owner2",
|
||||||
|
"urn:li:corpuser:owner3",
|
||||||
|
}
|
||||||
|
assert owner_urns == expected_owners
|
||||||
|
|
||||||
|
# Test tags from list - note: tags use the match replacement but join with comma
|
||||||
|
assert "add_tag" in aspect_map
|
||||||
|
tag_aspect: GlobalTagsClass = aspect_map["add_tag"]
|
||||||
|
assert len(tag_aspect.tags) == 1
|
||||||
|
# The matched values get joined with comma, and commas get URL-encoded in URNs
|
||||||
|
assert tag_aspect.tags[0].tag == "urn:li:tag:list_tag1%2Ctag2%2Ctag3"
|
||||||
|
|
||||||
|
# Test terms from list - only matching items
|
||||||
|
assert "add_term" in aspect_map
|
||||||
|
term_aspect: GlossaryTermsClass = aspect_map["add_term"]
|
||||||
|
assert len(term_aspect.terms) == 1
|
||||||
|
# The matched values get joined with comma - terms don't get URL-encoded
|
||||||
|
assert term_aspect.terms[0].urn == "urn:li:glossaryTerm:match1,match2"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user