mirror of
https://github.com/datahub-project/datahub.git
synced 2025-09-25 09:00:50 +00:00
feat(ingest) - update identity sources to add flags for masking sensitive work units (#4711)
This commit is contained in:
parent
c91d70f1ba
commit
15e90f6dd0
@ -112,27 +112,29 @@ For general pointers on writing and running a recipe, see our [main recipe guide
|
||||
|
||||
Note that a `.` is used to denote nested fields in the YAML configuration block.
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|------------------------------------|--------|----------|-------------|-----------------------------------------------------------------------------------------------------------------|
|
||||
| `client_id` | string | ✅ | | Application ID. Found in your app registration on Azure AD Portal |
|
||||
| `tenant_id` | string | ✅ | | Directory ID. Found in your app registration on Azure AD Portal |
|
||||
| `client_secret` | string | ✅ | | Client secret. Found in your app registration on Azure AD Portal |
|
||||
| `redirect` | string | ✅ | | Redirect URI. Found in your app registration on Azure AD Portal |
|
||||
| `authority` | string | ✅ | | The [authority](https://docs.microsoft.com/en-us/azure/active-directory/develop/msal-client-application-configuration) is a URL that indicates a directory that MSAL can request tokens from. |
|
||||
| `token_url` | string | ✅ | | The token URL that acquires a token from Azure AD for authorizing requests. This source will only work with v1.0 endpoint. |
|
||||
| `graph_url` | string | ✅ | | [Microsoft Graph API endpoint](https://docs.microsoft.com/en-us/graph/use-the-api)
|
||||
| `ingest_users` | bool | | `True` | Whether users should be ingested into DataHub. |
|
||||
| `ingest_groups` | bool | | `True` | Whether groups should be ingested into DataHub. |
|
||||
| `ingest_group_membership` | bool | | `True` | Whether group membership should be ingested into DataHub. ingest_groups must be True if this is True. |
|
||||
| `azure_ad_response_to_username_attr` | string | | `"userPrincipalName"` | Which Azure AD User Response attribute to use as input to DataHub username mapping. |
|
||||
| `azure_ad_response_to_username_regex` | string | | `"(.*)"` | A regex used to parse the DataHub username from the attribute specified in `azure_ad_response_to_username_attr`. |
|
||||
| `users_pattern.allow` | list of strings | | | List of regex patterns for users to include in ingestion. The name against which compare the regexp is the DataHub user name, i.e. the one resulting from the action of `azure_ad_response_to_username_attr` and `azure_ad_response_to_username_regex` |
|
||||
| `users_pattern.deny` | list of strings | | | As above, but for excluding users from ingestion. |
|
||||
| `azure_ad_response_to_groupname_attr` | string | | `"name"` | Which Azure AD Group Response attribute to use as input to DataHub group name mapping. |
|
||||
| `azure_ad_response_to_groupname_regex` | string | | `"(.*)"` | A regex used to parse the DataHub group name from the attribute specified in `azure_ad_response_to_groupname_attr`. |
|
||||
| `groups_pattern.allow` | list of strings | | | List of regex patterns for groups to include in ingestion. The name against which compare the regexp is the DataHub group name, i.e. the one resulting from the action of `azure_ad_response_to_groupname_attr` and `azure_ad_response_to_groupname_regex` |
|
||||
| `groups_pattern.deny` | list of strings | | | As above, but for exculing groups from ingestion. |
|
||||
| `ingest_groups_users` | bool | | `True` | This option is useful only when `ingest_users` is set to False and `ingest_group_membership` to True. As effect, only the users which belongs to the selected groups will be ingested. |
|
||||
| Field | Type | Required | Default | Description |
|
||||
|----------------------------------------|-----------------|----------|-----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| `client_id` | string | ✅ | | Application ID. Found in your app registration on Azure AD Portal |
|
||||
| `tenant_id` | string | ✅ | | Directory ID. Found in your app registration on Azure AD Portal |
|
||||
| `client_secret` | string | ✅ | | Client secret. Found in your app registration on Azure AD Portal |
|
||||
| `redirect` | string | ✅ | | Redirect URI. Found in your app registration on Azure AD Portal |
|
||||
| `authority` | string | ✅ | | The [authority](https://docs.microsoft.com/en-us/azure/active-directory/develop/msal-client-application-configuration) is a URL that indicates a directory that MSAL can request tokens from. |
|
||||
| `token_url` | string | ✅ | | The token URL that acquires a token from Azure AD for authorizing requests. This source will only work with v1.0 endpoint. |
|
||||
| `graph_url` | string | ✅ | | [Microsoft Graph API endpoint](https://docs.microsoft.com/en-us/graph/use-the-api) |
|
||||
| `ingest_users` | bool | | `True` | Whether users should be ingested into DataHub. |
|
||||
| `ingest_groups` | bool | | `True` | Whether groups should be ingested into DataHub. |
|
||||
| `ingest_group_membership` | bool | | `True` | Whether group membership should be ingested into DataHub. ingest_groups must be True if this is True. |
|
||||
| `azure_ad_response_to_username_attr` | string | | `"userPrincipalName"` | Which Azure AD User Response attribute to use as input to DataHub username mapping. |
|
||||
| `azure_ad_response_to_username_regex` | string | | `"(.*)"` | A regex used to parse the DataHub username from the attribute specified in `azure_ad_response_to_username_attr`. |
|
||||
| `users_pattern.allow` | list of strings | | | List of regex patterns for users to include in ingestion. The name against which compare the regexp is the DataHub user name, i.e. the one resulting from the action of `azure_ad_response_to_username_attr` and `azure_ad_response_to_username_regex` |
|
||||
| `users_pattern.deny` | list of strings | | | As above, but for excluding users from ingestion. |
|
||||
| `azure_ad_response_to_groupname_attr` | string | | `"name"` | Which Azure AD Group Response attribute to use as input to DataHub group name mapping. |
|
||||
| `azure_ad_response_to_groupname_regex` | string | | `"(.*)"` | A regex used to parse the DataHub group name from the attribute specified in `azure_ad_response_to_groupname_attr`. |
|
||||
| `groups_pattern.allow` | list of strings | | | List of regex patterns for groups to include in ingestion. The name against which compare the regexp is the DataHub group name, i.e. the one resulting from the action of `azure_ad_response_to_groupname_attr` and `azure_ad_response_to_groupname_regex` |
|
||||
| `groups_pattern.deny` | list of strings | | | As above, but for exculing groups from ingestion. |
|
||||
| `ingest_groups_users` | bool | | `True` | This option is useful only when `ingest_users` is set to False and `ingest_group_membership` to True. As effect, only the users which belongs to the selected groups will be ingested. |
|
||||
| `mask_group_id` | bool | | `True` | Whether workunit ID's for groups should be masked. |
|
||||
| `mask_user_id` | bool | | `True` | Whether workunit ID's for users should be masked. |
|
||||
|
||||
## Questions
|
||||
|
||||
|
@ -106,8 +106,8 @@ Note that a `.` is used to denote nested fields in the YAML configuration block.
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|------------------------------------|--------|----------|-------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| `okta_domain` | string | ✅ | | The location of your Okta Domain, without a protocol. Can be found in Okta Developer console. |
|
||||
| `okta_api_token` | string | ✅ | | An API token generated for the DataHub application inside your Okta Developer Console. |
|
||||
| `okta_domain` | string | ✅ | | The location of your Okta Domain, without a protocol. Can be found in Okta Developer console. |
|
||||
| `okta_api_token` | string | ✅ | | An API token generated for the DataHub application inside your Okta Developer Console. |
|
||||
| `ingest_users` | bool | | `True` | Whether users should be ingested into DataHub. |
|
||||
| `ingest_groups` | bool | | `True` | Whether groups should be ingested into DataHub. |
|
||||
| `ingest_group_membership` | bool | | `True` | Whether group membership should be ingested into DataHub. ingest_groups must be True if this is True. |
|
||||
@ -119,10 +119,12 @@ Note that a `.` is used to denote nested fields in the YAML configuration block.
|
||||
| `include_suspended_users` | bool | | `False` | Whether to ingest users in the SUSPENDED state from Okta. |
|
||||
| `page_size` | number | | `100` | The number of entities requested from Okta's REST APIs in one request. |
|
||||
| `delay_seconds` | number | | `0.01` | Number of seconds to wait between calls to Okta's REST APIs. (Okta rate limits). Defaults to 10ms. |
|
||||
| `okta_users_filter` | string | | `None` | Okta filter expression (not regex) for ingesting users. Only one of `okta_users_filter` and `okta_users_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/users/#list-users-with-a-filter) for more info. |
|
||||
| `okta_users_search` | string | | `None` | Okta search expression (not regex) for ingesting users. Only one of `okta_users_filter` and `okta_users_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/users/#list-users-with-search) for more info. |
|
||||
| `okta_groups_filter` | string | | `None` | Okta filter expression (not regex) for ingesting groups. Only one of `okta_groups_filter` and `okta_groups_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/groups/#filters) for more info. |
|
||||
| `okta_users_search` | string | | `None` | Okta search expression (not regex) for ingesting groups. Only one of `okta_groups_filter` and `okta_groups_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/groups/#list-groups-with-search) for more info. |
|
||||
| `okta_users_filter` | string | | `None` | Okta filter expression (not regex) for ingesting users. Only one of `okta_users_filter` and `okta_users_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/users/#list-users-with-a-filter) for more info. |
|
||||
| `okta_users_search` | string | | `None` | Okta search expression (not regex) for ingesting users. Only one of `okta_users_filter` and `okta_users_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/users/#list-users-with-search) for more info. |
|
||||
| `okta_groups_filter` | string | | `None` | Okta filter expression (not regex) for ingesting groups. Only one of `okta_groups_filter` and `okta_groups_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/groups/#filters) for more info. |
|
||||
| `okta_users_search` | string | | `None` | Okta search expression (not regex) for ingesting groups. Only one of `okta_groups_filter` and `okta_groups_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/groups/#list-groups-with-search) for more info. |
|
||||
| `mask_group_id` | bool | | `True` | Whether workunit ID's for groups should be masked. |
|
||||
| `mask_user_id` | bool | | `True` | Whether workunit ID's for users should be masked. |
|
||||
|
||||
## Compatibility
|
||||
|
||||
|
@ -63,6 +63,10 @@ class AzureADConfig(ConfigModel):
|
||||
# If enabled, report will contain names of filtered users and groups.
|
||||
filtered_tracking: bool = True
|
||||
|
||||
# Optional: Whether to mask sensitive information from workunit ID's. On by default.
|
||||
mask_group_id: bool = True
|
||||
mask_user_id: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
class AzureADSourceReport(SourceReport):
|
||||
@ -134,11 +138,18 @@ class AzureADSource(Source):
|
||||
datahub_corp_group_snapshots = self._map_azure_ad_groups(
|
||||
azure_ad_groups
|
||||
)
|
||||
for datahub_corp_group_snapshot in datahub_corp_group_snapshots:
|
||||
for group_count, datahub_corp_group_snapshot in enumerate(
|
||||
datahub_corp_group_snapshots
|
||||
):
|
||||
mce = MetadataChangeEvent(
|
||||
proposedSnapshot=datahub_corp_group_snapshot
|
||||
)
|
||||
wu = MetadataWorkUnit(id=datahub_corp_group_snapshot.urn, mce=mce)
|
||||
wu_id = (
|
||||
f"group-{group_count + 1}"
|
||||
if self.config.mask_group_id
|
||||
else datahub_corp_group_snapshot.urn
|
||||
)
|
||||
wu = MetadataWorkUnit(id=wu_id, mce=mce)
|
||||
self.report.report_workunit(wu)
|
||||
yield wu
|
||||
|
||||
@ -241,7 +252,9 @@ class AzureADSource(Source):
|
||||
datahub_corp_user_snapshots: Generator[CorpUserSnapshot, Any, None],
|
||||
datahub_corp_user_urn_to_group_membership: dict,
|
||||
) -> Generator[MetadataWorkUnit, Any, None]:
|
||||
for datahub_corp_user_snapshot in datahub_corp_user_snapshots:
|
||||
for user_count, datahub_corp_user_snapshot in enumerate(
|
||||
datahub_corp_user_snapshots
|
||||
):
|
||||
# Add GroupMembership if applicable
|
||||
if (
|
||||
datahub_corp_user_snapshot.urn
|
||||
@ -255,7 +268,12 @@ class AzureADSource(Source):
|
||||
assert datahub_group_membership
|
||||
datahub_corp_user_snapshot.aspects.append(datahub_group_membership)
|
||||
mce = MetadataChangeEvent(proposedSnapshot=datahub_corp_user_snapshot)
|
||||
wu = MetadataWorkUnit(id=datahub_corp_user_snapshot.urn, mce=mce)
|
||||
wu_id = (
|
||||
f"user-{user_count + 1}"
|
||||
if self.config.mask_user_id
|
||||
else datahub_corp_user_snapshot.urn
|
||||
)
|
||||
wu = MetadataWorkUnit(id=wu_id, mce=mce)
|
||||
self.report.report_workunit(wu)
|
||||
yield wu
|
||||
|
||||
|
@ -70,6 +70,10 @@ class OktaConfig(ConfigModel):
|
||||
okta_groups_filter: Optional[str] = None
|
||||
okta_groups_search: Optional[str] = None
|
||||
|
||||
# Optional: Whether to mask sensitive information from workunit ID's. On by default.
|
||||
mask_group_id: bool = True
|
||||
mask_user_id: bool = True
|
||||
|
||||
@validator("okta_users_search")
|
||||
def okta_users_one_of_filter_or_search(cls, v, values):
|
||||
if v and values["okta_users_filter"]:
|
||||
@ -130,9 +134,16 @@ class OktaSource(Source):
|
||||
if self.config.ingest_groups:
|
||||
okta_groups = list(self._get_okta_groups(event_loop))
|
||||
datahub_corp_group_snapshots = self._map_okta_groups(okta_groups)
|
||||
for datahub_corp_group_snapshot in datahub_corp_group_snapshots:
|
||||
for group_count, datahub_corp_group_snapshot in enumerate(
|
||||
datahub_corp_group_snapshots
|
||||
):
|
||||
mce = MetadataChangeEvent(proposedSnapshot=datahub_corp_group_snapshot)
|
||||
wu = MetadataWorkUnit(id=datahub_corp_group_snapshot.urn, mce=mce)
|
||||
wu_id = (
|
||||
f"group-{group_count + 1}"
|
||||
if self.config.mask_group_id
|
||||
else datahub_corp_group_snapshot.urn
|
||||
)
|
||||
wu = MetadataWorkUnit(id=wu_id, mce=mce)
|
||||
self.report.report_workunit(wu)
|
||||
yield wu
|
||||
|
||||
@ -183,7 +194,9 @@ class OktaSource(Source):
|
||||
okta_users = self._get_okta_users(event_loop)
|
||||
filtered_okta_users = filter(self._filter_okta_user, okta_users)
|
||||
datahub_corp_user_snapshots = self._map_okta_users(filtered_okta_users)
|
||||
for datahub_corp_user_snapshot in datahub_corp_user_snapshots:
|
||||
for user_count, datahub_corp_user_snapshot in enumerate(
|
||||
datahub_corp_user_snapshots
|
||||
):
|
||||
|
||||
# Add GroupMembership aspect populated in Step 2 if applicable.
|
||||
if (
|
||||
@ -198,7 +211,12 @@ class OktaSource(Source):
|
||||
assert datahub_group_membership is not None
|
||||
datahub_corp_user_snapshot.aspects.append(datahub_group_membership)
|
||||
mce = MetadataChangeEvent(proposedSnapshot=datahub_corp_user_snapshot)
|
||||
wu = MetadataWorkUnit(id=datahub_corp_user_snapshot.urn, mce=mce)
|
||||
wu_id = (
|
||||
f"user-{user_count + 1}"
|
||||
if self.config.mask_user_id
|
||||
else datahub_corp_user_snapshot.urn
|
||||
)
|
||||
wu = MetadataWorkUnit(id=wu_id, mce=mce)
|
||||
self.report.report_workunit(wu)
|
||||
yield wu
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user