diff --git a/metadata-ingestion/source_docs/azure-ad.md b/metadata-ingestion/source_docs/azure-ad.md index 49df160dd7..5a8287ce8d 100644 --- a/metadata-ingestion/source_docs/azure-ad.md +++ b/metadata-ingestion/source_docs/azure-ad.md @@ -112,27 +112,29 @@ For general pointers on writing and running a recipe, see our [main recipe guide Note that a `.` is used to denote nested fields in the YAML configuration block. -| Field | Type | Required | Default | Description | -|------------------------------------|--------|----------|-------------|-----------------------------------------------------------------------------------------------------------------| -| `client_id` | string | ✅ | | Application ID. Found in your app registration on Azure AD Portal | -| `tenant_id` | string | ✅ | | Directory ID. Found in your app registration on Azure AD Portal | -| `client_secret` | string | ✅ | | Client secret. Found in your app registration on Azure AD Portal | -| `redirect` | string | ✅ | | Redirect URI. Found in your app registration on Azure AD Portal | -| `authority` | string | ✅ | | The [authority](https://docs.microsoft.com/en-us/azure/active-directory/develop/msal-client-application-configuration) is a URL that indicates a directory that MSAL can request tokens from. | -| `token_url` | string | ✅ | | The token URL that acquires a token from Azure AD for authorizing requests. This source will only work with v1.0 endpoint. | -| `graph_url` | string | ✅ | | [Microsoft Graph API endpoint](https://docs.microsoft.com/en-us/graph/use-the-api) -| `ingest_users` | bool | | `True` | Whether users should be ingested into DataHub. | -| `ingest_groups` | bool | | `True` | Whether groups should be ingested into DataHub. | -| `ingest_group_membership` | bool | | `True` | Whether group membership should be ingested into DataHub. ingest_groups must be True if this is True. | -| `azure_ad_response_to_username_attr` | string | | `"userPrincipalName"` | Which Azure AD User Response attribute to use as input to DataHub username mapping. | -| `azure_ad_response_to_username_regex` | string | | `"(.*)"` | A regex used to parse the DataHub username from the attribute specified in `azure_ad_response_to_username_attr`. | -| `users_pattern.allow` | list of strings | | | List of regex patterns for users to include in ingestion. The name against which compare the regexp is the DataHub user name, i.e. the one resulting from the action of `azure_ad_response_to_username_attr` and `azure_ad_response_to_username_regex` | -| `users_pattern.deny` | list of strings | | | As above, but for excluding users from ingestion. | -| `azure_ad_response_to_groupname_attr` | string | | `"name"` | Which Azure AD Group Response attribute to use as input to DataHub group name mapping. | -| `azure_ad_response_to_groupname_regex` | string | | `"(.*)"` | A regex used to parse the DataHub group name from the attribute specified in `azure_ad_response_to_groupname_attr`. | -| `groups_pattern.allow` | list of strings | | | List of regex patterns for groups to include in ingestion. The name against which compare the regexp is the DataHub group name, i.e. the one resulting from the action of `azure_ad_response_to_groupname_attr` and `azure_ad_response_to_groupname_regex` | -| `groups_pattern.deny` | list of strings | | | As above, but for exculing groups from ingestion. | -| `ingest_groups_users` | bool | | `True` | This option is useful only when `ingest_users` is set to False and `ingest_group_membership` to True. As effect, only the users which belongs to the selected groups will be ingested. | +| Field | Type | Required | Default | Description | +|----------------------------------------|-----------------|----------|-----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `client_id` | string | ✅ | | Application ID. Found in your app registration on Azure AD Portal | +| `tenant_id` | string | ✅ | | Directory ID. Found in your app registration on Azure AD Portal | +| `client_secret` | string | ✅ | | Client secret. Found in your app registration on Azure AD Portal | +| `redirect` | string | ✅ | | Redirect URI. Found in your app registration on Azure AD Portal | +| `authority` | string | ✅ | | The [authority](https://docs.microsoft.com/en-us/azure/active-directory/develop/msal-client-application-configuration) is a URL that indicates a directory that MSAL can request tokens from. | +| `token_url` | string | ✅ | | The token URL that acquires a token from Azure AD for authorizing requests. This source will only work with v1.0 endpoint. | +| `graph_url` | string | ✅ | | [Microsoft Graph API endpoint](https://docs.microsoft.com/en-us/graph/use-the-api) | +| `ingest_users` | bool | | `True` | Whether users should be ingested into DataHub. | +| `ingest_groups` | bool | | `True` | Whether groups should be ingested into DataHub. | +| `ingest_group_membership` | bool | | `True` | Whether group membership should be ingested into DataHub. ingest_groups must be True if this is True. | +| `azure_ad_response_to_username_attr` | string | | `"userPrincipalName"` | Which Azure AD User Response attribute to use as input to DataHub username mapping. | +| `azure_ad_response_to_username_regex` | string | | `"(.*)"` | A regex used to parse the DataHub username from the attribute specified in `azure_ad_response_to_username_attr`. | +| `users_pattern.allow` | list of strings | | | List of regex patterns for users to include in ingestion. The name against which compare the regexp is the DataHub user name, i.e. the one resulting from the action of `azure_ad_response_to_username_attr` and `azure_ad_response_to_username_regex` | +| `users_pattern.deny` | list of strings | | | As above, but for excluding users from ingestion. | +| `azure_ad_response_to_groupname_attr` | string | | `"name"` | Which Azure AD Group Response attribute to use as input to DataHub group name mapping. | +| `azure_ad_response_to_groupname_regex` | string | | `"(.*)"` | A regex used to parse the DataHub group name from the attribute specified in `azure_ad_response_to_groupname_attr`. | +| `groups_pattern.allow` | list of strings | | | List of regex patterns for groups to include in ingestion. The name against which compare the regexp is the DataHub group name, i.e. the one resulting from the action of `azure_ad_response_to_groupname_attr` and `azure_ad_response_to_groupname_regex` | +| `groups_pattern.deny` | list of strings | | | As above, but for exculing groups from ingestion. | +| `ingest_groups_users` | bool | | `True` | This option is useful only when `ingest_users` is set to False and `ingest_group_membership` to True. As effect, only the users which belongs to the selected groups will be ingested. | +| `mask_group_id` | bool | | `True` | Whether workunit ID's for groups should be masked. | +| `mask_user_id` | bool | | `True` | Whether workunit ID's for users should be masked. | ## Questions diff --git a/metadata-ingestion/source_docs/okta.md b/metadata-ingestion/source_docs/okta.md index 58b6d8409b..8cd5037fc8 100644 --- a/metadata-ingestion/source_docs/okta.md +++ b/metadata-ingestion/source_docs/okta.md @@ -106,8 +106,8 @@ Note that a `.` is used to denote nested fields in the YAML configuration block. | Field | Type | Required | Default | Description | |------------------------------------|--------|----------|-------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `okta_domain` | string | ✅ | | The location of your Okta Domain, without a protocol. Can be found in Okta Developer console. | -| `okta_api_token` | string | ✅ | | An API token generated for the DataHub application inside your Okta Developer Console. | +| `okta_domain` | string | ✅ | | The location of your Okta Domain, without a protocol. Can be found in Okta Developer console. | +| `okta_api_token` | string | ✅ | | An API token generated for the DataHub application inside your Okta Developer Console. | | `ingest_users` | bool | | `True` | Whether users should be ingested into DataHub. | | `ingest_groups` | bool | | `True` | Whether groups should be ingested into DataHub. | | `ingest_group_membership` | bool | | `True` | Whether group membership should be ingested into DataHub. ingest_groups must be True if this is True. | @@ -119,10 +119,12 @@ Note that a `.` is used to denote nested fields in the YAML configuration block. | `include_suspended_users` | bool | | `False` | Whether to ingest users in the SUSPENDED state from Okta. | | `page_size` | number | | `100` | The number of entities requested from Okta's REST APIs in one request. | | `delay_seconds` | number | | `0.01` | Number of seconds to wait between calls to Okta's REST APIs. (Okta rate limits). Defaults to 10ms. | -| `okta_users_filter` | string | | `None` | Okta filter expression (not regex) for ingesting users. Only one of `okta_users_filter` and `okta_users_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/users/#list-users-with-a-filter) for more info. | -| `okta_users_search` | string | | `None` | Okta search expression (not regex) for ingesting users. Only one of `okta_users_filter` and `okta_users_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/users/#list-users-with-search) for more info. | -| `okta_groups_filter` | string | | `None` | Okta filter expression (not regex) for ingesting groups. Only one of `okta_groups_filter` and `okta_groups_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/groups/#filters) for more info. | -| `okta_users_search` | string | | `None` | Okta search expression (not regex) for ingesting groups. Only one of `okta_groups_filter` and `okta_groups_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/groups/#list-groups-with-search) for more info. | +| `okta_users_filter` | string | | `None` | Okta filter expression (not regex) for ingesting users. Only one of `okta_users_filter` and `okta_users_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/users/#list-users-with-a-filter) for more info. | +| `okta_users_search` | string | | `None` | Okta search expression (not regex) for ingesting users. Only one of `okta_users_filter` and `okta_users_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/users/#list-users-with-search) for more info. | +| `okta_groups_filter` | string | | `None` | Okta filter expression (not regex) for ingesting groups. Only one of `okta_groups_filter` and `okta_groups_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/groups/#filters) for more info. | +| `okta_users_search` | string | | `None` | Okta search expression (not regex) for ingesting groups. Only one of `okta_groups_filter` and `okta_groups_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/groups/#list-groups-with-search) for more info. | +| `mask_group_id` | bool | | `True` | Whether workunit ID's for groups should be masked. | +| `mask_user_id` | bool | | `True` | Whether workunit ID's for users should be masked. | ## Compatibility diff --git a/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py b/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py index 6413ed3497..77e4bce6ae 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py +++ b/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py @@ -63,6 +63,10 @@ class AzureADConfig(ConfigModel): # If enabled, report will contain names of filtered users and groups. filtered_tracking: bool = True + # Optional: Whether to mask sensitive information from workunit ID's. On by default. + mask_group_id: bool = True + mask_user_id: bool = True + @dataclass class AzureADSourceReport(SourceReport): @@ -134,11 +138,18 @@ class AzureADSource(Source): datahub_corp_group_snapshots = self._map_azure_ad_groups( azure_ad_groups ) - for datahub_corp_group_snapshot in datahub_corp_group_snapshots: + for group_count, datahub_corp_group_snapshot in enumerate( + datahub_corp_group_snapshots + ): mce = MetadataChangeEvent( proposedSnapshot=datahub_corp_group_snapshot ) - wu = MetadataWorkUnit(id=datahub_corp_group_snapshot.urn, mce=mce) + wu_id = ( + f"group-{group_count + 1}" + if self.config.mask_group_id + else datahub_corp_group_snapshot.urn + ) + wu = MetadataWorkUnit(id=wu_id, mce=mce) self.report.report_workunit(wu) yield wu @@ -241,7 +252,9 @@ class AzureADSource(Source): datahub_corp_user_snapshots: Generator[CorpUserSnapshot, Any, None], datahub_corp_user_urn_to_group_membership: dict, ) -> Generator[MetadataWorkUnit, Any, None]: - for datahub_corp_user_snapshot in datahub_corp_user_snapshots: + for user_count, datahub_corp_user_snapshot in enumerate( + datahub_corp_user_snapshots + ): # Add GroupMembership if applicable if ( datahub_corp_user_snapshot.urn @@ -255,7 +268,12 @@ class AzureADSource(Source): assert datahub_group_membership datahub_corp_user_snapshot.aspects.append(datahub_group_membership) mce = MetadataChangeEvent(proposedSnapshot=datahub_corp_user_snapshot) - wu = MetadataWorkUnit(id=datahub_corp_user_snapshot.urn, mce=mce) + wu_id = ( + f"user-{user_count + 1}" + if self.config.mask_user_id + else datahub_corp_user_snapshot.urn + ) + wu = MetadataWorkUnit(id=wu_id, mce=mce) self.report.report_workunit(wu) yield wu diff --git a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py index 4453194d0e..6c3b28b2ec 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py +++ b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py @@ -70,6 +70,10 @@ class OktaConfig(ConfigModel): okta_groups_filter: Optional[str] = None okta_groups_search: Optional[str] = None + # Optional: Whether to mask sensitive information from workunit ID's. On by default. + mask_group_id: bool = True + mask_user_id: bool = True + @validator("okta_users_search") def okta_users_one_of_filter_or_search(cls, v, values): if v and values["okta_users_filter"]: @@ -130,9 +134,16 @@ class OktaSource(Source): if self.config.ingest_groups: okta_groups = list(self._get_okta_groups(event_loop)) datahub_corp_group_snapshots = self._map_okta_groups(okta_groups) - for datahub_corp_group_snapshot in datahub_corp_group_snapshots: + for group_count, datahub_corp_group_snapshot in enumerate( + datahub_corp_group_snapshots + ): mce = MetadataChangeEvent(proposedSnapshot=datahub_corp_group_snapshot) - wu = MetadataWorkUnit(id=datahub_corp_group_snapshot.urn, mce=mce) + wu_id = ( + f"group-{group_count + 1}" + if self.config.mask_group_id + else datahub_corp_group_snapshot.urn + ) + wu = MetadataWorkUnit(id=wu_id, mce=mce) self.report.report_workunit(wu) yield wu @@ -183,7 +194,9 @@ class OktaSource(Source): okta_users = self._get_okta_users(event_loop) filtered_okta_users = filter(self._filter_okta_user, okta_users) datahub_corp_user_snapshots = self._map_okta_users(filtered_okta_users) - for datahub_corp_user_snapshot in datahub_corp_user_snapshots: + for user_count, datahub_corp_user_snapshot in enumerate( + datahub_corp_user_snapshots + ): # Add GroupMembership aspect populated in Step 2 if applicable. if ( @@ -198,7 +211,12 @@ class OktaSource(Source): assert datahub_group_membership is not None datahub_corp_user_snapshot.aspects.append(datahub_group_membership) mce = MetadataChangeEvent(proposedSnapshot=datahub_corp_user_snapshot) - wu = MetadataWorkUnit(id=datahub_corp_user_snapshot.urn, mce=mce) + wu_id = ( + f"user-{user_count + 1}" + if self.config.mask_user_id + else datahub_corp_user_snapshot.urn + ) + wu = MetadataWorkUnit(id=wu_id, mce=mce) self.report.report_workunit(wu) yield wu