feat(ingest) - update identity sources to add flags for masking sensitive work units (#4711)

This commit is contained in:
Aditya Radhakrishnan 2022-04-20 14:21:08 -07:00 committed by GitHub
parent c91d70f1ba
commit 15e90f6dd0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 75 additions and 35 deletions

View File

@ -112,27 +112,29 @@ For general pointers on writing and running a recipe, see our [main recipe guide
Note that a `.` is used to denote nested fields in the YAML configuration block. Note that a `.` is used to denote nested fields in the YAML configuration block.
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |
|------------------------------------|--------|----------|-------------|-----------------------------------------------------------------------------------------------------------------| |----------------------------------------|-----------------|----------|-----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `client_id` | string | ✅ | | Application ID. Found in your app registration on Azure AD Portal | | `client_id` | string | ✅ | | Application ID. Found in your app registration on Azure AD Portal |
| `tenant_id` | string | ✅ | | Directory ID. Found in your app registration on Azure AD Portal | | `tenant_id` | string | ✅ | | Directory ID. Found in your app registration on Azure AD Portal |
| `client_secret` | string | ✅ | | Client secret. Found in your app registration on Azure AD Portal | | `client_secret` | string | ✅ | | Client secret. Found in your app registration on Azure AD Portal |
| `redirect` | string | ✅ | | Redirect URI. Found in your app registration on Azure AD Portal | | `redirect` | string | ✅ | | Redirect URI. Found in your app registration on Azure AD Portal |
| `authority` | string | ✅ | | The [authority](https://docs.microsoft.com/en-us/azure/active-directory/develop/msal-client-application-configuration) is a URL that indicates a directory that MSAL can request tokens from. | | `authority` | string | ✅ | | The [authority](https://docs.microsoft.com/en-us/azure/active-directory/develop/msal-client-application-configuration) is a URL that indicates a directory that MSAL can request tokens from. |
| `token_url` | string | ✅ | | The token URL that acquires a token from Azure AD for authorizing requests. This source will only work with v1.0 endpoint. | | `token_url` | string | ✅ | | The token URL that acquires a token from Azure AD for authorizing requests. This source will only work with v1.0 endpoint. |
| `graph_url` | string | ✅ | | [Microsoft Graph API endpoint](https://docs.microsoft.com/en-us/graph/use-the-api) | `graph_url` | string | ✅ | | [Microsoft Graph API endpoint](https://docs.microsoft.com/en-us/graph/use-the-api) |
| `ingest_users` | bool | | `True` | Whether users should be ingested into DataHub. | | `ingest_users` | bool | | `True` | Whether users should be ingested into DataHub. |
| `ingest_groups` | bool | | `True` | Whether groups should be ingested into DataHub. | | `ingest_groups` | bool | | `True` | Whether groups should be ingested into DataHub. |
| `ingest_group_membership` | bool | | `True` | Whether group membership should be ingested into DataHub. ingest_groups must be True if this is True. | | `ingest_group_membership` | bool | | `True` | Whether group membership should be ingested into DataHub. ingest_groups must be True if this is True. |
| `azure_ad_response_to_username_attr` | string | | `"userPrincipalName"` | Which Azure AD User Response attribute to use as input to DataHub username mapping. | | `azure_ad_response_to_username_attr` | string | | `"userPrincipalName"` | Which Azure AD User Response attribute to use as input to DataHub username mapping. |
| `azure_ad_response_to_username_regex` | string | | `"(.*)"` | A regex used to parse the DataHub username from the attribute specified in `azure_ad_response_to_username_attr`. | | `azure_ad_response_to_username_regex` | string | | `"(.*)"` | A regex used to parse the DataHub username from the attribute specified in `azure_ad_response_to_username_attr`. |
| `users_pattern.allow` | list of strings | | | List of regex patterns for users to include in ingestion. The name against which compare the regexp is the DataHub user name, i.e. the one resulting from the action of `azure_ad_response_to_username_attr` and `azure_ad_response_to_username_regex` | | `users_pattern.allow` | list of strings | | | List of regex patterns for users to include in ingestion. The name against which compare the regexp is the DataHub user name, i.e. the one resulting from the action of `azure_ad_response_to_username_attr` and `azure_ad_response_to_username_regex` |
| `users_pattern.deny` | list of strings | | | As above, but for excluding users from ingestion. | | `users_pattern.deny` | list of strings | | | As above, but for excluding users from ingestion. |
| `azure_ad_response_to_groupname_attr` | string | | `"name"` | Which Azure AD Group Response attribute to use as input to DataHub group name mapping. | | `azure_ad_response_to_groupname_attr` | string | | `"name"` | Which Azure AD Group Response attribute to use as input to DataHub group name mapping. |
| `azure_ad_response_to_groupname_regex` | string | | `"(.*)"` | A regex used to parse the DataHub group name from the attribute specified in `azure_ad_response_to_groupname_attr`. | | `azure_ad_response_to_groupname_regex` | string | | `"(.*)"` | A regex used to parse the DataHub group name from the attribute specified in `azure_ad_response_to_groupname_attr`. |
| `groups_pattern.allow` | list of strings | | | List of regex patterns for groups to include in ingestion. The name against which compare the regexp is the DataHub group name, i.e. the one resulting from the action of `azure_ad_response_to_groupname_attr` and `azure_ad_response_to_groupname_regex` | | `groups_pattern.allow` | list of strings | | | List of regex patterns for groups to include in ingestion. The name against which compare the regexp is the DataHub group name, i.e. the one resulting from the action of `azure_ad_response_to_groupname_attr` and `azure_ad_response_to_groupname_regex` |
| `groups_pattern.deny` | list of strings | | | As above, but for exculing groups from ingestion. | | `groups_pattern.deny` | list of strings | | | As above, but for exculing groups from ingestion. |
| `ingest_groups_users` | bool | | `True` | This option is useful only when `ingest_users` is set to False and `ingest_group_membership` to True. As effect, only the users which belongs to the selected groups will be ingested. | | `ingest_groups_users` | bool | | `True` | This option is useful only when `ingest_users` is set to False and `ingest_group_membership` to True. As effect, only the users which belongs to the selected groups will be ingested. |
| `mask_group_id` | bool | | `True` | Whether workunit ID's for groups should be masked. |
| `mask_user_id` | bool | | `True` | Whether workunit ID's for users should be masked. |
## Questions ## Questions

View File

@ -106,8 +106,8 @@ Note that a `.` is used to denote nested fields in the YAML configuration block.
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |
|------------------------------------|--------|----------|-------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |------------------------------------|--------|----------|-------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `okta_domain` | string | ✅ | | The location of your Okta Domain, without a protocol. Can be found in Okta Developer console. | | `okta_domain` | string | ✅ | | The location of your Okta Domain, without a protocol. Can be found in Okta Developer console. |
| `okta_api_token` | string | ✅ | | An API token generated for the DataHub application inside your Okta Developer Console. | | `okta_api_token` | string | ✅ | | An API token generated for the DataHub application inside your Okta Developer Console. |
| `ingest_users` | bool | | `True` | Whether users should be ingested into DataHub. | | `ingest_users` | bool | | `True` | Whether users should be ingested into DataHub. |
| `ingest_groups` | bool | | `True` | Whether groups should be ingested into DataHub. | | `ingest_groups` | bool | | `True` | Whether groups should be ingested into DataHub. |
| `ingest_group_membership` | bool | | `True` | Whether group membership should be ingested into DataHub. ingest_groups must be True if this is True. | | `ingest_group_membership` | bool | | `True` | Whether group membership should be ingested into DataHub. ingest_groups must be True if this is True. |
@ -119,10 +119,12 @@ Note that a `.` is used to denote nested fields in the YAML configuration block.
| `include_suspended_users` | bool | | `False` | Whether to ingest users in the SUSPENDED state from Okta. | | `include_suspended_users` | bool | | `False` | Whether to ingest users in the SUSPENDED state from Okta. |
| `page_size` | number | | `100` | The number of entities requested from Okta's REST APIs in one request. | | `page_size` | number | | `100` | The number of entities requested from Okta's REST APIs in one request. |
| `delay_seconds` | number | | `0.01` | Number of seconds to wait between calls to Okta's REST APIs. (Okta rate limits). Defaults to 10ms. | | `delay_seconds` | number | | `0.01` | Number of seconds to wait between calls to Okta's REST APIs. (Okta rate limits). Defaults to 10ms. |
| `okta_users_filter` | string | | `None` | Okta filter expression (not regex) for ingesting users. Only one of `okta_users_filter` and `okta_users_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/users/#list-users-with-a-filter) for more info. | | `okta_users_filter` | string | | `None` | Okta filter expression (not regex) for ingesting users. Only one of `okta_users_filter` and `okta_users_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/users/#list-users-with-a-filter) for more info. |
| `okta_users_search` | string | | `None` | Okta search expression (not regex) for ingesting users. Only one of `okta_users_filter` and `okta_users_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/users/#list-users-with-search) for more info. | | `okta_users_search` | string | | `None` | Okta search expression (not regex) for ingesting users. Only one of `okta_users_filter` and `okta_users_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/users/#list-users-with-search) for more info. |
| `okta_groups_filter` | string | | `None` | Okta filter expression (not regex) for ingesting groups. Only one of `okta_groups_filter` and `okta_groups_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/groups/#filters) for more info. | | `okta_groups_filter` | string | | `None` | Okta filter expression (not regex) for ingesting groups. Only one of `okta_groups_filter` and `okta_groups_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/groups/#filters) for more info. |
| `okta_users_search` | string | | `None` | Okta search expression (not regex) for ingesting groups. Only one of `okta_groups_filter` and `okta_groups_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/groups/#list-groups-with-search) for more info. | | `okta_users_search` | string | | `None` | Okta search expression (not regex) for ingesting groups. Only one of `okta_groups_filter` and `okta_groups_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/groups/#list-groups-with-search) for more info. |
| `mask_group_id` | bool | | `True` | Whether workunit ID's for groups should be masked. |
| `mask_user_id` | bool | | `True` | Whether workunit ID's for users should be masked. |
## Compatibility ## Compatibility

View File

@ -63,6 +63,10 @@ class AzureADConfig(ConfigModel):
# If enabled, report will contain names of filtered users and groups. # If enabled, report will contain names of filtered users and groups.
filtered_tracking: bool = True filtered_tracking: bool = True
# Optional: Whether to mask sensitive information from workunit ID's. On by default.
mask_group_id: bool = True
mask_user_id: bool = True
@dataclass @dataclass
class AzureADSourceReport(SourceReport): class AzureADSourceReport(SourceReport):
@ -134,11 +138,18 @@ class AzureADSource(Source):
datahub_corp_group_snapshots = self._map_azure_ad_groups( datahub_corp_group_snapshots = self._map_azure_ad_groups(
azure_ad_groups azure_ad_groups
) )
for datahub_corp_group_snapshot in datahub_corp_group_snapshots: for group_count, datahub_corp_group_snapshot in enumerate(
datahub_corp_group_snapshots
):
mce = MetadataChangeEvent( mce = MetadataChangeEvent(
proposedSnapshot=datahub_corp_group_snapshot proposedSnapshot=datahub_corp_group_snapshot
) )
wu = MetadataWorkUnit(id=datahub_corp_group_snapshot.urn, mce=mce) wu_id = (
f"group-{group_count + 1}"
if self.config.mask_group_id
else datahub_corp_group_snapshot.urn
)
wu = MetadataWorkUnit(id=wu_id, mce=mce)
self.report.report_workunit(wu) self.report.report_workunit(wu)
yield wu yield wu
@ -241,7 +252,9 @@ class AzureADSource(Source):
datahub_corp_user_snapshots: Generator[CorpUserSnapshot, Any, None], datahub_corp_user_snapshots: Generator[CorpUserSnapshot, Any, None],
datahub_corp_user_urn_to_group_membership: dict, datahub_corp_user_urn_to_group_membership: dict,
) -> Generator[MetadataWorkUnit, Any, None]: ) -> Generator[MetadataWorkUnit, Any, None]:
for datahub_corp_user_snapshot in datahub_corp_user_snapshots: for user_count, datahub_corp_user_snapshot in enumerate(
datahub_corp_user_snapshots
):
# Add GroupMembership if applicable # Add GroupMembership if applicable
if ( if (
datahub_corp_user_snapshot.urn datahub_corp_user_snapshot.urn
@ -255,7 +268,12 @@ class AzureADSource(Source):
assert datahub_group_membership assert datahub_group_membership
datahub_corp_user_snapshot.aspects.append(datahub_group_membership) datahub_corp_user_snapshot.aspects.append(datahub_group_membership)
mce = MetadataChangeEvent(proposedSnapshot=datahub_corp_user_snapshot) mce = MetadataChangeEvent(proposedSnapshot=datahub_corp_user_snapshot)
wu = MetadataWorkUnit(id=datahub_corp_user_snapshot.urn, mce=mce) wu_id = (
f"user-{user_count + 1}"
if self.config.mask_user_id
else datahub_corp_user_snapshot.urn
)
wu = MetadataWorkUnit(id=wu_id, mce=mce)
self.report.report_workunit(wu) self.report.report_workunit(wu)
yield wu yield wu

View File

@ -70,6 +70,10 @@ class OktaConfig(ConfigModel):
okta_groups_filter: Optional[str] = None okta_groups_filter: Optional[str] = None
okta_groups_search: Optional[str] = None okta_groups_search: Optional[str] = None
# Optional: Whether to mask sensitive information from workunit ID's. On by default.
mask_group_id: bool = True
mask_user_id: bool = True
@validator("okta_users_search") @validator("okta_users_search")
def okta_users_one_of_filter_or_search(cls, v, values): def okta_users_one_of_filter_or_search(cls, v, values):
if v and values["okta_users_filter"]: if v and values["okta_users_filter"]:
@ -130,9 +134,16 @@ class OktaSource(Source):
if self.config.ingest_groups: if self.config.ingest_groups:
okta_groups = list(self._get_okta_groups(event_loop)) okta_groups = list(self._get_okta_groups(event_loop))
datahub_corp_group_snapshots = self._map_okta_groups(okta_groups) datahub_corp_group_snapshots = self._map_okta_groups(okta_groups)
for datahub_corp_group_snapshot in datahub_corp_group_snapshots: for group_count, datahub_corp_group_snapshot in enumerate(
datahub_corp_group_snapshots
):
mce = MetadataChangeEvent(proposedSnapshot=datahub_corp_group_snapshot) mce = MetadataChangeEvent(proposedSnapshot=datahub_corp_group_snapshot)
wu = MetadataWorkUnit(id=datahub_corp_group_snapshot.urn, mce=mce) wu_id = (
f"group-{group_count + 1}"
if self.config.mask_group_id
else datahub_corp_group_snapshot.urn
)
wu = MetadataWorkUnit(id=wu_id, mce=mce)
self.report.report_workunit(wu) self.report.report_workunit(wu)
yield wu yield wu
@ -183,7 +194,9 @@ class OktaSource(Source):
okta_users = self._get_okta_users(event_loop) okta_users = self._get_okta_users(event_loop)
filtered_okta_users = filter(self._filter_okta_user, okta_users) filtered_okta_users = filter(self._filter_okta_user, okta_users)
datahub_corp_user_snapshots = self._map_okta_users(filtered_okta_users) datahub_corp_user_snapshots = self._map_okta_users(filtered_okta_users)
for datahub_corp_user_snapshot in datahub_corp_user_snapshots: for user_count, datahub_corp_user_snapshot in enumerate(
datahub_corp_user_snapshots
):
# Add GroupMembership aspect populated in Step 2 if applicable. # Add GroupMembership aspect populated in Step 2 if applicable.
if ( if (
@ -198,7 +211,12 @@ class OktaSource(Source):
assert datahub_group_membership is not None assert datahub_group_membership is not None
datahub_corp_user_snapshot.aspects.append(datahub_group_membership) datahub_corp_user_snapshot.aspects.append(datahub_group_membership)
mce = MetadataChangeEvent(proposedSnapshot=datahub_corp_user_snapshot) mce = MetadataChangeEvent(proposedSnapshot=datahub_corp_user_snapshot)
wu = MetadataWorkUnit(id=datahub_corp_user_snapshot.urn, mce=mce) wu_id = (
f"user-{user_count + 1}"
if self.config.mask_user_id
else datahub_corp_user_snapshot.urn
)
wu = MetadataWorkUnit(id=wu_id, mce=mce)
self.report.report_workunit(wu) self.report.report_workunit(wu)
yield wu yield wu