feat(okta) - add support for filtering/searching when ingesting Okta groups and users (#4586)

This commit is contained in:
Aditya Radhakrishnan 2022-04-05 16:15:34 -07:00 committed by GitHub
parent 32349bf405
commit aeafa7e63f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 66 additions and 19 deletions

View File

@ -38,7 +38,7 @@ and mapped to the DataHub `CorpUserInfo` aspect:
- email
- title
- department
- country code
- country code
### Extracting DataHub Groups
@ -69,6 +69,12 @@ this should not matter.
This is a known limitation in our data model that is being tracked by [this ticket](https://github.com/datahub-project/datahub/issues/3065).
### Filtering and Searching
You can also choose to ingest a subset of users or groups to Datahub by adding flags for filtering or searching. For
users, set either the `okta_users_filter` or `okta_users_search` flag (only one can be set at a time). For groups, set
either the `okta_groups_filter` or `okta_groups_search` flag. Note that these are not regular expressions. See [below](#config-details) for full configuration
options.
## Quickstart recipe
@ -98,21 +104,25 @@ For general pointers on writing and running a recipe, see our [main recipe guide
Note that a `.` is used to denote nested fields in the YAML configuration block.
| Field | Type | Required | Default | Description |
|------------------------------------|--------|----------|-------------|-----------------------------------------------------------------------------------------------------------------|
| `okta_domain` | string | ✅ | | The location of your Okta Domain, without a protocol. Can be found in Okta Developer console. |
| `okta_api_token` | string | ✅ | | An API token generated for the DataHub application inside your Okta Developer Console. |
| `ingest_users` | bool | | `True` | Whether users should be ingested into DataHub. |
| `ingest_groups` | bool | | `True` | Whether groups should be ingested into DataHub. |
| `ingest_group_membership` | bool | | `True` | Whether group membership should be ingested into DataHub. ingest_groups must be True if this is True. |
| `okta_profile_to_username_attr` | string | | `"login"` | Which Okta User Profile attribute to use as input to DataHub username mapping. |
| `okta_profile_to_username_regex` | string | | `"([^@]+)"` | A regex used to parse the DataHub username from the attribute specified in `okta_profile_to_username_attr`. |
| `okta_profile_to_group_name_attr` | string | | `"name"` | Which Okta Group Profile attribute to use as input to DataHub group name mapping. |
| `okta_profile_to_group_name_regex` | string | | `"(.*)"` | A regex used to parse the DataHub group name from the attribute specified in `okta_profile_to_group_name_attr`. |
| `include_deprovisioned_users` | bool | | `False` | Whether to ingest users in the DEPROVISIONED state from Okta. |
| `include_suspended_users` | bool | | `False` | Whether to ingest users in the SUSPENDED state from Okta. |
| `page_size` | number | | `100` | The number of entities requested from Okta's REST APIs in one request. |
| `delay_seconds` | number | | `0.01` | Number of seconds to wait between calls to Okta's REST APIs. (Okta rate limits). Defaults to 10ms. |
| Field | Type | Required | Default | Description |
|------------------------------------|--------|----------|-------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `okta_domain` | string | ✅ | | The location of your Okta Domain, without a protocol. Can be found in Okta Developer console. |
| `okta_api_token` | string | ✅ | | An API token generated for the DataHub application inside your Okta Developer Console. |
| `ingest_users` | bool | | `True` | Whether users should be ingested into DataHub. |
| `ingest_groups` | bool | | `True` | Whether groups should be ingested into DataHub. |
| `ingest_group_membership` | bool | | `True` | Whether group membership should be ingested into DataHub. ingest_groups must be True if this is True. |
| `okta_profile_to_username_attr` | string | | `"login"` | Which Okta User Profile attribute to use as input to DataHub username mapping. |
| `okta_profile_to_username_regex` | string | | `"([^@]+)"` | A regex used to parse the DataHub username from the attribute specified in `okta_profile_to_username_attr`. |
| `okta_profile_to_group_name_attr` | string | | `"name"` | Which Okta Group Profile attribute to use as input to DataHub group name mapping. |
| `okta_profile_to_group_name_regex` | string | | `"(.*)"` | A regex used to parse the DataHub group name from the attribute specified in `okta_profile_to_group_name_attr`. |
| `include_deprovisioned_users` | bool | | `False` | Whether to ingest users in the DEPROVISIONED state from Okta. |
| `include_suspended_users` | bool | | `False` | Whether to ingest users in the SUSPENDED state from Okta. |
| `page_size` | number | | `100` | The number of entities requested from Okta's REST APIs in one request. |
| `delay_seconds` | number | | `0.01` | Number of seconds to wait between calls to Okta's REST APIs. (Okta rate limits). Defaults to 10ms. |
| `okta_users_filter` | string | | `None` | Okta filter expression (not regex) for ingesting users. Only one of `okta_users_filter` and `okta_users_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/users/#list-users-with-a-filter) for more info. |
| `okta_users_search` | string | | `None` | Okta search expression (not regex) for ingesting users. Only one of `okta_users_filter` and `okta_users_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/users/#list-users-with-search) for more info. |
| `okta_groups_filter` | string | | `None` | Okta filter expression (not regex) for ingesting groups. Only one of `okta_groups_filter` and `okta_groups_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/groups/#filters) for more info. |
| `okta_users_search` | string | | `None` | Okta search expression (not regex) for ingesting groups. Only one of `okta_groups_filter` and `okta_groups_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/groups/#list-groups-with-search) for more info. |
## Compatibility

View File

@ -4,13 +4,15 @@ import re
import urllib
from dataclasses import dataclass, field
from time import sleep
from typing import Dict, Iterable, List, Union
from typing import Dict, Iterable, List, Optional, Union
from okta.client import Client as OktaClient
from okta.exceptions import OktaAPIException
from okta.models import Group, GroupProfile, User, UserProfile, UserStatus
from pydantic import validator
from datahub.configuration import ConfigModel
from datahub.configuration.common import ConfigurationError
from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.api.source import Source, SourceReport
from datahub.ingestion.api.workunit import MetadataWorkUnit
@ -60,6 +62,30 @@ class OktaConfig(ConfigModel):
# Optional: Set the delay for fetching batches of entities from Okta. Okta has rate limiting in place.
delay_seconds = 0.01
# Optional: Filter and search expression for ingesting a subset of users. Only one can be specified at a time.
okta_users_filter: Optional[str] = None
okta_users_search: Optional[str] = None
# Optional: Filter and search expression for ingesting a subset of groups. Only one can be specified at a time.
okta_groups_filter: Optional[str] = None
okta_groups_search: Optional[str] = None
@validator("okta_users_search")
def okta_users_one_of_filter_or_search(cls, v, values):
if v and values["okta_users_filter"]:
raise ConfigurationError(
"Only one of okta_users_filter or okta_users_search can be set"
)
return v
@validator("okta_groups_search")
def okta_groups_one_of_filter_or_search(cls, v, values):
if v and values["okta_groups_filter"]:
raise ConfigurationError(
"Only one of okta_groups_filter or okta_groups_search can be set"
)
return v
@dataclass
class OktaSourceReport(SourceReport):
@ -201,7 +227,12 @@ class OktaSource(Source):
logger.debug("Extracting all Okta groups")
# Note that this is not taking full advantage of Python AsyncIO, as we are blocking on calls.
query_parameters = {"limit": self.config.page_size}
query_parameters: Dict[str, Union[str, int]] = {"limit": self.config.page_size}
if self.config.okta_groups_filter:
query_parameters.update({"filter": self.config.okta_groups_filter})
if self.config.okta_groups_search:
query_parameters.update({"search": self.config.okta_groups_search})
groups = resp = err = None
try:
groups, resp, err = event_loop.run_until_complete(
self.okta_client.list_groups(query_parameters)
@ -238,6 +269,7 @@ class OktaSource(Source):
# Note that this is not taking full advantage of Python AsyncIO; we are blocking on calls.
query_parameters = {"limit": self.config.page_size}
users = resp = err = None
try:
users, resp, err = event_loop.run_until_complete(
self.okta_client.list_group_users(group.id, query_parameters)
@ -272,7 +304,12 @@ class OktaSource(Source):
def _get_okta_users(self, event_loop: asyncio.AbstractEventLoop) -> Iterable[User]:
logger.debug("Extracting all Okta users")
query_parameters = {"limit": self.config.page_size}
query_parameters: Dict[str, Union[str, int]] = {"limit": self.config.page_size}
if self.config.okta_users_filter:
query_parameters.update({"filter": self.config.okta_users_filter})
if self.config.okta_users_search:
query_parameters.update({"search": self.config.okta_users_search})
users = resp = err = None
try:
users, resp, err = event_loop.run_until_complete(
self.okta_client.list_users(query_parameters)