diff --git a/metadata-ingestion/source_docs/okta.md b/metadata-ingestion/source_docs/okta.md index bf0de9a487..58b6d8409b 100644 --- a/metadata-ingestion/source_docs/okta.md +++ b/metadata-ingestion/source_docs/okta.md @@ -38,7 +38,7 @@ and mapped to the DataHub `CorpUserInfo` aspect: - email - title - department -- country code +- country code ### Extracting DataHub Groups @@ -69,6 +69,12 @@ this should not matter. This is a known limitation in our data model that is being tracked by [this ticket](https://github.com/datahub-project/datahub/issues/3065). +### Filtering and Searching +You can also choose to ingest a subset of users or groups to Datahub by adding flags for filtering or searching. For +users, set either the `okta_users_filter` or `okta_users_search` flag (only one can be set at a time). For groups, set +either the `okta_groups_filter` or `okta_groups_search` flag. Note that these are not regular expressions. See [below](#config-details) for full configuration +options. + ## Quickstart recipe @@ -98,21 +104,25 @@ For general pointers on writing and running a recipe, see our [main recipe guide Note that a `.` is used to denote nested fields in the YAML configuration block. -| Field | Type | Required | Default | Description | -|------------------------------------|--------|----------|-------------|-----------------------------------------------------------------------------------------------------------------| -| `okta_domain` | string | ✅ | | The location of your Okta Domain, without a protocol. Can be found in Okta Developer console. | -| `okta_api_token` | string | ✅ | | An API token generated for the DataHub application inside your Okta Developer Console. | -| `ingest_users` | bool | | `True` | Whether users should be ingested into DataHub. | -| `ingest_groups` | bool | | `True` | Whether groups should be ingested into DataHub. | -| `ingest_group_membership` | bool | | `True` | Whether group membership should be ingested into DataHub. ingest_groups must be True if this is True. | -| `okta_profile_to_username_attr` | string | | `"login"` | Which Okta User Profile attribute to use as input to DataHub username mapping. | -| `okta_profile_to_username_regex` | string | | `"([^@]+)"` | A regex used to parse the DataHub username from the attribute specified in `okta_profile_to_username_attr`. | -| `okta_profile_to_group_name_attr` | string | | `"name"` | Which Okta Group Profile attribute to use as input to DataHub group name mapping. | -| `okta_profile_to_group_name_regex` | string | | `"(.*)"` | A regex used to parse the DataHub group name from the attribute specified in `okta_profile_to_group_name_attr`. | -| `include_deprovisioned_users` | bool | | `False` | Whether to ingest users in the DEPROVISIONED state from Okta. | -| `include_suspended_users` | bool | | `False` | Whether to ingest users in the SUSPENDED state from Okta. | -| `page_size` | number | | `100` | The number of entities requested from Okta's REST APIs in one request. | -| `delay_seconds` | number | | `0.01` | Number of seconds to wait between calls to Okta's REST APIs. (Okta rate limits). Defaults to 10ms. | +| Field | Type | Required | Default | Description | +|------------------------------------|--------|----------|-------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `okta_domain` | string | ✅ | | The location of your Okta Domain, without a protocol. Can be found in Okta Developer console. | +| `okta_api_token` | string | ✅ | | An API token generated for the DataHub application inside your Okta Developer Console. | +| `ingest_users` | bool | | `True` | Whether users should be ingested into DataHub. | +| `ingest_groups` | bool | | `True` | Whether groups should be ingested into DataHub. | +| `ingest_group_membership` | bool | | `True` | Whether group membership should be ingested into DataHub. ingest_groups must be True if this is True. | +| `okta_profile_to_username_attr` | string | | `"login"` | Which Okta User Profile attribute to use as input to DataHub username mapping. | +| `okta_profile_to_username_regex` | string | | `"([^@]+)"` | A regex used to parse the DataHub username from the attribute specified in `okta_profile_to_username_attr`. | +| `okta_profile_to_group_name_attr` | string | | `"name"` | Which Okta Group Profile attribute to use as input to DataHub group name mapping. | +| `okta_profile_to_group_name_regex` | string | | `"(.*)"` | A regex used to parse the DataHub group name from the attribute specified in `okta_profile_to_group_name_attr`. | +| `include_deprovisioned_users` | bool | | `False` | Whether to ingest users in the DEPROVISIONED state from Okta. | +| `include_suspended_users` | bool | | `False` | Whether to ingest users in the SUSPENDED state from Okta. | +| `page_size` | number | | `100` | The number of entities requested from Okta's REST APIs in one request. | +| `delay_seconds` | number | | `0.01` | Number of seconds to wait between calls to Okta's REST APIs. (Okta rate limits). Defaults to 10ms. | +| `okta_users_filter` | string | | `None` | Okta filter expression (not regex) for ingesting users. Only one of `okta_users_filter` and `okta_users_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/users/#list-users-with-a-filter) for more info. | +| `okta_users_search` | string | | `None` | Okta search expression (not regex) for ingesting users. Only one of `okta_users_filter` and `okta_users_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/users/#list-users-with-search) for more info. | +| `okta_groups_filter` | string | | `None` | Okta filter expression (not regex) for ingesting groups. Only one of `okta_groups_filter` and `okta_groups_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/groups/#filters) for more info. | +| `okta_users_search` | string | | `None` | Okta search expression (not regex) for ingesting groups. Only one of `okta_groups_filter` and `okta_groups_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/groups/#list-groups-with-search) for more info. | ## Compatibility diff --git a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py index ced3c307e6..4453194d0e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py +++ b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py @@ -4,13 +4,15 @@ import re import urllib from dataclasses import dataclass, field from time import sleep -from typing import Dict, Iterable, List, Union +from typing import Dict, Iterable, List, Optional, Union from okta.client import Client as OktaClient from okta.exceptions import OktaAPIException from okta.models import Group, GroupProfile, User, UserProfile, UserStatus +from pydantic import validator from datahub.configuration import ConfigModel +from datahub.configuration.common import ConfigurationError from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit @@ -60,6 +62,30 @@ class OktaConfig(ConfigModel): # Optional: Set the delay for fetching batches of entities from Okta. Okta has rate limiting in place. delay_seconds = 0.01 + # Optional: Filter and search expression for ingesting a subset of users. Only one can be specified at a time. + okta_users_filter: Optional[str] = None + okta_users_search: Optional[str] = None + + # Optional: Filter and search expression for ingesting a subset of groups. Only one can be specified at a time. + okta_groups_filter: Optional[str] = None + okta_groups_search: Optional[str] = None + + @validator("okta_users_search") + def okta_users_one_of_filter_or_search(cls, v, values): + if v and values["okta_users_filter"]: + raise ConfigurationError( + "Only one of okta_users_filter or okta_users_search can be set" + ) + return v + + @validator("okta_groups_search") + def okta_groups_one_of_filter_or_search(cls, v, values): + if v and values["okta_groups_filter"]: + raise ConfigurationError( + "Only one of okta_groups_filter or okta_groups_search can be set" + ) + return v + @dataclass class OktaSourceReport(SourceReport): @@ -201,7 +227,12 @@ class OktaSource(Source): logger.debug("Extracting all Okta groups") # Note that this is not taking full advantage of Python AsyncIO, as we are blocking on calls. - query_parameters = {"limit": self.config.page_size} + query_parameters: Dict[str, Union[str, int]] = {"limit": self.config.page_size} + if self.config.okta_groups_filter: + query_parameters.update({"filter": self.config.okta_groups_filter}) + if self.config.okta_groups_search: + query_parameters.update({"search": self.config.okta_groups_search}) + groups = resp = err = None try: groups, resp, err = event_loop.run_until_complete( self.okta_client.list_groups(query_parameters) @@ -238,6 +269,7 @@ class OktaSource(Source): # Note that this is not taking full advantage of Python AsyncIO; we are blocking on calls. query_parameters = {"limit": self.config.page_size} + users = resp = err = None try: users, resp, err = event_loop.run_until_complete( self.okta_client.list_group_users(group.id, query_parameters) @@ -272,7 +304,12 @@ class OktaSource(Source): def _get_okta_users(self, event_loop: asyncio.AbstractEventLoop) -> Iterable[User]: logger.debug("Extracting all Okta users") - query_parameters = {"limit": self.config.page_size} + query_parameters: Dict[str, Union[str, int]] = {"limit": self.config.page_size} + if self.config.okta_users_filter: + query_parameters.update({"filter": self.config.okta_users_filter}) + if self.config.okta_users_search: + query_parameters.update({"search": self.config.okta_users_search}) + users = resp = err = None try: users, resp, err = event_loop.run_until_complete( self.okta_client.list_users(query_parameters)