mirror of
				https://github.com/datahub-project/datahub.git
				synced 2025-11-03 20:27:50 +00:00 
			
		
		
		
	feat(okta) - add support for filtering/searching when ingesting Okta groups and users (#4586)
This commit is contained in:
		
							parent
							
								
									32349bf405
								
							
						
					
					
						commit
						aeafa7e63f
					
				@ -38,7 +38,7 @@ and mapped to the DataHub `CorpUserInfo` aspect:
 | 
			
		||||
- email
 | 
			
		||||
- title
 | 
			
		||||
- department
 | 
			
		||||
- country code 
 | 
			
		||||
- country code
 | 
			
		||||
 | 
			
		||||
### Extracting DataHub Groups
 | 
			
		||||
 | 
			
		||||
@ -69,6 +69,12 @@ this should not matter.
 | 
			
		||||
 | 
			
		||||
This is a known limitation in our data model that is being tracked by [this ticket](https://github.com/datahub-project/datahub/issues/3065).
 | 
			
		||||
 | 
			
		||||
### Filtering and Searching
 | 
			
		||||
You can also choose to ingest a subset of users or groups to Datahub by adding flags for filtering or searching. For
 | 
			
		||||
users, set either the `okta_users_filter` or `okta_users_search` flag (only one can be set at a time). For groups, set
 | 
			
		||||
either the `okta_groups_filter` or `okta_groups_search` flag. Note that these are not regular expressions. See [below](#config-details) for full configuration
 | 
			
		||||
options.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## Quickstart recipe
 | 
			
		||||
 | 
			
		||||
@ -98,21 +104,25 @@ For general pointers on writing and running a recipe, see our [main recipe guide
 | 
			
		||||
 | 
			
		||||
Note that a `.` is used to denote nested fields in the YAML configuration block.
 | 
			
		||||
 | 
			
		||||
| Field                              | Type   | Required | Default     | Description                                                                                                     |
 | 
			
		||||
|------------------------------------|--------|----------|-------------|-----------------------------------------------------------------------------------------------------------------|
 | 
			
		||||
| `okta_domain`                      | string | ✅       |             | The location of your Okta Domain, without a protocol. Can be found in Okta Developer console.                   |
 | 
			
		||||
| `okta_api_token`                   | string | ✅       |             | An API token generated for the DataHub application inside your Okta Developer Console.                          |
 | 
			
		||||
| `ingest_users`                     | bool   |          | `True`      | Whether users should be ingested into DataHub.                                                                  |
 | 
			
		||||
| `ingest_groups`                    | bool   |          | `True`      | Whether groups should be ingested into DataHub.                                                                 |
 | 
			
		||||
| `ingest_group_membership`          | bool   |          | `True`      | Whether group membership should be ingested into DataHub. ingest_groups must be True if this is True.           |
 | 
			
		||||
| `okta_profile_to_username_attr`    | string |          | `"login"`   | Which Okta User Profile attribute to use as input to DataHub username mapping.                                  |
 | 
			
		||||
| `okta_profile_to_username_regex`   | string |          | `"([^@]+)"` | A regex used to parse the DataHub username from the attribute specified in `okta_profile_to_username_attr`.     |
 | 
			
		||||
| `okta_profile_to_group_name_attr`  | string |          | `"name"`    | Which Okta Group Profile attribute to use as input to DataHub group name mapping.                               |
 | 
			
		||||
| `okta_profile_to_group_name_regex` | string |          | `"(.*)"`    | A regex used to parse the DataHub group name from the attribute specified in `okta_profile_to_group_name_attr`. |
 | 
			
		||||
| `include_deprovisioned_users`      | bool   |          | `False`     | Whether to ingest users in the DEPROVISIONED state from Okta.                                                   |
 | 
			
		||||
| `include_suspended_users`          | bool   |          | `False`     | Whether to ingest users in the SUSPENDED state from Okta.                                                       |
 | 
			
		||||
| `page_size`                        | number |          | `100`       | The number of entities requested from Okta's REST APIs in one request.                                          |
 | 
			
		||||
| `delay_seconds`                    | number |          | `0.01`      | Number of seconds to wait between calls to Okta's REST APIs. (Okta rate limits). Defaults to 10ms.              |
 | 
			
		||||
| Field                              | Type   | Required | Default     | Description                                                                                                                                                                                                                                          |
 | 
			
		||||
|------------------------------------|--------|----------|-------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | 
			
		||||
| `okta_domain`                      | string | ✅       |             | The location of your Okta Domain, without a protocol. Can be found in Okta Developer console.                                                                                                                                                        |
 | 
			
		||||
| `okta_api_token`                   | string | ✅       |             | An API token generated for the DataHub application inside your Okta Developer Console.                                                                                                                                                               |
 | 
			
		||||
| `ingest_users`                     | bool   |          | `True`      | Whether users should be ingested into DataHub.                                                                                                                                                                                                       |
 | 
			
		||||
| `ingest_groups`                    | bool   |          | `True`      | Whether groups should be ingested into DataHub.                                                                                                                                                                                                      |
 | 
			
		||||
| `ingest_group_membership`          | bool   |          | `True`      | Whether group membership should be ingested into DataHub. ingest_groups must be True if this is True.                                                                                                                                                |
 | 
			
		||||
| `okta_profile_to_username_attr`    | string |          | `"login"`   | Which Okta User Profile attribute to use as input to DataHub username mapping.                                                                                                                                                                       |
 | 
			
		||||
| `okta_profile_to_username_regex`   | string |          | `"([^@]+)"` | A regex used to parse the DataHub username from the attribute specified in `okta_profile_to_username_attr`.                                                                                                                                          |
 | 
			
		||||
| `okta_profile_to_group_name_attr`  | string |          | `"name"`    | Which Okta Group Profile attribute to use as input to DataHub group name mapping.                                                                                                                                                                    |
 | 
			
		||||
| `okta_profile_to_group_name_regex` | string |          | `"(.*)"`    | A regex used to parse the DataHub group name from the attribute specified in `okta_profile_to_group_name_attr`.                                                                                                                                      |
 | 
			
		||||
| `include_deprovisioned_users`      | bool   |          | `False`     | Whether to ingest users in the DEPROVISIONED state from Okta.                                                                                                                                                                                        |
 | 
			
		||||
| `include_suspended_users`          | bool   |          | `False`     | Whether to ingest users in the SUSPENDED state from Okta.                                                                                                                                                                                            |
 | 
			
		||||
| `page_size`                        | number |          | `100`       | The number of entities requested from Okta's REST APIs in one request.                                                                                                                                                                               |
 | 
			
		||||
| `delay_seconds`                    | number |          | `0.01`      | Number of seconds to wait between calls to Okta's REST APIs. (Okta rate limits). Defaults to 10ms.                                                                                                                                                   |
 | 
			
		||||
| `okta_users_filter`                | string |          | `None`        | Okta filter expression (not regex) for ingesting users. Only one of `okta_users_filter` and `okta_users_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/users/#list-users-with-a-filter) for more info.    |
 | 
			
		||||
| `okta_users_search`                | string |          | `None`        | Okta search expression (not regex) for ingesting users. Only one of `okta_users_filter` and `okta_users_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/users/#list-users-with-search) for more info.      |
 | 
			
		||||
| `okta_groups_filter`               | string |          | `None`        | Okta filter expression (not regex) for ingesting groups. Only one of `okta_groups_filter` and `okta_groups_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/groups/#filters) for more info.                 |
 | 
			
		||||
| `okta_users_search`                | string |          | `None`        | Okta search expression (not regex) for ingesting groups. Only one of `okta_groups_filter` and `okta_groups_search` can be set. See the [Okta API docs](https://developer.okta.com/docs/reference/api/groups/#list-groups-with-search) for more info. |
 | 
			
		||||
 | 
			
		||||
## Compatibility
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -4,13 +4,15 @@ import re
 | 
			
		||||
import urllib
 | 
			
		||||
from dataclasses import dataclass, field
 | 
			
		||||
from time import sleep
 | 
			
		||||
from typing import Dict, Iterable, List, Union
 | 
			
		||||
from typing import Dict, Iterable, List, Optional, Union
 | 
			
		||||
 | 
			
		||||
from okta.client import Client as OktaClient
 | 
			
		||||
from okta.exceptions import OktaAPIException
 | 
			
		||||
from okta.models import Group, GroupProfile, User, UserProfile, UserStatus
 | 
			
		||||
from pydantic import validator
 | 
			
		||||
 | 
			
		||||
from datahub.configuration import ConfigModel
 | 
			
		||||
from datahub.configuration.common import ConfigurationError
 | 
			
		||||
from datahub.ingestion.api.common import PipelineContext
 | 
			
		||||
from datahub.ingestion.api.source import Source, SourceReport
 | 
			
		||||
from datahub.ingestion.api.workunit import MetadataWorkUnit
 | 
			
		||||
@ -60,6 +62,30 @@ class OktaConfig(ConfigModel):
 | 
			
		||||
    # Optional: Set the delay for fetching batches of entities from Okta. Okta has rate limiting in place.
 | 
			
		||||
    delay_seconds = 0.01
 | 
			
		||||
 | 
			
		||||
    # Optional: Filter and search expression for ingesting a subset of users. Only one can be specified at a time.
 | 
			
		||||
    okta_users_filter: Optional[str] = None
 | 
			
		||||
    okta_users_search: Optional[str] = None
 | 
			
		||||
 | 
			
		||||
    # Optional: Filter and search expression for ingesting a subset of groups. Only one can be specified at a time.
 | 
			
		||||
    okta_groups_filter: Optional[str] = None
 | 
			
		||||
    okta_groups_search: Optional[str] = None
 | 
			
		||||
 | 
			
		||||
    @validator("okta_users_search")
 | 
			
		||||
    def okta_users_one_of_filter_or_search(cls, v, values):
 | 
			
		||||
        if v and values["okta_users_filter"]:
 | 
			
		||||
            raise ConfigurationError(
 | 
			
		||||
                "Only one of okta_users_filter or okta_users_search can be set"
 | 
			
		||||
            )
 | 
			
		||||
        return v
 | 
			
		||||
 | 
			
		||||
    @validator("okta_groups_search")
 | 
			
		||||
    def okta_groups_one_of_filter_or_search(cls, v, values):
 | 
			
		||||
        if v and values["okta_groups_filter"]:
 | 
			
		||||
            raise ConfigurationError(
 | 
			
		||||
                "Only one of okta_groups_filter or okta_groups_search can be set"
 | 
			
		||||
            )
 | 
			
		||||
        return v
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class OktaSourceReport(SourceReport):
 | 
			
		||||
@ -201,7 +227,12 @@ class OktaSource(Source):
 | 
			
		||||
        logger.debug("Extracting all Okta groups")
 | 
			
		||||
 | 
			
		||||
        # Note that this is not taking full advantage of Python AsyncIO, as we are blocking on calls.
 | 
			
		||||
        query_parameters = {"limit": self.config.page_size}
 | 
			
		||||
        query_parameters: Dict[str, Union[str, int]] = {"limit": self.config.page_size}
 | 
			
		||||
        if self.config.okta_groups_filter:
 | 
			
		||||
            query_parameters.update({"filter": self.config.okta_groups_filter})
 | 
			
		||||
        if self.config.okta_groups_search:
 | 
			
		||||
            query_parameters.update({"search": self.config.okta_groups_search})
 | 
			
		||||
        groups = resp = err = None
 | 
			
		||||
        try:
 | 
			
		||||
            groups, resp, err = event_loop.run_until_complete(
 | 
			
		||||
                self.okta_client.list_groups(query_parameters)
 | 
			
		||||
@ -238,6 +269,7 @@ class OktaSource(Source):
 | 
			
		||||
 | 
			
		||||
        # Note that this is not taking full advantage of Python AsyncIO; we are blocking on calls.
 | 
			
		||||
        query_parameters = {"limit": self.config.page_size}
 | 
			
		||||
        users = resp = err = None
 | 
			
		||||
        try:
 | 
			
		||||
            users, resp, err = event_loop.run_until_complete(
 | 
			
		||||
                self.okta_client.list_group_users(group.id, query_parameters)
 | 
			
		||||
@ -272,7 +304,12 @@ class OktaSource(Source):
 | 
			
		||||
    def _get_okta_users(self, event_loop: asyncio.AbstractEventLoop) -> Iterable[User]:
 | 
			
		||||
        logger.debug("Extracting all Okta users")
 | 
			
		||||
 | 
			
		||||
        query_parameters = {"limit": self.config.page_size}
 | 
			
		||||
        query_parameters: Dict[str, Union[str, int]] = {"limit": self.config.page_size}
 | 
			
		||||
        if self.config.okta_users_filter:
 | 
			
		||||
            query_parameters.update({"filter": self.config.okta_users_filter})
 | 
			
		||||
        if self.config.okta_users_search:
 | 
			
		||||
            query_parameters.update({"search": self.config.okta_users_search})
 | 
			
		||||
        users = resp = err = None
 | 
			
		||||
        try:
 | 
			
		||||
            users, resp, err = event_loop.run_until_complete(
 | 
			
		||||
                self.okta_client.list_users(query_parameters)
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user