mirror of
				https://github.com/datahub-project/datahub.git
				synced 2025-10-31 02:37:05 +00:00 
			
		
		
		
	feat(ingestion): add Pulsar source (#4721)
This commit is contained in:
		
							parent
							
								
									f099aeb550
								
							
						
					
					
						commit
						74d6d35881
					
				
							
								
								
									
										
											BIN
										
									
								
								datahub-web-react/src/images/pulsarlogo.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								datahub-web-react/src/images/pulsarlogo.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 27 KiB | 
| @ -206,6 +206,7 @@ plugins: Dict[str, Set[str]] = { | ||||
|     "postgres": sql_common | {"psycopg2-binary", "GeoAlchemy2"}, | ||||
|     "presto-on-hive": sql_common | ||||
|     | {"psycopg2-binary", "acryl-pyhive[hive]>=0.6.12", "pymysql>=1.0.2"}, | ||||
|     "pulsar": {"requests"}, | ||||
|     "redash": {"redash-toolbelt", "sql-metadata", "sqllineage==1.3.4"}, | ||||
|     "redshift": sql_common | ||||
|     | {"sqlalchemy-redshift", "psycopg2-binary", "GeoAlchemy2", "sqllineage==1.3.4"}, | ||||
| @ -452,6 +453,7 @@ entry_points = { | ||||
|         "nifi = datahub.ingestion.source.nifi:NifiSource", | ||||
|         "powerbi = datahub.ingestion.source.powerbi:PowerBiDashboardSource", | ||||
|         "presto-on-hive = datahub.ingestion.source.sql.presto_on_hive:PrestoOnHiveSource", | ||||
|         "pulsar = datahub.ingestion.source.pulsar:PulsarSource", | ||||
|     ], | ||||
|     "datahub.ingestion.sink.plugins": [ | ||||
|         "file = datahub.ingestion.sink.file:FileSink", | ||||
|  | ||||
							
								
								
									
										176
									
								
								metadata-ingestion/source_docs/pulsar.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										176
									
								
								metadata-ingestion/source_docs/pulsar.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,176 @@ | ||||
| # Pulsar | ||||
| 
 | ||||
| <!-- Set Support Status --> | ||||
| <!--  | ||||
| --> | ||||
|  | ||||
| 
 | ||||
| ## Integration Details | ||||
| 
 | ||||
| <!-- Plain-language description of what this integration is meant to do.  --> | ||||
| <!-- Include details about where metadata is extracted from (ie. logs, source API, manifest, etc.)   --> | ||||
| 
 | ||||
| The Datahub Pulsar source plugin extracts `topic` and `schema` metadata from an Apache Pulsar instance and ingest the information into Datahub. The plugin uses the [Pulsar admin Rest API interface](https://pulsar.apache.org/admin-rest-api/#) to interact with the Pulsar instance. The following APIs are used in order to: | ||||
| - [Get the list of existing tenants](https://pulsar.apache.org/admin-rest-api/#tag/tenants) | ||||
| - [Get the list of namespaces associated with each tenant](https://pulsar.apache.org/admin-rest-api/#tag/namespaces) | ||||
| - [Get the list of topics associated with each namespace](https://pulsar.apache.org/admin-rest-api/#tag/persistent-topic) | ||||
|     - persistent topics | ||||
|     - persistent partitioned topics | ||||
|     - non-persistent topics | ||||
|     - non-persistent partitioned topics | ||||
| - [Get the latest schema associated with each topic](https://pulsar.apache.org/admin-rest-api/#tag/schemas) | ||||
| 
 | ||||
| The data is extracted on `tenant` and `namespace` basis, topics with corresponding schema (if available) are ingested as [Dataset](docs/generated/metamodel/entities/dataset.md) into Datahub. Some additional values like `schema description`, `schema_version`, `schema_type` and `partitioned` are included as `DatasetProperties`. | ||||
| 
 | ||||
| 
 | ||||
| ### Concept Mapping | ||||
| 
 | ||||
| <!-- This should be a manual mapping of concepts from the source to the DataHub Metadata Model --> | ||||
| <!-- Authors should provide as much context as possible about how this mapping was generated, including assumptions made, known shortcuts, & any other caveats --> | ||||
| 
 | ||||
| This ingestion source maps the following Source System Concepts to DataHub Concepts: | ||||
| 
 | ||||
| <!-- Remove all unnecessary/irrelevant DataHub Concepts --> | ||||
| 
 | ||||
| 
 | ||||
| | Source Concept | DataHub Concept                                                    | Notes                                                                     | | ||||
| |----------------|--------------------------------------------------------------------|---------------------------------------------------------------------------| | ||||
| | `pulsar`       | [Data Platform](docs/generated/metamodel/entities/dataPlatform.md) |                                                                           | | ||||
| | Pulsar Topic   | [Dataset](docs/generated/metamodel/entities/dataset.md)            | _subType_: `topic`                                                        | | ||||
| | Pulsar Schema  | [SchemaField](docs/generated/metamodel/entities/schemaField.md)    | Maps to the fields defined within the `Avro` or `JSON` schema definition. |  | ||||
| 
 | ||||
| 
 | ||||
| ### Supported Capabilities | ||||
| 
 | ||||
| <!-- This should be an auto-generated table of supported DataHub features/functionality --> | ||||
| <!-- Each capability should link out to a feature guide --> | ||||
| 
 | ||||
| | Capability                                            | Status | Notes                                                                                                                                                                                                                                        | | ||||
| |-------------------------------------------------------|:------:|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | ||||
| | Data Container                                        |   ❌    |                                                                                                                                                                                                                                              | | ||||
| | [Stateful Ingestion](./stateful_ingestion.md)         |   ✅    | Requires recipe configuration, stateful Ingestion is available only when a Platform Instance is assigned to this source.                                                                                                                     | | ||||
| | Partition Support                                     |   ✅    | Requires recipe configuration, each individual partition topic can be ingest. Behind the scenes, a partitioned topic is actually implemented as N internal topics, where N is the number of partitions. This feature is disabled by default. | | ||||
| | [Platform Instance](../../docs/platform-instances.md) |   ✅    | Requires recipe configuration and is mandatory for Stateful Ingestion. A Pulsar instance consists of one or more Pulsar clusters.                                                                                                            | | ||||
| | [Data Domain](../../docs/domains.md)                  |   ✅    | Requires recipe configuration                                                                                                                                                                                                                | | ||||
| | Dataset Profiling                                     |   ❌    |                                                                                                                                                                                                                                              | | ||||
| | Dataset Usage                                         |   ❌    |                                                                                                                                                                                                                                              | | ||||
| | Extract Descriptions                                  |   ❌    |                                                                                                                                                                                                                                              | | ||||
| | Extract Lineage                                       |   ❌    |                                                                                                                                                                                                                                              | | ||||
| | Extract Ownership                                     |   ❌    |                                                                                                                                                                                                                                              | | ||||
| | Extract Tags                                          |   ❌    |                                                                                                                                                                                                                                              | | ||||
| | ...                                                   |        | | ||||
| 
 | ||||
| ## Metadata Ingestion Quickstart | ||||
| 
 | ||||
| For context on getting started with ingestion, check out our [metadata ingestion guide](../README.md). | ||||
| 
 | ||||
| ### Prerequisites | ||||
| 
 | ||||
| In order to ingest metadata from Apache Pulsar, you will need: | ||||
| 
 | ||||
| * Access to a Pulsar Instance, if authentication is enabled a valid access token. | ||||
| * Pulsar version >= 2.7.0 | ||||
| * ... | ||||
| 
 | ||||
| > **_NOTE:_**  A _superUser_ role is required for listing all existing tenants within a Pulsar instance. | ||||
| > | ||||
| 
 | ||||
| ### Install the Plugin(s) | ||||
| 
 | ||||
| Run the following commands to install the relevant plugin(s): | ||||
| 
 | ||||
| `pip install 'acryl-datahub[pulsar]'` | ||||
| 
 | ||||
| ### Configure the Ingestion Recipe(s) | ||||
| 
 | ||||
| Use the following recipe(s) to get started with ingestion. See [below](#config-details) for full configuration options. | ||||
| 
 | ||||
| _For general pointers on writing and running a recipe, see our [main recipe guide](../README.md#recipes)._ | ||||
| 
 | ||||
| #### Quickstart recipe | ||||
| Getting started receipt | ||||
| ```yml | ||||
| source: | ||||
|   type: pulsar | ||||
|   config: | ||||
|     # Required fields | ||||
|     web_service_url: "http://localhost:8080" | ||||
| 
 | ||||
| sink: | ||||
|   # sink configs | ||||
| ``` | ||||
| 
 | ||||
| 
 | ||||
| #### Example recipe with authentication | ||||
| An example recipe for ingesting from a Pulsar instance with oauth authentication and ssl enabled. | ||||
| 
 | ||||
| 
 | ||||
| ```yml | ||||
| source:  | ||||
|   type: "pulsar"  | ||||
|   config: | ||||
|     env: "TEST"  | ||||
|     platform_instance: "local" | ||||
|     ## Pulsar client connection config ##  | ||||
|     web_service_url: "https://localhost:8443" | ||||
|     verify_ssl: "/opt/certs/ca.cert.pem" | ||||
|     # Issuer url for auth document, for example "http://localhost:8083/realms/pulsar" | ||||
|     issuer_url: <issuer_url> | ||||
|     client_id: ${CLIENT_ID} | ||||
|     client_secret: ${CLIENT_SECRET} | ||||
|     # Tenant list to scrape  | ||||
|     tenants: | ||||
|       - tenant_1 | ||||
|       - tenant_2 | ||||
|     # Topic filter pattern  | ||||
|     topic_patterns: | ||||
|       allow: | ||||
|         - ".*sales.*" | ||||
| 
 | ||||
| sink: | ||||
|   # sink configs | ||||
| ``` | ||||
| 
 | ||||
| > **_NOTE:_**  Always use TLS encryption in a production environment and use variable substitution for sensitive information (e.g. ${CLIENT_ID} and ${CLIENT_SECRET}). | ||||
| > | ||||
| 
 | ||||
| ## Config details | ||||
| <details> | ||||
|   <summary>View All Recipe Configuration Options</summary> | ||||
| 
 | ||||
| Note that a `.` is used to denote nested fields in the YAML recipe. | ||||
| 
 | ||||
| | Field                           | Required | Default                 | Description                                                                                                                                                     | | ||||
| |---------------------------------|:--------:|-------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------| | ||||
| | `env`                           |    ❌     | `PROD`                  | The data fabric, defaults to PROD                                                                                                                               | | ||||
| | `platform_instance`             |    ❌     |                         | The Platform instance to use while constructing URNs. Mandatory for Stateful Ingestion                                                                          | | ||||
| | `web_service_url`               |    ✅     | `http://localhost:8080` | The web URL for the cluster.                                                                                                                                    | | ||||
| | `timeout`                       |    ❌     | `5`                     | Timout setting, how long to wait for the Pulsar rest api to send data before giving up                                                                          | | ||||
| | `verify_ssl`                    |    ❌     | `True`                  | Either a boolean, in which case it controls whether we verify the server's TLS certificate, or a string, in which case it must be a path to a CA bundle to use. | | ||||
| | `issuer_url`                    |    ❌     |                         | The complete URL for a Custom Authorization Server. Mandatory for OAuth based authentication.                                                                   | | ||||
| | `client_id`                     |    ❌     |                         | The application's client ID                                                                                                                                     | | ||||
| | `client_secret`                 |    ❌     |                         | The application's client secret                                                                                                                                 | | ||||
| | `token`                         |    ❌     |                         | The access token for the application. Mandatory for token based authentication.                                                                                 | | ||||
| | `tenant_patterns.allow`         |    ❌     | `.*`                    | List of regex patterns for tenants to include in ingestion. By default all tenants are allowed.                                                                 | | ||||
| | `tenant_patterns.deny`          |    ❌     | `pulsar`                | List of regex patterns for tenants to exclude from ingestion. By default the Pulsar system tenant is denied.                                                    | | ||||
| | `tenant_patterns.ignoreCase`    |    ❌     | `True`                  | Whether to ignore case sensitivity during tenant pattern matching.                                                                                              | | ||||
| | `namespace_patterns.allow`      |    ❌     | `.*`                    | List of regex patterns for namespaces to include in ingestion. By default all namespaces are allowed.                                                           | | ||||
| | `namespace_patterns.deny`       |    ❌     | `public/functions`      | List of regex patterns for namespaces to exclude from ingestion. By default the functions namespace is denied.                                                  | | ||||
| | `namespace_patterns.ignoreCase` |    ❌     | `True`                  | Whether to ignore case sensitivity during namespace pattern matching.                                                                                           | | ||||
| | `topic_patterns.allow`          |    ❌     | `.*`                    | List of regex patterns for topics to include in ingestion. By default all topics are allowed.                                                                   | | ||||
| | `topic_patterns.deny`           |    ❌     | `/__.*$`                | List of regex patterns for topics to exclude from ingestion. By default the Pulsar system topics are denied.                                                    | | ||||
| | `topic_patterns.ignoreCase`     |    ❌     | `True`                  | Whether to ignore case sensitivity during topic pattern matching.                                                                                               | | ||||
| | `tenants`                       |    ❌     |                         | Listing all tenants requires superUser role, alternative you can set a list of tenants you want to scrape using the tenant admin role                           |  | ||||
| | `exclude_individual_partitions` |    ❌     | `True`                  | Extract each individual partitioned topic. e.g. when turned off a topic with 100 partitions will result in 100 `Datesets`.                                      | | ||||
| | `domain.domain_urn.allow`       |    ❌     |                         | List of regex patterns for topics to set domain_urn domain key. There can be multiple domain key specified.                                                     | | ||||
| | `domain.domain_urn.deny`        |    ❌     |                         | List of regex patterns for topics to not assign domain_urn. There can be multiple domain key specified.                                                         | | ||||
| | `domain.domain_urn.ignoreCase`  |    ❌     | `True`                  | Whether to ignore case sensitivity during pattern matching.There can be multiple domain key specified.                                                          | | ||||
| | `stateful_ingestion`            |    ❌     |                         | see [Stateful Ingestion](./stateful_ingestion.md)                                                                                                               | | ||||
| </details> | ||||
| 
 | ||||
| 
 | ||||
| ## Troubleshooting | ||||
| 
 | ||||
| ### [Common Issue] | ||||
| 
 | ||||
| [Provide description of common issues with this integration and steps to resolve] | ||||
							
								
								
									
										642
									
								
								metadata-ingestion/src/datahub/ingestion/source/pulsar.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										642
									
								
								metadata-ingestion/src/datahub/ingestion/source/pulsar.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,642 @@ | ||||
| import json | ||||
| import logging | ||||
| import re | ||||
| from dataclasses import dataclass | ||||
| from hashlib import md5 | ||||
| from typing import Iterable, List, Optional, Tuple, cast | ||||
| 
 | ||||
| import requests | ||||
| 
 | ||||
| from datahub.configuration.common import ConfigurationError | ||||
| from datahub.emitter.mce_builder import ( | ||||
|     make_data_platform_urn, | ||||
|     make_dataplatform_instance_urn, | ||||
|     make_dataset_urn_with_platform_instance, | ||||
|     make_domain_urn, | ||||
| ) | ||||
| from datahub.emitter.mcp import MetadataChangeProposalWrapper | ||||
| from datahub.emitter.mcp_builder import add_domain_to_entity_wu | ||||
| from datahub.ingestion.api.common import PipelineContext | ||||
| from datahub.ingestion.api.workunit import MetadataWorkUnit | ||||
| from datahub.ingestion.extractor import schema_util | ||||
| from datahub.ingestion.source.state.checkpoint import Checkpoint | ||||
| from datahub.ingestion.source.state.kafka_state import KafkaCheckpointState | ||||
| from datahub.ingestion.source.state.stateful_ingestion_base import ( | ||||
|     JobId, | ||||
|     StatefulIngestionSourceBase, | ||||
| ) | ||||
| from datahub.ingestion.source_config.pulsar import PulsarSourceConfig | ||||
| from datahub.ingestion.source_report.pulsar import PulsarSourceReport | ||||
| from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass | ||||
| from datahub.metadata.com.linkedin.pegasus2avro.schema import ( | ||||
|     KafkaSchema, | ||||
|     SchemaField, | ||||
|     SchemaMetadata, | ||||
| ) | ||||
| from datahub.metadata.schema_classes import ( | ||||
|     BrowsePathsClass, | ||||
|     ChangeTypeClass, | ||||
|     DataPlatformInstanceClass, | ||||
|     DatasetPropertiesClass, | ||||
|     JobStatusClass, | ||||
|     SubTypesClass, | ||||
| ) | ||||
| 
 | ||||
| logger = logging.getLogger(__name__) | ||||
| 
 | ||||
| 
 | ||||
| class PulsarTopic(object): | ||||
|     __slots__ = ["topic_parts", "fullname", "type", "tenant", "namespace", "topic"] | ||||
| 
 | ||||
|     def __init__(self, topic): | ||||
|         topic_parts = re.split("[: /]", topic) | ||||
|         self.fullname = topic | ||||
|         self.type = topic_parts[0] | ||||
|         self.tenant = topic_parts[3] | ||||
|         self.namespace = topic_parts[4] | ||||
|         self.topic = topic_parts[5] | ||||
| 
 | ||||
| 
 | ||||
| class PulsarSchema(object): | ||||
|     __slots__ = [ | ||||
|         "schema_version", | ||||
|         "schema_name", | ||||
|         "schema_description", | ||||
|         "schema_type", | ||||
|         "schema_str", | ||||
|         "properties", | ||||
|     ] | ||||
| 
 | ||||
|     def __init__(self, schema): | ||||
|         self.schema_version = schema.get("version") | ||||
| 
 | ||||
|         avro_schema = json.loads(schema.get("data")) | ||||
|         self.schema_name = avro_schema.get("namespace") + "." + avro_schema.get("name") | ||||
|         self.schema_description = avro_schema.get("doc") | ||||
|         self.schema_type = schema.get("type") | ||||
|         self.schema_str = schema.get("data") | ||||
|         self.properties = schema.get("properties") | ||||
| 
 | ||||
| 
 | ||||
| @dataclass | ||||
| class PulsarSource(StatefulIngestionSourceBase): | ||||
|     def __init__(self, config: PulsarSourceConfig, ctx: PipelineContext): | ||||
|         super().__init__(config, ctx) | ||||
|         self.platform: str = "pulsar" | ||||
|         self.config: PulsarSourceConfig = config | ||||
|         self.report: PulsarSourceReport = PulsarSourceReport() | ||||
|         self.base_url: str = self.config.web_service_url + "/admin/v2" | ||||
|         self.tenants: List[str] = config.tenants | ||||
| 
 | ||||
|         if ( | ||||
|             self.is_stateful_ingestion_configured() | ||||
|             and not self.config.platform_instance | ||||
|         ): | ||||
|             raise ConfigurationError( | ||||
|                 "Enabling Pulsar stateful ingestion requires to specify a platform instance." | ||||
|             ) | ||||
| 
 | ||||
|         self.session = requests.Session() | ||||
|         self.session.verify = self.config.verify_ssl | ||||
|         self.session.headers.update( | ||||
|             { | ||||
|                 "Content-Type": "application/json", | ||||
|             } | ||||
|         ) | ||||
| 
 | ||||
|         if self._is_oauth_authentication_configured(): | ||||
|             # Get OpenId configuration from issuer, e.g. token_endpoint | ||||
|             oid_config_url = ( | ||||
|                 "%s/.well-known/openid-configuration" % self.config.issuer_url | ||||
|             ) | ||||
|             oid_config_response = requests.get( | ||||
|                 oid_config_url, verify=False, allow_redirects=False | ||||
|             ) | ||||
| 
 | ||||
|             if oid_config_response: | ||||
|                 self.config.oid_config.update(oid_config_response.json()) | ||||
|             else: | ||||
|                 logger.error( | ||||
|                     "Unexpected response while getting discovery document using %s : %s" | ||||
|                     % (oid_config_url, oid_config_response) | ||||
|                 ) | ||||
| 
 | ||||
|             if "token_endpoint" not in self.config.oid_config: | ||||
|                 raise Exception( | ||||
|                     "The token_endpoint is not set, please verify the configured issuer_url or" | ||||
|                     " set oid_config.token_endpoint manually in the configuration file." | ||||
|                 ) | ||||
| 
 | ||||
|         # Authentication configured | ||||
|         if ( | ||||
|             self._is_token_authentication_configured() | ||||
|             or self._is_oauth_authentication_configured() | ||||
|         ): | ||||
|             # Update session header with Bearer token | ||||
|             self.session.headers.update( | ||||
|                 {"Authorization": f"Bearer {self.get_access_token()}"} | ||||
|             ) | ||||
| 
 | ||||
|     def get_access_token(self) -> str: | ||||
|         """ | ||||
|         Returns an access token used for authentication, token comes from config or third party provider | ||||
|         when issuer_url is provided | ||||
|         """ | ||||
|         # JWT, get access token (jwt) from config | ||||
|         if self._is_token_authentication_configured(): | ||||
|             return str(self.config.token) | ||||
| 
 | ||||
|         # OAuth, connect to issuer and return access token | ||||
|         if self._is_oauth_authentication_configured(): | ||||
|             assert self.config.client_id | ||||
|             assert self.config.client_secret | ||||
|             data = {"grant_type": "client_credentials"} | ||||
|             try: | ||||
|                 # Get a token from the issuer | ||||
|                 token_endpoint = self.config.oid_config["token_endpoint"] | ||||
|                 logger.info(f"Request access token from {token_endpoint}") | ||||
|                 token_response = requests.post( | ||||
|                     url=token_endpoint, | ||||
|                     data=data, | ||||
|                     verify=False, | ||||
|                     allow_redirects=False, | ||||
|                     auth=( | ||||
|                         self.config.client_id, | ||||
|                         self.config.client_secret, | ||||
|                     ), | ||||
|                 ) | ||||
|                 token_response.raise_for_status() | ||||
| 
 | ||||
|                 return token_response.json()["access_token"] | ||||
| 
 | ||||
|             except requests.exceptions.RequestException as e: | ||||
|                 logger.error(f"An error occurred while handling your request: {e}") | ||||
|         # Failed to get an access token, | ||||
|         raise ConfigurationError( | ||||
|             f"Failed to get the Pulsar access token from token_endpoint {self.config.oid_config.get('token_endpoint')}." | ||||
|             f" Please check your input configuration." | ||||
|         ) | ||||
| 
 | ||||
|     def _get_pulsar_metadata(self, url): | ||||
|         """ | ||||
|         Interacts with the Pulsar Admin Api and returns Pulsar metadata. Invocations with insufficient privileges | ||||
|         are logged. | ||||
|         """ | ||||
|         try: | ||||
|             # Request the Pulsar metadata | ||||
|             response = self.session.get(url, timeout=self.config.timeout) | ||||
|             response.raise_for_status() | ||||
|             # Return the response for status_code 200 | ||||
|             return response.json() | ||||
| 
 | ||||
|         except requests.exceptions.HTTPError as http_error: | ||||
|             # Topics can exist without a schema, log the warning and move on | ||||
|             if http_error.response.status_code == 404 and "/schemas/" in url: | ||||
|                 message = ( | ||||
|                     f"Failed to get schema from schema registry. The topic is either schema-less or" | ||||
|                     f" no messages have been written to the topic yet." | ||||
|                     f" {http_error}" | ||||
|                 ) | ||||
|                 self.report.report_warning("NoSchemaFound", message) | ||||
|             else: | ||||
|                 # Authorization error | ||||
|                 message = f"An HTTP error occurred: {http_error}" | ||||
|                 self.report.report_warning("HTTPError", message) | ||||
|         except requests.exceptions.RequestException as e: | ||||
|             raise Exception( | ||||
|                 f"An ambiguous exception occurred while handling the request: {e}" | ||||
|             ) | ||||
| 
 | ||||
|     def is_checkpointing_enabled(self, job_id: JobId) -> bool: | ||||
|         return job_id == ( | ||||
|             self.get_default_ingestion_job_id() | ||||
|             and self.is_stateful_ingestion_configured() | ||||
|             and self.config.stateful_ingestion | ||||
|             and self.config.stateful_ingestion.remove_stale_metadata | ||||
|         ) | ||||
| 
 | ||||
|     def get_default_ingestion_job_id(self) -> JobId: | ||||
|         """ | ||||
|         Default ingestion job name that kafka provides. | ||||
|         """ | ||||
|         return JobId("ingest_from_pulsar_source") | ||||
| 
 | ||||
|     def create_checkpoint(self, job_id: JobId) -> Optional[Checkpoint]: | ||||
|         """ | ||||
|         Create a custom checkpoint with empty state for the job. | ||||
|         """ | ||||
|         assert self.ctx.pipeline_name is not None | ||||
|         if job_id == self.get_default_ingestion_job_id(): | ||||
|             return Checkpoint( | ||||
|                 job_name=job_id, | ||||
|                 pipeline_name=self.ctx.pipeline_name, | ||||
|                 platform_instance_id=self.get_platform_instance_id(), | ||||
|                 run_id=self.ctx.run_id, | ||||
|                 config=self.config, | ||||
|                 # TODO Create a PulsarCheckpointState ? | ||||
|                 state=KafkaCheckpointState(), | ||||
|             ) | ||||
|         return None | ||||
| 
 | ||||
|     def get_platform_instance_id(self) -> str: | ||||
|         assert self.config.platform_instance is not None | ||||
|         return self.config.platform_instance | ||||
| 
 | ||||
|     @classmethod | ||||
|     def create(cls, config_dict, ctx): | ||||
|         config = PulsarSourceConfig.parse_obj(config_dict) | ||||
| 
 | ||||
|         # Do not include each individual partition for partitioned topics, | ||||
|         if config.exclude_individual_partitions: | ||||
|             config.topic_patterns.deny.append(r".*-partition-[0-9]+") | ||||
| 
 | ||||
|         return cls(config, ctx) | ||||
| 
 | ||||
|     def soft_delete_dataset(self, urn: str, type: str) -> Iterable[MetadataWorkUnit]: | ||||
|         logger.debug(f"Soft-deleting stale entity of type {type} - {urn}.") | ||||
|         mcp = MetadataChangeProposalWrapper( | ||||
|             entityType="dataset", | ||||
|             entityUrn=urn, | ||||
|             changeType=ChangeTypeClass.UPSERT, | ||||
|             aspectName="status", | ||||
|             aspect=StatusClass(removed=True), | ||||
|         ) | ||||
|         wu = MetadataWorkUnit(id=f"soft-delete-{type}-{urn}", mcp=mcp) | ||||
|         self.report.report_workunit(wu) | ||||
|         self.report.report_stale_entity_soft_deleted(urn) | ||||
|         yield wu | ||||
| 
 | ||||
|     def gen_removed_entity_workunits(self) -> Iterable[MetadataWorkUnit]: | ||||
|         last_checkpoint = self.get_last_checkpoint( | ||||
|             self.get_default_ingestion_job_id(), KafkaCheckpointState | ||||
|         ) | ||||
|         cur_checkpoint = self.get_current_checkpoint( | ||||
|             self.get_default_ingestion_job_id() | ||||
|         ) | ||||
|         if ( | ||||
|             self.config.stateful_ingestion | ||||
|             and self.config.stateful_ingestion.remove_stale_metadata | ||||
|             and last_checkpoint is not None | ||||
|             and last_checkpoint.state is not None | ||||
|             and cur_checkpoint is not None | ||||
|             and cur_checkpoint.state is not None | ||||
|         ): | ||||
|             logger.debug("Checking for stale entity removal.") | ||||
| 
 | ||||
|             last_checkpoint_state = cast(KafkaCheckpointState, last_checkpoint.state) | ||||
|             cur_checkpoint_state = cast(KafkaCheckpointState, cur_checkpoint.state) | ||||
| 
 | ||||
|             for topic_urn in last_checkpoint_state.get_topic_urns_not_in( | ||||
|                 cur_checkpoint_state | ||||
|             ): | ||||
|                 yield from self.soft_delete_dataset(topic_urn, "topic") | ||||
| 
 | ||||
|     def get_workunits(self) -> Iterable[MetadataWorkUnit]: | ||||
|         """ | ||||
|         Interacts with the Pulsar Admin Api and loops over tenants, namespaces and topics. For every topic | ||||
|         the schema information is retrieved if available. | ||||
| 
 | ||||
|         Pulsar web service admin rest api urls for retrieving topic information | ||||
|             - [web_service_url]/admin/v2/persistent/{tenant}/{namespace} | ||||
|             - [web_service_url]/admin/v2/persistent/{tenant}/{namespace}/partitioned | ||||
|             - [web_service_url]/admin/v2/non-persistent/{tenant}/{namespace} | ||||
|             - [web_service_url]/admin/v2/non-persistent/{tenant}/{namespace}/partitioned | ||||
|         """ | ||||
|         topic_urls = [ | ||||
|             self.base_url + "/persistent/{}", | ||||
|             self.base_url + "/persistent/{}/partitioned", | ||||
|             self.base_url + "/non-persistent/{}", | ||||
|             self.base_url + "/non-persistent/{}/partitioned", | ||||
|         ] | ||||
| 
 | ||||
|         # Report the Pulsar broker version we are communicating with | ||||
|         self.report.report_pulsar_version( | ||||
|             self.session.get( | ||||
|                 "%s/brokers/version" % self.base_url, | ||||
|                 timeout=self.config.timeout, | ||||
|             ).text | ||||
|         ) | ||||
| 
 | ||||
|         # If no tenants are provided, request all tenants from cluster using /admin/v2/tenants endpoint. | ||||
|         # Requesting cluster tenant information requires superuser privileges | ||||
|         if not self.tenants: | ||||
|             self.tenants = self._get_pulsar_metadata(self.base_url + "/tenants") or [] | ||||
| 
 | ||||
|         # Initialize counters | ||||
|         self.report.tenants_scanned = 0 | ||||
|         self.report.namespaces_scanned = 0 | ||||
|         self.report.topics_scanned = 0 | ||||
| 
 | ||||
|         for tenant in self.tenants: | ||||
|             self.report.tenants_scanned += 1 | ||||
|             if self.config.tenant_patterns.allowed(tenant): | ||||
|                 # Get namespaces belonging to a tenant, /admin/v2/%s/namespaces | ||||
|                 # A tenant admin role has sufficient privileges to perform this action | ||||
|                 namespaces = ( | ||||
|                     self._get_pulsar_metadata(self.base_url + "/namespaces/%s" % tenant) | ||||
|                     or [] | ||||
|                 ) | ||||
|                 for namespace in namespaces: | ||||
|                     self.report.namespaces_scanned += 1 | ||||
|                     if self.config.namespace_patterns.allowed(namespace): | ||||
|                         # Get all topics (persistent, non-persistent and partitioned) belonging to a tenant/namespace | ||||
|                         # Four endpoint invocations are needs to get all topic metadata for a namespace | ||||
|                         topics = {} | ||||
|                         for url in topic_urls: | ||||
|                             # Topics are partitioned when admin url ends with /partitioned | ||||
|                             partitioned = url.endswith("/partitioned") | ||||
|                             # Get the topics for each type | ||||
|                             pulsar_topics = ( | ||||
|                                 self._get_pulsar_metadata(url.format(namespace)) or [] | ||||
|                             ) | ||||
|                             # Create a mesh of topics with partitioned values, the | ||||
|                             # partitioned info is added as a custom properties later | ||||
|                             topics.update( | ||||
|                                 {topic: partitioned for topic in pulsar_topics} | ||||
|                             ) | ||||
| 
 | ||||
|                         # For all allowed topics get the metadata | ||||
|                         for topic, is_partitioned in topics.items(): | ||||
|                             self.report.topics_scanned += 1 | ||||
|                             if self.config.topic_patterns.allowed(topic): | ||||
| 
 | ||||
|                                 yield from self._extract_record(topic, is_partitioned) | ||||
|                                 # Add topic to checkpoint if stateful ingestion is enabled | ||||
|                                 if self.is_stateful_ingestion_configured(): | ||||
|                                     self._add_topic_to_checkpoint(topic) | ||||
|                             else: | ||||
|                                 self.report.report_topics_dropped(topic) | ||||
| 
 | ||||
|                         if self.is_stateful_ingestion_configured(): | ||||
|                             # Clean up stale entities. | ||||
|                             yield from self.gen_removed_entity_workunits() | ||||
| 
 | ||||
|                     else: | ||||
|                         self.report.report_namespaces_dropped(namespace) | ||||
|             else: | ||||
|                 self.report.report_tenants_dropped(tenant) | ||||
| 
 | ||||
|     def _add_topic_to_checkpoint(self, topic: str) -> None: | ||||
|         cur_checkpoint = self.get_current_checkpoint( | ||||
|             self.get_default_ingestion_job_id() | ||||
|         ) | ||||
| 
 | ||||
|         if cur_checkpoint is not None: | ||||
|             checkpoint_state = cast(KafkaCheckpointState, cur_checkpoint.state) | ||||
|             checkpoint_state.add_topic_urn( | ||||
|                 make_dataset_urn_with_platform_instance( | ||||
|                     platform=self.platform, | ||||
|                     name=topic, | ||||
|                     platform_instance=self.config.platform_instance, | ||||
|                     env=self.config.env, | ||||
|                 ) | ||||
|             ) | ||||
| 
 | ||||
|     def _is_token_authentication_configured(self) -> bool: | ||||
|         if self.config.token is not None: | ||||
|             return True | ||||
|         return False | ||||
| 
 | ||||
|     def _is_oauth_authentication_configured(self) -> bool: | ||||
|         if self.config.issuer_url is not None: | ||||
|             return True | ||||
|         return False | ||||
| 
 | ||||
|     def _get_schema_and_fields( | ||||
|         self, pulsar_topic: PulsarTopic, is_key_schema: bool | ||||
|     ) -> Tuple[Optional[PulsarSchema], List[SchemaField]]: | ||||
| 
 | ||||
|         pulsar_schema: Optional[PulsarSchema] = None | ||||
| 
 | ||||
|         schema_url = self.base_url + "/schemas/%s/%s/%s/schema" % ( | ||||
|             pulsar_topic.tenant, | ||||
|             pulsar_topic.namespace, | ||||
|             pulsar_topic.topic, | ||||
|         ) | ||||
| 
 | ||||
|         schema_payload = self._get_pulsar_metadata(schema_url) | ||||
| 
 | ||||
|         # Get the type and schema from the Pulsar Schema | ||||
|         if schema_payload is not None: | ||||
|             # pulsar_schema: Optional[PulsarSchema] = None | ||||
|             pulsar_schema = PulsarSchema(schema_payload) | ||||
| 
 | ||||
|         # Obtain the schema fields from schema for the topic. | ||||
|         fields: List[SchemaField] = [] | ||||
|         if pulsar_schema is not None: | ||||
|             fields = self._get_schema_fields( | ||||
|                 pulsar_topic=pulsar_topic, | ||||
|                 schema=pulsar_schema, | ||||
|                 is_key_schema=is_key_schema, | ||||
|             ) | ||||
|         return pulsar_schema, fields | ||||
| 
 | ||||
|     def _get_schema_fields( | ||||
|         self, pulsar_topic: PulsarTopic, schema: PulsarSchema, is_key_schema: bool | ||||
|     ) -> List[SchemaField]: | ||||
|         # Parse the schema and convert it to SchemaFields. | ||||
|         fields: List[SchemaField] = [] | ||||
|         if schema.schema_type == "AVRO" or schema.schema_type == "JSON": | ||||
|             # Extract fields from schema and get the FQN for the schema | ||||
|             fields = schema_util.avro_schema_to_mce_fields( | ||||
|                 schema.schema_str, is_key_schema=is_key_schema | ||||
|             ) | ||||
|         else: | ||||
|             self.report.report_warning( | ||||
|                 pulsar_topic.fullname, | ||||
|                 f"Parsing Pulsar schema type {schema.schema_type} is currently not implemented", | ||||
|             ) | ||||
|         return fields | ||||
| 
 | ||||
|     def _get_schema_metadata( | ||||
|         self, pulsar_topic: PulsarTopic, platform_urn: str | ||||
|     ) -> Tuple[Optional[PulsarSchema], Optional[SchemaMetadata]]: | ||||
| 
 | ||||
|         schema, fields = self._get_schema_and_fields( | ||||
|             pulsar_topic=pulsar_topic, is_key_schema=False | ||||
|         )  # type: Tuple[Optional[PulsarSchema], List[SchemaField]] | ||||
| 
 | ||||
|         # Create the schemaMetadata aspect. | ||||
|         if schema is not None: | ||||
|             md5_hash = md5(schema.schema_str.encode()).hexdigest() | ||||
| 
 | ||||
|             return schema, SchemaMetadata( | ||||
|                 schemaName=schema.schema_name, | ||||
|                 version=schema.schema_version, | ||||
|                 hash=md5_hash, | ||||
|                 platform=platform_urn, | ||||
|                 platformSchema=KafkaSchema( | ||||
|                     documentSchema=schema.schema_str if schema is not None else "", | ||||
|                     keySchema=None, | ||||
|                 ), | ||||
|                 fields=fields, | ||||
|             ) | ||||
|         return None, None | ||||
| 
 | ||||
|     def _extract_record( | ||||
|         self, topic: str, partitioned: bool | ||||
|     ) -> Iterable[MetadataWorkUnit]: | ||||
|         logger.info(f"topic = {topic}") | ||||
| 
 | ||||
|         # 1. Create and emit the default dataset for the topic. Extract type, tenant, namespace | ||||
|         # and topic name from full Pulsar topic name i.e. persistent://tenant/namespace/topic | ||||
|         pulsar_topic = PulsarTopic(topic) | ||||
| 
 | ||||
|         platform_urn = make_data_platform_urn(self.platform) | ||||
|         dataset_urn = make_dataset_urn_with_platform_instance( | ||||
|             platform=self.platform, | ||||
|             name=pulsar_topic.fullname, | ||||
|             platform_instance=self.config.platform_instance, | ||||
|             env=self.config.env, | ||||
|         ) | ||||
| 
 | ||||
|         status_wu = MetadataWorkUnit( | ||||
|             id=f"{dataset_urn}-status", | ||||
|             mcp=MetadataChangeProposalWrapper( | ||||
|                 entityType="dataset", | ||||
|                 changeType=ChangeTypeClass.UPSERT, | ||||
|                 entityUrn=dataset_urn, | ||||
|                 aspectName="status", | ||||
|                 aspect=StatusClass(removed=False), | ||||
|             ), | ||||
|         ) | ||||
|         self.report.report_workunit(status_wu) | ||||
|         yield status_wu | ||||
| 
 | ||||
|         # 2. Emit schemaMetadata aspect | ||||
|         schema, schema_metadata = self._get_schema_metadata(pulsar_topic, platform_urn) | ||||
|         if schema_metadata is not None: | ||||
|             schema_metadata_wu = MetadataWorkUnit( | ||||
|                 id=f"{dataset_urn}-schemaMetadata", | ||||
|                 mcp=MetadataChangeProposalWrapper( | ||||
|                     entityType="dataset", | ||||
|                     changeType=ChangeTypeClass.UPSERT, | ||||
|                     entityUrn=dataset_urn, | ||||
|                     aspectName="schemaMetadata", | ||||
|                     aspect=schema_metadata, | ||||
|                 ), | ||||
|             ) | ||||
|             self.report.report_workunit(schema_metadata_wu) | ||||
|             yield schema_metadata_wu | ||||
| 
 | ||||
|         # TODO Add topic properties (Pulsar 2.10.0 feature) | ||||
|         # 3. Construct and emit dataset properties aspect | ||||
|         if schema is not None: | ||||
|             schema_properties = { | ||||
|                 "schema_version": str(schema.schema_version), | ||||
|                 "schema_type": schema.schema_type, | ||||
|                 "partitioned": str(partitioned).lower(), | ||||
|             } | ||||
|             # Add some static properties to the schema properties | ||||
|             schema.properties.update(schema_properties) | ||||
| 
 | ||||
|             dataset_properties_wu = MetadataWorkUnit( | ||||
|                 id=f"{dataset_urn}-datasetProperties", | ||||
|                 mcp=MetadataChangeProposalWrapper( | ||||
|                     entityType="dataset", | ||||
|                     changeType=ChangeTypeClass.UPSERT, | ||||
|                     entityUrn=dataset_urn, | ||||
|                     aspectName="datasetProperties", | ||||
|                     aspect=DatasetPropertiesClass( | ||||
|                         description=schema.schema_description, | ||||
|                         customProperties=schema.properties, | ||||
|                     ), | ||||
|                 ), | ||||
|             ) | ||||
|             self.report.report_workunit(dataset_properties_wu) | ||||
|             yield dataset_properties_wu | ||||
| 
 | ||||
|         # 4. Emit browsePaths aspect | ||||
|         pulsar_path = ( | ||||
|             f"{pulsar_topic.tenant}/{pulsar_topic.namespace}/{pulsar_topic.topic}" | ||||
|         ) | ||||
|         browse_path_suffix = ( | ||||
|             f"{self.config.platform_instance}/{pulsar_path}" | ||||
|             if self.config.platform_instance | ||||
|             else pulsar_path | ||||
|         ) | ||||
| 
 | ||||
|         browse_path_wu = MetadataWorkUnit( | ||||
|             id=f"{dataset_urn}-browsePaths", | ||||
|             mcp=MetadataChangeProposalWrapper( | ||||
|                 entityType="dataset", | ||||
|                 changeType=ChangeTypeClass.UPSERT, | ||||
|                 entityUrn=dataset_urn, | ||||
|                 aspectName="browsePaths", | ||||
|                 aspect=BrowsePathsClass( | ||||
|                     [f"/{self.config.env.lower()}/{self.platform}/{browse_path_suffix}"] | ||||
|                 ), | ||||
|             ), | ||||
|         ) | ||||
|         self.report.report_workunit(browse_path_wu) | ||||
|         yield browse_path_wu | ||||
| 
 | ||||
|         # 5. Emit dataPlatformInstance aspect. | ||||
|         if self.config.platform_instance: | ||||
|             platform_instance_wu = MetadataWorkUnit( | ||||
|                 id=f"{dataset_urn}-dataPlatformInstance", | ||||
|                 mcp=MetadataChangeProposalWrapper( | ||||
|                     entityType="dataset", | ||||
|                     changeType=ChangeTypeClass.UPSERT, | ||||
|                     entityUrn=dataset_urn, | ||||
|                     aspectName="dataPlatformInstance", | ||||
|                     aspect=DataPlatformInstanceClass( | ||||
|                         platform=platform_urn, | ||||
|                         instance=make_dataplatform_instance_urn( | ||||
|                             self.platform, self.config.platform_instance | ||||
|                         ), | ||||
|                     ), | ||||
|                 ), | ||||
|             ) | ||||
|             self.report.report_workunit(platform_instance_wu) | ||||
|             yield platform_instance_wu | ||||
| 
 | ||||
|         # 6. Emit subtype aspect marking this as a "topic" | ||||
|         subtype_wu = MetadataWorkUnit( | ||||
|             id=f"{dataset_urn}-subTypes", | ||||
|             mcp=MetadataChangeProposalWrapper( | ||||
|                 entityType="dataset", | ||||
|                 changeType=ChangeTypeClass.UPSERT, | ||||
|                 entityUrn=dataset_urn, | ||||
|                 aspectName="subTypes", | ||||
|                 aspect=SubTypesClass(typeNames=["topic"]), | ||||
|             ), | ||||
|         ) | ||||
|         self.report.report_workunit(subtype_wu) | ||||
|         yield subtype_wu | ||||
| 
 | ||||
|         # 7. Emit domains aspect | ||||
|         domain_urn: Optional[str] = None | ||||
|         for domain, pattern in self.config.domain.items(): | ||||
|             if pattern.allowed(pulsar_topic.fullname): | ||||
|                 domain_urn = make_domain_urn(domain) | ||||
| 
 | ||||
|         if domain_urn: | ||||
|             wus = add_domain_to_entity_wu( | ||||
|                 entity_type="dataset", | ||||
|                 entity_urn=dataset_urn, | ||||
|                 domain_urn=domain_urn, | ||||
|             ) | ||||
|             for wu in wus: | ||||
|                 self.report.report_workunit(wu) | ||||
|                 yield wu | ||||
| 
 | ||||
|     def get_report(self): | ||||
|         return self.report | ||||
| 
 | ||||
|     def update_default_job_run_summary(self) -> None: | ||||
|         summary = self.get_job_run_summary(self.get_default_ingestion_job_id()) | ||||
|         if summary is not None: | ||||
|             # For now just add the config and the report. | ||||
|             summary.config = self.config.json() | ||||
|             summary.custom_summary = self.report.as_string() | ||||
|             summary.runStatus = ( | ||||
|                 JobStatusClass.FAILED | ||||
|                 if self.get_report().failures | ||||
|                 else JobStatusClass.COMPLETED | ||||
|             ) | ||||
| 
 | ||||
|     def close(self): | ||||
|         self.update_default_job_run_summary() | ||||
|         self.prepare_for_commit() | ||||
|         self.session.close() | ||||
							
								
								
									
										111
									
								
								metadata-ingestion/src/datahub/ingestion/source_config/pulsar.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										111
									
								
								metadata-ingestion/src/datahub/ingestion/source_config/pulsar.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,111 @@ | ||||
| import re | ||||
| from typing import Dict, List, Optional, Union | ||||
| from urllib.parse import urlparse | ||||
| 
 | ||||
| from pydantic import Field, validator | ||||
| 
 | ||||
| from datahub.configuration.common import AllowDenyPattern, ConfigurationError | ||||
| from datahub.configuration.source_common import DEFAULT_ENV, DatasetSourceConfigBase | ||||
| from datahub.ingestion.source.state.stateful_ingestion_base import ( | ||||
|     StatefulIngestionConfig, | ||||
|     StatefulIngestionConfigBase, | ||||
| ) | ||||
| from datahub.utilities import config_clean | ||||
| 
 | ||||
| 
 | ||||
| class PulsarSourceStatefulIngestionConfig(StatefulIngestionConfig): | ||||
|     """ | ||||
|     Specialization of the basic StatefulIngestionConfig to add custom config. | ||||
|     This will be used to override the stateful_ingestion config param of StatefulIngestionConfigBase | ||||
|     in the PulsarSourceConfig. | ||||
|     """ | ||||
| 
 | ||||
|     remove_stale_metadata: bool = True | ||||
| 
 | ||||
| 
 | ||||
| def _is_valid_hostname(hostname: str) -> bool: | ||||
|     """ | ||||
|     Loosely ascii hostname validation. A hostname is considered valid when the total length does not exceed 253 | ||||
|     characters, contains valid characters and are max 63 octets per label. | ||||
|     """ | ||||
|     if len(hostname) > 253: | ||||
|         return False | ||||
|     # Hostnames ending on a dot are valid, if present strip exactly one | ||||
|     if hostname[-1] == ".": | ||||
|         hostname = hostname[:-1] | ||||
|     allowed = re.compile(r"(?!-)[A-Z\d-]{1,63}(?<!-)$", re.IGNORECASE) | ||||
|     return all(allowed.match(x) for x in hostname.split(".")) | ||||
| 
 | ||||
| 
 | ||||
| class PulsarSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigBase): | ||||
|     env: str = DEFAULT_ENV | ||||
|     # The web URL for the cluster. | ||||
|     web_service_url: str = "http://localhost:8080" | ||||
|     # Timout setting, how long to wait for the Pulsar rest api to send data before giving up | ||||
|     timeout: int = 5 | ||||
|     # Mandatory for oauth authentication | ||||
|     issuer_url: Optional[str] = None | ||||
|     client_id: Optional[str] = None | ||||
|     client_secret: Optional[str] = None | ||||
|     # Mandatory for token authentication | ||||
|     token: Optional[str] = None | ||||
|     # Either a boolean, in which case it controls whether we verify the server's TLS certificate, or a string, | ||||
|     # in which case it must be a path to a CA bundle to use. | ||||
|     verify_ssl: Union[bool, str] = True | ||||
|     # By default, allow all topics and deny the pulsar system topics | ||||
|     tenant_patterns: AllowDenyPattern = AllowDenyPattern(allow=[".*"], deny=["pulsar"]) | ||||
|     namespace_patterns: AllowDenyPattern = AllowDenyPattern( | ||||
|         allow=[".*"], deny=["public/functions"] | ||||
|     ) | ||||
|     topic_patterns: AllowDenyPattern = AllowDenyPattern(allow=[".*"], deny=["/__.*$"]) | ||||
|     # Exclude partition topics. e.g. topics ending on _partition_N where N is a number | ||||
|     exclude_individual_partitions: bool = True | ||||
|     # Listing all tenants requires superUser role, alternative you can set tenants you want to scrape | ||||
|     # using the tenant admin role | ||||
|     tenants: List[str] = [] | ||||
| 
 | ||||
|     domain: Dict[str, AllowDenyPattern] = dict() | ||||
|     # Custom Stateful Ingestion settings | ||||
|     stateful_ingestion: Optional[PulsarSourceStatefulIngestionConfig] = None | ||||
| 
 | ||||
|     # Placeholder for OpenId discovery document | ||||
|     oid_config: dict = Field(default_factory=dict) | ||||
| 
 | ||||
|     @validator("token") | ||||
|     def ensure_only_issuer_or_token( | ||||
|         cls, token: Optional[str], values: Dict[str, Optional[str]] | ||||
|     ) -> Optional[str]: | ||||
|         if token is not None and values.get("issuer_url") is not None: | ||||
|             raise ConfigurationError( | ||||
|                 "Expected only one authentication method, either issuer_url or token." | ||||
|             ) | ||||
|         return token | ||||
| 
 | ||||
|     @validator("client_secret", always=True) | ||||
|     def ensure_client_id_and_secret_for_issuer_url( | ||||
|         cls, client_secret: Optional[str], values: Dict[str, Optional[str]] | ||||
|     ) -> Optional[str]: | ||||
|         if values.get("issuer_url") is not None and ( | ||||
|             client_secret is None or values.get("client_id") is None | ||||
|         ): | ||||
|             raise ConfigurationError( | ||||
|                 "Missing configuration: client_id and client_secret are mandatory when issuer_url is set." | ||||
|             ) | ||||
|         return client_secret | ||||
| 
 | ||||
|     @validator("web_service_url") | ||||
|     def web_service_url_scheme_host_port(cls, val: str) -> str: | ||||
|         # Tokenize the web url | ||||
|         url = urlparse(val) | ||||
| 
 | ||||
|         if url.scheme not in ["http", "https"]: | ||||
|             raise ConfigurationError( | ||||
|                 f"Scheme should be http or https, found {url.scheme}" | ||||
|             ) | ||||
| 
 | ||||
|         if not _is_valid_hostname(url.hostname.__str__()): | ||||
|             raise ConfigurationError( | ||||
|                 f"Not a valid hostname, hostname contains invalid characters, found {url.hostname}" | ||||
|             ) | ||||
| 
 | ||||
|         return config_clean.remove_trailing_slashes(val) | ||||
| @ -0,0 +1,33 @@ | ||||
| from dataclasses import dataclass, field | ||||
| from typing import List, Optional | ||||
| 
 | ||||
| from datahub.ingestion.source.state.stateful_ingestion_base import ( | ||||
|     StatefulIngestionReport, | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| @dataclass | ||||
| class PulsarSourceReport(StatefulIngestionReport): | ||||
|     pulsar_version: Optional[str] = None | ||||
|     tenants_scanned: Optional[int] = None | ||||
|     namespaces_scanned: Optional[int] = None | ||||
|     topics_scanned: Optional[int] = None | ||||
|     tenants_filtered: List[str] = field(default_factory=list) | ||||
|     namespaces_filtered: List[str] = field(default_factory=list) | ||||
|     topics_filtered: List[str] = field(default_factory=list) | ||||
|     soft_deleted_stale_entities: List[str] = field(default_factory=list) | ||||
| 
 | ||||
|     def report_pulsar_version(self, version: str) -> None: | ||||
|         self.pulsar_version = version | ||||
| 
 | ||||
|     def report_tenants_dropped(self, tenant: str) -> None: | ||||
|         self.tenants_filtered.append(tenant) | ||||
| 
 | ||||
|     def report_namespaces_dropped(self, namespace: str) -> None: | ||||
|         self.namespaces_filtered.append(namespace) | ||||
| 
 | ||||
|     def report_topics_dropped(self, topic: str) -> None: | ||||
|         self.topics_filtered.append(topic) | ||||
| 
 | ||||
|     def report_stale_entity_soft_deleted(self, urn: str) -> None: | ||||
|         self.soft_deleted_stale_entities.append(urn) | ||||
							
								
								
									
										239
									
								
								metadata-ingestion/tests/unit/test_pulsar_source.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										239
									
								
								metadata-ingestion/tests/unit/test_pulsar_source.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,239 @@ | ||||
| import unittest | ||||
| from typing import Any, Dict | ||||
| from unittest.mock import patch | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
| from datahub.configuration.common import ConfigurationError | ||||
| from datahub.emitter.mcp import MetadataChangeProposalWrapper | ||||
| from datahub.ingestion.api.common import PipelineContext | ||||
| from datahub.ingestion.source.pulsar import ( | ||||
|     PulsarSchema, | ||||
|     PulsarSource, | ||||
|     PulsarSourceConfig, | ||||
|     PulsarTopic, | ||||
| ) | ||||
| 
 | ||||
| mock_schema_response: Dict[str, Any] = { | ||||
|     "version": 1, | ||||
|     "type": "AVRO", | ||||
|     "timestamp": 0, | ||||
|     "data": '{"type":"record","name":"FooSchema","namespace":"foo.bar","doc":"Description of FooSchema","fields":[{"name":"field1","type":{"type":"string","avro.java.string":"String"},"doc":"Description of field1"},{"name":"field2","type":"long","doc":"Some description","default":0}]}', | ||||
|     "properties": {"__jsr310ConversionEnabled": "false", "__alwaysAllowNull": "true"}, | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| class TestPulsarSourceConfig: | ||||
|     def test_pulsar_source_config_valid_web_service_url(self): | ||||
|         assert ( | ||||
|             PulsarSourceConfig().web_service_url_scheme_host_port( | ||||
|                 "http://localhost:8080/" | ||||
|             ) | ||||
|             == "http://localhost:8080" | ||||
|         ) | ||||
| 
 | ||||
|     def test_pulsar_source_config_invalid_web_service_url_scheme(self): | ||||
|         with pytest.raises( | ||||
|             ConfigurationError, match=r"Scheme should be http or https, found ftp" | ||||
|         ): | ||||
|             PulsarSourceConfig().web_service_url_scheme_host_port( | ||||
|                 "ftp://localhost:8080/" | ||||
|             ) | ||||
| 
 | ||||
|     def test_pulsar_source_config_invalid_web_service_url_host(self): | ||||
|         with pytest.raises( | ||||
|             ConfigurationError, | ||||
|             match=r"Not a valid hostname, hostname contains invalid characters, found localhost&", | ||||
|         ): | ||||
|             PulsarSourceConfig().web_service_url_scheme_host_port( | ||||
|                 "http://localhost&:8080/" | ||||
|             ) | ||||
| 
 | ||||
| 
 | ||||
| class TestPulsarTopic: | ||||
|     def test_pulsar_source_parse_topic_string(self) -> None: | ||||
|         topic = "persistent://tenant/namespace/topic" | ||||
|         pulsar_topic = PulsarTopic(topic) | ||||
|         assert pulsar_topic.type == "persistent" | ||||
|         assert pulsar_topic.tenant == "tenant" | ||||
|         assert pulsar_topic.namespace == "namespace" | ||||
|         assert pulsar_topic.topic == "topic" | ||||
|         assert pulsar_topic.fullname == "persistent://tenant/namespace/topic" | ||||
| 
 | ||||
| 
 | ||||
| class TestPulsarSchema: | ||||
|     def test_pulsar_source_parse_pulsar_schema(self) -> None: | ||||
|         pulsar_schema = PulsarSchema(mock_schema_response) | ||||
|         assert pulsar_schema.schema_type == "AVRO" | ||||
|         assert ( | ||||
|             pulsar_schema.schema_str | ||||
|             == '{"type":"record","name":"FooSchema","namespace":"foo.bar","doc":"Description of FooSchema","fields":[{"name":"field1","type":{"type":"string","avro.java.string":"String"},"doc":"Description of field1"},{"name":"field2","type":"long","doc":"Some description","default":0}]}' | ||||
|         ) | ||||
|         assert pulsar_schema.schema_name == "foo.bar.FooSchema" | ||||
|         assert pulsar_schema.schema_version == 1 | ||||
|         assert pulsar_schema.schema_description == "Description of FooSchema" | ||||
|         assert pulsar_schema.properties == { | ||||
|             "__jsr310ConversionEnabled": "false", | ||||
|             "__alwaysAllowNull": "true", | ||||
|         } | ||||
| 
 | ||||
| 
 | ||||
| class TestPulsarSource(unittest.TestCase): | ||||
|     def test_pulsar_source_get_token_jwt(self): | ||||
|         ctx = PipelineContext(run_id="test") | ||||
|         pulsar_source = PulsarSource.create( | ||||
|             {"web_service_url": "http://localhost:8080", "token": "jwt_token"}, | ||||
|             ctx, | ||||
|         ) | ||||
|         # source = PulsarSource( | ||||
|         #    ctx=PipelineContext(run_id="pulsar-source-test"), | ||||
|         #    config=self.token_config) | ||||
|         assert pulsar_source.get_access_token() == "jwt_token" | ||||
| 
 | ||||
|     @patch("datahub.ingestion.source.pulsar.requests.get", autospec=True) | ||||
|     @patch("datahub.ingestion.source.pulsar.requests.post", autospec=True) | ||||
|     def test_pulsar_source_get_token_oauth(self, mock_post, mock_get): | ||||
|         ctx = PipelineContext(run_id="test") | ||||
|         mock_get.return_value.json.return_value = { | ||||
|             "token_endpoint": "http://127.0.0.1:8083/realms/pulsar/protocol/openid-connect/token" | ||||
|         } | ||||
| 
 | ||||
|         pulsar_source = PulsarSource.create( | ||||
|             { | ||||
|                 "web_service_url": "http://localhost:8080", | ||||
|                 "issuer_url": "http://localhost:8083/realms/pulsar", | ||||
|                 "client_id": "client_id", | ||||
|                 "client_secret": "client_secret", | ||||
|             }, | ||||
|             ctx, | ||||
|         ) | ||||
|         mock_post.return_value.json.return_value = {"access_token": "oauth_token"} | ||||
|         assert pulsar_source.get_access_token() == "oauth_token" | ||||
| 
 | ||||
|     @patch("datahub.ingestion.source.pulsar.requests.Session.get", autospec=True) | ||||
|     def test_pulsar_source_get_workunits_all_tenant(self, mock_session): | ||||
|         ctx = PipelineContext(run_id="test") | ||||
|         pulsar_source = PulsarSource.create( | ||||
|             { | ||||
|                 "web_service_url": "http://localhost:8080", | ||||
|             }, | ||||
|             ctx, | ||||
|         ) | ||||
| 
 | ||||
|         # Mock fetching Pulsar metadata | ||||
|         with patch( | ||||
|             "datahub.ingestion.source.pulsar.PulsarSource._get_pulsar_metadata" | ||||
|         ) as mock: | ||||
|             mock.side_effect = [ | ||||
|                 ["t_1"],  # tenant list | ||||
|                 ["t_1/ns_1"],  # namespaces list | ||||
|                 ["persistent://t_1/ns_1/topic_1"],  # persistent topic list | ||||
|                 [],  # persistent partitioned topic list | ||||
|                 [],  # none-persistent topic list | ||||
|                 [],  # none-persistent partitioned topic list | ||||
|                 mock_schema_response, | ||||
|             ]  # schema for persistent://t_1/ns_1/topic | ||||
| 
 | ||||
|             work_units = list(pulsar_source.get_workunits()) | ||||
|             first_mcp = work_units[0].metadata | ||||
|             assert isinstance(first_mcp, MetadataChangeProposalWrapper) | ||||
| 
 | ||||
|             # Expected calls 7 | ||||
|             # http://localhost:8080/admin/v2/tenants | ||||
|             # http://localhost:8080/admin/v2/namespaces/t_1 | ||||
|             # http://localhost:8080/admin/v2/persistent/t_1/ns_1 | ||||
|             # http://localhost:8080/admin/v2/persistent/t_1/ns_1/partitioned | ||||
|             # http://localhost:8080/admin/v2/non-persistent/t_1/ns_1 | ||||
|             # http://localhost:8080/admin/v2/non-persistent/t_1/ns_1/partitioned | ||||
|             # http://localhost:8080/admin/v2/schemas/t_1/ns_1/topic_1/schema | ||||
|             assert mock.call_count == 7 | ||||
|             # expecting 5 mcp for one topic with default config | ||||
|             assert len(work_units) == 5 | ||||
| 
 | ||||
|     @patch("datahub.ingestion.source.pulsar.requests.Session.get", autospec=True) | ||||
|     def test_pulsar_source_get_workunits_custom_tenant(self, mock_session): | ||||
|         ctx = PipelineContext(run_id="test") | ||||
|         pulsar_source = PulsarSource.create( | ||||
|             { | ||||
|                 "web_service_url": "http://localhost:8080", | ||||
|                 "tenants": ["t_1", "t_2"], | ||||
|             }, | ||||
|             ctx, | ||||
|         ) | ||||
| 
 | ||||
|         # Mock fetching Pulsar metadata | ||||
|         with patch( | ||||
|             "datahub.ingestion.source.pulsar.PulsarSource._get_pulsar_metadata" | ||||
|         ) as mock: | ||||
|             mock.side_effect = [ | ||||
|                 ["t_1/ns_1"],  # namespaces list | ||||
|                 ["persistent://t_1/ns_1/topic_1"],  # topic list | ||||
|                 [],  # empty persistent partitioned topic list | ||||
|                 [],  # empty none-persistent topic list | ||||
|                 [],  # empty none-persistent partitioned topic list | ||||
|                 mock_schema_response,  # schema for persistent://t_1/ns_1/topic | ||||
|                 [],  # no namespaces for tenant t_2 | ||||
|             ] | ||||
| 
 | ||||
|             work_units = list(pulsar_source.get_workunits()) | ||||
|             first_mcp = work_units[0].metadata | ||||
|             assert isinstance(first_mcp, MetadataChangeProposalWrapper) | ||||
| 
 | ||||
|             # Expected calls 7 | ||||
|             # http://localhost:8080/admin/v2/namespaces/t_1 | ||||
|             # http://localhost:8080/admin/v2/persistent/t_1/ns_1 | ||||
|             # http://localhost:8080/admin/v2/persistent/t_1/ns_1/partitioned | ||||
|             # http://localhost:8080/admin/v2/non-persistent/t_1/ns_1 | ||||
|             # http://localhost:8080/admin/v2/non-persistent/t_1/ns_1/partitioned | ||||
|             # http://localhost:8080/admin/v2/schemas/t_1/ns_1/topic_1/schema | ||||
|             # http://localhost:8080/admin/v2/namespaces/t_2 | ||||
|             assert mock.call_count == 7 | ||||
|             # expecting 5 mcp for one topic with default config | ||||
|             assert len(work_units) == 5 | ||||
| 
 | ||||
|     @patch("datahub.ingestion.source.pulsar.requests.Session.get", autospec=True) | ||||
|     def test_pulsar_source_get_workunits_patterns(self, mock_session): | ||||
|         ctx = PipelineContext(run_id="test") | ||||
|         pulsar_source = PulsarSource.create( | ||||
|             { | ||||
|                 "web_service_url": "http://localhost:8080", | ||||
|                 "tenants": ["t_1", "t_2", "bad_t_3"], | ||||
|                 "tenant_patterns": {"deny": ["bad_t_3"]}, | ||||
|                 "namespace_patterns": {"allow": [r"t_1/ns_1"]}, | ||||
|                 "topic_patterns": {"allow": [r"persistent://t_1/ns_1/topic_1"]}, | ||||
|             }, | ||||
|             ctx, | ||||
|         ) | ||||
| 
 | ||||
|         # Mock fetching Pulsar metadata | ||||
|         with patch( | ||||
|             "datahub.ingestion.source.pulsar.PulsarSource._get_pulsar_metadata" | ||||
|         ) as mock: | ||||
|             mock.side_effect = [ | ||||
|                 ["t_1/ns_1", "t_2/ns_1"],  # namespaces list | ||||
|                 [ | ||||
|                     "persistent://t_1/ns_1/topic_1",  # persistent topic list | ||||
|                     "non-persistent://t_1/ns_1/bad_topic", | ||||
|                 ],  # topic will be filtered out | ||||
|                 [],  # persistent partitioned topic list | ||||
|                 [],  # none-persistent topic list | ||||
|                 [],  # none-persistent partitioned topic list | ||||
|                 mock_schema_response,  # schema for persistent://t_1/ns_1/topic | ||||
|                 [],  # no namespaces for tenant t_2 | ||||
|             ] | ||||
| 
 | ||||
|             work_units = list(pulsar_source.get_workunits()) | ||||
|             first_mcp = work_units[0].metadata | ||||
|             assert isinstance(first_mcp, MetadataChangeProposalWrapper) | ||||
| 
 | ||||
|             # Expected calls 7 | ||||
|             # http://localhost:8080/admin/v2/namespaces/t_1 | ||||
|             # http://localhost:8080/admin/v2/persistent/t_1/ns_1 | ||||
|             # http://localhost:8080/admin/v2/persistent/t_1/ns_1/partitioned | ||||
|             # http://localhost:8080/admin/v2/non-persistent/t_1/ns_1 | ||||
|             # http://localhost:8080/admin/v2/non-persistent/t_1/ns_1/partitioned | ||||
|             # http://localhost:8080/admin/v2/schemas/t_1/ns_1/topic_1/schema | ||||
|             # http://localhost:8080/admin/v2/namespaces/t_2 | ||||
|             assert mock.call_count == 7 | ||||
|             # expecting 5 mcp for one topic with default config | ||||
|             assert len(work_units) == 5 | ||||
| @ -456,6 +456,16 @@ | ||||
|       "logoUrl": "/assets/platforms/trinologo.png" | ||||
|     } | ||||
|   }, | ||||
|   { | ||||
|     "urn": "urn:li:dataPlatform:pulsar", | ||||
|     "aspect": { | ||||
|       "datasetNameDelimiter": ".", | ||||
|       "name": "pulsar", | ||||
|       "displayName": "Pulsar", | ||||
|       "type": "MESSAGE_BROKER", | ||||
|       "logoUrl": "/assets/platforms/pulsarlogo.png" | ||||
|     } | ||||
|   }, | ||||
|   { | ||||
|     "urn": "urn:li:dataPlatform:unknown", | ||||
|     "aspect": { | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 vanmeete
						vanmeete