mirror of
				https://github.com/datahub-project/datahub.git
				synced 2025-10-31 02:37:05 +00:00 
			
		
		
		
	feat(ingestion): add Pulsar source (#4721)
This commit is contained in:
		
							parent
							
								
									f099aeb550
								
							
						
					
					
						commit
						74d6d35881
					
				
							
								
								
									
										
											BIN
										
									
								
								datahub-web-react/src/images/pulsarlogo.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								datahub-web-react/src/images/pulsarlogo.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 27 KiB | 
| @ -206,6 +206,7 @@ plugins: Dict[str, Set[str]] = { | |||||||
|     "postgres": sql_common | {"psycopg2-binary", "GeoAlchemy2"}, |     "postgres": sql_common | {"psycopg2-binary", "GeoAlchemy2"}, | ||||||
|     "presto-on-hive": sql_common |     "presto-on-hive": sql_common | ||||||
|     | {"psycopg2-binary", "acryl-pyhive[hive]>=0.6.12", "pymysql>=1.0.2"}, |     | {"psycopg2-binary", "acryl-pyhive[hive]>=0.6.12", "pymysql>=1.0.2"}, | ||||||
|  |     "pulsar": {"requests"}, | ||||||
|     "redash": {"redash-toolbelt", "sql-metadata", "sqllineage==1.3.4"}, |     "redash": {"redash-toolbelt", "sql-metadata", "sqllineage==1.3.4"}, | ||||||
|     "redshift": sql_common |     "redshift": sql_common | ||||||
|     | {"sqlalchemy-redshift", "psycopg2-binary", "GeoAlchemy2", "sqllineage==1.3.4"}, |     | {"sqlalchemy-redshift", "psycopg2-binary", "GeoAlchemy2", "sqllineage==1.3.4"}, | ||||||
| @ -452,6 +453,7 @@ entry_points = { | |||||||
|         "nifi = datahub.ingestion.source.nifi:NifiSource", |         "nifi = datahub.ingestion.source.nifi:NifiSource", | ||||||
|         "powerbi = datahub.ingestion.source.powerbi:PowerBiDashboardSource", |         "powerbi = datahub.ingestion.source.powerbi:PowerBiDashboardSource", | ||||||
|         "presto-on-hive = datahub.ingestion.source.sql.presto_on_hive:PrestoOnHiveSource", |         "presto-on-hive = datahub.ingestion.source.sql.presto_on_hive:PrestoOnHiveSource", | ||||||
|  |         "pulsar = datahub.ingestion.source.pulsar:PulsarSource", | ||||||
|     ], |     ], | ||||||
|     "datahub.ingestion.sink.plugins": [ |     "datahub.ingestion.sink.plugins": [ | ||||||
|         "file = datahub.ingestion.sink.file:FileSink", |         "file = datahub.ingestion.sink.file:FileSink", | ||||||
|  | |||||||
							
								
								
									
										176
									
								
								metadata-ingestion/source_docs/pulsar.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										176
									
								
								metadata-ingestion/source_docs/pulsar.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,176 @@ | |||||||
|  | # Pulsar | ||||||
|  | 
 | ||||||
|  | <!-- Set Support Status --> | ||||||
|  | <!--  | ||||||
|  | --> | ||||||
|  |  | ||||||
|  | 
 | ||||||
|  | ## Integration Details | ||||||
|  | 
 | ||||||
|  | <!-- Plain-language description of what this integration is meant to do.  --> | ||||||
|  | <!-- Include details about where metadata is extracted from (ie. logs, source API, manifest, etc.)   --> | ||||||
|  | 
 | ||||||
|  | The Datahub Pulsar source plugin extracts `topic` and `schema` metadata from an Apache Pulsar instance and ingest the information into Datahub. The plugin uses the [Pulsar admin Rest API interface](https://pulsar.apache.org/admin-rest-api/#) to interact with the Pulsar instance. The following APIs are used in order to: | ||||||
|  | - [Get the list of existing tenants](https://pulsar.apache.org/admin-rest-api/#tag/tenants) | ||||||
|  | - [Get the list of namespaces associated with each tenant](https://pulsar.apache.org/admin-rest-api/#tag/namespaces) | ||||||
|  | - [Get the list of topics associated with each namespace](https://pulsar.apache.org/admin-rest-api/#tag/persistent-topic) | ||||||
|  |     - persistent topics | ||||||
|  |     - persistent partitioned topics | ||||||
|  |     - non-persistent topics | ||||||
|  |     - non-persistent partitioned topics | ||||||
|  | - [Get the latest schema associated with each topic](https://pulsar.apache.org/admin-rest-api/#tag/schemas) | ||||||
|  | 
 | ||||||
|  | The data is extracted on `tenant` and `namespace` basis, topics with corresponding schema (if available) are ingested as [Dataset](docs/generated/metamodel/entities/dataset.md) into Datahub. Some additional values like `schema description`, `schema_version`, `schema_type` and `partitioned` are included as `DatasetProperties`. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | ### Concept Mapping | ||||||
|  | 
 | ||||||
|  | <!-- This should be a manual mapping of concepts from the source to the DataHub Metadata Model --> | ||||||
|  | <!-- Authors should provide as much context as possible about how this mapping was generated, including assumptions made, known shortcuts, & any other caveats --> | ||||||
|  | 
 | ||||||
|  | This ingestion source maps the following Source System Concepts to DataHub Concepts: | ||||||
|  | 
 | ||||||
|  | <!-- Remove all unnecessary/irrelevant DataHub Concepts --> | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | | Source Concept | DataHub Concept                                                    | Notes                                                                     | | ||||||
|  | |----------------|--------------------------------------------------------------------|---------------------------------------------------------------------------| | ||||||
|  | | `pulsar`       | [Data Platform](docs/generated/metamodel/entities/dataPlatform.md) |                                                                           | | ||||||
|  | | Pulsar Topic   | [Dataset](docs/generated/metamodel/entities/dataset.md)            | _subType_: `topic`                                                        | | ||||||
|  | | Pulsar Schema  | [SchemaField](docs/generated/metamodel/entities/schemaField.md)    | Maps to the fields defined within the `Avro` or `JSON` schema definition. |  | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | ### Supported Capabilities | ||||||
|  | 
 | ||||||
|  | <!-- This should be an auto-generated table of supported DataHub features/functionality --> | ||||||
|  | <!-- Each capability should link out to a feature guide --> | ||||||
|  | 
 | ||||||
|  | | Capability                                            | Status | Notes                                                                                                                                                                                                                                        | | ||||||
|  | |-------------------------------------------------------|:------:|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | ||||||
|  | | Data Container                                        |   ❌    |                                                                                                                                                                                                                                              | | ||||||
|  | | [Stateful Ingestion](./stateful_ingestion.md)         |   ✅    | Requires recipe configuration, stateful Ingestion is available only when a Platform Instance is assigned to this source.                                                                                                                     | | ||||||
|  | | Partition Support                                     |   ✅    | Requires recipe configuration, each individual partition topic can be ingest. Behind the scenes, a partitioned topic is actually implemented as N internal topics, where N is the number of partitions. This feature is disabled by default. | | ||||||
|  | | [Platform Instance](../../docs/platform-instances.md) |   ✅    | Requires recipe configuration and is mandatory for Stateful Ingestion. A Pulsar instance consists of one or more Pulsar clusters.                                                                                                            | | ||||||
|  | | [Data Domain](../../docs/domains.md)                  |   ✅    | Requires recipe configuration                                                                                                                                                                                                                | | ||||||
|  | | Dataset Profiling                                     |   ❌    |                                                                                                                                                                                                                                              | | ||||||
|  | | Dataset Usage                                         |   ❌    |                                                                                                                                                                                                                                              | | ||||||
|  | | Extract Descriptions                                  |   ❌    |                                                                                                                                                                                                                                              | | ||||||
|  | | Extract Lineage                                       |   ❌    |                                                                                                                                                                                                                                              | | ||||||
|  | | Extract Ownership                                     |   ❌    |                                                                                                                                                                                                                                              | | ||||||
|  | | Extract Tags                                          |   ❌    |                                                                                                                                                                                                                                              | | ||||||
|  | | ...                                                   |        | | ||||||
|  | 
 | ||||||
|  | ## Metadata Ingestion Quickstart | ||||||
|  | 
 | ||||||
|  | For context on getting started with ingestion, check out our [metadata ingestion guide](../README.md). | ||||||
|  | 
 | ||||||
|  | ### Prerequisites | ||||||
|  | 
 | ||||||
|  | In order to ingest metadata from Apache Pulsar, you will need: | ||||||
|  | 
 | ||||||
|  | * Access to a Pulsar Instance, if authentication is enabled a valid access token. | ||||||
|  | * Pulsar version >= 2.7.0 | ||||||
|  | * ... | ||||||
|  | 
 | ||||||
|  | > **_NOTE:_**  A _superUser_ role is required for listing all existing tenants within a Pulsar instance. | ||||||
|  | > | ||||||
|  | 
 | ||||||
|  | ### Install the Plugin(s) | ||||||
|  | 
 | ||||||
|  | Run the following commands to install the relevant plugin(s): | ||||||
|  | 
 | ||||||
|  | `pip install 'acryl-datahub[pulsar]'` | ||||||
|  | 
 | ||||||
|  | ### Configure the Ingestion Recipe(s) | ||||||
|  | 
 | ||||||
|  | Use the following recipe(s) to get started with ingestion. See [below](#config-details) for full configuration options. | ||||||
|  | 
 | ||||||
|  | _For general pointers on writing and running a recipe, see our [main recipe guide](../README.md#recipes)._ | ||||||
|  | 
 | ||||||
|  | #### Quickstart recipe | ||||||
|  | Getting started receipt | ||||||
|  | ```yml | ||||||
|  | source: | ||||||
|  |   type: pulsar | ||||||
|  |   config: | ||||||
|  |     # Required fields | ||||||
|  |     web_service_url: "http://localhost:8080" | ||||||
|  | 
 | ||||||
|  | sink: | ||||||
|  |   # sink configs | ||||||
|  | ``` | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | #### Example recipe with authentication | ||||||
|  | An example recipe for ingesting from a Pulsar instance with oauth authentication and ssl enabled. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | ```yml | ||||||
|  | source:  | ||||||
|  |   type: "pulsar"  | ||||||
|  |   config: | ||||||
|  |     env: "TEST"  | ||||||
|  |     platform_instance: "local" | ||||||
|  |     ## Pulsar client connection config ##  | ||||||
|  |     web_service_url: "https://localhost:8443" | ||||||
|  |     verify_ssl: "/opt/certs/ca.cert.pem" | ||||||
|  |     # Issuer url for auth document, for example "http://localhost:8083/realms/pulsar" | ||||||
|  |     issuer_url: <issuer_url> | ||||||
|  |     client_id: ${CLIENT_ID} | ||||||
|  |     client_secret: ${CLIENT_SECRET} | ||||||
|  |     # Tenant list to scrape  | ||||||
|  |     tenants: | ||||||
|  |       - tenant_1 | ||||||
|  |       - tenant_2 | ||||||
|  |     # Topic filter pattern  | ||||||
|  |     topic_patterns: | ||||||
|  |       allow: | ||||||
|  |         - ".*sales.*" | ||||||
|  | 
 | ||||||
|  | sink: | ||||||
|  |   # sink configs | ||||||
|  | ``` | ||||||
|  | 
 | ||||||
|  | > **_NOTE:_**  Always use TLS encryption in a production environment and use variable substitution for sensitive information (e.g. ${CLIENT_ID} and ${CLIENT_SECRET}). | ||||||
|  | > | ||||||
|  | 
 | ||||||
|  | ## Config details | ||||||
|  | <details> | ||||||
|  |   <summary>View All Recipe Configuration Options</summary> | ||||||
|  | 
 | ||||||
|  | Note that a `.` is used to denote nested fields in the YAML recipe. | ||||||
|  | 
 | ||||||
|  | | Field                           | Required | Default                 | Description                                                                                                                                                     | | ||||||
|  | |---------------------------------|:--------:|-------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------| | ||||||
|  | | `env`                           |    ❌     | `PROD`                  | The data fabric, defaults to PROD                                                                                                                               | | ||||||
|  | | `platform_instance`             |    ❌     |                         | The Platform instance to use while constructing URNs. Mandatory for Stateful Ingestion                                                                          | | ||||||
|  | | `web_service_url`               |    ✅     | `http://localhost:8080` | The web URL for the cluster.                                                                                                                                    | | ||||||
|  | | `timeout`                       |    ❌     | `5`                     | Timout setting, how long to wait for the Pulsar rest api to send data before giving up                                                                          | | ||||||
|  | | `verify_ssl`                    |    ❌     | `True`                  | Either a boolean, in which case it controls whether we verify the server's TLS certificate, or a string, in which case it must be a path to a CA bundle to use. | | ||||||
|  | | `issuer_url`                    |    ❌     |                         | The complete URL for a Custom Authorization Server. Mandatory for OAuth based authentication.                                                                   | | ||||||
|  | | `client_id`                     |    ❌     |                         | The application's client ID                                                                                                                                     | | ||||||
|  | | `client_secret`                 |    ❌     |                         | The application's client secret                                                                                                                                 | | ||||||
|  | | `token`                         |    ❌     |                         | The access token for the application. Mandatory for token based authentication.                                                                                 | | ||||||
|  | | `tenant_patterns.allow`         |    ❌     | `.*`                    | List of regex patterns for tenants to include in ingestion. By default all tenants are allowed.                                                                 | | ||||||
|  | | `tenant_patterns.deny`          |    ❌     | `pulsar`                | List of regex patterns for tenants to exclude from ingestion. By default the Pulsar system tenant is denied.                                                    | | ||||||
|  | | `tenant_patterns.ignoreCase`    |    ❌     | `True`                  | Whether to ignore case sensitivity during tenant pattern matching.                                                                                              | | ||||||
|  | | `namespace_patterns.allow`      |    ❌     | `.*`                    | List of regex patterns for namespaces to include in ingestion. By default all namespaces are allowed.                                                           | | ||||||
|  | | `namespace_patterns.deny`       |    ❌     | `public/functions`      | List of regex patterns for namespaces to exclude from ingestion. By default the functions namespace is denied.                                                  | | ||||||
|  | | `namespace_patterns.ignoreCase` |    ❌     | `True`                  | Whether to ignore case sensitivity during namespace pattern matching.                                                                                           | | ||||||
|  | | `topic_patterns.allow`          |    ❌     | `.*`                    | List of regex patterns for topics to include in ingestion. By default all topics are allowed.                                                                   | | ||||||
|  | | `topic_patterns.deny`           |    ❌     | `/__.*$`                | List of regex patterns for topics to exclude from ingestion. By default the Pulsar system topics are denied.                                                    | | ||||||
|  | | `topic_patterns.ignoreCase`     |    ❌     | `True`                  | Whether to ignore case sensitivity during topic pattern matching.                                                                                               | | ||||||
|  | | `tenants`                       |    ❌     |                         | Listing all tenants requires superUser role, alternative you can set a list of tenants you want to scrape using the tenant admin role                           |  | ||||||
|  | | `exclude_individual_partitions` |    ❌     | `True`                  | Extract each individual partitioned topic. e.g. when turned off a topic with 100 partitions will result in 100 `Datesets`.                                      | | ||||||
|  | | `domain.domain_urn.allow`       |    ❌     |                         | List of regex patterns for topics to set domain_urn domain key. There can be multiple domain key specified.                                                     | | ||||||
|  | | `domain.domain_urn.deny`        |    ❌     |                         | List of regex patterns for topics to not assign domain_urn. There can be multiple domain key specified.                                                         | | ||||||
|  | | `domain.domain_urn.ignoreCase`  |    ❌     | `True`                  | Whether to ignore case sensitivity during pattern matching.There can be multiple domain key specified.                                                          | | ||||||
|  | | `stateful_ingestion`            |    ❌     |                         | see [Stateful Ingestion](./stateful_ingestion.md)                                                                                                               | | ||||||
|  | </details> | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | ## Troubleshooting | ||||||
|  | 
 | ||||||
|  | ### [Common Issue] | ||||||
|  | 
 | ||||||
|  | [Provide description of common issues with this integration and steps to resolve] | ||||||
							
								
								
									
										642
									
								
								metadata-ingestion/src/datahub/ingestion/source/pulsar.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										642
									
								
								metadata-ingestion/src/datahub/ingestion/source/pulsar.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,642 @@ | |||||||
|  | import json | ||||||
|  | import logging | ||||||
|  | import re | ||||||
|  | from dataclasses import dataclass | ||||||
|  | from hashlib import md5 | ||||||
|  | from typing import Iterable, List, Optional, Tuple, cast | ||||||
|  | 
 | ||||||
|  | import requests | ||||||
|  | 
 | ||||||
|  | from datahub.configuration.common import ConfigurationError | ||||||
|  | from datahub.emitter.mce_builder import ( | ||||||
|  |     make_data_platform_urn, | ||||||
|  |     make_dataplatform_instance_urn, | ||||||
|  |     make_dataset_urn_with_platform_instance, | ||||||
|  |     make_domain_urn, | ||||||
|  | ) | ||||||
|  | from datahub.emitter.mcp import MetadataChangeProposalWrapper | ||||||
|  | from datahub.emitter.mcp_builder import add_domain_to_entity_wu | ||||||
|  | from datahub.ingestion.api.common import PipelineContext | ||||||
|  | from datahub.ingestion.api.workunit import MetadataWorkUnit | ||||||
|  | from datahub.ingestion.extractor import schema_util | ||||||
|  | from datahub.ingestion.source.state.checkpoint import Checkpoint | ||||||
|  | from datahub.ingestion.source.state.kafka_state import KafkaCheckpointState | ||||||
|  | from datahub.ingestion.source.state.stateful_ingestion_base import ( | ||||||
|  |     JobId, | ||||||
|  |     StatefulIngestionSourceBase, | ||||||
|  | ) | ||||||
|  | from datahub.ingestion.source_config.pulsar import PulsarSourceConfig | ||||||
|  | from datahub.ingestion.source_report.pulsar import PulsarSourceReport | ||||||
|  | from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass | ||||||
|  | from datahub.metadata.com.linkedin.pegasus2avro.schema import ( | ||||||
|  |     KafkaSchema, | ||||||
|  |     SchemaField, | ||||||
|  |     SchemaMetadata, | ||||||
|  | ) | ||||||
|  | from datahub.metadata.schema_classes import ( | ||||||
|  |     BrowsePathsClass, | ||||||
|  |     ChangeTypeClass, | ||||||
|  |     DataPlatformInstanceClass, | ||||||
|  |     DatasetPropertiesClass, | ||||||
|  |     JobStatusClass, | ||||||
|  |     SubTypesClass, | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | logger = logging.getLogger(__name__) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class PulsarTopic(object): | ||||||
|  |     __slots__ = ["topic_parts", "fullname", "type", "tenant", "namespace", "topic"] | ||||||
|  | 
 | ||||||
|  |     def __init__(self, topic): | ||||||
|  |         topic_parts = re.split("[: /]", topic) | ||||||
|  |         self.fullname = topic | ||||||
|  |         self.type = topic_parts[0] | ||||||
|  |         self.tenant = topic_parts[3] | ||||||
|  |         self.namespace = topic_parts[4] | ||||||
|  |         self.topic = topic_parts[5] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class PulsarSchema(object): | ||||||
|  |     __slots__ = [ | ||||||
|  |         "schema_version", | ||||||
|  |         "schema_name", | ||||||
|  |         "schema_description", | ||||||
|  |         "schema_type", | ||||||
|  |         "schema_str", | ||||||
|  |         "properties", | ||||||
|  |     ] | ||||||
|  | 
 | ||||||
|  |     def __init__(self, schema): | ||||||
|  |         self.schema_version = schema.get("version") | ||||||
|  | 
 | ||||||
|  |         avro_schema = json.loads(schema.get("data")) | ||||||
|  |         self.schema_name = avro_schema.get("namespace") + "." + avro_schema.get("name") | ||||||
|  |         self.schema_description = avro_schema.get("doc") | ||||||
|  |         self.schema_type = schema.get("type") | ||||||
|  |         self.schema_str = schema.get("data") | ||||||
|  |         self.properties = schema.get("properties") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @dataclass | ||||||
|  | class PulsarSource(StatefulIngestionSourceBase): | ||||||
|  |     def __init__(self, config: PulsarSourceConfig, ctx: PipelineContext): | ||||||
|  |         super().__init__(config, ctx) | ||||||
|  |         self.platform: str = "pulsar" | ||||||
|  |         self.config: PulsarSourceConfig = config | ||||||
|  |         self.report: PulsarSourceReport = PulsarSourceReport() | ||||||
|  |         self.base_url: str = self.config.web_service_url + "/admin/v2" | ||||||
|  |         self.tenants: List[str] = config.tenants | ||||||
|  | 
 | ||||||
|  |         if ( | ||||||
|  |             self.is_stateful_ingestion_configured() | ||||||
|  |             and not self.config.platform_instance | ||||||
|  |         ): | ||||||
|  |             raise ConfigurationError( | ||||||
|  |                 "Enabling Pulsar stateful ingestion requires to specify a platform instance." | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |         self.session = requests.Session() | ||||||
|  |         self.session.verify = self.config.verify_ssl | ||||||
|  |         self.session.headers.update( | ||||||
|  |             { | ||||||
|  |                 "Content-Type": "application/json", | ||||||
|  |             } | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         if self._is_oauth_authentication_configured(): | ||||||
|  |             # Get OpenId configuration from issuer, e.g. token_endpoint | ||||||
|  |             oid_config_url = ( | ||||||
|  |                 "%s/.well-known/openid-configuration" % self.config.issuer_url | ||||||
|  |             ) | ||||||
|  |             oid_config_response = requests.get( | ||||||
|  |                 oid_config_url, verify=False, allow_redirects=False | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |             if oid_config_response: | ||||||
|  |                 self.config.oid_config.update(oid_config_response.json()) | ||||||
|  |             else: | ||||||
|  |                 logger.error( | ||||||
|  |                     "Unexpected response while getting discovery document using %s : %s" | ||||||
|  |                     % (oid_config_url, oid_config_response) | ||||||
|  |                 ) | ||||||
|  | 
 | ||||||
|  |             if "token_endpoint" not in self.config.oid_config: | ||||||
|  |                 raise Exception( | ||||||
|  |                     "The token_endpoint is not set, please verify the configured issuer_url or" | ||||||
|  |                     " set oid_config.token_endpoint manually in the configuration file." | ||||||
|  |                 ) | ||||||
|  | 
 | ||||||
|  |         # Authentication configured | ||||||
|  |         if ( | ||||||
|  |             self._is_token_authentication_configured() | ||||||
|  |             or self._is_oauth_authentication_configured() | ||||||
|  |         ): | ||||||
|  |             # Update session header with Bearer token | ||||||
|  |             self.session.headers.update( | ||||||
|  |                 {"Authorization": f"Bearer {self.get_access_token()}"} | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |     def get_access_token(self) -> str: | ||||||
|  |         """ | ||||||
|  |         Returns an access token used for authentication, token comes from config or third party provider | ||||||
|  |         when issuer_url is provided | ||||||
|  |         """ | ||||||
|  |         # JWT, get access token (jwt) from config | ||||||
|  |         if self._is_token_authentication_configured(): | ||||||
|  |             return str(self.config.token) | ||||||
|  | 
 | ||||||
|  |         # OAuth, connect to issuer and return access token | ||||||
|  |         if self._is_oauth_authentication_configured(): | ||||||
|  |             assert self.config.client_id | ||||||
|  |             assert self.config.client_secret | ||||||
|  |             data = {"grant_type": "client_credentials"} | ||||||
|  |             try: | ||||||
|  |                 # Get a token from the issuer | ||||||
|  |                 token_endpoint = self.config.oid_config["token_endpoint"] | ||||||
|  |                 logger.info(f"Request access token from {token_endpoint}") | ||||||
|  |                 token_response = requests.post( | ||||||
|  |                     url=token_endpoint, | ||||||
|  |                     data=data, | ||||||
|  |                     verify=False, | ||||||
|  |                     allow_redirects=False, | ||||||
|  |                     auth=( | ||||||
|  |                         self.config.client_id, | ||||||
|  |                         self.config.client_secret, | ||||||
|  |                     ), | ||||||
|  |                 ) | ||||||
|  |                 token_response.raise_for_status() | ||||||
|  | 
 | ||||||
|  |                 return token_response.json()["access_token"] | ||||||
|  | 
 | ||||||
|  |             except requests.exceptions.RequestException as e: | ||||||
|  |                 logger.error(f"An error occurred while handling your request: {e}") | ||||||
|  |         # Failed to get an access token, | ||||||
|  |         raise ConfigurationError( | ||||||
|  |             f"Failed to get the Pulsar access token from token_endpoint {self.config.oid_config.get('token_endpoint')}." | ||||||
|  |             f" Please check your input configuration." | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     def _get_pulsar_metadata(self, url): | ||||||
|  |         """ | ||||||
|  |         Interacts with the Pulsar Admin Api and returns Pulsar metadata. Invocations with insufficient privileges | ||||||
|  |         are logged. | ||||||
|  |         """ | ||||||
|  |         try: | ||||||
|  |             # Request the Pulsar metadata | ||||||
|  |             response = self.session.get(url, timeout=self.config.timeout) | ||||||
|  |             response.raise_for_status() | ||||||
|  |             # Return the response for status_code 200 | ||||||
|  |             return response.json() | ||||||
|  | 
 | ||||||
|  |         except requests.exceptions.HTTPError as http_error: | ||||||
|  |             # Topics can exist without a schema, log the warning and move on | ||||||
|  |             if http_error.response.status_code == 404 and "/schemas/" in url: | ||||||
|  |                 message = ( | ||||||
|  |                     f"Failed to get schema from schema registry. The topic is either schema-less or" | ||||||
|  |                     f" no messages have been written to the topic yet." | ||||||
|  |                     f" {http_error}" | ||||||
|  |                 ) | ||||||
|  |                 self.report.report_warning("NoSchemaFound", message) | ||||||
|  |             else: | ||||||
|  |                 # Authorization error | ||||||
|  |                 message = f"An HTTP error occurred: {http_error}" | ||||||
|  |                 self.report.report_warning("HTTPError", message) | ||||||
|  |         except requests.exceptions.RequestException as e: | ||||||
|  |             raise Exception( | ||||||
|  |                 f"An ambiguous exception occurred while handling the request: {e}" | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |     def is_checkpointing_enabled(self, job_id: JobId) -> bool: | ||||||
|  |         return job_id == ( | ||||||
|  |             self.get_default_ingestion_job_id() | ||||||
|  |             and self.is_stateful_ingestion_configured() | ||||||
|  |             and self.config.stateful_ingestion | ||||||
|  |             and self.config.stateful_ingestion.remove_stale_metadata | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     def get_default_ingestion_job_id(self) -> JobId: | ||||||
|  |         """ | ||||||
|  |         Default ingestion job name that kafka provides. | ||||||
|  |         """ | ||||||
|  |         return JobId("ingest_from_pulsar_source") | ||||||
|  | 
 | ||||||
|  |     def create_checkpoint(self, job_id: JobId) -> Optional[Checkpoint]: | ||||||
|  |         """ | ||||||
|  |         Create a custom checkpoint with empty state for the job. | ||||||
|  |         """ | ||||||
|  |         assert self.ctx.pipeline_name is not None | ||||||
|  |         if job_id == self.get_default_ingestion_job_id(): | ||||||
|  |             return Checkpoint( | ||||||
|  |                 job_name=job_id, | ||||||
|  |                 pipeline_name=self.ctx.pipeline_name, | ||||||
|  |                 platform_instance_id=self.get_platform_instance_id(), | ||||||
|  |                 run_id=self.ctx.run_id, | ||||||
|  |                 config=self.config, | ||||||
|  |                 # TODO Create a PulsarCheckpointState ? | ||||||
|  |                 state=KafkaCheckpointState(), | ||||||
|  |             ) | ||||||
|  |         return None | ||||||
|  | 
 | ||||||
|  |     def get_platform_instance_id(self) -> str: | ||||||
|  |         assert self.config.platform_instance is not None | ||||||
|  |         return self.config.platform_instance | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def create(cls, config_dict, ctx): | ||||||
|  |         config = PulsarSourceConfig.parse_obj(config_dict) | ||||||
|  | 
 | ||||||
|  |         # Do not include each individual partition for partitioned topics, | ||||||
|  |         if config.exclude_individual_partitions: | ||||||
|  |             config.topic_patterns.deny.append(r".*-partition-[0-9]+") | ||||||
|  | 
 | ||||||
|  |         return cls(config, ctx) | ||||||
|  | 
 | ||||||
|  |     def soft_delete_dataset(self, urn: str, type: str) -> Iterable[MetadataWorkUnit]: | ||||||
|  |         logger.debug(f"Soft-deleting stale entity of type {type} - {urn}.") | ||||||
|  |         mcp = MetadataChangeProposalWrapper( | ||||||
|  |             entityType="dataset", | ||||||
|  |             entityUrn=urn, | ||||||
|  |             changeType=ChangeTypeClass.UPSERT, | ||||||
|  |             aspectName="status", | ||||||
|  |             aspect=StatusClass(removed=True), | ||||||
|  |         ) | ||||||
|  |         wu = MetadataWorkUnit(id=f"soft-delete-{type}-{urn}", mcp=mcp) | ||||||
|  |         self.report.report_workunit(wu) | ||||||
|  |         self.report.report_stale_entity_soft_deleted(urn) | ||||||
|  |         yield wu | ||||||
|  | 
 | ||||||
|  |     def gen_removed_entity_workunits(self) -> Iterable[MetadataWorkUnit]: | ||||||
|  |         last_checkpoint = self.get_last_checkpoint( | ||||||
|  |             self.get_default_ingestion_job_id(), KafkaCheckpointState | ||||||
|  |         ) | ||||||
|  |         cur_checkpoint = self.get_current_checkpoint( | ||||||
|  |             self.get_default_ingestion_job_id() | ||||||
|  |         ) | ||||||
|  |         if ( | ||||||
|  |             self.config.stateful_ingestion | ||||||
|  |             and self.config.stateful_ingestion.remove_stale_metadata | ||||||
|  |             and last_checkpoint is not None | ||||||
|  |             and last_checkpoint.state is not None | ||||||
|  |             and cur_checkpoint is not None | ||||||
|  |             and cur_checkpoint.state is not None | ||||||
|  |         ): | ||||||
|  |             logger.debug("Checking for stale entity removal.") | ||||||
|  | 
 | ||||||
|  |             last_checkpoint_state = cast(KafkaCheckpointState, last_checkpoint.state) | ||||||
|  |             cur_checkpoint_state = cast(KafkaCheckpointState, cur_checkpoint.state) | ||||||
|  | 
 | ||||||
|  |             for topic_urn in last_checkpoint_state.get_topic_urns_not_in( | ||||||
|  |                 cur_checkpoint_state | ||||||
|  |             ): | ||||||
|  |                 yield from self.soft_delete_dataset(topic_urn, "topic") | ||||||
|  | 
 | ||||||
|  |     def get_workunits(self) -> Iterable[MetadataWorkUnit]: | ||||||
|  |         """ | ||||||
|  |         Interacts with the Pulsar Admin Api and loops over tenants, namespaces and topics. For every topic | ||||||
|  |         the schema information is retrieved if available. | ||||||
|  | 
 | ||||||
|  |         Pulsar web service admin rest api urls for retrieving topic information | ||||||
|  |             - [web_service_url]/admin/v2/persistent/{tenant}/{namespace} | ||||||
|  |             - [web_service_url]/admin/v2/persistent/{tenant}/{namespace}/partitioned | ||||||
|  |             - [web_service_url]/admin/v2/non-persistent/{tenant}/{namespace} | ||||||
|  |             - [web_service_url]/admin/v2/non-persistent/{tenant}/{namespace}/partitioned | ||||||
|  |         """ | ||||||
|  |         topic_urls = [ | ||||||
|  |             self.base_url + "/persistent/{}", | ||||||
|  |             self.base_url + "/persistent/{}/partitioned", | ||||||
|  |             self.base_url + "/non-persistent/{}", | ||||||
|  |             self.base_url + "/non-persistent/{}/partitioned", | ||||||
|  |         ] | ||||||
|  | 
 | ||||||
|  |         # Report the Pulsar broker version we are communicating with | ||||||
|  |         self.report.report_pulsar_version( | ||||||
|  |             self.session.get( | ||||||
|  |                 "%s/brokers/version" % self.base_url, | ||||||
|  |                 timeout=self.config.timeout, | ||||||
|  |             ).text | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         # If no tenants are provided, request all tenants from cluster using /admin/v2/tenants endpoint. | ||||||
|  |         # Requesting cluster tenant information requires superuser privileges | ||||||
|  |         if not self.tenants: | ||||||
|  |             self.tenants = self._get_pulsar_metadata(self.base_url + "/tenants") or [] | ||||||
|  | 
 | ||||||
|  |         # Initialize counters | ||||||
|  |         self.report.tenants_scanned = 0 | ||||||
|  |         self.report.namespaces_scanned = 0 | ||||||
|  |         self.report.topics_scanned = 0 | ||||||
|  | 
 | ||||||
|  |         for tenant in self.tenants: | ||||||
|  |             self.report.tenants_scanned += 1 | ||||||
|  |             if self.config.tenant_patterns.allowed(tenant): | ||||||
|  |                 # Get namespaces belonging to a tenant, /admin/v2/%s/namespaces | ||||||
|  |                 # A tenant admin role has sufficient privileges to perform this action | ||||||
|  |                 namespaces = ( | ||||||
|  |                     self._get_pulsar_metadata(self.base_url + "/namespaces/%s" % tenant) | ||||||
|  |                     or [] | ||||||
|  |                 ) | ||||||
|  |                 for namespace in namespaces: | ||||||
|  |                     self.report.namespaces_scanned += 1 | ||||||
|  |                     if self.config.namespace_patterns.allowed(namespace): | ||||||
|  |                         # Get all topics (persistent, non-persistent and partitioned) belonging to a tenant/namespace | ||||||
|  |                         # Four endpoint invocations are needs to get all topic metadata for a namespace | ||||||
|  |                         topics = {} | ||||||
|  |                         for url in topic_urls: | ||||||
|  |                             # Topics are partitioned when admin url ends with /partitioned | ||||||
|  |                             partitioned = url.endswith("/partitioned") | ||||||
|  |                             # Get the topics for each type | ||||||
|  |                             pulsar_topics = ( | ||||||
|  |                                 self._get_pulsar_metadata(url.format(namespace)) or [] | ||||||
|  |                             ) | ||||||
|  |                             # Create a mesh of topics with partitioned values, the | ||||||
|  |                             # partitioned info is added as a custom properties later | ||||||
|  |                             topics.update( | ||||||
|  |                                 {topic: partitioned for topic in pulsar_topics} | ||||||
|  |                             ) | ||||||
|  | 
 | ||||||
|  |                         # For all allowed topics get the metadata | ||||||
|  |                         for topic, is_partitioned in topics.items(): | ||||||
|  |                             self.report.topics_scanned += 1 | ||||||
|  |                             if self.config.topic_patterns.allowed(topic): | ||||||
|  | 
 | ||||||
|  |                                 yield from self._extract_record(topic, is_partitioned) | ||||||
|  |                                 # Add topic to checkpoint if stateful ingestion is enabled | ||||||
|  |                                 if self.is_stateful_ingestion_configured(): | ||||||
|  |                                     self._add_topic_to_checkpoint(topic) | ||||||
|  |                             else: | ||||||
|  |                                 self.report.report_topics_dropped(topic) | ||||||
|  | 
 | ||||||
|  |                         if self.is_stateful_ingestion_configured(): | ||||||
|  |                             # Clean up stale entities. | ||||||
|  |                             yield from self.gen_removed_entity_workunits() | ||||||
|  | 
 | ||||||
|  |                     else: | ||||||
|  |                         self.report.report_namespaces_dropped(namespace) | ||||||
|  |             else: | ||||||
|  |                 self.report.report_tenants_dropped(tenant) | ||||||
|  | 
 | ||||||
|  |     def _add_topic_to_checkpoint(self, topic: str) -> None: | ||||||
|  |         cur_checkpoint = self.get_current_checkpoint( | ||||||
|  |             self.get_default_ingestion_job_id() | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         if cur_checkpoint is not None: | ||||||
|  |             checkpoint_state = cast(KafkaCheckpointState, cur_checkpoint.state) | ||||||
|  |             checkpoint_state.add_topic_urn( | ||||||
|  |                 make_dataset_urn_with_platform_instance( | ||||||
|  |                     platform=self.platform, | ||||||
|  |                     name=topic, | ||||||
|  |                     platform_instance=self.config.platform_instance, | ||||||
|  |                     env=self.config.env, | ||||||
|  |                 ) | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |     def _is_token_authentication_configured(self) -> bool: | ||||||
|  |         if self.config.token is not None: | ||||||
|  |             return True | ||||||
|  |         return False | ||||||
|  | 
 | ||||||
|  |     def _is_oauth_authentication_configured(self) -> bool: | ||||||
|  |         if self.config.issuer_url is not None: | ||||||
|  |             return True | ||||||
|  |         return False | ||||||
|  | 
 | ||||||
|  |     def _get_schema_and_fields( | ||||||
|  |         self, pulsar_topic: PulsarTopic, is_key_schema: bool | ||||||
|  |     ) -> Tuple[Optional[PulsarSchema], List[SchemaField]]: | ||||||
|  | 
 | ||||||
|  |         pulsar_schema: Optional[PulsarSchema] = None | ||||||
|  | 
 | ||||||
|  |         schema_url = self.base_url + "/schemas/%s/%s/%s/schema" % ( | ||||||
|  |             pulsar_topic.tenant, | ||||||
|  |             pulsar_topic.namespace, | ||||||
|  |             pulsar_topic.topic, | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         schema_payload = self._get_pulsar_metadata(schema_url) | ||||||
|  | 
 | ||||||
|  |         # Get the type and schema from the Pulsar Schema | ||||||
|  |         if schema_payload is not None: | ||||||
|  |             # pulsar_schema: Optional[PulsarSchema] = None | ||||||
|  |             pulsar_schema = PulsarSchema(schema_payload) | ||||||
|  | 
 | ||||||
|  |         # Obtain the schema fields from schema for the topic. | ||||||
|  |         fields: List[SchemaField] = [] | ||||||
|  |         if pulsar_schema is not None: | ||||||
|  |             fields = self._get_schema_fields( | ||||||
|  |                 pulsar_topic=pulsar_topic, | ||||||
|  |                 schema=pulsar_schema, | ||||||
|  |                 is_key_schema=is_key_schema, | ||||||
|  |             ) | ||||||
|  |         return pulsar_schema, fields | ||||||
|  | 
 | ||||||
|  |     def _get_schema_fields( | ||||||
|  |         self, pulsar_topic: PulsarTopic, schema: PulsarSchema, is_key_schema: bool | ||||||
|  |     ) -> List[SchemaField]: | ||||||
|  |         # Parse the schema and convert it to SchemaFields. | ||||||
|  |         fields: List[SchemaField] = [] | ||||||
|  |         if schema.schema_type == "AVRO" or schema.schema_type == "JSON": | ||||||
|  |             # Extract fields from schema and get the FQN for the schema | ||||||
|  |             fields = schema_util.avro_schema_to_mce_fields( | ||||||
|  |                 schema.schema_str, is_key_schema=is_key_schema | ||||||
|  |             ) | ||||||
|  |         else: | ||||||
|  |             self.report.report_warning( | ||||||
|  |                 pulsar_topic.fullname, | ||||||
|  |                 f"Parsing Pulsar schema type {schema.schema_type} is currently not implemented", | ||||||
|  |             ) | ||||||
|  |         return fields | ||||||
|  | 
 | ||||||
|  |     def _get_schema_metadata( | ||||||
|  |         self, pulsar_topic: PulsarTopic, platform_urn: str | ||||||
|  |     ) -> Tuple[Optional[PulsarSchema], Optional[SchemaMetadata]]: | ||||||
|  | 
 | ||||||
|  |         schema, fields = self._get_schema_and_fields( | ||||||
|  |             pulsar_topic=pulsar_topic, is_key_schema=False | ||||||
|  |         )  # type: Tuple[Optional[PulsarSchema], List[SchemaField]] | ||||||
|  | 
 | ||||||
|  |         # Create the schemaMetadata aspect. | ||||||
|  |         if schema is not None: | ||||||
|  |             md5_hash = md5(schema.schema_str.encode()).hexdigest() | ||||||
|  | 
 | ||||||
|  |             return schema, SchemaMetadata( | ||||||
|  |                 schemaName=schema.schema_name, | ||||||
|  |                 version=schema.schema_version, | ||||||
|  |                 hash=md5_hash, | ||||||
|  |                 platform=platform_urn, | ||||||
|  |                 platformSchema=KafkaSchema( | ||||||
|  |                     documentSchema=schema.schema_str if schema is not None else "", | ||||||
|  |                     keySchema=None, | ||||||
|  |                 ), | ||||||
|  |                 fields=fields, | ||||||
|  |             ) | ||||||
|  |         return None, None | ||||||
|  | 
 | ||||||
|  |     def _extract_record( | ||||||
|  |         self, topic: str, partitioned: bool | ||||||
|  |     ) -> Iterable[MetadataWorkUnit]: | ||||||
|  |         logger.info(f"topic = {topic}") | ||||||
|  | 
 | ||||||
|  |         # 1. Create and emit the default dataset for the topic. Extract type, tenant, namespace | ||||||
|  |         # and topic name from full Pulsar topic name i.e. persistent://tenant/namespace/topic | ||||||
|  |         pulsar_topic = PulsarTopic(topic) | ||||||
|  | 
 | ||||||
|  |         platform_urn = make_data_platform_urn(self.platform) | ||||||
|  |         dataset_urn = make_dataset_urn_with_platform_instance( | ||||||
|  |             platform=self.platform, | ||||||
|  |             name=pulsar_topic.fullname, | ||||||
|  |             platform_instance=self.config.platform_instance, | ||||||
|  |             env=self.config.env, | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         status_wu = MetadataWorkUnit( | ||||||
|  |             id=f"{dataset_urn}-status", | ||||||
|  |             mcp=MetadataChangeProposalWrapper( | ||||||
|  |                 entityType="dataset", | ||||||
|  |                 changeType=ChangeTypeClass.UPSERT, | ||||||
|  |                 entityUrn=dataset_urn, | ||||||
|  |                 aspectName="status", | ||||||
|  |                 aspect=StatusClass(removed=False), | ||||||
|  |             ), | ||||||
|  |         ) | ||||||
|  |         self.report.report_workunit(status_wu) | ||||||
|  |         yield status_wu | ||||||
|  | 
 | ||||||
|  |         # 2. Emit schemaMetadata aspect | ||||||
|  |         schema, schema_metadata = self._get_schema_metadata(pulsar_topic, platform_urn) | ||||||
|  |         if schema_metadata is not None: | ||||||
|  |             schema_metadata_wu = MetadataWorkUnit( | ||||||
|  |                 id=f"{dataset_urn}-schemaMetadata", | ||||||
|  |                 mcp=MetadataChangeProposalWrapper( | ||||||
|  |                     entityType="dataset", | ||||||
|  |                     changeType=ChangeTypeClass.UPSERT, | ||||||
|  |                     entityUrn=dataset_urn, | ||||||
|  |                     aspectName="schemaMetadata", | ||||||
|  |                     aspect=schema_metadata, | ||||||
|  |                 ), | ||||||
|  |             ) | ||||||
|  |             self.report.report_workunit(schema_metadata_wu) | ||||||
|  |             yield schema_metadata_wu | ||||||
|  | 
 | ||||||
|  |         # TODO Add topic properties (Pulsar 2.10.0 feature) | ||||||
|  |         # 3. Construct and emit dataset properties aspect | ||||||
|  |         if schema is not None: | ||||||
|  |             schema_properties = { | ||||||
|  |                 "schema_version": str(schema.schema_version), | ||||||
|  |                 "schema_type": schema.schema_type, | ||||||
|  |                 "partitioned": str(partitioned).lower(), | ||||||
|  |             } | ||||||
|  |             # Add some static properties to the schema properties | ||||||
|  |             schema.properties.update(schema_properties) | ||||||
|  | 
 | ||||||
|  |             dataset_properties_wu = MetadataWorkUnit( | ||||||
|  |                 id=f"{dataset_urn}-datasetProperties", | ||||||
|  |                 mcp=MetadataChangeProposalWrapper( | ||||||
|  |                     entityType="dataset", | ||||||
|  |                     changeType=ChangeTypeClass.UPSERT, | ||||||
|  |                     entityUrn=dataset_urn, | ||||||
|  |                     aspectName="datasetProperties", | ||||||
|  |                     aspect=DatasetPropertiesClass( | ||||||
|  |                         description=schema.schema_description, | ||||||
|  |                         customProperties=schema.properties, | ||||||
|  |                     ), | ||||||
|  |                 ), | ||||||
|  |             ) | ||||||
|  |             self.report.report_workunit(dataset_properties_wu) | ||||||
|  |             yield dataset_properties_wu | ||||||
|  | 
 | ||||||
|  |         # 4. Emit browsePaths aspect | ||||||
|  |         pulsar_path = ( | ||||||
|  |             f"{pulsar_topic.tenant}/{pulsar_topic.namespace}/{pulsar_topic.topic}" | ||||||
|  |         ) | ||||||
|  |         browse_path_suffix = ( | ||||||
|  |             f"{self.config.platform_instance}/{pulsar_path}" | ||||||
|  |             if self.config.platform_instance | ||||||
|  |             else pulsar_path | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         browse_path_wu = MetadataWorkUnit( | ||||||
|  |             id=f"{dataset_urn}-browsePaths", | ||||||
|  |             mcp=MetadataChangeProposalWrapper( | ||||||
|  |                 entityType="dataset", | ||||||
|  |                 changeType=ChangeTypeClass.UPSERT, | ||||||
|  |                 entityUrn=dataset_urn, | ||||||
|  |                 aspectName="browsePaths", | ||||||
|  |                 aspect=BrowsePathsClass( | ||||||
|  |                     [f"/{self.config.env.lower()}/{self.platform}/{browse_path_suffix}"] | ||||||
|  |                 ), | ||||||
|  |             ), | ||||||
|  |         ) | ||||||
|  |         self.report.report_workunit(browse_path_wu) | ||||||
|  |         yield browse_path_wu | ||||||
|  | 
 | ||||||
|  |         # 5. Emit dataPlatformInstance aspect. | ||||||
|  |         if self.config.platform_instance: | ||||||
|  |             platform_instance_wu = MetadataWorkUnit( | ||||||
|  |                 id=f"{dataset_urn}-dataPlatformInstance", | ||||||
|  |                 mcp=MetadataChangeProposalWrapper( | ||||||
|  |                     entityType="dataset", | ||||||
|  |                     changeType=ChangeTypeClass.UPSERT, | ||||||
|  |                     entityUrn=dataset_urn, | ||||||
|  |                     aspectName="dataPlatformInstance", | ||||||
|  |                     aspect=DataPlatformInstanceClass( | ||||||
|  |                         platform=platform_urn, | ||||||
|  |                         instance=make_dataplatform_instance_urn( | ||||||
|  |                             self.platform, self.config.platform_instance | ||||||
|  |                         ), | ||||||
|  |                     ), | ||||||
|  |                 ), | ||||||
|  |             ) | ||||||
|  |             self.report.report_workunit(platform_instance_wu) | ||||||
|  |             yield platform_instance_wu | ||||||
|  | 
 | ||||||
|  |         # 6. Emit subtype aspect marking this as a "topic" | ||||||
|  |         subtype_wu = MetadataWorkUnit( | ||||||
|  |             id=f"{dataset_urn}-subTypes", | ||||||
|  |             mcp=MetadataChangeProposalWrapper( | ||||||
|  |                 entityType="dataset", | ||||||
|  |                 changeType=ChangeTypeClass.UPSERT, | ||||||
|  |                 entityUrn=dataset_urn, | ||||||
|  |                 aspectName="subTypes", | ||||||
|  |                 aspect=SubTypesClass(typeNames=["topic"]), | ||||||
|  |             ), | ||||||
|  |         ) | ||||||
|  |         self.report.report_workunit(subtype_wu) | ||||||
|  |         yield subtype_wu | ||||||
|  | 
 | ||||||
|  |         # 7. Emit domains aspect | ||||||
|  |         domain_urn: Optional[str] = None | ||||||
|  |         for domain, pattern in self.config.domain.items(): | ||||||
|  |             if pattern.allowed(pulsar_topic.fullname): | ||||||
|  |                 domain_urn = make_domain_urn(domain) | ||||||
|  | 
 | ||||||
|  |         if domain_urn: | ||||||
|  |             wus = add_domain_to_entity_wu( | ||||||
|  |                 entity_type="dataset", | ||||||
|  |                 entity_urn=dataset_urn, | ||||||
|  |                 domain_urn=domain_urn, | ||||||
|  |             ) | ||||||
|  |             for wu in wus: | ||||||
|  |                 self.report.report_workunit(wu) | ||||||
|  |                 yield wu | ||||||
|  | 
 | ||||||
|  |     def get_report(self): | ||||||
|  |         return self.report | ||||||
|  | 
 | ||||||
|  |     def update_default_job_run_summary(self) -> None: | ||||||
|  |         summary = self.get_job_run_summary(self.get_default_ingestion_job_id()) | ||||||
|  |         if summary is not None: | ||||||
|  |             # For now just add the config and the report. | ||||||
|  |             summary.config = self.config.json() | ||||||
|  |             summary.custom_summary = self.report.as_string() | ||||||
|  |             summary.runStatus = ( | ||||||
|  |                 JobStatusClass.FAILED | ||||||
|  |                 if self.get_report().failures | ||||||
|  |                 else JobStatusClass.COMPLETED | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |     def close(self): | ||||||
|  |         self.update_default_job_run_summary() | ||||||
|  |         self.prepare_for_commit() | ||||||
|  |         self.session.close() | ||||||
							
								
								
									
										111
									
								
								metadata-ingestion/src/datahub/ingestion/source_config/pulsar.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										111
									
								
								metadata-ingestion/src/datahub/ingestion/source_config/pulsar.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,111 @@ | |||||||
|  | import re | ||||||
|  | from typing import Dict, List, Optional, Union | ||||||
|  | from urllib.parse import urlparse | ||||||
|  | 
 | ||||||
|  | from pydantic import Field, validator | ||||||
|  | 
 | ||||||
|  | from datahub.configuration.common import AllowDenyPattern, ConfigurationError | ||||||
|  | from datahub.configuration.source_common import DEFAULT_ENV, DatasetSourceConfigBase | ||||||
|  | from datahub.ingestion.source.state.stateful_ingestion_base import ( | ||||||
|  |     StatefulIngestionConfig, | ||||||
|  |     StatefulIngestionConfigBase, | ||||||
|  | ) | ||||||
|  | from datahub.utilities import config_clean | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class PulsarSourceStatefulIngestionConfig(StatefulIngestionConfig): | ||||||
|  |     """ | ||||||
|  |     Specialization of the basic StatefulIngestionConfig to add custom config. | ||||||
|  |     This will be used to override the stateful_ingestion config param of StatefulIngestionConfigBase | ||||||
|  |     in the PulsarSourceConfig. | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     remove_stale_metadata: bool = True | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def _is_valid_hostname(hostname: str) -> bool: | ||||||
|  |     """ | ||||||
|  |     Loosely ascii hostname validation. A hostname is considered valid when the total length does not exceed 253 | ||||||
|  |     characters, contains valid characters and are max 63 octets per label. | ||||||
|  |     """ | ||||||
|  |     if len(hostname) > 253: | ||||||
|  |         return False | ||||||
|  |     # Hostnames ending on a dot are valid, if present strip exactly one | ||||||
|  |     if hostname[-1] == ".": | ||||||
|  |         hostname = hostname[:-1] | ||||||
|  |     allowed = re.compile(r"(?!-)[A-Z\d-]{1,63}(?<!-)$", re.IGNORECASE) | ||||||
|  |     return all(allowed.match(x) for x in hostname.split(".")) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class PulsarSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigBase): | ||||||
|  |     env: str = DEFAULT_ENV | ||||||
|  |     # The web URL for the cluster. | ||||||
|  |     web_service_url: str = "http://localhost:8080" | ||||||
|  |     # Timout setting, how long to wait for the Pulsar rest api to send data before giving up | ||||||
|  |     timeout: int = 5 | ||||||
|  |     # Mandatory for oauth authentication | ||||||
|  |     issuer_url: Optional[str] = None | ||||||
|  |     client_id: Optional[str] = None | ||||||
|  |     client_secret: Optional[str] = None | ||||||
|  |     # Mandatory for token authentication | ||||||
|  |     token: Optional[str] = None | ||||||
|  |     # Either a boolean, in which case it controls whether we verify the server's TLS certificate, or a string, | ||||||
|  |     # in which case it must be a path to a CA bundle to use. | ||||||
|  |     verify_ssl: Union[bool, str] = True | ||||||
|  |     # By default, allow all topics and deny the pulsar system topics | ||||||
|  |     tenant_patterns: AllowDenyPattern = AllowDenyPattern(allow=[".*"], deny=["pulsar"]) | ||||||
|  |     namespace_patterns: AllowDenyPattern = AllowDenyPattern( | ||||||
|  |         allow=[".*"], deny=["public/functions"] | ||||||
|  |     ) | ||||||
|  |     topic_patterns: AllowDenyPattern = AllowDenyPattern(allow=[".*"], deny=["/__.*$"]) | ||||||
|  |     # Exclude partition topics. e.g. topics ending on _partition_N where N is a number | ||||||
|  |     exclude_individual_partitions: bool = True | ||||||
|  |     # Listing all tenants requires superUser role, alternative you can set tenants you want to scrape | ||||||
|  |     # using the tenant admin role | ||||||
|  |     tenants: List[str] = [] | ||||||
|  | 
 | ||||||
|  |     domain: Dict[str, AllowDenyPattern] = dict() | ||||||
|  |     # Custom Stateful Ingestion settings | ||||||
|  |     stateful_ingestion: Optional[PulsarSourceStatefulIngestionConfig] = None | ||||||
|  | 
 | ||||||
|  |     # Placeholder for OpenId discovery document | ||||||
|  |     oid_config: dict = Field(default_factory=dict) | ||||||
|  | 
 | ||||||
|  |     @validator("token") | ||||||
|  |     def ensure_only_issuer_or_token( | ||||||
|  |         cls, token: Optional[str], values: Dict[str, Optional[str]] | ||||||
|  |     ) -> Optional[str]: | ||||||
|  |         if token is not None and values.get("issuer_url") is not None: | ||||||
|  |             raise ConfigurationError( | ||||||
|  |                 "Expected only one authentication method, either issuer_url or token." | ||||||
|  |             ) | ||||||
|  |         return token | ||||||
|  | 
 | ||||||
|  |     @validator("client_secret", always=True) | ||||||
|  |     def ensure_client_id_and_secret_for_issuer_url( | ||||||
|  |         cls, client_secret: Optional[str], values: Dict[str, Optional[str]] | ||||||
|  |     ) -> Optional[str]: | ||||||
|  |         if values.get("issuer_url") is not None and ( | ||||||
|  |             client_secret is None or values.get("client_id") is None | ||||||
|  |         ): | ||||||
|  |             raise ConfigurationError( | ||||||
|  |                 "Missing configuration: client_id and client_secret are mandatory when issuer_url is set." | ||||||
|  |             ) | ||||||
|  |         return client_secret | ||||||
|  | 
 | ||||||
|  |     @validator("web_service_url") | ||||||
|  |     def web_service_url_scheme_host_port(cls, val: str) -> str: | ||||||
|  |         # Tokenize the web url | ||||||
|  |         url = urlparse(val) | ||||||
|  | 
 | ||||||
|  |         if url.scheme not in ["http", "https"]: | ||||||
|  |             raise ConfigurationError( | ||||||
|  |                 f"Scheme should be http or https, found {url.scheme}" | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |         if not _is_valid_hostname(url.hostname.__str__()): | ||||||
|  |             raise ConfigurationError( | ||||||
|  |                 f"Not a valid hostname, hostname contains invalid characters, found {url.hostname}" | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |         return config_clean.remove_trailing_slashes(val) | ||||||
| @ -0,0 +1,33 @@ | |||||||
|  | from dataclasses import dataclass, field | ||||||
|  | from typing import List, Optional | ||||||
|  | 
 | ||||||
|  | from datahub.ingestion.source.state.stateful_ingestion_base import ( | ||||||
|  |     StatefulIngestionReport, | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @dataclass | ||||||
|  | class PulsarSourceReport(StatefulIngestionReport): | ||||||
|  |     pulsar_version: Optional[str] = None | ||||||
|  |     tenants_scanned: Optional[int] = None | ||||||
|  |     namespaces_scanned: Optional[int] = None | ||||||
|  |     topics_scanned: Optional[int] = None | ||||||
|  |     tenants_filtered: List[str] = field(default_factory=list) | ||||||
|  |     namespaces_filtered: List[str] = field(default_factory=list) | ||||||
|  |     topics_filtered: List[str] = field(default_factory=list) | ||||||
|  |     soft_deleted_stale_entities: List[str] = field(default_factory=list) | ||||||
|  | 
 | ||||||
|  |     def report_pulsar_version(self, version: str) -> None: | ||||||
|  |         self.pulsar_version = version | ||||||
|  | 
 | ||||||
|  |     def report_tenants_dropped(self, tenant: str) -> None: | ||||||
|  |         self.tenants_filtered.append(tenant) | ||||||
|  | 
 | ||||||
|  |     def report_namespaces_dropped(self, namespace: str) -> None: | ||||||
|  |         self.namespaces_filtered.append(namespace) | ||||||
|  | 
 | ||||||
|  |     def report_topics_dropped(self, topic: str) -> None: | ||||||
|  |         self.topics_filtered.append(topic) | ||||||
|  | 
 | ||||||
|  |     def report_stale_entity_soft_deleted(self, urn: str) -> None: | ||||||
|  |         self.soft_deleted_stale_entities.append(urn) | ||||||
							
								
								
									
										239
									
								
								metadata-ingestion/tests/unit/test_pulsar_source.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										239
									
								
								metadata-ingestion/tests/unit/test_pulsar_source.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,239 @@ | |||||||
|  | import unittest | ||||||
|  | from typing import Any, Dict | ||||||
|  | from unittest.mock import patch | ||||||
|  | 
 | ||||||
|  | import pytest | ||||||
|  | 
 | ||||||
|  | from datahub.configuration.common import ConfigurationError | ||||||
|  | from datahub.emitter.mcp import MetadataChangeProposalWrapper | ||||||
|  | from datahub.ingestion.api.common import PipelineContext | ||||||
|  | from datahub.ingestion.source.pulsar import ( | ||||||
|  |     PulsarSchema, | ||||||
|  |     PulsarSource, | ||||||
|  |     PulsarSourceConfig, | ||||||
|  |     PulsarTopic, | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | mock_schema_response: Dict[str, Any] = { | ||||||
|  |     "version": 1, | ||||||
|  |     "type": "AVRO", | ||||||
|  |     "timestamp": 0, | ||||||
|  |     "data": '{"type":"record","name":"FooSchema","namespace":"foo.bar","doc":"Description of FooSchema","fields":[{"name":"field1","type":{"type":"string","avro.java.string":"String"},"doc":"Description of field1"},{"name":"field2","type":"long","doc":"Some description","default":0}]}', | ||||||
|  |     "properties": {"__jsr310ConversionEnabled": "false", "__alwaysAllowNull": "true"}, | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class TestPulsarSourceConfig: | ||||||
|  |     def test_pulsar_source_config_valid_web_service_url(self): | ||||||
|  |         assert ( | ||||||
|  |             PulsarSourceConfig().web_service_url_scheme_host_port( | ||||||
|  |                 "http://localhost:8080/" | ||||||
|  |             ) | ||||||
|  |             == "http://localhost:8080" | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     def test_pulsar_source_config_invalid_web_service_url_scheme(self): | ||||||
|  |         with pytest.raises( | ||||||
|  |             ConfigurationError, match=r"Scheme should be http or https, found ftp" | ||||||
|  |         ): | ||||||
|  |             PulsarSourceConfig().web_service_url_scheme_host_port( | ||||||
|  |                 "ftp://localhost:8080/" | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |     def test_pulsar_source_config_invalid_web_service_url_host(self): | ||||||
|  |         with pytest.raises( | ||||||
|  |             ConfigurationError, | ||||||
|  |             match=r"Not a valid hostname, hostname contains invalid characters, found localhost&", | ||||||
|  |         ): | ||||||
|  |             PulsarSourceConfig().web_service_url_scheme_host_port( | ||||||
|  |                 "http://localhost&:8080/" | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class TestPulsarTopic: | ||||||
|  |     def test_pulsar_source_parse_topic_string(self) -> None: | ||||||
|  |         topic = "persistent://tenant/namespace/topic" | ||||||
|  |         pulsar_topic = PulsarTopic(topic) | ||||||
|  |         assert pulsar_topic.type == "persistent" | ||||||
|  |         assert pulsar_topic.tenant == "tenant" | ||||||
|  |         assert pulsar_topic.namespace == "namespace" | ||||||
|  |         assert pulsar_topic.topic == "topic" | ||||||
|  |         assert pulsar_topic.fullname == "persistent://tenant/namespace/topic" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class TestPulsarSchema: | ||||||
|  |     def test_pulsar_source_parse_pulsar_schema(self) -> None: | ||||||
|  |         pulsar_schema = PulsarSchema(mock_schema_response) | ||||||
|  |         assert pulsar_schema.schema_type == "AVRO" | ||||||
|  |         assert ( | ||||||
|  |             pulsar_schema.schema_str | ||||||
|  |             == '{"type":"record","name":"FooSchema","namespace":"foo.bar","doc":"Description of FooSchema","fields":[{"name":"field1","type":{"type":"string","avro.java.string":"String"},"doc":"Description of field1"},{"name":"field2","type":"long","doc":"Some description","default":0}]}' | ||||||
|  |         ) | ||||||
|  |         assert pulsar_schema.schema_name == "foo.bar.FooSchema" | ||||||
|  |         assert pulsar_schema.schema_version == 1 | ||||||
|  |         assert pulsar_schema.schema_description == "Description of FooSchema" | ||||||
|  |         assert pulsar_schema.properties == { | ||||||
|  |             "__jsr310ConversionEnabled": "false", | ||||||
|  |             "__alwaysAllowNull": "true", | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class TestPulsarSource(unittest.TestCase): | ||||||
|  |     def test_pulsar_source_get_token_jwt(self): | ||||||
|  |         ctx = PipelineContext(run_id="test") | ||||||
|  |         pulsar_source = PulsarSource.create( | ||||||
|  |             {"web_service_url": "http://localhost:8080", "token": "jwt_token"}, | ||||||
|  |             ctx, | ||||||
|  |         ) | ||||||
|  |         # source = PulsarSource( | ||||||
|  |         #    ctx=PipelineContext(run_id="pulsar-source-test"), | ||||||
|  |         #    config=self.token_config) | ||||||
|  |         assert pulsar_source.get_access_token() == "jwt_token" | ||||||
|  | 
 | ||||||
|  |     @patch("datahub.ingestion.source.pulsar.requests.get", autospec=True) | ||||||
|  |     @patch("datahub.ingestion.source.pulsar.requests.post", autospec=True) | ||||||
|  |     def test_pulsar_source_get_token_oauth(self, mock_post, mock_get): | ||||||
|  |         ctx = PipelineContext(run_id="test") | ||||||
|  |         mock_get.return_value.json.return_value = { | ||||||
|  |             "token_endpoint": "http://127.0.0.1:8083/realms/pulsar/protocol/openid-connect/token" | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         pulsar_source = PulsarSource.create( | ||||||
|  |             { | ||||||
|  |                 "web_service_url": "http://localhost:8080", | ||||||
|  |                 "issuer_url": "http://localhost:8083/realms/pulsar", | ||||||
|  |                 "client_id": "client_id", | ||||||
|  |                 "client_secret": "client_secret", | ||||||
|  |             }, | ||||||
|  |             ctx, | ||||||
|  |         ) | ||||||
|  |         mock_post.return_value.json.return_value = {"access_token": "oauth_token"} | ||||||
|  |         assert pulsar_source.get_access_token() == "oauth_token" | ||||||
|  | 
 | ||||||
|  |     @patch("datahub.ingestion.source.pulsar.requests.Session.get", autospec=True) | ||||||
|  |     def test_pulsar_source_get_workunits_all_tenant(self, mock_session): | ||||||
|  |         ctx = PipelineContext(run_id="test") | ||||||
|  |         pulsar_source = PulsarSource.create( | ||||||
|  |             { | ||||||
|  |                 "web_service_url": "http://localhost:8080", | ||||||
|  |             }, | ||||||
|  |             ctx, | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         # Mock fetching Pulsar metadata | ||||||
|  |         with patch( | ||||||
|  |             "datahub.ingestion.source.pulsar.PulsarSource._get_pulsar_metadata" | ||||||
|  |         ) as mock: | ||||||
|  |             mock.side_effect = [ | ||||||
|  |                 ["t_1"],  # tenant list | ||||||
|  |                 ["t_1/ns_1"],  # namespaces list | ||||||
|  |                 ["persistent://t_1/ns_1/topic_1"],  # persistent topic list | ||||||
|  |                 [],  # persistent partitioned topic list | ||||||
|  |                 [],  # none-persistent topic list | ||||||
|  |                 [],  # none-persistent partitioned topic list | ||||||
|  |                 mock_schema_response, | ||||||
|  |             ]  # schema for persistent://t_1/ns_1/topic | ||||||
|  | 
 | ||||||
|  |             work_units = list(pulsar_source.get_workunits()) | ||||||
|  |             first_mcp = work_units[0].metadata | ||||||
|  |             assert isinstance(first_mcp, MetadataChangeProposalWrapper) | ||||||
|  | 
 | ||||||
|  |             # Expected calls 7 | ||||||
|  |             # http://localhost:8080/admin/v2/tenants | ||||||
|  |             # http://localhost:8080/admin/v2/namespaces/t_1 | ||||||
|  |             # http://localhost:8080/admin/v2/persistent/t_1/ns_1 | ||||||
|  |             # http://localhost:8080/admin/v2/persistent/t_1/ns_1/partitioned | ||||||
|  |             # http://localhost:8080/admin/v2/non-persistent/t_1/ns_1 | ||||||
|  |             # http://localhost:8080/admin/v2/non-persistent/t_1/ns_1/partitioned | ||||||
|  |             # http://localhost:8080/admin/v2/schemas/t_1/ns_1/topic_1/schema | ||||||
|  |             assert mock.call_count == 7 | ||||||
|  |             # expecting 5 mcp for one topic with default config | ||||||
|  |             assert len(work_units) == 5 | ||||||
|  | 
 | ||||||
|  |     @patch("datahub.ingestion.source.pulsar.requests.Session.get", autospec=True) | ||||||
|  |     def test_pulsar_source_get_workunits_custom_tenant(self, mock_session): | ||||||
|  |         ctx = PipelineContext(run_id="test") | ||||||
|  |         pulsar_source = PulsarSource.create( | ||||||
|  |             { | ||||||
|  |                 "web_service_url": "http://localhost:8080", | ||||||
|  |                 "tenants": ["t_1", "t_2"], | ||||||
|  |             }, | ||||||
|  |             ctx, | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         # Mock fetching Pulsar metadata | ||||||
|  |         with patch( | ||||||
|  |             "datahub.ingestion.source.pulsar.PulsarSource._get_pulsar_metadata" | ||||||
|  |         ) as mock: | ||||||
|  |             mock.side_effect = [ | ||||||
|  |                 ["t_1/ns_1"],  # namespaces list | ||||||
|  |                 ["persistent://t_1/ns_1/topic_1"],  # topic list | ||||||
|  |                 [],  # empty persistent partitioned topic list | ||||||
|  |                 [],  # empty none-persistent topic list | ||||||
|  |                 [],  # empty none-persistent partitioned topic list | ||||||
|  |                 mock_schema_response,  # schema for persistent://t_1/ns_1/topic | ||||||
|  |                 [],  # no namespaces for tenant t_2 | ||||||
|  |             ] | ||||||
|  | 
 | ||||||
|  |             work_units = list(pulsar_source.get_workunits()) | ||||||
|  |             first_mcp = work_units[0].metadata | ||||||
|  |             assert isinstance(first_mcp, MetadataChangeProposalWrapper) | ||||||
|  | 
 | ||||||
|  |             # Expected calls 7 | ||||||
|  |             # http://localhost:8080/admin/v2/namespaces/t_1 | ||||||
|  |             # http://localhost:8080/admin/v2/persistent/t_1/ns_1 | ||||||
|  |             # http://localhost:8080/admin/v2/persistent/t_1/ns_1/partitioned | ||||||
|  |             # http://localhost:8080/admin/v2/non-persistent/t_1/ns_1 | ||||||
|  |             # http://localhost:8080/admin/v2/non-persistent/t_1/ns_1/partitioned | ||||||
|  |             # http://localhost:8080/admin/v2/schemas/t_1/ns_1/topic_1/schema | ||||||
|  |             # http://localhost:8080/admin/v2/namespaces/t_2 | ||||||
|  |             assert mock.call_count == 7 | ||||||
|  |             # expecting 5 mcp for one topic with default config | ||||||
|  |             assert len(work_units) == 5 | ||||||
|  | 
 | ||||||
|  |     @patch("datahub.ingestion.source.pulsar.requests.Session.get", autospec=True) | ||||||
|  |     def test_pulsar_source_get_workunits_patterns(self, mock_session): | ||||||
|  |         ctx = PipelineContext(run_id="test") | ||||||
|  |         pulsar_source = PulsarSource.create( | ||||||
|  |             { | ||||||
|  |                 "web_service_url": "http://localhost:8080", | ||||||
|  |                 "tenants": ["t_1", "t_2", "bad_t_3"], | ||||||
|  |                 "tenant_patterns": {"deny": ["bad_t_3"]}, | ||||||
|  |                 "namespace_patterns": {"allow": [r"t_1/ns_1"]}, | ||||||
|  |                 "topic_patterns": {"allow": [r"persistent://t_1/ns_1/topic_1"]}, | ||||||
|  |             }, | ||||||
|  |             ctx, | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         # Mock fetching Pulsar metadata | ||||||
|  |         with patch( | ||||||
|  |             "datahub.ingestion.source.pulsar.PulsarSource._get_pulsar_metadata" | ||||||
|  |         ) as mock: | ||||||
|  |             mock.side_effect = [ | ||||||
|  |                 ["t_1/ns_1", "t_2/ns_1"],  # namespaces list | ||||||
|  |                 [ | ||||||
|  |                     "persistent://t_1/ns_1/topic_1",  # persistent topic list | ||||||
|  |                     "non-persistent://t_1/ns_1/bad_topic", | ||||||
|  |                 ],  # topic will be filtered out | ||||||
|  |                 [],  # persistent partitioned topic list | ||||||
|  |                 [],  # none-persistent topic list | ||||||
|  |                 [],  # none-persistent partitioned topic list | ||||||
|  |                 mock_schema_response,  # schema for persistent://t_1/ns_1/topic | ||||||
|  |                 [],  # no namespaces for tenant t_2 | ||||||
|  |             ] | ||||||
|  | 
 | ||||||
|  |             work_units = list(pulsar_source.get_workunits()) | ||||||
|  |             first_mcp = work_units[0].metadata | ||||||
|  |             assert isinstance(first_mcp, MetadataChangeProposalWrapper) | ||||||
|  | 
 | ||||||
|  |             # Expected calls 7 | ||||||
|  |             # http://localhost:8080/admin/v2/namespaces/t_1 | ||||||
|  |             # http://localhost:8080/admin/v2/persistent/t_1/ns_1 | ||||||
|  |             # http://localhost:8080/admin/v2/persistent/t_1/ns_1/partitioned | ||||||
|  |             # http://localhost:8080/admin/v2/non-persistent/t_1/ns_1 | ||||||
|  |             # http://localhost:8080/admin/v2/non-persistent/t_1/ns_1/partitioned | ||||||
|  |             # http://localhost:8080/admin/v2/schemas/t_1/ns_1/topic_1/schema | ||||||
|  |             # http://localhost:8080/admin/v2/namespaces/t_2 | ||||||
|  |             assert mock.call_count == 7 | ||||||
|  |             # expecting 5 mcp for one topic with default config | ||||||
|  |             assert len(work_units) == 5 | ||||||
| @ -456,6 +456,16 @@ | |||||||
|       "logoUrl": "/assets/platforms/trinologo.png" |       "logoUrl": "/assets/platforms/trinologo.png" | ||||||
|     } |     } | ||||||
|   }, |   }, | ||||||
|  |   { | ||||||
|  |     "urn": "urn:li:dataPlatform:pulsar", | ||||||
|  |     "aspect": { | ||||||
|  |       "datasetNameDelimiter": ".", | ||||||
|  |       "name": "pulsar", | ||||||
|  |       "displayName": "Pulsar", | ||||||
|  |       "type": "MESSAGE_BROKER", | ||||||
|  |       "logoUrl": "/assets/platforms/pulsarlogo.png" | ||||||
|  |     } | ||||||
|  |   }, | ||||||
|   { |   { | ||||||
|     "urn": "urn:li:dataPlatform:unknown", |     "urn": "urn:li:dataPlatform:unknown", | ||||||
|     "aspect": { |     "aspect": { | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 vanmeete
						vanmeete