mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-12-13 00:22:23 +00:00
Azure datalake minor changes (#9407)
This commit is contained in:
parent
4fdf14f2da
commit
9a3d599f30
@ -29,7 +29,7 @@ from metadata.generated.schema.entity.data.table import (
|
||||
TableType,
|
||||
)
|
||||
from metadata.generated.schema.entity.services.connections.database.datalakeConnection import (
|
||||
AzureDatalakeConfig,
|
||||
AzureConfig,
|
||||
DatalakeConnection,
|
||||
GCSConfig,
|
||||
S3Config,
|
||||
@ -193,11 +193,19 @@ class DatalakeSource(DatabaseServiceSource): # pylint: disable=too-many-public-
|
||||
else:
|
||||
yield from self.fetch_s3_bucket_names()
|
||||
|
||||
if isinstance(self.service_connection.configSource, AzureDatalakeConfig):
|
||||
if isinstance(self.service_connection.configSource, AzureConfig):
|
||||
yield from self.get_container_names()
|
||||
|
||||
def get_container_names(self) -> Iterable[str]:
|
||||
schema_names = self.client.list_containers(name_starts_with="")
|
||||
"""
|
||||
To get schema names
|
||||
"""
|
||||
prefix = (
|
||||
self.service_connection.bucketName
|
||||
if self.service_connection.bucketName
|
||||
else ""
|
||||
)
|
||||
schema_names = self.client.list_containers(name_starts_with=prefix)
|
||||
for schema in schema_names:
|
||||
schema_fqn = fqn.build(
|
||||
self.metadata,
|
||||
@ -319,9 +327,9 @@ class DatalakeSource(DatabaseServiceSource): # pylint: disable=too-many-public-
|
||||
continue
|
||||
|
||||
yield table_name, TableType.Regular
|
||||
if isinstance(self.service_connection.configSource, AzureDatalakeConfig):
|
||||
if isinstance(self.service_connection.configSource, AzureConfig):
|
||||
files_names = self.get_tables(container_name=bucket_name)
|
||||
for file in files_names.list_blobs():
|
||||
for file in files_names.list_blobs(name_starts_with=prefix):
|
||||
file_name = file.name
|
||||
if "/" in file.name:
|
||||
table_name = self.standardize_table_name(bucket_name, file_name)
|
||||
@ -376,12 +384,14 @@ class DatalakeSource(DatabaseServiceSource): # pylint: disable=too-many-public-
|
||||
data_frame = self.get_s3_files(
|
||||
client=self.client, key=table_name, bucket_name=schema_name
|
||||
)
|
||||
if isinstance(self.service_connection.configSource, AzureDatalakeConfig):
|
||||
if isinstance(self.service_connection.configSource, AzureConfig):
|
||||
columns = None
|
||||
connection_args = self.service_connection.configSource.securityConfig
|
||||
storage_options = {
|
||||
"tenant_id": connection_args.tenantId,
|
||||
"client_id": connection_args.clientId,
|
||||
"client_secret": connection_args.clientSecret.get_secret_value(),
|
||||
"account_name": connection_args.accountName,
|
||||
}
|
||||
data_frame = self.get_azure_files(
|
||||
client=self.client,
|
||||
@ -565,7 +575,8 @@ class DatalakeSource(DatabaseServiceSource): # pylint: disable=too-many-public-
|
||||
return False
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
if isinstance(self.service_connection.configSource, AzureConfig):
|
||||
self.client.close()
|
||||
|
||||
def get_status(self) -> SourceStatus:
|
||||
return self.status
|
||||
|
||||
@ -95,7 +95,7 @@ from metadata.generated.schema.entity.services.connections.database.databricksCo
|
||||
DatabricksConnection,
|
||||
)
|
||||
from metadata.generated.schema.entity.services.connections.database.datalakeConnection import (
|
||||
AzureDatalakeConfig,
|
||||
AzureConfig,
|
||||
DatalakeConnection,
|
||||
GCSConfig,
|
||||
S3Config,
|
||||
@ -1005,7 +1005,7 @@ def _(connection: DatalakeClient) -> None:
|
||||
else:
|
||||
connection.client.list_buckets()
|
||||
|
||||
if isinstance(config, AzureDatalakeConfig):
|
||||
if isinstance(config, AzureConfig):
|
||||
connection.client.list_containers(name_starts_with="")
|
||||
|
||||
except ClientError as err:
|
||||
@ -1050,7 +1050,7 @@ def _(config: GCSConfig):
|
||||
|
||||
|
||||
@get_datalake_client.register
|
||||
def _(config: AzureDatalakeConfig):
|
||||
def _(config: AzureConfig):
|
||||
from azure.identity import ClientSecretCredential
|
||||
from azure.storage.blob import BlobServiceClient
|
||||
|
||||
|
||||
@ -275,6 +275,8 @@ We support two ways of authenticating to GCS:
|
||||
- `Storage Blob Data Contributor`
|
||||
- `Storage Queue Data Contributor`
|
||||
|
||||
The current approach for authentication is based on `app registration`, reach out to us on [slack](https://slack.open-metadata.org/) if you find the need for another auth system
|
||||
|
||||
</Collapse>
|
||||
|
||||
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
},
|
||||
"GCSConfig": {
|
||||
"title": "DataLake GCS Config Source",
|
||||
"description": "DataLake Catalog and Manifest files in GCS storage. We will search for catalog.json and manifest.json.",
|
||||
"description": "DataLake GCS storage will ingest metadata of files",
|
||||
"properties": {
|
||||
"securityConfig": {
|
||||
"title": "DataLake GCS Security Config",
|
||||
@ -26,7 +26,7 @@
|
||||
},
|
||||
"S3Config": {
|
||||
"title": "DataLake S3 Config Source",
|
||||
"description": "DataLake Catalog and Manifest files in S3 bucket. We will search for catalog.json and manifest.json.",
|
||||
"description": "DataLake S3 bucket will ingest metadata of files in bucket",
|
||||
"properties": {
|
||||
"securityConfig": {
|
||||
"title": "DataLake S3 Security Config",
|
||||
@ -34,9 +34,9 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"AzureDatalakeConfig": {
|
||||
"AzureConfig": {
|
||||
"title": "Azure Config Source",
|
||||
"description": "Azure Datalake Storage",
|
||||
"description": "Azure Datalake Storage will ingest files in container",
|
||||
"properties": {
|
||||
"securityConfig": {
|
||||
"title": "Azure Datalake Config Source",
|
||||
@ -63,7 +63,7 @@
|
||||
"$ref": "#/definitions/GCSConfig"
|
||||
},
|
||||
{
|
||||
"$ref": "#/definitions/AzureDatalakeConfig"
|
||||
"$ref": "#/definitions/AzureConfig"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user