Fix failure in GCS datalake ingestions (#9336)

This commit is contained in:
Ayush Shah 2022-12-16 14:07:39 +05:30 committed by GitHub
parent c758d83276
commit 50d1538374
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -256,6 +256,14 @@ class DatalakeSource(DatabaseServiceSource): # pylint: disable=too-many-public-
bucket = self.client.get_bucket(bucket_name)
for key in bucket.list_blobs(prefix=prefix):
table_name = self.standardize_table_name(bucket_name, key.name)
# adding this condition as the gcp blobs also contains directory, which we can filter out
if table_name.endswith("/") or not self.check_valid_file_type(
key.name
):
logger.debug(
f"Object filtered due to unsupported file type: {key.name}"
)
continue
table_fqn = fqn.build(
self.metadata,
entity_type=Table,
@ -264,6 +272,7 @@ class DatalakeSource(DatabaseServiceSource): # pylint: disable=too-many-public-
schema_name=self.context.database_schema.name.__root__,
table_name=table_name,
)
if filter_by_table(
self.config.sourceConfig.config.tableFilterPattern,
table_fqn
@ -275,11 +284,6 @@ class DatalakeSource(DatabaseServiceSource): # pylint: disable=too-many-public-
"Object Filtered Out",
)
continue
if not self.check_valid_file_type(key.name):
logger.debug(
f"Object filtered due to unsupported file type: {key.name}"
)
continue
yield table_name, TableType.Regular
if isinstance(self.service_connection.configSource, S3Config):