From 9c9e885d77f74197df53506824c0f9c37a6fb7ad Mon Sep 17 00:00:00 2001 From: harshsoni2024 <64592571+harshsoni2024@users.noreply.github.com> Date: Thu, 15 May 2025 18:20:10 +0530 Subject: [PATCH] issue-20074: s3 objects get paginated response (#21208) --- .../ingestion/source/storage/s3/metadata.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/ingestion/src/metadata/ingestion/source/storage/s3/metadata.py b/ingestion/src/metadata/ingestion/source/storage/s3/metadata.py index 58592f6913a..5d79e3e6e2a 100644 --- a/ingestion/src/metadata/ingestion/source/storage/s3/metadata.py +++ b/ingestion/src/metadata/ingestion/source/storage/s3/metadata.py @@ -64,6 +64,7 @@ from metadata.readers.file.config_source_factory import get_reader from metadata.utils import fqn from metadata.utils.filters import filter_by_container from metadata.utils.logger import ingestion_logger +from metadata.utils.s3_utils import list_s3_objects from metadata.utils.tag_utils import get_ometa_tag_and_classification, get_tag_label logger = ingestion_logger() @@ -345,14 +346,13 @@ class S3Source(StorageServiceSource): try: prefix = self._get_sample_file_prefix(metadata_entry=metadata_entry) if prefix: - response = self.s3_client.list_objects_v2( - Bucket=bucket_response.name, Prefix=prefix - ) + kwargs = {"Bucket": bucket_response.name, "Prefix": prefix} + response = list_s3_objects(self.s3_client, **kwargs) # total depth is depth of prefix + depth of the metadata entry total_depth = metadata_entry.depth + len(prefix[:-1].split("/")) candidate_keys = { "/".join(entry.get("Key").split("/")[:total_depth]) + "/" - for entry in response[S3_CLIENT_ROOT_RESPONSE] + for entry in response if entry and entry.get("Key") and len(entry.get("Key").split("/")) > total_depth @@ -464,12 +464,11 @@ class S3Source(StorageServiceSource): parent: Optional[EntityReference] = None, ): bucket_name = bucket_response.name - response = self.s3_client.list_objects_v2( - Bucket=bucket_name, Prefix=metadata_entry.dataPath - ) + kwargs = {"Bucket": bucket_name, "Prefix": metadata_entry.dataPath} + response = list_s3_objects(self.s3_client, **kwargs) candidate_keys = [ entry["Key"] - for entry in response[S3_CLIENT_ROOT_RESPONSE] + for entry in response if entry and entry.get("Key") and not entry.get("Key").endswith("/") ] for key in candidate_keys: