mirror of
				https://github.com/datahub-project/datahub.git
				synced 2025-10-31 10:49:00 +00:00 
			
		
		
		
	feat(s3/ingest): performance improvements for get_dir_to_process and get_folder_info (#14709)
This commit is contained in:
		
							parent
							
								
									f7ea7f033d
								
							
						
					
					
						commit
						5da54bf14d
					
				| @ -126,7 +126,10 @@ def list_folders_path( | ||||
| 
 | ||||
| 
 | ||||
| def list_objects_recursive_path( | ||||
|     s3_uri: str, *, startswith: str, aws_config: Optional[AwsConnectionConfig] | ||||
|     s3_uri: str, | ||||
|     *, | ||||
|     startswith: str = "", | ||||
|     aws_config: Optional[AwsConnectionConfig] = None, | ||||
| ) -> Iterable["ObjectSummary"]: | ||||
|     """ | ||||
|     Given an S3 URI to a folder or bucket, return all objects underneath that URI, optionally | ||||
|  | ||||
| @ -194,6 +194,9 @@ class PathSpec(ConfigModel): | ||||
|         return True | ||||
| 
 | ||||
|     def dir_allowed(self, path: str) -> bool: | ||||
|         if not path.endswith("/"): | ||||
|             path += "/" | ||||
| 
 | ||||
|         if self.glob_include.endswith("**"): | ||||
|             return self.allowed(path, ignore_ext=True) | ||||
| 
 | ||||
| @ -221,9 +224,8 @@ class PathSpec(ConfigModel): | ||||
|                 ): | ||||
|                     return False | ||||
| 
 | ||||
|         file_name_pattern = self.include.rsplit("/", 1)[1] | ||||
|         table_name, _ = self.extract_table_name_and_path( | ||||
|             os.path.join(path, file_name_pattern) | ||||
|             path + self.get_remaining_glob_include(path) | ||||
|         ) | ||||
|         if not self.tables_filter_pattern.allowed(table_name): | ||||
|             return False | ||||
| @ -571,3 +573,38 @@ class PathSpec(ConfigModel): | ||||
|                 "/".join(path.split("/")[:depth]) + "/" + parsed_vars.named["table"] | ||||
|             ) | ||||
|         return self._extract_table_name(parsed_vars.named), table_path | ||||
| 
 | ||||
|     def has_correct_number_of_directory_components(self, path: str) -> bool: | ||||
|         """ | ||||
|         Checks that a given path has the same number of components as the path spec | ||||
|         has directory components. Useful for checking if a path needs to descend further | ||||
|         into child directories or if the source can switch into file listing mode. If the | ||||
|         glob form of the path spec ends in "**", this always returns False. | ||||
|         """ | ||||
|         if self.glob_include.endswith("**"): | ||||
|             return False | ||||
| 
 | ||||
|         if not path.endswith("/"): | ||||
|             path += "/" | ||||
|         path_slash = path.count("/") | ||||
|         glob_slash = self.glob_include.count("/") | ||||
|         if path_slash == glob_slash: | ||||
|             return True | ||||
|         return False | ||||
| 
 | ||||
|     def get_remaining_glob_include(self, path: str) -> str: | ||||
|         """ | ||||
|         Given a path, return the remaining components of the path spec (if any | ||||
|         exist) in glob form. If the glob form of the path spec ends in "**", this | ||||
|         function's return value also always ends in "**", regardless of how | ||||
|         many components the input path has. | ||||
|         """ | ||||
|         if not path.endswith("/"): | ||||
|             path += "/" | ||||
|         path_slash = path.count("/") | ||||
|         remainder = "/".join(self.glob_include.split("/")[path_slash:]) | ||||
|         if remainder: | ||||
|             return remainder | ||||
|         if self.glob_include.endswith("**"): | ||||
|             return "**" | ||||
|         return "" | ||||
|  | ||||
| @ -3,14 +3,14 @@ import functools | ||||
| import logging | ||||
| import os | ||||
| import pathlib | ||||
| import posixpath | ||||
| import re | ||||
| import time | ||||
| from datetime import datetime | ||||
| from pathlib import PurePath | ||||
| from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple | ||||
| from typing import Any, Dict, Iterable, List, Optional, Tuple | ||||
| 
 | ||||
| import smart_open.compression as so_compression | ||||
| from more_itertools import peekable | ||||
| from pyspark.conf import SparkConf | ||||
| from pyspark.sql import SparkSession | ||||
| from pyspark.sql.dataframe import DataFrame | ||||
| @ -36,9 +36,7 @@ from datahub.ingestion.api.source import MetadataWorkUnitProcessor | ||||
| from datahub.ingestion.api.workunit import MetadataWorkUnit | ||||
| from datahub.ingestion.source.aws.s3_boto_utils import ( | ||||
|     get_s3_tags, | ||||
|     list_folders, | ||||
|     list_folders_path, | ||||
|     list_objects_recursive, | ||||
|     list_objects_recursive_path, | ||||
| ) | ||||
| from datahub.ingestion.source.aws.s3_util import ( | ||||
| @ -83,9 +81,6 @@ from datahub.metadata.schema_classes import ( | ||||
| from datahub.telemetry import stats, telemetry | ||||
| from datahub.utilities.perf_timer import PerfTimer | ||||
| 
 | ||||
| if TYPE_CHECKING: | ||||
|     from mypy_boto3_s3.service_resource import Bucket | ||||
| 
 | ||||
| # hide annoying debug errors from py4j | ||||
| logging.getLogger("py4j").setLevel(logging.ERROR) | ||||
| logger: logging.Logger = logging.getLogger(__name__) | ||||
| @ -872,35 +867,44 @@ class S3Source(StatefulIngestionSourceBase): | ||||
| 
 | ||||
|     def get_dir_to_process( | ||||
|         self, | ||||
|         bucket_name: str, | ||||
|         folder: str, | ||||
|         uri: str, | ||||
|         path_spec: PathSpec, | ||||
|         protocol: str, | ||||
|         min: bool = False, | ||||
|     ) -> List[str]: | ||||
|         # if len(path_spec.include.split("/")) == len(f"{protocol}{bucket_name}/{folder}".split("/")): | ||||
|         #    return [f"{protocol}{bucket_name}/{folder}"] | ||||
|         # Add any remaining parts of the path_spec before globs, excluding the | ||||
|         # final filename component, to the URI and prefix so that we don't | ||||
|         # unnecessarily list too many objects. | ||||
|         if not uri.endswith("/"): | ||||
|             uri += "/" | ||||
|         remaining = posixpath.dirname(path_spec.get_remaining_glob_include(uri)).split( | ||||
|             "*" | ||||
|         )[0] | ||||
|         uri += posixpath.dirname(remaining) | ||||
|         prefix = posixpath.basename(remaining) | ||||
| 
 | ||||
|         iterator = list_folders( | ||||
|             bucket_name=bucket_name, | ||||
|             prefix=folder, | ||||
|         # Check if we're at the end of the include path. If so, no need to list sub-folders. | ||||
|         if path_spec.has_correct_number_of_directory_components(uri): | ||||
|             return [uri] | ||||
| 
 | ||||
|         logger.debug(f"get_dir_to_process listing folders {uri=} {prefix=}") | ||||
|         iterator = list_folders_path( | ||||
|             s3_uri=uri, | ||||
|             startswith=prefix, | ||||
|             aws_config=self.source_config.aws_config, | ||||
|         ) | ||||
|         iterator = peekable(iterator) | ||||
|         if iterator: | ||||
|         sorted_dirs = sorted( | ||||
|             iterator, | ||||
|                 key=functools.cmp_to_key(partitioned_folder_comparator), | ||||
|             key=lambda dir: functools.cmp_to_key(partitioned_folder_comparator)( | ||||
|                 dir.name | ||||
|             ), | ||||
|             reverse=not min, | ||||
|         ) | ||||
|         folders = [] | ||||
|         for dir in sorted_dirs: | ||||
|                 if path_spec.dir_allowed(f"{protocol}{bucket_name}/{dir}/"): | ||||
|             if path_spec.dir_allowed(dir.path): | ||||
|                 folders_list = self.get_dir_to_process( | ||||
|                         bucket_name=bucket_name, | ||||
|                         folder=dir + "/", | ||||
|                     uri=dir.path, | ||||
|                     path_spec=path_spec, | ||||
|                         protocol=protocol, | ||||
|                     min=min, | ||||
|                 ) | ||||
|                 folders.extend(folders_list) | ||||
| @ -909,18 +913,16 @@ class S3Source(StatefulIngestionSourceBase): | ||||
|         if folders: | ||||
|             return folders | ||||
|         else: | ||||
|                 return [f"{protocol}{bucket_name}/{folder}"] | ||||
|         return [f"{protocol}{bucket_name}/{folder}"] | ||||
|             return [uri] | ||||
| 
 | ||||
|     def get_folder_info( | ||||
|         self, | ||||
|         path_spec: PathSpec, | ||||
|         bucket: "Bucket", | ||||
|         prefix: str, | ||||
|         uri: str, | ||||
|     ) -> Iterable[Folder]: | ||||
|         """ | ||||
|         Retrieves all the folders in a path by listing all the files in the prefix. | ||||
|         If the prefix is a full path then only that folder will be extracted. | ||||
|         Retrieves all the folders in a path by recursively listing all the files under the | ||||
|         given URI. | ||||
| 
 | ||||
|         A folder has creation and modification times, size, and a sample file path. | ||||
|         - Creation time is the earliest creation time of all files in the folder. | ||||
| @ -930,8 +932,7 @@ class S3Source(StatefulIngestionSourceBase): | ||||
| 
 | ||||
|         Parameters: | ||||
|         path_spec (PathSpec): The path specification used to determine partitioning. | ||||
|         bucket (Bucket): The S3 bucket object. | ||||
|         prefix (str): The prefix path in the S3 bucket to list objects from. | ||||
|         uri (str): The path in the S3 bucket to list objects from. | ||||
| 
 | ||||
|         Returns: | ||||
|         List[Folder]: A list of Folder objects representing the partitions found. | ||||
| @ -947,12 +948,22 @@ class S3Source(StatefulIngestionSourceBase): | ||||
|                 self.report.report_file_dropped(s3_uri) | ||||
|             return allowed | ||||
| 
 | ||||
|         # Add any remaining parts of the path_spec before globs to the URI and prefix, | ||||
|         # so that we don't unnecessarily list too many objects. | ||||
|         if not uri.endswith("/"): | ||||
|             uri += "/" | ||||
|         remaining = path_spec.get_remaining_glob_include(uri).split("*")[0] | ||||
|         uri += posixpath.dirname(remaining) | ||||
|         prefix = posixpath.basename(remaining) | ||||
| 
 | ||||
|         # Process objects in a memory-efficient streaming fashion | ||||
|         # Instead of loading all objects into memory, we'll accumulate folder data incrementally | ||||
|         folder_data: Dict[str, FolderInfo] = {}  # dirname -> FolderInfo | ||||
| 
 | ||||
|         for obj in list_objects_recursive( | ||||
|             bucket.name, prefix, self.source_config.aws_config | ||||
|         logger.info(f"Listing objects under {repr(uri)} with {prefix=}") | ||||
| 
 | ||||
|         for obj in list_objects_recursive_path( | ||||
|             uri, startswith=prefix, aws_config=self.source_config.aws_config | ||||
|         ): | ||||
|             s3_path = self.create_s3_path(obj.bucket_name, obj.key) | ||||
| 
 | ||||
| @ -1047,7 +1058,7 @@ class S3Source(StatefulIngestionSourceBase): | ||||
|             # This creates individual file-level datasets | ||||
|             yield from self._process_simple_path(path_spec) | ||||
| 
 | ||||
|     def _process_templated_path(self, path_spec: PathSpec) -> Iterable[BrowsePath]:  # noqa: C901 | ||||
|     def _process_templated_path(self, path_spec: PathSpec) -> Iterable[BrowsePath]: | ||||
|         """ | ||||
|         Process S3 paths containing {table} templates to create table-level datasets. | ||||
| 
 | ||||
| @ -1133,20 +1144,12 @@ class S3Source(StatefulIngestionSourceBase): | ||||
| 
 | ||||
|                 # STEP 4: Process each table folder to create a table-level dataset | ||||
|                 for folder in table_folders: | ||||
|                     bucket_name = get_bucket_name(folder.path) | ||||
|                     table_folder = get_bucket_relative_path(folder.path) | ||||
|                     bucket = s3.Bucket(bucket_name) | ||||
| 
 | ||||
|                     # Create the full S3 path for this table | ||||
|                     table_s3_path = self.create_s3_path(bucket_name, table_folder) | ||||
|                     logger.info( | ||||
|                         f"Processing table folder: {table_folder} -> {table_s3_path}" | ||||
|                     ) | ||||
|                     logger.info(f"Processing table path: {folder.path}") | ||||
| 
 | ||||
|                     # Extract table name using the ORIGINAL path spec pattern matching (not the modified one) | ||||
|                     # This uses the compiled regex pattern to extract the table name from the full path | ||||
|                     table_name, _ = self.extract_table_name_and_path( | ||||
|                         path_spec, table_s3_path | ||||
|                         path_spec, folder.path | ||||
|                     ) | ||||
| 
 | ||||
|                     # Apply table name filtering if configured | ||||
| @ -1155,90 +1158,52 @@ class S3Source(StatefulIngestionSourceBase): | ||||
|                         continue | ||||
| 
 | ||||
|                     # STEP 5: Handle partition traversal based on configuration | ||||
|                     # Get all partition folders first | ||||
|                     all_partition_folders = list( | ||||
|                         list_folders( | ||||
|                             bucket_name, table_folder, self.source_config.aws_config | ||||
|                         ) | ||||
|                     ) | ||||
|                     logger.info( | ||||
|                         f"Found {len(all_partition_folders)} partition folders under table {table_name} using method {path_spec.traversal_method}" | ||||
|                     ) | ||||
| 
 | ||||
|                     if all_partition_folders: | ||||
|                         # Apply the same traversal logic as the original code | ||||
|                     dirs_to_process = [] | ||||
| 
 | ||||
|                     if path_spec.traversal_method == FolderTraversalMethod.ALL: | ||||
|                         # Process ALL partitions (original behavior) | ||||
|                             dirs_to_process = all_partition_folders | ||||
|                         dirs_to_process = [folder.path] | ||||
|                         logger.debug( | ||||
|                                 f"Processing ALL {len(all_partition_folders)} partitions" | ||||
|                             f"Processing ALL partition folders under: {folder.path}" | ||||
|                         ) | ||||
| 
 | ||||
|                     else: | ||||
|                         # Use the original get_dir_to_process logic for MIN/MAX | ||||
|                             protocol = "s3://"  # Default protocol for S3 | ||||
| 
 | ||||
|                         if ( | ||||
|                                 path_spec.traversal_method | ||||
|                                 == FolderTraversalMethod.MIN_MAX | ||||
|                                 or path_spec.traversal_method | ||||
|                                 == FolderTraversalMethod.MAX | ||||
|                             path_spec.traversal_method == FolderTraversalMethod.MIN_MAX | ||||
|                             or path_spec.traversal_method == FolderTraversalMethod.MAX | ||||
|                         ): | ||||
|                             # Get MAX partition using original logic | ||||
|                             dirs_to_process_max = self.get_dir_to_process( | ||||
|                                     bucket_name=bucket_name, | ||||
|                                     folder=table_folder + "/", | ||||
|                                 uri=folder.path, | ||||
|                                 path_spec=path_spec, | ||||
|                                     protocol=protocol, | ||||
|                                 min=False, | ||||
|                             ) | ||||
|                             if dirs_to_process_max: | ||||
|                                     # Convert full S3 paths back to relative paths for processing | ||||
|                                     dirs_to_process.extend( | ||||
|                                         [ | ||||
|                                             d.replace(f"{protocol}{bucket_name}/", "") | ||||
|                                             for d in dirs_to_process_max | ||||
|                                         ] | ||||
|                                     ) | ||||
|                                 dirs_to_process.extend(dirs_to_process_max) | ||||
|                                 logger.debug( | ||||
|                                     f"Added MAX partition: {dirs_to_process_max}" | ||||
|                                 ) | ||||
| 
 | ||||
|                             if ( | ||||
|                                 path_spec.traversal_method | ||||
|                                 == FolderTraversalMethod.MIN_MAX | ||||
|                             ): | ||||
|                         if path_spec.traversal_method == FolderTraversalMethod.MIN_MAX: | ||||
|                             # Get MIN partition using original logic | ||||
|                             dirs_to_process_min = self.get_dir_to_process( | ||||
|                                     bucket_name=bucket_name, | ||||
|                                     folder=table_folder + "/", | ||||
|                                 uri=folder.path, | ||||
|                                 path_spec=path_spec, | ||||
|                                     protocol=protocol, | ||||
|                                 min=True, | ||||
|                             ) | ||||
|                             if dirs_to_process_min: | ||||
|                                     # Convert full S3 paths back to relative paths for processing | ||||
|                                     dirs_to_process.extend( | ||||
|                                         [ | ||||
|                                             d.replace(f"{protocol}{bucket_name}/", "") | ||||
|                                             for d in dirs_to_process_min | ||||
|                                         ] | ||||
|                                     ) | ||||
|                                 dirs_to_process.extend(dirs_to_process_min) | ||||
|                                 logger.debug( | ||||
|                                     f"Added MIN partition: {dirs_to_process_min}" | ||||
|                                 ) | ||||
| 
 | ||||
|                     # Process the selected partitions | ||||
|                     all_folders = [] | ||||
|                         for partition_folder in dirs_to_process: | ||||
|                             # Ensure we have a clean folder path | ||||
|                             clean_folder = partition_folder.rstrip("/") | ||||
| 
 | ||||
|                             logger.info(f"Scanning files in partition: {clean_folder}") | ||||
|                     for partition_path in dirs_to_process: | ||||
|                         logger.info(f"Scanning files in partition: {partition_path}") | ||||
|                         partition_files = list( | ||||
|                                 self.get_folder_info(path_spec, bucket, clean_folder) | ||||
|                             self.get_folder_info(path_spec, partition_path) | ||||
|                         ) | ||||
|                         all_folders.extend(partition_files) | ||||
| 
 | ||||
| @ -1267,10 +1232,6 @@ class S3Source(StatefulIngestionSourceBase): | ||||
|                         logger.warning( | ||||
|                             f"No files found in processed partitions for table {table_name}" | ||||
|                         ) | ||||
|                     else: | ||||
|                         logger.warning( | ||||
|                             f"No partition folders found under table {table_name}" | ||||
|                         ) | ||||
| 
 | ||||
|         except Exception as e: | ||||
|             if isinstance(e, s3.meta.client.exceptions.NoSuchBucket): | ||||
|  | ||||
| @ -8,13 +8,13 @@ | ||||
|         "json": { | ||||
|             "customProperties": { | ||||
|                 "schema_inferred_from": "gs://my-test-bucket/folder_a/folder_aa/folder_aaa/folder_aaaa/pokemon_abilities_yearwise_2021/month=march/part2.json", | ||||
|                 "number_of_partitions": "6" | ||||
|                 "number_of_partitions": "1" | ||||
|             }, | ||||
|             "externalUrl": "https://console.cloud.google.com/storage/browser/my-test-bucket/folder_a/folder_aa/folder_aaa/folder_aaaa", | ||||
|             "name": "folder_aaaa", | ||||
|             "description": "", | ||||
|             "created": { | ||||
|                 "time": 1586847680000 | ||||
|                 "time": 1586847780000 | ||||
|             }, | ||||
|             "lastModified": { | ||||
|                 "time": 1586847790000 | ||||
| @ -628,9 +628,9 @@ | ||||
|     "aspect": { | ||||
|         "json": { | ||||
|             "minPartition": { | ||||
|                 "partition": "partition_0=pokemon_abilities_yearwise_2019/partition_1=month=feb", | ||||
|                 "createdTime": 1586847680000, | ||||
|                 "lastModifiedTime": 1586847690000 | ||||
|                 "partition": "partition_0=pokemon_abilities_yearwise_2021/partition_1=month=march", | ||||
|                 "createdTime": 1586847780000, | ||||
|                 "lastModifiedTime": 1586847790000 | ||||
|             }, | ||||
|             "maxPartition": { | ||||
|                 "partition": "partition_0=pokemon_abilities_yearwise_2021/partition_1=month=march", | ||||
|  | ||||
| @ -8,13 +8,13 @@ | ||||
|         "json": { | ||||
|             "customProperties": { | ||||
|                 "schema_inferred_from": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/folder_aaaa/pokemon_abilities_yearwise_2021/month=march/part2.json", | ||||
|                 "number_of_partitions": "6" | ||||
|                 "number_of_partitions": "1" | ||||
|             }, | ||||
|             "externalUrl": "https://us-east-1.console.aws.amazon.com/s3/buckets/my-test-bucket?prefix=folder_a/folder_aa/folder_aaa/folder_aaaa", | ||||
|             "name": "folder_aaaa", | ||||
|             "description": "", | ||||
|             "created": { | ||||
|                 "time": 1586847680000 | ||||
|                 "time": 1586847780000 | ||||
|             }, | ||||
|             "lastModified": { | ||||
|                 "time": 1586847790000 | ||||
| @ -628,9 +628,9 @@ | ||||
|     "aspect": { | ||||
|         "json": { | ||||
|             "minPartition": { | ||||
|                 "partition": "partition_0=pokemon_abilities_yearwise_2019/partition_1=month=feb", | ||||
|                 "createdTime": 1586847680000, | ||||
|                 "lastModifiedTime": 1586847690000 | ||||
|                 "partition": "partition_0=pokemon_abilities_yearwise_2021/partition_1=month=march", | ||||
|                 "createdTime": 1586847780000, | ||||
|                 "lastModifiedTime": 1586847790000 | ||||
|             }, | ||||
|             "maxPartition": { | ||||
|                 "partition": "partition_0=pokemon_abilities_yearwise_2021/partition_1=month=march", | ||||
|  | ||||
| @ -2,7 +2,7 @@ import json | ||||
| import logging | ||||
| import os | ||||
| from datetime import datetime | ||||
| from unittest.mock import patch | ||||
| from unittest.mock import Mock, call, patch | ||||
| 
 | ||||
| import moto.s3 | ||||
| import pytest | ||||
| @ -11,6 +11,11 @@ from moto import mock_s3 | ||||
| from pydantic import ValidationError | ||||
| 
 | ||||
| from datahub.ingestion.run.pipeline import Pipeline, PipelineContext | ||||
| from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig | ||||
| from datahub.ingestion.source.aws.s3_boto_utils import ( | ||||
|     list_folders_path, | ||||
|     list_objects_recursive_path, | ||||
| ) | ||||
| from datahub.ingestion.source.s3.source import S3Source | ||||
| from datahub.testing import mce_helpers | ||||
| 
 | ||||
| @ -367,3 +372,158 @@ def test_data_lake_incorrect_config_raises_error(tmp_path, mock_time): | ||||
|     } | ||||
|     with pytest.raises(ValidationError, match=r"\*\*"): | ||||
|         S3Source.create(source, ctx) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     "calls_test_tuple", | ||||
|     [ | ||||
|         ( | ||||
|             "partitions_and_filename_with_prefix", | ||||
|             { | ||||
|                 "include": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/{table}/year={year}/month={month}/part*.json", | ||||
|                 "tables_filter_pattern": {"allow": ["^pokemon_abilities_json$"]}, | ||||
|             }, | ||||
|             [ | ||||
|                 call.list_folders_path( | ||||
|                     "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/" | ||||
|                 ), | ||||
|                 call.list_folders_path( | ||||
|                     s3_uri="s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/", | ||||
|                     startswith="year=", | ||||
|                 ), | ||||
|                 call.list_folders_path( | ||||
|                     s3_uri="s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/", | ||||
|                     startswith="month=", | ||||
|                 ), | ||||
|                 call.list_objects_recursive_path( | ||||
|                     "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/", | ||||
|                     startswith="part", | ||||
|                 ), | ||||
|             ], | ||||
|         ), | ||||
|         ( | ||||
|             "filter_specific_partition", | ||||
|             { | ||||
|                 "include": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/{table}/year=2022/month={month}/*.json", | ||||
|                 "tables_filter_pattern": {"allow": ["^pokemon_abilities_json$"]}, | ||||
|             }, | ||||
|             [ | ||||
|                 call.list_folders_path( | ||||
|                     "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/" | ||||
|                 ), | ||||
|                 call.list_folders_path( | ||||
|                     s3_uri="s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022", | ||||
|                     startswith="month=", | ||||
|                 ), | ||||
|                 call.list_objects_recursive_path( | ||||
|                     "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/", | ||||
|                     startswith="", | ||||
|                 ), | ||||
|             ], | ||||
|         ), | ||||
|         ( | ||||
|             "partition_autodetection", | ||||
|             { | ||||
|                 "include": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/{table}/", | ||||
|                 "tables_filter_pattern": {"allow": ["^pokemon_abilities_json$"]}, | ||||
|             }, | ||||
|             [ | ||||
|                 call.list_folders_path( | ||||
|                     "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/" | ||||
|                 ), | ||||
|                 call.list_folders_path( | ||||
|                     s3_uri="s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/", | ||||
|                     startswith="", | ||||
|                 ), | ||||
|                 call.list_folders_path( | ||||
|                     s3_uri="s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/", | ||||
|                     startswith="", | ||||
|                 ), | ||||
|                 call.list_folders_path( | ||||
|                     s3_uri="s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/", | ||||
|                     startswith="", | ||||
|                 ), | ||||
|                 call.list_objects_recursive_path( | ||||
|                     "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/", | ||||
|                     startswith="", | ||||
|                 ), | ||||
|             ], | ||||
|         ), | ||||
|         ( | ||||
|             "partitions_traversal_all", | ||||
|             { | ||||
|                 "include": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/{table}/year={year}/month={month}/*.json", | ||||
|                 "tables_filter_pattern": {"allow": ["^pokemon_abilities_json$"]}, | ||||
|                 "traversal_method": "ALL", | ||||
|             }, | ||||
|             [ | ||||
|                 call.list_folders_path( | ||||
|                     "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/" | ||||
|                 ), | ||||
|                 call.list_objects_recursive_path( | ||||
|                     "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/", | ||||
|                     startswith="year=", | ||||
|                 ), | ||||
|             ], | ||||
|         ), | ||||
|         ( | ||||
|             "filter_specific_partition_traversal_all", | ||||
|             { | ||||
|                 "include": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/{table}/year=2022/month={month}/part*.json", | ||||
|                 "tables_filter_pattern": {"allow": ["^pokemon_abilities_json$"]}, | ||||
|                 "traversal_method": "ALL", | ||||
|             }, | ||||
|             [ | ||||
|                 call.list_folders_path( | ||||
|                     "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/" | ||||
|                 ), | ||||
|                 call.list_objects_recursive_path( | ||||
|                     "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022", | ||||
|                     startswith="month=", | ||||
|                 ), | ||||
|             ], | ||||
|         ), | ||||
|     ], | ||||
|     ids=lambda calls_test_tuple: calls_test_tuple[0], | ||||
| ) | ||||
| def test_data_lake_s3_calls(s3_populate, calls_test_tuple): | ||||
|     _, path_spec, expected_calls = calls_test_tuple | ||||
| 
 | ||||
|     ctx = PipelineContext(run_id="test-s3") | ||||
|     config = { | ||||
|         "path_specs": [path_spec], | ||||
|         "aws_config": { | ||||
|             "aws_region": "us-east-1", | ||||
|             "aws_access_key_id": "testing", | ||||
|             "aws_secret_access_key": "testing", | ||||
|         }, | ||||
|     } | ||||
|     source = S3Source.create(config, ctx) | ||||
| 
 | ||||
|     m = Mock() | ||||
|     m.list_folders_path.side_effect = list_folders_path | ||||
|     m.list_objects_recursive_path.side_effect = list_objects_recursive_path | ||||
| 
 | ||||
|     with ( | ||||
|         patch( | ||||
|             "datahub.ingestion.source.s3.source.list_folders_path", m.list_folders_path | ||||
|         ), | ||||
|         patch( | ||||
|             "datahub.ingestion.source.s3.source.list_objects_recursive_path", | ||||
|             m.list_objects_recursive_path, | ||||
|         ), | ||||
|     ): | ||||
|         for _ in source.get_workunits_internal(): | ||||
|             pass | ||||
| 
 | ||||
|     # Verify S3 calls. We're checking that we make the minimum necessary calls with | ||||
|     # prefixes when possible to reduce the amount of queries to the S3 API. | ||||
|     calls = [] | ||||
|     for c in m.mock_calls: | ||||
|         if isinstance(c.kwargs, dict):  # type assertion | ||||
|             c.kwargs.pop("aws_config", None) | ||||
|         if len(c.args) == 3 and isinstance(c.args[2], AwsConnectionConfig): | ||||
|             c = getattr(call, c[0])(*(c.args[:2]), **c.kwargs) | ||||
|         calls.append(c) | ||||
| 
 | ||||
|     assert calls == expected_calls | ||||
|  | ||||
| @ -331,6 +331,25 @@ def test_dir_allowed_with_debug_logging() -> None: | ||||
|     assert result is True | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     "include, allowed", | ||||
|     [ | ||||
|         ("s3://bucket/{table}/1/*.json", "s3://bucket/table/1/"), | ||||
|         ("s3://bucket/{table}/1/*/*.json", "s3://bucket/table/1/"), | ||||
|         ("s3://bucket/{table}/1/*/*.json", "s3://bucket/table/1/2/"), | ||||
|     ], | ||||
| ) | ||||
| def test_dir_allowed_with_table_filter_pattern(include: str, allowed: str) -> None: | ||||
|     """Test dir_allowed method with table filter patterns.""" | ||||
|     path_spec = PathSpec( | ||||
|         include=include, | ||||
|         tables_filter_pattern=AllowDenyPattern( | ||||
|             allow=["^table$"], | ||||
|         ), | ||||
|     ) | ||||
|     assert path_spec.dir_allowed(allowed) is True | ||||
| 
 | ||||
| 
 | ||||
| # Tests for get_parsable_include classmethod | ||||
| @pytest.mark.parametrize( | ||||
|     "include, expected", | ||||
|  | ||||
| @ -1,7 +1,7 @@ | ||||
| import logging | ||||
| from datetime import datetime, timezone | ||||
| from typing import List, Tuple | ||||
| from unittest.mock import Mock, call | ||||
| from unittest.mock import Mock, call, patch | ||||
| 
 | ||||
| import pytest | ||||
| from boto3.session import Session | ||||
| @ -314,7 +314,8 @@ def test_get_folder_info_returns_latest_file_in_each_folder(s3_resource): | ||||
| 
 | ||||
|     # act | ||||
|     res = _get_s3_source(path_spec).get_folder_info( | ||||
|         path_spec, bucket, prefix="my-folder" | ||||
|         path_spec, | ||||
|         "s3://my-bucket/my-folder", | ||||
|     ) | ||||
|     res = list(res) | ||||
| 
 | ||||
| @ -329,12 +330,9 @@ def test_get_folder_info_ignores_disallowed_path(s3_resource, caplog): | ||||
|     Test S3Source.get_folder_info skips disallowed files and logs a message | ||||
|     """ | ||||
|     # arrange | ||||
|     path_spec = Mock( | ||||
|         spec=PathSpec, | ||||
|     path_spec = PathSpec( | ||||
|         include="s3://my-bucket/{table}/{partition0}/*.csv", | ||||
|         table_name="{table}", | ||||
|     ) | ||||
|     path_spec.allowed = Mock(return_value=False) | ||||
| 
 | ||||
|     bucket = s3_resource.Bucket("my-bucket") | ||||
|     bucket.create() | ||||
| @ -343,13 +341,17 @@ def test_get_folder_info_ignores_disallowed_path(s3_resource, caplog): | ||||
|     s3_source = _get_s3_source(path_spec) | ||||
| 
 | ||||
|     # act | ||||
|     res = s3_source.get_folder_info(path_spec, bucket, prefix="my-folder") | ||||
|     with patch( | ||||
|         "datahub.ingestion.source.data_lake_common.path_spec.PathSpec.allowed" | ||||
|     ) as allowed: | ||||
|         allowed.return_value = False | ||||
|         res = s3_source.get_folder_info(path_spec, "s3://my-bucket/my-folder") | ||||
|         res = list(res) | ||||
| 
 | ||||
|     # assert | ||||
|     expected_called_s3_uri = "s3://my-bucket/my-folder/ignore/this/path/0001.csv" | ||||
| 
 | ||||
|     assert path_spec.allowed.call_args_list == [call(expected_called_s3_uri)], ( | ||||
|     assert allowed.call_args_list == [call(expected_called_s3_uri)], ( | ||||
|         "File should be checked if it's allowed" | ||||
|     ) | ||||
|     assert f"File {expected_called_s3_uri} not allowed and skipping" in caplog.text, ( | ||||
| @ -377,7 +379,8 @@ def test_get_folder_info_returns_expected_folder(s3_resource): | ||||
| 
 | ||||
|     # act | ||||
|     res = _get_s3_source(path_spec).get_folder_info( | ||||
|         path_spec, bucket, prefix="my-folder" | ||||
|         path_spec, | ||||
|         "s3://my-bucket/my-folder", | ||||
|     ) | ||||
|     res = list(res) | ||||
| 
 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Michael Maltese
						Michael Maltese