mirror of
https://github.com/datahub-project/datahub.git
synced 2025-10-29 09:52:40 +00:00
feat(s3/ingest): performance improvements for get_dir_to_process and get_folder_info (#14709)
This commit is contained in:
parent
f7ea7f033d
commit
5da54bf14d
@ -126,7 +126,10 @@ def list_folders_path(
|
||||
|
||||
|
||||
def list_objects_recursive_path(
|
||||
s3_uri: str, *, startswith: str, aws_config: Optional[AwsConnectionConfig]
|
||||
s3_uri: str,
|
||||
*,
|
||||
startswith: str = "",
|
||||
aws_config: Optional[AwsConnectionConfig] = None,
|
||||
) -> Iterable["ObjectSummary"]:
|
||||
"""
|
||||
Given an S3 URI to a folder or bucket, return all objects underneath that URI, optionally
|
||||
|
||||
@ -194,6 +194,9 @@ class PathSpec(ConfigModel):
|
||||
return True
|
||||
|
||||
def dir_allowed(self, path: str) -> bool:
|
||||
if not path.endswith("/"):
|
||||
path += "/"
|
||||
|
||||
if self.glob_include.endswith("**"):
|
||||
return self.allowed(path, ignore_ext=True)
|
||||
|
||||
@ -221,9 +224,8 @@ class PathSpec(ConfigModel):
|
||||
):
|
||||
return False
|
||||
|
||||
file_name_pattern = self.include.rsplit("/", 1)[1]
|
||||
table_name, _ = self.extract_table_name_and_path(
|
||||
os.path.join(path, file_name_pattern)
|
||||
path + self.get_remaining_glob_include(path)
|
||||
)
|
||||
if not self.tables_filter_pattern.allowed(table_name):
|
||||
return False
|
||||
@ -571,3 +573,38 @@ class PathSpec(ConfigModel):
|
||||
"/".join(path.split("/")[:depth]) + "/" + parsed_vars.named["table"]
|
||||
)
|
||||
return self._extract_table_name(parsed_vars.named), table_path
|
||||
|
||||
def has_correct_number_of_directory_components(self, path: str) -> bool:
|
||||
"""
|
||||
Checks that a given path has the same number of components as the path spec
|
||||
has directory components. Useful for checking if a path needs to descend further
|
||||
into child directories or if the source can switch into file listing mode. If the
|
||||
glob form of the path spec ends in "**", this always returns False.
|
||||
"""
|
||||
if self.glob_include.endswith("**"):
|
||||
return False
|
||||
|
||||
if not path.endswith("/"):
|
||||
path += "/"
|
||||
path_slash = path.count("/")
|
||||
glob_slash = self.glob_include.count("/")
|
||||
if path_slash == glob_slash:
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_remaining_glob_include(self, path: str) -> str:
|
||||
"""
|
||||
Given a path, return the remaining components of the path spec (if any
|
||||
exist) in glob form. If the glob form of the path spec ends in "**", this
|
||||
function's return value also always ends in "**", regardless of how
|
||||
many components the input path has.
|
||||
"""
|
||||
if not path.endswith("/"):
|
||||
path += "/"
|
||||
path_slash = path.count("/")
|
||||
remainder = "/".join(self.glob_include.split("/")[path_slash:])
|
||||
if remainder:
|
||||
return remainder
|
||||
if self.glob_include.endswith("**"):
|
||||
return "**"
|
||||
return ""
|
||||
|
||||
@ -3,14 +3,14 @@ import functools
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
import posixpath
|
||||
import re
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import PurePath
|
||||
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
import smart_open.compression as so_compression
|
||||
from more_itertools import peekable
|
||||
from pyspark.conf import SparkConf
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql.dataframe import DataFrame
|
||||
@ -36,9 +36,7 @@ from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
||||
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
||||
from datahub.ingestion.source.aws.s3_boto_utils import (
|
||||
get_s3_tags,
|
||||
list_folders,
|
||||
list_folders_path,
|
||||
list_objects_recursive,
|
||||
list_objects_recursive_path,
|
||||
)
|
||||
from datahub.ingestion.source.aws.s3_util import (
|
||||
@ -83,9 +81,6 @@ from datahub.metadata.schema_classes import (
|
||||
from datahub.telemetry import stats, telemetry
|
||||
from datahub.utilities.perf_timer import PerfTimer
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from mypy_boto3_s3.service_resource import Bucket
|
||||
|
||||
# hide annoying debug errors from py4j
|
||||
logging.getLogger("py4j").setLevel(logging.ERROR)
|
||||
logger: logging.Logger = logging.getLogger(__name__)
|
||||
@ -872,55 +867,62 @@ class S3Source(StatefulIngestionSourceBase):
|
||||
|
||||
def get_dir_to_process(
|
||||
self,
|
||||
bucket_name: str,
|
||||
folder: str,
|
||||
uri: str,
|
||||
path_spec: PathSpec,
|
||||
protocol: str,
|
||||
min: bool = False,
|
||||
) -> List[str]:
|
||||
# if len(path_spec.include.split("/")) == len(f"{protocol}{bucket_name}/{folder}".split("/")):
|
||||
# return [f"{protocol}{bucket_name}/{folder}"]
|
||||
# Add any remaining parts of the path_spec before globs, excluding the
|
||||
# final filename component, to the URI and prefix so that we don't
|
||||
# unnecessarily list too many objects.
|
||||
if not uri.endswith("/"):
|
||||
uri += "/"
|
||||
remaining = posixpath.dirname(path_spec.get_remaining_glob_include(uri)).split(
|
||||
"*"
|
||||
)[0]
|
||||
uri += posixpath.dirname(remaining)
|
||||
prefix = posixpath.basename(remaining)
|
||||
|
||||
iterator = list_folders(
|
||||
bucket_name=bucket_name,
|
||||
prefix=folder,
|
||||
# Check if we're at the end of the include path. If so, no need to list sub-folders.
|
||||
if path_spec.has_correct_number_of_directory_components(uri):
|
||||
return [uri]
|
||||
|
||||
logger.debug(f"get_dir_to_process listing folders {uri=} {prefix=}")
|
||||
iterator = list_folders_path(
|
||||
s3_uri=uri,
|
||||
startswith=prefix,
|
||||
aws_config=self.source_config.aws_config,
|
||||
)
|
||||
iterator = peekable(iterator)
|
||||
if iterator:
|
||||
sorted_dirs = sorted(
|
||||
iterator,
|
||||
key=functools.cmp_to_key(partitioned_folder_comparator),
|
||||
reverse=not min,
|
||||
)
|
||||
folders = []
|
||||
for dir in sorted_dirs:
|
||||
if path_spec.dir_allowed(f"{protocol}{bucket_name}/{dir}/"):
|
||||
folders_list = self.get_dir_to_process(
|
||||
bucket_name=bucket_name,
|
||||
folder=dir + "/",
|
||||
path_spec=path_spec,
|
||||
protocol=protocol,
|
||||
min=min,
|
||||
)
|
||||
folders.extend(folders_list)
|
||||
if path_spec.traversal_method != FolderTraversalMethod.ALL:
|
||||
return folders
|
||||
if folders:
|
||||
return folders
|
||||
else:
|
||||
return [f"{protocol}{bucket_name}/{folder}"]
|
||||
return [f"{protocol}{bucket_name}/{folder}"]
|
||||
sorted_dirs = sorted(
|
||||
iterator,
|
||||
key=lambda dir: functools.cmp_to_key(partitioned_folder_comparator)(
|
||||
dir.name
|
||||
),
|
||||
reverse=not min,
|
||||
)
|
||||
folders = []
|
||||
for dir in sorted_dirs:
|
||||
if path_spec.dir_allowed(dir.path):
|
||||
folders_list = self.get_dir_to_process(
|
||||
uri=dir.path,
|
||||
path_spec=path_spec,
|
||||
min=min,
|
||||
)
|
||||
folders.extend(folders_list)
|
||||
if path_spec.traversal_method != FolderTraversalMethod.ALL:
|
||||
return folders
|
||||
if folders:
|
||||
return folders
|
||||
else:
|
||||
return [uri]
|
||||
|
||||
def get_folder_info(
|
||||
self,
|
||||
path_spec: PathSpec,
|
||||
bucket: "Bucket",
|
||||
prefix: str,
|
||||
uri: str,
|
||||
) -> Iterable[Folder]:
|
||||
"""
|
||||
Retrieves all the folders in a path by listing all the files in the prefix.
|
||||
If the prefix is a full path then only that folder will be extracted.
|
||||
Retrieves all the folders in a path by recursively listing all the files under the
|
||||
given URI.
|
||||
|
||||
A folder has creation and modification times, size, and a sample file path.
|
||||
- Creation time is the earliest creation time of all files in the folder.
|
||||
@ -930,8 +932,7 @@ class S3Source(StatefulIngestionSourceBase):
|
||||
|
||||
Parameters:
|
||||
path_spec (PathSpec): The path specification used to determine partitioning.
|
||||
bucket (Bucket): The S3 bucket object.
|
||||
prefix (str): The prefix path in the S3 bucket to list objects from.
|
||||
uri (str): The path in the S3 bucket to list objects from.
|
||||
|
||||
Returns:
|
||||
List[Folder]: A list of Folder objects representing the partitions found.
|
||||
@ -947,12 +948,22 @@ class S3Source(StatefulIngestionSourceBase):
|
||||
self.report.report_file_dropped(s3_uri)
|
||||
return allowed
|
||||
|
||||
# Add any remaining parts of the path_spec before globs to the URI and prefix,
|
||||
# so that we don't unnecessarily list too many objects.
|
||||
if not uri.endswith("/"):
|
||||
uri += "/"
|
||||
remaining = path_spec.get_remaining_glob_include(uri).split("*")[0]
|
||||
uri += posixpath.dirname(remaining)
|
||||
prefix = posixpath.basename(remaining)
|
||||
|
||||
# Process objects in a memory-efficient streaming fashion
|
||||
# Instead of loading all objects into memory, we'll accumulate folder data incrementally
|
||||
folder_data: Dict[str, FolderInfo] = {} # dirname -> FolderInfo
|
||||
|
||||
for obj in list_objects_recursive(
|
||||
bucket.name, prefix, self.source_config.aws_config
|
||||
logger.info(f"Listing objects under {repr(uri)} with {prefix=}")
|
||||
|
||||
for obj in list_objects_recursive_path(
|
||||
uri, startswith=prefix, aws_config=self.source_config.aws_config
|
||||
):
|
||||
s3_path = self.create_s3_path(obj.bucket_name, obj.key)
|
||||
|
||||
@ -1047,7 +1058,7 @@ class S3Source(StatefulIngestionSourceBase):
|
||||
# This creates individual file-level datasets
|
||||
yield from self._process_simple_path(path_spec)
|
||||
|
||||
def _process_templated_path(self, path_spec: PathSpec) -> Iterable[BrowsePath]: # noqa: C901
|
||||
def _process_templated_path(self, path_spec: PathSpec) -> Iterable[BrowsePath]:
|
||||
"""
|
||||
Process S3 paths containing {table} templates to create table-level datasets.
|
||||
|
||||
@ -1133,20 +1144,12 @@ class S3Source(StatefulIngestionSourceBase):
|
||||
|
||||
# STEP 4: Process each table folder to create a table-level dataset
|
||||
for folder in table_folders:
|
||||
bucket_name = get_bucket_name(folder.path)
|
||||
table_folder = get_bucket_relative_path(folder.path)
|
||||
bucket = s3.Bucket(bucket_name)
|
||||
|
||||
# Create the full S3 path for this table
|
||||
table_s3_path = self.create_s3_path(bucket_name, table_folder)
|
||||
logger.info(
|
||||
f"Processing table folder: {table_folder} -> {table_s3_path}"
|
||||
)
|
||||
logger.info(f"Processing table path: {folder.path}")
|
||||
|
||||
# Extract table name using the ORIGINAL path spec pattern matching (not the modified one)
|
||||
# This uses the compiled regex pattern to extract the table name from the full path
|
||||
table_name, _ = self.extract_table_name_and_path(
|
||||
path_spec, table_s3_path
|
||||
path_spec, folder.path
|
||||
)
|
||||
|
||||
# Apply table name filtering if configured
|
||||
@ -1155,121 +1158,79 @@ class S3Source(StatefulIngestionSourceBase):
|
||||
continue
|
||||
|
||||
# STEP 5: Handle partition traversal based on configuration
|
||||
# Get all partition folders first
|
||||
all_partition_folders = list(
|
||||
list_folders(
|
||||
bucket_name, table_folder, self.source_config.aws_config
|
||||
dirs_to_process = []
|
||||
|
||||
if path_spec.traversal_method == FolderTraversalMethod.ALL:
|
||||
# Process ALL partitions (original behavior)
|
||||
dirs_to_process = [folder.path]
|
||||
logger.debug(
|
||||
f"Processing ALL partition folders under: {folder.path}"
|
||||
)
|
||||
)
|
||||
logger.info(
|
||||
f"Found {len(all_partition_folders)} partition folders under table {table_name} using method {path_spec.traversal_method}"
|
||||
)
|
||||
|
||||
if all_partition_folders:
|
||||
# Apply the same traversal logic as the original code
|
||||
dirs_to_process = []
|
||||
|
||||
if path_spec.traversal_method == FolderTraversalMethod.ALL:
|
||||
# Process ALL partitions (original behavior)
|
||||
dirs_to_process = all_partition_folders
|
||||
logger.debug(
|
||||
f"Processing ALL {len(all_partition_folders)} partitions"
|
||||
else:
|
||||
# Use the original get_dir_to_process logic for MIN/MAX
|
||||
if (
|
||||
path_spec.traversal_method == FolderTraversalMethod.MIN_MAX
|
||||
or path_spec.traversal_method == FolderTraversalMethod.MAX
|
||||
):
|
||||
# Get MAX partition using original logic
|
||||
dirs_to_process_max = self.get_dir_to_process(
|
||||
uri=folder.path,
|
||||
path_spec=path_spec,
|
||||
min=False,
|
||||
)
|
||||
|
||||
else:
|
||||
# Use the original get_dir_to_process logic for MIN/MAX
|
||||
protocol = "s3://" # Default protocol for S3
|
||||
|
||||
if (
|
||||
path_spec.traversal_method
|
||||
== FolderTraversalMethod.MIN_MAX
|
||||
or path_spec.traversal_method
|
||||
== FolderTraversalMethod.MAX
|
||||
):
|
||||
# Get MAX partition using original logic
|
||||
dirs_to_process_max = self.get_dir_to_process(
|
||||
bucket_name=bucket_name,
|
||||
folder=table_folder + "/",
|
||||
path_spec=path_spec,
|
||||
protocol=protocol,
|
||||
min=False,
|
||||
if dirs_to_process_max:
|
||||
dirs_to_process.extend(dirs_to_process_max)
|
||||
logger.debug(
|
||||
f"Added MAX partition: {dirs_to_process_max}"
|
||||
)
|
||||
if dirs_to_process_max:
|
||||
# Convert full S3 paths back to relative paths for processing
|
||||
dirs_to_process.extend(
|
||||
[
|
||||
d.replace(f"{protocol}{bucket_name}/", "")
|
||||
for d in dirs_to_process_max
|
||||
]
|
||||
)
|
||||
logger.debug(
|
||||
f"Added MAX partition: {dirs_to_process_max}"
|
||||
)
|
||||
|
||||
if (
|
||||
path_spec.traversal_method
|
||||
== FolderTraversalMethod.MIN_MAX
|
||||
):
|
||||
# Get MIN partition using original logic
|
||||
dirs_to_process_min = self.get_dir_to_process(
|
||||
bucket_name=bucket_name,
|
||||
folder=table_folder + "/",
|
||||
path_spec=path_spec,
|
||||
protocol=protocol,
|
||||
min=True,
|
||||
if path_spec.traversal_method == FolderTraversalMethod.MIN_MAX:
|
||||
# Get MIN partition using original logic
|
||||
dirs_to_process_min = self.get_dir_to_process(
|
||||
uri=folder.path,
|
||||
path_spec=path_spec,
|
||||
min=True,
|
||||
)
|
||||
if dirs_to_process_min:
|
||||
dirs_to_process.extend(dirs_to_process_min)
|
||||
logger.debug(
|
||||
f"Added MIN partition: {dirs_to_process_min}"
|
||||
)
|
||||
if dirs_to_process_min:
|
||||
# Convert full S3 paths back to relative paths for processing
|
||||
dirs_to_process.extend(
|
||||
[
|
||||
d.replace(f"{protocol}{bucket_name}/", "")
|
||||
for d in dirs_to_process_min
|
||||
]
|
||||
)
|
||||
logger.debug(
|
||||
f"Added MIN partition: {dirs_to_process_min}"
|
||||
)
|
||||
|
||||
# Process the selected partitions
|
||||
all_folders = []
|
||||
for partition_folder in dirs_to_process:
|
||||
# Ensure we have a clean folder path
|
||||
clean_folder = partition_folder.rstrip("/")
|
||||
# Process the selected partitions
|
||||
all_folders = []
|
||||
for partition_path in dirs_to_process:
|
||||
logger.info(f"Scanning files in partition: {partition_path}")
|
||||
partition_files = list(
|
||||
self.get_folder_info(path_spec, partition_path)
|
||||
)
|
||||
all_folders.extend(partition_files)
|
||||
|
||||
logger.info(f"Scanning files in partition: {clean_folder}")
|
||||
partition_files = list(
|
||||
self.get_folder_info(path_spec, bucket, clean_folder)
|
||||
)
|
||||
all_folders.extend(partition_files)
|
||||
if all_folders:
|
||||
# Use the most recent file across all processed partitions
|
||||
latest_file = max(
|
||||
all_folders, key=lambda x: x.modification_time
|
||||
)
|
||||
|
||||
if all_folders:
|
||||
# Use the most recent file across all processed partitions
|
||||
latest_file = max(
|
||||
all_folders, key=lambda x: x.modification_time
|
||||
)
|
||||
# Get partition information
|
||||
partitions = [f for f in all_folders if f.is_partition]
|
||||
|
||||
# Get partition information
|
||||
partitions = [f for f in all_folders if f.is_partition]
|
||||
# Calculate total size of processed partitions
|
||||
total_size = sum(f.size for f in all_folders)
|
||||
|
||||
# Calculate total size of processed partitions
|
||||
total_size = sum(f.size for f in all_folders)
|
||||
|
||||
# Create ONE BrowsePath per table
|
||||
# The key insight: we need to provide the sample file for schema inference
|
||||
# but the table path should be extracted correctly by extract_table_name_and_path
|
||||
yield BrowsePath(
|
||||
file=latest_file.sample_file, # Sample file for schema inference
|
||||
timestamp=latest_file.modification_time, # Latest timestamp
|
||||
size=total_size, # Size of processed partitions
|
||||
partitions=partitions, # Partition metadata
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
f"No files found in processed partitions for table {table_name}"
|
||||
)
|
||||
# Create ONE BrowsePath per table
|
||||
# The key insight: we need to provide the sample file for schema inference
|
||||
# but the table path should be extracted correctly by extract_table_name_and_path
|
||||
yield BrowsePath(
|
||||
file=latest_file.sample_file, # Sample file for schema inference
|
||||
timestamp=latest_file.modification_time, # Latest timestamp
|
||||
size=total_size, # Size of processed partitions
|
||||
partitions=partitions, # Partition metadata
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
f"No partition folders found under table {table_name}"
|
||||
f"No files found in processed partitions for table {table_name}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
|
||||
@ -8,13 +8,13 @@
|
||||
"json": {
|
||||
"customProperties": {
|
||||
"schema_inferred_from": "gs://my-test-bucket/folder_a/folder_aa/folder_aaa/folder_aaaa/pokemon_abilities_yearwise_2021/month=march/part2.json",
|
||||
"number_of_partitions": "6"
|
||||
"number_of_partitions": "1"
|
||||
},
|
||||
"externalUrl": "https://console.cloud.google.com/storage/browser/my-test-bucket/folder_a/folder_aa/folder_aaa/folder_aaaa",
|
||||
"name": "folder_aaaa",
|
||||
"description": "",
|
||||
"created": {
|
||||
"time": 1586847680000
|
||||
"time": 1586847780000
|
||||
},
|
||||
"lastModified": {
|
||||
"time": 1586847790000
|
||||
@ -628,9 +628,9 @@
|
||||
"aspect": {
|
||||
"json": {
|
||||
"minPartition": {
|
||||
"partition": "partition_0=pokemon_abilities_yearwise_2019/partition_1=month=feb",
|
||||
"createdTime": 1586847680000,
|
||||
"lastModifiedTime": 1586847690000
|
||||
"partition": "partition_0=pokemon_abilities_yearwise_2021/partition_1=month=march",
|
||||
"createdTime": 1586847780000,
|
||||
"lastModifiedTime": 1586847790000
|
||||
},
|
||||
"maxPartition": {
|
||||
"partition": "partition_0=pokemon_abilities_yearwise_2021/partition_1=month=march",
|
||||
|
||||
@ -8,13 +8,13 @@
|
||||
"json": {
|
||||
"customProperties": {
|
||||
"schema_inferred_from": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/folder_aaaa/pokemon_abilities_yearwise_2021/month=march/part2.json",
|
||||
"number_of_partitions": "6"
|
||||
"number_of_partitions": "1"
|
||||
},
|
||||
"externalUrl": "https://us-east-1.console.aws.amazon.com/s3/buckets/my-test-bucket?prefix=folder_a/folder_aa/folder_aaa/folder_aaaa",
|
||||
"name": "folder_aaaa",
|
||||
"description": "",
|
||||
"created": {
|
||||
"time": 1586847680000
|
||||
"time": 1586847780000
|
||||
},
|
||||
"lastModified": {
|
||||
"time": 1586847790000
|
||||
@ -628,9 +628,9 @@
|
||||
"aspect": {
|
||||
"json": {
|
||||
"minPartition": {
|
||||
"partition": "partition_0=pokemon_abilities_yearwise_2019/partition_1=month=feb",
|
||||
"createdTime": 1586847680000,
|
||||
"lastModifiedTime": 1586847690000
|
||||
"partition": "partition_0=pokemon_abilities_yearwise_2021/partition_1=month=march",
|
||||
"createdTime": 1586847780000,
|
||||
"lastModifiedTime": 1586847790000
|
||||
},
|
||||
"maxPartition": {
|
||||
"partition": "partition_0=pokemon_abilities_yearwise_2021/partition_1=month=march",
|
||||
|
||||
@ -2,7 +2,7 @@ import json
|
||||
import logging
|
||||
import os
|
||||
from datetime import datetime
|
||||
from unittest.mock import patch
|
||||
from unittest.mock import Mock, call, patch
|
||||
|
||||
import moto.s3
|
||||
import pytest
|
||||
@ -11,6 +11,11 @@ from moto import mock_s3
|
||||
from pydantic import ValidationError
|
||||
|
||||
from datahub.ingestion.run.pipeline import Pipeline, PipelineContext
|
||||
from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
|
||||
from datahub.ingestion.source.aws.s3_boto_utils import (
|
||||
list_folders_path,
|
||||
list_objects_recursive_path,
|
||||
)
|
||||
from datahub.ingestion.source.s3.source import S3Source
|
||||
from datahub.testing import mce_helpers
|
||||
|
||||
@ -367,3 +372,158 @@ def test_data_lake_incorrect_config_raises_error(tmp_path, mock_time):
|
||||
}
|
||||
with pytest.raises(ValidationError, match=r"\*\*"):
|
||||
S3Source.create(source, ctx)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"calls_test_tuple",
|
||||
[
|
||||
(
|
||||
"partitions_and_filename_with_prefix",
|
||||
{
|
||||
"include": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/{table}/year={year}/month={month}/part*.json",
|
||||
"tables_filter_pattern": {"allow": ["^pokemon_abilities_json$"]},
|
||||
},
|
||||
[
|
||||
call.list_folders_path(
|
||||
"s3://my-test-bucket/folder_a/folder_aa/folder_aaa/"
|
||||
),
|
||||
call.list_folders_path(
|
||||
s3_uri="s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/",
|
||||
startswith="year=",
|
||||
),
|
||||
call.list_folders_path(
|
||||
s3_uri="s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/",
|
||||
startswith="month=",
|
||||
),
|
||||
call.list_objects_recursive_path(
|
||||
"s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/",
|
||||
startswith="part",
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
"filter_specific_partition",
|
||||
{
|
||||
"include": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/{table}/year=2022/month={month}/*.json",
|
||||
"tables_filter_pattern": {"allow": ["^pokemon_abilities_json$"]},
|
||||
},
|
||||
[
|
||||
call.list_folders_path(
|
||||
"s3://my-test-bucket/folder_a/folder_aa/folder_aaa/"
|
||||
),
|
||||
call.list_folders_path(
|
||||
s3_uri="s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022",
|
||||
startswith="month=",
|
||||
),
|
||||
call.list_objects_recursive_path(
|
||||
"s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/",
|
||||
startswith="",
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
"partition_autodetection",
|
||||
{
|
||||
"include": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/{table}/",
|
||||
"tables_filter_pattern": {"allow": ["^pokemon_abilities_json$"]},
|
||||
},
|
||||
[
|
||||
call.list_folders_path(
|
||||
"s3://my-test-bucket/folder_a/folder_aa/folder_aaa/"
|
||||
),
|
||||
call.list_folders_path(
|
||||
s3_uri="s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/",
|
||||
startswith="",
|
||||
),
|
||||
call.list_folders_path(
|
||||
s3_uri="s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/",
|
||||
startswith="",
|
||||
),
|
||||
call.list_folders_path(
|
||||
s3_uri="s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/",
|
||||
startswith="",
|
||||
),
|
||||
call.list_objects_recursive_path(
|
||||
"s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/",
|
||||
startswith="",
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
"partitions_traversal_all",
|
||||
{
|
||||
"include": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/{table}/year={year}/month={month}/*.json",
|
||||
"tables_filter_pattern": {"allow": ["^pokemon_abilities_json$"]},
|
||||
"traversal_method": "ALL",
|
||||
},
|
||||
[
|
||||
call.list_folders_path(
|
||||
"s3://my-test-bucket/folder_a/folder_aa/folder_aaa/"
|
||||
),
|
||||
call.list_objects_recursive_path(
|
||||
"s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/",
|
||||
startswith="year=",
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
"filter_specific_partition_traversal_all",
|
||||
{
|
||||
"include": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/{table}/year=2022/month={month}/part*.json",
|
||||
"tables_filter_pattern": {"allow": ["^pokemon_abilities_json$"]},
|
||||
"traversal_method": "ALL",
|
||||
},
|
||||
[
|
||||
call.list_folders_path(
|
||||
"s3://my-test-bucket/folder_a/folder_aa/folder_aaa/"
|
||||
),
|
||||
call.list_objects_recursive_path(
|
||||
"s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022",
|
||||
startswith="month=",
|
||||
),
|
||||
],
|
||||
),
|
||||
],
|
||||
ids=lambda calls_test_tuple: calls_test_tuple[0],
|
||||
)
|
||||
def test_data_lake_s3_calls(s3_populate, calls_test_tuple):
|
||||
_, path_spec, expected_calls = calls_test_tuple
|
||||
|
||||
ctx = PipelineContext(run_id="test-s3")
|
||||
config = {
|
||||
"path_specs": [path_spec],
|
||||
"aws_config": {
|
||||
"aws_region": "us-east-1",
|
||||
"aws_access_key_id": "testing",
|
||||
"aws_secret_access_key": "testing",
|
||||
},
|
||||
}
|
||||
source = S3Source.create(config, ctx)
|
||||
|
||||
m = Mock()
|
||||
m.list_folders_path.side_effect = list_folders_path
|
||||
m.list_objects_recursive_path.side_effect = list_objects_recursive_path
|
||||
|
||||
with (
|
||||
patch(
|
||||
"datahub.ingestion.source.s3.source.list_folders_path", m.list_folders_path
|
||||
),
|
||||
patch(
|
||||
"datahub.ingestion.source.s3.source.list_objects_recursive_path",
|
||||
m.list_objects_recursive_path,
|
||||
),
|
||||
):
|
||||
for _ in source.get_workunits_internal():
|
||||
pass
|
||||
|
||||
# Verify S3 calls. We're checking that we make the minimum necessary calls with
|
||||
# prefixes when possible to reduce the amount of queries to the S3 API.
|
||||
calls = []
|
||||
for c in m.mock_calls:
|
||||
if isinstance(c.kwargs, dict): # type assertion
|
||||
c.kwargs.pop("aws_config", None)
|
||||
if len(c.args) == 3 and isinstance(c.args[2], AwsConnectionConfig):
|
||||
c = getattr(call, c[0])(*(c.args[:2]), **c.kwargs)
|
||||
calls.append(c)
|
||||
|
||||
assert calls == expected_calls
|
||||
|
||||
@ -331,6 +331,25 @@ def test_dir_allowed_with_debug_logging() -> None:
|
||||
assert result is True
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"include, allowed",
|
||||
[
|
||||
("s3://bucket/{table}/1/*.json", "s3://bucket/table/1/"),
|
||||
("s3://bucket/{table}/1/*/*.json", "s3://bucket/table/1/"),
|
||||
("s3://bucket/{table}/1/*/*.json", "s3://bucket/table/1/2/"),
|
||||
],
|
||||
)
|
||||
def test_dir_allowed_with_table_filter_pattern(include: str, allowed: str) -> None:
|
||||
"""Test dir_allowed method with table filter patterns."""
|
||||
path_spec = PathSpec(
|
||||
include=include,
|
||||
tables_filter_pattern=AllowDenyPattern(
|
||||
allow=["^table$"],
|
||||
),
|
||||
)
|
||||
assert path_spec.dir_allowed(allowed) is True
|
||||
|
||||
|
||||
# Tests for get_parsable_include classmethod
|
||||
@pytest.mark.parametrize(
|
||||
"include, expected",
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from typing import List, Tuple
|
||||
from unittest.mock import Mock, call
|
||||
from unittest.mock import Mock, call, patch
|
||||
|
||||
import pytest
|
||||
from boto3.session import Session
|
||||
@ -314,7 +314,8 @@ def test_get_folder_info_returns_latest_file_in_each_folder(s3_resource):
|
||||
|
||||
# act
|
||||
res = _get_s3_source(path_spec).get_folder_info(
|
||||
path_spec, bucket, prefix="my-folder"
|
||||
path_spec,
|
||||
"s3://my-bucket/my-folder",
|
||||
)
|
||||
res = list(res)
|
||||
|
||||
@ -329,12 +330,9 @@ def test_get_folder_info_ignores_disallowed_path(s3_resource, caplog):
|
||||
Test S3Source.get_folder_info skips disallowed files and logs a message
|
||||
"""
|
||||
# arrange
|
||||
path_spec = Mock(
|
||||
spec=PathSpec,
|
||||
path_spec = PathSpec(
|
||||
include="s3://my-bucket/{table}/{partition0}/*.csv",
|
||||
table_name="{table}",
|
||||
)
|
||||
path_spec.allowed = Mock(return_value=False)
|
||||
|
||||
bucket = s3_resource.Bucket("my-bucket")
|
||||
bucket.create()
|
||||
@ -343,13 +341,17 @@ def test_get_folder_info_ignores_disallowed_path(s3_resource, caplog):
|
||||
s3_source = _get_s3_source(path_spec)
|
||||
|
||||
# act
|
||||
res = s3_source.get_folder_info(path_spec, bucket, prefix="my-folder")
|
||||
res = list(res)
|
||||
with patch(
|
||||
"datahub.ingestion.source.data_lake_common.path_spec.PathSpec.allowed"
|
||||
) as allowed:
|
||||
allowed.return_value = False
|
||||
res = s3_source.get_folder_info(path_spec, "s3://my-bucket/my-folder")
|
||||
res = list(res)
|
||||
|
||||
# assert
|
||||
expected_called_s3_uri = "s3://my-bucket/my-folder/ignore/this/path/0001.csv"
|
||||
|
||||
assert path_spec.allowed.call_args_list == [call(expected_called_s3_uri)], (
|
||||
assert allowed.call_args_list == [call(expected_called_s3_uri)], (
|
||||
"File should be checked if it's allowed"
|
||||
)
|
||||
assert f"File {expected_called_s3_uri} not allowed and skipping" in caplog.text, (
|
||||
@ -377,7 +379,8 @@ def test_get_folder_info_returns_expected_folder(s3_resource):
|
||||
|
||||
# act
|
||||
res = _get_s3_source(path_spec).get_folder_info(
|
||||
path_spec, bucket, prefix="my-folder"
|
||||
path_spec,
|
||||
"s3://my-bucket/my-folder",
|
||||
)
|
||||
res = list(res)
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user