datahub/metadata-ingestion/tests/unit/glue/test_empty_s3_key_fix.py

239 lines
9.1 KiB
Python

import datetime
from unittest.mock import patch
import botocore.exceptions
from botocore.stub import Stubber
from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.source.aws.glue import GlueSource, GlueSourceConfig
def test_empty_s3_key_handling():
"""Test that empty S3 keys in script paths are handled gracefully without crashing."""
# Mock job with empty S3 key (malformed S3 URI)
get_jobs_response_with_empty_key = {
"Jobs": [
{
"Name": "test-job-empty-key",
"Description": "Job with empty S3 key",
"Role": "arn:aws:iam::123412341234:role/service-role/AWSGlueServiceRole-glue-crawler",
"CreatedOn": datetime.datetime(2021, 6, 10, 16, 51, 25, 690000),
"LastModifiedOn": datetime.datetime(2021, 6, 10, 16, 55, 35, 307000),
"ExecutionProperty": {"MaxConcurrentRuns": 1},
"Command": {
"Name": "glueetl",
# This S3 URI has no key/path component - just bucket
"ScriptLocation": "s3://bucket-only/",
"PythonVersion": "3",
},
},
{
"Name": "test-job-bucket-only",
"Description": "Job with bucket-only S3 URI",
"Role": "arn:aws:iam::123412341234:role/service-role/AWSGlueServiceRole-glue-crawler",
"CreatedOn": datetime.datetime(2021, 6, 10, 16, 51, 25, 690000),
"LastModifiedOn": datetime.datetime(2021, 6, 10, 16, 55, 35, 307000),
"ExecutionProperty": {"MaxConcurrentRuns": 1},
"Command": {
"Name": "glueetl",
# This S3 URI has no key/path component - just bucket without trailing slash
"ScriptLocation": "s3://bucket-only",
"PythonVersion": "3",
},
},
]
}
# Configuration with minimal settings
config = GlueSourceConfig(
aws_region="us-west-2",
extract_transforms=True,
)
ctx = PipelineContext(run_id="test_run")
# Create Glue source instance
glue_source_instance = GlueSource(config=config, ctx=ctx)
# Mock the Glue client responses
with Stubber(glue_source_instance.glue_client) as glue_stubber:
# Add response for get_databases (required by get_workunits_internal)
glue_stubber.add_response("get_databases", {"DatabaseList": []}, {})
# Add response for get_jobs
glue_stubber.add_response("get_jobs", get_jobs_response_with_empty_key, {})
# Initialize report counters to 0
initial_invalid_count = (
glue_source_instance.report.num_job_script_location_invalid
)
# Call the method that processes jobs - this should not crash
workunits = list(glue_source_instance.get_workunits_internal())
# Verify that the invalid script location counter was incremented
# Should increment by 2 since we have 2 jobs with empty keys
expected_invalid_count = initial_invalid_count + 2
assert (
glue_source_instance.report.num_job_script_location_invalid
== expected_invalid_count
)
# Verify that we still get workunits for the jobs themselves (DataFlow entities)
# even though their script processing failed
job_names = {"test-job-empty-key", "test-job-bucket-only"}
dataflow_workunits = [wu for wu in workunits if wu.id in job_names]
assert (
len(dataflow_workunits) == 2
) # Should have DataFlow entities for both jobs
# Verify that no S3 get_object calls were made (since we skip processing empty keys)
# This implicitly tests that we don't crash with the ParamValidationError
def test_get_dataflow_graph_with_empty_key_directly():
"""Test the get_dataflow_graph method directly with empty S3 keys."""
config = GlueSourceConfig(
aws_region="us-west-2",
extract_transforms=True,
)
ctx = PipelineContext(run_id="test_run")
glue_source_instance = GlueSource(config=config, ctx=ctx)
# Test case 1: S3 URI with trailing slash (empty key)
flow_urn = "urn:li:dataFlow:(glue,test-job,PROD)"
result = glue_source_instance.get_dataflow_graph("s3://bucket-only/", flow_urn)
assert result is None
assert glue_source_instance.report.num_job_script_location_invalid == 1
# Test case 2: S3 URI without trailing slash (still empty key)
result = glue_source_instance.get_dataflow_graph("s3://bucket-only", flow_urn)
assert result is None
assert glue_source_instance.report.num_job_script_location_invalid == 2
# Test case 3: Valid S3 URI should still work (we won't call S3 but should pass validation)
# This tests that our validation doesn't break valid paths
with patch.object(glue_source_instance.s3_client, "get_object") as mock_s3_get:
mock_s3_get.side_effect = Exception(
"S3 call should happen but we're mocking it"
)
try:
glue_source_instance.get_dataflow_graph(
"s3://valid-bucket/path/to/script.py", flow_urn
)
except Exception as e:
# Should reach S3 call and get our mocked exception, proving validation passed
assert "S3 call should happen" in str(e)
# Invalid count should remain the same (not incremented for valid path)
assert glue_source_instance.report.num_job_script_location_invalid == 2
def test_original_error_scenario():
"""Test the exact scenario from the original error to ensure it's fixed."""
config = GlueSourceConfig(
aws_region="us-west-2",
extract_transforms=True,
)
ctx = PipelineContext(run_id="test_run")
glue_source_instance = GlueSource(config=config, ctx=ctx)
# This should have triggered the original ParamValidationError:
# "Invalid length for parameter Key, value: 0, valid min length: 1"
flow_urn = "urn:li:dataFlow:(glue,problematic-job,PROD)"
# This call should not raise ParamValidationError anymore
result = glue_source_instance.get_dataflow_graph("s3://bucket/", flow_urn)
# Should return None instead of crashing
assert result is None
# Should increment the invalid counter
assert glue_source_instance.report.num_job_script_location_invalid == 1
def test_param_validation_error_handling():
"""Test that ParamValidationError from S3 get_object is properly caught and handled."""
config = GlueSourceConfig(
aws_region="us-west-2",
extract_transforms=True,
)
ctx = PipelineContext(run_id="test_run")
glue_source_instance = GlueSource(config=config, ctx=ctx)
flow_urn = "urn:li:dataFlow:(glue,test-job,PROD)"
# Test case 1: Mock ParamValidationError during S3 get_object call
with patch.object(glue_source_instance.s3_client, "get_object") as mock_s3_get:
# Simulate the ParamValidationError that would occur with invalid S3 parameters
mock_s3_get.side_effect = botocore.exceptions.ParamValidationError(
report="Invalid length for parameter Key, value: 0, valid min length: 1"
)
initial_invalid_count = (
glue_source_instance.report.num_job_script_location_invalid
)
result = glue_source_instance.get_dataflow_graph(
"s3://test-bucket/invalid-key", flow_urn
)
# Should return None instead of crashing
assert result is None
# Should increment the invalid script location counter
assert (
glue_source_instance.report.num_job_script_location_invalid
== initial_invalid_count + 1
)
# Verify the S3 get_object was actually called (so we know the error handling is in the right place)
mock_s3_get.assert_called_once_with(Bucket="test-bucket", Key="invalid-key")
# Test case 2: Different ParamValidationError scenarios
with patch.object(glue_source_instance.s3_client, "get_object") as mock_s3_get:
# Simulate another type of ParamValidationError
mock_s3_get.side_effect = botocore.exceptions.ParamValidationError(
report="Parameter validation failed: Invalid bucket name"
)
initial_invalid_count = (
glue_source_instance.report.num_job_script_location_invalid
)
result = glue_source_instance.get_dataflow_graph(
"s3://invalid-bucket-name/script.py", flow_urn
)
# Should return None instead of crashing
assert result is None
# Should increment the invalid script location counter
assert (
glue_source_instance.report.num_job_script_location_invalid
== initial_invalid_count + 1
)
# Verify the S3 get_object was called with the expected parameters
mock_s3_get.assert_called_once_with(
Bucket="invalid-bucket-name", Key="script.py"
)
if __name__ == "__main__":
test_empty_s3_key_handling()
test_get_dataflow_graph_with_empty_key_directly()
test_original_error_scenario()
test_param_validation_error_handling()
print("All tests passed!")