datahub/metadata-ingestion/tests/unit/glue/test_empty_s3_key_fix.py

239 lines
9.1 KiB
Python
Raw Normal View History

import datetime
from unittest.mock import patch
import botocore.exceptions
from botocore.stub import Stubber
from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.source.aws.glue import GlueSource, GlueSourceConfig
def test_empty_s3_key_handling():
"""Test that empty S3 keys in script paths are handled gracefully without crashing."""
# Mock job with empty S3 key (malformed S3 URI)
get_jobs_response_with_empty_key = {
"Jobs": [
{
"Name": "test-job-empty-key",
"Description": "Job with empty S3 key",
"Role": "arn:aws:iam::123412341234:role/service-role/AWSGlueServiceRole-glue-crawler",
"CreatedOn": datetime.datetime(2021, 6, 10, 16, 51, 25, 690000),
"LastModifiedOn": datetime.datetime(2021, 6, 10, 16, 55, 35, 307000),
"ExecutionProperty": {"MaxConcurrentRuns": 1},
"Command": {
"Name": "glueetl",
# This S3 URI has no key/path component - just bucket
"ScriptLocation": "s3://bucket-only/",
"PythonVersion": "3",
},
},
{
"Name": "test-job-bucket-only",
"Description": "Job with bucket-only S3 URI",
"Role": "arn:aws:iam::123412341234:role/service-role/AWSGlueServiceRole-glue-crawler",
"CreatedOn": datetime.datetime(2021, 6, 10, 16, 51, 25, 690000),
"LastModifiedOn": datetime.datetime(2021, 6, 10, 16, 55, 35, 307000),
"ExecutionProperty": {"MaxConcurrentRuns": 1},
"Command": {
"Name": "glueetl",
# This S3 URI has no key/path component - just bucket without trailing slash
"ScriptLocation": "s3://bucket-only",
"PythonVersion": "3",
},
},
]
}
# Configuration with minimal settings
config = GlueSourceConfig(
aws_region="us-west-2",
extract_transforms=True,
)
ctx = PipelineContext(run_id="test_run")
# Create Glue source instance
glue_source_instance = GlueSource(config=config, ctx=ctx)
# Mock the Glue client responses
with Stubber(glue_source_instance.glue_client) as glue_stubber:
# Add response for get_databases (required by get_workunits_internal)
glue_stubber.add_response("get_databases", {"DatabaseList": []}, {})
# Add response for get_jobs
glue_stubber.add_response("get_jobs", get_jobs_response_with_empty_key, {})
# Initialize report counters to 0
initial_invalid_count = (
glue_source_instance.report.num_job_script_location_invalid
)
# Call the method that processes jobs - this should not crash
workunits = list(glue_source_instance.get_workunits_internal())
# Verify that the invalid script location counter was incremented
# Should increment by 2 since we have 2 jobs with empty keys
expected_invalid_count = initial_invalid_count + 2
assert (
glue_source_instance.report.num_job_script_location_invalid
== expected_invalid_count
)
# Verify that we still get workunits for the jobs themselves (DataFlow entities)
# even though their script processing failed
job_names = {"test-job-empty-key", "test-job-bucket-only"}
dataflow_workunits = [wu for wu in workunits if wu.id in job_names]
assert (
len(dataflow_workunits) == 2
) # Should have DataFlow entities for both jobs
# Verify that no S3 get_object calls were made (since we skip processing empty keys)
# This implicitly tests that we don't crash with the ParamValidationError
def test_get_dataflow_graph_with_empty_key_directly():
"""Test the get_dataflow_graph method directly with empty S3 keys."""
config = GlueSourceConfig(
aws_region="us-west-2",
extract_transforms=True,
)
ctx = PipelineContext(run_id="test_run")
glue_source_instance = GlueSource(config=config, ctx=ctx)
# Test case 1: S3 URI with trailing slash (empty key)
flow_urn = "urn:li:dataFlow:(glue,test-job,PROD)"
result = glue_source_instance.get_dataflow_graph("s3://bucket-only/", flow_urn)
assert result is None
assert glue_source_instance.report.num_job_script_location_invalid == 1
# Test case 2: S3 URI without trailing slash (still empty key)
result = glue_source_instance.get_dataflow_graph("s3://bucket-only", flow_urn)
assert result is None
assert glue_source_instance.report.num_job_script_location_invalid == 2
# Test case 3: Valid S3 URI should still work (we won't call S3 but should pass validation)
# This tests that our validation doesn't break valid paths
with patch.object(glue_source_instance.s3_client, "get_object") as mock_s3_get:
mock_s3_get.side_effect = Exception(
"S3 call should happen but we're mocking it"
)
try:
glue_source_instance.get_dataflow_graph(
"s3://valid-bucket/path/to/script.py", flow_urn
)
except Exception as e:
# Should reach S3 call and get our mocked exception, proving validation passed
assert "S3 call should happen" in str(e)
# Invalid count should remain the same (not incremented for valid path)
assert glue_source_instance.report.num_job_script_location_invalid == 2
def test_original_error_scenario():
"""Test the exact scenario from the original error to ensure it's fixed."""
config = GlueSourceConfig(
aws_region="us-west-2",
extract_transforms=True,
)
ctx = PipelineContext(run_id="test_run")
glue_source_instance = GlueSource(config=config, ctx=ctx)
# This should have triggered the original ParamValidationError:
# "Invalid length for parameter Key, value: 0, valid min length: 1"
flow_urn = "urn:li:dataFlow:(glue,problematic-job,PROD)"
# This call should not raise ParamValidationError anymore
result = glue_source_instance.get_dataflow_graph("s3://bucket/", flow_urn)
# Should return None instead of crashing
assert result is None
# Should increment the invalid counter
assert glue_source_instance.report.num_job_script_location_invalid == 1
def test_param_validation_error_handling():
"""Test that ParamValidationError from S3 get_object is properly caught and handled."""
config = GlueSourceConfig(
aws_region="us-west-2",
extract_transforms=True,
)
ctx = PipelineContext(run_id="test_run")
glue_source_instance = GlueSource(config=config, ctx=ctx)
flow_urn = "urn:li:dataFlow:(glue,test-job,PROD)"
# Test case 1: Mock ParamValidationError during S3 get_object call
with patch.object(glue_source_instance.s3_client, "get_object") as mock_s3_get:
# Simulate the ParamValidationError that would occur with invalid S3 parameters
mock_s3_get.side_effect = botocore.exceptions.ParamValidationError(
report="Invalid length for parameter Key, value: 0, valid min length: 1"
)
initial_invalid_count = (
glue_source_instance.report.num_job_script_location_invalid
)
result = glue_source_instance.get_dataflow_graph(
"s3://test-bucket/invalid-key", flow_urn
)
# Should return None instead of crashing
assert result is None
# Should increment the invalid script location counter
assert (
glue_source_instance.report.num_job_script_location_invalid
== initial_invalid_count + 1
)
# Verify the S3 get_object was actually called (so we know the error handling is in the right place)
mock_s3_get.assert_called_once_with(Bucket="test-bucket", Key="invalid-key")
# Test case 2: Different ParamValidationError scenarios
with patch.object(glue_source_instance.s3_client, "get_object") as mock_s3_get:
# Simulate another type of ParamValidationError
mock_s3_get.side_effect = botocore.exceptions.ParamValidationError(
report="Parameter validation failed: Invalid bucket name"
)
initial_invalid_count = (
glue_source_instance.report.num_job_script_location_invalid
)
result = glue_source_instance.get_dataflow_graph(
"s3://invalid-bucket-name/script.py", flow_urn
)
# Should return None instead of crashing
assert result is None
# Should increment the invalid script location counter
assert (
glue_source_instance.report.num_job_script_location_invalid
== initial_invalid_count + 1
)
# Verify the S3 get_object was called with the expected parameters
mock_s3_get.assert_called_once_with(
Bucket="invalid-bucket-name", Key="script.py"
)
if __name__ == "__main__":
test_empty_s3_key_handling()
test_get_dataflow_graph_with_empty_key_directly()
test_original_error_scenario()
test_param_validation_error_handling()
print("All tests passed!")