mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-01 03:09:12 +00:00
239 lines
9.1 KiB
Python
239 lines
9.1 KiB
Python
import datetime
|
|
from unittest.mock import patch
|
|
|
|
import botocore.exceptions
|
|
from botocore.stub import Stubber
|
|
|
|
from datahub.ingestion.api.common import PipelineContext
|
|
from datahub.ingestion.source.aws.glue import GlueSource, GlueSourceConfig
|
|
|
|
|
|
def test_empty_s3_key_handling():
|
|
"""Test that empty S3 keys in script paths are handled gracefully without crashing."""
|
|
|
|
# Mock job with empty S3 key (malformed S3 URI)
|
|
get_jobs_response_with_empty_key = {
|
|
"Jobs": [
|
|
{
|
|
"Name": "test-job-empty-key",
|
|
"Description": "Job with empty S3 key",
|
|
"Role": "arn:aws:iam::123412341234:role/service-role/AWSGlueServiceRole-glue-crawler",
|
|
"CreatedOn": datetime.datetime(2021, 6, 10, 16, 51, 25, 690000),
|
|
"LastModifiedOn": datetime.datetime(2021, 6, 10, 16, 55, 35, 307000),
|
|
"ExecutionProperty": {"MaxConcurrentRuns": 1},
|
|
"Command": {
|
|
"Name": "glueetl",
|
|
# This S3 URI has no key/path component - just bucket
|
|
"ScriptLocation": "s3://bucket-only/",
|
|
"PythonVersion": "3",
|
|
},
|
|
},
|
|
{
|
|
"Name": "test-job-bucket-only",
|
|
"Description": "Job with bucket-only S3 URI",
|
|
"Role": "arn:aws:iam::123412341234:role/service-role/AWSGlueServiceRole-glue-crawler",
|
|
"CreatedOn": datetime.datetime(2021, 6, 10, 16, 51, 25, 690000),
|
|
"LastModifiedOn": datetime.datetime(2021, 6, 10, 16, 55, 35, 307000),
|
|
"ExecutionProperty": {"MaxConcurrentRuns": 1},
|
|
"Command": {
|
|
"Name": "glueetl",
|
|
# This S3 URI has no key/path component - just bucket without trailing slash
|
|
"ScriptLocation": "s3://bucket-only",
|
|
"PythonVersion": "3",
|
|
},
|
|
},
|
|
]
|
|
}
|
|
|
|
# Configuration with minimal settings
|
|
config = GlueSourceConfig(
|
|
aws_region="us-west-2",
|
|
extract_transforms=True,
|
|
)
|
|
|
|
ctx = PipelineContext(run_id="test_run")
|
|
|
|
# Create Glue source instance
|
|
glue_source_instance = GlueSource(config=config, ctx=ctx)
|
|
|
|
# Mock the Glue client responses
|
|
with Stubber(glue_source_instance.glue_client) as glue_stubber:
|
|
# Add response for get_databases (required by get_workunits_internal)
|
|
glue_stubber.add_response("get_databases", {"DatabaseList": []}, {})
|
|
|
|
# Add response for get_jobs
|
|
glue_stubber.add_response("get_jobs", get_jobs_response_with_empty_key, {})
|
|
|
|
# Initialize report counters to 0
|
|
initial_invalid_count = (
|
|
glue_source_instance.report.num_job_script_location_invalid
|
|
)
|
|
|
|
# Call the method that processes jobs - this should not crash
|
|
workunits = list(glue_source_instance.get_workunits_internal())
|
|
|
|
# Verify that the invalid script location counter was incremented
|
|
# Should increment by 2 since we have 2 jobs with empty keys
|
|
expected_invalid_count = initial_invalid_count + 2
|
|
assert (
|
|
glue_source_instance.report.num_job_script_location_invalid
|
|
== expected_invalid_count
|
|
)
|
|
|
|
# Verify that we still get workunits for the jobs themselves (DataFlow entities)
|
|
# even though their script processing failed
|
|
job_names = {"test-job-empty-key", "test-job-bucket-only"}
|
|
dataflow_workunits = [wu for wu in workunits if wu.id in job_names]
|
|
assert (
|
|
len(dataflow_workunits) == 2
|
|
) # Should have DataFlow entities for both jobs
|
|
|
|
# Verify that no S3 get_object calls were made (since we skip processing empty keys)
|
|
# This implicitly tests that we don't crash with the ParamValidationError
|
|
|
|
|
|
def test_get_dataflow_graph_with_empty_key_directly():
|
|
"""Test the get_dataflow_graph method directly with empty S3 keys."""
|
|
|
|
config = GlueSourceConfig(
|
|
aws_region="us-west-2",
|
|
extract_transforms=True,
|
|
)
|
|
|
|
ctx = PipelineContext(run_id="test_run")
|
|
glue_source_instance = GlueSource(config=config, ctx=ctx)
|
|
|
|
# Test case 1: S3 URI with trailing slash (empty key)
|
|
flow_urn = "urn:li:dataFlow:(glue,test-job,PROD)"
|
|
result = glue_source_instance.get_dataflow_graph("s3://bucket-only/", flow_urn)
|
|
|
|
assert result is None
|
|
assert glue_source_instance.report.num_job_script_location_invalid == 1
|
|
|
|
# Test case 2: S3 URI without trailing slash (still empty key)
|
|
result = glue_source_instance.get_dataflow_graph("s3://bucket-only", flow_urn)
|
|
|
|
assert result is None
|
|
assert glue_source_instance.report.num_job_script_location_invalid == 2
|
|
|
|
# Test case 3: Valid S3 URI should still work (we won't call S3 but should pass validation)
|
|
# This tests that our validation doesn't break valid paths
|
|
with patch.object(glue_source_instance.s3_client, "get_object") as mock_s3_get:
|
|
mock_s3_get.side_effect = Exception(
|
|
"S3 call should happen but we're mocking it"
|
|
)
|
|
|
|
try:
|
|
glue_source_instance.get_dataflow_graph(
|
|
"s3://valid-bucket/path/to/script.py", flow_urn
|
|
)
|
|
except Exception as e:
|
|
# Should reach S3 call and get our mocked exception, proving validation passed
|
|
assert "S3 call should happen" in str(e)
|
|
|
|
# Invalid count should remain the same (not incremented for valid path)
|
|
assert glue_source_instance.report.num_job_script_location_invalid == 2
|
|
|
|
|
|
def test_original_error_scenario():
|
|
"""Test the exact scenario from the original error to ensure it's fixed."""
|
|
|
|
config = GlueSourceConfig(
|
|
aws_region="us-west-2",
|
|
extract_transforms=True,
|
|
)
|
|
|
|
ctx = PipelineContext(run_id="test_run")
|
|
glue_source_instance = GlueSource(config=config, ctx=ctx)
|
|
|
|
# This should have triggered the original ParamValidationError:
|
|
# "Invalid length for parameter Key, value: 0, valid min length: 1"
|
|
flow_urn = "urn:li:dataFlow:(glue,problematic-job,PROD)"
|
|
|
|
# This call should not raise ParamValidationError anymore
|
|
result = glue_source_instance.get_dataflow_graph("s3://bucket/", flow_urn)
|
|
|
|
# Should return None instead of crashing
|
|
assert result is None
|
|
|
|
# Should increment the invalid counter
|
|
assert glue_source_instance.report.num_job_script_location_invalid == 1
|
|
|
|
|
|
def test_param_validation_error_handling():
|
|
"""Test that ParamValidationError from S3 get_object is properly caught and handled."""
|
|
|
|
config = GlueSourceConfig(
|
|
aws_region="us-west-2",
|
|
extract_transforms=True,
|
|
)
|
|
|
|
ctx = PipelineContext(run_id="test_run")
|
|
glue_source_instance = GlueSource(config=config, ctx=ctx)
|
|
|
|
flow_urn = "urn:li:dataFlow:(glue,test-job,PROD)"
|
|
|
|
# Test case 1: Mock ParamValidationError during S3 get_object call
|
|
with patch.object(glue_source_instance.s3_client, "get_object") as mock_s3_get:
|
|
# Simulate the ParamValidationError that would occur with invalid S3 parameters
|
|
mock_s3_get.side_effect = botocore.exceptions.ParamValidationError(
|
|
report="Invalid length for parameter Key, value: 0, valid min length: 1"
|
|
)
|
|
|
|
initial_invalid_count = (
|
|
glue_source_instance.report.num_job_script_location_invalid
|
|
)
|
|
|
|
result = glue_source_instance.get_dataflow_graph(
|
|
"s3://test-bucket/invalid-key", flow_urn
|
|
)
|
|
|
|
# Should return None instead of crashing
|
|
assert result is None
|
|
|
|
# Should increment the invalid script location counter
|
|
assert (
|
|
glue_source_instance.report.num_job_script_location_invalid
|
|
== initial_invalid_count + 1
|
|
)
|
|
|
|
# Verify the S3 get_object was actually called (so we know the error handling is in the right place)
|
|
mock_s3_get.assert_called_once_with(Bucket="test-bucket", Key="invalid-key")
|
|
|
|
# Test case 2: Different ParamValidationError scenarios
|
|
with patch.object(glue_source_instance.s3_client, "get_object") as mock_s3_get:
|
|
# Simulate another type of ParamValidationError
|
|
mock_s3_get.side_effect = botocore.exceptions.ParamValidationError(
|
|
report="Parameter validation failed: Invalid bucket name"
|
|
)
|
|
|
|
initial_invalid_count = (
|
|
glue_source_instance.report.num_job_script_location_invalid
|
|
)
|
|
|
|
result = glue_source_instance.get_dataflow_graph(
|
|
"s3://invalid-bucket-name/script.py", flow_urn
|
|
)
|
|
|
|
# Should return None instead of crashing
|
|
assert result is None
|
|
|
|
# Should increment the invalid script location counter
|
|
assert (
|
|
glue_source_instance.report.num_job_script_location_invalid
|
|
== initial_invalid_count + 1
|
|
)
|
|
|
|
# Verify the S3 get_object was called with the expected parameters
|
|
mock_s3_get.assert_called_once_with(
|
|
Bucket="invalid-bucket-name", Key="script.py"
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_empty_s3_key_handling()
|
|
test_get_dataflow_graph_with_empty_key_directly()
|
|
test_original_error_scenario()
|
|
test_param_validation_error_handling()
|
|
print("All tests passed!")
|