datahub/metadata-ingestion/tests/unit/glue/test_empty_s3_key_fix.py

import datetime
from unittest.mock import patch

import botocore.exceptions
from botocore.stub import Stubber

from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.source.aws.glue import GlueSource, GlueSourceConfig


def test_empty_s3_key_handling():
    """Test that empty S3 keys in script paths are handled gracefully without crashing."""

    # Mock job with empty S3 key (malformed S3 URI)
    get_jobs_response_with_empty_key = {
        "Jobs": [
            {
                "Name": "test-job-empty-key",
                "Description": "Job with empty S3 key",
                "Role": "arn:aws:iam::123412341234:role/service-role/AWSGlueServiceRole-glue-crawler",
                "CreatedOn": datetime.datetime(2021, 6, 10, 16, 51, 25, 690000),
                "LastModifiedOn": datetime.datetime(2021, 6, 10, 16, 55, 35, 307000),
                "ExecutionProperty": {"MaxConcurrentRuns": 1},
                "Command": {
                    "Name": "glueetl",
                    # This S3 URI has no key/path component - just bucket
                    "ScriptLocation": "s3://bucket-only/",
                    "PythonVersion": "3",
                },
            },
            {
                "Name": "test-job-bucket-only",
                "Description": "Job with bucket-only S3 URI",
                "Role": "arn:aws:iam::123412341234:role/service-role/AWSGlueServiceRole-glue-crawler",
                "CreatedOn": datetime.datetime(2021, 6, 10, 16, 51, 25, 690000),
                "LastModifiedOn": datetime.datetime(2021, 6, 10, 16, 55, 35, 307000),
                "ExecutionProperty": {"MaxConcurrentRuns": 1},
                "Command": {
                    "Name": "glueetl",
                    # This S3 URI has no key/path component - just bucket without trailing slash
                    "ScriptLocation": "s3://bucket-only",
                    "PythonVersion": "3",
                },
            },
        ]
    }

    # Configuration with minimal settings
    config = GlueSourceConfig(
        aws_region="us-west-2",
        extract_transforms=True,
    )

    ctx = PipelineContext(run_id="test_run")

    # Create Glue source instance
    glue_source_instance = GlueSource(config=config, ctx=ctx)

    # Mock the Glue client responses
    with Stubber(glue_source_instance.glue_client) as glue_stubber:
        # Add response for get_databases (required by get_workunits_internal)
        glue_stubber.add_response("get_databases", {"DatabaseList": []}, {})

        # Add response for get_jobs
        glue_stubber.add_response("get_jobs", get_jobs_response_with_empty_key, {})

        # Initialize report counters to 0
        initial_invalid_count = (
            glue_source_instance.report.num_job_script_location_invalid
        )

        # Call the method that processes jobs - this should not crash
        workunits = list(glue_source_instance.get_workunits_internal())

        # Verify that the invalid script location counter was incremented
        # Should increment by 2 since we have 2 jobs with empty keys
        expected_invalid_count = initial_invalid_count + 2
        assert (
            glue_source_instance.report.num_job_script_location_invalid
            == expected_invalid_count
        )

        # Verify that we still get workunits for the jobs themselves (DataFlow entities)
        # even though their script processing failed
        job_names = {"test-job-empty-key", "test-job-bucket-only"}
        dataflow_workunits = [wu for wu in workunits if wu.id in job_names]
        assert (
            len(dataflow_workunits) == 2
        )  # Should have DataFlow entities for both jobs

        # Verify that no S3 get_object calls were made (since we skip processing empty keys)
        # This implicitly tests that we don't crash with the ParamValidationError


def test_get_dataflow_graph_with_empty_key_directly():
    """Test the get_dataflow_graph method directly with empty S3 keys."""

    config = GlueSourceConfig(
        aws_region="us-west-2",
        extract_transforms=True,
    )

    ctx = PipelineContext(run_id="test_run")
    glue_source_instance = GlueSource(config=config, ctx=ctx)

    # Test case 1: S3 URI with trailing slash (empty key)
    flow_urn = "urn:li:dataFlow:(glue,test-job,PROD)"
    result = glue_source_instance.get_dataflow_graph("s3://bucket-only/", flow_urn)

    assert result is None
    assert glue_source_instance.report.num_job_script_location_invalid == 1

    # Test case 2: S3 URI without trailing slash (still empty key)
    result = glue_source_instance.get_dataflow_graph("s3://bucket-only", flow_urn)

    assert result is None
    assert glue_source_instance.report.num_job_script_location_invalid == 2

    # Test case 3: Valid S3 URI should still work (we won't call S3 but should pass validation)
    # This tests that our validation doesn't break valid paths
    with patch.object(glue_source_instance.s3_client, "get_object") as mock_s3_get:
        mock_s3_get.side_effect = Exception(
            "S3 call should happen but we're mocking it"
        )

        try:
            glue_source_instance.get_dataflow_graph(
                "s3://valid-bucket/path/to/script.py", flow_urn
            )
        except Exception as e:
            # Should reach S3 call and get our mocked exception, proving validation passed
            assert "S3 call should happen" in str(e)

        # Invalid count should remain the same (not incremented for valid path)
        assert glue_source_instance.report.num_job_script_location_invalid == 2


def test_original_error_scenario():
    """Test the exact scenario from the original error to ensure it's fixed."""

    config = GlueSourceConfig(
        aws_region="us-west-2",
        extract_transforms=True,
    )

    ctx = PipelineContext(run_id="test_run")
    glue_source_instance = GlueSource(config=config, ctx=ctx)

    # This should have triggered the original ParamValidationError:
    # "Invalid length for parameter Key, value: 0, valid min length: 1"
    flow_urn = "urn:li:dataFlow:(glue,problematic-job,PROD)"

    # This call should not raise ParamValidationError anymore
    result = glue_source_instance.get_dataflow_graph("s3://bucket/", flow_urn)

    # Should return None instead of crashing
    assert result is None

    # Should increment the invalid counter
    assert glue_source_instance.report.num_job_script_location_invalid == 1


def test_param_validation_error_handling():
    """Test that ParamValidationError from S3 get_object is properly caught and handled."""

    config = GlueSourceConfig(
        aws_region="us-west-2",
        extract_transforms=True,
    )

    ctx = PipelineContext(run_id="test_run")
    glue_source_instance = GlueSource(config=config, ctx=ctx)

    flow_urn = "urn:li:dataFlow:(glue,test-job,PROD)"

    # Test case 1: Mock ParamValidationError during S3 get_object call
    with patch.object(glue_source_instance.s3_client, "get_object") as mock_s3_get:
        # Simulate the ParamValidationError that would occur with invalid S3 parameters
        mock_s3_get.side_effect = botocore.exceptions.ParamValidationError(
            report="Invalid length for parameter Key, value: 0, valid min length: 1"
        )

        initial_invalid_count = (
            glue_source_instance.report.num_job_script_location_invalid
        )

        result = glue_source_instance.get_dataflow_graph(
            "s3://test-bucket/invalid-key", flow_urn
        )

        # Should return None instead of crashing
        assert result is None

        # Should increment the invalid script location counter
        assert (
            glue_source_instance.report.num_job_script_location_invalid
            == initial_invalid_count + 1
        )

        # Verify the S3 get_object was actually called (so we know the error handling is in the right place)
        mock_s3_get.assert_called_once_with(Bucket="test-bucket", Key="invalid-key")

    # Test case 2: Different ParamValidationError scenarios
    with patch.object(glue_source_instance.s3_client, "get_object") as mock_s3_get:
        # Simulate another type of ParamValidationError
        mock_s3_get.side_effect = botocore.exceptions.ParamValidationError(
            report="Parameter validation failed: Invalid bucket name"
        )

        initial_invalid_count = (
            glue_source_instance.report.num_job_script_location_invalid
        )

        result = glue_source_instance.get_dataflow_graph(
            "s3://invalid-bucket-name/script.py", flow_urn
        )

        # Should return None instead of crashing
        assert result is None

        # Should increment the invalid script location counter
        assert (
            glue_source_instance.report.num_job_script_location_invalid
            == initial_invalid_count + 1
        )

        # Verify the S3 get_object was called with the expected parameters
        mock_s3_get.assert_called_once_with(
            Bucket="invalid-bucket-name", Key="script.py"
        )


if __name__ == "__main__":
    test_empty_s3_key_handling()
    test_get_dataflow_graph_with_empty_key_directly()
    test_original_error_scenario()
    test_param_validation_error_handling()
    print("All tests passed!")
fix(ingest/glue): Add fix for empty script path in glue (#14324) 2025-08-06 08:27:34 +01:00			`import datetime`
			`from unittest.mock import patch`

			`import botocore.exceptions`
			`from botocore.stub import Stubber`

			`from datahub.ingestion.api.common import PipelineContext`
			`from datahub.ingestion.source.aws.glue import GlueSource, GlueSourceConfig`


			`def test_empty_s3_key_handling():`
			`"""Test that empty S3 keys in script paths are handled gracefully without crashing."""`

			`# Mock job with empty S3 key (malformed S3 URI)`
			`get_jobs_response_with_empty_key = {`
			`"Jobs": [`
			`{`
			`"Name": "test-job-empty-key",`
			`"Description": "Job with empty S3 key",`
			`"Role": "arn:aws:iam::123412341234:role/service-role/AWSGlueServiceRole-glue-crawler",`
			`"CreatedOn": datetime.datetime(2021, 6, 10, 16, 51, 25, 690000),`
			`"LastModifiedOn": datetime.datetime(2021, 6, 10, 16, 55, 35, 307000),`
			`"ExecutionProperty": {"MaxConcurrentRuns": 1},`
			`"Command": {`
			`"Name": "glueetl",`
			`# This S3 URI has no key/path component - just bucket`
			`"ScriptLocation": "s3://bucket-only/",`
			`"PythonVersion": "3",`
			`},`
			`},`
			`{`
			`"Name": "test-job-bucket-only",`
			`"Description": "Job with bucket-only S3 URI",`
			`"Role": "arn:aws:iam::123412341234:role/service-role/AWSGlueServiceRole-glue-crawler",`
			`"CreatedOn": datetime.datetime(2021, 6, 10, 16, 51, 25, 690000),`
			`"LastModifiedOn": datetime.datetime(2021, 6, 10, 16, 55, 35, 307000),`
			`"ExecutionProperty": {"MaxConcurrentRuns": 1},`
			`"Command": {`
			`"Name": "glueetl",`
			`# This S3 URI has no key/path component - just bucket without trailing slash`
			`"ScriptLocation": "s3://bucket-only",`
			`"PythonVersion": "3",`
			`},`
			`},`
			`]`
			`}`

			`# Configuration with minimal settings`
			`config = GlueSourceConfig(`
			`aws_region="us-west-2",`
			`extract_transforms=True,`
			`)`

			`ctx = PipelineContext(run_id="test_run")`

			`# Create Glue source instance`
			`glue_source_instance = GlueSource(config=config, ctx=ctx)`

			`# Mock the Glue client responses`
			`with Stubber(glue_source_instance.glue_client) as glue_stubber:`
			`# Add response for get_databases (required by get_workunits_internal)`
			`glue_stubber.add_response("get_databases", {"DatabaseList": []}, {})`

			`# Add response for get_jobs`
			`glue_stubber.add_response("get_jobs", get_jobs_response_with_empty_key, {})`

			`# Initialize report counters to 0`
			`initial_invalid_count = (`
			`glue_source_instance.report.num_job_script_location_invalid`
			`)`

			`# Call the method that processes jobs - this should not crash`
			`workunits = list(glue_source_instance.get_workunits_internal())`

			`# Verify that the invalid script location counter was incremented`
			`# Should increment by 2 since we have 2 jobs with empty keys`
			`expected_invalid_count = initial_invalid_count + 2`
			`assert (`
			`glue_source_instance.report.num_job_script_location_invalid`
			`== expected_invalid_count`
			`)`

			`# Verify that we still get workunits for the jobs themselves (DataFlow entities)`
			`# even though their script processing failed`
			`job_names = {"test-job-empty-key", "test-job-bucket-only"}`
			`dataflow_workunits = [wu for wu in workunits if wu.id in job_names]`
			`assert (`
			`len(dataflow_workunits) == 2`
			`) # Should have DataFlow entities for both jobs`

			`# Verify that no S3 get_object calls were made (since we skip processing empty keys)`
			`# This implicitly tests that we don't crash with the ParamValidationError`


			`def test_get_dataflow_graph_with_empty_key_directly():`
			`"""Test the get_dataflow_graph method directly with empty S3 keys."""`

			`config = GlueSourceConfig(`
			`aws_region="us-west-2",`
			`extract_transforms=True,`
			`)`

			`ctx = PipelineContext(run_id="test_run")`
			`glue_source_instance = GlueSource(config=config, ctx=ctx)`

			`# Test case 1: S3 URI with trailing slash (empty key)`
			`flow_urn = "urn:li:dataFlow:(glue,test-job,PROD)"`
			`result = glue_source_instance.get_dataflow_graph("s3://bucket-only/", flow_urn)`

			`assert result is None`
			`assert glue_source_instance.report.num_job_script_location_invalid == 1`

			`# Test case 2: S3 URI without trailing slash (still empty key)`
			`result = glue_source_instance.get_dataflow_graph("s3://bucket-only", flow_urn)`

			`assert result is None`
			`assert glue_source_instance.report.num_job_script_location_invalid == 2`

			`# Test case 3: Valid S3 URI should still work (we won't call S3 but should pass validation)`
			`# This tests that our validation doesn't break valid paths`
			`with patch.object(glue_source_instance.s3_client, "get_object") as mock_s3_get:`
			`mock_s3_get.side_effect = Exception(`
			`"S3 call should happen but we're mocking it"`
			`)`

			`try:`
			`glue_source_instance.get_dataflow_graph(`
			`"s3://valid-bucket/path/to/script.py", flow_urn`
			`)`
			`except Exception as e:`
			`# Should reach S3 call and get our mocked exception, proving validation passed`
			`assert "S3 call should happen" in str(e)`

			`# Invalid count should remain the same (not incremented for valid path)`
			`assert glue_source_instance.report.num_job_script_location_invalid == 2`


			`def test_original_error_scenario():`
			`"""Test the exact scenario from the original error to ensure it's fixed."""`

			`config = GlueSourceConfig(`
			`aws_region="us-west-2",`
			`extract_transforms=True,`
			`)`

			`ctx = PipelineContext(run_id="test_run")`
			`glue_source_instance = GlueSource(config=config, ctx=ctx)`

			`# This should have triggered the original ParamValidationError:`
			`# "Invalid length for parameter Key, value: 0, valid min length: 1"`
			`flow_urn = "urn:li:dataFlow:(glue,problematic-job,PROD)"`

			`# This call should not raise ParamValidationError anymore`
			`result = glue_source_instance.get_dataflow_graph("s3://bucket/", flow_urn)`

			`# Should return None instead of crashing`
			`assert result is None`

			`# Should increment the invalid counter`
			`assert glue_source_instance.report.num_job_script_location_invalid == 1`


			`def test_param_validation_error_handling():`
			`"""Test that ParamValidationError from S3 get_object is properly caught and handled."""`

			`config = GlueSourceConfig(`
			`aws_region="us-west-2",`
			`extract_transforms=True,`
			`)`

			`ctx = PipelineContext(run_id="test_run")`
			`glue_source_instance = GlueSource(config=config, ctx=ctx)`

			`flow_urn = "urn:li:dataFlow:(glue,test-job,PROD)"`

			`# Test case 1: Mock ParamValidationError during S3 get_object call`
			`with patch.object(glue_source_instance.s3_client, "get_object") as mock_s3_get:`
			`# Simulate the ParamValidationError that would occur with invalid S3 parameters`
			`mock_s3_get.side_effect = botocore.exceptions.ParamValidationError(`
			`report="Invalid length for parameter Key, value: 0, valid min length: 1"`
			`)`

			`initial_invalid_count = (`
			`glue_source_instance.report.num_job_script_location_invalid`
			`)`

			`result = glue_source_instance.get_dataflow_graph(`
			`"s3://test-bucket/invalid-key", flow_urn`
			`)`

			`# Should return None instead of crashing`
			`assert result is None`

			`# Should increment the invalid script location counter`
			`assert (`
			`glue_source_instance.report.num_job_script_location_invalid`
			`== initial_invalid_count + 1`
			`)`

			`# Verify the S3 get_object was actually called (so we know the error handling is in the right place)`
			`mock_s3_get.assert_called_once_with(Bucket="test-bucket", Key="invalid-key")`

			`# Test case 2: Different ParamValidationError scenarios`
			`with patch.object(glue_source_instance.s3_client, "get_object") as mock_s3_get:`
			`# Simulate another type of ParamValidationError`
			`mock_s3_get.side_effect = botocore.exceptions.ParamValidationError(`
			`report="Parameter validation failed: Invalid bucket name"`
			`)`

			`initial_invalid_count = (`
			`glue_source_instance.report.num_job_script_location_invalid`
			`)`

			`result = glue_source_instance.get_dataflow_graph(`
			`"s3://invalid-bucket-name/script.py", flow_urn`
			`)`

			`# Should return None instead of crashing`
			`assert result is None`

			`# Should increment the invalid script location counter`
			`assert (`
			`glue_source_instance.report.num_job_script_location_invalid`
			`== initial_invalid_count + 1`
			`)`

			`# Verify the S3 get_object was called with the expected parameters`
			`mock_s3_get.assert_called_once_with(`
			`Bucket="invalid-bucket-name", Key="script.py"`
			`)`


			`if __name__ == "__main__":`
			`test_empty_s3_key_handling()`
			`test_get_dataflow_graph_with_empty_key_directly()`
			`test_original_error_scenario()`
			`test_param_validation_error_handling()`
			`print("All tests passed!")`