datahub/metadata-ingestion/tests/unit/test_library_examples.py

"""
Test that all library examples can at least be compiled and imported.

This test doesn't execute the examples (they require a running DataHub instance),
but it verifies that:
1. The syntax is valid (no compilation errors)
2. All imports resolve correctly
3. No obvious runtime errors in module-level code

This catches common issues like:
- Missing imports
- Typos in class/function names
- Invalid syntax
"""

import os
import py_compile
import subprocess
from pathlib import Path

import pytest

# Get the examples/library directory
EXAMPLES_DIR = Path(__file__).parent.parent.parent / "examples" / "library"


def get_library_examples():
    """Get all Python files in the examples/library directory."""
    if not EXAMPLES_DIR.exists():
        return []
    return sorted([f for f in os.listdir(EXAMPLES_DIR) if f.endswith(".py")])


@pytest.mark.parametrize("example_file", get_library_examples())
def test_example_compiles(example_file):
    """Test that each example file compiles without syntax errors."""
    filepath = EXAMPLES_DIR / example_file
    try:
        py_compile.compile(filepath, doraise=True)
    except py_compile.PyCompileError as e:
        pytest.fail(f"Compilation failed for {example_file}: {e}")


@pytest.mark.parametrize("example_file", get_library_examples())
def test_example_imports(example_file):
    """
    Test that each example file can be imported without errors.

    This catches import errors and module-level code issues without
    actually executing the main logic (which would require a DataHub instance).
    """
    filepath = EXAMPLES_DIR / example_file

    # Use python -m py_compile to check imports resolve
    result = subprocess.run(
        ["python", "-m", "py_compile", str(filepath)],
        capture_output=True,
        text=True,
    )

    if result.returncode != 0:
        pytest.fail(
            f"Import check failed for {example_file}:\n"
            f"stdout: {result.stdout}\n"
            f"stderr: {result.stderr}"
        )


def test_all_examples_accounted_for():
    """Ensure we're testing all example files."""
    examples = get_library_examples()
    assert len(examples) > 0, "No example files found in examples/library"

    # We should have a reasonable number of examples
    assert len(examples) > 50, f"Expected 50+ examples, found {len(examples)}"

    # Verify our new examples are included
    new_examples = [
        "notebook_create.py",
        "notebook_add_content.py",
        "notebook_add_owner.py",
        "notebook_add_tags.py",
        "data_platform_create.py",
    ]

    for example in new_examples:
        assert example in examples, f"New example {example} not found in library"


# ==================== Unit tests for refactored examples ====================
# These tests verify that examples produce valid metadata structures


def test_create_notebook_metadata():
    """Test that create_notebook example produces valid MCP."""
    from examples.library.notebook_create import create_notebook_metadata

    mcp = create_notebook_metadata(
        notebook_urn="urn:li:notebook:(test,test_notebook)",
        title="Test Notebook",
        description="A test notebook",
        external_url="https://example.com/notebook",
        custom_properties={"key": "value"},
        actor="urn:li:corpuser:test",
        timestamp_millis=1234567890000,
    )

    # Validate MCP structure
    assert mcp.entityUrn == "urn:li:notebook:(test,test_notebook)"
    assert mcp.aspectName == "notebookInfo"
    assert mcp.aspect is not None

    # Validate aspect content
    assert mcp.aspect.title == "Test Notebook"
    assert mcp.aspect.description == "A test notebook"
    assert mcp.aspect.externalUrl == "https://example.com/notebook"
    assert mcp.aspect.customProperties == {"key": "value"}
    assert mcp.aspect.changeAuditStamps.created.actor == "urn:li:corpuser:test"
    assert mcp.aspect.changeAuditStamps.created.time == 1234567890000


def test_create_notebook_main_with_mock_emitter():
    """Test that create_notebook main function emits metadata correctly."""
    from unittest import mock

    from examples.library.notebook_create import main

    mock_emitter = mock.Mock()
    main(emitter=mock_emitter)

    # Verify emit was called once
    assert mock_emitter.emit.call_count == 1

    # Verify the emitted MCP
    emitted_mcp = mock_emitter.emit.call_args[0][0]
    assert emitted_mcp.entityUrn == "urn:li:notebook:(querybook,customer_analysis_2024)"
    assert emitted_mcp.aspectName == "notebookInfo"
    assert emitted_mcp.aspect.title == "Customer Segmentation Analysis 2024"


def test_query_dataset_deprecation_not_deprecated():
    """Test querying deprecation for a non-deprecated dataset."""
    from unittest import mock

    from examples.library.dataset_query_deprecation import query_dataset_deprecation

    # Mock client and dataset
    mock_client = mock.Mock()
    mock_dataset = mock.Mock()
    mock_dataset._get_aspect.return_value = None
    mock_client.entities.get.return_value = mock_dataset

    from datahub.sdk import DatasetUrn

    dataset_urn = DatasetUrn(platform="test", name="test_table", env="PROD")

    is_deprecated, note, decommission_time = query_dataset_deprecation(
        mock_client, dataset_urn
    )

    assert not is_deprecated
    assert note is None
    assert decommission_time is None


def test_query_dataset_deprecation_deprecated():
    """Test querying deprecation for a deprecated dataset."""
    from unittest import mock

    from datahub.metadata.schema_classes import DeprecationClass
    from examples.library.dataset_query_deprecation import query_dataset_deprecation

    # Mock client and dataset
    mock_client = mock.Mock()
    mock_dataset = mock.Mock()
    deprecation = DeprecationClass(
        deprecated=True,
        note="This table is deprecated",
        decommissionTime=1234567890000,
        actor="urn:li:corpuser:test",
    )
    mock_dataset._get_aspect.return_value = deprecation
    mock_client.entities.get.return_value = mock_dataset

    from datahub.sdk import DatasetUrn

    dataset_urn = DatasetUrn(platform="test", name="test_table", env="PROD")

    is_deprecated, note, decommission_time = query_dataset_deprecation(
        mock_client, dataset_urn
    )

    assert is_deprecated
    assert note == "This table is deprecated"
    assert decommission_time == 1234567890000


def test_add_terms_to_dataset():
    """Test adding glossary terms to a dataset."""
    from unittest import mock

    from datahub.sdk import DatasetUrn, GlossaryTermUrn
    from examples.library.dataset_add_term import add_terms_to_dataset

    # Mock client and dataset
    mock_client = mock.Mock()
    mock_dataset = mock.Mock()
    mock_client.entities.get.return_value = mock_dataset
    mock_client.resolve.term.return_value = GlossaryTermUrn("ResolvedTerm")

    dataset_urn = DatasetUrn(platform="test", name="test_table", env="PROD")

    add_terms_to_dataset(
        client=mock_client,
        dataset_urn=dataset_urn,
        term_urns=[
            GlossaryTermUrn("Classification.HighlyConfidential"),
            "PII",  # Will be resolved by name
        ],
    )

    # Verify add_term was called twice
    assert mock_dataset.add_term.call_count == 2

    # Verify the terms added
    first_call = mock_dataset.add_term.call_args_list[0][0][0]
    assert str(first_call) == "urn:li:glossaryTerm:Classification.HighlyConfidential"

    # Verify client.entities.update was called
    assert mock_client.entities.update.call_count == 1