datahub/metadata-ingestion/tests/unit/cli/dataset/test_dataset_cmd.py

import shutil
from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest
from click.testing import CliRunner

from datahub.cli.specific.dataset_cli import dataset
from tests.test_helpers.mce_helpers import check_goldens_stream

TEST_RESOURCES_DIR = Path(__file__).parent / "test_resources"


@pytest.fixture
def test_yaml_file():
    """Creates a temporary test yaml file for testing."""
    # Define test data path
    test_file = TEST_RESOURCES_DIR / "dataset.yaml"

    # Create a temporary copy to work with
    temp_file = Path(f"{test_file}.tmp")
    shutil.copyfile(test_file, temp_file)

    yield temp_file

    # Clean up
    if temp_file.exists():
        temp_file.unlink()


@pytest.fixture
def invalid_value_yaml_file():
    """Creates a temporary  yaml file - correctly formatted but bad datatype for testing."""
    invalid_content = """
## This file is intentionally malformed
- id: user.badformat
  platform: hive
  schema:
    fields:
    - id: ip
      type: bad_type
      description: The IP address
    """

    # Create a temporary file
    temp_file = TEST_RESOURCES_DIR / "invalid_dataset.yaml.tmp"
    with open(temp_file, "w") as f:
        f.write(invalid_content)

    yield temp_file

    # Clean up
    if temp_file.exists():
        temp_file.unlink()


@pytest.fixture
def malformed_yaml_file():
    """Creates a temporary malformed yaml file for testing."""
    malformed_content = """
## This file is intentionally malformed
- id: user.badformat
  platform: hive
  schema:
    fields:
    - id: ip
      type string  # Missing colon here
      description: The IP address
    """

    # Create a temporary file
    temp_file = TEST_RESOURCES_DIR / "malformed_dataset.yaml.tmp"
    with open(temp_file, "w") as f:
        f.write(malformed_content)

    yield temp_file

    # Clean up
    if temp_file.exists():
        temp_file.unlink()


@pytest.fixture
def fixable_yaml_file():
    """Creates a temporary yaml file with fixable formatting issues."""
    fixable_content = """
## This file has fixable formatting issues
- id: user.fixable
  platform: hive
  schema:
    fields:
    - id: ip
      type: string
      description: The IP address
    - id:  user_id     # Extra spaces
      type: string
      description:    The user ID   # Extra spaces
    """

    temp_file = TEST_RESOURCES_DIR / "fixable_dataset.yaml.tmp"
    with open(temp_file, "w") as f:
        f.write(fixable_content)

    yield temp_file

    # Clean up
    if temp_file.exists():
        temp_file.unlink()


class TestDatasetCli:
    def test_dataset_file_command_exists(self):
        """Test that the dataset file command exists."""
        runner = CliRunner()
        result = runner.invoke(dataset, ["--help"])
        assert result.exit_code == 0
        assert "file" in result.output

    @patch("datahub.cli.specific.dataset_cli.Dataset")
    def test_lint_check_no_issues(self, mock_dataset, test_yaml_file):
        """Test the lintCheck option when no issues are found."""
        # Setup mocks
        mock_dataset_instance = MagicMock()
        mock_dataset.from_yaml.return_value = [mock_dataset_instance]
        mock_dataset_instance.to_yaml.return_value = None

        # Mock filecmp.cmp to return True (files match)
        with patch("filecmp.cmp", return_value=True):
            runner = CliRunner()
            result = runner.invoke(
                dataset, ["file", "--lintCheck", str(test_yaml_file)]
            )

            # Verify
            assert result.exit_code == 0
            assert "No differences found" in result.output
            mock_dataset.from_yaml.assert_called_once()
            mock_dataset_instance.to_yaml.assert_called_once()

    @patch("datahub.cli.specific.dataset_cli.Dataset")
    @patch("os.system")
    def test_lint_check_with_issues(self, mock_system, mock_dataset, fixable_yaml_file):
        """Test the lintCheck option when issues are found."""
        # Setup mocks
        mock_dataset_instance = MagicMock()
        mock_dataset.from_yaml.return_value = [mock_dataset_instance]

        # Mock filecmp.cmp to return False (files don't match)
        with patch("filecmp.cmp", return_value=False):
            runner = CliRunner()
            result = runner.invoke(
                dataset, ["file", "--lintCheck", str(fixable_yaml_file)]
            )

            # Verify
            assert result.exit_code == 0
            assert "To fix these differences" in result.output
            mock_dataset.from_yaml.assert_called_once()
            mock_dataset_instance.to_yaml.assert_called_once()
            mock_system.assert_called_once()  # Should call diff

    @patch("datahub.cli.specific.dataset_cli.Dataset")
    @patch("os.system")
    @patch("shutil.copyfile")
    def test_lint_fix(
        self, mock_copyfile, mock_system, mock_dataset, fixable_yaml_file
    ):
        """Test the lintFix option."""
        # Setup mocks
        mock_dataset_instance = MagicMock()
        mock_dataset.from_yaml.return_value = [mock_dataset_instance]

        # Mock filecmp.cmp to return False (files don't match)
        with patch("filecmp.cmp", return_value=False):
            runner = CliRunner()
            result = runner.invoke(
                dataset, ["file", "--lintCheck", "--lintFix", str(fixable_yaml_file)]
            )

            # Verify
            assert result.exit_code == 0
            assert "Fixed linting issues" in result.output

            # Check that copyfile was called twice:
            # 1. To copy the original file to the temp file
            # 2. To copy the fixed temp file back to the original
            assert mock_copyfile.call_count == 2

            # The second call should copy from temp file to the original
            mock_copyfile.call_args_list[1][0][0]  # Source of second call
            assert mock_copyfile.call_args_list[1][0][1] == str(
                fixable_yaml_file
            )  # Destination

    @patch("datahub.cli.specific.dataset_cli.Dataset")
    def test_error_handling(self, mock_dataset, malformed_yaml_file):
        """Test error handling when processing a malformed yaml file."""
        # Setup mock to raise an exception
        mock_dataset.from_yaml.side_effect = Exception("YAML parsing error")

        runner = CliRunner()
        result = runner.invoke(
            dataset, ["file", "--lintCheck", str(malformed_yaml_file)]
        )

        # Verify exception is properly handled
        assert result.exit_code != 0
        mock_dataset.from_yaml.assert_called_once()

    def test_temporary_file_cleanup(self, test_yaml_file):
        """Test that temporary files are properly cleaned up."""
        # Count files in the directory before
        files_before = len(list(TEST_RESOURCES_DIR.glob("*.tmp")))

        runner = CliRunner()
        with patch("datahub.cli.specific.dataset_cli.Dataset"), patch(
            "filecmp.cmp", return_value=True
        ):
            runner.invoke(dataset, ["file", "--lintCheck", str(test_yaml_file)])

        # Count files after
        files_after = len(list(TEST_RESOURCES_DIR.glob("*.tmp")))

        # Should be same count (our fixture creates one tmp file)
        assert files_before == files_after

    @patch("datahub.cli.specific.dataset_cli.Dataset")
    def test_multiple_datasets_in_file(self, mock_dataset, test_yaml_file):
        """Test handling of multiple datasets defined in a single file."""
        # Create mock dataset instances
        mock_dataset1 = MagicMock()
        mock_dataset2 = MagicMock()
        mock_dataset.from_yaml.return_value = [mock_dataset1, mock_dataset2]

        with patch("filecmp.cmp", return_value=True):
            runner = CliRunner()
            result = runner.invoke(
                dataset, ["file", "--lintCheck", str(test_yaml_file)]
            )

            # Verify
            assert result.exit_code == 0
            assert "No differences found" in result.output

            # Verify both dataset instances had to_yaml called
            mock_dataset1.to_yaml.assert_called_once()
            mock_dataset2.to_yaml.assert_called_once()

    @patch("datahub.cli.specific.dataset_cli.get_default_graph")
    def test_dry_run_sync(self, mock_get_default_graph, test_yaml_file):
        mock_graph = MagicMock()
        mock_graph.exists.return_value = True
        mock_get_default_graph.return_value.__enter__.return_value = mock_graph

        runner = CliRunner()
        result = runner.invoke(
            dataset, ["sync", "--dry-run", "--to-datahub", "-f", str(test_yaml_file)]
        )

        # Verify
        assert result.exit_code == 0
        assert not mock_get_default_graph.emit.called

    @patch("datahub.cli.specific.dataset_cli.get_default_graph")
    def test_dry_run_sync_fail_bad_type(
        self, mock_get_default_graph, invalid_value_yaml_file
    ):
        mock_graph = MagicMock()
        mock_graph.exists.return_value = True
        mock_get_default_graph.return_value.__enter__.return_value = mock_graph

        runner = CliRunner()
        result = runner.invoke(
            dataset,
            ["sync", "--dry-run", "--to-datahub", "-f", str(invalid_value_yaml_file)],
        )

        # Verify
        assert result.exit_code != 0
        assert not mock_get_default_graph.emit.called
        assert "Type bad_type is not a valid primitive type" in result.output

    @patch("datahub.cli.specific.dataset_cli.get_default_graph")
    def test_dry_run_sync_fail_missing_ref(
        self, mock_get_default_graph, test_yaml_file
    ):
        mock_graph = MagicMock()
        mock_graph.exists.return_value = False
        mock_get_default_graph.return_value.__enter__.return_value = mock_graph

        runner = CliRunner()
        result = runner.invoke(
            dataset, ["sync", "--dry-run", "--to-datahub", "-f", str(test_yaml_file)]
        )

        # Verify
        assert result.exit_code != 0
        assert not mock_get_default_graph.emit.called
        assert "missing entity reference" in result.output

    @patch("datahub.cli.specific.dataset_cli.get_default_graph")
    def test_run_sync(self, mock_get_default_graph, test_yaml_file):
        mock_graph = MagicMock()
        mock_graph.exists.return_value = True
        mock_get_default_graph.return_value.__enter__.return_value = mock_graph

        emitted_items = []

        def capture_emit(item, *args, **kwargs):
            emitted_items.append(item)
            return None

        mock_graph.emit.side_effect = capture_emit

        runner = CliRunner()
        result = runner.invoke(
            dataset, ["sync", "--to-datahub", "-f", str(test_yaml_file)]
        )

        # Verify
        assert result.exit_code == 0
        assert mock_graph.emit.called

        golden_file = Path(TEST_RESOURCES_DIR / "golden_test_dataset_sync_mpcs.json")
        check_goldens_stream(emitted_items, golden_file)

    @patch("datahub.cli.specific.dataset_cli.get_default_graph")
    def test_run_sync_fail(self, mock_get_default_graph, invalid_value_yaml_file):
        mock_graph = MagicMock()
        mock_graph.exists.return_value = True
        mock_get_default_graph.return_value.__enter__.return_value = mock_graph

        runner = CliRunner()
        result = runner.invoke(
            dataset, ["sync", "--to-datahub", "-f", str(invalid_value_yaml_file)]
        )

        # Verify
        assert result.exit_code != 0
        assert not mock_get_default_graph.emit.called
        assert "is not a valid primitive type" in result.output

    @patch("datahub.cli.specific.dataset_cli.get_default_graph")
    def test_run_upsert_fail(self, mock_get_default_graph, invalid_value_yaml_file):
        mock_graph = MagicMock()
        mock_graph.exists.return_value = True
        mock_get_default_graph.return_value.__enter__.return_value = mock_graph

        runner = CliRunner()
        result = runner.invoke(dataset, ["upsert", "-f", str(invalid_value_yaml_file)])

        # Verify
        assert result.exit_code != 0
        assert not mock_get_default_graph.emit.called
        assert "is not a valid primitive type" in result.output

    @patch("datahub.cli.specific.dataset_cli.get_default_graph")
    def test_sync_from_datahub_fail(self, mock_get_default_graph, test_yaml_file):
        mock_graph = MagicMock()
        mock_graph.exists.return_value = False
        mock_get_default_graph.return_value.__enter__.return_value = mock_graph

        runner = CliRunner()
        result = runner.invoke(
            dataset, ["sync", "--dry-run", "--from-datahub", "-f", str(test_yaml_file)]
        )

        # Verify
        assert result.exit_code != 0
        assert "does not exist" in result.output