2022-05-26 08:05:57 -07:00
|
|
|
from pathlib import PosixPath
|
2022-12-12 13:06:03 -05:00
|
|
|
from typing import Any, Dict, Optional, Union, cast
|
2022-05-26 08:05:57 -07:00
|
|
|
from unittest.mock import patch
|
|
|
|
|
2022-08-10 22:00:31 +00:00
|
|
|
import pytest
|
|
|
|
from freezegun import freeze_time
|
2022-05-26 08:05:57 -07:00
|
|
|
from iceberg.core.filesystem.file_status import FileStatus
|
|
|
|
from iceberg.core.filesystem.local_filesystem import LocalFileSystem
|
|
|
|
|
|
|
|
from datahub.ingestion.run.pipeline import Pipeline
|
2022-12-12 13:06:03 -05:00
|
|
|
from datahub.ingestion.source.iceberg.iceberg import IcebergSource
|
|
|
|
from datahub.ingestion.source.state.checkpoint import Checkpoint
|
2022-12-15 11:33:29 -05:00
|
|
|
from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState
|
2022-05-26 08:05:57 -07:00
|
|
|
from tests.test_helpers import mce_helpers
|
2022-12-12 13:06:03 -05:00
|
|
|
from tests.test_helpers.state_helpers import (
|
|
|
|
run_and_get_pipeline,
|
|
|
|
validate_all_providers_have_committed_successfully,
|
|
|
|
)
|
2022-05-26 08:05:57 -07:00
|
|
|
|
|
|
|
FROZEN_TIME = "2020-04-14 07:00:00"
|
2022-12-12 13:06:03 -05:00
|
|
|
GMS_PORT = 8080
|
|
|
|
GMS_SERVER = f"http://localhost:{GMS_PORT}"
|
|
|
|
|
|
|
|
|
|
|
|
def get_current_checkpoint_from_pipeline(
|
|
|
|
pipeline: Pipeline,
|
2022-12-15 11:33:29 -05:00
|
|
|
) -> Optional[Checkpoint[GenericCheckpointState]]:
|
2022-12-12 13:06:03 -05:00
|
|
|
iceberg_source = cast(IcebergSource, pipeline.source)
|
|
|
|
return iceberg_source.get_current_checkpoint(
|
|
|
|
iceberg_source.stale_entity_removal_handler.job_id
|
|
|
|
)
|
2022-05-26 08:05:57 -07:00
|
|
|
|
|
|
|
|
|
|
|
@freeze_time(FROZEN_TIME)
|
|
|
|
@pytest.mark.integration
|
|
|
|
def test_iceberg_ingest(pytestconfig, tmp_path, mock_time):
|
|
|
|
test_resources_dir = pytestconfig.rootpath / "tests/integration/iceberg/"
|
|
|
|
|
|
|
|
# Run the metadata ingestion pipeline.
|
|
|
|
pipeline = Pipeline.create(
|
|
|
|
{
|
|
|
|
"run_id": "iceberg-test",
|
|
|
|
"source": {
|
|
|
|
"type": "iceberg",
|
|
|
|
"config": {
|
2022-12-12 13:06:03 -05:00
|
|
|
"localfs": str(test_resources_dir / "test_data/ingest_test"),
|
2022-05-26 08:05:57 -07:00
|
|
|
"user_ownership_property": "owner",
|
|
|
|
"group_ownership_property": "owner",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
"sink": {
|
|
|
|
"type": "file",
|
|
|
|
"config": {
|
|
|
|
"filename": f"{tmp_path}/iceberg_mces.json",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
)
|
|
|
|
pipeline.run()
|
|
|
|
pipeline.raise_from_status()
|
|
|
|
|
|
|
|
# Verify the output.
|
|
|
|
mce_helpers.check_golden_file(
|
|
|
|
pytestconfig,
|
|
|
|
output_path=tmp_path / "iceberg_mces.json",
|
2022-12-12 13:06:03 -05:00
|
|
|
golden_path=test_resources_dir
|
|
|
|
/ "test_data/ingest_test/iceberg_mces_golden.json",
|
2022-05-26 08:05:57 -07:00
|
|
|
)
|
|
|
|
|
|
|
|
|
2022-12-12 13:06:03 -05:00
|
|
|
@freeze_time(FROZEN_TIME)
|
|
|
|
@pytest.mark.integration
|
|
|
|
def test_iceberg_stateful_ingest(pytestconfig, tmp_path, mock_time, mock_datahub_graph):
|
|
|
|
test_resources_dir = (
|
|
|
|
pytestconfig.rootpath / "tests/integration/iceberg/test_data/stateful_test"
|
|
|
|
)
|
|
|
|
platform_instance = "test_platform_instance"
|
|
|
|
|
|
|
|
scd_before_deletion: Dict[str, Any] = {
|
|
|
|
"localfs": str(test_resources_dir / "run1"),
|
|
|
|
"user_ownership_property": "owner",
|
|
|
|
"group_ownership_property": "owner",
|
|
|
|
"platform_instance": f"{platform_instance}",
|
|
|
|
# enable stateful ingestion
|
|
|
|
"stateful_ingestion": {
|
|
|
|
"enabled": True,
|
|
|
|
"remove_stale_metadata": True,
|
|
|
|
"fail_safe_threshold": 100.0,
|
|
|
|
"state_provider": {
|
|
|
|
"type": "datahub",
|
|
|
|
"config": {"datahub_api": {"server": GMS_SERVER}},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
scd_after_deletion: Dict[str, Any] = {
|
|
|
|
"localfs": str(test_resources_dir / "run2"),
|
|
|
|
"user_ownership_property": "owner",
|
|
|
|
"group_ownership_property": "owner",
|
|
|
|
"platform_instance": f"{platform_instance}",
|
|
|
|
# enable stateful ingestion
|
|
|
|
"stateful_ingestion": {
|
|
|
|
"enabled": True,
|
|
|
|
"remove_stale_metadata": True,
|
|
|
|
"fail_safe_threshold": 100.0,
|
|
|
|
"state_provider": {
|
|
|
|
"type": "datahub",
|
|
|
|
"config": {"datahub_api": {"server": GMS_SERVER}},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
pipeline_config_dict: Dict[str, Any] = {
|
|
|
|
"source": {
|
|
|
|
"type": "iceberg",
|
|
|
|
"config": scd_before_deletion,
|
|
|
|
},
|
|
|
|
"sink": {
|
|
|
|
# we are not really interested in the resulting events for this test
|
|
|
|
"type": "console"
|
|
|
|
},
|
|
|
|
"pipeline_name": "test_pipeline",
|
|
|
|
}
|
|
|
|
|
|
|
|
with patch(
|
|
|
|
"datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph",
|
|
|
|
mock_datahub_graph,
|
|
|
|
) as mock_checkpoint:
|
|
|
|
# Both checkpoint and reporting will use the same mocked graph instance.
|
|
|
|
mock_checkpoint.return_value = mock_datahub_graph
|
|
|
|
|
|
|
|
# Do the first run of the pipeline and get the default job's checkpoint.
|
|
|
|
pipeline_run1 = run_and_get_pipeline(pipeline_config_dict)
|
|
|
|
checkpoint1 = get_current_checkpoint_from_pipeline(pipeline_run1)
|
|
|
|
|
|
|
|
assert checkpoint1
|
|
|
|
assert checkpoint1.state
|
|
|
|
|
|
|
|
# Set iceberg config where a table is deleted.
|
|
|
|
pipeline_config_dict["source"]["config"] = scd_after_deletion
|
|
|
|
# Capture MCEs of second run to validate Status(removed=true)
|
|
|
|
deleted_mces_path = f"{tmp_path}/iceberg_deleted_mces.json"
|
|
|
|
pipeline_config_dict["sink"]["type"] = "file"
|
|
|
|
pipeline_config_dict["sink"]["config"] = {"filename": deleted_mces_path}
|
|
|
|
|
|
|
|
# Do the second run of the pipeline.
|
|
|
|
pipeline_run2 = run_and_get_pipeline(pipeline_config_dict)
|
|
|
|
checkpoint2 = get_current_checkpoint_from_pipeline(pipeline_run2)
|
|
|
|
|
|
|
|
assert checkpoint2
|
|
|
|
assert checkpoint2.state
|
|
|
|
|
|
|
|
# Perform all assertions on the states. The deleted table should not be
|
|
|
|
# part of the second state
|
2022-12-15 11:33:29 -05:00
|
|
|
state1 = checkpoint1.state
|
|
|
|
state2 = checkpoint2.state
|
2022-12-12 13:06:03 -05:00
|
|
|
difference_urns = list(
|
|
|
|
state1.get_urns_not_in(type="dataset", other_checkpoint_state=state2)
|
|
|
|
)
|
|
|
|
|
|
|
|
assert len(difference_urns) == 1
|
|
|
|
|
|
|
|
urn1 = "urn:li:dataset:(urn:li:dataPlatform:iceberg,test_platform_instance.namespace.iceberg_test_2,PROD)"
|
|
|
|
|
|
|
|
assert urn1 in difference_urns
|
|
|
|
|
|
|
|
# Validate that all providers have committed successfully.
|
|
|
|
validate_all_providers_have_committed_successfully(
|
|
|
|
pipeline=pipeline_run1, expected_providers=1
|
|
|
|
)
|
|
|
|
validate_all_providers_have_committed_successfully(
|
|
|
|
pipeline=pipeline_run2, expected_providers=1
|
|
|
|
)
|
|
|
|
|
|
|
|
# Verify the output.
|
|
|
|
mce_helpers.check_golden_file(
|
|
|
|
pytestconfig,
|
|
|
|
output_path=deleted_mces_path,
|
|
|
|
golden_path=test_resources_dir / "iceberg_deleted_table_mces_golden.json",
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2022-05-26 08:05:57 -07:00
|
|
|
@freeze_time(FROZEN_TIME)
|
|
|
|
@pytest.mark.integration
|
|
|
|
def test_iceberg_profiling(pytestconfig, tmp_path, mock_time):
|
|
|
|
"""
|
|
|
|
This test is using a table created using https://github.com/tabular-io/docker-spark-iceberg.
|
|
|
|
Here are the DDL statements that you can execute with `spark-sql`:
|
|
|
|
```SQL
|
|
|
|
CREATE TABLE datahub.integration.profiling (
|
|
|
|
field_int bigint COMMENT 'An integer field',
|
|
|
|
field_str string COMMENT 'A string field',
|
|
|
|
field_timestamp timestamp COMMENT 'A timestamp field')
|
|
|
|
USING iceberg;
|
|
|
|
|
|
|
|
INSERT INTO datahub.integration.profiling VALUES (1, 'row1', current_timestamp()), (2, 'row2', null);
|
|
|
|
INSERT INTO datahub.integration.profiling VALUES (3, 'row3', current_timestamp()), (4, 'row4', null);
|
|
|
|
```
|
|
|
|
|
|
|
|
When importing the metadata files into this test, we need to create a `version-hint.text` with a value that
|
|
|
|
reflects the version of the table, and then change the code in `TestLocalFileSystem._replace_path()` accordingly.
|
|
|
|
"""
|
2022-12-12 13:06:03 -05:00
|
|
|
test_resources_dir = (
|
|
|
|
pytestconfig.rootpath / "tests/integration/iceberg/test_data/profiling_test"
|
|
|
|
)
|
2022-05-26 08:05:57 -07:00
|
|
|
|
|
|
|
# Run the metadata ingestion pipeline.
|
|
|
|
pipeline = Pipeline.create(
|
|
|
|
{
|
|
|
|
"run_id": "iceberg-test",
|
|
|
|
"source": {
|
|
|
|
"type": "iceberg",
|
|
|
|
"config": {
|
|
|
|
"localfs": str(test_resources_dir),
|
|
|
|
"user_ownership_property": "owner",
|
|
|
|
"group_ownership_property": "owner",
|
|
|
|
"max_path_depth": 3,
|
|
|
|
"profiling": {
|
|
|
|
"enabled": True,
|
|
|
|
},
|
|
|
|
"table_pattern": {"allow": ["datahub.integration.profiling"]},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
"sink": {
|
|
|
|
"type": "file",
|
|
|
|
"config": {
|
|
|
|
"filename": f"{tmp_path}/iceberg_mces.json",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
|
|
|
class TestLocalFileSystem(LocalFileSystem):
|
|
|
|
# This class acts as a wrapper on LocalFileSystem to intercept calls using a path location.
|
|
|
|
# The wrapper will normalize those paths to be usable by the test.
|
|
|
|
fs: LocalFileSystem
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def _replace_path(path: Union[str, PosixPath]) -> str:
|
|
|
|
# When the Iceberg table was created, its warehouse folder was '/home/iceberg/warehouse'. Iceberg tables
|
|
|
|
# are not portable, so we need to replace the warehouse folder by the test location at runtime.
|
|
|
|
normalized_path: str = str(path).replace(
|
|
|
|
"/home/iceberg/warehouse", str(test_resources_dir)
|
|
|
|
)
|
|
|
|
|
|
|
|
# When the Iceberg table was created, a postgres catalog was used instead of a HadoopCatalog. The HadoopCatalog
|
|
|
|
# expects a file named 'v{}.metadata.json' where {} is the version number from 'version-hint.text'. Since
|
|
|
|
# 'v2.metadata.json' does not exist, we will redirect the call to '00002-02782173-8364-4caf-a3c4-9567c1d6608f.metadata.json'.
|
|
|
|
if normalized_path.endswith("v2.metadata.json"):
|
|
|
|
return normalized_path.replace(
|
|
|
|
"v2.metadata.json",
|
|
|
|
"00002-cc241948-4c12-46d0-9a75-ce3578ec03d4.metadata.json",
|
|
|
|
)
|
|
|
|
return normalized_path
|
|
|
|
|
|
|
|
def __init__(self, fs: LocalFileSystem) -> None:
|
|
|
|
self.fs = fs
|
|
|
|
|
|
|
|
def open(self, path: str, mode: str = "rb") -> object:
|
|
|
|
return self.fs.open(TestLocalFileSystem._replace_path(path), mode)
|
|
|
|
|
|
|
|
def delete(self, path: str) -> None:
|
|
|
|
self.fs.delete(TestLocalFileSystem._replace_path(path))
|
|
|
|
|
|
|
|
def stat(self, path: str) -> FileStatus:
|
|
|
|
return self.fs.stat(TestLocalFileSystem._replace_path(path))
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def fix_path(path: str) -> str:
|
|
|
|
return TestLocalFileSystem.fs.fix_path(
|
|
|
|
TestLocalFileSystem._replace_path(path)
|
|
|
|
)
|
|
|
|
|
|
|
|
def create(self, path: str, overwrite: bool = False) -> object:
|
|
|
|
return self.fs.create(TestLocalFileSystem._replace_path(path), overwrite)
|
|
|
|
|
|
|
|
def rename(self, src: str, dest: str) -> bool:
|
|
|
|
return self.fs.rename(
|
|
|
|
TestLocalFileSystem._replace_path(src),
|
|
|
|
TestLocalFileSystem._replace_path(dest),
|
|
|
|
)
|
|
|
|
|
|
|
|
def exists(self, path: str) -> bool:
|
|
|
|
return self.fs.exists(TestLocalFileSystem._replace_path(path))
|
|
|
|
|
|
|
|
local_fs_wrapper: TestLocalFileSystem = TestLocalFileSystem(
|
|
|
|
LocalFileSystem.get_instance()
|
|
|
|
)
|
|
|
|
with patch.object(LocalFileSystem, "get_instance", return_value=local_fs_wrapper):
|
|
|
|
pipeline.run()
|
|
|
|
pipeline.raise_from_status()
|
|
|
|
|
|
|
|
# Verify the output.
|
|
|
|
mce_helpers.check_golden_file(
|
|
|
|
pytestconfig,
|
|
|
|
output_path=tmp_path / "iceberg_mces.json",
|
2022-12-12 13:06:03 -05:00
|
|
|
golden_path=test_resources_dir / "iceberg_mces_golden.json",
|
2022-05-26 08:05:57 -07:00
|
|
|
)
|