mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-11 18:16:58 +00:00
fix(ingestion/iceberg): Improve iceberg source resiliency to server errors (#14731)
This commit is contained in:
parent
3fbef4a632
commit
5f23652fd3
@ -12,7 +12,7 @@ from pyiceberg.exceptions import (
|
|||||||
NoSuchNamespaceError,
|
NoSuchNamespaceError,
|
||||||
NoSuchPropertyException,
|
NoSuchPropertyException,
|
||||||
NoSuchTableError,
|
NoSuchTableError,
|
||||||
ServerError,
|
RESTError,
|
||||||
)
|
)
|
||||||
from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit
|
from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit
|
||||||
from pyiceberg.table import Table
|
from pyiceberg.table import Table
|
||||||
@ -154,6 +154,10 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|||||||
self.report: IcebergSourceReport = IcebergSourceReport()
|
self.report: IcebergSourceReport = IcebergSourceReport()
|
||||||
self.config: IcebergSourceConfig = config
|
self.config: IcebergSourceConfig = config
|
||||||
self.ctx: PipelineContext = ctx
|
self.ctx: PipelineContext = ctx
|
||||||
|
self.stamping_processor = AutoSystemMetadata(
|
||||||
|
self.ctx
|
||||||
|
) # single instance used only when processing namespaces
|
||||||
|
self.namespaces: List[Tuple[Identifier, str]] = []
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create(cls, config_dict: Dict, ctx: PipelineContext) -> "IcebergSource":
|
def create(cls, config_dict: Dict, ctx: PipelineContext) -> "IcebergSource":
|
||||||
@ -246,6 +250,13 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|||||||
context=str(namespace),
|
context=str(namespace),
|
||||||
exc=e,
|
exc=e,
|
||||||
)
|
)
|
||||||
|
except RESTError as e:
|
||||||
|
self.report.warning(
|
||||||
|
title="Iceberg REST Server Error",
|
||||||
|
message="Iceberg REST Server returned error status when trying to list tables for a namespace, skipping it.",
|
||||||
|
context=str(namespace),
|
||||||
|
exc=e,
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.report.report_failure(
|
self.report.report_failure(
|
||||||
title="Error when processing a namespace",
|
title="Error when processing a namespace",
|
||||||
@ -322,10 +333,10 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|||||||
context=dataset_name,
|
context=dataset_name,
|
||||||
exc=e,
|
exc=e,
|
||||||
)
|
)
|
||||||
except ServerError as e:
|
except RESTError as e:
|
||||||
self.report.warning(
|
self.report.warning(
|
||||||
title="Iceberg REST Server Error",
|
title="Iceberg REST Server Error",
|
||||||
message="Iceberg returned 500 HTTP status when trying to process a table, skipping it.",
|
message="Iceberg REST Server returned error status when trying to process a table, skipping it.",
|
||||||
context=dataset_name,
|
context=dataset_name,
|
||||||
exc=e,
|
exc=e,
|
||||||
)
|
)
|
||||||
@ -365,7 +376,7 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
catalog = self.config.get_catalog()
|
self.catalog = self.config.get_catalog()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.report.report_failure(
|
self.report.report_failure(
|
||||||
title="Failed to initialize catalog object",
|
title="Failed to initialize catalog object",
|
||||||
@ -375,33 +386,7 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
stamping_processor = AutoSystemMetadata(self.ctx)
|
yield from self._process_namespaces()
|
||||||
namespace_ids = self._get_namespaces(catalog)
|
|
||||||
namespaces: List[Tuple[Identifier, str]] = []
|
|
||||||
for namespace in namespace_ids:
|
|
||||||
namespace_repr = ".".join(namespace)
|
|
||||||
LOGGER.debug(f"Processing namespace {namespace_repr}")
|
|
||||||
namespace_urn = make_container_urn(
|
|
||||||
NamespaceKey(
|
|
||||||
namespace=namespace_repr,
|
|
||||||
platform=self.platform,
|
|
||||||
instance=self.config.platform_instance,
|
|
||||||
env=self.config.env,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
namespace_properties: Properties = catalog.load_namespace_properties(
|
|
||||||
namespace
|
|
||||||
)
|
|
||||||
namespaces.append((namespace, namespace_urn))
|
|
||||||
for aspect in self._create_iceberg_namespace_aspects(
|
|
||||||
namespace, namespace_properties
|
|
||||||
):
|
|
||||||
yield stamping_processor.stamp_wu(
|
|
||||||
MetadataChangeProposalWrapper(
|
|
||||||
entityUrn=namespace_urn, aspect=aspect
|
|
||||||
).as_workunit()
|
|
||||||
)
|
|
||||||
LOGGER.debug("Namespaces ingestion completed")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.report.report_failure(
|
self.report.report_failure(
|
||||||
title="Failed to list namespaces",
|
title="Failed to list namespaces",
|
||||||
@ -415,13 +400,70 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|||||||
args_list=[
|
args_list=[
|
||||||
(dataset_path, namespace_urn)
|
(dataset_path, namespace_urn)
|
||||||
for dataset_path, namespace_urn in self._get_datasets(
|
for dataset_path, namespace_urn in self._get_datasets(
|
||||||
catalog, namespaces
|
self.catalog, self.namespaces
|
||||||
)
|
)
|
||||||
],
|
],
|
||||||
max_workers=self.config.processing_threads,
|
max_workers=self.config.processing_threads,
|
||||||
):
|
):
|
||||||
yield wu
|
yield wu
|
||||||
|
|
||||||
|
def _try_processing_namespace(
|
||||||
|
self, namespace: Identifier
|
||||||
|
) -> Iterable[MetadataWorkUnit]:
|
||||||
|
namespace_repr = ".".join(namespace)
|
||||||
|
try:
|
||||||
|
LOGGER.debug(f"Processing namespace {namespace_repr}")
|
||||||
|
namespace_urn = make_container_urn(
|
||||||
|
NamespaceKey(
|
||||||
|
namespace=namespace_repr,
|
||||||
|
platform=self.platform,
|
||||||
|
instance=self.config.platform_instance,
|
||||||
|
env=self.config.env,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
namespace_properties: Properties = self.catalog.load_namespace_properties(
|
||||||
|
namespace
|
||||||
|
)
|
||||||
|
for aspect in self._create_iceberg_namespace_aspects(
|
||||||
|
namespace, namespace_properties
|
||||||
|
):
|
||||||
|
yield self.stamping_processor.stamp_wu(
|
||||||
|
MetadataChangeProposalWrapper(
|
||||||
|
entityUrn=namespace_urn, aspect=aspect
|
||||||
|
).as_workunit()
|
||||||
|
)
|
||||||
|
self.namespaces.append((namespace, namespace_urn))
|
||||||
|
except NoSuchNamespaceError as e:
|
||||||
|
self.report.report_warning(
|
||||||
|
title="Failed to retrieve namespace properties",
|
||||||
|
message="Couldn't find the namespace, was it deleted during the ingestion?",
|
||||||
|
context=namespace_repr,
|
||||||
|
exc=e,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
except RESTError as e:
|
||||||
|
self.report.warning(
|
||||||
|
title="Iceberg REST Server Error",
|
||||||
|
message="Iceberg REST Server returned error status when trying to retrieve namespace properties, skipping it.",
|
||||||
|
context=str(namespace),
|
||||||
|
exc=e,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
self.report.report_failure(
|
||||||
|
title="Failed to process namespace",
|
||||||
|
message="Unhandled exception happened during processing of the namespace",
|
||||||
|
context=namespace_repr,
|
||||||
|
exc=e,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _process_namespaces(self) -> Iterable[MetadataWorkUnit]:
|
||||||
|
namespace_ids = self._get_namespaces(self.catalog)
|
||||||
|
for namespace in namespace_ids:
|
||||||
|
yield from self._try_processing_namespace(namespace)
|
||||||
|
|
||||||
|
LOGGER.debug("Namespaces ingestion completed")
|
||||||
|
|
||||||
def _create_iceberg_table_aspects(
|
def _create_iceberg_table_aspects(
|
||||||
self, dataset_name: str, table: Table, namespace_urn: str
|
self, dataset_name: str, table: Table, namespace_urn: str
|
||||||
) -> Iterable[_Aspect]:
|
) -> Iterable[_Aspect]:
|
||||||
|
|||||||
@ -12,6 +12,7 @@ from pyiceberg.exceptions import (
|
|||||||
NoSuchNamespaceError,
|
NoSuchNamespaceError,
|
||||||
NoSuchPropertyException,
|
NoSuchPropertyException,
|
||||||
NoSuchTableError,
|
NoSuchTableError,
|
||||||
|
RESTError,
|
||||||
ServerError,
|
ServerError,
|
||||||
)
|
)
|
||||||
from pyiceberg.io.pyarrow import PyArrowFileIO
|
from pyiceberg.io.pyarrow import PyArrowFileIO
|
||||||
@ -573,6 +574,8 @@ class MockCatalogExceptionListingTables(MockCatalog):
|
|||||||
def list_tables(self, namespace: str) -> Iterable[Tuple[str, str]]:
|
def list_tables(self, namespace: str) -> Iterable[Tuple[str, str]]:
|
||||||
if namespace == ("no_such_namespace",):
|
if namespace == ("no_such_namespace",):
|
||||||
raise NoSuchNamespaceError()
|
raise NoSuchNamespaceError()
|
||||||
|
if namespace == ("rest_error",):
|
||||||
|
raise RESTError()
|
||||||
if namespace == ("generic_exception",):
|
if namespace == ("generic_exception",):
|
||||||
raise Exception()
|
raise Exception()
|
||||||
return super().list_tables(namespace)
|
return super().list_tables(namespace)
|
||||||
@ -583,6 +586,17 @@ class MockCatalogExceptionListingNamespaces(MockCatalog):
|
|||||||
raise Exception("Test exception")
|
raise Exception("Test exception")
|
||||||
|
|
||||||
|
|
||||||
|
class MockCatalogExceptionRetrievingNamespaceProperties(MockCatalog):
|
||||||
|
def load_namespace_properties(self, namespace: Tuple[str, ...]) -> Dict[str, str]:
|
||||||
|
if namespace == ("no_such_namespace",):
|
||||||
|
raise NoSuchNamespaceError()
|
||||||
|
if namespace == ("rest_error",):
|
||||||
|
raise RESTError()
|
||||||
|
if namespace == ("generic_exception",):
|
||||||
|
raise Exception()
|
||||||
|
return super().load_namespace_properties(namespace)
|
||||||
|
|
||||||
|
|
||||||
def test_exception_while_listing_namespaces() -> None:
|
def test_exception_while_listing_namespaces() -> None:
|
||||||
source = with_iceberg_source(processing_threads=2)
|
source = with_iceberg_source(processing_threads=2)
|
||||||
mock_catalog = MockCatalogExceptionListingNamespaces({})
|
mock_catalog = MockCatalogExceptionListingNamespaces({})
|
||||||
@ -595,9 +609,9 @@ def test_exception_while_listing_namespaces() -> None:
|
|||||||
assert source.report.failures.total_elements == 1
|
assert source.report.failures.total_elements == 1
|
||||||
|
|
||||||
|
|
||||||
def test_known_exception_while_listing_tables() -> None:
|
def test_known_exception_while_retrieving_namespace_properties() -> None:
|
||||||
source = with_iceberg_source(processing_threads=2)
|
source = with_iceberg_source(processing_threads=2)
|
||||||
mock_catalog = MockCatalogExceptionListingTables(
|
mock_catalog = MockCatalogExceptionRetrievingNamespaceProperties(
|
||||||
{
|
{
|
||||||
"namespaceA": {
|
"namespaceA": {
|
||||||
"table1": lambda: Table(
|
"table1": lambda: Table(
|
||||||
@ -614,6 +628,7 @@ def test_known_exception_while_listing_tables() -> None:
|
|||||||
)
|
)
|
||||||
},
|
},
|
||||||
"no_such_namespace": {},
|
"no_such_namespace": {},
|
||||||
|
"rest_error": {},
|
||||||
"namespaceB": {
|
"namespaceB": {
|
||||||
"table2": lambda: Table(
|
"table2": lambda: Table(
|
||||||
identifier=("namespaceB", "table2"),
|
identifier=("namespaceB", "table2"),
|
||||||
@ -675,7 +690,7 @@ def test_known_exception_while_listing_tables() -> None:
|
|||||||
) as get_catalog:
|
) as get_catalog:
|
||||||
get_catalog.return_value = mock_catalog
|
get_catalog.return_value = mock_catalog
|
||||||
wu: List[MetadataWorkUnit] = [*source.get_workunits_internal()]
|
wu: List[MetadataWorkUnit] = [*source.get_workunits_internal()]
|
||||||
# ingested 5 tables (6 MCPs each) and 5 namespaces (4 MCPs each), despite exception
|
# ingested 5 tables (6 MCPs each) and 4 namespaces (4 MCPs each), we will not ingest namespaces at all if we fail to get their properties
|
||||||
expected_wu_urns = [
|
expected_wu_urns = [
|
||||||
"urn:li:dataset:(urn:li:dataPlatform:iceberg,namespaceA.table1,PROD)",
|
"urn:li:dataset:(urn:li:dataPlatform:iceberg,namespaceA.table1,PROD)",
|
||||||
"urn:li:dataset:(urn:li:dataPlatform:iceberg,namespaceB.table2,PROD)",
|
"urn:li:dataset:(urn:li:dataPlatform:iceberg,namespaceB.table2,PROD)",
|
||||||
@ -684,7 +699,6 @@ def test_known_exception_while_listing_tables() -> None:
|
|||||||
"urn:li:dataset:(urn:li:dataPlatform:iceberg,namespaceD.table5,PROD)",
|
"urn:li:dataset:(urn:li:dataPlatform:iceberg,namespaceD.table5,PROD)",
|
||||||
] * MCPS_PER_TABLE + [
|
] * MCPS_PER_TABLE + [
|
||||||
"urn:li:container:390e031441265aae5b7b7ae8d51b0c1f",
|
"urn:li:container:390e031441265aae5b7b7ae8d51b0c1f",
|
||||||
"urn:li:container:9cb5e87ec392b231720f23bf00d6f6aa",
|
|
||||||
"urn:li:container:74727446a56420d80ff3b1abf2a18087",
|
"urn:li:container:74727446a56420d80ff3b1abf2a18087",
|
||||||
"urn:li:container:3f9a24213cca64ab22e409d1b9a94789",
|
"urn:li:container:3f9a24213cca64ab22e409d1b9a94789",
|
||||||
"urn:li:container:38a0583b0305ec5066cb708199f6848c",
|
"urn:li:container:38a0583b0305ec5066cb708199f6848c",
|
||||||
@ -698,7 +712,224 @@ def test_known_exception_while_listing_tables() -> None:
|
|||||||
urns,
|
urns,
|
||||||
expected_wu_urns,
|
expected_wu_urns,
|
||||||
)
|
)
|
||||||
assert source.report.warnings.total_elements == 1
|
assert source.report.warnings.total_elements == 2
|
||||||
|
assert source.report.failures.total_elements == 0
|
||||||
|
assert source.report.tables_scanned == 5
|
||||||
|
|
||||||
|
|
||||||
|
def test_unknown_exception_while_retrieving_namespace_properties() -> None:
|
||||||
|
source = with_iceberg_source(processing_threads=2)
|
||||||
|
mock_catalog = MockCatalogExceptionRetrievingNamespaceProperties(
|
||||||
|
{
|
||||||
|
"namespaceA": {
|
||||||
|
"table1": lambda: Table(
|
||||||
|
identifier=("namespaceA", "table1"),
|
||||||
|
metadata=TableMetadataV2(
|
||||||
|
partition_specs=[PartitionSpec(spec_id=0)],
|
||||||
|
location="s3://abcdefg/namespaceA/table1",
|
||||||
|
last_column_id=0,
|
||||||
|
schemas=[Schema(schema_id=0)],
|
||||||
|
),
|
||||||
|
metadata_location="s3://abcdefg/namespaceA/table1",
|
||||||
|
io=PyArrowFileIO(),
|
||||||
|
catalog=None,
|
||||||
|
)
|
||||||
|
},
|
||||||
|
"generic_exception": {},
|
||||||
|
"namespaceB": {
|
||||||
|
"table2": lambda: Table(
|
||||||
|
identifier=("namespaceB", "table2"),
|
||||||
|
metadata=TableMetadataV2(
|
||||||
|
partition_specs=[PartitionSpec(spec_id=0)],
|
||||||
|
location="s3://abcdefg/namespaceB/table2",
|
||||||
|
last_column_id=0,
|
||||||
|
schemas=[Schema(schema_id=0)],
|
||||||
|
),
|
||||||
|
metadata_location="s3://abcdefg/namespaceB/table2",
|
||||||
|
io=PyArrowFileIO(),
|
||||||
|
catalog=None,
|
||||||
|
),
|
||||||
|
"table3": lambda: Table(
|
||||||
|
identifier=("namespaceB", "table3"),
|
||||||
|
metadata=TableMetadataV2(
|
||||||
|
partition_specs=[PartitionSpec(spec_id=0)],
|
||||||
|
location="s3://abcdefg/namespaceB/table3",
|
||||||
|
last_column_id=0,
|
||||||
|
schemas=[Schema(schema_id=0)],
|
||||||
|
),
|
||||||
|
metadata_location="s3://abcdefg/namespaceB/table3",
|
||||||
|
io=PyArrowFileIO(),
|
||||||
|
catalog=None,
|
||||||
|
),
|
||||||
|
},
|
||||||
|
"namespaceC": {
|
||||||
|
"table4": lambda: Table(
|
||||||
|
identifier=("namespaceC", "table4"),
|
||||||
|
metadata=TableMetadataV2(
|
||||||
|
partition_specs=[PartitionSpec(spec_id=0)],
|
||||||
|
location="s3://abcdefg/namespaceC/table4",
|
||||||
|
last_column_id=0,
|
||||||
|
schemas=[Schema(schema_id=0)],
|
||||||
|
),
|
||||||
|
metadata_location="s3://abcdefg/namespaceC/table4",
|
||||||
|
io=PyArrowFileIO(),
|
||||||
|
catalog=None,
|
||||||
|
)
|
||||||
|
},
|
||||||
|
"namespaceD": {
|
||||||
|
"table5": lambda: Table(
|
||||||
|
identifier=("namespaceD", "table5"),
|
||||||
|
metadata=TableMetadataV2(
|
||||||
|
partition_specs=[PartitionSpec(spec_id=0)],
|
||||||
|
location="s3://abcdefg/namespaceA/table5",
|
||||||
|
last_column_id=0,
|
||||||
|
schemas=[Schema(schema_id=0)],
|
||||||
|
),
|
||||||
|
metadata_location="s3://abcdefg/namespaceA/table5",
|
||||||
|
io=PyArrowFileIO(),
|
||||||
|
catalog=None,
|
||||||
|
)
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
with patch(
|
||||||
|
"datahub.ingestion.source.iceberg.iceberg.IcebergSourceConfig.get_catalog"
|
||||||
|
) as get_catalog:
|
||||||
|
get_catalog.return_value = mock_catalog
|
||||||
|
wu: List[MetadataWorkUnit] = [*source.get_workunits_internal()]
|
||||||
|
# ingested 5 tables (6 MCPs each) and 4 namespaces (4 MCPs each), despite exception
|
||||||
|
expected_wu_urns = [
|
||||||
|
"urn:li:dataset:(urn:li:dataPlatform:iceberg,namespaceA.table1,PROD)",
|
||||||
|
"urn:li:dataset:(urn:li:dataPlatform:iceberg,namespaceB.table2,PROD)",
|
||||||
|
"urn:li:dataset:(urn:li:dataPlatform:iceberg,namespaceB.table3,PROD)",
|
||||||
|
"urn:li:dataset:(urn:li:dataPlatform:iceberg,namespaceC.table4,PROD)",
|
||||||
|
"urn:li:dataset:(urn:li:dataPlatform:iceberg,namespaceD.table5,PROD)",
|
||||||
|
] * MCPS_PER_TABLE + [
|
||||||
|
"urn:li:container:390e031441265aae5b7b7ae8d51b0c1f",
|
||||||
|
"urn:li:container:74727446a56420d80ff3b1abf2a18087",
|
||||||
|
"urn:li:container:3f9a24213cca64ab22e409d1b9a94789",
|
||||||
|
"urn:li:container:38a0583b0305ec5066cb708199f6848c",
|
||||||
|
] * MCPS_PER_NAMESPACE
|
||||||
|
assert len(wu) == len(expected_wu_urns)
|
||||||
|
urns = []
|
||||||
|
for unit in wu:
|
||||||
|
assert isinstance(unit.metadata, MetadataChangeProposalWrapper)
|
||||||
|
urns.append(unit.metadata.entityUrn)
|
||||||
|
TestCase().assertCountEqual(
|
||||||
|
urns,
|
||||||
|
expected_wu_urns,
|
||||||
|
)
|
||||||
|
assert source.report.warnings.total_elements == 0
|
||||||
|
assert source.report.failures.total_elements == 1
|
||||||
|
assert source.report.tables_scanned == 5
|
||||||
|
|
||||||
|
|
||||||
|
def test_known_exception_while_listing_tables() -> None:
|
||||||
|
source = with_iceberg_source(processing_threads=2)
|
||||||
|
mock_catalog = MockCatalogExceptionListingTables(
|
||||||
|
{
|
||||||
|
"namespaceA": {
|
||||||
|
"table1": lambda: Table(
|
||||||
|
identifier=("namespaceA", "table1"),
|
||||||
|
metadata=TableMetadataV2(
|
||||||
|
partition_specs=[PartitionSpec(spec_id=0)],
|
||||||
|
location="s3://abcdefg/namespaceA/table1",
|
||||||
|
last_column_id=0,
|
||||||
|
schemas=[Schema(schema_id=0)],
|
||||||
|
),
|
||||||
|
metadata_location="s3://abcdefg/namespaceA/table1",
|
||||||
|
io=PyArrowFileIO(),
|
||||||
|
catalog=None,
|
||||||
|
)
|
||||||
|
},
|
||||||
|
"no_such_namespace": {},
|
||||||
|
"rest_error": {},
|
||||||
|
"namespaceB": {
|
||||||
|
"table2": lambda: Table(
|
||||||
|
identifier=("namespaceB", "table2"),
|
||||||
|
metadata=TableMetadataV2(
|
||||||
|
partition_specs=[PartitionSpec(spec_id=0)],
|
||||||
|
location="s3://abcdefg/namespaceB/table2",
|
||||||
|
last_column_id=0,
|
||||||
|
schemas=[Schema(schema_id=0)],
|
||||||
|
),
|
||||||
|
metadata_location="s3://abcdefg/namespaceB/table2",
|
||||||
|
io=PyArrowFileIO(),
|
||||||
|
catalog=None,
|
||||||
|
),
|
||||||
|
"table3": lambda: Table(
|
||||||
|
identifier=("namespaceB", "table3"),
|
||||||
|
metadata=TableMetadataV2(
|
||||||
|
partition_specs=[PartitionSpec(spec_id=0)],
|
||||||
|
location="s3://abcdefg/namespaceB/table3",
|
||||||
|
last_column_id=0,
|
||||||
|
schemas=[Schema(schema_id=0)],
|
||||||
|
),
|
||||||
|
metadata_location="s3://abcdefg/namespaceB/table3",
|
||||||
|
io=PyArrowFileIO(),
|
||||||
|
catalog=None,
|
||||||
|
),
|
||||||
|
},
|
||||||
|
"namespaceC": {
|
||||||
|
"table4": lambda: Table(
|
||||||
|
identifier=("namespaceC", "table4"),
|
||||||
|
metadata=TableMetadataV2(
|
||||||
|
partition_specs=[PartitionSpec(spec_id=0)],
|
||||||
|
location="s3://abcdefg/namespaceC/table4",
|
||||||
|
last_column_id=0,
|
||||||
|
schemas=[Schema(schema_id=0)],
|
||||||
|
),
|
||||||
|
metadata_location="s3://abcdefg/namespaceC/table4",
|
||||||
|
io=PyArrowFileIO(),
|
||||||
|
catalog=None,
|
||||||
|
)
|
||||||
|
},
|
||||||
|
"namespaceD": {
|
||||||
|
"table5": lambda: Table(
|
||||||
|
identifier=("namespaceD", "table5"),
|
||||||
|
metadata=TableMetadataV2(
|
||||||
|
partition_specs=[PartitionSpec(spec_id=0)],
|
||||||
|
location="s3://abcdefg/namespaceA/table5",
|
||||||
|
last_column_id=0,
|
||||||
|
schemas=[Schema(schema_id=0)],
|
||||||
|
),
|
||||||
|
metadata_location="s3://abcdefg/namespaceA/table5",
|
||||||
|
io=PyArrowFileIO(),
|
||||||
|
catalog=None,
|
||||||
|
)
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
with patch(
|
||||||
|
"datahub.ingestion.source.iceberg.iceberg.IcebergSourceConfig.get_catalog"
|
||||||
|
) as get_catalog:
|
||||||
|
get_catalog.return_value = mock_catalog
|
||||||
|
wu: List[MetadataWorkUnit] = [*source.get_workunits_internal()]
|
||||||
|
# ingested 5 tables (6 MCPs each) and 6 namespaces (4 MCPs each), despite exception
|
||||||
|
expected_wu_urns = [
|
||||||
|
"urn:li:dataset:(urn:li:dataPlatform:iceberg,namespaceA.table1,PROD)",
|
||||||
|
"urn:li:dataset:(urn:li:dataPlatform:iceberg,namespaceB.table2,PROD)",
|
||||||
|
"urn:li:dataset:(urn:li:dataPlatform:iceberg,namespaceB.table3,PROD)",
|
||||||
|
"urn:li:dataset:(urn:li:dataPlatform:iceberg,namespaceC.table4,PROD)",
|
||||||
|
"urn:li:dataset:(urn:li:dataPlatform:iceberg,namespaceD.table5,PROD)",
|
||||||
|
] * MCPS_PER_TABLE + [
|
||||||
|
"urn:li:container:390e031441265aae5b7b7ae8d51b0c1f",
|
||||||
|
"urn:li:container:9cb5e87ec392b231720f23bf00d6f6aa",
|
||||||
|
"urn:li:container:74727446a56420d80ff3b1abf2a18087",
|
||||||
|
"urn:li:container:3f9a24213cca64ab22e409d1b9a94789",
|
||||||
|
"urn:li:container:38a0583b0305ec5066cb708199f6848c",
|
||||||
|
"urn:li:container:7b510fcb61d4977da0b1707e533999d8",
|
||||||
|
] * MCPS_PER_NAMESPACE
|
||||||
|
assert len(wu) == len(expected_wu_urns)
|
||||||
|
urns = []
|
||||||
|
for unit in wu:
|
||||||
|
assert isinstance(unit.metadata, MetadataChangeProposalWrapper)
|
||||||
|
urns.append(unit.metadata.entityUrn)
|
||||||
|
TestCase().assertCountEqual(
|
||||||
|
urns,
|
||||||
|
expected_wu_urns,
|
||||||
|
)
|
||||||
|
assert source.report.warnings.total_elements == 2
|
||||||
assert source.report.failures.total_elements == 0
|
assert source.report.failures.total_elements == 0
|
||||||
assert source.report.tables_scanned == 5
|
assert source.report.tables_scanned == 5
|
||||||
|
|
||||||
@ -1009,6 +1240,9 @@ def test_handle_expected_exceptions() -> None:
|
|||||||
def _raise_server_error():
|
def _raise_server_error():
|
||||||
raise ServerError()
|
raise ServerError()
|
||||||
|
|
||||||
|
def _raise_rest_error():
|
||||||
|
raise RESTError()
|
||||||
|
|
||||||
def _raise_fileio_error():
|
def _raise_fileio_error():
|
||||||
raise ValueError("Could not initialize FileIO: abc.dummy.fileio")
|
raise ValueError("Could not initialize FileIO: abc.dummy.fileio")
|
||||||
|
|
||||||
@ -1069,6 +1303,7 @@ def test_handle_expected_exceptions() -> None:
|
|||||||
"table8": _raise_no_such_iceberg_table_exception,
|
"table8": _raise_no_such_iceberg_table_exception,
|
||||||
"table9": _raise_server_error,
|
"table9": _raise_server_error,
|
||||||
"table10": _raise_fileio_error,
|
"table10": _raise_fileio_error,
|
||||||
|
"table11": _raise_rest_error,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@ -1095,7 +1330,9 @@ def test_handle_expected_exceptions() -> None:
|
|||||||
urns,
|
urns,
|
||||||
expected_wu_urns,
|
expected_wu_urns,
|
||||||
)
|
)
|
||||||
assert source.report.warnings.total_elements == 6
|
assert (
|
||||||
|
source.report.warnings.total_elements == 6
|
||||||
|
) # ServerError and RESTError exceptions are caught together
|
||||||
assert source.report.failures.total_elements == 0
|
assert source.report.failures.total_elements == 0
|
||||||
assert source.report.tables_scanned == 4
|
assert source.report.tables_scanned == 4
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user