diff --git a/ingestion/src/metadata/ingestion/ometa/mixins/table_mixin.py b/ingestion/src/metadata/ingestion/ometa/mixins/table_mixin.py index 3886b4fcbb1..9f10a226d9d 100644 --- a/ingestion/src/metadata/ingestion/ometa/mixins/table_mixin.py +++ b/ingestion/src/metadata/ingestion/ometa/mixins/table_mixin.py @@ -13,6 +13,7 @@ Mixin class containing Table specific methods To be used by OpenMetadata class """ +import base64 import traceback from typing import List, Optional, Type, TypeVar @@ -56,6 +57,7 @@ class OMetaTableMixin: client: REST + # pylint: disable=too-many-nested-blocks def ingest_table_sample_data( self, table: Table, sample_data: TableData ) -> Optional[TableData]: @@ -67,9 +69,37 @@ class OMetaTableMixin: """ resp = None try: + # Pre-process sample data to handle binary/non-UTF-8 data before serialization + if sample_data and sample_data.rows: + + for row in sample_data.rows: + if not row: + continue + for col_idx, value in enumerate(row): + # Handle binary data explicitly + if isinstance(value, bytes): + # Convert binary data to Base64-encoded string + try: + row[ + col_idx + ] = f"[base64]{base64.b64encode(value).decode('ascii', errors='ignore')}" + except Exception as _: + row[col_idx] = f"[binary]{value}" + + try: + data = sample_data.model_dump_json() + except Exception as _: + logger.debug(traceback.format_exc()) + logger.warning( + f"Error serializing sample data for {table.fullyQualifiedName.root}" + " please check if the data is valid" + ) + return None + + # Now safely serialize to JSON resp = self.client.put( f"{self.get_suffix(Table)}/{table.id.root}/sampleData", - data=sample_data.model_dump_json(), + data=data, ) except Exception as exc: logger.debug(traceback.format_exc()) diff --git a/ingestion/tests/integration/ometa/test_ometa_table_api.py b/ingestion/tests/integration/ometa/test_ometa_table_api.py index f8e673b5dcc..81ce6d8346c 100644 --- a/ingestion/tests/integration/ometa/test_ometa_table_api.py +++ b/ingestion/tests/integration/ometa/test_ometa_table_api.py @@ -673,3 +673,24 @@ class OMetaTableTest(TestCase): ) assert res.name == name + + def test_ingest_sample_data_with_binary_data(self): + """ + Test ingesting sample data with binary data + """ + table: Table = self.metadata.create_or_update( + data=get_create_entity( + entity=Table, + name="random", + reference=self.create_schema_entity.fullyQualifiedName, + ) + ) + sample_data = TableData( + columns=["id"], rows=[[b"data\x00\x01\x02\x8e\xba\xab\xf0"]] + ) + res = self.metadata.ingest_table_sample_data(table, sample_data) + assert res == sample_data + + sample_data = TableData(columns=["id"], rows=[[b"\x00\x01\x02"]]) + res = self.metadata.ingest_table_sample_data(table, sample_data) + assert res == sample_data