Fix #20236: Handle Sample Data with non-utf8 characters (#20380)

(cherry picked from commit e6b7b89f8634b2429e6f26e8809fa51ab295df47)
This commit is contained in:
Mayur Singal 2025-03-27 14:20:26 +05:30 committed by OpenMetadata Release Bot
parent 44856348bf
commit d1f022b1d6
2 changed files with 52 additions and 1 deletions

View File

@ -13,6 +13,7 @@ Mixin class containing Table specific methods
To be used by OpenMetadata class
"""
import base64
import traceback
from typing import List, Optional, Type, TypeVar
@ -56,6 +57,7 @@ class OMetaTableMixin:
client: REST
# pylint: disable=too-many-nested-blocks
def ingest_table_sample_data(
self, table: Table, sample_data: TableData
) -> Optional[TableData]:
@ -67,9 +69,37 @@ class OMetaTableMixin:
"""
resp = None
try:
# Pre-process sample data to handle binary/non-UTF-8 data before serialization
if sample_data and sample_data.rows:
for row in sample_data.rows:
if not row:
continue
for col_idx, value in enumerate(row):
# Handle binary data explicitly
if isinstance(value, bytes):
# Convert binary data to Base64-encoded string
try:
row[
col_idx
] = f"[base64]{base64.b64encode(value).decode('ascii', errors='ignore')}"
except Exception as _:
row[col_idx] = f"[binary]{value}"
try:
data = sample_data.model_dump_json()
except Exception as _:
logger.debug(traceback.format_exc())
logger.warning(
f"Error serializing sample data for {table.fullyQualifiedName.root}"
" please check if the data is valid"
)
return None
# Now safely serialize to JSON
resp = self.client.put(
f"{self.get_suffix(Table)}/{table.id.root}/sampleData",
data=sample_data.model_dump_json(),
data=data,
)
except Exception as exc:
logger.debug(traceback.format_exc())

View File

@ -673,3 +673,24 @@ class OMetaTableTest(TestCase):
)
assert res.name == name
def test_ingest_sample_data_with_binary_data(self):
"""
Test ingesting sample data with binary data
"""
table: Table = self.metadata.create_or_update(
data=get_create_entity(
entity=Table,
name="random",
reference=self.create_schema_entity.fullyQualifiedName,
)
)
sample_data = TableData(
columns=["id"], rows=[[b"data\x00\x01\x02\x8e\xba\xab\xf0"]]
)
res = self.metadata.ingest_table_sample_data(table, sample_data)
assert res == sample_data
sample_data = TableData(columns=["id"], rows=[[b"\x00\x01\x02"]])
res = self.metadata.ingest_table_sample_data(table, sample_data)
assert res == sample_data