Fix #20689: Trino Column validation errors for highly complex fields (#22421)

* Fix: Trino Column validation errors for highly complex fields

* addressed copilot comms

* fixed tests

* fixed tests and addressed comms

* missed file
This commit is contained in:
Suman Maharana 2025-07-28 11:11:44 +05:30 committed by GitHub
parent d0739f0a23
commit 54dcdc7d82
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 92 additions and 4 deletions

View File

@ -444,8 +444,13 @@ class ColumnTypeParser:
if not data_type:
return {"dataType": "NULL", "dataTypeDisplay": dtype}
data_length = 1
if re.match(r".*(\([\w]*\))", dtype):
data_length = re.match(r".*\(([\w]*)\)", dtype).groups()[0]
match = re.match(r".*\(([\w]*)\)", dtype)
if match:
try:
data_length = int(match.groups()[0])
except ValueError:
# If conversion fails, keep it as 1 (default)
data_length = 1
return {
"dataType": data_type,
"dataTypeDisplay": data_type,

View File

@ -160,7 +160,7 @@
{
"dataType": "VARCHAR",
"dataTypeDisplay": "VARCHAR",
"dataLength": "16777216"
"dataLength": 16777216
},
{
"dataType": "DATE",

View File

@ -1,5 +1,6 @@
from unittest import TestCase
from metadata.ingestion.source.database.column_type_parser import ColumnTypeParser
from metadata.ingestion.source.database.trino.metadata import (
parse_array_data_type,
parse_row_data_type,
@ -21,6 +22,7 @@ RAW_ROW_DATA_TYPES = [
"row(a array(string))",
"row(bigquerytestdatatype51 array(row(bigquery_test_datatype_511 array(string))))",
"row(record_1 row(record_2 row(record_3 row(record_4 string))))",
"row(splash row(color varchar, icon varchar, custom_story_board varchar, custom_android_splash varchar), android row(firebase_app_id varchar, google_services_json varchar, whitelisted_audience varchar, android_client_id varchar, build_preview row(download_url varchar, app_version varchar)), name varchar, app_icon varchar, url_scheme varchar, universal_link varchar, universal_links array(varchar), ios row(ios_client_id varchar))",
]
EXPECTED_ROW_DATA_TYPES = [
@ -29,10 +31,37 @@ EXPECTED_ROW_DATA_TYPES = [
"struct<a:array<string>>",
"struct<bigquerytestdatatype51:array<struct<bigquery_test_datatype_511:array<string>>>>",
"struct<record_1:struct<record_2:struct<record_3:struct<record_4:string>>>>",
"struct<splash:struct<color:varchar,icon:varchar,custom_story_board:varchar,custom_android_splash:varchar>,android:struct<firebase_app_id:varchar,google_services_json:varchar,whitelisted_audience:varchar,android_client_id:varchar,build_preview:struct<download_url:varchar,app_version:varchar>>,name:varchar,app_icon:varchar,url_scheme:varchar,universal_link:varchar,universal_links:array<varchar>,ios:struct<ios_client_id:varchar>>",
]
# Test cases for dataLength conversion - only for types that should have dataLength
DATA_LENGTH_TEST_CASES = [
("varchar(20)", 20),
("varchar(255)", 255),
("char(10)", 10),
("timestamp(3)", 3),
("timestamp(6)", 6),
("time(3)", 3),
]
# Test cases for non-numeric length values (should default to 1)
NON_NUMERIC_LENGTH_TEST_CASES = [
("varchar(abc)", 1),
("char(xyz)", 1),
("timestamp(precision)", 1),
]
# Test cases for types that should NOT have dataLength field
NO_DATA_LENGTH_TEST_CASES = [
"varchar", # No length specified
"int", # No length specified
"bigint", # No length specified
"date", # No length specified
"timestamp", # No length specified (without precision)
]
class SouceConnectionTest(TestCase):
class SourceConnectionTest(TestCase):
def test_array_datatype(self):
for i in range(len(RAW_ARRAY_DATA_TYPES)):
parsed_type = parse_array_data_type(RAW_ARRAY_DATA_TYPES[i])
@ -42,3 +71,57 @@ class SouceConnectionTest(TestCase):
for i in range(len(RAW_ROW_DATA_TYPES)):
parsed_type = parse_row_data_type(RAW_ROW_DATA_TYPES[i])
assert parsed_type == EXPECTED_ROW_DATA_TYPES[i]
def test_data_length_conversion(self):
"""Test that dataLength is properly converted to integer for various data types"""
parser = ColumnTypeParser()
for dtype, expected_length in DATA_LENGTH_TEST_CASES:
with self.subTest(dtype=dtype):
result = parser._parse_primitive_datatype_string(dtype)
self.assertIn("dataLength", result)
self.assertIsInstance(result["dataLength"], int)
self.assertEqual(result["dataLength"], expected_length)
self.assertEqual(result["dataType"], dtype.split("(")[0].upper())
def test_non_numeric_length_handling(self):
"""Test that non-numeric length values default to 1"""
parser = ColumnTypeParser()
for dtype, expected_length in NON_NUMERIC_LENGTH_TEST_CASES:
with self.subTest(dtype=dtype):
result = parser._parse_primitive_datatype_string(dtype)
self.assertIn("dataLength", result)
self.assertIsInstance(result["dataLength"], int)
self.assertEqual(result["dataLength"], expected_length)
self.assertEqual(result["dataType"], dtype.split("(")[0].upper())
def test_no_data_length_for_simple_types(self):
"""Test that simple types without length specification don't have dataLength field"""
parser = ColumnTypeParser()
for dtype in NO_DATA_LENGTH_TEST_CASES:
with self.subTest(dtype=dtype):
result = parser._parse_primitive_datatype_string(dtype)
# These types should not have dataLength field
self.assertNotIn("dataLength", result)
def test_data_length_type_safety(self):
"""Test that dataLength is always an integer type when present"""
parser = ColumnTypeParser()
test_dtypes = [
"varchar(100)",
"char(50)",
"timestamp(3)",
"decimal(10,2)",
]
for dtype in test_dtypes:
with self.subTest(dtype=dtype):
result = parser._parse_primitive_datatype_string(dtype)
# Verify dataLength is always an integer when present
if "dataLength" in result:
self.assertIsInstance(result["dataLength"], int)
# Verify dataLength is never a string
self.assertNotIsInstance(result["dataLength"], str)