From 54dcdc7d82eb8c4ebae8d570c162f5cd8d1087ae Mon Sep 17 00:00:00 2001 From: Suman Maharana Date: Mon, 28 Jul 2025 11:11:44 +0530 Subject: [PATCH] Fix #20689: Trino Column validation errors for highly complex fields (#22421) * Fix: Trino Column validation errors for highly complex fields * addressed copilot comms * fixed tests * fixed tests and addressed comms * missed file --- .../source/database/column_type_parser.py | 9 +- .../expected_output_column_parser.json | 2 +- .../tests/unit/test_trino_complex_types.py | 85 ++++++++++++++++++- 3 files changed, 92 insertions(+), 4 deletions(-) diff --git a/ingestion/src/metadata/ingestion/source/database/column_type_parser.py b/ingestion/src/metadata/ingestion/source/database/column_type_parser.py index 88d5c08ed6e..8a48c184b28 100644 --- a/ingestion/src/metadata/ingestion/source/database/column_type_parser.py +++ b/ingestion/src/metadata/ingestion/source/database/column_type_parser.py @@ -444,8 +444,13 @@ class ColumnTypeParser: if not data_type: return {"dataType": "NULL", "dataTypeDisplay": dtype} data_length = 1 - if re.match(r".*(\([\w]*\))", dtype): - data_length = re.match(r".*\(([\w]*)\)", dtype).groups()[0] + match = re.match(r".*\(([\w]*)\)", dtype) + if match: + try: + data_length = int(match.groups()[0]) + except ValueError: + # If conversion fails, keep it as 1 (default) + data_length = 1 return { "dataType": data_type, "dataTypeDisplay": data_type, diff --git a/ingestion/tests/unit/resources/expected_output_column_parser.json b/ingestion/tests/unit/resources/expected_output_column_parser.json index 12ba3e93030..c536046ca4b 100644 --- a/ingestion/tests/unit/resources/expected_output_column_parser.json +++ b/ingestion/tests/unit/resources/expected_output_column_parser.json @@ -160,7 +160,7 @@ { "dataType": "VARCHAR", "dataTypeDisplay": "VARCHAR", - "dataLength": "16777216" + "dataLength": 16777216 }, { "dataType": "DATE", diff --git a/ingestion/tests/unit/test_trino_complex_types.py b/ingestion/tests/unit/test_trino_complex_types.py index 11a5f299ea1..7dda3a11631 100644 --- a/ingestion/tests/unit/test_trino_complex_types.py +++ b/ingestion/tests/unit/test_trino_complex_types.py @@ -1,5 +1,6 @@ from unittest import TestCase +from metadata.ingestion.source.database.column_type_parser import ColumnTypeParser from metadata.ingestion.source.database.trino.metadata import ( parse_array_data_type, parse_row_data_type, @@ -21,6 +22,7 @@ RAW_ROW_DATA_TYPES = [ "row(a array(string))", "row(bigquerytestdatatype51 array(row(bigquery_test_datatype_511 array(string))))", "row(record_1 row(record_2 row(record_3 row(record_4 string))))", + "row(splash row(color varchar, icon varchar, custom_story_board varchar, custom_android_splash varchar), android row(firebase_app_id varchar, google_services_json varchar, whitelisted_audience varchar, android_client_id varchar, build_preview row(download_url varchar, app_version varchar)), name varchar, app_icon varchar, url_scheme varchar, universal_link varchar, universal_links array(varchar), ios row(ios_client_id varchar))", ] EXPECTED_ROW_DATA_TYPES = [ @@ -29,10 +31,37 @@ EXPECTED_ROW_DATA_TYPES = [ "struct>", "struct>>>", "struct>>>", + "struct,android:struct>,name:varchar,app_icon:varchar,url_scheme:varchar,universal_link:varchar,universal_links:array,ios:struct>", +] + +# Test cases for dataLength conversion - only for types that should have dataLength +DATA_LENGTH_TEST_CASES = [ + ("varchar(20)", 20), + ("varchar(255)", 255), + ("char(10)", 10), + ("timestamp(3)", 3), + ("timestamp(6)", 6), + ("time(3)", 3), +] + +# Test cases for non-numeric length values (should default to 1) +NON_NUMERIC_LENGTH_TEST_CASES = [ + ("varchar(abc)", 1), + ("char(xyz)", 1), + ("timestamp(precision)", 1), +] + +# Test cases for types that should NOT have dataLength field +NO_DATA_LENGTH_TEST_CASES = [ + "varchar", # No length specified + "int", # No length specified + "bigint", # No length specified + "date", # No length specified + "timestamp", # No length specified (without precision) ] -class SouceConnectionTest(TestCase): +class SourceConnectionTest(TestCase): def test_array_datatype(self): for i in range(len(RAW_ARRAY_DATA_TYPES)): parsed_type = parse_array_data_type(RAW_ARRAY_DATA_TYPES[i]) @@ -42,3 +71,57 @@ class SouceConnectionTest(TestCase): for i in range(len(RAW_ROW_DATA_TYPES)): parsed_type = parse_row_data_type(RAW_ROW_DATA_TYPES[i]) assert parsed_type == EXPECTED_ROW_DATA_TYPES[i] + + def test_data_length_conversion(self): + """Test that dataLength is properly converted to integer for various data types""" + parser = ColumnTypeParser() + + for dtype, expected_length in DATA_LENGTH_TEST_CASES: + with self.subTest(dtype=dtype): + result = parser._parse_primitive_datatype_string(dtype) + self.assertIn("dataLength", result) + self.assertIsInstance(result["dataLength"], int) + self.assertEqual(result["dataLength"], expected_length) + self.assertEqual(result["dataType"], dtype.split("(")[0].upper()) + + def test_non_numeric_length_handling(self): + """Test that non-numeric length values default to 1""" + parser = ColumnTypeParser() + + for dtype, expected_length in NON_NUMERIC_LENGTH_TEST_CASES: + with self.subTest(dtype=dtype): + result = parser._parse_primitive_datatype_string(dtype) + self.assertIn("dataLength", result) + self.assertIsInstance(result["dataLength"], int) + self.assertEqual(result["dataLength"], expected_length) + self.assertEqual(result["dataType"], dtype.split("(")[0].upper()) + + def test_no_data_length_for_simple_types(self): + """Test that simple types without length specification don't have dataLength field""" + parser = ColumnTypeParser() + + for dtype in NO_DATA_LENGTH_TEST_CASES: + with self.subTest(dtype=dtype): + result = parser._parse_primitive_datatype_string(dtype) + # These types should not have dataLength field + self.assertNotIn("dataLength", result) + + def test_data_length_type_safety(self): + """Test that dataLength is always an integer type when present""" + parser = ColumnTypeParser() + + test_dtypes = [ + "varchar(100)", + "char(50)", + "timestamp(3)", + "decimal(10,2)", + ] + + for dtype in test_dtypes: + with self.subTest(dtype=dtype): + result = parser._parse_primitive_datatype_string(dtype) + # Verify dataLength is always an integer when present + if "dataLength" in result: + self.assertIsInstance(result["dataLength"], int) + # Verify dataLength is never a string + self.assertNotIsInstance(result["dataLength"], str)