Enhance SQL column processing for BigQuery ingestion (#20408)

- Refactored the handling of nested columns in `sql_column_handler.py` to prioritize source-provided children, ensuring they override any derived children. - Removed the overridden `_process_col_type` method in `bigquery/metadata.py` to streamline column type handling, enforcing the use of the standard path for BigQuery. This update improves the accuracy of column metadata processing and simplifies the codebase.
2026-01-08 13:36:32 +00:00 · 2025-03-25 14:40:46 +05:30 · 2025-03-25 14:40:46 +05:30 · 1434b5dba2
commit 1434b5dba2
parent d344caa8c7
2 changed files with 9 additions and 16 deletions
--- a/ingestion/src/metadata/ingestion/source/database/bigquery/metadata.py
+++ b/ingestion/src/metadata/ingestion/source/database/bigquery/metadata.py
@ -1020,13 +1020,3 @@ class BigquerySource(LifeCycleQueryMixin, CommonDbSourceService, MultiDBSource):
                )
        else:
            yield from super().mark_tables_as_deleted()
-
-    def _process_col_type(self, column: dict, schema: str) -> Tuple:
-        """
-        Override the parent method to always return parsed_string as None for BigQuery.
-        This ensures we always use the standard column type handling path.
-        """
-        data_type_display, arr_data_type, _ = super()._process_col_type(column, schema)
-        # For BigQuery, we always want to force parsed_string to None
-        # This ensures we use the standard column type handling path
-        return data_type_display, arr_data_type, None
--- a/ingestion/src/metadata/ingestion/source/database/sql_column_handler.py
+++ b/ingestion/src/metadata/ingestion/source/database/sql_column_handler.py
@ -299,12 +299,6 @@ class SqlColumnHandlerMixin:
                    arrayDataType=arr_data_type,
                    ordinalPosition=column.get("ordinalPosition"),
                )
-                if column.get("children"):
-                    om_column.children = [
-                        process_column(children) for children in column.get("children")
-                    ]
-                    if not arr_data_type:
-                        om_column.arrayDataType = DataType.UNKNOWN.value
                if precision:
                    # Precision and scale must be integer values
                    om_column.precision = int(precision[0])
@ -314,6 +308,15 @@ class SqlColumnHandlerMixin:
                    column=column, parsed_string=parsed_string
                )
                om_column = col_obj
+
+                if column.get("children"):
+                    # Prioritize source-provided children for column processing.
+                    # If 'children' are directly provided in the source metadata,
+                    # process and assign them to the output column, overriding any derived children.
+                    # Currently, this is only used for BigQuery.
+                    om_column.children = [
+                        process_column(children) for children in column.get("children")
+                    ]
            om_column.tags = self.get_column_tag_labels(
                table_name=table_name, column=column
            )