Enhance SQL column processing for BigQuery ingestion (#20408)

- Refactored the handling of nested columns in `sql_column_handler.py` to prioritize source-provided children, ensuring they override any derived children.
- Removed the overridden `_process_col_type` method in `bigquery/metadata.py` to streamline column type handling, enforcing the use of the standard path for BigQuery.

This update improves the accuracy of column metadata processing and simplifies the codebase.
This commit is contained in:
Ayush Shah 2025-03-25 14:40:46 +05:30 committed by GitHub
parent d344caa8c7
commit 1434b5dba2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 9 additions and 16 deletions

View File

@ -1020,13 +1020,3 @@ class BigquerySource(LifeCycleQueryMixin, CommonDbSourceService, MultiDBSource):
)
else:
yield from super().mark_tables_as_deleted()
def _process_col_type(self, column: dict, schema: str) -> Tuple:
"""
Override the parent method to always return parsed_string as None for BigQuery.
This ensures we always use the standard column type handling path.
"""
data_type_display, arr_data_type, _ = super()._process_col_type(column, schema)
# For BigQuery, we always want to force parsed_string to None
# This ensures we use the standard column type handling path
return data_type_display, arr_data_type, None

View File

@ -299,12 +299,6 @@ class SqlColumnHandlerMixin:
arrayDataType=arr_data_type,
ordinalPosition=column.get("ordinalPosition"),
)
if column.get("children"):
om_column.children = [
process_column(children) for children in column.get("children")
]
if not arr_data_type:
om_column.arrayDataType = DataType.UNKNOWN.value
if precision:
# Precision and scale must be integer values
om_column.precision = int(precision[0])
@ -314,6 +308,15 @@ class SqlColumnHandlerMixin:
column=column, parsed_string=parsed_string
)
om_column = col_obj
if column.get("children"):
# Prioritize source-provided children for column processing.
# If 'children' are directly provided in the source metadata,
# process and assign them to the output column, overriding any derived children.
# Currently, this is only used for BigQuery.
om_column.children = [
process_column(children) for children in column.get("children")
]
om_column.tags = self.get_column_tag_labels(
table_name=table_name, column=column
)