Fix #14830: Ignore non current columns for iceberg tables for glue & athena (#22564)

2025-11-03 03:59:12 +00:00 · 2025-07-29 16:19:09 +05:30 · 2025-07-29 16:19:09 +05:30 · 199e3b981c
commit 199e3b981c
parent 26f99a3ac2
5 changed files with 418 additions and 10 deletions
--- a/ingestion/src/metadata/ingestion/source/database/athena/metadata.py
+++ b/ingestion/src/metadata/ingestion/source/database/athena/metadata.py
@ -115,6 +115,7 @@ class AthenaSource(ExternalTableLineageMixin, CommonDbSourceService):
        )
        self.external_location_map = {}
        self.schema_description_map = {}
+        self.glue_client = None

    def prepare(self):
        """
@ -122,8 +123,10 @@ class AthenaSource(ExternalTableLineageMixin, CommonDbSourceService):
        """
        try:
            super().prepare()
-            glue_client = AWSClient(self.service_connection.awsConfig).get_glue_client()
-            paginator = glue_client.get_paginator("get_databases")
+            self.glue_client = AWSClient(
+                self.service_connection.awsConfig
+            ).get_glue_client()
+            paginator = self.glue_client.get_paginator("get_databases")
            for page in paginator.paginate():
                database_page = DatabasePage(**page)
                for database in database_page.DatabaseList or []:
@ -304,3 +307,23 @@ class AthenaSource(ExternalTableLineageMixin, CommonDbSourceService):
        else:
            description = table_info.get("text")
        return description
+
+    def _get_columns_internal(
+        self,
+        schema_name: str,
+        table_name: str,
+        db_name: str,
+        inspector: Inspector,
+        table_type: TableType = None,
+    ):
+        """
+        Override to pass Glue client to get_columns for Iceberg table filtering
+        """
+        # Pass the Glue client as a keyword argument to get_columns
+        return inspector.get_columns(
+            table_name,
+            schema_name,
+            table_type=table_type,
+            db_name=db_name,
+            glue_client=self.glue_client,
+        )
--- a/ingestion/src/metadata/ingestion/source/database/athena/utils.py
+++ b/ingestion/src/metadata/ingestion/source/database/athena/utils.py
@ -20,9 +20,12 @@ from sqlalchemy.engine import reflection

 from metadata.ingestion.source import sqa_types
 from metadata.ingestion.source.database.column_type_parser import ColumnTypeParser
+from metadata.utils.logger import utils_logger
 from metadata.utils.sqlalchemy_utils import is_complex_type

+logger = utils_logger()

+# pylint: disable=protected-access
@reflection.cache
 def _get_column_type(self, type_):
    """
@ -30,7 +33,7 @@ def _get_column_type(self, type_):
    to add custom SQA typing.
    """
    type_ = type_.replace(" ", "").lower()
-    match = self._pattern_column_type.match(type_)  # pylint: disable=protected-access
+    match = self._pattern_column_type.match(type_)
    if match:
        name = match.group(1).lower()
        length = match.group(2)
@ -113,18 +116,17 @@ def _get_projection_details(
    return columns


+# pylint: disable=too-many-locals
@reflection.cache
 def get_columns(self, connection, table_name, schema=None, **kw):
    """
    Method to handle table columns
    """
-    metadata = self._get_table(  # pylint: disable=protected-access
-        connection, table_name, schema=schema, **kw
-    )
+    metadata = self._get_table(connection, table_name, schema=schema, **kw)
    columns = [
        {
            "name": c.name,
-            "type": self._get_column_type(c.type),  # pylint: disable=protected-access
+            "type": self._get_column_type(c.type),
            "nullable": True,
            "default": None,
            "autoincrement": False,
@ -147,10 +149,65 @@ def get_columns(self, connection, table_name, schema=None, **kw):
        columns = _get_projection_details(columns, projection_parameters)
        return columns

+    # Check if this is an Iceberg table
+    if metadata.parameters.get("table_type") == "ICEBERG":
+        # For Iceberg tables, get the full table metadata from Glue to access column parameters
+        try:
+            # Get the raw connection to access schema information
+            raw_connection = self._raw_connection(connection)
+            schema = schema if schema else raw_connection.schema_name
+
+            # Use the provided Glue client or create one with default credentials
+            glue_client = kw.get("glue_client")
+
+            # Get full table metadata from Glue
+            response = glue_client.get_table(DatabaseName=schema, Name=table_name)
+
+            table_info = response["Table"]
+
+            # Filter out non-current Iceberg columns
+            current_columns = []
+            storage_descriptor = table_info.get("StorageDescriptor", {})
+            glue_columns = storage_descriptor.get("Columns", [])
+
+            for glue_col in glue_columns:
+                col_name = glue_col["Name"]
+                col_type = glue_col["Type"]
+                col_comment = glue_col.get("Comment", "")
+                col_parameters = glue_col.get("Parameters", {})
+
+                # Check if this is a non-current Iceberg column
+                iceberg_current = col_parameters.get("iceberg.field.current", "true")
+                is_current = iceberg_current != "false"
+
+                if is_current:
+                    current_columns.append(
+                        {
+                            "name": col_name,
+                            "type": self._get_column_type(col_type),
+                            "nullable": True,
+                            "default": None,
+                            "autoincrement": False,
+                            "comment": col_comment,
+                            "system_data_type": col_type,
+                            "is_complex": is_complex_type(col_type),
+                            "dialect_options": {"awsathena_partition": None},
+                        }
+                    )
+
+            columns += current_columns
+            return columns
+
+        except Exception as e:
+            # If we can't get Glue metadata, fall back to the original method
+            # This ensures backward compatibility
+            logger.warning(f"Error getting Glue metadata for table {table_name}: {e}")
+
+    # For non-Iceberg tables or if Glue access fails, use the original method
    columns += [
        {
            "name": c.name,
-            "type": self._get_column_type(c.type),  # pylint: disable=protected-access
+            "type": self._get_column_type(c.type),
            "nullable": True,
            "default": None,
            "autoincrement": False,
--- a/ingestion/src/metadata/ingestion/source/database/glue/metadata.py
+++ b/ingestion/src/metadata/ingestion/source/database/glue/metadata.py
@ -377,12 +377,87 @@ class GlueSource(ExternalTableLineageMixin, DatabaseServiceSource):
        parsed_string["description"] = column.Comment
        return Column(**parsed_string)

+    # pylint: disable=too-many-locals
    def get_columns(self, column_data: StorageDetails) -> Optional[Iterable[Column]]:
+        """
+        Get columns from Glue.
+        """
+        # Check if this is an Iceberg table
+        table = self.context.get().table_data
+        is_iceberg = table.Parameters and table.Parameters.table_type == "ICEBERG"
+
+        if is_iceberg:
+            # For Iceberg tables, get the full table metadata from Glue to access column parameters
+            try:
+                schema_name = self.context.get().database_schema
+                table_name = table.Name
+
+                # Get full table metadata from Glue API
+                response = self.glue.get_table(
+                    DatabaseName=schema_name, Name=table_name
+                )
+
+                table_info = response["Table"]
+
+                # Filter out non-current Iceberg columns
+                storage_descriptor = table_info.get("StorageDescriptor", {})
+                glue_columns = storage_descriptor.get("Columns", [])
+
+                for glue_col in glue_columns:
+                    col_name = glue_col["Name"]
+                    col_type = glue_col["Type"]
+                    col_comment = glue_col.get("Comment", "")
+                    col_parameters = glue_col.get("Parameters", {})
+
+                    # Check if this is a non-current Iceberg column
+                    iceberg_current = col_parameters.get(
+                        "iceberg.field.current", "true"
+                    )
+                    is_current = iceberg_current != "false"
+
+                    if is_current:
+                        # Create a GlueColumn object for processing
+                        column_obj = GlueColumn(
+                            Name=col_name, Type=col_type, Comment=col_comment
+                        )
+                        yield self._get_column_object(column_obj)
+
+                # Process partition columns
+                partition_keys = table_info.get("PartitionKeys", [])
+                for glue_col in partition_keys:
+                    col_name = glue_col["Name"]
+                    col_type = glue_col["Type"]
+                    col_comment = glue_col.get("Comment", "")
+                    col_parameters = glue_col.get("Parameters", {})
+
+                    # Check if this is a non-current Iceberg column
+                    iceberg_current = col_parameters.get(
+                        "iceberg.field.current", "true"
+                    )
+                    is_current = iceberg_current != "false"
+
+                    if is_current:
+                        # Create a GlueColumn object for processing
+                        column_obj = GlueColumn(
+                            Name=col_name, Type=col_type, Comment=col_comment
+                        )
+                        yield self._get_column_object(column_obj)
+
+                return
+
+            except Exception as e:
+                # If we can't get Glue metadata, fall back to the original method
+                # This ensures backward compatibility
+                logger.warning(
+                    f"Failed to get Glue metadata for Iceberg table {table.Name}: {e}"
+                )
+
+        # For non-Iceberg tables or if Glue access fails, use the original method
        # process table regular columns info
        for column in column_data.Columns:
            yield self._get_column_object(column)

-        # process table regular columns info
+        # process table partition columns info
        for column in self.context.get().table_data.PartitionKeys:
            yield self._get_column_object(column)

--- a/ingestion/tests/unit/topology/database/test_athena_utils.py
+++ b/ingestion/tests/unit/topology/database/test_athena_utils.py
@ -0,0 +1,169 @@
+#  Copyright 2025 Collate
+#  Licensed under the Collate Community License, Version 1.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#  https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Test Athena Utils"""
+
+import unittest
+
+
+class TestAthenaUtils(unittest.TestCase):
+    """Test Athena Utils"""
+
+    def test_iceberg_column_filtering_logic(self):
+        """Test the Iceberg column filtering logic directly"""
+
+        # Create mock Glue column data (as returned by boto3)
+        current_column = {
+            "Name": "current_col",
+            "Type": "int",
+            "Comment": "Current column",
+            "Parameters": {"iceberg.field.current": "true"},
+        }
+        non_current_column = {
+            "Name": "non_current_col",
+            "Type": "string",
+            "Comment": "Non-current column",
+            "Parameters": {"iceberg.field.current": "false"},
+        }
+        column_without_params = {
+            "Name": "normal_col",
+            "Type": "boolean",
+            "Comment": "Normal column",
+            "Parameters": {},
+        }
+
+        # Test the filtering logic directly (same logic as in get_columns function)
+        current_columns = []
+        for col in [current_column, non_current_column, column_without_params]:
+            col_name = col["Name"]
+            col_type = col["Type"]
+            col_comment = col.get("Comment", "")
+            col_parameters = col.get("Parameters", {})
+
+            # Check if this is a non-current Iceberg column
+            iceberg_current = col_parameters.get("iceberg.field.current", "true")
+            is_current = iceberg_current != "false"
+
+            if is_current:
+                current_columns.append(col_name)
+
+        # Verify that only current columns are returned
+        current_column_names = current_columns
+
+        # Should include current_col and normal_col, but not non_current_col
+        self.assertIn("current_col", current_column_names)
+        self.assertIn("normal_col", current_column_names)
+        self.assertNotIn("non_current_col", current_column_names)
+
+        # Verify that exactly 2 columns are returned (current_col and normal_col)
+        self.assertEqual(len(current_columns), 2)
+
+    def test_get_columns_handles_attribute_error(self):
+        """Test that get_columns handles AttributeError gracefully"""
+
+        # Create a column object that raises AttributeError when accessing parameters
+        class MockColumn:
+            def __init__(self, name, type_, comment):
+                self.name = name
+                self.type = type_
+                self.comment = comment
+
+            @property
+            def parameters(self):
+                raise AttributeError("parameters attribute not available")
+
+        column = MockColumn("test_col", "int", "Test column")
+
+        # Test the filtering logic with AttributeError
+        current_columns = []
+        for c in [column]:
+            is_current = True
+            try:
+                if hasattr(c, "parameters") and c.parameters:
+                    iceberg_current = c.parameters.get("iceberg.field.current")
+                    if iceberg_current == "false":
+                        is_current = False
+            except (AttributeError, KeyError):
+                pass
+
+            if is_current:
+                current_columns.append(c)
+
+        # Should include the column since AttributeError is caught
+        self.assertEqual(len(current_columns), 1)
+        self.assertEqual(current_columns[0].name, "test_col")
+
+    def test_get_columns_handles_missing_parameters_attribute(self):
+        """Test that get_columns handles missing parameters attribute gracefully"""
+
+        # Create a column object without parameters attribute
+        class MockColumn:
+            def __init__(self, name, type_, comment):
+                self.name = name
+                self.type = type_
+                self.comment = comment
+
+        column = MockColumn("test_col", "int", "Test column")
+
+        # Test the filtering logic with missing parameters attribute
+        current_columns = []
+        for c in [column]:
+            is_current = True
+            try:
+                if hasattr(c, "parameters") and c.parameters:
+                    iceberg_current = c.parameters.get("iceberg.field.current")
+                    if iceberg_current == "false":
+                        is_current = False
+            except (AttributeError, KeyError):
+                pass
+
+            if is_current:
+                current_columns.append(c)
+
+        # Should include the column since parameters attribute is missing
+        self.assertEqual(len(current_columns), 1)
+        self.assertEqual(current_columns[0].name, "test_col")
+
+    def test_get_columns_handles_none_parameters(self):
+        """Test that get_columns handles None parameters gracefully"""
+
+        # Create a column object with None parameters
+        class MockColumn:
+            def __init__(self, name, type_, comment, parameters=None):
+                self.name = name
+                self.type = type_
+                self.comment = comment
+                self.parameters = parameters
+
+        column = MockColumn("test_col", "int", "Test column", None)
+
+        # Test the filtering logic with None parameters
+        current_columns = []
+        for c in [column]:
+            is_current = True
+            try:
+                if hasattr(c, "parameters") and c.parameters:
+                    iceberg_current = c.parameters.get("iceberg.field.current")
+                    if iceberg_current == "false":
+                        is_current = False
+            except (AttributeError, KeyError):
+                pass
+
+            if is_current:
+                current_columns.append(c)
+
+        # Should include the column since parameters is None
+        self.assertEqual(len(current_columns), 1)
+        self.assertEqual(current_columns[0].name, "test_col")
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/ingestion/tests/unit/topology/database/test_glue.py
+++ b/ingestion/tests/unit/topology/database/test_glue.py
@ -17,7 +17,7 @@ import json
 from copy import deepcopy
 from pathlib import Path
 from unittest import TestCase
-from unittest.mock import patch
+from unittest.mock import Mock, patch

 from metadata.generated.schema.entity.data.database import Database
 from metadata.generated.schema.entity.data.databaseSchema import DatabaseSchema
@ -211,3 +211,87 @@ class GlueUnitTest(TestCase):
            list(map(lambda x: x.locationPath, self.get_table_requests()))
            == EXPECTED_LOCATION_PATHS
        )
+
+    def test_iceberg_column_filtering_logic(self):
+        """Test the Iceberg column filtering logic directly"""
+
+        # Create mock Glue column data (as returned by boto3)
+        current_column = {
+            "Name": "current_col",
+            "Type": "int",
+            "Comment": "Current column",
+            "Parameters": {"iceberg.field.current": "true"},
+        }
+        non_current_column = {
+            "Name": "non_current_col",
+            "Type": "string",
+            "Comment": "Non-current column",
+            "Parameters": {"iceberg.field.current": "false"},
+        }
+        column_without_params = {
+            "Name": "normal_col",
+            "Type": "boolean",
+            "Comment": "Normal column",
+            "Parameters": {},
+        }
+
+        # Test the filtering logic directly (same logic as in get_columns function)
+        current_columns = []
+        for col in [current_column, non_current_column, column_without_params]:
+            col_name = col["Name"]
+            col_type = col["Type"]
+            col_comment = col.get("Comment", "")
+            col_parameters = col.get("Parameters", {})
+
+            # Check if this is a non-current Iceberg column
+            iceberg_current = col_parameters.get("iceberg.field.current", "true")
+            is_current = iceberg_current != "false"
+
+            if is_current:
+                current_columns.append(col_name)
+
+        # Verify that only current columns are returned
+        current_column_names = current_columns
+
+        # Should include current_col and normal_col, but not non_current_col
+        self.assertIn("current_col", current_column_names)
+        self.assertIn("normal_col", current_column_names)
+        self.assertNotIn("non_current_col", current_column_names)
+
+        # Verify that exactly 2 columns are returned (current_col and normal_col)
+        self.assertEqual(len(current_columns), 2)
+
+    def test_iceberg_table_detection(self):
+        """Test that Iceberg tables are correctly detected"""
+
+        # Test with Iceberg table
+        mock_iceberg_table = Mock()
+        mock_iceberg_table.Parameters = Mock()
+        mock_iceberg_table.Parameters.table_type = "ICEBERG"
+
+        # Test with non-Iceberg table
+        mock_regular_table = Mock()
+        mock_regular_table.Parameters = Mock()
+        mock_regular_table.Parameters.table_type = "EXTERNAL_TABLE"
+
+        # Test with table without parameters
+        mock_no_params_table = Mock()
+        mock_no_params_table.Parameters = None
+
+        # Test the detection logic
+        is_iceberg_1 = (
+            mock_iceberg_table.Parameters
+            and mock_iceberg_table.Parameters.table_type == "ICEBERG"
+        )
+        is_iceberg_2 = (
+            mock_regular_table.Parameters
+            and mock_regular_table.Parameters.table_type == "ICEBERG"
+        )
+        is_iceberg_3 = (
+            mock_no_params_table.Parameters
+            and mock_no_params_table.Parameters.table_type == "ICEBERG"
+        )
+
+        self.assertTrue(is_iceberg_1)
+        self.assertFalse(is_iceberg_2)
+        self.assertFalse(is_iceberg_3)