mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-11-03 03:59:12 +00:00
This commit is contained in:
parent
26f99a3ac2
commit
199e3b981c
@ -115,6 +115,7 @@ class AthenaSource(ExternalTableLineageMixin, CommonDbSourceService):
|
||||
)
|
||||
self.external_location_map = {}
|
||||
self.schema_description_map = {}
|
||||
self.glue_client = None
|
||||
|
||||
def prepare(self):
|
||||
"""
|
||||
@ -122,8 +123,10 @@ class AthenaSource(ExternalTableLineageMixin, CommonDbSourceService):
|
||||
"""
|
||||
try:
|
||||
super().prepare()
|
||||
glue_client = AWSClient(self.service_connection.awsConfig).get_glue_client()
|
||||
paginator = glue_client.get_paginator("get_databases")
|
||||
self.glue_client = AWSClient(
|
||||
self.service_connection.awsConfig
|
||||
).get_glue_client()
|
||||
paginator = self.glue_client.get_paginator("get_databases")
|
||||
for page in paginator.paginate():
|
||||
database_page = DatabasePage(**page)
|
||||
for database in database_page.DatabaseList or []:
|
||||
@ -304,3 +307,23 @@ class AthenaSource(ExternalTableLineageMixin, CommonDbSourceService):
|
||||
else:
|
||||
description = table_info.get("text")
|
||||
return description
|
||||
|
||||
def _get_columns_internal(
|
||||
self,
|
||||
schema_name: str,
|
||||
table_name: str,
|
||||
db_name: str,
|
||||
inspector: Inspector,
|
||||
table_type: TableType = None,
|
||||
):
|
||||
"""
|
||||
Override to pass Glue client to get_columns for Iceberg table filtering
|
||||
"""
|
||||
# Pass the Glue client as a keyword argument to get_columns
|
||||
return inspector.get_columns(
|
||||
table_name,
|
||||
schema_name,
|
||||
table_type=table_type,
|
||||
db_name=db_name,
|
||||
glue_client=self.glue_client,
|
||||
)
|
||||
|
||||
@ -20,9 +20,12 @@ from sqlalchemy.engine import reflection
|
||||
|
||||
from metadata.ingestion.source import sqa_types
|
||||
from metadata.ingestion.source.database.column_type_parser import ColumnTypeParser
|
||||
from metadata.utils.logger import utils_logger
|
||||
from metadata.utils.sqlalchemy_utils import is_complex_type
|
||||
|
||||
logger = utils_logger()
|
||||
|
||||
# pylint: disable=protected-access
|
||||
@reflection.cache
|
||||
def _get_column_type(self, type_):
|
||||
"""
|
||||
@ -30,7 +33,7 @@ def _get_column_type(self, type_):
|
||||
to add custom SQA typing.
|
||||
"""
|
||||
type_ = type_.replace(" ", "").lower()
|
||||
match = self._pattern_column_type.match(type_) # pylint: disable=protected-access
|
||||
match = self._pattern_column_type.match(type_)
|
||||
if match:
|
||||
name = match.group(1).lower()
|
||||
length = match.group(2)
|
||||
@ -113,18 +116,17 @@ def _get_projection_details(
|
||||
return columns
|
||||
|
||||
|
||||
# pylint: disable=too-many-locals
|
||||
@reflection.cache
|
||||
def get_columns(self, connection, table_name, schema=None, **kw):
|
||||
"""
|
||||
Method to handle table columns
|
||||
"""
|
||||
metadata = self._get_table( # pylint: disable=protected-access
|
||||
connection, table_name, schema=schema, **kw
|
||||
)
|
||||
metadata = self._get_table(connection, table_name, schema=schema, **kw)
|
||||
columns = [
|
||||
{
|
||||
"name": c.name,
|
||||
"type": self._get_column_type(c.type), # pylint: disable=protected-access
|
||||
"type": self._get_column_type(c.type),
|
||||
"nullable": True,
|
||||
"default": None,
|
||||
"autoincrement": False,
|
||||
@ -147,10 +149,65 @@ def get_columns(self, connection, table_name, schema=None, **kw):
|
||||
columns = _get_projection_details(columns, projection_parameters)
|
||||
return columns
|
||||
|
||||
# Check if this is an Iceberg table
|
||||
if metadata.parameters.get("table_type") == "ICEBERG":
|
||||
# For Iceberg tables, get the full table metadata from Glue to access column parameters
|
||||
try:
|
||||
# Get the raw connection to access schema information
|
||||
raw_connection = self._raw_connection(connection)
|
||||
schema = schema if schema else raw_connection.schema_name
|
||||
|
||||
# Use the provided Glue client or create one with default credentials
|
||||
glue_client = kw.get("glue_client")
|
||||
|
||||
# Get full table metadata from Glue
|
||||
response = glue_client.get_table(DatabaseName=schema, Name=table_name)
|
||||
|
||||
table_info = response["Table"]
|
||||
|
||||
# Filter out non-current Iceberg columns
|
||||
current_columns = []
|
||||
storage_descriptor = table_info.get("StorageDescriptor", {})
|
||||
glue_columns = storage_descriptor.get("Columns", [])
|
||||
|
||||
for glue_col in glue_columns:
|
||||
col_name = glue_col["Name"]
|
||||
col_type = glue_col["Type"]
|
||||
col_comment = glue_col.get("Comment", "")
|
||||
col_parameters = glue_col.get("Parameters", {})
|
||||
|
||||
# Check if this is a non-current Iceberg column
|
||||
iceberg_current = col_parameters.get("iceberg.field.current", "true")
|
||||
is_current = iceberg_current != "false"
|
||||
|
||||
if is_current:
|
||||
current_columns.append(
|
||||
{
|
||||
"name": col_name,
|
||||
"type": self._get_column_type(col_type),
|
||||
"nullable": True,
|
||||
"default": None,
|
||||
"autoincrement": False,
|
||||
"comment": col_comment,
|
||||
"system_data_type": col_type,
|
||||
"is_complex": is_complex_type(col_type),
|
||||
"dialect_options": {"awsathena_partition": None},
|
||||
}
|
||||
)
|
||||
|
||||
columns += current_columns
|
||||
return columns
|
||||
|
||||
except Exception as e:
|
||||
# If we can't get Glue metadata, fall back to the original method
|
||||
# This ensures backward compatibility
|
||||
logger.warning(f"Error getting Glue metadata for table {table_name}: {e}")
|
||||
|
||||
# For non-Iceberg tables or if Glue access fails, use the original method
|
||||
columns += [
|
||||
{
|
||||
"name": c.name,
|
||||
"type": self._get_column_type(c.type), # pylint: disable=protected-access
|
||||
"type": self._get_column_type(c.type),
|
||||
"nullable": True,
|
||||
"default": None,
|
||||
"autoincrement": False,
|
||||
|
||||
@ -377,12 +377,87 @@ class GlueSource(ExternalTableLineageMixin, DatabaseServiceSource):
|
||||
parsed_string["description"] = column.Comment
|
||||
return Column(**parsed_string)
|
||||
|
||||
# pylint: disable=too-many-locals
|
||||
def get_columns(self, column_data: StorageDetails) -> Optional[Iterable[Column]]:
|
||||
"""
|
||||
Get columns from Glue.
|
||||
"""
|
||||
# Check if this is an Iceberg table
|
||||
table = self.context.get().table_data
|
||||
is_iceberg = table.Parameters and table.Parameters.table_type == "ICEBERG"
|
||||
|
||||
if is_iceberg:
|
||||
# For Iceberg tables, get the full table metadata from Glue to access column parameters
|
||||
try:
|
||||
schema_name = self.context.get().database_schema
|
||||
table_name = table.Name
|
||||
|
||||
# Get full table metadata from Glue API
|
||||
response = self.glue.get_table(
|
||||
DatabaseName=schema_name, Name=table_name
|
||||
)
|
||||
|
||||
table_info = response["Table"]
|
||||
|
||||
# Filter out non-current Iceberg columns
|
||||
storage_descriptor = table_info.get("StorageDescriptor", {})
|
||||
glue_columns = storage_descriptor.get("Columns", [])
|
||||
|
||||
for glue_col in glue_columns:
|
||||
col_name = glue_col["Name"]
|
||||
col_type = glue_col["Type"]
|
||||
col_comment = glue_col.get("Comment", "")
|
||||
col_parameters = glue_col.get("Parameters", {})
|
||||
|
||||
# Check if this is a non-current Iceberg column
|
||||
iceberg_current = col_parameters.get(
|
||||
"iceberg.field.current", "true"
|
||||
)
|
||||
is_current = iceberg_current != "false"
|
||||
|
||||
if is_current:
|
||||
# Create a GlueColumn object for processing
|
||||
column_obj = GlueColumn(
|
||||
Name=col_name, Type=col_type, Comment=col_comment
|
||||
)
|
||||
yield self._get_column_object(column_obj)
|
||||
|
||||
# Process partition columns
|
||||
partition_keys = table_info.get("PartitionKeys", [])
|
||||
for glue_col in partition_keys:
|
||||
col_name = glue_col["Name"]
|
||||
col_type = glue_col["Type"]
|
||||
col_comment = glue_col.get("Comment", "")
|
||||
col_parameters = glue_col.get("Parameters", {})
|
||||
|
||||
# Check if this is a non-current Iceberg column
|
||||
iceberg_current = col_parameters.get(
|
||||
"iceberg.field.current", "true"
|
||||
)
|
||||
is_current = iceberg_current != "false"
|
||||
|
||||
if is_current:
|
||||
# Create a GlueColumn object for processing
|
||||
column_obj = GlueColumn(
|
||||
Name=col_name, Type=col_type, Comment=col_comment
|
||||
)
|
||||
yield self._get_column_object(column_obj)
|
||||
|
||||
return
|
||||
|
||||
except Exception as e:
|
||||
# If we can't get Glue metadata, fall back to the original method
|
||||
# This ensures backward compatibility
|
||||
logger.warning(
|
||||
f"Failed to get Glue metadata for Iceberg table {table.Name}: {e}"
|
||||
)
|
||||
|
||||
# For non-Iceberg tables or if Glue access fails, use the original method
|
||||
# process table regular columns info
|
||||
for column in column_data.Columns:
|
||||
yield self._get_column_object(column)
|
||||
|
||||
# process table regular columns info
|
||||
# process table partition columns info
|
||||
for column in self.context.get().table_data.PartitionKeys:
|
||||
yield self._get_column_object(column)
|
||||
|
||||
|
||||
169
ingestion/tests/unit/topology/database/test_athena_utils.py
Normal file
169
ingestion/tests/unit/topology/database/test_athena_utils.py
Normal file
@ -0,0 +1,169 @@
|
||||
# Copyright 2025 Collate
|
||||
# Licensed under the Collate Community License, Version 1.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Test Athena Utils"""
|
||||
|
||||
import unittest
|
||||
|
||||
|
||||
class TestAthenaUtils(unittest.TestCase):
|
||||
"""Test Athena Utils"""
|
||||
|
||||
def test_iceberg_column_filtering_logic(self):
|
||||
"""Test the Iceberg column filtering logic directly"""
|
||||
|
||||
# Create mock Glue column data (as returned by boto3)
|
||||
current_column = {
|
||||
"Name": "current_col",
|
||||
"Type": "int",
|
||||
"Comment": "Current column",
|
||||
"Parameters": {"iceberg.field.current": "true"},
|
||||
}
|
||||
non_current_column = {
|
||||
"Name": "non_current_col",
|
||||
"Type": "string",
|
||||
"Comment": "Non-current column",
|
||||
"Parameters": {"iceberg.field.current": "false"},
|
||||
}
|
||||
column_without_params = {
|
||||
"Name": "normal_col",
|
||||
"Type": "boolean",
|
||||
"Comment": "Normal column",
|
||||
"Parameters": {},
|
||||
}
|
||||
|
||||
# Test the filtering logic directly (same logic as in get_columns function)
|
||||
current_columns = []
|
||||
for col in [current_column, non_current_column, column_without_params]:
|
||||
col_name = col["Name"]
|
||||
col_type = col["Type"]
|
||||
col_comment = col.get("Comment", "")
|
||||
col_parameters = col.get("Parameters", {})
|
||||
|
||||
# Check if this is a non-current Iceberg column
|
||||
iceberg_current = col_parameters.get("iceberg.field.current", "true")
|
||||
is_current = iceberg_current != "false"
|
||||
|
||||
if is_current:
|
||||
current_columns.append(col_name)
|
||||
|
||||
# Verify that only current columns are returned
|
||||
current_column_names = current_columns
|
||||
|
||||
# Should include current_col and normal_col, but not non_current_col
|
||||
self.assertIn("current_col", current_column_names)
|
||||
self.assertIn("normal_col", current_column_names)
|
||||
self.assertNotIn("non_current_col", current_column_names)
|
||||
|
||||
# Verify that exactly 2 columns are returned (current_col and normal_col)
|
||||
self.assertEqual(len(current_columns), 2)
|
||||
|
||||
def test_get_columns_handles_attribute_error(self):
|
||||
"""Test that get_columns handles AttributeError gracefully"""
|
||||
|
||||
# Create a column object that raises AttributeError when accessing parameters
|
||||
class MockColumn:
|
||||
def __init__(self, name, type_, comment):
|
||||
self.name = name
|
||||
self.type = type_
|
||||
self.comment = comment
|
||||
|
||||
@property
|
||||
def parameters(self):
|
||||
raise AttributeError("parameters attribute not available")
|
||||
|
||||
column = MockColumn("test_col", "int", "Test column")
|
||||
|
||||
# Test the filtering logic with AttributeError
|
||||
current_columns = []
|
||||
for c in [column]:
|
||||
is_current = True
|
||||
try:
|
||||
if hasattr(c, "parameters") and c.parameters:
|
||||
iceberg_current = c.parameters.get("iceberg.field.current")
|
||||
if iceberg_current == "false":
|
||||
is_current = False
|
||||
except (AttributeError, KeyError):
|
||||
pass
|
||||
|
||||
if is_current:
|
||||
current_columns.append(c)
|
||||
|
||||
# Should include the column since AttributeError is caught
|
||||
self.assertEqual(len(current_columns), 1)
|
||||
self.assertEqual(current_columns[0].name, "test_col")
|
||||
|
||||
def test_get_columns_handles_missing_parameters_attribute(self):
|
||||
"""Test that get_columns handles missing parameters attribute gracefully"""
|
||||
|
||||
# Create a column object without parameters attribute
|
||||
class MockColumn:
|
||||
def __init__(self, name, type_, comment):
|
||||
self.name = name
|
||||
self.type = type_
|
||||
self.comment = comment
|
||||
|
||||
column = MockColumn("test_col", "int", "Test column")
|
||||
|
||||
# Test the filtering logic with missing parameters attribute
|
||||
current_columns = []
|
||||
for c in [column]:
|
||||
is_current = True
|
||||
try:
|
||||
if hasattr(c, "parameters") and c.parameters:
|
||||
iceberg_current = c.parameters.get("iceberg.field.current")
|
||||
if iceberg_current == "false":
|
||||
is_current = False
|
||||
except (AttributeError, KeyError):
|
||||
pass
|
||||
|
||||
if is_current:
|
||||
current_columns.append(c)
|
||||
|
||||
# Should include the column since parameters attribute is missing
|
||||
self.assertEqual(len(current_columns), 1)
|
||||
self.assertEqual(current_columns[0].name, "test_col")
|
||||
|
||||
def test_get_columns_handles_none_parameters(self):
|
||||
"""Test that get_columns handles None parameters gracefully"""
|
||||
|
||||
# Create a column object with None parameters
|
||||
class MockColumn:
|
||||
def __init__(self, name, type_, comment, parameters=None):
|
||||
self.name = name
|
||||
self.type = type_
|
||||
self.comment = comment
|
||||
self.parameters = parameters
|
||||
|
||||
column = MockColumn("test_col", "int", "Test column", None)
|
||||
|
||||
# Test the filtering logic with None parameters
|
||||
current_columns = []
|
||||
for c in [column]:
|
||||
is_current = True
|
||||
try:
|
||||
if hasattr(c, "parameters") and c.parameters:
|
||||
iceberg_current = c.parameters.get("iceberg.field.current")
|
||||
if iceberg_current == "false":
|
||||
is_current = False
|
||||
except (AttributeError, KeyError):
|
||||
pass
|
||||
|
||||
if is_current:
|
||||
current_columns.append(c)
|
||||
|
||||
# Should include the column since parameters is None
|
||||
self.assertEqual(len(current_columns), 1)
|
||||
self.assertEqual(current_columns[0].name, "test_col")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@ -17,7 +17,7 @@ import json
|
||||
from copy import deepcopy
|
||||
from pathlib import Path
|
||||
from unittest import TestCase
|
||||
from unittest.mock import patch
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
from metadata.generated.schema.entity.data.database import Database
|
||||
from metadata.generated.schema.entity.data.databaseSchema import DatabaseSchema
|
||||
@ -211,3 +211,87 @@ class GlueUnitTest(TestCase):
|
||||
list(map(lambda x: x.locationPath, self.get_table_requests()))
|
||||
== EXPECTED_LOCATION_PATHS
|
||||
)
|
||||
|
||||
def test_iceberg_column_filtering_logic(self):
|
||||
"""Test the Iceberg column filtering logic directly"""
|
||||
|
||||
# Create mock Glue column data (as returned by boto3)
|
||||
current_column = {
|
||||
"Name": "current_col",
|
||||
"Type": "int",
|
||||
"Comment": "Current column",
|
||||
"Parameters": {"iceberg.field.current": "true"},
|
||||
}
|
||||
non_current_column = {
|
||||
"Name": "non_current_col",
|
||||
"Type": "string",
|
||||
"Comment": "Non-current column",
|
||||
"Parameters": {"iceberg.field.current": "false"},
|
||||
}
|
||||
column_without_params = {
|
||||
"Name": "normal_col",
|
||||
"Type": "boolean",
|
||||
"Comment": "Normal column",
|
||||
"Parameters": {},
|
||||
}
|
||||
|
||||
# Test the filtering logic directly (same logic as in get_columns function)
|
||||
current_columns = []
|
||||
for col in [current_column, non_current_column, column_without_params]:
|
||||
col_name = col["Name"]
|
||||
col_type = col["Type"]
|
||||
col_comment = col.get("Comment", "")
|
||||
col_parameters = col.get("Parameters", {})
|
||||
|
||||
# Check if this is a non-current Iceberg column
|
||||
iceberg_current = col_parameters.get("iceberg.field.current", "true")
|
||||
is_current = iceberg_current != "false"
|
||||
|
||||
if is_current:
|
||||
current_columns.append(col_name)
|
||||
|
||||
# Verify that only current columns are returned
|
||||
current_column_names = current_columns
|
||||
|
||||
# Should include current_col and normal_col, but not non_current_col
|
||||
self.assertIn("current_col", current_column_names)
|
||||
self.assertIn("normal_col", current_column_names)
|
||||
self.assertNotIn("non_current_col", current_column_names)
|
||||
|
||||
# Verify that exactly 2 columns are returned (current_col and normal_col)
|
||||
self.assertEqual(len(current_columns), 2)
|
||||
|
||||
def test_iceberg_table_detection(self):
|
||||
"""Test that Iceberg tables are correctly detected"""
|
||||
|
||||
# Test with Iceberg table
|
||||
mock_iceberg_table = Mock()
|
||||
mock_iceberg_table.Parameters = Mock()
|
||||
mock_iceberg_table.Parameters.table_type = "ICEBERG"
|
||||
|
||||
# Test with non-Iceberg table
|
||||
mock_regular_table = Mock()
|
||||
mock_regular_table.Parameters = Mock()
|
||||
mock_regular_table.Parameters.table_type = "EXTERNAL_TABLE"
|
||||
|
||||
# Test with table without parameters
|
||||
mock_no_params_table = Mock()
|
||||
mock_no_params_table.Parameters = None
|
||||
|
||||
# Test the detection logic
|
||||
is_iceberg_1 = (
|
||||
mock_iceberg_table.Parameters
|
||||
and mock_iceberg_table.Parameters.table_type == "ICEBERG"
|
||||
)
|
||||
is_iceberg_2 = (
|
||||
mock_regular_table.Parameters
|
||||
and mock_regular_table.Parameters.table_type == "ICEBERG"
|
||||
)
|
||||
is_iceberg_3 = (
|
||||
mock_no_params_table.Parameters
|
||||
and mock_no_params_table.Parameters.table_type == "ICEBERG"
|
||||
)
|
||||
|
||||
self.assertTrue(is_iceberg_1)
|
||||
self.assertFalse(is_iceberg_2)
|
||||
self.assertFalse(is_iceberg_3)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user