fix(ingestion): updated json_extract function for postgres db #12604 (#14533)

This commit is contained in:
Anush Kumar 2025-08-21 15:42:51 -07:00 committed by GitHub
parent 829cf62fc3
commit bf09c5eebf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 20 additions and 8 deletions

View File

@ -104,6 +104,22 @@ class DataHubDatabaseReader:
ORDER BY mav.urn ORDER BY mav.urn
""" """
def _get_json_extract_expression(self) -> str:
"""
Returns the appropriate JSON extraction expression based on the database dialect.
Returns:
Database-specific JSON extraction expression
"""
# Return the correct JSON extraction expression for the "removed" field,
# depending on the database dialect.
if self.engine.dialect.name == "postgresql":
# For PostgreSQL, cast the metadata column to JSON and extract the 'removed' key as boolean.
return "((metadata::json)->>'removed')::boolean"
else:
# For other databases (e.g., MySQL), use JSON_EXTRACT.
return "JSON_EXTRACT(metadata, '$.removed')"
def query(self, set_structured_properties_filter: bool) -> str: def query(self, set_structured_properties_filter: bool) -> str:
""" """
Main query that gets data for specified date range with appropriate filters. Main query that gets data for specified date range with appropriate filters.
@ -125,7 +141,7 @@ class DataHubDatabaseReader:
LEFT JOIN ( LEFT JOIN (
SELECT SELECT
*, *,
JSON_EXTRACT(metadata, '$.removed') as removed {self._get_json_extract_expression()} as removed
FROM {self.engine.dialect.identifier_preparer.quote(self.config.database_table_name)} FROM {self.engine.dialect.identifier_preparer.quote(self.config.database_table_name)}
WHERE aspect = 'status' WHERE aspect = 'status'
AND version = 0 AND version = 0
@ -241,15 +257,10 @@ class DataHubDatabaseReader:
"end_createdon": end_date.strftime(DATETIME_FORMAT), "end_createdon": end_date.strftime(DATETIME_FORMAT),
"limit": limit, "limit": limit,
"offset": offset, "offset": offset,
# Always pass exclude_aspects as a tuple, postgres doesn't support lists
"exclude_aspects": tuple(self.config.exclude_aspects),
} }
# Add exclude_aspects if needed
if (
hasattr(self.config, "exclude_aspects")
and self.config.exclude_aspects
):
params["exclude_aspects"] = tuple(self.config.exclude_aspects)
logger.info( logger.info(
f"Querying data from {start_date.strftime(DATETIME_FORMAT)} to {end_date.strftime(DATETIME_FORMAT)} " f"Querying data from {start_date.strftime(DATETIME_FORMAT)} to {end_date.strftime(DATETIME_FORMAT)} "
f"with limit {limit} and offset {offset} (inclusive range)" f"with limit {limit} and offset {offset} (inclusive range)"

View File

@ -304,4 +304,5 @@ def test_get_rows_for_date_range_exclude_aspects(mock_reader):
# Assert # Assert
called_params = mock_reader.execute_server_cursor.call_args[0][1] called_params = mock_reader.execute_server_cursor.call_args[0][1]
assert "exclude_aspects" in called_params assert "exclude_aspects" in called_params
assert isinstance(called_params["exclude_aspects"], tuple)
assert called_params["exclude_aspects"] == ("aspect1", "aspect2") assert called_params["exclude_aspects"] == ("aspect1", "aspect2")