feat(ingest): hide cartesian product warnings in GE profiler (#7096)

This commit is contained in:
Harshal Sheth 2023-01-24 02:30:50 -05:00 committed by GitHub
parent dbc89c8680
commit ceaeb42a2b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 29 additions and 1 deletions

View File

@ -24,6 +24,7 @@ from typing import (
) )
import sqlalchemy as sa import sqlalchemy as sa
import sqlalchemy.sql.compiler
from great_expectations.core.util import convert_to_json_serializable from great_expectations.core.util import convert_to_json_serializable
from great_expectations.data_context import BaseDataContext from great_expectations.data_context import BaseDataContext
from great_expectations.data_context.types.base import ( from great_expectations.data_context.types.base import (
@ -58,6 +59,7 @@ from datahub.metadata.schema_classes import (
from datahub.telemetry import stats, telemetry from datahub.telemetry import stats, telemetry
from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.perf_timer import PerfTimer
from datahub.utilities.sqlalchemy_query_combiner import ( from datahub.utilities.sqlalchemy_query_combiner import (
IS_SQLALCHEMY_1_4,
SQLAlchemyQueryCombiner, SQLAlchemyQueryCombiner,
get_query_columns, get_query_columns,
) )
@ -683,6 +685,21 @@ class DatahubGEProfiler:
# make the threading code work correctly. As such, we need to make sure we've # make the threading code work correctly. As such, we need to make sure we've
# got an engine here. # got an engine here.
self.base_engine = conn.engine self.base_engine = conn.engine
if IS_SQLALCHEMY_1_4:
# SQLAlchemy 1.4 added a statement "linter", which issues warnings about cartesian products in SELECT statements.
# Changelog: https://docs.sqlalchemy.org/en/14/changelog/migration_14.html#change-4737.
# Code: https://github.com/sqlalchemy/sqlalchemy/blob/2f91dd79310657814ad28b6ef64f91fff7a007c9/lib/sqlalchemy/sql/compiler.py#L549
#
# The query combiner does indeed produce queries with cartesian products, but they are
# safe because each "FROM" clause only returns one row, so the cartesian product
# is also always a single row. As such, we disable the linter here.
# Modified from https://github.com/sqlalchemy/sqlalchemy/blob/2f91dd79310657814ad28b6ef64f91fff7a007c9/lib/sqlalchemy/engine/create.py#L612
self.base_engine.dialect.compiler_linting &= ( # type: ignore[attr-defined]
~sqlalchemy.sql.compiler.COLLECT_CARTESIAN_PRODUCTS # type: ignore[attr-defined]
)
self.platform = platform self.platform = platform
@contextlib.contextmanager @contextlib.contextmanager

View File

@ -13,6 +13,7 @@ import greenlet
import sqlalchemy import sqlalchemy
import sqlalchemy.engine import sqlalchemy.engine
import sqlalchemy.sql import sqlalchemy.sql
from packaging import version
from sqlalchemy.engine import Connection from sqlalchemy.engine import Connection
from sqlalchemy.orm.exc import MultipleResultsFound, NoResultFound from sqlalchemy.orm.exc import MultipleResultsFound, NoResultFound
@ -20,6 +21,12 @@ from datahub.ingestion.api.report import Report
logger: logging.Logger = logging.getLogger(__name__) logger: logging.Logger = logging.getLogger(__name__)
# The type annotations for SA 1.3.x don't have the __version__ attribute,
# so we need to ignore the error here.
SQLALCHEMY_VERSION = sqlalchemy.__version__ # type: ignore[attr-defined]
IS_SQLALCHEMY_1_4 = version.parse(SQLALCHEMY_VERSION) >= version.parse("1.4.0")
MAX_QUERIES_TO_COMBINE_AT_ONCE = 40 MAX_QUERIES_TO_COMBINE_AT_ONCE = 40
@ -333,7 +340,11 @@ class SQLAlchemyQueryCombiner:
# Extract the results into a result for each query. # Extract the results into a result for each query.
index = 0 index = 0
for _, query_future in pending_queue.items(): for _, query_future in pending_queue.items():
cols = query_future.query.columns query = query_future.query
if IS_SQLALCHEMY_1_4:
# On 1.4, it prints a warning if we don't call subquery.
query = query.subquery() # type: ignore
cols = query.columns
data = {} data = {}
for col in cols: for col in cols: