From e2844b6c95821655f083b47b569c11cdfdad733f Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 2 May 2025 13:04:53 -0700 Subject: [PATCH] fix(ingest): move to acryl-great-expectations (#13398) --- metadata-ingestion/setup.py | 25 ++++++++++++++----- .../ingestion/source/ge_data_profiler.py | 25 +++++++++++++++++++ 2 files changed, 44 insertions(+), 6 deletions(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index ada682ae7d..f437e175af 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -131,6 +131,22 @@ cachetools_lib = { "cachetools", } +great_expectations_lib = { + # 1. Our original dep was this: + # "great-expectations>=0.15.12, <=0.15.50", + # 2. For hive, we had additional restrictions: + # Due to https://github.com/great-expectations/great_expectations/issues/6146, + # we cannot allow 0.15.{23-26}. This was fixed in 0.15.27 by + # https://github.com/great-expectations/great_expectations/pull/6149. + # "great-expectations != 0.15.23, != 0.15.24, != 0.15.25, != 0.15.26", + # 3. Since then, we've ended up forking great-expectations in order to + # add pydantic 2.x support. The fork is pretty simple + # https://github.com/great-expectations/great_expectations/compare/0.15.50...hsheth2:great_expectations:0.15.50-pydantic-2-patch?expand=1 + # This was derived from work done by @jskrzypek in + # https://github.com/datahub-project/datahub/issues/8115#issuecomment-2264219783 + "acryl-great-expectations==0.15.50.1", +} + sql_common_slim = { # Required for all SQL sources. # This is temporary lower bound that we're open to loosening/tightening as requirements show up @@ -140,8 +156,8 @@ sql_common = ( { *sql_common_slim, # Required for SQL profiling. - "great-expectations>=0.15.12, <=0.15.50", - *pydantic_no_v2, # because of great-expectations + *great_expectations_lib, + "pydantic<2", # keeping this for now, but can be removed eventually # scipy version restricted to reduce backtracking, used by great-expectations, "scipy>=1.7.2", # GE added handling for higher version of jinja2 @@ -450,10 +466,7 @@ plugins: Dict[str, Set[str]] = { | pyhive_common | { "databricks-dbapi", - # Due to https://github.com/great-expectations/great_expectations/issues/6146, - # we cannot allow 0.15.{23-26}. This was fixed in 0.15.27 by - # https://github.com/great-expectations/great_expectations/pull/6149. - "great-expectations != 0.15.23, != 0.15.24, != 0.15.25, != 0.15.26", + *great_expectations_lib, }, # keep in sync with presto-on-hive until presto-on-hive will be removed "hive-metastore": sql_common diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py index 8762b59760..6514e74dfc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py @@ -5,6 +5,7 @@ import concurrent.futures import contextlib import dataclasses import functools +import importlib.metadata import json import logging import re @@ -84,6 +85,30 @@ if TYPE_CHECKING: from pyathena.cursor import Cursor assert MARKUPSAFE_PATCHED + +# We need to ensure that acryl-great-expectations is installed +# and great-expectations is not installed. +try: + acryl_gx_version = bool(importlib.metadata.distribution("acryl-great-expectations")) +except importlib.metadata.PackageNotFoundError: + acryl_gx_version = False + +try: + original_gx_version = bool(importlib.metadata.distribution("great-expectations")) +except importlib.metadata.PackageNotFoundError: + original_gx_version = False + +if acryl_gx_version and original_gx_version: + raise RuntimeError( + "acryl-great-expectations and great-expectations cannot both be installed because their files will conflict. " + "You will need to (1) uninstall great-expectations and (2) re-install acryl-great-expectations. " + "See https://github.com/pypa/pip/issues/4625." + ) +elif original_gx_version: + raise RuntimeError( + "We expect acryl-great-expectations to be installed, but great-expectations is installed instead." + ) + logger: logging.Logger = logging.getLogger(__name__) _original_get_column_median = SqlAlchemyDataset.get_column_median