2025-06-12 18:41:22 +05:30

1009 lines
41 KiB
Python

from typing import Dict, Set
import setuptools
package_metadata: dict = {}
with open("./src/datahub/_version.py") as fp:
exec(fp.read(), package_metadata)
_version: str = package_metadata["__version__"]
_self_pin = (
f"=={_version}"
if not (_version.endswith(("dev0", "dev1")) or "docker" in _version)
else ""
)
base_requirements = {
# Our min version of typing_extensions is somewhat constrained by Airflow.
"typing_extensions>=4.5.0",
# Actual dependencies.
"typing-inspect",
# pydantic 1.8.2 is incompatible with mypy 0.910.
# See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910.
# pydantic 1.10.3 is incompatible with typing-extensions 4.1.1 - https://github.com/pydantic/pydantic/issues/4885
"pydantic>=1.10.0,!=1.10.3",
"mixpanel>=4.9.0",
# Airflow depends on fairly old versions of sentry-sdk, so we want to be loose with our constraints.
"sentry-sdk",
}
framework_common = {
# Avoiding click 8.2.0 due to https://github.com/pallets/click/issues/2894
"click>=7.1.2, !=8.2.0",
"click-default-group",
"PyYAML",
"toml>=0.10.0",
# In Python 3.10+, importlib_metadata is included in the standard library.
"importlib_metadata>=4.0.0; python_version < '3.10'",
"docker",
"expandvars>=0.6.5",
"avro-gen3==0.7.16",
# "avro-gen3 @ git+https://github.com/acryldata/avro_gen@master#egg=avro-gen3",
# avro has historically made breaking changes, so we have a cautious upper bound.
"avro>=1.11.3,<1.13",
"python-dateutil>=2.8.0",
"tabulate",
"progressbar2",
"psutil>=5.8.0",
"Deprecated",
"humanfriendly",
"packaging",
"aiohttp<4",
"cached_property",
"ijson",
"click-spinner",
"requests_file",
"jsonref",
"jsonschema",
"ruamel.yaml",
}
pydantic_no_v2 = {
# pydantic 2 makes major, backwards-incompatible changes - https://github.com/pydantic/pydantic/issues/4887
# Tags sources that require the pydantic v2 API.
"pydantic<2",
}
rest_common = {"requests", "requests_file"}
kafka_common = {
# Note that confluent_kafka 1.9.0 introduced a hard compatibility break, and
# requires librdkafka >=1.9.0. This is generally not an issue, since they
# now provide prebuilt wheels for most platforms, including M1 Macs and
# Linux aarch64 (e.g. Docker's linux/arm64). Installing confluent_kafka
# from source remains a pain.
# With the release of 2.8.1, confluent-kafka only released a source distribution,
# and no prebuilt wheels.
# See https://github.com/confluentinc/confluent-kafka-python/issues/1927
"confluent_kafka[schemaregistry,avro]>=1.9.0, != 2.8.1",
# We currently require both Avro libraries. The codegen uses avro-python3 (above)
# schema parsers at runtime for generating and reading JSON into Python objects.
# At the same time, we use Kafka's AvroSerializer, which internally relies on
# fastavro for serialization. We do not use confluent_kafka[avro], since it
# is incompatible with its own dep on avro-python3.
"fastavro>=1.2.0",
}
kafka_protobuf = {
"networkx>=2.6.2",
# Required to generate protobuf python modules from the schema downloaded from the schema registry
# NOTE: potential conflict with feast also depending on grpcio
"grpcio>=1.44.0,<2",
"grpcio-tools>=1.44.0,<2",
}
usage_common = {
"sqlparse",
}
sqlglot_lib = {
# We heavily monkeypatch sqlglot.
# We used to maintain an acryl-sqlglot fork: https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:main?expand=1
# but not longer do.
"sqlglot[rs]==26.26.0",
"patchy==2.8.0",
}
classification_lib = {
"acryl-datahub-classify==0.0.11",
# schwifty is needed for the classify plugin but in 2024.08.0 they broke the python 3.8 compatibility
"schwifty<2024.08.0",
# This is a bit of a hack. Because we download the SpaCy model at runtime in the classify plugin,
# we need pip to be available.
"pip",
# We were seeing an error like this `numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject`
# with numpy 2.0. This likely indicates a mismatch between scikit-learn and numpy versions.
# https://stackoverflow.com/questions/40845304/runtimewarning-numpy-dtype-size-changed-may-indicate-binary-incompatibility
"numpy<2",
}
dbt_common = {
*sqlglot_lib,
"more_itertools",
}
cachetools_lib = {
"cachetools",
}
great_expectations_lib = {
# 1. Our original dep was this:
# "great-expectations>=0.15.12, <=0.15.50",
# 2. For hive, we had additional restrictions:
# Due to https://github.com/great-expectations/great_expectations/issues/6146,
# we cannot allow 0.15.{23-26}. This was fixed in 0.15.27 by
# https://github.com/great-expectations/great_expectations/pull/6149.
# "great-expectations != 0.15.23, != 0.15.24, != 0.15.25, != 0.15.26",
# 3. Since then, we've ended up forking great-expectations in order to
# add pydantic 2.x support. The fork is pretty simple
# https://github.com/great-expectations/great_expectations/compare/0.15.50...hsheth2:great_expectations:0.15.50-pydantic-2-patch?expand=1
# This was derived from work done by @jskrzypek in
# https://github.com/datahub-project/datahub/issues/8115#issuecomment-2264219783
"acryl-great-expectations==0.15.50.1",
}
sqlalchemy_lib = {
# Required for all SQL sources.
# This is temporary lower bound that we're open to loosening/tightening as requirements show up
"sqlalchemy>=1.4.39, <2",
}
sql_common = (
{
*sqlalchemy_lib,
# Required for SQL profiling.
*great_expectations_lib,
# scipy version restricted to reduce backtracking, used by great-expectations,
"scipy>=1.7.2",
# GE added handling for higher version of jinja2
# https://github.com/great-expectations/great_expectations/pull/5382/files
# datahub does not depend on traitlets directly but great expectations does.
# https://github.com/ipython/traitlets/issues/741
"traitlets!=5.2.2",
# GE depends on IPython - we have no direct dependency on it.
# IPython 8.22.0 added a dependency on traitlets 5.13.x, but only declared a
# version requirement of traitlets>5.
# See https://github.com/ipython/ipython/issues/14352.
# This issue was fixed by https://github.com/ipython/ipython/pull/14353,
# which first appeared in IPython 8.22.1.
# As such, we just need to avoid that version in order to get the
# dependencies that we need. IPython probably should've yanked 8.22.0.
"IPython!=8.22.0",
"greenlet",
*cachetools_lib,
}
| usage_common
| sqlglot_lib
| classification_lib
)
aws_common = {
# AWS Python SDK
"boto3",
# Deal with a version incompatibility between botocore (used by boto3) and urllib3.
# See https://github.com/boto/botocore/pull/2563.
"botocore!=1.23.0",
}
path_spec_common = {
"parse>=1.19.0",
"wcmatch",
}
looker_common = {
# Looker Python SDK
"looker-sdk>=23.0.0",
# This version of lkml contains a fix for parsing lists in
# LookML files with spaces between an item and the following comma.
# See https://github.com/joshtemple/lkml/issues/73.
"lkml>=1.3.4",
*sqlglot_lib,
"GitPython>2",
# python-liquid 2 includes a bunch of breaking changes.
# See https://jg-rp.github.io/liquid/migration/
# Eventually we should fully upgrade to v2, but that will require
# us to drop Python 3.8 support first.
"python-liquid<2",
"deepmerge>=1.1.1",
}
bigquery_common = {
# Google cloud logging library
"google-cloud-logging<=3.5.0",
"google-cloud-bigquery",
"google-cloud-datacatalog>=1.5.0",
"google-cloud-resource-manager",
"more-itertools>=8.12.0",
"sqlalchemy-bigquery>=1.4.1",
*path_spec_common,
}
clickhouse_common = {
# Clickhouse 0.2.0 adds support for SQLAlchemy 1.4.x
# Disallow 0.2.5 because of https://github.com/xzkostyan/clickhouse-sqlalchemy/issues/272.
# Note that there's also a known issue around nested map types: https://github.com/xzkostyan/clickhouse-sqlalchemy/issues/269.
# zstd needs to be pinned because the latest version causes issues on arm
"zstd<1.5.6.8",
"clickhouse-sqlalchemy>=0.2.0,<0.2.5",
}
redshift_common = {
# Clickhouse 0.8.3 adds support for SQLAlchemy 1.4.x
"sqlalchemy-redshift>=0.8.3",
"GeoAlchemy2",
"redshift-connector>=2.1.5",
*path_spec_common,
}
snowflake_common = {
# Lower bound due to https://github.com/snowflakedb/snowflake-sqlalchemy/issues/350
#
# Upper bound <1.7.4: Version 1.7.4 of snowflake-sqlalchemy introduced a bug that breaks
# table column name reflection for non-uppercase table names. While we do not
# use this method directly, it is used by great-expectations during profiling.
#
# See: https://github.com/snowflakedb/snowflake-sqlalchemy/compare/v1.7.3...v1.7.4
#
# The exact cause of the breakage in v1.7.4 is unclear, but it may be related to
# changes in the _get_table_columns function. I initially suspected PR #541
# (https://github.com/snowflakedb/snowflake-sqlalchemy/pull/541), but that has been
# present since v1.7.0 and does not appear to cause issues.
#
# Reflection failures for case-sensitive object names are a known issue:
# https://github.com/snowflakedb/snowflake-sqlalchemy/issues/388
#
# As of May 2025, snowflake-sqlalchemy is in maintenance mode. I have commented on the
# above issue and we are pinning to a safe version.
"snowflake-sqlalchemy>=1.4.3, <1.7.4",
"snowflake-connector-python>=3.4.0",
"pandas",
"cryptography",
"msal",
*cachetools_lib,
}
trino = {
"trino[sqlalchemy]>=0.308",
}
pyhive_common = {
# DataHub maintains a fork of PyHive
# - 0.6.11 adds support for table comments and column comments,
# and also releases HTTP and HTTPS transport schemes
# - 0.6.12 adds support for Spark Thrift Server
# - 0.6.13 adds a small fix for Databricks
# - 0.6.14 uses pure-sasl instead of sasl so it builds on Python 3.11
# - 0.6.15 adds support for thrift > 0.14 (cherry-picked from https://github.com/apache/thrift/pull/2491)
# - 0.6.16 fixes a regression in 0.6.15 (https://github.com/acryldata/PyHive/pull/9)
"acryl-pyhive[hive-pure-sasl]==0.6.16",
# As per https://github.com/datahub-project/datahub/issues/8405
# and https://github.com/dropbox/PyHive/issues/417, version 0.14.0
# of thrift broke PyHive's hive+http transport.
# Fixed by https://github.com/apache/thrift/pull/2491 in version 0.17.0
# which is unfortunately not on PyPi.
# Instead, we put the fix in our PyHive fork, so no thrift pin is needed.
}
microsoft_common = {
"msal>=1.31.1",
}
iceberg_common = {
# Iceberg Python SDK
# Kept at 0.4.0 due to higher versions requiring pydantic>2, as soon as we are fine with it, bump this dependency
"pyiceberg>=0.4.0",
*cachetools_lib,
}
mssql_common = {
"sqlalchemy-pytds>=0.3",
"pyOpenSSL",
}
postgres_common = {
"psycopg2-binary",
"GeoAlchemy2",
}
s3_base = {
*aws_common,
"more-itertools>=8.12.0",
"parse>=1.19.0",
"pyarrow>=6.0.1",
"tableschema>=1.20.2",
# ujson 5.2.0 has the JSONDecodeError exception type, which we need for error handling.
"ujson>=5.2.0",
"smart-open[s3]>=5.2.1",
# moto 5.0.0 drops support for Python 3.7
"moto[s3]<5.0.0",
*path_spec_common,
}
threading_timeout_common = {
"stopit==1.1.2",
# stopit uses pkg_resources internally, which means there's an implied
# dependency on setuptools.
"setuptools",
}
abs_base = {
"azure-core>=1.31.0",
"azure-identity>=1.21.0",
"azure-storage-blob>=12.19.0",
"azure-storage-file-datalake>=12.14.0",
"more-itertools>=8.12.0",
"pyarrow>=6.0.1",
"smart-open[azure]>=5.2.1",
"tableschema>=1.20.2",
"ujson>=5.2.0",
*path_spec_common,
}
data_lake_profiling = {
"pydeequ>=1.1.0",
"pyspark~=3.5.0",
# cachetools is used by the profiling config
*cachetools_lib,
}
delta_lake = {
*s3_base,
*abs_base,
# Version 0.18.0 broken on ARM Macs: https://github.com/delta-io/delta-rs/issues/2577
"deltalake>=0.6.3, != 0.6.4, != 0.18.0; platform_system == 'Darwin' and platform_machine == 'arm64'",
"deltalake>=0.6.3, != 0.6.4; platform_system != 'Darwin' or platform_machine != 'arm64'",
}
powerbi_report_server = {"requests", "requests_ntlm"}
slack = {
"slack-sdk==3.18.1",
"tenacity>=8.0.1",
}
databricks = {
# 0.1.11 appears to have authentication issues with azure databricks
# 0.22.0 has support for `include_browse` in metadata list apis
"databricks-sdk>=0.30.0",
"pyspark~=3.5.0",
"requests",
# Version 2.4.0 includes sqlalchemy dialect, 2.8.0 includes some bug fixes
# Version 3.0.0 required SQLAlchemy > 2.0.21
"databricks-sql-connector>=2.8.0,<3.0.0",
# Due to https://github.com/databricks/databricks-sql-python/issues/326
# databricks-sql-connector<3.0.0 requires pandas<2.2.0
"pandas<2.2.0",
}
mysql = {"pymysql>=1.0.2"}
sac = {
"requests",
"pyodata>=1.11.1",
"Authlib",
}
superset_common = {
"requests",
*sqlglot_lib,
}
# Note: for all of these, framework_common will be added.
plugins: Dict[str, Set[str]] = {
# Sink plugins.
"datahub-kafka": kafka_common,
"datahub-rest": rest_common,
"sync-file-emitter": {"filelock"},
"datahub-lite": {
"duckdb>=1.0.0",
# duckdb dropped support for python 3.8 in 1.3.0
"duckdb<1.3.0; python_version < '3.9'",
"fastapi",
"uvicorn",
},
# Integrations.
"airflow": {
f"acryl-datahub-airflow-plugin{_self_pin}",
},
"circuit-breaker": {
"gql>=3.3.0",
"gql[requests]>=3.3.0",
},
# TODO: Eventually we should reorganize our imports so that this depends on sqlalchemy_lib
# but not the full sql_common.
"datahub": sql_common | mysql | kafka_common,
"great-expectations": {
f"acryl-datahub-gx-plugin{_self_pin}",
},
# Misc plugins.
"sql-parser": sqlglot_lib,
# Source plugins
# sqlalchemy-bigquery is included here since it provides an implementation of
# a SQLalchemy-conform STRUCT type definition
"athena": sql_common
# We need to set tenacity lower than 8.4.0 as
# this version has missing dependency asyncio
# https://github.com/jd/tenacity/issues/471
| {
"PyAthena[SQLAlchemy]>=2.6.0,<3.0.0",
"sqlalchemy-bigquery>=1.4.1",
"tenacity!=8.4.0",
},
"azure-ad": set(),
"bigquery": sql_common
| bigquery_common
| sqlglot_lib
| classification_lib
| {
"google-cloud-datacatalog-lineage==0.2.2",
},
"bigquery-slim": bigquery_common,
"bigquery-queries": sql_common | bigquery_common | sqlglot_lib,
"clickhouse": sql_common | clickhouse_common,
"clickhouse-usage": sql_common | usage_common | clickhouse_common,
"cockroachdb": sql_common | postgres_common | {"sqlalchemy-cockroachdb<2.0.0"},
"datahub-lineage-file": set(),
"datahub-business-glossary": set(),
"delta-lake": {*data_lake_profiling, *delta_lake},
"dbt": {"requests"} | dbt_common | aws_common,
"dbt-cloud": {"requests"} | dbt_common,
"dremio": {"requests"} | sql_common,
"druid": sql_common | {"pydruid>=0.6.2"},
"dynamodb": aws_common | classification_lib,
# Starting with 7.14.0 python client is checking if it is connected to elasticsearch client. If its not it throws
# UnsupportedProductError
# https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/release-notes.html#rn-7-14-0
# https://github.com/elastic/elasticsearch-py/issues/1639#issuecomment-883587433
"elasticsearch": {"elasticsearch==7.13.4", *cachetools_lib},
"cassandra": {
"cassandra-driver>=3.28.0",
# We were seeing an error like this `numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject`
# with numpy 2.0. This likely indicates a mismatch between scikit-learn and numpy versions.
# https://stackoverflow.com/questions/40845304/runtimewarning-numpy-dtype-size-changed-may-indicate-binary-incompatibility
"numpy<2",
*cachetools_lib,
},
"feast": {
"feast>=0.34.0,<1",
"flask-openid>=1.3.0",
"dask[dataframe]<2024.7.0",
# We were seeing an error like this `numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject`
# with numpy 2.0. This likely indicates a mismatch between scikit-learn and numpy versions.
# https://stackoverflow.com/questions/40845304/runtimewarning-numpy-dtype-size-changed-may-indicate-binary-incompatibility
"numpy<2",
},
"grafana": {"requests"},
"glue": aws_common | cachetools_lib,
# hdbcli is supported officially by SAP, sqlalchemy-hana is built on top but not officially supported
"hana": sql_common
| {
"sqlalchemy-hana>=0.5.0; platform_machine != 'aarch64' and platform_machine != 'arm64'",
"hdbcli>=2.11.20; platform_machine != 'aarch64' and platform_machine != 'arm64'",
},
"hive": sql_common
| pyhive_common
| {
"databricks-dbapi",
*great_expectations_lib,
},
# keep in sync with presto-on-hive until presto-on-hive will be removed
"hive-metastore": sql_common
| pyhive_common
| {"psycopg2-binary", "pymysql>=1.0.2"},
"iceberg": iceberg_common,
"iceberg-catalog": aws_common,
"json-schema": set(),
"kafka": kafka_common | kafka_protobuf,
"kafka-connect": sql_common | {"requests", "JPype1"},
"ldap": {"python-ldap>=2.4"},
"looker": looker_common,
"lookml": looker_common,
"metabase": {"requests"} | sqlglot_lib,
"mlflow": {
"mlflow-skinny>=2.3.0,<2.21.0",
# Pinned to avoid the breaking change introduced in MLflow 2.21.0 where search_registered_models injects an implicit filter
# https://github.com/mlflow/mlflow/pull/14795
# Upper bound can be removed once the upstream issue is resolved,
# or we have a reliable and backward-compatible way to handle prompt filtering.
# It's technically wrong for packages to depend on setuptools. However, it seems mlflow does it anyways.
"setuptools",
},
"datahub-debug": {
"dnspython==2.7.0",
"requests"
},
"mode": {"requests", "python-liquid", "tenacity>=8.0.1"} | sqlglot_lib,
"mongodb": {"pymongo[srv]>=3.11", "packaging"},
"mssql": sql_common | mssql_common,
"mssql-odbc": sql_common | mssql_common | {"pyodbc"},
"mysql": sql_common | mysql,
# mariadb should have same dependency as mysql
"mariadb": sql_common | mysql,
"okta": {"okta~=1.7.0", "nest-asyncio"},
"oracle": sql_common | {"oracledb"},
"postgres": sql_common | postgres_common,
"presto": sql_common | pyhive_common | trino,
# presto-on-hive is an alias for hive-metastore and needs to be kept in sync
"presto-on-hive": sql_common
| pyhive_common
| {"psycopg2-binary", "pymysql>=1.0.2"},
"pulsar": {"requests"},
"redash": {"redash-toolbelt", "sql-metadata"} | sqlglot_lib,
"redshift": sql_common
| redshift_common
| usage_common
| sqlglot_lib
| classification_lib
| {"db-dtypes"} # Pandas extension data types
| cachetools_lib,
"s3": {*s3_base, *data_lake_profiling},
"gcs": {*s3_base, *data_lake_profiling},
"abs": {*abs_base, *data_lake_profiling},
"sagemaker": aws_common,
"salesforce": {"simple-salesforce", *cachetools_lib},
"snowflake": snowflake_common | sql_common | usage_common | sqlglot_lib,
"snowflake-slim": snowflake_common,
"snowflake-summary": snowflake_common | sql_common | usage_common | sqlglot_lib,
"snowflake-queries": snowflake_common | sql_common | usage_common | sqlglot_lib,
"sqlalchemy": sql_common,
"sql-queries": usage_common | sqlglot_lib,
"slack": slack,
"superset": superset_common,
"preset": superset_common,
"tableau": {"tableauserverclient>=0.24.0"} | sqlglot_lib,
"teradata": sql_common
| usage_common
| sqlglot_lib
| {
# On 2024-10-30, teradatasqlalchemy 20.0.0.2 was released. This version seemed to cause issues
# in our CI, so we're pinning the version for now.
"teradatasqlalchemy>=17.20.0.0,<=20.0.0.2",
},
"trino": sql_common | trino,
"starburst-trino-usage": sql_common | usage_common | trino,
"nifi": {"requests", "packaging", "requests-gssapi"},
"powerbi": (
microsoft_common
| {"lark[regex]==1.1.4", "sqlparse", "more-itertools"}
| sqlglot_lib
| threading_timeout_common
),
"powerbi-report-server": powerbi_report_server,
"vertica": sql_common | {"vertica-sqlalchemy-dialect[vertica-python]==0.0.8.2"},
"unity-catalog": databricks | sql_common,
# databricks is alias for unity-catalog and needs to be kept in sync
"databricks": databricks | sql_common,
"fivetran": snowflake_common | bigquery_common | sqlalchemy_lib | sqlglot_lib,
"qlik-sense": sqlglot_lib | {"requests", "websocket-client"},
"sigma": sqlglot_lib | {"requests"},
"sac": sac,
"neo4j": {"pandas", "neo4j"},
"vertexai": {"google-cloud-aiplatform>=1.80.0"},
}
# This is mainly used to exclude plugins from the Docker image.
all_exclude_plugins: Set[str] = {
# The Airflow extra is only retained for compatibility, but new users should
# be using the datahub-airflow-plugin package instead.
"airflow",
# The great-expectations extra is only retained for compatibility, but new users should
# be using the datahub-gx-plugin package instead.
"great-expectations",
# SQL Server ODBC requires additional drivers, and so we don't want to keep
# it included in the default "all" installation.
"mssql-odbc",
# duckdb doesn't have a prebuilt wheel for Linux arm7l or aarch64, so we
# simply exclude it.
"datahub-lite",
# Feast tends to have overly restrictive dependencies and hence doesn't
# play nice with the "all" installation.
"feast",
}
mypy_stubs = {
"types-dataclasses",
"types-six",
"types-python-dateutil",
# We need to avoid 2.31.0.5 and 2.31.0.4 due to
# https://github.com/python/typeshed/issues/10764. Once that
# issue is resolved, we can remove the upper bound and change it
# to a != constraint.
# We have a PR up to fix the underlying issue: https://github.com/python/typeshed/pull/10776.
"types-requests>=2.28.11.6,<=2.31.0.3",
"types-toml",
"types-PyMySQL",
"types-PyYAML",
"types-cachetools",
# versions 0.1.13 and 0.1.14 seem to have issues
"types-click==0.1.12",
# The boto3-stubs package seems to have regularly breaking minor releases,
# we pin to a specific version to avoid this.
"boto3-stubs[s3,glue,sagemaker,sts,dynamodb, lakeformation]==1.28.15",
"mypy-boto3-sagemaker==1.28.15", # For some reason, above pin only restricts `mypy-boto3-sagemaker<1.29.0,>=1.28.0`
"types-tabulate",
# avrogen package requires this
"types-pytz",
"types-pyOpenSSL",
"types-click-spinner>=0.1.13.1",
"types-ujson>=5.2.0",
"types-Deprecated",
"types-protobuf>=4.21.0.1",
"sqlalchemy2-stubs",
}
test_api_requirements = {
"pytest>=6.2.2",
"pytest-timeout",
# Missing numpy requirement in 8.0.0
"deepdiff!=8.0.0",
"orderly-set!=5.4.0", # 5.4.0 uses invalid types on Python 3.8
"PyYAML",
"pytest-docker>=1.1.0",
}
debug_requirements = {
"memray",
}
lint_requirements = {
# This is pinned only to avoid spurious errors in CI.
# We should make an effort to keep it up to date.
"ruff==0.11.7",
"mypy==1.14.1",
}
base_dev_requirements = {
*base_requirements,
*framework_common,
*mypy_stubs,
*s3_base,
*lint_requirements,
*test_api_requirements,
"coverage>=5.1",
"faker>=18.4.0",
"pytest-asyncio>=0.16.0",
"pytest-cov>=2.8.1",
"pytest-random-order~=1.1.0",
"requests-mock",
"freezegun",
"jsonpickle",
"build",
"twine",
*list(
dependency
for plugin in [
"abs",
"athena",
"bigquery",
"clickhouse",
"clickhouse-usage",
"cockroachdb",
"delta-lake",
"dremio",
"druid",
"elasticsearch",
"feast",
"iceberg",
"iceberg-catalog",
"mlflow",
"json-schema",
"ldap",
"looker",
"lookml",
"glue",
"mariadb",
"okta",
"oracle",
"postgres",
"sagemaker",
"kafka",
"datahub-rest",
"datahub-lite",
"presto",
"redash",
"redshift",
"s3",
"snowflake",
"slack",
"tableau",
"teradata",
"trino",
"hive",
"starburst-trino-usage",
"powerbi",
"powerbi-report-server",
"salesforce",
"unity-catalog",
"nifi",
"vertica",
"mode",
"fivetran",
"kafka-connect",
"qlik-sense",
"sigma",
"sac",
"cassandra",
"neo4j",
"vertexai",
]
if plugin
for dependency in plugins[plugin]
),
*pydantic_no_v2,
}
dev_requirements = {
*base_dev_requirements,
}
full_test_dev_requirements = {
*list(
dependency
for plugin in [
"athena",
"circuit-breaker",
"clickhouse",
"delta-lake",
"druid",
"feast",
"hana",
"hive",
"iceberg",
"iceberg-catalog",
"kafka-connect",
"ldap",
"mongodb",
"slack",
"mssql",
"mysql",
"mariadb",
"redash",
"vertica",
"vertexai",
]
if plugin
for dependency in plugins[plugin]
),
}
entry_points = {
"console_scripts": ["datahub = datahub.entrypoints:main"],
"datahub.ingestion.source.plugins": [
"abs = datahub.ingestion.source.abs.source:ABSSource",
"csv-enricher = datahub.ingestion.source.csv_enricher:CSVEnricherSource",
"file = datahub.ingestion.source.file:GenericFileSource",
"datahub = datahub.ingestion.source.datahub.datahub_source:DataHubSource",
"sqlalchemy = datahub.ingestion.source.sql.sql_generic:SQLAlchemyGenericSource",
"athena = datahub.ingestion.source.sql.athena:AthenaSource",
"azure-ad = datahub.ingestion.source.identity.azure_ad:AzureADSource",
"bigquery = datahub.ingestion.source.bigquery_v2.bigquery:BigqueryV2Source",
"bigquery-queries = datahub.ingestion.source.bigquery_v2.bigquery_queries:BigQueryQueriesSource",
"clickhouse = datahub.ingestion.source.sql.clickhouse:ClickHouseSource",
"clickhouse-usage = datahub.ingestion.source.usage.clickhouse_usage:ClickHouseUsageSource",
"cockroachdb = datahub.ingestion.source.sql.cockroachdb:CockroachDBSource",
"delta-lake = datahub.ingestion.source.delta_lake:DeltaLakeSource",
"s3 = datahub.ingestion.source.s3:S3Source",
"dbt = datahub.ingestion.source.dbt.dbt_core:DBTCoreSource",
"dbt-cloud = datahub.ingestion.source.dbt.dbt_cloud:DBTCloudSource",
"dremio = datahub.ingestion.source.dremio.dremio_source:DremioSource",
"druid = datahub.ingestion.source.sql.druid:DruidSource",
"dynamodb = datahub.ingestion.source.dynamodb.dynamodb:DynamoDBSource",
"elasticsearch = datahub.ingestion.source.elastic_search:ElasticsearchSource",
"feast = datahub.ingestion.source.feast:FeastRepositorySource",
"grafana = datahub.ingestion.source.grafana.grafana_source:GrafanaSource",
"glue = datahub.ingestion.source.aws.glue:GlueSource",
"sagemaker = datahub.ingestion.source.aws.sagemaker:SagemakerSource",
"hana = datahub.ingestion.source.sql.hana:HanaSource",
"hive = datahub.ingestion.source.sql.hive:HiveSource",
"hive-metastore = datahub.ingestion.source.sql.hive_metastore:HiveMetastoreSource",
"json-schema = datahub.ingestion.source.schema.json_schema:JsonSchemaSource",
"kafka = datahub.ingestion.source.kafka.kafka:KafkaSource",
"kafka-connect = datahub.ingestion.source.kafka_connect.kafka_connect:KafkaConnectSource",
"ldap = datahub.ingestion.source.ldap:LDAPSource",
"looker = datahub.ingestion.source.looker.looker_source:LookerDashboardSource",
"lookml = datahub.ingestion.source.looker.lookml_source:LookMLSource",
"datahub-gc = datahub.ingestion.source.gc.datahub_gc:DataHubGcSource",
"datahub-debug = datahub.ingestion.source.debug.datahub_debug:DataHubDebugSource",
"datahub-apply = datahub.ingestion.source.apply.datahub_apply:DataHubApplySource",
"datahub-lineage-file = datahub.ingestion.source.metadata.lineage:LineageFileSource",
"datahub-business-glossary = datahub.ingestion.source.metadata.business_glossary:BusinessGlossaryFileSource",
"mlflow = datahub.ingestion.source.mlflow:MLflowSource",
"mode = datahub.ingestion.source.mode:ModeSource",
"mongodb = datahub.ingestion.source.mongodb:MongoDBSource",
"mssql = datahub.ingestion.source.sql.mssql:SQLServerSource",
"mysql = datahub.ingestion.source.sql.mysql:MySQLSource",
"mariadb = datahub.ingestion.source.sql.mariadb.MariaDBSource",
"okta = datahub.ingestion.source.identity.okta:OktaSource",
"oracle = datahub.ingestion.source.sql.oracle:OracleSource",
"postgres = datahub.ingestion.source.sql.postgres:PostgresSource",
"redash = datahub.ingestion.source.redash:RedashSource",
"redshift = datahub.ingestion.source.redshift.redshift:RedshiftSource",
"slack = datahub.ingestion.source.slack.slack:SlackSource",
"snowflake = datahub.ingestion.source.snowflake.snowflake_v2:SnowflakeV2Source",
"snowflake-summary = datahub.ingestion.source.snowflake.snowflake_summary:SnowflakeSummarySource",
"snowflake-queries = datahub.ingestion.source.snowflake.snowflake_queries:SnowflakeQueriesSource",
"superset = datahub.ingestion.source.superset:SupersetSource",
"preset = datahub.ingestion.source.preset:PresetSource",
"tableau = datahub.ingestion.source.tableau.tableau:TableauSource",
"openapi = datahub.ingestion.source.openapi:OpenApiSource",
"metabase = datahub.ingestion.source.metabase:MetabaseSource",
"teradata = datahub.ingestion.source.sql.teradata:TeradataSource",
"trino = datahub.ingestion.source.sql.trino:TrinoSource",
"starburst-trino-usage = datahub.ingestion.source.usage.starburst_trino_usage:TrinoUsageSource",
"nifi = datahub.ingestion.source.nifi:NifiSource",
"powerbi = datahub.ingestion.source.powerbi.powerbi:PowerBiDashboardSource",
"powerbi-report-server = datahub.ingestion.source.powerbi_report_server:PowerBiReportServerDashboardSource",
"iceberg = datahub.ingestion.source.iceberg.iceberg:IcebergSource",
"vertica = datahub.ingestion.source.sql.vertica:VerticaSource",
"presto = datahub.ingestion.source.sql.presto:PrestoSource",
# This is only here for backward compatibility. Use the `hive-metastore` source instead.
"presto-on-hive = datahub.ingestion.source.sql.hive_metastore:HiveMetastoreSource",
"pulsar = datahub.ingestion.source.pulsar:PulsarSource",
"salesforce = datahub.ingestion.source.salesforce:SalesforceSource",
"demo-data = datahub.ingestion.source.demo_data.DemoDataSource",
"unity-catalog = datahub.ingestion.source.unity.source:UnityCatalogSource",
"gcs = datahub.ingestion.source.gcs.gcs_source:GCSSource",
"sql-queries = datahub.ingestion.source.sql_queries:SqlQueriesSource",
"fivetran = datahub.ingestion.source.fivetran.fivetran:FivetranSource",
"qlik-sense = datahub.ingestion.source.qlik_sense.qlik_sense:QlikSenseSource",
"sigma = datahub.ingestion.source.sigma.sigma:SigmaSource",
"sac = datahub.ingestion.source.sac.sac:SACSource",
"cassandra = datahub.ingestion.source.cassandra.cassandra:CassandraSource",
"neo4j = datahub.ingestion.source.neo4j.neo4j_source:Neo4jSource",
"vertexai = datahub.ingestion.source.vertexai.vertexai:VertexAISource",
"hex = datahub.ingestion.source.hex.hex:HexSource",
],
"datahub.ingestion.transformer.plugins": [
"pattern_cleanup_ownership = datahub.ingestion.transformer.pattern_cleanup_ownership:PatternCleanUpOwnership",
"simple_remove_dataset_ownership = datahub.ingestion.transformer.remove_dataset_ownership:SimpleRemoveDatasetOwnership",
"mark_dataset_status = datahub.ingestion.transformer.mark_dataset_status:MarkDatasetStatus",
"set_dataset_browse_path = datahub.ingestion.transformer.add_dataset_browse_path:AddDatasetBrowsePathTransformer",
"add_dataset_ownership = datahub.ingestion.transformer.add_dataset_ownership:AddDatasetOwnership",
"simple_add_dataset_ownership = datahub.ingestion.transformer.add_dataset_ownership:SimpleAddDatasetOwnership",
"pattern_add_dataset_ownership = datahub.ingestion.transformer.add_dataset_ownership:PatternAddDatasetOwnership",
"add_dataset_domain = datahub.ingestion.transformer.dataset_domain:AddDatasetDomain",
"simple_add_dataset_domain = datahub.ingestion.transformer.dataset_domain:SimpleAddDatasetDomain",
"pattern_add_dataset_domain = datahub.ingestion.transformer.dataset_domain:PatternAddDatasetDomain",
"add_dataset_tags = datahub.ingestion.transformer.add_dataset_tags:AddDatasetTags",
"simple_add_dataset_tags = datahub.ingestion.transformer.add_dataset_tags:SimpleAddDatasetTags",
"pattern_add_dataset_tags = datahub.ingestion.transformer.add_dataset_tags:PatternAddDatasetTags",
"extract_dataset_tags = datahub.ingestion.transformer.extract_dataset_tags:ExtractDatasetTags",
"add_dataset_terms = datahub.ingestion.transformer.add_dataset_terms:AddDatasetTerms",
"simple_add_dataset_terms = datahub.ingestion.transformer.add_dataset_terms:SimpleAddDatasetTerms",
"pattern_add_dataset_terms = datahub.ingestion.transformer.add_dataset_terms:PatternAddDatasetTerms",
"add_dataset_properties = datahub.ingestion.transformer.add_dataset_properties:AddDatasetProperties",
"simple_add_dataset_properties = datahub.ingestion.transformer.add_dataset_properties:SimpleAddDatasetProperties",
"pattern_add_dataset_schema_terms = datahub.ingestion.transformer.add_dataset_schema_terms:PatternAddDatasetSchemaTerms",
"pattern_add_dataset_schema_tags = datahub.ingestion.transformer.add_dataset_schema_tags:PatternAddDatasetSchemaTags",
"extract_ownership_from_tags = datahub.ingestion.transformer.extract_ownership_from_tags:ExtractOwnersFromTagsTransformer",
"add_dataset_dataproduct = datahub.ingestion.transformer.add_dataset_dataproduct:AddDatasetDataProduct",
"simple_add_dataset_dataproduct = datahub.ingestion.transformer.add_dataset_dataproduct:SimpleAddDatasetDataProduct",
"pattern_add_dataset_dataproduct = datahub.ingestion.transformer.add_dataset_dataproduct:PatternAddDatasetDataProduct",
"replace_external_url = datahub.ingestion.transformer.replace_external_url:ReplaceExternalUrlDataset",
"replace_external_url_container = datahub.ingestion.transformer.replace_external_url:ReplaceExternalUrlContainer",
"pattern_cleanup_dataset_usage_user = datahub.ingestion.transformer.pattern_cleanup_dataset_usage_user:PatternCleanupDatasetUsageUser",
"domain_mapping_based_on_tags = datahub.ingestion.transformer.dataset_domain_based_on_tags:DatasetTagDomainMapper",
"tags_to_term = datahub.ingestion.transformer.tags_to_terms:TagsToTermMapper",
],
"datahub.ingestion.sink.plugins": [
"file = datahub.ingestion.sink.file:FileSink",
"console = datahub.ingestion.sink.console:ConsoleSink",
"blackhole = datahub.ingestion.sink.blackhole:BlackHoleSink",
"datahub-kafka = datahub.ingestion.sink.datahub_kafka:DatahubKafkaSink",
"datahub-rest = datahub.ingestion.sink.datahub_rest:DatahubRestSink",
"datahub-lite = datahub.ingestion.sink.datahub_lite:DataHubLiteSink",
],
"datahub.ingestion.checkpointing_provider.plugins": [
"datahub = datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider:DatahubIngestionCheckpointingProvider",
"file = datahub.ingestion.source.state_provider.file_ingestion_checkpointing_provider:FileIngestionCheckpointingProvider",
],
"datahub.ingestion.reporting_provider.plugins": [
"datahub = datahub.ingestion.reporting.datahub_ingestion_run_summary_provider:DatahubIngestionRunSummaryProvider",
"file = datahub.ingestion.reporting.file_reporter:FileReporter",
],
"datahub.custom_packages": [],
"datahub.fs.plugins": [
"s3 = datahub.ingestion.fs.s3_fs:S3FileSystem",
"file = datahub.ingestion.fs.local_fs:LocalFileSystem",
"http = datahub.ingestion.fs.http_fs:HttpFileSystem",
],
}
setuptools.setup(
# Package metadata.
name=package_metadata["__package_name__"],
version=_version,
url="https://docs.datahub.com/",
project_urls={
"Documentation": "https://docs.datahub.com/docs/",
"Source": "https://github.com/datahub-project/datahub",
"Changelog": "https://github.com/datahub-project/datahub/releases",
"Releases": "https://github.com/acryldata/datahub/releases",
},
license="Apache License 2.0",
description="A CLI to work with DataHub metadata",
long_description="""\
The `acryl-datahub` package contains a CLI and SDK for interacting with DataHub,
as well as an integration framework for pulling/pushing metadata from external systems.
See the [DataHub docs](https://docs.datahub.com/docs/metadata-ingestion).
""",
long_description_content_type="text/markdown",
classifiers=[
"Development Status :: 5 - Production/Stable",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Intended Audience :: Developers",
"Intended Audience :: Information Technology",
"Intended Audience :: System Administrators",
"License :: OSI Approved",
"License :: OSI Approved :: Apache Software License",
"Operating System :: Unix",
"Operating System :: POSIX :: Linux",
"Environment :: Console",
"Environment :: MacOS X",
"Topic :: Software Development",
],
# Package info.
zip_safe=False,
python_requires=">=3.8",
package_dir={"": "src"},
packages=setuptools.find_namespace_packages(where="./src"),
package_data={
"datahub": ["py.typed"],
"datahub.metadata": ["schema.avsc"],
"datahub.metadata.schemas": ["*.avsc"],
"datahub.ingestion.source.powerbi": ["powerbi-lexical-grammar.rule"],
},
entry_points=entry_points,
# Dependencies.
install_requires=list(base_requirements | framework_common),
extras_require={
"base": list(framework_common),
**{
plugin: list(
framework_common
| (
# While pydantic v2 support is experimental, require that all plugins
# continue to use v1. This will ensure that no ingestion recipes break.
pydantic_no_v2
if plugin
not in {
"airflow",
"datahub-rest",
"datahub-kafka",
"sync-file-emitter",
"sql-parser",
# Some sources have been manually tested for compatibility with pydantic v2.
"iceberg",
"feast",
"bigquery-slim",
"snowflake-slim",
"mysql", # tested in smoke-test
}
else set()
)
| dependencies
)
for (plugin, dependencies) in plugins.items()
},
"all": list(
framework_common.union(
*[
requirements
for plugin, requirements in plugins.items()
if plugin not in all_exclude_plugins
]
)
),
"cloud": ["acryl-datahub-cloud"],
"dev": list(dev_requirements),
"lint": list(lint_requirements),
"testing-utils": list(test_api_requirements), # To import `datahub.testing`
"integration-tests": list(full_test_dev_requirements),
"debug": list(debug_requirements),
},
)