Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

529 lines
17 KiB
Python
Raw Normal View History

# Copyright 2025 Collate
# Licensed under the Collate Community License, Version 1.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
2021-08-02 15:08:30 +05:30
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Python Dependencies
"""
import sys
from typing import Dict, List, Set
2021-08-01 14:27:44 -07:00
from setuptools import setup
2021-08-01 14:27:44 -07:00
# Add here versions required for multiple plugins
VERSIONS = {
"airflow": "apache-airflow==2.10.5",
"adlfs": "adlfs>=2023.1.0",
"avro": "avro>=1.11.3,<1.12",
"boto3": "boto3>=1.20,<2.0", # No need to add botocore separately. It's a dep from boto3
"geoalchemy2": "GeoAlchemy2~=0.12",
"google-cloud-monitoring": "google-cloud-monitoring>=2.0.0",
2025-01-17 06:34:34 +01:00
"google-cloud-storage": "google-cloud-storage>=1.43.0",
"gcsfs": "gcsfs>=2023.1.0",
"great-expectations": "great-expectations~=0.18.0",
"great-expectations-1xx": "great-expectations~=1.0",
2023-02-20 13:37:27 +01:00
"grpc-tools": "grpcio-tools>=1.47.2",
"msal": "msal~=1.2",
"neo4j": "neo4j~=5.3",
"pandas": "pandas~=2.0.0",
"pyarrow": "pyarrow~=16.0",
"pydantic": "pydantic~=2.0,>=2.7.0",
"pydantic-settings": "pydantic-settings~=2.0,>=2.7.0",
"pydomo": "pydomo~=0.3",
"pymysql": "pymysql~=1.0",
"pyodbc": "pyodbc>=4.0.35,<5",
2024-06-18 17:03:35 +05:30
"numpy": "numpy<2",
"scikit-learn": "scikit-learn~=1.0", # Python 3.7 only goes up to 1.0.2
"packaging": "packaging",
"azure-storage-blob": "azure-storage-blob~=12.14",
"azure-identity": "azure-identity~=1.12",
"sqlalchemy-databricks": "sqlalchemy-databricks~=0.1",
"databricks-sdk": "databricks-sdk>=0.18.0,<0.20.0",
"trino": "trino[sqlalchemy]",
"spacy": "spacy<3.8",
"looker-sdk": "looker-sdk>=22.20.0,!=24.18.0",
"lkml": "lkml~=1.3",
"tableau": "tableauserverclient==0.25", # higher versions require urllib3>2.0 which conflicts other libs
"pyhive": "pyhive[hive_pure_sasl]~=0.7",
"mongo": "pymongo~=4.3",
"redshift": "sqlalchemy-redshift==0.8.12",
"snowflake": "snowflake-sqlalchemy~=1.4",
"elasticsearch8": "elasticsearch8~=8.9.0",
"giturlparse": "giturlparse",
"validators": "validators~=0.22.0",
"teradata": "teradatasqlalchemy==20.0.0.2",
"cockroach": "sqlalchemy-cockroachdb~=2.0",
2024-12-12 15:12:55 +05:30
"cassandra": "cassandra-driver>=3.28.0",
"opensearch": "opensearch-py~=2.4.0",
"pydoris": "pydoris==1.0.2",
"pyiceberg": "pyiceberg==0.5.1",
"google-cloud-bigtable": "google-cloud-bigtable>=2.0.0",
"pyathena": "pyathena~=3.0",
2025-04-16 11:34:26 +05:30
"sqlalchemy-bigquery": "sqlalchemy-bigquery>=1.2.2",
"presidio-analyzer": "presidio-analyzer==2.2.358",
}
COMMONS = {
"datalake": {
VERSIONS["avro"],
VERSIONS["boto3"],
VERSIONS["pandas"],
VERSIONS["pyarrow"],
2024-08-20 12:19:05 +02:00
VERSIONS["numpy"],
# python-snappy does not work well on 3.11 https://github.com/aio-libs/aiokafka/discussions/931
# Using this as an alternative
"cramjam~=2.7",
},
"hive": {
"presto-types-parser>=0.0.2",
VERSIONS["pyhive"],
},
"kafka": {
VERSIONS["avro"],
"confluent_kafka>=2.1.1,<=2.6.1",
"fastavro>=1.2.0",
# Due to https://github.com/grpc/grpc/issues/30843#issuecomment-1303816925
2023-02-20 13:37:27 +01:00
# use >= v1.47.2 https://github.com/grpc/grpc/blob/v1.47.2/tools/distrib/python/grpcio_tools/grpc_version.py#L17
VERSIONS[
"grpc-tools"
], # grpcio-tools already depends on grpcio. No need to add separately
"protobuf",
},
"postgres": {
VERSIONS["pymysql"],
"psycopg2-binary",
VERSIONS["geoalchemy2"],
VERSIONS["packaging"],
}, # Adding as Postgres SQL & GreenPlum are using common packages.
}
Fix #16421: add tableDiff test case (#16554) * feat: add tableDiff test case This changed introduces a "table diff" test case which compares two tables and fails if they are not identical. The similarity is made based on a specific "key" (because the test only makes sense when performed on ordered collections). 1. Added the `tableDiff` test definition. 2. Implemented a "runtime" parameters feature which injects additional parameters for the test at runtime. 3. Integration tests (because of course). This feature was not tested end-to-end yet because "array" data * pydantic v2 * format * format * format and added data diff to setup.py * format * fixed param issue which has type ARRAY * fixed runtime_parameter_setter * moved models to parent directory * handle errors in table diff * fixed issue with edit test case * format * added more details to pytest skip * format * refactor: Improve createTestCaseParameters function in DataQualityUtils * fixed unit test * removed unused fixture * removed validator.py * fixed tests * added validate kwarg to tests_mixin * removed "postgres" data diff extra as they interfere with psycopg2-binary * fixed tests * pinned tenacity for tests * reverted tenacity pinning * added ui support for test diff * fixed dq cypress and added edit flow * organized the test case * added dialect support * fixed tests * option style fix * fixed calculation for passing/failing rows * restrict the tableDiff test to limited services * set where to None if blank string * fixed where clause * fixed tests for where clause * use displayName in place of name in edit form * added docs for RuntimeParameterSetter * fixed cypress --------- Co-authored-by: Shailesh Parmar <shailesh.parmar.webdev@gmail.com>
2024-06-20 16:54:12 +02:00
DATA_DIFF = {
driver: f"collate-data-diff[{driver}]"
# data-diff uses different drivers out-of-the-box than OpenMetadata
# the extras are described here:
Fix #16421: add tableDiff test case (#16554) * feat: add tableDiff test case This changed introduces a "table diff" test case which compares two tables and fails if they are not identical. The similarity is made based on a specific "key" (because the test only makes sense when performed on ordered collections). 1. Added the `tableDiff` test definition. 2. Implemented a "runtime" parameters feature which injects additional parameters for the test at runtime. 3. Integration tests (because of course). This feature was not tested end-to-end yet because "array" data * pydantic v2 * format * format * format and added data diff to setup.py * format * fixed param issue which has type ARRAY * fixed runtime_parameter_setter * moved models to parent directory * handle errors in table diff * fixed issue with edit test case * format * added more details to pytest skip * format * refactor: Improve createTestCaseParameters function in DataQualityUtils * fixed unit test * removed unused fixture * removed validator.py * fixed tests * added validate kwarg to tests_mixin * removed "postgres" data diff extra as they interfere with psycopg2-binary * fixed tests * pinned tenacity for tests * reverted tenacity pinning * added ui support for test diff * fixed dq cypress and added edit flow * organized the test case * added dialect support * fixed tests * option style fix * fixed calculation for passing/failing rows * restrict the tableDiff test to limited services * set where to None if blank string * fixed where clause * fixed tests for where clause * use displayName in place of name in edit form * added docs for RuntimeParameterSetter * fixed cypress --------- Co-authored-by: Shailesh Parmar <shailesh.parmar.webdev@gmail.com>
2024-06-20 16:54:12 +02:00
# https://github.com/open-metadata/collate-data-diff/blob/main/pyproject.toml#L68
# install all data diffs with "pip install collate-data-diff[all-dbs]"
for driver in [
"clickhouse",
# "duckdb", # Not supported by OpenMetadata
"mssql",
"mysql",
"oracle",
# "postgresql", we dont use this as it installs psycopg2 which interferes with psycopg2-binary
"presto",
"redshift",
"snowflake",
"trino",
"vertica",
]
}
2021-08-01 14:27:44 -07:00
base_requirements = {
"antlr4-python3-runtime==4.9.2",
VERSIONS["azure-identity"],
"azure-keyvault-secrets", # Azure Key Vault SM
VERSIONS["boto3"], # Required in base for the secrets manager
"cached-property==1.5.2", # LineageParser
"chardet==4.0.0", # Used in the profiler
"cryptography>=42.0.0",
"google-cloud-secret-manager>=2.19.0,<2.20.1",
"google-crc32c",
"email-validator>=2.0", # For the pydantic generated models for Email
2023-09-15 12:03:47 +05:30
"importlib-metadata>=4.13.0", # From airflow constraints
"Jinja2>=2.11.3",
"jsonpatch<2.0, >=1.24",
"memory-profiler",
"mypy_extensions>=0.4.3",
Fixes #5448: Implement initial Iceberg Connector using PyIceberg (#14825) * Create the iceberg connection schema * Link the IcebergConnection configuration with the forms on the UI * Add the pyiceberg dependency on the ingestion package * Create the get_connection and test_connection functions * First iteration on the iceberg ingestion logic * Add A more comprehensive implementation of the Iceberg Source * Add UnitTests * Update icebergConnection definition * Update the iceberg souce code based on new schema * Updated icebergConnecgtion schema for simplicity and to be able to configure Converters * Updated setup dependencies to be more flexible * Updated get_owner_ref logic * Fix formatting * Changed the icebergConnection json schema structure to enable the ClassConverters * Add the IcebergCatalog and IcebergFileSystem ClassConverters * Refactor the code to take into account the new jsonSchema structure * Fix formatting * Add Documentation for the Iceberg Connector * Fix Menu order for Iceberg * ui: add Iceberg service icon and constant * Fix DynamoDb Catalog issue due to how PyIceberg instantes it * Changed uri title to URI * Fix ClassConverter for Iceberg * Fix GetSecretValue for password types * Fix formatting * Fix formatting * Add Iceberg Connector Images for the docs * Add pylint disable for Hacky super() call * Add Iceberg.md for the UI docs * Fix pylint complaint * Fix pylint complaint * Fix UnitTests * fix type error and unit tests * update pipeline type checks * Fix Sonar Cloud complaints --------- Co-authored-by: Sachin Chaurasiya <sachinchaurasiyachotey87@gmail.com>
2024-01-29 06:32:58 +01:00
VERSIONS["pydantic"],
VERSIONS["pydantic-settings"],
VERSIONS["pymysql"],
2021-08-01 14:27:44 -07:00
"python-dateutil>=2.8.1",
"PyYAML~=6.0",
"requests>=2.23",
"requests-aws4auth~=1.1", # Only depends on requests as external package. Leaving as base.
2023-01-27 15:26:30 +01:00
"sqlalchemy>=1.4.0,<2",
"collate-sqllineage~=1.6.0",
"tabulate==0.9.0",
"typing-inspect",
"packaging", # For version parsing
"setuptools~=70.0",
"shapely",
"collate-data-diff",
# TODO: Remove one once we have updated datadiff version
"snowflake-connector-python>=3.13.1,<4.0.0",
"mysql-connector-python>=8.0.29;python_version<'3.9'",
"mysql-connector-python>=9.1;python_version>='3.9'",
}
2021-08-01 14:27:44 -07:00
plugins: Dict[str, Set[str]] = {
"airflow": {
"opentelemetry-exporter-otlp==1.27.0",
2024-11-06 15:51:43 +05:30
"protobuf<5",
"attrs",
VERSIONS["airflow"],
}, # Same as ingestion container. For development.
"amundsen": {VERSIONS["neo4j"]},
"athena": {VERSIONS["pyathena"]},
"atlas": {},
"azuresql": {VERSIONS["pyodbc"]},
"azure-sso": {VERSIONS["msal"]},
"backup": {VERSIONS["boto3"], VERSIONS["azure-identity"], "azure-storage-blob"},
"bigquery": {
"cachetools",
"google-cloud-datacatalog>=3.6.2",
"google-cloud-logging",
VERSIONS["pyarrow"],
2024-06-18 17:03:35 +05:30
VERSIONS["numpy"],
"sqlalchemy-bigquery>=1.2.2",
},
"bigtable": {
VERSIONS["google-cloud-bigtable"],
VERSIONS["pandas"],
VERSIONS["numpy"],
},
Fix #16421: add tableDiff test case (#16554) * feat: add tableDiff test case This changed introduces a "table diff" test case which compares two tables and fails if they are not identical. The similarity is made based on a specific "key" (because the test only makes sense when performed on ordered collections). 1. Added the `tableDiff` test definition. 2. Implemented a "runtime" parameters feature which injects additional parameters for the test at runtime. 3. Integration tests (because of course). This feature was not tested end-to-end yet because "array" data * pydantic v2 * format * format * format and added data diff to setup.py * format * fixed param issue which has type ARRAY * fixed runtime_parameter_setter * moved models to parent directory * handle errors in table diff * fixed issue with edit test case * format * added more details to pytest skip * format * refactor: Improve createTestCaseParameters function in DataQualityUtils * fixed unit test * removed unused fixture * removed validator.py * fixed tests * added validate kwarg to tests_mixin * removed "postgres" data diff extra as they interfere with psycopg2-binary * fixed tests * pinned tenacity for tests * reverted tenacity pinning * added ui support for test diff * fixed dq cypress and added edit flow * organized the test case * added dialect support * fixed tests * option style fix * fixed calculation for passing/failing rows * restrict the tableDiff test to limited services * set where to None if blank string * fixed where clause * fixed tests for where clause * use displayName in place of name in edit form * added docs for RuntimeParameterSetter * fixed cypress --------- Co-authored-by: Shailesh Parmar <shailesh.parmar.webdev@gmail.com>
2024-06-20 16:54:12 +02:00
"clickhouse": {
"clickhouse-driver~=0.2",
"clickhouse-sqlalchemy~=0.2.0",
Fix #16421: add tableDiff test case (#16554) * feat: add tableDiff test case This changed introduces a "table diff" test case which compares two tables and fails if they are not identical. The similarity is made based on a specific "key" (because the test only makes sense when performed on ordered collections). 1. Added the `tableDiff` test definition. 2. Implemented a "runtime" parameters feature which injects additional parameters for the test at runtime. 3. Integration tests (because of course). This feature was not tested end-to-end yet because "array" data * pydantic v2 * format * format * format and added data diff to setup.py * format * fixed param issue which has type ARRAY * fixed runtime_parameter_setter * moved models to parent directory * handle errors in table diff * fixed issue with edit test case * format * added more details to pytest skip * format * refactor: Improve createTestCaseParameters function in DataQualityUtils * fixed unit test * removed unused fixture * removed validator.py * fixed tests * added validate kwarg to tests_mixin * removed "postgres" data diff extra as they interfere with psycopg2-binary * fixed tests * pinned tenacity for tests * reverted tenacity pinning * added ui support for test diff * fixed dq cypress and added edit flow * organized the test case * added dialect support * fixed tests * option style fix * fixed calculation for passing/failing rows * restrict the tableDiff test to limited services * set where to None if blank string * fixed where clause * fixed tests for where clause * use displayName in place of name in edit form * added docs for RuntimeParameterSetter * fixed cypress --------- Co-authored-by: Shailesh Parmar <shailesh.parmar.webdev@gmail.com>
2024-06-20 16:54:12 +02:00
DATA_DIFF["clickhouse"],
},
"dagster": {
"croniter<3",
VERSIONS["pymysql"],
"psycopg2-binary",
VERSIONS["geoalchemy2"],
"dagster_graphql>=1.8.0",
},
"dbt": {
"google-cloud",
VERSIONS["boto3"],
VERSIONS["google-cloud-storage"],
"collate-dbt-artifacts-parser",
VERSIONS["azure-storage-blob"],
VERSIONS["azure-identity"],
},
"db2": {"ibm-db-sa~=0.4.1", "ibm-db>=2.0.0"},
"db2-ibmi": {"sqlalchemy-ibmi~=0.9.3"},
"databricks": {
VERSIONS["sqlalchemy-databricks"],
VERSIONS["databricks-sdk"],
"ndg-httpsclient~=0.5.1",
"pyOpenSSL~=24.1.0",
"pyasn1~=0.6.0",
# databricks has a dependency on pyhive for metadata as well as profiler
VERSIONS["pyhive"],
},
"datalake-azure": {
VERSIONS["azure-storage-blob"],
VERSIONS["azure-identity"],
VERSIONS["adlfs"],
*COMMONS["datalake"],
},
"datalake-gcs": {
VERSIONS["google-cloud-monitoring"],
VERSIONS["google-cloud-storage"],
Fixes #5448: Implement initial Iceberg Connector using PyIceberg (#14825) * Create the iceberg connection schema * Link the IcebergConnection configuration with the forms on the UI * Add the pyiceberg dependency on the ingestion package * Create the get_connection and test_connection functions * First iteration on the iceberg ingestion logic * Add A more comprehensive implementation of the Iceberg Source * Add UnitTests * Update icebergConnection definition * Update the iceberg souce code based on new schema * Updated icebergConnecgtion schema for simplicity and to be able to configure Converters * Updated setup dependencies to be more flexible * Updated get_owner_ref logic * Fix formatting * Changed the icebergConnection json schema structure to enable the ClassConverters * Add the IcebergCatalog and IcebergFileSystem ClassConverters * Refactor the code to take into account the new jsonSchema structure * Fix formatting * Add Documentation for the Iceberg Connector * Fix Menu order for Iceberg * ui: add Iceberg service icon and constant * Fix DynamoDb Catalog issue due to how PyIceberg instantes it * Changed uri title to URI * Fix ClassConverter for Iceberg * Fix GetSecretValue for password types * Fix formatting * Fix formatting * Add Iceberg Connector Images for the docs * Add pylint disable for Hacky super() call * Add Iceberg.md for the UI docs * Fix pylint complaint * Fix pylint complaint * Fix UnitTests * fix type error and unit tests * update pipeline type checks * Fix Sonar Cloud complaints --------- Co-authored-by: Sachin Chaurasiya <sachinchaurasiyachotey87@gmail.com>
2024-01-29 06:32:58 +01:00
VERSIONS["gcsfs"],
*COMMONS["datalake"],
},
"datalake-s3": {
*COMMONS["datalake"],
},
"deltalake": {
"delta-spark<=2.3.0",
"deltalake~=0.17,<0.20",
}, # TODO: remove pinning to under 0.20 after https://github.com/open-metadata/OpenMetadata/issues/17909
"deltalake-storage": {"deltalake~=0.17"},
"deltalake-spark": {"delta-spark<=2.3.0"},
"domo": {VERSIONS["pydomo"]},
"doris": {"pydoris==1.0.2"},
"druid": {"pydruid>=0.6.5"},
"dynamodb": {VERSIONS["boto3"]},
"elasticsearch": {
VERSIONS["elasticsearch8"],
"httpx>=0.23.0",
}, # also requires requests-aws4auth which is in base
"opensearch": {VERSIONS["opensearch"]},
"exasol": {"sqlalchemy_exasol>=5,<6"},
"glue": {VERSIONS["boto3"]},
"great-expectations": {VERSIONS["great-expectations"]},
"great-expectations-1xx": {VERSIONS["great-expectations-1xx"]},
"greenplum": {*COMMONS["postgres"]},
"cockroach": {
VERSIONS["cockroach"],
"psycopg2-binary",
},
"hive": {
*COMMONS["hive"],
"thrift>=0.13,<1",
# Replacing sasl with pure-sasl based on https://github.com/cloudera/python-sasl/issues/30 for py 3.11
"pure-sasl",
"thrift-sasl~=0.4",
"impyla~=0.18.0",
},
Fixes #5448: Implement initial Iceberg Connector using PyIceberg (#14825) * Create the iceberg connection schema * Link the IcebergConnection configuration with the forms on the UI * Add the pyiceberg dependency on the ingestion package * Create the get_connection and test_connection functions * First iteration on the iceberg ingestion logic * Add A more comprehensive implementation of the Iceberg Source * Add UnitTests * Update icebergConnection definition * Update the iceberg souce code based on new schema * Updated icebergConnecgtion schema for simplicity and to be able to configure Converters * Updated setup dependencies to be more flexible * Updated get_owner_ref logic * Fix formatting * Changed the icebergConnection json schema structure to enable the ClassConverters * Add the IcebergCatalog and IcebergFileSystem ClassConverters * Refactor the code to take into account the new jsonSchema structure * Fix formatting * Add Documentation for the Iceberg Connector * Fix Menu order for Iceberg * ui: add Iceberg service icon and constant * Fix DynamoDb Catalog issue due to how PyIceberg instantes it * Changed uri title to URI * Fix ClassConverter for Iceberg * Fix GetSecretValue for password types * Fix formatting * Fix formatting * Add Iceberg Connector Images for the docs * Add pylint disable for Hacky super() call * Add Iceberg.md for the UI docs * Fix pylint complaint * Fix pylint complaint * Fix UnitTests * fix type error and unit tests * update pipeline type checks * Fix Sonar Cloud complaints --------- Co-authored-by: Sachin Chaurasiya <sachinchaurasiyachotey87@gmail.com>
2024-01-29 06:32:58 +01:00
"iceberg": {
VERSIONS["pyiceberg"],
Fixes #5448: Implement initial Iceberg Connector using PyIceberg (#14825) * Create the iceberg connection schema * Link the IcebergConnection configuration with the forms on the UI * Add the pyiceberg dependency on the ingestion package * Create the get_connection and test_connection functions * First iteration on the iceberg ingestion logic * Add A more comprehensive implementation of the Iceberg Source * Add UnitTests * Update icebergConnection definition * Update the iceberg souce code based on new schema * Updated icebergConnecgtion schema for simplicity and to be able to configure Converters * Updated setup dependencies to be more flexible * Updated get_owner_ref logic * Fix formatting * Changed the icebergConnection json schema structure to enable the ClassConverters * Add the IcebergCatalog and IcebergFileSystem ClassConverters * Refactor the code to take into account the new jsonSchema structure * Fix formatting * Add Documentation for the Iceberg Connector * Fix Menu order for Iceberg * ui: add Iceberg service icon and constant * Fix DynamoDb Catalog issue due to how PyIceberg instantes it * Changed uri title to URI * Fix ClassConverter for Iceberg * Fix GetSecretValue for password types * Fix formatting * Fix formatting * Add Iceberg Connector Images for the docs * Add pylint disable for Hacky super() call * Add Iceberg.md for the UI docs * Fix pylint complaint * Fix pylint complaint * Fix UnitTests * fix type error and unit tests * update pipeline type checks * Fix Sonar Cloud complaints --------- Co-authored-by: Sachin Chaurasiya <sachinchaurasiyachotey87@gmail.com>
2024-01-29 06:32:58 +01:00
# Forcing the version of a few packages so it plays nicely with other requirements.
VERSIONS["pydantic"],
VERSIONS["adlfs"],
VERSIONS["gcsfs"],
VERSIONS["pyarrow"],
},
Impalaconnection (#11151) * updated metadata to work with the impala query engine. Uses the describe function to grab column names, data types, and comments. * added the ordinalPosition data point into the Column constructor. * renamed variable to better describe its usage. * updated profile errors. Hive connections now comment columns by default. * removed print statements * Cleaned up code by pulling check into its own function * Updated median function to return null when it is being used for first and third quartiles. * updated metadata to work with the impala query engine. Uses the describe function to grab column names, data types, and comments. * added the ordinalPosition data point into the Column constructor. * renamed variable to better describe its usage. * updated profile errors. Hive connections now comment columns by default. * removed print statements * Cleaned up code by pulling check into its own function * Updated median function to return null when it is being used for first and third quartiles. * removed print statements and ran make py_format * updated to fix some pylint errors. imported Dialects to remove string compare to "impala" engine * moved huge comment into function docstring. This comment shows us the sql to get quartiles in Impala * added cast to decimal for column when running average in mean.py * fixed lint error * fixed ui ordering of precision and scale. Precision should be ordred in front of scale since the precision is set first in decimal data types * first pass for impala connector * updated default auth_mechanism to be one of the enum values. * updated UI documentation to match fields for the impalaconneciton. refined impalaConnection to bring use_ssl to a boolean instead or relying on an extra connection option being manually added. Removed reference to hive for type mapping added impala to the pip setup * py_format updates * removed print statement * Lints and fixes * Updated database documentation to follow new style * Flag as BETA * Remove tests --------- Co-authored-by: Chirag Madlani <12962843+chirag-madlani@users.noreply.github.com> Co-authored-by: Pere Miquel Brull <peremiquelbrull@gmail.com>
2023-04-21 02:57:13 -05:00
"impala": {
"presto-types-parser>=0.0.2",
"impyla[kerberos]~=0.18.0",
"thrift>=0.13,<1",
"pure-sasl",
Impalaconnection (#11151) * updated metadata to work with the impala query engine. Uses the describe function to grab column names, data types, and comments. * added the ordinalPosition data point into the Column constructor. * renamed variable to better describe its usage. * updated profile errors. Hive connections now comment columns by default. * removed print statements * Cleaned up code by pulling check into its own function * Updated median function to return null when it is being used for first and third quartiles. * updated metadata to work with the impala query engine. Uses the describe function to grab column names, data types, and comments. * added the ordinalPosition data point into the Column constructor. * renamed variable to better describe its usage. * updated profile errors. Hive connections now comment columns by default. * removed print statements * Cleaned up code by pulling check into its own function * Updated median function to return null when it is being used for first and third quartiles. * removed print statements and ran make py_format * updated to fix some pylint errors. imported Dialects to remove string compare to "impala" engine * moved huge comment into function docstring. This comment shows us the sql to get quartiles in Impala * added cast to decimal for column when running average in mean.py * fixed lint error * fixed ui ordering of precision and scale. Precision should be ordred in front of scale since the precision is set first in decimal data types * first pass for impala connector * updated default auth_mechanism to be one of the enum values. * updated UI documentation to match fields for the impalaconneciton. refined impalaConnection to bring use_ssl to a boolean instead or relying on an extra connection option being manually added. Removed reference to hive for type mapping added impala to the pip setup * py_format updates * removed print statement * Lints and fixes * Updated database documentation to follow new style * Flag as BETA * Remove tests --------- Co-authored-by: Chirag Madlani <12962843+chirag-madlani@users.noreply.github.com> Co-authored-by: Pere Miquel Brull <peremiquelbrull@gmail.com>
2023-04-21 02:57:13 -05:00
"thrift-sasl~=0.4",
},
"kafka": {*COMMONS["kafka"]},
2024-05-10 14:29:45 +05:30
"kafkaconnect": {"kafka-connect-py==0.10.11"},
"kinesis": {VERSIONS["boto3"]},
"looker": {
VERSIONS["looker-sdk"],
VERSIONS["lkml"],
"gitpython~=3.1.34",
VERSIONS["giturlparse"],
"python-liquid",
},
"mlflow": {"mlflow-skinny>=2.3.0"},
2024-08-20 12:19:05 +02:00
"mongo": {VERSIONS["mongo"], VERSIONS["pandas"], VERSIONS["numpy"]},
2024-12-12 15:12:55 +05:30
"cassandra": {VERSIONS["cassandra"]},
"couchbase": {"couchbase~=4.1"},
Fix #16421: add tableDiff test case (#16554) * feat: add tableDiff test case This changed introduces a "table diff" test case which compares two tables and fails if they are not identical. The similarity is made based on a specific "key" (because the test only makes sense when performed on ordered collections). 1. Added the `tableDiff` test definition. 2. Implemented a "runtime" parameters feature which injects additional parameters for the test at runtime. 3. Integration tests (because of course). This feature was not tested end-to-end yet because "array" data * pydantic v2 * format * format * format and added data diff to setup.py * format * fixed param issue which has type ARRAY * fixed runtime_parameter_setter * moved models to parent directory * handle errors in table diff * fixed issue with edit test case * format * added more details to pytest skip * format * refactor: Improve createTestCaseParameters function in DataQualityUtils * fixed unit test * removed unused fixture * removed validator.py * fixed tests * added validate kwarg to tests_mixin * removed "postgres" data diff extra as they interfere with psycopg2-binary * fixed tests * pinned tenacity for tests * reverted tenacity pinning * added ui support for test diff * fixed dq cypress and added edit flow * organized the test case * added dialect support * fixed tests * option style fix * fixed calculation for passing/failing rows * restrict the tableDiff test to limited services * set where to None if blank string * fixed where clause * fixed tests for where clause * use displayName in place of name in edit form * added docs for RuntimeParameterSetter * fixed cypress --------- Co-authored-by: Shailesh Parmar <shailesh.parmar.webdev@gmail.com>
2024-06-20 16:54:12 +02:00
"mssql": {
"sqlalchemy-pytds~=0.3",
DATA_DIFF["mssql"],
},
"mssql-odbc": {
VERSIONS["pyodbc"],
DATA_DIFF["mssql"],
},
"mysql": {
VERSIONS["pymysql"],
DATA_DIFF["mysql"],
},
"nifi": {}, # uses requests
"openlineage": {*COMMONS["kafka"]},
Fix #16421: add tableDiff test case (#16554) * feat: add tableDiff test case This changed introduces a "table diff" test case which compares two tables and fails if they are not identical. The similarity is made based on a specific "key" (because the test only makes sense when performed on ordered collections). 1. Added the `tableDiff` test definition. 2. Implemented a "runtime" parameters feature which injects additional parameters for the test at runtime. 3. Integration tests (because of course). This feature was not tested end-to-end yet because "array" data * pydantic v2 * format * format * format and added data diff to setup.py * format * fixed param issue which has type ARRAY * fixed runtime_parameter_setter * moved models to parent directory * handle errors in table diff * fixed issue with edit test case * format * added more details to pytest skip * format * refactor: Improve createTestCaseParameters function in DataQualityUtils * fixed unit test * removed unused fixture * removed validator.py * fixed tests * added validate kwarg to tests_mixin * removed "postgres" data diff extra as they interfere with psycopg2-binary * fixed tests * pinned tenacity for tests * reverted tenacity pinning * added ui support for test diff * fixed dq cypress and added edit flow * organized the test case * added dialect support * fixed tests * option style fix * fixed calculation for passing/failing rows * restrict the tableDiff test to limited services * set where to None if blank string * fixed where clause * fixed tests for where clause * use displayName in place of name in edit form * added docs for RuntimeParameterSetter * fixed cypress --------- Co-authored-by: Shailesh Parmar <shailesh.parmar.webdev@gmail.com>
2024-06-20 16:54:12 +02:00
"oracle": {"cx_Oracle>=8.3.0,<9", "oracledb~=1.2", DATA_DIFF["oracle"]},
"pgspider": {"psycopg2-binary", "sqlalchemy-pgspider"},
"pinotdb": {"pinotdb~=5.0"},
"postgres": {*COMMONS["postgres"]},
"powerbi": {
VERSIONS["msal"],
VERSIONS["boto3"],
VERSIONS["google-cloud-storage"],
VERSIONS["azure-storage-blob"],
VERSIONS["azure-identity"],
},
2023-08-11 12:28:05 +05:30
"qliksense": {"websocket-client~=1.6.1"},
Fix #16421: add tableDiff test case (#16554) * feat: add tableDiff test case This changed introduces a "table diff" test case which compares two tables and fails if they are not identical. The similarity is made based on a specific "key" (because the test only makes sense when performed on ordered collections). 1. Added the `tableDiff` test definition. 2. Implemented a "runtime" parameters feature which injects additional parameters for the test at runtime. 3. Integration tests (because of course). This feature was not tested end-to-end yet because "array" data * pydantic v2 * format * format * format and added data diff to setup.py * format * fixed param issue which has type ARRAY * fixed runtime_parameter_setter * moved models to parent directory * handle errors in table diff * fixed issue with edit test case * format * added more details to pytest skip * format * refactor: Improve createTestCaseParameters function in DataQualityUtils * fixed unit test * removed unused fixture * removed validator.py * fixed tests * added validate kwarg to tests_mixin * removed "postgres" data diff extra as they interfere with psycopg2-binary * fixed tests * pinned tenacity for tests * reverted tenacity pinning * added ui support for test diff * fixed dq cypress and added edit flow * organized the test case * added dialect support * fixed tests * option style fix * fixed calculation for passing/failing rows * restrict the tableDiff test to limited services * set where to None if blank string * fixed where clause * fixed tests for where clause * use displayName in place of name in edit form * added docs for RuntimeParameterSetter * fixed cypress --------- Co-authored-by: Shailesh Parmar <shailesh.parmar.webdev@gmail.com>
2024-06-20 16:54:12 +02:00
"presto": {*COMMONS["hive"], DATA_DIFF["presto"]},
"pymssql": {"pymssql~=2.2.0"},
"quicksight": {VERSIONS["boto3"]},
"redash": {VERSIONS["packaging"]},
"redpanda": {*COMMONS["kafka"]},
"redshift": {
# Going higher has memory and performance issues
VERSIONS["redshift"],
"psycopg2-binary",
VERSIONS["geoalchemy2"],
},
"sagemaker": {VERSIONS["boto3"]},
"salesforce": {"simple_salesforce~=1.11", "authlib>=1.3.1"},
2025-04-16 11:34:26 +05:30
"sample-data": {
VERSIONS["avro"],
VERSIONS["grpc-tools"],
VERSIONS["sqlalchemy-bigquery"],
VERSIONS["presidio-analyzer"],
2025-04-16 11:34:26 +05:30
},
"sap-hana": {"hdbcli", "sqlalchemy-hana"},
"sas": {},
"singlestore": {VERSIONS["pymysql"]},
"sklearn": {VERSIONS["scikit-learn"]},
Fix #16421: add tableDiff test case (#16554) * feat: add tableDiff test case This changed introduces a "table diff" test case which compares two tables and fails if they are not identical. The similarity is made based on a specific "key" (because the test only makes sense when performed on ordered collections). 1. Added the `tableDiff` test definition. 2. Implemented a "runtime" parameters feature which injects additional parameters for the test at runtime. 3. Integration tests (because of course). This feature was not tested end-to-end yet because "array" data * pydantic v2 * format * format * format and added data diff to setup.py * format * fixed param issue which has type ARRAY * fixed runtime_parameter_setter * moved models to parent directory * handle errors in table diff * fixed issue with edit test case * format * added more details to pytest skip * format * refactor: Improve createTestCaseParameters function in DataQualityUtils * fixed unit test * removed unused fixture * removed validator.py * fixed tests * added validate kwarg to tests_mixin * removed "postgres" data diff extra as they interfere with psycopg2-binary * fixed tests * pinned tenacity for tests * reverted tenacity pinning * added ui support for test diff * fixed dq cypress and added edit flow * organized the test case * added dialect support * fixed tests * option style fix * fixed calculation for passing/failing rows * restrict the tableDiff test to limited services * set where to None if blank string * fixed where clause * fixed tests for where clause * use displayName in place of name in edit form * added docs for RuntimeParameterSetter * fixed cypress --------- Co-authored-by: Shailesh Parmar <shailesh.parmar.webdev@gmail.com>
2024-06-20 16:54:12 +02:00
"snowflake": {VERSIONS["snowflake"], DATA_DIFF["snowflake"]},
"superset": {}, # uses requests
"tableau": {VERSIONS["tableau"], VERSIONS["validators"], VERSIONS["packaging"]},
"teradata": {VERSIONS["teradata"]},
Fix #16421: add tableDiff test case (#16554) * feat: add tableDiff test case This changed introduces a "table diff" test case which compares two tables and fails if they are not identical. The similarity is made based on a specific "key" (because the test only makes sense when performed on ordered collections). 1. Added the `tableDiff` test definition. 2. Implemented a "runtime" parameters feature which injects additional parameters for the test at runtime. 3. Integration tests (because of course). This feature was not tested end-to-end yet because "array" data * pydantic v2 * format * format * format and added data diff to setup.py * format * fixed param issue which has type ARRAY * fixed runtime_parameter_setter * moved models to parent directory * handle errors in table diff * fixed issue with edit test case * format * added more details to pytest skip * format * refactor: Improve createTestCaseParameters function in DataQualityUtils * fixed unit test * removed unused fixture * removed validator.py * fixed tests * added validate kwarg to tests_mixin * removed "postgres" data diff extra as they interfere with psycopg2-binary * fixed tests * pinned tenacity for tests * reverted tenacity pinning * added ui support for test diff * fixed dq cypress and added edit flow * organized the test case * added dialect support * fixed tests * option style fix * fixed calculation for passing/failing rows * restrict the tableDiff test to limited services * set where to None if blank string * fixed where clause * fixed tests for where clause * use displayName in place of name in edit form * added docs for RuntimeParameterSetter * fixed cypress --------- Co-authored-by: Shailesh Parmar <shailesh.parmar.webdev@gmail.com>
2024-06-20 16:54:12 +02:00
"trino": {VERSIONS["trino"], DATA_DIFF["trino"]},
"vertica": {"sqlalchemy-vertica[vertica-python]>=0.0.5", DATA_DIFF["vertica"]},
"pii-processor": {
VERSIONS["spacy"],
VERSIONS["pandas"],
2024-08-20 12:19:05 +02:00
VERSIONS["numpy"],
VERSIONS["presidio-analyzer"],
},
"presidio-analyzer": {VERSIONS["presidio-analyzer"]},
2021-08-01 14:27:44 -07:00
}
dev = {
"black==22.3.0",
"datamodel-code-generator==0.25.6",
"boto3-stubs",
"mypy-boto3-glue",
"isort",
"pre-commit",
"pycln",
"pylint~=3.2.0", # 3.3.0+ breaks our current linting
# For publishing
"twine",
"build",
*plugins["sample-data"],
}
# Dependencies for unit testing in addition to dev dependencies and plugins
test_unit = {
"pytest==7.0.0",
"pytest-cov",
"pytest-order",
"dirty-equals",
"faker==37.1.0", # The version needs to be fixed to prevent flaky tests!
ci/nox-setup-testing (#21377) * Make pytest to user code from src rather than from install package * Fix test_amundsen: missing None * Update pytest configuration to use importlib mode * Fix custom_basemodel_validation to check model_fields on type(values) to prevent noisy warnings * Refactor referencedByQueries validation to use field_validator as per deprecation warning * Update ColumnJson to use model_rebuild rather as replacement for forward reference updates as per deprecation warning * Move superset test to integration test as they are using testcontainers * Update coverage source path * Fix wrong import. * Add install_dev_env target to Makefile for development dependencies * Add test-unit as extra in setup.py * Modify dependencies in dev environment. * Ignore all airflow tests * Remove coverage in unit_ingestion_dev_env. Revert coverage source to prevent broken CI. * Add nox for running unit test * FIx PowerBI integration test to use pathlib for resource paths and not os.getcwd to prevent failures when not executed from the right path * Move test_helpers.py to unit test, as it is not an integration test. * Remove utils empty folder in integration tests * Refactor testcontainers configuration to avoid pitfalls with max_tries setting * Add nox unit testing basic setup * Add format check session * Refactor nox-unit and add plugins tests * Add GHA for py-nox-ci * Add comment to GHA * Restore conftest.py file * Clarify comment * Simplify function * Fix matrix startegy and nox mismatch * Improve python version strategy with nox and GHA --------- Co-authored-by: Pere Menal <pere.menal@getcollate.io>
2025-05-27 10:56:52 +02:00
# TODO: Remove once no unit test requires testcontainers
"testcontainers",
}
test = {
# Install Airflow as it's not part of `all` plugin
"opentelemetry-exporter-otlp==1.27.0",
VERSIONS["airflow"],
"boto3-stubs",
"mypy-boto3-glue",
"coverage",
# Install GE because it's not in the `all` plugin
VERSIONS["great-expectations"],
"basedpyright~=1.14",
2022-02-06 23:41:56 +01:00
"pytest==7.0.0",
"pytest-cov",
"pytest-order",
"dirty-equals",
# install dbt dependency
"collate-dbt-artifacts-parser",
"freezegun",
VERSIONS["sqlalchemy-databricks"],
VERSIONS["databricks-sdk"],
VERSIONS["scikit-learn"],
VERSIONS["pyarrow"],
VERSIONS["trino"],
VERSIONS["spacy"],
VERSIONS["pydomo"],
VERSIONS["looker-sdk"],
VERSIONS["lkml"],
VERSIONS["tableau"],
VERSIONS["pyhive"],
VERSIONS["mongo"],
2024-12-12 15:12:55 +05:30
VERSIONS["cassandra"],
VERSIONS["redshift"],
VERSIONS["snowflake"],
VERSIONS["elasticsearch8"],
VERSIONS["giturlparse"],
VERSIONS["avro"], # Sample Data
VERSIONS["grpc-tools"],
VERSIONS["neo4j"],
VERSIONS["cockroach"],
VERSIONS["pydoris"],
VERSIONS["pyiceberg"],
"testcontainers==3.7.1;python_version<'3.9'",
"testcontainers~=4.8.0;python_version>='3.9'",
"minio==7.2.5",
*plugins["mlflow"],
*plugins["datalake-s3"],
*plugins["kafka"],
"kafka-python==2.0.2",
*plugins["pii-processor"],
"requests==2.31.0",
f"{DATA_DIFF['mysql']}",
*plugins["deltalake"],
*plugins["datalake-gcs"],
*plugins["pgspider"],
*plugins["clickhouse"],
*plugins["mssql"],
*plugins["dagster"],
*plugins["oracle"],
*plugins["mssql"],
VERSIONS["validators"],
VERSIONS["pyathena"],
VERSIONS["pyiceberg"],
VERSIONS["pydoris"],
"python-liquid",
VERSIONS["google-cloud-bigtable"],
*plugins["bigquery"],
ci/nox-setup-testing (#21377) * Make pytest to user code from src rather than from install package * Fix test_amundsen: missing None * Update pytest configuration to use importlib mode * Fix custom_basemodel_validation to check model_fields on type(values) to prevent noisy warnings * Refactor referencedByQueries validation to use field_validator as per deprecation warning * Update ColumnJson to use model_rebuild rather as replacement for forward reference updates as per deprecation warning * Move superset test to integration test as they are using testcontainers * Update coverage source path * Fix wrong import. * Add install_dev_env target to Makefile for development dependencies * Add test-unit as extra in setup.py * Modify dependencies in dev environment. * Ignore all airflow tests * Remove coverage in unit_ingestion_dev_env. Revert coverage source to prevent broken CI. * Add nox for running unit test * FIx PowerBI integration test to use pathlib for resource paths and not os.getcwd to prevent failures when not executed from the right path * Move test_helpers.py to unit test, as it is not an integration test. * Remove utils empty folder in integration tests * Refactor testcontainers configuration to avoid pitfalls with max_tries setting * Add nox unit testing basic setup * Add format check session * Refactor nox-unit and add plugins tests * Add GHA for py-nox-ci * Add comment to GHA * Restore conftest.py file * Clarify comment * Simplify function * Fix matrix startegy and nox mismatch * Improve python version strategy with nox and GHA --------- Co-authored-by: Pere Menal <pere.menal@getcollate.io>
2025-05-27 10:56:52 +02:00
"faker==37.1.0", # The version needs to be fixed to prevent flaky tests!
}
2021-08-01 14:27:44 -07:00
if sys.version_info >= (3, 9):
test.add("locust~=2.32.0")
e2e_test = {
# playwright dependencies
"pytest-playwright",
"pytest-base-url",
}
# Define playwright_dependencies as a set of packages required for Playwright tests
# These packages correspond to the ingestion connectors used in Playwright tests
playwright_dependencies = {
*plugins["mysql"],
*plugins["bigquery"],
*plugins["kafka"],
*plugins["mlflow"],
*plugins["snowflake"],
*plugins["superset"],
*plugins["postgres"],
*plugins["redshift"],
*plugins["airflow"],
*plugins["datalake-s3"],
*plugins["dbt"],
*plugins["presidio-analyzer"],
*e2e_test
# Add other plugins as needed for Playwright tests
}
def filter_requirements(filtered: Set[str]) -> List[str]:
"""Filter out requirements from base_requirements"""
return list(
base_requirements.union(
*[
requirements
for plugin, requirements in plugins.items()
if plugin not in filtered
]
)
)
2021-08-01 14:27:44 -07:00
setup(
install_requires=list(base_requirements),
extras_require={
"dev": list(dev),
"test": list(test),
"test-unit": list(test_unit),
"e2e_test": list(e2e_test),
2022-11-15 05:44:25 +01:00
"data-insight": list(plugins["elasticsearch"]),
**{plugin: list(dependencies) for (plugin, dependencies) in plugins.items()},
# FIXME: all-dev-env is a temporary solution to install all dependencies except
ci/nox-setup-testing (#21377) * Make pytest to user code from src rather than from install package * Fix test_amundsen: missing None * Update pytest configuration to use importlib mode * Fix custom_basemodel_validation to check model_fields on type(values) to prevent noisy warnings * Refactor referencedByQueries validation to use field_validator as per deprecation warning * Update ColumnJson to use model_rebuild rather as replacement for forward reference updates as per deprecation warning * Move superset test to integration test as they are using testcontainers * Update coverage source path * Fix wrong import. * Add install_dev_env target to Makefile for development dependencies * Add test-unit as extra in setup.py * Modify dependencies in dev environment. * Ignore all airflow tests * Remove coverage in unit_ingestion_dev_env. Revert coverage source to prevent broken CI. * Add nox for running unit test * FIx PowerBI integration test to use pathlib for resource paths and not os.getcwd to prevent failures when not executed from the right path * Move test_helpers.py to unit test, as it is not an integration test. * Remove utils empty folder in integration tests * Refactor testcontainers configuration to avoid pitfalls with max_tries setting * Add nox unit testing basic setup * Add format check session * Refactor nox-unit and add plugins tests * Add GHA for py-nox-ci * Add comment to GHA * Restore conftest.py file * Clarify comment * Simplify function * Fix matrix startegy and nox mismatch * Improve python version strategy with nox and GHA --------- Co-authored-by: Pere Menal <pere.menal@getcollate.io>
2025-05-27 10:56:52 +02:00
# those that might conflict with each other or cause issues in the dev environment
# This covers all development cases where none of the plugins are used
"all-dev-env": filter_requirements(
{"airflow", "db2", "great-expectations", "pymssql"}
),
# enf-of-fixme
"all": filter_requirements({"airflow", "db2", "great-expectations"}),
"playwright": list(playwright_dependencies),
"slim": filter_requirements(
{
"airflow",
"db2",
"great-expectations",
"deltalake",
"deltalake-spark",
"sklearn",
}
),
},
2021-08-01 14:27:44 -07:00
)