diff --git a/ingestion/setup.py b/ingestion/setup.py index 4bd9c2e8464..d282179bd4b 100644 --- a/ingestion/setup.py +++ b/ingestion/setup.py @@ -22,182 +22,201 @@ def get_long_description(): return description +# Add here versions required for multiple plugins +VERSIONS = { + "airflow": "apache-airflow==2.3.3", + "avro-python3": "avro-python3~=1.10", + "boto3": "boto3~=1.26", # No need to add botocore separately. It's a dep from boto3 + "geoalchemy2": "GeoAlchemy2~=0.12", + "google-cloud-storage": "google-cloud-storage==1.43.0", + "great-expectations": "great-expectations~=0.15.0", + "grpc-tools": "grpcio-tools==1.47.2", + "msal": "msal~=1.2", + "neo4j": "neo4j~=4.4.0", + "pandas": "pandas==1.3.5", + "pyarrow": "pyarrow~=8.0", + "pydomo": "pydomo~=0.3", + "pymysql": "pymysql>=1.0.2", + "pyodbc": "pyodbc>=4.0.35,<5", + "scikit-learn": "scikit-learn~=1.0", # Python 3.7 only goes up to 1.0.2 +} + +COMMONS = { + "datalake": {VERSIONS["boto3"], VERSIONS["pandas"], VERSIONS["pyarrow"]}, + "hive": { + "presto-types-parser>=0.0.2", + "pyhive~=0.6", + }, + "kafka": { + "avro~=1.11", + VERSIONS["avro-python3"], + "confluent_kafka==1.8.2", + "fastavro>=1.2.0", + # Due to https://github.com/grpc/grpc/issues/30843#issuecomment-1303816925 + # we use v1.47.2 https://github.com/grpc/grpc/blob/v1.47.2/tools/distrib/python/grpcio_tools/grpc_version.py#L17 + VERSIONS[ + "grpc-tools" + ], # grpcio-tools already depends on grpcio. No need to add separately + "protobuf", + }, +} + + base_requirements = { + "antlr4-python3-runtime==4.9.2", + VERSIONS["avro-python3"], # Used in sample data + VERSIONS["boto3"], # Required in base for the secrets manager + "cached-property==1.5.2", + "chardet==4.0.0", + "croniter~=1.3.0", + "cryptography", "commonregex", - "idna<3,>=2.5", - "mypy_extensions>=0.4.3", - "typing-inspect", - "pydantic~=1.9.0", "email-validator>=1.0.3", "google>=3.0.0", "google-auth>=1.33.0", - "python-dateutil>=2.8.1", - "wheel~=0.38.4", - "setuptools~=65.6.3", - "python-jose==3.3.0", - "sqlalchemy>=1.4.0", - "requests>=2.23", - "cryptography", - "Jinja2>=2.11.3", - "PyYAML", - "jsonschema", - "sqllineage==1.3.7", - "antlr4-python3-runtime==4.9.2", - "boto3~=1.19.12", - "botocore==1.22.12", - "avro-python3==1.10.2", - # https://github.com/grpc/grpc/issues/30843#issuecomment-1303816925 - "grpcio<1.48.1", - "grpcio-tools==1.47.2", - # compatibility requirements for 3.7 - "typing-compat~=0.1.0", + VERSIONS["grpc-tools"], # Used in sample data + "idna<3,>=2.5", "importlib-metadata~=4.12.0", # From airflow constraints - "croniter~=1.3.0", - "requests-aws4auth==1.1.2", - "pymysql>=1.0.2", - "cached-property==1.5.2", - "pandas==1.3.5", - "chardet==4.0.0", + "Jinja2>=2.11.3", + "jsonschema", + "mypy_extensions>=0.4.3", + VERSIONS["pandas"], # to be removed from base + "pydantic~=1.10", + VERSIONS["pymysql"], + "python-dateutil>=2.8.1", + "python-jose~=3.3", + "PyYAML", + "requests>=2.23", + "requests-aws4auth~=1.1", # Only depends on requests as external package. Leaving as base. + "setuptools~=65.6.3", + "sqlalchemy>=1.4.0", + "sqllineage==1.3.7", + "typing-compat~=0.1.0", # compatibility requirements for 3.7 + "typing-inspect", + "wheel~=0.38.4", } -datalake_common = { - "pandas==1.3.5", - "pyarrow==6.0.1", -} - plugins: Dict[str, Set[str]] = { "airflow": { "apache-airflow==2.3.3" }, # Same as ingestion container. For development. - "airflow-container-1.10.15": {"markupsafe==2.0.1 ", "requests==2.23.0"}, - "amundsen": {"neo4j~=4.4.0"}, + "amundsen": {VERSIONS["neo4j"]}, "athena": {"PyAthena[SQLAlchemy]"}, "atlas": {}, - "azuresql": {"pyodbc"}, + "azuresql": {VERSIONS["pyodbc"]}, + "azure-sso": {VERSIONS["msal"]}, + "backup": {VERSIONS["boto3"], "azure-identity", "azure-storage-blob"}, "bigquery": { - "sqlalchemy-bigquery>=1.2.2", - "pyarrow~=6.0.1", + "cachetools", "google-cloud-datacatalog==3.6.2", + "google-cloud-logging", + VERSIONS["pyarrow"], + "sqlalchemy-bigquery>=1.2.2", }, - "bigquery-usage": {"google-cloud-logging", "cachetools"}, - "docker": {"python_on_whales==0.55.0"}, - "backup": {"boto3~=1.19.12", "azure-identity", "azure-storage-blob"}, - "dagster": {"pymysql>=1.0.2", "psycopg2-binary", "GeoAlchemy2", "dagster_graphql"}, - "datalake-s3": { - "s3fs==0.4.2", - "boto3~=1.19.12", - *datalake_common, + "clickhouse": {"clickhouse-driver~=0.2", "clickhouse-sqlalchemy~=0.2"}, + "dagster": { + VERSIONS["pymysql"], + "psycopg2-binary", + VERSIONS["geoalchemy2"], + "dagster_graphql~=1.1", + }, + "dbt": {"google-cloud", VERSIONS["boto3"], VERSIONS["google-cloud-storage"]}, + "db2": {"ibm-db-sa~=0.3"}, + "databricks": {"sqlalchemy-databricks~=0.1"}, + "datalake-azure": { + "azure-storage-blob~=12.14", + "azure-identity~=1.12", + "adlfs>=2022.2.0", # Python 3.7 does only support up to 2022.2.0 + *COMMONS["datalake"], }, "datalake-gcs": { - "google-cloud-storage==1.43.0", - "gcsfs==2022.5.0", - *datalake_common, + VERSIONS["google-cloud-storage"], + "gcsfs==2022.11.0", + *COMMONS["datalake"], }, - "dbt": {"google-cloud", "boto3", "google-cloud-storage==1.43.0"}, - "druid": {"pydruid>=0.6.2"}, - "elasticsearch": {"elasticsearch==7.13.1", "requests-aws4auth==1.1.2"}, - "glue": {"boto3~=1.19.12"}, - "dynamodb": {"boto3~=1.19.12"}, - "sagemaker": {"boto3~=1.19.12"}, + "datalake-s3": { + # requires aiobotocore + # https://github.com/fsspec/s3fs/blob/9bf99f763edaf7026318e150c4bd3a8d18bb3a00/requirements.txt#L1 + # however, the latest version of `s3fs` conflicts its `aiobotocore` dep with `boto3`'s dep on `botocore`. + # Leaving this marked to the automatic resolution to speed up installation. + "s3fs==0.4.2", + *COMMONS["datalake"], + }, + "deltalake": {"delta-spark~=2.2"}, + "docker": {"python_on_whales==0.55.0"}, + "domo": {VERSIONS["pydomo"]}, + "druid": {"pydruid>=0.6.5"}, + "dynamodb": {VERSIONS["boto3"]}, + "elasticsearch": { + "elasticsearch>=7.17,<8" + }, # also requires requests-aws4auth which is in base + "glue": {VERSIONS["boto3"]}, + "great-expectations": {VERSIONS["great-expectations"]}, "hive": { - "pyhive~=0.6.5", - "thrift~=0.13.0", - "sasl==0.3.1", - "thrift-sasl==0.4.3", - "presto-types-parser==0.0.2", - }, - "kafka": { - "confluent_kafka==1.8.2", - "fastavro>=1.2.0", - "avro-python3==1.10.2", - "avro==1.11.1", - "grpcio-tools", - "protobuf", - }, - "kinesis": {"boto3~=1.19.12"}, - "redpanda": { - "confluent_kafka==1.8.2", - "fastavro>=1.2.0", - "avro-python3==1.10.2", - "avro==1.11.1", - "grpcio-tools", - "protobuf", + *COMMONS["hive"], + "thrift>=0.13,<1", + "sasl~=0.3", + "thrift-sasl~=0.4", }, + "kafka": {*COMMONS["kafka"]}, + "kinesis": {VERSIONS["boto3"]}, "ldap-users": {"ldap3==2.9.1"}, "looker": {"looker-sdk>=22.20.0"}, - "mssql": {"sqlalchemy-pytds>=0.3"}, + "mlflow": {"mlflow-skinny~=1.30"}, + "mssql": {"sqlalchemy-pytds~=0.3"}, + "mssql-odbc": {VERSIONS["pyodbc"]}, + "mysql": {VERSIONS["pymysql"]}, + "nifi": {}, # uses requests + "okta": {"okta~=2.3"}, + "oracle": {"cx_Oracle>=8.3.0,<9", "oracledb~=1.2"}, + "pinotdb": {"pinotdb~=0.3"}, + "postgres": {VERSIONS["pymysql"], "psycopg2-binary", VERSIONS["geoalchemy2"]}, + "powerbi": {VERSIONS["msal"]}, + "presto": {*COMMONS["hive"]}, "pymssql": {"pymssql==2.2.5"}, - "mssql-odbc": {"pyodbc"}, - "mysql": {"pymysql>=1.0.2"}, - "oracle": {"cx_Oracle", "oracledb==1.0.3"}, - "powerbi": {"msal==1.17.0"}, - "presto": {"pyhive~=0.6.3"}, - "trino": {"trino[sqlalchemy]"}, - "postgres": {"pymysql>=1.0.2", "psycopg2-binary", "GeoAlchemy2"}, - "redash": {"redash-toolbelt==0.1.8"}, - "redshift": {"sqlalchemy-redshift==0.8.9", "psycopg2-binary", "GeoAlchemy2"}, - "redshift-usage": { - "sqlalchemy-redshift==0.8.9", + "redash": {"redash-toolbelt~=0.1"}, + "redpanda": {*COMMONS["kafka"]}, + "redshift": { + "sqlalchemy-redshift~=0.8", "psycopg2-binary", - "GeoAlchemy2", + VERSIONS["geoalchemy2"], }, - "snowflake": {"snowflake-sqlalchemy~=1.4.3"}, - "snowflake-usage": {"snowflake-sqlalchemy~=1.4.3"}, - "superset": {}, - "tableau": {"tableau-api-lib==0.1.50"}, + "sagemaker": {VERSIONS["boto3"]}, + "salesforce": {"simple_salesforce==1.11.4"}, + "singlestore": {VERSIONS["pymysql"]}, + "sklearn": {VERSIONS["scikit-learn"]}, + "snowflake": {"snowflake-sqlalchemy~=1.4"}, + "superset": {}, # uses requests + "tableau": {"tableau-api-lib~=0.1"}, + "trino": {"trino[sqlalchemy]"}, "vertica": {"sqlalchemy-vertica[vertica-python]>=0.0.5"}, - "webhook-server": {}, - "salesforce": {"simple_salesforce~=1.11.4"}, - "okta": {"okta~=2.3.0"}, - "mlflow": {"mlflow-skinny~=1.26.1"}, - "sklearn": {"scikit-learn==1.0.2"}, - "db2": {"ibm-db-sa==0.3.8"}, - "clickhouse": {"clickhouse-driver==0.2.5", "clickhouse-sqlalchemy==0.2.3"}, - "databricks": {"sqlalchemy-databricks==0.1.0"}, - "singlestore": {"pymysql>=1.0.2"}, - "azure-sso": {"msal~=1.17.0"}, - "deltalake": {"delta-spark~=2.2.0"}, - "great-expectations": {"great-expectations~=0.15.0"}, - "pinotdb": {"pinotdb~=0.3.11"}, - "nifi": {}, - "domo": {"pydomo~=0.3.0.5"}, - "datalake-azure": { - "azure-storage-blob~=12.14.1", - "azure-identity~=1.12.0", - "adlfs==2022.2.0", - *datalake_common, - }, } + dev = { - "datamodel-code-generator==0.15.0", "black==22.3.0", - "pycln==1.3.2", + "datamodel-code-generator==0.15.0", "docker", - "google-cloud-storage==1.43.0", + "isort", + "pre-commit", + "pycln", + "pylint", "twine", } + test = { - "isort==5.10.1", - "pre-commit", - "pylint", + VERSIONS["airflow"], + "coverage", + VERSIONS["google-cloud-storage"], + VERSIONS["great-expectations"], + "moto==4.0.8", + VERSIONS["neo4j"], + VERSIONS["pandas"], + VERSIONS["pydomo"], "pytest==7.0.0", "pytest-cov", "pytest-order", - "coverage", - # sklearn integration - "scikit-learn==1.0.2", - "pandas==1.3.5", - # great_expectations tests - "great-expectations~=0.15.0", - # Airflow tests - "apache-airflow==2.3.3", - # Domo test - "pydomo~=0.3.0.5", - # mock boto3 functions - "moto==4.0.8", - # amundsen - "neo4j~=4.4.0", + VERSIONS["scikit-learn"], } build_options = {"includes": ["_cffi_backend"]} @@ -243,7 +262,6 @@ setup( if plugin not in { "airflow", - "airflow-container-1.10.15", "db2", "great-expectations", } diff --git a/ingestion/src/metadata/__version__.py b/ingestion/src/metadata/__version__.py index 03f24932057..9081d0b168a 100644 --- a/ingestion/src/metadata/__version__.py +++ b/ingestion/src/metadata/__version__.py @@ -13,11 +13,43 @@ Module for getting versions of OpenMetadata and python """ import os +import re import sys -import pkg_resources +try: + from importlib.metadata import version +except ImportError: + from importlib_metadata import version -version = pkg_resources.require("openmetadata-ingestion")[0].version + +class VersionParsingException(Exception): + """ + Used when we cannot parse version information from a string + """ + + +def get_version_from_string(raw_version: str) -> str: + """ + Given a raw version string, such as `0.10.1.dev0` or + `0.11.0-SNAPSHOT`, we should extract the major.minor.patch + :param raw_version: raw string with version info + :return: Clean version string + """ + try: + return re.match(r"\d+.\d+.\d+", raw_version).group(0) + except AttributeError as err: + raise VersionParsingException( + f"Can't extract version from {raw_version}: {err}" + ) + + +def get_client_version() -> str: + """ + Get openmetadata-ingestion module version + :return: client version + """ + raw_version = version("openmetadata-ingestion") + return get_version_from_string(raw_version) def get_metadata_version() -> str: @@ -28,7 +60,7 @@ def get_metadata_version() -> str: metadata_pkg_dir = os.path.join(os.path.dirname(__file__), "..", "..") metadata_pkg_dir = os.path.abspath(metadata_pkg_dir) - return f"metadata {version} from {metadata_pkg_dir} (python {get_major_minor_version()})" + return f"metadata {get_client_version()} from {metadata_pkg_dir} (python {get_major_minor_version()})" def get_major_minor_version() -> str: diff --git a/ingestion/src/metadata/ingestion/ometa/mixins/server_mixin.py b/ingestion/src/metadata/ingestion/ometa/mixins/server_mixin.py index bf4df006ba8..49350af6e4c 100755 --- a/ingestion/src/metadata/ingestion/ometa/mixins/server_mixin.py +++ b/ingestion/src/metadata/ingestion/ometa/mixins/server_mixin.py @@ -13,25 +13,13 @@ Mixin class containing Server and client specific methods To be used by OpenMetadata class """ -import re - -try: - from importlib.metadata import version -except ImportError: - from importlib_metadata import version - +from metadata.__version__ import get_client_version, get_version_from_string from metadata.ingestion.ometa.client import REST from metadata.utils.logger import ometa_logger logger = ometa_logger() -class VersionParsingException(Exception): - """ - Used when we cannot parse version information from a string - """ - - class VersionMismatchException(Exception): """ Used when server and client versions do not match @@ -53,21 +41,6 @@ class OMetaServerMixin: client: REST - @staticmethod - def get_version_from_string(raw_version: str) -> str: - """ - Given a raw version string, such as `0.10.1.dev0` or - `0.11.0-SNAPSHOT`, we should extract the major.minor.patch - :param raw_version: raw string with version info - :return: Clean version string - """ - try: - return re.match(r"\d+.\d+.\d+", raw_version).group(0) - except AttributeError as err: - raise VersionParsingException( - f"Can't extract version from {raw_version}: {err}" - ) - def get_server_version(self) -> str: """ Run endpoint /version to check server version @@ -77,15 +50,7 @@ class OMetaServerMixin: raw_version = self.client.get("/version")["version"] except KeyError: raise VersionNotFoundException("Cannot Find Version at api/v1/version") - return self.get_version_from_string(raw_version) - - def get_client_version(self) -> str: - """ - Get openmetadata-ingestion module version - :return: client version - """ - raw_version = version("openmetadata-ingestion") - return self.get_version_from_string(raw_version) + return get_version_from_string(raw_version) def validate_versions(self) -> None: """ @@ -95,7 +60,7 @@ class OMetaServerMixin: logger.debug("Validating client and server versions") server_version = self.get_server_version() - client_version = self.get_client_version() + client_version = get_client_version() if server_version != client_version: raise VersionMismatchException( diff --git a/ingestion/src/metadata/parsers/schema_parsers.py b/ingestion/src/metadata/parsers/schema_parsers.py index 7c63dfc15a6..7de84b465f3 100644 --- a/ingestion/src/metadata/parsers/schema_parsers.py +++ b/ingestion/src/metadata/parsers/schema_parsers.py @@ -15,9 +15,6 @@ Hosts the singledispatch to get the schema parsers from typing import List, Optional from metadata.generated.schema.type.schema import FieldModel, SchemaType -from metadata.parsers.avro_parser import parse_avro_schema -from metadata.parsers.json_schema_parser import parse_json_schema -from metadata.parsers.protobuf_parser import ProtobufParser, ProtobufParserConfig from metadata.utils.dispatch import enum_register schema_parser_config_registry = enum_register() @@ -29,10 +26,14 @@ class InvalidSchemaTypeException(Exception): """ +# Load parsers only on demand +# pylint: disable=import-outside-toplevel @schema_parser_config_registry.add(SchemaType.Avro.value.lower()) def load_avro_parser( topic_name: str, schema_text: str # pylint: disable=unused-argument ) -> Optional[List[FieldModel]]: + from metadata.parsers.avro_parser import parse_avro_schema + return parse_avro_schema(schema_text) @@ -40,6 +41,8 @@ def load_avro_parser( def load_protobuf_parser( topic_name: str, schema_text: str ) -> Optional[List[FieldModel]]: + from metadata.parsers.protobuf_parser import ProtobufParser, ProtobufParserConfig + protobuf_parser = ProtobufParser( config=ProtobufParserConfig(schema_name=topic_name, schema_text=schema_text) ) @@ -50,6 +53,8 @@ def load_protobuf_parser( def load_json_schema_parser( topic_name: str, schema_text: str # pylint: disable=unused-argument ) -> Optional[List[FieldModel]]: + from metadata.parsers.json_schema_parser import parse_json_schema + return parse_json_schema(schema_text) diff --git a/ingestion/tests/unit/test_server_mixin.py b/ingestion/tests/unit/test_version.py similarity index 55% rename from ingestion/tests/unit/test_server_mixin.py rename to ingestion/tests/unit/test_version.py index 9959c8928c0..d4dd05e34ba 100644 --- a/ingestion/tests/unit/test_server_mixin.py +++ b/ingestion/tests/unit/test_version.py @@ -14,28 +14,20 @@ Validate Server Mixin version methods from unittest import TestCase -from metadata.ingestion.ometa.mixins.server_mixin import OMetaServerMixin +from metadata.__version__ import get_version_from_string -class OMetaServerTest(TestCase): +class OMetaVersionTest(TestCase): """ Check version methods """ - mixin = OMetaServerMixin() - def test_get_version_from_string(self): """ We should be able to parse regular version responses """ - self.assertEqual("0.11.0", self.mixin.get_version_from_string("0.11.0.dev0")) - self.assertEqual("0.11.0", self.mixin.get_version_from_string("0.11.0")) - self.assertEqual( - "1111.11.111", self.mixin.get_version_from_string("1111.11.111") - ) - self.assertEqual( - "1111.11.111", self.mixin.get_version_from_string("1111.11.111-SNAPSHOT") - ) - self.assertEqual( - "0.11.1", self.mixin.get_version_from_string("0.11.1.0.0.1.patch") - ) + self.assertEqual("0.11.0", get_version_from_string("0.11.0.dev0")) + self.assertEqual("0.11.0", get_version_from_string("0.11.0")) + self.assertEqual("1111.11.111", get_version_from_string("1111.11.111")) + self.assertEqual("1111.11.111", get_version_from_string("1111.11.111-SNAPSHOT")) + self.assertEqual("0.11.1", get_version_from_string("0.11.1.0.0.1.patch"))