chore(cli): drop support for python 3.7 (#9731)

This commit is contained in:
Harshal Sheth 2024-01-29 10:50:47 -08:00 committed by GitHub
parent f3cc4e068a
commit 1498c36875
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
22 changed files with 799 additions and 857 deletions

View File

@ -31,7 +31,7 @@ jobs:
# DATAHUB_LOOKML_GIT_TEST_SSH_KEY: ${{ secrets.DATAHUB_LOOKML_GIT_TEST_SSH_KEY }} # DATAHUB_LOOKML_GIT_TEST_SSH_KEY: ${{ secrets.DATAHUB_LOOKML_GIT_TEST_SSH_KEY }}
strategy: strategy:
matrix: matrix:
python-version: ["3.7", "3.10"] python-version: ["3.8", "3.10"]
command: command:
[ [
"testQuick", "testQuick",
@ -40,7 +40,7 @@ jobs:
"testIntegrationBatch2", "testIntegrationBatch2",
] ]
include: include:
- python-version: "3.7" - python-version: "3.8"
- python-version: "3.10" - python-version: "3.10"
fail-fast: false fail-fast: false
steps: steps:

View File

@ -24,7 +24,7 @@ source venv/bin/activate # activate the environment
Once inside the virtual environment, install `datahub` using the following commands Once inside the virtual environment, install `datahub` using the following commands
```shell ```shell
# Requires Python 3.7+ # Requires Python 3.8+
python3 -m pip install --upgrade pip wheel setuptools python3 -m pip install --upgrade pip wheel setuptools
python3 -m pip install --upgrade acryl-datahub python3 -m pip install --upgrade acryl-datahub
# validate that the install was successful # validate that the install was successful

View File

@ -10,8 +10,10 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
- Neo4j 5.x, may require migration from 4.x - Neo4j 5.x, may require migration from 4.x
- Build requires JDK17 (Runtime Java 11) - Build requires JDK17 (Runtime Java 11)
- Build requires Docker Compose > 2.20 - Build requires Docker Compose > 2.20
- #9731 - The `acryl-datahub` CLI now requires Python 3.8+
- #9601 - The Unity Catalog(UC) ingestion source config `include_metastore` is now disabled by default. This change will affect the urns of all entities in the workspace.<br/> - #9601 - The Unity Catalog(UC) ingestion source config `include_metastore` is now disabled by default. This change will affect the urns of all entities in the workspace.<br/>
Entity Hierarchy with `include_metastore: true` (Old) Entity Hierarchy with `include_metastore: true` (Old)
``` ```
- UC Metastore - UC Metastore
- Catalog - Catalog
@ -20,15 +22,18 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
``` ```
Entity Hierarchy with `include_metastore: false` (New) Entity Hierarchy with `include_metastore: false` (New)
``` ```
- Catalog - Catalog
- Schema - Schema
- Table - Table
``` ```
We recommend using `platform_instance` for differentiating across metastores. We recommend using `platform_instance` for differentiating across metastores.
If stateful ingestion is enabled, running ingestion with latest cli version will perform all required cleanup. Otherwise, we recommend soft deleting all databricks data via the DataHub CLI: If stateful ingestion is enabled, running ingestion with latest cli version will perform all required cleanup. Otherwise, we recommend soft deleting all databricks data via the DataHub CLI:
`datahub delete --platform databricks --soft` and then reingesting with latest cli version. `datahub delete --platform databricks --soft` and then reingesting with latest cli version.
- #9601 - The Unity Catalog(UC) ingestion source config `include_hive_metastore` is now enabled by default. This requires config `warehouse_id` to be set. You can disable `include_hive_metastore` by setting it to `False` to avoid ingesting legacy hive metastore catalog in Databricks. - #9601 - The Unity Catalog(UC) ingestion source config `include_hive_metastore` is now enabled by default. This requires config `warehouse_id` to be set. You can disable `include_hive_metastore` by setting it to `False` to avoid ingesting legacy hive metastore catalog in Databricks.
### Potential Downtime ### Potential Downtime

View File

@ -22,7 +22,7 @@ If you're interested in a managed version, [Acryl Data](https://www.acryldata.io
| Linux | [Docker for Linux](https://docs.docker.com/desktop/install/linux-install/) and [Docker Compose](https://docs.docker.com/compose/install/linux/) | | Linux | [Docker for Linux](https://docs.docker.com/desktop/install/linux-install/) and [Docker Compose](https://docs.docker.com/compose/install/linux/) |
- **Launch the Docker engine** from command line or the desktop app. - **Launch the Docker engine** from command line or the desktop app.
- Ensure you have **Python 3.7+** installed & configured. (Check using `python3 --version`). - Ensure you have **Python 3.8+** installed & configured. (Check using `python3 --version`).
:::note Docker Resource Allocation :::note Docker Resource Allocation

View File

@ -18,16 +18,10 @@ _version: str = package_metadata["__version__"]
_self_pin = f"=={_version}" if not _version.endswith("dev0") else "" _self_pin = f"=={_version}" if not _version.endswith("dev0") else ""
rest_common = {"requests", "requests_file"}
base_requirements = { base_requirements = {
# Compatibility. f"acryl-datahub[datahub-rest]{_self_pin}",
"dataclasses>=0.6; python_version < '3.7'",
"mypy_extensions>=0.4.3",
# Actual dependencies. # Actual dependencies.
"pydantic>=1.5.1",
"apache-airflow >= 2.0.2", "apache-airflow >= 2.0.2",
*rest_common,
} }
plugins: Dict[str, Set[str]] = { plugins: Dict[str, Set[str]] = {
@ -42,9 +36,8 @@ plugins: Dict[str, Set[str]] = {
}, },
"plugin-v1": set(), "plugin-v1": set(),
"plugin-v2": { "plugin-v2": {
# The v2 plugin requires Python 3.8+.
f"acryl-datahub[sql-parser]{_self_pin}", f"acryl-datahub[sql-parser]{_self_pin}",
"openlineage-airflow==1.2.0; python_version >= '3.8'", "openlineage-airflow==1.2.0",
}, },
} }
@ -144,7 +137,6 @@ setuptools.setup(
"Programming Language :: Python", "Programming Language :: Python",
"Programming Language :: Python :: 3", "Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only", "Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.10",
@ -161,7 +153,7 @@ setuptools.setup(
], ],
# Package info. # Package info.
zip_safe=False, zip_safe=False,
python_requires=">=3.7", python_requires=">=3.8",
package_data={ package_data={
"datahub_airflow_plugin": ["py.typed"], "datahub_airflow_plugin": ["py.typed"],
}, },

View File

@ -1,7 +1,6 @@
import datetime import datetime
import json import json
import os import os
import sys
from contextlib import contextmanager from contextlib import contextmanager
from typing import Iterator from typing import Iterator
from unittest import mock from unittest import mock
@ -318,8 +317,7 @@ def test_lineage_backend(mock_emit, inlets, outlets, capture_executions):
# Check that the right things were emitted. # Check that the right things were emitted.
assert mock_emitter.emit.call_count == 17 if capture_executions else 9 assert mock_emitter.emit.call_count == 17 if capture_executions else 9
# Running further checks based on python version because args only exists in python 3.8+ # TODO: Replace this with a golden file-based comparison.
if sys.version_info > (3, 8):
assert mock_emitter.method_calls[0].args[0].aspectName == "dataFlowInfo" assert mock_emitter.method_calls[0].args[0].aspectName == "dataFlowInfo"
assert ( assert (
mock_emitter.method_calls[0].args[0].entityUrn mock_emitter.method_calls[0].args[0].entityUrn
@ -344,9 +342,7 @@ def test_lineage_backend(mock_emit, inlets, outlets, capture_executions):
== "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)"
) )
assert ( assert mock_emitter.method_calls[4].args[0].aspectName == "dataJobInputOutput"
mock_emitter.method_calls[4].args[0].aspectName == "dataJobInputOutput"
)
assert ( assert (
mock_emitter.method_calls[4].args[0].entityUrn mock_emitter.method_calls[4].args[0].entityUrn
== "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)"

View File

@ -17,7 +17,7 @@ def get_coverage_arg(test_name) {
task checkPythonVersion(type: Exec) { task checkPythonVersion(type: Exec) {
commandLine python_executable, '-c', commandLine python_executable, '-c',
'import sys; assert (3, 11) > sys.version_info >= (3, 7), f"Python version {sys.version_info[:2]} not allowed"' 'import sys; assert (3, 11) > sys.version_info >= (3, 8), f"Python version {sys.version_info[:2]} not allowed"'
} }
task environmentSetup(type: Exec, dependsOn: checkPythonVersion) { task environmentSetup(type: Exec, dependsOn: checkPythonVersion) {

View File

@ -3,14 +3,16 @@
## Installing the CLI ## Installing the CLI
Make sure you have installed DataHub CLI before following this guide. Make sure you have installed DataHub CLI before following this guide.
```shell ```shell
# Requires Python 3.7+ # Requires Python 3.8+
python3 -m pip install --upgrade pip wheel setuptools python3 -m pip install --upgrade pip wheel setuptools
python3 -m pip install --upgrade acryl-datahub python3 -m pip install --upgrade acryl-datahub
# validate that the install was successful # validate that the install was successful
datahub version datahub version
# If you see "command not found", try running this instead: python3 -m datahub version # If you see "command not found", try running this instead: python3 -m datahub version
``` ```
Check out the [CLI Installation Guide](../docs/cli.md#installation) for more installation options and troubleshooting tips. Check out the [CLI Installation Guide](../docs/cli.md#installation) for more installation options and troubleshooting tips.
After that, install the required plugin for the ingestion. After that, install the required plugin for the ingestion.
@ -18,10 +20,13 @@ After that, install the required plugin for the ingestion.
```shell ```shell
pip install 'acryl-datahub[datahub-rest]' # install the required plugin pip install 'acryl-datahub[datahub-rest]' # install the required plugin
``` ```
Check out the [alternative installation options](../docs/cli.md#alternate-installation-options) for more reference. Check out the [alternative installation options](../docs/cli.md#alternate-installation-options) for more reference.
## Configuring a Recipe ## Configuring a Recipe
Create a recipe.yml file that defines the source and sink for metadata, as shown below. Create a recipe.yml file that defines the source and sink for metadata, as shown below.
```yaml ```yaml
# my_reipe.yml # my_reipe.yml
source: source:
@ -39,6 +44,7 @@ sink:
For more information and examples on configuring recipes, please refer to [Recipes](recipe_overview.md). For more information and examples on configuring recipes, please refer to [Recipes](recipe_overview.md).
## Ingesting Metadata ## Ingesting Metadata
You can run ingestion using `datahub ingest` like below. You can run ingestion using `datahub ingest` like below.
```shell ```shell
@ -48,6 +54,7 @@ datahub ingest -c <path_to_recipe_file.yml>
## Reference ## Reference
Please refer the following pages for advanced guids on CLI ingestion. Please refer the following pages for advanced guids on CLI ingestion.
- [Reference for `datahub ingest` command](../docs/cli.md#ingest) - [Reference for `datahub ingest` command](../docs/cli.md#ingest)
- [UI Ingestion Guide](../docs/ui-ingestion.md) - [UI Ingestion Guide](../docs/ui-ingestion.md)

View File

@ -9,10 +9,10 @@ Also take a look at the guide to [adding a source](./adding-source.md).
### Requirements ### Requirements
1. Python 3.7+ must be installed in your host environment. 1. Python 3.8+ must be installed in your host environment.
2. Java 17 (gradle won't work with newer or older versions) 2. Java 17 (gradle won't work with newer or older versions)
4. On Debian/Ubuntu: `sudo apt install python3-dev python3-venv` 3. On Debian/Ubuntu: `sudo apt install python3-dev python3-venv`
5. On Fedora (if using LDAP source integration): `sudo yum install openldap-devel` 4. On Fedora (if using LDAP source integration): `sudo yum install openldap-devel`
### Set up your Python environment ### Set up your Python environment

View File

@ -1,4 +1,3 @@
import sys
from typing import Dict, Set from typing import Dict, Set
import setuptools import setuptools
@ -11,7 +10,6 @@ with open("./src/datahub/__init__.py") as fp:
base_requirements = { base_requirements = {
# Typing extension should be >=3.10.0.2 ideally but we can't restrict due to a Airflow 2.1 dependency conflict. # Typing extension should be >=3.10.0.2 ideally but we can't restrict due to a Airflow 2.1 dependency conflict.
"typing_extensions>=3.7.4.3", "typing_extensions>=3.7.4.3",
"mypy_extensions>=0.4.3",
# Actual dependencies. # Actual dependencies.
"typing-inspect", "typing-inspect",
# pydantic 1.8.2 is incompatible with mypy 0.910. # pydantic 1.8.2 is incompatible with mypy 0.910.
@ -48,9 +46,7 @@ framework_common = {
"click-spinner", "click-spinner",
"requests_file", "requests_file",
"jsonref", "jsonref",
# jsonschema drops python 3.7 support in v4.18.0 "jsonschema",
"jsonschema<=4.17.3; python_version < '3.8'",
"jsonschema; python_version >= '3.8'",
"ruamel.yaml", "ruamel.yaml",
} }
@ -463,7 +459,7 @@ base_dev_requirements = {
"black==22.12.0", "black==22.12.0",
"coverage>=5.1", "coverage>=5.1",
"faker>=18.4.0", "faker>=18.4.0",
"flake8>=3.8.3", # DEPRECATION: Once we drop Python 3.7, we can pin to 6.x. "flake8>=6.0.0",
"flake8-tidy-imports>=4.3.0", "flake8-tidy-imports>=4.3.0",
"flake8-bugbear==23.3.12", "flake8-bugbear==23.3.12",
"isort>=5.7.0", "isort>=5.7.0",
@ -489,9 +485,9 @@ base_dev_requirements = {
"delta-lake", "delta-lake",
"druid", "druid",
"elasticsearch", "elasticsearch",
"feast" if sys.version_info >= (3, 8) else None, "feast",
"iceberg" if sys.version_info >= (3, 8) else None, "iceberg",
"mlflow" if sys.version_info >= (3, 8) else None, "mlflow",
"json-schema", "json-schema",
"ldap", "ldap",
"looker", "looker",
@ -544,14 +540,14 @@ full_test_dev_requirements = {
"clickhouse", "clickhouse",
"delta-lake", "delta-lake",
"druid", "druid",
"feast" if sys.version_info >= (3, 8) else None, "feast",
"hana", "hana",
"hive", "hive",
"iceberg" if sys.version_info >= (3, 8) else None, "iceberg",
"kafka-connect", "kafka-connect",
"ldap", "ldap",
"mongodb", "mongodb",
"mssql" if sys.version_info >= (3, 8) else None, "mssql",
"mysql", "mysql",
"mariadb", "mariadb",
"redash", "redash",
@ -699,7 +695,6 @@ See the [DataHub docs](https://datahubproject.io/docs/metadata-ingestion).
"Programming Language :: Python", "Programming Language :: Python",
"Programming Language :: Python :: 3", "Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only", "Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.10",
@ -716,7 +711,7 @@ See the [DataHub docs](https://datahubproject.io/docs/metadata-ingestion).
], ],
# Package info. # Package info.
zip_safe=False, zip_safe=False,
python_requires=">=3.7", python_requires=">=3.8",
package_dir={"": "src"}, package_dir={"": "src"},
packages=setuptools.find_namespace_packages(where="./src"), packages=setuptools.find_namespace_packages(where="./src"),
package_data={ package_data={

View File

@ -16,16 +16,9 @@ def nice_version_name() -> str:
return __version__ return __version__
if sys.version_info < (3, 7): if sys.version_info < (3, 8):
warnings.warn( warnings.warn(
"DataHub requires Python 3.7 or newer. " "DataHub requires Python 3.8 or newer. "
"Please upgrade your Python version to continue using DataHub.",
FutureWarning,
stacklevel=2,
)
elif sys.version_info < (3, 8):
warnings.warn(
"DataHub will require Python 3.8 or newer soon. "
"Please upgrade your Python version to continue using DataHub.", "Please upgrade your Python version to continue using DataHub.",
FutureWarning, FutureWarning,
stacklevel=2, stacklevel=2,

View File

@ -2,11 +2,10 @@ import dataclasses
import json import json
import logging import logging
import pprint import pprint
import sys
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime, timedelta from datetime import datetime, timedelta
from enum import Enum from enum import Enum
from typing import Any, Dict, Optional from typing import Any, Optional
import humanfriendly import humanfriendly
import pydantic import pydantic
@ -19,12 +18,6 @@ from datahub.utilities.lossy_collections import LossyList
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
LogLevel = Literal["ERROR", "WARNING", "INFO", "DEBUG"] LogLevel = Literal["ERROR", "WARNING", "INFO", "DEBUG"]
# The sort_dicts option was added in Python 3.8.
if sys.version_info >= (3, 8):
PPRINT_OPTIONS = {"sort_dicts": False}
else:
PPRINT_OPTIONS: Dict = {}
@runtime_checkable @runtime_checkable
class SupportsAsObj(Protocol): class SupportsAsObj(Protocol):
@ -32,14 +25,6 @@ class SupportsAsObj(Protocol):
... ...
def _stacklevel_if_supported(level: int) -> dict:
# The logging module added support for stacklevel in Python 3.8.
if sys.version_info >= (3, 8):
return {"stacklevel": level}
else:
return {}
@dataclass @dataclass
class Report(SupportsAsObj): class Report(SupportsAsObj):
@staticmethod @staticmethod
@ -95,7 +80,7 @@ class Report(SupportsAsObj):
} }
def as_string(self) -> str: def as_string(self) -> str:
return pprint.pformat(self.as_obj(), width=150, **PPRINT_OPTIONS) return pprint.pformat(self.as_obj(), width=150, sort_dicts=False)
def as_json(self) -> str: def as_json(self) -> str:
return json.dumps(self.as_obj()) return json.dumps(self.as_obj())
@ -118,7 +103,7 @@ class ReportAttribute(BaseModel):
return log_levels[self.severity] return log_levels[self.severity]
def log(self, msg: str) -> None: def log(self, msg: str) -> None:
logger.log(level=self.logger_sev, msg=msg, **_stacklevel_if_supported(3)) logger.log(level=self.logger_sev, msg=msg, stacklevel=3)
class EntityFilterReport(ReportAttribute): class EntityFilterReport(ReportAttribute):

View File

@ -1,8 +1,3 @@
import sys
if sys.version_info < (3, 8):
raise ImportError("Feast is only supported on Python 3.8+")
from dataclasses import dataclass from dataclasses import dataclass
from typing import Dict, Iterable, List, Optional, Tuple, Union from typing import Dict, Iterable, List, Optional, Tuple, Union

View File

@ -1,8 +1,3 @@
import sys
if sys.version_info < (3, 8):
raise ImportError("Iceberg is only supported on Python 3.8+")
import json import json
import logging import logging
import uuid import uuid

View File

@ -1,9 +1,3 @@
import sys
if sys.version_info < (3, 8):
raise ImportError("MLflow is only supported on Python 3.8+")
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any, Callable, Iterable, Optional, TypeVar, Union from typing import Any, Callable, Iterable, Optional, TypeVar, Union

View File

@ -1,7 +1,7 @@
from collections import Counter from collections import Counter
from typing import Any, Counter as CounterType, Dict, Sequence, Tuple, Union from typing import Any, Counter as CounterType, Dict, Sequence, Tuple, Union
from mypy_extensions import TypedDict from typing_extensions import TypedDict
class BasicSchemaDescription(TypedDict): class BasicSchemaDescription(TypedDict):

View File

@ -1,6 +1,3 @@
import sys
import pytest
from freezegun import freeze_time from freezegun import freeze_time
from datahub.ingestion.run.pipeline import Pipeline from datahub.ingestion.run.pipeline import Pipeline
@ -8,10 +5,6 @@ from tests.test_helpers import mce_helpers
FROZEN_TIME = "2020-04-14 07:00:00" FROZEN_TIME = "2020-04-14 07:00:00"
pytestmark = pytest.mark.skipif(
sys.version_info < (3, 8), reason="requires python 3.8 or higher"
)
@freeze_time(FROZEN_TIME) @freeze_time(FROZEN_TIME)
def test_feast_repository_ingest(pytestconfig, tmp_path, mock_time): def test_feast_repository_ingest(pytestconfig, tmp_path, mock_time):

View File

@ -1,5 +1,4 @@
import subprocess import subprocess
import sys
from typing import Any, Dict, List from typing import Any, Dict, List
from unittest.mock import patch from unittest.mock import patch
@ -15,13 +14,7 @@ from tests.test_helpers.state_helpers import (
validate_all_providers_have_committed_successfully, validate_all_providers_have_committed_successfully,
) )
pytestmark = [ pytestmark = pytest.mark.integration_batch_1
pytest.mark.integration_batch_1,
# Skip tests if not on Python 3.8 or higher.
pytest.mark.skipif(
sys.version_info < (3, 8), reason="Requires python 3.8 or higher"
),
]
FROZEN_TIME = "2020-04-14 07:00:00" FROZEN_TIME = "2020-04-14 07:00:00"
GMS_PORT = 8080 GMS_PORT = 8080
GMS_SERVER = f"http://localhost:{GMS_PORT}" GMS_SERVER = f"http://localhost:{GMS_PORT}"

View File

@ -1,6 +1,3 @@
import sys
if sys.version_info >= (3, 8):
from pathlib import Path from pathlib import Path
from typing import Any, Dict, TypeVar from typing import Any, Dict, TypeVar
@ -12,14 +9,17 @@ if sys.version_info >= (3, 8):
T = TypeVar("T") T = TypeVar("T")
@pytest.fixture @pytest.fixture
def tracking_uri(tmp_path: Path) -> str: def tracking_uri(tmp_path: Path) -> str:
return str(tmp_path / "mlruns") return str(tmp_path / "mlruns")
@pytest.fixture @pytest.fixture
def sink_file_path(tmp_path: Path) -> str: def sink_file_path(tmp_path: Path) -> str:
return str(tmp_path / "mlflow_source_mcps.json") return str(tmp_path / "mlflow_source_mcps.json")
@pytest.fixture @pytest.fixture
def pipeline_config(tracking_uri: str, sink_file_path: str) -> Dict[str, Any]: def pipeline_config(tracking_uri: str, sink_file_path: str) -> Dict[str, Any]:
source_type = "mlflow" source_type = "mlflow"
@ -39,6 +39,7 @@ if sys.version_info >= (3, 8):
}, },
} }
@pytest.fixture @pytest.fixture
def generate_mlflow_data(tracking_uri: str) -> None: def generate_mlflow_data(tracking_uri: str) -> None:
client = MlflowClient(tracking_uri=tracking_uri) client = MlflowClient(tracking_uri=tracking_uri)
@ -80,6 +81,7 @@ if sys.version_info >= (3, 8):
stage="Archived", stage="Archived",
) )
def test_ingestion( def test_ingestion(
pytestconfig, pytestconfig,
mock_time, mock_time,

View File

@ -1,6 +1,5 @@
import os import os
import subprocess import subprocess
import sys
import time import time
import pytest import pytest
@ -9,10 +8,6 @@ from tests.test_helpers import mce_helpers
from tests.test_helpers.click_helpers import run_datahub_cmd from tests.test_helpers.click_helpers import run_datahub_cmd
from tests.test_helpers.docker_helpers import cleanup_image, wait_for_port from tests.test_helpers.docker_helpers import cleanup_image, wait_for_port
pytestmark = pytest.mark.skipif(
sys.version_info < (3, 8), reason="requires python 3.8 or higher"
)
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def mssql_runner(docker_compose_runner, pytestconfig): def mssql_runner(docker_compose_runner, pytestconfig):

View File

@ -1,12 +1,9 @@
import sys
import uuid import uuid
from decimal import Decimal from decimal import Decimal
from typing import Any, Optional from typing import Any, Optional
import pytest import pytest
from pydantic import ValidationError from pydantic import ValidationError
if sys.version_info >= (3, 8):
from pyiceberg.schema import Schema from pyiceberg.schema import Schema
from pyiceberg.types import ( from pyiceberg.types import (
BinaryType, BinaryType,
@ -51,9 +48,6 @@ if sys.version_info >= (3, 8):
TimeTypeClass, TimeTypeClass,
) )
pytestmark = pytest.mark.skipif(
sys.version_info < (3, 8), reason="requires python 3.8 or higher"
)
def with_iceberg_source() -> IcebergSource: def with_iceberg_source() -> IcebergSource:
catalog: IcebergCatalogConfig = IcebergCatalogConfig( catalog: IcebergCatalogConfig = IcebergCatalogConfig(
@ -64,12 +58,14 @@ if sys.version_info >= (3, 8):
config=IcebergSourceConfig(catalog=catalog), config=IcebergSourceConfig(catalog=catalog),
) )
def with_iceberg_profiler() -> IcebergProfiler: def with_iceberg_profiler() -> IcebergProfiler:
iceberg_source_instance = with_iceberg_source() iceberg_source_instance = with_iceberg_source()
return IcebergProfiler( return IcebergProfiler(
iceberg_source_instance.report, iceberg_source_instance.config.profiling iceberg_source_instance.report, iceberg_source_instance.config.profiling
) )
def assert_field( def assert_field(
schema_field: SchemaField, schema_field: SchemaField,
expected_description: Optional[str], expected_description: Optional[str],
@ -86,6 +82,7 @@ if sys.version_info >= (3, 8):
schema_field.type.type, expected_type schema_field.type.type, expected_type
), f"Field type {schema_field.type.type} is different from expected type {expected_type}" ), f"Field type {schema_field.type.type} is different from expected type {expected_type}"
def test_config_no_catalog(): def test_config_no_catalog():
""" """
Test when no Iceberg catalog is provided. Test when no Iceberg catalog is provided.
@ -93,6 +90,7 @@ if sys.version_info >= (3, 8):
with pytest.raises(ValidationError, match="catalog"): with pytest.raises(ValidationError, match="catalog"):
IcebergSourceConfig() # type: ignore IcebergSourceConfig() # type: ignore
def test_config_catalog_not_configured(): def test_config_catalog_not_configured():
""" """
Test when an Iceberg catalog is provided, but not properly configured. Test when an Iceberg catalog is provided, but not properly configured.
@ -106,12 +104,14 @@ if sys.version_info >= (3, 8):
with pytest.raises(ValidationError, match="type"): with pytest.raises(ValidationError, match="type"):
IcebergCatalogConfig(conf={}) # type: ignore IcebergCatalogConfig(conf={}) # type: ignore
def test_config_for_tests(): def test_config_for_tests():
""" """
Test valid iceberg source that will be used in unit tests. Test valid iceberg source that will be used in unit tests.
""" """
with_iceberg_source() with_iceberg_source()
@pytest.mark.parametrize( @pytest.mark.parametrize(
"iceberg_type, expected_schema_field_type", "iceberg_type, expected_schema_field_type",
[ [
@ -159,9 +159,7 @@ if sys.version_info >= (3, 8):
), ),
]: ]:
schema = Schema(column) schema = Schema(column)
schema_fields = iceberg_source_instance._get_schema_fields_for_schema( schema_fields = iceberg_source_instance._get_schema_fields_for_schema(schema)
schema
)
assert ( assert (
len(schema_fields) == 1 len(schema_fields) == 1
), f"Expected 1 field, but got {len(schema_fields)}" ), f"Expected 1 field, but got {len(schema_fields)}"
@ -172,6 +170,7 @@ if sys.version_info >= (3, 8):
expected_schema_field_type, expected_schema_field_type,
) )
@pytest.mark.parametrize( @pytest.mark.parametrize(
"iceberg_type, expected_array_nested_type", "iceberg_type, expected_array_nested_type",
[ [
@ -241,9 +240,7 @@ if sys.version_info >= (3, 8):
]: ]:
iceberg_source_instance = with_iceberg_source() iceberg_source_instance = with_iceberg_source()
schema = Schema(list_column) schema = Schema(list_column)
schema_fields = iceberg_source_instance._get_schema_fields_for_schema( schema_fields = iceberg_source_instance._get_schema_fields_for_schema(schema)
schema
)
assert ( assert (
len(schema_fields) == 1 len(schema_fields) == 1
), f"Expected 1 field, but got {len(schema_fields)}" ), f"Expected 1 field, but got {len(schema_fields)}"
@ -258,6 +255,7 @@ if sys.version_info >= (3, 8):
expected_array_nested_type expected_array_nested_type
], f"List Field nested type {arrayType.nestedType} was expected to be {expected_array_nested_type}" ], f"List Field nested type {arrayType.nestedType} was expected to be {expected_array_nested_type}"
@pytest.mark.parametrize( @pytest.mark.parametrize(
"iceberg_type, expected_map_type", "iceberg_type, expected_map_type",
[ [
@ -327,9 +325,7 @@ if sys.version_info >= (3, 8):
]: ]:
iceberg_source_instance = with_iceberg_source() iceberg_source_instance = with_iceberg_source()
schema = Schema(map_column) schema = Schema(map_column)
schema_fields = iceberg_source_instance._get_schema_fields_for_schema( schema_fields = iceberg_source_instance._get_schema_fields_for_schema(schema)
schema
)
# Converting an Iceberg Map type will be done by creating an array of struct(key, value) records. # Converting an Iceberg Map type will be done by creating an array of struct(key, value) records.
# The first field will be the array. # The first field will be the array.
assert ( assert (
@ -350,6 +346,7 @@ if sys.version_info >= (3, 8):
expected_map_type, expected_map_type,
) )
@pytest.mark.parametrize( @pytest.mark.parametrize(
"iceberg_type, expected_schema_field_type", "iceberg_type, expected_schema_field_type",
[ [
@ -394,9 +391,7 @@ if sys.version_info >= (3, 8):
iceberg_source_instance = with_iceberg_source() iceberg_source_instance = with_iceberg_source()
schema = Schema(struct_column) schema = Schema(struct_column)
schema_fields = iceberg_source_instance._get_schema_fields_for_schema(schema) schema_fields = iceberg_source_instance._get_schema_fields_for_schema(schema)
assert ( assert len(schema_fields) == 2, f"Expected 2 fields, but got {len(schema_fields)}"
len(schema_fields) == 2
), f"Expected 2 fields, but got {len(schema_fields)}"
assert_field( assert_field(
schema_fields[0], struct_column.doc, struct_column.optional, RecordTypeClass schema_fields[0], struct_column.doc, struct_column.optional, RecordTypeClass
) )
@ -404,6 +399,7 @@ if sys.version_info >= (3, 8):
schema_fields[1], field1.doc, field1.optional, expected_schema_field_type schema_fields[1], field1.doc, field1.optional, expected_schema_field_type
) )
@pytest.mark.parametrize( @pytest.mark.parametrize(
"value_type, value, expected_value", "value_type, value, expected_value",
[ [
@ -444,6 +440,7 @@ if sys.version_info >= (3, 8):
== expected_value == expected_value
) )
def test_avro_decimal_bytes_nullable() -> None: def test_avro_decimal_bytes_nullable() -> None:
""" """
The following test exposes a problem with decimal (bytes) not preserving extra attributes like _nullable. Decimal (fixed) and Boolean for example do. The following test exposes a problem with decimal (bytes) not preserving extra attributes like _nullable. Decimal (fixed) and Boolean for example do.
@ -457,9 +454,7 @@ if sys.version_info >= (3, 8):
print( print(
f"Original avro schema string: {decimal_avro_schema_string}" f"Original avro schema string: {decimal_avro_schema_string}"
) )
print( print(f"After avro parsing, _nullable attribute is missing: {decimal_avro_schema}")
f"After avro parsing, _nullable attribute is missing: {decimal_avro_schema}"
)
decimal_fixed_avro_schema_string = """{"type": "record", "name": "__struct_", "fields": [{"type": {"type": "fixed", "logicalType": "decimal", "precision": 3, "scale": 2, "native_data_type": "decimal(3, 2)", "_nullable": false, "name": "bogusName", "size": 16}, "name": "required_field", "doc": "required field documentation"}]}""" decimal_fixed_avro_schema_string = """{"type": "record", "name": "__struct_", "fields": [{"type": {"type": "fixed", "logicalType": "decimal", "precision": 3, "scale": 2, "native_data_type": "decimal(3, 2)", "_nullable": false, "name": "bogusName", "size": 16}, "name": "required_field", "doc": "required field documentation"}]}"""
decimal_fixed_avro_schema = avro.schema.parse(decimal_fixed_avro_schema_string) decimal_fixed_avro_schema = avro.schema.parse(decimal_fixed_avro_schema_string)

View File

@ -1,6 +1,3 @@
import sys
if sys.version_info >= (3, 8):
import datetime import datetime
from pathlib import Path from pathlib import Path
from typing import Any, TypeVar, Union from typing import Any, TypeVar, Union
@ -16,10 +13,12 @@ if sys.version_info >= (3, 8):
T = TypeVar("T") T = TypeVar("T")
@pytest.fixture @pytest.fixture
def tracking_uri(tmp_path: Path) -> str: def tracking_uri(tmp_path: Path) -> str:
return str(tmp_path / "mlruns") return str(tmp_path / "mlruns")
@pytest.fixture @pytest.fixture
def source(tracking_uri: str) -> MLflowSource: def source(tracking_uri: str) -> MLflowSource:
return MLflowSource( return MLflowSource(
@ -27,11 +26,13 @@ if sys.version_info >= (3, 8):
config=MLflowConfig(tracking_uri=tracking_uri), config=MLflowConfig(tracking_uri=tracking_uri),
) )
@pytest.fixture @pytest.fixture
def registered_model(source: MLflowSource) -> RegisteredModel: def registered_model(source: MLflowSource) -> RegisteredModel:
model_name = "abc" model_name = "abc"
return RegisteredModel(name=model_name) return RegisteredModel(name=model_name)
@pytest.fixture @pytest.fixture
def model_version( def model_version(
source: MLflowSource, source: MLflowSource,
@ -44,6 +45,7 @@ if sys.version_info >= (3, 8):
creation_timestamp=datetime.datetime.now(), creation_timestamp=datetime.datetime.now(),
) )
def dummy_search_func(page_token: Union[None, str], **kwargs: Any) -> PagedList[T]: def dummy_search_func(page_token: Union[None, str], **kwargs: Any) -> PagedList[T]:
dummy_pages = dict( dummy_pages = dict(
page_1=PagedList(items=["a", "b"], token="page_2"), page_1=PagedList(items=["a", "b"], token="page_2"),
@ -61,6 +63,7 @@ if sys.version_info >= (3, 8):
) )
return page_to_return return page_to_return
def test_stages(source): def test_stages(source):
mlflow_registered_model_stages = { mlflow_registered_model_stages = {
"Production", "Production",
@ -76,6 +79,7 @@ if sys.version_info >= (3, 8):
"mlflow_" + str(stage).lower() for stage in mlflow_registered_model_stages "mlflow_" + str(stage).lower() for stage in mlflow_registered_model_stages
} }
def test_config_model_name_separator(source, model_version): def test_config_model_name_separator(source, model_version):
name_version_sep = "+" name_version_sep = "+"
source.config.model_name_separator = name_version_sep source.config.model_name_separator = name_version_sep
@ -88,6 +92,7 @@ if sys.version_info >= (3, 8):
assert urn == expected_urn assert urn == expected_urn
def test_model_without_run(source, registered_model, model_version): def test_model_without_run(source, registered_model, model_version):
run = source._get_mlflow_run(model_version) run = source._get_mlflow_run(model_version)
wu = source._get_ml_model_properties_workunit( wu = source._get_ml_model_properties_workunit(
@ -100,6 +105,7 @@ if sys.version_info >= (3, 8):
assert aspect.hyperParams is None assert aspect.hyperParams is None
assert aspect.trainingMetrics is None assert aspect.trainingMetrics is None
def test_traverse_mlflow_search_func(source): def test_traverse_mlflow_search_func(source):
expected_items = ["a", "b", "c", "d", "e"] expected_items = ["a", "b", "c", "d", "e"]
@ -107,15 +113,15 @@ if sys.version_info >= (3, 8):
assert items == expected_items assert items == expected_items
def test_traverse_mlflow_search_func_with_kwargs(source): def test_traverse_mlflow_search_func_with_kwargs(source):
expected_items = ["A", "B", "C", "D", "E"] expected_items = ["A", "B", "C", "D", "E"]
items = list( items = list(source._traverse_mlflow_search_func(dummy_search_func, case="upper"))
source._traverse_mlflow_search_func(dummy_search_func, case="upper")
)
assert items == expected_items assert items == expected_items
def test_make_external_link_local(source, model_version): def test_make_external_link_local(source, model_version):
expected_url = None expected_url = None
@ -123,6 +129,7 @@ if sys.version_info >= (3, 8):
assert url == expected_url assert url == expected_url
def test_make_external_link_remote(source, model_version): def test_make_external_link_remote(source, model_version):
tracking_uri_remote = "https://dummy-mlflow-tracking-server.org" tracking_uri_remote = "https://dummy-mlflow-tracking-server.org"
source.client = MlflowClient(tracking_uri=tracking_uri_remote) source.client = MlflowClient(tracking_uri=tracking_uri_remote)