mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-09 16:03:31 +00:00
chore(cli): drop support for python 3.7 (#9731)
This commit is contained in:
parent
f3cc4e068a
commit
1498c36875
4
.github/workflows/metadata-ingestion.yml
vendored
4
.github/workflows/metadata-ingestion.yml
vendored
@ -31,7 +31,7 @@ jobs:
|
||||
# DATAHUB_LOOKML_GIT_TEST_SSH_KEY: ${{ secrets.DATAHUB_LOOKML_GIT_TEST_SSH_KEY }}
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.7", "3.10"]
|
||||
python-version: ["3.8", "3.10"]
|
||||
command:
|
||||
[
|
||||
"testQuick",
|
||||
@ -40,7 +40,7 @@ jobs:
|
||||
"testIntegrationBatch2",
|
||||
]
|
||||
include:
|
||||
- python-version: "3.7"
|
||||
- python-version: "3.8"
|
||||
- python-version: "3.10"
|
||||
fail-fast: false
|
||||
steps:
|
||||
|
||||
@ -24,7 +24,7 @@ source venv/bin/activate # activate the environment
|
||||
Once inside the virtual environment, install `datahub` using the following commands
|
||||
|
||||
```shell
|
||||
# Requires Python 3.7+
|
||||
# Requires Python 3.8+
|
||||
python3 -m pip install --upgrade pip wheel setuptools
|
||||
python3 -m pip install --upgrade acryl-datahub
|
||||
# validate that the install was successful
|
||||
|
||||
@ -10,8 +10,10 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
|
||||
- Neo4j 5.x, may require migration from 4.x
|
||||
- Build requires JDK17 (Runtime Java 11)
|
||||
- Build requires Docker Compose > 2.20
|
||||
- #9731 - The `acryl-datahub` CLI now requires Python 3.8+
|
||||
- #9601 - The Unity Catalog(UC) ingestion source config `include_metastore` is now disabled by default. This change will affect the urns of all entities in the workspace.<br/>
|
||||
Entity Hierarchy with `include_metastore: true` (Old)
|
||||
|
||||
```
|
||||
- UC Metastore
|
||||
- Catalog
|
||||
@ -20,15 +22,18 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
|
||||
```
|
||||
|
||||
Entity Hierarchy with `include_metastore: false` (New)
|
||||
|
||||
```
|
||||
- Catalog
|
||||
- Schema
|
||||
- Table
|
||||
```
|
||||
|
||||
We recommend using `platform_instance` for differentiating across metastores.
|
||||
|
||||
If stateful ingestion is enabled, running ingestion with latest cli version will perform all required cleanup. Otherwise, we recommend soft deleting all databricks data via the DataHub CLI:
|
||||
`datahub delete --platform databricks --soft` and then reingesting with latest cli version.
|
||||
|
||||
- #9601 - The Unity Catalog(UC) ingestion source config `include_hive_metastore` is now enabled by default. This requires config `warehouse_id` to be set. You can disable `include_hive_metastore` by setting it to `False` to avoid ingesting legacy hive metastore catalog in Databricks.
|
||||
|
||||
### Potential Downtime
|
||||
|
||||
@ -22,7 +22,7 @@ If you're interested in a managed version, [Acryl Data](https://www.acryldata.io
|
||||
| Linux | [Docker for Linux](https://docs.docker.com/desktop/install/linux-install/) and [Docker Compose](https://docs.docker.com/compose/install/linux/) |
|
||||
|
||||
- **Launch the Docker engine** from command line or the desktop app.
|
||||
- Ensure you have **Python 3.7+** installed & configured. (Check using `python3 --version`).
|
||||
- Ensure you have **Python 3.8+** installed & configured. (Check using `python3 --version`).
|
||||
|
||||
:::note Docker Resource Allocation
|
||||
|
||||
|
||||
@ -18,16 +18,10 @@ _version: str = package_metadata["__version__"]
|
||||
_self_pin = f"=={_version}" if not _version.endswith("dev0") else ""
|
||||
|
||||
|
||||
rest_common = {"requests", "requests_file"}
|
||||
|
||||
base_requirements = {
|
||||
# Compatibility.
|
||||
"dataclasses>=0.6; python_version < '3.7'",
|
||||
"mypy_extensions>=0.4.3",
|
||||
f"acryl-datahub[datahub-rest]{_self_pin}",
|
||||
# Actual dependencies.
|
||||
"pydantic>=1.5.1",
|
||||
"apache-airflow >= 2.0.2",
|
||||
*rest_common,
|
||||
}
|
||||
|
||||
plugins: Dict[str, Set[str]] = {
|
||||
@ -42,9 +36,8 @@ plugins: Dict[str, Set[str]] = {
|
||||
},
|
||||
"plugin-v1": set(),
|
||||
"plugin-v2": {
|
||||
# The v2 plugin requires Python 3.8+.
|
||||
f"acryl-datahub[sql-parser]{_self_pin}",
|
||||
"openlineage-airflow==1.2.0; python_version >= '3.8'",
|
||||
"openlineage-airflow==1.2.0",
|
||||
},
|
||||
}
|
||||
|
||||
@ -144,7 +137,6 @@ setuptools.setup(
|
||||
"Programming Language :: Python",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3 :: Only",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
@ -161,7 +153,7 @@ setuptools.setup(
|
||||
],
|
||||
# Package info.
|
||||
zip_safe=False,
|
||||
python_requires=">=3.7",
|
||||
python_requires=">=3.8",
|
||||
package_data={
|
||||
"datahub_airflow_plugin": ["py.typed"],
|
||||
},
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
import datetime
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from contextlib import contextmanager
|
||||
from typing import Iterator
|
||||
from unittest import mock
|
||||
@ -318,8 +317,7 @@ def test_lineage_backend(mock_emit, inlets, outlets, capture_executions):
|
||||
# Check that the right things were emitted.
|
||||
assert mock_emitter.emit.call_count == 17 if capture_executions else 9
|
||||
|
||||
# Running further checks based on python version because args only exists in python 3.8+
|
||||
if sys.version_info > (3, 8):
|
||||
# TODO: Replace this with a golden file-based comparison.
|
||||
assert mock_emitter.method_calls[0].args[0].aspectName == "dataFlowInfo"
|
||||
assert (
|
||||
mock_emitter.method_calls[0].args[0].entityUrn
|
||||
@ -344,9 +342,7 @@ def test_lineage_backend(mock_emit, inlets, outlets, capture_executions):
|
||||
== "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)"
|
||||
)
|
||||
|
||||
assert (
|
||||
mock_emitter.method_calls[4].args[0].aspectName == "dataJobInputOutput"
|
||||
)
|
||||
assert mock_emitter.method_calls[4].args[0].aspectName == "dataJobInputOutput"
|
||||
assert (
|
||||
mock_emitter.method_calls[4].args[0].entityUrn
|
||||
== "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)"
|
||||
|
||||
@ -17,7 +17,7 @@ def get_coverage_arg(test_name) {
|
||||
|
||||
task checkPythonVersion(type: Exec) {
|
||||
commandLine python_executable, '-c',
|
||||
'import sys; assert (3, 11) > sys.version_info >= (3, 7), f"Python version {sys.version_info[:2]} not allowed"'
|
||||
'import sys; assert (3, 11) > sys.version_info >= (3, 8), f"Python version {sys.version_info[:2]} not allowed"'
|
||||
}
|
||||
|
||||
task environmentSetup(type: Exec, dependsOn: checkPythonVersion) {
|
||||
|
||||
@ -3,14 +3,16 @@
|
||||
## Installing the CLI
|
||||
|
||||
Make sure you have installed DataHub CLI before following this guide.
|
||||
|
||||
```shell
|
||||
# Requires Python 3.7+
|
||||
# Requires Python 3.8+
|
||||
python3 -m pip install --upgrade pip wheel setuptools
|
||||
python3 -m pip install --upgrade acryl-datahub
|
||||
# validate that the install was successful
|
||||
datahub version
|
||||
# If you see "command not found", try running this instead: python3 -m datahub version
|
||||
```
|
||||
|
||||
Check out the [CLI Installation Guide](../docs/cli.md#installation) for more installation options and troubleshooting tips.
|
||||
|
||||
After that, install the required plugin for the ingestion.
|
||||
@ -18,10 +20,13 @@ After that, install the required plugin for the ingestion.
|
||||
```shell
|
||||
pip install 'acryl-datahub[datahub-rest]' # install the required plugin
|
||||
```
|
||||
|
||||
Check out the [alternative installation options](../docs/cli.md#alternate-installation-options) for more reference.
|
||||
|
||||
## Configuring a Recipe
|
||||
|
||||
Create a recipe.yml file that defines the source and sink for metadata, as shown below.
|
||||
|
||||
```yaml
|
||||
# my_reipe.yml
|
||||
source:
|
||||
@ -39,6 +44,7 @@ sink:
|
||||
For more information and examples on configuring recipes, please refer to [Recipes](recipe_overview.md).
|
||||
|
||||
## Ingesting Metadata
|
||||
|
||||
You can run ingestion using `datahub ingest` like below.
|
||||
|
||||
```shell
|
||||
@ -48,6 +54,7 @@ datahub ingest -c <path_to_recipe_file.yml>
|
||||
## Reference
|
||||
|
||||
Please refer the following pages for advanced guids on CLI ingestion.
|
||||
|
||||
- [Reference for `datahub ingest` command](../docs/cli.md#ingest)
|
||||
- [UI Ingestion Guide](../docs/ui-ingestion.md)
|
||||
|
||||
|
||||
@ -9,10 +9,10 @@ Also take a look at the guide to [adding a source](./adding-source.md).
|
||||
|
||||
### Requirements
|
||||
|
||||
1. Python 3.7+ must be installed in your host environment.
|
||||
1. Python 3.8+ must be installed in your host environment.
|
||||
2. Java 17 (gradle won't work with newer or older versions)
|
||||
4. On Debian/Ubuntu: `sudo apt install python3-dev python3-venv`
|
||||
5. On Fedora (if using LDAP source integration): `sudo yum install openldap-devel`
|
||||
3. On Debian/Ubuntu: `sudo apt install python3-dev python3-venv`
|
||||
4. On Fedora (if using LDAP source integration): `sudo yum install openldap-devel`
|
||||
|
||||
### Set up your Python environment
|
||||
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
import sys
|
||||
from typing import Dict, Set
|
||||
|
||||
import setuptools
|
||||
@ -11,7 +10,6 @@ with open("./src/datahub/__init__.py") as fp:
|
||||
base_requirements = {
|
||||
# Typing extension should be >=3.10.0.2 ideally but we can't restrict due to a Airflow 2.1 dependency conflict.
|
||||
"typing_extensions>=3.7.4.3",
|
||||
"mypy_extensions>=0.4.3",
|
||||
# Actual dependencies.
|
||||
"typing-inspect",
|
||||
# pydantic 1.8.2 is incompatible with mypy 0.910.
|
||||
@ -48,9 +46,7 @@ framework_common = {
|
||||
"click-spinner",
|
||||
"requests_file",
|
||||
"jsonref",
|
||||
# jsonschema drops python 3.7 support in v4.18.0
|
||||
"jsonschema<=4.17.3; python_version < '3.8'",
|
||||
"jsonschema; python_version >= '3.8'",
|
||||
"jsonschema",
|
||||
"ruamel.yaml",
|
||||
}
|
||||
|
||||
@ -463,7 +459,7 @@ base_dev_requirements = {
|
||||
"black==22.12.0",
|
||||
"coverage>=5.1",
|
||||
"faker>=18.4.0",
|
||||
"flake8>=3.8.3", # DEPRECATION: Once we drop Python 3.7, we can pin to 6.x.
|
||||
"flake8>=6.0.0",
|
||||
"flake8-tidy-imports>=4.3.0",
|
||||
"flake8-bugbear==23.3.12",
|
||||
"isort>=5.7.0",
|
||||
@ -489,9 +485,9 @@ base_dev_requirements = {
|
||||
"delta-lake",
|
||||
"druid",
|
||||
"elasticsearch",
|
||||
"feast" if sys.version_info >= (3, 8) else None,
|
||||
"iceberg" if sys.version_info >= (3, 8) else None,
|
||||
"mlflow" if sys.version_info >= (3, 8) else None,
|
||||
"feast",
|
||||
"iceberg",
|
||||
"mlflow",
|
||||
"json-schema",
|
||||
"ldap",
|
||||
"looker",
|
||||
@ -544,14 +540,14 @@ full_test_dev_requirements = {
|
||||
"clickhouse",
|
||||
"delta-lake",
|
||||
"druid",
|
||||
"feast" if sys.version_info >= (3, 8) else None,
|
||||
"feast",
|
||||
"hana",
|
||||
"hive",
|
||||
"iceberg" if sys.version_info >= (3, 8) else None,
|
||||
"iceberg",
|
||||
"kafka-connect",
|
||||
"ldap",
|
||||
"mongodb",
|
||||
"mssql" if sys.version_info >= (3, 8) else None,
|
||||
"mssql",
|
||||
"mysql",
|
||||
"mariadb",
|
||||
"redash",
|
||||
@ -699,7 +695,6 @@ See the [DataHub docs](https://datahubproject.io/docs/metadata-ingestion).
|
||||
"Programming Language :: Python",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3 :: Only",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
@ -716,7 +711,7 @@ See the [DataHub docs](https://datahubproject.io/docs/metadata-ingestion).
|
||||
],
|
||||
# Package info.
|
||||
zip_safe=False,
|
||||
python_requires=">=3.7",
|
||||
python_requires=">=3.8",
|
||||
package_dir={"": "src"},
|
||||
packages=setuptools.find_namespace_packages(where="./src"),
|
||||
package_data={
|
||||
|
||||
@ -16,16 +16,9 @@ def nice_version_name() -> str:
|
||||
return __version__
|
||||
|
||||
|
||||
if sys.version_info < (3, 7):
|
||||
if sys.version_info < (3, 8):
|
||||
warnings.warn(
|
||||
"DataHub requires Python 3.7 or newer. "
|
||||
"Please upgrade your Python version to continue using DataHub.",
|
||||
FutureWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
elif sys.version_info < (3, 8):
|
||||
warnings.warn(
|
||||
"DataHub will require Python 3.8 or newer soon. "
|
||||
"DataHub requires Python 3.8 or newer. "
|
||||
"Please upgrade your Python version to continue using DataHub.",
|
||||
FutureWarning,
|
||||
stacklevel=2,
|
||||
|
||||
@ -2,11 +2,10 @@ import dataclasses
|
||||
import json
|
||||
import logging
|
||||
import pprint
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timedelta
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, Optional
|
||||
from typing import Any, Optional
|
||||
|
||||
import humanfriendly
|
||||
import pydantic
|
||||
@ -19,12 +18,6 @@ from datahub.utilities.lossy_collections import LossyList
|
||||
logger = logging.getLogger(__name__)
|
||||
LogLevel = Literal["ERROR", "WARNING", "INFO", "DEBUG"]
|
||||
|
||||
# The sort_dicts option was added in Python 3.8.
|
||||
if sys.version_info >= (3, 8):
|
||||
PPRINT_OPTIONS = {"sort_dicts": False}
|
||||
else:
|
||||
PPRINT_OPTIONS: Dict = {}
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class SupportsAsObj(Protocol):
|
||||
@ -32,14 +25,6 @@ class SupportsAsObj(Protocol):
|
||||
...
|
||||
|
||||
|
||||
def _stacklevel_if_supported(level: int) -> dict:
|
||||
# The logging module added support for stacklevel in Python 3.8.
|
||||
if sys.version_info >= (3, 8):
|
||||
return {"stacklevel": level}
|
||||
else:
|
||||
return {}
|
||||
|
||||
|
||||
@dataclass
|
||||
class Report(SupportsAsObj):
|
||||
@staticmethod
|
||||
@ -95,7 +80,7 @@ class Report(SupportsAsObj):
|
||||
}
|
||||
|
||||
def as_string(self) -> str:
|
||||
return pprint.pformat(self.as_obj(), width=150, **PPRINT_OPTIONS)
|
||||
return pprint.pformat(self.as_obj(), width=150, sort_dicts=False)
|
||||
|
||||
def as_json(self) -> str:
|
||||
return json.dumps(self.as_obj())
|
||||
@ -118,7 +103,7 @@ class ReportAttribute(BaseModel):
|
||||
return log_levels[self.severity]
|
||||
|
||||
def log(self, msg: str) -> None:
|
||||
logger.log(level=self.logger_sev, msg=msg, **_stacklevel_if_supported(3))
|
||||
logger.log(level=self.logger_sev, msg=msg, stacklevel=3)
|
||||
|
||||
|
||||
class EntityFilterReport(ReportAttribute):
|
||||
|
||||
@ -1,8 +1,3 @@
|
||||
import sys
|
||||
|
||||
if sys.version_info < (3, 8):
|
||||
raise ImportError("Feast is only supported on Python 3.8+")
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, Iterable, List, Optional, Tuple, Union
|
||||
|
||||
|
||||
@ -1,8 +1,3 @@
|
||||
import sys
|
||||
|
||||
if sys.version_info < (3, 8):
|
||||
raise ImportError("Iceberg is only supported on Python 3.8+")
|
||||
|
||||
import json
|
||||
import logging
|
||||
import uuid
|
||||
|
||||
@ -1,9 +1,3 @@
|
||||
import sys
|
||||
|
||||
if sys.version_info < (3, 8):
|
||||
raise ImportError("MLflow is only supported on Python 3.8+")
|
||||
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Callable, Iterable, Optional, TypeVar, Union
|
||||
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
from collections import Counter
|
||||
from typing import Any, Counter as CounterType, Dict, Sequence, Tuple, Union
|
||||
|
||||
from mypy_extensions import TypedDict
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
|
||||
class BasicSchemaDescription(TypedDict):
|
||||
|
||||
@ -1,6 +1,3 @@
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
from freezegun import freeze_time
|
||||
|
||||
from datahub.ingestion.run.pipeline import Pipeline
|
||||
@ -8,10 +5,6 @@ from tests.test_helpers import mce_helpers
|
||||
|
||||
FROZEN_TIME = "2020-04-14 07:00:00"
|
||||
|
||||
pytestmark = pytest.mark.skipif(
|
||||
sys.version_info < (3, 8), reason="requires python 3.8 or higher"
|
||||
)
|
||||
|
||||
|
||||
@freeze_time(FROZEN_TIME)
|
||||
def test_feast_repository_ingest(pytestconfig, tmp_path, mock_time):
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
import subprocess
|
||||
import sys
|
||||
from typing import Any, Dict, List
|
||||
from unittest.mock import patch
|
||||
|
||||
@ -15,13 +14,7 @@ from tests.test_helpers.state_helpers import (
|
||||
validate_all_providers_have_committed_successfully,
|
||||
)
|
||||
|
||||
pytestmark = [
|
||||
pytest.mark.integration_batch_1,
|
||||
# Skip tests if not on Python 3.8 or higher.
|
||||
pytest.mark.skipif(
|
||||
sys.version_info < (3, 8), reason="Requires python 3.8 or higher"
|
||||
),
|
||||
]
|
||||
pytestmark = pytest.mark.integration_batch_1
|
||||
FROZEN_TIME = "2020-04-14 07:00:00"
|
||||
GMS_PORT = 8080
|
||||
GMS_SERVER = f"http://localhost:{GMS_PORT}"
|
||||
|
||||
@ -1,27 +1,27 @@
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, TypeVar
|
||||
|
||||
if sys.version_info >= (3, 8):
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, TypeVar
|
||||
import pytest
|
||||
from mlflow import MlflowClient
|
||||
|
||||
import pytest
|
||||
from mlflow import MlflowClient
|
||||
from datahub.ingestion.run.pipeline import Pipeline
|
||||
from tests.test_helpers import mce_helpers
|
||||
|
||||
from datahub.ingestion.run.pipeline import Pipeline
|
||||
from tests.test_helpers import mce_helpers
|
||||
T = TypeVar("T")
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
@pytest.fixture
|
||||
def tracking_uri(tmp_path: Path) -> str:
|
||||
@pytest.fixture
|
||||
def tracking_uri(tmp_path: Path) -> str:
|
||||
return str(tmp_path / "mlruns")
|
||||
|
||||
@pytest.fixture
|
||||
def sink_file_path(tmp_path: Path) -> str:
|
||||
|
||||
@pytest.fixture
|
||||
def sink_file_path(tmp_path: Path) -> str:
|
||||
return str(tmp_path / "mlflow_source_mcps.json")
|
||||
|
||||
@pytest.fixture
|
||||
def pipeline_config(tracking_uri: str, sink_file_path: str) -> Dict[str, Any]:
|
||||
|
||||
@pytest.fixture
|
||||
def pipeline_config(tracking_uri: str, sink_file_path: str) -> Dict[str, Any]:
|
||||
source_type = "mlflow"
|
||||
return {
|
||||
"run_id": "mlflow-source-test",
|
||||
@ -39,8 +39,9 @@ if sys.version_info >= (3, 8):
|
||||
},
|
||||
}
|
||||
|
||||
@pytest.fixture
|
||||
def generate_mlflow_data(tracking_uri: str) -> None:
|
||||
|
||||
@pytest.fixture
|
||||
def generate_mlflow_data(tracking_uri: str) -> None:
|
||||
client = MlflowClient(tracking_uri=tracking_uri)
|
||||
experiment_name = "test-experiment"
|
||||
run_name = "test-run"
|
||||
@ -80,13 +81,14 @@ if sys.version_info >= (3, 8):
|
||||
stage="Archived",
|
||||
)
|
||||
|
||||
def test_ingestion(
|
||||
|
||||
def test_ingestion(
|
||||
pytestconfig,
|
||||
mock_time,
|
||||
sink_file_path,
|
||||
pipeline_config,
|
||||
generate_mlflow_data,
|
||||
):
|
||||
):
|
||||
print(f"MCPs file path: {sink_file_path}")
|
||||
golden_file_path = (
|
||||
pytestconfig.rootpath / "tests/integration/mlflow/mlflow_mcps_golden.json"
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
||||
import pytest
|
||||
@ -9,10 +8,6 @@ from tests.test_helpers import mce_helpers
|
||||
from tests.test_helpers.click_helpers import run_datahub_cmd
|
||||
from tests.test_helpers.docker_helpers import cleanup_image, wait_for_port
|
||||
|
||||
pytestmark = pytest.mark.skipif(
|
||||
sys.version_info < (3, 8), reason="requires python 3.8 or higher"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def mssql_runner(docker_compose_runner, pytestconfig):
|
||||
|
||||
@ -1,14 +1,11 @@
|
||||
import sys
|
||||
import uuid
|
||||
from decimal import Decimal
|
||||
from typing import Any, Optional
|
||||
|
||||
import pytest
|
||||
from pydantic import ValidationError
|
||||
|
||||
if sys.version_info >= (3, 8):
|
||||
from pyiceberg.schema import Schema
|
||||
from pyiceberg.types import (
|
||||
from pyiceberg.schema import Schema
|
||||
from pyiceberg.types import (
|
||||
BinaryType,
|
||||
BooleanType,
|
||||
DateType,
|
||||
@ -29,17 +26,17 @@ if sys.version_info >= (3, 8):
|
||||
TimestamptzType,
|
||||
TimeType,
|
||||
UUIDType,
|
||||
)
|
||||
)
|
||||
|
||||
from datahub.ingestion.api.common import PipelineContext
|
||||
from datahub.ingestion.source.iceberg.iceberg import (
|
||||
from datahub.ingestion.api.common import PipelineContext
|
||||
from datahub.ingestion.source.iceberg.iceberg import (
|
||||
IcebergProfiler,
|
||||
IcebergSource,
|
||||
IcebergSourceConfig,
|
||||
)
|
||||
from datahub.ingestion.source.iceberg.iceberg_common import IcebergCatalogConfig
|
||||
from datahub.metadata.com.linkedin.pegasus2avro.schema import ArrayType, SchemaField
|
||||
from datahub.metadata.schema_classes import (
|
||||
)
|
||||
from datahub.ingestion.source.iceberg.iceberg_common import IcebergCatalogConfig
|
||||
from datahub.metadata.com.linkedin.pegasus2avro.schema import ArrayType, SchemaField
|
||||
from datahub.metadata.schema_classes import (
|
||||
ArrayTypeClass,
|
||||
BooleanTypeClass,
|
||||
BytesTypeClass,
|
||||
@ -49,13 +46,10 @@ if sys.version_info >= (3, 8):
|
||||
RecordTypeClass,
|
||||
StringTypeClass,
|
||||
TimeTypeClass,
|
||||
)
|
||||
)
|
||||
|
||||
pytestmark = pytest.mark.skipif(
|
||||
sys.version_info < (3, 8), reason="requires python 3.8 or higher"
|
||||
)
|
||||
|
||||
def with_iceberg_source() -> IcebergSource:
|
||||
def with_iceberg_source() -> IcebergSource:
|
||||
catalog: IcebergCatalogConfig = IcebergCatalogConfig(
|
||||
name="test", type="rest", config={}
|
||||
)
|
||||
@ -64,18 +58,20 @@ if sys.version_info >= (3, 8):
|
||||
config=IcebergSourceConfig(catalog=catalog),
|
||||
)
|
||||
|
||||
def with_iceberg_profiler() -> IcebergProfiler:
|
||||
|
||||
def with_iceberg_profiler() -> IcebergProfiler:
|
||||
iceberg_source_instance = with_iceberg_source()
|
||||
return IcebergProfiler(
|
||||
iceberg_source_instance.report, iceberg_source_instance.config.profiling
|
||||
)
|
||||
|
||||
def assert_field(
|
||||
|
||||
def assert_field(
|
||||
schema_field: SchemaField,
|
||||
expected_description: Optional[str],
|
||||
expected_nullable: bool,
|
||||
expected_type: Any,
|
||||
) -> None:
|
||||
) -> None:
|
||||
assert (
|
||||
schema_field.description == expected_description
|
||||
), f"Field description '{schema_field.description}' is different from expected description '{expected_description}'"
|
||||
@ -86,14 +82,16 @@ if sys.version_info >= (3, 8):
|
||||
schema_field.type.type, expected_type
|
||||
), f"Field type {schema_field.type.type} is different from expected type {expected_type}"
|
||||
|
||||
def test_config_no_catalog():
|
||||
|
||||
def test_config_no_catalog():
|
||||
"""
|
||||
Test when no Iceberg catalog is provided.
|
||||
"""
|
||||
with pytest.raises(ValidationError, match="catalog"):
|
||||
IcebergSourceConfig() # type: ignore
|
||||
|
||||
def test_config_catalog_not_configured():
|
||||
|
||||
def test_config_catalog_not_configured():
|
||||
"""
|
||||
Test when an Iceberg catalog is provided, but not properly configured.
|
||||
"""
|
||||
@ -106,13 +104,15 @@ if sys.version_info >= (3, 8):
|
||||
with pytest.raises(ValidationError, match="type"):
|
||||
IcebergCatalogConfig(conf={}) # type: ignore
|
||||
|
||||
def test_config_for_tests():
|
||||
|
||||
def test_config_for_tests():
|
||||
"""
|
||||
Test valid iceberg source that will be used in unit tests.
|
||||
"""
|
||||
with_iceberg_source()
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"iceberg_type, expected_schema_field_type",
|
||||
[
|
||||
(BinaryType(), BytesTypeClass),
|
||||
@ -142,10 +142,10 @@ if sys.version_info >= (3, 8):
|
||||
StringTypeClass,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_iceberg_primitive_type_to_schema_field(
|
||||
)
|
||||
def test_iceberg_primitive_type_to_schema_field(
|
||||
iceberg_type: PrimitiveType, expected_schema_field_type: Any
|
||||
) -> None:
|
||||
) -> None:
|
||||
"""
|
||||
Test converting a primitive typed Iceberg field to a SchemaField
|
||||
"""
|
||||
@ -159,9 +159,7 @@ if sys.version_info >= (3, 8):
|
||||
),
|
||||
]:
|
||||
schema = Schema(column)
|
||||
schema_fields = iceberg_source_instance._get_schema_fields_for_schema(
|
||||
schema
|
||||
)
|
||||
schema_fields = iceberg_source_instance._get_schema_fields_for_schema(schema)
|
||||
assert (
|
||||
len(schema_fields) == 1
|
||||
), f"Expected 1 field, but got {len(schema_fields)}"
|
||||
@ -172,7 +170,8 @@ if sys.version_info >= (3, 8):
|
||||
expected_schema_field_type,
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"iceberg_type, expected_array_nested_type",
|
||||
[
|
||||
(BinaryType(), "bytes"),
|
||||
@ -202,10 +201,10 @@ if sys.version_info >= (3, 8):
|
||||
"uuid",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_iceberg_list_to_schema_field(
|
||||
)
|
||||
def test_iceberg_list_to_schema_field(
|
||||
iceberg_type: PrimitiveType, expected_array_nested_type: Any
|
||||
) -> None:
|
||||
) -> None:
|
||||
"""
|
||||
Test converting a list typed Iceberg field to an ArrayType SchemaField, including the list nested type.
|
||||
"""
|
||||
@ -241,9 +240,7 @@ if sys.version_info >= (3, 8):
|
||||
]:
|
||||
iceberg_source_instance = with_iceberg_source()
|
||||
schema = Schema(list_column)
|
||||
schema_fields = iceberg_source_instance._get_schema_fields_for_schema(
|
||||
schema
|
||||
)
|
||||
schema_fields = iceberg_source_instance._get_schema_fields_for_schema(schema)
|
||||
assert (
|
||||
len(schema_fields) == 1
|
||||
), f"Expected 1 field, but got {len(schema_fields)}"
|
||||
@ -258,7 +255,8 @@ if sys.version_info >= (3, 8):
|
||||
expected_array_nested_type
|
||||
], f"List Field nested type {arrayType.nestedType} was expected to be {expected_array_nested_type}"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"iceberg_type, expected_map_type",
|
||||
[
|
||||
(BinaryType(), BytesTypeClass),
|
||||
@ -288,10 +286,10 @@ if sys.version_info >= (3, 8):
|
||||
StringTypeClass,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_iceberg_map_to_schema_field(
|
||||
)
|
||||
def test_iceberg_map_to_schema_field(
|
||||
iceberg_type: PrimitiveType, expected_map_type: Any
|
||||
) -> None:
|
||||
) -> None:
|
||||
"""
|
||||
Test converting a map typed Iceberg field to a MapType SchemaField, where the key is the same type as the value.
|
||||
"""
|
||||
@ -327,9 +325,7 @@ if sys.version_info >= (3, 8):
|
||||
]:
|
||||
iceberg_source_instance = with_iceberg_source()
|
||||
schema = Schema(map_column)
|
||||
schema_fields = iceberg_source_instance._get_schema_fields_for_schema(
|
||||
schema
|
||||
)
|
||||
schema_fields = iceberg_source_instance._get_schema_fields_for_schema(schema)
|
||||
# Converting an Iceberg Map type will be done by creating an array of struct(key, value) records.
|
||||
# The first field will be the array.
|
||||
assert (
|
||||
@ -350,7 +346,8 @@ if sys.version_info >= (3, 8):
|
||||
expected_map_type,
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"iceberg_type, expected_schema_field_type",
|
||||
[
|
||||
(BinaryType(), BytesTypeClass),
|
||||
@ -380,10 +377,10 @@ if sys.version_info >= (3, 8):
|
||||
StringTypeClass,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_iceberg_struct_to_schema_field(
|
||||
)
|
||||
def test_iceberg_struct_to_schema_field(
|
||||
iceberg_type: PrimitiveType, expected_schema_field_type: Any
|
||||
) -> None:
|
||||
) -> None:
|
||||
"""
|
||||
Test converting a struct typed Iceberg field to a RecordType SchemaField.
|
||||
"""
|
||||
@ -394,9 +391,7 @@ if sys.version_info >= (3, 8):
|
||||
iceberg_source_instance = with_iceberg_source()
|
||||
schema = Schema(struct_column)
|
||||
schema_fields = iceberg_source_instance._get_schema_fields_for_schema(schema)
|
||||
assert (
|
||||
len(schema_fields) == 2
|
||||
), f"Expected 2 fields, but got {len(schema_fields)}"
|
||||
assert len(schema_fields) == 2, f"Expected 2 fields, but got {len(schema_fields)}"
|
||||
assert_field(
|
||||
schema_fields[0], struct_column.doc, struct_column.optional, RecordTypeClass
|
||||
)
|
||||
@ -404,7 +399,8 @@ if sys.version_info >= (3, 8):
|
||||
schema_fields[1], field1.doc, field1.optional, expected_schema_field_type
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"value_type, value, expected_value",
|
||||
[
|
||||
(BinaryType(), bytes([1, 2, 3, 4, 5]), "b'\\x01\\x02\\x03\\x04\\x05'"),
|
||||
@ -434,17 +430,18 @@ if sys.version_info >= (3, 8):
|
||||
"00010203-0405-0607-0809-0a0b0c0d0e0f",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_iceberg_profiler_value_render(
|
||||
)
|
||||
def test_iceberg_profiler_value_render(
|
||||
value_type: IcebergType, value: Any, expected_value: Optional[str]
|
||||
) -> None:
|
||||
) -> None:
|
||||
iceberg_profiler_instance = with_iceberg_profiler()
|
||||
assert (
|
||||
iceberg_profiler_instance._render_value("a.dataset", value_type, value)
|
||||
== expected_value
|
||||
)
|
||||
|
||||
def test_avro_decimal_bytes_nullable() -> None:
|
||||
|
||||
def test_avro_decimal_bytes_nullable() -> None:
|
||||
"""
|
||||
The following test exposes a problem with decimal (bytes) not preserving extra attributes like _nullable. Decimal (fixed) and Boolean for example do.
|
||||
NOTE: This bug was by-passed by mapping the Decimal type to fixed instead of bytes.
|
||||
@ -457,9 +454,7 @@ if sys.version_info >= (3, 8):
|
||||
print(
|
||||
f"Original avro schema string: {decimal_avro_schema_string}"
|
||||
)
|
||||
print(
|
||||
f"After avro parsing, _nullable attribute is missing: {decimal_avro_schema}"
|
||||
)
|
||||
print(f"After avro parsing, _nullable attribute is missing: {decimal_avro_schema}")
|
||||
|
||||
decimal_fixed_avro_schema_string = """{"type": "record", "name": "__struct_", "fields": [{"type": {"type": "fixed", "logicalType": "decimal", "precision": 3, "scale": 2, "native_data_type": "decimal(3, 2)", "_nullable": false, "name": "bogusName", "size": 16}, "name": "required_field", "doc": "required field documentation"}]}"""
|
||||
decimal_fixed_avro_schema = avro.schema.parse(decimal_fixed_avro_schema_string)
|
||||
|
||||
@ -1,42 +1,43 @@
|
||||
import sys
|
||||
import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, TypeVar, Union
|
||||
|
||||
if sys.version_info >= (3, 8):
|
||||
import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, TypeVar, Union
|
||||
import pytest
|
||||
from mlflow import MlflowClient
|
||||
from mlflow.entities.model_registry import RegisteredModel
|
||||
from mlflow.entities.model_registry.model_version import ModelVersion
|
||||
from mlflow.store.entities import PagedList
|
||||
|
||||
import pytest
|
||||
from mlflow import MlflowClient
|
||||
from mlflow.entities.model_registry import RegisteredModel
|
||||
from mlflow.entities.model_registry.model_version import ModelVersion
|
||||
from mlflow.store.entities import PagedList
|
||||
from datahub.ingestion.api.common import PipelineContext
|
||||
from datahub.ingestion.source.mlflow import MLflowConfig, MLflowSource
|
||||
|
||||
from datahub.ingestion.api.common import PipelineContext
|
||||
from datahub.ingestion.source.mlflow import MLflowConfig, MLflowSource
|
||||
T = TypeVar("T")
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
@pytest.fixture
|
||||
def tracking_uri(tmp_path: Path) -> str:
|
||||
@pytest.fixture
|
||||
def tracking_uri(tmp_path: Path) -> str:
|
||||
return str(tmp_path / "mlruns")
|
||||
|
||||
@pytest.fixture
|
||||
def source(tracking_uri: str) -> MLflowSource:
|
||||
|
||||
@pytest.fixture
|
||||
def source(tracking_uri: str) -> MLflowSource:
|
||||
return MLflowSource(
|
||||
ctx=PipelineContext(run_id="mlflow-source-test"),
|
||||
config=MLflowConfig(tracking_uri=tracking_uri),
|
||||
)
|
||||
|
||||
@pytest.fixture
|
||||
def registered_model(source: MLflowSource) -> RegisteredModel:
|
||||
|
||||
@pytest.fixture
|
||||
def registered_model(source: MLflowSource) -> RegisteredModel:
|
||||
model_name = "abc"
|
||||
return RegisteredModel(name=model_name)
|
||||
|
||||
@pytest.fixture
|
||||
def model_version(
|
||||
|
||||
@pytest.fixture
|
||||
def model_version(
|
||||
source: MLflowSource,
|
||||
registered_model: RegisteredModel,
|
||||
) -> ModelVersion:
|
||||
) -> ModelVersion:
|
||||
version = "1"
|
||||
return ModelVersion(
|
||||
name=registered_model.name,
|
||||
@ -44,7 +45,8 @@ if sys.version_info >= (3, 8):
|
||||
creation_timestamp=datetime.datetime.now(),
|
||||
)
|
||||
|
||||
def dummy_search_func(page_token: Union[None, str], **kwargs: Any) -> PagedList[T]:
|
||||
|
||||
def dummy_search_func(page_token: Union[None, str], **kwargs: Any) -> PagedList[T]:
|
||||
dummy_pages = dict(
|
||||
page_1=PagedList(items=["a", "b"], token="page_2"),
|
||||
page_2=PagedList(items=["c", "d"], token="page_3"),
|
||||
@ -61,7 +63,8 @@ if sys.version_info >= (3, 8):
|
||||
)
|
||||
return page_to_return
|
||||
|
||||
def test_stages(source):
|
||||
|
||||
def test_stages(source):
|
||||
mlflow_registered_model_stages = {
|
||||
"Production",
|
||||
"Staging",
|
||||
@ -76,7 +79,8 @@ if sys.version_info >= (3, 8):
|
||||
"mlflow_" + str(stage).lower() for stage in mlflow_registered_model_stages
|
||||
}
|
||||
|
||||
def test_config_model_name_separator(source, model_version):
|
||||
|
||||
def test_config_model_name_separator(source, model_version):
|
||||
name_version_sep = "+"
|
||||
source.config.model_name_separator = name_version_sep
|
||||
expected_model_name = (
|
||||
@ -88,7 +92,8 @@ if sys.version_info >= (3, 8):
|
||||
|
||||
assert urn == expected_urn
|
||||
|
||||
def test_model_without_run(source, registered_model, model_version):
|
||||
|
||||
def test_model_without_run(source, registered_model, model_version):
|
||||
run = source._get_mlflow_run(model_version)
|
||||
wu = source._get_ml_model_properties_workunit(
|
||||
registered_model=registered_model,
|
||||
@ -100,30 +105,32 @@ if sys.version_info >= (3, 8):
|
||||
assert aspect.hyperParams is None
|
||||
assert aspect.trainingMetrics is None
|
||||
|
||||
def test_traverse_mlflow_search_func(source):
|
||||
|
||||
def test_traverse_mlflow_search_func(source):
|
||||
expected_items = ["a", "b", "c", "d", "e"]
|
||||
|
||||
items = list(source._traverse_mlflow_search_func(dummy_search_func))
|
||||
|
||||
assert items == expected_items
|
||||
|
||||
def test_traverse_mlflow_search_func_with_kwargs(source):
|
||||
|
||||
def test_traverse_mlflow_search_func_with_kwargs(source):
|
||||
expected_items = ["A", "B", "C", "D", "E"]
|
||||
|
||||
items = list(
|
||||
source._traverse_mlflow_search_func(dummy_search_func, case="upper")
|
||||
)
|
||||
items = list(source._traverse_mlflow_search_func(dummy_search_func, case="upper"))
|
||||
|
||||
assert items == expected_items
|
||||
|
||||
def test_make_external_link_local(source, model_version):
|
||||
|
||||
def test_make_external_link_local(source, model_version):
|
||||
expected_url = None
|
||||
|
||||
url = source._make_external_url(model_version)
|
||||
|
||||
assert url == expected_url
|
||||
|
||||
def test_make_external_link_remote(source, model_version):
|
||||
|
||||
def test_make_external_link_remote(source, model_version):
|
||||
tracking_uri_remote = "https://dummy-mlflow-tracking-server.org"
|
||||
source.client = MlflowClient(tracking_uri=tracking_uri_remote)
|
||||
expected_url = f"{tracking_uri_remote}/#/models/{model_version.name}/versions/{model_version.version}"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user