Fixes #7509 - implement import path migration script (#7515)

* Added script to migarte import

* Added tests

* Updated doc to include info about import migration

* renamed test resource files
This commit is contained in:
Teddy 2022-09-22 14:43:25 +02:00 committed by GitHub
parent d1a3fa4928
commit c34281251f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 163 additions and 2 deletions

View File

@ -0,0 +1,62 @@
# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
utility to update import for migration from v0.11.5 to 0.12
"""
import os
from metadata.utils.logger import cli_logger
logger = cli_logger()
V115_IMPORT_STRING = "from openmetadata."
V12_IMPORT_STRING = "from openmetadata_managed_apis."
def run_openmetadata_imports_migration(dir_path: str) -> None:
"""Given a path to the DAG folder we'll look for openmetadata import and update the package to
`openmetadata_managed_apis`
Args:
dir_path (str): path to the DAG folder
"""
if not os.path.isdir(dir_path):
logger.error(f"{dir_path} is not a valid directory")
raise ValueError
for root, _, filenames in os.walk(dir_path):
logger.info(
f"{len(filenames)} files found in `{root}`."
"\nChecking for imports in the following files:\n\t{file_list}".format(
file_list="\n\t".join(filenames)
)
)
for filename in filenames:
logger.info(f"Checking imports in {filename}")
if os.path.splitext(filename)[1] == ".py":
with open(
os.path.join(root, filename), "r", encoding="utf-8"
) as dag_fle:
fle_data = dag_fle.read()
if V115_IMPORT_STRING in fle_data:
fle_data = fle_data.replace(V115_IMPORT_STRING, V12_IMPORT_STRING)
with open(
os.path.join(root, filename), "w", encoding="utf-8"
) as dag_file:
dag_file.write(fle_data)
logger.info(
f"Imports found in {filename}. Replaced `{V115_IMPORT_STRING}` with `{V12_IMPORT_STRING}`"
)

View File

@ -23,6 +23,9 @@ from metadata.__version__ import get_metadata_version
from metadata.cli.backup import run_backup
from metadata.cli.docker import BACKEND_DATABASES, run_docker
from metadata.cli.ingest import run_ingest
from metadata.cli.openmetadata_imports_migration import (
run_openmetadata_imports_migration,
)
from metadata.config.common import load_config_file
from metadata.orm_profiler.api.workflow import ProfilerWorkflow
from metadata.test_suite.api.workflow import TestSuiteWorkflow
@ -324,4 +327,20 @@ def backup(
)
@metadata.command()
@click.option("-d", "--dir-path", default="/ingestion/examples/airflow/dags")
def openmetadata_imports_migration(
dir_path: str,
) -> None:
"""Update DAG files generated after creating workflow in 0.11 and before.
In 0.12 the airflow managed API package name changed from `openmetadata` to `openmetadata_managed_apis`
hence breaking existing DAGs
Args:
dir_path (str): _description_
"""
run_openmetadata_imports_migration(dir_path)
metadata.add_command(check)

View File

@ -0,0 +1,10 @@
"""
This file has been generated from dag_runner.j2
"""
from airflow import DAG
from openmetadata.workflows import workflow_factory
workflow = workflow_factory.WorkflowFactory.create(
"/airflow/dag_generated_configs/local_redshift_profiler_e9AziRXs.json"
)
workflow.generate_dag(globals())

View File

@ -0,0 +1,8 @@
"""
This file has been generated from dag_runner.j2
"""
from airflow import DAG
from openmetadata.workflows import workflow_factory
workflow = workflow_factory.WorkflowFactory.create("/airflow/dag_generated_configs/local_redshift_profiler_e9AziRXs.json")
workflow.generate_dag(globals())

View File

@ -0,0 +1,58 @@
# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
test import migration cli script
"""
import os
from unittest import TestCase
from metadata.cli.openmetadata_imports_migration import (
run_openmetadata_imports_migration,
)
class TestOpenmetadataImportsMigration(TestCase):
"""Test class for the cli scrip test"""
store = dict()
path_to_ress_dir = os.path.join(os.path.dirname(__file__), "resources")
@classmethod
def setUpClass(cls) -> None:
for root, _, filenames in os.walk(cls.path_to_ress_dir):
for filename in filenames:
with open(os.path.join(root, filename), "r", encoding="utf-8") as fle:
cls.store[os.path.join(root, filename)] = fle.read()
def test_run_openmetadata_imports_migration(self):
"""test the run openmetadata function"""
run_openmetadata_imports_migration(self.path_to_ress_dir)
failures = []
for root, _, filenames in os.walk(self.path_to_ress_dir):
for filename in filenames:
if os.path.splitext(filename)[1] == ".py":
with open(
os.path.join(root, filename), "r", encoding="utf-8"
) as fle:
data = fle.read()
if "from openmetadata_managed_apis." not in data:
failures.append(filename)
assert not failures
@classmethod
def tearDownClass(cls) -> None:
for file_path, file_content in cls.store.items():
with open(file_path, "w", encoding="utf-8") as fle:
fle.write(file_content)

View File

@ -41,7 +41,8 @@ internals as well. Main topics to consider:
1. Tests now run with the Test Suite workflow and cannot be configured in the Profiler Workflow
2. Any past test data will be cleaned up during the upgrade to 0.12.0, as the internal data storage has been improved
3. The Profiler Ingestion Pipelines will be cleaned up during the upgrade to 0.12.0 as well.
3. The Profiler Ingestion Pipelines will be cleaned up during the upgrade to 0.12.0 as well
4. You will see broken profiler DAGs in airflow -- you can simply delete these DAGs
### DBT Tests Integration
@ -60,7 +61,7 @@ In the `processor` you can now configure:
- `profileSample` to specify the % of the table to run the profiling on
- `columnConfig.profileQuery` as a query to use to sample the data of the table
- `columnConfig.excludeColumns` and `columnConfig.includeColumns` to mark which columns to skip.
- In `columnConfig.includeColumns` we can also specify a list of `metrics` to run from our supported metrics.
- In `columnConfig.includeColumns` we can also specify a list of `metrics` to run from our supported metrics.
### Profiler Multithreading for Snowflake users
@ -85,6 +86,9 @@ to Airflow 2.3.3.
If you are using your own Airflow to prepare the ingestion from the UI, which is stuck in version 2.1.4, and you cannot
upgrade that, but you want to use OM 0.12, reach out to us.
**Note**
If you are using `openmetadata/ingestion` Docker image and you've upgraded to 0.12.0 reusing volumes mounted to the `openmetadata/ingestion:0.11.5` container you will need to run the `metadata openmetadata-imports-migration` command inside the `openmetadata/ingestion:0.12.x` container. Indeed, `openmetadata-airflow-managed-apis` has been renamed to `openmetadata-managed-apis` and its import from `import openmetadata` to `import openmetadata_managed_apis`. If theimport paths are not updated through the `metadata` command it will result in broken DAGs. By default the command will look for DAGs stored in `/ingestion/examples/airflow/dags`. If you have changed where generated DAGs are stored you can specify the path to where your DAGs are stored `metadata openmetadata-imports-migration -d <path/to/folder>`
### Connector Improvements
- **Oracle**: In `0.11.x` and previous releases, we were using the [Cx_Oracle](https://oracle.github.io/python-cx_Oracle/) driver to extract the metadata from oracledb. The drawback of using this driver was it required Oracle Client libraries to be installed in the host machine in order to run the ingestion. With the `0.12` release, we will be using the [python-oracledb](https://oracle.github.io/python-oracledb/) driver which is a upgraded version of `Cx_Oracle`. `python-oracledb` with `Thin` mode does not need Oracle Client libraries.