Fix #3850 - JSON Schemas for lineage (#3880)

* Use new JSON schemas in lineage

* Add sqlite
This commit is contained in:
Pere Miquel Brull 2022-04-06 12:24:15 +02:00 committed by GitHub
parent b93c4611bb
commit 4e55ea0154
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 209 additions and 125 deletions

View File

@ -1,115 +0,0 @@
# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
OpenMetadata Airflow Lineage Backend
"""
import json
import os
from typing import Optional
from airflow.configuration import conf
# TODO refactor https://github.com/open-metadata/OpenMetadata/issues/3844
class OpenMetadataLineageConfig(MetadataServerConfig):
"""
Base class for OpenMetada lineage config
Attributes
airflow_service_name (str): name of the service
api_endpoint (str): the endpoint for the API
auth_provider_type (str):
secret_key (str):
"""
airflow_service_name: str = "airflow"
api_endpoint: str = "http://localhost:8585"
auth_provider_type: str = "no-auth"
secret_key: Optional[str] = None
def get_lineage_config() -> OpenMetadataLineageConfig:
"""
Load the lineage config from airflow_provider_openmetadata.cfg.
"""
airflow_service_name = conf.get("lineage", "airflow_service_name", fallback=None)
if airflow_service_name:
api_endpoint = conf.get(
"lineage", "openmetadata_api_endpoint", fallback="http://localhost:8585"
)
auth_provider_type = conf.get(
"lineage", "auth_provider_type", fallback="no-auth"
)
secret_key = conf.get("lineage", "secret_key", fallback=None)
api_version = conf.get("lineage", "api_version", fallback="v1")
retry = conf.get("lineage", "retry", fallback=3)
retry_wait = conf.get("lineage", "retry_wait", fallback=3)
org_url = conf.get("lineage", "org_url", fallback=None)
client_id = conf.get("lineage", "client_id", fallback=None)
private_key = conf.get("lineage", "private_key", fallback=None)
domain = conf.get("lineage", "domain", fallback=None)
email = conf.get("lineage", "email", fallback=None)
audience = conf.get(
"lineage", "audience", fallback="https://www.googleapis.com/oauth2/v4/token"
)
auth_header = conf.get("lineage", "auth_header", fallback="Authorization")
authority = conf.get("lineage", "authority", fallback="")
scopes = conf.get("lineage", "scopes", fallback=[])
return OpenMetadataLineageConfig.parse_obj(
{
"airflow_service_name": airflow_service_name,
"api_endpoint": api_endpoint,
"auth_provider_type": auth_provider_type,
"secret_key": secret_key,
"audience": audience,
"auth_header": auth_header,
"email": email,
"domain": domain,
"private_key": private_key,
"client_id": client_id,
"org_url": org_url,
"retry_wait": retry_wait,
"retry": retry,
"api_version": api_version,
"authority": authority,
"scopes": scopes,
}
)
openmetadata_config_file = os.getenv("OPENMETADATA_LINEAGE_CONFIG")
if openmetadata_config_file:
with open(openmetadata_config_file, encoding="utf-8") as config_file:
config = json.load(config_file)
return OpenMetadataLineageConfig.parse_obj(config)
return OpenMetadataLineageConfig.parse_obj(
{
"airflow_service_name": "airflow",
"api_endpoint": "http://localhost:8585/api",
"auth_provider_type": "no-auth",
}
)
def get_metadata_config(config: OpenMetadataLineageConfig) -> MetadataServerConfig:
"""
Return MetadataServerConfig to interact with the API.
:param config: get_lineage_config()
"""
return MetadataServerConfig.parse_obj(
{
"api_endpoint": config.api_endpoint,
"auth_provider_type": config.auth_provider_type,
"secret_key": config.secret_key,
}
)

View File

@ -0,0 +1,16 @@
# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
OpenMetadata Airflow Lineage Backend config commons & cte
"""
LINEAGE = "lineage"

View File

@ -0,0 +1,89 @@
# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
OpenMetadata Airflow Lineage Backend
"""
import json
import os
from airflow.configuration import conf
from pydantic import BaseModel
from airflow_provider_openmetadata.lineage.config.commons import LINEAGE
from airflow_provider_openmetadata.lineage.config.providers import (
provider_config_registry,
)
from metadata.generated.schema.metadataIngestion.workflow import (
AuthProvider,
OpenMetadataServerConfig,
)
class AirflowLineageConfig(BaseModel):
airflow_service_name: str
metadata_config: OpenMetadataServerConfig
def parse_airflow_config(airflow_service_name: str) -> AirflowLineageConfig:
"""
Get airflow config from airflow.cfg and parse it
to the config model
"""
auth_provider_type = conf.get(
LINEAGE, "auth_provider_type", fallback=AuthProvider.no_auth.value
)
if auth_provider_type == AuthProvider.no_auth.value:
security_config = None
else:
load_security_config_fn = provider_config_registry.registry(auth_provider_type)
security_config = load_security_config_fn()
return AirflowLineageConfig(
airflow_service_name=airflow_service_name,
metadata_config=OpenMetadataServerConfig(
hostPort=conf.get(
LINEAGE,
"openmetadata_api_endpoint",
fallback="http://localhost:8585/api",
),
authProvider=auth_provider_type,
securityConfig=security_config,
),
)
def get_lineage_config() -> AirflowLineageConfig:
"""
Load the lineage config from airflow.cfg, from
a JSON file path configures as env in OPENMETADATA_LINEAGE_CONFIG
or return a default config.
"""
airflow_service_name = conf.get(LINEAGE, "airflow_service_name", fallback=None)
if airflow_service_name:
return parse_airflow_config(airflow_service_name)
openmetadata_config_file = os.getenv("OPENMETADATA_LINEAGE_CONFIG")
# If config file, parse the JSON config, that should conform to AirflowLineageConfig
if openmetadata_config_file:
with open(openmetadata_config_file, encoding="utf-8") as config_file:
config = json.load(config_file)
return AirflowLineageConfig.parse_obj(config)
# If nothing is configured, let's use a default for local
return AirflowLineageConfig(
airflow_service_name="airflow",
metadata_config=OpenMetadataServerConfig(
hostPort="http://localhost:8585/api",
),
)

View File

@ -0,0 +1,85 @@
# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
OpenMetadata Airflow Lineage Backend security providers config
"""
from collections import namedtuple
from airflow.configuration import conf
from airflow_provider_openmetadata.lineage.config.commons import LINEAGE
from metadata.generated.schema.metadataIngestion.workflow import (
Auth0SSOConfig,
AuthProvider,
GoogleSSOConfig,
OktaSSOConfig,
)
def register_provider_config():
"""
Helps us register custom functions to parse provider config
"""
registry = dict()
def add(provider: str):
def inner(fn):
registry[provider] = fn
return fn
return inner
Register = namedtuple("Register", ["add", "registry"])
return Register(add, registry)
provider_config_registry = register_provider_config()
@provider_config_registry.add(AuthProvider.google.value)
def load_google_auth() -> GoogleSSOConfig:
"""
Load config for Google Auth
"""
return GoogleSSOConfig(
secretKey=conf.get(LINEAGE, "secret_key"),
audience=conf.get(
LINEAGE, "audience", fallback="https://www.googleapis.com/oauth2/v4/token"
),
)
@provider_config_registry.add(AuthProvider.okta.value)
def load_okta_auth() -> OktaSSOConfig:
"""
Load config for Google Auth
"""
return OktaSSOConfig(
clientId=conf.get(LINEAGE, "client_id"),
orgURL=conf.get(LINEAGE, "org_url"),
privateKey=conf.get(LINEAGE, "private_key"),
email=conf.get(LINEAGE, "email"),
scopes=conf.get(LINEAGE, "scopes", fallback=[]),
)
@provider_config_registry.add(AuthProvider.auth0.value)
def load_auth0_auth() -> Auth0SSOConfig:
"""
Load config for Google Auth
"""
return Auth0SSOConfig(
clientId=conf.get(LINEAGE, "client_id"),
secretKey=conf.get(LINEAGE, "secret_key"),
domain=conf.get(LINEAGE, "domain"),
)

View File

@ -18,9 +18,9 @@ from typing import TYPE_CHECKING, Dict, List, Optional
from airflow.lineage.backend import LineageBackend
from airflow_provider_openmetadata.lineage.config import (
from airflow_provider_openmetadata.lineage.config.loader import (
AirflowLineageConfig,
get_lineage_config,
get_metadata_config,
)
from airflow_provider_openmetadata.lineage.utils import get_xlets, parse_lineage
from metadata.ingestion.ometa.ometa_api import OpenMetadata
@ -78,9 +78,8 @@ class OpenMetadataLineageBackend(LineageBackend):
"""
try:
config = get_lineage_config()
metadata_config = get_metadata_config(config)
client = OpenMetadata(metadata_config)
config: AirflowLineageConfig = get_lineage_config()
client = OpenMetadata(config.metadata_config)
op_inlets = get_xlets(operator, "_inlets")
op_outlets = get_xlets(operator, "_outlets")

View File

@ -18,7 +18,7 @@ from typing import TYPE_CHECKING, Dict, List, Optional
from airflow.configuration import conf
from airflow_provider_openmetadata.lineage.config import OpenMetadataLineageConfig
from airflow_provider_openmetadata.lineage.config.loader import AirflowLineageConfig
from metadata.generated.schema.api.data.createPipeline import CreatePipelineRequest
from metadata.generated.schema.api.lineage.addLineage import AddLineageRequest
from metadata.generated.schema.api.services.createPipelineService import (
@ -308,7 +308,7 @@ def add_status(
# pylint: disable=too-many-arguments,too-many-locals
def parse_lineage(
config: OpenMetadataLineageConfig,
config: AirflowLineageConfig,
context: Dict,
operator: "BaseOperator",
inlets: List,
@ -382,7 +382,7 @@ def parse_lineage(
def get_or_create_pipeline_service(
operator: "BaseOperator", client: OpenMetadata, config: OpenMetadataLineageConfig
operator: "BaseOperator", client: OpenMetadata, config: AirflowLineageConfig
) -> PipelineService:
"""
Check if we already have the airflow instance as a PipelineService,

View File

@ -11,7 +11,6 @@
"""
Generic source to build SQL connectors.
"""
import copy
import json
import logging
import re
@ -209,7 +208,6 @@ class SQLSource(Source[OMetaDatabaseAndTable]):
inspectors = self.get_databases()
for inspector in inspectors:
schema_names = inspector.get_schema_names()
print(schema_names)
for schema in schema_names:
# clear any previous source database state
self.database_source_state.clear()

View File

@ -17,6 +17,9 @@ from urllib.parse import quote_plus
from metadata.generated.schema.entity.services.connections.database.mysqlConnection import (
MysqlConnection,
)
from metadata.generated.schema.entity.services.connections.database.sqliteConnection import (
SQLiteConnection,
)
@singledispatch
@ -53,3 +56,12 @@ def _(connection: MysqlConnection):
url = f"{url}?{params}"
return url
@get_connection_url.register
def _(connection: SQLiteConnection):
"""
SQLite is only used for testing with the in-memory db
"""
return f"{connection.scheme.value}:///:memory:"