mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-11-14 01:40:08 +00:00
Merge pull request #96 from open-metadata/ingestion_refactor
Ingestion refactor
This commit is contained in:
commit
3857f72c7b
@ -9,7 +9,7 @@
|
|||||||
"username": "sa",
|
"username": "sa",
|
||||||
"password": "test!Password",
|
"password": "test!Password",
|
||||||
"include_pattern": {
|
"include_pattern": {
|
||||||
"include": ["catalog_test.*"]
|
"excludes": ["catalog_test.*"]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|||||||
@ -7,13 +7,7 @@
|
|||||||
"host_port": "localhost:5432",
|
"host_port": "localhost:5432",
|
||||||
"database": "pagila",
|
"database": "pagila",
|
||||||
"service_name": "local_postgres",
|
"service_name": "local_postgres",
|
||||||
"service_type": "Postgres",
|
"service_type": "Postgres"
|
||||||
"include_pattern": {
|
|
||||||
"filter": [
|
|
||||||
"pg_catalog.*[a-zA-Z0-9]*",
|
|
||||||
"information_schema.*[a-zA-Z0-9]*"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"processor": {
|
"processor": {
|
||||||
|
|||||||
@ -1,37 +0,0 @@
|
|||||||
{
|
|
||||||
"source": {
|
|
||||||
"type": "redshift-sql",
|
|
||||||
"config": {
|
|
||||||
"host_port": "cluster.name.region.redshift.amazonaws.com:5439",
|
|
||||||
"username": "username",
|
|
||||||
"password": "strong_password",
|
|
||||||
"database": "dev",
|
|
||||||
"service_name": "aws_redshift",
|
|
||||||
"service_type": "Redshift"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"processor": {
|
|
||||||
"type": "pii-tags",
|
|
||||||
"config": {
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"sink": {
|
|
||||||
"type": "metadata-rest-tables",
|
|
||||||
"config": {
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"metadata_server": {
|
|
||||||
"type": "metadata-server",
|
|
||||||
"config": {
|
|
||||||
"api_endpoint": "http://localhost:8585/api",
|
|
||||||
"auth_provider_type": "no-auth"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"cron": {
|
|
||||||
"minute": "*/5",
|
|
||||||
"hour": null,
|
|
||||||
"day": null,
|
|
||||||
"month": null,
|
|
||||||
"day_of_week": null
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@ -10,7 +10,7 @@
|
|||||||
"service_name": "snowflake",
|
"service_name": "snowflake",
|
||||||
"service_type": "Snowflake",
|
"service_type": "Snowflake",
|
||||||
"include_pattern": {
|
"include_pattern": {
|
||||||
"include": [
|
"includes": [
|
||||||
"(\\w)*.tpcds_sf100tcl.catalog_page",
|
"(\\w)*.tpcds_sf100tcl.catalog_page",
|
||||||
"(\\w)*.tpcds_sf100tcl.time_dim",
|
"(\\w)*.tpcds_sf100tcl.time_dim",
|
||||||
"(\\w)*.tpcds_sf10tcl.catalog_page"
|
"(\\w)*.tpcds_sf10tcl.catalog_page"
|
||||||
|
|||||||
14
ingestion/requirements.txt
Normal file
14
ingestion/requirements.txt
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
click~=7.1.2
|
||||||
|
pydantic~=1.7.4
|
||||||
|
expandvars~=0.6.5
|
||||||
|
requests~=2.25.1
|
||||||
|
python-dateutil~=2.8.1
|
||||||
|
SQLAlchemy~=1.4.5
|
||||||
|
pandas~=1.2.4
|
||||||
|
Faker~=8.1.1
|
||||||
|
elasticsearch~=7.12.0
|
||||||
|
spacy~=3.0.5
|
||||||
|
commonregex~=1.5.4
|
||||||
|
setuptools~=57.0.0
|
||||||
|
PyHive~=0.6.4
|
||||||
|
ldap3~=2.9.1
|
||||||
@ -101,19 +101,17 @@ build_options = {"includes": ["_cffi_backend"]}
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="metadata",
|
name="metadata",
|
||||||
version=get_version(),
|
version="0.2.0",
|
||||||
url="https://github.com/streamlinedata/metadata",
|
url="https://github.com/open-metadata/OpenMetadata",
|
||||||
author="Metadata Committers",
|
author="Metadata Committers",
|
||||||
license="Apache License 2.0",
|
license="Apache License 2.0",
|
||||||
description="Ingestion Framework for OpenMetadata",
|
description="Ingestion Framework for OpenMetadata",
|
||||||
long_description="Ingestion Framework for OpenMetadata",
|
long_description="Ingestion Framework for OpenMetadata",
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
python_requires=">=3.8",
|
python_requires=">=3.8",
|
||||||
options={"build_exe": build_options},
|
options={"build_exe": build_options},
|
||||||
package_dir={"": "src"},
|
package_dir={"": "src"},
|
||||||
packages=find_namespace_packages(where='src', exclude=['tests*']),
|
packages=find_namespace_packages(where='src', exclude=['tests*']),
|
||||||
dependency_links=['git+git://github.com/djacobs/PyAPNs.git#egg=apns',
|
|
||||||
'git+https://github.com/StreamlineData/sdscheduler.git#egg=simplescheduler'],
|
|
||||||
entry_points={
|
entry_points={
|
||||||
"console_scripts": ["metadata = metadata.cmd:metadata"],
|
"console_scripts": ["metadata = metadata.cmd:metadata"],
|
||||||
"metadata.ingestion.source.plugins": [
|
"metadata.ingestion.source.plugins": [
|
||||||
|
|||||||
@ -1,20 +0,0 @@
|
|||||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
# contributor license agreements. See the NOTICE file distributed with
|
|
||||||
# this work for additional information regarding copyright ownership.
|
|
||||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
# (the "License"); you may not use this file except in compliance with
|
|
||||||
# the License. You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import click
|
|
||||||
|
|
||||||
@click.group()
|
|
||||||
def check() -> None:
|
|
||||||
pass
|
|
||||||
@ -21,7 +21,6 @@ import sys
|
|||||||
import click
|
import click
|
||||||
from pydantic import ValidationError
|
from pydantic import ValidationError
|
||||||
|
|
||||||
from metadata.check.check_cli import check
|
|
||||||
from metadata.config.config_loader import load_config_file
|
from metadata.config.config_loader import load_config_file
|
||||||
from metadata.ingestion.workflow.workflow import Workflow
|
from metadata.ingestion.workflow.workflow import Workflow
|
||||||
|
|
||||||
@ -35,11 +34,14 @@ BASE_LOGGING_FORMAT = (
|
|||||||
)
|
)
|
||||||
logging.basicConfig(format=BASE_LOGGING_FORMAT)
|
logging.basicConfig(format=BASE_LOGGING_FORMAT)
|
||||||
|
|
||||||
|
@click.group()
|
||||||
|
def check() -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
@click.group()
|
@click.group()
|
||||||
@click.option("--debug/--no-debug", default=False)
|
@click.option("--debug/--no-debug", default=False)
|
||||||
def metadata(debug: bool) -> None:
|
def metadata(debug: bool) -> None:
|
||||||
if debug or os.getenv("METADATA_DEBUG", False):
|
if os.getenv("METADATA_DEBUG", False):
|
||||||
logging.getLogger().setLevel(logging.INFO)
|
logging.getLogger().setLevel(logging.INFO)
|
||||||
logging.getLogger("metadata").setLevel(logging.DEBUG)
|
logging.getLogger("metadata").setLevel(logging.DEBUG)
|
||||||
else:
|
else:
|
||||||
@ -52,12 +54,11 @@ def metadata(debug: bool) -> None:
|
|||||||
"-c",
|
"-c",
|
||||||
"--config",
|
"--config",
|
||||||
type=click.Path(exists=True, dir_okay=False),
|
type=click.Path(exists=True, dir_okay=False),
|
||||||
help="Config file in .toml or .yaml format",
|
help="Workflow config",
|
||||||
required=True,
|
required=True,
|
||||||
)
|
)
|
||||||
def ingest(config: str) -> None:
|
def ingest(config: str) -> None:
|
||||||
"""Main command for ingesting metadata into Metadata"""
|
"""Main command for ingesting metadata into Metadata"""
|
||||||
|
|
||||||
config_file = pathlib.Path(config)
|
config_file = pathlib.Path(config)
|
||||||
workflow_config = load_config_file(config_file)
|
workflow_config = load_config_file(config_file)
|
||||||
|
|
||||||
@ -71,6 +72,7 @@ def ingest(config: str) -> None:
|
|||||||
|
|
||||||
workflow.execute()
|
workflow.execute()
|
||||||
ret = workflow.print_status()
|
ret = workflow.print_status()
|
||||||
|
workflow.stop()
|
||||||
sys.exit(ret)
|
sys.exit(ret)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -42,48 +42,18 @@ class ConfigModel(BaseModel):
|
|||||||
|
|
||||||
class DynamicTypedConfig(ConfigModel):
|
class DynamicTypedConfig(ConfigModel):
|
||||||
type: str
|
type: str
|
||||||
# This config type is declared Optional[Any] here. The eventual parser for the
|
|
||||||
# specified type is responsible for further validation.
|
|
||||||
config: Optional[Any]
|
config: Optional[Any]
|
||||||
|
|
||||||
|
|
||||||
class MetaError(Exception):
|
class WorkflowExecutionError(Exception):
|
||||||
"""A base class for all meta exceptions"""
|
|
||||||
|
|
||||||
|
|
||||||
class WorkflowExecutionError(MetaError):
|
|
||||||
"""An error occurred when executing the workflow"""
|
"""An error occurred when executing the workflow"""
|
||||||
|
|
||||||
|
|
||||||
class OperationalError(WorkflowExecutionError):
|
|
||||||
"""An error occurred because of client-provided metadata"""
|
|
||||||
|
|
||||||
message: str
|
|
||||||
info: dict
|
|
||||||
|
|
||||||
def __init__(self, message: str, info: dict = None):
|
|
||||||
self.message = message
|
|
||||||
if info:
|
|
||||||
self.info = info
|
|
||||||
else:
|
|
||||||
self.info = {}
|
|
||||||
|
|
||||||
|
|
||||||
class ConfigurationError(MetaError):
|
|
||||||
"""A configuration error has happened"""
|
|
||||||
|
|
||||||
|
|
||||||
class ConfigurationMechanism(ABC):
|
|
||||||
@abstractmethod
|
|
||||||
def load_config(self, config_fp: IO) -> dict:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class IncludeFilterPattern(ConfigModel):
|
class IncludeFilterPattern(ConfigModel):
|
||||||
"""A class to store allow deny regexes"""
|
"""A class to store allow deny regexes"""
|
||||||
|
|
||||||
include: List[str] = [".*"]
|
includes: List[str] = [".*"]
|
||||||
filter: List[str] = []
|
excludes: List[str] = []
|
||||||
alphabet: str = "[A-Za-z0-9 _.-]"
|
alphabet: str = "[A-Za-z0-9 _.-]"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@ -96,11 +66,11 @@ class IncludeFilterPattern(ConfigModel):
|
|||||||
|
|
||||||
def included(self, string: str) -> bool:
|
def included(self, string: str) -> bool:
|
||||||
try:
|
try:
|
||||||
for filter in self.filter:
|
for exclude in self.excludes:
|
||||||
if re.match(filter, string):
|
if re.match(exclude, string):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
for include in self.include:
|
for include in self.includes:
|
||||||
if re.match(include, string):
|
if re.match(include, string):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
@ -108,17 +78,11 @@ class IncludeFilterPattern(ConfigModel):
|
|||||||
raise Exception("Regex Error: {}".format(err))
|
raise Exception("Regex Error: {}".format(err))
|
||||||
|
|
||||||
def is_fully_specified_include_list(self) -> bool:
|
def is_fully_specified_include_list(self) -> bool:
|
||||||
"""
|
for include_pattern in self.includes:
|
||||||
If the allow patterns are literals and not full regexes, then it is considered
|
|
||||||
fully specified. This is useful if you want to convert a 'list + filter'
|
|
||||||
pattern into a 'search for the ones that are allowed' pattern, which can be
|
|
||||||
much more efficient in some cases.
|
|
||||||
"""
|
|
||||||
for include_pattern in self.include:
|
|
||||||
if not self.alphabet_pattern.match(include_pattern):
|
if not self.alphabet_pattern.match(include_pattern):
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def get_allowed_list(self):
|
def get_allowed_list(self):
|
||||||
assert self.is_fully_specified_include_list()
|
assert self.is_fully_specified_include_list()
|
||||||
return [a for a in self.include if self.included(a)]
|
return [a for a in self.includes if self.included(a)]
|
||||||
|
|||||||
@ -28,7 +28,7 @@ class ProcessorStatus(Status):
|
|||||||
warnings: List[Any] = field(default_factory=list)
|
warnings: List[Any] = field(default_factory=list)
|
||||||
failures: List[Any] = field(default_factory=list)
|
failures: List[Any] = field(default_factory=list)
|
||||||
|
|
||||||
def records_processed(self, record: Record):
|
def processed(self, record: Record):
|
||||||
self.records += 1
|
self.records += 1
|
||||||
|
|
||||||
def warning(self, info: Any) -> None:
|
def warning(self, info: Any) -> None:
|
||||||
|
|||||||
@ -28,7 +28,7 @@ class SourceStatus(Status):
|
|||||||
warnings: Dict[str, List[str]] = field(default_factory=dict)
|
warnings: Dict[str, List[str]] = field(default_factory=dict)
|
||||||
failures: Dict[str, List[str]] = field(default_factory=dict)
|
failures: Dict[str, List[str]] = field(default_factory=dict)
|
||||||
|
|
||||||
def records_produced(self, record: Record) -> None:
|
def scanned(self, record: Record) -> None:
|
||||||
self.records += 1
|
self.records += 1
|
||||||
|
|
||||||
def warning(self, key: str, reason: str) -> None:
|
def warning(self, key: str, reason: str) -> None:
|
||||||
|
|||||||
@ -44,6 +44,7 @@ class MetadataUsageBulkSink(BulkSink):
|
|||||||
self.client = REST(self.metadata_config)
|
self.client = REST(self.metadata_config)
|
||||||
self.status = BulkSinkStatus()
|
self.status = BulkSinkStatus()
|
||||||
self.tables_dict = {}
|
self.tables_dict = {}
|
||||||
|
self.table_join_dict = {}
|
||||||
self.__map_tables()
|
self.__map_tables()
|
||||||
|
|
||||||
def __map_tables(self):
|
def __map_tables(self):
|
||||||
@ -74,7 +75,8 @@ class MetadataUsageBulkSink(BulkSink):
|
|||||||
try:
|
try:
|
||||||
self.client.publish_usage_for_a_table(table_entity, table_usage_request)
|
self.client.publish_usage_for_a_table(table_entity, table_usage_request)
|
||||||
except APIError as err:
|
except APIError as err:
|
||||||
logger.error("Failed to update usage and query join {}".format(err))
|
self.status.failures.append(table_usage_request)
|
||||||
|
logger.error("Failed to update usage for {} {}".format(table_usage.table, err))
|
||||||
|
|
||||||
table_join_request = self.__get_table_joins(table_usage)
|
table_join_request = self.__get_table_joins(table_usage)
|
||||||
logger.debug("table join request {}".format(table_join_request))
|
logger.debug("table join request {}".format(table_join_request))
|
||||||
@ -82,7 +84,8 @@ class MetadataUsageBulkSink(BulkSink):
|
|||||||
if table_join_request is not None and len(table_join_request.columnJoins) > 0:
|
if table_join_request is not None and len(table_join_request.columnJoins) > 0:
|
||||||
self.client.publish_frequently_joined_with(table_entity, table_join_request)
|
self.client.publish_frequently_joined_with(table_entity, table_join_request)
|
||||||
except APIError as err:
|
except APIError as err:
|
||||||
logger.error("Failed to update usage and query join {}".format(err))
|
self.status.failures.append(table_join_request)
|
||||||
|
logger.error("Failed to update query join for {}, {}".format(table_usage.table, err))
|
||||||
|
|
||||||
else:
|
else:
|
||||||
logger.warning("Table does not exist, skipping usage publish {}, {}".format(table_usage.table,
|
logger.warning("Table does not exist, skipping usage publish {}, {}".format(table_usage.table,
|
||||||
@ -90,21 +93,32 @@ class MetadataUsageBulkSink(BulkSink):
|
|||||||
|
|
||||||
def __get_table_joins(self, table_usage):
|
def __get_table_joins(self, table_usage):
|
||||||
table_joins: TableJoins = TableJoins(columnJoins=[], startDate=table_usage.date)
|
table_joins: TableJoins = TableJoins(columnJoins=[], startDate=table_usage.date)
|
||||||
|
column_joins_dict = {}
|
||||||
|
joined_with = {}
|
||||||
for column_join in table_usage.joins:
|
for column_join in table_usage.joins:
|
||||||
if column_join.table_column is None or len(column_join.joined_with) == 0:
|
if column_join.table_column is None or len(column_join.joined_with) == 0:
|
||||||
continue
|
continue
|
||||||
logger.debug("main column join {}".format(column_join.table_column))
|
|
||||||
|
if column_join.table_column.column in column_joins_dict.keys():
|
||||||
|
joined_with = column_joins_dict[column_join.table_column.column]
|
||||||
|
else:
|
||||||
|
column_joins_dict[column_join.table_column.column] = {}
|
||||||
|
|
||||||
main_column_fqdn = self.__get_column_fqdn(column_join.table_column)
|
main_column_fqdn = self.__get_column_fqdn(column_join.table_column)
|
||||||
logger.debug("main column fqdn join {}".format(main_column_fqdn))
|
|
||||||
joined_with = []
|
|
||||||
for column in column_join.joined_with:
|
for column in column_join.joined_with:
|
||||||
logger.debug("joined column {}".format(column))
|
|
||||||
joined_column_fqdn = self.__get_column_fqdn(column)
|
joined_column_fqdn = self.__get_column_fqdn(column)
|
||||||
logger.debug("joined column fqdn {}".format(joined_column_fqdn))
|
if joined_column_fqdn in joined_with.keys():
|
||||||
if joined_column_fqdn is not None:
|
column_joined_with = joined_with[joined_column_fqdn]
|
||||||
joined_with.append(ColumnJoinedWith(fullyQualifiedName=joined_column_fqdn, joinCount=1))
|
column_joined_with.joinCount += 1
|
||||||
table_joins.columnJoins.append(ColumnJoins(columnName=column_join.table_column.column,
|
joined_with[joined_column_fqdn] = column_joined_with
|
||||||
joinedWith=joined_with))
|
else:
|
||||||
|
joined_with[joined_column_fqdn] = ColumnJoinedWith(fullyQualifiedName=joined_column_fqdn,
|
||||||
|
joinCount=1)
|
||||||
|
column_joins_dict[column_join.table_column.column] = joined_with
|
||||||
|
|
||||||
|
for key, value in column_joins_dict.items():
|
||||||
|
table_joins.columnJoins.append(ColumnJoins(columnName=key,
|
||||||
|
joinedWith=list(value.values())))
|
||||||
return table_joins
|
return table_joins
|
||||||
|
|
||||||
def __get_column_fqdn(self, table_column: TableColumn):
|
def __get_column_fqdn(self, table_column: TableColumn):
|
||||||
|
|||||||
@ -31,7 +31,7 @@ from metadata.ingestion.models.table_queries import TableUsageRequest, ColumnJoi
|
|||||||
from metadata.ingestion.ometa.auth_provider import MetadataServerConfig, AuthenticationProvider, \
|
from metadata.ingestion.ometa.auth_provider import MetadataServerConfig, AuthenticationProvider, \
|
||||||
GoogleAuthenticationProvider, NoOpAuthenticationProvider, OktaAuthenticationProvider
|
GoogleAuthenticationProvider, NoOpAuthenticationProvider, OktaAuthenticationProvider
|
||||||
from metadata.ingestion.ometa.credentials import URL, get_api_version
|
from metadata.ingestion.ometa.credentials import URL, get_api_version
|
||||||
from metadata.generated.schema.entity.data.table import TableEntity
|
from metadata.generated.schema.entity.data.table import TableEntity, TableJoins
|
||||||
from metadata.generated.schema.entity.data.database import DatabaseEntity
|
from metadata.generated.schema.entity.data.database import DatabaseEntity
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@ -296,11 +296,11 @@ class REST(object):
|
|||||||
def publish_usage_for_a_table(self, table: TableEntity, table_usage_request: TableUsageRequest) -> None:
|
def publish_usage_for_a_table(self, table: TableEntity, table_usage_request: TableUsageRequest) -> None:
|
||||||
"""publish usage details for a table"""
|
"""publish usage details for a table"""
|
||||||
resp = self.post('/usage/table/{}'.format(table.id.__root__), data=table_usage_request.json())
|
resp = self.post('/usage/table/{}'.format(table.id.__root__), data=table_usage_request.json())
|
||||||
# self.post('/usage/compute.percentile/table/{}'.format(table.id.__root__), table_usage_request.date)
|
logger.debug("published table usage {}".format(resp))
|
||||||
|
|
||||||
def publish_frequently_joined_with(self, table: TableEntity, table_join_request: ColumnJoinsList) -> None:
|
def publish_frequently_joined_with(self, table: TableEntity, table_join_request: TableJoins) -> None:
|
||||||
"""publish frequently joined with for a table"""
|
"""publish frequently joined with for a table"""
|
||||||
print(table_join_request.json())
|
logger.debug(table_join_request.json())
|
||||||
logger.info("table join request {}".format(table_join_request.json()))
|
logger.info("table join request {}".format(table_join_request.json()))
|
||||||
resp = self.put('/tables/{}/joins'.format(table.id.__root__), data=table_join_request.json())
|
resp = self.put('/tables/{}/joins'.format(table.id.__root__), data=table_join_request.json())
|
||||||
logger.debug("published frequently joined with {}".format(resp))
|
logger.debug("published frequently joined with {}".format(resp))
|
||||||
|
|||||||
@ -52,7 +52,7 @@ class QueryParserProcessor(Processor):
|
|||||||
try:
|
try:
|
||||||
start_date = datetime.datetime.strptime(record.analysis_date, '%Y-%m-%d %H:%M:%S').date()
|
start_date = datetime.datetime.strptime(record.analysis_date, '%Y-%m-%d %H:%M:%S').date()
|
||||||
parser = Parser(record.sql)
|
parser = Parser(record.sql)
|
||||||
columns_dict = {} if parser.columns_dict == None else parser.columns_dict
|
columns_dict = {} if parser.columns_dict is None else parser.columns_dict
|
||||||
query_parser_data = QueryParserData(tables=parser.tables,
|
query_parser_data = QueryParserData(tables=parser.tables,
|
||||||
tables_aliases=parser.tables_aliases,
|
tables_aliases=parser.tables_aliases,
|
||||||
columns=columns_dict,
|
columns=columns_dict,
|
||||||
@ -60,8 +60,8 @@ class QueryParserProcessor(Processor):
|
|||||||
sql=record.sql,
|
sql=record.sql,
|
||||||
date=start_date.strftime('%Y-%m-%d'))
|
date=start_date.strftime('%Y-%m-%d'))
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
logger.error(record.sql)
|
logger.debug(record.sql)
|
||||||
logger.error(err)
|
logger.debug(err)
|
||||||
query_parser_data = None
|
query_parser_data = None
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
@ -16,11 +16,11 @@
|
|||||||
from typing import Optional
|
from typing import Optional
|
||||||
from urllib.parse import quote_plus
|
from urllib.parse import quote_plus
|
||||||
|
|
||||||
from .sql_source import SQLAlchemyConfig, SQLAlchemySource
|
from .sql_source import SQLConnectionConfig, SQLSource
|
||||||
from ..ometa.auth_provider import MetadataServerConfig
|
from ..ometa.auth_provider import MetadataServerConfig
|
||||||
|
|
||||||
|
|
||||||
class AthenaConfig(SQLAlchemyConfig):
|
class AthenaConfig(SQLConnectionConfig):
|
||||||
scheme: str = "awsathena+rest"
|
scheme: str = "awsathena+rest"
|
||||||
username: Optional[str] = None
|
username: Optional[str] = None
|
||||||
password: Optional[str] = None
|
password: Optional[str] = None
|
||||||
@ -29,7 +29,7 @@ class AthenaConfig(SQLAlchemyConfig):
|
|||||||
s3_staging_dir: str
|
s3_staging_dir: str
|
||||||
work_group: str
|
work_group: str
|
||||||
|
|
||||||
def get_sql_alchemy_url(self):
|
def get_connection_url(self):
|
||||||
url = f"{self.scheme}://"
|
url = f"{self.scheme}://"
|
||||||
if self.username:
|
if self.username:
|
||||||
url += f"{quote_plus(self.username)}"
|
url += f"{quote_plus(self.username)}"
|
||||||
@ -46,9 +46,9 @@ class AthenaConfig(SQLAlchemyConfig):
|
|||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
class AthenaSource(SQLAlchemySource):
|
class AthenaSource(SQLSource):
|
||||||
def __init__(self, config, metadata_config, ctx):
|
def __init__(self, config, metadata_config, ctx):
|
||||||
super().__init__(config, metadata_config, ctx, "athena")
|
super().__init__(config, metadata_config, ctx)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create(cls, config_dict, metadata_config_dict, ctx):
|
def create(cls, config_dict, metadata_config_dict, ctx):
|
||||||
|
|||||||
@ -17,26 +17,32 @@ from typing import Optional, Tuple
|
|||||||
|
|
||||||
# This import verifies that the dependencies are available.
|
# This import verifies that the dependencies are available.
|
||||||
|
|
||||||
from .sql_source import BasicSQLAlchemyConfig, SQLAlchemySource
|
from .sql_source import SQLConnectionConfig, SQLSource
|
||||||
from ..ometa.auth_provider import MetadataServerConfig
|
from ..ometa.auth_provider import MetadataServerConfig
|
||||||
|
|
||||||
|
|
||||||
class BigQueryConfig(BasicSQLAlchemyConfig):
|
class BigQueryConfig(SQLConnectionConfig, SQLSource):
|
||||||
scheme = "bigquery"
|
scheme = "bigquery"
|
||||||
project_id: Optional[str] = None
|
project_id: Optional[str] = None
|
||||||
|
|
||||||
def get_sql_alchemy_url(self):
|
def get_connection_url(self):
|
||||||
if self.project_id:
|
if self.project_id:
|
||||||
return f"{self.scheme}://{self.project_id}"
|
return f"{self.scheme}://{self.project_id}"
|
||||||
return f"{self.scheme}://"
|
return f"{self.scheme}://"
|
||||||
|
|
||||||
def get_identifier(self, schema: str, table: str) -> str:
|
|
||||||
if self.project_id:
|
class BigQuerySource(SQLSource):
|
||||||
return f"{self.project_id}.{schema}.{table}"
|
def __init__(self, config, metadata_config, ctx):
|
||||||
return f"{schema}.{table}"
|
super().__init__(config, metadata_config, ctx)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create(cls, config_dict, metadata_config_dict, ctx):
|
||||||
|
config = BigQueryConfig.parse_obj(config_dict)
|
||||||
|
metadata_config = MetadataServerConfig.parse_obj(metadata_config_dict)
|
||||||
|
return cls(config, metadata_config, ctx)
|
||||||
|
|
||||||
def standardize_schema_table_names(
|
def standardize_schema_table_names(
|
||||||
self, schema: str, table: str
|
self, schema: str, table: str
|
||||||
) -> Tuple[str, str]:
|
) -> Tuple[str, str]:
|
||||||
segments = table.split(".")
|
segments = table.split(".")
|
||||||
if len(segments) != 2:
|
if len(segments) != 2:
|
||||||
@ -44,14 +50,3 @@ class BigQueryConfig(BasicSQLAlchemyConfig):
|
|||||||
if segments[0] != schema:
|
if segments[0] != schema:
|
||||||
raise ValueError(f"schema {schema} does not match table {table}")
|
raise ValueError(f"schema {schema} does not match table {table}")
|
||||||
return segments[0], segments[1]
|
return segments[0], segments[1]
|
||||||
|
|
||||||
|
|
||||||
class BigQuerySource(SQLAlchemySource):
|
|
||||||
def __init__(self, config, metadata_config, ctx):
|
|
||||||
super().__init__(config, metadata_config, ctx, "bigquery")
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def create(cls, config_dict, metadata_config_dict, ctx):
|
|
||||||
config = BigQueryConfig.parse_obj(config_dict)
|
|
||||||
metadata_config = MetadataServerConfig.parse_obj(metadata_config_dict)
|
|
||||||
return cls(config, metadata_config, ctx)
|
|
||||||
|
|||||||
@ -17,8 +17,8 @@ from pyhive import hive # noqa: F401
|
|||||||
from pyhive.sqlalchemy_hive import HiveDate, HiveDecimal, HiveTimestamp
|
from pyhive.sqlalchemy_hive import HiveDate, HiveDecimal, HiveTimestamp
|
||||||
|
|
||||||
from .sql_source import (
|
from .sql_source import (
|
||||||
BasicSQLAlchemyConfig,
|
SQLConnectionConfig,
|
||||||
SQLAlchemySource,
|
SQLSource,
|
||||||
register_custom_type,
|
register_custom_type,
|
||||||
)
|
)
|
||||||
from ..ometa.auth_provider import MetadataServerConfig
|
from ..ometa.auth_provider import MetadataServerConfig
|
||||||
@ -28,13 +28,16 @@ register_custom_type(HiveTimestamp, "TIME")
|
|||||||
register_custom_type(HiveDecimal, "NUMBER")
|
register_custom_type(HiveDecimal, "NUMBER")
|
||||||
|
|
||||||
|
|
||||||
class HiveConfig(BasicSQLAlchemyConfig):
|
class HiveConfig(SQLConnectionConfig):
|
||||||
scheme = "hive"
|
scheme = "hive"
|
||||||
|
|
||||||
|
def get_connection_url(self):
|
||||||
|
return super().get_connection_url()
|
||||||
|
|
||||||
class HiveSource(SQLAlchemySource):
|
|
||||||
|
class HiveSource(SQLSource):
|
||||||
def __init__(self, config, metadata_config, ctx):
|
def __init__(self, config, metadata_config, ctx):
|
||||||
super().__init__(config, metadata_config, ctx, "hive")
|
super().__init__(config, metadata_config, ctx)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create(cls, config_dict, metadata_config_dict, ctx):
|
def create(cls, config_dict, metadata_config_dict, ctx):
|
||||||
|
|||||||
@ -16,24 +16,21 @@
|
|||||||
# This import verifies that the dependencies are available.
|
# This import verifies that the dependencies are available.
|
||||||
import sqlalchemy_pytds # noqa: F401
|
import sqlalchemy_pytds # noqa: F401
|
||||||
|
|
||||||
from .sql_source import BasicSQLAlchemyConfig, SQLAlchemySource
|
from .sql_source import SQLConnectionConfig, SQLSource
|
||||||
from ..ometa.auth_provider import MetadataServerConfig
|
from ..ometa.auth_provider import MetadataServerConfig
|
||||||
|
|
||||||
|
|
||||||
class SQLServerConfig(BasicSQLAlchemyConfig):
|
class SQLServerConfig(SQLConnectionConfig):
|
||||||
host_port = "localhost:1433"
|
host_port = "localhost:1433"
|
||||||
scheme = "mssql+pytds"
|
scheme = "mssql+pytds"
|
||||||
|
|
||||||
def get_identifier(self, schema: str, table: str) -> str:
|
def get_connection_url(self):
|
||||||
regular = f"{schema}.{table}"
|
return super().get_connection_url()
|
||||||
if self.database:
|
|
||||||
return f"{self.database}.{regular}"
|
|
||||||
return regular
|
|
||||||
|
|
||||||
|
|
||||||
class SQLServerSource(SQLAlchemySource):
|
class SQLServerSource(SQLSource):
|
||||||
def __init__(self, config, metadata_config, ctx):
|
def __init__(self, config, metadata_config, ctx):
|
||||||
super().__init__(config, metadata_config, ctx, "mssql")
|
super().__init__(config, metadata_config, ctx)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create(cls, config_dict, metadata_config_dict, ctx):
|
def create(cls, config_dict, metadata_config_dict, ctx):
|
||||||
|
|||||||
@ -13,19 +13,18 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import pymysql # noqa: F401
|
from .sql_source import SQLSource, SQLConnectionConfig
|
||||||
|
|
||||||
from .sql_source import BasicSQLAlchemyConfig, SQLAlchemySource
|
|
||||||
from ..ometa.auth_provider import MetadataServerConfig
|
from ..ometa.auth_provider import MetadataServerConfig
|
||||||
|
|
||||||
|
|
||||||
class MySQLConfig(BasicSQLAlchemyConfig):
|
class MySQLConfig(SQLConnectionConfig):
|
||||||
# defaults
|
|
||||||
host_port = "localhost:3306"
|
host_port = "localhost:3306"
|
||||||
scheme = "mysql+pymysql"
|
scheme = "mysql+pymysql"
|
||||||
|
|
||||||
|
def get_connection_url(self):
|
||||||
|
return super().get_connection_url()
|
||||||
|
|
||||||
class MySQLSource(SQLAlchemySource):
|
class MySQLSource(SQLSource):
|
||||||
def __init__(self, config, metadata_config, ctx):
|
def __init__(self, config, metadata_config, ctx):
|
||||||
super().__init__(config, metadata_config, ctx)
|
super().__init__(config, metadata_config, ctx)
|
||||||
|
|
||||||
|
|||||||
@ -16,18 +16,18 @@
|
|||||||
# This import verifies that the dependencies are available.
|
# This import verifies that the dependencies are available.
|
||||||
import cx_Oracle # noqa: F401
|
import cx_Oracle # noqa: F401
|
||||||
|
|
||||||
from .sql_source import BasicSQLAlchemyConfig, SQLAlchemySource
|
from .sql_source import SQLSource, SQLConnectionConfig
|
||||||
from ..ometa.auth_provider import MetadataServerConfig
|
from ..ometa.auth_provider import MetadataServerConfig
|
||||||
|
|
||||||
|
|
||||||
class OracleConfig(BasicSQLAlchemyConfig):
|
class OracleConfig(SQLConnectionConfig):
|
||||||
# defaults
|
# defaults
|
||||||
scheme = "oracle+cx_oracle"
|
scheme = "oracle+cx_oracle"
|
||||||
|
|
||||||
|
|
||||||
class OracleSource(SQLAlchemySource):
|
class OracleSource(SQLSource):
|
||||||
def __init__(self, config, metadata_config, ctx):
|
def __init__(self, config, metadata_config, ctx):
|
||||||
super().__init__(config, metadata_config, ctx, "oracle")
|
super().__init__(config, metadata_config, ctx)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create(cls, config_dict, metadata_config_dict, ctx):
|
def create(cls, config_dict, metadata_config_dict, ctx):
|
||||||
|
|||||||
@ -24,8 +24,8 @@ from metadata.ingestion.models.ometa_table_db import OMetaDatabaseAndTable
|
|||||||
import pymysql # noqa: F401
|
import pymysql # noqa: F401
|
||||||
|
|
||||||
from metadata.generated.schema.entity.data.table import TableEntity, Column
|
from metadata.generated.schema.entity.data.table import TableEntity, Column
|
||||||
from metadata.ingestion.source.sql_source_common import SQLAlchemyHelper, SQLSourceStatus
|
from metadata.ingestion.source.sql_alchemy_helper import SQLAlchemyHelper, SQLSourceStatus
|
||||||
from .sql_source import BasicSQLAlchemyConfig
|
from .sql_source import SQLConnectionConfig
|
||||||
from metadata.ingestion.api.source import Source, SourceStatus
|
from metadata.ingestion.api.source import Source, SourceStatus
|
||||||
from metadata.ingestion.models.table_metadata import DatabaseMetadata
|
from metadata.ingestion.models.table_metadata import DatabaseMetadata
|
||||||
from itertools import groupby
|
from itertools import groupby
|
||||||
@ -38,27 +38,18 @@ from ...utils.helpers import get_service_or_create
|
|||||||
TableKey = namedtuple('TableKey', ['schema', 'table_name'])
|
TableKey = namedtuple('TableKey', ['schema', 'table_name'])
|
||||||
|
|
||||||
|
|
||||||
class PostgresSourceConfig(BasicSQLAlchemyConfig):
|
class PostgresSourceConfig(SQLConnectionConfig):
|
||||||
# defaults
|
# defaults
|
||||||
scheme = "postgresql+psycopg2"
|
scheme = "postgresql+psycopg2"
|
||||||
service_name = "postgres"
|
service_name = "postgres"
|
||||||
service_type = "POSTGRES"
|
service_type = "POSTGRES"
|
||||||
|
|
||||||
def get_sql_alchemy_url(self):
|
|
||||||
url = f"{self.scheme}://"
|
|
||||||
if self.username:
|
|
||||||
url += f"{self.username}"
|
|
||||||
if self.password:
|
|
||||||
url += f":{self.password}"
|
|
||||||
url += "@"
|
|
||||||
url += f"{self.host_port}"
|
|
||||||
if self.database:
|
|
||||||
url += f"/{self.database}"
|
|
||||||
return url
|
|
||||||
|
|
||||||
def get_service_type(self) -> DatabaseServiceType:
|
def get_service_type(self) -> DatabaseServiceType:
|
||||||
return DatabaseServiceType[self.service_type]
|
return DatabaseServiceType[self.service_type]
|
||||||
|
|
||||||
|
def get_connection_url(self):
|
||||||
|
return super().get_connection_url()
|
||||||
|
|
||||||
|
|
||||||
def get_table_key(row: Dict[str, Any]) -> Union[TableKey, None]:
|
def get_table_key(row: Dict[str, Any]) -> Union[TableKey, None]:
|
||||||
"""
|
"""
|
||||||
@ -73,7 +64,6 @@ def get_table_key(row: Dict[str, Any]) -> Union[TableKey, None]:
|
|||||||
|
|
||||||
|
|
||||||
class PostgresSource(Source):
|
class PostgresSource(Source):
|
||||||
# SELECT statement from mysql information_schema to extract table and column metadata
|
|
||||||
SQL_STATEMENT = """
|
SQL_STATEMENT = """
|
||||||
SELECT
|
SELECT
|
||||||
c.table_catalog as cluster, c.table_schema as schema, c.table_name as name, pgtd.description as description
|
c.table_catalog as cluster, c.table_schema as schema, c.table_name as name, pgtd.description as description
|
||||||
@ -106,7 +96,7 @@ class PostgresSource(Source):
|
|||||||
self.status = SQLSourceStatus()
|
self.status = SQLSourceStatus()
|
||||||
self.service = get_service_or_create(config, metadata_config)
|
self.service = get_service_or_create(config, metadata_config)
|
||||||
self.include_pattern = IncludeFilterPattern
|
self.include_pattern = IncludeFilterPattern
|
||||||
self.pattern = config.include_pattern
|
self.pattern = config
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create(cls, config_dict, metadata_config_dict, ctx):
|
def create(cls, config_dict, metadata_config_dict, ctx):
|
||||||
@ -131,7 +121,6 @@ class PostgresSource(Source):
|
|||||||
Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
|
Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
counter = 0
|
|
||||||
for key, group in groupby(self._get_raw_extract_iter(), get_table_key):
|
for key, group in groupby(self._get_raw_extract_iter(), get_table_key):
|
||||||
columns = []
|
columns = []
|
||||||
for row in group:
|
for row in group:
|
||||||
@ -139,7 +128,7 @@ class PostgresSource(Source):
|
|||||||
col_type = ''
|
col_type = ''
|
||||||
if row['col_type'].upper() == 'CHARACTER VARYING':
|
if row['col_type'].upper() == 'CHARACTER VARYING':
|
||||||
col_type = 'VARCHAR'
|
col_type = 'VARCHAR'
|
||||||
elif row['col_type'].upper() == 'CHARACTER':
|
elif row['col_type'].upper() == 'CHARACTER' or row['col_type'].upper() == 'NAME':
|
||||||
col_type = 'CHAR'
|
col_type = 'CHAR'
|
||||||
elif row['col_type'].upper() == 'INTEGER':
|
elif row['col_type'].upper() == 'INTEGER':
|
||||||
col_type = 'INT'
|
col_type = 'INT'
|
||||||
@ -149,28 +138,29 @@ class PostgresSource(Source):
|
|||||||
col_type = 'DOUBLE'
|
col_type = 'DOUBLE'
|
||||||
elif row['col_type'].upper() == 'OID':
|
elif row['col_type'].upper() == 'OID':
|
||||||
col_type = 'NUMBER'
|
col_type = 'NUMBER'
|
||||||
elif row['col_type'].upper() == 'NAME':
|
elif row['col_type'].upper() == 'ARRAY':
|
||||||
col_type = 'CHAR'
|
col_type = 'ARRAY'
|
||||||
|
elif row['col_type'].upper() == 'BOOLEAN':
|
||||||
|
col_type = 'BOOLEAN'
|
||||||
else:
|
else:
|
||||||
col_type = row['col_type'].upper()
|
col_type = None
|
||||||
if not self.include_pattern.included(self.pattern, last_row[1]):
|
if not self.pattern.include_pattern.included(f'{last_row[1]}.{last_row[2]}'):
|
||||||
self.status.report_dropped(last_row['name'])
|
self.status.filtered(f'{last_row[1]}.{last_row[2]}', "pattern not allowed", last_row[2])
|
||||||
continue
|
continue
|
||||||
columns.append(Column(name=row['col_name'], description=row['col_description'],
|
if col_type is not None:
|
||||||
columnDataType=col_type, ordinalPosition=int(row['col_sort_order'])))
|
columns.append(Column(name=row['col_name'], description=row['col_description'],
|
||||||
|
columnDataType=col_type, ordinalPosition=int(row['col_sort_order'])))
|
||||||
table_metadata = TableEntity(name=last_row['name'],
|
table_metadata = TableEntity(name=last_row['name'],
|
||||||
description=last_row['description'],
|
description=last_row['description'],
|
||||||
columns=columns)
|
columns=columns)
|
||||||
|
|
||||||
self.status.report_table_scanned(table_metadata.name)
|
self.status.scanned(table_metadata.name.__root__)
|
||||||
|
|
||||||
dm = DatabaseEntity(id=uuid.uuid4(),
|
dm = DatabaseEntity(id=uuid.uuid4(),
|
||||||
name=row['schema'],
|
name=row['schema'],
|
||||||
description=row['description'] if row['description'] is not None else ' ',
|
description=row['description'] if row['description'] is not None else ' ',
|
||||||
service=EntityReference(id=self.service.id, type=self.SERVICE_TYPE))
|
service=EntityReference(id=self.service.id, type=self.SERVICE_TYPE))
|
||||||
table_and_db = OMetaDatabaseAndTable(table=table_metadata, database=dm)
|
table_and_db = OMetaDatabaseAndTable(table=table_metadata, database=dm)
|
||||||
self.status.records_produced(dm)
|
|
||||||
yield table_and_db
|
yield table_and_db
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
|
|||||||
@ -17,13 +17,13 @@ import logging
|
|||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from metadata.ingestion.ometa.auth_provider import MetadataServerConfig
|
from metadata.ingestion.ometa.auth_provider import MetadataServerConfig
|
||||||
from metadata.ingestion.source.sql_source import SQLAlchemySource, BasicSQLAlchemyConfig
|
from metadata.ingestion.source.sql_source import SQLSource, SQLConnectionConfig
|
||||||
from metadata.ingestion.api.source import SourceStatus
|
from metadata.ingestion.api.source import SourceStatus
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class RedshiftConfig(BasicSQLAlchemyConfig):
|
class RedshiftConfig(SQLConnectionConfig):
|
||||||
scheme = "postgresql+psycopg2"
|
scheme = "postgresql+psycopg2"
|
||||||
where_clause: Optional[str] = None
|
where_clause: Optional[str] = None
|
||||||
duration: int = 1
|
duration: int = 1
|
||||||
@ -34,8 +34,11 @@ class RedshiftConfig(BasicSQLAlchemyConfig):
|
|||||||
return f"{self.database}.{regular}"
|
return f"{self.database}.{regular}"
|
||||||
return regular
|
return regular
|
||||||
|
|
||||||
|
def get_connection_url(self):
|
||||||
|
return super().get_connection_url()
|
||||||
|
|
||||||
class RedshiftSource(SQLAlchemySource):
|
|
||||||
|
class RedshiftSource(SQLSource):
|
||||||
|
|
||||||
def __init__(self, config, metadata_config, ctx):
|
def __init__(self, config, metadata_config, ctx):
|
||||||
super().__init__(config, metadata_config, ctx)
|
super().__init__(config, metadata_config, ctx)
|
||||||
|
|||||||
@ -1,200 +0,0 @@
|
|||||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
# contributor license agreements. See the NOTICE file distributed with
|
|
||||||
# this work for additional information regarding copyright ownership.
|
|
||||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
# (the "License"); you may not use this file except in compliance with
|
|
||||||
# the License. You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
# This import verifies that the dependencies are available.
|
|
||||||
import logging
|
|
||||||
import uuid
|
|
||||||
|
|
||||||
import pymysql # noqa: F401
|
|
||||||
from pydantic import ValidationError
|
|
||||||
|
|
||||||
from metadata.generated.schema.entity.data.table import Column, TableEntity
|
|
||||||
from metadata.generated.schema.entity.data.database import DatabaseEntity
|
|
||||||
from metadata.generated.schema.type.entityReference import EntityReference
|
|
||||||
from metadata.ingestion.models.ometa_table_db import OMetaDatabaseAndTable
|
|
||||||
from metadata.ingestion.ometa.auth_provider import MetadataServerConfig
|
|
||||||
from metadata.ingestion.source.sql_source_common import BasicSQLQueryConfig, SQLAlchemyHelper, SQLSourceStatus
|
|
||||||
from metadata.ingestion.api.source import Source, SourceStatus
|
|
||||||
from itertools import groupby
|
|
||||||
from typing import Iterator, Union, Dict, Any, Iterable
|
|
||||||
from collections import namedtuple
|
|
||||||
|
|
||||||
from metadata.utils.helpers import get_service_or_create
|
|
||||||
|
|
||||||
TableKey = namedtuple('TableKey', ['schema', 'table_name'])
|
|
||||||
|
|
||||||
|
|
||||||
class RedshiftConfig(BasicSQLQueryConfig):
|
|
||||||
scheme = "redshift"
|
|
||||||
where_clause: str = None
|
|
||||||
cluster_source: str = "CURRENT_DATABASE()"
|
|
||||||
api_endpoint: str = None
|
|
||||||
service_type: str = "REDSHIFT"
|
|
||||||
service_name: str = "aws_redshift"
|
|
||||||
|
|
||||||
|
|
||||||
def get_table_key(row: Dict[str, Any]) -> Union[TableKey, None]:
|
|
||||||
"""
|
|
||||||
Table key consists of schema and table name
|
|
||||||
:param row:
|
|
||||||
:return:
|
|
||||||
"""
|
|
||||||
if row:
|
|
||||||
return TableKey(schema=row['schema'], table_name=row['name'])
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class RedshiftSQLSource(Source):
|
|
||||||
# SELECT statement from mysql information_schema to extract table and column metadata
|
|
||||||
SQL_STATEMENT = """
|
|
||||||
SELECT
|
|
||||||
*
|
|
||||||
FROM (
|
|
||||||
SELECT
|
|
||||||
{cluster_source} as cluster,
|
|
||||||
c.table_schema as schema,
|
|
||||||
c.table_name as name,
|
|
||||||
pgtd.description as description,
|
|
||||||
c.column_name as col_name,
|
|
||||||
c.data_type as col_type,
|
|
||||||
pgcd.description as col_description,
|
|
||||||
ordinal_position as col_sort_order
|
|
||||||
FROM INFORMATION_SCHEMA.COLUMNS c
|
|
||||||
INNER JOIN
|
|
||||||
pg_catalog.pg_statio_all_tables as st on c.table_schema=st.schemaname and c.table_name=st.relname
|
|
||||||
LEFT JOIN
|
|
||||||
pg_catalog.pg_description pgcd on pgcd.objoid=st.relid and pgcd.objsubid=c.ordinal_position
|
|
||||||
LEFT JOIN
|
|
||||||
pg_catalog.pg_description pgtd on pgtd.objoid=st.relid and pgtd.objsubid=0
|
|
||||||
|
|
||||||
UNION
|
|
||||||
|
|
||||||
SELECT
|
|
||||||
{cluster_source} as cluster,
|
|
||||||
view_schema as schema,
|
|
||||||
view_name as name,
|
|
||||||
NULL as description,
|
|
||||||
column_name as col_name,
|
|
||||||
data_type as col_type,
|
|
||||||
NULL as col_description,
|
|
||||||
ordinal_position as col_sort_order
|
|
||||||
FROM
|
|
||||||
PG_GET_LATE_BINDING_VIEW_COLS()
|
|
||||||
COLS(view_schema NAME, view_name NAME, column_name NAME, data_type VARCHAR, ordinal_position INT)
|
|
||||||
)
|
|
||||||
|
|
||||||
{where_clause_suffix}
|
|
||||||
ORDER by cluster, schema, name, col_sort_order ;
|
|
||||||
"""
|
|
||||||
|
|
||||||
# CONFIG KEYS
|
|
||||||
WHERE_CLAUSE_SUFFIX_KEY = 'where_clause'
|
|
||||||
CLUSTER_SOURCE = 'cluster_source'
|
|
||||||
CLUSTER_KEY = 'cluster_key'
|
|
||||||
USE_CATALOG_AS_CLUSTER_NAME = 'use_catalog_as_cluster_name'
|
|
||||||
DATABASE_KEY = 'database_key'
|
|
||||||
SERVICE_TYPE = 'REDSHIFT'
|
|
||||||
DEFAULT_CLUSTER_SOURCE = 'CURRENT_DATABASE()'
|
|
||||||
|
|
||||||
def __init__(self, config, metadata_config, ctx):
|
|
||||||
super().__init__(ctx)
|
|
||||||
self.sql_stmt = RedshiftSQLSource.SQL_STATEMENT.format(
|
|
||||||
where_clause_suffix=config.where_clause,
|
|
||||||
cluster_source=config.cluster_source,
|
|
||||||
database=config.database
|
|
||||||
)
|
|
||||||
self.alchemy_helper = SQLAlchemyHelper(config, metadata_config, ctx, "Redshift", self.sql_stmt)
|
|
||||||
self.config = config
|
|
||||||
self.metadata_config = metadata_config
|
|
||||||
self._extract_iter: Union[None, Iterator] = None
|
|
||||||
self._database = 'redshift'
|
|
||||||
self.report = SQLSourceStatus()
|
|
||||||
self.service = get_service_or_create(config, metadata_config)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def create(cls, config_dict, metadata_config_dict, ctx):
|
|
||||||
config = RedshiftConfig.parse_obj(config_dict)
|
|
||||||
metadata_config = MetadataServerConfig.parse_obj(metadata_config_dict)
|
|
||||||
return cls(config, metadata_config, ctx)
|
|
||||||
|
|
||||||
def prepare(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def _get_raw_extract_iter(self) -> Iterable[Dict[str, Any]]:
|
|
||||||
"""
|
|
||||||
Provides iterator of result row from SQLAlchemy helper
|
|
||||||
:return:
|
|
||||||
"""
|
|
||||||
rows = self.alchemy_helper.execute_query()
|
|
||||||
for row in rows:
|
|
||||||
yield row
|
|
||||||
|
|
||||||
def next_record(self) -> Iterable[OMetaDatabaseAndTable]:
|
|
||||||
"""
|
|
||||||
Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
|
|
||||||
:return:
|
|
||||||
"""
|
|
||||||
for key, group in groupby(self._get_raw_extract_iter(), get_table_key):
|
|
||||||
try:
|
|
||||||
columns = []
|
|
||||||
for row in group:
|
|
||||||
last_row = row
|
|
||||||
col_type = ''
|
|
||||||
if row['col_type'].upper() == 'CHARACTER VARYING':
|
|
||||||
col_type = 'VARCHAR'
|
|
||||||
elif row['col_type'].upper() == 'CHARACTER':
|
|
||||||
col_type = 'CHAR'
|
|
||||||
elif row['col_type'].upper() == 'INTEGER':
|
|
||||||
col_type = 'INT'
|
|
||||||
elif row['col_type'].upper() == 'TIMESTAMP WITHOUT TIME ZONE':
|
|
||||||
col_type = 'TIMESTAMP'
|
|
||||||
elif row['col_type'].upper() == 'DOUBLE PRECISION':
|
|
||||||
col_type = 'DOUBLE'
|
|
||||||
elif row['col_type'].upper() == 'OID':
|
|
||||||
col_type = 'NUMBER'
|
|
||||||
elif row['col_type'].upper() == 'NAME':
|
|
||||||
col_type = 'CHAR'
|
|
||||||
else:
|
|
||||||
col_type = row['col_type'].upper()
|
|
||||||
columns.append(Column(name=row['col_name'], description=row['col_description'],
|
|
||||||
columnDataType=col_type,
|
|
||||||
ordinalPosition=int(row['col_sort_order'])))
|
|
||||||
db = DatabaseEntity(id=uuid.uuid4(),
|
|
||||||
name=last_row['schema'],
|
|
||||||
description=last_row['description'] if last_row['description'] is not None else ' ',
|
|
||||||
service=EntityReference(id=self.service.id, type=self.config.service_type))
|
|
||||||
table = TableEntity(name=last_row['name'],
|
|
||||||
columns=columns)
|
|
||||||
table_and_db = OMetaDatabaseAndTable(table=table, database=db)
|
|
||||||
self.report.report_table_scanned(table.name)
|
|
||||||
self.report.records_produced(table.name)
|
|
||||||
yield table_and_db
|
|
||||||
except ValidationError as err:
|
|
||||||
logger.info("Dropped Table {} due to {}".format(row['name'], err))
|
|
||||||
self.report.report_dropped(row['name'])
|
|
||||||
continue
|
|
||||||
|
|
||||||
def get_report(self):
|
|
||||||
return self.report
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
self.alchemy_helper.close()
|
|
||||||
|
|
||||||
def get_status(self) -> SourceStatus:
|
|
||||||
return self.report
|
|
||||||
@ -17,7 +17,7 @@
|
|||||||
import logging
|
import logging
|
||||||
from metadata.ingestion.models.table_queries import TableQuery
|
from metadata.ingestion.models.table_queries import TableQuery
|
||||||
from metadata.ingestion.ometa.auth_provider import MetadataServerConfig
|
from metadata.ingestion.ometa.auth_provider import MetadataServerConfig
|
||||||
from metadata.ingestion.source.sql_source_common import SQLAlchemyHelper, SQLSourceStatus
|
from metadata.ingestion.source.sql_alchemy_helper import SQLAlchemyHelper, SQLSourceStatus
|
||||||
from metadata.ingestion.api.source import Source, SourceStatus
|
from metadata.ingestion.api.source import Source, SourceStatus
|
||||||
from typing import Iterator, Union, Dict, Any, Iterable
|
from typing import Iterator, Union, Dict, Any, Iterable
|
||||||
from metadata.utils.helpers import get_start_and_end
|
from metadata.utils.helpers import get_start_and_end
|
||||||
@ -99,8 +99,8 @@ class RedshiftUsageSource(Source):
|
|||||||
"""
|
"""
|
||||||
for row in self._get_raw_extract_iter():
|
for row in self._get_raw_extract_iter():
|
||||||
tq = TableQuery(row['query'], row['label'], row['userid'], row['xid'], row['pid'], str(row['starttime']),
|
tq = TableQuery(row['query'], row['label'], row['userid'], row['xid'], row['pid'], str(row['starttime']),
|
||||||
str(row['endtime']), str(row['analysis_date']), row['duration'], row['database'], row['aborted'], row['sql'])
|
str(row['endtime']), str(row['analysis_date']), row['duration'], row['database'],
|
||||||
self.status.records_produced(tq)
|
row['aborted'], row['sql'])
|
||||||
yield tq
|
yield tq
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
|
|||||||
@ -293,8 +293,7 @@ class SampleTableSource(Source):
|
|||||||
for table in self.tables['tables']:
|
for table in self.tables['tables']:
|
||||||
table_metadata = TableEntity(**table)
|
table_metadata = TableEntity(**table)
|
||||||
table_and_db = OMetaDatabaseAndTable(table=table_metadata, database=db)
|
table_and_db = OMetaDatabaseAndTable(table=table_metadata, database=db)
|
||||||
self.status.report_table_scanned(table_metadata.name.__root__)
|
self.status.scanned(table_metadata.name.__root__)
|
||||||
self.status.records_produced(table_metadata.name.__root__)
|
|
||||||
yield table_and_db
|
yield table_and_db
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
|
|||||||
@ -15,12 +15,11 @@
|
|||||||
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import snowflake.sqlalchemy
|
|
||||||
from snowflake.sqlalchemy import custom_types
|
from snowflake.sqlalchemy import custom_types
|
||||||
|
|
||||||
from .sql_source import (
|
from .sql_source import (
|
||||||
BasicSQLAlchemyConfig,
|
SQLConnectionConfig,
|
||||||
SQLAlchemySource,
|
SQLSource,
|
||||||
register_custom_type,
|
register_custom_type,
|
||||||
)
|
)
|
||||||
from ..ometa.auth_provider import MetadataServerConfig
|
from ..ometa.auth_provider import MetadataServerConfig
|
||||||
@ -30,7 +29,7 @@ register_custom_type(custom_types.TIMESTAMP_LTZ, "TIME")
|
|||||||
register_custom_type(custom_types.TIMESTAMP_NTZ, "TIME")
|
register_custom_type(custom_types.TIMESTAMP_NTZ, "TIME")
|
||||||
|
|
||||||
|
|
||||||
class SnowflakeConfig(BasicSQLAlchemyConfig):
|
class SnowflakeConfig(SQLConnectionConfig):
|
||||||
scheme = "snowflake"
|
scheme = "snowflake"
|
||||||
account: str
|
account: str
|
||||||
database: str # database is required
|
database: str # database is required
|
||||||
@ -38,8 +37,8 @@ class SnowflakeConfig(BasicSQLAlchemyConfig):
|
|||||||
role: Optional[str]
|
role: Optional[str]
|
||||||
duration: Optional[int]
|
duration: Optional[int]
|
||||||
|
|
||||||
def get_sql_alchemy_url(self):
|
def get_connection_url(self):
|
||||||
connect_string = super().get_sql_alchemy_url()
|
connect_string = super().get_connection_url()
|
||||||
options = {
|
options = {
|
||||||
"account": self.account,
|
"account": self.account,
|
||||||
"warehouse": self.warehouse,
|
"warehouse": self.warehouse,
|
||||||
@ -50,14 +49,10 @@ class SnowflakeConfig(BasicSQLAlchemyConfig):
|
|||||||
connect_string = f"{connect_string}?{params}"
|
connect_string = f"{connect_string}?{params}"
|
||||||
return connect_string
|
return connect_string
|
||||||
|
|
||||||
def get_identifier(self, schema: str, table: str) -> str:
|
|
||||||
regular = super().get_identifier(schema, table)
|
|
||||||
return f"{self.database}.{regular}"
|
|
||||||
|
|
||||||
|
class SnowflakeSource(SQLSource):
|
||||||
class SnowflakeSource(SQLAlchemySource):
|
|
||||||
def __init__(self, config, metadata_config, ctx):
|
def __init__(self, config, metadata_config, ctx):
|
||||||
super().__init__(config, metadata_config, ctx, "snowflake")
|
super().__init__(config, metadata_config, ctx)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create(cls, config_dict, metadata_config_dict, ctx):
|
def create(cls, config_dict, metadata_config_dict, ctx):
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
# This import verifies that the dependencies are available.
|
# This import verifies that the dependencies are available.
|
||||||
from metadata.ingestion.models.table_queries import TableQuery
|
from metadata.ingestion.models.table_queries import TableQuery
|
||||||
from metadata.ingestion.ometa.auth_provider import MetadataServerConfig
|
from metadata.ingestion.ometa.auth_provider import MetadataServerConfig
|
||||||
from metadata.ingestion.source.sql_source_common import SQLAlchemyHelper, SQLSourceStatus
|
from metadata.ingestion.source.sql_alchemy_helper import SQLAlchemyHelper, SQLSourceStatus
|
||||||
from metadata.ingestion.api.source import Source, SourceStatus
|
from metadata.ingestion.api.source import Source, SourceStatus
|
||||||
from typing import Iterator, Union, Dict, Any, Iterable
|
from typing import Iterator, Union, Dict, Any, Iterable
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ class SnowflakeUsageSource(Source):
|
|||||||
for row in self._get_raw_extract_iter():
|
for row in self._get_raw_extract_iter():
|
||||||
tq = TableQuery(row['query'], row['label'], 0, 0, 0, str(row['starttime']),
|
tq = TableQuery(row['query'], row['label'], 0, 0, 0, str(row['starttime']),
|
||||||
str(row['endtime']), str(row['starttime'])[0:19], 2, row['database'], 0, row['sql'])
|
str(row['endtime']), str(row['starttime'])[0:19], 2, row['database'], 0, row['sql'])
|
||||||
self.report.records_produced(tq)
|
self.report.scanned(tq)
|
||||||
yield tq
|
yield tq
|
||||||
|
|
||||||
def get_report(self):
|
def get_report(self):
|
||||||
|
|||||||
@ -13,71 +13,18 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
from abc import abstractmethod
|
|
||||||
from metadata.config.common import ConfigModel
|
from typing import Any, Iterable
|
||||||
from typing import Any, Iterable, List, Optional, Tuple
|
|
||||||
from dataclasses import dataclass, field
|
|
||||||
from metadata.ingestion.api.common import WorkflowContext
|
from metadata.ingestion.api.common import WorkflowContext
|
||||||
from metadata.ingestion.api.source import SourceStatus
|
|
||||||
from sqlalchemy import create_engine
|
from sqlalchemy import create_engine
|
||||||
|
from .sql_source import SQLConnectionConfig, SQLSourceStatus
|
||||||
from metadata.ingestion.ometa.auth_provider import MetadataServerConfig
|
from metadata.ingestion.ometa.auth_provider import MetadataServerConfig
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class SQLSourceStatus(SourceStatus):
|
|
||||||
tables_scanned = 0
|
|
||||||
filtered: List[str] = field(default_factory=list)
|
|
||||||
|
|
||||||
def report_table_scanned(self, table_name: str) -> None:
|
|
||||||
self.tables_scanned += 1
|
|
||||||
|
|
||||||
def report_dropped(self, table_name: str) -> None:
|
|
||||||
self.filtered.append(table_name)
|
|
||||||
|
|
||||||
|
|
||||||
class SQLAlchemyConfig(ConfigModel):
|
|
||||||
options: dict = {}
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def get_sql_alchemy_url(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def get_identifier(self, schema: str, table: str) -> str:
|
|
||||||
return f"{schema}.{table}"
|
|
||||||
|
|
||||||
def standardize_schema_table_names(
|
|
||||||
self, schema: str, table: str
|
|
||||||
) -> Tuple[str, str]:
|
|
||||||
# Some SQLAlchemy dialects need a standardization step to clean the schema
|
|
||||||
# and table names. See BigQuery for an example of when this is useful.
|
|
||||||
return schema, table
|
|
||||||
|
|
||||||
|
|
||||||
class BasicSQLQueryConfig(SQLAlchemyConfig):
|
|
||||||
username: Optional[str] = None
|
|
||||||
password: Optional[str] = None
|
|
||||||
host_port: str
|
|
||||||
database: Optional[str] = None
|
|
||||||
scheme: str
|
|
||||||
|
|
||||||
def get_sql_alchemy_url(self):
|
|
||||||
url = f"{self.scheme}://"
|
|
||||||
if self.username:
|
|
||||||
url += f"{self.username}"
|
|
||||||
if self.password:
|
|
||||||
url += f":{self.password}"
|
|
||||||
url += "@"
|
|
||||||
url += f"{self.host_port}"
|
|
||||||
if self.database:
|
|
||||||
url += f"/{self.database}"
|
|
||||||
return url
|
|
||||||
|
|
||||||
|
|
||||||
class SQLAlchemyHelper:
|
class SQLAlchemyHelper:
|
||||||
"""A helper class for all SQL Sources that use SQLAlchemy to extend"""
|
"""A helper class for all SQL Sources that use SQLAlchemy to extend"""
|
||||||
|
|
||||||
def __init__(self, config: SQLAlchemyConfig, metadata_config: MetadataServerConfig,
|
def __init__(self, config: SQLConnectionConfig, metadata_config: MetadataServerConfig,
|
||||||
ctx: WorkflowContext, platform: str, query: str):
|
ctx: WorkflowContext, platform: str, query: str):
|
||||||
self.config = config
|
self.config = config
|
||||||
self.platform = platform
|
self.platform = platform
|
||||||
@ -89,7 +36,7 @@ class SQLAlchemyHelper:
|
|||||||
"""
|
"""
|
||||||
Create a SQLAlchemy connection to Database
|
Create a SQLAlchemy connection to Database
|
||||||
"""
|
"""
|
||||||
engine = create_engine(self.config.get_sql_alchemy_url())
|
engine = create_engine(self.config.get_connection_url())
|
||||||
conn = engine.connect()
|
conn = engine.connect()
|
||||||
return conn
|
return conn
|
||||||
|
|
||||||
@ -44,40 +44,20 @@ logger: logging.Logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class SQLSourceStatus(SourceStatus):
|
class SQLSourceStatus(SourceStatus):
|
||||||
tables_scanned: List[str] = field(default_factory=list)
|
success: List[str] = field(default_factory=list)
|
||||||
filtered: List[str] = field(default_factory=list)
|
failures: List[str] = field(default_factory=list)
|
||||||
|
warnings: List[str] = field(default_factory=list)
|
||||||
|
|
||||||
def report_table_scanned(self, table_name: str) -> None:
|
def scanned(self, table_name: str) -> None:
|
||||||
self.tables_scanned.append(table_name)
|
self.success.append(table_name)
|
||||||
logger.info('Table Scanned: {}'.format(table_name))
|
logger.info('Table Scanned: {}'.format(table_name))
|
||||||
|
|
||||||
def report_dropped(self, table_name: str, err: str, dataset_name: str = None, col_type: str = None) -> None:
|
def filtered(self, table_name: str, err: str, dataset_name: str = None, col_type: str = None) -> None:
|
||||||
self.filtered.append(table_name)
|
self.warnings.append(table_name)
|
||||||
logger.error("Dropped Table {} due to {}".format(dataset_name, err))
|
logger.warning("Dropped Table {} due to {}".format(dataset_name, err))
|
||||||
logger.error("column type {}".format(col_type))
|
|
||||||
|
|
||||||
|
|
||||||
class SQLAlchemyConfig(ConfigModel):
|
class SQLConnectionConfig(ConfigModel):
|
||||||
env: str = "PROD"
|
|
||||||
options: dict = {}
|
|
||||||
include_pattern: IncludeFilterPattern
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def get_sql_alchemy_url(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def get_identifier(self, schema: str, table: str) -> str:
|
|
||||||
return f"{schema}.{table}"
|
|
||||||
|
|
||||||
def standardize_schema_table_names(
|
|
||||||
self, schema: str, table: str
|
|
||||||
) -> Tuple[str, str]:
|
|
||||||
# Some SQLAlchemy dialects need a standardization step to clean the schema
|
|
||||||
# and table names. See BigQuery for an example of when this is useful.
|
|
||||||
return schema, table
|
|
||||||
|
|
||||||
|
|
||||||
class BasicSQLAlchemyConfig(SQLAlchemyConfig):
|
|
||||||
username: Optional[str] = None
|
username: Optional[str] = None
|
||||||
password: Optional[str] = None
|
password: Optional[str] = None
|
||||||
host_port: str
|
host_port: str
|
||||||
@ -85,8 +65,11 @@ class BasicSQLAlchemyConfig(SQLAlchemyConfig):
|
|||||||
scheme: str
|
scheme: str
|
||||||
service_name: str
|
service_name: str
|
||||||
service_type: str
|
service_type: str
|
||||||
|
options: dict = {}
|
||||||
|
include_pattern: IncludeFilterPattern = IncludeFilterPattern.allow_all()
|
||||||
|
|
||||||
def get_sql_alchemy_url(self):
|
@abstractmethod
|
||||||
|
def get_connection_url(self):
|
||||||
url = f"{self.scheme}://"
|
url = f"{self.scheme}://"
|
||||||
if self.username:
|
if self.username:
|
||||||
url += f"{self.username}"
|
url += f"{self.username}"
|
||||||
@ -101,8 +84,11 @@ class BasicSQLAlchemyConfig(SQLAlchemyConfig):
|
|||||||
def get_service_type(self) -> DatabaseServiceType:
|
def get_service_type(self) -> DatabaseServiceType:
|
||||||
return DatabaseServiceType[self.service_type]
|
return DatabaseServiceType[self.service_type]
|
||||||
|
|
||||||
|
def get_service_name(self) -> str:
|
||||||
|
return self.service_name
|
||||||
|
|
||||||
_field_type_mapping: Dict[Type[types.TypeEngine], str] = {
|
|
||||||
|
_column_type_mapping: Dict[Type[types.TypeEngine], str] = {
|
||||||
types.Integer: "INT",
|
types.Integer: "INT",
|
||||||
types.Numeric: "INT",
|
types.Numeric: "INT",
|
||||||
types.Boolean: "BOOLEAN",
|
types.Boolean: "BOOLEAN",
|
||||||
@ -123,7 +109,7 @@ _field_type_mapping: Dict[Type[types.TypeEngine], str] = {
|
|||||||
types.JSON: "JSON"
|
types.JSON: "JSON"
|
||||||
}
|
}
|
||||||
|
|
||||||
_known_unknown_field_types: Set[Type[types.TypeEngine]] = {
|
_known_unknown_column_types: Set[Type[types.TypeEngine]] = {
|
||||||
types.Interval,
|
types.Interval,
|
||||||
types.CLOB,
|
types.CLOB,
|
||||||
}
|
}
|
||||||
@ -133,25 +119,25 @@ def register_custom_type(
|
|||||||
tp: Type[types.TypeEngine], output: str = None
|
tp: Type[types.TypeEngine], output: str = None
|
||||||
) -> None:
|
) -> None:
|
||||||
if output:
|
if output:
|
||||||
_field_type_mapping[tp] = output
|
_column_type_mapping[tp] = output
|
||||||
else:
|
else:
|
||||||
_known_unknown_field_types.add(tp)
|
_known_unknown_column_types.add(tp)
|
||||||
|
|
||||||
|
|
||||||
def get_column_type(sql_report: SQLSourceStatus, dataset_name: str, column_type: Any) -> str:
|
def get_column_type(status: SQLSourceStatus, dataset_name: str, column_type: Any) -> str:
|
||||||
type_class: Optional[str] = None
|
type_class: Optional[str] = None
|
||||||
for sql_type in _field_type_mapping.keys():
|
for sql_type in _column_type_mapping.keys():
|
||||||
if isinstance(column_type, sql_type):
|
if isinstance(column_type, sql_type):
|
||||||
type_class = _field_type_mapping[sql_type]
|
type_class = _column_type_mapping[sql_type]
|
||||||
break
|
break
|
||||||
if type_class is None:
|
if type_class is None:
|
||||||
for sql_type in _known_unknown_field_types:
|
for sql_type in _known_unknown_column_types:
|
||||||
if isinstance(column_type, sql_type):
|
if isinstance(column_type, sql_type):
|
||||||
type_class = "NULL"
|
type_class = "NULL"
|
||||||
break
|
break
|
||||||
|
|
||||||
if type_class is None:
|
if type_class is None:
|
||||||
sql_report.warning(
|
status.warning(
|
||||||
dataset_name, f"unable to map type {column_type!r} to metadata schema"
|
dataset_name, f"unable to map type {column_type!r} to metadata schema"
|
||||||
)
|
)
|
||||||
type_class = "NULL"
|
type_class = "NULL"
|
||||||
@ -159,10 +145,10 @@ def get_column_type(sql_report: SQLSourceStatus, dataset_name: str, column_type:
|
|||||||
return type_class
|
return type_class
|
||||||
|
|
||||||
|
|
||||||
class SQLAlchemySource(Source):
|
class SQLSource(Source):
|
||||||
|
|
||||||
def __init__(self, config: SQLAlchemyConfig, metadata_config: MetadataServerConfig,
|
def __init__(self, config: SQLConnectionConfig, metadata_config: MetadataServerConfig,
|
||||||
ctx: WorkflowContext, connector: str = None):
|
ctx: WorkflowContext):
|
||||||
super().__init__(ctx)
|
super().__init__(ctx)
|
||||||
self.config = config
|
self.config = config
|
||||||
self.metadata_config = metadata_config
|
self.metadata_config = metadata_config
|
||||||
@ -176,20 +162,25 @@ class SQLAlchemySource(Source):
|
|||||||
def create(cls, config_dict: dict, metadata_config_dict: dict, ctx: WorkflowContext):
|
def create(cls, config_dict: dict, metadata_config_dict: dict, ctx: WorkflowContext):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def standardize_schema_table_names(
|
||||||
|
self, schema: str, table: str
|
||||||
|
) -> Tuple[str, str]:
|
||||||
|
return schema, table
|
||||||
|
|
||||||
def next_record(self) -> Iterable[OMetaDatabaseAndTable]:
|
def next_record(self) -> Iterable[OMetaDatabaseAndTable]:
|
||||||
sql_config = self.config
|
sql_config = self.config
|
||||||
url = sql_config.get_sql_alchemy_url()
|
url = sql_config.get_connection_url()
|
||||||
logger.debug(f"sql_alchemy_url={url}")
|
logger.debug(f"sql_alchemy_url={url}")
|
||||||
engine = create_engine(url, **sql_config.options)
|
engine = create_engine(url, **sql_config.options)
|
||||||
inspector = inspect(engine)
|
inspector = inspect(engine)
|
||||||
for schema in inspector.get_schema_names():
|
for schema in inspector.get_schema_names():
|
||||||
if not sql_config.include_pattern.included(schema):
|
if not sql_config.include_pattern.included(schema):
|
||||||
self.status.report_dropped(schema, "Schema pattern not allowed")
|
self.status.filtered(schema, "Schema pattern not allowed")
|
||||||
continue
|
continue
|
||||||
logger.debug("total tables {}".format(inspector.get_table_names(schema)))
|
logger.debug("total tables {}".format(inspector.get_table_names(schema)))
|
||||||
for table in inspector.get_table_names(schema):
|
for table in inspector.get_table_names(schema):
|
||||||
try:
|
try:
|
||||||
schema, table = sql_config.standardize_schema_table_names(schema, table)
|
schema, table = self.standardize_schema_table_names(schema, table)
|
||||||
pk_constraints = inspector.get_pk_constraint(table, schema)
|
pk_constraints = inspector.get_pk_constraint(table, schema)
|
||||||
pk_columns = pk_constraints['column_constraints'] if len(
|
pk_columns = pk_constraints['column_constraints'] if len(
|
||||||
pk_constraints) > 0 and "column_constraints" in pk_constraints.keys() else {}
|
pk_constraints) > 0 and "column_constraints" in pk_constraints.keys() else {}
|
||||||
@ -203,11 +194,11 @@ class SQLAlchemySource(Source):
|
|||||||
if 'column_names' in constraint.keys():
|
if 'column_names' in constraint.keys():
|
||||||
unique_columns = constraint['column_names']
|
unique_columns = constraint['column_names']
|
||||||
|
|
||||||
dataset_name = sql_config.get_identifier(schema, table)
|
dataset_name = f"{schema}.{table}"
|
||||||
self.status.report_table_scanned('{}.{}'.format(self.config.service_name, dataset_name))
|
self.status.scanned('{}.{}'.format(self.config.get_service_name(), dataset_name))
|
||||||
if not sql_config.include_pattern.included(dataset_name):
|
if not sql_config.include_pattern.included(dataset_name):
|
||||||
self.status.report_dropped('{}.{}'.format(self.config.service_name, dataset_name),
|
self.status.filtered('{}.{}'.format(self.config.get_service_name(), dataset_name),
|
||||||
"Table pattern not allowed")
|
"Table pattern not allowed")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
columns = inspector.get_columns(table, schema)
|
columns = inspector.get_columns(table, schema)
|
||||||
@ -216,15 +207,9 @@ class SQLAlchemySource(Source):
|
|||||||
table_info: dict = inspector.get_table_comment(table, schema)
|
table_info: dict = inspector.get_table_comment(table, schema)
|
||||||
except NotImplementedError:
|
except NotImplementedError:
|
||||||
description: Optional[str] = None
|
description: Optional[str] = None
|
||||||
properties: Dict[str, str] = {}
|
|
||||||
else:
|
else:
|
||||||
description = table_info["text"]
|
description = table_info["text"]
|
||||||
|
|
||||||
# The "properties" field is a non-standard addition to SQLAlchemy's interface.
|
|
||||||
properties = table_info.get("properties", {})
|
|
||||||
# TODO: capture inspector.get_pk_constraint
|
|
||||||
# TODO: capture inspector.get_sorted_table_and_fkc_names
|
|
||||||
|
|
||||||
table_columns = []
|
table_columns = []
|
||||||
row_order = 1
|
row_order = 1
|
||||||
for column in columns:
|
for column in columns:
|
||||||
@ -255,12 +240,11 @@ class SQLAlchemySource(Source):
|
|||||||
columns=table_columns)
|
columns=table_columns)
|
||||||
|
|
||||||
table_and_db = OMetaDatabaseAndTable(table=table, database=db)
|
table_and_db = OMetaDatabaseAndTable(table=table, database=db)
|
||||||
self.status.records_produced(table.name)
|
|
||||||
yield table_and_db
|
yield table_and_db
|
||||||
except ValidationError as err:
|
except ValidationError as err:
|
||||||
logger.error(err)
|
logger.error(err)
|
||||||
self.status.report_dropped('{}.{}'.format(self.config.service_name, dataset_name),
|
self.status.filtered('{}.{}'.format(self.config.service_name, dataset_name),
|
||||||
"Validation error")
|
"Validation error")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
|
|||||||
@ -42,7 +42,6 @@ def get_table_column_join(table, table_aliases, joins):
|
|||||||
except ValueError as err:
|
except ValueError as err:
|
||||||
logger.error("Error in parsing sql query joins {}".format(err))
|
logger.error("Error in parsing sql query joins {}".format(err))
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return TableColumnJoin(table_column=table_column, joined_with=joined_with)
|
return TableColumnJoin(table_column=table_column, joined_with=joined_with)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -118,6 +118,8 @@ class Workflow:
|
|||||||
if hasattr(self, 'sink'):
|
if hasattr(self, 'sink'):
|
||||||
self.sink.write_record(processed_record)
|
self.sink.write_record(processed_record)
|
||||||
self.report['sink'] = self.sink.get_status().as_obj()
|
self.report['sink'] = self.sink.get_status().as_obj()
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
if hasattr(self, 'processor'):
|
if hasattr(self, 'processor'):
|
||||||
self.processor.close()
|
self.processor.close()
|
||||||
if hasattr(self, 'stage'):
|
if hasattr(self, 'stage'):
|
||||||
|
|||||||
@ -41,7 +41,7 @@ def get_service_or_create(config, metadata_config) -> DatabaseServiceEntity:
|
|||||||
if service is not None:
|
if service is not None:
|
||||||
return service
|
return service
|
||||||
else:
|
else:
|
||||||
service = {'jdbc': {'connectionUrl': config.get_sql_alchemy_url(), 'driverClass': 'jdbc'},
|
service = {'jdbc': {'connectionUrl': config.get_connection_url(), 'driverClass': 'jdbc'},
|
||||||
'name': config.service_name, 'description': '', 'serviceType': config.get_service_type()}
|
'name': config.service_name, 'description': '', 'serviceType': config.get_service_type()}
|
||||||
created_service = client.create_database_service(CreateDatabaseServiceEntityRequest(**service))
|
created_service = client.create_database_service(CreateDatabaseServiceEntityRequest(**service))
|
||||||
return created_service
|
return created_service
|
||||||
|
|||||||
@ -24,4 +24,4 @@ services:
|
|||||||
volumes:
|
volumes:
|
||||||
- ./setup:/setup
|
- ./setup:/setup
|
||||||
ports:
|
ports:
|
||||||
- 51433:1433
|
- 1433:1433
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user