Fix #4767: Refactor Usage & Lineage (#5190)

Fix #4767: Refactor Usage & Lineage (#5190)
This commit is contained in:
Mayur Singal 2022-06-03 13:42:28 +05:30 committed by GitHub
parent 37057c79a2
commit b3428771a3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
31 changed files with 786 additions and 629 deletions

View File

@ -35,6 +35,18 @@
"queryLogFilePath": { "queryLogFilePath": {
"description": "Configuration to set the file path for query logs", "description": "Configuration to set the file path for query logs",
"type": "string" "type": "string"
},
"schemaFilterPattern": {
"description": "Regex to only fetch tables or databases that matches the pattern.",
"$ref": "../type/filterPattern.json#/definitions/filterPattern"
},
"tableFilterPattern": {
"description": "Regex exclude tables or databases that matches the pattern.",
"$ref": "../type/filterPattern.json#/definitions/filterPattern"
},
"databaseFilterPattern": {
"description": "Regex to only fetch databases that matches the pattern.",
"$ref": "../type/filterPattern.json#/definitions/filterPattern"
} }
}, },
"additionalProperties": false "additionalProperties": false

View File

@ -0,0 +1,46 @@
{
"$id": "https://open-metadata.org/schema/entity/data/queryParserData.json",
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "Query Parser Data",
"description": "This schema defines type of query parser data",
"type": "object",
"properties": {
"tables": {
"description": "List of tables used in query",
"type": "array",
"items": {
"type": "string"
}
},
"tableAliases": {
"description": "Table names mapped with alias used in query",
"type": "object"
},
"columns": {
"description": "Table columns used in query",
"type": "object"
},
"database": {
"description": "Database of the associated with query",
"type": "string"
},
"sql": {
"description": "SQL query",
"type": "string"
},
"serviceName": {
"description": "Name that identifies this database service.",
"type": "string"
},
"date": {
"description": "Date of execution of SQL query",
"type": "string"
},
"databaseSchema": {
"description": "Database schema of the associated with query",
"type": "string"
}
},
"required": ["sql", "serviceName", "tables", "database"],
"additionalProperties": false
}

View File

@ -0,0 +1,47 @@
{
"$id": "https://open-metadata.org/schema/entity/data/tableQuery.json",
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "Table Query",
"description": "This schema defines structure of table query",
"type": "object",
"properties": {
"query": {
"description": "SQL query",
"type": "string"
},
"userName": {
"description": "Name of the user that executed the SQL query",
"type": "string"
},
"startTime": {
"description": "Start time of execution of SQL query",
"type": "string"
},
"endTime": {
"description": "End time of execution of SQL query",
"type": "string"
},
"analysisDate": {
"description": "Date of execution of SQL query",
"$ref": "./basic.json#/definitions/dateTime"
},
"aborted": {
"description": "Flag to check if query was aborted during execution",
"type": "boolean"
},
"serviceName": {
"description": "Name that identifies this database service.",
"type": "string"
},
"database": {
"description": "Database of the associated with query",
"type": "string"
},
"databaseSchema": {
"description": "Database schema of the associated with query",
"type": "string"
}
},
"required": ["query", "sql", "serviceName"],
"additionalProperties": false
}

View File

@ -0,0 +1,81 @@
{
"$id": "https://open-metadata.org/schema/entity/data/tableUsageCount.json",
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "Table Usage Count",
"description": "This model is the linking between the usage stage and bulksink steps",
"type": "object",
"definitions": {
"tableColumn": {
"type": "object",
"properties": {
"table": {
"description": "Name of the table",
"type": "string"
},
"column": {
"description": "Name of the column",
"type": "string"
}
}
},
"tableColumnJoin": {
"type": "object",
"properties": {
"tableColumn": {
"description": "Source table column",
"$ref": "#/definitions/tableColumn"
},
"joinedWith": {
"description": "List of table columns with which the table is joined with",
"type": "array",
"items": {
"$ref": "#/definitions/tableColumn"
}
}
}
}
},
"properties": {
"table": {
"description": "Name of the table",
"type": "string"
},
"date": {
"description": "Date of execution of SQL query",
"type": "string"
},
"database": {
"description": "Database of the associated with table",
"type": "string"
},
"count": {
"description": "Usage count of table",
"type": "integer",
"default": 1
},
"databaseSchema": {
"description": "Database schema of the associated with table",
"type": "string"
},
"sqlQueries": {
"description": "List of SQL Queries associated with table",
"type": "array",
"items": {
"$ref": "../entity/data/table.json#/definitions/sqlQuery"
}
},
"joins": {
"description": "List of joins associated with table",
"type": "array",
"items": {
"$ref": "#/definitions/tableColumnJoin"
}
},
"serviceName": {
"description": "Name that identifies this database service.",
"type": "string"
}
},
"required": ["tableName", "date", "database", "serviceName"],
"additionalProperties": false
}

View File

@ -0,0 +1,20 @@
{
"$id": "https://open-metadata.org/schema/entity/data/usageRequest.json",
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "Usage Request",
"description": "This schema defines type of table usage request used to publish the usage count on a perticular date",
"javaType": "org.openmetadata.catalog.type.UsageRequest",
"type": "object",
"properties": {
"date": {
"description": "Date of execution of SQL query",
"type": "string"
},
"count": {
"description": "Usage count of table",
"type": "integer"
}
},
"required": ["date", "count"],
"additionalProperties": false
}

View File

@ -4,7 +4,6 @@ source:
serviceConnection: serviceConnection:
config: config:
type: BigQuery type: BigQuery
enablePolicyTagImport: true
projectId: project_id projectId: project_id
credentials: credentials:
gcsConfig: gcsConfig:
@ -23,8 +22,7 @@ source:
queryLogDuration: '1' queryLogDuration: '1'
processor: processor:
type: query-parser type: query-parser
config: config: {}
filter: ''
stage: stage:
type: table-usage type: table-usage
config: config:

View File

@ -13,8 +13,7 @@ source:
queryLogDuration: '1' queryLogDuration: '1'
processor: processor:
type: query-parser type: query-parser
config: config: {}
filter: ''
stage: stage:
type: table-usage type: table-usage
config: config:

View File

@ -11,11 +11,9 @@ source:
sourceConfig: sourceConfig:
config: config:
queryLogDuration: '1' queryLogDuration: '1'
queryLogFilePath: <path to query log file>
processor: processor:
type: query-parser type: query-parser
config: config: {}
filter: ''
stage: stage:
type: table-usage type: table-usage
config: config:

View File

@ -13,8 +13,7 @@ source:
queryLogDuration: '1' queryLogDuration: '1'
processor: processor:
type: query-parser type: query-parser
config: config: {}
filter: ''
stage: stage:
type: table-usage type: table-usage
config: config:

View File

@ -14,8 +14,7 @@ source:
resultLimit: 1000 resultLimit: 1000
processor: processor:
type: query-parser type: query-parser
config: config: {}
filter: ''
stage: stage:
type: table-usage type: table-usage
config: config:

View File

@ -8,12 +8,15 @@
"sampleDataFolder": "./examples/sample_data" "sampleDataFolder": "./examples/sample_data"
} }
}, },
"sourceConfig": {} "sourceConfig": {
"config":{
"type":"DatabaseUsage"
}
}
}, },
"processor": { "processor": {
"type": "query-parser", "type": "query-parser",
"config": { "config": {
"filter": ""
} }
}, },
"stage": { "stage": {

View File

@ -43,7 +43,7 @@ base_requirements = {
"Jinja2>=2.11.3", "Jinja2>=2.11.3",
"PyYAML", "PyYAML",
"jsonschema", "jsonschema",
"sqllineage==1.3.3", "sqllineage==1.3.5",
"antlr4-python3-runtime==4.9.2", "antlr4-python3-runtime==4.9.2",
# compatibility requirements for 3.7 # compatibility requirements for 3.7
"typing-compat~=0.1.0", "typing-compat~=0.1.0",
@ -61,7 +61,7 @@ base_plugins = {
"query-parser", "query-parser",
"metadata-usage", "metadata-usage",
"file-stage", "file-stage",
"sql-metadata~=2.0.0", "sql-metadata~=2.5.0",
} }
plugins: Dict[str, Set[str]] = { plugins: Dict[str, Set[str]] = {
"airflow": { "airflow": {

View File

@ -15,23 +15,29 @@ from typing import List, Optional
from metadata.config.common import ConfigModel from metadata.config.common import ConfigModel
from metadata.generated.schema.entity.data.database import Database from metadata.generated.schema.entity.data.database import Database
from metadata.generated.schema.entity.data.table import ColumnJoins, Table, TableJoins from metadata.generated.schema.entity.data.table import (
ColumnJoins,
JoinedWith,
SqlQuery,
Table,
TableJoins,
)
from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import ( from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
OpenMetadataConnection, OpenMetadataConnection,
) )
from metadata.generated.schema.type.tableUsageCount import TableColumn, TableUsageCount
from metadata.generated.schema.type.usageRequest import UsageRequest
from metadata.ingestion.api.bulk_sink import BulkSink, BulkSinkStatus from metadata.ingestion.api.bulk_sink import BulkSink, BulkSinkStatus
from metadata.ingestion.models.table_queries import (
ColumnJoinedWith,
TableColumn,
TableUsageCount,
TableUsageRequest,
)
from metadata.ingestion.ometa.client import APIError from metadata.ingestion.ometa.client import APIError
from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.utils import fqn from metadata.utils import fqn
from metadata.utils.helpers import _get_formmated_table_name from metadata.utils.helpers import get_formatted_entity_name
from metadata.utils.logger import ingestion_logger from metadata.utils.logger import ingestion_logger
from metadata.utils.sql_lineage import ingest_lineage_by_query from metadata.utils.sql_lineage import (
get_column_fqn,
ingest_lineage_by_query,
search_table_entities,
)
logger = ingestion_logger() logger = ingestion_logger()
@ -57,6 +63,7 @@ class MetadataUsageBulkSink(BulkSink):
self.metadata = OpenMetadata(self.metadata_config) self.metadata = OpenMetadata(self.metadata_config)
self.status = BulkSinkStatus() self.status = BulkSinkStatus()
self.table_join_dict = {} self.table_join_dict = {}
self.table_usage_map = {}
self.today = datetime.today().strftime("%Y-%m-%d") self.today = datetime.today().strftime("%Y-%m-%d")
@classmethod @classmethod
@ -64,13 +71,12 @@ class MetadataUsageBulkSink(BulkSink):
config = MetadataUsageSinkConfig.parse_obj(config_dict) config = MetadataUsageSinkConfig.parse_obj(config_dict)
return cls(config, metadata_config) return cls(config, metadata_config)
def handle_work_unit_start(self, wu): def ingest_sql_queries_lineage(
pass self, queries: List[SqlQuery], database: str
) -> None:
def handle_work_unit_end(self, wu): """
pass Method to ingest lineage by sql queries
"""
def ingest_sql_queries_lineage(self, queries, database):
for query in queries: for query in queries:
ingest_lineage_by_query( ingest_lineage_by_query(
self.metadata, self.metadata,
@ -79,74 +85,35 @@ class MetadataUsageBulkSink(BulkSink):
database=database, database=database,
) )
def write_records(self) -> None: def __populate_table_usage_map(
usage_records = [json.loads(l) for l in self.file_handler.readlines()] self, table_entity: Table, table_usage: TableUsageCount
table_usage_map = {} ) -> None:
for record in usage_records: """
table_usage = TableUsageCount(**json.loads(record)) Method Either initialise the map data or
self.service_name = table_usage.service_name update existing data with information from new queries on the same table
if "." in table_usage.table: """
( if not self.table_usage_map.get(table_entity.id.__root__):
table_usage.database_schema, self.table_usage_map[table_entity.id.__root__] = {
table_usage.table, "table_entity": table_entity,
) = table_usage.table.split(".")[-2:] "usage_count": table_usage.count,
table_entities = self.__get_table_entity( "sql_queries": table_usage.sqlQueries,
table_usage.database, table_usage.database_schema, table_usage.table "usage_date": table_usage.date,
) "database": table_usage.database,
else: "database_schema": table_usage.databaseSchema,
table_entities = self.metadata.es_search_from_service( }
entity_type=Table, else:
service_name=self.service_name, self.table_usage_map[table_entity.id.__root__][
filters={ "usage_count"
"database": table_usage.database, ] += table_usage.count
"database_schema": None, self.table_usage_map[table_entity.id.__root__]["sql_queries"].extend(
"name": table_usage.table, table_usage.sqlQueries
}, )
)
for table_entity in table_entities or []:
if table_entity is not None:
if not table_usage_map.get(table_entity.id.__root__):
table_usage_map[table_entity.id.__root__] = {
"table_entity": table_entity,
"usage_count": table_usage.count,
"sql_queries": table_usage.sql_queries,
"usage_date": table_usage.date,
"database": table_usage.database,
}
else:
table_usage_map[table_entity.id.__root__][
"usage_count"
] += table_usage.count
table_usage_map[table_entity.id.__root__]["sql_queries"].extend(
table_usage.sql_queries
)
table_join_request = self.__get_table_joins(table_usage)
logger.debug("table join request {}".format(table_join_request))
try:
if (
table_join_request is not None
and len(table_join_request.columnJoins) > 0
):
self.metadata.publish_frequently_joined_with(
table_entity, table_join_request
)
except APIError as err:
self.status.failures.append(table_join_request)
logger.error(
"Failed to update query join for {}, {}".format(
table_usage.table, err
)
)
else: def __publish_usage_records(self) -> None:
logger.warning( """
"Table does not exist, skipping usage publish {}, {}".format( Method to publish SQL Queries, Table Usage & Lineage
table_usage.table, table_usage.database """
) for _, value_dict in self.table_usage_map.items():
)
self.status.warnings.append(f"Table: {table_usage.table}")
for table_id, value_dict in table_usage_map.items():
self.metadata.ingest_table_queries_data( self.metadata.ingest_table_queries_data(
table=value_dict["table_entity"], table=value_dict["table_entity"],
table_queries=value_dict["sql_queries"], table_queries=value_dict["sql_queries"],
@ -154,8 +121,8 @@ class MetadataUsageBulkSink(BulkSink):
self.ingest_sql_queries_lineage( self.ingest_sql_queries_lineage(
value_dict["sql_queries"], value_dict["database"] value_dict["sql_queries"], value_dict["database"]
) )
table_usage_request = TableUsageRequest( table_usage_request = UsageRequest(
date=value_dict["usage_date"], count=value_dict["usage_count"] date=self.today, count=value_dict["usage_count"]
) )
try: try:
self.metadata.publish_table_usage( self.metadata.publish_table_usage(
@ -179,47 +146,100 @@ class MetadataUsageBulkSink(BulkSink):
self.status.failures.append( self.status.failures.append(
"Table: {}".format(value_dict["table_entity"].name.__root__) "Table: {}".format(value_dict["table_entity"].name.__root__)
) )
def write_records(self) -> None:
for usage_record in self.file_handler.readlines():
record = json.loads(usage_record)
table_usage = TableUsageCount(**json.loads(record))
self.service_name = table_usage.serviceName
if "." in table_usage.table:
databaseSchema, table = fqn.split(table_usage.table)[-2:]
table_usage.table = table
if not table_usage.databaseSchema:
table_usage.databaseSchema = databaseSchema
table_usage.database = get_formatted_entity_name(table_usage.database)
table_usage.databaseSchema = get_formatted_entity_name(
table_usage.databaseSchema
)
table_usage.table = get_formatted_entity_name(table_usage.table)
table_entities = search_table_entities(
self.metadata,
table_usage.serviceName,
table_usage.database,
table_usage.databaseSchema,
table_usage.table,
)
for table_entity in table_entities or []:
if table_entity is not None:
self.__populate_table_usage_map(
table_usage=table_usage, table_entity=table_entity
)
table_join_request = self.__get_table_joins(table_usage)
logger.debug("table join request {}".format(table_join_request))
try:
if (
table_join_request is not None
and len(table_join_request.columnJoins) > 0
):
self.metadata.publish_frequently_joined_with(
table_entity, table_join_request
)
except APIError as err:
self.status.failures.append(table_join_request)
logger.error(
"Failed to update query join for {}, {}".format(
table_usage.table, err
)
)
else:
logger.warning(
"Table does not exist, skipping usage publish {}, {}".format(
table_usage.table, table_usage.database
)
)
self.status.warnings.append(f"Table: {table_usage.table}")
self.__publish_usage_records()
try: try:
self.metadata.compute_percentile(Table, self.today) self.metadata.compute_percentile(Table, self.today)
self.metadata.compute_percentile(Database, self.today) self.metadata.compute_percentile(Database, self.today)
except APIError: except APIError:
logger.error("Failed to publish compute.percentile") logger.error("Failed to publish compute.percentile")
def __get_table_joins(self, table_usage): def __get_table_joins(self, table_usage: TableUsageCount) -> TableJoins:
table_joins: TableJoins = TableJoins( table_joins: TableJoins = TableJoins(
columnJoins=[], directTableJoins=[], startDate=table_usage.date columnJoins=[], directTableJoins=[], startDate=table_usage.date
) )
"""
Method to get Table Joins
"""
column_joins_dict = {} column_joins_dict = {}
for column_join in table_usage.joins: for column_join in table_usage.joins:
joined_with = {} joined_with = {}
if column_join.table_column is None or len(column_join.joined_with) == 0: if column_join.tableColumn is None or len(column_join.joinedWith) == 0:
continue continue
if column_join.table_column.column in column_joins_dict.keys():
joined_with = column_joins_dict[column_join.table_column.column] if column_join.tableColumn.column in column_joins_dict.keys():
joined_with = column_joins_dict[column_join.tableColumn.column]
else: else:
column_joins_dict[column_join.table_column.column] = {} column_joins_dict[column_join.tableColumn.column] = {}
main_column_fqdn = self.__get_column_fqdn(
table_usage.database, for column in column_join.joinedWith:
table_usage.database_schema, joined_column_fqn = self.__get_column_fqn(
column_join.table_column, table_usage.database, table_usage.databaseSchema, column
)
for column in column_join.joined_with:
joined_column_fqdn = self.__get_column_fqdn(
table_usage.database, table_usage.database_schema, column
) )
if str(joined_column_fqdn) in joined_with.keys(): if str(joined_column_fqn) in joined_with.keys():
column_joined_with = joined_with[str(joined_column_fqdn)] column_joined_with = joined_with[str(joined_column_fqn)]
column_joined_with.joinCount += 1 column_joined_with.joinCount += 1
joined_with[str(joined_column_fqdn)] = column_joined_with joined_with[str(joined_column_fqn)] = column_joined_with
elif joined_column_fqdn is not None: elif joined_column_fqn is not None:
joined_with[str(joined_column_fqdn)] = ColumnJoinedWith( joined_with[str(joined_column_fqn)] = JoinedWith(
fullyQualifiedName=str(joined_column_fqdn), joinCount=1 fullyQualifiedName=str(joined_column_fqn), joinCount=1
) )
else: else:
logger.info( logger.debug(
f"Skipping join columns for {column} {joined_column_fqdn}" f"Skipping join columns for {column} {joined_column_fqn}"
) )
column_joins_dict[column_join.table_column.column] = joined_with column_joins_dict[column_join.tableColumn.column] = joined_with
for key, value in column_joins_dict.items(): for key, value in column_joins_dict.items():
table_joins.columnJoins.append( table_joins.columnJoins.append(
@ -227,44 +247,24 @@ class MetadataUsageBulkSink(BulkSink):
) )
return table_joins return table_joins
def __get_column_fqdn( def __get_column_fqn(
self, database: str, database_schema: str, table_column: TableColumn self, database: str, database_schema: str, table_column: TableColumn
): ) -> Optional[str]:
table_entities = self.__get_table_entity( """
database, database_schema, table_column.table Method to get column fqn
"""
table_entities = search_table_entities(
self.metadata,
self.service_name,
database,
database_schema,
table_column.table,
) )
if not table_entities: if not table_entities:
return None return None
for table_entity in table_entities:
for tbl_column in table_entity.columns:
if table_column.column.lower() == tbl_column.name.__root__.lower():
return tbl_column.fullyQualifiedName.__root__
def __get_table_entity( for table_entity in table_entities:
self, database_name: str, database_schema: str, table_name: str return get_column_fqn(table_entity, table_column.column)
) -> Optional[List[Table]]:
table_fqn = fqn.build(
self.metadata,
entity_type=Table,
service_name=self.service_name,
database_name=database_name,
schema_name=database_schema,
table_name=table_name,
)
table_fqn = _get_formmated_table_name(table_fqn)
table_entity = self.metadata.get_by_name(Table, fqn=table_fqn)
if table_entity:
return [table_entity]
es_result = self.metadata.es_search_from_service(
entity_type=Table,
service_name=self.service_name,
filters={
"database": database_name,
"database_schema": database_schema,
"name": table_name,
},
)
return es_result
def get_status(self): def get_status(self):
return self.status return self.status

View File

@ -1,98 +0,0 @@
# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Dict, List, Optional
from pydantic import BaseModel
from metadata.generated.schema.entity.data.table import ColumnJoins, SqlQuery
from metadata.ingestion.models.json_serializable import JsonSerializable
class TableQuery(JsonSerializable):
""" """
def __init__(
self,
query: str,
user_name: str,
starttime: str,
endtime: str,
analysis_date: str,
database: str,
aborted: bool,
sql: str,
service_name: str,
) -> None:
""" """
self.query = query
self.user_name = user_name
self.starttime = starttime
self.endtime = endtime
self.analysis_date = analysis_date
self.database = database
self.aborted = aborted
self.sql = sql
self.service_name = service_name
class TableColumn(BaseModel):
table: str
column: str
class TableColumnJoin(BaseModel):
table_column: Optional[TableColumn] = None
joined_with: Optional[List[TableColumn]] = None
TableColumnJoins = List[TableColumnJoin]
class TableUsageCount(BaseModel):
table: str
date: str
database: str
database_schema: Optional[str]
sql_queries: List[SqlQuery]
count: int = 1
joins: TableColumnJoins
service_name: str
class QueryParserData(BaseModel):
tables: List[str]
tables_aliases: Dict[str, str]
columns: Dict[str, List[object]]
date: str
database: str
sql: str
service_name: str
class Config:
arbitrary_types_allowed = True
class TableUsageRequest(BaseModel):
date: str
count: int
class ColumnJoinsList(BaseModel):
__root__: List[ColumnJoins]
class ColumnJoinedWith(BaseModel):
fullyQualifiedName: str
joinCount: int
TablesUsage = List[TableUsageCount]

View File

@ -27,7 +27,7 @@ from metadata.generated.schema.entity.data.table import (
TableJoins, TableJoins,
TableProfile, TableProfile,
) )
from metadata.ingestion.models.table_queries import TableUsageRequest from metadata.generated.schema.type.usageRequest import UsageRequest
from metadata.ingestion.ometa.client import REST from metadata.ingestion.ometa.client import REST
from metadata.ingestion.ometa.utils import ometa_logger from metadata.ingestion.ometa.utils import ometa_logger
@ -115,7 +115,7 @@ class OMetaTableMixin:
) )
def publish_table_usage( def publish_table_usage(
self, table: Table, table_usage_request: TableUsageRequest self, table: Table, table_usage_request: UsageRequest
) -> None: ) -> None:
""" """
POST usage details for a Table POST usage details for a Table

View File

@ -76,7 +76,3 @@ def _(arg) -> str:
Models with __root__ Models with __root__
""" """
return str(arg.__root__) return str(arg.__root__)
def _get_formmated_table_name(table_name):
return table_name.replace("[", "").replace("]", "")

View File

@ -22,19 +22,11 @@ from metadata.config.common import ConfigModel
from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import ( from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
OpenMetadataConnection, OpenMetadataConnection,
) )
from metadata.generated.schema.type.queryParserData import QueryParserData
from metadata.generated.schema.type.tableQuery import TableQuery
from metadata.ingestion.api.processor import Processor, ProcessorStatus from metadata.ingestion.api.processor import Processor, ProcessorStatus
from metadata.ingestion.models.table_queries import QueryParserData, TableQuery
from metadata.utils.logger import ingestion_logger from metadata.utils.logger import ingestion_logger
class QueryParserProcessorConfig(ConfigModel):
"""
Query parser pydantic configuration model
"""
filter: Optional[str] = None
logger = ingestion_logger() logger = ingestion_logger()
@ -52,12 +44,12 @@ class QueryParserProcessor(Processor):
status (ProcessorStatus): status (ProcessorStatus):
""" """
config: QueryParserProcessorConfig config: ConfigModel
status: ProcessorStatus status: ProcessorStatus
def __init__( def __init__(
self, self,
config: QueryParserProcessorConfig, config: ConfigModel,
metadata_config: OpenMetadataConnection, metadata_config: OpenMetadataConnection,
): ):
@ -69,26 +61,30 @@ class QueryParserProcessor(Processor):
def create( def create(
cls, config_dict: dict, metadata_config: OpenMetadataConnection, **kwargs cls, config_dict: dict, metadata_config: OpenMetadataConnection, **kwargs
): ):
config = QueryParserProcessorConfig.parse_obj(config_dict) config = ConfigModel.parse_obj(config_dict)
return cls(config, metadata_config) return cls(config, metadata_config)
def process(self, record: TableQuery) -> QueryParserData: def process(self, record: TableQuery) -> Optional[QueryParserData]:
query_parser_data = None
try: try:
start_date = record.analysis_date if not record.query:
if isinstance(record.analysis_date, str): return
start_date = record.analysisDate.__root__
if isinstance(record.analysisDate, str):
start_date = datetime.datetime.strptime( start_date = datetime.datetime.strptime(
str(record.analysis_date), "%Y-%m-%d %H:%M:%S" str(record.analysisDate), "%Y-%m-%d %H:%M:%S"
).date() ).date()
parser = Parser(record.sql) parser = Parser(record.query)
columns_dict = {} if parser.columns_dict is None else parser.columns_dict columns_dict = {} if parser.columns_dict is None else parser.columns_dict
query_parser_data = QueryParserData( query_parser_data = QueryParserData(
tables=parser.tables, tables=parser.tables,
tables_aliases=parser.tables_aliases, tableAliases=parser.tables_aliases,
columns=columns_dict, columns=columns_dict,
database=record.database, database=record.database,
sql=record.sql, databaseSchema=record.databaseSchema,
sql=record.query,
date=start_date.strftime("%Y-%m-%d"), date=start_date.strftime("%Y-%m-%d"),
service_name=record.service_name, serviceName=record.serviceName,
) )
# pylint: disable=broad-except # pylint: disable=broad-except
except Exception as err: except Exception as err:
@ -96,8 +92,6 @@ class QueryParserProcessor(Processor):
logger.debug(record.sql) logger.debug(record.sql)
logger.debug(traceback.format_exc()) logger.debug(traceback.format_exc())
logger.error(err) logger.error(err)
query_parser_data = None
return query_parser_data return query_parser_data
def close(self): def close(self):

View File

@ -15,7 +15,7 @@ import collections
import logging as log import logging as log
import os import os
from datetime import datetime from datetime import datetime
from typing import Iterable from typing import Any, Dict, Iterable, Optional
from google.cloud import logging from google.cloud import logging
@ -31,34 +31,29 @@ from metadata.generated.schema.entity.services.databaseService import (
from metadata.generated.schema.metadataIngestion.workflow import ( from metadata.generated.schema.metadataIngestion.workflow import (
Source as WorkflowSource, Source as WorkflowSource,
) )
from metadata.ingestion.api.source import InvalidSourceException, Source, SourceStatus from metadata.generated.schema.type.tableQuery import TableQuery
from metadata.ingestion.models.table_queries import TableQuery from metadata.ingestion.api.source import InvalidSourceException
from metadata.ingestion.source.database.common_db_source import SQLSourceStatus from metadata.ingestion.source.database.usage_source import UsageSource
from metadata.utils.credentials import set_google_credentials from metadata.utils.credentials import set_google_credentials
from metadata.utils.helpers import get_start_and_end
logger = log.getLogger(__name__) logger = log.getLogger(__name__)
class BigqueryUsageSource(Source[TableQuery]): class BigqueryUsageSource(UsageSource):
SERVICE_TYPE = DatabaseServiceType.BigQuery.value SERVICE_TYPE = DatabaseServiceType.BigQuery.value
scheme = "bigquery" scheme = "bigquery"
def __init__(self, config: WorkflowSource, metadata_config: OpenMetadataConnection): def __init__(self, config: WorkflowSource, metadata_config: OpenMetadataConnection):
super().__init__() super().__init__(config, metadata_config)
self.temp_credentials = None self.temp_credentials = None
self.metadata_config = metadata_config
self.config = config
self.service_connection = config.serviceConnection.__root__.config
# Used as db # Used as db
self.project_id = ( self.project_id = (
self.service_connection.projectId self.connection.projectId or self.connection.credentials.gcsConfig.projectId
or self.service_connection.credentials.gcsConfig.projectId
) )
self.logger_name = "cloudaudit.googleapis.com%2Fdata_access" self.logger_name = "cloudaudit.googleapis.com%2Fdata_access"
self.status = SQLSourceStatus() self.logging_client = logging.Client()
self.usage_logger = self.logging_client.logger(self.logger_name)
logger.debug("Listing entries for logger {}:".format(self.usage_logger.name))
@classmethod @classmethod
def create(cls, config_dict, metadata_config: OpenMetadataConnection): def create(cls, config_dict, metadata_config: OpenMetadataConnection):
@ -75,69 +70,47 @@ class BigqueryUsageSource(Source[TableQuery]):
return cls(config, metadata_config) return cls(config, metadata_config)
def prepare(self): def _get_raw_extract_iter(self) -> Optional[Iterable[Dict[str, Any]]]:
pass entries = self.usage_logger.list_entries()
for entry in entries:
def next_record(self) -> Iterable[TableQuery]: timestamp = entry.timestamp.isoformat()
logging_client = logging.Client() timestamp = datetime.strptime(timestamp[0:10], "%Y-%m-%d")
usage_logger = logging_client.logger(self.logger_name) if timestamp >= self.start and timestamp <= self.end:
logger.debug("Listing entries for logger {}:".format(usage_logger.name)) if ("query" in str(entry.payload)) and type(
start, end = get_start_and_end(self.config.sourceConfig.config.queryLogDuration) entry.payload
try: ) == collections.OrderedDict:
entries = usage_logger.list_entries() payload = list(entry.payload.items())[-1][1]
for entry in entries: if "jobChange" in payload:
timestamp = entry.timestamp.isoformat() logger.debug(f"\nEntries: {payload}")
timestamp = datetime.strptime(timestamp[0:10], "%Y-%m-%d") if "queryConfig" in payload["jobChange"]["job"]["jobConfig"]:
if timestamp >= start and timestamp <= end: queryConfig = payload["jobChange"]["job"]["jobConfig"][
if ("query" in str(entry.payload)) and type(
entry.payload
) == collections.OrderedDict:
payload = list(entry.payload.items())[-1][1]
if "jobChange" in payload:
logger.debug(f"\nEntries: {payload}")
if (
"queryConfig" "queryConfig"
in payload["jobChange"]["job"]["jobConfig"] ]
): else:
queryConfig = payload["jobChange"]["job"]["jobConfig"][ continue
"queryConfig" jobStats = payload["jobChange"]["job"]["jobStats"]
] statementType = ""
else: if hasattr(queryConfig, "statementType"):
continue statementType = queryConfig["statementType"]
jobStats = payload["jobChange"]["job"]["jobStats"] database = self.project_id
statementType = "" analysis_date = str(
if hasattr(queryConfig, "statementType"): datetime.strptime(
statementType = queryConfig["statementType"] jobStats["startTime"][0:19], "%Y-%m-%dT%H:%M:%S"
database = self.project_id ).strftime("%Y-%m-%d %H:%M:%S")
analysis_date = str( )
datetime.strptime( logger.debug(f"Query :{statementType}:{queryConfig['query']}")
jobStats["startTime"][0:19], "%Y-%m-%dT%H:%M:%S" tq = TableQuery(
).strftime("%Y-%m-%d %H:%M:%S") query=queryConfig["query"],
) userName=entry.resource.labels["project_id"],
logger.debug( startTime=str(jobStats["startTime"]),
f"Query :{statementType}:{queryConfig['query']}" endTime=str(jobStats["endTime"]),
) analysisDate=analysis_date,
tq = TableQuery( aborted=0,
query=statementType, database=str(database),
user_name=entry.resource.labels["project_id"], serviceName=self.config.serviceName,
starttime=str(jobStats["startTime"]), databaseSchema=None,
endtime=str(jobStats["endTime"]), )
analysis_date=analysis_date, yield tq
aborted=0,
database=str(database),
sql=queryConfig["query"],
service_name=self.config.serviceName,
)
yield tq
except Exception as err:
logger.error(repr(err))
def get_status(self) -> SourceStatus:
return self.status
def test_connection(self) -> SourceStatus:
pass
def close(self): def close(self):
super().close() super().close()

View File

@ -13,7 +13,6 @@ Clickhouse usage module
""" """
import ast import ast
from typing import Iterable
from metadata.generated.schema.entity.services.connections.database.clickhouseConnection import ( from metadata.generated.schema.entity.services.connections.database.clickhouseConnection import (
ClickhouseConnection, ClickhouseConnection,
@ -22,29 +21,17 @@ from metadata.generated.schema.metadataIngestion.workflow import (
Source as WorkflowSource, Source as WorkflowSource,
) )
from metadata.generated.schema.metadataIngestion.workflow import WorkflowConfig from metadata.generated.schema.metadataIngestion.workflow import WorkflowConfig
from metadata.ingestion.api.source import InvalidSourceException, Source, SourceStatus from metadata.ingestion.api.source import InvalidSourceException
# This import verifies that the dependencies are available.
from metadata.ingestion.models.table_queries import TableQuery
from metadata.ingestion.source.database.common_db_source import SQLSourceStatus
from metadata.ingestion.source.database.usage_source import UsageSource from metadata.ingestion.source.database.usage_source import UsageSource
from metadata.utils.connections import get_connection, test_connection
from metadata.utils.helpers import get_start_and_end
from metadata.utils.sql_queries import CLICKHOUSE_SQL_USAGE_STATEMENT from metadata.utils.sql_queries import CLICKHOUSE_SQL_USAGE_STATEMENT
class ClickhouseUsageSource(UsageSource): class ClickhouseUsageSource(UsageSource):
def __init__(self, config: WorkflowSource, metadata_config: WorkflowConfig): def __init__(self, config: WorkflowSource, metadata_config: WorkflowConfig):
super().__init__(config, metadata_config) super().__init__(config, metadata_config)
self.config = config
self.connection = config.serviceConnection.__root__.config
start, end = get_start_and_end(self.config.sourceConfig.config.queryLogDuration)
self.analysis_date = start
self.sql_stmt = CLICKHOUSE_SQL_USAGE_STATEMENT.format( self.sql_stmt = CLICKHOUSE_SQL_USAGE_STATEMENT.format(
start_time=start, end_time=end start_time=self.start, end_time=self.end
) )
self.report = SQLSourceStatus()
self.engine = get_connection(self.connection)
@classmethod @classmethod
def create(cls, config_dict, metadata_config: WorkflowConfig): def create(cls, config_dict, metadata_config: WorkflowConfig):
@ -57,27 +44,12 @@ class ClickhouseUsageSource(UsageSource):
return cls(config, metadata_config) return cls(config, metadata_config)
def next_record(self) -> Iterable[TableQuery]: def get_database_name(self, data: dict) -> str:
""" """
Using itertools.groupby and raw level iterator, Method to fetch database name from row data
it groups to table and yields TableMetadata
:return:
""" """
for row in self._get_raw_extract_iter(): database = "default"
database = "default" if data["database_name"]:
if row["database_name"]: database_list = ast.literal_eval(data["database_name"])
database_list = ast.literal_eval(row["database_name"]) database = database_list[0] if len(database_list) == 1 else "default"
database = database_list[0] if len(database_list) == 1 else "default" return database
table_query = TableQuery(
query=row["query_id"],
user_name=row["user_name"],
starttime=str(row["start_time"]),
endtime=str(row["end_time"]),
analysis_date=self.analysis_date,
aborted=row["aborted"],
database=database,
sql=row["query_text"],
service_name=self.config.serviceName,
)
yield table_query

View File

@ -23,18 +23,15 @@ from metadata.generated.schema.metadataIngestion.workflow import (
from metadata.generated.schema.metadataIngestion.workflow import WorkflowConfig from metadata.generated.schema.metadataIngestion.workflow import WorkflowConfig
from metadata.ingestion.api.source import InvalidSourceException from metadata.ingestion.api.source import InvalidSourceException
from metadata.ingestion.source.database.usage_source import UsageSource from metadata.ingestion.source.database.usage_source import UsageSource
# This import verifies that the dependencies are available.
from metadata.utils.helpers import get_start_and_end
from metadata.utils.sql_queries import MSSQL_SQL_USAGE_STATEMENT from metadata.utils.sql_queries import MSSQL_SQL_USAGE_STATEMENT
class MssqlUsageSource(UsageSource): class MssqlUsageSource(UsageSource):
def __init__(self, config: WorkflowSource, metadata_config: OpenMetadataConnection): def __init__(self, config: WorkflowSource, metadata_config: OpenMetadataConnection):
super().__init__(config, metadata_config) super().__init__(config, metadata_config)
start, end = get_start_and_end(config.sourceConfig.config.queryLogDuration) self.sql_stmt = MSSQL_SQL_USAGE_STATEMENT.format(
self.analysis_date = start start_date=self.start, end_date=self.end
self.sql_stmt = MSSQL_SQL_USAGE_STATEMENT.format(start_date=start, end_date=end) )
@classmethod @classmethod
def create(cls, config_dict, metadata_config: WorkflowConfig): def create(cls, config_dict, metadata_config: WorkflowConfig):

View File

@ -27,7 +27,6 @@ from metadata.generated.schema.metadataIngestion.workflow import (
from metadata.generated.schema.metadataIngestion.workflow import WorkflowConfig from metadata.generated.schema.metadataIngestion.workflow import WorkflowConfig
from metadata.ingestion.api.source import InvalidSourceException from metadata.ingestion.api.source import InvalidSourceException
from metadata.ingestion.source.database.usage_source import UsageSource from metadata.ingestion.source.database.usage_source import UsageSource
from metadata.utils.helpers import get_start_and_end
# pylint: disable=useless-super-delegation # pylint: disable=useless-super-delegation
from metadata.utils.logger import ingestion_logger from metadata.utils.logger import ingestion_logger
@ -37,24 +36,14 @@ logger = ingestion_logger()
class RedshiftUsageSource(UsageSource): class RedshiftUsageSource(UsageSource):
# SELECT statement from mysql information_schema to extract table and column metadata
SQL_STATEMENT = REDSHIFT_SQL_STATEMENT SQL_STATEMENT = REDSHIFT_SQL_STATEMENT
# CONFIG KEYS
WHERE_CLAUSE_SUFFIX_KEY = "where_clause"
CLUSTER_SOURCE = "cluster_source"
CLUSTER_KEY = "cluster_key"
USE_CATALOG_AS_CLUSTER_NAME = "use_catalog_as_cluster_name"
DATABASE_KEY = "database_key"
SERVICE_TYPE = DatabaseServiceType.Redshift.value
DEFAULT_CLUSTER_SOURCE = "CURRENT_DATABASE()"
def __init__(self, config: WorkflowSource, metadata_config: WorkflowConfig): def __init__(self, config: WorkflowSource, metadata_config: WorkflowConfig):
super().__init__(config, metadata_config) super().__init__(config, metadata_config)
start, end = get_start_and_end(self.config.sourceConfig.config.queryLogDuration)
self.sql_stmt = RedshiftUsageSource.SQL_STATEMENT.format( self.sql_stmt = RedshiftUsageSource.SQL_STATEMENT.format(
start_time=start, end_time=end start_time=self.start, end_time=self.end
) )
self.analysis_date = start
self._extract_iter: Union[None, Iterator] = None self._extract_iter: Union[None, Iterator] = None
self._database = "redshift" self._database = "redshift"

View File

@ -12,7 +12,7 @@
import csv import csv
import json import json
from datetime import datetime from datetime import datetime
from typing import Iterable from typing import Dict, Iterable, Optional
from metadata.generated.schema.entity.services.connections.database.sampleDataConnection import ( from metadata.generated.schema.entity.services.connections.database.sampleDataConnection import (
SampleDataConnection, SampleDataConnection,
@ -27,23 +27,27 @@ from metadata.generated.schema.entity.services.databaseService import (
from metadata.generated.schema.metadataIngestion.workflow import ( from metadata.generated.schema.metadataIngestion.workflow import (
Source as WorkflowSource, Source as WorkflowSource,
) )
from metadata.ingestion.api.source import InvalidSourceException, Source from metadata.generated.schema.type.tableQuery import TableQuery
from metadata.ingestion.models.table_queries import TableQuery from metadata.ingestion.api.source import InvalidSourceException
from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.ingestion.source.database.common_db_source import SQLSourceStatus
from metadata.ingestion.source.database.sample_data import SampleDataSourceStatus from metadata.ingestion.source.database.sample_data import SampleDataSourceStatus
from metadata.ingestion.source.database.usage_source import UsageSource
class SampleUsageSource(Source[TableQuery]): class SampleUsageSource(UsageSource):
service_type = DatabaseServiceType.BigQuery.value service_type = DatabaseServiceType.BigQuery.value
def __init__(self, config: WorkflowSource, metadata_config: OpenMetadataConnection): def __init__(self, config: WorkflowSource, metadata_config: OpenMetadataConnection):
super().__init__()
self.status = SampleDataSourceStatus() self.status = SampleDataSourceStatus()
self.config = config self.config = config
self.service_connection = config.serviceConnection.__root__.config self.service_connection = config.serviceConnection.__root__.config
self.source_config = config.sourceConfig.config
self.metadata_config = metadata_config self.metadata_config = metadata_config
self.report = SQLSourceStatus()
self.metadata = OpenMetadata(metadata_config) self.metadata = OpenMetadata(metadata_config)
self.analysis_date = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
self.service_json = json.load( self.service_json = json.load(
open( open(
@ -70,29 +74,16 @@ class SampleUsageSource(Source[TableQuery]):
) )
return cls(config, metadata_config) return cls(config, metadata_config)
def prepare(self): def _get_raw_extract_iter(self) -> Optional[Iterable[Dict[str, str]]]:
pass
def next_record(self) -> Iterable[TableQuery]:
for row in self.query_logs: for row in self.query_logs:
tq = TableQuery( yield TableQuery(
query=row["query"], query=row["query"],
user_name="", userName="",
starttime="", startTime="",
endtime="", endTime="",
analysis_date=datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"), analysisDate=self.analysis_date,
database="ecommerce_db",
aborted=False, aborted=False,
sql=row["query"], database="ecommerce_db",
service_name=self.config.serviceName, serviceName=self.config.serviceName,
databaseSchema="shopify",
) )
yield tq
def close(self):
pass
def get_status(self):
return self.status
def test_connection(self) -> None:
pass

View File

@ -12,11 +12,8 @@
Snowflake usage module Snowflake usage module
""" """
import traceback
from datetime import timedelta from datetime import timedelta
from typing import Any, Dict, Iterable, Iterator, Union from typing import Iterable, Iterator, Union
from sqlalchemy import inspect
from metadata.generated.schema.entity.services.connections.database.snowflakeConnection import ( from metadata.generated.schema.entity.services.connections.database.snowflakeConnection import (
SnowflakeConnection, SnowflakeConnection,
@ -31,17 +28,17 @@ from metadata.generated.schema.metadataIngestion.workflow import (
Source as WorkflowSource, Source as WorkflowSource,
) )
from metadata.generated.schema.metadataIngestion.workflow import WorkflowConfig from metadata.generated.schema.metadataIngestion.workflow import WorkflowConfig
from metadata.generated.schema.type.tableQuery import TableQuery
from metadata.ingestion.api.source import InvalidSourceException from metadata.ingestion.api.source import InvalidSourceException
# This import verifies that the dependencies are available. # This import verifies that the dependencies are available.
from metadata.ingestion.models.table_queries import TableQuery
from metadata.ingestion.source.database.usage_source import UsageSource from metadata.ingestion.source.database.usage_source import UsageSource
from metadata.utils.connections import get_connection from metadata.utils.connections import get_connection
from metadata.utils.helpers import get_start_and_end
from metadata.utils.logger import ingestion_logger from metadata.utils.logger import ingestion_logger
from metadata.utils.sql_queries import SNOWFLAKE_SQL_STATEMENT from metadata.utils.sql_queries import SNOWFLAKE_SQL_STATEMENT
logger = ingestion_logger() logger = ingestion_logger()
SNOWFLAKE_ABORTED_CODE = "1969"
class SnowflakeUsageSource(UsageSource): class SnowflakeUsageSource(UsageSource):
@ -60,12 +57,10 @@ class SnowflakeUsageSource(UsageSource):
def __init__(self, config: WorkflowSource, metadata_config: OpenMetadataConnection): def __init__(self, config: WorkflowSource, metadata_config: OpenMetadataConnection):
super().__init__(config, metadata_config) super().__init__(config, metadata_config)
start, end = get_start_and_end(self.config.sourceConfig.config.queryLogDuration) self.end = self.end + timedelta(days=1)
end = end + timedelta(days=1)
self.analysis_date = start
self.sql_stmt = SnowflakeUsageSource.SQL_STATEMENT.format( self.sql_stmt = SnowflakeUsageSource.SQL_STATEMENT.format(
start_date=start, start_date=self.start,
end_date=end, end_date=self.end,
result_limit=self.config.sourceConfig.config.resultLimit, result_limit=self.config.sourceConfig.config.resultLimit,
) )
self._extract_iter: Union[None, Iterator] = None self._extract_iter: Union[None, Iterator] = None
@ -81,9 +76,9 @@ class SnowflakeUsageSource(UsageSource):
) )
return cls(config, metadata_config) return cls(config, metadata_config)
def _get_raw_extract_iter(self) -> Iterable[Dict[str, Any]]: def _get_raw_extract_iter(self) -> Iterable[TableQuery]:
if self.config.serviceConnection.__root__.config.database: if self.config.serviceConnection.__root__.config.database:
yield from super(SnowflakeUsageSource, self)._get_raw_extract_iter() yield from super()._get_raw_extract_iter()
else: else:
query = "SHOW DATABASES" query = "SHOW DATABASES"
results = self.engine.execute(query) results = self.engine.execute(query)
@ -96,35 +91,28 @@ class SnowflakeUsageSource(UsageSource):
self.engine = get_connection(self.connection) self.engine = get_connection(self.connection)
rows = self.engine.execute(self.sql_stmt) rows = self.engine.execute(self.sql_stmt)
for row in rows: for row in rows:
yield row yield TableQuery(
query=row["query_text"],
userName=row["user_name"],
startTime=str(row["start_time"]),
endTime=str(row["end_time"]),
analysisDate=self.analysis_date,
aborted=self.get_aborted_status(row),
database=self.get_database_name(row),
serviceName=self.config.serviceName,
databaseSchema=row["schema_name"],
)
def next_record(self) -> Iterable[TableQuery]: def get_database_name(self, data: dict) -> str:
""" """
Using itertools.groupby and raw level iterator, Method to get database name
it groups to table and yields TableMetadata
:return:
""" """
for row in self._get_raw_extract_iter(): if not data["database_name"] and self.connection.database:
try: return self.connection.database
table_query = TableQuery( return data["database_name"]
query=row["query_type"],
user_name=row["user_name"], def get_aborted_status(self, data: dict) -> bool:
starttime=str(row["start_time"]), """
endtime=str(row["end_time"]), Method to get aborted status of query
analysis_date=self.analysis_date, """
aborted="1969" in str(row["end_time"]), return SNOWFLAKE_ABORTED_CODE in str(data["end_time"])
database=row["database_name"],
sql=row["query_text"],
service_name=self.config.serviceName,
)
if not row["database_name"] and self.connection.database:
TableQuery.database = self.connection.database
logger.debug(f"Parsed Query: {row['query_text']}")
if row["schema_name"] is not None:
self.report.scanned(f"{row['database_name']}.{row['schema_name']}")
else:
self.report.scanned(f"{row['database_name']}")
yield table_query
except Exception as err:
logger.debug(traceback.format_exc())
logger.debug(repr(err))

View File

@ -12,7 +12,8 @@
Usage Souce Module Usage Souce Module
""" """
import csv import csv
from typing import Any, Dict, Iterable import traceback
from typing import Iterable, Optional
from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import ( from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
OpenMetadataConnection, OpenMetadataConnection,
@ -20,13 +21,17 @@ from metadata.generated.schema.entity.services.connections.metadata.openMetadata
from metadata.generated.schema.metadataIngestion.workflow import ( from metadata.generated.schema.metadataIngestion.workflow import (
Source as WorkflowSource, Source as WorkflowSource,
) )
from metadata.ingestion.api.source import InvalidSourceException, Source, SourceStatus
# This import verifies that the dependencies are available. # This import verifies that the dependencies are available.
from metadata.ingestion.models.table_queries import TableQuery from metadata.generated.schema.type.tableQuery import TableQuery
from metadata.ingestion.api.source import Source, SourceStatus
from metadata.ingestion.source.database.common_db_source import SQLSourceStatus from metadata.ingestion.source.database.common_db_source import SQLSourceStatus
from metadata.utils.connections import get_connection, test_connection from metadata.utils.connections import get_connection, test_connection
from metadata.utils.filters import filter_by_database, filter_by_schema
from metadata.utils.helpers import get_start_and_end from metadata.utils.helpers import get_start_and_end
from metadata.utils.logger import ingestion_logger
logger = ingestion_logger()
class UsageSource(Source[TableQuery]): class UsageSource(Source[TableQuery]):
@ -35,39 +40,63 @@ class UsageSource(Source[TableQuery]):
self.config = config self.config = config
self.metadata_config = metadata_config self.metadata_config = metadata_config
self.connection = config.serviceConnection.__root__.config self.connection = config.serviceConnection.__root__.config
start, end = get_start_and_end(self.config.sourceConfig.config.queryLogDuration) self.source_config = self.config.sourceConfig.config
self.analysis_date = start self.start, self.end = get_start_and_end(self.source_config.queryLogDuration)
self.analysis_date = self.start
self.report = SQLSourceStatus() self.report = SQLSourceStatus()
self.engine = get_connection(self.connection) self.engine = get_connection(self.connection)
def prepare(self): def prepare(self):
return super().prepare() return super().prepare()
def _get_raw_extract_iter(self) -> Iterable[Dict[str, Any]]: def get_database_name(self, data: dict) -> str:
"""
Method to get database name
"""
return data.get("database_name")
def get_aborted_status(self, data: dict) -> bool:
"""
Method to get aborted status of query
"""
return data.get("aborted", False)
def _get_raw_extract_iter(self) -> Optional[Iterable[TableQuery]]:
"""
If queryLogFilePath available in config iterate through log file
otherwise execute the sql query to fetch TableQuery data
"""
if self.config.sourceConfig.config.queryLogFilePath: if self.config.sourceConfig.config.queryLogFilePath:
with open(self.config.sourceConfig.config.queryLogFilePath, "r") as fin: with open(self.config.sourceConfig.config.queryLogFilePath, "r") as fin:
for i in csv.DictReader(fin): for i in csv.DictReader(fin):
query_dict = dict(i) query_dict = dict(i)
row = { yield TableQuery(
"query_type": query_dict.get("query"), query=query_dict["query_text"],
"user_name": query_dict.get("user_name", ""), userName=query_dict.get("user_name", ""),
"start_time": query_dict.get("start_time", ""), startTime=query_dict.get("start_time", ""),
"end_time": query_dict.get("end_time", ""), endTime=query_dict.get("end_time", ""),
"aborted": query_dict.get("aborted", False), analysisDate=self.analysis_date,
"database_name": query_dict.get( aborted=self.get_aborted_status(query_dict),
"database_name", database=self.get_database_name(query_dict),
self.connection.database serviceName=self.config.serviceName,
if self.connection.database databaseSchema=query_dict.get("schema_name"),
else "default", )
),
"query_text": query_dict.get("query"),
"schema_name": query_dict.get("schema_name"),
}
yield row
else: else:
rows = self.engine.execute(self.sql_stmt) rows = self.engine.execute(self.sql_stmt)
for row in rows: for row in rows:
yield row row = dict(row)
print(row)
yield TableQuery(
query=row["query_text"],
userName=row["user_name"],
startTime=str(row["start_time"]),
endTime=str(row["end_time"]),
analysisDate=self.analysis_date,
aborted=self.get_aborted_status(row),
database=self.get_database_name(row),
serviceName=self.config.serviceName,
databaseSchema=row["schema_name"],
)
def next_record(self) -> Iterable[TableQuery]: def next_record(self) -> Iterable[TableQuery]:
""" """
@ -75,24 +104,32 @@ class UsageSource(Source[TableQuery]):
it groups to table and yields TableMetadata it groups to table and yields TableMetadata
:return: :return:
""" """
for table_query in self._get_raw_extract_iter():
if table_query:
if filter_by_database(
self.source_config.databaseFilterPattern,
database_name=table_query.database,
):
continue
if filter_by_schema(
self.source_config.schemaFilterPattern,
schema_name=table_query.databaseSchema,
):
continue
for row in self._get_raw_extract_iter(): try:
table_query = TableQuery( yield table_query
query=row["query_type"], logger.debug(f"Parsed Query: {table_query.query}")
user_name=row["user_name"], if not table_query.databaseSchema:
starttime=str(row["start_time"]), self.report.scanned(
endtime=str(row["end_time"]), f"{table_query.database}.{table_query.databaseSchema}"
analysis_date=self.analysis_date, )
aborted=row["aborted"], else:
database=row["database_name"], self.report.scanned(f"{table_query.database}")
sql=row["query_text"], yield table_query
service_name=self.config.serviceName, except Exception as err:
) logger.debug(traceback.format_exc())
if not row["schema_name"]: logger.error(str(err))
self.report.scanned(f"{row['database_name']}.{row['schema_name']}")
else:
self.report.scanned(f"{row['database_name']}")
yield table_query
def get_report(self): def get_report(self):
""" """

View File

@ -16,13 +16,13 @@ from metadata.generated.schema.entity.data.table import SqlQuery
from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import ( from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
OpenMetadataConnection, OpenMetadataConnection,
) )
from metadata.ingestion.api.stage import Stage, StageStatus from metadata.generated.schema.type.queryParserData import QueryParserData
from metadata.ingestion.models.table_queries import ( from metadata.generated.schema.type.tableUsageCount import (
QueryParserData,
TableColumn, TableColumn,
TableColumnJoin, TableColumnJoin,
TableUsageCount, TableUsageCount,
) )
from metadata.ingestion.api.stage import Stage, StageStatus
from metadata.ingestion.stage.file import FileStageConfig from metadata.ingestion.stage.file import FileStageConfig
from metadata.utils.logger import ingestion_logger from metadata.utils.logger import ingestion_logger
@ -58,7 +58,7 @@ def get_table_column_join(table, table_aliases, joins, database):
) )
except ValueError as err: except ValueError as err:
logger.error("Error in parsing sql query joins {}".format(err)) logger.error("Error in parsing sql query joins {}".format(err))
return TableColumnJoin(table_column=table_column, joined_with=joined_with) return TableColumnJoin(tableColumn=table_column, joinedWith=joined_with)
class TableUsageStage(Stage[QueryParserData]): class TableUsageStage(Stage[QueryParserData]):
@ -104,7 +104,7 @@ class TableUsageStage(Stage[QueryParserData]):
table_usage_count.joins.append( table_usage_count.joins.append(
get_table_column_join( get_table_column_join(
table, table,
record.tables_aliases, record.tableAliases,
record.columns["join"], record.columns["join"],
record.database, record.database,
) )
@ -114,7 +114,7 @@ class TableUsageStage(Stage[QueryParserData]):
if record.columns.get("join") is not None: if record.columns.get("join") is not None:
tbl_column_join = get_table_column_join( tbl_column_join = get_table_column_join(
table, table,
record.tables_aliases, record.tableAliases,
record.columns["join"], record.columns["join"],
record.database, record.database,
) )
@ -126,8 +126,9 @@ class TableUsageStage(Stage[QueryParserData]):
database=record.database, database=record.database,
date=record.date, date=record.date,
joins=joins, joins=joins,
service_name=record.service_name, serviceName=record.serviceName,
sql_queries=[], sqlQueries=[],
databaseSchema=record.databaseSchema,
) )
except Exception as exc: except Exception as exc:
@ -140,7 +141,7 @@ class TableUsageStage(Stage[QueryParserData]):
def close(self): def close(self):
for key, value in self.table_usage.items(): for key, value in self.table_usage.items():
value.sql_queries = self.table_queries.get(key, []) value.sqlQueries = self.table_queries.get(key, [])
data = value.json() data = value.json()
self.file.write(json.dumps(data)) self.file.write(json.dumps(data))
self.file.write("\n") self.file.write("\n")

View File

@ -51,7 +51,9 @@ def get_query_from_dict(data: Dict[str, Optional[str]]) -> str:
) )
def get_entity_from_es_result(entity_list: Optional[List[T]]) -> Optional[T]: def get_entity_from_es_result(
entity_list: Optional[List[T]], fetch_multiple_entities: bool = False
) -> Optional[T]:
""" """
Return a single element from an entity list obtained Return a single element from an entity list obtained
from an ES query from an ES query
@ -59,6 +61,8 @@ def get_entity_from_es_result(entity_list: Optional[List[T]]) -> Optional[T]:
:return: single entity :return: single entity
""" """
if entity_list and len(entity_list): if entity_list and len(entity_list):
if fetch_multiple_entities:
return entity_list
return entity_list[0] return entity_list[0]
logger.warning("ES Query was empty") logger.warning("ES Query was empty")

View File

@ -14,9 +14,12 @@ Filter information has been taken from the
ES indexes definitions ES indexes definitions
""" """
import re import re
from typing import List, Optional, Type, TypeVar from typing import List, Optional, Type, TypeVar, Union
from antlr4 import * from antlr4.CommonTokenStream import CommonTokenStream
from antlr4.error.ErrorStrategy import BailErrorStrategy
from antlr4.InputStream import InputStream
from antlr4.tree.Tree import ParseTreeWalker
from pydantic import BaseModel from pydantic import BaseModel
from metadata.antlr.split_listener import SplitListener from metadata.antlr.split_listener import SplitListener
@ -121,6 +124,7 @@ def _(
schema_name: Optional[str], schema_name: Optional[str],
table_name: str, table_name: str,
retries: int = 3, retries: int = 3,
fetch_multiple_entities: bool = False,
) -> Optional[str]: ) -> Optional[str]:
""" """
Building logic for tables Building logic for tables
@ -148,10 +152,16 @@ def _(
}, },
retries=retries, retries=retries,
) )
entity: Optional[Table] = get_entity_from_es_result(entity_list=es_result) entity: Optional[Union[Table, List[Table]]] = get_entity_from_es_result(
return str(entity.fullyQualifiedName.__root__) if entity else None entity_list=es_result, fetch_multiple_entities=fetch_multiple_entities
)
return _build(service_name, database_name, schema_name, table_name) if not entity:
return None
if fetch_multiple_entities:
return [str(table.fullyQualifiedName.__root__) for table in entity]
return str(entity.fullyQualifiedName.__root__)
fqn = _build(service_name, database_name, schema_name, table_name)
return [fqn] if fetch_multiple_entities else fqn
@fqn_build_registry.add(DatabaseSchema) @fqn_build_registry.add(DatabaseSchema)

View File

@ -10,7 +10,7 @@
# limitations under the License. # limitations under the License.
from datetime import datetime, timedelta from datetime import datetime, timedelta
from typing import Any, Dict, Iterable from typing import Any, Dict, Iterable, Optional
from metadata.generated.schema.api.services.createDashboardService import ( from metadata.generated.schema.api.services.createDashboardService import (
CreateDashboardServiceRequest, CreateDashboardServiceRequest,
@ -46,7 +46,7 @@ def get_start_and_end(duration):
start = (today + timedelta(0 - duration)).replace( start = (today + timedelta(0 - duration)).replace(
hour=0, minute=0, second=0, microsecond=0 hour=0, minute=0, second=0, microsecond=0
) )
end = (today + timedelta(3)).replace(hour=0, minute=0, second=0, microsecond=0) end = today.replace(hour=0, minute=0, second=0, microsecond=0)
return start, end return start, end
@ -185,8 +185,9 @@ def datetime_to_ts(date: datetime) -> int:
return int(date.timestamp() * 1_000) return int(date.timestamp() * 1_000)
def _get_formmated_table_name(table_name): def get_formatted_entity_name(name: str) -> Optional[str]:
return table_name.replace("[", "").replace("]", "") if name:
return name.replace("[", "").replace("]", "")
def get_raw_extract_iter(alchemy_helper) -> Iterable[Dict[str, Any]]: def get_raw_extract_iter(alchemy_helper) -> Iterable[Dict[str, Any]]:

View File

@ -13,33 +13,99 @@ Helper functions to handle SQL lineage operations
""" """
import traceback import traceback
from logging.config import DictConfigurator from logging.config import DictConfigurator
from typing import List, Optional
from metadata.generated.schema.api.lineage.addLineage import AddLineageRequest from metadata.generated.schema.api.lineage.addLineage import AddLineageRequest
from metadata.generated.schema.entity.data.table import Table from metadata.generated.schema.entity.data.table import Table
from metadata.generated.schema.type.entityLineage import EntitiesEdge from metadata.generated.schema.type.entityLineage import (
ColumnLineage,
EntitiesEdge,
LineageDetails,
)
from metadata.generated.schema.type.entityReference import EntityReference from metadata.generated.schema.type.entityReference import EntityReference
from metadata.ingestion.ometa.client import APIError
from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.utils import fqn from metadata.utils import fqn
from metadata.utils.helpers import _get_formmated_table_name from metadata.utils.helpers import get_formatted_entity_name
from metadata.utils.logger import utils_logger from metadata.utils.logger import utils_logger
# Prevent sqllineage from modifying the logger config
def configure(self):
pass
DictConfigurator.configure = configure
logger = utils_logger() logger = utils_logger()
column_lineage_map = {}
def _separate_fqn(database, fqn): def split_raw_table_name(database: str, raw_name: str) -> dict:
database_schema, table = fqn.split(".")[-2:] database_schema = None
if not database_schema: if "." in raw_name:
database_schema = None database_schema, table = fqn.split(raw_name)[-2:]
return {"database": database, "database_schema": database_schema, "name": table} if database_schema == "<default>":
database_schema = None
return {"database": database, "database_schema": database_schema, "table": table}
def get_column_fqn(table_entity: Table, column: str) -> Optional[str]:
"""
Get fqn of column if exist in table entity
"""
if not table_entity:
return
for tbl_column in table_entity.columns:
if column.lower() == tbl_column.name.__root__.lower():
return tbl_column.fullyQualifiedName.__root__
def search_table_entities(
metadata: OpenMetadata,
service_name: str,
database: str,
database_schema: Optional[str],
table: str,
) -> Optional[List[Table]]:
"""
Method to get table entity from database, database_schema & table name
"""
try:
table_fqns = fqn.build(
metadata,
entity_type=Table,
service_name=service_name,
database_name=database,
schema_name=database_schema,
table_name=table,
fetch_multiple_entities=True,
)
table_entities = []
for table_fqn in table_fqns or []:
try:
table_entity = metadata.get_by_name(Table, fqn=table_fqn)
table_entities.append(table_entity)
except APIError:
logger.debug(f"Table not found for fqn: {fqn}")
return table_entities
except Exception as err:
logger.debug(traceback.format_exc())
logger.error(err)
def get_column_lineage(
to_entity: Table,
from_entity: Table,
to_table_raw_name: str,
from_table_raw_name: str,
) -> List[ColumnLineage]:
column_lineage = []
if column_lineage_map.get(to_table_raw_name) and column_lineage_map.get(
to_table_raw_name
).get(from_table_raw_name):
for to_col, from_col in column_lineage_map.get(to_table_raw_name).get(
from_table_raw_name
):
to_col_fqn = get_column_fqn(to_entity, to_col)
from_col_fqn = get_column_fqn(from_entity, from_col)
if to_col_fqn and from_col_fqn:
column_lineage.append(
ColumnLineage(fromColumns=[from_col_fqn], toColumn=to_col_fqn)
)
return column_lineage
def _create_lineage_by_table_name( def _create_lineage_by_table_name(
@ -48,53 +114,46 @@ def _create_lineage_by_table_name(
to_table: str, to_table: str,
service_name: str, service_name: str,
database: str, database: str,
query: str,
): ):
""" """
This method is to create a lineage between two tables This method is to create a lineage between two tables
""" """
try: try:
from_table = str(from_table).replace("<default>", "") from_raw_name = get_formatted_entity_name(str(from_table))
to_table = str(to_table).replace("<default>", "") from_table_obj = split_raw_table_name(database=database, raw_name=from_raw_name)
from_fqn = fqn.build( from_entities = search_table_entities(
metadata, table=from_table_obj.get("table"),
entity_type=Table, database_schema=from_table_obj.get("database_schema"),
database=from_table_obj.get("database"),
metadata=metadata,
service_name=service_name, service_name=service_name,
database_name=database,
schema_name=None, # TODO: Split table name
table_name=_get_formmated_table_name(str(from_table)),
) )
from_entity: Table = metadata.get_by_name(entity=Table, fqn=from_fqn) to_raw_name = get_formatted_entity_name(str(from_table))
if not from_entity: to_table_obj = split_raw_table_name(database=database, raw_name=to_raw_name)
table_obj = _separate_fqn(database=database, fqn=from_fqn) to_entities = search_table_entities(
multiple_from_fqns = metadata.es_search_from_service( table=to_table_obj.get("table"),
entity_type=Table, database_schema=to_table_obj.get("database_schema"),
service_name=service_name, database=to_table_obj.get("database"),
filters=table_obj, metadata=metadata,
)
else:
multiple_from_fqns = [from_entity]
to_fqn = fqn.build(
metadata,
entity_type=Table,
service_name=service_name, service_name=service_name,
database_name=database,
schema_name=None, # TODO: Split table name
table_name=_get_formmated_table_name(str(to_table)),
) )
to_entity: Table = metadata.get_by_name(entity=Table, fqn=to_fqn) if not to_entities or not from_entities:
if not to_entity:
table_obj = _separate_fqn(database=database, fqn=to_fqn)
multiple_to_fqns = metadata.es_search_from_service(
entity_type=Table,
service_name=service_name,
filters=table_obj,
)
else:
multiple_to_fqns = [to_entity]
if not multiple_to_fqns or not multiple_from_fqns:
return None return None
for from_entity in multiple_from_fqns: for from_entity in from_entities:
for to_entity in multiple_to_fqns: for to_entity in to_entities:
col_lineage = get_column_lineage(
to_entity=to_entity,
to_table_raw_name=str(to_table),
from_entity=from_entity,
from_table_raw_name=str(from_table),
)
lineage_details = None
if col_lineage:
lineage_details = LineageDetails(
sqlQuery=query, columnsLineage=col_lineage
)
lineage = AddLineageRequest( lineage = AddLineageRequest(
edge=EntitiesEdge( edge=EntitiesEdge(
fromEntity=EntityReference( fromEntity=EntityReference(
@ -107,13 +166,37 @@ def _create_lineage_by_table_name(
), ),
) )
) )
if lineage_details:
lineage.edge.lineageDetails = lineage_details
created_lineage = metadata.add_lineage(lineage) created_lineage = metadata.add_lineage(lineage)
logger.info(f"Successfully added Lineage {created_lineage}") logger.info(f"Successfully added Lineage {created_lineage}")
except Exception as err: except Exception as err:
logger.debug(traceback.format_exc()) logger.debug(traceback.format_exc())
logger.error(err) logger.error(traceback.format_exc())
def populate_column_lineage_map(raw_column_lineage):
lineage_map = {}
if not raw_column_lineage or len(raw_column_lineage[0]) != 2:
return
for source, target in raw_column_lineage:
if lineage_map.get(str(target.parent)):
ele = lineage_map.get(str(target.parent))
if ele.get(str(source.parent)):
ele[str(source.parent)].append(
(
target.raw_name,
source.raw_name,
)
)
else:
ele[str(source.parent)] = [(target.raw_name, source.raw_name)]
else:
lineage_map[str(target.parent)] = {
str(source.parent): [(target.raw_name, source.raw_name)]
}
return lineage_map
def ingest_lineage_by_query( def ingest_lineage_by_query(
@ -123,12 +206,22 @@ def ingest_lineage_by_query(
This method parses the query to get source, target and intermediate table names to create lineage, This method parses the query to get source, target and intermediate table names to create lineage,
and returns True if target table is found to create lineage otherwise returns False. and returns True if target table is found to create lineage otherwise returns False.
""" """
# Prevent sqllineage from modifying the logger config
# Disable the DictConfigurator.configure method while importing LineageRunner
configure = DictConfigurator.configure
DictConfigurator.configure = lambda _: None
from sqllineage.runner import LineageRunner from sqllineage.runner import LineageRunner
# Reverting changes after import is done
DictConfigurator.configure = configure
column_lineage_map.clear()
try: try:
result = LineageRunner(query) result = LineageRunner(query)
if not result.target_tables: if not result.target_tables:
return False return False
raw_column_lineage = result.get_column_lineage()
column_lineage_map.update(populate_column_lineage_map(raw_column_lineage))
for intermediate_table in result.intermediate_tables: for intermediate_table in result.intermediate_tables:
for source_table in result.source_tables: for source_table in result.source_tables:
_create_lineage_by_table_name( _create_lineage_by_table_name(
@ -137,6 +230,7 @@ def ingest_lineage_by_query(
to_table=intermediate_table, to_table=intermediate_table,
service_name=service_name, service_name=service_name,
database=database, database=database,
query=query,
) )
for target_table in result.target_tables: for target_table in result.target_tables:
_create_lineage_by_table_name( _create_lineage_by_table_name(
@ -145,6 +239,7 @@ def ingest_lineage_by_query(
to_table=target_table, to_table=target_table,
service_name=service_name, service_name=service_name,
database=database, database=database,
query=query,
) )
if not result.intermediate_tables: if not result.intermediate_tables:
for target_table in result.target_tables: for target_table in result.target_tables:
@ -155,6 +250,7 @@ def ingest_lineage_by_query(
to_table=target_table, to_table=target_table,
service_name=service_name, service_name=service_name,
database=database, database=database,
query=query,
) )
return True return True
except Exception as err: except Exception as err:

View File

@ -62,7 +62,7 @@ from metadata.generated.schema.tests.table.tableRowCountToEqual import (
) )
from metadata.generated.schema.tests.tableTest import TableTestCase, TableTestType from metadata.generated.schema.tests.tableTest import TableTestCase, TableTestType
from metadata.generated.schema.type.entityReference import EntityReference from metadata.generated.schema.type.entityReference import EntityReference
from metadata.ingestion.models.table_queries import TableUsageRequest from metadata.generated.schema.type.usageRequest import UsageRequest
from metadata.ingestion.ometa.client import APIError from metadata.ingestion.ometa.client import APIError
from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.ingestion.ometa.ometa_api import OpenMetadata
@ -340,7 +340,7 @@ class OMetaTableTest(TestCase):
entity=Table, fqn=self.entity.fullyQualifiedName entity=Table, fqn=self.entity.fullyQualifiedName
) )
usage = TableUsageRequest(date="2021-10-20", count=10) usage = UsageRequest(date="2021-10-20", count=10)
self.metadata.publish_table_usage(res, usage) self.metadata.publish_table_usage(res, usage)

View File

@ -28,12 +28,16 @@ config = """
"sampleDataFolder": "ingestion/examples/sample_data" "sampleDataFolder": "ingestion/examples/sample_data"
} }
}, },
"sourceConfig": {} "sourceConfig": {
"config":{
"type": "DatabaseUsage"
}
}
}, },
"processor": { "processor": {
"type": "query-parser", "type": "query-parser",
"config": { "config": {
"filter": ""
} }
}, },
"stage": { "stage": {
@ -64,31 +68,31 @@ class QueryParserTest(TestCase):
Check the join count Check the join count
""" """
expected_result = { expected_result = {
"shopify.dim_address": 100, "shopify.dim_address": 200,
"shopify.shop": 150, "shopify.shop": 300,
"shopify.dim_customer": 125, "shopify.dim_customer": 250,
"dim_customer": 38, "dim_customer": 76,
"shopify.dim_location": 75, "shopify.dim_location": 150,
"dim_location.shop_id": 25, "dim_location.shop_id": 50,
"shop": 28, "shop": 56,
"shop_id": 25, "shop_id": 50,
"shopify.dim_staff": 75, "shopify.dim_staff": 150,
"shopify.fact_line_item": 100, "shopify.fact_line_item": 200,
"shopify.fact_order": 155, "shopify.fact_order": 310,
"shopify.product": 5, "shopify.product": 10,
"shopify.fact_sale": 260, "shopify.fact_sale": 520,
"dim_address": 12, "dim_address": 24,
"api": 2, "api": 4,
"dim_location": 4, "dim_location": 8,
"product": 16, "product": 32,
"dim_staff": 5, "dim_staff": 10,
"fact_line_item": 17, "fact_line_item": 34,
"fact_order": 15, "fact_order": 30,
"fact_sale": 27, "fact_sale": 54,
"fact_session": 31, "fact_session": 62,
"raw_customer": 10, "raw_customer": 20,
"raw_order": 13, "raw_order": 26,
"raw_product_catalog": 6, "raw_product_catalog": 12,
} }
workflow = Workflow.create(json.loads(config)) workflow = Workflow.create(json.loads(config))
workflow.execute() workflow.execute()