mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-10-29 09:42:23 +00:00
parent
37057c79a2
commit
b3428771a3
@ -35,6 +35,18 @@
|
|||||||
"queryLogFilePath": {
|
"queryLogFilePath": {
|
||||||
"description": "Configuration to set the file path for query logs",
|
"description": "Configuration to set the file path for query logs",
|
||||||
"type": "string"
|
"type": "string"
|
||||||
|
},
|
||||||
|
"schemaFilterPattern": {
|
||||||
|
"description": "Regex to only fetch tables or databases that matches the pattern.",
|
||||||
|
"$ref": "../type/filterPattern.json#/definitions/filterPattern"
|
||||||
|
},
|
||||||
|
"tableFilterPattern": {
|
||||||
|
"description": "Regex exclude tables or databases that matches the pattern.",
|
||||||
|
"$ref": "../type/filterPattern.json#/definitions/filterPattern"
|
||||||
|
},
|
||||||
|
"databaseFilterPattern": {
|
||||||
|
"description": "Regex to only fetch databases that matches the pattern.",
|
||||||
|
"$ref": "../type/filterPattern.json#/definitions/filterPattern"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"additionalProperties": false
|
"additionalProperties": false
|
||||||
|
|||||||
@ -0,0 +1,46 @@
|
|||||||
|
{
|
||||||
|
"$id": "https://open-metadata.org/schema/entity/data/queryParserData.json",
|
||||||
|
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||||
|
"title": "Query Parser Data",
|
||||||
|
"description": "This schema defines type of query parser data",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"tables": {
|
||||||
|
"description": "List of tables used in query",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"tableAliases": {
|
||||||
|
"description": "Table names mapped with alias used in query",
|
||||||
|
"type": "object"
|
||||||
|
},
|
||||||
|
"columns": {
|
||||||
|
"description": "Table columns used in query",
|
||||||
|
"type": "object"
|
||||||
|
},
|
||||||
|
"database": {
|
||||||
|
"description": "Database of the associated with query",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"sql": {
|
||||||
|
"description": "SQL query",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"serviceName": {
|
||||||
|
"description": "Name that identifies this database service.",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"date": {
|
||||||
|
"description": "Date of execution of SQL query",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"databaseSchema": {
|
||||||
|
"description": "Database schema of the associated with query",
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["sql", "serviceName", "tables", "database"],
|
||||||
|
"additionalProperties": false
|
||||||
|
}
|
||||||
@ -0,0 +1,47 @@
|
|||||||
|
{
|
||||||
|
"$id": "https://open-metadata.org/schema/entity/data/tableQuery.json",
|
||||||
|
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||||
|
"title": "Table Query",
|
||||||
|
"description": "This schema defines structure of table query",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"query": {
|
||||||
|
"description": "SQL query",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"userName": {
|
||||||
|
"description": "Name of the user that executed the SQL query",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"startTime": {
|
||||||
|
"description": "Start time of execution of SQL query",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"endTime": {
|
||||||
|
"description": "End time of execution of SQL query",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"analysisDate": {
|
||||||
|
"description": "Date of execution of SQL query",
|
||||||
|
"$ref": "./basic.json#/definitions/dateTime"
|
||||||
|
},
|
||||||
|
"aborted": {
|
||||||
|
"description": "Flag to check if query was aborted during execution",
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"serviceName": {
|
||||||
|
"description": "Name that identifies this database service.",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"database": {
|
||||||
|
"description": "Database of the associated with query",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"databaseSchema": {
|
||||||
|
"description": "Database schema of the associated with query",
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["query", "sql", "serviceName"],
|
||||||
|
"additionalProperties": false
|
||||||
|
}
|
||||||
@ -0,0 +1,81 @@
|
|||||||
|
{
|
||||||
|
"$id": "https://open-metadata.org/schema/entity/data/tableUsageCount.json",
|
||||||
|
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||||
|
"title": "Table Usage Count",
|
||||||
|
"description": "This model is the linking between the usage stage and bulksink steps",
|
||||||
|
"type": "object",
|
||||||
|
"definitions": {
|
||||||
|
"tableColumn": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"table": {
|
||||||
|
"description": "Name of the table",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"column": {
|
||||||
|
"description": "Name of the column",
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"tableColumnJoin": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"tableColumn": {
|
||||||
|
"description": "Source table column",
|
||||||
|
"$ref": "#/definitions/tableColumn"
|
||||||
|
},
|
||||||
|
"joinedWith": {
|
||||||
|
"description": "List of table columns with which the table is joined with",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/definitions/tableColumn"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"properties": {
|
||||||
|
"table": {
|
||||||
|
"description": "Name of the table",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"date": {
|
||||||
|
"description": "Date of execution of SQL query",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"database": {
|
||||||
|
"description": "Database of the associated with table",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"count": {
|
||||||
|
"description": "Usage count of table",
|
||||||
|
"type": "integer",
|
||||||
|
"default": 1
|
||||||
|
},
|
||||||
|
"databaseSchema": {
|
||||||
|
"description": "Database schema of the associated with table",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"sqlQueries": {
|
||||||
|
"description": "List of SQL Queries associated with table",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "../entity/data/table.json#/definitions/sqlQuery"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"joins": {
|
||||||
|
"description": "List of joins associated with table",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/definitions/tableColumnJoin"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"serviceName": {
|
||||||
|
"description": "Name that identifies this database service.",
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["tableName", "date", "database", "serviceName"],
|
||||||
|
"additionalProperties": false
|
||||||
|
}
|
||||||
@ -0,0 +1,20 @@
|
|||||||
|
{
|
||||||
|
"$id": "https://open-metadata.org/schema/entity/data/usageRequest.json",
|
||||||
|
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||||
|
"title": "Usage Request",
|
||||||
|
"description": "This schema defines type of table usage request used to publish the usage count on a perticular date",
|
||||||
|
"javaType": "org.openmetadata.catalog.type.UsageRequest",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"date": {
|
||||||
|
"description": "Date of execution of SQL query",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"count": {
|
||||||
|
"description": "Usage count of table",
|
||||||
|
"type": "integer"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["date", "count"],
|
||||||
|
"additionalProperties": false
|
||||||
|
}
|
||||||
@ -4,7 +4,6 @@ source:
|
|||||||
serviceConnection:
|
serviceConnection:
|
||||||
config:
|
config:
|
||||||
type: BigQuery
|
type: BigQuery
|
||||||
enablePolicyTagImport: true
|
|
||||||
projectId: project_id
|
projectId: project_id
|
||||||
credentials:
|
credentials:
|
||||||
gcsConfig:
|
gcsConfig:
|
||||||
@ -23,8 +22,7 @@ source:
|
|||||||
queryLogDuration: '1'
|
queryLogDuration: '1'
|
||||||
processor:
|
processor:
|
||||||
type: query-parser
|
type: query-parser
|
||||||
config:
|
config: {}
|
||||||
filter: ''
|
|
||||||
stage:
|
stage:
|
||||||
type: table-usage
|
type: table-usage
|
||||||
config:
|
config:
|
||||||
|
|||||||
@ -13,8 +13,7 @@ source:
|
|||||||
queryLogDuration: '1'
|
queryLogDuration: '1'
|
||||||
processor:
|
processor:
|
||||||
type: query-parser
|
type: query-parser
|
||||||
config:
|
config: {}
|
||||||
filter: ''
|
|
||||||
stage:
|
stage:
|
||||||
type: table-usage
|
type: table-usage
|
||||||
config:
|
config:
|
||||||
|
|||||||
@ -11,11 +11,9 @@ source:
|
|||||||
sourceConfig:
|
sourceConfig:
|
||||||
config:
|
config:
|
||||||
queryLogDuration: '1'
|
queryLogDuration: '1'
|
||||||
queryLogFilePath: <path to query log file>
|
|
||||||
processor:
|
processor:
|
||||||
type: query-parser
|
type: query-parser
|
||||||
config:
|
config: {}
|
||||||
filter: ''
|
|
||||||
stage:
|
stage:
|
||||||
type: table-usage
|
type: table-usage
|
||||||
config:
|
config:
|
||||||
|
|||||||
@ -13,8 +13,7 @@ source:
|
|||||||
queryLogDuration: '1'
|
queryLogDuration: '1'
|
||||||
processor:
|
processor:
|
||||||
type: query-parser
|
type: query-parser
|
||||||
config:
|
config: {}
|
||||||
filter: ''
|
|
||||||
stage:
|
stage:
|
||||||
type: table-usage
|
type: table-usage
|
||||||
config:
|
config:
|
||||||
|
|||||||
@ -14,8 +14,7 @@ source:
|
|||||||
resultLimit: 1000
|
resultLimit: 1000
|
||||||
processor:
|
processor:
|
||||||
type: query-parser
|
type: query-parser
|
||||||
config:
|
config: {}
|
||||||
filter: ''
|
|
||||||
stage:
|
stage:
|
||||||
type: table-usage
|
type: table-usage
|
||||||
config:
|
config:
|
||||||
|
|||||||
@ -8,12 +8,15 @@
|
|||||||
"sampleDataFolder": "./examples/sample_data"
|
"sampleDataFolder": "./examples/sample_data"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"sourceConfig": {}
|
"sourceConfig": {
|
||||||
|
"config":{
|
||||||
|
"type":"DatabaseUsage"
|
||||||
|
}
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"processor": {
|
"processor": {
|
||||||
"type": "query-parser",
|
"type": "query-parser",
|
||||||
"config": {
|
"config": {
|
||||||
"filter": ""
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"stage": {
|
"stage": {
|
||||||
|
|||||||
@ -43,7 +43,7 @@ base_requirements = {
|
|||||||
"Jinja2>=2.11.3",
|
"Jinja2>=2.11.3",
|
||||||
"PyYAML",
|
"PyYAML",
|
||||||
"jsonschema",
|
"jsonschema",
|
||||||
"sqllineage==1.3.3",
|
"sqllineage==1.3.5",
|
||||||
"antlr4-python3-runtime==4.9.2",
|
"antlr4-python3-runtime==4.9.2",
|
||||||
# compatibility requirements for 3.7
|
# compatibility requirements for 3.7
|
||||||
"typing-compat~=0.1.0",
|
"typing-compat~=0.1.0",
|
||||||
@ -61,7 +61,7 @@ base_plugins = {
|
|||||||
"query-parser",
|
"query-parser",
|
||||||
"metadata-usage",
|
"metadata-usage",
|
||||||
"file-stage",
|
"file-stage",
|
||||||
"sql-metadata~=2.0.0",
|
"sql-metadata~=2.5.0",
|
||||||
}
|
}
|
||||||
plugins: Dict[str, Set[str]] = {
|
plugins: Dict[str, Set[str]] = {
|
||||||
"airflow": {
|
"airflow": {
|
||||||
|
|||||||
@ -15,23 +15,29 @@ from typing import List, Optional
|
|||||||
|
|
||||||
from metadata.config.common import ConfigModel
|
from metadata.config.common import ConfigModel
|
||||||
from metadata.generated.schema.entity.data.database import Database
|
from metadata.generated.schema.entity.data.database import Database
|
||||||
from metadata.generated.schema.entity.data.table import ColumnJoins, Table, TableJoins
|
from metadata.generated.schema.entity.data.table import (
|
||||||
|
ColumnJoins,
|
||||||
|
JoinedWith,
|
||||||
|
SqlQuery,
|
||||||
|
Table,
|
||||||
|
TableJoins,
|
||||||
|
)
|
||||||
from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
|
from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
|
||||||
OpenMetadataConnection,
|
OpenMetadataConnection,
|
||||||
)
|
)
|
||||||
|
from metadata.generated.schema.type.tableUsageCount import TableColumn, TableUsageCount
|
||||||
|
from metadata.generated.schema.type.usageRequest import UsageRequest
|
||||||
from metadata.ingestion.api.bulk_sink import BulkSink, BulkSinkStatus
|
from metadata.ingestion.api.bulk_sink import BulkSink, BulkSinkStatus
|
||||||
from metadata.ingestion.models.table_queries import (
|
|
||||||
ColumnJoinedWith,
|
|
||||||
TableColumn,
|
|
||||||
TableUsageCount,
|
|
||||||
TableUsageRequest,
|
|
||||||
)
|
|
||||||
from metadata.ingestion.ometa.client import APIError
|
from metadata.ingestion.ometa.client import APIError
|
||||||
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
||||||
from metadata.utils import fqn
|
from metadata.utils import fqn
|
||||||
from metadata.utils.helpers import _get_formmated_table_name
|
from metadata.utils.helpers import get_formatted_entity_name
|
||||||
from metadata.utils.logger import ingestion_logger
|
from metadata.utils.logger import ingestion_logger
|
||||||
from metadata.utils.sql_lineage import ingest_lineage_by_query
|
from metadata.utils.sql_lineage import (
|
||||||
|
get_column_fqn,
|
||||||
|
ingest_lineage_by_query,
|
||||||
|
search_table_entities,
|
||||||
|
)
|
||||||
|
|
||||||
logger = ingestion_logger()
|
logger = ingestion_logger()
|
||||||
|
|
||||||
@ -57,6 +63,7 @@ class MetadataUsageBulkSink(BulkSink):
|
|||||||
self.metadata = OpenMetadata(self.metadata_config)
|
self.metadata = OpenMetadata(self.metadata_config)
|
||||||
self.status = BulkSinkStatus()
|
self.status = BulkSinkStatus()
|
||||||
self.table_join_dict = {}
|
self.table_join_dict = {}
|
||||||
|
self.table_usage_map = {}
|
||||||
self.today = datetime.today().strftime("%Y-%m-%d")
|
self.today = datetime.today().strftime("%Y-%m-%d")
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -64,13 +71,12 @@ class MetadataUsageBulkSink(BulkSink):
|
|||||||
config = MetadataUsageSinkConfig.parse_obj(config_dict)
|
config = MetadataUsageSinkConfig.parse_obj(config_dict)
|
||||||
return cls(config, metadata_config)
|
return cls(config, metadata_config)
|
||||||
|
|
||||||
def handle_work_unit_start(self, wu):
|
def ingest_sql_queries_lineage(
|
||||||
pass
|
self, queries: List[SqlQuery], database: str
|
||||||
|
) -> None:
|
||||||
def handle_work_unit_end(self, wu):
|
"""
|
||||||
pass
|
Method to ingest lineage by sql queries
|
||||||
|
"""
|
||||||
def ingest_sql_queries_lineage(self, queries, database):
|
|
||||||
for query in queries:
|
for query in queries:
|
||||||
ingest_lineage_by_query(
|
ingest_lineage_by_query(
|
||||||
self.metadata,
|
self.metadata,
|
||||||
@ -79,74 +85,35 @@ class MetadataUsageBulkSink(BulkSink):
|
|||||||
database=database,
|
database=database,
|
||||||
)
|
)
|
||||||
|
|
||||||
def write_records(self) -> None:
|
def __populate_table_usage_map(
|
||||||
usage_records = [json.loads(l) for l in self.file_handler.readlines()]
|
self, table_entity: Table, table_usage: TableUsageCount
|
||||||
table_usage_map = {}
|
) -> None:
|
||||||
for record in usage_records:
|
"""
|
||||||
table_usage = TableUsageCount(**json.loads(record))
|
Method Either initialise the map data or
|
||||||
self.service_name = table_usage.service_name
|
update existing data with information from new queries on the same table
|
||||||
if "." in table_usage.table:
|
"""
|
||||||
(
|
if not self.table_usage_map.get(table_entity.id.__root__):
|
||||||
table_usage.database_schema,
|
self.table_usage_map[table_entity.id.__root__] = {
|
||||||
table_usage.table,
|
"table_entity": table_entity,
|
||||||
) = table_usage.table.split(".")[-2:]
|
"usage_count": table_usage.count,
|
||||||
table_entities = self.__get_table_entity(
|
"sql_queries": table_usage.sqlQueries,
|
||||||
table_usage.database, table_usage.database_schema, table_usage.table
|
"usage_date": table_usage.date,
|
||||||
)
|
"database": table_usage.database,
|
||||||
else:
|
"database_schema": table_usage.databaseSchema,
|
||||||
table_entities = self.metadata.es_search_from_service(
|
}
|
||||||
entity_type=Table,
|
else:
|
||||||
service_name=self.service_name,
|
self.table_usage_map[table_entity.id.__root__][
|
||||||
filters={
|
"usage_count"
|
||||||
"database": table_usage.database,
|
] += table_usage.count
|
||||||
"database_schema": None,
|
self.table_usage_map[table_entity.id.__root__]["sql_queries"].extend(
|
||||||
"name": table_usage.table,
|
table_usage.sqlQueries
|
||||||
},
|
)
|
||||||
)
|
|
||||||
for table_entity in table_entities or []:
|
|
||||||
if table_entity is not None:
|
|
||||||
if not table_usage_map.get(table_entity.id.__root__):
|
|
||||||
table_usage_map[table_entity.id.__root__] = {
|
|
||||||
"table_entity": table_entity,
|
|
||||||
"usage_count": table_usage.count,
|
|
||||||
"sql_queries": table_usage.sql_queries,
|
|
||||||
"usage_date": table_usage.date,
|
|
||||||
"database": table_usage.database,
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
table_usage_map[table_entity.id.__root__][
|
|
||||||
"usage_count"
|
|
||||||
] += table_usage.count
|
|
||||||
table_usage_map[table_entity.id.__root__]["sql_queries"].extend(
|
|
||||||
table_usage.sql_queries
|
|
||||||
)
|
|
||||||
table_join_request = self.__get_table_joins(table_usage)
|
|
||||||
logger.debug("table join request {}".format(table_join_request))
|
|
||||||
try:
|
|
||||||
if (
|
|
||||||
table_join_request is not None
|
|
||||||
and len(table_join_request.columnJoins) > 0
|
|
||||||
):
|
|
||||||
self.metadata.publish_frequently_joined_with(
|
|
||||||
table_entity, table_join_request
|
|
||||||
)
|
|
||||||
except APIError as err:
|
|
||||||
self.status.failures.append(table_join_request)
|
|
||||||
logger.error(
|
|
||||||
"Failed to update query join for {}, {}".format(
|
|
||||||
table_usage.table, err
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
else:
|
def __publish_usage_records(self) -> None:
|
||||||
logger.warning(
|
"""
|
||||||
"Table does not exist, skipping usage publish {}, {}".format(
|
Method to publish SQL Queries, Table Usage & Lineage
|
||||||
table_usage.table, table_usage.database
|
"""
|
||||||
)
|
for _, value_dict in self.table_usage_map.items():
|
||||||
)
|
|
||||||
self.status.warnings.append(f"Table: {table_usage.table}")
|
|
||||||
|
|
||||||
for table_id, value_dict in table_usage_map.items():
|
|
||||||
self.metadata.ingest_table_queries_data(
|
self.metadata.ingest_table_queries_data(
|
||||||
table=value_dict["table_entity"],
|
table=value_dict["table_entity"],
|
||||||
table_queries=value_dict["sql_queries"],
|
table_queries=value_dict["sql_queries"],
|
||||||
@ -154,8 +121,8 @@ class MetadataUsageBulkSink(BulkSink):
|
|||||||
self.ingest_sql_queries_lineage(
|
self.ingest_sql_queries_lineage(
|
||||||
value_dict["sql_queries"], value_dict["database"]
|
value_dict["sql_queries"], value_dict["database"]
|
||||||
)
|
)
|
||||||
table_usage_request = TableUsageRequest(
|
table_usage_request = UsageRequest(
|
||||||
date=value_dict["usage_date"], count=value_dict["usage_count"]
|
date=self.today, count=value_dict["usage_count"]
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
self.metadata.publish_table_usage(
|
self.metadata.publish_table_usage(
|
||||||
@ -179,47 +146,100 @@ class MetadataUsageBulkSink(BulkSink):
|
|||||||
self.status.failures.append(
|
self.status.failures.append(
|
||||||
"Table: {}".format(value_dict["table_entity"].name.__root__)
|
"Table: {}".format(value_dict["table_entity"].name.__root__)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def write_records(self) -> None:
|
||||||
|
for usage_record in self.file_handler.readlines():
|
||||||
|
record = json.loads(usage_record)
|
||||||
|
table_usage = TableUsageCount(**json.loads(record))
|
||||||
|
self.service_name = table_usage.serviceName
|
||||||
|
if "." in table_usage.table:
|
||||||
|
databaseSchema, table = fqn.split(table_usage.table)[-2:]
|
||||||
|
table_usage.table = table
|
||||||
|
if not table_usage.databaseSchema:
|
||||||
|
table_usage.databaseSchema = databaseSchema
|
||||||
|
table_usage.database = get_formatted_entity_name(table_usage.database)
|
||||||
|
table_usage.databaseSchema = get_formatted_entity_name(
|
||||||
|
table_usage.databaseSchema
|
||||||
|
)
|
||||||
|
table_usage.table = get_formatted_entity_name(table_usage.table)
|
||||||
|
table_entities = search_table_entities(
|
||||||
|
self.metadata,
|
||||||
|
table_usage.serviceName,
|
||||||
|
table_usage.database,
|
||||||
|
table_usage.databaseSchema,
|
||||||
|
table_usage.table,
|
||||||
|
)
|
||||||
|
for table_entity in table_entities or []:
|
||||||
|
if table_entity is not None:
|
||||||
|
self.__populate_table_usage_map(
|
||||||
|
table_usage=table_usage, table_entity=table_entity
|
||||||
|
)
|
||||||
|
table_join_request = self.__get_table_joins(table_usage)
|
||||||
|
logger.debug("table join request {}".format(table_join_request))
|
||||||
|
try:
|
||||||
|
if (
|
||||||
|
table_join_request is not None
|
||||||
|
and len(table_join_request.columnJoins) > 0
|
||||||
|
):
|
||||||
|
self.metadata.publish_frequently_joined_with(
|
||||||
|
table_entity, table_join_request
|
||||||
|
)
|
||||||
|
except APIError as err:
|
||||||
|
self.status.failures.append(table_join_request)
|
||||||
|
logger.error(
|
||||||
|
"Failed to update query join for {}, {}".format(
|
||||||
|
table_usage.table, err
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
"Table does not exist, skipping usage publish {}, {}".format(
|
||||||
|
table_usage.table, table_usage.database
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self.status.warnings.append(f"Table: {table_usage.table}")
|
||||||
|
self.__publish_usage_records()
|
||||||
try:
|
try:
|
||||||
self.metadata.compute_percentile(Table, self.today)
|
self.metadata.compute_percentile(Table, self.today)
|
||||||
self.metadata.compute_percentile(Database, self.today)
|
self.metadata.compute_percentile(Database, self.today)
|
||||||
except APIError:
|
except APIError:
|
||||||
logger.error("Failed to publish compute.percentile")
|
logger.error("Failed to publish compute.percentile")
|
||||||
|
|
||||||
def __get_table_joins(self, table_usage):
|
def __get_table_joins(self, table_usage: TableUsageCount) -> TableJoins:
|
||||||
table_joins: TableJoins = TableJoins(
|
table_joins: TableJoins = TableJoins(
|
||||||
columnJoins=[], directTableJoins=[], startDate=table_usage.date
|
columnJoins=[], directTableJoins=[], startDate=table_usage.date
|
||||||
)
|
)
|
||||||
|
"""
|
||||||
|
Method to get Table Joins
|
||||||
|
"""
|
||||||
column_joins_dict = {}
|
column_joins_dict = {}
|
||||||
for column_join in table_usage.joins:
|
for column_join in table_usage.joins:
|
||||||
joined_with = {}
|
joined_with = {}
|
||||||
if column_join.table_column is None or len(column_join.joined_with) == 0:
|
if column_join.tableColumn is None or len(column_join.joinedWith) == 0:
|
||||||
continue
|
continue
|
||||||
if column_join.table_column.column in column_joins_dict.keys():
|
|
||||||
joined_with = column_joins_dict[column_join.table_column.column]
|
if column_join.tableColumn.column in column_joins_dict.keys():
|
||||||
|
joined_with = column_joins_dict[column_join.tableColumn.column]
|
||||||
else:
|
else:
|
||||||
column_joins_dict[column_join.table_column.column] = {}
|
column_joins_dict[column_join.tableColumn.column] = {}
|
||||||
main_column_fqdn = self.__get_column_fqdn(
|
|
||||||
table_usage.database,
|
for column in column_join.joinedWith:
|
||||||
table_usage.database_schema,
|
joined_column_fqn = self.__get_column_fqn(
|
||||||
column_join.table_column,
|
table_usage.database, table_usage.databaseSchema, column
|
||||||
)
|
|
||||||
for column in column_join.joined_with:
|
|
||||||
joined_column_fqdn = self.__get_column_fqdn(
|
|
||||||
table_usage.database, table_usage.database_schema, column
|
|
||||||
)
|
)
|
||||||
if str(joined_column_fqdn) in joined_with.keys():
|
if str(joined_column_fqn) in joined_with.keys():
|
||||||
column_joined_with = joined_with[str(joined_column_fqdn)]
|
column_joined_with = joined_with[str(joined_column_fqn)]
|
||||||
column_joined_with.joinCount += 1
|
column_joined_with.joinCount += 1
|
||||||
joined_with[str(joined_column_fqdn)] = column_joined_with
|
joined_with[str(joined_column_fqn)] = column_joined_with
|
||||||
elif joined_column_fqdn is not None:
|
elif joined_column_fqn is not None:
|
||||||
joined_with[str(joined_column_fqdn)] = ColumnJoinedWith(
|
joined_with[str(joined_column_fqn)] = JoinedWith(
|
||||||
fullyQualifiedName=str(joined_column_fqdn), joinCount=1
|
fullyQualifiedName=str(joined_column_fqn), joinCount=1
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logger.info(
|
logger.debug(
|
||||||
f"Skipping join columns for {column} {joined_column_fqdn}"
|
f"Skipping join columns for {column} {joined_column_fqn}"
|
||||||
)
|
)
|
||||||
column_joins_dict[column_join.table_column.column] = joined_with
|
column_joins_dict[column_join.tableColumn.column] = joined_with
|
||||||
|
|
||||||
for key, value in column_joins_dict.items():
|
for key, value in column_joins_dict.items():
|
||||||
table_joins.columnJoins.append(
|
table_joins.columnJoins.append(
|
||||||
@ -227,44 +247,24 @@ class MetadataUsageBulkSink(BulkSink):
|
|||||||
)
|
)
|
||||||
return table_joins
|
return table_joins
|
||||||
|
|
||||||
def __get_column_fqdn(
|
def __get_column_fqn(
|
||||||
self, database: str, database_schema: str, table_column: TableColumn
|
self, database: str, database_schema: str, table_column: TableColumn
|
||||||
):
|
) -> Optional[str]:
|
||||||
table_entities = self.__get_table_entity(
|
"""
|
||||||
database, database_schema, table_column.table
|
Method to get column fqn
|
||||||
|
"""
|
||||||
|
table_entities = search_table_entities(
|
||||||
|
self.metadata,
|
||||||
|
self.service_name,
|
||||||
|
database,
|
||||||
|
database_schema,
|
||||||
|
table_column.table,
|
||||||
)
|
)
|
||||||
if not table_entities:
|
if not table_entities:
|
||||||
return None
|
return None
|
||||||
for table_entity in table_entities:
|
|
||||||
for tbl_column in table_entity.columns:
|
|
||||||
if table_column.column.lower() == tbl_column.name.__root__.lower():
|
|
||||||
return tbl_column.fullyQualifiedName.__root__
|
|
||||||
|
|
||||||
def __get_table_entity(
|
for table_entity in table_entities:
|
||||||
self, database_name: str, database_schema: str, table_name: str
|
return get_column_fqn(table_entity, table_column.column)
|
||||||
) -> Optional[List[Table]]:
|
|
||||||
table_fqn = fqn.build(
|
|
||||||
self.metadata,
|
|
||||||
entity_type=Table,
|
|
||||||
service_name=self.service_name,
|
|
||||||
database_name=database_name,
|
|
||||||
schema_name=database_schema,
|
|
||||||
table_name=table_name,
|
|
||||||
)
|
|
||||||
table_fqn = _get_formmated_table_name(table_fqn)
|
|
||||||
table_entity = self.metadata.get_by_name(Table, fqn=table_fqn)
|
|
||||||
if table_entity:
|
|
||||||
return [table_entity]
|
|
||||||
es_result = self.metadata.es_search_from_service(
|
|
||||||
entity_type=Table,
|
|
||||||
service_name=self.service_name,
|
|
||||||
filters={
|
|
||||||
"database": database_name,
|
|
||||||
"database_schema": database_schema,
|
|
||||||
"name": table_name,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
return es_result
|
|
||||||
|
|
||||||
def get_status(self):
|
def get_status(self):
|
||||||
return self.status
|
return self.status
|
||||||
|
|||||||
@ -1,98 +0,0 @@
|
|||||||
# Copyright 2021 Collate
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
from typing import Dict, List, Optional
|
|
||||||
|
|
||||||
from pydantic import BaseModel
|
|
||||||
|
|
||||||
from metadata.generated.schema.entity.data.table import ColumnJoins, SqlQuery
|
|
||||||
from metadata.ingestion.models.json_serializable import JsonSerializable
|
|
||||||
|
|
||||||
|
|
||||||
class TableQuery(JsonSerializable):
|
|
||||||
""" """
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
query: str,
|
|
||||||
user_name: str,
|
|
||||||
starttime: str,
|
|
||||||
endtime: str,
|
|
||||||
analysis_date: str,
|
|
||||||
database: str,
|
|
||||||
aborted: bool,
|
|
||||||
sql: str,
|
|
||||||
service_name: str,
|
|
||||||
) -> None:
|
|
||||||
""" """
|
|
||||||
self.query = query
|
|
||||||
self.user_name = user_name
|
|
||||||
self.starttime = starttime
|
|
||||||
self.endtime = endtime
|
|
||||||
self.analysis_date = analysis_date
|
|
||||||
self.database = database
|
|
||||||
self.aborted = aborted
|
|
||||||
self.sql = sql
|
|
||||||
self.service_name = service_name
|
|
||||||
|
|
||||||
|
|
||||||
class TableColumn(BaseModel):
|
|
||||||
table: str
|
|
||||||
column: str
|
|
||||||
|
|
||||||
|
|
||||||
class TableColumnJoin(BaseModel):
|
|
||||||
table_column: Optional[TableColumn] = None
|
|
||||||
joined_with: Optional[List[TableColumn]] = None
|
|
||||||
|
|
||||||
|
|
||||||
TableColumnJoins = List[TableColumnJoin]
|
|
||||||
|
|
||||||
|
|
||||||
class TableUsageCount(BaseModel):
|
|
||||||
table: str
|
|
||||||
date: str
|
|
||||||
database: str
|
|
||||||
database_schema: Optional[str]
|
|
||||||
sql_queries: List[SqlQuery]
|
|
||||||
count: int = 1
|
|
||||||
joins: TableColumnJoins
|
|
||||||
service_name: str
|
|
||||||
|
|
||||||
|
|
||||||
class QueryParserData(BaseModel):
|
|
||||||
tables: List[str]
|
|
||||||
tables_aliases: Dict[str, str]
|
|
||||||
columns: Dict[str, List[object]]
|
|
||||||
date: str
|
|
||||||
database: str
|
|
||||||
sql: str
|
|
||||||
service_name: str
|
|
||||||
|
|
||||||
class Config:
|
|
||||||
arbitrary_types_allowed = True
|
|
||||||
|
|
||||||
|
|
||||||
class TableUsageRequest(BaseModel):
|
|
||||||
date: str
|
|
||||||
count: int
|
|
||||||
|
|
||||||
|
|
||||||
class ColumnJoinsList(BaseModel):
|
|
||||||
__root__: List[ColumnJoins]
|
|
||||||
|
|
||||||
|
|
||||||
class ColumnJoinedWith(BaseModel):
|
|
||||||
fullyQualifiedName: str
|
|
||||||
joinCount: int
|
|
||||||
|
|
||||||
|
|
||||||
TablesUsage = List[TableUsageCount]
|
|
||||||
@ -27,7 +27,7 @@ from metadata.generated.schema.entity.data.table import (
|
|||||||
TableJoins,
|
TableJoins,
|
||||||
TableProfile,
|
TableProfile,
|
||||||
)
|
)
|
||||||
from metadata.ingestion.models.table_queries import TableUsageRequest
|
from metadata.generated.schema.type.usageRequest import UsageRequest
|
||||||
from metadata.ingestion.ometa.client import REST
|
from metadata.ingestion.ometa.client import REST
|
||||||
from metadata.ingestion.ometa.utils import ometa_logger
|
from metadata.ingestion.ometa.utils import ometa_logger
|
||||||
|
|
||||||
@ -115,7 +115,7 @@ class OMetaTableMixin:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def publish_table_usage(
|
def publish_table_usage(
|
||||||
self, table: Table, table_usage_request: TableUsageRequest
|
self, table: Table, table_usage_request: UsageRequest
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
POST usage details for a Table
|
POST usage details for a Table
|
||||||
|
|||||||
@ -76,7 +76,3 @@ def _(arg) -> str:
|
|||||||
Models with __root__
|
Models with __root__
|
||||||
"""
|
"""
|
||||||
return str(arg.__root__)
|
return str(arg.__root__)
|
||||||
|
|
||||||
|
|
||||||
def _get_formmated_table_name(table_name):
|
|
||||||
return table_name.replace("[", "").replace("]", "")
|
|
||||||
|
|||||||
@ -22,19 +22,11 @@ from metadata.config.common import ConfigModel
|
|||||||
from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
|
from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
|
||||||
OpenMetadataConnection,
|
OpenMetadataConnection,
|
||||||
)
|
)
|
||||||
|
from metadata.generated.schema.type.queryParserData import QueryParserData
|
||||||
|
from metadata.generated.schema.type.tableQuery import TableQuery
|
||||||
from metadata.ingestion.api.processor import Processor, ProcessorStatus
|
from metadata.ingestion.api.processor import Processor, ProcessorStatus
|
||||||
from metadata.ingestion.models.table_queries import QueryParserData, TableQuery
|
|
||||||
from metadata.utils.logger import ingestion_logger
|
from metadata.utils.logger import ingestion_logger
|
||||||
|
|
||||||
|
|
||||||
class QueryParserProcessorConfig(ConfigModel):
|
|
||||||
"""
|
|
||||||
Query parser pydantic configuration model
|
|
||||||
"""
|
|
||||||
|
|
||||||
filter: Optional[str] = None
|
|
||||||
|
|
||||||
|
|
||||||
logger = ingestion_logger()
|
logger = ingestion_logger()
|
||||||
|
|
||||||
|
|
||||||
@ -52,12 +44,12 @@ class QueryParserProcessor(Processor):
|
|||||||
status (ProcessorStatus):
|
status (ProcessorStatus):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
config: QueryParserProcessorConfig
|
config: ConfigModel
|
||||||
status: ProcessorStatus
|
status: ProcessorStatus
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
config: QueryParserProcessorConfig,
|
config: ConfigModel,
|
||||||
metadata_config: OpenMetadataConnection,
|
metadata_config: OpenMetadataConnection,
|
||||||
):
|
):
|
||||||
|
|
||||||
@ -69,26 +61,30 @@ class QueryParserProcessor(Processor):
|
|||||||
def create(
|
def create(
|
||||||
cls, config_dict: dict, metadata_config: OpenMetadataConnection, **kwargs
|
cls, config_dict: dict, metadata_config: OpenMetadataConnection, **kwargs
|
||||||
):
|
):
|
||||||
config = QueryParserProcessorConfig.parse_obj(config_dict)
|
config = ConfigModel.parse_obj(config_dict)
|
||||||
return cls(config, metadata_config)
|
return cls(config, metadata_config)
|
||||||
|
|
||||||
def process(self, record: TableQuery) -> QueryParserData:
|
def process(self, record: TableQuery) -> Optional[QueryParserData]:
|
||||||
|
query_parser_data = None
|
||||||
try:
|
try:
|
||||||
start_date = record.analysis_date
|
if not record.query:
|
||||||
if isinstance(record.analysis_date, str):
|
return
|
||||||
|
start_date = record.analysisDate.__root__
|
||||||
|
if isinstance(record.analysisDate, str):
|
||||||
start_date = datetime.datetime.strptime(
|
start_date = datetime.datetime.strptime(
|
||||||
str(record.analysis_date), "%Y-%m-%d %H:%M:%S"
|
str(record.analysisDate), "%Y-%m-%d %H:%M:%S"
|
||||||
).date()
|
).date()
|
||||||
parser = Parser(record.sql)
|
parser = Parser(record.query)
|
||||||
columns_dict = {} if parser.columns_dict is None else parser.columns_dict
|
columns_dict = {} if parser.columns_dict is None else parser.columns_dict
|
||||||
query_parser_data = QueryParserData(
|
query_parser_data = QueryParserData(
|
||||||
tables=parser.tables,
|
tables=parser.tables,
|
||||||
tables_aliases=parser.tables_aliases,
|
tableAliases=parser.tables_aliases,
|
||||||
columns=columns_dict,
|
columns=columns_dict,
|
||||||
database=record.database,
|
database=record.database,
|
||||||
sql=record.sql,
|
databaseSchema=record.databaseSchema,
|
||||||
|
sql=record.query,
|
||||||
date=start_date.strftime("%Y-%m-%d"),
|
date=start_date.strftime("%Y-%m-%d"),
|
||||||
service_name=record.service_name,
|
serviceName=record.serviceName,
|
||||||
)
|
)
|
||||||
# pylint: disable=broad-except
|
# pylint: disable=broad-except
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
@ -96,8 +92,6 @@ class QueryParserProcessor(Processor):
|
|||||||
logger.debug(record.sql)
|
logger.debug(record.sql)
|
||||||
logger.debug(traceback.format_exc())
|
logger.debug(traceback.format_exc())
|
||||||
logger.error(err)
|
logger.error(err)
|
||||||
query_parser_data = None
|
|
||||||
|
|
||||||
return query_parser_data
|
return query_parser_data
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
|
|||||||
@ -15,7 +15,7 @@ import collections
|
|||||||
import logging as log
|
import logging as log
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Iterable
|
from typing import Any, Dict, Iterable, Optional
|
||||||
|
|
||||||
from google.cloud import logging
|
from google.cloud import logging
|
||||||
|
|
||||||
@ -31,34 +31,29 @@ from metadata.generated.schema.entity.services.databaseService import (
|
|||||||
from metadata.generated.schema.metadataIngestion.workflow import (
|
from metadata.generated.schema.metadataIngestion.workflow import (
|
||||||
Source as WorkflowSource,
|
Source as WorkflowSource,
|
||||||
)
|
)
|
||||||
from metadata.ingestion.api.source import InvalidSourceException, Source, SourceStatus
|
from metadata.generated.schema.type.tableQuery import TableQuery
|
||||||
from metadata.ingestion.models.table_queries import TableQuery
|
from metadata.ingestion.api.source import InvalidSourceException
|
||||||
from metadata.ingestion.source.database.common_db_source import SQLSourceStatus
|
from metadata.ingestion.source.database.usage_source import UsageSource
|
||||||
from metadata.utils.credentials import set_google_credentials
|
from metadata.utils.credentials import set_google_credentials
|
||||||
from metadata.utils.helpers import get_start_and_end
|
|
||||||
|
|
||||||
logger = log.getLogger(__name__)
|
logger = log.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class BigqueryUsageSource(Source[TableQuery]):
|
class BigqueryUsageSource(UsageSource):
|
||||||
SERVICE_TYPE = DatabaseServiceType.BigQuery.value
|
SERVICE_TYPE = DatabaseServiceType.BigQuery.value
|
||||||
scheme = "bigquery"
|
scheme = "bigquery"
|
||||||
|
|
||||||
def __init__(self, config: WorkflowSource, metadata_config: OpenMetadataConnection):
|
def __init__(self, config: WorkflowSource, metadata_config: OpenMetadataConnection):
|
||||||
super().__init__()
|
super().__init__(config, metadata_config)
|
||||||
self.temp_credentials = None
|
self.temp_credentials = None
|
||||||
self.metadata_config = metadata_config
|
|
||||||
self.config = config
|
|
||||||
self.service_connection = config.serviceConnection.__root__.config
|
|
||||||
|
|
||||||
# Used as db
|
# Used as db
|
||||||
self.project_id = (
|
self.project_id = (
|
||||||
self.service_connection.projectId
|
self.connection.projectId or self.connection.credentials.gcsConfig.projectId
|
||||||
or self.service_connection.credentials.gcsConfig.projectId
|
|
||||||
)
|
)
|
||||||
|
|
||||||
self.logger_name = "cloudaudit.googleapis.com%2Fdata_access"
|
self.logger_name = "cloudaudit.googleapis.com%2Fdata_access"
|
||||||
self.status = SQLSourceStatus()
|
self.logging_client = logging.Client()
|
||||||
|
self.usage_logger = self.logging_client.logger(self.logger_name)
|
||||||
|
logger.debug("Listing entries for logger {}:".format(self.usage_logger.name))
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create(cls, config_dict, metadata_config: OpenMetadataConnection):
|
def create(cls, config_dict, metadata_config: OpenMetadataConnection):
|
||||||
@ -75,69 +70,47 @@ class BigqueryUsageSource(Source[TableQuery]):
|
|||||||
|
|
||||||
return cls(config, metadata_config)
|
return cls(config, metadata_config)
|
||||||
|
|
||||||
def prepare(self):
|
def _get_raw_extract_iter(self) -> Optional[Iterable[Dict[str, Any]]]:
|
||||||
pass
|
entries = self.usage_logger.list_entries()
|
||||||
|
for entry in entries:
|
||||||
def next_record(self) -> Iterable[TableQuery]:
|
timestamp = entry.timestamp.isoformat()
|
||||||
logging_client = logging.Client()
|
timestamp = datetime.strptime(timestamp[0:10], "%Y-%m-%d")
|
||||||
usage_logger = logging_client.logger(self.logger_name)
|
if timestamp >= self.start and timestamp <= self.end:
|
||||||
logger.debug("Listing entries for logger {}:".format(usage_logger.name))
|
if ("query" in str(entry.payload)) and type(
|
||||||
start, end = get_start_and_end(self.config.sourceConfig.config.queryLogDuration)
|
entry.payload
|
||||||
try:
|
) == collections.OrderedDict:
|
||||||
entries = usage_logger.list_entries()
|
payload = list(entry.payload.items())[-1][1]
|
||||||
for entry in entries:
|
if "jobChange" in payload:
|
||||||
timestamp = entry.timestamp.isoformat()
|
logger.debug(f"\nEntries: {payload}")
|
||||||
timestamp = datetime.strptime(timestamp[0:10], "%Y-%m-%d")
|
if "queryConfig" in payload["jobChange"]["job"]["jobConfig"]:
|
||||||
if timestamp >= start and timestamp <= end:
|
queryConfig = payload["jobChange"]["job"]["jobConfig"][
|
||||||
if ("query" in str(entry.payload)) and type(
|
|
||||||
entry.payload
|
|
||||||
) == collections.OrderedDict:
|
|
||||||
payload = list(entry.payload.items())[-1][1]
|
|
||||||
if "jobChange" in payload:
|
|
||||||
logger.debug(f"\nEntries: {payload}")
|
|
||||||
if (
|
|
||||||
"queryConfig"
|
"queryConfig"
|
||||||
in payload["jobChange"]["job"]["jobConfig"]
|
]
|
||||||
):
|
else:
|
||||||
queryConfig = payload["jobChange"]["job"]["jobConfig"][
|
continue
|
||||||
"queryConfig"
|
jobStats = payload["jobChange"]["job"]["jobStats"]
|
||||||
]
|
statementType = ""
|
||||||
else:
|
if hasattr(queryConfig, "statementType"):
|
||||||
continue
|
statementType = queryConfig["statementType"]
|
||||||
jobStats = payload["jobChange"]["job"]["jobStats"]
|
database = self.project_id
|
||||||
statementType = ""
|
analysis_date = str(
|
||||||
if hasattr(queryConfig, "statementType"):
|
datetime.strptime(
|
||||||
statementType = queryConfig["statementType"]
|
jobStats["startTime"][0:19], "%Y-%m-%dT%H:%M:%S"
|
||||||
database = self.project_id
|
).strftime("%Y-%m-%d %H:%M:%S")
|
||||||
analysis_date = str(
|
)
|
||||||
datetime.strptime(
|
logger.debug(f"Query :{statementType}:{queryConfig['query']}")
|
||||||
jobStats["startTime"][0:19], "%Y-%m-%dT%H:%M:%S"
|
tq = TableQuery(
|
||||||
).strftime("%Y-%m-%d %H:%M:%S")
|
query=queryConfig["query"],
|
||||||
)
|
userName=entry.resource.labels["project_id"],
|
||||||
logger.debug(
|
startTime=str(jobStats["startTime"]),
|
||||||
f"Query :{statementType}:{queryConfig['query']}"
|
endTime=str(jobStats["endTime"]),
|
||||||
)
|
analysisDate=analysis_date,
|
||||||
tq = TableQuery(
|
aborted=0,
|
||||||
query=statementType,
|
database=str(database),
|
||||||
user_name=entry.resource.labels["project_id"],
|
serviceName=self.config.serviceName,
|
||||||
starttime=str(jobStats["startTime"]),
|
databaseSchema=None,
|
||||||
endtime=str(jobStats["endTime"]),
|
)
|
||||||
analysis_date=analysis_date,
|
yield tq
|
||||||
aborted=0,
|
|
||||||
database=str(database),
|
|
||||||
sql=queryConfig["query"],
|
|
||||||
service_name=self.config.serviceName,
|
|
||||||
)
|
|
||||||
yield tq
|
|
||||||
|
|
||||||
except Exception as err:
|
|
||||||
logger.error(repr(err))
|
|
||||||
|
|
||||||
def get_status(self) -> SourceStatus:
|
|
||||||
return self.status
|
|
||||||
|
|
||||||
def test_connection(self) -> SourceStatus:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
super().close()
|
super().close()
|
||||||
|
|||||||
@ -13,7 +13,6 @@ Clickhouse usage module
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import ast
|
import ast
|
||||||
from typing import Iterable
|
|
||||||
|
|
||||||
from metadata.generated.schema.entity.services.connections.database.clickhouseConnection import (
|
from metadata.generated.schema.entity.services.connections.database.clickhouseConnection import (
|
||||||
ClickhouseConnection,
|
ClickhouseConnection,
|
||||||
@ -22,29 +21,17 @@ from metadata.generated.schema.metadataIngestion.workflow import (
|
|||||||
Source as WorkflowSource,
|
Source as WorkflowSource,
|
||||||
)
|
)
|
||||||
from metadata.generated.schema.metadataIngestion.workflow import WorkflowConfig
|
from metadata.generated.schema.metadataIngestion.workflow import WorkflowConfig
|
||||||
from metadata.ingestion.api.source import InvalidSourceException, Source, SourceStatus
|
from metadata.ingestion.api.source import InvalidSourceException
|
||||||
|
|
||||||
# This import verifies that the dependencies are available.
|
|
||||||
from metadata.ingestion.models.table_queries import TableQuery
|
|
||||||
from metadata.ingestion.source.database.common_db_source import SQLSourceStatus
|
|
||||||
from metadata.ingestion.source.database.usage_source import UsageSource
|
from metadata.ingestion.source.database.usage_source import UsageSource
|
||||||
from metadata.utils.connections import get_connection, test_connection
|
|
||||||
from metadata.utils.helpers import get_start_and_end
|
|
||||||
from metadata.utils.sql_queries import CLICKHOUSE_SQL_USAGE_STATEMENT
|
from metadata.utils.sql_queries import CLICKHOUSE_SQL_USAGE_STATEMENT
|
||||||
|
|
||||||
|
|
||||||
class ClickhouseUsageSource(UsageSource):
|
class ClickhouseUsageSource(UsageSource):
|
||||||
def __init__(self, config: WorkflowSource, metadata_config: WorkflowConfig):
|
def __init__(self, config: WorkflowSource, metadata_config: WorkflowConfig):
|
||||||
super().__init__(config, metadata_config)
|
super().__init__(config, metadata_config)
|
||||||
self.config = config
|
|
||||||
self.connection = config.serviceConnection.__root__.config
|
|
||||||
start, end = get_start_and_end(self.config.sourceConfig.config.queryLogDuration)
|
|
||||||
self.analysis_date = start
|
|
||||||
self.sql_stmt = CLICKHOUSE_SQL_USAGE_STATEMENT.format(
|
self.sql_stmt = CLICKHOUSE_SQL_USAGE_STATEMENT.format(
|
||||||
start_time=start, end_time=end
|
start_time=self.start, end_time=self.end
|
||||||
)
|
)
|
||||||
self.report = SQLSourceStatus()
|
|
||||||
self.engine = get_connection(self.connection)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create(cls, config_dict, metadata_config: WorkflowConfig):
|
def create(cls, config_dict, metadata_config: WorkflowConfig):
|
||||||
@ -57,27 +44,12 @@ class ClickhouseUsageSource(UsageSource):
|
|||||||
|
|
||||||
return cls(config, metadata_config)
|
return cls(config, metadata_config)
|
||||||
|
|
||||||
def next_record(self) -> Iterable[TableQuery]:
|
def get_database_name(self, data: dict) -> str:
|
||||||
"""
|
"""
|
||||||
Using itertools.groupby and raw level iterator,
|
Method to fetch database name from row data
|
||||||
it groups to table and yields TableMetadata
|
|
||||||
:return:
|
|
||||||
"""
|
"""
|
||||||
for row in self._get_raw_extract_iter():
|
database = "default"
|
||||||
database = "default"
|
if data["database_name"]:
|
||||||
if row["database_name"]:
|
database_list = ast.literal_eval(data["database_name"])
|
||||||
database_list = ast.literal_eval(row["database_name"])
|
database = database_list[0] if len(database_list) == 1 else "default"
|
||||||
database = database_list[0] if len(database_list) == 1 else "default"
|
return database
|
||||||
|
|
||||||
table_query = TableQuery(
|
|
||||||
query=row["query_id"],
|
|
||||||
user_name=row["user_name"],
|
|
||||||
starttime=str(row["start_time"]),
|
|
||||||
endtime=str(row["end_time"]),
|
|
||||||
analysis_date=self.analysis_date,
|
|
||||||
aborted=row["aborted"],
|
|
||||||
database=database,
|
|
||||||
sql=row["query_text"],
|
|
||||||
service_name=self.config.serviceName,
|
|
||||||
)
|
|
||||||
yield table_query
|
|
||||||
|
|||||||
@ -23,18 +23,15 @@ from metadata.generated.schema.metadataIngestion.workflow import (
|
|||||||
from metadata.generated.schema.metadataIngestion.workflow import WorkflowConfig
|
from metadata.generated.schema.metadataIngestion.workflow import WorkflowConfig
|
||||||
from metadata.ingestion.api.source import InvalidSourceException
|
from metadata.ingestion.api.source import InvalidSourceException
|
||||||
from metadata.ingestion.source.database.usage_source import UsageSource
|
from metadata.ingestion.source.database.usage_source import UsageSource
|
||||||
|
|
||||||
# This import verifies that the dependencies are available.
|
|
||||||
from metadata.utils.helpers import get_start_and_end
|
|
||||||
from metadata.utils.sql_queries import MSSQL_SQL_USAGE_STATEMENT
|
from metadata.utils.sql_queries import MSSQL_SQL_USAGE_STATEMENT
|
||||||
|
|
||||||
|
|
||||||
class MssqlUsageSource(UsageSource):
|
class MssqlUsageSource(UsageSource):
|
||||||
def __init__(self, config: WorkflowSource, metadata_config: OpenMetadataConnection):
|
def __init__(self, config: WorkflowSource, metadata_config: OpenMetadataConnection):
|
||||||
super().__init__(config, metadata_config)
|
super().__init__(config, metadata_config)
|
||||||
start, end = get_start_and_end(config.sourceConfig.config.queryLogDuration)
|
self.sql_stmt = MSSQL_SQL_USAGE_STATEMENT.format(
|
||||||
self.analysis_date = start
|
start_date=self.start, end_date=self.end
|
||||||
self.sql_stmt = MSSQL_SQL_USAGE_STATEMENT.format(start_date=start, end_date=end)
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create(cls, config_dict, metadata_config: WorkflowConfig):
|
def create(cls, config_dict, metadata_config: WorkflowConfig):
|
||||||
|
|||||||
@ -27,7 +27,6 @@ from metadata.generated.schema.metadataIngestion.workflow import (
|
|||||||
from metadata.generated.schema.metadataIngestion.workflow import WorkflowConfig
|
from metadata.generated.schema.metadataIngestion.workflow import WorkflowConfig
|
||||||
from metadata.ingestion.api.source import InvalidSourceException
|
from metadata.ingestion.api.source import InvalidSourceException
|
||||||
from metadata.ingestion.source.database.usage_source import UsageSource
|
from metadata.ingestion.source.database.usage_source import UsageSource
|
||||||
from metadata.utils.helpers import get_start_and_end
|
|
||||||
|
|
||||||
# pylint: disable=useless-super-delegation
|
# pylint: disable=useless-super-delegation
|
||||||
from metadata.utils.logger import ingestion_logger
|
from metadata.utils.logger import ingestion_logger
|
||||||
@ -37,24 +36,14 @@ logger = ingestion_logger()
|
|||||||
|
|
||||||
|
|
||||||
class RedshiftUsageSource(UsageSource):
|
class RedshiftUsageSource(UsageSource):
|
||||||
# SELECT statement from mysql information_schema to extract table and column metadata
|
|
||||||
SQL_STATEMENT = REDSHIFT_SQL_STATEMENT
|
SQL_STATEMENT = REDSHIFT_SQL_STATEMENT
|
||||||
# CONFIG KEYS
|
|
||||||
WHERE_CLAUSE_SUFFIX_KEY = "where_clause"
|
|
||||||
CLUSTER_SOURCE = "cluster_source"
|
|
||||||
CLUSTER_KEY = "cluster_key"
|
|
||||||
USE_CATALOG_AS_CLUSTER_NAME = "use_catalog_as_cluster_name"
|
|
||||||
DATABASE_KEY = "database_key"
|
|
||||||
SERVICE_TYPE = DatabaseServiceType.Redshift.value
|
|
||||||
DEFAULT_CLUSTER_SOURCE = "CURRENT_DATABASE()"
|
|
||||||
|
|
||||||
def __init__(self, config: WorkflowSource, metadata_config: WorkflowConfig):
|
def __init__(self, config: WorkflowSource, metadata_config: WorkflowConfig):
|
||||||
super().__init__(config, metadata_config)
|
super().__init__(config, metadata_config)
|
||||||
start, end = get_start_and_end(self.config.sourceConfig.config.queryLogDuration)
|
|
||||||
self.sql_stmt = RedshiftUsageSource.SQL_STATEMENT.format(
|
self.sql_stmt = RedshiftUsageSource.SQL_STATEMENT.format(
|
||||||
start_time=start, end_time=end
|
start_time=self.start, end_time=self.end
|
||||||
)
|
)
|
||||||
self.analysis_date = start
|
|
||||||
self._extract_iter: Union[None, Iterator] = None
|
self._extract_iter: Union[None, Iterator] = None
|
||||||
self._database = "redshift"
|
self._database = "redshift"
|
||||||
|
|
||||||
|
|||||||
@ -12,7 +12,7 @@
|
|||||||
import csv
|
import csv
|
||||||
import json
|
import json
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Iterable
|
from typing import Dict, Iterable, Optional
|
||||||
|
|
||||||
from metadata.generated.schema.entity.services.connections.database.sampleDataConnection import (
|
from metadata.generated.schema.entity.services.connections.database.sampleDataConnection import (
|
||||||
SampleDataConnection,
|
SampleDataConnection,
|
||||||
@ -27,23 +27,27 @@ from metadata.generated.schema.entity.services.databaseService import (
|
|||||||
from metadata.generated.schema.metadataIngestion.workflow import (
|
from metadata.generated.schema.metadataIngestion.workflow import (
|
||||||
Source as WorkflowSource,
|
Source as WorkflowSource,
|
||||||
)
|
)
|
||||||
from metadata.ingestion.api.source import InvalidSourceException, Source
|
from metadata.generated.schema.type.tableQuery import TableQuery
|
||||||
from metadata.ingestion.models.table_queries import TableQuery
|
from metadata.ingestion.api.source import InvalidSourceException
|
||||||
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
||||||
|
from metadata.ingestion.source.database.common_db_source import SQLSourceStatus
|
||||||
from metadata.ingestion.source.database.sample_data import SampleDataSourceStatus
|
from metadata.ingestion.source.database.sample_data import SampleDataSourceStatus
|
||||||
|
from metadata.ingestion.source.database.usage_source import UsageSource
|
||||||
|
|
||||||
|
|
||||||
class SampleUsageSource(Source[TableQuery]):
|
class SampleUsageSource(UsageSource):
|
||||||
|
|
||||||
service_type = DatabaseServiceType.BigQuery.value
|
service_type = DatabaseServiceType.BigQuery.value
|
||||||
|
|
||||||
def __init__(self, config: WorkflowSource, metadata_config: OpenMetadataConnection):
|
def __init__(self, config: WorkflowSource, metadata_config: OpenMetadataConnection):
|
||||||
super().__init__()
|
|
||||||
self.status = SampleDataSourceStatus()
|
self.status = SampleDataSourceStatus()
|
||||||
self.config = config
|
self.config = config
|
||||||
self.service_connection = config.serviceConnection.__root__.config
|
self.service_connection = config.serviceConnection.__root__.config
|
||||||
|
self.source_config = config.sourceConfig.config
|
||||||
self.metadata_config = metadata_config
|
self.metadata_config = metadata_config
|
||||||
|
self.report = SQLSourceStatus()
|
||||||
self.metadata = OpenMetadata(metadata_config)
|
self.metadata = OpenMetadata(metadata_config)
|
||||||
|
self.analysis_date = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
self.service_json = json.load(
|
self.service_json = json.load(
|
||||||
open(
|
open(
|
||||||
@ -70,29 +74,16 @@ class SampleUsageSource(Source[TableQuery]):
|
|||||||
)
|
)
|
||||||
return cls(config, metadata_config)
|
return cls(config, metadata_config)
|
||||||
|
|
||||||
def prepare(self):
|
def _get_raw_extract_iter(self) -> Optional[Iterable[Dict[str, str]]]:
|
||||||
pass
|
|
||||||
|
|
||||||
def next_record(self) -> Iterable[TableQuery]:
|
|
||||||
for row in self.query_logs:
|
for row in self.query_logs:
|
||||||
tq = TableQuery(
|
yield TableQuery(
|
||||||
query=row["query"],
|
query=row["query"],
|
||||||
user_name="",
|
userName="",
|
||||||
starttime="",
|
startTime="",
|
||||||
endtime="",
|
endTime="",
|
||||||
analysis_date=datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"),
|
analysisDate=self.analysis_date,
|
||||||
database="ecommerce_db",
|
|
||||||
aborted=False,
|
aborted=False,
|
||||||
sql=row["query"],
|
database="ecommerce_db",
|
||||||
service_name=self.config.serviceName,
|
serviceName=self.config.serviceName,
|
||||||
|
databaseSchema="shopify",
|
||||||
)
|
)
|
||||||
yield tq
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def get_status(self):
|
|
||||||
return self.status
|
|
||||||
|
|
||||||
def test_connection(self) -> None:
|
|
||||||
pass
|
|
||||||
|
|||||||
@ -12,11 +12,8 @@
|
|||||||
Snowflake usage module
|
Snowflake usage module
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import traceback
|
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from typing import Any, Dict, Iterable, Iterator, Union
|
from typing import Iterable, Iterator, Union
|
||||||
|
|
||||||
from sqlalchemy import inspect
|
|
||||||
|
|
||||||
from metadata.generated.schema.entity.services.connections.database.snowflakeConnection import (
|
from metadata.generated.schema.entity.services.connections.database.snowflakeConnection import (
|
||||||
SnowflakeConnection,
|
SnowflakeConnection,
|
||||||
@ -31,17 +28,17 @@ from metadata.generated.schema.metadataIngestion.workflow import (
|
|||||||
Source as WorkflowSource,
|
Source as WorkflowSource,
|
||||||
)
|
)
|
||||||
from metadata.generated.schema.metadataIngestion.workflow import WorkflowConfig
|
from metadata.generated.schema.metadataIngestion.workflow import WorkflowConfig
|
||||||
|
from metadata.generated.schema.type.tableQuery import TableQuery
|
||||||
from metadata.ingestion.api.source import InvalidSourceException
|
from metadata.ingestion.api.source import InvalidSourceException
|
||||||
|
|
||||||
# This import verifies that the dependencies are available.
|
# This import verifies that the dependencies are available.
|
||||||
from metadata.ingestion.models.table_queries import TableQuery
|
|
||||||
from metadata.ingestion.source.database.usage_source import UsageSource
|
from metadata.ingestion.source.database.usage_source import UsageSource
|
||||||
from metadata.utils.connections import get_connection
|
from metadata.utils.connections import get_connection
|
||||||
from metadata.utils.helpers import get_start_and_end
|
|
||||||
from metadata.utils.logger import ingestion_logger
|
from metadata.utils.logger import ingestion_logger
|
||||||
from metadata.utils.sql_queries import SNOWFLAKE_SQL_STATEMENT
|
from metadata.utils.sql_queries import SNOWFLAKE_SQL_STATEMENT
|
||||||
|
|
||||||
logger = ingestion_logger()
|
logger = ingestion_logger()
|
||||||
|
SNOWFLAKE_ABORTED_CODE = "1969"
|
||||||
|
|
||||||
|
|
||||||
class SnowflakeUsageSource(UsageSource):
|
class SnowflakeUsageSource(UsageSource):
|
||||||
@ -60,12 +57,10 @@ class SnowflakeUsageSource(UsageSource):
|
|||||||
|
|
||||||
def __init__(self, config: WorkflowSource, metadata_config: OpenMetadataConnection):
|
def __init__(self, config: WorkflowSource, metadata_config: OpenMetadataConnection):
|
||||||
super().__init__(config, metadata_config)
|
super().__init__(config, metadata_config)
|
||||||
start, end = get_start_and_end(self.config.sourceConfig.config.queryLogDuration)
|
self.end = self.end + timedelta(days=1)
|
||||||
end = end + timedelta(days=1)
|
|
||||||
self.analysis_date = start
|
|
||||||
self.sql_stmt = SnowflakeUsageSource.SQL_STATEMENT.format(
|
self.sql_stmt = SnowflakeUsageSource.SQL_STATEMENT.format(
|
||||||
start_date=start,
|
start_date=self.start,
|
||||||
end_date=end,
|
end_date=self.end,
|
||||||
result_limit=self.config.sourceConfig.config.resultLimit,
|
result_limit=self.config.sourceConfig.config.resultLimit,
|
||||||
)
|
)
|
||||||
self._extract_iter: Union[None, Iterator] = None
|
self._extract_iter: Union[None, Iterator] = None
|
||||||
@ -81,9 +76,9 @@ class SnowflakeUsageSource(UsageSource):
|
|||||||
)
|
)
|
||||||
return cls(config, metadata_config)
|
return cls(config, metadata_config)
|
||||||
|
|
||||||
def _get_raw_extract_iter(self) -> Iterable[Dict[str, Any]]:
|
def _get_raw_extract_iter(self) -> Iterable[TableQuery]:
|
||||||
if self.config.serviceConnection.__root__.config.database:
|
if self.config.serviceConnection.__root__.config.database:
|
||||||
yield from super(SnowflakeUsageSource, self)._get_raw_extract_iter()
|
yield from super()._get_raw_extract_iter()
|
||||||
else:
|
else:
|
||||||
query = "SHOW DATABASES"
|
query = "SHOW DATABASES"
|
||||||
results = self.engine.execute(query)
|
results = self.engine.execute(query)
|
||||||
@ -96,35 +91,28 @@ class SnowflakeUsageSource(UsageSource):
|
|||||||
self.engine = get_connection(self.connection)
|
self.engine = get_connection(self.connection)
|
||||||
rows = self.engine.execute(self.sql_stmt)
|
rows = self.engine.execute(self.sql_stmt)
|
||||||
for row in rows:
|
for row in rows:
|
||||||
yield row
|
yield TableQuery(
|
||||||
|
query=row["query_text"],
|
||||||
|
userName=row["user_name"],
|
||||||
|
startTime=str(row["start_time"]),
|
||||||
|
endTime=str(row["end_time"]),
|
||||||
|
analysisDate=self.analysis_date,
|
||||||
|
aborted=self.get_aborted_status(row),
|
||||||
|
database=self.get_database_name(row),
|
||||||
|
serviceName=self.config.serviceName,
|
||||||
|
databaseSchema=row["schema_name"],
|
||||||
|
)
|
||||||
|
|
||||||
def next_record(self) -> Iterable[TableQuery]:
|
def get_database_name(self, data: dict) -> str:
|
||||||
"""
|
"""
|
||||||
Using itertools.groupby and raw level iterator,
|
Method to get database name
|
||||||
it groups to table and yields TableMetadata
|
|
||||||
:return:
|
|
||||||
"""
|
"""
|
||||||
for row in self._get_raw_extract_iter():
|
if not data["database_name"] and self.connection.database:
|
||||||
try:
|
return self.connection.database
|
||||||
table_query = TableQuery(
|
return data["database_name"]
|
||||||
query=row["query_type"],
|
|
||||||
user_name=row["user_name"],
|
def get_aborted_status(self, data: dict) -> bool:
|
||||||
starttime=str(row["start_time"]),
|
"""
|
||||||
endtime=str(row["end_time"]),
|
Method to get aborted status of query
|
||||||
analysis_date=self.analysis_date,
|
"""
|
||||||
aborted="1969" in str(row["end_time"]),
|
return SNOWFLAKE_ABORTED_CODE in str(data["end_time"])
|
||||||
database=row["database_name"],
|
|
||||||
sql=row["query_text"],
|
|
||||||
service_name=self.config.serviceName,
|
|
||||||
)
|
|
||||||
if not row["database_name"] and self.connection.database:
|
|
||||||
TableQuery.database = self.connection.database
|
|
||||||
logger.debug(f"Parsed Query: {row['query_text']}")
|
|
||||||
if row["schema_name"] is not None:
|
|
||||||
self.report.scanned(f"{row['database_name']}.{row['schema_name']}")
|
|
||||||
else:
|
|
||||||
self.report.scanned(f"{row['database_name']}")
|
|
||||||
yield table_query
|
|
||||||
except Exception as err:
|
|
||||||
logger.debug(traceback.format_exc())
|
|
||||||
logger.debug(repr(err))
|
|
||||||
|
|||||||
@ -12,7 +12,8 @@
|
|||||||
Usage Souce Module
|
Usage Souce Module
|
||||||
"""
|
"""
|
||||||
import csv
|
import csv
|
||||||
from typing import Any, Dict, Iterable
|
import traceback
|
||||||
|
from typing import Iterable, Optional
|
||||||
|
|
||||||
from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
|
from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
|
||||||
OpenMetadataConnection,
|
OpenMetadataConnection,
|
||||||
@ -20,13 +21,17 @@ from metadata.generated.schema.entity.services.connections.metadata.openMetadata
|
|||||||
from metadata.generated.schema.metadataIngestion.workflow import (
|
from metadata.generated.schema.metadataIngestion.workflow import (
|
||||||
Source as WorkflowSource,
|
Source as WorkflowSource,
|
||||||
)
|
)
|
||||||
from metadata.ingestion.api.source import InvalidSourceException, Source, SourceStatus
|
|
||||||
|
|
||||||
# This import verifies that the dependencies are available.
|
# This import verifies that the dependencies are available.
|
||||||
from metadata.ingestion.models.table_queries import TableQuery
|
from metadata.generated.schema.type.tableQuery import TableQuery
|
||||||
|
from metadata.ingestion.api.source import Source, SourceStatus
|
||||||
from metadata.ingestion.source.database.common_db_source import SQLSourceStatus
|
from metadata.ingestion.source.database.common_db_source import SQLSourceStatus
|
||||||
from metadata.utils.connections import get_connection, test_connection
|
from metadata.utils.connections import get_connection, test_connection
|
||||||
|
from metadata.utils.filters import filter_by_database, filter_by_schema
|
||||||
from metadata.utils.helpers import get_start_and_end
|
from metadata.utils.helpers import get_start_and_end
|
||||||
|
from metadata.utils.logger import ingestion_logger
|
||||||
|
|
||||||
|
logger = ingestion_logger()
|
||||||
|
|
||||||
|
|
||||||
class UsageSource(Source[TableQuery]):
|
class UsageSource(Source[TableQuery]):
|
||||||
@ -35,39 +40,63 @@ class UsageSource(Source[TableQuery]):
|
|||||||
self.config = config
|
self.config = config
|
||||||
self.metadata_config = metadata_config
|
self.metadata_config = metadata_config
|
||||||
self.connection = config.serviceConnection.__root__.config
|
self.connection = config.serviceConnection.__root__.config
|
||||||
start, end = get_start_and_end(self.config.sourceConfig.config.queryLogDuration)
|
self.source_config = self.config.sourceConfig.config
|
||||||
self.analysis_date = start
|
self.start, self.end = get_start_and_end(self.source_config.queryLogDuration)
|
||||||
|
self.analysis_date = self.start
|
||||||
self.report = SQLSourceStatus()
|
self.report = SQLSourceStatus()
|
||||||
self.engine = get_connection(self.connection)
|
self.engine = get_connection(self.connection)
|
||||||
|
|
||||||
def prepare(self):
|
def prepare(self):
|
||||||
return super().prepare()
|
return super().prepare()
|
||||||
|
|
||||||
def _get_raw_extract_iter(self) -> Iterable[Dict[str, Any]]:
|
def get_database_name(self, data: dict) -> str:
|
||||||
|
"""
|
||||||
|
Method to get database name
|
||||||
|
"""
|
||||||
|
return data.get("database_name")
|
||||||
|
|
||||||
|
def get_aborted_status(self, data: dict) -> bool:
|
||||||
|
"""
|
||||||
|
Method to get aborted status of query
|
||||||
|
"""
|
||||||
|
return data.get("aborted", False)
|
||||||
|
|
||||||
|
def _get_raw_extract_iter(self) -> Optional[Iterable[TableQuery]]:
|
||||||
|
"""
|
||||||
|
If queryLogFilePath available in config iterate through log file
|
||||||
|
otherwise execute the sql query to fetch TableQuery data
|
||||||
|
"""
|
||||||
if self.config.sourceConfig.config.queryLogFilePath:
|
if self.config.sourceConfig.config.queryLogFilePath:
|
||||||
with open(self.config.sourceConfig.config.queryLogFilePath, "r") as fin:
|
with open(self.config.sourceConfig.config.queryLogFilePath, "r") as fin:
|
||||||
for i in csv.DictReader(fin):
|
for i in csv.DictReader(fin):
|
||||||
query_dict = dict(i)
|
query_dict = dict(i)
|
||||||
row = {
|
yield TableQuery(
|
||||||
"query_type": query_dict.get("query"),
|
query=query_dict["query_text"],
|
||||||
"user_name": query_dict.get("user_name", ""),
|
userName=query_dict.get("user_name", ""),
|
||||||
"start_time": query_dict.get("start_time", ""),
|
startTime=query_dict.get("start_time", ""),
|
||||||
"end_time": query_dict.get("end_time", ""),
|
endTime=query_dict.get("end_time", ""),
|
||||||
"aborted": query_dict.get("aborted", False),
|
analysisDate=self.analysis_date,
|
||||||
"database_name": query_dict.get(
|
aborted=self.get_aborted_status(query_dict),
|
||||||
"database_name",
|
database=self.get_database_name(query_dict),
|
||||||
self.connection.database
|
serviceName=self.config.serviceName,
|
||||||
if self.connection.database
|
databaseSchema=query_dict.get("schema_name"),
|
||||||
else "default",
|
)
|
||||||
),
|
|
||||||
"query_text": query_dict.get("query"),
|
|
||||||
"schema_name": query_dict.get("schema_name"),
|
|
||||||
}
|
|
||||||
yield row
|
|
||||||
else:
|
else:
|
||||||
rows = self.engine.execute(self.sql_stmt)
|
rows = self.engine.execute(self.sql_stmt)
|
||||||
for row in rows:
|
for row in rows:
|
||||||
yield row
|
row = dict(row)
|
||||||
|
print(row)
|
||||||
|
yield TableQuery(
|
||||||
|
query=row["query_text"],
|
||||||
|
userName=row["user_name"],
|
||||||
|
startTime=str(row["start_time"]),
|
||||||
|
endTime=str(row["end_time"]),
|
||||||
|
analysisDate=self.analysis_date,
|
||||||
|
aborted=self.get_aborted_status(row),
|
||||||
|
database=self.get_database_name(row),
|
||||||
|
serviceName=self.config.serviceName,
|
||||||
|
databaseSchema=row["schema_name"],
|
||||||
|
)
|
||||||
|
|
||||||
def next_record(self) -> Iterable[TableQuery]:
|
def next_record(self) -> Iterable[TableQuery]:
|
||||||
"""
|
"""
|
||||||
@ -75,24 +104,32 @@ class UsageSource(Source[TableQuery]):
|
|||||||
it groups to table and yields TableMetadata
|
it groups to table and yields TableMetadata
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
|
for table_query in self._get_raw_extract_iter():
|
||||||
|
if table_query:
|
||||||
|
if filter_by_database(
|
||||||
|
self.source_config.databaseFilterPattern,
|
||||||
|
database_name=table_query.database,
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
if filter_by_schema(
|
||||||
|
self.source_config.schemaFilterPattern,
|
||||||
|
schema_name=table_query.databaseSchema,
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
|
||||||
for row in self._get_raw_extract_iter():
|
try:
|
||||||
table_query = TableQuery(
|
yield table_query
|
||||||
query=row["query_type"],
|
logger.debug(f"Parsed Query: {table_query.query}")
|
||||||
user_name=row["user_name"],
|
if not table_query.databaseSchema:
|
||||||
starttime=str(row["start_time"]),
|
self.report.scanned(
|
||||||
endtime=str(row["end_time"]),
|
f"{table_query.database}.{table_query.databaseSchema}"
|
||||||
analysis_date=self.analysis_date,
|
)
|
||||||
aborted=row["aborted"],
|
else:
|
||||||
database=row["database_name"],
|
self.report.scanned(f"{table_query.database}")
|
||||||
sql=row["query_text"],
|
yield table_query
|
||||||
service_name=self.config.serviceName,
|
except Exception as err:
|
||||||
)
|
logger.debug(traceback.format_exc())
|
||||||
if not row["schema_name"]:
|
logger.error(str(err))
|
||||||
self.report.scanned(f"{row['database_name']}.{row['schema_name']}")
|
|
||||||
else:
|
|
||||||
self.report.scanned(f"{row['database_name']}")
|
|
||||||
yield table_query
|
|
||||||
|
|
||||||
def get_report(self):
|
def get_report(self):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -16,13 +16,13 @@ from metadata.generated.schema.entity.data.table import SqlQuery
|
|||||||
from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
|
from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
|
||||||
OpenMetadataConnection,
|
OpenMetadataConnection,
|
||||||
)
|
)
|
||||||
from metadata.ingestion.api.stage import Stage, StageStatus
|
from metadata.generated.schema.type.queryParserData import QueryParserData
|
||||||
from metadata.ingestion.models.table_queries import (
|
from metadata.generated.schema.type.tableUsageCount import (
|
||||||
QueryParserData,
|
|
||||||
TableColumn,
|
TableColumn,
|
||||||
TableColumnJoin,
|
TableColumnJoin,
|
||||||
TableUsageCount,
|
TableUsageCount,
|
||||||
)
|
)
|
||||||
|
from metadata.ingestion.api.stage import Stage, StageStatus
|
||||||
from metadata.ingestion.stage.file import FileStageConfig
|
from metadata.ingestion.stage.file import FileStageConfig
|
||||||
from metadata.utils.logger import ingestion_logger
|
from metadata.utils.logger import ingestion_logger
|
||||||
|
|
||||||
@ -58,7 +58,7 @@ def get_table_column_join(table, table_aliases, joins, database):
|
|||||||
)
|
)
|
||||||
except ValueError as err:
|
except ValueError as err:
|
||||||
logger.error("Error in parsing sql query joins {}".format(err))
|
logger.error("Error in parsing sql query joins {}".format(err))
|
||||||
return TableColumnJoin(table_column=table_column, joined_with=joined_with)
|
return TableColumnJoin(tableColumn=table_column, joinedWith=joined_with)
|
||||||
|
|
||||||
|
|
||||||
class TableUsageStage(Stage[QueryParserData]):
|
class TableUsageStage(Stage[QueryParserData]):
|
||||||
@ -104,7 +104,7 @@ class TableUsageStage(Stage[QueryParserData]):
|
|||||||
table_usage_count.joins.append(
|
table_usage_count.joins.append(
|
||||||
get_table_column_join(
|
get_table_column_join(
|
||||||
table,
|
table,
|
||||||
record.tables_aliases,
|
record.tableAliases,
|
||||||
record.columns["join"],
|
record.columns["join"],
|
||||||
record.database,
|
record.database,
|
||||||
)
|
)
|
||||||
@ -114,7 +114,7 @@ class TableUsageStage(Stage[QueryParserData]):
|
|||||||
if record.columns.get("join") is not None:
|
if record.columns.get("join") is not None:
|
||||||
tbl_column_join = get_table_column_join(
|
tbl_column_join = get_table_column_join(
|
||||||
table,
|
table,
|
||||||
record.tables_aliases,
|
record.tableAliases,
|
||||||
record.columns["join"],
|
record.columns["join"],
|
||||||
record.database,
|
record.database,
|
||||||
)
|
)
|
||||||
@ -126,8 +126,9 @@ class TableUsageStage(Stage[QueryParserData]):
|
|||||||
database=record.database,
|
database=record.database,
|
||||||
date=record.date,
|
date=record.date,
|
||||||
joins=joins,
|
joins=joins,
|
||||||
service_name=record.service_name,
|
serviceName=record.serviceName,
|
||||||
sql_queries=[],
|
sqlQueries=[],
|
||||||
|
databaseSchema=record.databaseSchema,
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
@ -140,7 +141,7 @@ class TableUsageStage(Stage[QueryParserData]):
|
|||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
for key, value in self.table_usage.items():
|
for key, value in self.table_usage.items():
|
||||||
value.sql_queries = self.table_queries.get(key, [])
|
value.sqlQueries = self.table_queries.get(key, [])
|
||||||
data = value.json()
|
data = value.json()
|
||||||
self.file.write(json.dumps(data))
|
self.file.write(json.dumps(data))
|
||||||
self.file.write("\n")
|
self.file.write("\n")
|
||||||
|
|||||||
@ -51,7 +51,9 @@ def get_query_from_dict(data: Dict[str, Optional[str]]) -> str:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_entity_from_es_result(entity_list: Optional[List[T]]) -> Optional[T]:
|
def get_entity_from_es_result(
|
||||||
|
entity_list: Optional[List[T]], fetch_multiple_entities: bool = False
|
||||||
|
) -> Optional[T]:
|
||||||
"""
|
"""
|
||||||
Return a single element from an entity list obtained
|
Return a single element from an entity list obtained
|
||||||
from an ES query
|
from an ES query
|
||||||
@ -59,6 +61,8 @@ def get_entity_from_es_result(entity_list: Optional[List[T]]) -> Optional[T]:
|
|||||||
:return: single entity
|
:return: single entity
|
||||||
"""
|
"""
|
||||||
if entity_list and len(entity_list):
|
if entity_list and len(entity_list):
|
||||||
|
if fetch_multiple_entities:
|
||||||
|
return entity_list
|
||||||
return entity_list[0]
|
return entity_list[0]
|
||||||
|
|
||||||
logger.warning("ES Query was empty")
|
logger.warning("ES Query was empty")
|
||||||
|
|||||||
@ -14,9 +14,12 @@ Filter information has been taken from the
|
|||||||
ES indexes definitions
|
ES indexes definitions
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
from typing import List, Optional, Type, TypeVar
|
from typing import List, Optional, Type, TypeVar, Union
|
||||||
|
|
||||||
from antlr4 import *
|
from antlr4.CommonTokenStream import CommonTokenStream
|
||||||
|
from antlr4.error.ErrorStrategy import BailErrorStrategy
|
||||||
|
from antlr4.InputStream import InputStream
|
||||||
|
from antlr4.tree.Tree import ParseTreeWalker
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from metadata.antlr.split_listener import SplitListener
|
from metadata.antlr.split_listener import SplitListener
|
||||||
@ -121,6 +124,7 @@ def _(
|
|||||||
schema_name: Optional[str],
|
schema_name: Optional[str],
|
||||||
table_name: str,
|
table_name: str,
|
||||||
retries: int = 3,
|
retries: int = 3,
|
||||||
|
fetch_multiple_entities: bool = False,
|
||||||
) -> Optional[str]:
|
) -> Optional[str]:
|
||||||
"""
|
"""
|
||||||
Building logic for tables
|
Building logic for tables
|
||||||
@ -148,10 +152,16 @@ def _(
|
|||||||
},
|
},
|
||||||
retries=retries,
|
retries=retries,
|
||||||
)
|
)
|
||||||
entity: Optional[Table] = get_entity_from_es_result(entity_list=es_result)
|
entity: Optional[Union[Table, List[Table]]] = get_entity_from_es_result(
|
||||||
return str(entity.fullyQualifiedName.__root__) if entity else None
|
entity_list=es_result, fetch_multiple_entities=fetch_multiple_entities
|
||||||
|
)
|
||||||
return _build(service_name, database_name, schema_name, table_name)
|
if not entity:
|
||||||
|
return None
|
||||||
|
if fetch_multiple_entities:
|
||||||
|
return [str(table.fullyQualifiedName.__root__) for table in entity]
|
||||||
|
return str(entity.fullyQualifiedName.__root__)
|
||||||
|
fqn = _build(service_name, database_name, schema_name, table_name)
|
||||||
|
return [fqn] if fetch_multiple_entities else fqn
|
||||||
|
|
||||||
|
|
||||||
@fqn_build_registry.add(DatabaseSchema)
|
@fqn_build_registry.add(DatabaseSchema)
|
||||||
|
|||||||
@ -10,7 +10,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from typing import Any, Dict, Iterable
|
from typing import Any, Dict, Iterable, Optional
|
||||||
|
|
||||||
from metadata.generated.schema.api.services.createDashboardService import (
|
from metadata.generated.schema.api.services.createDashboardService import (
|
||||||
CreateDashboardServiceRequest,
|
CreateDashboardServiceRequest,
|
||||||
@ -46,7 +46,7 @@ def get_start_and_end(duration):
|
|||||||
start = (today + timedelta(0 - duration)).replace(
|
start = (today + timedelta(0 - duration)).replace(
|
||||||
hour=0, minute=0, second=0, microsecond=0
|
hour=0, minute=0, second=0, microsecond=0
|
||||||
)
|
)
|
||||||
end = (today + timedelta(3)).replace(hour=0, minute=0, second=0, microsecond=0)
|
end = today.replace(hour=0, minute=0, second=0, microsecond=0)
|
||||||
return start, end
|
return start, end
|
||||||
|
|
||||||
|
|
||||||
@ -185,8 +185,9 @@ def datetime_to_ts(date: datetime) -> int:
|
|||||||
return int(date.timestamp() * 1_000)
|
return int(date.timestamp() * 1_000)
|
||||||
|
|
||||||
|
|
||||||
def _get_formmated_table_name(table_name):
|
def get_formatted_entity_name(name: str) -> Optional[str]:
|
||||||
return table_name.replace("[", "").replace("]", "")
|
if name:
|
||||||
|
return name.replace("[", "").replace("]", "")
|
||||||
|
|
||||||
|
|
||||||
def get_raw_extract_iter(alchemy_helper) -> Iterable[Dict[str, Any]]:
|
def get_raw_extract_iter(alchemy_helper) -> Iterable[Dict[str, Any]]:
|
||||||
|
|||||||
@ -13,33 +13,99 @@ Helper functions to handle SQL lineage operations
|
|||||||
"""
|
"""
|
||||||
import traceback
|
import traceback
|
||||||
from logging.config import DictConfigurator
|
from logging.config import DictConfigurator
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
from metadata.generated.schema.api.lineage.addLineage import AddLineageRequest
|
from metadata.generated.schema.api.lineage.addLineage import AddLineageRequest
|
||||||
from metadata.generated.schema.entity.data.table import Table
|
from metadata.generated.schema.entity.data.table import Table
|
||||||
from metadata.generated.schema.type.entityLineage import EntitiesEdge
|
from metadata.generated.schema.type.entityLineage import (
|
||||||
|
ColumnLineage,
|
||||||
|
EntitiesEdge,
|
||||||
|
LineageDetails,
|
||||||
|
)
|
||||||
from metadata.generated.schema.type.entityReference import EntityReference
|
from metadata.generated.schema.type.entityReference import EntityReference
|
||||||
|
from metadata.ingestion.ometa.client import APIError
|
||||||
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
||||||
from metadata.utils import fqn
|
from metadata.utils import fqn
|
||||||
from metadata.utils.helpers import _get_formmated_table_name
|
from metadata.utils.helpers import get_formatted_entity_name
|
||||||
from metadata.utils.logger import utils_logger
|
from metadata.utils.logger import utils_logger
|
||||||
|
|
||||||
|
|
||||||
# Prevent sqllineage from modifying the logger config
|
|
||||||
def configure(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
DictConfigurator.configure = configure
|
|
||||||
|
|
||||||
|
|
||||||
logger = utils_logger()
|
logger = utils_logger()
|
||||||
|
column_lineage_map = {}
|
||||||
|
|
||||||
|
|
||||||
def _separate_fqn(database, fqn):
|
def split_raw_table_name(database: str, raw_name: str) -> dict:
|
||||||
database_schema, table = fqn.split(".")[-2:]
|
database_schema = None
|
||||||
if not database_schema:
|
if "." in raw_name:
|
||||||
database_schema = None
|
database_schema, table = fqn.split(raw_name)[-2:]
|
||||||
return {"database": database, "database_schema": database_schema, "name": table}
|
if database_schema == "<default>":
|
||||||
|
database_schema = None
|
||||||
|
return {"database": database, "database_schema": database_schema, "table": table}
|
||||||
|
|
||||||
|
|
||||||
|
def get_column_fqn(table_entity: Table, column: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Get fqn of column if exist in table entity
|
||||||
|
"""
|
||||||
|
if not table_entity:
|
||||||
|
return
|
||||||
|
for tbl_column in table_entity.columns:
|
||||||
|
if column.lower() == tbl_column.name.__root__.lower():
|
||||||
|
return tbl_column.fullyQualifiedName.__root__
|
||||||
|
|
||||||
|
|
||||||
|
def search_table_entities(
|
||||||
|
metadata: OpenMetadata,
|
||||||
|
service_name: str,
|
||||||
|
database: str,
|
||||||
|
database_schema: Optional[str],
|
||||||
|
table: str,
|
||||||
|
) -> Optional[List[Table]]:
|
||||||
|
"""
|
||||||
|
Method to get table entity from database, database_schema & table name
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
table_fqns = fqn.build(
|
||||||
|
metadata,
|
||||||
|
entity_type=Table,
|
||||||
|
service_name=service_name,
|
||||||
|
database_name=database,
|
||||||
|
schema_name=database_schema,
|
||||||
|
table_name=table,
|
||||||
|
fetch_multiple_entities=True,
|
||||||
|
)
|
||||||
|
table_entities = []
|
||||||
|
for table_fqn in table_fqns or []:
|
||||||
|
try:
|
||||||
|
table_entity = metadata.get_by_name(Table, fqn=table_fqn)
|
||||||
|
table_entities.append(table_entity)
|
||||||
|
except APIError:
|
||||||
|
logger.debug(f"Table not found for fqn: {fqn}")
|
||||||
|
return table_entities
|
||||||
|
except Exception as err:
|
||||||
|
logger.debug(traceback.format_exc())
|
||||||
|
logger.error(err)
|
||||||
|
|
||||||
|
|
||||||
|
def get_column_lineage(
|
||||||
|
to_entity: Table,
|
||||||
|
from_entity: Table,
|
||||||
|
to_table_raw_name: str,
|
||||||
|
from_table_raw_name: str,
|
||||||
|
) -> List[ColumnLineage]:
|
||||||
|
column_lineage = []
|
||||||
|
if column_lineage_map.get(to_table_raw_name) and column_lineage_map.get(
|
||||||
|
to_table_raw_name
|
||||||
|
).get(from_table_raw_name):
|
||||||
|
for to_col, from_col in column_lineage_map.get(to_table_raw_name).get(
|
||||||
|
from_table_raw_name
|
||||||
|
):
|
||||||
|
to_col_fqn = get_column_fqn(to_entity, to_col)
|
||||||
|
from_col_fqn = get_column_fqn(from_entity, from_col)
|
||||||
|
if to_col_fqn and from_col_fqn:
|
||||||
|
column_lineage.append(
|
||||||
|
ColumnLineage(fromColumns=[from_col_fqn], toColumn=to_col_fqn)
|
||||||
|
)
|
||||||
|
return column_lineage
|
||||||
|
|
||||||
|
|
||||||
def _create_lineage_by_table_name(
|
def _create_lineage_by_table_name(
|
||||||
@ -48,53 +114,46 @@ def _create_lineage_by_table_name(
|
|||||||
to_table: str,
|
to_table: str,
|
||||||
service_name: str,
|
service_name: str,
|
||||||
database: str,
|
database: str,
|
||||||
|
query: str,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
This method is to create a lineage between two tables
|
This method is to create a lineage between two tables
|
||||||
"""
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from_table = str(from_table).replace("<default>", "")
|
from_raw_name = get_formatted_entity_name(str(from_table))
|
||||||
to_table = str(to_table).replace("<default>", "")
|
from_table_obj = split_raw_table_name(database=database, raw_name=from_raw_name)
|
||||||
from_fqn = fqn.build(
|
from_entities = search_table_entities(
|
||||||
metadata,
|
table=from_table_obj.get("table"),
|
||||||
entity_type=Table,
|
database_schema=from_table_obj.get("database_schema"),
|
||||||
|
database=from_table_obj.get("database"),
|
||||||
|
metadata=metadata,
|
||||||
service_name=service_name,
|
service_name=service_name,
|
||||||
database_name=database,
|
|
||||||
schema_name=None, # TODO: Split table name
|
|
||||||
table_name=_get_formmated_table_name(str(from_table)),
|
|
||||||
)
|
)
|
||||||
from_entity: Table = metadata.get_by_name(entity=Table, fqn=from_fqn)
|
to_raw_name = get_formatted_entity_name(str(from_table))
|
||||||
if not from_entity:
|
to_table_obj = split_raw_table_name(database=database, raw_name=to_raw_name)
|
||||||
table_obj = _separate_fqn(database=database, fqn=from_fqn)
|
to_entities = search_table_entities(
|
||||||
multiple_from_fqns = metadata.es_search_from_service(
|
table=to_table_obj.get("table"),
|
||||||
entity_type=Table,
|
database_schema=to_table_obj.get("database_schema"),
|
||||||
service_name=service_name,
|
database=to_table_obj.get("database"),
|
||||||
filters=table_obj,
|
metadata=metadata,
|
||||||
)
|
|
||||||
else:
|
|
||||||
multiple_from_fqns = [from_entity]
|
|
||||||
to_fqn = fqn.build(
|
|
||||||
metadata,
|
|
||||||
entity_type=Table,
|
|
||||||
service_name=service_name,
|
service_name=service_name,
|
||||||
database_name=database,
|
|
||||||
schema_name=None, # TODO: Split table name
|
|
||||||
table_name=_get_formmated_table_name(str(to_table)),
|
|
||||||
)
|
)
|
||||||
to_entity: Table = metadata.get_by_name(entity=Table, fqn=to_fqn)
|
if not to_entities or not from_entities:
|
||||||
if not to_entity:
|
|
||||||
table_obj = _separate_fqn(database=database, fqn=to_fqn)
|
|
||||||
multiple_to_fqns = metadata.es_search_from_service(
|
|
||||||
entity_type=Table,
|
|
||||||
service_name=service_name,
|
|
||||||
filters=table_obj,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
multiple_to_fqns = [to_entity]
|
|
||||||
if not multiple_to_fqns or not multiple_from_fqns:
|
|
||||||
return None
|
return None
|
||||||
for from_entity in multiple_from_fqns:
|
for from_entity in from_entities:
|
||||||
for to_entity in multiple_to_fqns:
|
for to_entity in to_entities:
|
||||||
|
col_lineage = get_column_lineage(
|
||||||
|
to_entity=to_entity,
|
||||||
|
to_table_raw_name=str(to_table),
|
||||||
|
from_entity=from_entity,
|
||||||
|
from_table_raw_name=str(from_table),
|
||||||
|
)
|
||||||
|
lineage_details = None
|
||||||
|
if col_lineage:
|
||||||
|
lineage_details = LineageDetails(
|
||||||
|
sqlQuery=query, columnsLineage=col_lineage
|
||||||
|
)
|
||||||
lineage = AddLineageRequest(
|
lineage = AddLineageRequest(
|
||||||
edge=EntitiesEdge(
|
edge=EntitiesEdge(
|
||||||
fromEntity=EntityReference(
|
fromEntity=EntityReference(
|
||||||
@ -107,13 +166,37 @@ def _create_lineage_by_table_name(
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
if lineage_details:
|
||||||
|
lineage.edge.lineageDetails = lineage_details
|
||||||
created_lineage = metadata.add_lineage(lineage)
|
created_lineage = metadata.add_lineage(lineage)
|
||||||
logger.info(f"Successfully added Lineage {created_lineage}")
|
logger.info(f"Successfully added Lineage {created_lineage}")
|
||||||
|
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
logger.debug(traceback.format_exc())
|
logger.debug(traceback.format_exc())
|
||||||
logger.error(err)
|
logger.error(traceback.format_exc())
|
||||||
|
|
||||||
|
|
||||||
|
def populate_column_lineage_map(raw_column_lineage):
|
||||||
|
lineage_map = {}
|
||||||
|
if not raw_column_lineage or len(raw_column_lineage[0]) != 2:
|
||||||
|
return
|
||||||
|
for source, target in raw_column_lineage:
|
||||||
|
if lineage_map.get(str(target.parent)):
|
||||||
|
ele = lineage_map.get(str(target.parent))
|
||||||
|
if ele.get(str(source.parent)):
|
||||||
|
ele[str(source.parent)].append(
|
||||||
|
(
|
||||||
|
target.raw_name,
|
||||||
|
source.raw_name,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
ele[str(source.parent)] = [(target.raw_name, source.raw_name)]
|
||||||
|
else:
|
||||||
|
lineage_map[str(target.parent)] = {
|
||||||
|
str(source.parent): [(target.raw_name, source.raw_name)]
|
||||||
|
}
|
||||||
|
return lineage_map
|
||||||
|
|
||||||
|
|
||||||
def ingest_lineage_by_query(
|
def ingest_lineage_by_query(
|
||||||
@ -123,12 +206,22 @@ def ingest_lineage_by_query(
|
|||||||
This method parses the query to get source, target and intermediate table names to create lineage,
|
This method parses the query to get source, target and intermediate table names to create lineage,
|
||||||
and returns True if target table is found to create lineage otherwise returns False.
|
and returns True if target table is found to create lineage otherwise returns False.
|
||||||
"""
|
"""
|
||||||
|
# Prevent sqllineage from modifying the logger config
|
||||||
|
# Disable the DictConfigurator.configure method while importing LineageRunner
|
||||||
|
configure = DictConfigurator.configure
|
||||||
|
DictConfigurator.configure = lambda _: None
|
||||||
from sqllineage.runner import LineageRunner
|
from sqllineage.runner import LineageRunner
|
||||||
|
|
||||||
|
# Reverting changes after import is done
|
||||||
|
DictConfigurator.configure = configure
|
||||||
|
column_lineage_map.clear()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = LineageRunner(query)
|
result = LineageRunner(query)
|
||||||
if not result.target_tables:
|
if not result.target_tables:
|
||||||
return False
|
return False
|
||||||
|
raw_column_lineage = result.get_column_lineage()
|
||||||
|
column_lineage_map.update(populate_column_lineage_map(raw_column_lineage))
|
||||||
for intermediate_table in result.intermediate_tables:
|
for intermediate_table in result.intermediate_tables:
|
||||||
for source_table in result.source_tables:
|
for source_table in result.source_tables:
|
||||||
_create_lineage_by_table_name(
|
_create_lineage_by_table_name(
|
||||||
@ -137,6 +230,7 @@ def ingest_lineage_by_query(
|
|||||||
to_table=intermediate_table,
|
to_table=intermediate_table,
|
||||||
service_name=service_name,
|
service_name=service_name,
|
||||||
database=database,
|
database=database,
|
||||||
|
query=query,
|
||||||
)
|
)
|
||||||
for target_table in result.target_tables:
|
for target_table in result.target_tables:
|
||||||
_create_lineage_by_table_name(
|
_create_lineage_by_table_name(
|
||||||
@ -145,6 +239,7 @@ def ingest_lineage_by_query(
|
|||||||
to_table=target_table,
|
to_table=target_table,
|
||||||
service_name=service_name,
|
service_name=service_name,
|
||||||
database=database,
|
database=database,
|
||||||
|
query=query,
|
||||||
)
|
)
|
||||||
if not result.intermediate_tables:
|
if not result.intermediate_tables:
|
||||||
for target_table in result.target_tables:
|
for target_table in result.target_tables:
|
||||||
@ -155,6 +250,7 @@ def ingest_lineage_by_query(
|
|||||||
to_table=target_table,
|
to_table=target_table,
|
||||||
service_name=service_name,
|
service_name=service_name,
|
||||||
database=database,
|
database=database,
|
||||||
|
query=query,
|
||||||
)
|
)
|
||||||
return True
|
return True
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
|
|||||||
@ -62,7 +62,7 @@ from metadata.generated.schema.tests.table.tableRowCountToEqual import (
|
|||||||
)
|
)
|
||||||
from metadata.generated.schema.tests.tableTest import TableTestCase, TableTestType
|
from metadata.generated.schema.tests.tableTest import TableTestCase, TableTestType
|
||||||
from metadata.generated.schema.type.entityReference import EntityReference
|
from metadata.generated.schema.type.entityReference import EntityReference
|
||||||
from metadata.ingestion.models.table_queries import TableUsageRequest
|
from metadata.generated.schema.type.usageRequest import UsageRequest
|
||||||
from metadata.ingestion.ometa.client import APIError
|
from metadata.ingestion.ometa.client import APIError
|
||||||
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
||||||
|
|
||||||
@ -340,7 +340,7 @@ class OMetaTableTest(TestCase):
|
|||||||
entity=Table, fqn=self.entity.fullyQualifiedName
|
entity=Table, fqn=self.entity.fullyQualifiedName
|
||||||
)
|
)
|
||||||
|
|
||||||
usage = TableUsageRequest(date="2021-10-20", count=10)
|
usage = UsageRequest(date="2021-10-20", count=10)
|
||||||
|
|
||||||
self.metadata.publish_table_usage(res, usage)
|
self.metadata.publish_table_usage(res, usage)
|
||||||
|
|
||||||
|
|||||||
@ -28,12 +28,16 @@ config = """
|
|||||||
"sampleDataFolder": "ingestion/examples/sample_data"
|
"sampleDataFolder": "ingestion/examples/sample_data"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"sourceConfig": {}
|
"sourceConfig": {
|
||||||
|
"config":{
|
||||||
|
"type": "DatabaseUsage"
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"processor": {
|
"processor": {
|
||||||
"type": "query-parser",
|
"type": "query-parser",
|
||||||
"config": {
|
"config": {
|
||||||
"filter": ""
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"stage": {
|
"stage": {
|
||||||
@ -64,31 +68,31 @@ class QueryParserTest(TestCase):
|
|||||||
Check the join count
|
Check the join count
|
||||||
"""
|
"""
|
||||||
expected_result = {
|
expected_result = {
|
||||||
"shopify.dim_address": 100,
|
"shopify.dim_address": 200,
|
||||||
"shopify.shop": 150,
|
"shopify.shop": 300,
|
||||||
"shopify.dim_customer": 125,
|
"shopify.dim_customer": 250,
|
||||||
"dim_customer": 38,
|
"dim_customer": 76,
|
||||||
"shopify.dim_location": 75,
|
"shopify.dim_location": 150,
|
||||||
"dim_location.shop_id": 25,
|
"dim_location.shop_id": 50,
|
||||||
"shop": 28,
|
"shop": 56,
|
||||||
"shop_id": 25,
|
"shop_id": 50,
|
||||||
"shopify.dim_staff": 75,
|
"shopify.dim_staff": 150,
|
||||||
"shopify.fact_line_item": 100,
|
"shopify.fact_line_item": 200,
|
||||||
"shopify.fact_order": 155,
|
"shopify.fact_order": 310,
|
||||||
"shopify.product": 5,
|
"shopify.product": 10,
|
||||||
"shopify.fact_sale": 260,
|
"shopify.fact_sale": 520,
|
||||||
"dim_address": 12,
|
"dim_address": 24,
|
||||||
"api": 2,
|
"api": 4,
|
||||||
"dim_location": 4,
|
"dim_location": 8,
|
||||||
"product": 16,
|
"product": 32,
|
||||||
"dim_staff": 5,
|
"dim_staff": 10,
|
||||||
"fact_line_item": 17,
|
"fact_line_item": 34,
|
||||||
"fact_order": 15,
|
"fact_order": 30,
|
||||||
"fact_sale": 27,
|
"fact_sale": 54,
|
||||||
"fact_session": 31,
|
"fact_session": 62,
|
||||||
"raw_customer": 10,
|
"raw_customer": 20,
|
||||||
"raw_order": 13,
|
"raw_order": 26,
|
||||||
"raw_product_catalog": 6,
|
"raw_product_catalog": 12,
|
||||||
}
|
}
|
||||||
workflow = Workflow.create(json.loads(config))
|
workflow = Workflow.create(json.loads(config))
|
||||||
workflow.execute()
|
workflow.execute()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user