mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-08-24 17:08:28 +00:00
* fix: removed sqlparse dependency for system metrics * fix: update sample query * fix: move system test os retrieval to `.get()` * fix: move os.environ to `get`
This commit is contained in:
parent
6e92dc6938
commit
1e86b6533c
@ -228,7 +228,7 @@ class SystemMetric(Metric, ABC):
|
|||||||
"""Abstract class for system metrics"""
|
"""Abstract class for system metrics"""
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def sql(self):
|
def sql(self, session: Session, **kwargs):
|
||||||
"""SQL query to get system Metric"""
|
"""SQL query to get system Metric"""
|
||||||
|
|
||||||
|
|
||||||
|
@ -0,0 +1,35 @@
|
|||||||
|
# Copyright 2021 Collate
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
DML Operation class and mapper
|
||||||
|
"""
|
||||||
|
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
from metadata.generated.schema.entity.data.table import DmlOperationType
|
||||||
|
|
||||||
|
|
||||||
|
class DatabaseDMLOperations(Enum):
|
||||||
|
"""enum of supported DML operation on database engine side"""
|
||||||
|
|
||||||
|
INSERT = "INSERT"
|
||||||
|
UPDATE = "UPDATE"
|
||||||
|
DELETE = "DELETE"
|
||||||
|
MERGE = "MERGE"
|
||||||
|
|
||||||
|
|
||||||
|
DML_OPERATION_MAP = {
|
||||||
|
DatabaseDMLOperations.INSERT.value: DmlOperationType.INSERT.value,
|
||||||
|
DatabaseDMLOperations.MERGE.value: DmlOperationType.UPDATE.value,
|
||||||
|
DatabaseDMLOperations.UPDATE.value: DmlOperationType.UPDATE.value,
|
||||||
|
DatabaseDMLOperations.DELETE.value: DmlOperationType.DELETE.value,
|
||||||
|
}
|
@ -0,0 +1,54 @@
|
|||||||
|
# Copyright 2021 Collate
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
Bigquery System Metric Queries
|
||||||
|
"""
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from metadata.profiler.metrics.system.dml_operation import DatabaseDMLOperations
|
||||||
|
|
||||||
|
|
||||||
|
class BigQueryQueryResult(BaseModel):
|
||||||
|
table_name: dict
|
||||||
|
timestamp: datetime
|
||||||
|
query_type: str
|
||||||
|
dml_statistics: dict
|
||||||
|
|
||||||
|
|
||||||
|
DML_STAT_TO_DML_STATEMENT_MAPPING = {
|
||||||
|
"inserted_row_count": DatabaseDMLOperations.INSERT.value,
|
||||||
|
"deleted_row_count": DatabaseDMLOperations.DELETE.value,
|
||||||
|
"updated_row_count": DatabaseDMLOperations.UPDATE.value,
|
||||||
|
}
|
||||||
|
|
||||||
|
JOBS = """
|
||||||
|
SELECT
|
||||||
|
statement_type,
|
||||||
|
start_time,
|
||||||
|
destination_table,
|
||||||
|
dml_statistics
|
||||||
|
FROM
|
||||||
|
`region-{usage_location}`.INFORMATION_SCHEMA.JOBS
|
||||||
|
WHERE
|
||||||
|
DATE(creation_time) >= CURRENT_DATE() - 1 AND
|
||||||
|
destination_table.dataset_id = '{dataset_id}' AND
|
||||||
|
destination_table.project_id = '{project_id}' AND
|
||||||
|
statement_type IN (
|
||||||
|
'{insert}',
|
||||||
|
'{update}',
|
||||||
|
'{delete}',
|
||||||
|
'{merge}'
|
||||||
|
)
|
||||||
|
ORDER BY creation_time DESC;
|
||||||
|
"""
|
@ -0,0 +1,105 @@
|
|||||||
|
# Copyright 2021 Collate
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
Redshift System Metric Queries and queries operations
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from sqlalchemy import text
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from metadata.utils.profiler_utils import QueryResult
|
||||||
|
|
||||||
|
STL_QUERY = """
|
||||||
|
with data as (
|
||||||
|
select
|
||||||
|
{alias}.*
|
||||||
|
from
|
||||||
|
pg_catalog.stl_insert si
|
||||||
|
{join_type} join pg_catalog.stl_delete sd on si.query = sd.query
|
||||||
|
where
|
||||||
|
{condition}
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
SUM(data."rows") AS "rows",
|
||||||
|
sti."database",
|
||||||
|
sti."schema",
|
||||||
|
sti."table",
|
||||||
|
sq.text,
|
||||||
|
DATE_TRUNC('second', data.starttime) AS starttime
|
||||||
|
FROM
|
||||||
|
data
|
||||||
|
INNER JOIN pg_catalog.svv_table_info sti ON data.tbl = sti.table_id
|
||||||
|
INNER JOIN pg_catalog.stl_querytext sq ON data.query = sq.query
|
||||||
|
where
|
||||||
|
sti."database" = '{database}' AND
|
||||||
|
sti."schema" = '{schema}' AND
|
||||||
|
"rows" != 0 AND
|
||||||
|
DATE(data.starttime) >= CURRENT_DATE - 1
|
||||||
|
GROUP BY 2,3,4,5,6
|
||||||
|
ORDER BY 6 DESC
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def get_query_results(
|
||||||
|
session: Session,
|
||||||
|
query,
|
||||||
|
operation,
|
||||||
|
) -> List[QueryResult]:
|
||||||
|
"""get query results either from cache or from the database
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cache (Optional[List[QueryResult]]): cache results
|
||||||
|
session (Session): session
|
||||||
|
query (_type_): query
|
||||||
|
operation (_type_): operation
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[QueryResult]:
|
||||||
|
"""
|
||||||
|
cursor = session.execute(text(query))
|
||||||
|
results = [
|
||||||
|
QueryResult(
|
||||||
|
database_name=row.database,
|
||||||
|
schema_name=row.schema,
|
||||||
|
table_name=row.table,
|
||||||
|
query_text=row.text,
|
||||||
|
query_type=operation,
|
||||||
|
timestamp=row.starttime,
|
||||||
|
rows=row.rows,
|
||||||
|
)
|
||||||
|
for row in cursor
|
||||||
|
]
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def get_metric_result(ddls: List[QueryResult], table_name: str) -> List:
|
||||||
|
"""Given query results, retur the metric result
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ddls (List[QueryResult]): list of query results
|
||||||
|
table_name (str): table name
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List:
|
||||||
|
"""
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"timestamp": int(ddl.timestamp.timestamp() * 1000),
|
||||||
|
"operation": ddl.query_type,
|
||||||
|
"rowsAffected": ddl.rows,
|
||||||
|
}
|
||||||
|
for ddl in ddls
|
||||||
|
if ddl.table_name == table_name
|
||||||
|
]
|
@ -0,0 +1,99 @@
|
|||||||
|
# Copyright 2021 Collate
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
Snowflake System Metric Queries and query operations
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from sqlalchemy.engine.row import Row
|
||||||
|
|
||||||
|
from metadata.utils.logger import profiler_logger
|
||||||
|
from metadata.utils.profiler_utils import QueryResult, get_identifiers_from_string
|
||||||
|
|
||||||
|
logger = profiler_logger()
|
||||||
|
|
||||||
|
INFORMATION_SCHEMA_QUERY = """
|
||||||
|
SELECT * FROM "SNOWFLAKE"."ACCOUNT_USAGE"."QUERY_HISTORY"
|
||||||
|
WHERE
|
||||||
|
start_time>= DATEADD('DAY', -1, CURRENT_TIMESTAMP)
|
||||||
|
AND QUERY_TEXT ILIKE '%{tablename}%'
|
||||||
|
AND QUERY_TYPE IN (
|
||||||
|
'{insert}',
|
||||||
|
'{update}',
|
||||||
|
'{delete}',
|
||||||
|
'{merge}'
|
||||||
|
)
|
||||||
|
AND EXECUTION_STATUS = 'SUCCESS';
|
||||||
|
"""
|
||||||
|
|
||||||
|
RESULT_SCAN = """
|
||||||
|
SELECT *
|
||||||
|
FROM TABLE(RESULT_SCAN('{query_id}'));
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def get_snowflake_system_queries(
|
||||||
|
row: Row, database: str, schema: str
|
||||||
|
) -> Optional[QueryResult]:
|
||||||
|
"""get snowflake system queries for a specific database and schema. Parsing the query
|
||||||
|
is the only reliable way to get the DDL operation as fields in the table are not. If parsing
|
||||||
|
fails we'll fall back to regex lookup
|
||||||
|
|
||||||
|
1. Parse the query and check if we have an Identifier
|
||||||
|
2.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
row (dict): row from the snowflake system queries table
|
||||||
|
database (str): database name
|
||||||
|
schema (str): schema name
|
||||||
|
Returns:
|
||||||
|
QueryResult: namedtuple with the query result
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
logger.debug(f"Trying to parse query:\n{row.QUERY_TEXT}\n")
|
||||||
|
|
||||||
|
pattern = r"(?:(INSERT\s*INTO\s*|INSERT\s*OVERWRITE\s*INTO\s*|UPDATE\s*|MERGE\s*INTO\s*|DELETE\s*FROM\s*))([\w._\"]+)(?=[\s*\n])" # pylint: disable=line-too-long
|
||||||
|
match = re.match(pattern, row.QUERY_TEXT, re.IGNORECASE)
|
||||||
|
try:
|
||||||
|
identifier = match.group(2)
|
||||||
|
except (IndexError, AttributeError):
|
||||||
|
logger.debug("Could not find identifier in query. Skipping row.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
database_name, schema_name, table_name = get_identifiers_from_string(identifier)
|
||||||
|
|
||||||
|
if not all([database_name, schema_name, table_name]):
|
||||||
|
logger.debug(
|
||||||
|
"Missing database, schema, or table. Can't link operation to table entity in OpenMetadata."
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
if (
|
||||||
|
database.lower() == database_name.lower()
|
||||||
|
and schema.lower() == schema_name.lower()
|
||||||
|
):
|
||||||
|
return QueryResult(
|
||||||
|
query_id=row.QUERY_ID,
|
||||||
|
database_name=database_name.lower(),
|
||||||
|
schema_name=schema_name.lower(),
|
||||||
|
table_name=table_name.lower(),
|
||||||
|
query_text=row.QUERY_TEXT,
|
||||||
|
query_type=row.QUERY_TYPE,
|
||||||
|
timestamp=row.START_TIME,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return None
|
@ -8,31 +8,47 @@
|
|||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
# pylint: disable=unused-argument
|
||||||
"""
|
"""
|
||||||
System Metric
|
System Metric
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import traceback
|
import traceback
|
||||||
from collections import defaultdict, namedtuple
|
from collections import defaultdict
|
||||||
from enum import Enum
|
|
||||||
from textwrap import dedent
|
from textwrap import dedent
|
||||||
from typing import Dict, List, Optional
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
import sqlparse
|
|
||||||
from sqlalchemy import text
|
from sqlalchemy import text
|
||||||
from sqlalchemy.orm import DeclarativeMeta, Session
|
from sqlalchemy.orm import DeclarativeMeta, Session
|
||||||
|
|
||||||
from metadata.generated.schema.entity.data.table import DmlOperationType
|
|
||||||
from metadata.generated.schema.entity.services.connections.database.bigQueryConnection import (
|
from metadata.generated.schema.entity.services.connections.database.bigQueryConnection import (
|
||||||
BigQueryConnection,
|
BigQueryConnection,
|
||||||
)
|
)
|
||||||
from metadata.profiler.metrics.core import SystemMetric
|
from metadata.profiler.metrics.core import SystemMetric
|
||||||
|
from metadata.profiler.metrics.system.dml_operation import (
|
||||||
|
DML_OPERATION_MAP,
|
||||||
|
DatabaseDMLOperations,
|
||||||
|
)
|
||||||
|
from metadata.profiler.metrics.system.queries.bigquery import (
|
||||||
|
DML_STAT_TO_DML_STATEMENT_MAPPING,
|
||||||
|
JOBS,
|
||||||
|
BigQueryQueryResult,
|
||||||
|
)
|
||||||
|
from metadata.profiler.metrics.system.queries.redshift import (
|
||||||
|
STL_QUERY,
|
||||||
|
get_metric_result,
|
||||||
|
get_query_results,
|
||||||
|
)
|
||||||
|
from metadata.profiler.metrics.system.queries.snowflake import (
|
||||||
|
INFORMATION_SCHEMA_QUERY,
|
||||||
|
RESULT_SCAN,
|
||||||
|
get_snowflake_system_queries,
|
||||||
|
)
|
||||||
from metadata.profiler.orm.registry import Dialects
|
from metadata.profiler.orm.registry import Dialects
|
||||||
from metadata.utils.dispatch import valuedispatch
|
from metadata.utils.dispatch import valuedispatch
|
||||||
from metadata.utils.helpers import deep_size_of_dict
|
from metadata.utils.helpers import deep_size_of_dict
|
||||||
from metadata.utils.logger import profiler_logger
|
from metadata.utils.logger import profiler_logger
|
||||||
from metadata.utils.profiler_utils import clean_up_query, get_snowflake_system_queries
|
from metadata.utils.profiler_utils import get_value_from_cache, set_cache
|
||||||
|
|
||||||
logger = profiler_logger()
|
logger = profiler_logger()
|
||||||
|
|
||||||
@ -44,22 +60,6 @@ def recursive_dic():
|
|||||||
return defaultdict(recursive_dic)
|
return defaultdict(recursive_dic)
|
||||||
|
|
||||||
|
|
||||||
class DatabaseDMLOperations(Enum):
|
|
||||||
"""enum of supported DML operation on database engine side"""
|
|
||||||
|
|
||||||
INSERT = "INSERT"
|
|
||||||
UPDATE = "UPDATE"
|
|
||||||
DELETE = "DELETE"
|
|
||||||
MERGE = "MERGE"
|
|
||||||
|
|
||||||
|
|
||||||
DML_OPERATION_MAP = {
|
|
||||||
DatabaseDMLOperations.INSERT.value: DmlOperationType.INSERT.value,
|
|
||||||
DatabaseDMLOperations.MERGE.value: DmlOperationType.UPDATE.value,
|
|
||||||
DatabaseDMLOperations.UPDATE.value: DmlOperationType.UPDATE.value,
|
|
||||||
DatabaseDMLOperations.DELETE.value: DmlOperationType.DELETE.value,
|
|
||||||
}
|
|
||||||
|
|
||||||
SYSTEM_QUERY_RESULT_CACHE = recursive_dic()
|
SYSTEM_QUERY_RESULT_CACHE = recursive_dic()
|
||||||
|
|
||||||
|
|
||||||
@ -108,100 +108,83 @@ def _(
|
|||||||
List[Dict]:
|
List[Dict]:
|
||||||
"""
|
"""
|
||||||
logger.debug(f"Fetching system metrics for {dialect}")
|
logger.debug(f"Fetching system metrics for {dialect}")
|
||||||
dml_stat_to_dml_statement_mapping = {
|
|
||||||
"inserted_row_count": DatabaseDMLOperations.INSERT.value,
|
|
||||||
"deleted_row_count": DatabaseDMLOperations.DELETE.value,
|
|
||||||
"updated_row_count": DatabaseDMLOperations.UPDATE.value,
|
|
||||||
}
|
|
||||||
project_id = session.get_bind().url.host
|
|
||||||
dataset_id = table.__table_args__["schema"]
|
|
||||||
|
|
||||||
jobs = dedent(
|
project_id = session.get_bind().url.host
|
||||||
f"""
|
dataset_id = table.__table_args__["schema"] # type: ignore
|
||||||
SELECT
|
|
||||||
statement_type,
|
|
||||||
start_time,
|
|
||||||
destination_table,
|
|
||||||
dml_statistics
|
|
||||||
FROM
|
|
||||||
`region-{conn_config.usageLocation}`.INFORMATION_SCHEMA.JOBS
|
|
||||||
WHERE
|
|
||||||
DATE(creation_time) >= CURRENT_DATE() - 1 AND
|
|
||||||
destination_table.dataset_id = '{dataset_id}' AND
|
|
||||||
destination_table.project_id = '{project_id}' AND
|
|
||||||
statement_type IN (
|
|
||||||
'{DatabaseDMLOperations.INSERT.value}',
|
|
||||||
'{DatabaseDMLOperations.DELETE.value}',
|
|
||||||
'{DatabaseDMLOperations.UPDATE.value}',
|
|
||||||
'{DatabaseDMLOperations.MERGE.value}'
|
|
||||||
)
|
|
||||||
ORDER BY creation_time DESC;
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
|
|
||||||
metric_results: List[Dict] = []
|
metric_results: List[Dict] = []
|
||||||
QueryResult = namedtuple(
|
# QueryResult = namedtuple(
|
||||||
"QueryResult",
|
# "QueryResult",
|
||||||
"query_type,timestamp,destination_table,dml_statistics",
|
# "query_type,timestamp,destination_table,dml_statistics",
|
||||||
|
# )
|
||||||
|
|
||||||
|
jobs = get_value_from_cache(
|
||||||
|
SYSTEM_QUERY_RESULT_CACHE, f"{Dialects.BigQuery}.{project_id}.{dataset_id}.jobs"
|
||||||
)
|
)
|
||||||
|
|
||||||
if (
|
if not jobs:
|
||||||
"query_results"
|
cursor_jobs = session.execute(
|
||||||
in SYSTEM_QUERY_RESULT_CACHE[Dialects.BigQuery][project_id][dataset_id]
|
text(
|
||||||
):
|
JOBS.format(
|
||||||
# we'll try to get the cached data first
|
usage_location=conn_config.usageLocation,
|
||||||
query_results = SYSTEM_QUERY_RESULT_CACHE[Dialects.BigQuery][project_id][
|
dataset_id=dataset_id,
|
||||||
dataset_id
|
project_id=project_id,
|
||||||
]["query_results"]
|
insert=DatabaseDMLOperations.INSERT.value,
|
||||||
else:
|
update=DatabaseDMLOperations.UPDATE.value,
|
||||||
cursor_jobs = session.execute(text(jobs))
|
delete=DatabaseDMLOperations.DELETE.value,
|
||||||
query_results = [
|
merge=DatabaseDMLOperations.MERGE.value,
|
||||||
QueryResult(
|
|
||||||
row.statement_type,
|
|
||||||
row.start_time,
|
|
||||||
row.destination_table,
|
|
||||||
row.dml_statistics,
|
|
||||||
)
|
)
|
||||||
for row in cursor_jobs.fetchall()
|
)
|
||||||
|
)
|
||||||
|
jobs = [
|
||||||
|
BigQueryQueryResult(
|
||||||
|
query_type=row.statement_type,
|
||||||
|
timestamp=row.start_time,
|
||||||
|
table_name=row.destination_table,
|
||||||
|
dml_statistics=row.dml_statistics,
|
||||||
|
)
|
||||||
|
for row in cursor_jobs
|
||||||
]
|
]
|
||||||
SYSTEM_QUERY_RESULT_CACHE[Dialects.BigQuery][project_id][dataset_id][
|
set_cache(
|
||||||
"query_results"
|
SYSTEM_QUERY_RESULT_CACHE,
|
||||||
] = query_results
|
f"{Dialects.BigQuery}.{project_id}.{dataset_id}.jobs",
|
||||||
|
jobs,
|
||||||
|
)
|
||||||
|
|
||||||
for row_jobs in query_results:
|
for job in jobs:
|
||||||
if row_jobs.destination_table.get("table_id") == table.__tablename__:
|
if job.table_name.get("table_id") == table.__tablename__: # type: ignore
|
||||||
rows_affected = None
|
rows_affected = None
|
||||||
try:
|
try:
|
||||||
if row_jobs.query_type == DatabaseDMLOperations.INSERT.value:
|
if job.query_type == DatabaseDMLOperations.INSERT.value:
|
||||||
rows_affected = row_jobs.dml_statistics.get("inserted_row_count")
|
rows_affected = job.dml_statistics.get("inserted_row_count")
|
||||||
if row_jobs.query_type == DatabaseDMLOperations.DELETE.value:
|
if job.query_type == DatabaseDMLOperations.DELETE.value:
|
||||||
rows_affected = row_jobs.dml_statistics.get("deleted_row_count")
|
rows_affected = job.dml_statistics.get("deleted_row_count")
|
||||||
if row_jobs.query_type == DatabaseDMLOperations.UPDATE.value:
|
if job.query_type == DatabaseDMLOperations.UPDATE.value:
|
||||||
rows_affected = row_jobs.dml_statistics.get("updated_row_count")
|
rows_affected = job.dml_statistics.get("updated_row_count")
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
logger.debug(traceback.format_exc())
|
logger.debug(traceback.format_exc())
|
||||||
rows_affected = None
|
rows_affected = None
|
||||||
|
|
||||||
if row_jobs.query_type == DatabaseDMLOperations.MERGE.value:
|
if job.query_type == DatabaseDMLOperations.MERGE.value:
|
||||||
for indx, key in enumerate(row_jobs.dml_statistics):
|
for indx, key in enumerate(job.dml_statistics):
|
||||||
if row_jobs.dml_statistics[key] != 0:
|
if job.dml_statistics[key] != 0:
|
||||||
metric_results.append(
|
metric_results.append(
|
||||||
{
|
{
|
||||||
# Merge statement can include multiple DML operations
|
# Merge statement can include multiple DML operations
|
||||||
# We are padding timestamps by 0,1,2 millisesond to avoid
|
# We are padding timestamps by 0,1,2 millisesond to avoid
|
||||||
# duplicate timestamps
|
# duplicate timestamps
|
||||||
"timestamp": int(row_jobs.timestamp.timestamp() * 1000)
|
"timestamp": int(job.timestamp.timestamp() * 1000)
|
||||||
+ indx,
|
+ indx,
|
||||||
"operation": dml_stat_to_dml_statement_mapping.get(key),
|
"operation": DML_STAT_TO_DML_STATEMENT_MAPPING.get(key),
|
||||||
"rowsAffected": row_jobs.dml_statistics[key],
|
"rowsAffected": job.dml_statistics[key],
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
metric_results.append(
|
metric_results.append(
|
||||||
{
|
{
|
||||||
"timestamp": int(row_jobs.timestamp.timestamp() * 1000),
|
"timestamp": int(job.timestamp.timestamp() * 1000),
|
||||||
"operation": row_jobs.query_type,
|
"operation": job.query_type,
|
||||||
"rowsAffected": rows_affected,
|
"rowsAffected": rows_affected,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@ -229,237 +212,147 @@ def _(
|
|||||||
"""
|
"""
|
||||||
logger.debug(f"Fetching system metrics for {dialect}")
|
logger.debug(f"Fetching system metrics for {dialect}")
|
||||||
database = session.get_bind().url.database
|
database = session.get_bind().url.database
|
||||||
schema = table.__table_args__["schema"]
|
schema = table.__table_args__["schema"] # type: ignore
|
||||||
|
|
||||||
stl_deleted = dedent(
|
|
||||||
f"""
|
|
||||||
SELECT
|
|
||||||
SUM(si."rows") AS "rows",
|
|
||||||
sti."database",
|
|
||||||
sti."schema",
|
|
||||||
sti."table",
|
|
||||||
sq.text,
|
|
||||||
DATE_TRUNC('second', si.starttime) AS starttime
|
|
||||||
FROM
|
|
||||||
pg_catalog.stl_delete si
|
|
||||||
INNER JOIN pg_catalog.svv_table_info sti ON si.tbl = sti.table_id
|
|
||||||
INNER JOIN pg_catalog.stl_querytext sq ON si.query = sq.query
|
|
||||||
WHERE
|
|
||||||
sti."database" = '{database}' AND
|
|
||||||
sti."schema" = '{schema}' AND
|
|
||||||
"rows" != 0 AND
|
|
||||||
DATE(starttime) >= CURRENT_DATE - 1
|
|
||||||
GROUP BY 2,3,4,5,6
|
|
||||||
ORDER BY 6 desc
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
|
|
||||||
stl_insert = dedent(
|
|
||||||
f"""
|
|
||||||
SELECT
|
|
||||||
SUM(si."rows") AS "rows",
|
|
||||||
sti."database",
|
|
||||||
sti."schema",
|
|
||||||
sti."table",
|
|
||||||
sq.text,
|
|
||||||
DATE_TRUNC('second', si.starttime) AS starttime
|
|
||||||
FROM
|
|
||||||
pg_catalog.stl_insert si
|
|
||||||
INNER JOIN pg_catalog.svv_table_info sti ON si.tbl = sti.table_id
|
|
||||||
INNER JOIN pg_catalog.stl_querytext sq ON si.query = sq.query
|
|
||||||
WHERE
|
|
||||||
sti."database" = '{database}' AND
|
|
||||||
sti."schema" = '{schema}' AND
|
|
||||||
"rows" != 0 AND
|
|
||||||
DATE(starttime) >= CURRENT_DATE - 1
|
|
||||||
GROUP BY 2,3,4,5,6
|
|
||||||
ORDER BY 6 desc
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
|
|
||||||
metric_results: List[Dict] = []
|
metric_results: List[Dict] = []
|
||||||
QueryResult = namedtuple(
|
|
||||||
"QueryResult",
|
|
||||||
"database_name,schema_name,table_name,query_text,timestamp,rowsAffected",
|
|
||||||
)
|
|
||||||
|
|
||||||
if (
|
# get inserts ddl queries
|
||||||
"query_results_inserted"
|
inserts = get_value_from_cache(
|
||||||
in SYSTEM_QUERY_RESULT_CACHE[Dialects.Redshift][database][schema]
|
SYSTEM_QUERY_RESULT_CACHE, f"{Dialects.Redshift}.{database}.{schema}.inserts"
|
||||||
):
|
|
||||||
# we'll try to get the cached data first
|
|
||||||
query_results_inserted = SYSTEM_QUERY_RESULT_CACHE[Dialects.Redshift][database][
|
|
||||||
schema
|
|
||||||
]["query_results_inserted"]
|
|
||||||
else:
|
|
||||||
cursor_insert = session.execute(text(stl_insert))
|
|
||||||
query_results_inserted = [
|
|
||||||
QueryResult(
|
|
||||||
row.database,
|
|
||||||
row.schema,
|
|
||||||
row.table,
|
|
||||||
sqlparse.parse(clean_up_query(row.text))[0],
|
|
||||||
row.starttime,
|
|
||||||
row.rows,
|
|
||||||
)
|
)
|
||||||
for row in cursor_insert.fetchall()
|
if not inserts:
|
||||||
]
|
insert_query = STL_QUERY.format(
|
||||||
SYSTEM_QUERY_RESULT_CACHE[Dialects.Redshift][database][schema][
|
alias="si",
|
||||||
"query_results_inserted"
|
join_type="LEFT",
|
||||||
] = query_results_inserted
|
condition="sd.query is null",
|
||||||
|
database=database,
|
||||||
|
schema=schema,
|
||||||
|
)
|
||||||
|
inserts = get_query_results(
|
||||||
|
session,
|
||||||
|
insert_query,
|
||||||
|
DatabaseDMLOperations.INSERT.value,
|
||||||
|
)
|
||||||
|
set_cache(
|
||||||
|
SYSTEM_QUERY_RESULT_CACHE,
|
||||||
|
f"{Dialects.Redshift}.{database}.{schema}.inserts",
|
||||||
|
inserts,
|
||||||
|
)
|
||||||
|
metric_results.extend(get_metric_result(inserts, table.__tablename__)) # type: ignore
|
||||||
|
|
||||||
if (
|
# get deletes ddl queries
|
||||||
"query_results_deleted"
|
deletes = get_value_from_cache(
|
||||||
in SYSTEM_QUERY_RESULT_CACHE[Dialects.Redshift][database][schema]
|
SYSTEM_QUERY_RESULT_CACHE, f"{Dialects.Redshift}.{database}.{schema}.deletes"
|
||||||
):
|
|
||||||
# we'll try to get the cached data first
|
|
||||||
query_results_deleted = SYSTEM_QUERY_RESULT_CACHE[Dialects.Redshift][database][
|
|
||||||
schema
|
|
||||||
]["query_results_deleted"]
|
|
||||||
else:
|
|
||||||
cursor_deleted = session.execute(text(stl_deleted))
|
|
||||||
query_results_deleted = [
|
|
||||||
QueryResult(
|
|
||||||
row.database,
|
|
||||||
row.schema,
|
|
||||||
row.table,
|
|
||||||
sqlparse.parse(clean_up_query(row.text))[0],
|
|
||||||
row.starttime,
|
|
||||||
row.rows,
|
|
||||||
)
|
)
|
||||||
for row in cursor_deleted.fetchall()
|
if not deletes:
|
||||||
]
|
delete_query = STL_QUERY.format(
|
||||||
SYSTEM_QUERY_RESULT_CACHE[Dialects.Redshift][database][schema][
|
alias="sd",
|
||||||
"query_results_deleted"
|
join_type="RIGHT",
|
||||||
] = query_results_deleted
|
condition="si.query is null",
|
||||||
|
database=database,
|
||||||
|
schema=schema,
|
||||||
|
)
|
||||||
|
deletes = get_query_results(
|
||||||
|
session,
|
||||||
|
delete_query,
|
||||||
|
DatabaseDMLOperations.DELETE.value,
|
||||||
|
)
|
||||||
|
set_cache(
|
||||||
|
SYSTEM_QUERY_RESULT_CACHE,
|
||||||
|
f"{Dialects.Redshift}.{database}.{schema}.deletes",
|
||||||
|
deletes,
|
||||||
|
)
|
||||||
|
metric_results.extend(get_metric_result(deletes, table.__tablename__)) # type: ignore
|
||||||
|
|
||||||
for row_inserted in query_results_inserted:
|
# get updates ddl queries
|
||||||
if row_inserted.table_name == table.__tablename__:
|
updates = get_value_from_cache(
|
||||||
query_text = row_inserted.query_text
|
SYSTEM_QUERY_RESULT_CACHE, f"{Dialects.Redshift}.{database}.{schema}.updates"
|
||||||
operation = next(
|
|
||||||
(
|
|
||||||
token.value.upper()
|
|
||||||
for token in query_text.tokens
|
|
||||||
if token.ttype is sqlparse.tokens.DML
|
|
||||||
and token.value.upper()
|
|
||||||
in DmlOperationType._member_names_ # pylint: disable=protected-access
|
|
||||||
),
|
|
||||||
None,
|
|
||||||
)
|
)
|
||||||
if operation:
|
if not updates:
|
||||||
metric_results.append(
|
update_query = STL_QUERY.format(
|
||||||
{
|
alias="si",
|
||||||
"timestamp": int(row_inserted.timestamp.timestamp() * 1000),
|
join_type="INNER",
|
||||||
"operation": operation,
|
condition="sd.query is not null",
|
||||||
"rowsAffected": row_inserted.rowsAffected,
|
database=database,
|
||||||
}
|
schema=schema,
|
||||||
)
|
)
|
||||||
|
updates = get_query_results(
|
||||||
for row_deleted in query_results_deleted:
|
session,
|
||||||
if row_deleted.table_name == table.__tablename__:
|
update_query,
|
||||||
query_text = row_deleted.query_text
|
DatabaseDMLOperations.UPDATE.value,
|
||||||
operation = next(
|
|
||||||
(
|
|
||||||
token.value.upper()
|
|
||||||
for token in query_text.tokens
|
|
||||||
if token.ttype is sqlparse.tokens.DML and token.value != "UPDATE"
|
|
||||||
),
|
|
||||||
None,
|
|
||||||
)
|
)
|
||||||
|
set_cache(
|
||||||
if operation:
|
SYSTEM_QUERY_RESULT_CACHE,
|
||||||
metric_results.append(
|
f"{Dialects.Redshift}.{database}.{schema}.updates",
|
||||||
{
|
updates,
|
||||||
"timestamp": int(row_deleted.timestamp.timestamp() * 1000),
|
|
||||||
"operation": operation,
|
|
||||||
"rowsAffected": row_deleted.rowsAffected,
|
|
||||||
}
|
|
||||||
)
|
)
|
||||||
|
metric_results.extend(get_metric_result(updates, table.__tablename__)) # type: ignore
|
||||||
|
|
||||||
return metric_results
|
return metric_results
|
||||||
|
|
||||||
|
|
||||||
# @get_system_metrics_for_dialect.register(Dialects.Snowflake)
|
@get_system_metrics_for_dialect.register(Dialects.Snowflake)
|
||||||
# def _(
|
def _(
|
||||||
# dialect: str,
|
dialect: str,
|
||||||
# session: Session,
|
session: Session,
|
||||||
# table: DeclarativeMeta,
|
table: DeclarativeMeta,
|
||||||
# *args,
|
*args,
|
||||||
# **kwargs,
|
**kwargs,
|
||||||
# ) -> Optional[List[Dict]]:
|
) -> Optional[List[Dict]]:
|
||||||
# """Fetch system metrics for Snowflake. query_history will return maximum 10K rows in one request.
|
"""Fetch system metrics for Snowflake. query_history will return maximum 10K rows in one request.
|
||||||
# We'll be fetching all the queries ran for the past 24 hours and filtered on specific query types
|
We'll be fetching all the queries ran for the past 24 hours and filtered on specific query types
|
||||||
# (INSERTS, MERGE, DELETE, UPDATE).
|
(INSERTS, MERGE, DELETE, UPDATE).
|
||||||
|
|
||||||
# To get the number of rows affected we'll use the specific query ID.
|
:waring: Unlike redshift and bigquery results are not cached as we'll be looking
|
||||||
|
at DDL for each table
|
||||||
|
|
||||||
# Args:
|
To get the number of rows affected we'll use the specific query ID.
|
||||||
# dialect (str): dialect
|
|
||||||
# session (Session): session object
|
|
||||||
|
|
||||||
# Returns:
|
Args:
|
||||||
# Dict: system metric
|
dialect (str): dialect
|
||||||
# """
|
session (Session): session object
|
||||||
# logger.debug(f"Fetching system metrics for {dialect}")
|
|
||||||
# database = session.get_bind().url.database
|
|
||||||
# schema = table.__table_args__["schema"]
|
|
||||||
|
|
||||||
# metric_results: List[Dict] = []
|
Returns:
|
||||||
|
Dict: system metric
|
||||||
|
"""
|
||||||
|
logger.debug(f"Fetching system metrics for {dialect}")
|
||||||
|
database = session.get_bind().url.database
|
||||||
|
schema = table.__table_args__["schema"] # type: ignore
|
||||||
|
|
||||||
# information_schema_query_history = f"""
|
metric_results: List[Dict] = []
|
||||||
# SELECT * FROM "SNOWFLAKE"."ACCOUNT_USAGE"."QUERY_HISTORY"
|
|
||||||
# WHERE
|
|
||||||
# start_time>= DATEADD('DAY', -1, CURRENT_TIMESTAMP)
|
|
||||||
# AND QUERY_TYPE IN (
|
|
||||||
# '{DatabaseDMLOperations.INSERT.value}',
|
|
||||||
# '{DatabaseDMLOperations.UPDATE.value}',
|
|
||||||
# '{DatabaseDMLOperations.DELETE.value}',
|
|
||||||
# '{DatabaseDMLOperations.MERGE.value}'
|
|
||||||
# )
|
|
||||||
# AND EXECUTION_STATUS = 'SUCCESS';
|
|
||||||
# """
|
|
||||||
# result_scan = """
|
|
||||||
# SELECT *
|
|
||||||
# FROM TABLE(RESULT_SCAN('{query_id}'));
|
|
||||||
# """
|
|
||||||
|
|
||||||
# if (
|
rows = session.execute(
|
||||||
# "query_results"
|
text(
|
||||||
# in SYSTEM_QUERY_RESULT_CACHE[Dialects.Snowflake][database][schema]
|
INFORMATION_SCHEMA_QUERY.format(
|
||||||
# ):
|
tablename=table.__tablename__, # type: ignore
|
||||||
# # we'll try to get the cached data first
|
insert=DatabaseDMLOperations.INSERT.value,
|
||||||
# query_results = SYSTEM_QUERY_RESULT_CACHE[Dialects.Snowflake][database][schema][
|
update=DatabaseDMLOperations.UPDATE.value,
|
||||||
# "query_results"
|
delete=DatabaseDMLOperations.DELETE.value,
|
||||||
# ]
|
merge=DatabaseDMLOperations.MERGE.value,
|
||||||
# else:
|
)
|
||||||
# rows = session.execute(text(information_schema_query_history))
|
)
|
||||||
# query_results = []
|
)
|
||||||
# for row in rows:
|
query_results = []
|
||||||
# result = get_snowflake_system_queries(row, database, schema)
|
for row in rows:
|
||||||
# if result:
|
result = get_snowflake_system_queries(row, database, schema)
|
||||||
# query_results.append(result)
|
if result:
|
||||||
# SYSTEM_QUERY_RESULT_CACHE[Dialects.Snowflake][database][schema][
|
query_results.append(result)
|
||||||
# "query_results"
|
|
||||||
# ] = query_results
|
|
||||||
|
|
||||||
# for query_result in query_results:
|
for query_result in query_results:
|
||||||
# if table.__tablename__.lower() == query_result.table_name:
|
cursor_for_result_scan = session.execute(
|
||||||
# cursor_for_result_scan = session.execute(
|
text(dedent(RESULT_SCAN.format(query_id=query_result.query_id)))
|
||||||
# text(dedent(result_scan.format(query_id=query_result.query_id)))
|
)
|
||||||
# )
|
row_for_result_scan = cursor_for_result_scan.first()
|
||||||
# row_for_result_scan = cursor_for_result_scan.first()
|
|
||||||
|
|
||||||
# metric_results.append(
|
metric_results.append(
|
||||||
# {
|
{
|
||||||
# "timestamp": int(query_result.timestamp.timestamp() * 1000),
|
"timestamp": int(query_result.timestamp.timestamp() * 1000),
|
||||||
# "operation": DML_OPERATION_MAP.get(query_result.query_type),
|
"operation": DML_OPERATION_MAP.get(query_result.query_type),
|
||||||
# "rowsAffected": row_for_result_scan[0]
|
"rowsAffected": row_for_result_scan[0] if row_for_result_scan else None,
|
||||||
# if row_for_result_scan
|
}
|
||||||
# else None,
|
)
|
||||||
# }
|
|
||||||
# )
|
|
||||||
|
|
||||||
# return metric_results
|
return metric_results
|
||||||
|
|
||||||
|
|
||||||
class System(SystemMetric):
|
class System(SystemMetric):
|
||||||
@ -513,7 +406,7 @@ class System(SystemMetric):
|
|||||||
system_metrics = get_system_metrics_for_dialect(
|
system_metrics = get_system_metrics_for_dialect(
|
||||||
session.get_bind().dialect.name,
|
session.get_bind().dialect.name,
|
||||||
session=session,
|
session=session,
|
||||||
table=self.table,
|
table=self.table, # pylint: disable=no-member
|
||||||
conn_config=conn_config,
|
conn_config=conn_config,
|
||||||
)
|
)
|
||||||
self._manage_cache()
|
self._manage_cache()
|
||||||
|
@ -12,15 +12,34 @@
|
|||||||
"""Profiler utils class and functions"""
|
"""Profiler utils class and functions"""
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from collections import namedtuple
|
from collections import defaultdict
|
||||||
from typing import Optional
|
from datetime import datetime
|
||||||
|
from functools import reduce
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
import sqlparse
|
import sqlparse
|
||||||
from sqlalchemy.engine.row import Row
|
from pydantic import BaseModel
|
||||||
from sqlparse.sql import Identifier
|
|
||||||
|
|
||||||
|
from metadata.utils.logger import profiler_logger
|
||||||
from metadata.utils.sqa_utils import is_array
|
from metadata.utils.sqa_utils import is_array
|
||||||
|
|
||||||
|
logger = profiler_logger()
|
||||||
|
|
||||||
|
PARSING_TIMEOUT = 10
|
||||||
|
|
||||||
|
|
||||||
|
class QueryResult(BaseModel):
|
||||||
|
"""System metric query result shared by Redshift and Snowflake"""
|
||||||
|
|
||||||
|
database_name: str
|
||||||
|
schema_name: str
|
||||||
|
table_name: str
|
||||||
|
query_type: str
|
||||||
|
timestamp: datetime
|
||||||
|
query_id: Optional[str] = None
|
||||||
|
query_text: Optional[str] = None
|
||||||
|
rows: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
class ColumnLike:
|
class ColumnLike:
|
||||||
"""We don't have column information at this stage (only metric entities)
|
"""We don't have column information at this stage (only metric entities)
|
||||||
@ -57,60 +76,58 @@ def clean_up_query(query: str) -> str:
|
|||||||
return sqlparse.format(query, strip_comments=True).replace("\\n", "")
|
return sqlparse.format(query, strip_comments=True).replace("\\n", "")
|
||||||
|
|
||||||
|
|
||||||
def get_snowflake_system_queries(
|
def get_identifiers_from_string(
|
||||||
row: Row, database: str, schema: str
|
identifier: str,
|
||||||
) -> Optional["QueryResult"]:
|
) -> Tuple[Optional[str], Optional[str], Optional[str]]:
|
||||||
"""get snowflake system queries for a specific database and schema. Parsing the query
|
"""given a string identifier try to fetch the database, schema and table names.
|
||||||
is the only reliable way to get the DDL operation as fields in the table are not.
|
part of the identifier name as `"DATABASE.DOT"` will be returned on the left side of the tuple
|
||||||
|
and the rest of the identifier name as `"SCHEMA.DOT.TABLE"` will be returned on the right side of the tuple
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
row (dict): row from the snowflake system queries table
|
identifier (str): table identifier
|
||||||
database (str): database name
|
|
||||||
schema (str): schema name
|
|
||||||
Returns:
|
Returns:
|
||||||
QueryResult: namedtuple with the query result
|
Tuple[str, str, str]: database, schema and table names
|
||||||
"""
|
"""
|
||||||
|
pattern = r"\"([^\"]+)\"|(\w+(?:\.\w+)*(?:\.\w+)*)"
|
||||||
|
matches = re.findall(pattern, identifier)
|
||||||
|
|
||||||
QueryResult = namedtuple(
|
values = []
|
||||||
"QueryResult",
|
for match in matches:
|
||||||
"query_id,database_name,schema_name,table_name,query_text,query_type,timestamp",
|
if match[0] != "":
|
||||||
)
|
values.append(match[0])
|
||||||
|
if match[1] != "":
|
||||||
|
split_match = match[1].split(".")
|
||||||
|
values.extend(split_match)
|
||||||
|
|
||||||
try:
|
|
||||||
parsed_query = sqlparse.parse(clean_up_query(row.query_text))[0]
|
|
||||||
identifier = next(
|
|
||||||
(
|
|
||||||
query_el
|
|
||||||
for query_el in parsed_query.tokens
|
|
||||||
if isinstance(query_el, Identifier)
|
|
||||||
),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
if not identifier:
|
|
||||||
return None
|
|
||||||
values = identifier.value.split(".")
|
|
||||||
database_name, schema_name, table_name = ([None] * (3 - len(values))) + values
|
database_name, schema_name, table_name = ([None] * (3 - len(values))) + values
|
||||||
|
return database_name, schema_name, table_name
|
||||||
|
|
||||||
if not all([database_name, schema_name, table_name]):
|
|
||||||
|
def get_value_from_cache(cache: dict, key: str):
|
||||||
|
"""given a dict of cache and a key, return the value if exists
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cache (dict): dict of cache
|
||||||
|
key (str): key to look for in the cache
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return reduce(dict.get, key.split("."), cache)
|
||||||
|
except TypeError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# clean up table name
|
|
||||||
table_name = re.sub(r"\s.*", "", table_name).strip()
|
|
||||||
|
|
||||||
if (
|
def set_cache(cache: defaultdict, key: str, value):
|
||||||
database.lower() == database_name.lower()
|
"""given a dict of cache, a key and a value, set the value in the cache
|
||||||
and schema.lower() == schema_name.lower()
|
|
||||||
):
|
|
||||||
return QueryResult(
|
|
||||||
row.query_id,
|
|
||||||
database_name.lower(),
|
|
||||||
schema_name.lower(),
|
|
||||||
table_name.lower(),
|
|
||||||
parsed_query,
|
|
||||||
row.query_type,
|
|
||||||
row.start_time,
|
|
||||||
)
|
|
||||||
except Exception:
|
|
||||||
return None
|
|
||||||
|
|
||||||
return None
|
Args:
|
||||||
|
cache (dict): dict of cache
|
||||||
|
key (str): key to set for in the cache
|
||||||
|
value: value to set in the cache
|
||||||
|
"""
|
||||||
|
split_key = key.split(".")
|
||||||
|
for indx, key_ in enumerate(split_key):
|
||||||
|
if indx == len(split_key) - 1:
|
||||||
|
cache[key_] = value
|
||||||
|
break
|
||||||
|
cache = cache[key_]
|
||||||
|
@ -0,0 +1,179 @@
|
|||||||
|
# Copyright 2021 Collate
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
Validate bigquery system metrics (will be disabled by default). To be ran manually
|
||||||
|
|
||||||
|
How to use this test
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
1. Comment the @pytest.mark.skip decorator on line 31
|
||||||
|
2. Make sure you have set up the right environment variables for the bigquery database
|
||||||
|
check the config file at "cli_e2e/database/bigquery/bigquery.yaml". The metadata
|
||||||
|
ingestion will ingest data from the `dbt_jaffle` schema.
|
||||||
|
3. Prior to running this test you will need to execute DDLs in the `dbt_jaffle` schema.
|
||||||
|
We will need to perform at least one `DELETE`, `INSERT`, `UPDATE` on any table from the schema.
|
||||||
|
query example:
|
||||||
|
```
|
||||||
|
INSERT INTO dbt_jaffle.Person VALUES
|
||||||
|
('John', 'Doe', 'II'),
|
||||||
|
('Jane', 'Doe', 'II'),
|
||||||
|
('Jeff', 'Doe', 'II')
|
||||||
|
|
||||||
|
UPDATE dbt_jaffle.Person SET add = 'IV' WHERE first_name = 'John';
|
||||||
|
|
||||||
|
MERGE INTO dbt_jaffle.Person NT USING (SELECT 'Jeff' AS first_name, 'Doe' AS last_name, NULL AS add) N ON NT.first_name = N.first_name
|
||||||
|
WHEN MATCHED THEN UPDATE SET NT.first_name = N.first_name;
|
||||||
|
|
||||||
|
DELETE FROM dbt_jaffle.Person WHERE first_name = 'John';
|
||||||
|
```
|
||||||
|
4. Once you have performed the above steps, run the test with the following command:
|
||||||
|
`python -m pytest tests/integration/orm_profiler/system/test_bigquery_system_metrics.py` from the ingestion directory.
|
||||||
|
You can also perform the same action with your IDE.
|
||||||
|
|
||||||
|
:warning: the profiler workflow will be ran for the table set in `PROFILER_TABLE_FILTER_PATTERN`
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
|
from copy import deepcopy
|
||||||
|
from unittest import TestCase
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from metadata.generated.schema.entity.data.table import SystemProfile
|
||||||
|
from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
|
||||||
|
OpenMetadataConnection,
|
||||||
|
)
|
||||||
|
from metadata.ingestion.api.workflow import Workflow
|
||||||
|
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
||||||
|
from metadata.profiler.api.workflow import ProfilerWorkflow
|
||||||
|
from metadata.utils.time_utils import (
|
||||||
|
get_beginning_of_day_timestamp_mill,
|
||||||
|
get_end_of_day_timestamp_mill,
|
||||||
|
)
|
||||||
|
|
||||||
|
TESTS_ROOT_DIR = pathlib.Path(__file__).parent.parent.parent.parent
|
||||||
|
BIGQUERY_CONFIG_FILE = "cli_e2e/database/bigquery/bigquery.yaml"
|
||||||
|
FULL_CONFIG_PATH = pathlib.Path(TESTS_ROOT_DIR, BIGQUERY_CONFIG_FILE)
|
||||||
|
DATABASE_FILTER = {
|
||||||
|
"includes": os.environ.get("E2E_BQ_PROJECT_ID"),
|
||||||
|
"excludes": None,
|
||||||
|
}
|
||||||
|
SCHEMA_FILTER = {
|
||||||
|
"includes": "dbt_jaffle",
|
||||||
|
"excludes": None,
|
||||||
|
}
|
||||||
|
TABLE_FILTER = {
|
||||||
|
"includes": "Person",
|
||||||
|
"excludes": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(
|
||||||
|
reason="Disabled by default. Should be ran manually on system metric updates"
|
||||||
|
)
|
||||||
|
class TestBigquerySystem(TestCase):
|
||||||
|
"""Test class for bigquery system metrics"""
|
||||||
|
|
||||||
|
taxonomy = os.environ.get("E2E_BQ_PROJECT_ID_TAXONOMY")
|
||||||
|
private_key_id = os.environ.get("E2E_BQ_PRIVATE_KEY_ID")
|
||||||
|
private_key = os.environ.get("E2E_BQ_PRIVATE_KEY")
|
||||||
|
project_id = DATABASE_FILTER["includes"]
|
||||||
|
client_email = os.environ.get("E2E_BQ_CLIENT_EMAIL")
|
||||||
|
client_id = os.environ.get("E2E_BQ_CLIENT_ID")
|
||||||
|
|
||||||
|
full_config_path = FULL_CONFIG_PATH
|
||||||
|
|
||||||
|
schema = SCHEMA_FILTER["includes"]
|
||||||
|
table = TABLE_FILTER["includes"]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setUpClass(cls) -> None:
|
||||||
|
"""set up class"""
|
||||||
|
with open(cls.full_config_path, "r", encoding="utf-8") as file:
|
||||||
|
cls.config = yaml.safe_load(file)
|
||||||
|
|
||||||
|
# set up the config to filter from the `dbt_jaffle` schema
|
||||||
|
cls.config["source"]["sourceConfig"]["config"]["schemaFilterPattern"] = {
|
||||||
|
"includes": [cls.schema],
|
||||||
|
}
|
||||||
|
cls.config["source"]["sourceConfig"]["config"]["tableFilterPattern"] = {
|
||||||
|
"includes": [cls.table],
|
||||||
|
}
|
||||||
|
cls.config["source"]["serviceConnection"]["config"]["credentials"]["gcpConfig"][
|
||||||
|
"projectId"
|
||||||
|
] = cls.project_id
|
||||||
|
cls.config["source"]["serviceConnection"]["config"]["credentials"]["gcpConfig"][
|
||||||
|
"privateKeyId"
|
||||||
|
] = cls.private_key_id
|
||||||
|
cls.config["source"]["serviceConnection"]["config"]["credentials"]["gcpConfig"][
|
||||||
|
"privateKey"
|
||||||
|
] = cls.private_key
|
||||||
|
cls.config["source"]["serviceConnection"]["config"]["credentials"]["gcpConfig"][
|
||||||
|
"clientEmail"
|
||||||
|
] = cls.client_email
|
||||||
|
cls.config["source"]["serviceConnection"]["config"]["credentials"]["gcpConfig"][
|
||||||
|
"clientId"
|
||||||
|
] = cls.client_id
|
||||||
|
cls.config["source"]["serviceConnection"]["config"]["taxonomyProjectID"] = [
|
||||||
|
cls.taxonomy
|
||||||
|
]
|
||||||
|
|
||||||
|
# set metadata config
|
||||||
|
cls.metadata_config_dict = cls.config["workflowConfig"][
|
||||||
|
"openMetadataServerConfig"
|
||||||
|
]
|
||||||
|
cls.metadata_config = OpenMetadataConnection.parse_obj(cls.metadata_config_dict)
|
||||||
|
cls.metadata = OpenMetadata(cls.metadata_config)
|
||||||
|
|
||||||
|
# run the ingestion workflow
|
||||||
|
ingestion_workflow = Workflow.create(cls.config)
|
||||||
|
ingestion_workflow.execute()
|
||||||
|
ingestion_workflow.raise_from_status()
|
||||||
|
ingestion_workflow.print_status()
|
||||||
|
ingestion_workflow.stop()
|
||||||
|
|
||||||
|
# get table fqn
|
||||||
|
cls.table_fqn = f"{cls.config['source']['serviceName']}.{cls.project_id}.{cls.schema}.{cls.table}"
|
||||||
|
|
||||||
|
def test_bigquery_system_metrics(self):
|
||||||
|
"""run profiler workflow and check the system metrics"""
|
||||||
|
config = deepcopy(self.config)
|
||||||
|
# update the config to run the profiler workflow
|
||||||
|
config["source"]["sourceConfig"]["config"] = {
|
||||||
|
"type": "Profiler",
|
||||||
|
"generateSampleData": True,
|
||||||
|
"timeoutSeconds": 5400,
|
||||||
|
"tableFilterPattern": {
|
||||||
|
"includes": [self.table],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
config["processor"] = {
|
||||||
|
"type": "orm-profiler",
|
||||||
|
"config": {},
|
||||||
|
}
|
||||||
|
profiler_workflow = ProfilerWorkflow.create(config)
|
||||||
|
profiler_workflow.execute()
|
||||||
|
profiler_workflow.raise_from_status()
|
||||||
|
profiler_workflow.print_status()
|
||||||
|
profiler_workflow.stop()
|
||||||
|
|
||||||
|
# get latest profile metrics
|
||||||
|
profile = self.metadata.get_profile_data(
|
||||||
|
self.table_fqn,
|
||||||
|
get_beginning_of_day_timestamp_mill(days=1),
|
||||||
|
get_end_of_day_timestamp_mill(),
|
||||||
|
profile_type=SystemProfile,
|
||||||
|
)
|
||||||
|
ddl_operations = [prl.operation.value for prl in profile.entities]
|
||||||
|
assert set(ddl_operations) == set(["INSERT", "UPDATE", "DELETE"])
|
@ -0,0 +1,146 @@
|
|||||||
|
# Copyright 2021 Collate
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
Validate redshift system metrics (will be disabled by default). To be ran manually
|
||||||
|
|
||||||
|
How to use this test
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
1. Comment the @pytest.mark.skip decorator on line 31
|
||||||
|
2. Make sure you have set up the right environment variables for the redshift database
|
||||||
|
check the config file at "cli_e2e/database/redshift/redshift.yaml". The metadata
|
||||||
|
ingestion will ingest data from the `dbt_jaffle` schema.
|
||||||
|
3. Prior to running this test you will need to execute DDLs in the `dbt_jaffle` schema.
|
||||||
|
We will need to perform at least one `DELETE`, `INSERT`, `UPDATE` on any table from the schema.
|
||||||
|
4. Once you have performed the above steps, run the test with the following command:
|
||||||
|
`python -m pytest tests/integration/orm_profiler/system/test_redshift_system_metrics.py` from the ingestion directory.
|
||||||
|
You can also perform the same action with your IDE.
|
||||||
|
|
||||||
|
:warning: the profiler workflow will be ran for the table set in `PROFILER_TABLE_FILTER_PATTERN`
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
|
from copy import deepcopy
|
||||||
|
from unittest import TestCase
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from metadata.generated.schema.entity.data.table import SystemProfile
|
||||||
|
from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
|
||||||
|
OpenMetadataConnection,
|
||||||
|
)
|
||||||
|
from metadata.ingestion.api.workflow import Workflow
|
||||||
|
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
||||||
|
from metadata.profiler.api.workflow import ProfilerWorkflow
|
||||||
|
from metadata.utils.time_utils import (
|
||||||
|
get_beginning_of_day_timestamp_mill,
|
||||||
|
get_end_of_day_timestamp_mill,
|
||||||
|
)
|
||||||
|
|
||||||
|
TESTS_ROOT_DIR = pathlib.Path(__file__).parent.parent.parent.parent
|
||||||
|
REDSHIFT_CONFIG_FILE = "cli_e2e/database/redshift/redshift.yaml"
|
||||||
|
FULL_CONFIG_PATH = pathlib.Path(TESTS_ROOT_DIR, REDSHIFT_CONFIG_FILE)
|
||||||
|
DATABASE_FILTER = {
|
||||||
|
"includes": "dev",
|
||||||
|
"excludes": None,
|
||||||
|
}
|
||||||
|
SCHEMA_FILTER = {
|
||||||
|
"includes": "dbt_jaffle",
|
||||||
|
"excludes": None,
|
||||||
|
}
|
||||||
|
TABLE_FILTER = {
|
||||||
|
"includes": "boolean_test",
|
||||||
|
"excludes": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(
|
||||||
|
reason="Disabled by default. Should be ran manually on system metric updates"
|
||||||
|
)
|
||||||
|
class TestRedshiftSystem(TestCase):
|
||||||
|
"""Test class for redshift system metrics"""
|
||||||
|
|
||||||
|
hostPort = os.environ.get("E2E_REDSHIFT_HOST_PORT")
|
||||||
|
username = os.environ.get("E2E_REDSHIFT_USERNAME")
|
||||||
|
password = os.environ.get("E2E_REDSHIFT_PASSWORD")
|
||||||
|
database = DATABASE_FILTER["includes"]
|
||||||
|
|
||||||
|
full_config_path = FULL_CONFIG_PATH
|
||||||
|
|
||||||
|
schema = SCHEMA_FILTER["includes"]
|
||||||
|
table = TABLE_FILTER["includes"]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setUpClass(cls) -> None:
|
||||||
|
"""set up class"""
|
||||||
|
with open(cls.full_config_path, "r", encoding="utf-8") as file:
|
||||||
|
cls.config = yaml.safe_load(file)
|
||||||
|
|
||||||
|
# set up the config to filter from the `dbt_jaffle` schema
|
||||||
|
cls.config["source"]["sourceConfig"]["config"]["schemaFilterPattern"] = {
|
||||||
|
"includes": [cls.schema],
|
||||||
|
}
|
||||||
|
cls.config["source"]["serviceConnection"]["config"]["hostPort"] = cls.hostPort
|
||||||
|
cls.config["source"]["serviceConnection"]["config"]["username"] = cls.username
|
||||||
|
cls.config["source"]["serviceConnection"]["config"]["password"] = cls.password
|
||||||
|
cls.config["source"]["serviceConnection"]["config"]["database"] = cls.database
|
||||||
|
|
||||||
|
# set metadata config
|
||||||
|
cls.metadata_config_dict = cls.config["workflowConfig"][
|
||||||
|
"openMetadataServerConfig"
|
||||||
|
]
|
||||||
|
cls.metadata_config = OpenMetadataConnection.parse_obj(cls.metadata_config_dict)
|
||||||
|
cls.metadata = OpenMetadata(cls.metadata_config)
|
||||||
|
|
||||||
|
# run the ingestion workflow
|
||||||
|
ingestion_workflow = Workflow.create(cls.config)
|
||||||
|
ingestion_workflow.execute()
|
||||||
|
ingestion_workflow.raise_from_status()
|
||||||
|
ingestion_workflow.print_status()
|
||||||
|
ingestion_workflow.stop()
|
||||||
|
|
||||||
|
# get table fqn
|
||||||
|
cls.table_fqn = f"{cls.config['source']['serviceName']}.{cls.database}.{cls.schema}.{cls.table}"
|
||||||
|
|
||||||
|
def test_redshift_system_metrics(self):
|
||||||
|
"""run profiler workflow and check the system metrics"""
|
||||||
|
config = deepcopy(self.config)
|
||||||
|
# update the config to run the profiler workflow
|
||||||
|
config["source"]["sourceConfig"]["config"] = {
|
||||||
|
"type": "Profiler",
|
||||||
|
"generateSampleData": True,
|
||||||
|
"timeoutSeconds": 5400,
|
||||||
|
"tableFilterPattern": {
|
||||||
|
"includes": [self.table],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
config["processor"] = {
|
||||||
|
"type": "orm-profiler",
|
||||||
|
"config": {},
|
||||||
|
}
|
||||||
|
profiler_workflow = ProfilerWorkflow.create(config)
|
||||||
|
profiler_workflow.execute()
|
||||||
|
profiler_workflow.raise_from_status()
|
||||||
|
profiler_workflow.print_status()
|
||||||
|
profiler_workflow.stop()
|
||||||
|
|
||||||
|
# get latest profile metrics
|
||||||
|
profile = self.metadata.get_profile_data(
|
||||||
|
self.table_fqn,
|
||||||
|
get_beginning_of_day_timestamp_mill(days=1),
|
||||||
|
get_end_of_day_timestamp_mill(),
|
||||||
|
profile_type=SystemProfile,
|
||||||
|
)
|
||||||
|
ddl_operations = [prl.operation.value for prl in profile.entities]
|
||||||
|
assert set(ddl_operations) == set(["INSERT", "UPDATE", "DELETE"])
|
@ -0,0 +1,170 @@
|
|||||||
|
# Copyright 2021 Collate
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
Validate snowflake system metrics (will be disabled by default). To be ran manually
|
||||||
|
|
||||||
|
How to use this test
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
1. Comment the @pytest.mark.skip decorator on line 31
|
||||||
|
2. Make sure you have set up the right environment variables for the snowflake database
|
||||||
|
check the config file at "cli_e2e/database/snowflake/snowflake.yaml". The metadata
|
||||||
|
ingestion will ingest data from the `TEST_DB` schema.
|
||||||
|
3. Prior to running this test you will need to execute DDLs in the `TEST_DB` schema.
|
||||||
|
We will need to perform at least one `DELETE`, `INSERT`, `UPDATE` on any table from the schema.
|
||||||
|
query example:
|
||||||
|
```
|
||||||
|
INSERT INTO TEST_DB.TEST_SCHEMA.NEW_TAB VALUES
|
||||||
|
(1, 'FOO'),
|
||||||
|
(2, 'BAR'),
|
||||||
|
(3, 'BAZZ')
|
||||||
|
|
||||||
|
INSERT OVERWRITE INTO TEST_DB.TEST_SCHEMA.NEW_TAB VALUES
|
||||||
|
(4, 'FOOBAR'),
|
||||||
|
(5, 'FOOBAZZ'),
|
||||||
|
(6, 'BARBAZZ')
|
||||||
|
|
||||||
|
UPDATE TEST_DB.TEST_SCHEMA.NEW_TAB SET NAME='BABAR' WHERE id = 6;
|
||||||
|
|
||||||
|
MERGE INTO TEST_DB.TEST_SCHEMA.NEW_TAB NT USING (SELECT 5 AS id, 'BAR' AS NAME) N ON NT.id = N.id
|
||||||
|
WHEN MATCHED THEN UPDATE SET NT.NAME = N.NAME;
|
||||||
|
|
||||||
|
DELETE FROM TEST_DB.TEST_SCHEMA.NEW_TAB WHERE ID = 4;
|
||||||
|
```
|
||||||
|
4. Once you have performed the above steps, run the test with the following command:
|
||||||
|
`python -m pytest tests/integration/orm_profiler/system/test_snowflake_system_metrics.py` from the ingestion directory.
|
||||||
|
You can also perform the same action with your IDE.
|
||||||
|
|
||||||
|
:warning: the profiler workflow will be ran for the table set in `PROFILER_TABLE_FILTER_PATTERN`
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
|
from copy import deepcopy
|
||||||
|
from unittest import TestCase
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from metadata.generated.schema.entity.data.table import SystemProfile
|
||||||
|
from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
|
||||||
|
OpenMetadataConnection,
|
||||||
|
)
|
||||||
|
from metadata.ingestion.api.workflow import Workflow
|
||||||
|
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
||||||
|
from metadata.profiler.api.workflow import ProfilerWorkflow
|
||||||
|
from metadata.utils.time_utils import (
|
||||||
|
get_beginning_of_day_timestamp_mill,
|
||||||
|
get_end_of_day_timestamp_mill,
|
||||||
|
)
|
||||||
|
|
||||||
|
TESTS_ROOT_DIR = pathlib.Path(__file__).parent.parent.parent.parent
|
||||||
|
SNOWFLAKE_CONFIG_FILE = "cli_e2e/database/snowflake/snowflake.yaml"
|
||||||
|
FULL_CONFIG_PATH = pathlib.Path(TESTS_ROOT_DIR, SNOWFLAKE_CONFIG_FILE)
|
||||||
|
DATABASE_FILTER = {
|
||||||
|
"includes": "TEST_DB",
|
||||||
|
"excludes": None,
|
||||||
|
}
|
||||||
|
SCHEMA_FILTER = {
|
||||||
|
"includes": "TEST_SCHEMA",
|
||||||
|
"excludes": None,
|
||||||
|
}
|
||||||
|
TABLE_FILTER = {
|
||||||
|
"includes": "NEW_TAB",
|
||||||
|
"excludes": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(
|
||||||
|
reason="Disabled by default. Should be ran manually on system metric updates"
|
||||||
|
)
|
||||||
|
class TestSnowflakeystem(TestCase):
|
||||||
|
"""Test class for snowflake system metrics"""
|
||||||
|
|
||||||
|
account = os.environ.get("E2E_SNOWFLAKE_ACCOUNT")
|
||||||
|
warehouse = os.environ.get("E2E_SNOWFLAKE_WAREHOUSE")
|
||||||
|
username = os.environ.get("E2E_SNOWFLAKE_USERNAME")
|
||||||
|
password = os.environ.get("E2E_SNOWFLAKE_PASSWORD")
|
||||||
|
database = DATABASE_FILTER["includes"]
|
||||||
|
|
||||||
|
full_config_path = FULL_CONFIG_PATH
|
||||||
|
|
||||||
|
schema = SCHEMA_FILTER["includes"]
|
||||||
|
table = TABLE_FILTER["includes"]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setUpClass(cls) -> None:
|
||||||
|
"""set up class"""
|
||||||
|
with open(cls.full_config_path, "r", encoding="utf-8") as file:
|
||||||
|
cls.config = yaml.safe_load(file)
|
||||||
|
|
||||||
|
# set up the config to filter from the `dbt_jaffle` schema
|
||||||
|
cls.config["source"]["sourceConfig"]["config"]["schemaFilterPattern"] = {
|
||||||
|
"includes": [cls.schema],
|
||||||
|
}
|
||||||
|
cls.config["source"]["sourceConfig"]["config"]["tableFilterPattern"] = {
|
||||||
|
"includes": [cls.table],
|
||||||
|
}
|
||||||
|
cls.config["source"]["serviceConnection"]["config"]["account"] = cls.account
|
||||||
|
cls.config["source"]["serviceConnection"]["config"]["warehouse"] = cls.warehouse
|
||||||
|
cls.config["source"]["serviceConnection"]["config"]["username"] = cls.username
|
||||||
|
cls.config["source"]["serviceConnection"]["config"]["password"] = cls.password
|
||||||
|
cls.config["source"]["serviceConnection"]["config"]["database"] = cls.database
|
||||||
|
|
||||||
|
# set metadata config
|
||||||
|
cls.metadata_config_dict = cls.config["workflowConfig"][
|
||||||
|
"openMetadataServerConfig"
|
||||||
|
]
|
||||||
|
cls.metadata_config = OpenMetadataConnection.parse_obj(cls.metadata_config_dict)
|
||||||
|
cls.metadata = OpenMetadata(cls.metadata_config)
|
||||||
|
|
||||||
|
# run the ingestion workflow
|
||||||
|
ingestion_workflow = Workflow.create(cls.config)
|
||||||
|
ingestion_workflow.execute()
|
||||||
|
ingestion_workflow.raise_from_status()
|
||||||
|
ingestion_workflow.print_status()
|
||||||
|
ingestion_workflow.stop()
|
||||||
|
|
||||||
|
# get table fqn
|
||||||
|
cls.table_fqn = f"{cls.config['source']['serviceName']}.{cls.database}.{cls.schema}.{cls.table}"
|
||||||
|
|
||||||
|
def test_snowflake_system_metrics(self):
|
||||||
|
"""run profiler workflow and check the system metrics"""
|
||||||
|
config = deepcopy(self.config)
|
||||||
|
# update the config to run the profiler workflow
|
||||||
|
config["source"]["sourceConfig"]["config"] = {
|
||||||
|
"type": "Profiler",
|
||||||
|
"generateSampleData": True,
|
||||||
|
"timeoutSeconds": 5400,
|
||||||
|
"tableFilterPattern": {
|
||||||
|
"includes": [self.table],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
config["processor"] = {
|
||||||
|
"type": "orm-profiler",
|
||||||
|
"config": {},
|
||||||
|
}
|
||||||
|
profiler_workflow = ProfilerWorkflow.create(config)
|
||||||
|
profiler_workflow.execute()
|
||||||
|
profiler_workflow.raise_from_status()
|
||||||
|
profiler_workflow.print_status()
|
||||||
|
profiler_workflow.stop()
|
||||||
|
|
||||||
|
# get latest profile metrics
|
||||||
|
profile = self.metadata.get_profile_data(
|
||||||
|
self.table_fqn,
|
||||||
|
get_beginning_of_day_timestamp_mill(days=1),
|
||||||
|
get_end_of_day_timestamp_mill(),
|
||||||
|
profile_type=SystemProfile,
|
||||||
|
)
|
||||||
|
ddl_operations = [prl.operation.value for prl in profile.entities]
|
||||||
|
assert set(ddl_operations) == set(["INSERT", "UPDATE", "DELETE"])
|
@ -21,7 +21,6 @@ from copy import deepcopy
|
|||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from unittest import TestCase
|
from unittest import TestCase
|
||||||
|
|
||||||
import pytest
|
|
||||||
from sqlalchemy import Column, DateTime, Integer, String, create_engine
|
from sqlalchemy import Column, DateTime, Integer, String, create_engine
|
||||||
from sqlalchemy.orm import declarative_base
|
from sqlalchemy.orm import declarative_base
|
||||||
|
|
||||||
|
@ -65,7 +65,7 @@ class Row:
|
|||||||
start_time,
|
start_time,
|
||||||
query_text,
|
query_text,
|
||||||
):
|
):
|
||||||
self.query_id = query_id
|
self.QUERY_ID = query_id
|
||||||
self.query_type = query_type
|
self.QUERY_TYPE = query_type
|
||||||
self.start_time = start_time
|
self.START_TIME = start_time
|
||||||
self.query_text = query_text
|
self.QUERY_TEXT = query_text
|
||||||
|
@ -17,12 +17,22 @@ import os
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from unittest import TestCase
|
from unittest import TestCase
|
||||||
|
|
||||||
|
import pytest
|
||||||
from sqlalchemy import Column, create_engine
|
from sqlalchemy import Column, create_engine
|
||||||
from sqlalchemy.orm import declarative_base, sessionmaker
|
from sqlalchemy.orm import declarative_base, sessionmaker
|
||||||
from sqlalchemy.sql.sqltypes import Integer, String
|
from sqlalchemy.sql.sqltypes import Integer, String
|
||||||
|
|
||||||
from metadata.profiler.metrics.hybrid.histogram import Histogram
|
from metadata.profiler.metrics.hybrid.histogram import Histogram
|
||||||
from metadata.utils.profiler_utils import ColumnLike, get_snowflake_system_queries
|
from metadata.profiler.metrics.system.queries.snowflake import (
|
||||||
|
get_snowflake_system_queries,
|
||||||
|
)
|
||||||
|
from metadata.profiler.metrics.system.system import recursive_dic
|
||||||
|
from metadata.utils.profiler_utils import (
|
||||||
|
ColumnLike,
|
||||||
|
get_identifiers_from_string,
|
||||||
|
get_value_from_cache,
|
||||||
|
set_cache,
|
||||||
|
)
|
||||||
from metadata.utils.sqa_utils import handle_array, is_array
|
from metadata.utils.sqa_utils import handle_array, is_array
|
||||||
|
|
||||||
from .conftest import Row
|
from .conftest import Row
|
||||||
@ -145,7 +155,7 @@ def test_column_like_object():
|
|||||||
def test_get_snowflake_system_queries():
|
def test_get_snowflake_system_queries():
|
||||||
"""Test get snowflake system queries"""
|
"""Test get snowflake system queries"""
|
||||||
row = Row(
|
row = Row(
|
||||||
query_id=1,
|
query_id="1",
|
||||||
query_type="INSERT",
|
query_type="INSERT",
|
||||||
start_time=datetime.now(),
|
start_time=datetime.now(),
|
||||||
query_text="INSERT INTO DATABASE.SCHEMA.TABLE1 (col1, col2) VALUES (1, 'a'), (2, 'b')",
|
query_text="INSERT INTO DATABASE.SCHEMA.TABLE1 (col1, col2) VALUES (1, 'a'), (2, 'b')",
|
||||||
@ -153,7 +163,7 @@ def test_get_snowflake_system_queries():
|
|||||||
|
|
||||||
query_result = get_snowflake_system_queries(row, "DATABASE", "SCHEMA") # type: ignore
|
query_result = get_snowflake_system_queries(row, "DATABASE", "SCHEMA") # type: ignore
|
||||||
assert query_result
|
assert query_result
|
||||||
assert query_result.query_id == 1
|
assert query_result.query_id == "1"
|
||||||
assert query_result.query_type == "INSERT"
|
assert query_result.query_type == "INSERT"
|
||||||
assert query_result.database_name == "database"
|
assert query_result.database_name == "database"
|
||||||
assert query_result.schema_name == "schema"
|
assert query_result.schema_name == "schema"
|
||||||
@ -169,3 +179,81 @@ def test_get_snowflake_system_queries():
|
|||||||
query_result = get_snowflake_system_queries(row, "DATABASE", "SCHEMA") # type: ignore
|
query_result = get_snowflake_system_queries(row, "DATABASE", "SCHEMA") # type: ignore
|
||||||
|
|
||||||
assert not query_result
|
assert not query_result
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"query, expected",
|
||||||
|
[
|
||||||
|
(
|
||||||
|
"INSERT INTO DATABASE.SCHEMA.TABLE1 (col1, col2) VALUES (1, 'a'), (2, 'b')",
|
||||||
|
"INSERT",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"INSERT OVERWRITE INTO DATABASE.SCHEMA.TABLE1 (col1, col2) VALUES (1, 'a'), (2, 'b')",
|
||||||
|
"INSERT",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"MERGE INTO DATABASE.SCHEMA.TABLE1 (col1, col2) VALUES (1, 'a'), (2, 'b')",
|
||||||
|
"MERGE",
|
||||||
|
),
|
||||||
|
("DELETE FROM DATABASE.SCHEMA.TABLE1 WHERE val = 9999", "MERGE"),
|
||||||
|
("UPDATE DATABASE.SCHEMA.TABLE1 SET col1 = 1 WHERE val = 9999", "UPDATE"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_get_snowflake_system_queries_all_dll(query, expected):
|
||||||
|
"""test we ca get all ddl queries
|
||||||
|
reference https://docs.snowflake.com/en/sql-reference/sql-dml
|
||||||
|
"""
|
||||||
|
row = Row(
|
||||||
|
query_id=1,
|
||||||
|
query_type=expected,
|
||||||
|
start_time=datetime.now(),
|
||||||
|
query_text=query,
|
||||||
|
)
|
||||||
|
|
||||||
|
query_result = get_snowflake_system_queries(row, "DATABASE", "SCHEMA") # type: ignore
|
||||||
|
|
||||||
|
assert query_result
|
||||||
|
assert query_result.query_type == expected
|
||||||
|
assert query_result.database_name == "database"
|
||||||
|
assert query_result.schema_name == "schema"
|
||||||
|
assert query_result.table_name == "table1"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"identifier, expected",
|
||||||
|
[
|
||||||
|
("DATABASE.SCHEMA.TABLE1", ("DATABASE", "SCHEMA", "TABLE1")),
|
||||||
|
('DATABASE.SCHEMA."TABLE.DOT"', ("DATABASE", "SCHEMA", "TABLE.DOT")),
|
||||||
|
('DATABASE."SCHEMA.DOT".TABLE', ("DATABASE", "SCHEMA.DOT", "TABLE")),
|
||||||
|
('"DATABASE.DOT".SCHEMA.TABLE', ("DATABASE.DOT", "SCHEMA", "TABLE")),
|
||||||
|
('DATABASE."SCHEMA.DOT"."TABLE.DOT"', ("DATABASE", "SCHEMA.DOT", "TABLE.DOT")),
|
||||||
|
('"DATABASE.DOT"."SCHEMA.DOT".TABLE', ("DATABASE.DOT", "SCHEMA.DOT", "TABLE")),
|
||||||
|
(
|
||||||
|
'"DATABASE.DOT"."SCHEMA.DOT"."TABLE.DOT"',
|
||||||
|
("DATABASE.DOT", "SCHEMA.DOT", "TABLE.DOT"),
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_get_identifiers_from_string(identifier, expected):
|
||||||
|
"""test get identifiers from string"""
|
||||||
|
assert get_identifiers_from_string(identifier) == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_cache_func():
|
||||||
|
"""test get and set cache"""
|
||||||
|
cache_dict = recursive_dic()
|
||||||
|
cache_value = [1, 2, 3, 4, 5]
|
||||||
|
new_cache_value = [6, 7, 8, 9, 10]
|
||||||
|
|
||||||
|
cache = get_value_from_cache(cache_dict, "key1.key2.key3")
|
||||||
|
assert not cache
|
||||||
|
|
||||||
|
set_cache(cache_dict, "key1.key2.key3", cache_value)
|
||||||
|
cache = get_value_from_cache(cache_dict, "key1.key2.key3")
|
||||||
|
assert cache == cache_value
|
||||||
|
|
||||||
|
# calling set_cache on the same key will reset the cache
|
||||||
|
set_cache(cache_dict, "key1.key2.key3", new_cache_value)
|
||||||
|
cache = get_value_from_cache(cache_dict, "key1.key2.key3")
|
||||||
|
assert cache == new_cache_value
|
||||||
|
Loading…
x
Reference in New Issue
Block a user