Improve Redshift Usage query (#881)

* Improve Redshift Usage query

* Improve Redshift Usage query

* Improve Redshift Usage query
This commit is contained in:
Sriharsha Chintalapani 2021-10-20 21:50:02 -07:00 committed by GitHub
parent 39a7b3e8c6
commit 29c87f77d0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 33 additions and 46 deletions

View File

@ -27,28 +27,20 @@ class TableQuery(JsonSerializable):
def __init__( def __init__(
self, self,
query: str, query: str,
label: str, user_name: str,
userid: int,
xid: int,
pid: int,
starttime: str, starttime: str,
endtime: str, endtime: str,
analysis_date: str, analysis_date: str,
duration: int,
database: str, database: str,
aborted: bool, aborted: bool,
sql: str, sql: str,
) -> None: ) -> None:
""" """ """ """
self.query = query self.query = query
self.label = label self.user_name = user_name
self.userid = userid
self.xid = xid
self.pid = pid
self.starttime = starttime self.starttime = starttime
self.endtime = endtime self.endtime = endtime
self.analysis_date = analysis_date self.analysis_date = analysis_date
self.duration = duration
self.database = database self.database = database
self.aborted = aborted self.aborted = aborted
self.sql = sql self.sql = sql

View File

@ -15,6 +15,7 @@
import datetime import datetime
import logging import logging
import traceback
from typing import Optional from typing import Optional
from sql_metadata import Parser from sql_metadata import Parser

View File

@ -33,29 +33,26 @@ logger = logging.getLogger(__name__)
class RedshiftUsageSource(Source): class RedshiftUsageSource(Source):
# SELECT statement from mysql information_schema to extract table and column metadata # SELECT statement from mysql information_schema to extract table and column metadata
SQL_STATEMENT = """ SQL_STATEMENT = """
WITH query_sql AS ( SELECT DISTINCT ss.userid,
SELECT ss.query,
query, sui.usename,
LISTAGG(text) WITHIN GROUP (ORDER BY sequence) AS sql ss.tbl,
FROM stl_querytext sq.querytxt,
GROUP BY 1 sti.database,
) sti.schema,
sti.table,
SELECT sq.starttime,
q.query, q.label, userid, xid, pid, starttime, endtime, sq.endtime,
DATEDIFF(milliseconds, starttime, endtime) AS duration, sq.aborted
TRIM(database) AS database, FROM stl_scan ss
'{start_date}' as analysis_date, JOIN svv_table_info sti ON ss.tbl = sti.table_id
(CASE aborted WHEN 1 THEN TRUE ELSE FALSE END) AS aborted, JOIN stl_query sq ON ss.query = sq.query
sql JOIN svl_user_info sui ON sq.userid = sui.usesysid
FROM WHERE ss.starttime >= '{start_time}'
stl_query q JOIN query_sql qs ON (q.query = qs.query) AND ss.starttime < '{end_time}'
WHERE AND sq.aborted = 0
endtime between '{start_date}' and '{end_date}' ORDER BY ss.endtime DESC;
{where_clause} """
ORDER BY starttime;
"""
# CONFIG KEYS # CONFIG KEYS
WHERE_CLAUSE_SUFFIX_KEY = "where_clause" WHERE_CLAUSE_SUFFIX_KEY = "where_clause"
CLUSTER_SOURCE = "cluster_source" CLUSTER_SOURCE = "cluster_source"
@ -69,8 +66,9 @@ class RedshiftUsageSource(Source):
super().__init__(ctx) super().__init__(ctx)
start, end = get_start_and_end(config.duration) start, end = get_start_and_end(config.duration)
self.sql_stmt = RedshiftUsageSource.SQL_STATEMENT.format( self.sql_stmt = RedshiftUsageSource.SQL_STATEMENT.format(
where_clause=config.where_clause, start_date=start, end_date=end start_time=start, end_time=end
) )
self.analysis_date = start
self.alchemy_helper = SQLAlchemyHelper( self.alchemy_helper = SQLAlchemyHelper(
config, metadata_config, ctx, "Redshift", self.sql_stmt config, metadata_config, ctx, "Redshift", self.sql_stmt
) )
@ -103,18 +101,14 @@ class RedshiftUsageSource(Source):
""" """
for row in self._get_raw_extract_iter(): for row in self._get_raw_extract_iter():
tq = TableQuery( tq = TableQuery(
row["query"], query=row["query"],
row["label"], user_name=row["usename"],
row["userid"], starttime=str(row["starttime"]),
row["xid"], endtime=str(row["endtime"]),
row["pid"], analysis_date=str(self.analysis_date),
str(row["starttime"]), database=row["database"],
str(row["endtime"]), aborted=row["aborted"],
str(row["analysis_date"]), sql=row["querytxt"],
row["duration"],
row["database"],
row["aborted"],
row["sql"],
) )
yield tq yield tq