diff --git a/ingestion/src/metadata/utils/sql_queries.py b/ingestion/src/metadata/utils/sql_queries.py index ad7be0c88c3..56c1d2bdb44 100644 --- a/ingestion/src/metadata/utils/sql_queries.py +++ b/ingestion/src/metadata/utils/sql_queries.py @@ -1,26 +1,61 @@ import textwrap REDSHIFT_SQL_STATEMENT = """ - SELECT DISTINCT ss.userid, - ss.query query_type, - sui.usename user_name, - ss.tbl, - sq.querytxt query_text, - sti.database database_name, - sti.schema schema_name, - sti.table, - sq.starttime start_time, - sq.endtime end_time, - sq.aborted aborted - FROM stl_scan ss - JOIN svv_table_info sti ON ss.tbl = sti.table_id - JOIN stl_query sq ON ss.query = sq.query - JOIN svl_user_info sui ON sq.userid = sui.usesysid - WHERE ss.starttime >= '{start_time}' - AND ss.starttime < '{end_time}' - AND sq.aborted = 0 - ORDER BY ss.endtime DESC; - """ + WITH + queries AS ( + SELECT * + FROM pg_catalog.stl_query + WHERE userid > 1 + -- Filter out all automated & cursor queries + AND querytxt NOT ILIKE 'fetch %' + AND querytxt NOT ILIKE 'padb_fetch_sample: %' + AND querytxt NOT ILIKE 'Undoing % transactions on table % with current xid%' + AND aborted = 0 + AND starttime >= '{start_time}' + AND starttime < '{end_time}' + ), + full_queries AS ( + SELECT + query, + LISTAGG(CASE WHEN LEN(RTRIM(text)) = 0 THEN text ELSE RTRIM(text) END, '') + WITHIN GROUP (ORDER BY sequence) AS query_text + FROM pg_catalog.stl_querytext + WHERE sequence < 327 -- each chunk contains up to 200, RS has a maximum str length of 65535. + GROUP BY query + ), + raw_scans AS ( + -- We have one row per table per slice so we need to get rid of the dupes + SELECT distinct query, tbl + FROM pg_catalog.stl_scan + ), + scans AS ( + SELECT DISTINCT + query, + sti.database AS database_name, + sti.schema AS schema_name + FROM raw_scans AS s + INNER JOIN pg_catalog.svv_table_info AS sti + ON (s.tbl)::oid = sti.table_id + ) + SELECT DISTINCT + q.userid, + s.query AS query_id, + RTRIM(u.usename) AS user_name, + fq.query_text, + s.database_name, + s.schema_name, + q.starttime AS start_time, + q.endtime AS end_time, + q.aborted AS aborted + FROM scans AS s + INNER JOIN queries AS q + ON s.query = q.query + INNER JOIN full_queries AS fq + ON s.query = fq.query + INNER JOIN pg_catalog.pg_user AS u + ON q.userid = u.usesysid + ORDER BY q.endtime DESC +""" REDSHIFT_GET_ALL_RELATION_INFO = """ SELECT