fix(ingest/redshift): reduce sequence limit for LISTAGG (#11621)

Co-authored-by: treff7es <treff7es@gmail.com>
Co-authored-by: Aseem Bansal <asmbansal2@gmail.com>
This commit is contained in:
Harshal Sheth 2024-10-17 10:08:37 -07:00 committed by GitHub
parent 6b09346ca5
commit 68cd17b34e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 143 additions and 178 deletions

View File

@ -4,6 +4,12 @@ from typing import List
redshift_datetime_format = "%Y-%m-%d %H:%M:%S" redshift_datetime_format = "%Y-%m-%d %H:%M:%S"
# See https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext
# for why we need to limit the size of the query text.
# We use 290 instead instead of the standard 320, because escape characters can add to the length.
_QUERY_SEQUENCE_LIMIT = 290
class RedshiftCommonQuery: class RedshiftCommonQuery:
CREATE_TEMP_TABLE_CLAUSE = "create temp table" CREATE_TEMP_TABLE_CLAUSE = "create temp table"
CREATE_TEMPORARY_TABLE_CLAUSE = "create temporary table" CREATE_TEMPORARY_TABLE_CLAUSE = "create temporary table"
@ -487,9 +493,8 @@ class RedshiftProvisionedQuery(RedshiftCommonQuery):
def list_insert_create_queries_sql( def list_insert_create_queries_sql(
db_name: str, start_time: datetime, end_time: datetime db_name: str, start_time: datetime, end_time: datetime
) -> str: ) -> str:
return """ return """\
with query_txt as with query_txt as (
(
select select
query, query,
pid, pid,
@ -497,10 +502,9 @@ class RedshiftProvisionedQuery(RedshiftCommonQuery):
when LEN(RTRIM(text)) = 0 then text when LEN(RTRIM(text)) = 0 then text
else RTRIM(text) else RTRIM(text)
end) within group ( end) within group (
order by order by sequence
sequence) as ddl ) as ddl
from from (
(
select select
query, query,
pid, pid,
@ -509,15 +513,15 @@ class RedshiftProvisionedQuery(RedshiftCommonQuery):
from from
STL_QUERYTEXT STL_QUERYTEXT
where where
sequence < 320 sequence < {_QUERY_SEQUENCE_LIMIT}
order by order by
sequence sequence
) )
group by group by
query, query,
pid pid
) )
select select
distinct tbl as target_table_id, distinct tbl as target_table_id,
sti.schema as target_schema, sti.schema as target_schema,
sti.table as target_table, sti.table as target_table,
@ -527,23 +531,23 @@ class RedshiftProvisionedQuery(RedshiftCommonQuery):
sq.query as query_id, sq.query as query_id,
min(si.starttime) as timestamp, min(si.starttime) as timestamp,
ANY_VALUE(pid) as session_id ANY_VALUE(pid) as session_id
from from
stl_insert as si stl_insert as si
left join SVV_TABLE_INFO sti on left join SVV_TABLE_INFO sti on
sti.table_id = tbl sti.table_id = tbl
left join svl_user_info sui on left join svl_user_info sui on
si.userid = sui.usesysid si.userid = sui.usesysid
left join query_txt sq on left join query_txt sq on
si.query = sq.query si.query = sq.query
left join stl_load_commits slc on left join stl_load_commits slc on
slc.query = si.query slc.query = si.query
where where
sui.usename <> 'rdsdb' sui.usename <> 'rdsdb'
and cluster = '{db_name}' and cluster = '{db_name}'
and slc.query IS NULL and slc.query IS NULL
and si.starttime >= '{start_time}' and si.starttime >= '{start_time}'
and si.starttime < '{end_time}' and si.starttime < '{end_time}'
group by group by
target_table_id, target_table_id,
target_schema, target_schema,
target_table, target_table,
@ -552,6 +556,7 @@ class RedshiftProvisionedQuery(RedshiftCommonQuery):
ddl, ddl,
sq.query sq.query
""".format( """.format(
_QUERY_SEQUENCE_LIMIT=_QUERY_SEQUENCE_LIMIT,
# We need the original database name for filtering # We need the original database name for filtering
db_name=db_name, db_name=db_name,
start_time=start_time.strftime(redshift_datetime_format), start_time=start_time.strftime(redshift_datetime_format),
@ -564,11 +569,11 @@ class RedshiftProvisionedQuery(RedshiftCommonQuery):
end_time_str: str = end_time.strftime(redshift_datetime_format) end_time_str: str = end_time.strftime(redshift_datetime_format)
return rf"""-- DataHub Redshift Source temp table DDL query return rf"""\
select -- DataHub Redshift Source temp table DDL query
select
* *
from from (
(
select select
session_id, session_id,
transaction_id, transaction_id,
@ -580,8 +585,7 @@ class RedshiftProvisionedQuery(RedshiftCommonQuery):
partition by session_id, TRIM(query_text) partition by session_id, TRIM(query_text)
order by start_time desc order by start_time desc
) rn ) rn
from from (
(
select select
pid as session_id, pid as session_id,
xid as transaction_id, xid as transaction_id,
@ -589,8 +593,7 @@ class RedshiftProvisionedQuery(RedshiftCommonQuery):
type, type,
query_text, query_text,
userid userid
from from (
(
select select
starttime, starttime,
pid, pid,
@ -610,8 +613,7 @@ class RedshiftProvisionedQuery(RedshiftCommonQuery):
type in ('DDL', 'QUERY') type in ('DDL', 'QUERY')
AND starttime >= '{start_time_str}' AND starttime >= '{start_time_str}'
AND starttime < '{end_time_str}' AND starttime < '{end_time_str}'
-- See https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext AND sequence < {_QUERY_SEQUENCE_LIMIT}
AND sequence < 320
group by group by
starttime, starttime,
pid, pid,
@ -624,7 +626,8 @@ class RedshiftProvisionedQuery(RedshiftCommonQuery):
xid, xid,
type, type,
userid userid
asc) asc
)
where where
type in ('DDL', 'QUERY') type in ('DDL', 'QUERY')
) )
@ -639,8 +642,8 @@ class RedshiftProvisionedQuery(RedshiftCommonQuery):
-- We need to filter out our query and it was not possible earlier when we did not have any comment in the query -- We need to filter out our query and it was not possible earlier when we did not have any comment in the query
and query_text not ilike '%https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext%' and query_text not ilike '%https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext%'
) )
where where
rn = 1 rn = 1
""" """

View File

@ -56,45 +56,7 @@ def mock_stl_insert_table_cursor(cursor: MagicMock) -> None:
query_vs_cursor_mocker = { query_vs_cursor_mocker = {
( (
"-- DataHub Redshift Source temp table DDL query\n select\n *\n " "\\\n-- DataHub Redshift Source temp table DDL query\nselect\n *\nfrom (\n select\n session_id,\n transaction_id,\n start_time,\n userid,\n REGEXP_REPLACE(REGEXP_SUBSTR(REGEXP_REPLACE(query_text,'\\\\\\\\n','\\\\n'), '(CREATE(?:[\\\\n\\\\s\\\\t]+(?:temp|temporary))?(?:[\\\\n\\\\s\\\\t]+)table(?:[\\\\n\\\\s\\\\t]+)[^\\\\n\\\\s\\\\t()-]+)', 0, 1, 'ipe'),'[\\\\n\\\\s\\\\t]+',' ',1,'p') as create_command,\n query_text,\n row_number() over (\n partition by session_id, TRIM(query_text)\n order by start_time desc\n ) rn\n from (\n select\n pid as session_id,\n xid as transaction_id,\n starttime as start_time,\n type,\n query_text,\n userid\n from (\n select\n starttime,\n pid,\n xid,\n type,\n userid,\n LISTAGG(case\n when LEN(RTRIM(text)) = 0 then text\n else RTRIM(text)\n end,\n '') within group (\n order by sequence\n ) as query_text\n from\n SVL_STATEMENTTEXT\n where\n type in ('DDL', 'QUERY')\n AND starttime >= '2024-01-01 12:00:00'\n AND starttime < '2024-01-10 12:00:00'\n AND sequence < 290\n group by\n starttime,\n pid,\n xid,\n type,\n userid\n order by\n starttime,\n pid,\n xid,\n type,\n userid\n asc\n )\n where\n type in ('DDL', 'QUERY')\n )\n where\n (create_command ilike 'create temp table %'\n or create_command ilike 'create temporary table %'\n -- we want to get all the create table statements and not just temp tables if non temp table is created and dropped in the same transaction\n or create_command ilike 'create table %')\n -- Redshift creates temp tables with the following names: volt_tt_%. We need to filter them out.\n and query_text not ilike 'CREATE TEMP TABLE volt_tt_%'\n and create_command not like 'CREATE TEMP TABLE volt_tt_'\n -- We need to filter out our query and it was not possible earlier when we did not have any comment in the query\n and query_text not ilike '%https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext%'\n\n)\nwhere\n rn = 1\n "
"from\n (\n select\n session_id,\n "
" transaction_id,\n start_time,\n userid,\n "
" REGEXP_REPLACE(REGEXP_SUBSTR(REGEXP_REPLACE(query_text,'\\\\\\\\n','\\\\n'), '(CREATE(?:["
"\\\\n\\\\s\\\\t]+(?:temp|temporary))?(?:[\\\\n\\\\s\\\\t]+)table(?:[\\\\n\\\\s\\\\t]+)["
"^\\\\n\\\\s\\\\t()-]+)', 0, 1, 'ipe'),'[\\\\n\\\\s\\\\t]+',' ',1,'p') as create_command,\n "
" query_text,\n row_number() over (\n partition "
"by session_id, TRIM(query_text)\n order by start_time desc\n ) rn\n "
" from\n (\n select\n pid "
"as session_id,\n xid as transaction_id,\n starttime "
"as start_time,\n type,\n query_text,\n "
" userid\n from\n (\n "
"select\n starttime,\n pid,\n "
" xid,\n type,\n userid,\n "
" LISTAGG(case\n when LEN(RTRIM(text)) = 0 then text\n "
" else RTRIM(text)\n end,\n "
" '') within group (\n order by sequence\n "
" ) as query_text\n from\n "
"SVL_STATEMENTTEXT\n where\n type in ('DDL', "
"'QUERY')\n AND starttime >= '2024-01-01 12:00:00'\n "
" AND starttime < '2024-01-10 12:00:00'\n -- See "
"https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl"
"-statementtext\n AND sequence < 320\n group by\n "
" starttime,\n pid,\n "
"xid,\n type,\n userid\n "
" order by\n starttime,\n pid,\n "
" xid,\n type,\n userid\n "
" asc)\n where\n type in ('DDL', "
"'QUERY')\n )\n where\n (create_command ilike "
"'create temp table %'\n or create_command ilike 'create temporary table %'\n "
" -- we want to get all the create table statements and not just temp tables "
"if non temp table is created and dropped in the same transaction\n or "
"create_command ilike 'create table %')\n -- Redshift creates temp tables with "
"the following names: volt_tt_%. We need to filter them out.\n and query_text not "
"ilike 'CREATE TEMP TABLE volt_tt_%'\n and create_command not like 'CREATE TEMP "
"TABLE volt_tt_'\n -- We need to filter out our query and it was not possible "
"earlier when we did not have any comment in the query\n and query_text not ilike "
"'%https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl"
"-statementtext%'\n\n )\n where\n rn = 1\n "
): mock_temp_table_cursor, ): mock_temp_table_cursor,
"select * from test_collapse_temp_lineage": mock_stl_insert_table_cursor, "select * from test_collapse_temp_lineage": mock_stl_insert_table_cursor,
} }