perf(ingest/redshift): limit copy lineage (#11662)

Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com>
This commit is contained in:
Harshal Sheth 2024-11-20 00:29:05 -08:00 committed by GitHub
parent 8638bf974a
commit 524ef8c6d0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -9,6 +9,8 @@ redshift_datetime_format = "%Y-%m-%d %H:%M:%S"
# We use 290 instead instead of the standard 320, because escape characters can add to the length. # We use 290 instead instead of the standard 320, because escape characters can add to the length.
_QUERY_SEQUENCE_LIMIT = 290 _QUERY_SEQUENCE_LIMIT = 290
_MAX_COPY_ENTRIES_PER_TABLE = 20
class RedshiftCommonQuery: class RedshiftCommonQuery:
CREATE_TEMP_TABLE_CLAUSE = "create temp table" CREATE_TEMP_TABLE_CLAUSE = "create temp table"
@ -293,28 +295,37 @@ SELECT schemaname as schema_name,
def list_copy_commands_sql( def list_copy_commands_sql(
db_name: str, start_time: datetime, end_time: datetime db_name: str, start_time: datetime, end_time: datetime
) -> str: ) -> str:
return """ return """\
select SELECT DISTINCT
distinct target_schema,
"schema" as target_schema, target_table,
"table" as target_table, filename
c.file_name as filename FROM (
from SELECT
SYS_QUERY_DETAIL as si sti."schema" AS target_schema,
join SYS_LOAD_DETAIL as c on sti."table" AS target_table,
si.query_id = c.query_id c.file_name AS filename,
join SVV_TABLE_INFO sti on ROW_NUMBER() OVER (
sti.table_id = si.table_id PARTITION BY sti."schema", sti."table"
where ORDER BY si.start_time DESC
database = '{db_name}' ) AS rn
and si.start_time >= '{start_time}' FROM
and si.start_time < '{end_time}' SYS_QUERY_DETAIL AS si
order by target_schema, target_table, si.start_time asc JOIN SYS_LOAD_DETAIL AS c ON si.query_id = c.query_id
JOIN SVV_TABLE_INFO sti ON sti.table_id = si.table_id
WHERE
sti.database = '{db_name}'
AND si.start_time >= '{start_time}'
AND si.start_time < '{end_time}'
) subquery
WHERE rn <= {_MAX_COPY_ENTRIES_PER_TABLE}
ORDER BY target_schema, target_table, filename
""".format( """.format(
# We need the original database name for filtering # We need the original database name for filtering
db_name=db_name, db_name=db_name,
start_time=start_time.strftime(redshift_datetime_format), start_time=start_time.strftime(redshift_datetime_format),
end_time=end_time.strftime(redshift_datetime_format), end_time=end_time.strftime(redshift_datetime_format),
_MAX_COPY_ENTRIES_PER_TABLE=_MAX_COPY_ENTRIES_PER_TABLE,
) )
@staticmethod @staticmethod