mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-17 04:53:46 +00:00
perf(ingest/redshift): limit copy lineage (#11662)
Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com>
This commit is contained in:
parent
8638bf974a
commit
524ef8c6d0
@ -9,6 +9,8 @@ redshift_datetime_format = "%Y-%m-%d %H:%M:%S"
|
|||||||
# We use 290 instead instead of the standard 320, because escape characters can add to the length.
|
# We use 290 instead instead of the standard 320, because escape characters can add to the length.
|
||||||
_QUERY_SEQUENCE_LIMIT = 290
|
_QUERY_SEQUENCE_LIMIT = 290
|
||||||
|
|
||||||
|
_MAX_COPY_ENTRIES_PER_TABLE = 20
|
||||||
|
|
||||||
|
|
||||||
class RedshiftCommonQuery:
|
class RedshiftCommonQuery:
|
||||||
CREATE_TEMP_TABLE_CLAUSE = "create temp table"
|
CREATE_TEMP_TABLE_CLAUSE = "create temp table"
|
||||||
@ -293,28 +295,37 @@ SELECT schemaname as schema_name,
|
|||||||
def list_copy_commands_sql(
|
def list_copy_commands_sql(
|
||||||
db_name: str, start_time: datetime, end_time: datetime
|
db_name: str, start_time: datetime, end_time: datetime
|
||||||
) -> str:
|
) -> str:
|
||||||
return """
|
return """\
|
||||||
select
|
SELECT DISTINCT
|
||||||
distinct
|
target_schema,
|
||||||
"schema" as target_schema,
|
target_table,
|
||||||
"table" as target_table,
|
filename
|
||||||
c.file_name as filename
|
FROM (
|
||||||
from
|
SELECT
|
||||||
SYS_QUERY_DETAIL as si
|
sti."schema" AS target_schema,
|
||||||
join SYS_LOAD_DETAIL as c on
|
sti."table" AS target_table,
|
||||||
si.query_id = c.query_id
|
c.file_name AS filename,
|
||||||
join SVV_TABLE_INFO sti on
|
ROW_NUMBER() OVER (
|
||||||
sti.table_id = si.table_id
|
PARTITION BY sti."schema", sti."table"
|
||||||
where
|
ORDER BY si.start_time DESC
|
||||||
database = '{db_name}'
|
) AS rn
|
||||||
and si.start_time >= '{start_time}'
|
FROM
|
||||||
and si.start_time < '{end_time}'
|
SYS_QUERY_DETAIL AS si
|
||||||
order by target_schema, target_table, si.start_time asc
|
JOIN SYS_LOAD_DETAIL AS c ON si.query_id = c.query_id
|
||||||
""".format(
|
JOIN SVV_TABLE_INFO sti ON sti.table_id = si.table_id
|
||||||
|
WHERE
|
||||||
|
sti.database = '{db_name}'
|
||||||
|
AND si.start_time >= '{start_time}'
|
||||||
|
AND si.start_time < '{end_time}'
|
||||||
|
) subquery
|
||||||
|
WHERE rn <= {_MAX_COPY_ENTRIES_PER_TABLE}
|
||||||
|
ORDER BY target_schema, target_table, filename
|
||||||
|
""".format(
|
||||||
# We need the original database name for filtering
|
# We need the original database name for filtering
|
||||||
db_name=db_name,
|
db_name=db_name,
|
||||||
start_time=start_time.strftime(redshift_datetime_format),
|
start_time=start_time.strftime(redshift_datetime_format),
|
||||||
end_time=end_time.strftime(redshift_datetime_format),
|
end_time=end_time.strftime(redshift_datetime_format),
|
||||||
|
_MAX_COPY_ENTRIES_PER_TABLE=_MAX_COPY_ENTRIES_PER_TABLE,
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user