diff --git a/bootstrap/sql/migrations/native/1.9.9/mysql/postDataMigrationSQLScript.sql b/bootstrap/sql/migrations/native/1.9.9/mysql/postDataMigrationSQLScript.sql index 606eb65a8c2..b98f8d3f779 100644 --- a/bootstrap/sql/migrations/native/1.9.9/mysql/postDataMigrationSQLScript.sql +++ b/bootstrap/sql/migrations/native/1.9.9/mysql/postDataMigrationSQLScript.sql @@ -64,26 +64,102 @@ FROM profiler_data_time_series pdts WHERE pdts.extension = 'table.columnProfile' AND CHAR_LENGTH(pdts.entityFQNHash) - CHAR_LENGTH(REPLACE(pdts.entityFQNHash, '.', '')) >= 4; --- Update column profiles using the mapping (much faster than LIKE) -UPDATE profiler_data_time_series pdts -INNER JOIN column_to_table_mapping ctm ON pdts.entityFQNHash = ctm.column_hash -INNER JOIN table_entity te ON ctm.table_hash = te.fqnHash -SET pdts.json = JSON_OBJECT( - 'id', UUID(), - 'entityReference', JSON_OBJECT( +-- Create temporary table with pre-computed entityReference data +CREATE TEMPORARY TABLE IF NOT EXISTS table_entity_references ( + table_hash VARCHAR(768) PRIMARY KEY, + entity_reference JSON, + INDEX idx_hash (table_hash) +) ENGINE=InnoDB; + +-- Pre-compute all entityReference data for tables involved in column profiles +INSERT INTO table_entity_references (table_hash, entity_reference) +SELECT DISTINCT + ctm.table_hash, + JSON_OBJECT( 'id', te.json -> '$.id', 'type', 'table', 'fullyQualifiedName', te.json -> '$.fullyQualifiedName', 'name', te.name - ), - 'timestamp', pdts.timestamp, - 'profileData', pdts.json, - 'profileType', 'column' -) -WHERE pdts.extension = 'table.columnProfile'; + ) as entity_reference +FROM column_to_table_mapping ctm +INNER JOIN table_entity te ON ctm.table_hash = te.fqnHash; + +DELIMITER $$ +CREATE PROCEDURE MigrateColumnProfiles() +BEGIN + + DECLARE current_start_timestamp BIGINT; + DECLARE current_end_timestamp BIGINT; + DECLARE max_timestamp BIGINT; + DECLARE batch_interval_ms BIGINT DEFAULT 1296000000; -- 15 days in milliseconds (15 * 24 * 60 * 60 * 1000) + DECLARE rows_updated INT DEFAULT 1; + DECLARE total_processed INT DEFAULT 0; + DECLARE batch_count INT DEFAULT 0; + + -- Optimize session settings + SET SESSION tmp_table_size = 4294967296; + SET SESSION max_heap_table_size = 4294967296; + SET SESSION sort_buffer_size = 536870912; + + -- Get the timestamp range for column profiles + SELECT + MIN(timestamp), + MAX(timestamp) + INTO current_start_timestamp, max_timestamp + FROM profiler_data_time_series + WHERE extension = 'table.columnProfile'; + + CREATE INDEX idx_pdts_timestamp_ext ON profiler_data_time_series(extension, timestamp); + + -- Process in timestamp batches + migration_loop: WHILE current_start_timestamp <= max_timestamp AND rows_updated > 0 DO + SET current_end_timestamp = current_start_timestamp + batch_interval_ms; + SET batch_count = batch_count + 1; + + -- Update records in current timestamp range using pre-computed entity references + UPDATE profiler_data_time_series pdts + INNER JOIN column_to_table_mapping ctm ON pdts.entityFQNHash = ctm.column_hash + INNER JOIN table_entity_references ter ON ctm.table_hash = ter.table_hash + SET pdts.json = JSON_OBJECT( + 'id', UUID(), + 'entityReference', ter.entity_reference, + 'timestamp', pdts.timestamp, + 'profileData', pdts.json, + 'profileType', 'column' + ) + WHERE pdts.extension = 'table.columnProfile' + AND pdts.timestamp >= current_start_timestamp + AND pdts.timestamp < current_end_timestamp + AND pdts.json->>'$.profileData' IS NULL; + + + -- Move to next timestamp batch + SET current_start_timestamp = current_end_timestamp; + + -- Safety check - avoid runaway processes + IF total_processed > 50000000 OR batch_count > 1000 THEN + SELECT CONCAT('Safety limit reached. Total processed: ', total_processed, + ', Batches: ', batch_count) as safety_stop; + LEAVE migration_loop; + END IF; + + END WHILE; + DROP INDEX idx_pdts_timestamp_ext ON profiler_data_time_series; + +END$$ +DELIMITER ; + +-- Execute the migration +CALL MigrateColumnProfiles(); + +-- Clean up the procedure +DROP PROCEDURE MigrateColumnProfiles; + -- Clean up temporary table DROP TEMPORARY TABLE IF EXISTS column_to_table_mapping; +DROP TEMPORARY TABLE IF EXISTS table_entity_references; + -- Drop temporary indexes after migration DROP INDEX idx_pdts_entityFQNHash ON profiler_data_time_series;