feat(catalog): report total table files size in dataset profile (#14312)

This commit is contained in:
Chakru 2025-08-05 22:00:30 +05:30 committed by GitHub
parent 38cd4d02d0
commit 59384dfc9b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 72 additions and 0 deletions

View File

@ -352,6 +352,10 @@ class TableOpsDelegate extends TableOrViewOpsDelegate<TableMetadata> {
if (totalRecordsStr != null) {
dataSetProfile.setRowCount(Long.parseLong(totalRecordsStr));
}
String totalFileSizeStr = currentSnapshot.summary().get(SnapshotSummary.TOTAL_FILE_SIZE_PROP);
if (totalFileSizeStr != null) {
dataSetProfile.setSizeInBytes(Long.parseLong(totalFileSizeStr));
}
}
return dataSetProfile;

View File

@ -32,9 +32,13 @@ import io.datahubproject.metadata.context.ActorContext;
import io.datahubproject.metadata.context.OperationContext;
import io.datahubproject.schematron.converters.avro.AvroSchemaConverter;
import java.time.Instant;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import org.apache.iceberg.Schema;
import org.apache.iceberg.Snapshot;
import org.apache.iceberg.SnapshotSummary;
import org.apache.iceberg.TableMetadata;
import org.apache.iceberg.TableMetadataParser;
import org.apache.iceberg.avro.AvroSchemaUtil;
@ -426,4 +430,68 @@ public class TableOpsDelegateTest {
when(mockWarehouse.getIcebergMetadata(identifier)).thenReturn(Optional.empty());
assertNull(tableDelegate.refresh());
}
@Test
public void testGetDataSetProfileWithTotalFileSize() {
// Create a real TableOpsDelegate instance for testing the actual getDataSetProfile method
TableOpsDelegate realTableDelegate =
new TableOpsDelegate(
mockWarehouse, identifier, mockEntityService, mockOperationContext, mockFileIOFactory);
// Mock TableMetadata with snapshot and summary
TableMetadata mockMetadata = mock(TableMetadata.class);
Schema schema =
new Schema(
Types.NestedField.required(1, "id", Types.LongType.get()),
Types.NestedField.optional(2, "data", Types.StringType.get()));
when(mockMetadata.schema()).thenReturn(schema);
// Mock Snapshot with summary containing total file size
Snapshot mockSnapshot = mock(Snapshot.class);
Map<String, String> mockSummary = new HashMap<>();
mockSummary.put(SnapshotSummary.TOTAL_RECORDS_PROP, "1000");
mockSummary.put(SnapshotSummary.TOTAL_FILE_SIZE_PROP, "5242880"); // 5MB in bytes
when(mockSnapshot.summary()).thenReturn(mockSummary);
when(mockMetadata.currentSnapshot()).thenReturn(mockSnapshot);
// Call the actual getDataSetProfile method
DatasetProfile result = realTableDelegate.getDataSetProfile(mockMetadata);
// Verify the results
assertEquals(result.getColumnCount().longValue(), 2L);
assertEquals(result.getRowCount().longValue(), 1000L);
assertEquals(result.getSizeInBytes().longValue(), 5242880L);
}
@Test
public void testGetDataSetProfileWithoutTotalFileSize() {
// Create a real TableOpsDelegate instance for testing the actual getDataSetProfile method
TableOpsDelegate realTableDelegate =
new TableOpsDelegate(
mockWarehouse, identifier, mockEntityService, mockOperationContext, mockFileIOFactory);
// Mock TableMetadata with snapshot but no file size in summary
TableMetadata mockMetadata = mock(TableMetadata.class);
Schema schema =
new Schema(
Types.NestedField.required(1, "id", Types.LongType.get()),
Types.NestedField.optional(2, "data", Types.StringType.get()));
when(mockMetadata.schema()).thenReturn(schema);
// Mock Snapshot with summary containing only row count, no file size
Snapshot mockSnapshot = mock(Snapshot.class);
Map<String, String> mockSummary = new HashMap<>();
mockSummary.put(SnapshotSummary.TOTAL_RECORDS_PROP, "500");
// No TOTAL_FILE_SIZE_PROP in the map
when(mockSnapshot.summary()).thenReturn(mockSummary);
when(mockMetadata.currentSnapshot()).thenReturn(mockSnapshot);
// Call the actual getDataSetProfile method
DatasetProfile result = realTableDelegate.getDataSetProfile(mockMetadata);
// Verify the results
assertEquals(result.getColumnCount().longValue(), 2L);
assertEquals(result.getRowCount().longValue(), 500L);
assertNull(result.getSizeInBytes()); // Should be null when no file size info
}
}