mirror of
				https://github.com/datahub-project/datahub.git
				synced 2025-10-31 10:49:00 +00:00 
			
		
		
		
	feat(ingest): ensure payload size constraints for queryProperties, querySubjects and upstreamLineage aspects (#14919)
Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
		
							parent
							
								
									40b51ac2da
								
							
						
					
					
						commit
						e847b58472
					
				| @ -1,5 +1,6 @@ | |||||||
| import json | import json | ||||||
| import logging | import logging | ||||||
|  | import os | ||||||
| from typing import TYPE_CHECKING, Iterable, List | from typing import TYPE_CHECKING, Iterable, List | ||||||
| 
 | 
 | ||||||
| from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES | from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES | ||||||
| @ -7,15 +8,36 @@ from datahub.emitter.serialization_helper import pre_json_transform | |||||||
| from datahub.ingestion.api.workunit import MetadataWorkUnit | from datahub.ingestion.api.workunit import MetadataWorkUnit | ||||||
| from datahub.metadata.schema_classes import ( | from datahub.metadata.schema_classes import ( | ||||||
|     DatasetProfileClass, |     DatasetProfileClass, | ||||||
|  |     QueryPropertiesClass, | ||||||
|  |     QuerySubjectsClass, | ||||||
|     SchemaFieldClass, |     SchemaFieldClass, | ||||||
|     SchemaMetadataClass, |     SchemaMetadataClass, | ||||||
|  |     UpstreamLineageClass, | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
| if TYPE_CHECKING: | if TYPE_CHECKING: | ||||||
|     from datahub.ingestion.api.source import SourceReport |     from datahub.ingestion.api.source import SourceReport | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  | # TODO: ordering | ||||||
|  | # In the cases where we trim collections of data (e.g. fields in schema, upstream lineage, query subjects), given | ||||||
|  | # those collections are typically unordered, we should consider sorting them by some criteria (e.g. size, alphabetically) | ||||||
|  | # so that the trimming is deterministic and predictable and more importantly consistent across executions. | ||||||
|  | # In the case of schemaMetadata, that's more relevant as currently we may be trimming fields while adding nested ones, | ||||||
|  | # which may lead to poorly schema rendering in the UI. | ||||||
|  | 
 | ||||||
| logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||||
| 
 | 
 | ||||||
|  | DEFAULT_QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES = 5 * 1024 * 1024  # 5MB | ||||||
|  | QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES = int( | ||||||
|  |     os.environ.get( | ||||||
|  |         "QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES", | ||||||
|  |         DEFAULT_QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES, | ||||||
|  |     ) | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | QUERY_STATEMENT_TRUNCATION_BUFFER = 100 | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| class EnsureAspectSizeProcessor: | class EnsureAspectSizeProcessor: | ||||||
|     def __init__( |     def __init__( | ||||||
| @ -81,6 +103,274 @@ class EnsureAspectSizeProcessor: | |||||||
| 
 | 
 | ||||||
|         schema.fields = accepted_fields |         schema.fields = accepted_fields | ||||||
| 
 | 
 | ||||||
|  |     def ensure_query_subjects_size( | ||||||
|  |         self, entity_urn: str, query_subjects: QuerySubjectsClass | ||||||
|  |     ) -> None: | ||||||
|  |         """ | ||||||
|  |         Ensure query subjects aspect does not exceed allowed size by removing column-level lineage first, | ||||||
|  |         then table lineage if necessary. | ||||||
|  |         """ | ||||||
|  |         if not query_subjects.subjects: | ||||||
|  |             return | ||||||
|  | 
 | ||||||
|  |         total_subjects_size = 0 | ||||||
|  |         accepted_table_level_subjects = [] | ||||||
|  |         accepted_column_level_subjects = [] | ||||||
|  |         column_level_subjects_with_sizes = [] | ||||||
|  |         table_level_subjects_with_sizes = [] | ||||||
|  | 
 | ||||||
|  |         # Separate column-level and table-level subjects | ||||||
|  |         for subject in query_subjects.subjects: | ||||||
|  |             subject_size = len(json.dumps(pre_json_transform(subject.to_obj()))) | ||||||
|  | 
 | ||||||
|  |             if subject.entity.startswith("urn:li:schemaField:"): | ||||||
|  |                 column_level_subjects_with_sizes.append((subject, subject_size)) | ||||||
|  |             else: | ||||||
|  |                 table_level_subjects_with_sizes.append((subject, subject_size)) | ||||||
|  | 
 | ||||||
|  |         # Once we find one that doesn't fit, stop everything else to prevent inconsistencies | ||||||
|  |         first_skip_done = False | ||||||
|  | 
 | ||||||
|  |         # First, try to include all table-level subjects | ||||||
|  |         for subject, subject_size in table_level_subjects_with_sizes: | ||||||
|  |             if total_subjects_size + subject_size < self.payload_constraint: | ||||||
|  |                 accepted_table_level_subjects.append(subject) | ||||||
|  |                 total_subjects_size += subject_size | ||||||
|  |             else: | ||||||
|  |                 first_skip_done = True | ||||||
|  |                 break | ||||||
|  | 
 | ||||||
|  |         # Then, add column-level subjects if there's remaining space | ||||||
|  |         # Only process if we successfully included all table-level subjects | ||||||
|  |         if not first_skip_done: | ||||||
|  |             for subject, subject_size in column_level_subjects_with_sizes: | ||||||
|  |                 if total_subjects_size + subject_size < self.payload_constraint: | ||||||
|  |                     accepted_column_level_subjects.append(subject) | ||||||
|  |                     total_subjects_size += subject_size | ||||||
|  |                 else: | ||||||
|  |                     first_skip_done = True | ||||||
|  |                     break | ||||||
|  | 
 | ||||||
|  |         if first_skip_done: | ||||||
|  |             # Log aggregate warnings | ||||||
|  |             table_level_skipped_count = len(table_level_subjects_with_sizes) - len( | ||||||
|  |                 accepted_table_level_subjects | ||||||
|  |             ) | ||||||
|  |             column_level_skipped_count = len(column_level_subjects_with_sizes) - len( | ||||||
|  |                 accepted_column_level_subjects | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |             self._maybe_warn_query_subjects( | ||||||
|  |                 entity_urn, table_level_skipped_count, "table-level lineage subjects" | ||||||
|  |             ) | ||||||
|  |             self._maybe_warn_query_subjects( | ||||||
|  |                 entity_urn, column_level_skipped_count, "column-level lineage subjects" | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |         query_subjects.subjects = ( | ||||||
|  |             accepted_table_level_subjects + accepted_column_level_subjects | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     def _maybe_warn_query_subjects( | ||||||
|  |         self, entity_urn: str, skipped_count: int, item_type: str | ||||||
|  |     ) -> None: | ||||||
|  |         """Log warning for query subjects truncation if any items were skipped.""" | ||||||
|  |         if skipped_count > 0: | ||||||
|  |             self.report.warning( | ||||||
|  |                 title="Query subjects truncated due to size constraint", | ||||||
|  |                 message="Query subjects contained too much data and would have caused ingestion to fail", | ||||||
|  |                 context=f"Skipped {skipped_count} {item_type} for {entity_urn} due to aspect size constraints", | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |     def _maybe_warn_upstream_lineage( | ||||||
|  |         self, entity_urn: str, skipped_count: int, item_type: str | ||||||
|  |     ) -> None: | ||||||
|  |         """Log warning for upstream lineage truncation if any items were skipped.""" | ||||||
|  |         if skipped_count > 0: | ||||||
|  |             self.report.warning( | ||||||
|  |                 title="Upstream lineage truncated due to size constraint", | ||||||
|  |                 message="Upstream lineage contained too much data and would have caused ingestion to fail", | ||||||
|  |                 context=f"Skipped {skipped_count} {item_type} for {entity_urn} due to aspect size constraints", | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |     def ensure_upstream_lineage_size(  # noqa: C901 | ||||||
|  |         self, entity_urn: str, upstream_lineage: UpstreamLineageClass | ||||||
|  |     ) -> None: | ||||||
|  |         """ | ||||||
|  |         Ensure upstream lineage aspect does not exceed allowed size by removing lineage in priority order: | ||||||
|  |         first NONE fine-grained lineages (lowest priority), then FIELD_SET fine-grained lineages, | ||||||
|  |         then DATASET fine-grained lineages, and finally upstreams (highest priority). | ||||||
|  |         """ | ||||||
|  |         if not upstream_lineage.fineGrainedLineages and not upstream_lineage.upstreams: | ||||||
|  |             return | ||||||
|  | 
 | ||||||
|  |         total_lineage_size = 0 | ||||||
|  |         accepted_upstreams = [] | ||||||
|  |         accepted_dataset_fg_lineages = [] | ||||||
|  |         accepted_field_set_fg_lineages = [] | ||||||
|  |         accepted_none_fg_lineages = [] | ||||||
|  |         upstream_items_with_sizes = [] | ||||||
|  |         dataset_fg_items_with_sizes = [] | ||||||
|  |         field_set_fg_items_with_sizes = [] | ||||||
|  |         none_fg_items_with_sizes = [] | ||||||
|  | 
 | ||||||
|  |         # Add upstreams (highest priority) | ||||||
|  |         if upstream_lineage.upstreams: | ||||||
|  |             for upstream in upstream_lineage.upstreams: | ||||||
|  |                 upstream_size = len(json.dumps(pre_json_transform(upstream.to_obj()))) | ||||||
|  |                 upstream_items_with_sizes.append((upstream, upstream_size)) | ||||||
|  | 
 | ||||||
|  |         # Separate fine-grained lineage items by upstreamType: DATASET > FIELD_SET > NONE | ||||||
|  |         if upstream_lineage.fineGrainedLineages: | ||||||
|  |             for fg_lineage in upstream_lineage.fineGrainedLineages: | ||||||
|  |                 fg_lineage_size = len( | ||||||
|  |                     json.dumps(pre_json_transform(fg_lineage.to_obj())) | ||||||
|  |                 ) | ||||||
|  | 
 | ||||||
|  |                 upstream_type_str = str(fg_lineage.upstreamType) | ||||||
|  |                 if upstream_type_str == "DATASET": | ||||||
|  |                     dataset_fg_items_with_sizes.append((fg_lineage, fg_lineage_size)) | ||||||
|  |                 elif upstream_type_str == "FIELD_SET": | ||||||
|  |                     field_set_fg_items_with_sizes.append((fg_lineage, fg_lineage_size)) | ||||||
|  |                 elif upstream_type_str == "NONE": | ||||||
|  |                     none_fg_items_with_sizes.append((fg_lineage, fg_lineage_size)) | ||||||
|  | 
 | ||||||
|  |         # Once we find one that doesn't fit, stop everything else to prevent inconsistencies | ||||||
|  |         first_skip_done = False | ||||||
|  | 
 | ||||||
|  |         # First, include all upstreams (highest priority) | ||||||
|  |         for item, item_size in upstream_items_with_sizes: | ||||||
|  |             if total_lineage_size + item_size < self.payload_constraint: | ||||||
|  |                 accepted_upstreams.append(item) | ||||||
|  |                 total_lineage_size += item_size | ||||||
|  |             else: | ||||||
|  |                 first_skip_done = True | ||||||
|  |                 break | ||||||
|  | 
 | ||||||
|  |         # Second, include DATASET fine-grained lineages if no upstreams were skipped | ||||||
|  |         if not first_skip_done: | ||||||
|  |             for fg_lineage, fg_lineage_size in dataset_fg_items_with_sizes: | ||||||
|  |                 if total_lineage_size + fg_lineage_size < self.payload_constraint: | ||||||
|  |                     accepted_dataset_fg_lineages.append(fg_lineage) | ||||||
|  |                     total_lineage_size += fg_lineage_size | ||||||
|  |                 else: | ||||||
|  |                     first_skip_done = True | ||||||
|  |                     break | ||||||
|  | 
 | ||||||
|  |         # Third, include FIELD_SET fine-grained lineages if no higher priority items were skipped | ||||||
|  |         if not first_skip_done: | ||||||
|  |             for fg_lineage, fg_lineage_size in field_set_fg_items_with_sizes: | ||||||
|  |                 if total_lineage_size + fg_lineage_size < self.payload_constraint: | ||||||
|  |                     accepted_field_set_fg_lineages.append(fg_lineage) | ||||||
|  |                     total_lineage_size += fg_lineage_size | ||||||
|  |                 else: | ||||||
|  |                     first_skip_done = True | ||||||
|  |                     break | ||||||
|  | 
 | ||||||
|  |         # Finally, include NONE fine-grained lineages if no higher priority items were skipped | ||||||
|  |         if not first_skip_done: | ||||||
|  |             for fg_lineage, fg_lineage_size in none_fg_items_with_sizes: | ||||||
|  |                 if total_lineage_size + fg_lineage_size < self.payload_constraint: | ||||||
|  |                     accepted_none_fg_lineages.append(fg_lineage) | ||||||
|  |                     total_lineage_size += fg_lineage_size | ||||||
|  |                 else: | ||||||
|  |                     first_skip_done = True | ||||||
|  |                     break | ||||||
|  | 
 | ||||||
|  |         # Log aggregate warnings instead of per-item warnings | ||||||
|  |         if first_skip_done: | ||||||
|  |             upstreams_skipped_count = len(upstream_items_with_sizes) - len( | ||||||
|  |                 accepted_upstreams | ||||||
|  |             ) | ||||||
|  |             dataset_fg_skipped_count = len(dataset_fg_items_with_sizes) - len( | ||||||
|  |                 accepted_dataset_fg_lineages | ||||||
|  |             ) | ||||||
|  |             field_set_fg_skipped_count = len(field_set_fg_items_with_sizes) - len( | ||||||
|  |                 accepted_field_set_fg_lineages | ||||||
|  |             ) | ||||||
|  |             none_fg_skipped_count = len(none_fg_items_with_sizes) - len( | ||||||
|  |                 accepted_none_fg_lineages | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |             self._maybe_warn_upstream_lineage( | ||||||
|  |                 entity_urn, upstreams_skipped_count, "upstream datasets" | ||||||
|  |             ) | ||||||
|  |             self._maybe_warn_upstream_lineage( | ||||||
|  |                 entity_urn, | ||||||
|  |                 dataset_fg_skipped_count, | ||||||
|  |                 "dataset-level fine-grained lineages", | ||||||
|  |             ) | ||||||
|  |             self._maybe_warn_upstream_lineage( | ||||||
|  |                 entity_urn, | ||||||
|  |                 field_set_fg_skipped_count, | ||||||
|  |                 "field-set-level fine-grained lineages", | ||||||
|  |             ) | ||||||
|  |             self._maybe_warn_upstream_lineage( | ||||||
|  |                 entity_urn, none_fg_skipped_count, "none-level fine-grained lineages" | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |         # Combine all accepted fine-grained lineages | ||||||
|  |         accepted_fine_grained_lineages = ( | ||||||
|  |             accepted_dataset_fg_lineages | ||||||
|  |             + accepted_field_set_fg_lineages | ||||||
|  |             + accepted_none_fg_lineages | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         upstream_lineage.upstreams = accepted_upstreams | ||||||
|  |         upstream_lineage.fineGrainedLineages = ( | ||||||
|  |             accepted_fine_grained_lineages if accepted_fine_grained_lineages else None | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     def ensure_query_properties_size( | ||||||
|  |         self, entity_urn: str, query_properties: QueryPropertiesClass | ||||||
|  |     ) -> None: | ||||||
|  |         """ | ||||||
|  |         Ensure query properties aspect does not exceed allowed size by truncating the query statement value. | ||||||
|  |         Uses a configurable max payload size that is the minimum between QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES | ||||||
|  |         and INGEST_MAX_PAYLOAD_BYTES. | ||||||
|  | 
 | ||||||
|  |         We have found surprisingly large query statements (e.g. 20MB+) that caused ingestion to fail; | ||||||
|  |         that was INSERT INTO VALUES with huge list of values. | ||||||
|  |         """ | ||||||
|  |         if not query_properties.statement or not query_properties.statement.value: | ||||||
|  |             return | ||||||
|  | 
 | ||||||
|  |         max_payload_size = min( | ||||||
|  |             QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES, self.payload_constraint | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         current_size = len(json.dumps(pre_json_transform(query_properties.to_obj()))) | ||||||
|  | 
 | ||||||
|  |         if current_size < max_payload_size: | ||||||
|  |             return | ||||||
|  | 
 | ||||||
|  |         reduction_needed = ( | ||||||
|  |             current_size - max_payload_size + QUERY_STATEMENT_TRUNCATION_BUFFER | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         statement_value_size = len(query_properties.statement.value) | ||||||
|  |         original_statement_size = statement_value_size | ||||||
|  | 
 | ||||||
|  |         # Only truncate if reduction is actually needed and possible | ||||||
|  |         if statement_value_size > reduction_needed > 0: | ||||||
|  |             new_statement_length = statement_value_size - reduction_needed | ||||||
|  |             truncated_statement = query_properties.statement.value[ | ||||||
|  |                 :new_statement_length | ||||||
|  |             ] | ||||||
|  | 
 | ||||||
|  |             truncation_message = f"... [original value was {original_statement_size} bytes and truncated to {new_statement_length} bytes]" | ||||||
|  |             query_properties.statement.value = truncated_statement + truncation_message | ||||||
|  | 
 | ||||||
|  |             self.report.warning( | ||||||
|  |                 title="Query properties truncated due to size constraint", | ||||||
|  |                 message="Query properties contained too much data and would have caused ingestion to fail", | ||||||
|  |                 context=f"Query statement was truncated from {original_statement_size} to {new_statement_length} characters for {entity_urn} due to aspect size constraints", | ||||||
|  |             ) | ||||||
|  |         else: | ||||||
|  |             logger.warning( | ||||||
|  |                 f"Cannot truncate query statement for {entity_urn} as it is smaller than or equal to the required reduction size {reduction_needed}. That means that 'ensure_query_properties_size' must be extended to trim other fields different than statement." | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|     def ensure_aspect_size( |     def ensure_aspect_size( | ||||||
|         self, |         self, | ||||||
|         stream: Iterable[MetadataWorkUnit], |         stream: Iterable[MetadataWorkUnit], | ||||||
| @ -96,4 +386,10 @@ class EnsureAspectSizeProcessor: | |||||||
|                 self.ensure_schema_metadata_size(wu.get_urn(), schema) |                 self.ensure_schema_metadata_size(wu.get_urn(), schema) | ||||||
|             elif profile := wu.get_aspect_of_type(DatasetProfileClass): |             elif profile := wu.get_aspect_of_type(DatasetProfileClass): | ||||||
|                 self.ensure_dataset_profile_size(wu.get_urn(), profile) |                 self.ensure_dataset_profile_size(wu.get_urn(), profile) | ||||||
|  |             elif query_subjects := wu.get_aspect_of_type(QuerySubjectsClass): | ||||||
|  |                 self.ensure_query_subjects_size(wu.get_urn(), query_subjects) | ||||||
|  |             elif upstream_lineage := wu.get_aspect_of_type(UpstreamLineageClass): | ||||||
|  |                 self.ensure_upstream_lineage_size(wu.get_urn(), upstream_lineage) | ||||||
|  |             elif query_properties := wu.get_aspect_of_type(QueryPropertiesClass): | ||||||
|  |                 self.ensure_query_properties_size(wu.get_urn(), query_properties) | ||||||
|             yield wu |             yield wu | ||||||
|  | |||||||
| @ -15,20 +15,33 @@ from datahub.ingestion.api.source import SourceReport | |||||||
| from datahub.ingestion.api.workunit import MetadataWorkUnit | from datahub.ingestion.api.workunit import MetadataWorkUnit | ||||||
| from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent | from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent | ||||||
| from datahub.metadata.schema_classes import ( | from datahub.metadata.schema_classes import ( | ||||||
|  |     AuditStampClass, | ||||||
|     ChangeTypeClass, |     ChangeTypeClass, | ||||||
|     DatasetFieldProfileClass, |     DatasetFieldProfileClass, | ||||||
|  |     DatasetLineageTypeClass, | ||||||
|     DatasetProfileClass, |     DatasetProfileClass, | ||||||
|     DatasetSnapshotClass, |     DatasetSnapshotClass, | ||||||
|  |     FineGrainedLineageClass, | ||||||
|  |     FineGrainedLineageDownstreamTypeClass, | ||||||
|  |     FineGrainedLineageUpstreamTypeClass, | ||||||
|     GenericAspectClass, |     GenericAspectClass, | ||||||
|     MetadataChangeProposalClass, |     MetadataChangeProposalClass, | ||||||
|     NumberTypeClass, |     NumberTypeClass, | ||||||
|     OtherSchemaClass, |     OtherSchemaClass, | ||||||
|  |     QueryLanguageClass, | ||||||
|  |     QueryPropertiesClass, | ||||||
|  |     QuerySourceClass, | ||||||
|  |     QueryStatementClass, | ||||||
|  |     QuerySubjectClass, | ||||||
|  |     QuerySubjectsClass, | ||||||
|     SchemaFieldClass, |     SchemaFieldClass, | ||||||
|     SchemaFieldDataTypeClass, |     SchemaFieldDataTypeClass, | ||||||
|     SchemaMetadataClass, |     SchemaMetadataClass, | ||||||
|     StatusClass, |     StatusClass, | ||||||
|     StringTypeClass, |     StringTypeClass, | ||||||
|     SubTypesClass, |     SubTypesClass, | ||||||
|  |     UpstreamClass, | ||||||
|  |     UpstreamLineageClass, | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -112,6 +125,192 @@ def proper_schema_metadata() -> SchemaMetadataClass: | |||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def proper_query_subjects() -> QuerySubjectsClass: | ||||||
|  |     subjects = [ | ||||||
|  |         QuerySubjectClass( | ||||||
|  |             entity="urn:li:dataset:(urn:li:dataPlatform:hive,db1.table1,PROD)" | ||||||
|  |         ), | ||||||
|  |         QuerySubjectClass( | ||||||
|  |             entity="urn:li:dataset:(urn:li:dataPlatform:hive,db1.table2,PROD)" | ||||||
|  |         ), | ||||||
|  |         QuerySubjectClass( | ||||||
|  |             entity="urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.table1,PROD),col1)" | ||||||
|  |         ), | ||||||
|  |         QuerySubjectClass( | ||||||
|  |             entity="urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.table2,PROD),col2)" | ||||||
|  |         ), | ||||||
|  |     ] | ||||||
|  |     return QuerySubjectsClass(subjects=subjects) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def too_big_query_subjects() -> QuerySubjectsClass: | ||||||
|  |     subjects = [] | ||||||
|  | 
 | ||||||
|  |     # Add a few table-level subjects | ||||||
|  |     for i in range(5): | ||||||
|  |         subjects.append( | ||||||
|  |             QuerySubjectClass( | ||||||
|  |                 entity=f"urn:li:dataset:(urn:li:dataPlatform:hive,db.table{i},PROD)" | ||||||
|  |             ) | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     # Add many column-level subjects with very large entity URNs to exceed the 15MB constraint | ||||||
|  |     # Each URN will be about 40KB, so 500 subjects should create ~20MB of data | ||||||
|  |     for i in range(500): | ||||||
|  |         large_table_name = "a" * 20000  # Very large table name | ||||||
|  |         large_column_name = "b" * 20000  # Very large column name | ||||||
|  |         subjects.append( | ||||||
|  |             QuerySubjectClass( | ||||||
|  |                 entity=f"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,{large_table_name}_{i},PROD),{large_column_name}_{i})" | ||||||
|  |             ) | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     return QuerySubjectsClass(subjects=subjects) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def proper_upstream_lineage() -> UpstreamLineageClass: | ||||||
|  |     upstreams = [ | ||||||
|  |         UpstreamClass( | ||||||
|  |             dataset="urn:li:dataset:(urn:li:dataPlatform:hive,db1.table1,PROD)", | ||||||
|  |             type=DatasetLineageTypeClass.TRANSFORMED, | ||||||
|  |         ), | ||||||
|  |         UpstreamClass( | ||||||
|  |             dataset="urn:li:dataset:(urn:li:dataPlatform:hive,db1.table2,PROD)", | ||||||
|  |             type=DatasetLineageTypeClass.TRANSFORMED, | ||||||
|  |         ), | ||||||
|  |     ] | ||||||
|  |     fine_grained_lineages = [ | ||||||
|  |         FineGrainedLineageClass( | ||||||
|  |             upstreamType=FineGrainedLineageUpstreamTypeClass.DATASET, | ||||||
|  |             downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD, | ||||||
|  |             upstreams=["urn:li:dataset:(urn:li:dataPlatform:hive,db1.table3,PROD)"], | ||||||
|  |             downstreams=[ | ||||||
|  |                 "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.target,PROD),col1)" | ||||||
|  |             ], | ||||||
|  |         ), | ||||||
|  |         FineGrainedLineageClass( | ||||||
|  |             upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET, | ||||||
|  |             downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD, | ||||||
|  |             upstreams=[ | ||||||
|  |                 "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.table4,PROD),col2)" | ||||||
|  |             ], | ||||||
|  |             downstreams=[ | ||||||
|  |                 "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.target,PROD),col2)" | ||||||
|  |             ], | ||||||
|  |         ), | ||||||
|  |     ] | ||||||
|  |     return UpstreamLineageClass( | ||||||
|  |         upstreams=upstreams, fineGrainedLineages=fine_grained_lineages | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def too_big_upstream_lineage() -> UpstreamLineageClass: | ||||||
|  |     upstreams = [] | ||||||
|  |     fine_grained_lineages = [] | ||||||
|  | 
 | ||||||
|  |     # Add upstreams (highest priority) | ||||||
|  |     for i in range(5): | ||||||
|  |         upstreams.append( | ||||||
|  |             UpstreamClass( | ||||||
|  |                 dataset=f"urn:li:dataset:(urn:li:dataPlatform:hive,upstream_table_{i},PROD)", | ||||||
|  |                 type=DatasetLineageTypeClass.TRANSFORMED, | ||||||
|  |             ) | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     # Add DATASET fine-grained lineages with large URNs | ||||||
|  |     for i in range(200): | ||||||
|  |         large_dataset_name = "a" * 20000 | ||||||
|  |         large_downstream_name = "b" * 20000 | ||||||
|  |         fine_grained_lineages.append( | ||||||
|  |             FineGrainedLineageClass( | ||||||
|  |                 upstreamType=FineGrainedLineageUpstreamTypeClass.DATASET, | ||||||
|  |                 downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD, | ||||||
|  |                 upstreams=[ | ||||||
|  |                     f"urn:li:dataset:(urn:li:dataPlatform:hive,{large_dataset_name}_{i},PROD)" | ||||||
|  |                 ], | ||||||
|  |                 downstreams=[ | ||||||
|  |                     f"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,target,PROD),{large_downstream_name}_{i})" | ||||||
|  |                 ], | ||||||
|  |             ) | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     # Add FIELD_SET fine-grained lineages with large URNs | ||||||
|  |     for i in range(200): | ||||||
|  |         large_upstream_name = "c" * 20000 | ||||||
|  |         large_downstream_name = "d" * 20000 | ||||||
|  |         fine_grained_lineages.append( | ||||||
|  |             FineGrainedLineageClass( | ||||||
|  |                 upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET, | ||||||
|  |                 downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD, | ||||||
|  |                 upstreams=[ | ||||||
|  |                     f"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,source,PROD),{large_upstream_name}_{i})" | ||||||
|  |                 ], | ||||||
|  |                 downstreams=[ | ||||||
|  |                     f"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,target,PROD),{large_downstream_name}_{i})" | ||||||
|  |                 ], | ||||||
|  |             ) | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     # Add NONE fine-grained lineages with large URNs (lowest priority) | ||||||
|  |     for i in range(200): | ||||||
|  |         large_downstream_name = "e" * 20000 | ||||||
|  |         fine_grained_lineages.append( | ||||||
|  |             FineGrainedLineageClass( | ||||||
|  |                 upstreamType=FineGrainedLineageUpstreamTypeClass.NONE, | ||||||
|  |                 downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD, | ||||||
|  |                 downstreams=[ | ||||||
|  |                     f"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,target,PROD),{large_downstream_name}_{i})" | ||||||
|  |                 ], | ||||||
|  |             ) | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     return UpstreamLineageClass( | ||||||
|  |         upstreams=upstreams, fineGrainedLineages=fine_grained_lineages | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def proper_query_properties() -> QueryPropertiesClass: | ||||||
|  |     # Create a query properties with a reasonably sized statement (~1KB) | ||||||
|  |     query_statement = ( | ||||||
|  |         "SELECT * FROM table1 WHERE column1 = 'value' AND column2 > 100;" * 20 | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     return QueryPropertiesClass( | ||||||
|  |         statement=QueryStatementClass( | ||||||
|  |             value=query_statement, | ||||||
|  |             language=QueryLanguageClass.SQL, | ||||||
|  |         ), | ||||||
|  |         source=QuerySourceClass.SYSTEM, | ||||||
|  |         created=AuditStampClass(time=1000000000000, actor="urn:li:corpuser:test"), | ||||||
|  |         lastModified=AuditStampClass(time=1000000000000, actor="urn:li:corpuser:test"), | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def too_big_query_properties() -> QueryPropertiesClass: | ||||||
|  |     # Create a query properties with a very large statement (~6MB, exceeding the 5MB default limit) | ||||||
|  |     # This is larger than the QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES default (5MB) | ||||||
|  |     large_query_statement = ( | ||||||
|  |         "SELECT col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, " | ||||||
|  |         "col11, col12, col13, col14, col15, col16, col17, col18, col19, col20 " | ||||||
|  |         "FROM very_long_table_name_with_lots_of_characters_to_make_it_big " | ||||||
|  |         "WHERE condition1 = 'some_very_long_value_with_lots_of_text' " | ||||||
|  |         "AND condition2 IN ('value1', 'value2', 'value3', 'value4') " | ||||||
|  |         "ORDER BY col1, col2, col3, col4, col5 LIMIT 1000;" | ||||||
|  |     ) * 15000  # ~6MB | ||||||
|  | 
 | ||||||
|  |     return QueryPropertiesClass( | ||||||
|  |         statement=QueryStatementClass( | ||||||
|  |             value=large_query_statement, | ||||||
|  |             language=QueryLanguageClass.SQL, | ||||||
|  |         ), | ||||||
|  |         source=QuerySourceClass.SYSTEM, | ||||||
|  |         created=AuditStampClass(time=1000000000000, actor="urn:li:corpuser:test"), | ||||||
|  |         lastModified=AuditStampClass(time=1000000000000, actor="urn:li:corpuser:test"), | ||||||
|  |         name="Large Test Query", | ||||||
|  |         description="A test query with a very large statement", | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def proper_dataset_profile() -> DatasetProfileClass: | def proper_dataset_profile() -> DatasetProfileClass: | ||||||
|     sample_values = [ |     sample_values = [ | ||||||
|         "23483295", |         "23483295", | ||||||
| @ -344,3 +543,254 @@ def test_wu_processor_not_triggered_by_unhandled_aspects( | |||||||
|     ] |     ] | ||||||
|     ensure_schema_metadata_size_mock.assert_not_called() |     ensure_schema_metadata_size_mock.assert_not_called() | ||||||
|     ensure_dataset_profile_size_mock.assert_not_called() |     ensure_dataset_profile_size_mock.assert_not_called() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @freeze_time("2023-01-02 00:00:00") | ||||||
|  | def test_ensure_size_of_proper_query_subjects(processor): | ||||||
|  |     query_subjects = proper_query_subjects() | ||||||
|  |     orig_repr = json.dumps(query_subjects.to_obj()) | ||||||
|  |     processor.ensure_query_subjects_size( | ||||||
|  |         "urn:li:query:(urn:li:dataPlatform:hive, dummy_query, DEV)", query_subjects | ||||||
|  |     ) | ||||||
|  |     assert orig_repr == json.dumps(query_subjects.to_obj()), ( | ||||||
|  |         "Aspect was modified in case where workunit processor should have been no-op" | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @freeze_time("2023-01-02 00:00:00") | ||||||
|  | def test_ensure_size_of_too_big_query_subjects(processor): | ||||||
|  |     query_subjects = too_big_query_subjects() | ||||||
|  |     assert len(query_subjects.subjects) == 505  # 5 table + 500 column subjects | ||||||
|  | 
 | ||||||
|  |     # Verify that the initial size exceeds the default payload constraint | ||||||
|  |     initial_size = len(json.dumps(query_subjects.to_obj())) | ||||||
|  |     expected_size = 20 * 1024 * 1024  # 20MB | ||||||
|  |     assert initial_size == pytest.approx(expected_size, rel=0.05), ( | ||||||
|  |         f"Initial size {initial_size} should be around 20MB (±5%), got {initial_size / (1024 * 1024):.1f}MB" | ||||||
|  |     ) | ||||||
|  |     assert initial_size > INGEST_MAX_PAYLOAD_BYTES, ( | ||||||
|  |         f"Initial size {initial_size} should exceed payload constraint {INGEST_MAX_PAYLOAD_BYTES}" | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     processor.ensure_query_subjects_size( | ||||||
|  |         "urn:li:query:(urn:li:dataPlatform:hive, dummy_query, DEV)", query_subjects | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # Should be significantly reduced due to size constraints | ||||||
|  |     # With ~20MB of data needing to be reduced to ~15MB, we expect ~25% reduction (125 subjects) | ||||||
|  |     # So final count should be around 380, using 400 as upper bound with buffer | ||||||
|  |     assert len(query_subjects.subjects) < 400, ( | ||||||
|  |         "Query subjects has not been properly truncated" | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # Check that table-level subjects are prioritized (should still be present) | ||||||
|  |     table_subjects = [ | ||||||
|  |         s | ||||||
|  |         for s in query_subjects.subjects | ||||||
|  |         if not s.entity.startswith("urn:li:schemaField:") | ||||||
|  |     ] | ||||||
|  |     assert len(table_subjects) > 0, ( | ||||||
|  |         "Table-level subjects should be prioritized and present" | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # The aspect should not exceed acceptable size | ||||||
|  |     assert len(json.dumps(query_subjects.to_obj())) < INGEST_MAX_PAYLOAD_BYTES, ( | ||||||
|  |         "Aspect exceeded acceptable size" | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @freeze_time("2023-01-02 00:00:00") | ||||||
|  | @patch( | ||||||
|  |     "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_query_subjects_size" | ||||||
|  | ) | ||||||
|  | def test_wu_processor_triggered_by_query_subjects_aspect( | ||||||
|  |     ensure_query_subjects_size_mock, processor | ||||||
|  | ): | ||||||
|  |     ret = [  # noqa: F841 | ||||||
|  |         *processor.ensure_aspect_size( | ||||||
|  |             [ | ||||||
|  |                 MetadataChangeProposalWrapper( | ||||||
|  |                     entityUrn="urn:li:query:(urn:li:dataPlatform:hive, dummy_query, DEV)", | ||||||
|  |                     aspect=proper_query_subjects(), | ||||||
|  |                 ).as_workunit() | ||||||
|  |             ] | ||||||
|  |         ) | ||||||
|  |     ] | ||||||
|  |     ensure_query_subjects_size_mock.assert_called_once() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @freeze_time("2023-01-02 00:00:00") | ||||||
|  | def test_ensure_size_of_proper_upstream_lineage(processor): | ||||||
|  |     upstream_lineage = proper_upstream_lineage() | ||||||
|  |     orig_repr = json.dumps(upstream_lineage.to_obj()) | ||||||
|  |     processor.ensure_upstream_lineage_size( | ||||||
|  |         "urn:li:dataset:(urn:li:dataPlatform:hive, dummy_dataset, DEV)", | ||||||
|  |         upstream_lineage, | ||||||
|  |     ) | ||||||
|  |     assert orig_repr == json.dumps(upstream_lineage.to_obj()), ( | ||||||
|  |         "Aspect was modified in case where workunit processor should have been no-op" | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @freeze_time("2023-01-02 00:00:00") | ||||||
|  | def test_ensure_size_of_too_big_upstream_lineage(processor): | ||||||
|  |     upstream_lineage = too_big_upstream_lineage() | ||||||
|  |     assert len(upstream_lineage.upstreams) == 5  # 5 upstreams | ||||||
|  |     assert upstream_lineage.fineGrainedLineages is not None | ||||||
|  |     assert ( | ||||||
|  |         len(upstream_lineage.fineGrainedLineages) == 600 | ||||||
|  |     )  # 200 DATASET + 200 FIELD_SET + 200 NONE | ||||||
|  | 
 | ||||||
|  |     # Verify that the initial size exceeds the default payload constraint | ||||||
|  |     initial_size = len(json.dumps(upstream_lineage.to_obj())) | ||||||
|  |     expected_size = 20 * 1024 * 1024  # 20MB | ||||||
|  |     assert initial_size == pytest.approx(expected_size, rel=0.05), ( | ||||||
|  |         f"Initial size {initial_size} should be around 20MB (±5%), got {initial_size / (1024 * 1024):.1f}MB" | ||||||
|  |     ) | ||||||
|  |     assert initial_size > INGEST_MAX_PAYLOAD_BYTES, ( | ||||||
|  |         f"Initial size {initial_size} should exceed payload constraint {INGEST_MAX_PAYLOAD_BYTES}" | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     processor.ensure_upstream_lineage_size( | ||||||
|  |         "urn:li:dataset:(urn:li:dataPlatform:hive, dummy_dataset, DEV)", | ||||||
|  |         upstream_lineage, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # Should be significantly reduced due to size constraints | ||||||
|  |     # With ~20MB of data needing to be reduced to ~15MB, we expect ~25% reduction | ||||||
|  |     # Total items: 5 upstreams + 600 fine-grained = 605, expect around ~450 after 25% reduction | ||||||
|  |     total_items = len(upstream_lineage.upstreams) + ( | ||||||
|  |         len(upstream_lineage.fineGrainedLineages) | ||||||
|  |         if upstream_lineage.fineGrainedLineages | ||||||
|  |         else 0 | ||||||
|  |     ) | ||||||
|  |     assert total_items < 500, "Upstream lineage has not been properly truncated" | ||||||
|  | 
 | ||||||
|  |     # Check that upstreams are prioritized (should still be present) | ||||||
|  |     assert len(upstream_lineage.upstreams) > 0, ( | ||||||
|  |         "Upstreams should be prioritized and present" | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # Check that DATASET fine-grained lineages are prioritized over FIELD_SET and NONE | ||||||
|  |     if upstream_lineage.fineGrainedLineages: | ||||||
|  |         dataset_count = sum( | ||||||
|  |             1 | ||||||
|  |             for fg in upstream_lineage.fineGrainedLineages | ||||||
|  |             if str(fg.upstreamType) == "DATASET" | ||||||
|  |         ) | ||||||
|  |         field_set_count = sum( | ||||||
|  |             1 | ||||||
|  |             for fg in upstream_lineage.fineGrainedLineages | ||||||
|  |             if str(fg.upstreamType) == "FIELD_SET" | ||||||
|  |         ) | ||||||
|  |         none_count = sum( | ||||||
|  |             1 | ||||||
|  |             for fg in upstream_lineage.fineGrainedLineages | ||||||
|  |             if str(fg.upstreamType) == "NONE" | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         # DATASET should be prioritized over FIELD_SET and NONE | ||||||
|  |         assert dataset_count >= field_set_count, ( | ||||||
|  |             "DATASET fine-grained lineages should be prioritized" | ||||||
|  |         ) | ||||||
|  |         assert dataset_count >= none_count, ( | ||||||
|  |             "DATASET fine-grained lineages should be prioritized over NONE" | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     # The aspect should not exceed acceptable size | ||||||
|  |     assert len(json.dumps(upstream_lineage.to_obj())) < INGEST_MAX_PAYLOAD_BYTES, ( | ||||||
|  |         "Aspect exceeded acceptable size" | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @freeze_time("2023-01-02 00:00:00") | ||||||
|  | @patch( | ||||||
|  |     "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_upstream_lineage_size" | ||||||
|  | ) | ||||||
|  | def test_wu_processor_triggered_by_upstream_lineage_aspect( | ||||||
|  |     ensure_upstream_lineage_size_mock, processor | ||||||
|  | ): | ||||||
|  |     ret = [  # noqa: F841 | ||||||
|  |         *processor.ensure_aspect_size( | ||||||
|  |             [ | ||||||
|  |                 MetadataChangeProposalWrapper( | ||||||
|  |                     entityUrn="urn:li:dataset:(urn:li:dataPlatform:hive, dummy_dataset, DEV)", | ||||||
|  |                     aspect=proper_upstream_lineage(), | ||||||
|  |                 ).as_workunit() | ||||||
|  |             ] | ||||||
|  |         ) | ||||||
|  |     ] | ||||||
|  |     ensure_upstream_lineage_size_mock.assert_called_once() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @freeze_time("2023-01-02 00:00:00") | ||||||
|  | def test_ensure_size_of_proper_query_properties(processor): | ||||||
|  |     query_properties = proper_query_properties() | ||||||
|  |     original_statement = query_properties.statement.value | ||||||
|  | 
 | ||||||
|  |     # Verify initial size is reasonable (under 5MB) | ||||||
|  |     initial_size = len(json.dumps(query_properties.to_obj())) | ||||||
|  |     assert initial_size < 5 * 1024 * 1024, "Test query properties should be under 5MB" | ||||||
|  | 
 | ||||||
|  |     processor.ensure_query_properties_size("urn:li:query:test", query_properties) | ||||||
|  | 
 | ||||||
|  |     # Statement should remain unchanged for properly sized query properties | ||||||
|  |     assert query_properties.statement.value == original_statement | ||||||
|  |     assert len(processor.report.warnings) == 0 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @freeze_time("2023-01-02 00:00:00") | ||||||
|  | def test_ensure_size_of_too_big_query_properties(processor): | ||||||
|  |     query_properties = too_big_query_properties() | ||||||
|  |     original_statement_size = len(query_properties.statement.value) | ||||||
|  | 
 | ||||||
|  |     # Verify initial size exceeds it's about 5.5MB limit and definitely larger than 5MB | ||||||
|  |     initial_size = len(json.dumps(query_properties.to_obj())) | ||||||
|  |     expected_initial_size = 5.5 * 1024 * 1024  # ~5.5MB | ||||||
|  |     assert initial_size == pytest.approx(expected_initial_size, rel=0.1), ( | ||||||
|  |         f"Expected initial size ~{expected_initial_size}, got {initial_size}" | ||||||
|  |     ) | ||||||
|  |     assert initial_size > 5 * 1024 * 1024, "Test data should exceed 5MB limit" | ||||||
|  | 
 | ||||||
|  |     processor.ensure_query_properties_size("urn:li:query:test", query_properties) | ||||||
|  | 
 | ||||||
|  |     # Statement should be truncated | ||||||
|  |     assert len(query_properties.statement.value) < original_statement_size | ||||||
|  | 
 | ||||||
|  |     # Should contain truncation message | ||||||
|  |     assert "... [original value was" in query_properties.statement.value | ||||||
|  |     assert ( | ||||||
|  |         f"{original_statement_size} bytes and truncated to" | ||||||
|  |         in query_properties.statement.value | ||||||
|  |     ) | ||||||
|  |     assert query_properties.statement.value.endswith(" bytes]") | ||||||
|  | 
 | ||||||
|  |     # Final size should be within constraints, ie <= 5MB + buffer | ||||||
|  |     final_size = len(json.dumps(query_properties.to_obj())) | ||||||
|  |     expected_final_size = 5 * 1024 * 1024 + 100  # 5MB + buffer | ||||||
|  |     assert final_size <= expected_final_size, ( | ||||||
|  |         f"Final size {final_size} should be <= {expected_final_size}" | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     # Should have logged a warning | ||||||
|  |     assert len(processor.report.warnings) == 1 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @freeze_time("2023-01-02 00:00:00") | ||||||
|  | @patch( | ||||||
|  |     "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_query_properties_size" | ||||||
|  | ) | ||||||
|  | def test_wu_processor_triggered_by_query_properties_aspect( | ||||||
|  |     ensure_query_properties_size_mock, processor | ||||||
|  | ): | ||||||
|  |     list( | ||||||
|  |         processor.ensure_aspect_size( | ||||||
|  |             [ | ||||||
|  |                 MetadataChangeProposalWrapper( | ||||||
|  |                     entityUrn="urn:li:query:test", | ||||||
|  |                     aspect=proper_query_properties(), | ||||||
|  |                 ).as_workunit() | ||||||
|  |             ] | ||||||
|  |         ) | ||||||
|  |     ) | ||||||
|  |     ensure_query_properties_size_mock.assert_called_once() | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Sergio Gómez Villamor
						Sergio Gómez Villamor