mirror of
				https://github.com/datahub-project/datahub.git
				synced 2025-10-31 02:37:05 +00:00 
			
		
		
		
	feat(ingest): ensure payload size constraints for queryProperties, querySubjects and upstreamLineage aspects (#14919)
Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
		
							parent
							
								
									40b51ac2da
								
							
						
					
					
						commit
						e847b58472
					
				| @ -1,5 +1,6 @@ | ||||
| import json | ||||
| import logging | ||||
| import os | ||||
| from typing import TYPE_CHECKING, Iterable, List | ||||
| 
 | ||||
| from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES | ||||
| @ -7,15 +8,36 @@ from datahub.emitter.serialization_helper import pre_json_transform | ||||
| from datahub.ingestion.api.workunit import MetadataWorkUnit | ||||
| from datahub.metadata.schema_classes import ( | ||||
|     DatasetProfileClass, | ||||
|     QueryPropertiesClass, | ||||
|     QuerySubjectsClass, | ||||
|     SchemaFieldClass, | ||||
|     SchemaMetadataClass, | ||||
|     UpstreamLineageClass, | ||||
| ) | ||||
| 
 | ||||
| if TYPE_CHECKING: | ||||
|     from datahub.ingestion.api.source import SourceReport | ||||
| 
 | ||||
| 
 | ||||
| # TODO: ordering | ||||
| # In the cases where we trim collections of data (e.g. fields in schema, upstream lineage, query subjects), given | ||||
| # those collections are typically unordered, we should consider sorting them by some criteria (e.g. size, alphabetically) | ||||
| # so that the trimming is deterministic and predictable and more importantly consistent across executions. | ||||
| # In the case of schemaMetadata, that's more relevant as currently we may be trimming fields while adding nested ones, | ||||
| # which may lead to poorly schema rendering in the UI. | ||||
| 
 | ||||
| logger = logging.getLogger(__name__) | ||||
| 
 | ||||
| DEFAULT_QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES = 5 * 1024 * 1024  # 5MB | ||||
| QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES = int( | ||||
|     os.environ.get( | ||||
|         "QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES", | ||||
|         DEFAULT_QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES, | ||||
|     ) | ||||
| ) | ||||
| 
 | ||||
| QUERY_STATEMENT_TRUNCATION_BUFFER = 100 | ||||
| 
 | ||||
| 
 | ||||
| class EnsureAspectSizeProcessor: | ||||
|     def __init__( | ||||
| @ -81,6 +103,274 @@ class EnsureAspectSizeProcessor: | ||||
| 
 | ||||
|         schema.fields = accepted_fields | ||||
| 
 | ||||
|     def ensure_query_subjects_size( | ||||
|         self, entity_urn: str, query_subjects: QuerySubjectsClass | ||||
|     ) -> None: | ||||
|         """ | ||||
|         Ensure query subjects aspect does not exceed allowed size by removing column-level lineage first, | ||||
|         then table lineage if necessary. | ||||
|         """ | ||||
|         if not query_subjects.subjects: | ||||
|             return | ||||
| 
 | ||||
|         total_subjects_size = 0 | ||||
|         accepted_table_level_subjects = [] | ||||
|         accepted_column_level_subjects = [] | ||||
|         column_level_subjects_with_sizes = [] | ||||
|         table_level_subjects_with_sizes = [] | ||||
| 
 | ||||
|         # Separate column-level and table-level subjects | ||||
|         for subject in query_subjects.subjects: | ||||
|             subject_size = len(json.dumps(pre_json_transform(subject.to_obj()))) | ||||
| 
 | ||||
|             if subject.entity.startswith("urn:li:schemaField:"): | ||||
|                 column_level_subjects_with_sizes.append((subject, subject_size)) | ||||
|             else: | ||||
|                 table_level_subjects_with_sizes.append((subject, subject_size)) | ||||
| 
 | ||||
|         # Once we find one that doesn't fit, stop everything else to prevent inconsistencies | ||||
|         first_skip_done = False | ||||
| 
 | ||||
|         # First, try to include all table-level subjects | ||||
|         for subject, subject_size in table_level_subjects_with_sizes: | ||||
|             if total_subjects_size + subject_size < self.payload_constraint: | ||||
|                 accepted_table_level_subjects.append(subject) | ||||
|                 total_subjects_size += subject_size | ||||
|             else: | ||||
|                 first_skip_done = True | ||||
|                 break | ||||
| 
 | ||||
|         # Then, add column-level subjects if there's remaining space | ||||
|         # Only process if we successfully included all table-level subjects | ||||
|         if not first_skip_done: | ||||
|             for subject, subject_size in column_level_subjects_with_sizes: | ||||
|                 if total_subjects_size + subject_size < self.payload_constraint: | ||||
|                     accepted_column_level_subjects.append(subject) | ||||
|                     total_subjects_size += subject_size | ||||
|                 else: | ||||
|                     first_skip_done = True | ||||
|                     break | ||||
| 
 | ||||
|         if first_skip_done: | ||||
|             # Log aggregate warnings | ||||
|             table_level_skipped_count = len(table_level_subjects_with_sizes) - len( | ||||
|                 accepted_table_level_subjects | ||||
|             ) | ||||
|             column_level_skipped_count = len(column_level_subjects_with_sizes) - len( | ||||
|                 accepted_column_level_subjects | ||||
|             ) | ||||
| 
 | ||||
|             self._maybe_warn_query_subjects( | ||||
|                 entity_urn, table_level_skipped_count, "table-level lineage subjects" | ||||
|             ) | ||||
|             self._maybe_warn_query_subjects( | ||||
|                 entity_urn, column_level_skipped_count, "column-level lineage subjects" | ||||
|             ) | ||||
| 
 | ||||
|         query_subjects.subjects = ( | ||||
|             accepted_table_level_subjects + accepted_column_level_subjects | ||||
|         ) | ||||
| 
 | ||||
|     def _maybe_warn_query_subjects( | ||||
|         self, entity_urn: str, skipped_count: int, item_type: str | ||||
|     ) -> None: | ||||
|         """Log warning for query subjects truncation if any items were skipped.""" | ||||
|         if skipped_count > 0: | ||||
|             self.report.warning( | ||||
|                 title="Query subjects truncated due to size constraint", | ||||
|                 message="Query subjects contained too much data and would have caused ingestion to fail", | ||||
|                 context=f"Skipped {skipped_count} {item_type} for {entity_urn} due to aspect size constraints", | ||||
|             ) | ||||
| 
 | ||||
|     def _maybe_warn_upstream_lineage( | ||||
|         self, entity_urn: str, skipped_count: int, item_type: str | ||||
|     ) -> None: | ||||
|         """Log warning for upstream lineage truncation if any items were skipped.""" | ||||
|         if skipped_count > 0: | ||||
|             self.report.warning( | ||||
|                 title="Upstream lineage truncated due to size constraint", | ||||
|                 message="Upstream lineage contained too much data and would have caused ingestion to fail", | ||||
|                 context=f"Skipped {skipped_count} {item_type} for {entity_urn} due to aspect size constraints", | ||||
|             ) | ||||
| 
 | ||||
|     def ensure_upstream_lineage_size(  # noqa: C901 | ||||
|         self, entity_urn: str, upstream_lineage: UpstreamLineageClass | ||||
|     ) -> None: | ||||
|         """ | ||||
|         Ensure upstream lineage aspect does not exceed allowed size by removing lineage in priority order: | ||||
|         first NONE fine-grained lineages (lowest priority), then FIELD_SET fine-grained lineages, | ||||
|         then DATASET fine-grained lineages, and finally upstreams (highest priority). | ||||
|         """ | ||||
|         if not upstream_lineage.fineGrainedLineages and not upstream_lineage.upstreams: | ||||
|             return | ||||
| 
 | ||||
|         total_lineage_size = 0 | ||||
|         accepted_upstreams = [] | ||||
|         accepted_dataset_fg_lineages = [] | ||||
|         accepted_field_set_fg_lineages = [] | ||||
|         accepted_none_fg_lineages = [] | ||||
|         upstream_items_with_sizes = [] | ||||
|         dataset_fg_items_with_sizes = [] | ||||
|         field_set_fg_items_with_sizes = [] | ||||
|         none_fg_items_with_sizes = [] | ||||
| 
 | ||||
|         # Add upstreams (highest priority) | ||||
|         if upstream_lineage.upstreams: | ||||
|             for upstream in upstream_lineage.upstreams: | ||||
|                 upstream_size = len(json.dumps(pre_json_transform(upstream.to_obj()))) | ||||
|                 upstream_items_with_sizes.append((upstream, upstream_size)) | ||||
| 
 | ||||
|         # Separate fine-grained lineage items by upstreamType: DATASET > FIELD_SET > NONE | ||||
|         if upstream_lineage.fineGrainedLineages: | ||||
|             for fg_lineage in upstream_lineage.fineGrainedLineages: | ||||
|                 fg_lineage_size = len( | ||||
|                     json.dumps(pre_json_transform(fg_lineage.to_obj())) | ||||
|                 ) | ||||
| 
 | ||||
|                 upstream_type_str = str(fg_lineage.upstreamType) | ||||
|                 if upstream_type_str == "DATASET": | ||||
|                     dataset_fg_items_with_sizes.append((fg_lineage, fg_lineage_size)) | ||||
|                 elif upstream_type_str == "FIELD_SET": | ||||
|                     field_set_fg_items_with_sizes.append((fg_lineage, fg_lineage_size)) | ||||
|                 elif upstream_type_str == "NONE": | ||||
|                     none_fg_items_with_sizes.append((fg_lineage, fg_lineage_size)) | ||||
| 
 | ||||
|         # Once we find one that doesn't fit, stop everything else to prevent inconsistencies | ||||
|         first_skip_done = False | ||||
| 
 | ||||
|         # First, include all upstreams (highest priority) | ||||
|         for item, item_size in upstream_items_with_sizes: | ||||
|             if total_lineage_size + item_size < self.payload_constraint: | ||||
|                 accepted_upstreams.append(item) | ||||
|                 total_lineage_size += item_size | ||||
|             else: | ||||
|                 first_skip_done = True | ||||
|                 break | ||||
| 
 | ||||
|         # Second, include DATASET fine-grained lineages if no upstreams were skipped | ||||
|         if not first_skip_done: | ||||
|             for fg_lineage, fg_lineage_size in dataset_fg_items_with_sizes: | ||||
|                 if total_lineage_size + fg_lineage_size < self.payload_constraint: | ||||
|                     accepted_dataset_fg_lineages.append(fg_lineage) | ||||
|                     total_lineage_size += fg_lineage_size | ||||
|                 else: | ||||
|                     first_skip_done = True | ||||
|                     break | ||||
| 
 | ||||
|         # Third, include FIELD_SET fine-grained lineages if no higher priority items were skipped | ||||
|         if not first_skip_done: | ||||
|             for fg_lineage, fg_lineage_size in field_set_fg_items_with_sizes: | ||||
|                 if total_lineage_size + fg_lineage_size < self.payload_constraint: | ||||
|                     accepted_field_set_fg_lineages.append(fg_lineage) | ||||
|                     total_lineage_size += fg_lineage_size | ||||
|                 else: | ||||
|                     first_skip_done = True | ||||
|                     break | ||||
| 
 | ||||
|         # Finally, include NONE fine-grained lineages if no higher priority items were skipped | ||||
|         if not first_skip_done: | ||||
|             for fg_lineage, fg_lineage_size in none_fg_items_with_sizes: | ||||
|                 if total_lineage_size + fg_lineage_size < self.payload_constraint: | ||||
|                     accepted_none_fg_lineages.append(fg_lineage) | ||||
|                     total_lineage_size += fg_lineage_size | ||||
|                 else: | ||||
|                     first_skip_done = True | ||||
|                     break | ||||
| 
 | ||||
|         # Log aggregate warnings instead of per-item warnings | ||||
|         if first_skip_done: | ||||
|             upstreams_skipped_count = len(upstream_items_with_sizes) - len( | ||||
|                 accepted_upstreams | ||||
|             ) | ||||
|             dataset_fg_skipped_count = len(dataset_fg_items_with_sizes) - len( | ||||
|                 accepted_dataset_fg_lineages | ||||
|             ) | ||||
|             field_set_fg_skipped_count = len(field_set_fg_items_with_sizes) - len( | ||||
|                 accepted_field_set_fg_lineages | ||||
|             ) | ||||
|             none_fg_skipped_count = len(none_fg_items_with_sizes) - len( | ||||
|                 accepted_none_fg_lineages | ||||
|             ) | ||||
| 
 | ||||
|             self._maybe_warn_upstream_lineage( | ||||
|                 entity_urn, upstreams_skipped_count, "upstream datasets" | ||||
|             ) | ||||
|             self._maybe_warn_upstream_lineage( | ||||
|                 entity_urn, | ||||
|                 dataset_fg_skipped_count, | ||||
|                 "dataset-level fine-grained lineages", | ||||
|             ) | ||||
|             self._maybe_warn_upstream_lineage( | ||||
|                 entity_urn, | ||||
|                 field_set_fg_skipped_count, | ||||
|                 "field-set-level fine-grained lineages", | ||||
|             ) | ||||
|             self._maybe_warn_upstream_lineage( | ||||
|                 entity_urn, none_fg_skipped_count, "none-level fine-grained lineages" | ||||
|             ) | ||||
| 
 | ||||
|         # Combine all accepted fine-grained lineages | ||||
|         accepted_fine_grained_lineages = ( | ||||
|             accepted_dataset_fg_lineages | ||||
|             + accepted_field_set_fg_lineages | ||||
|             + accepted_none_fg_lineages | ||||
|         ) | ||||
| 
 | ||||
|         upstream_lineage.upstreams = accepted_upstreams | ||||
|         upstream_lineage.fineGrainedLineages = ( | ||||
|             accepted_fine_grained_lineages if accepted_fine_grained_lineages else None | ||||
|         ) | ||||
| 
 | ||||
|     def ensure_query_properties_size( | ||||
|         self, entity_urn: str, query_properties: QueryPropertiesClass | ||||
|     ) -> None: | ||||
|         """ | ||||
|         Ensure query properties aspect does not exceed allowed size by truncating the query statement value. | ||||
|         Uses a configurable max payload size that is the minimum between QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES | ||||
|         and INGEST_MAX_PAYLOAD_BYTES. | ||||
| 
 | ||||
|         We have found surprisingly large query statements (e.g. 20MB+) that caused ingestion to fail; | ||||
|         that was INSERT INTO VALUES with huge list of values. | ||||
|         """ | ||||
|         if not query_properties.statement or not query_properties.statement.value: | ||||
|             return | ||||
| 
 | ||||
|         max_payload_size = min( | ||||
|             QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES, self.payload_constraint | ||||
|         ) | ||||
| 
 | ||||
|         current_size = len(json.dumps(pre_json_transform(query_properties.to_obj()))) | ||||
| 
 | ||||
|         if current_size < max_payload_size: | ||||
|             return | ||||
| 
 | ||||
|         reduction_needed = ( | ||||
|             current_size - max_payload_size + QUERY_STATEMENT_TRUNCATION_BUFFER | ||||
|         ) | ||||
| 
 | ||||
|         statement_value_size = len(query_properties.statement.value) | ||||
|         original_statement_size = statement_value_size | ||||
| 
 | ||||
|         # Only truncate if reduction is actually needed and possible | ||||
|         if statement_value_size > reduction_needed > 0: | ||||
|             new_statement_length = statement_value_size - reduction_needed | ||||
|             truncated_statement = query_properties.statement.value[ | ||||
|                 :new_statement_length | ||||
|             ] | ||||
| 
 | ||||
|             truncation_message = f"... [original value was {original_statement_size} bytes and truncated to {new_statement_length} bytes]" | ||||
|             query_properties.statement.value = truncated_statement + truncation_message | ||||
| 
 | ||||
|             self.report.warning( | ||||
|                 title="Query properties truncated due to size constraint", | ||||
|                 message="Query properties contained too much data and would have caused ingestion to fail", | ||||
|                 context=f"Query statement was truncated from {original_statement_size} to {new_statement_length} characters for {entity_urn} due to aspect size constraints", | ||||
|             ) | ||||
|         else: | ||||
|             logger.warning( | ||||
|                 f"Cannot truncate query statement for {entity_urn} as it is smaller than or equal to the required reduction size {reduction_needed}. That means that 'ensure_query_properties_size' must be extended to trim other fields different than statement." | ||||
|             ) | ||||
| 
 | ||||
|     def ensure_aspect_size( | ||||
|         self, | ||||
|         stream: Iterable[MetadataWorkUnit], | ||||
| @ -96,4 +386,10 @@ class EnsureAspectSizeProcessor: | ||||
|                 self.ensure_schema_metadata_size(wu.get_urn(), schema) | ||||
|             elif profile := wu.get_aspect_of_type(DatasetProfileClass): | ||||
|                 self.ensure_dataset_profile_size(wu.get_urn(), profile) | ||||
|             elif query_subjects := wu.get_aspect_of_type(QuerySubjectsClass): | ||||
|                 self.ensure_query_subjects_size(wu.get_urn(), query_subjects) | ||||
|             elif upstream_lineage := wu.get_aspect_of_type(UpstreamLineageClass): | ||||
|                 self.ensure_upstream_lineage_size(wu.get_urn(), upstream_lineage) | ||||
|             elif query_properties := wu.get_aspect_of_type(QueryPropertiesClass): | ||||
|                 self.ensure_query_properties_size(wu.get_urn(), query_properties) | ||||
|             yield wu | ||||
|  | ||||
| @ -15,20 +15,33 @@ from datahub.ingestion.api.source import SourceReport | ||||
| from datahub.ingestion.api.workunit import MetadataWorkUnit | ||||
| from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent | ||||
| from datahub.metadata.schema_classes import ( | ||||
|     AuditStampClass, | ||||
|     ChangeTypeClass, | ||||
|     DatasetFieldProfileClass, | ||||
|     DatasetLineageTypeClass, | ||||
|     DatasetProfileClass, | ||||
|     DatasetSnapshotClass, | ||||
|     FineGrainedLineageClass, | ||||
|     FineGrainedLineageDownstreamTypeClass, | ||||
|     FineGrainedLineageUpstreamTypeClass, | ||||
|     GenericAspectClass, | ||||
|     MetadataChangeProposalClass, | ||||
|     NumberTypeClass, | ||||
|     OtherSchemaClass, | ||||
|     QueryLanguageClass, | ||||
|     QueryPropertiesClass, | ||||
|     QuerySourceClass, | ||||
|     QueryStatementClass, | ||||
|     QuerySubjectClass, | ||||
|     QuerySubjectsClass, | ||||
|     SchemaFieldClass, | ||||
|     SchemaFieldDataTypeClass, | ||||
|     SchemaMetadataClass, | ||||
|     StatusClass, | ||||
|     StringTypeClass, | ||||
|     SubTypesClass, | ||||
|     UpstreamClass, | ||||
|     UpstreamLineageClass, | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| @ -112,6 +125,192 @@ def proper_schema_metadata() -> SchemaMetadataClass: | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| def proper_query_subjects() -> QuerySubjectsClass: | ||||
|     subjects = [ | ||||
|         QuerySubjectClass( | ||||
|             entity="urn:li:dataset:(urn:li:dataPlatform:hive,db1.table1,PROD)" | ||||
|         ), | ||||
|         QuerySubjectClass( | ||||
|             entity="urn:li:dataset:(urn:li:dataPlatform:hive,db1.table2,PROD)" | ||||
|         ), | ||||
|         QuerySubjectClass( | ||||
|             entity="urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.table1,PROD),col1)" | ||||
|         ), | ||||
|         QuerySubjectClass( | ||||
|             entity="urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.table2,PROD),col2)" | ||||
|         ), | ||||
|     ] | ||||
|     return QuerySubjectsClass(subjects=subjects) | ||||
| 
 | ||||
| 
 | ||||
| def too_big_query_subjects() -> QuerySubjectsClass: | ||||
|     subjects = [] | ||||
| 
 | ||||
|     # Add a few table-level subjects | ||||
|     for i in range(5): | ||||
|         subjects.append( | ||||
|             QuerySubjectClass( | ||||
|                 entity=f"urn:li:dataset:(urn:li:dataPlatform:hive,db.table{i},PROD)" | ||||
|             ) | ||||
|         ) | ||||
| 
 | ||||
|     # Add many column-level subjects with very large entity URNs to exceed the 15MB constraint | ||||
|     # Each URN will be about 40KB, so 500 subjects should create ~20MB of data | ||||
|     for i in range(500): | ||||
|         large_table_name = "a" * 20000  # Very large table name | ||||
|         large_column_name = "b" * 20000  # Very large column name | ||||
|         subjects.append( | ||||
|             QuerySubjectClass( | ||||
|                 entity=f"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,{large_table_name}_{i},PROD),{large_column_name}_{i})" | ||||
|             ) | ||||
|         ) | ||||
| 
 | ||||
|     return QuerySubjectsClass(subjects=subjects) | ||||
| 
 | ||||
| 
 | ||||
| def proper_upstream_lineage() -> UpstreamLineageClass: | ||||
|     upstreams = [ | ||||
|         UpstreamClass( | ||||
|             dataset="urn:li:dataset:(urn:li:dataPlatform:hive,db1.table1,PROD)", | ||||
|             type=DatasetLineageTypeClass.TRANSFORMED, | ||||
|         ), | ||||
|         UpstreamClass( | ||||
|             dataset="urn:li:dataset:(urn:li:dataPlatform:hive,db1.table2,PROD)", | ||||
|             type=DatasetLineageTypeClass.TRANSFORMED, | ||||
|         ), | ||||
|     ] | ||||
|     fine_grained_lineages = [ | ||||
|         FineGrainedLineageClass( | ||||
|             upstreamType=FineGrainedLineageUpstreamTypeClass.DATASET, | ||||
|             downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD, | ||||
|             upstreams=["urn:li:dataset:(urn:li:dataPlatform:hive,db1.table3,PROD)"], | ||||
|             downstreams=[ | ||||
|                 "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.target,PROD),col1)" | ||||
|             ], | ||||
|         ), | ||||
|         FineGrainedLineageClass( | ||||
|             upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET, | ||||
|             downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD, | ||||
|             upstreams=[ | ||||
|                 "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.table4,PROD),col2)" | ||||
|             ], | ||||
|             downstreams=[ | ||||
|                 "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.target,PROD),col2)" | ||||
|             ], | ||||
|         ), | ||||
|     ] | ||||
|     return UpstreamLineageClass( | ||||
|         upstreams=upstreams, fineGrainedLineages=fine_grained_lineages | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| def too_big_upstream_lineage() -> UpstreamLineageClass: | ||||
|     upstreams = [] | ||||
|     fine_grained_lineages = [] | ||||
| 
 | ||||
|     # Add upstreams (highest priority) | ||||
|     for i in range(5): | ||||
|         upstreams.append( | ||||
|             UpstreamClass( | ||||
|                 dataset=f"urn:li:dataset:(urn:li:dataPlatform:hive,upstream_table_{i},PROD)", | ||||
|                 type=DatasetLineageTypeClass.TRANSFORMED, | ||||
|             ) | ||||
|         ) | ||||
| 
 | ||||
|     # Add DATASET fine-grained lineages with large URNs | ||||
|     for i in range(200): | ||||
|         large_dataset_name = "a" * 20000 | ||||
|         large_downstream_name = "b" * 20000 | ||||
|         fine_grained_lineages.append( | ||||
|             FineGrainedLineageClass( | ||||
|                 upstreamType=FineGrainedLineageUpstreamTypeClass.DATASET, | ||||
|                 downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD, | ||||
|                 upstreams=[ | ||||
|                     f"urn:li:dataset:(urn:li:dataPlatform:hive,{large_dataset_name}_{i},PROD)" | ||||
|                 ], | ||||
|                 downstreams=[ | ||||
|                     f"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,target,PROD),{large_downstream_name}_{i})" | ||||
|                 ], | ||||
|             ) | ||||
|         ) | ||||
| 
 | ||||
|     # Add FIELD_SET fine-grained lineages with large URNs | ||||
|     for i in range(200): | ||||
|         large_upstream_name = "c" * 20000 | ||||
|         large_downstream_name = "d" * 20000 | ||||
|         fine_grained_lineages.append( | ||||
|             FineGrainedLineageClass( | ||||
|                 upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET, | ||||
|                 downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD, | ||||
|                 upstreams=[ | ||||
|                     f"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,source,PROD),{large_upstream_name}_{i})" | ||||
|                 ], | ||||
|                 downstreams=[ | ||||
|                     f"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,target,PROD),{large_downstream_name}_{i})" | ||||
|                 ], | ||||
|             ) | ||||
|         ) | ||||
| 
 | ||||
|     # Add NONE fine-grained lineages with large URNs (lowest priority) | ||||
|     for i in range(200): | ||||
|         large_downstream_name = "e" * 20000 | ||||
|         fine_grained_lineages.append( | ||||
|             FineGrainedLineageClass( | ||||
|                 upstreamType=FineGrainedLineageUpstreamTypeClass.NONE, | ||||
|                 downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD, | ||||
|                 downstreams=[ | ||||
|                     f"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,target,PROD),{large_downstream_name}_{i})" | ||||
|                 ], | ||||
|             ) | ||||
|         ) | ||||
| 
 | ||||
|     return UpstreamLineageClass( | ||||
|         upstreams=upstreams, fineGrainedLineages=fine_grained_lineages | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| def proper_query_properties() -> QueryPropertiesClass: | ||||
|     # Create a query properties with a reasonably sized statement (~1KB) | ||||
|     query_statement = ( | ||||
|         "SELECT * FROM table1 WHERE column1 = 'value' AND column2 > 100;" * 20 | ||||
|     ) | ||||
| 
 | ||||
|     return QueryPropertiesClass( | ||||
|         statement=QueryStatementClass( | ||||
|             value=query_statement, | ||||
|             language=QueryLanguageClass.SQL, | ||||
|         ), | ||||
|         source=QuerySourceClass.SYSTEM, | ||||
|         created=AuditStampClass(time=1000000000000, actor="urn:li:corpuser:test"), | ||||
|         lastModified=AuditStampClass(time=1000000000000, actor="urn:li:corpuser:test"), | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| def too_big_query_properties() -> QueryPropertiesClass: | ||||
|     # Create a query properties with a very large statement (~6MB, exceeding the 5MB default limit) | ||||
|     # This is larger than the QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES default (5MB) | ||||
|     large_query_statement = ( | ||||
|         "SELECT col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, " | ||||
|         "col11, col12, col13, col14, col15, col16, col17, col18, col19, col20 " | ||||
|         "FROM very_long_table_name_with_lots_of_characters_to_make_it_big " | ||||
|         "WHERE condition1 = 'some_very_long_value_with_lots_of_text' " | ||||
|         "AND condition2 IN ('value1', 'value2', 'value3', 'value4') " | ||||
|         "ORDER BY col1, col2, col3, col4, col5 LIMIT 1000;" | ||||
|     ) * 15000  # ~6MB | ||||
| 
 | ||||
|     return QueryPropertiesClass( | ||||
|         statement=QueryStatementClass( | ||||
|             value=large_query_statement, | ||||
|             language=QueryLanguageClass.SQL, | ||||
|         ), | ||||
|         source=QuerySourceClass.SYSTEM, | ||||
|         created=AuditStampClass(time=1000000000000, actor="urn:li:corpuser:test"), | ||||
|         lastModified=AuditStampClass(time=1000000000000, actor="urn:li:corpuser:test"), | ||||
|         name="Large Test Query", | ||||
|         description="A test query with a very large statement", | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| def proper_dataset_profile() -> DatasetProfileClass: | ||||
|     sample_values = [ | ||||
|         "23483295", | ||||
| @ -344,3 +543,254 @@ def test_wu_processor_not_triggered_by_unhandled_aspects( | ||||
|     ] | ||||
|     ensure_schema_metadata_size_mock.assert_not_called() | ||||
|     ensure_dataset_profile_size_mock.assert_not_called() | ||||
| 
 | ||||
| 
 | ||||
| @freeze_time("2023-01-02 00:00:00") | ||||
| def test_ensure_size_of_proper_query_subjects(processor): | ||||
|     query_subjects = proper_query_subjects() | ||||
|     orig_repr = json.dumps(query_subjects.to_obj()) | ||||
|     processor.ensure_query_subjects_size( | ||||
|         "urn:li:query:(urn:li:dataPlatform:hive, dummy_query, DEV)", query_subjects | ||||
|     ) | ||||
|     assert orig_repr == json.dumps(query_subjects.to_obj()), ( | ||||
|         "Aspect was modified in case where workunit processor should have been no-op" | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| @freeze_time("2023-01-02 00:00:00") | ||||
| def test_ensure_size_of_too_big_query_subjects(processor): | ||||
|     query_subjects = too_big_query_subjects() | ||||
|     assert len(query_subjects.subjects) == 505  # 5 table + 500 column subjects | ||||
| 
 | ||||
|     # Verify that the initial size exceeds the default payload constraint | ||||
|     initial_size = len(json.dumps(query_subjects.to_obj())) | ||||
|     expected_size = 20 * 1024 * 1024  # 20MB | ||||
|     assert initial_size == pytest.approx(expected_size, rel=0.05), ( | ||||
|         f"Initial size {initial_size} should be around 20MB (±5%), got {initial_size / (1024 * 1024):.1f}MB" | ||||
|     ) | ||||
|     assert initial_size > INGEST_MAX_PAYLOAD_BYTES, ( | ||||
|         f"Initial size {initial_size} should exceed payload constraint {INGEST_MAX_PAYLOAD_BYTES}" | ||||
|     ) | ||||
| 
 | ||||
|     processor.ensure_query_subjects_size( | ||||
|         "urn:li:query:(urn:li:dataPlatform:hive, dummy_query, DEV)", query_subjects | ||||
|     ) | ||||
| 
 | ||||
|     # Should be significantly reduced due to size constraints | ||||
|     # With ~20MB of data needing to be reduced to ~15MB, we expect ~25% reduction (125 subjects) | ||||
|     # So final count should be around 380, using 400 as upper bound with buffer | ||||
|     assert len(query_subjects.subjects) < 400, ( | ||||
|         "Query subjects has not been properly truncated" | ||||
|     ) | ||||
| 
 | ||||
|     # Check that table-level subjects are prioritized (should still be present) | ||||
|     table_subjects = [ | ||||
|         s | ||||
|         for s in query_subjects.subjects | ||||
|         if not s.entity.startswith("urn:li:schemaField:") | ||||
|     ] | ||||
|     assert len(table_subjects) > 0, ( | ||||
|         "Table-level subjects should be prioritized and present" | ||||
|     ) | ||||
| 
 | ||||
|     # The aspect should not exceed acceptable size | ||||
|     assert len(json.dumps(query_subjects.to_obj())) < INGEST_MAX_PAYLOAD_BYTES, ( | ||||
|         "Aspect exceeded acceptable size" | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| @freeze_time("2023-01-02 00:00:00") | ||||
| @patch( | ||||
|     "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_query_subjects_size" | ||||
| ) | ||||
| def test_wu_processor_triggered_by_query_subjects_aspect( | ||||
|     ensure_query_subjects_size_mock, processor | ||||
| ): | ||||
|     ret = [  # noqa: F841 | ||||
|         *processor.ensure_aspect_size( | ||||
|             [ | ||||
|                 MetadataChangeProposalWrapper( | ||||
|                     entityUrn="urn:li:query:(urn:li:dataPlatform:hive, dummy_query, DEV)", | ||||
|                     aspect=proper_query_subjects(), | ||||
|                 ).as_workunit() | ||||
|             ] | ||||
|         ) | ||||
|     ] | ||||
|     ensure_query_subjects_size_mock.assert_called_once() | ||||
| 
 | ||||
| 
 | ||||
| @freeze_time("2023-01-02 00:00:00") | ||||
| def test_ensure_size_of_proper_upstream_lineage(processor): | ||||
|     upstream_lineage = proper_upstream_lineage() | ||||
|     orig_repr = json.dumps(upstream_lineage.to_obj()) | ||||
|     processor.ensure_upstream_lineage_size( | ||||
|         "urn:li:dataset:(urn:li:dataPlatform:hive, dummy_dataset, DEV)", | ||||
|         upstream_lineage, | ||||
|     ) | ||||
|     assert orig_repr == json.dumps(upstream_lineage.to_obj()), ( | ||||
|         "Aspect was modified in case where workunit processor should have been no-op" | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| @freeze_time("2023-01-02 00:00:00") | ||||
| def test_ensure_size_of_too_big_upstream_lineage(processor): | ||||
|     upstream_lineage = too_big_upstream_lineage() | ||||
|     assert len(upstream_lineage.upstreams) == 5  # 5 upstreams | ||||
|     assert upstream_lineage.fineGrainedLineages is not None | ||||
|     assert ( | ||||
|         len(upstream_lineage.fineGrainedLineages) == 600 | ||||
|     )  # 200 DATASET + 200 FIELD_SET + 200 NONE | ||||
| 
 | ||||
|     # Verify that the initial size exceeds the default payload constraint | ||||
|     initial_size = len(json.dumps(upstream_lineage.to_obj())) | ||||
|     expected_size = 20 * 1024 * 1024  # 20MB | ||||
|     assert initial_size == pytest.approx(expected_size, rel=0.05), ( | ||||
|         f"Initial size {initial_size} should be around 20MB (±5%), got {initial_size / (1024 * 1024):.1f}MB" | ||||
|     ) | ||||
|     assert initial_size > INGEST_MAX_PAYLOAD_BYTES, ( | ||||
|         f"Initial size {initial_size} should exceed payload constraint {INGEST_MAX_PAYLOAD_BYTES}" | ||||
|     ) | ||||
| 
 | ||||
|     processor.ensure_upstream_lineage_size( | ||||
|         "urn:li:dataset:(urn:li:dataPlatform:hive, dummy_dataset, DEV)", | ||||
|         upstream_lineage, | ||||
|     ) | ||||
| 
 | ||||
|     # Should be significantly reduced due to size constraints | ||||
|     # With ~20MB of data needing to be reduced to ~15MB, we expect ~25% reduction | ||||
|     # Total items: 5 upstreams + 600 fine-grained = 605, expect around ~450 after 25% reduction | ||||
|     total_items = len(upstream_lineage.upstreams) + ( | ||||
|         len(upstream_lineage.fineGrainedLineages) | ||||
|         if upstream_lineage.fineGrainedLineages | ||||
|         else 0 | ||||
|     ) | ||||
|     assert total_items < 500, "Upstream lineage has not been properly truncated" | ||||
| 
 | ||||
|     # Check that upstreams are prioritized (should still be present) | ||||
|     assert len(upstream_lineage.upstreams) > 0, ( | ||||
|         "Upstreams should be prioritized and present" | ||||
|     ) | ||||
| 
 | ||||
|     # Check that DATASET fine-grained lineages are prioritized over FIELD_SET and NONE | ||||
|     if upstream_lineage.fineGrainedLineages: | ||||
|         dataset_count = sum( | ||||
|             1 | ||||
|             for fg in upstream_lineage.fineGrainedLineages | ||||
|             if str(fg.upstreamType) == "DATASET" | ||||
|         ) | ||||
|         field_set_count = sum( | ||||
|             1 | ||||
|             for fg in upstream_lineage.fineGrainedLineages | ||||
|             if str(fg.upstreamType) == "FIELD_SET" | ||||
|         ) | ||||
|         none_count = sum( | ||||
|             1 | ||||
|             for fg in upstream_lineage.fineGrainedLineages | ||||
|             if str(fg.upstreamType) == "NONE" | ||||
|         ) | ||||
| 
 | ||||
|         # DATASET should be prioritized over FIELD_SET and NONE | ||||
|         assert dataset_count >= field_set_count, ( | ||||
|             "DATASET fine-grained lineages should be prioritized" | ||||
|         ) | ||||
|         assert dataset_count >= none_count, ( | ||||
|             "DATASET fine-grained lineages should be prioritized over NONE" | ||||
|         ) | ||||
| 
 | ||||
|     # The aspect should not exceed acceptable size | ||||
|     assert len(json.dumps(upstream_lineage.to_obj())) < INGEST_MAX_PAYLOAD_BYTES, ( | ||||
|         "Aspect exceeded acceptable size" | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| @freeze_time("2023-01-02 00:00:00") | ||||
| @patch( | ||||
|     "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_upstream_lineage_size" | ||||
| ) | ||||
| def test_wu_processor_triggered_by_upstream_lineage_aspect( | ||||
|     ensure_upstream_lineage_size_mock, processor | ||||
| ): | ||||
|     ret = [  # noqa: F841 | ||||
|         *processor.ensure_aspect_size( | ||||
|             [ | ||||
|                 MetadataChangeProposalWrapper( | ||||
|                     entityUrn="urn:li:dataset:(urn:li:dataPlatform:hive, dummy_dataset, DEV)", | ||||
|                     aspect=proper_upstream_lineage(), | ||||
|                 ).as_workunit() | ||||
|             ] | ||||
|         ) | ||||
|     ] | ||||
|     ensure_upstream_lineage_size_mock.assert_called_once() | ||||
| 
 | ||||
| 
 | ||||
| @freeze_time("2023-01-02 00:00:00") | ||||
| def test_ensure_size_of_proper_query_properties(processor): | ||||
|     query_properties = proper_query_properties() | ||||
|     original_statement = query_properties.statement.value | ||||
| 
 | ||||
|     # Verify initial size is reasonable (under 5MB) | ||||
|     initial_size = len(json.dumps(query_properties.to_obj())) | ||||
|     assert initial_size < 5 * 1024 * 1024, "Test query properties should be under 5MB" | ||||
| 
 | ||||
|     processor.ensure_query_properties_size("urn:li:query:test", query_properties) | ||||
| 
 | ||||
|     # Statement should remain unchanged for properly sized query properties | ||||
|     assert query_properties.statement.value == original_statement | ||||
|     assert len(processor.report.warnings) == 0 | ||||
| 
 | ||||
| 
 | ||||
| @freeze_time("2023-01-02 00:00:00") | ||||
| def test_ensure_size_of_too_big_query_properties(processor): | ||||
|     query_properties = too_big_query_properties() | ||||
|     original_statement_size = len(query_properties.statement.value) | ||||
| 
 | ||||
|     # Verify initial size exceeds it's about 5.5MB limit and definitely larger than 5MB | ||||
|     initial_size = len(json.dumps(query_properties.to_obj())) | ||||
|     expected_initial_size = 5.5 * 1024 * 1024  # ~5.5MB | ||||
|     assert initial_size == pytest.approx(expected_initial_size, rel=0.1), ( | ||||
|         f"Expected initial size ~{expected_initial_size}, got {initial_size}" | ||||
|     ) | ||||
|     assert initial_size > 5 * 1024 * 1024, "Test data should exceed 5MB limit" | ||||
| 
 | ||||
|     processor.ensure_query_properties_size("urn:li:query:test", query_properties) | ||||
| 
 | ||||
|     # Statement should be truncated | ||||
|     assert len(query_properties.statement.value) < original_statement_size | ||||
| 
 | ||||
|     # Should contain truncation message | ||||
|     assert "... [original value was" in query_properties.statement.value | ||||
|     assert ( | ||||
|         f"{original_statement_size} bytes and truncated to" | ||||
|         in query_properties.statement.value | ||||
|     ) | ||||
|     assert query_properties.statement.value.endswith(" bytes]") | ||||
| 
 | ||||
|     # Final size should be within constraints, ie <= 5MB + buffer | ||||
|     final_size = len(json.dumps(query_properties.to_obj())) | ||||
|     expected_final_size = 5 * 1024 * 1024 + 100  # 5MB + buffer | ||||
|     assert final_size <= expected_final_size, ( | ||||
|         f"Final size {final_size} should be <= {expected_final_size}" | ||||
|     ) | ||||
| 
 | ||||
|     # Should have logged a warning | ||||
|     assert len(processor.report.warnings) == 1 | ||||
| 
 | ||||
| 
 | ||||
| @freeze_time("2023-01-02 00:00:00") | ||||
| @patch( | ||||
|     "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_query_properties_size" | ||||
| ) | ||||
| def test_wu_processor_triggered_by_query_properties_aspect( | ||||
|     ensure_query_properties_size_mock, processor | ||||
| ): | ||||
|     list( | ||||
|         processor.ensure_aspect_size( | ||||
|             [ | ||||
|                 MetadataChangeProposalWrapper( | ||||
|                     entityUrn="urn:li:query:test", | ||||
|                     aspect=proper_query_properties(), | ||||
|                 ).as_workunit() | ||||
|             ] | ||||
|         ) | ||||
|     ) | ||||
|     ensure_query_properties_size_mock.assert_called_once() | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Sergio Gómez Villamor
						Sergio Gómez Villamor