mirror of
https://github.com/datahub-project/datahub.git
synced 2025-10-30 10:14:53 +00:00
feat(ingest): ensure payload size constraints for queryProperties, querySubjects and upstreamLineage aspects (#14919)
Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
parent
40b51ac2da
commit
e847b58472
@ -1,5 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
from typing import TYPE_CHECKING, Iterable, List
|
from typing import TYPE_CHECKING, Iterable, List
|
||||||
|
|
||||||
from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES
|
from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES
|
||||||
@ -7,15 +8,36 @@ from datahub.emitter.serialization_helper import pre_json_transform
|
|||||||
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
||||||
from datahub.metadata.schema_classes import (
|
from datahub.metadata.schema_classes import (
|
||||||
DatasetProfileClass,
|
DatasetProfileClass,
|
||||||
|
QueryPropertiesClass,
|
||||||
|
QuerySubjectsClass,
|
||||||
SchemaFieldClass,
|
SchemaFieldClass,
|
||||||
SchemaMetadataClass,
|
SchemaMetadataClass,
|
||||||
|
UpstreamLineageClass,
|
||||||
)
|
)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from datahub.ingestion.api.source import SourceReport
|
from datahub.ingestion.api.source import SourceReport
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: ordering
|
||||||
|
# In the cases where we trim collections of data (e.g. fields in schema, upstream lineage, query subjects), given
|
||||||
|
# those collections are typically unordered, we should consider sorting them by some criteria (e.g. size, alphabetically)
|
||||||
|
# so that the trimming is deterministic and predictable and more importantly consistent across executions.
|
||||||
|
# In the case of schemaMetadata, that's more relevant as currently we may be trimming fields while adding nested ones,
|
||||||
|
# which may lead to poorly schema rendering in the UI.
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
DEFAULT_QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES = 5 * 1024 * 1024 # 5MB
|
||||||
|
QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES = int(
|
||||||
|
os.environ.get(
|
||||||
|
"QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES",
|
||||||
|
DEFAULT_QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
QUERY_STATEMENT_TRUNCATION_BUFFER = 100
|
||||||
|
|
||||||
|
|
||||||
class EnsureAspectSizeProcessor:
|
class EnsureAspectSizeProcessor:
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -81,6 +103,274 @@ class EnsureAspectSizeProcessor:
|
|||||||
|
|
||||||
schema.fields = accepted_fields
|
schema.fields = accepted_fields
|
||||||
|
|
||||||
|
def ensure_query_subjects_size(
|
||||||
|
self, entity_urn: str, query_subjects: QuerySubjectsClass
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Ensure query subjects aspect does not exceed allowed size by removing column-level lineage first,
|
||||||
|
then table lineage if necessary.
|
||||||
|
"""
|
||||||
|
if not query_subjects.subjects:
|
||||||
|
return
|
||||||
|
|
||||||
|
total_subjects_size = 0
|
||||||
|
accepted_table_level_subjects = []
|
||||||
|
accepted_column_level_subjects = []
|
||||||
|
column_level_subjects_with_sizes = []
|
||||||
|
table_level_subjects_with_sizes = []
|
||||||
|
|
||||||
|
# Separate column-level and table-level subjects
|
||||||
|
for subject in query_subjects.subjects:
|
||||||
|
subject_size = len(json.dumps(pre_json_transform(subject.to_obj())))
|
||||||
|
|
||||||
|
if subject.entity.startswith("urn:li:schemaField:"):
|
||||||
|
column_level_subjects_with_sizes.append((subject, subject_size))
|
||||||
|
else:
|
||||||
|
table_level_subjects_with_sizes.append((subject, subject_size))
|
||||||
|
|
||||||
|
# Once we find one that doesn't fit, stop everything else to prevent inconsistencies
|
||||||
|
first_skip_done = False
|
||||||
|
|
||||||
|
# First, try to include all table-level subjects
|
||||||
|
for subject, subject_size in table_level_subjects_with_sizes:
|
||||||
|
if total_subjects_size + subject_size < self.payload_constraint:
|
||||||
|
accepted_table_level_subjects.append(subject)
|
||||||
|
total_subjects_size += subject_size
|
||||||
|
else:
|
||||||
|
first_skip_done = True
|
||||||
|
break
|
||||||
|
|
||||||
|
# Then, add column-level subjects if there's remaining space
|
||||||
|
# Only process if we successfully included all table-level subjects
|
||||||
|
if not first_skip_done:
|
||||||
|
for subject, subject_size in column_level_subjects_with_sizes:
|
||||||
|
if total_subjects_size + subject_size < self.payload_constraint:
|
||||||
|
accepted_column_level_subjects.append(subject)
|
||||||
|
total_subjects_size += subject_size
|
||||||
|
else:
|
||||||
|
first_skip_done = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if first_skip_done:
|
||||||
|
# Log aggregate warnings
|
||||||
|
table_level_skipped_count = len(table_level_subjects_with_sizes) - len(
|
||||||
|
accepted_table_level_subjects
|
||||||
|
)
|
||||||
|
column_level_skipped_count = len(column_level_subjects_with_sizes) - len(
|
||||||
|
accepted_column_level_subjects
|
||||||
|
)
|
||||||
|
|
||||||
|
self._maybe_warn_query_subjects(
|
||||||
|
entity_urn, table_level_skipped_count, "table-level lineage subjects"
|
||||||
|
)
|
||||||
|
self._maybe_warn_query_subjects(
|
||||||
|
entity_urn, column_level_skipped_count, "column-level lineage subjects"
|
||||||
|
)
|
||||||
|
|
||||||
|
query_subjects.subjects = (
|
||||||
|
accepted_table_level_subjects + accepted_column_level_subjects
|
||||||
|
)
|
||||||
|
|
||||||
|
def _maybe_warn_query_subjects(
|
||||||
|
self, entity_urn: str, skipped_count: int, item_type: str
|
||||||
|
) -> None:
|
||||||
|
"""Log warning for query subjects truncation if any items were skipped."""
|
||||||
|
if skipped_count > 0:
|
||||||
|
self.report.warning(
|
||||||
|
title="Query subjects truncated due to size constraint",
|
||||||
|
message="Query subjects contained too much data and would have caused ingestion to fail",
|
||||||
|
context=f"Skipped {skipped_count} {item_type} for {entity_urn} due to aspect size constraints",
|
||||||
|
)
|
||||||
|
|
||||||
|
def _maybe_warn_upstream_lineage(
|
||||||
|
self, entity_urn: str, skipped_count: int, item_type: str
|
||||||
|
) -> None:
|
||||||
|
"""Log warning for upstream lineage truncation if any items were skipped."""
|
||||||
|
if skipped_count > 0:
|
||||||
|
self.report.warning(
|
||||||
|
title="Upstream lineage truncated due to size constraint",
|
||||||
|
message="Upstream lineage contained too much data and would have caused ingestion to fail",
|
||||||
|
context=f"Skipped {skipped_count} {item_type} for {entity_urn} due to aspect size constraints",
|
||||||
|
)
|
||||||
|
|
||||||
|
def ensure_upstream_lineage_size( # noqa: C901
|
||||||
|
self, entity_urn: str, upstream_lineage: UpstreamLineageClass
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Ensure upstream lineage aspect does not exceed allowed size by removing lineage in priority order:
|
||||||
|
first NONE fine-grained lineages (lowest priority), then FIELD_SET fine-grained lineages,
|
||||||
|
then DATASET fine-grained lineages, and finally upstreams (highest priority).
|
||||||
|
"""
|
||||||
|
if not upstream_lineage.fineGrainedLineages and not upstream_lineage.upstreams:
|
||||||
|
return
|
||||||
|
|
||||||
|
total_lineage_size = 0
|
||||||
|
accepted_upstreams = []
|
||||||
|
accepted_dataset_fg_lineages = []
|
||||||
|
accepted_field_set_fg_lineages = []
|
||||||
|
accepted_none_fg_lineages = []
|
||||||
|
upstream_items_with_sizes = []
|
||||||
|
dataset_fg_items_with_sizes = []
|
||||||
|
field_set_fg_items_with_sizes = []
|
||||||
|
none_fg_items_with_sizes = []
|
||||||
|
|
||||||
|
# Add upstreams (highest priority)
|
||||||
|
if upstream_lineage.upstreams:
|
||||||
|
for upstream in upstream_lineage.upstreams:
|
||||||
|
upstream_size = len(json.dumps(pre_json_transform(upstream.to_obj())))
|
||||||
|
upstream_items_with_sizes.append((upstream, upstream_size))
|
||||||
|
|
||||||
|
# Separate fine-grained lineage items by upstreamType: DATASET > FIELD_SET > NONE
|
||||||
|
if upstream_lineage.fineGrainedLineages:
|
||||||
|
for fg_lineage in upstream_lineage.fineGrainedLineages:
|
||||||
|
fg_lineage_size = len(
|
||||||
|
json.dumps(pre_json_transform(fg_lineage.to_obj()))
|
||||||
|
)
|
||||||
|
|
||||||
|
upstream_type_str = str(fg_lineage.upstreamType)
|
||||||
|
if upstream_type_str == "DATASET":
|
||||||
|
dataset_fg_items_with_sizes.append((fg_lineage, fg_lineage_size))
|
||||||
|
elif upstream_type_str == "FIELD_SET":
|
||||||
|
field_set_fg_items_with_sizes.append((fg_lineage, fg_lineage_size))
|
||||||
|
elif upstream_type_str == "NONE":
|
||||||
|
none_fg_items_with_sizes.append((fg_lineage, fg_lineage_size))
|
||||||
|
|
||||||
|
# Once we find one that doesn't fit, stop everything else to prevent inconsistencies
|
||||||
|
first_skip_done = False
|
||||||
|
|
||||||
|
# First, include all upstreams (highest priority)
|
||||||
|
for item, item_size in upstream_items_with_sizes:
|
||||||
|
if total_lineage_size + item_size < self.payload_constraint:
|
||||||
|
accepted_upstreams.append(item)
|
||||||
|
total_lineage_size += item_size
|
||||||
|
else:
|
||||||
|
first_skip_done = True
|
||||||
|
break
|
||||||
|
|
||||||
|
# Second, include DATASET fine-grained lineages if no upstreams were skipped
|
||||||
|
if not first_skip_done:
|
||||||
|
for fg_lineage, fg_lineage_size in dataset_fg_items_with_sizes:
|
||||||
|
if total_lineage_size + fg_lineage_size < self.payload_constraint:
|
||||||
|
accepted_dataset_fg_lineages.append(fg_lineage)
|
||||||
|
total_lineage_size += fg_lineage_size
|
||||||
|
else:
|
||||||
|
first_skip_done = True
|
||||||
|
break
|
||||||
|
|
||||||
|
# Third, include FIELD_SET fine-grained lineages if no higher priority items were skipped
|
||||||
|
if not first_skip_done:
|
||||||
|
for fg_lineage, fg_lineage_size in field_set_fg_items_with_sizes:
|
||||||
|
if total_lineage_size + fg_lineage_size < self.payload_constraint:
|
||||||
|
accepted_field_set_fg_lineages.append(fg_lineage)
|
||||||
|
total_lineage_size += fg_lineage_size
|
||||||
|
else:
|
||||||
|
first_skip_done = True
|
||||||
|
break
|
||||||
|
|
||||||
|
# Finally, include NONE fine-grained lineages if no higher priority items were skipped
|
||||||
|
if not first_skip_done:
|
||||||
|
for fg_lineage, fg_lineage_size in none_fg_items_with_sizes:
|
||||||
|
if total_lineage_size + fg_lineage_size < self.payload_constraint:
|
||||||
|
accepted_none_fg_lineages.append(fg_lineage)
|
||||||
|
total_lineage_size += fg_lineage_size
|
||||||
|
else:
|
||||||
|
first_skip_done = True
|
||||||
|
break
|
||||||
|
|
||||||
|
# Log aggregate warnings instead of per-item warnings
|
||||||
|
if first_skip_done:
|
||||||
|
upstreams_skipped_count = len(upstream_items_with_sizes) - len(
|
||||||
|
accepted_upstreams
|
||||||
|
)
|
||||||
|
dataset_fg_skipped_count = len(dataset_fg_items_with_sizes) - len(
|
||||||
|
accepted_dataset_fg_lineages
|
||||||
|
)
|
||||||
|
field_set_fg_skipped_count = len(field_set_fg_items_with_sizes) - len(
|
||||||
|
accepted_field_set_fg_lineages
|
||||||
|
)
|
||||||
|
none_fg_skipped_count = len(none_fg_items_with_sizes) - len(
|
||||||
|
accepted_none_fg_lineages
|
||||||
|
)
|
||||||
|
|
||||||
|
self._maybe_warn_upstream_lineage(
|
||||||
|
entity_urn, upstreams_skipped_count, "upstream datasets"
|
||||||
|
)
|
||||||
|
self._maybe_warn_upstream_lineage(
|
||||||
|
entity_urn,
|
||||||
|
dataset_fg_skipped_count,
|
||||||
|
"dataset-level fine-grained lineages",
|
||||||
|
)
|
||||||
|
self._maybe_warn_upstream_lineage(
|
||||||
|
entity_urn,
|
||||||
|
field_set_fg_skipped_count,
|
||||||
|
"field-set-level fine-grained lineages",
|
||||||
|
)
|
||||||
|
self._maybe_warn_upstream_lineage(
|
||||||
|
entity_urn, none_fg_skipped_count, "none-level fine-grained lineages"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Combine all accepted fine-grained lineages
|
||||||
|
accepted_fine_grained_lineages = (
|
||||||
|
accepted_dataset_fg_lineages
|
||||||
|
+ accepted_field_set_fg_lineages
|
||||||
|
+ accepted_none_fg_lineages
|
||||||
|
)
|
||||||
|
|
||||||
|
upstream_lineage.upstreams = accepted_upstreams
|
||||||
|
upstream_lineage.fineGrainedLineages = (
|
||||||
|
accepted_fine_grained_lineages if accepted_fine_grained_lineages else None
|
||||||
|
)
|
||||||
|
|
||||||
|
def ensure_query_properties_size(
|
||||||
|
self, entity_urn: str, query_properties: QueryPropertiesClass
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Ensure query properties aspect does not exceed allowed size by truncating the query statement value.
|
||||||
|
Uses a configurable max payload size that is the minimum between QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES
|
||||||
|
and INGEST_MAX_PAYLOAD_BYTES.
|
||||||
|
|
||||||
|
We have found surprisingly large query statements (e.g. 20MB+) that caused ingestion to fail;
|
||||||
|
that was INSERT INTO VALUES with huge list of values.
|
||||||
|
"""
|
||||||
|
if not query_properties.statement or not query_properties.statement.value:
|
||||||
|
return
|
||||||
|
|
||||||
|
max_payload_size = min(
|
||||||
|
QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES, self.payload_constraint
|
||||||
|
)
|
||||||
|
|
||||||
|
current_size = len(json.dumps(pre_json_transform(query_properties.to_obj())))
|
||||||
|
|
||||||
|
if current_size < max_payload_size:
|
||||||
|
return
|
||||||
|
|
||||||
|
reduction_needed = (
|
||||||
|
current_size - max_payload_size + QUERY_STATEMENT_TRUNCATION_BUFFER
|
||||||
|
)
|
||||||
|
|
||||||
|
statement_value_size = len(query_properties.statement.value)
|
||||||
|
original_statement_size = statement_value_size
|
||||||
|
|
||||||
|
# Only truncate if reduction is actually needed and possible
|
||||||
|
if statement_value_size > reduction_needed > 0:
|
||||||
|
new_statement_length = statement_value_size - reduction_needed
|
||||||
|
truncated_statement = query_properties.statement.value[
|
||||||
|
:new_statement_length
|
||||||
|
]
|
||||||
|
|
||||||
|
truncation_message = f"... [original value was {original_statement_size} bytes and truncated to {new_statement_length} bytes]"
|
||||||
|
query_properties.statement.value = truncated_statement + truncation_message
|
||||||
|
|
||||||
|
self.report.warning(
|
||||||
|
title="Query properties truncated due to size constraint",
|
||||||
|
message="Query properties contained too much data and would have caused ingestion to fail",
|
||||||
|
context=f"Query statement was truncated from {original_statement_size} to {new_statement_length} characters for {entity_urn} due to aspect size constraints",
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
f"Cannot truncate query statement for {entity_urn} as it is smaller than or equal to the required reduction size {reduction_needed}. That means that 'ensure_query_properties_size' must be extended to trim other fields different than statement."
|
||||||
|
)
|
||||||
|
|
||||||
def ensure_aspect_size(
|
def ensure_aspect_size(
|
||||||
self,
|
self,
|
||||||
stream: Iterable[MetadataWorkUnit],
|
stream: Iterable[MetadataWorkUnit],
|
||||||
@ -96,4 +386,10 @@ class EnsureAspectSizeProcessor:
|
|||||||
self.ensure_schema_metadata_size(wu.get_urn(), schema)
|
self.ensure_schema_metadata_size(wu.get_urn(), schema)
|
||||||
elif profile := wu.get_aspect_of_type(DatasetProfileClass):
|
elif profile := wu.get_aspect_of_type(DatasetProfileClass):
|
||||||
self.ensure_dataset_profile_size(wu.get_urn(), profile)
|
self.ensure_dataset_profile_size(wu.get_urn(), profile)
|
||||||
|
elif query_subjects := wu.get_aspect_of_type(QuerySubjectsClass):
|
||||||
|
self.ensure_query_subjects_size(wu.get_urn(), query_subjects)
|
||||||
|
elif upstream_lineage := wu.get_aspect_of_type(UpstreamLineageClass):
|
||||||
|
self.ensure_upstream_lineage_size(wu.get_urn(), upstream_lineage)
|
||||||
|
elif query_properties := wu.get_aspect_of_type(QueryPropertiesClass):
|
||||||
|
self.ensure_query_properties_size(wu.get_urn(), query_properties)
|
||||||
yield wu
|
yield wu
|
||||||
|
|||||||
@ -15,20 +15,33 @@ from datahub.ingestion.api.source import SourceReport
|
|||||||
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
||||||
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
||||||
from datahub.metadata.schema_classes import (
|
from datahub.metadata.schema_classes import (
|
||||||
|
AuditStampClass,
|
||||||
ChangeTypeClass,
|
ChangeTypeClass,
|
||||||
DatasetFieldProfileClass,
|
DatasetFieldProfileClass,
|
||||||
|
DatasetLineageTypeClass,
|
||||||
DatasetProfileClass,
|
DatasetProfileClass,
|
||||||
DatasetSnapshotClass,
|
DatasetSnapshotClass,
|
||||||
|
FineGrainedLineageClass,
|
||||||
|
FineGrainedLineageDownstreamTypeClass,
|
||||||
|
FineGrainedLineageUpstreamTypeClass,
|
||||||
GenericAspectClass,
|
GenericAspectClass,
|
||||||
MetadataChangeProposalClass,
|
MetadataChangeProposalClass,
|
||||||
NumberTypeClass,
|
NumberTypeClass,
|
||||||
OtherSchemaClass,
|
OtherSchemaClass,
|
||||||
|
QueryLanguageClass,
|
||||||
|
QueryPropertiesClass,
|
||||||
|
QuerySourceClass,
|
||||||
|
QueryStatementClass,
|
||||||
|
QuerySubjectClass,
|
||||||
|
QuerySubjectsClass,
|
||||||
SchemaFieldClass,
|
SchemaFieldClass,
|
||||||
SchemaFieldDataTypeClass,
|
SchemaFieldDataTypeClass,
|
||||||
SchemaMetadataClass,
|
SchemaMetadataClass,
|
||||||
StatusClass,
|
StatusClass,
|
||||||
StringTypeClass,
|
StringTypeClass,
|
||||||
SubTypesClass,
|
SubTypesClass,
|
||||||
|
UpstreamClass,
|
||||||
|
UpstreamLineageClass,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -112,6 +125,192 @@ def proper_schema_metadata() -> SchemaMetadataClass:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def proper_query_subjects() -> QuerySubjectsClass:
|
||||||
|
subjects = [
|
||||||
|
QuerySubjectClass(
|
||||||
|
entity="urn:li:dataset:(urn:li:dataPlatform:hive,db1.table1,PROD)"
|
||||||
|
),
|
||||||
|
QuerySubjectClass(
|
||||||
|
entity="urn:li:dataset:(urn:li:dataPlatform:hive,db1.table2,PROD)"
|
||||||
|
),
|
||||||
|
QuerySubjectClass(
|
||||||
|
entity="urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.table1,PROD),col1)"
|
||||||
|
),
|
||||||
|
QuerySubjectClass(
|
||||||
|
entity="urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.table2,PROD),col2)"
|
||||||
|
),
|
||||||
|
]
|
||||||
|
return QuerySubjectsClass(subjects=subjects)
|
||||||
|
|
||||||
|
|
||||||
|
def too_big_query_subjects() -> QuerySubjectsClass:
|
||||||
|
subjects = []
|
||||||
|
|
||||||
|
# Add a few table-level subjects
|
||||||
|
for i in range(5):
|
||||||
|
subjects.append(
|
||||||
|
QuerySubjectClass(
|
||||||
|
entity=f"urn:li:dataset:(urn:li:dataPlatform:hive,db.table{i},PROD)"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add many column-level subjects with very large entity URNs to exceed the 15MB constraint
|
||||||
|
# Each URN will be about 40KB, so 500 subjects should create ~20MB of data
|
||||||
|
for i in range(500):
|
||||||
|
large_table_name = "a" * 20000 # Very large table name
|
||||||
|
large_column_name = "b" * 20000 # Very large column name
|
||||||
|
subjects.append(
|
||||||
|
QuerySubjectClass(
|
||||||
|
entity=f"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,{large_table_name}_{i},PROD),{large_column_name}_{i})"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return QuerySubjectsClass(subjects=subjects)
|
||||||
|
|
||||||
|
|
||||||
|
def proper_upstream_lineage() -> UpstreamLineageClass:
|
||||||
|
upstreams = [
|
||||||
|
UpstreamClass(
|
||||||
|
dataset="urn:li:dataset:(urn:li:dataPlatform:hive,db1.table1,PROD)",
|
||||||
|
type=DatasetLineageTypeClass.TRANSFORMED,
|
||||||
|
),
|
||||||
|
UpstreamClass(
|
||||||
|
dataset="urn:li:dataset:(urn:li:dataPlatform:hive,db1.table2,PROD)",
|
||||||
|
type=DatasetLineageTypeClass.TRANSFORMED,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
fine_grained_lineages = [
|
||||||
|
FineGrainedLineageClass(
|
||||||
|
upstreamType=FineGrainedLineageUpstreamTypeClass.DATASET,
|
||||||
|
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
||||||
|
upstreams=["urn:li:dataset:(urn:li:dataPlatform:hive,db1.table3,PROD)"],
|
||||||
|
downstreams=[
|
||||||
|
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.target,PROD),col1)"
|
||||||
|
],
|
||||||
|
),
|
||||||
|
FineGrainedLineageClass(
|
||||||
|
upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
||||||
|
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
||||||
|
upstreams=[
|
||||||
|
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.table4,PROD),col2)"
|
||||||
|
],
|
||||||
|
downstreams=[
|
||||||
|
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,db1.target,PROD),col2)"
|
||||||
|
],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
return UpstreamLineageClass(
|
||||||
|
upstreams=upstreams, fineGrainedLineages=fine_grained_lineages
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def too_big_upstream_lineage() -> UpstreamLineageClass:
|
||||||
|
upstreams = []
|
||||||
|
fine_grained_lineages = []
|
||||||
|
|
||||||
|
# Add upstreams (highest priority)
|
||||||
|
for i in range(5):
|
||||||
|
upstreams.append(
|
||||||
|
UpstreamClass(
|
||||||
|
dataset=f"urn:li:dataset:(urn:li:dataPlatform:hive,upstream_table_{i},PROD)",
|
||||||
|
type=DatasetLineageTypeClass.TRANSFORMED,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add DATASET fine-grained lineages with large URNs
|
||||||
|
for i in range(200):
|
||||||
|
large_dataset_name = "a" * 20000
|
||||||
|
large_downstream_name = "b" * 20000
|
||||||
|
fine_grained_lineages.append(
|
||||||
|
FineGrainedLineageClass(
|
||||||
|
upstreamType=FineGrainedLineageUpstreamTypeClass.DATASET,
|
||||||
|
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
||||||
|
upstreams=[
|
||||||
|
f"urn:li:dataset:(urn:li:dataPlatform:hive,{large_dataset_name}_{i},PROD)"
|
||||||
|
],
|
||||||
|
downstreams=[
|
||||||
|
f"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,target,PROD),{large_downstream_name}_{i})"
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add FIELD_SET fine-grained lineages with large URNs
|
||||||
|
for i in range(200):
|
||||||
|
large_upstream_name = "c" * 20000
|
||||||
|
large_downstream_name = "d" * 20000
|
||||||
|
fine_grained_lineages.append(
|
||||||
|
FineGrainedLineageClass(
|
||||||
|
upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
||||||
|
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
||||||
|
upstreams=[
|
||||||
|
f"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,source,PROD),{large_upstream_name}_{i})"
|
||||||
|
],
|
||||||
|
downstreams=[
|
||||||
|
f"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,target,PROD),{large_downstream_name}_{i})"
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add NONE fine-grained lineages with large URNs (lowest priority)
|
||||||
|
for i in range(200):
|
||||||
|
large_downstream_name = "e" * 20000
|
||||||
|
fine_grained_lineages.append(
|
||||||
|
FineGrainedLineageClass(
|
||||||
|
upstreamType=FineGrainedLineageUpstreamTypeClass.NONE,
|
||||||
|
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
||||||
|
downstreams=[
|
||||||
|
f"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,target,PROD),{large_downstream_name}_{i})"
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return UpstreamLineageClass(
|
||||||
|
upstreams=upstreams, fineGrainedLineages=fine_grained_lineages
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def proper_query_properties() -> QueryPropertiesClass:
|
||||||
|
# Create a query properties with a reasonably sized statement (~1KB)
|
||||||
|
query_statement = (
|
||||||
|
"SELECT * FROM table1 WHERE column1 = 'value' AND column2 > 100;" * 20
|
||||||
|
)
|
||||||
|
|
||||||
|
return QueryPropertiesClass(
|
||||||
|
statement=QueryStatementClass(
|
||||||
|
value=query_statement,
|
||||||
|
language=QueryLanguageClass.SQL,
|
||||||
|
),
|
||||||
|
source=QuerySourceClass.SYSTEM,
|
||||||
|
created=AuditStampClass(time=1000000000000, actor="urn:li:corpuser:test"),
|
||||||
|
lastModified=AuditStampClass(time=1000000000000, actor="urn:li:corpuser:test"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def too_big_query_properties() -> QueryPropertiesClass:
|
||||||
|
# Create a query properties with a very large statement (~6MB, exceeding the 5MB default limit)
|
||||||
|
# This is larger than the QUERY_PROPERTIES_STATEMENT_MAX_PAYLOAD_BYTES default (5MB)
|
||||||
|
large_query_statement = (
|
||||||
|
"SELECT col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, "
|
||||||
|
"col11, col12, col13, col14, col15, col16, col17, col18, col19, col20 "
|
||||||
|
"FROM very_long_table_name_with_lots_of_characters_to_make_it_big "
|
||||||
|
"WHERE condition1 = 'some_very_long_value_with_lots_of_text' "
|
||||||
|
"AND condition2 IN ('value1', 'value2', 'value3', 'value4') "
|
||||||
|
"ORDER BY col1, col2, col3, col4, col5 LIMIT 1000;"
|
||||||
|
) * 15000 # ~6MB
|
||||||
|
|
||||||
|
return QueryPropertiesClass(
|
||||||
|
statement=QueryStatementClass(
|
||||||
|
value=large_query_statement,
|
||||||
|
language=QueryLanguageClass.SQL,
|
||||||
|
),
|
||||||
|
source=QuerySourceClass.SYSTEM,
|
||||||
|
created=AuditStampClass(time=1000000000000, actor="urn:li:corpuser:test"),
|
||||||
|
lastModified=AuditStampClass(time=1000000000000, actor="urn:li:corpuser:test"),
|
||||||
|
name="Large Test Query",
|
||||||
|
description="A test query with a very large statement",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def proper_dataset_profile() -> DatasetProfileClass:
|
def proper_dataset_profile() -> DatasetProfileClass:
|
||||||
sample_values = [
|
sample_values = [
|
||||||
"23483295",
|
"23483295",
|
||||||
@ -344,3 +543,254 @@ def test_wu_processor_not_triggered_by_unhandled_aspects(
|
|||||||
]
|
]
|
||||||
ensure_schema_metadata_size_mock.assert_not_called()
|
ensure_schema_metadata_size_mock.assert_not_called()
|
||||||
ensure_dataset_profile_size_mock.assert_not_called()
|
ensure_dataset_profile_size_mock.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
|
@freeze_time("2023-01-02 00:00:00")
|
||||||
|
def test_ensure_size_of_proper_query_subjects(processor):
|
||||||
|
query_subjects = proper_query_subjects()
|
||||||
|
orig_repr = json.dumps(query_subjects.to_obj())
|
||||||
|
processor.ensure_query_subjects_size(
|
||||||
|
"urn:li:query:(urn:li:dataPlatform:hive, dummy_query, DEV)", query_subjects
|
||||||
|
)
|
||||||
|
assert orig_repr == json.dumps(query_subjects.to_obj()), (
|
||||||
|
"Aspect was modified in case where workunit processor should have been no-op"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@freeze_time("2023-01-02 00:00:00")
|
||||||
|
def test_ensure_size_of_too_big_query_subjects(processor):
|
||||||
|
query_subjects = too_big_query_subjects()
|
||||||
|
assert len(query_subjects.subjects) == 505 # 5 table + 500 column subjects
|
||||||
|
|
||||||
|
# Verify that the initial size exceeds the default payload constraint
|
||||||
|
initial_size = len(json.dumps(query_subjects.to_obj()))
|
||||||
|
expected_size = 20 * 1024 * 1024 # 20MB
|
||||||
|
assert initial_size == pytest.approx(expected_size, rel=0.05), (
|
||||||
|
f"Initial size {initial_size} should be around 20MB (±5%), got {initial_size / (1024 * 1024):.1f}MB"
|
||||||
|
)
|
||||||
|
assert initial_size > INGEST_MAX_PAYLOAD_BYTES, (
|
||||||
|
f"Initial size {initial_size} should exceed payload constraint {INGEST_MAX_PAYLOAD_BYTES}"
|
||||||
|
)
|
||||||
|
|
||||||
|
processor.ensure_query_subjects_size(
|
||||||
|
"urn:li:query:(urn:li:dataPlatform:hive, dummy_query, DEV)", query_subjects
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should be significantly reduced due to size constraints
|
||||||
|
# With ~20MB of data needing to be reduced to ~15MB, we expect ~25% reduction (125 subjects)
|
||||||
|
# So final count should be around 380, using 400 as upper bound with buffer
|
||||||
|
assert len(query_subjects.subjects) < 400, (
|
||||||
|
"Query subjects has not been properly truncated"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check that table-level subjects are prioritized (should still be present)
|
||||||
|
table_subjects = [
|
||||||
|
s
|
||||||
|
for s in query_subjects.subjects
|
||||||
|
if not s.entity.startswith("urn:li:schemaField:")
|
||||||
|
]
|
||||||
|
assert len(table_subjects) > 0, (
|
||||||
|
"Table-level subjects should be prioritized and present"
|
||||||
|
)
|
||||||
|
|
||||||
|
# The aspect should not exceed acceptable size
|
||||||
|
assert len(json.dumps(query_subjects.to_obj())) < INGEST_MAX_PAYLOAD_BYTES, (
|
||||||
|
"Aspect exceeded acceptable size"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@freeze_time("2023-01-02 00:00:00")
|
||||||
|
@patch(
|
||||||
|
"datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_query_subjects_size"
|
||||||
|
)
|
||||||
|
def test_wu_processor_triggered_by_query_subjects_aspect(
|
||||||
|
ensure_query_subjects_size_mock, processor
|
||||||
|
):
|
||||||
|
ret = [ # noqa: F841
|
||||||
|
*processor.ensure_aspect_size(
|
||||||
|
[
|
||||||
|
MetadataChangeProposalWrapper(
|
||||||
|
entityUrn="urn:li:query:(urn:li:dataPlatform:hive, dummy_query, DEV)",
|
||||||
|
aspect=proper_query_subjects(),
|
||||||
|
).as_workunit()
|
||||||
|
]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
ensure_query_subjects_size_mock.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
|
@freeze_time("2023-01-02 00:00:00")
|
||||||
|
def test_ensure_size_of_proper_upstream_lineage(processor):
|
||||||
|
upstream_lineage = proper_upstream_lineage()
|
||||||
|
orig_repr = json.dumps(upstream_lineage.to_obj())
|
||||||
|
processor.ensure_upstream_lineage_size(
|
||||||
|
"urn:li:dataset:(urn:li:dataPlatform:hive, dummy_dataset, DEV)",
|
||||||
|
upstream_lineage,
|
||||||
|
)
|
||||||
|
assert orig_repr == json.dumps(upstream_lineage.to_obj()), (
|
||||||
|
"Aspect was modified in case where workunit processor should have been no-op"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@freeze_time("2023-01-02 00:00:00")
|
||||||
|
def test_ensure_size_of_too_big_upstream_lineage(processor):
|
||||||
|
upstream_lineage = too_big_upstream_lineage()
|
||||||
|
assert len(upstream_lineage.upstreams) == 5 # 5 upstreams
|
||||||
|
assert upstream_lineage.fineGrainedLineages is not None
|
||||||
|
assert (
|
||||||
|
len(upstream_lineage.fineGrainedLineages) == 600
|
||||||
|
) # 200 DATASET + 200 FIELD_SET + 200 NONE
|
||||||
|
|
||||||
|
# Verify that the initial size exceeds the default payload constraint
|
||||||
|
initial_size = len(json.dumps(upstream_lineage.to_obj()))
|
||||||
|
expected_size = 20 * 1024 * 1024 # 20MB
|
||||||
|
assert initial_size == pytest.approx(expected_size, rel=0.05), (
|
||||||
|
f"Initial size {initial_size} should be around 20MB (±5%), got {initial_size / (1024 * 1024):.1f}MB"
|
||||||
|
)
|
||||||
|
assert initial_size > INGEST_MAX_PAYLOAD_BYTES, (
|
||||||
|
f"Initial size {initial_size} should exceed payload constraint {INGEST_MAX_PAYLOAD_BYTES}"
|
||||||
|
)
|
||||||
|
|
||||||
|
processor.ensure_upstream_lineage_size(
|
||||||
|
"urn:li:dataset:(urn:li:dataPlatform:hive, dummy_dataset, DEV)",
|
||||||
|
upstream_lineage,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should be significantly reduced due to size constraints
|
||||||
|
# With ~20MB of data needing to be reduced to ~15MB, we expect ~25% reduction
|
||||||
|
# Total items: 5 upstreams + 600 fine-grained = 605, expect around ~450 after 25% reduction
|
||||||
|
total_items = len(upstream_lineage.upstreams) + (
|
||||||
|
len(upstream_lineage.fineGrainedLineages)
|
||||||
|
if upstream_lineage.fineGrainedLineages
|
||||||
|
else 0
|
||||||
|
)
|
||||||
|
assert total_items < 500, "Upstream lineage has not been properly truncated"
|
||||||
|
|
||||||
|
# Check that upstreams are prioritized (should still be present)
|
||||||
|
assert len(upstream_lineage.upstreams) > 0, (
|
||||||
|
"Upstreams should be prioritized and present"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check that DATASET fine-grained lineages are prioritized over FIELD_SET and NONE
|
||||||
|
if upstream_lineage.fineGrainedLineages:
|
||||||
|
dataset_count = sum(
|
||||||
|
1
|
||||||
|
for fg in upstream_lineage.fineGrainedLineages
|
||||||
|
if str(fg.upstreamType) == "DATASET"
|
||||||
|
)
|
||||||
|
field_set_count = sum(
|
||||||
|
1
|
||||||
|
for fg in upstream_lineage.fineGrainedLineages
|
||||||
|
if str(fg.upstreamType) == "FIELD_SET"
|
||||||
|
)
|
||||||
|
none_count = sum(
|
||||||
|
1
|
||||||
|
for fg in upstream_lineage.fineGrainedLineages
|
||||||
|
if str(fg.upstreamType) == "NONE"
|
||||||
|
)
|
||||||
|
|
||||||
|
# DATASET should be prioritized over FIELD_SET and NONE
|
||||||
|
assert dataset_count >= field_set_count, (
|
||||||
|
"DATASET fine-grained lineages should be prioritized"
|
||||||
|
)
|
||||||
|
assert dataset_count >= none_count, (
|
||||||
|
"DATASET fine-grained lineages should be prioritized over NONE"
|
||||||
|
)
|
||||||
|
|
||||||
|
# The aspect should not exceed acceptable size
|
||||||
|
assert len(json.dumps(upstream_lineage.to_obj())) < INGEST_MAX_PAYLOAD_BYTES, (
|
||||||
|
"Aspect exceeded acceptable size"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@freeze_time("2023-01-02 00:00:00")
|
||||||
|
@patch(
|
||||||
|
"datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_upstream_lineage_size"
|
||||||
|
)
|
||||||
|
def test_wu_processor_triggered_by_upstream_lineage_aspect(
|
||||||
|
ensure_upstream_lineage_size_mock, processor
|
||||||
|
):
|
||||||
|
ret = [ # noqa: F841
|
||||||
|
*processor.ensure_aspect_size(
|
||||||
|
[
|
||||||
|
MetadataChangeProposalWrapper(
|
||||||
|
entityUrn="urn:li:dataset:(urn:li:dataPlatform:hive, dummy_dataset, DEV)",
|
||||||
|
aspect=proper_upstream_lineage(),
|
||||||
|
).as_workunit()
|
||||||
|
]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
ensure_upstream_lineage_size_mock.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
|
@freeze_time("2023-01-02 00:00:00")
|
||||||
|
def test_ensure_size_of_proper_query_properties(processor):
|
||||||
|
query_properties = proper_query_properties()
|
||||||
|
original_statement = query_properties.statement.value
|
||||||
|
|
||||||
|
# Verify initial size is reasonable (under 5MB)
|
||||||
|
initial_size = len(json.dumps(query_properties.to_obj()))
|
||||||
|
assert initial_size < 5 * 1024 * 1024, "Test query properties should be under 5MB"
|
||||||
|
|
||||||
|
processor.ensure_query_properties_size("urn:li:query:test", query_properties)
|
||||||
|
|
||||||
|
# Statement should remain unchanged for properly sized query properties
|
||||||
|
assert query_properties.statement.value == original_statement
|
||||||
|
assert len(processor.report.warnings) == 0
|
||||||
|
|
||||||
|
|
||||||
|
@freeze_time("2023-01-02 00:00:00")
|
||||||
|
def test_ensure_size_of_too_big_query_properties(processor):
|
||||||
|
query_properties = too_big_query_properties()
|
||||||
|
original_statement_size = len(query_properties.statement.value)
|
||||||
|
|
||||||
|
# Verify initial size exceeds it's about 5.5MB limit and definitely larger than 5MB
|
||||||
|
initial_size = len(json.dumps(query_properties.to_obj()))
|
||||||
|
expected_initial_size = 5.5 * 1024 * 1024 # ~5.5MB
|
||||||
|
assert initial_size == pytest.approx(expected_initial_size, rel=0.1), (
|
||||||
|
f"Expected initial size ~{expected_initial_size}, got {initial_size}"
|
||||||
|
)
|
||||||
|
assert initial_size > 5 * 1024 * 1024, "Test data should exceed 5MB limit"
|
||||||
|
|
||||||
|
processor.ensure_query_properties_size("urn:li:query:test", query_properties)
|
||||||
|
|
||||||
|
# Statement should be truncated
|
||||||
|
assert len(query_properties.statement.value) < original_statement_size
|
||||||
|
|
||||||
|
# Should contain truncation message
|
||||||
|
assert "... [original value was" in query_properties.statement.value
|
||||||
|
assert (
|
||||||
|
f"{original_statement_size} bytes and truncated to"
|
||||||
|
in query_properties.statement.value
|
||||||
|
)
|
||||||
|
assert query_properties.statement.value.endswith(" bytes]")
|
||||||
|
|
||||||
|
# Final size should be within constraints, ie <= 5MB + buffer
|
||||||
|
final_size = len(json.dumps(query_properties.to_obj()))
|
||||||
|
expected_final_size = 5 * 1024 * 1024 + 100 # 5MB + buffer
|
||||||
|
assert final_size <= expected_final_size, (
|
||||||
|
f"Final size {final_size} should be <= {expected_final_size}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should have logged a warning
|
||||||
|
assert len(processor.report.warnings) == 1
|
||||||
|
|
||||||
|
|
||||||
|
@freeze_time("2023-01-02 00:00:00")
|
||||||
|
@patch(
|
||||||
|
"datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_query_properties_size"
|
||||||
|
)
|
||||||
|
def test_wu_processor_triggered_by_query_properties_aspect(
|
||||||
|
ensure_query_properties_size_mock, processor
|
||||||
|
):
|
||||||
|
list(
|
||||||
|
processor.ensure_aspect_size(
|
||||||
|
[
|
||||||
|
MetadataChangeProposalWrapper(
|
||||||
|
entityUrn="urn:li:query:test",
|
||||||
|
aspect=proper_query_properties(),
|
||||||
|
).as_workunit()
|
||||||
|
]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
ensure_query_properties_size_mock.assert_called_once()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user