mirror of
https://github.com/datahub-project/datahub.git
synced 2025-08-21 07:38:13 +00:00
feat(ingest): fix bugs in SqlParsingAggregator (#9926)
This commit is contained in:
parent
92b1cfa194
commit
1736edf8f5
@ -80,7 +80,7 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
|
|||||||
def schema_count(self) -> int:
|
def schema_count(self) -> int:
|
||||||
return int(
|
return int(
|
||||||
self._schema_cache.sql_query(
|
self._schema_cache.sql_query(
|
||||||
f"SELECT COUNT(*) FROM {self._schema_cache.tablename} WHERE is_missing"
|
f"SELECT COUNT(*) FROM {self._schema_cache.tablename} WHERE NOT is_missing"
|
||||||
)[0][0]
|
)[0][0]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -115,6 +115,7 @@ class SqlAggregatorReport(Report):
|
|||||||
_aggregator: "SqlParsingAggregator"
|
_aggregator: "SqlParsingAggregator"
|
||||||
query_log_path: Optional[str] = None
|
query_log_path: Optional[str] = None
|
||||||
|
|
||||||
|
# Observed queries.
|
||||||
num_observed_queries: int = 0
|
num_observed_queries: int = 0
|
||||||
num_observed_queries_failed: int = 0
|
num_observed_queries_failed: int = 0
|
||||||
num_observed_queries_column_timeout: int = 0
|
num_observed_queries_column_timeout: int = 0
|
||||||
@ -123,6 +124,7 @@ class SqlAggregatorReport(Report):
|
|||||||
default_factory=LossyList
|
default_factory=LossyList
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Views.
|
||||||
num_view_definitions: int = 0
|
num_view_definitions: int = 0
|
||||||
num_views_failed: int = 0
|
num_views_failed: int = 0
|
||||||
num_views_column_timeout: int = 0
|
num_views_column_timeout: int = 0
|
||||||
@ -131,28 +133,30 @@ class SqlAggregatorReport(Report):
|
|||||||
default_factory=LossyDict
|
default_factory=LossyDict
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Other lineage loading metrics.
|
||||||
num_known_query_lineage: int = 0
|
num_known_query_lineage: int = 0
|
||||||
num_known_mapping_lineage: int = 0
|
num_known_mapping_lineage: int = 0
|
||||||
num_table_renames: int = 0
|
num_table_renames: int = 0
|
||||||
|
|
||||||
num_queries_with_temp_tables_in_session: int = 0
|
# Temp tables.
|
||||||
|
|
||||||
num_unique_query_fingerprints: Optional[int] = None
|
|
||||||
|
|
||||||
# Lineage-related.
|
|
||||||
num_urns_with_lineage: Optional[int] = None
|
|
||||||
num_temp_sessions: Optional[int] = None
|
num_temp_sessions: Optional[int] = None
|
||||||
num_inferred_temp_schemas: Optional[int] = None
|
num_inferred_temp_schemas: Optional[int] = None
|
||||||
|
num_queries_with_temp_tables_in_session: int = 0
|
||||||
queries_with_temp_upstreams: LossyDict[QueryId, LossyList] = dataclasses.field(
|
queries_with_temp_upstreams: LossyDict[QueryId, LossyList] = dataclasses.field(
|
||||||
default_factory=LossyDict
|
default_factory=LossyDict
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Lineage-related.
|
||||||
|
schema_resolver_count: Optional[int] = None
|
||||||
|
num_unique_query_fingerprints: Optional[int] = None
|
||||||
|
num_urns_with_lineage: Optional[int] = None
|
||||||
num_queries_entities_generated: int = 0
|
num_queries_entities_generated: int = 0
|
||||||
|
|
||||||
# Usage-related.
|
# Usage-related.
|
||||||
usage_skipped_missing_timestamp: int = 0
|
usage_skipped_missing_timestamp: int = 0
|
||||||
|
|
||||||
def compute_stats(self) -> None:
|
def compute_stats(self) -> None:
|
||||||
|
self.schema_resolver_count = self._aggregator._schema_resolver.schema_count()
|
||||||
self.num_unique_query_fingerprints = len(self._aggregator._query_map)
|
self.num_unique_query_fingerprints = len(self._aggregator._query_map)
|
||||||
|
|
||||||
self.num_urns_with_lineage = len(self._aggregator._lineage_map)
|
self.num_urns_with_lineage = len(self._aggregator._lineage_map)
|
||||||
@ -865,6 +869,9 @@ class SqlParsingAggregator:
|
|||||||
confidenceScore=queries_map[query_id].confidence_score,
|
confidenceScore=queries_map[query_id].confidence_score,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
upstream_aspect.fineGrainedLineages = (
|
||||||
|
upstream_aspect.fineGrainedLineages or None
|
||||||
|
)
|
||||||
|
|
||||||
yield MetadataChangeProposalWrapper(
|
yield MetadataChangeProposalWrapper(
|
||||||
entityUrn=downstream_urn,
|
entityUrn=downstream_urn,
|
||||||
|
@ -19,8 +19,7 @@
|
|||||||
"dataset": "urn:li:dataset:(urn:li:dataPlatform:s3,bucket1/key1,PROD)",
|
"dataset": "urn:li:dataset:(urn:li:dataPlatform:s3,bucket1/key1,PROD)",
|
||||||
"type": "COPY"
|
"type": "COPY"
|
||||||
}
|
}
|
||||||
],
|
]
|
||||||
"fineGrainedLineages": []
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@ -44,8 +43,7 @@
|
|||||||
"dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)",
|
"dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)",
|
||||||
"type": "COPY"
|
"type": "COPY"
|
||||||
}
|
}
|
||||||
],
|
]
|
||||||
"fineGrainedLineages": []
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@ -69,8 +67,7 @@
|
|||||||
"dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD)",
|
"dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD)",
|
||||||
"type": "COPY"
|
"type": "COPY"
|
||||||
}
|
}
|
||||||
],
|
]
|
||||||
"fineGrainedLineages": []
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -205,8 +205,7 @@
|
|||||||
"type": "TRANSFORMED",
|
"type": "TRANSFORMED",
|
||||||
"query": "urn:li:query:3e85e6f353c7fa33d6514cb090482852064d23df6491c9a8ae28be0d990a3c71"
|
"query": "urn:li:query:3e85e6f353c7fa33d6514cb090482852064d23df6491c9a8ae28be0d990a3c71"
|
||||||
}
|
}
|
||||||
],
|
]
|
||||||
"fineGrainedLineages": []
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -1,6 +1,31 @@
|
|||||||
from datahub.sql_parsing.schema_resolver import SchemaResolver, _TableName
|
from datahub.sql_parsing.schema_resolver import SchemaResolver, _TableName
|
||||||
|
|
||||||
|
|
||||||
|
def test_basic_schema_resolver():
|
||||||
|
schema_resolver = SchemaResolver(
|
||||||
|
platform="redshift",
|
||||||
|
env="PROD",
|
||||||
|
graph=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
schema_resolver.add_raw_schema_info(
|
||||||
|
urn="urn:li:dataset:(urn:li:dataPlatform:redshift,my_db.public.test_table,PROD)",
|
||||||
|
schema_info={"name": "STRING"},
|
||||||
|
)
|
||||||
|
|
||||||
|
urn, schema = schema_resolver.resolve_table(
|
||||||
|
_TableName(database="my_db", db_schema="public", table="test_table")
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
urn
|
||||||
|
== "urn:li:dataset:(urn:li:dataPlatform:redshift,my_db.public.test_table,PROD)"
|
||||||
|
)
|
||||||
|
assert schema
|
||||||
|
assert schema["name"]
|
||||||
|
|
||||||
|
assert schema_resolver.schema_count() == 1
|
||||||
|
|
||||||
|
|
||||||
def test_get_urn_for_table_lowercase():
|
def test_get_urn_for_table_lowercase():
|
||||||
schema_resolver = SchemaResolver(
|
schema_resolver = SchemaResolver(
|
||||||
platform="mssql",
|
platform="mssql",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user