mirror of
https://github.com/datahub-project/datahub.git
synced 2025-08-15 12:46:53 +00:00
feat(ingest/powerbi): improve reporting around m-query parser (#11763)
This commit is contained in:
parent
4634bbcae0
commit
e609ff810d
@ -278,6 +278,9 @@ s3_base = {
|
||||
|
||||
threading_timeout_common = {
|
||||
"stopit==1.1.2",
|
||||
# stopit uses pkg_resources internally, which means there's an implied
|
||||
# dependency on setuptools.
|
||||
"setuptools",
|
||||
}
|
||||
|
||||
abs_base = {
|
||||
|
@ -20,6 +20,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
||||
StatefulIngestionConfigBase,
|
||||
)
|
||||
from datahub.utilities.lossy_collections import LossyList
|
||||
from datahub.utilities.perf_timer import PerfTimer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -190,6 +191,15 @@ class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport):
|
||||
filtered_dashboards: List[str] = dataclass_field(default_factory=list)
|
||||
filtered_charts: List[str] = dataclass_field(default_factory=list)
|
||||
|
||||
m_query_parse_timer: PerfTimer = dataclass_field(default_factory=PerfTimer)
|
||||
m_query_parse_attempts: int = 0
|
||||
m_query_parse_successes: int = 0
|
||||
m_query_parse_timeouts: int = 0
|
||||
m_query_parse_validation_errors: int = 0
|
||||
m_query_parse_unexpected_character_errors: int = 0
|
||||
m_query_parse_unknown_errors: int = 0
|
||||
m_query_resolver_errors: int = 0
|
||||
|
||||
def report_dashboards_scanned(self, count: int = 1) -> None:
|
||||
self.dashboards_scanned += count
|
||||
|
||||
|
@ -74,7 +74,9 @@ def get_upstream_tables(
|
||||
)
|
||||
|
||||
try:
|
||||
parse_tree: Tree = _parse_expression(table.expression)
|
||||
with reporter.m_query_parse_timer:
|
||||
reporter.m_query_parse_attempts += 1
|
||||
parse_tree: Tree = _parse_expression(table.expression)
|
||||
|
||||
valid, message = validator.validate_parse_tree(
|
||||
parse_tree, native_query_enabled=config.native_query_parsing
|
||||
@ -87,10 +89,12 @@ def get_upstream_tables(
|
||||
message="DataAccess function is not present in M-Query expression",
|
||||
context=f"table-full-name={table.full_name}, expression={table.expression}, message={message}",
|
||||
)
|
||||
reporter.m_query_parse_validation_errors += 1
|
||||
return []
|
||||
except KeyboardInterrupt:
|
||||
raise
|
||||
except TimeoutException:
|
||||
reporter.m_query_parse_timeouts += 1
|
||||
reporter.warning(
|
||||
title="M-Query Parsing Timeout",
|
||||
message=f"M-Query parsing timed out after {_M_QUERY_PARSE_TIMEOUT} seconds. Lineage for this table will not be extracted.",
|
||||
@ -102,8 +106,10 @@ def get_upstream_tables(
|
||||
) as e: # TODO: Debug why BaseException is needed here and below.
|
||||
if isinstance(e, lark.exceptions.UnexpectedCharacters):
|
||||
error_type = "Unexpected Character Error"
|
||||
reporter.m_query_parse_unexpected_character_errors += 1
|
||||
else:
|
||||
error_type = "Unknown Parsing Error"
|
||||
reporter.m_query_parse_unknown_errors += 1
|
||||
|
||||
reporter.warning(
|
||||
title="Unable to extract lineage from M-Query expression",
|
||||
@ -112,10 +118,10 @@ def get_upstream_tables(
|
||||
exc=e,
|
||||
)
|
||||
return []
|
||||
reporter.m_query_parse_successes += 1
|
||||
|
||||
lineage: List[resolver.Lineage] = []
|
||||
try:
|
||||
lineage = resolver.MQueryResolver(
|
||||
lineage: List[resolver.Lineage] = resolver.MQueryResolver(
|
||||
table=table,
|
||||
parse_tree=parse_tree,
|
||||
reporter=reporter,
|
||||
@ -126,14 +132,14 @@ def get_upstream_tables(
|
||||
platform_instance_resolver=platform_instance_resolver,
|
||||
)
|
||||
|
||||
return lineage
|
||||
|
||||
except BaseException as e:
|
||||
reporter.m_query_resolver_errors += 1
|
||||
reporter.warning(
|
||||
title="Unknown M-Query Pattern",
|
||||
message="Encountered a unknown M-Query Expression",
|
||||
context=f"table-full-name={table.full_name}, expression={table.expression}, message={e}",
|
||||
exc=e,
|
||||
)
|
||||
|
||||
logger.debug(f"Stack trace for {table.full_name}:", exc_info=e)
|
||||
|
||||
return lineage
|
||||
return []
|
||||
|
@ -356,8 +356,9 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
||||
)
|
||||
if arg_list is None:
|
||||
self.reporter.report_warning(
|
||||
f"{self.table.full_name}-arg-list",
|
||||
f"Argument list not found for data-access-function {data_access_func}",
|
||||
title="M-Query Resolver Error",
|
||||
message="Unable to extract lineage from parsed M-Query expression (missing argument list)",
|
||||
context=f"{self.table.full_name}: argument list not found for data-access-function {data_access_func}",
|
||||
)
|
||||
return None
|
||||
|
||||
@ -377,8 +378,9 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
||||
f"Function invocation without argument in expression = {invoke_expression.pretty()}"
|
||||
)
|
||||
self.reporter.report_warning(
|
||||
f"{self.table.full_name}-variable-statement",
|
||||
"Function invocation without argument",
|
||||
title="M-Query Resolver Error",
|
||||
message="Unable to extract lineage from parsed M-Query expression (function invocation without argument)",
|
||||
context=f"{self.table.full_name}: function invocation without argument",
|
||||
)
|
||||
return None
|
||||
|
||||
@ -403,8 +405,9 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
||||
f"Either list_expression or type_expression is not found = {invoke_expression.pretty()}"
|
||||
)
|
||||
self.reporter.report_warning(
|
||||
f"{self.table.full_name}-variable-statement",
|
||||
"Function argument expression is not supported",
|
||||
title="M-Query Resolver Error",
|
||||
message="Unable to extract lineage from parsed M-Query expression (function argument expression is not supported)",
|
||||
context=f"{self.table.full_name}: function argument expression is not supported",
|
||||
)
|
||||
return None
|
||||
|
||||
|
@ -90,7 +90,9 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
|
||||
)[0][0]
|
||||
)
|
||||
|
||||
def get_urn_for_table(self, table: _TableName, lower: bool = False) -> str:
|
||||
def get_urn_for_table(
|
||||
self, table: _TableName, lower: bool = False, mixed: bool = False
|
||||
) -> str:
|
||||
# TODO: Validate that this is the correct 2/3 layer hierarchy for the platform.
|
||||
|
||||
table_name = ".".join(
|
||||
@ -101,7 +103,10 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
|
||||
|
||||
if lower:
|
||||
table_name = table_name.lower()
|
||||
platform_instance = platform_instance.lower() if platform_instance else None
|
||||
if not mixed:
|
||||
platform_instance = (
|
||||
platform_instance.lower() if platform_instance else None
|
||||
)
|
||||
|
||||
if self.platform == "bigquery":
|
||||
# Normalize shard numbers and other BigQuery weirdness.
|
||||
@ -131,6 +136,20 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
|
||||
if schema_info:
|
||||
return urn_lower, schema_info
|
||||
|
||||
# Our treatment of platform instances when lowercasing urns
|
||||
# is inconsistent. In some places (e.g. Snowflake), we lowercase
|
||||
# the table names but not the platform instance. In other places
|
||||
# (e.g. Databricks), we lowercase everything because it happens
|
||||
# via the automatic lowercasing helper.
|
||||
# See https://github.com/datahub-project/datahub/pull/8928.
|
||||
# While we have this sort of inconsistency, we should also
|
||||
# check the mixed case urn, as a last resort.
|
||||
urn_mixed = self.get_urn_for_table(table, lower=True, mixed=True)
|
||||
if urn_mixed not in {urn, urn_lower}:
|
||||
schema_info = self._resolve_schema_info(urn_mixed)
|
||||
if schema_info:
|
||||
return urn_mixed, schema_info
|
||||
|
||||
if self._prefers_urn_lower():
|
||||
return urn_lower, None
|
||||
else:
|
||||
|
@ -294,6 +294,9 @@ class BatchPartitionExecutor(Closeable):
|
||||
def _handle_batch_completion(
|
||||
batch: List[_BatchPartitionWorkItem], future: Future
|
||||
) -> None:
|
||||
nonlocal workers_available
|
||||
workers_available += 1
|
||||
|
||||
with clearinghouse_state_lock:
|
||||
for item in batch:
|
||||
keys_no_longer_in_flight.add(item.key)
|
||||
|
@ -9,7 +9,6 @@ logger: logging.Logger = logging.getLogger(__name__)
|
||||
class PerfTimer(AbstractContextManager):
|
||||
"""
|
||||
A context manager that gives easy access to elapsed time for performance measurement.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
|
@ -41,6 +41,11 @@ def test_get_urn_for_table_lowercase():
|
||||
== "urn:li:dataset:(urn:li:dataPlatform:mssql,uppercased-instance.database.dataset.table,PROD)"
|
||||
)
|
||||
|
||||
assert (
|
||||
schema_resolver.get_urn_for_table(table=table, lower=True, mixed=True)
|
||||
== "urn:li:dataset:(urn:li:dataPlatform:mssql,Uppercased-Instance.database.dataset.table,PROD)"
|
||||
)
|
||||
|
||||
|
||||
def test_get_urn_for_table_not_lower_should_keep_capital_letters():
|
||||
schema_resolver = SchemaResolver(
|
||||
|
@ -8,7 +8,7 @@ from datahub.utilities.perf_timer import PerfTimer
|
||||
approx = partial(pytest.approx, rel=2e-2)
|
||||
|
||||
|
||||
def test_perf_timer_simple():
|
||||
def test_perf_timer_simple() -> None:
|
||||
with PerfTimer() as timer:
|
||||
time.sleep(0.4)
|
||||
assert approx(timer.elapsed_seconds()) == 0.4
|
||||
@ -16,7 +16,7 @@ def test_perf_timer_simple():
|
||||
assert approx(timer.elapsed_seconds()) == 0.4
|
||||
|
||||
|
||||
def test_perf_timer_paused_timer():
|
||||
def test_perf_timer_paused_timer() -> None:
|
||||
with PerfTimer() as current_timer:
|
||||
time.sleep(0.5)
|
||||
assert approx(current_timer.elapsed_seconds()) == 0.5
|
||||
@ -29,7 +29,7 @@ def test_perf_timer_paused_timer():
|
||||
assert approx(current_timer.elapsed_seconds()) == 0.7
|
||||
|
||||
|
||||
def test_generator_with_paused_timer():
|
||||
def test_generator_with_paused_timer() -> None:
|
||||
n = 4
|
||||
|
||||
def generator_function():
|
||||
@ -46,3 +46,15 @@ def test_generator_with_paused_timer():
|
||||
seq = generator_function()
|
||||
list([i for i in seq])
|
||||
assert approx(outer_timer.elapsed_seconds()) == 1 + 0.2 * n + 0.2 * n
|
||||
|
||||
|
||||
def test_perf_timer_reuse() -> None:
|
||||
timer = PerfTimer()
|
||||
|
||||
with timer:
|
||||
time.sleep(0.2)
|
||||
|
||||
with timer:
|
||||
time.sleep(0.3)
|
||||
|
||||
assert approx(timer.elapsed_seconds()) == 0.5
|
||||
|
Loading…
x
Reference in New Issue
Block a user