mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-09 07:53:33 +00:00
fix(bigquery): add dataset_id for bigquery (#4932)
This commit is contained in:
parent
1de2e2c85f
commit
2bb2c5243c
@ -677,7 +677,10 @@ class DatahubGEProfiler:
|
|||||||
yield GEContext(data_context, datasource_name)
|
yield GEContext(data_context, datasource_name)
|
||||||
|
|
||||||
def generate_profiles(
|
def generate_profiles(
|
||||||
self, requests: List[GEProfilerRequest], max_workers: int
|
self,
|
||||||
|
requests: List[GEProfilerRequest],
|
||||||
|
max_workers: int,
|
||||||
|
platform: Optional[str] = None,
|
||||||
) -> Iterable[Tuple[GEProfilerRequest, Optional[DatasetProfileClass]]]:
|
) -> Iterable[Tuple[GEProfilerRequest, Optional[DatasetProfileClass]]]:
|
||||||
with PerfTimer() as timer, concurrent.futures.ThreadPoolExecutor(
|
with PerfTimer() as timer, concurrent.futures.ThreadPoolExecutor(
|
||||||
max_workers=max_workers
|
max_workers=max_workers
|
||||||
@ -704,6 +707,7 @@ class DatahubGEProfiler:
|
|||||||
self._generate_profile_from_request,
|
self._generate_profile_from_request,
|
||||||
query_combiner,
|
query_combiner,
|
||||||
request,
|
request,
|
||||||
|
platform=platform,
|
||||||
)
|
)
|
||||||
for request in requests
|
for request in requests
|
||||||
]
|
]
|
||||||
@ -751,10 +755,12 @@ class DatahubGEProfiler:
|
|||||||
self,
|
self,
|
||||||
query_combiner: SQLAlchemyQueryCombiner,
|
query_combiner: SQLAlchemyQueryCombiner,
|
||||||
request: GEProfilerRequest,
|
request: GEProfilerRequest,
|
||||||
|
platform: Optional[str] = None,
|
||||||
) -> Tuple[GEProfilerRequest, Optional[DatasetProfileClass]]:
|
) -> Tuple[GEProfilerRequest, Optional[DatasetProfileClass]]:
|
||||||
return request, self._generate_single_profile(
|
return request, self._generate_single_profile(
|
||||||
query_combiner=query_combiner,
|
query_combiner=query_combiner,
|
||||||
pretty_name=request.pretty_name,
|
pretty_name=request.pretty_name,
|
||||||
|
platform=platform,
|
||||||
**request.batch_kwargs,
|
**request.batch_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -781,6 +787,7 @@ class DatahubGEProfiler:
|
|||||||
table: str = None,
|
table: str = None,
|
||||||
partition: Optional[str] = None,
|
partition: Optional[str] = None,
|
||||||
custom_sql: Optional[str] = None,
|
custom_sql: Optional[str] = None,
|
||||||
|
platform: Optional[str] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> Optional[DatasetProfileClass]:
|
) -> Optional[DatasetProfileClass]:
|
||||||
bigquery_temp_table: Optional[str] = None
|
bigquery_temp_table: Optional[str] = None
|
||||||
@ -820,6 +827,7 @@ class DatahubGEProfiler:
|
|||||||
ge_context,
|
ge_context,
|
||||||
ge_config,
|
ge_config,
|
||||||
pretty_name=pretty_name,
|
pretty_name=pretty_name,
|
||||||
|
platform=platform,
|
||||||
)
|
)
|
||||||
|
|
||||||
profile = _SingleDatasetProfiler(
|
profile = _SingleDatasetProfiler(
|
||||||
@ -852,6 +860,7 @@ class DatahubGEProfiler:
|
|||||||
ge_context: GEContext,
|
ge_context: GEContext,
|
||||||
batch_kwargs: dict,
|
batch_kwargs: dict,
|
||||||
pretty_name: str,
|
pretty_name: str,
|
||||||
|
platform: Optional[str] = None,
|
||||||
) -> Dataset:
|
) -> Dataset:
|
||||||
# This is effectively emulating the beginning of the process that
|
# This is effectively emulating the beginning of the process that
|
||||||
# is followed by GE itself. In particular, we simply want to construct
|
# is followed by GE itself. In particular, we simply want to construct
|
||||||
@ -878,4 +887,12 @@ class DatahubGEProfiler:
|
|||||||
**batch_kwargs,
|
**batch_kwargs,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
if platform is not None and platform == "bigquery":
|
||||||
|
name_parts = pretty_name.split(".")
|
||||||
|
if len(name_parts) != 3:
|
||||||
|
logger.error(
|
||||||
|
f"Unexpected {pretty_name} while profiling. Should have 3 parts but has {len(name_parts)} parts."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
batch.engine.dialect.dataset_id = name_parts[1]
|
||||||
return batch
|
return batch
|
||||||
|
|||||||
@ -730,7 +730,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if profiler and profile_requests:
|
if profiler and profile_requests:
|
||||||
yield from self.loop_profiler(profile_requests, profiler)
|
yield from self.loop_profiler(
|
||||||
|
profile_requests, profiler, platform=self.platform
|
||||||
|
)
|
||||||
|
|
||||||
if self.is_stateful_ingestion_configured():
|
if self.is_stateful_ingestion_configured():
|
||||||
# Clean up stale entities.
|
# Clean up stale entities.
|
||||||
@ -1329,10 +1331,13 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def loop_profiler(
|
def loop_profiler(
|
||||||
self, profile_requests: List["GEProfilerRequest"], profiler: "DatahubGEProfiler"
|
self,
|
||||||
|
profile_requests: List["GEProfilerRequest"],
|
||||||
|
profiler: "DatahubGEProfiler",
|
||||||
|
platform: Optional[str] = None,
|
||||||
) -> Iterable[MetadataWorkUnit]:
|
) -> Iterable[MetadataWorkUnit]:
|
||||||
for request, profile in profiler.generate_profiles(
|
for request, profile in profiler.generate_profiles(
|
||||||
profile_requests, self.config.profiling.max_workers
|
profile_requests, self.config.profiling.max_workers, platform=platform
|
||||||
):
|
):
|
||||||
if profile is None:
|
if profile is None:
|
||||||
continue
|
continue
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user