mirror of
				https://github.com/datahub-project/datahub.git
				synced 2025-11-04 12:51:23 +00:00 
			
		
		
		
	fix(bigquery): add dataset_id for bigquery (#4932)
This commit is contained in:
		
							parent
							
								
									1de2e2c85f
								
							
						
					
					
						commit
						2bb2c5243c
					
				@ -677,7 +677,10 @@ class DatahubGEProfiler:
 | 
				
			|||||||
            yield GEContext(data_context, datasource_name)
 | 
					            yield GEContext(data_context, datasource_name)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def generate_profiles(
 | 
					    def generate_profiles(
 | 
				
			||||||
        self, requests: List[GEProfilerRequest], max_workers: int
 | 
					        self,
 | 
				
			||||||
 | 
					        requests: List[GEProfilerRequest],
 | 
				
			||||||
 | 
					        max_workers: int,
 | 
				
			||||||
 | 
					        platform: Optional[str] = None,
 | 
				
			||||||
    ) -> Iterable[Tuple[GEProfilerRequest, Optional[DatasetProfileClass]]]:
 | 
					    ) -> Iterable[Tuple[GEProfilerRequest, Optional[DatasetProfileClass]]]:
 | 
				
			||||||
        with PerfTimer() as timer, concurrent.futures.ThreadPoolExecutor(
 | 
					        with PerfTimer() as timer, concurrent.futures.ThreadPoolExecutor(
 | 
				
			||||||
            max_workers=max_workers
 | 
					            max_workers=max_workers
 | 
				
			||||||
@ -704,6 +707,7 @@ class DatahubGEProfiler:
 | 
				
			|||||||
                            self._generate_profile_from_request,
 | 
					                            self._generate_profile_from_request,
 | 
				
			||||||
                            query_combiner,
 | 
					                            query_combiner,
 | 
				
			||||||
                            request,
 | 
					                            request,
 | 
				
			||||||
 | 
					                            platform=platform,
 | 
				
			||||||
                        )
 | 
					                        )
 | 
				
			||||||
                        for request in requests
 | 
					                        for request in requests
 | 
				
			||||||
                    ]
 | 
					                    ]
 | 
				
			||||||
@ -751,10 +755,12 @@ class DatahubGEProfiler:
 | 
				
			|||||||
        self,
 | 
					        self,
 | 
				
			||||||
        query_combiner: SQLAlchemyQueryCombiner,
 | 
					        query_combiner: SQLAlchemyQueryCombiner,
 | 
				
			||||||
        request: GEProfilerRequest,
 | 
					        request: GEProfilerRequest,
 | 
				
			||||||
 | 
					        platform: Optional[str] = None,
 | 
				
			||||||
    ) -> Tuple[GEProfilerRequest, Optional[DatasetProfileClass]]:
 | 
					    ) -> Tuple[GEProfilerRequest, Optional[DatasetProfileClass]]:
 | 
				
			||||||
        return request, self._generate_single_profile(
 | 
					        return request, self._generate_single_profile(
 | 
				
			||||||
            query_combiner=query_combiner,
 | 
					            query_combiner=query_combiner,
 | 
				
			||||||
            pretty_name=request.pretty_name,
 | 
					            pretty_name=request.pretty_name,
 | 
				
			||||||
 | 
					            platform=platform,
 | 
				
			||||||
            **request.batch_kwargs,
 | 
					            **request.batch_kwargs,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -781,6 +787,7 @@ class DatahubGEProfiler:
 | 
				
			|||||||
        table: str = None,
 | 
					        table: str = None,
 | 
				
			||||||
        partition: Optional[str] = None,
 | 
					        partition: Optional[str] = None,
 | 
				
			||||||
        custom_sql: Optional[str] = None,
 | 
					        custom_sql: Optional[str] = None,
 | 
				
			||||||
 | 
					        platform: Optional[str] = None,
 | 
				
			||||||
        **kwargs: Any,
 | 
					        **kwargs: Any,
 | 
				
			||||||
    ) -> Optional[DatasetProfileClass]:
 | 
					    ) -> Optional[DatasetProfileClass]:
 | 
				
			||||||
        bigquery_temp_table: Optional[str] = None
 | 
					        bigquery_temp_table: Optional[str] = None
 | 
				
			||||||
@ -820,6 +827,7 @@ class DatahubGEProfiler:
 | 
				
			|||||||
                    ge_context,
 | 
					                    ge_context,
 | 
				
			||||||
                    ge_config,
 | 
					                    ge_config,
 | 
				
			||||||
                    pretty_name=pretty_name,
 | 
					                    pretty_name=pretty_name,
 | 
				
			||||||
 | 
					                    platform=platform,
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                profile = _SingleDatasetProfiler(
 | 
					                profile = _SingleDatasetProfiler(
 | 
				
			||||||
@ -852,6 +860,7 @@ class DatahubGEProfiler:
 | 
				
			|||||||
        ge_context: GEContext,
 | 
					        ge_context: GEContext,
 | 
				
			||||||
        batch_kwargs: dict,
 | 
					        batch_kwargs: dict,
 | 
				
			||||||
        pretty_name: str,
 | 
					        pretty_name: str,
 | 
				
			||||||
 | 
					        platform: Optional[str] = None,
 | 
				
			||||||
    ) -> Dataset:
 | 
					    ) -> Dataset:
 | 
				
			||||||
        # This is effectively emulating the beginning of the process that
 | 
					        # This is effectively emulating the beginning of the process that
 | 
				
			||||||
        # is followed by GE itself. In particular, we simply want to construct
 | 
					        # is followed by GE itself. In particular, we simply want to construct
 | 
				
			||||||
@ -878,4 +887,12 @@ class DatahubGEProfiler:
 | 
				
			|||||||
                **batch_kwargs,
 | 
					                **batch_kwargs,
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					        if platform is not None and platform == "bigquery":
 | 
				
			||||||
 | 
					            name_parts = pretty_name.split(".")
 | 
				
			||||||
 | 
					            if len(name_parts) != 3:
 | 
				
			||||||
 | 
					                logger.error(
 | 
				
			||||||
 | 
					                    f"Unexpected {pretty_name} while profiling. Should have 3 parts but has {len(name_parts)} parts."
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                batch.engine.dialect.dataset_id = name_parts[1]
 | 
				
			||||||
        return batch
 | 
					        return batch
 | 
				
			||||||
 | 
				
			|||||||
@ -730,7 +730,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
 | 
				
			|||||||
                    )
 | 
					                    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if profiler and profile_requests:
 | 
					            if profiler and profile_requests:
 | 
				
			||||||
                yield from self.loop_profiler(profile_requests, profiler)
 | 
					                yield from self.loop_profiler(
 | 
				
			||||||
 | 
					                    profile_requests, profiler, platform=self.platform
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if self.is_stateful_ingestion_configured():
 | 
					        if self.is_stateful_ingestion_configured():
 | 
				
			||||||
            # Clean up stale entities.
 | 
					            # Clean up stale entities.
 | 
				
			||||||
@ -1329,10 +1331,13 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
 | 
				
			|||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def loop_profiler(
 | 
					    def loop_profiler(
 | 
				
			||||||
        self, profile_requests: List["GEProfilerRequest"], profiler: "DatahubGEProfiler"
 | 
					        self,
 | 
				
			||||||
 | 
					        profile_requests: List["GEProfilerRequest"],
 | 
				
			||||||
 | 
					        profiler: "DatahubGEProfiler",
 | 
				
			||||||
 | 
					        platform: Optional[str] = None,
 | 
				
			||||||
    ) -> Iterable[MetadataWorkUnit]:
 | 
					    ) -> Iterable[MetadataWorkUnit]:
 | 
				
			||||||
        for request, profile in profiler.generate_profiles(
 | 
					        for request, profile in profiler.generate_profiles(
 | 
				
			||||||
            profile_requests, self.config.profiling.max_workers
 | 
					            profile_requests, self.config.profiling.max_workers, platform=platform
 | 
				
			||||||
        ):
 | 
					        ):
 | 
				
			||||||
            if profile is None:
 | 
					            if profile is None:
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user