mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-29 10:57:52 +00:00
fix(bigquery): reduce number of calls for details of partitioning (#5014)
This commit is contained in:
parent
05310e4912
commit
912ce11821
@ -332,6 +332,7 @@ class BigQuerySource(SQLAlchemySource):
|
||||
self.report: BigQueryReport = BigQueryReport()
|
||||
self.lineage_metadata: Optional[Dict[str, Set[str]]] = None
|
||||
self.maximum_shard_ids: Dict[str, str] = dict()
|
||||
self.partition_info: Dict[str, str] = dict()
|
||||
atexit.register(cleanup, config)
|
||||
|
||||
def get_db_name(self, inspector: Inspector = None) -> str:
|
||||
@ -703,15 +704,35 @@ class BigQuerySource(SQLAlchemySource):
|
||||
else:
|
||||
return True
|
||||
|
||||
def add_information_for_schema(self, inspector: Inspector, schema: str) -> None:
|
||||
url = self.config.get_sql_alchemy_url()
|
||||
engine = create_engine(url, **self.config.options)
|
||||
project_id = self.get_db_name(inspector)
|
||||
with engine.connect() as con:
|
||||
inspector = inspect(con)
|
||||
sql = f"""
|
||||
select table_name, column_name
|
||||
from `{project_id}.{schema}.INFORMATION_SCHEMA.COLUMNS`
|
||||
where is_partitioning_column = 'YES';
|
||||
"""
|
||||
result = con.execute(sql)
|
||||
for row in result.fetchall():
|
||||
table = row[0]
|
||||
partition_column = row[1]
|
||||
self.partition_info[f"{project_id}.{schema}.{table}"] = partition_column
|
||||
self.report.partition_info = self.partition_info
|
||||
|
||||
def get_extra_tags(
|
||||
self, inspector: Inspector, schema: str, table: str
|
||||
) -> Dict[str, List[str]]:
|
||||
extra_tags: Dict[str, List[str]] = {}
|
||||
partition: Optional[BigQueryPartitionColumn] = self.get_latest_partition(
|
||||
schema, table
|
||||
)
|
||||
if partition:
|
||||
extra_tags[partition.column_name] = [Constants.TAG_PARTITION_KEY]
|
||||
project_id = self.get_db_name(inspector)
|
||||
|
||||
partition_lookup_key = f"{project_id}.{schema}.{table}"
|
||||
if partition_lookup_key in self.partition_info:
|
||||
extra_tags[self.partition_info[partition_lookup_key]] = [
|
||||
Constants.TAG_PARTITION_KEY
|
||||
]
|
||||
return extra_tags
|
||||
|
||||
def generate_partition_profiler_query(
|
||||
|
||||
@ -718,6 +718,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
|
||||
if not sql_config.schema_pattern.allowed(schema):
|
||||
self.report.report_dropped(f"{schema}.*")
|
||||
continue
|
||||
self.add_information_for_schema(inspector, schema)
|
||||
|
||||
yield from self.gen_schema_containers(schema, db_name)
|
||||
|
||||
@ -871,6 +872,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
|
||||
except Exception as e:
|
||||
self.report.report_failure(f"{schema}", f"Tables error: {e}")
|
||||
|
||||
def add_information_for_schema(self, inspector: Inspector, schema: str) -> None:
|
||||
pass
|
||||
|
||||
def get_extra_tags(
|
||||
self, inspector: Inspector, schema: str, table: str
|
||||
) -> Optional[Dict[str, List[str]]]:
|
||||
|
||||
@ -31,3 +31,4 @@ class BigQueryReport(SQLSourceReport):
|
||||
audit_start_time: Optional[str] = None
|
||||
audit_end_time: Optional[str] = None
|
||||
upstream_lineage: Dict = field(default_factory=dict)
|
||||
partition_info: Dict[str, str] = field(default_factory=dict)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user