fix(bigquery): reduce number of calls for details of partitioning (#5014)

This commit is contained in:
Aseem Bansal 2022-05-27 13:09:08 +05:30 committed by GitHub
parent 05310e4912
commit 912ce11821
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 31 additions and 5 deletions

View File

@ -332,6 +332,7 @@ class BigQuerySource(SQLAlchemySource):
self.report: BigQueryReport = BigQueryReport()
self.lineage_metadata: Optional[Dict[str, Set[str]]] = None
self.maximum_shard_ids: Dict[str, str] = dict()
self.partition_info: Dict[str, str] = dict()
atexit.register(cleanup, config)
def get_db_name(self, inspector: Inspector = None) -> str:
@ -703,15 +704,35 @@ class BigQuerySource(SQLAlchemySource):
else:
return True
def add_information_for_schema(self, inspector: Inspector, schema: str) -> None:
url = self.config.get_sql_alchemy_url()
engine = create_engine(url, **self.config.options)
project_id = self.get_db_name(inspector)
with engine.connect() as con:
inspector = inspect(con)
sql = f"""
select table_name, column_name
from `{project_id}.{schema}.INFORMATION_SCHEMA.COLUMNS`
where is_partitioning_column = 'YES';
"""
result = con.execute(sql)
for row in result.fetchall():
table = row[0]
partition_column = row[1]
self.partition_info[f"{project_id}.{schema}.{table}"] = partition_column
self.report.partition_info = self.partition_info
def get_extra_tags(
self, inspector: Inspector, schema: str, table: str
) -> Dict[str, List[str]]:
extra_tags: Dict[str, List[str]] = {}
partition: Optional[BigQueryPartitionColumn] = self.get_latest_partition(
schema, table
)
if partition:
extra_tags[partition.column_name] = [Constants.TAG_PARTITION_KEY]
project_id = self.get_db_name(inspector)
partition_lookup_key = f"{project_id}.{schema}.{table}"
if partition_lookup_key in self.partition_info:
extra_tags[self.partition_info[partition_lookup_key]] = [
Constants.TAG_PARTITION_KEY
]
return extra_tags
def generate_partition_profiler_query(

View File

@ -718,6 +718,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
if not sql_config.schema_pattern.allowed(schema):
self.report.report_dropped(f"{schema}.*")
continue
self.add_information_for_schema(inspector, schema)
yield from self.gen_schema_containers(schema, db_name)
@ -871,6 +872,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
except Exception as e:
self.report.report_failure(f"{schema}", f"Tables error: {e}")
def add_information_for_schema(self, inspector: Inspector, schema: str) -> None:
pass
def get_extra_tags(
self, inspector: Inspector, schema: str, table: str
) -> Optional[Dict[str, List[str]]]:

View File

@ -31,3 +31,4 @@ class BigQueryReport(SQLSourceReport):
audit_start_time: Optional[str] = None
audit_end_time: Optional[str] = None
upstream_lineage: Dict = field(default_factory=dict)
partition_info: Dict[str, str] = field(default_factory=dict)