docs(bigquery): profiling report enhancement (#5342)

This commit is contained in:
Mugdha Hardikar 2022-07-06 18:38:35 +05:30 committed by GitHub
parent d60f789519
commit 77eaab609d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 29 additions and 1 deletions

View File

@ -460,6 +460,30 @@ class BigQuerySource(SQLAlchemySource):
return None return None
project_id = self.get_db_name(inspector) project_id = self.get_db_name(inspector)
_client: BigQueryClient = BigQueryClient(project=project_id) _client: BigQueryClient = BigQueryClient(project=project_id)
# Reading all tables' metadata to report
base_query = (
f"SELECT "
f"table_id, "
f"size_bytes, "
f"last_modified_time, "
f"row_count, "
f"FROM {schema}.__TABLES__"
)
all_tables = _client.query(base_query)
report_tables: List[str] = [
"table_id, size_bytes, last_modified_time, row_count"
]
for table_row in all_tables:
report_tables.append(
f"{table_row.table_id}, {table_row.size_bytes}, {table_row.last_modified_time}, {table_row.row_count}"
)
report_key = f"{self._get_project_id(inspector)}.{schema}"
self.report.table_metadata[report_key] = report_tables
self.report.profile_table_selection_criteria[report_key] = (
"no constraint" if profile_clause == "" else profile_clause.lstrip(" WHERE")
)
# reading filtered tables. TODO: remove this call and apply local filtering on above query results.
query = ( query = (
f"SELECT " f"SELECT "
f"table_id, " f"table_id, "
@ -483,6 +507,7 @@ class BigQuerySource(SQLAlchemySource):
logger.debug( logger.debug(
f"Generated profiling candidates for {schema}: {_profile_candidates}" f"Generated profiling candidates for {schema}: {_profile_candidates}"
) )
self.report.selected_profile_tables[report_key] = _profile_candidates
return _profile_candidates return _profile_candidates
def _get_bigquery_log_entries( def _get_bigquery_log_entries(

View File

@ -1,6 +1,6 @@
from dataclasses import dataclass, field from dataclasses import dataclass, field
from datetime import datetime from datetime import datetime
from typing import Dict, Optional from typing import Dict, List, Optional
import pydantic import pydantic
@ -33,3 +33,6 @@ class BigQueryReport(SQLSourceReport):
audit_end_time: Optional[str] = None audit_end_time: Optional[str] = None
upstream_lineage: Dict = field(default_factory=dict) upstream_lineage: Dict = field(default_factory=dict)
partition_info: Dict[str, str] = field(default_factory=dict) partition_info: Dict[str, str] = field(default_factory=dict)
table_metadata: Dict[str, List[str]] = field(default_factory=dict)
profile_table_selection_criteria: Dict[str, str] = field(default_factory=dict)
selected_profile_tables: Dict[str, List[str]] = field(default_factory=dict)