diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/sql/bigquery.py index 21aad90e8d..947f9b9271 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/bigquery.py @@ -460,6 +460,30 @@ class BigQuerySource(SQLAlchemySource): return None project_id = self.get_db_name(inspector) _client: BigQueryClient = BigQueryClient(project=project_id) + # Reading all tables' metadata to report + base_query = ( + f"SELECT " + f"table_id, " + f"size_bytes, " + f"last_modified_time, " + f"row_count, " + f"FROM {schema}.__TABLES__" + ) + all_tables = _client.query(base_query) + report_tables: List[str] = [ + "table_id, size_bytes, last_modified_time, row_count" + ] + for table_row in all_tables: + report_tables.append( + f"{table_row.table_id}, {table_row.size_bytes}, {table_row.last_modified_time}, {table_row.row_count}" + ) + report_key = f"{self._get_project_id(inspector)}.{schema}" + self.report.table_metadata[report_key] = report_tables + self.report.profile_table_selection_criteria[report_key] = ( + "no constraint" if profile_clause == "" else profile_clause.lstrip(" WHERE") + ) + + # reading filtered tables. TODO: remove this call and apply local filtering on above query results. query = ( f"SELECT " f"table_id, " @@ -483,6 +507,7 @@ class BigQuerySource(SQLAlchemySource): logger.debug( f"Generated profiling candidates for {schema}: {_profile_candidates}" ) + self.report.selected_profile_tables[report_key] = _profile_candidates return _profile_candidates def _get_bigquery_log_entries( diff --git a/metadata-ingestion/src/datahub/ingestion/source_report/sql/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source_report/sql/bigquery.py index 3e46df61af..e857716464 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_report/sql/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source_report/sql/bigquery.py @@ -1,6 +1,6 @@ from dataclasses import dataclass, field from datetime import datetime -from typing import Dict, Optional +from typing import Dict, List, Optional import pydantic @@ -33,3 +33,6 @@ class BigQueryReport(SQLSourceReport): audit_end_time: Optional[str] = None upstream_lineage: Dict = field(default_factory=dict) partition_info: Dict[str, str] = field(default_factory=dict) + table_metadata: Dict[str, List[str]] = field(default_factory=dict) + profile_table_selection_criteria: Dict[str, str] = field(default_factory=dict) + selected_profile_tables: Dict[str, List[str]] = field(default_factory=dict)