From 6db0925b26fdc2155fec30f37831329ec4655194 Mon Sep 17 00:00:00 2001 From: Tamas Nemeth Date: Thu, 6 Oct 2022 18:20:49 +0200 Subject: [PATCH] fix(ingest): bigquery-beta - fix for missing key error if dataset was empty (#6133) --- .../docs/sources/bigquery/bigquery-beta_pre.md | 3 +++ .../ingestion/source/bigquery_v2/bigquery.py | 15 ++++++++++++++- .../datahub/ingestion/source/bigquery_v2/usage.py | 3 ++- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/docs/sources/bigquery/bigquery-beta_pre.md b/metadata-ingestion/docs/sources/bigquery/bigquery-beta_pre.md index c2fa46ba0a..873f337ca8 100644 --- a/metadata-ingestion/docs/sources/bigquery/bigquery-beta_pre.md +++ b/metadata-ingestion/docs/sources/bigquery/bigquery-beta_pre.md @@ -25,6 +25,9 @@ If you have multiple projects in your BigQuery setup, the role should be granted | `bigquery.readsessions.getData` | Get data from the read session. | | `resourcemanager.projects.get` | Retrieve project names and metadata. | +You can use the following predefined IAM role which has all the needed permissions as well: +- [roles/bigquery.metadataViewer](https://cloud.google.com/bigquery/docs/access-control#bigquery.metadataViewer) + ##### Lineage/usage generation requirements Additional requirements needed on the top of the basic requirements. diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index e54faea2e7..ffdae318d9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -433,11 +433,17 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource): yield wu def get_workunits(self) -> Iterable[WorkUnit]: - + logger.info("Getting projects") conn: bigquery.Client = self.get_bigquery_client() self.add_config_to_report() projects: List[BigqueryProject] = BigQueryDataDictionary.get_projects(conn) + if len(projects) == 0: + logger.warning( + "Get projects didn't return any project. Maybe resourcemanager.projects.get permission is missing for the service account. You can assign predefined roles/bigquery.metadataViewer role to your service account." + ) + return + for project_id in projects: if not self.config.project_id_pattern.allowed(project_id.id): self.report.report_dropped(project_id.id) @@ -446,6 +452,7 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource): yield from self._process_project(conn, project_id) if self.config.profiling.enabled: + logger.info("Starting profiling...") yield from self.profiler.get_workunits(self.db_tables) # Clean up stale entities if configured. @@ -475,6 +482,12 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource): ) return None + if len(bigquery_project.datasets) == 0: + logger.warning( + f"No dataset found in {project_id}. Either there are no datasets in this project or missing bigquery.datasets.get permission. You can assign predefined roles/bigquery.metadataViewer role to your service account." + ) + return + for bigquery_dataset in bigquery_project.datasets: if not self.config.dataset_pattern.allowed(bigquery_dataset.name): diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py index 119d8fa49b..7227c209f8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py @@ -761,7 +761,8 @@ class BigQueryUsageExtractor: try: resource = event.read_event.resource.get_sanitized_table_ref() if ( - resource.table_identifier.get_table_display_name() + resource.table_identifier.dataset not in tables + or resource.table_identifier.get_table_display_name() not in tables[resource.table_identifier.dataset] ): logger.debug(f"Skipping non existing {resource} from usage")