feat(ingest): add lineage_client_project_id field to the BigQuery config (#4138)

* feat(ingest): add lineage_client_project_id field to the bigquery config

* fix linting issues

* add type annotation for arguments
This commit is contained in:
Vishal Shah 2022-02-28 14:19:23 -05:00 committed by GitHub
parent 88d1c96fff
commit 93ff09517b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 35 additions and 10 deletions

View File

@ -40,6 +40,7 @@ source:
# - "schema.table.column" # - "schema.table.column"
# deny: # deny:
# - "*.*.*" # - "*.*.*"
#lineage_client_project_id: project-id-1234567
## see https://datahubproject.io/docs/metadata-ingestion/sink_docs/datahub for complete documentation ## see https://datahubproject.io/docs/metadata-ingestion/sink_docs/datahub for complete documentation
sink: sink:

View File

@ -139,6 +139,7 @@ As a SQL-based service, the Athena integration is also supported by our SQL prof
| `domain.domain_key.allow` | | | List of regex patterns for tables/schemas to set domain_key domain key (domain_key can be any string like `sales`. There can be multiple domain key specified. | | `domain.domain_key.allow` | | | List of regex patterns for tables/schemas to set domain_key domain key (domain_key can be any string like `sales`. There can be multiple domain key specified. |
| `domain.domain_key.deny` | | | List of regex patterns for tables/schemas to not assign domain_key. There can be multiple domain key specified. | | `domain.domain_key.deny` | | | List of regex patterns for tables/schemas to not assign domain_key. There can be multiple domain key specified. |
| `domain.domain_key.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching.There can be multiple domain key specified. | | `domain.domain_key.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching.There can be multiple domain key specified. |
| `lineage_client_project_id` | | None | The project to use when creating the BigQuery Client. If left empty, the required `project_id` will be used. |
The following parameters are only relevant if include_table_lineage is set to true: The following parameters are only relevant if include_table_lineage is set to true:

View File

@ -236,6 +236,7 @@ def create_credential_temp_file(credential: BigQueryCredential) -> str:
class BigQueryConfig(BaseTimeWindowConfig, SQLAlchemyConfig): class BigQueryConfig(BaseTimeWindowConfig, SQLAlchemyConfig):
scheme: str = "bigquery" scheme: str = "bigquery"
project_id: Optional[str] = None project_id: Optional[str] = None
lineage_client_project_id: Optional[str] = None
log_page_size: Optional[pydantic.PositiveInt] = 1000 log_page_size: Optional[pydantic.PositiveInt] = 1000
credential: Optional[BigQueryCredential] credential: Optional[BigQueryCredential]
@ -304,20 +305,29 @@ class BigQuerySource(SQLAlchemySource):
def _compute_big_query_lineage(self) -> None: def _compute_big_query_lineage(self) -> None:
if self.config.include_table_lineage: if self.config.include_table_lineage:
lineage_client_project_id = self._get_lineage_client_project_id()
if self.config.use_exported_bigquery_audit_metadata: if self.config.use_exported_bigquery_audit_metadata:
self._compute_bigquery_lineage_via_exported_bigquery_audit_metadata() self._compute_bigquery_lineage_via_exported_bigquery_audit_metadata(
lineage_client_project_id
)
else: else:
self._compute_bigquery_lineage_via_gcp_logging() self._compute_bigquery_lineage_via_gcp_logging(
lineage_client_project_id
)
if self.lineage_metadata is not None: if self.lineage_metadata is not None:
logger.info( logger.info(
f"Built lineage map containing {len(self.lineage_metadata)} entries." f"Built lineage map containing {len(self.lineage_metadata)} entries."
) )
def _compute_bigquery_lineage_via_gcp_logging(self) -> None: def _compute_bigquery_lineage_via_gcp_logging(
self, lineage_client_project_id: Optional[str]
) -> None:
logger.info("Populating lineage info via GCP audit logs") logger.info("Populating lineage info via GCP audit logs")
try: try:
_clients: List[GCPLoggingClient] = self._make_bigquery_client() _clients: List[GCPLoggingClient] = self._make_bigquery_client(
lineage_client_project_id
)
log_entries: Iterable[AuditLogEntry] = self._get_bigquery_log_entries( log_entries: Iterable[AuditLogEntry] = self._get_bigquery_log_entries(
_clients _clients
) )
@ -331,10 +341,12 @@ class BigQuerySource(SQLAlchemySource):
e, e,
) )
def _compute_bigquery_lineage_via_exported_bigquery_audit_metadata(self) -> None: def _compute_bigquery_lineage_via_exported_bigquery_audit_metadata(
self, lineage_client_project_id: Optional[str]
) -> None:
logger.info("Populating lineage info via exported GCP audit logs") logger.info("Populating lineage info via exported GCP audit logs")
try: try:
_client: BigQueryClient = BigQueryClient(project=self.config.project_id) _client: BigQueryClient = BigQueryClient(project=lineage_client_project_id)
exported_bigquery_audit_metadata: Iterable[ exported_bigquery_audit_metadata: Iterable[
BigQueryAuditMetadata BigQueryAuditMetadata
] = self._get_exported_bigquery_audit_metadata(_client) ] = self._get_exported_bigquery_audit_metadata(_client)
@ -350,17 +362,28 @@ class BigQuerySource(SQLAlchemySource):
e, e,
) )
def _make_bigquery_client(self) -> List[GCPLoggingClient]: def _make_bigquery_client(
self, lineage_client_project_id: Optional[str]
) -> List[GCPLoggingClient]:
# See https://github.com/googleapis/google-cloud-python/issues/2674 for # See https://github.com/googleapis/google-cloud-python/issues/2674 for
# why we disable gRPC here. # why we disable gRPC here.
client_options = self.config.extra_client_options.copy() client_options = self.config.extra_client_options.copy()
client_options["_use_grpc"] = False client_options["_use_grpc"] = False
project_id = self.config.project_id if lineage_client_project_id is not None:
if project_id is not None: return [
return [GCPLoggingClient(**client_options, project=project_id)] GCPLoggingClient(**client_options, project=lineage_client_project_id)
]
else: else:
return [GCPLoggingClient(**client_options)] return [GCPLoggingClient(**client_options)]
def _get_lineage_client_project_id(self) -> Optional[str]:
project_id: Optional[str] = (
self.config.lineage_client_project_id
if self.config.lineage_client_project_id
else self.config.project_id
)
return project_id
def _get_bigquery_log_entries( def _get_bigquery_log_entries(
self, clients: List[GCPLoggingClient] self, clients: List[GCPLoggingClient]
) -> Iterable[AuditLogEntry]: ) -> Iterable[AuditLogEntry]: