mirror of
https://github.com/datahub-project/datahub.git
synced 2025-08-22 16:18:10 +00:00
feat(ingest): add lineage_client_project_id field to the BigQuery config (#4138)
* feat(ingest): add lineage_client_project_id field to the bigquery config * fix linting issues * add type annotation for arguments
This commit is contained in:
parent
88d1c96fff
commit
93ff09517b
@ -40,6 +40,7 @@ source:
|
|||||||
# - "schema.table.column"
|
# - "schema.table.column"
|
||||||
# deny:
|
# deny:
|
||||||
# - "*.*.*"
|
# - "*.*.*"
|
||||||
|
#lineage_client_project_id: project-id-1234567
|
||||||
|
|
||||||
## see https://datahubproject.io/docs/metadata-ingestion/sink_docs/datahub for complete documentation
|
## see https://datahubproject.io/docs/metadata-ingestion/sink_docs/datahub for complete documentation
|
||||||
sink:
|
sink:
|
||||||
|
@ -139,6 +139,7 @@ As a SQL-based service, the Athena integration is also supported by our SQL prof
|
|||||||
| `domain.domain_key.allow` | | | List of regex patterns for tables/schemas to set domain_key domain key (domain_key can be any string like `sales`. There can be multiple domain key specified. |
|
| `domain.domain_key.allow` | | | List of regex patterns for tables/schemas to set domain_key domain key (domain_key can be any string like `sales`. There can be multiple domain key specified. |
|
||||||
| `domain.domain_key.deny` | | | List of regex patterns for tables/schemas to not assign domain_key. There can be multiple domain key specified. |
|
| `domain.domain_key.deny` | | | List of regex patterns for tables/schemas to not assign domain_key. There can be multiple domain key specified. |
|
||||||
| `domain.domain_key.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching.There can be multiple domain key specified. |
|
| `domain.domain_key.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching.There can be multiple domain key specified. |
|
||||||
|
| `lineage_client_project_id` | | None | The project to use when creating the BigQuery Client. If left empty, the required `project_id` will be used. |
|
||||||
|
|
||||||
|
|
||||||
The following parameters are only relevant if include_table_lineage is set to true:
|
The following parameters are only relevant if include_table_lineage is set to true:
|
||||||
|
@ -236,6 +236,7 @@ def create_credential_temp_file(credential: BigQueryCredential) -> str:
|
|||||||
class BigQueryConfig(BaseTimeWindowConfig, SQLAlchemyConfig):
|
class BigQueryConfig(BaseTimeWindowConfig, SQLAlchemyConfig):
|
||||||
scheme: str = "bigquery"
|
scheme: str = "bigquery"
|
||||||
project_id: Optional[str] = None
|
project_id: Optional[str] = None
|
||||||
|
lineage_client_project_id: Optional[str] = None
|
||||||
|
|
||||||
log_page_size: Optional[pydantic.PositiveInt] = 1000
|
log_page_size: Optional[pydantic.PositiveInt] = 1000
|
||||||
credential: Optional[BigQueryCredential]
|
credential: Optional[BigQueryCredential]
|
||||||
@ -304,20 +305,29 @@ class BigQuerySource(SQLAlchemySource):
|
|||||||
|
|
||||||
def _compute_big_query_lineage(self) -> None:
|
def _compute_big_query_lineage(self) -> None:
|
||||||
if self.config.include_table_lineage:
|
if self.config.include_table_lineage:
|
||||||
|
lineage_client_project_id = self._get_lineage_client_project_id()
|
||||||
if self.config.use_exported_bigquery_audit_metadata:
|
if self.config.use_exported_bigquery_audit_metadata:
|
||||||
self._compute_bigquery_lineage_via_exported_bigquery_audit_metadata()
|
self._compute_bigquery_lineage_via_exported_bigquery_audit_metadata(
|
||||||
|
lineage_client_project_id
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
self._compute_bigquery_lineage_via_gcp_logging()
|
self._compute_bigquery_lineage_via_gcp_logging(
|
||||||
|
lineage_client_project_id
|
||||||
|
)
|
||||||
|
|
||||||
if self.lineage_metadata is not None:
|
if self.lineage_metadata is not None:
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Built lineage map containing {len(self.lineage_metadata)} entries."
|
f"Built lineage map containing {len(self.lineage_metadata)} entries."
|
||||||
)
|
)
|
||||||
|
|
||||||
def _compute_bigquery_lineage_via_gcp_logging(self) -> None:
|
def _compute_bigquery_lineage_via_gcp_logging(
|
||||||
|
self, lineage_client_project_id: Optional[str]
|
||||||
|
) -> None:
|
||||||
logger.info("Populating lineage info via GCP audit logs")
|
logger.info("Populating lineage info via GCP audit logs")
|
||||||
try:
|
try:
|
||||||
_clients: List[GCPLoggingClient] = self._make_bigquery_client()
|
_clients: List[GCPLoggingClient] = self._make_bigquery_client(
|
||||||
|
lineage_client_project_id
|
||||||
|
)
|
||||||
log_entries: Iterable[AuditLogEntry] = self._get_bigquery_log_entries(
|
log_entries: Iterable[AuditLogEntry] = self._get_bigquery_log_entries(
|
||||||
_clients
|
_clients
|
||||||
)
|
)
|
||||||
@ -331,10 +341,12 @@ class BigQuerySource(SQLAlchemySource):
|
|||||||
e,
|
e,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _compute_bigquery_lineage_via_exported_bigquery_audit_metadata(self) -> None:
|
def _compute_bigquery_lineage_via_exported_bigquery_audit_metadata(
|
||||||
|
self, lineage_client_project_id: Optional[str]
|
||||||
|
) -> None:
|
||||||
logger.info("Populating lineage info via exported GCP audit logs")
|
logger.info("Populating lineage info via exported GCP audit logs")
|
||||||
try:
|
try:
|
||||||
_client: BigQueryClient = BigQueryClient(project=self.config.project_id)
|
_client: BigQueryClient = BigQueryClient(project=lineage_client_project_id)
|
||||||
exported_bigquery_audit_metadata: Iterable[
|
exported_bigquery_audit_metadata: Iterable[
|
||||||
BigQueryAuditMetadata
|
BigQueryAuditMetadata
|
||||||
] = self._get_exported_bigquery_audit_metadata(_client)
|
] = self._get_exported_bigquery_audit_metadata(_client)
|
||||||
@ -350,17 +362,28 @@ class BigQuerySource(SQLAlchemySource):
|
|||||||
e,
|
e,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _make_bigquery_client(self) -> List[GCPLoggingClient]:
|
def _make_bigquery_client(
|
||||||
|
self, lineage_client_project_id: Optional[str]
|
||||||
|
) -> List[GCPLoggingClient]:
|
||||||
# See https://github.com/googleapis/google-cloud-python/issues/2674 for
|
# See https://github.com/googleapis/google-cloud-python/issues/2674 for
|
||||||
# why we disable gRPC here.
|
# why we disable gRPC here.
|
||||||
client_options = self.config.extra_client_options.copy()
|
client_options = self.config.extra_client_options.copy()
|
||||||
client_options["_use_grpc"] = False
|
client_options["_use_grpc"] = False
|
||||||
project_id = self.config.project_id
|
if lineage_client_project_id is not None:
|
||||||
if project_id is not None:
|
return [
|
||||||
return [GCPLoggingClient(**client_options, project=project_id)]
|
GCPLoggingClient(**client_options, project=lineage_client_project_id)
|
||||||
|
]
|
||||||
else:
|
else:
|
||||||
return [GCPLoggingClient(**client_options)]
|
return [GCPLoggingClient(**client_options)]
|
||||||
|
|
||||||
|
def _get_lineage_client_project_id(self) -> Optional[str]:
|
||||||
|
project_id: Optional[str] = (
|
||||||
|
self.config.lineage_client_project_id
|
||||||
|
if self.config.lineage_client_project_id
|
||||||
|
else self.config.project_id
|
||||||
|
)
|
||||||
|
return project_id
|
||||||
|
|
||||||
def _get_bigquery_log_entries(
|
def _get_bigquery_log_entries(
|
||||||
self, clients: List[GCPLoggingClient]
|
self, clients: List[GCPLoggingClient]
|
||||||
) -> Iterable[AuditLogEntry]:
|
) -> Iterable[AuditLogEntry]:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user