Bigquery lineage mapping (#2849)

* lineage-boilerplate-code-added

* bigquery-lineage-completed

* lineage-code-optimised

* code-cleaned

* code-smell-fixed

* code-refined

* added-database-name-in-fqdn

* code-smell-removed

* exception-handled

* removed-print-statement

* formatted-setup.py

* added-sqllineage-version

* included-valid-type-none

* debug-added-in-logger

* lineage-logic-updated

* updated-lineage-params

* global-config-variables-removed

* return-type-updated-in-entity

* updated-conditionin-from-entity

* imported-sqllineage-inside-method

* code-smell-removed

* code-formatted
This commit is contained in:
codingwithabhi 2022-02-24 11:59:00 +05:30 committed by GitHub
parent 1fb0e7c489
commit 2ef2edbf17
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 67 additions and 7 deletions

View File

@ -70,7 +70,7 @@ plugins: Dict[str, Set[str]] = {
"pyarrow~=6.0.1",
"google-cloud-datacatalog==3.6.2",
},
"bigquery-usage": {"google-cloud-logging", "cachetools"},
"bigquery-usage": {"google-cloud-logging", "cachetools", "sqllineage==1.3.3"},
# "docker": {"docker==5.0.3"},
"docker": {"python_on_whales==0.34.0"},
"backup": {"boto3~=1.19.12"},

View File

@ -14,6 +14,7 @@ import collections
# This import verifies that the dependencies are available.
import logging as log
import os
import traceback
from datetime import datetime
from typing import Iterable
@ -24,10 +25,11 @@ from metadata.generated.schema.entity.services.databaseService import (
)
from metadata.ingestion.api.source import Source, SourceStatus
from metadata.ingestion.models.table_queries import TableQuery
from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.ingestion.ometa.openmetadata_rest import MetadataServerConfig
from metadata.ingestion.source.bigquery import BigQueryConfig, BigquerySource
from metadata.ingestion.source.sql_alchemy_helper import SQLSourceStatus
from metadata.utils.helpers import get_start_and_end
from metadata.utils.helpers import get_start_and_end, ingest_lineage
logger = log.getLogger(__name__)
@ -39,7 +41,7 @@ class BigqueryUsageSource(Source[TableQuery]):
def __init__(self, config, metadata_config, ctx):
super().__init__(ctx)
self.temp_credentials = None
self.metadata_config = metadata_config
self.config = config
self.project_id = self.config.project_id
self.logger_name = "cloudaudit.googleapis.com%2Fdata_access"
@ -64,7 +66,6 @@ class BigqueryUsageSource(Source[TableQuery]):
def get_connection_url(self):
if self.project_id:
print(f"{self.scheme}://{self.project_id}")
return f"{self.scheme}://{self.project_id}"
return f"{self.scheme}://"
@ -130,12 +131,19 @@ class BigqueryUsageSource(Source[TableQuery]):
service_name=self.config.service_name,
)
yield tq
query_info = {
"sql": tq.sql,
"from_type": "table",
"to_type": "table",
"service_name": self.config.service_name,
}
ingest_lineage(query_info, self.metadata_config)
except Exception as err:
logger.error(repr(err))
def close(self):
pass
def get_status(self) -> SourceStatus:
return self.status

View File

@ -8,10 +8,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import traceback
from datetime import datetime, timedelta
from typing import Any, Dict, Iterable, List
from metadata.generated.schema.api.lineage.addLineage import AddLineageRequest
from metadata.generated.schema.api.services.createDashboardService import (
CreateDashboardServiceRequest,
)
@ -27,11 +30,14 @@ from metadata.generated.schema.api.services.createPipelineService import (
from metadata.generated.schema.api.services.createStorageService import (
CreateStorageServiceRequest,
)
from metadata.generated.schema.entity.data.table import Table
from metadata.generated.schema.entity.services.dashboardService import DashboardService
from metadata.generated.schema.entity.services.databaseService import DatabaseService
from metadata.generated.schema.entity.services.messagingService import MessagingService
from metadata.generated.schema.entity.services.pipelineService import PipelineService
from metadata.generated.schema.entity.services.storageService import StorageService
from metadata.generated.schema.type.entityLineage import EntitiesEdge
from metadata.generated.schema.type.entityReference import EntityReference
from metadata.ingestion.ometa.ometa_api import OpenMetadata
logger = logging.getLogger(__name__)
@ -183,6 +189,52 @@ def datetime_to_ts(date: datetime) -> int:
return int(date.timestamp())
def create_lineage(from_table, to_table, query_info, metadata):
try:
from_fqdn = f"{query_info.get('service_name')}.{from_table}"
from_entity: Table = metadata.get_by_name(entity=Table, fqdn=from_fqdn)
to_fqdn = f"{query_info.get('service_name')}.{to_table}"
to_entity: Table = metadata.get_by_name(entity=Table, fqdn=to_fqdn)
if not from_entity or not to_entity:
return None
lineage = AddLineageRequest(
edge=EntitiesEdge(
fromEntity=EntityReference(
id=from_entity.id.__root__,
type=query_info["from_type"],
),
toEntity=EntityReference(
id=to_entity.id.__root__,
type=query_info["to_type"],
),
)
)
created_lineage = metadata.add_lineage(lineage)
logger.info(f"Successfully added Lineage {created_lineage}")
except Exception as err:
logger.debug(traceback.print_exc())
logger.error(err)
def ingest_lineage(query_info, metadata_config):
from sqllineage.runner import LineageRunner
result = LineageRunner(query_info["sql"])
metadata = OpenMetadata(metadata_config)
for intermediate_table in result.intermediate_tables:
for source_table in result.source_tables:
create_lineage(source_table, intermediate_table, query_info, metadata)
for target_table in result.target_tables:
create_lineage(intermediate_table, target_table, query_info, metadata)
if not result.intermediate_tables:
for target_table in result.target_tables:
for source_table in result.source_tables:
create_lineage(source_table, target_table, query_info, metadata)
def get_raw_extract_iter(alchemy_helper) -> Iterable[Dict[str, Any]]:
"""
Provides iterator of result row from SQLAlchemy helper