mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-08-15 12:37:18 +00:00
Bigquery lineage mapping (#2849)
* lineage-boilerplate-code-added * bigquery-lineage-completed * lineage-code-optimised * code-cleaned * code-smell-fixed * code-refined * added-database-name-in-fqdn * code-smell-removed * exception-handled * removed-print-statement * formatted-setup.py * added-sqllineage-version * included-valid-type-none * debug-added-in-logger * lineage-logic-updated * updated-lineage-params * global-config-variables-removed * return-type-updated-in-entity * updated-conditionin-from-entity * imported-sqllineage-inside-method * code-smell-removed * code-formatted
This commit is contained in:
parent
1fb0e7c489
commit
2ef2edbf17
@ -70,7 +70,7 @@ plugins: Dict[str, Set[str]] = {
|
|||||||
"pyarrow~=6.0.1",
|
"pyarrow~=6.0.1",
|
||||||
"google-cloud-datacatalog==3.6.2",
|
"google-cloud-datacatalog==3.6.2",
|
||||||
},
|
},
|
||||||
"bigquery-usage": {"google-cloud-logging", "cachetools"},
|
"bigquery-usage": {"google-cloud-logging", "cachetools", "sqllineage==1.3.3"},
|
||||||
# "docker": {"docker==5.0.3"},
|
# "docker": {"docker==5.0.3"},
|
||||||
"docker": {"python_on_whales==0.34.0"},
|
"docker": {"python_on_whales==0.34.0"},
|
||||||
"backup": {"boto3~=1.19.12"},
|
"backup": {"boto3~=1.19.12"},
|
||||||
|
@ -14,6 +14,7 @@ import collections
|
|||||||
# This import verifies that the dependencies are available.
|
# This import verifies that the dependencies are available.
|
||||||
import logging as log
|
import logging as log
|
||||||
import os
|
import os
|
||||||
|
import traceback
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
|
|
||||||
@ -24,10 +25,11 @@ from metadata.generated.schema.entity.services.databaseService import (
|
|||||||
)
|
)
|
||||||
from metadata.ingestion.api.source import Source, SourceStatus
|
from metadata.ingestion.api.source import Source, SourceStatus
|
||||||
from metadata.ingestion.models.table_queries import TableQuery
|
from metadata.ingestion.models.table_queries import TableQuery
|
||||||
|
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
||||||
from metadata.ingestion.ometa.openmetadata_rest import MetadataServerConfig
|
from metadata.ingestion.ometa.openmetadata_rest import MetadataServerConfig
|
||||||
from metadata.ingestion.source.bigquery import BigQueryConfig, BigquerySource
|
from metadata.ingestion.source.bigquery import BigQueryConfig, BigquerySource
|
||||||
from metadata.ingestion.source.sql_alchemy_helper import SQLSourceStatus
|
from metadata.ingestion.source.sql_alchemy_helper import SQLSourceStatus
|
||||||
from metadata.utils.helpers import get_start_and_end
|
from metadata.utils.helpers import get_start_and_end, ingest_lineage
|
||||||
|
|
||||||
logger = log.getLogger(__name__)
|
logger = log.getLogger(__name__)
|
||||||
|
|
||||||
@ -39,7 +41,7 @@ class BigqueryUsageSource(Source[TableQuery]):
|
|||||||
def __init__(self, config, metadata_config, ctx):
|
def __init__(self, config, metadata_config, ctx):
|
||||||
super().__init__(ctx)
|
super().__init__(ctx)
|
||||||
self.temp_credentials = None
|
self.temp_credentials = None
|
||||||
|
self.metadata_config = metadata_config
|
||||||
self.config = config
|
self.config = config
|
||||||
self.project_id = self.config.project_id
|
self.project_id = self.config.project_id
|
||||||
self.logger_name = "cloudaudit.googleapis.com%2Fdata_access"
|
self.logger_name = "cloudaudit.googleapis.com%2Fdata_access"
|
||||||
@ -64,7 +66,6 @@ class BigqueryUsageSource(Source[TableQuery]):
|
|||||||
|
|
||||||
def get_connection_url(self):
|
def get_connection_url(self):
|
||||||
if self.project_id:
|
if self.project_id:
|
||||||
print(f"{self.scheme}://{self.project_id}")
|
|
||||||
return f"{self.scheme}://{self.project_id}"
|
return f"{self.scheme}://{self.project_id}"
|
||||||
return f"{self.scheme}://"
|
return f"{self.scheme}://"
|
||||||
|
|
||||||
@ -130,12 +131,19 @@ class BigqueryUsageSource(Source[TableQuery]):
|
|||||||
service_name=self.config.service_name,
|
service_name=self.config.service_name,
|
||||||
)
|
)
|
||||||
yield tq
|
yield tq
|
||||||
|
|
||||||
|
query_info = {
|
||||||
|
"sql": tq.sql,
|
||||||
|
"from_type": "table",
|
||||||
|
"to_type": "table",
|
||||||
|
"service_name": self.config.service_name,
|
||||||
|
}
|
||||||
|
|
||||||
|
ingest_lineage(query_info, self.metadata_config)
|
||||||
|
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
logger.error(repr(err))
|
logger.error(repr(err))
|
||||||
|
|
||||||
def close(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def get_status(self) -> SourceStatus:
|
def get_status(self) -> SourceStatus:
|
||||||
return self.status
|
return self.status
|
||||||
|
|
||||||
|
@ -8,10 +8,13 @@
|
|||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import traceback
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from typing import Any, Dict, Iterable, List
|
from typing import Any, Dict, Iterable, List
|
||||||
|
|
||||||
|
from metadata.generated.schema.api.lineage.addLineage import AddLineageRequest
|
||||||
from metadata.generated.schema.api.services.createDashboardService import (
|
from metadata.generated.schema.api.services.createDashboardService import (
|
||||||
CreateDashboardServiceRequest,
|
CreateDashboardServiceRequest,
|
||||||
)
|
)
|
||||||
@ -27,11 +30,14 @@ from metadata.generated.schema.api.services.createPipelineService import (
|
|||||||
from metadata.generated.schema.api.services.createStorageService import (
|
from metadata.generated.schema.api.services.createStorageService import (
|
||||||
CreateStorageServiceRequest,
|
CreateStorageServiceRequest,
|
||||||
)
|
)
|
||||||
|
from metadata.generated.schema.entity.data.table import Table
|
||||||
from metadata.generated.schema.entity.services.dashboardService import DashboardService
|
from metadata.generated.schema.entity.services.dashboardService import DashboardService
|
||||||
from metadata.generated.schema.entity.services.databaseService import DatabaseService
|
from metadata.generated.schema.entity.services.databaseService import DatabaseService
|
||||||
from metadata.generated.schema.entity.services.messagingService import MessagingService
|
from metadata.generated.schema.entity.services.messagingService import MessagingService
|
||||||
from metadata.generated.schema.entity.services.pipelineService import PipelineService
|
from metadata.generated.schema.entity.services.pipelineService import PipelineService
|
||||||
from metadata.generated.schema.entity.services.storageService import StorageService
|
from metadata.generated.schema.entity.services.storageService import StorageService
|
||||||
|
from metadata.generated.schema.type.entityLineage import EntitiesEdge
|
||||||
|
from metadata.generated.schema.type.entityReference import EntityReference
|
||||||
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@ -183,6 +189,52 @@ def datetime_to_ts(date: datetime) -> int:
|
|||||||
return int(date.timestamp())
|
return int(date.timestamp())
|
||||||
|
|
||||||
|
|
||||||
|
def create_lineage(from_table, to_table, query_info, metadata):
|
||||||
|
try:
|
||||||
|
from_fqdn = f"{query_info.get('service_name')}.{from_table}"
|
||||||
|
from_entity: Table = metadata.get_by_name(entity=Table, fqdn=from_fqdn)
|
||||||
|
to_fqdn = f"{query_info.get('service_name')}.{to_table}"
|
||||||
|
to_entity: Table = metadata.get_by_name(entity=Table, fqdn=to_fqdn)
|
||||||
|
if not from_entity or not to_entity:
|
||||||
|
return None
|
||||||
|
|
||||||
|
lineage = AddLineageRequest(
|
||||||
|
edge=EntitiesEdge(
|
||||||
|
fromEntity=EntityReference(
|
||||||
|
id=from_entity.id.__root__,
|
||||||
|
type=query_info["from_type"],
|
||||||
|
),
|
||||||
|
toEntity=EntityReference(
|
||||||
|
id=to_entity.id.__root__,
|
||||||
|
type=query_info["to_type"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
created_lineage = metadata.add_lineage(lineage)
|
||||||
|
logger.info(f"Successfully added Lineage {created_lineage}")
|
||||||
|
|
||||||
|
except Exception as err:
|
||||||
|
logger.debug(traceback.print_exc())
|
||||||
|
logger.error(err)
|
||||||
|
|
||||||
|
|
||||||
|
def ingest_lineage(query_info, metadata_config):
|
||||||
|
from sqllineage.runner import LineageRunner
|
||||||
|
|
||||||
|
result = LineageRunner(query_info["sql"])
|
||||||
|
metadata = OpenMetadata(metadata_config)
|
||||||
|
for intermediate_table in result.intermediate_tables:
|
||||||
|
for source_table in result.source_tables:
|
||||||
|
create_lineage(source_table, intermediate_table, query_info, metadata)
|
||||||
|
for target_table in result.target_tables:
|
||||||
|
create_lineage(intermediate_table, target_table, query_info, metadata)
|
||||||
|
if not result.intermediate_tables:
|
||||||
|
for target_table in result.target_tables:
|
||||||
|
for source_table in result.source_tables:
|
||||||
|
create_lineage(source_table, target_table, query_info, metadata)
|
||||||
|
|
||||||
|
|
||||||
def get_raw_extract_iter(alchemy_helper) -> Iterable[Dict[str, Any]]:
|
def get_raw_extract_iter(alchemy_helper) -> Iterable[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Provides iterator of result row from SQLAlchemy helper
|
Provides iterator of result row from SQLAlchemy helper
|
||||||
|
Loading…
x
Reference in New Issue
Block a user