393 lines
13 KiB
Python

import sys
import time
from os import path
import pytest
from _openmetadata_testutils.postgres.conftest import postgres_container
from metadata.generated.schema.api.services.createDatabaseService import (
CreateDatabaseServiceRequest,
)
from metadata.generated.schema.entity.data.table import Table
from metadata.generated.schema.entity.services.connections.database.common.basicAuth import (
BasicAuth,
)
from metadata.generated.schema.entity.services.connections.database.postgresConnection import (
PostgresConnection,
)
from metadata.generated.schema.entity.services.databaseService import (
DatabaseConnection,
DatabaseService,
DatabaseServiceType,
)
from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import (
DatabaseServiceProfilerPipeline,
)
from metadata.generated.schema.metadataIngestion.databaseServiceQueryLineagePipeline import (
DatabaseServiceQueryLineagePipeline,
)
from metadata.generated.schema.metadataIngestion.databaseServiceQueryUsagePipeline import (
DatabaseUsageConfigType,
)
from metadata.generated.schema.metadataIngestion.workflow import (
LogLevels,
OpenMetadataWorkflowConfig,
Processor,
Sink,
Source,
SourceConfig,
WorkflowConfig,
)
from metadata.ingestion.lineage.sql_lineage import search_cache
from metadata.ingestion.models.custom_pydantic import CustomSecretStr
from metadata.ingestion.ometa.client import APIError
from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.workflow.metadata import MetadataWorkflow
from metadata.workflow.profiler import ProfilerWorkflow
from metadata.workflow.usage import UsageWorkflow
if not sys.version_info >= (3, 9):
pytest.skip("requires python 3.9+", allow_module_level=True)
@pytest.fixture(scope="module")
def db_service(metadata, postgres_container):
service = CreateDatabaseServiceRequest(
name="docker_test_db",
serviceType=DatabaseServiceType.Postgres,
connection=DatabaseConnection(
config=PostgresConnection(
username=postgres_container.username,
authType=BasicAuth(password=postgres_container.password),
hostPort="localhost:"
+ postgres_container.get_exposed_port(postgres_container.port),
database="dvdrental",
)
),
)
service_entity = metadata.create_or_update(data=service)
service_entity.connection.config.authType.password = CustomSecretStr(
postgres_container.password
)
yield service_entity
try:
metadata.delete(
DatabaseService, service_entity.id, recursive=True, hard_delete=True
)
except APIError as error:
if error.status_code == 404:
pass
else:
raise
@pytest.fixture(scope="module")
def ingest_metadata(db_service, metadata: OpenMetadata):
search_cache.clear()
workflow_config = OpenMetadataWorkflowConfig(
source=Source(
type=db_service.connection.config.type.value.lower(),
serviceName=db_service.fullyQualifiedName.root,
serviceConnection=db_service.connection,
sourceConfig=SourceConfig(config={}),
),
sink=Sink(
type="metadata-rest",
config={},
),
workflowConfig=WorkflowConfig(openMetadataServerConfig=metadata.config),
)
metadata_ingestion = MetadataWorkflow.create(workflow_config)
metadata_ingestion.execute()
return
@pytest.fixture(scope="module")
def ingest_postgres_lineage(db_service, ingest_metadata, metadata: OpenMetadata):
search_cache.clear()
workflow_config = OpenMetadataWorkflowConfig(
source=Source(
type="postgres-lineage",
serviceName=db_service.fullyQualifiedName.root,
serviceConnection=db_service.connection,
sourceConfig=SourceConfig(config=DatabaseServiceQueryLineagePipeline()),
),
sink=Sink(
type="metadata-rest",
config={},
),
workflowConfig=WorkflowConfig(
loggerLevel=LogLevels.DEBUG, openMetadataServerConfig=metadata.config
),
)
metadata_ingestion = MetadataWorkflow.create(workflow_config)
metadata_ingestion.execute()
metadata_ingestion.raise_from_status()
return
def test_ingest_query_log(db_service, ingest_metadata, metadata: OpenMetadata):
search_cache.clear()
reindex_search(
metadata
) # since query cache is stored in ES, we need to reindex to avoid having a stale cache
workflow_config = {
"source": {
"type": "query-log-lineage",
"serviceName": db_service.fullyQualifiedName.root,
"sourceConfig": {
"config": {
"type": "DatabaseLineage",
"queryLogFilePath": path.dirname(__file__) + "/bad_query_log.csv",
}
},
},
"sink": {"type": "metadata-rest", "config": {}},
"workflowConfig": {
"loggerLevel": "DEBUG",
"openMetadataServerConfig": metadata.config.model_dump(),
},
}
metadata_ingestion = MetadataWorkflow.create(workflow_config)
metadata_ingestion.execute()
assert len(metadata_ingestion.source.status.failures) == 2
for failure in metadata_ingestion.source.status.failures:
assert "Table entity not found" in failure.error
customer_table: Table = metadata.get_by_name(
Table,
f"{db_service.fullyQualifiedName.root}.dvdrental.public.customer",
nullable=False,
)
actor_table: Table = metadata.get_by_name(
Table,
f"{db_service.fullyQualifiedName.root}.dvdrental.public.actor",
nullable=False,
)
staff_table: Table = metadata.get_by_name(
Table,
f"{db_service.fullyQualifiedName.root}.dvdrental.public.staff",
nullable=False,
)
edge = metadata.get_lineage_edge(
str(customer_table.id.root), str(actor_table.id.root)
)
assert edge is not None
edge = metadata.get_lineage_edge(
str(customer_table.id.root), str(staff_table.id.root)
)
assert edge is not None
@pytest.fixture(scope="module")
def run_profiler_workflow(ingest_metadata, db_service, metadata):
workflow_config = OpenMetadataWorkflowConfig(
source=Source(
type=db_service.connection.config.type.value.lower(),
serviceName=db_service.fullyQualifiedName.root,
serviceConnection=db_service.connection,
sourceConfig=SourceConfig(config=DatabaseServiceProfilerPipeline()),
),
processor=Processor(
type="orm-profiler",
config={},
),
sink=Sink(
type="metadata-rest",
config={},
),
workflowConfig=WorkflowConfig(
loggerLevel=LogLevels.DEBUG, openMetadataServerConfig=metadata.config
),
)
metadata_ingestion = ProfilerWorkflow.create(workflow_config.model_dump())
search_cache.clear()
metadata_ingestion.execute()
return
@pytest.fixture(scope="module")
def ingest_query_usage(ingest_metadata, db_service, metadata):
search_cache.clear()
workflow_config = {
"source": {
"type": "postgres-usage",
"serviceName": db_service.fullyQualifiedName.root,
"serviceConnection": db_service.connection.model_dump(),
"sourceConfig": {
"config": {"type": DatabaseUsageConfigType.DatabaseUsage.value}
},
},
"processor": {"type": "query-parser", "config": {}},
"stage": {
"type": "table-usage",
"config": {
"filename": "/tmp/postgres_usage",
},
},
"bulkSink": {
"type": "metadata-usage",
"config": {
"filename": "/tmp/postgres_usage",
},
},
"sink": {"type": "metadata-rest", "config": {}},
"workflowConfig": {
"loggerLevel": "DEBUG",
"openMetadataServerConfig": metadata.config.model_dump(),
},
}
workflow = UsageWorkflow.create(workflow_config)
search_cache.clear()
workflow.execute()
workflow.raise_from_status()
return
@pytest.fixture(scope="module")
def db_fqn(db_service: DatabaseService):
return ".".join(
[
db_service.fullyQualifiedName.root,
db_service.connection.config.database,
]
)
def test_query_usage(
ingest_query_usage,
db_service,
metadata,
db_fqn,
):
table = metadata.get_by_name(Table, ".".join([db_fqn, "public", "actor"]))
queries = metadata.get_entity_queries(table.id)
# TODO this should be retruning 2 queries but in CI sometimes it returns 1 *shrug*
assert 1 <= len(queries) <= 2
def test_profiler(run_profiler_workflow):
pass
def test_db_lineage(ingest_postgres_lineage):
pass
def run_usage_workflow(db_service, metadata):
workflow_config = {
"source": {
"type": "postgres-usage",
"serviceName": db_service.fullyQualifiedName.root,
"serviceConnection": db_service.connection.model_dump(),
"sourceConfig": {
"config": {"type": DatabaseUsageConfigType.DatabaseUsage.value}
},
},
"processor": {"type": "query-parser", "config": {}},
"stage": {
"type": "table-usage",
"config": {
"filename": "/tmp/postgres_usage",
},
},
"bulkSink": {
"type": "metadata-usage",
"config": {
"filename": "/tmp/postgres_usage",
},
},
"sink": {"type": "metadata-rest", "config": {}},
"workflowConfig": {
"loggerLevel": "DEBUG",
"openMetadataServerConfig": metadata.config.model_dump(),
},
}
workflow = UsageWorkflow.create(workflow_config)
search_cache.clear()
workflow.execute()
workflow.raise_from_status()
@pytest.mark.xfail(
reason="'metadata.ingestion.lineage.sql_lineage.search_cache' gets corrupted with invalid data."
" See issue https://github.com/open-metadata/OpenMetadata/issues/16408"
)
def test_usage_delete_usage(db_service, ingest_postgres_lineage, metadata):
workflow_config = {
"source": {
"type": "postgres-usage",
"serviceName": db_service.fullyQualifiedName.root,
"serviceConnection": db_service.connection.model_dump(),
"sourceConfig": {
"config": {"type": DatabaseUsageConfigType.DatabaseUsage.value}
},
},
"processor": {"type": "query-parser", "config": {}},
"stage": {
"type": "table-usage",
"config": {
"filename": "/tmp/postgres_usage",
},
},
"bulkSink": {
"type": "metadata-usage",
"config": {
"filename": "/tmp/postgres_usage",
},
},
"sink": {"type": "metadata-rest", "config": {}},
"workflowConfig": {
"loggerLevel": "DEBUG",
"openMetadataServerConfig": metadata.config.model_dump(),
},
}
workflow = UsageWorkflow.create(workflow_config)
search_cache.clear()
workflow.execute()
workflow.raise_from_status()
run_usage_workflow(db_service, metadata)
metadata.delete(DatabaseService, db_service.id, hard_delete=True, recursive=True)
workflow_config = OpenMetadataWorkflowConfig(
source=Source(
type=db_service.connection.config.type.value.lower(),
serviceName=db_service.fullyQualifiedName.root,
serviceConnection=db_service.connection,
sourceConfig=SourceConfig(config={}),
),
sink=Sink(
type="metadata-rest",
config={},
),
workflowConfig=WorkflowConfig(openMetadataServerConfig=metadata.config),
)
metadata_ingestion = MetadataWorkflow.create(workflow_config)
metadata_ingestion.execute()
metadata_ingestion.raise_from_status()
run_usage_workflow(db_service, metadata)
def reindex_search(metadata: OpenMetadata, timeout=60):
start = time.time()
status = None
while status is None or status == "running":
response = metadata.client.get(
"/apps/name/SearchIndexingApplication/status?offset=0&limit=1"
)
status = response["data"][0]["status"]
if time.time() - start > timeout:
raise TimeoutError("Timed out waiting for reindexing to start")
time.sleep(1)
time.sleep(
0.5
) # app interactivity is not immediate (probably bc async operations), so we wait a bit
metadata.client.post("/apps/trigger/SearchIndexingApplication")
time.sleep(0.5) # here too
while status != "success":
response = metadata.client.get(
"/apps/name/SearchIndexingApplication/status?offset=0&limit=1"
)
status = response["data"][0]["status"]
if time.time() - start > timeout:
raise TimeoutError("Timed out waiting for reindexing to complete")
time.sleep(1)