Minor: Add extended glossary (#14918)

* Add Nested glossary to extended sample data

* Fix lint

* Add Stress test config

* Fix Lint
This commit is contained in:
Ayush Shah 2024-01-31 20:43:04 +05:30 committed by GitHub
parent a715bdef3e
commit 0d66c0e514
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 207 additions and 137 deletions

View File

@ -9,6 +9,8 @@ source:
connectionOptions: connectionOptions:
sampleDataFolder: "./examples/sample_data" sampleDataFolder: "./examples/sample_data"
extendedSampleDataFolder: "./examples/extended_sample_data" extendedSampleDataFolder: "./examples/extended_sample_data"
includeGlossary: True
includeLineageStressTesting: True
sourceConfig: {} sourceConfig: {}
sink: sink:
type: metadata-rest type: metadata-rest

View File

@ -24,6 +24,10 @@ from metadata.generated.schema.api.data.createDatabase import CreateDatabaseRequ
from metadata.generated.schema.api.data.createDatabaseSchema import ( from metadata.generated.schema.api.data.createDatabaseSchema import (
CreateDatabaseSchemaRequest, CreateDatabaseSchemaRequest,
) )
from metadata.generated.schema.api.data.createGlossary import CreateGlossaryRequest
from metadata.generated.schema.api.data.createGlossaryTerm import (
CreateGlossaryTermRequest,
)
from metadata.generated.schema.api.data.createTable import CreateTableRequest from metadata.generated.schema.api.data.createTable import CreateTableRequest
from metadata.generated.schema.api.lineage.addLineage import AddLineageRequest from metadata.generated.schema.api.lineage.addLineage import AddLineageRequest
from metadata.generated.schema.entity.data.dashboard import Dashboard from metadata.generated.schema.entity.data.dashboard import Dashboard
@ -91,10 +95,20 @@ class ExtendedSampleDataSource(Source): # pylint: disable=too-many-instance-att
self.store_table_fqn = set() self.store_table_fqn = set()
self.store_data_model_fqn = [] self.store_data_model_fqn = []
self.store_dashboard_fqn = [] self.store_dashboard_fqn = []
self.main_glossary = None
self.glossary_term_list = []
sample_data_folder = self.service_connection.connectionOptions.__root__.get( sample_data_folder = self.service_connection.connectionOptions.__root__.get(
"sampleDataFolder" "sampleDataFolder"
) )
self.include_glossary = self.service_connection.connectionOptions.__root__.get(
"includeGlossary"
)
self.include_lineage_stress_testing = (
self.service_connection.connectionOptions.__root__.get(
"includeLineageStressTesting"
)
)
extneded_sample_data_folder = ( extneded_sample_data_folder = (
self.service_connection.connectionOptions.__root__.get( self.service_connection.connectionOptions.__root__.get(
"extendedSampleDataFolder" "extendedSampleDataFolder"
@ -225,43 +239,71 @@ class ExtendedSampleDataSource(Source): # pylint: disable=too-many-instance-att
) )
) )
def generate_sample_data(self): # pylint: disable=too-many-locals def generate_sample_data(
self,
): # pylint: disable=too-many-locals,too-many-statements
""" """
Generate sample data for dashboard and database service, Generate sample data for dashboard and database service,
with lineage between them, having long names, special characters and description with lineage between them, having long names, special characters and description
""" """
if self.include_glossary:
db = self.create_database_request("extended_sample_data", self.generate_text()) yield from self.create_glossary()
yield Either(right=db) yield from self.create_glossary_term()
if self.include_lineage_stress_testing:
schema = self.create_database_schema_request( db = self.create_database_request(
"extended_sample_database_schema", self.generate_text(), db "extended_sample_data", self.generate_text()
)
yield Either(right=schema)
for table in self.extended_tables["tables"]:
text = self.generate_text()
table_request = self.create_table_request(
table["name"], text, schema, table
) )
yield Either(right=table_request) yield Either(right=db)
downstream_node_fqn_table = fqn.build(
self.metadata,
entity_type=Table,
service_name=self.database_service.name.__root__,
database_name=db.name.__root__,
schema_name=schema.name.__root__,
table_name=table_request.name.__root__,
)
to_table = self.metadata.get_by_name(
entity=Table, fqn=downstream_node_fqn_table
)
main_table = to_table
self.store_table_fqn.add(downstream_node_fqn_table)
schema = self.create_database_schema_request(
"extended_sample_database_schema", self.generate_text(), db
)
yield Either(right=schema)
for table in self.extended_tables["tables"]:
text = self.generate_text()
table_request = self.create_table_request(
table["name"], text, schema, table
)
yield Either(right=table_request)
downstream_node_fqn_table = fqn.build(
self.metadata,
entity_type=Table,
service_name=self.database_service.name.__root__,
database_name=db.name.__root__,
schema_name=schema.name.__root__,
table_name=table_request.name.__root__,
)
to_table = self.metadata.get_by_name(
entity=Table, fqn=downstream_node_fqn_table
)
main_table = to_table
self.store_table_fqn.add(downstream_node_fqn_table)
for _ in range(40):
# 40 Dynamic Lineage Depths
name = self.generate_name()
table_request = self.create_table_request(name, text, schema, table)
yield Either(right=table_request)
upstream_node_fqn_table = fqn.build(
self.metadata,
entity_type=Table,
service_name=self.database_service.name.__root__,
database_name=db.name.__root__,
schema_name=schema.name.__root__,
table_name=table_request.name.__root__,
)
from_table = self.metadata.get_by_name(
entity=Table, fqn=upstream_node_fqn_table
)
yield from self.create_depth_nodes(
from_table=from_table, to_table=to_table
)
to_table = from_table
for _ in range(40): for _ in range(40):
# 40 Dynamic Lineage Depths
name = self.generate_name() name = self.generate_name()
table_request = self.create_table_request(name, text, schema, table) table_request = self.create_table_request(
name, text, schema, self.extended_tables["tables"][-1]
)
yield Either(right=table_request) yield Either(right=table_request)
upstream_node_fqn_table = fqn.build( upstream_node_fqn_table = fqn.build(
self.metadata, self.metadata,
@ -274,128 +316,76 @@ class ExtendedSampleDataSource(Source): # pylint: disable=too-many-instance-att
from_table = self.metadata.get_by_name( from_table = self.metadata.get_by_name(
entity=Table, fqn=upstream_node_fqn_table entity=Table, fqn=upstream_node_fqn_table
) )
yield from self.create_depth_nodes(
from_table=from_table, to_table=main_table
)
for table in self.tables["tables"]:
text = self.generate_text()
table_request = self.create_table_request(
table["name"], text, schema, table
)
yield Either(right=table_request)
table_entity_fqn = fqn.build(
self.metadata,
entity_type=Table,
service_name=self.database_service.name.__root__,
database_name=db.name.__root__,
schema_name=schema.name.__root__,
table_name=table_request.name.__root__,
)
from_table = self.metadata.get_by_name(
entity=Table, fqn=table_entity_fqn
)
yield from self.create_depth_nodes( yield from self.create_depth_nodes(
from_table=from_table, to_table=to_table from_table=from_table, to_table=to_table
) )
to_table = from_table
for _ in range(40):
name = self.generate_name()
table_request = self.create_table_request(
name, text, schema, self.extended_tables["tables"][-1]
)
yield Either(right=table_request)
upstream_node_fqn_table = fqn.build(
self.metadata,
entity_type=Table,
service_name=self.database_service.name.__root__,
database_name=db.name.__root__,
schema_name=schema.name.__root__,
table_name=table_request.name.__root__,
)
from_table = self.metadata.get_by_name(
entity=Table, fqn=upstream_node_fqn_table
)
yield from self.create_depth_nodes(
from_table=from_table, to_table=main_table
)
for table in self.tables["tables"]:
text = self.generate_text()
table_request = self.create_table_request(
table["name"], text, schema, table
)
yield Either(right=table_request)
table_entity_fqn = fqn.build(
self.metadata,
entity_type=Table,
service_name=self.database_service.name.__root__,
database_name=db.name.__root__,
schema_name=schema.name.__root__,
table_name=table_request.name.__root__,
)
from_table = self.metadata.get_by_name(entity=Table, fqn=table_entity_fqn)
yield from self.create_depth_nodes(from_table=from_table, to_table=to_table)
self.dashboard_service_json["name"] = name
self.dashboard_service_json["description"] = text
for chart in self.charts["charts"]:
yield Either(
right=CreateChartRequest(
name=chart["name"],
displayName=chart["displayName"],
description=chart["description"],
chartType=get_standard_chart_type(chart["chartType"]),
sourceUrl=chart["sourceUrl"],
service=self.dashboard_service.fullyQualifiedName,
)
)
for data_model in self.data_models["datamodels"]:
name = self.generate_name()
text = self.generate_text()
data_model_request = self.create_dashboard_data_model_request(
name, text, data_model
)
yield Either(right=data_model_request)
data_model_entity_fqn = fqn.build(
self.metadata,
entity_type=DashboardDataModel,
service_name=self.dashboard_service.name.__root__,
data_model_name=data_model_request.name.__root__,
)
self.store_data_model_fqn.append(data_model_entity_fqn)
for table_fqn in self.store_table_fqn:
from_table = self.metadata.get_by_name(entity=Table, fqn=table_fqn)
for dashboard_datamodel_fqn in self.store_data_model_fqn:
to_datamodel = self.metadata.get_by_name(
entity=DashboardDataModel, fqn=dashboard_datamodel_fqn
)
self.dashboard_service_json["name"] = name
self.dashboard_service_json["description"] = text
for chart in self.charts["charts"]:
yield Either( yield Either(
right=AddLineageRequest( right=CreateChartRequest(
edge=EntitiesEdge( name=chart["name"],
fromEntity=EntityReference( displayName=chart["displayName"],
id=from_table.id.__root__, type="table" description=chart["description"],
), chartType=get_standard_chart_type(chart["chartType"]),
toEntity=EntityReference( sourceUrl=chart["sourceUrl"],
id=to_datamodel.id.__root__,
type="dashboardDataModel",
),
lineageDetails=LineageDetails(
source=LineageSource.DashboardLineage
),
)
)
)
for dashboard in self.dashboards["dashboards"]:
dashboard_request = CreateDashboardRequest(
name=dashboard["name"],
displayName=dashboard["displayName"],
description=dashboard["description"],
sourceUrl=dashboard["sourceUrl"],
charts=dashboard["charts"],
service=self.dashboard_service.fullyQualifiedName, service=self.dashboard_service.fullyQualifiedName,
) )
yield Either(right=dashboard_request) )
dashboard_fqn = fqn.build( for data_model in self.data_models["datamodels"]:
self.metadata, name = self.generate_name()
entity_type=Dashboard, text = self.generate_text()
service_name=self.dashboard_service.name.__root__, data_model_request = self.create_dashboard_data_model_request(
dashboard_name=dashboard_request.name.__root__, name, text, data_model
) )
to_dashboard = self.metadata.get_by_name( yield Either(right=data_model_request)
entity=Dashboard, fqn=dashboard_fqn data_model_entity_fqn = fqn.build(
self.metadata,
entity_type=DashboardDataModel,
service_name=self.dashboard_service.name.__root__,
data_model_name=data_model_request.name.__root__,
)
self.store_data_model_fqn.append(data_model_entity_fqn)
for table_fqn in self.store_table_fqn:
from_table = self.metadata.get_by_name(entity=Table, fqn=table_fqn)
for dashboard_datamodel_fqn in self.store_data_model_fqn:
to_datamodel = self.metadata.get_by_name(
entity=DashboardDataModel, fqn=dashboard_datamodel_fqn
) )
yield Either( yield Either(
right=AddLineageRequest( right=AddLineageRequest(
edge=EntitiesEdge( edge=EntitiesEdge(
fromEntity=EntityReference( fromEntity=EntityReference(
id=to_datamodel.id.__root__, id=from_table.id.__root__, type="table"
type="dashboardDataModel",
), ),
toEntity=EntityReference( toEntity=EntityReference(
id=to_dashboard.id.__root__, type="dashboard" id=to_datamodel.id.__root__,
type="dashboardDataModel",
), ),
lineageDetails=LineageDetails( lineageDetails=LineageDetails(
source=LineageSource.DashboardLineage source=LineageSource.DashboardLineage
@ -403,6 +393,84 @@ class ExtendedSampleDataSource(Source): # pylint: disable=too-many-instance-att
) )
) )
) )
for dashboard in self.dashboards["dashboards"]:
dashboard_request = CreateDashboardRequest(
name=dashboard["name"],
displayName=dashboard["displayName"],
description=dashboard["description"],
sourceUrl=dashboard["sourceUrl"],
charts=dashboard["charts"],
service=self.dashboard_service.fullyQualifiedName,
)
yield Either(right=dashboard_request)
dashboard_fqn = fqn.build(
self.metadata,
entity_type=Dashboard,
service_name=self.dashboard_service.name.__root__,
dashboard_name=dashboard_request.name.__root__,
)
to_dashboard = self.metadata.get_by_name(
entity=Dashboard, fqn=dashboard_fqn
)
yield Either(
right=AddLineageRequest(
edge=EntitiesEdge(
fromEntity=EntityReference(
id=to_datamodel.id.__root__,
type="dashboardDataModel",
),
toEntity=EntityReference(
id=to_dashboard.id.__root__, type="dashboard"
),
lineageDetails=LineageDetails(
source=LineageSource.DashboardLineage
),
)
)
)
def create_glossary(self):
self.main_glossary = CreateGlossaryRequest(
name="NestedGlossaryTest",
displayName="NestedGlossaryTest",
description="Description of test glossary",
)
yield Either(right=self.main_glossary)
def create_glossary_term(self):
"""
Create Glossary Terms
"""
for _ in range(20):
random_name = self.fake.first_name()
yield Either(
right=CreateGlossaryTermRequest(
glossary="NestedGlossaryTest",
name=random_name,
displayName=random_name,
description="Test glossary term ",
parent=self.glossary_term_list[-1]
if len(self.glossary_term_list) > 3
else None,
)
)
if len(self.glossary_term_list) > 3 and self.glossary_term_list[-1]:
self.glossary_term_list.append(
f"{self.glossary_term_list[-1]}.{random_name}"
)
else:
self.glossary_term_list.append(f"NestedGlossaryTest.{random_name}")
for _ in range(500):
random_name = self.fake.first_name()
yield Either(
right=CreateGlossaryTermRequest(
glossary="NestedGlossaryTest",
name=random_name,
displayName=random_name,
description="Test glossary term 1",
)
)
def generate_name(self): def generate_name(self):
return f"Sample-@!3_(%t3st@)%_^{self.fake.name()}" return f"Sample-@!3_(%t3st@)%_^{self.fake.name()}"