Minor: Add extended glossary (#14918)

* Add Nested glossary to extended sample data

* Fix lint

* Add Stress test config

* Fix Lint
This commit is contained in:
Ayush Shah 2024-01-31 20:43:04 +05:30 committed by GitHub
parent a715bdef3e
commit 0d66c0e514
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 207 additions and 137 deletions

View File

@ -9,6 +9,8 @@ source:
connectionOptions:
sampleDataFolder: "./examples/sample_data"
extendedSampleDataFolder: "./examples/extended_sample_data"
includeGlossary: True
includeLineageStressTesting: True
sourceConfig: {}
sink:
type: metadata-rest

View File

@ -24,6 +24,10 @@ from metadata.generated.schema.api.data.createDatabase import CreateDatabaseRequ
from metadata.generated.schema.api.data.createDatabaseSchema import (
CreateDatabaseSchemaRequest,
)
from metadata.generated.schema.api.data.createGlossary import CreateGlossaryRequest
from metadata.generated.schema.api.data.createGlossaryTerm import (
CreateGlossaryTermRequest,
)
from metadata.generated.schema.api.data.createTable import CreateTableRequest
from metadata.generated.schema.api.lineage.addLineage import AddLineageRequest
from metadata.generated.schema.entity.data.dashboard import Dashboard
@ -91,10 +95,20 @@ class ExtendedSampleDataSource(Source): # pylint: disable=too-many-instance-att
self.store_table_fqn = set()
self.store_data_model_fqn = []
self.store_dashboard_fqn = []
self.main_glossary = None
self.glossary_term_list = []
sample_data_folder = self.service_connection.connectionOptions.__root__.get(
"sampleDataFolder"
)
self.include_glossary = self.service_connection.connectionOptions.__root__.get(
"includeGlossary"
)
self.include_lineage_stress_testing = (
self.service_connection.connectionOptions.__root__.get(
"includeLineageStressTesting"
)
)
extneded_sample_data_folder = (
self.service_connection.connectionOptions.__root__.get(
"extendedSampleDataFolder"
@ -225,43 +239,71 @@ class ExtendedSampleDataSource(Source): # pylint: disable=too-many-instance-att
)
)
def generate_sample_data(self): # pylint: disable=too-many-locals
def generate_sample_data(
self,
): # pylint: disable=too-many-locals,too-many-statements
"""
Generate sample data for dashboard and database service,
with lineage between them, having long names, special characters and description
"""
db = self.create_database_request("extended_sample_data", self.generate_text())
yield Either(right=db)
schema = self.create_database_schema_request(
"extended_sample_database_schema", self.generate_text(), db
)
yield Either(right=schema)
for table in self.extended_tables["tables"]:
text = self.generate_text()
table_request = self.create_table_request(
table["name"], text, schema, table
if self.include_glossary:
yield from self.create_glossary()
yield from self.create_glossary_term()
if self.include_lineage_stress_testing:
db = self.create_database_request(
"extended_sample_data", self.generate_text()
)
yield Either(right=table_request)
downstream_node_fqn_table = fqn.build(
self.metadata,
entity_type=Table,
service_name=self.database_service.name.__root__,
database_name=db.name.__root__,
schema_name=schema.name.__root__,
table_name=table_request.name.__root__,
)
to_table = self.metadata.get_by_name(
entity=Table, fqn=downstream_node_fqn_table
)
main_table = to_table
self.store_table_fqn.add(downstream_node_fqn_table)
yield Either(right=db)
schema = self.create_database_schema_request(
"extended_sample_database_schema", self.generate_text(), db
)
yield Either(right=schema)
for table in self.extended_tables["tables"]:
text = self.generate_text()
table_request = self.create_table_request(
table["name"], text, schema, table
)
yield Either(right=table_request)
downstream_node_fqn_table = fqn.build(
self.metadata,
entity_type=Table,
service_name=self.database_service.name.__root__,
database_name=db.name.__root__,
schema_name=schema.name.__root__,
table_name=table_request.name.__root__,
)
to_table = self.metadata.get_by_name(
entity=Table, fqn=downstream_node_fqn_table
)
main_table = to_table
self.store_table_fqn.add(downstream_node_fqn_table)
for _ in range(40):
# 40 Dynamic Lineage Depths
name = self.generate_name()
table_request = self.create_table_request(name, text, schema, table)
yield Either(right=table_request)
upstream_node_fqn_table = fqn.build(
self.metadata,
entity_type=Table,
service_name=self.database_service.name.__root__,
database_name=db.name.__root__,
schema_name=schema.name.__root__,
table_name=table_request.name.__root__,
)
from_table = self.metadata.get_by_name(
entity=Table, fqn=upstream_node_fqn_table
)
yield from self.create_depth_nodes(
from_table=from_table, to_table=to_table
)
to_table = from_table
for _ in range(40):
# 40 Dynamic Lineage Depths
name = self.generate_name()
table_request = self.create_table_request(name, text, schema, table)
table_request = self.create_table_request(
name, text, schema, self.extended_tables["tables"][-1]
)
yield Either(right=table_request)
upstream_node_fqn_table = fqn.build(
self.metadata,
@ -274,128 +316,76 @@ class ExtendedSampleDataSource(Source): # pylint: disable=too-many-instance-att
from_table = self.metadata.get_by_name(
entity=Table, fqn=upstream_node_fqn_table
)
yield from self.create_depth_nodes(
from_table=from_table, to_table=main_table
)
for table in self.tables["tables"]:
text = self.generate_text()
table_request = self.create_table_request(
table["name"], text, schema, table
)
yield Either(right=table_request)
table_entity_fqn = fqn.build(
self.metadata,
entity_type=Table,
service_name=self.database_service.name.__root__,
database_name=db.name.__root__,
schema_name=schema.name.__root__,
table_name=table_request.name.__root__,
)
from_table = self.metadata.get_by_name(
entity=Table, fqn=table_entity_fqn
)
yield from self.create_depth_nodes(
from_table=from_table, to_table=to_table
)
to_table = from_table
for _ in range(40):
name = self.generate_name()
table_request = self.create_table_request(
name, text, schema, self.extended_tables["tables"][-1]
)
yield Either(right=table_request)
upstream_node_fqn_table = fqn.build(
self.metadata,
entity_type=Table,
service_name=self.database_service.name.__root__,
database_name=db.name.__root__,
schema_name=schema.name.__root__,
table_name=table_request.name.__root__,
)
from_table = self.metadata.get_by_name(
entity=Table, fqn=upstream_node_fqn_table
)
yield from self.create_depth_nodes(
from_table=from_table, to_table=main_table
)
for table in self.tables["tables"]:
text = self.generate_text()
table_request = self.create_table_request(
table["name"], text, schema, table
)
yield Either(right=table_request)
table_entity_fqn = fqn.build(
self.metadata,
entity_type=Table,
service_name=self.database_service.name.__root__,
database_name=db.name.__root__,
schema_name=schema.name.__root__,
table_name=table_request.name.__root__,
)
from_table = self.metadata.get_by_name(entity=Table, fqn=table_entity_fqn)
yield from self.create_depth_nodes(from_table=from_table, to_table=to_table)
self.dashboard_service_json["name"] = name
self.dashboard_service_json["description"] = text
for chart in self.charts["charts"]:
yield Either(
right=CreateChartRequest(
name=chart["name"],
displayName=chart["displayName"],
description=chart["description"],
chartType=get_standard_chart_type(chart["chartType"]),
sourceUrl=chart["sourceUrl"],
service=self.dashboard_service.fullyQualifiedName,
)
)
for data_model in self.data_models["datamodels"]:
name = self.generate_name()
text = self.generate_text()
data_model_request = self.create_dashboard_data_model_request(
name, text, data_model
)
yield Either(right=data_model_request)
data_model_entity_fqn = fqn.build(
self.metadata,
entity_type=DashboardDataModel,
service_name=self.dashboard_service.name.__root__,
data_model_name=data_model_request.name.__root__,
)
self.store_data_model_fqn.append(data_model_entity_fqn)
for table_fqn in self.store_table_fqn:
from_table = self.metadata.get_by_name(entity=Table, fqn=table_fqn)
for dashboard_datamodel_fqn in self.store_data_model_fqn:
to_datamodel = self.metadata.get_by_name(
entity=DashboardDataModel, fqn=dashboard_datamodel_fqn
)
self.dashboard_service_json["name"] = name
self.dashboard_service_json["description"] = text
for chart in self.charts["charts"]:
yield Either(
right=AddLineageRequest(
edge=EntitiesEdge(
fromEntity=EntityReference(
id=from_table.id.__root__, type="table"
),
toEntity=EntityReference(
id=to_datamodel.id.__root__,
type="dashboardDataModel",
),
lineageDetails=LineageDetails(
source=LineageSource.DashboardLineage
),
)
)
)
for dashboard in self.dashboards["dashboards"]:
dashboard_request = CreateDashboardRequest(
name=dashboard["name"],
displayName=dashboard["displayName"],
description=dashboard["description"],
sourceUrl=dashboard["sourceUrl"],
charts=dashboard["charts"],
right=CreateChartRequest(
name=chart["name"],
displayName=chart["displayName"],
description=chart["description"],
chartType=get_standard_chart_type(chart["chartType"]),
sourceUrl=chart["sourceUrl"],
service=self.dashboard_service.fullyQualifiedName,
)
yield Either(right=dashboard_request)
dashboard_fqn = fqn.build(
self.metadata,
entity_type=Dashboard,
service_name=self.dashboard_service.name.__root__,
dashboard_name=dashboard_request.name.__root__,
)
to_dashboard = self.metadata.get_by_name(
entity=Dashboard, fqn=dashboard_fqn
)
for data_model in self.data_models["datamodels"]:
name = self.generate_name()
text = self.generate_text()
data_model_request = self.create_dashboard_data_model_request(
name, text, data_model
)
yield Either(right=data_model_request)
data_model_entity_fqn = fqn.build(
self.metadata,
entity_type=DashboardDataModel,
service_name=self.dashboard_service.name.__root__,
data_model_name=data_model_request.name.__root__,
)
self.store_data_model_fqn.append(data_model_entity_fqn)
for table_fqn in self.store_table_fqn:
from_table = self.metadata.get_by_name(entity=Table, fqn=table_fqn)
for dashboard_datamodel_fqn in self.store_data_model_fqn:
to_datamodel = self.metadata.get_by_name(
entity=DashboardDataModel, fqn=dashboard_datamodel_fqn
)
yield Either(
right=AddLineageRequest(
edge=EntitiesEdge(
fromEntity=EntityReference(
id=to_datamodel.id.__root__,
type="dashboardDataModel",
id=from_table.id.__root__, type="table"
),
toEntity=EntityReference(
id=to_dashboard.id.__root__, type="dashboard"
id=to_datamodel.id.__root__,
type="dashboardDataModel",
),
lineageDetails=LineageDetails(
source=LineageSource.DashboardLineage
@ -403,6 +393,84 @@ class ExtendedSampleDataSource(Source): # pylint: disable=too-many-instance-att
)
)
)
for dashboard in self.dashboards["dashboards"]:
dashboard_request = CreateDashboardRequest(
name=dashboard["name"],
displayName=dashboard["displayName"],
description=dashboard["description"],
sourceUrl=dashboard["sourceUrl"],
charts=dashboard["charts"],
service=self.dashboard_service.fullyQualifiedName,
)
yield Either(right=dashboard_request)
dashboard_fqn = fqn.build(
self.metadata,
entity_type=Dashboard,
service_name=self.dashboard_service.name.__root__,
dashboard_name=dashboard_request.name.__root__,
)
to_dashboard = self.metadata.get_by_name(
entity=Dashboard, fqn=dashboard_fqn
)
yield Either(
right=AddLineageRequest(
edge=EntitiesEdge(
fromEntity=EntityReference(
id=to_datamodel.id.__root__,
type="dashboardDataModel",
),
toEntity=EntityReference(
id=to_dashboard.id.__root__, type="dashboard"
),
lineageDetails=LineageDetails(
source=LineageSource.DashboardLineage
),
)
)
)
def create_glossary(self):
self.main_glossary = CreateGlossaryRequest(
name="NestedGlossaryTest",
displayName="NestedGlossaryTest",
description="Description of test glossary",
)
yield Either(right=self.main_glossary)
def create_glossary_term(self):
"""
Create Glossary Terms
"""
for _ in range(20):
random_name = self.fake.first_name()
yield Either(
right=CreateGlossaryTermRequest(
glossary="NestedGlossaryTest",
name=random_name,
displayName=random_name,
description="Test glossary term ",
parent=self.glossary_term_list[-1]
if len(self.glossary_term_list) > 3
else None,
)
)
if len(self.glossary_term_list) > 3 and self.glossary_term_list[-1]:
self.glossary_term_list.append(
f"{self.glossary_term_list[-1]}.{random_name}"
)
else:
self.glossary_term_list.append(f"NestedGlossaryTest.{random_name}")
for _ in range(500):
random_name = self.fake.first_name()
yield Either(
right=CreateGlossaryTermRequest(
glossary="NestedGlossaryTest",
name=random_name,
displayName=random_name,
description="Test glossary term 1",
)
)
def generate_name(self):
return f"Sample-@!3_(%t3st@)%_^{self.fake.name()}"