Fix #13336: Clean Mark All Deleted Table Flag (#13344)

This commit is contained in:
Mayur Singal 2023-10-06 16:04:54 +05:30 committed by GitHub
parent 3b7f023bdc
commit c0ababd8ad
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 21 additions and 78 deletions

View File

@ -187,3 +187,8 @@ CREATE TABLE IF NOT EXISTS doc_store (
);
-- Remove Mark All Deleted Field
UPDATE ingestion_pipeline_entity
SET json = JSON_REMOVE(json, '$.sourceConfig.config.markAllDeletedTables')
WHERE JSON_EXTRACT(json, '$.pipelineType') = 'metadata';

View File

@ -196,3 +196,7 @@ CREATE TABLE IF NOT EXISTS doc_store (
UNIQUE (fqnHash)
);
CREATE INDEX page_name_index ON doc_store USING btree (name);
-- Remove Mark All Deleted Field
UPDATE ingestion_pipeline_entity
SET json = json::jsonb #- '{sourceConfig,config,markAllDeletedTables}'
WHERE json #>> '{pipelineType}' = 'metadata';

View File

@ -56,7 +56,6 @@ from metadata.ingestion.api.delete import delete_entity_from_source
from metadata.ingestion.api.models import Either
from metadata.ingestion.api.steps import Source
from metadata.ingestion.api.topology_runner import TopologyRunnerMixin
from metadata.ingestion.models.delete_entity import DeleteEntity
from metadata.ingestion.models.life_cycle import OMetaLifeCycleData
from metadata.ingestion.models.ometa_classification import OMetaTagAndClassification
from metadata.ingestion.models.topology import (
@ -429,39 +428,6 @@ class DatabaseServiceSource(
self.database_source_state.add(table_fqn)
def fetch_all_schema_and_delete_tables(self) -> Iterable[Either[DeleteEntity]]:
"""
Fetch all schemas and delete tables
"""
database_fqn = fqn.build(
self.metadata,
entity_type=Database,
service_name=self.config.serviceName,
database_name=self.context.database.name.__root__,
)
schema_list = self.metadata.list_all_entities(
entity=DatabaseSchema, params={"database": database_fqn}
)
for schema in schema_list:
yield from delete_entity_from_source(
metadata=self.metadata,
entity_type=Table,
entity_source_state=self.database_source_state,
mark_deleted_entity=self.source_config.markDeletedTables,
params={"databaseSchema": schema.fullyQualifiedName.__root__},
)
# Delete the schema
yield from delete_entity_from_source(
metadata=self.metadata,
entity_type=DatabaseSchema,
entity_source_state=list(
self._get_filtered_schema_names(return_fqn=True, add_to_status=False)
),
mark_deleted_entity=self.source_config.markDeletedTables,
params={"database": database_fqn},
)
def _get_filtered_schema_names(
self, return_fqn: bool = False, add_to_status: bool = True
) -> Iterable[str]:
@ -490,25 +456,19 @@ class DatabaseServiceSource(
logger.info(
f"Mark Deleted Tables set to True. Processing database [{self.context.database.name.__root__}]"
)
# If markAllDeletedTables is True, all tables Which are not in FilterPattern will be deleted
if self.source_config.markAllDeletedTables:
yield from self.fetch_all_schema_and_delete_tables()
schema_fqn_list = self._get_filtered_schema_names(
return_fqn=True, add_to_status=False
)
# If markAllDeletedTables is False (Default), Only delete tables which are deleted from the datasource
else:
schema_fqn_list = self._get_filtered_schema_names(
return_fqn=True, add_to_status=False
for schema_fqn in schema_fqn_list:
yield from delete_entity_from_source(
metadata=self.metadata,
entity_type=Table,
entity_source_state=self.database_source_state,
mark_deleted_entity=self.source_config.markDeletedTables,
params={"database": schema_fqn},
)
for schema_fqn in schema_fqn_list:
yield from delete_entity_from_source(
metadata=self.metadata,
entity_type=Table,
entity_source_state=self.database_source_state,
mark_deleted_entity=self.source_config.markDeletedTables,
params={"database": schema_fqn},
)
def yield_life_cycle_data(self, _) -> Iterable[Either[OMetaLifeCycleData]]:
"""
Get the life cycle data of the table

View File

@ -252,7 +252,6 @@ caption="Configure Metadata Ingestion Page" /%}
- **Include tags (toggle)**: Set the Include tags toggle to control whether or not to include tags as part of metadata ingestion.
- **Enable Debug Log (toggle)**: Set the Enable Debug Log toggle to set the default log level to debug, these logs can be viewed later in Airflow.
- **Mark Deleted Tables (toggle)**: This is an optional configuration for enabling soft deletion of tables. When this option is enabled, only tables that have been deleted from the source will be soft deleted, and this will apply solely to the schema that is currently being ingested via the pipeline. Any related entities such as test suites or lineage information that were associated with those tables will also be deleted..
- **Mark All Deleted Tables (toggle)**: This is an optional configuration for enabling soft deletion of tables. When this option is enabled, only tables that have been deleted from the source will be soft deleted, and this will apply to all the schemas available in the data source. Any related entities such as test suites or lineage information that were associated with those tables will also be deleted. Do not enable this option when you have multiple metadata ingestion pipelines. Also make sure to enable the markDeletedTables option for this to work.
- **Auto Tag PII(toggle)**: Auto PII tagging checks for column name to mark PII Sensitive/NonSensitive tag
{% /extraContent %}

View File

@ -118,8 +118,7 @@ Lets start with an example of fetching metadata from a database service, i.e.
- Include Views - to generate lineage
- Include Tags
- Enable Debug Log: We recommend enabling the debug log.
- Mark Deleted Tables, or
- Mark All Deleted Tables.
- Mark Deleted Tables
- **View Definition Parsing Timeout Limit:** The default is set to 300.
{% image

View File

@ -11,7 +11,6 @@ slug: /main-concepts/metadata-standard/schemas/metadataingestion/databaseservice
- **`type`**: Pipeline type. Refer to *#/definitions/databaseMetadataConfigType*. Default: `DatabaseMetadata`.
- **`markDeletedTables`** *(boolean)*: This is an optional configuration for enabling soft deletion of tables. When this option is enabled, only tables that have been deleted from the source will be soft deleted, and this will apply solely to the schema that is currently being ingested via the pipeline. Any related entities such as test suites or lineage information that were associated with those tables will also be deleted. Default: `True`.
- **`markAllDeletedTables`** *(boolean)*: This is an optional configuration for enabling soft deletion of tables. When this option is enabled, only tables that have been deleted from the source will be soft deleted, and this will apply to all the schemas available in the data source. Any related entities such as test suites or lineage information that were associated with those tables will also be deleted. Do not enable this option when you have multiple metadata ingestion pipelines. Also make sure to enable the markDeletedTables option for this to work. Default: `False`.
- **`includeTables`** *(boolean)*: Optional configuration to turn off fetching metadata for tables. Default: `True`.
- **`includeViews`** *(boolean)*: Optional configuration to turn off fetching metadata for views. Default: `True`.
- **`includeTags`** *(boolean)*: Optional configuration to toggle the tags ingestion. Default: `True`.

View File

@ -25,12 +25,6 @@
"default": true,
"title": "Mark Deleted Tables"
},
"markAllDeletedTables": {
"description": "This is an optional configuration for enabling soft deletion of tables. When this option is enabled, only tables that have been deleted from the source will be soft deleted, and this will apply to all the schemas available in the data source. Any related entities such as test suites or lineage information that were associated with those tables will also be deleted. Do not enable this option when you have multiple metadata ingestion pipelines. Also make sure to enable the markDeletedTables option for this to work.",
"type": "boolean",
"default": false,
"title": "Mark All Deleted Tables"
},
"includeTables": {
"description": "Optional configuration to turn off fetching metadata for tables.",
"type": "boolean",

View File

@ -101,27 +101,10 @@ Here are some examples of scenarios where tables will **NOT** get soft deleted i
- If you already have `SchemaA` & `SchemaB` ingested in OpenMetadata, then later you apply a `Schema Filter Pattern` to exclude `SchemaB`, then no table from `SchemaB` will be deleted.
- If you already have `SchemaA` & `SchemaB` ingested in OpenMetadata and for this ingestion pipeline you have applied a `Schema Filter Pattern` to include only `SchemaA`, then any table deleted from `SchemaB` will not be deleted (since it is ignored in the ingestion).
$$
$$section
### Mark All Deleted Tables $(id="markAllDeletedTables")
This is an optional configuration that needs to be enabled on top of **Mark Deleted Tables**.
When this option is enabled, only tables that have been deleted from the source will be soft deleted, and this will apply to ALL the schemas available in the data source. Any related entities such as test suites or lineage information that were associated with those tables will also be deleted.
**It is recommended to be cautious while enabling this flag if you have multiple ingestion pipelines running for the same service, because it is possible that a pipeline with this flag enabled might delete the tables ingested by other pipelines.**
Here are some examples of scenarios where tables will get soft deleted if this flag is enabled.
- If no filters were applied, but a table was deleted from the data source, then the same table will be soft deleted from OpenMetadata as well.
- If you have applied a Schema Filter Pattern to include `SchemaA` then any table deleted from `SchemaA` will also be soft deleted from Openmetadata.
- If `TableA` was already ingested in OpenMetadata, then later you apply a `Table Filter Pattern` to exclude `TableA` then `TableA` will get soft deleted from OpenMetadata.
- If you already have `SchemaA` & `SchemaB` ingested in OpenMetadata ,then later you apply a `Schema Filter Pattern` to exclude `SchemaB`, ALL tables from `SchemaB` will be deleted due to this ingestion pipeline. This might be useful if you want to remove a full schema from OpenMetadata that you missed to filter out the first time.
In such cases you may delete the table/schema manually from UI.
$$
$$section
### View Definition Parsing Timeout Limit $(id="viewParsingTimeoutLimit")