diff --git a/openmetadata-spec/src/main/resources/json/schema/entity/data/table.json b/openmetadata-spec/src/main/resources/json/schema/entity/data/table.json index f9a46c465fc..7df71c30353 100644 --- a/openmetadata-spec/src/main/resources/json/schema/entity/data/table.json +++ b/openmetadata-spec/src/main/resources/json/schema/entity/data/table.json @@ -724,16 +724,16 @@ "javaType": "org.openmetadata.schema.type.TableProfilerConfig", "description": "This schema defines the type for Table profile config.", "properties": { - "profileSample": { - "description": "Percentage of data or no. of rows we want to execute the profiler and tests on", - "type": "number", - "default": null - }, "profileSampleType": { "$ref": "#/definitions/profileSampleType" }, + "profileSample": { + "description": "Percentage of data or no. of rows used to compute the profiler metrics and run data quality tests", + "type": "number", + "default": null + }, "sampleDataCount": { - "description": "Number of row of sample data to be generated", + "description": "Number of sample rows to ingest when 'Generate Sample Data' is enabled", "type": "integer", "default": 50, "title": "Sample Data Rows Count" diff --git a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceProfilerPipeline.json b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceProfilerPipeline.json index d8e8a35eba2..2c759080e8b 100644 --- a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceProfilerPipeline.json +++ b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceProfilerPipeline.json @@ -39,30 +39,14 @@ "default": true, "title": "Include Views" }, - "processPiiSensitive": { - "description": "Optional configuration to automatically tag columns that might contain sensitive information", + "useFqnForFiltering": { + "description": "Regex will be applied on fully qualified name (e.g service_name.db_name.schema_name.table_name) instead of raw name (e.g. table_name)", "type": "boolean", "default": false, - "title": "Auto Tag PII" - }, - "confidence": { - "description": "Set the Confidence value for which you want the column to be marked", - "type": "number", - "default": 80, - "title": "Confidence" - }, - "profileSample": { - "description": "Percentage of data or no. of rows we want to execute the profiler and tests on", - "type": "number", - "default": null, - "title": "Profile Sample" - }, - "profileSampleType": { - "$ref": "../entity/data/table.json#/definitions/profileSampleType", - "title": "Profile Sample Type" + "title": "Use FQN For Filtering" }, "generateSampleData": { - "description": "Option to turn on/off generating sample data.", + "description": "Option to turn on/off generating sample data. If enabled, profiler will ingest sample data for each table.", "type": "boolean", "default": true, "title": "Generate Sample Data" @@ -73,8 +57,30 @@ "default": true, "title": "Compute Metrics" }, + "processPiiSensitive": { + "description": "Optional configuration to automatically tag columns that might contain sensitive information", + "type": "boolean", + "default": false, + "title": "Auto Tag PII" + }, + "confidence": { + "description": "Set the Confidence value for which you want the column to be tagged as PII. Confidence value ranges from 0 to 100. A higher number will yield less false positives but more false negatives. A lower number will yield more false positives but less false negatives.", + "type": "number", + "default": 80, + "title": "PII Inference Confidence Level" + }, + "profileSampleType": { + "$ref": "../entity/data/table.json#/definitions/profileSampleType", + "title": "Profile Sample Type" + }, + "profileSample": { + "description": "Percentage of data or no. of rows used to compute the profiler metrics and run data quality tests", + "type": "number", + "default": null, + "title": "Profile Sample" + }, "sampleDataCount": { - "description": "Number of row of sample data to be generated", + "description": "Number of sample rows to ingest when 'Generate Sample Data' is enabled", "type": "integer", "default": 50, "title": "Sample Data Rows Count" @@ -90,12 +96,6 @@ "type": "integer", "default": 43200, "title": "Timeout (in sec.)" - }, - "useFqnForFiltering": { - "description": "Regex will be applied on fully qualified name (e.g service_name.db_name.schema_name.table_name) instead of raw name (e.g. table_name)", - "type": "boolean", - "default": false, - "title": "Use FQN For Filtering" } }, "additionalProperties": false diff --git a/openmetadata-ui/src/main/resources/ui/public/locales/en-US/Database/workflows/profiler.md b/openmetadata-ui/src/main/resources/ui/public/locales/en-US/Database/workflows/profiler.md index 065637d5222..9c1ecef65bf 100644 --- a/openmetadata-ui/src/main/resources/ui/public/locales/en-US/Database/workflows/profiler.md +++ b/openmetadata-ui/src/main/resources/ui/public/locales/en-US/Database/workflows/profiler.md @@ -55,12 +55,46 @@ Checkout [this](https://docs.open-metadata.org/connectors/ingestion/workflows/me $$ $$section -### Profile Sample $(id="profileSample") -Percentage of data or number of rows to use when sampling tables. +### Enable Debug Logs $(id="enableDebugLog") -By default, the profiler will run against the entire table. +Set the `Enable Debug Log` toggle to set the logging level of the process to debug. You can check these logs in the Ingestion tab of the service and dig deeper into any errors you might find. $$ +$$section +### Include Views $(id="includeViews") +If activated the profiler will compute metric for view entity types. Note that it can have a negative impact on the profiler performance. +$$ + +$$section +### Use FQN For Filtering Views $(id="useFqnForFiltering") +Set this flag when you want to apply the filters on Fully Qualified Names (e.g `service_name.db_name.schema_name.table_name`) instead of applying them to the raw name of the asset (e.g `table_name`). + +This Flag is useful in scenarios when you have different schemas with same name in multiple databases, or tables with same name in different schemas, and you want to filter out only one of them. + +Checkout [this](https://docs.open-metadata.org/connectors/ingestion/workflows/metadata/filter-patterns/database#table-filter-pattern) document for further examples on how to use this field. +$$ + +$$section +### Ingest Sample Data $(id="generateSampleData") + +Set the Ingest Sample Data toggle to control whether to ingest sample data as part of profiler ingestion. If this is enabled, 100 rows will be ingested by default. You can update the number of rows in the "DatabaseServiceProfilerPipeline Advanced Config" section (i.e. `Sample Data Rows Count` setting). +$$ + +$$section +### Compute Metrics $(id="computeMetrics") + +Set the `Compute Metrics` toggle off to not perform any metric computation during the profiler ingestion workflow. Used in combination with `Ingest Sample Data` toggle on allows you to only ingest sample data. +$$ + +$$section +### Auto Tag PII $(id="processPiiSensitive") + +Set the `Auto Tag PII` toggle to control whether to automatically tag columns that might contain sensitive information as part of profiler ingestion. + +If `Ingest Sample Data` is enabled, OpenMetadata will leverage machine learning to infer which column may contain PII sensitive data. If disabled, OpenMetadata will infer this information from the column name. Use the `Confidence` setting in the "DatabaseServiceProfilerPipeline Advanced Config" to set the confience level when infering the PII status of a column. +$$ + + $$section ### Profile Sample Type $(id="profileSampleType") The sample type can be set to either: @@ -69,6 +103,21 @@ The sample type can be set to either: * **Row Count**: this will use a number of rows to sample the table (e.g. if table has 100 rows, and we set row count to 10, the profiler will use 10 random rows to compute the metrics). $$ +$$section +### Profile Sample $(id="profileSample") +Percentage of data or number of rows to use when sampling tables to compute the profiler metrics. By default (i.e. if left blank), the profiler will run against the entire table. +$$ + +$$section +### PII Inference Confidence Level $(id="confidence") +Confidence level to use when infering whether a column shoul be flagged as PII or not (between 0 and 100). A number closer to 100 will yield less false positive but potentially more false negative. +$$ + +$$section +### Sample Data Rows Count $(id="sampleDataCount") +Set the number of rows to ingest when `Ingest Sample Data` toggle is on. Defaults to 50. +$$ + $$section ### Thread Count $(id="threadCount") Number of threads that will be used when computing the profiler metrics. A high number can have negative performance effect. @@ -84,26 +133,6 @@ This will set the duration a profiling job against a table should wait before in It is important to note that the profiler will wait for the hanging query to **terminate** before killing the execution. If there is a risk for your profiling job to hang, it is important to also set a query/connection timeout on your database engine. The default value for the profiler timeout is 12 hours. $$ -$$section -### Ingest Sample Data $(id="generateSampleData") - -Set the Ingest Sample Data toggle to control whether to ingest sample data as part of profiler ingestion. If this is enabled, 100 rows will be ingested by default. -$$ - -$$section -### Enable Debug Logs $(id="enableDebugLog") - -Set the `Enable Debug Log` toggle to set the logging level of the process to debug. You can check these logs in the Ingestion tab of the service and dig deeper into any errors you might find. -$$ - -$$section -### Auto Tag PII $(id="processPiiSensitive") - -Set the `Auto Tag PII` toggle to control whether to automatically tag columns that might contain sensitive information as part of profiler ingestion. - -If `Ingest Sample Data` is enabled, OpenMetadata will leverage machine learning to infer which column may contain PII sensitive data. If disabled, OpenMetadata will infer this information from the column name. -$$ - $$section ### Number of Retries $(id="retries") diff --git a/openmetadata-ui/src/main/resources/ui/src/constants/Services.constant.ts b/openmetadata-ui/src/main/resources/ui/src/constants/Services.constant.ts index 0cd8c7464f1..4bf9dfdd672 100644 --- a/openmetadata-ui/src/main/resources/ui/src/constants/Services.constant.ts +++ b/openmetadata-ui/src/main/resources/ui/src/constants/Services.constant.ts @@ -425,6 +425,10 @@ export const ADVANCED_PROPERTIES = [ 'connectionOptions', 'scheme', 'sampleDataStorageConfig', + 'confidence', + 'sampleDataCount', + 'threadCount', + 'timeoutSeconds', ]; export const PIPELINE_SERVICE_PLATFORM = 'Airflow';