Minor: Auto classification UI (#18785)

* Add the auto classification pipeline option in the service ingestion

* Localization changes for other languages

* Improve the logic for getSupportedPipelineTypes function and add unit tests for newly added logic

* Add playwright tests for the auto classification feature

* Improve the getSupportedPipelineTypes function logic to reduce the cognitive complexity

* update md docs

* Add classificationFilterPattern in the UI schema form order

* fix logs from backend for auto classification

* Changes to view the auto classification logs

* Fix the sonar errors

---------

Co-authored-by: Pere Miquel Brull <peremiquelbrull@gmail.com>
This commit is contained in:
Aniket Katkar 2024-11-27 14:51:59 +05:30 committed by GitHub
parent e8031bcc0e
commit c101c7cf30
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
28 changed files with 518 additions and 68 deletions

View File

@ -52,6 +52,8 @@ public interface PipelineServiceClientInterface {
"ingestion_task",
PipelineType.PROFILER.toString(),
"profiler_task",
PipelineType.AUTO_CLASSIFICATION.toString(),
"auto_classification_task",
PipelineType.LINEAGE.toString(),
"lineage_task",
PipelineType.DBT.toString(),

View File

@ -0,0 +1,118 @@
/*
* Copyright 2024 Collate.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import test from '@playwright/test';
import { PLAYWRIGHT_INGESTION_TAG_OBJ } from '../../constant/config';
import MysqlIngestionClass from '../../support/entity/ingestion/MySqlIngestionClass';
import { addAndTriggerAutoClassificationPipeline } from '../../utils/autoClassification';
import { redirectToHomePage } from '../../utils/common';
import { settingClick, SettingOptionsType } from '../../utils/sidebar';
const mysqlService = new MysqlIngestionClass(['sensitive_customers']);
// use the admin user to login
test.use({
storageState: 'playwright/.auth/admin.json',
trace: process.env.PLAYWRIGHT_IS_OSS ? 'off' : 'on-first-retry',
video: process.env.PLAYWRIGHT_IS_OSS ? 'on' : 'off',
});
test.describe.configure({
// 11 minutes max for ingestion tests
timeout: 11 * 60 * 1000,
});
test.describe('Auto Classification', PLAYWRIGHT_INGESTION_TAG_OBJ, async () => {
test('should be able to auto classify data', async ({ page }) => {
await redirectToHomePage(page);
await settingClick(
page,
mysqlService.category as unknown as SettingOptionsType
);
// Create and ingest service data
await mysqlService.createService(page);
await addAndTriggerAutoClassificationPipeline(page, mysqlService);
// Check if the classification is successful
const getDatabases = page.waitForResponse(
(response) =>
response.url().includes('/api/v1/databases?service=') &&
response.request().method() === 'GET' &&
response.status() === 200
);
// Click on databases tab
await page.click('.ant-tabs-nav-list [data-testid="databases"]');
await getDatabases;
// Click on the database name
await page
.getByTestId('child-asset-name-link')
.getByText('default')
.click();
await page.waitForSelector('[data-testid="cypress_integrations_test_db"]');
// Click on the database schema name
await page.getByTestId('cypress_integrations_test_db').click();
await page.waitForSelector('[data-testid="sensitive_customers"]');
// Click on the table name
await page.getByTestId('sensitive_customers').click();
// Verify the sensitive tags
await test
.expect(
page.locator(
`[data-row-key*="user_name"] [data-testid="tag-PII.Sensitive"] `
)
)
.toBeAttached();
await test
.expect(
page.locator(`[data-row-key*="SSN"] [data-testid="tag-PII.Sensitive"] `)
)
.toBeAttached();
await test
.expect(
page.locator(
`[data-row-key*="DWH_X10"] [data-testid="tag-PII.Sensitive"] `
)
)
.toBeAttached();
mysqlService.name;
// Verify the non sensitive tags
await test
.expect(
page.locator(
`[data-row-key*="address"] [data-testid="tag-PII.NonSensitive"] `
)
)
.toBeAttached();
// Delete the created service
await settingClick(
page,
mysqlService.category as unknown as SettingOptionsType
);
await mysqlService.deleteService(page);
});
});

View File

@ -33,17 +33,18 @@ import {
import ServiceBaseClass from './ServiceBaseClass';
class MysqlIngestionClass extends ServiceBaseClass {
name: string;
name = '';
tableFilter: string[];
profilerTable = 'alert_entity';
constructor() {
super(
Services.Database,
`pw-mysql-with-%-${uuid()}`,
'Mysql',
'bot_entity'
);
this.tableFilter = ['bot_entity', 'alert_entity', 'chart_entity'];
constructor(tableFilter?: string[]) {
const serviceName = `pw-mysql-with-%-${uuid()}`;
super(Services.Database, serviceName, 'Mysql', 'bot_entity');
this.name = serviceName;
this.tableFilter = tableFilter ?? [
'bot_entity',
'alert_entity',
'chart_entity',
];
}
async createService(page: Page) {

View File

@ -98,10 +98,10 @@ class ServiceBaseClass {
await testConnection(page);
}
await this.submitService(this.serviceName, page);
await this.submitService(page);
if (this.shouldAddIngestion) {
await this.addIngestionPipeline(this.serviceName, page);
await this.addIngestionPipeline(page);
}
}
@ -149,7 +149,7 @@ class ServiceBaseClass {
// Handle validate ingestion details in respective service here
}
async addIngestionPipeline(serviceName: string, page: Page) {
async addIngestionPipeline(page: Page) {
await page.click('[data-testid="add-ingestion-button"]');
// Add ingestion page
@ -191,7 +191,7 @@ class ServiceBaseClass {
await this.handleIngestionRetry('metadata', page);
}
async submitService(serviceName: string, page: Page) {
async submitService(page: Page) {
await page.click('[data-testid="submit-btn"]');
await page.waitForSelector('[data-testid="success-line"]', {
state: 'visible',

View File

@ -0,0 +1,88 @@
/*
* Copyright 2024 Collate.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { Page } from '@playwright/test';
import MysqlIngestionClass from '../support/entity/ingestion/MySqlIngestionClass';
import { getApiContext, toastNotification } from './common';
import { visitServiceDetailsPage } from './service';
export const addAndTriggerAutoClassificationPipeline = async (
page: Page,
mysqlService: MysqlIngestionClass
) => {
const { apiContext } = await getApiContext(page);
await visitServiceDetailsPage(
page,
{
type: mysqlService.category,
name: mysqlService.name,
displayName: mysqlService.name,
},
true
);
// Add auto classification ingestion
await page.click('[data-testid="ingestions"]');
await page.click('[data-testid="add-new-ingestion-button"]');
await page.waitForSelector('[data-menu-id*="autoClassification"]');
await page.click('[data-menu-id*="autoClassification"]');
// Fill the auto classification form details
await page.waitForSelector('#root\\/tableFilterPattern\\/includes');
await mysqlService.fillIngestionDetails(page);
await page.click('#root\\/enableAutoClassification');
await page.click('[data-testid="submit-btn"]');
// Make sure we create ingestion with None schedule to avoid conflict between Airflow and Argo behavior
await mysqlService.scheduleIngestion(page);
await page.click('[data-testid="view-service-button"]');
// Header available once page loads
await page.getByTestId('loader').waitFor({ state: 'detached' });
await page.getByTestId('ingestions').click();
await page
.getByLabel('Ingestions')
.getByTestId('loader')
.waitFor({ state: 'detached' });
const response = await apiContext
.get(
`/api/v1/services/ingestionPipelines?service=${encodeURIComponent(
mysqlService.name
)}&pipelineType=autoClassification&serviceType=databaseService&limit=1`
)
.then((res) => res.json());
// need manual wait to settle down the deployed pipeline, before triggering the pipeline
await page.waitForTimeout(3000);
await page.click(
`[data-row-key*="${response.data[0].name}"] [data-testid="more-actions"]`
);
await page.getByTestId('run-button').click();
await toastNotification(page, `Pipeline triggered successfully!`);
// need manual wait to make sure we are awaiting on latest run results
await page.waitForTimeout(2000);
await mysqlService.handleIngestionRetry('autoClassification', page);
};

View File

@ -0,0 +1,126 @@
# Auto Classification
Auto Classification Pipeline Configuration.
The main goal of this pipeline is bringing in Sample Data from your sources, as well as using NLP models to
automatically classify your data based on PII (Personally Identifiable Information) and other sensitive information.
## Configuration
$$section
### Database Filter Pattern $(id="databaseFilterPattern")
Database filter patterns to control whether to include database as part of metadata ingestion.
**Include**: Explicitly include databases by adding a list of regular expressions to the `Include` field. OpenMetadata will include all databases with names matching one or more of the supplied regular expressions. All other databases will be excluded.
For example, to include only those databases whose name starts with the word `demo`, add the regex pattern in the include field as `^demo.*`.
**Exclude**: Explicitly exclude databases by adding a list of regular expressions to the `Exclude` field. OpenMetadata will exclude all databases with names matching one or more of the supplied regular expressions. All other databases will be included.
For example, to exclude all databases with the name containing the word `demo`, add the regex pattern in the exclude field as `.*demo.*`.
Checkout [this](https://docs.open-metadata.org/connectors/ingestion/workflows/metadata/filter-patterns/database#database-filter-pattern) document for further examples on database filter patterns.
$$
$$section
### Schema Filter Pattern $(id="schemaFilterPattern")
Schema filter patterns are used to control whether to include schemas as part of metadata ingestion.
**Include**: Explicitly include schemas by adding a list of regular expressions to the `Include` field. OpenMetadata will include all schemas with names matching one or more of the supplied regular expressions. All other schemas will be excluded.
For example, to include only those schemas whose name starts with the word `demo`, add the regex pattern in the include field as `^demo.*`.
**Exclude**: Explicitly exclude schemas by adding a list of regular expressions to the `Exclude` field. OpenMetadata will exclude all schemas with names matching one or more of the supplied regular expressions. All other schemas will be included.
For example, to exclude all schemas with the name containing the word `demo`, add regex pattern in the exclude field as `.*demo.*`.
Checkout [this](https://docs.open-metadata.org/connectors/ingestion/workflows/metadata/filter-patterns/database#database-filter-pattern) document for further examples on schema filter patterns.
$$
$$section
### Table Filter Pattern $(id="tableFilterPattern")
Table filter patterns are used to control whether to include tables as part of metadata ingestion.
**Include**: Explicitly include tables by adding a list of regular expressions to the `Include` field. OpenMetadata will include all tables with names matching one or more of the supplied regular expressions. All other tables will be excluded.
For example, to include only those tables whose name starts with the word `demo`, add the regex pattern in the include field as `^demo.*`.
**Exclude**: Explicitly exclude tables by adding a list of regular expressions to the `Exclude` field. OpenMetadata will exclude all tables with names matching one or more of the supplied regular expressions. All other tables will be included.
For example, to exclude all tables with the name containing the word `demo`, add the regex pattern in the exclude field as `.*demo.*`.
Checkout [this](https://docs.open-metadata.org/connectors/ingestion/workflows/metadata/filter-patterns/database#table-filter-pattern) document for further examples on table filter patterns.
$$
$$section
### Enable Debug Logs $(id="enableDebugLog")
Set the `Enable Debug Log` toggle to set the logging level of the process to debug. You can check these logs in the Ingestion tab of the service and dig deeper into any errors you might find.
$$
$$section
### Include Views $(id="includeViews")
If activated the profiler will compute metric for view entity types. Note that it can have a negative impact on the profiler performance.
$$
$$section
### Use FQN For Filtering Views $(id="useFqnForFiltering")
Set this flag when you want to apply the filters on Fully Qualified Names (e.g `service_name.db_name.schema_name.table_name`) instead of applying them to the raw name of the asset (e.g `table_name`).
This Flag is useful in scenarios when you have different schemas with same name in multiple databases, or tables with same name in different schemas, and you want to filter out only one of them.
Checkout [this](https://docs.open-metadata.org/connectors/ingestion/workflows/metadata/filter-patterns/database#table-filter-pattern) document for further examples on how to use this field.
$$
$$section
### Store Sample Data $(id="storeSampleData")
Set the Store Sample Data toggle to control whether to store sample data as part of Auto Classification workflow. If this is enabled, 100 rows will be ingested by default. You can update the number of rows in the "DatabaseServiceProfilerPipeline Advanced Config" section (i.e. `Sample Data Rows Count` setting).
If disabled, OpenMetadata will not store any sample data, but will still use it on-the-fly to compute the Auto Classification.
$$
$$section
### Enable Auto Classification $(id="enableAutoClassification")
Set the Enable Auto Classification toggle to control whether to automatically tag columns that might contain sensitive information.
Use the `Confidence` setting to set the confidence level when inferring the tags for a column.
$$
$$section
### PII Inference Confidence Level $(id="confidence")
Confidence level to use when inferring whether a column should be applied the classification or not (between 0 and 100). A number closer to 100 will yield less false positive but potentially more false negative.
$$
$$section
### Profile Sample Type $(id="profileSampleType")
The sample type can be set to either:
* **Percentage**: this will use a percentage to sample the table (e.g. if table has 100 rows, and we set sample percentage tp 50%, the profiler will use 50 random rows to compute the metrics).
* **Row Count**: this will use a number of rows to sample the table (e.g. if table has 100 rows, and we set row count to 10, the profiler will use 10 random rows to compute the metrics).
$$
$$section
### Profile Sample $(id="profileSample")
Percentage of data or number of rows to use when sampling tables to compute the profiler metrics. By default (i.e. if left blank), the profiler will run against the entire table.
$$
$$section
### Sampling Method Type $(id="samplingMethodType")
**This parameter is effective for Snowflake only**
The sampling method type can be set to **BERNOULLI** or **SYSTEM**. You can find the difference of two values in the document of the Snowflake. When you choice **BERNOULLI**, it will scan full rows in the table even though small value is set at the **Profile Sample**. However, it has less restlictions than **SYSTEM**.
If no option is chosen, the default is **BERNOULLI**.
$$
$$section
### Sample Data Rows Count $(id="sampleDataCount")
Set the number of rows to ingest when `Ingest Sample Data` toggle is on. Defaults to 50.
$$

View File

@ -74,27 +74,12 @@ This Flag is useful in scenarios when you have different schemas with same name
Checkout [this](https://docs.open-metadata.org/connectors/ingestion/workflows/metadata/filter-patterns/database#table-filter-pattern) document for further examples on how to use this field.
$$
$$section
### Ingest Sample Data $(id="generateSampleData")
Set the Ingest Sample Data toggle to control whether to ingest sample data as part of profiler ingestion. If this is enabled, 100 rows will be ingested by default. You can update the number of rows in the "DatabaseServiceProfilerPipeline Advanced Config" section (i.e. `Sample Data Rows Count` setting).
$$
$$section
### Compute Metrics $(id="computeMetrics")
Set the `Compute Metrics` toggle off to not perform any metric computation during the profiler ingestion workflow. Used in combination with `Ingest Sample Data` toggle on allows you to only ingest sample data.
$$
$$section
### Auto Tag PII $(id="processPiiSensitive")
Set the `Auto Tag PII` toggle to control whether to automatically tag columns that might contain sensitive information as part of profiler ingestion.
If `Ingest Sample Data` is enabled, OpenMetadata will leverage machine learning to infer which column may contain PII sensitive data. If disabled, OpenMetadata will infer this information from the column name. Use the `Confidence` setting in the "DatabaseServiceProfilerPipeline Advanced Config" to set the confience level when infering the PII status of a column.
$$
$$section
### Profile Sample Type $(id="profileSampleType")
The sample type can be set to either:
@ -115,12 +100,7 @@ $$section
The sampling method type can be set to **BERNOULLI** or **SYSTEM**. You can find the difference of two values in the document of the Snowflake. When you choice **BERNOULLI**, it will scan full rows in the table even though small value is set at the **Profile Sample**. However, it has less restlictions than **SYSTEM**.
If no option is choiced, the default is **BERNOULLI**.
$$
$$section
### PII Inference Confidence Level $(id="confidence")
Confidence level to use when infering whether a column shoul be flagged as PII or not (between 0 and 100). A number closer to 100 will yield less false positive but potentially more false negative.
If no option is chosen, the default is **BERNOULLI**.
$$
$$section

View File

@ -323,6 +323,7 @@ export const INGESTION_WORKFLOW_UI_SCHEMA = {
'databaseFilterPattern',
'schemaFilterPattern',
'tableFilterPattern',
'classificationFilterPattern',
'enableDebugLog',
'*',
],
@ -493,5 +494,6 @@ export const SERVICE_INGESTION_PIPELINE_TYPES = [
PipelineType.Usage,
PipelineType.Lineage,
PipelineType.Profiler,
PipelineType.AutoClassification,
PipelineType.Dbt,
];

View File

@ -115,6 +115,7 @@
"authentication-uri": "Authentifizierungs-URI",
"authority": "Behörde",
"authorize-app": "Authorize {{app}}",
"auto-classification": "Automatische Klassifizierung",
"auto-pii-confidence-score": "Auto PII-Vertrauensscore",
"auto-tag-pii-uppercase": "Auto PII-Tag",
"automatically-generate": "Automatisch generieren",

View File

@ -115,6 +115,7 @@
"authentication-uri": "Authentication URI",
"authority": "Authority",
"authorize-app": "Authorize {{app}}",
"auto-classification": "Auto Classification",
"auto-pii-confidence-score": "Auto PII Confidence Score",
"auto-tag-pii-uppercase": "Auto Tag PII",
"automatically-generate": "Automatically Generate",

View File

@ -115,6 +115,7 @@
"authentication-uri": "URI de autenticación",
"authority": "Autoridad",
"authorize-app": "Autorizar {{app}}",
"auto-classification": "Clasificación automática",
"auto-pii-confidence-score": "Nivel de Confianza de Auto PII",
"auto-tag-pii-uppercase": "Etiqueta de información personal identificable automática",
"automatically-generate": "Generar automáticamente",

View File

@ -115,6 +115,7 @@
"authentication-uri": "URI d'Authentification",
"authority": "Autorité",
"authorize-app": "Authorize {{app}}",
"auto-classification": "Classification Automatique",
"auto-pii-confidence-score": "Score de Confiance Auto PII",
"auto-tag-pii-uppercase": "Balise Auto PII",
"automatically-generate": "Générer Automatiquement",

View File

@ -115,6 +115,7 @@
"authentication-uri": "URI de autenticación",
"authority": "Autoridade",
"authorize-app": "Autorizar {{app}}",
"auto-classification": "Clasificación automática",
"auto-pii-confidence-score": "Puntuación de confianza automática de PII",
"auto-tag-pii-uppercase": "Etiquetado automático de PII",
"automatically-generate": "Xerar automaticamente",

View File

@ -115,6 +115,7 @@
"authentication-uri": "URI אימות",
"authority": "רשות",
"authorize-app": "אמת את {{app}}",
"auto-classification": "סיווג אוטומטי",
"auto-pii-confidence-score": "ציון ביטחון PII אוטומטי",
"auto-tag-pii-uppercase": "תיוג PII אוטומטי",
"automatically-generate": "צור באופן אוטומטי",

View File

@ -115,6 +115,7 @@
"authentication-uri": "認証URI",
"authority": "Authority",
"authorize-app": "Authorize {{app}}",
"auto-classification": "自動分類",
"auto-pii-confidence-score": "Auto PII Confidence Score",
"auto-tag-pii-uppercase": "自動PIIタグ",
"automatically-generate": "自動生成",

View File

@ -115,6 +115,7 @@
"authentication-uri": "Authenticatie-URI",
"authority": "Autoriteit",
"authorize-app": "Applicatie autoriseren {{app}}",
"auto-classification": "Automatische classificatie",
"auto-pii-confidence-score": "Automatische PII-vertrouwensscore",
"auto-tag-pii-uppercase": "Automatisch taggen van PII",
"automatically-generate": "Automatisch genereren",

View File

@ -115,6 +115,7 @@
"authentication-uri": "آدرس URI احراز هویت",
"authority": "مرجع",
"authorize-app": "مجوز دادن به {{app}}",
"auto-classification": "طبقه‌بندی خودکار",
"auto-pii-confidence-score": "امتیاز اعتماد PII خودکار",
"auto-tag-pii-uppercase": "برچسب PII خودکار",
"automatically-generate": "تولید خودکار",

View File

@ -115,6 +115,7 @@
"authentication-uri": "URI de Autenticação",
"authority": "Autoridade",
"authorize-app": "Autorizar {{app}}",
"auto-classification": "Classificação Automática",
"auto-pii-confidence-score": "Pontuação de Confiança Automática PII",
"auto-tag-pii-uppercase": "Auto Tag PII",
"automatically-generate": "Gerar Automaticamente",

View File

@ -115,6 +115,7 @@
"authentication-uri": "URI de Autenticação",
"authority": "Autoridade",
"authorize-app": "Autorizar {{app}}",
"auto-classification": "Classificação Automática",
"auto-pii-confidence-score": "Pontuação de Confiança Automática PII",
"auto-tag-pii-uppercase": "Etiqueta Automática PII",
"automatically-generate": "Gerar Automaticamente",

View File

@ -115,6 +115,7 @@
"authentication-uri": "URI аутентификации",
"authority": "Власть",
"authorize-app": "Authorize {{app}}",
"auto-classification": "Автоклассификация",
"auto-pii-confidence-score": "Оценка достоверности Auto PII",
"auto-tag-pii-uppercase": "Автотег PII",
"automatically-generate": "Автоматически генерировать",

View File

@ -115,6 +115,7 @@
"authentication-uri": "URI การรับรอง",
"authority": "อำนาจ",
"authorize-app": "อนุญาต {{app}}",
"auto-classification": "การจำแนกประเภทอัตโนมัติ",
"auto-pii-confidence-score": "คะแนนความมั่นใจ PII อัตโนมัติ",
"auto-tag-pii-uppercase": "แท็ก PII อัตโนมัติ",
"automatically-generate": "สร้างโดยอัตโนมัติ",

View File

@ -115,6 +115,7 @@
"authentication-uri": "鉴权 URI",
"authority": "授权",
"authorize-app": "授权{{app}}",
"auto-classification": "自动分类",
"auto-pii-confidence-score": "自动计算 PII 信任值",
"auto-tag-pii-uppercase": "自动标记 PII",
"automatically-generate": "自动生成",

View File

@ -21,6 +21,7 @@ export interface IngestionPipelineLogByIdInterface {
data_insight_task?: string;
dbt_task?: string;
elasticsearch_reindex_task?: string;
auto_classification_task?: string;
total?: string;
after?: string;
}

View File

@ -112,40 +112,45 @@ const LogsViewerPage = () => {
switch (pipelineType || ingestionDetails?.pipelineType) {
case PipelineType.Metadata:
setLogs(logs.concat(res.data?.ingestion_task || ''));
setLogs(logs.concat(res.data?.ingestion_task ?? ''));
break;
case PipelineType.Application:
setLogs(logs.concat(res.data?.application_task || ''));
setLogs(logs.concat(res.data?.application_task ?? ''));
break;
case PipelineType.Profiler:
setLogs(logs.concat(res.data?.profiler_task || ''));
setLogs(logs.concat(res.data?.profiler_task ?? ''));
break;
case PipelineType.Usage:
setLogs(logs.concat(res.data?.usage_task || ''));
setLogs(logs.concat(res.data?.usage_task ?? ''));
break;
case PipelineType.Lineage:
setLogs(logs.concat(res.data?.lineage_task || ''));
setLogs(logs.concat(res.data?.lineage_task ?? ''));
break;
case PipelineType.Dbt:
setLogs(logs.concat(res.data?.dbt_task || ''));
setLogs(logs.concat(res.data?.dbt_task ?? ''));
break;
case PipelineType.TestSuite:
setLogs(logs.concat(res.data?.test_suite_task || ''));
setLogs(logs.concat(res.data?.test_suite_task ?? ''));
break;
case PipelineType.DataInsight:
setLogs(logs.concat(res.data?.data_insight_task || ''));
setLogs(logs.concat(res.data?.data_insight_task ?? ''));
break;
case PipelineType.ElasticSearchReindex:
setLogs(logs.concat(res.data?.elasticsearch_reindex_task || ''));
setLogs(logs.concat(res.data?.elasticsearch_reindex_task ?? ''));
break;
case PipelineType.AutoClassification:
setLogs(logs.concat(res.data?.auto_classification_task ?? ''));
break;

View File

@ -0,0 +1,101 @@
/*
* Copyright 2024 Collate.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { PipelineType } from '../generated/api/services/ingestionPipelines/createIngestionPipeline';
import { DatabaseServiceType } from '../generated/entity/services/databaseService';
import { MetadataServiceType } from '../generated/entity/services/metadataService';
import { ServicesType } from '../interface/service.interface';
import { getSupportedPipelineTypes } from './IngestionUtils';
describe('getSupportedPipelineTypes', () => {
it('should return only return metadata pipeline types if config is undefined', () => {
const serviceDetails = {};
const result = getSupportedPipelineTypes(serviceDetails as ServicesType);
expect(result).toEqual([PipelineType.Metadata]);
});
it('should return supported pipeline types based on config', () => {
const serviceDetails: ServicesType = {
id: '',
name: '',
serviceType: DatabaseServiceType.Athena,
connection: {
config: {
supportsMetadataExtraction: true,
supportsUsageExtraction: true,
supportsLineageExtraction: true,
supportsProfiler: true,
supportsDBTExtraction: true,
supportsViewLineageExtraction: true,
},
},
};
const result = getSupportedPipelineTypes(serviceDetails);
expect(result).toEqual([
PipelineType.Metadata,
PipelineType.Usage,
PipelineType.Lineage,
PipelineType.Profiler,
PipelineType.AutoClassification,
PipelineType.Dbt,
]);
});
it('should return empty array if no pipeline types are supported', () => {
const serviceDetails = {
id: '',
name: '',
serviceType: DatabaseServiceType.Athena,
connection: {
config: {},
},
};
const result = getSupportedPipelineTypes(serviceDetails);
expect(result).toEqual([]);
});
it('should include DataInsight if supportsDataInsightExtraction is true', () => {
const serviceDetails: ServicesType = {
id: '',
name: '',
serviceType: MetadataServiceType.Alation,
connection: {
config: {
supportsDataInsightExtraction: true,
},
},
};
const result = getSupportedPipelineTypes(serviceDetails);
expect(result).toContain(PipelineType.DataInsight);
});
it('should include ElasticSearchReindex if supportsElasticSearchReindexingExtraction is true', () => {
const serviceDetails = {
id: '',
name: '',
serviceType: MetadataServiceType.AlationSink,
connection: {
config: {
supportsElasticSearchReindexingExtraction: true,
},
},
};
const result = getSupportedPipelineTypes(serviceDetails);
expect(result).toContain(PipelineType.ElasticSearchReindex);
});
});

View File

@ -14,7 +14,7 @@
import { Typography } from 'antd';
import { ExpandableConfig } from 'antd/lib/table/interface';
import { t } from 'i18next';
import { isEmpty, isUndefined, startCase } from 'lodash';
import { isEmpty, isUndefined, startCase, uniq } from 'lodash';
import { ServiceTypes } from 'Models';
import React from 'react';
import ErrorPlaceHolder from '../components/common/ErrorWithPlaceholder/ErrorPlaceHolder';
@ -44,7 +44,6 @@ import {
IngestionPipeline,
StepSummary,
} from '../generated/entity/services/ingestionPipelines/ingestionPipeline';
import { Connection as MetadataConnection } from '../generated/entity/services/metadataService';
import { SearchSourceAlias } from '../interface/search.interface';
import { DataObj, ServicesType } from '../interface/service.interface';
import { Transi18next } from './CommonUtils';
@ -141,32 +140,33 @@ export const getBreadCrumbsArray = (
};
export const getSupportedPipelineTypes = (serviceDetails: ServicesType) => {
let pipelineType = [];
const pipelineType: PipelineType[] = [];
const config = serviceDetails?.connection?.config as Connection;
if (config) {
config?.supportsMetadataExtraction &&
pipelineType.push(PipelineType.Metadata);
config?.supportsUsageExtraction && pipelineType.push(PipelineType.Usage);
(config?.supportsLineageExtraction ||
config?.supportsViewLineageExtraction) &&
pipelineType.push(PipelineType.Lineage);
config?.supportsProfiler && pipelineType.push(PipelineType.Profiler);
config?.supportsDBTExtraction && pipelineType.push(PipelineType.Dbt);
(config as MetadataConnection)?.supportsDataInsightExtraction &&
pipelineType.push(PipelineType.DataInsight);
(config as MetadataConnection)?.supportsElasticSearchReindexingExtraction &&
pipelineType.push(PipelineType.ElasticSearchReindex);
} else {
pipelineType = [
PipelineType.Metadata,
PipelineType.Usage,
PipelineType.Lineage,
PipelineType.Profiler,
PipelineType.Dbt,
];
if (isUndefined(config)) {
return [PipelineType.Metadata];
}
return pipelineType;
const pipelineMapping: { [key: string]: PipelineType[] } = {
supportsMetadataExtraction: [PipelineType.Metadata],
supportsUsageExtraction: [PipelineType.Usage],
supportsLineageExtraction: [PipelineType.Lineage],
supportsViewLineageExtraction: [PipelineType.Lineage],
supportsProfiler: [PipelineType.Profiler, PipelineType.AutoClassification],
supportsDBTExtraction: [PipelineType.Dbt],
supportsDataInsightExtraction: [PipelineType.DataInsight],
supportsElasticSearchReindexingExtraction: [
PipelineType.ElasticSearchReindex,
],
};
Object.keys(pipelineMapping).forEach((key) => {
if (config[key as keyof Connection]) {
pipelineType.push(...pipelineMapping[key]);
}
});
return uniq(pipelineType);
};
export const getIngestionTypes = (

View File

@ -104,10 +104,15 @@ describe('Ingestion Workflow tests', () => {
WorkflowType.Usage,
ServiceCategory.PIPELINE_SERVICES
);
const autoClassificationSchema = getSchemaByWorkflowType(
WorkflowType.AutoClassification,
ServiceCategory.DATABASE_SERVICES
);
expect(metadataSchema).toBeDefined();
expect(profilerSchema).toBeDefined();
expect(usageSchema).toBeDefined();
expect(autoClassificationSchema).toBeDefined();
});
it('should getSchemaByWorkflowType return a default object with for an unknown workflow type', () => {

View File

@ -19,6 +19,7 @@ import {
} from '../generated/api/services/ingestionPipelines/createIngestionPipeline';
import apiServiceMetadataPipeline from '../jsons/ingestionSchemas/apiServiceMetadataPipeline.json';
import dashboardMetadataPipeline from '../jsons/ingestionSchemas/dashboardServiceMetadataPipeline.json';
import databaseAutoClassificationPipeline from '../jsons/ingestionSchemas/databaseServiceAutoClassificationPipeline.json';
import databaseMetadataPipeline from '../jsons/ingestionSchemas/databaseServiceMetadataPipeline.json';
import databaseProfilerPipeline from '../jsons/ingestionSchemas/databaseServiceProfilerPipeline.json';
import databaseLineagePipeline from '../jsons/ingestionSchemas/databaseServiceQueryLineagePipeline.json';
@ -98,6 +99,12 @@ export const getSchemaByWorkflowType = (
...databaseProfilerPipeline,
};
break;
case WorkflowType.AutoClassification:
schema = {
...databaseAutoClassificationPipeline,
};
break;
case WorkflowType.Usage:
schema = {