mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-09-01 05:03:10 +00:00

* Add PIICategoryTags and some utilities on top of them. * Fix static-check * Add test for fqn representation * Add NEREntityGeneralTags.json from Collate * Add test to check PIICategoryTags agree with the ones used by OM server * Add LabelExtractor * Fix style * Add ignore superflous-parens for pylint * Ass comment as per PR review * Fix not-updated PII-IT * Remove duplicated IT test for PII --------- Co-authored-by: Pere Menal <pere.menal@getcollate.io> Co-authored-by: Sriharsha Chintalapani <harshach@users.noreply.github.com>
340 lines
12 KiB
Python
340 lines
12 KiB
Python
# Copyright 2025 Collate
|
|
# Licensed under the Collate Community License, Version 1.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""
|
|
Test Processor Class
|
|
"""
|
|
|
|
import datetime
|
|
from unittest import TestCase
|
|
|
|
from metadata.generated.schema.api.data.createDatabase import CreateDatabaseRequest
|
|
from metadata.generated.schema.api.data.createDatabaseSchema import (
|
|
CreateDatabaseSchemaRequest,
|
|
)
|
|
from metadata.generated.schema.api.data.createTable import CreateTableRequest
|
|
from metadata.generated.schema.api.services.createDatabaseService import (
|
|
CreateDatabaseServiceRequest,
|
|
)
|
|
from metadata.generated.schema.entity.data.table import (
|
|
Column,
|
|
ColumnName,
|
|
DataType,
|
|
TableData,
|
|
)
|
|
from metadata.generated.schema.entity.services.connections.database.common.basicAuth import (
|
|
BasicAuth,
|
|
)
|
|
from metadata.generated.schema.entity.services.connections.database.mysqlConnection import (
|
|
MysqlConnection,
|
|
)
|
|
from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
|
|
OpenMetadataConnection,
|
|
)
|
|
from metadata.generated.schema.entity.services.databaseService import (
|
|
DatabaseConnection,
|
|
DatabaseService,
|
|
DatabaseServiceType,
|
|
)
|
|
from metadata.generated.schema.metadataIngestion.databaseServiceAutoClassificationPipeline import (
|
|
DatabaseServiceAutoClassificationPipeline,
|
|
)
|
|
from metadata.generated.schema.metadataIngestion.workflow import (
|
|
OpenMetadataWorkflowConfig,
|
|
Source,
|
|
SourceConfig,
|
|
WorkflowConfig,
|
|
)
|
|
from metadata.generated.schema.security.client.openMetadataJWTClientConfig import (
|
|
OpenMetadataJWTClientConfig,
|
|
)
|
|
from metadata.generated.schema.type.tagLabel import TagFQN, TagLabel
|
|
from metadata.ingestion.models.table_metadata import ColumnTag
|
|
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
|
from metadata.pii.processor import PIIProcessor
|
|
from metadata.profiler.api.models import ProfilerResponse
|
|
from metadata.sampler.models import SampleData, SamplerResponse
|
|
|
|
table_data = TableData(
|
|
columns=[
|
|
ColumnName("customer_id"),
|
|
ColumnName("first_name"),
|
|
ColumnName("last_name"),
|
|
ColumnName("first_order"),
|
|
# Apply a random name to force the NER scanner execution here
|
|
ColumnName("random"),
|
|
ColumnName("number_of_orders"),
|
|
],
|
|
rows=[
|
|
[
|
|
30,
|
|
"Christina",
|
|
"W.",
|
|
datetime.date(2018, 3, 2),
|
|
"christina@hotmail.com",
|
|
2,
|
|
],
|
|
[73, "Alan", "B.", None, "joshua.alan@yahoo.com", None],
|
|
[71, "Gerald", "C.", datetime.date(2018, 1, 18), "geraldc@gmail.com", 3],
|
|
[35, "Sara", "T.", datetime.date(2018, 2, 21), "saratimithi@godesign.com", 2],
|
|
[22, "Sean", "H.", datetime.date(2018, 1, 26), "heroldsean@google.com", 3],
|
|
[50, "Billy", "L.", datetime.date(2018, 1, 5), "bliam@random.com", 2],
|
|
[
|
|
76,
|
|
"Barbara",
|
|
"W.",
|
|
datetime.date(2018, 3, 23),
|
|
"bmwastin@gmail.co.in",
|
|
1,
|
|
],
|
|
[5, "Katherine", "R.", None, None, None],
|
|
[31, "Jane", "G.", datetime.date(2018, 2, 17), "gg34jane@hammer.com", 1],
|
|
[45, "Scott", "B.", None, None, None],
|
|
[21, "Willie", "H.", datetime.date(2018, 3, 28), "12hwilliejose@gmail.com", 1],
|
|
[18, "Johnny", "K.", datetime.date(2018, 2, 27), "johnnykk@dexter.com", 1],
|
|
[6, "Sarah", "R.", datetime.date(2018, 2, 19), "rrsarah@britinia.com", 1],
|
|
[56, "Joshua", "K.", None, None, None],
|
|
[79, "Jack", "R.", datetime.date(2018, 2, 28), "jack.mm@people.co.in", 2],
|
|
[94, "Gregory", "H.", datetime.date(2018, 1, 4), "peter.gregory@japer.com", 2],
|
|
[83, "Virginia", "R.", None, None, None],
|
|
[17, "Kimberly", "R.", None, None, None],
|
|
[2, "Shawn", "M.", datetime.date(2018, 1, 11), "shawn344@gmail.com", 1],
|
|
[60, "Norma", "W.", None, None, None],
|
|
[87, "Phillip", "B.", None, None, None],
|
|
],
|
|
)
|
|
|
|
|
|
EXPECTED_COLUMN_TAGS = [
|
|
ColumnTag(
|
|
column_fqn="test-service-table-patch.test-db.test-schema.customers.first_name",
|
|
tag_label=TagLabel(
|
|
tagFQN=TagFQN("General.Person"),
|
|
source="Classification",
|
|
labelType="Automated",
|
|
state="Suggested",
|
|
),
|
|
),
|
|
ColumnTag(
|
|
column_fqn="test-service-table-patch.test-db.test-schema.customers.first_name",
|
|
tag_label=TagLabel(
|
|
tagFQN=TagFQN("PII.Sensitive"),
|
|
source="Classification",
|
|
labelType="Automated",
|
|
state="Suggested",
|
|
),
|
|
),
|
|
ColumnTag(
|
|
column_fqn="test-service-table-patch.test-db.test-schema.customers.first_order",
|
|
tag_label=TagLabel(
|
|
tagFQN=TagFQN("General.DateTime"),
|
|
source="Classification",
|
|
labelType="Automated",
|
|
state="Suggested",
|
|
),
|
|
),
|
|
ColumnTag(
|
|
column_fqn="test-service-table-patch.test-db.test-schema.customers.first_order",
|
|
tag_label=TagLabel(
|
|
tagFQN=TagFQN("PII.NonSensitive"),
|
|
source="Classification",
|
|
labelType="Automated",
|
|
state="Suggested",
|
|
),
|
|
),
|
|
ColumnTag(
|
|
column_fqn="test-service-table-patch.test-db.test-schema.customers.random",
|
|
tag_label=TagLabel(
|
|
tagFQN=TagFQN("General.Email"),
|
|
source="Classification",
|
|
labelType="Automated",
|
|
state="Suggested",
|
|
),
|
|
),
|
|
ColumnTag(
|
|
column_fqn="test-service-table-patch.test-db.test-schema.customers.random",
|
|
tag_label=TagLabel(
|
|
tagFQN=TagFQN("PII.Sensitive"),
|
|
source="Classification",
|
|
labelType="Automated",
|
|
state="Suggested",
|
|
),
|
|
),
|
|
]
|
|
|
|
|
|
class PiiProcessorTest(TestCase):
|
|
"""
|
|
Run this integration test with different type of column name
|
|
to attach PII Tags
|
|
"""
|
|
|
|
server_config = OpenMetadataConnection(
|
|
hostPort="http://localhost:8585/api",
|
|
authProvider="openmetadata",
|
|
securityConfig=OpenMetadataJWTClientConfig(
|
|
jwtToken="eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJh"
|
|
"bGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vc"
|
|
"mciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7Hgz"
|
|
"GBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUx"
|
|
"huv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakL"
|
|
"Lzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM"
|
|
"5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg"
|
|
),
|
|
)
|
|
|
|
workflow_config = OpenMetadataWorkflowConfig(
|
|
source=Source(
|
|
type="mysql",
|
|
serviceName="test",
|
|
sourceConfig=SourceConfig(
|
|
config=DatabaseServiceAutoClassificationPipeline(
|
|
confidence=85,
|
|
enableAutoClassification=True,
|
|
)
|
|
),
|
|
),
|
|
workflowConfig=WorkflowConfig(openMetadataServerConfig=server_config),
|
|
)
|
|
|
|
metadata = OpenMetadata(server_config)
|
|
pii_processor = PIIProcessor(
|
|
config=workflow_config, metadata=OpenMetadata(server_config)
|
|
)
|
|
|
|
@classmethod
|
|
def setUpClass(cls) -> None:
|
|
"""
|
|
Prepare ingredients
|
|
"""
|
|
service = CreateDatabaseServiceRequest(
|
|
name="test-service-table-patch",
|
|
serviceType=DatabaseServiceType.Mysql,
|
|
connection=DatabaseConnection(
|
|
config=MysqlConnection(
|
|
username="username",
|
|
authType=BasicAuth(
|
|
password="password",
|
|
),
|
|
hostPort="http://localhost:1234",
|
|
)
|
|
),
|
|
)
|
|
service_entity = cls.metadata.create_or_update(data=service)
|
|
|
|
create_db = CreateDatabaseRequest(
|
|
name="test-db",
|
|
service=service_entity.fullyQualifiedName,
|
|
)
|
|
|
|
create_db_entity = cls.metadata.create_or_update(data=create_db)
|
|
|
|
create_schema = CreateDatabaseSchemaRequest(
|
|
name="test-schema",
|
|
database=create_db_entity.fullyQualifiedName,
|
|
)
|
|
|
|
create_schema_entity = cls.metadata.create_or_update(data=create_schema)
|
|
|
|
created_table = CreateTableRequest(
|
|
name="customers",
|
|
columns=[
|
|
Column(name="customer_id", dataType=DataType.INT),
|
|
Column(name="first_name", dataType=DataType.VARCHAR, dataLength=20),
|
|
Column(name="last_name", dataType=DataType.VARCHAR, dataLength=20),
|
|
Column(name="first_order", dataType=DataType.DATE),
|
|
Column(name="customer_email", dataType=DataType.VARCHAR, dataLength=20),
|
|
Column(name="number_of_orders", dataType=DataType.BIGINT),
|
|
],
|
|
databaseSchema=create_schema_entity.fullyQualifiedName,
|
|
)
|
|
cls.table_entity = cls.metadata.create_or_update(data=created_table)
|
|
|
|
@classmethod
|
|
def tearDownClass(cls) -> None:
|
|
"""
|
|
Clean up
|
|
"""
|
|
service_id = str(
|
|
cls.metadata.get_by_name(
|
|
entity=DatabaseService, fqn="test-service-table-patch"
|
|
).id.root
|
|
)
|
|
|
|
cls.metadata.delete(
|
|
entity=DatabaseService,
|
|
entity_id=service_id,
|
|
recursive=True,
|
|
hard_delete=True,
|
|
)
|
|
|
|
@classmethod
|
|
def setUpClass(cls) -> None:
|
|
"""
|
|
Prepare ingredients
|
|
"""
|
|
service = CreateDatabaseServiceRequest(
|
|
name="test-service-table-patch",
|
|
serviceType=DatabaseServiceType.Mysql,
|
|
connection=DatabaseConnection(
|
|
config=MysqlConnection(
|
|
username="username",
|
|
authType=BasicAuth(
|
|
password="password",
|
|
),
|
|
hostPort="http://localhost:1234",
|
|
)
|
|
),
|
|
)
|
|
service_entity = cls.metadata.create_or_update(data=service)
|
|
|
|
create_db = CreateDatabaseRequest(
|
|
name="test-db",
|
|
service=service_entity.fullyQualifiedName,
|
|
)
|
|
|
|
create_db_entity = cls.metadata.create_or_update(data=create_db)
|
|
|
|
create_schema = CreateDatabaseSchemaRequest(
|
|
name="test-schema",
|
|
database=create_db_entity.fullyQualifiedName,
|
|
)
|
|
|
|
create_schema_entity = cls.metadata.create_or_update(data=create_schema)
|
|
|
|
created_table = CreateTableRequest(
|
|
name="customers",
|
|
columns=[
|
|
Column(name="customer_id", dataType=DataType.INT),
|
|
Column(name="first_name", dataType=DataType.VARCHAR, dataLength=20),
|
|
Column(name="last_name", dataType=DataType.VARCHAR, dataLength=20),
|
|
Column(name="first_order", dataType=DataType.DATE),
|
|
Column(name="random", dataType=DataType.VARCHAR, dataLength=20),
|
|
Column(name="number_of_orders", dataType=DataType.BIGINT),
|
|
],
|
|
databaseSchema=create_schema_entity.fullyQualifiedName,
|
|
)
|
|
cls.table_entity = cls.metadata.create_or_update(data=created_table)
|
|
|
|
def test_ner_scanner_process(self):
|
|
"""
|
|
test function for ner Scanner
|
|
"""
|
|
|
|
record = SamplerResponse(
|
|
table=self.table_entity,
|
|
sample_data=SampleData(data=table_data),
|
|
)
|
|
|
|
updated_record: ProfilerResponse = self.pii_processor.run(record)
|
|
for expected, updated in zip(EXPECTED_COLUMN_TAGS, updated_record.column_tags):
|
|
self.assertEqual(expected.column_fqn, updated.column_fqn)
|
|
self.assertEqual(expected.tag_label.tagFQN, updated.tag_label.tagFQN)
|