OpenMetadata/ingestion/tests/integration/pii/test_pii_processor.py
Pere Menal-Ferrer 1e86f9870f
FIX #1464 (#21520)
* Add PIICategoryTags and some utilities on top of them.

* Fix static-check

* Add test for fqn representation

* Add NEREntityGeneralTags.json from Collate

* Add test to check PIICategoryTags agree with the ones used by OM server

* Add LabelExtractor

* Fix style

* Add ignore superflous-parens for pylint

* Ass comment as per PR review

* Fix not-updated PII-IT

* Remove duplicated IT test for PII

---------

Co-authored-by: Pere Menal <pere.menal@getcollate.io>
Co-authored-by: Sriharsha Chintalapani <harshach@users.noreply.github.com>
2025-06-09 16:05:35 -07:00

340 lines
12 KiB
Python

# Copyright 2025 Collate
# Licensed under the Collate Community License, Version 1.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Test Processor Class
"""
import datetime
from unittest import TestCase
from metadata.generated.schema.api.data.createDatabase import CreateDatabaseRequest
from metadata.generated.schema.api.data.createDatabaseSchema import (
CreateDatabaseSchemaRequest,
)
from metadata.generated.schema.api.data.createTable import CreateTableRequest
from metadata.generated.schema.api.services.createDatabaseService import (
CreateDatabaseServiceRequest,
)
from metadata.generated.schema.entity.data.table import (
Column,
ColumnName,
DataType,
TableData,
)
from metadata.generated.schema.entity.services.connections.database.common.basicAuth import (
BasicAuth,
)
from metadata.generated.schema.entity.services.connections.database.mysqlConnection import (
MysqlConnection,
)
from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
OpenMetadataConnection,
)
from metadata.generated.schema.entity.services.databaseService import (
DatabaseConnection,
DatabaseService,
DatabaseServiceType,
)
from metadata.generated.schema.metadataIngestion.databaseServiceAutoClassificationPipeline import (
DatabaseServiceAutoClassificationPipeline,
)
from metadata.generated.schema.metadataIngestion.workflow import (
OpenMetadataWorkflowConfig,
Source,
SourceConfig,
WorkflowConfig,
)
from metadata.generated.schema.security.client.openMetadataJWTClientConfig import (
OpenMetadataJWTClientConfig,
)
from metadata.generated.schema.type.tagLabel import TagFQN, TagLabel
from metadata.ingestion.models.table_metadata import ColumnTag
from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.pii.processor import PIIProcessor
from metadata.profiler.api.models import ProfilerResponse
from metadata.sampler.models import SampleData, SamplerResponse
table_data = TableData(
columns=[
ColumnName("customer_id"),
ColumnName("first_name"),
ColumnName("last_name"),
ColumnName("first_order"),
# Apply a random name to force the NER scanner execution here
ColumnName("random"),
ColumnName("number_of_orders"),
],
rows=[
[
30,
"Christina",
"W.",
datetime.date(2018, 3, 2),
"christina@hotmail.com",
2,
],
[73, "Alan", "B.", None, "joshua.alan@yahoo.com", None],
[71, "Gerald", "C.", datetime.date(2018, 1, 18), "geraldc@gmail.com", 3],
[35, "Sara", "T.", datetime.date(2018, 2, 21), "saratimithi@godesign.com", 2],
[22, "Sean", "H.", datetime.date(2018, 1, 26), "heroldsean@google.com", 3],
[50, "Billy", "L.", datetime.date(2018, 1, 5), "bliam@random.com", 2],
[
76,
"Barbara",
"W.",
datetime.date(2018, 3, 23),
"bmwastin@gmail.co.in",
1,
],
[5, "Katherine", "R.", None, None, None],
[31, "Jane", "G.", datetime.date(2018, 2, 17), "gg34jane@hammer.com", 1],
[45, "Scott", "B.", None, None, None],
[21, "Willie", "H.", datetime.date(2018, 3, 28), "12hwilliejose@gmail.com", 1],
[18, "Johnny", "K.", datetime.date(2018, 2, 27), "johnnykk@dexter.com", 1],
[6, "Sarah", "R.", datetime.date(2018, 2, 19), "rrsarah@britinia.com", 1],
[56, "Joshua", "K.", None, None, None],
[79, "Jack", "R.", datetime.date(2018, 2, 28), "jack.mm@people.co.in", 2],
[94, "Gregory", "H.", datetime.date(2018, 1, 4), "peter.gregory@japer.com", 2],
[83, "Virginia", "R.", None, None, None],
[17, "Kimberly", "R.", None, None, None],
[2, "Shawn", "M.", datetime.date(2018, 1, 11), "shawn344@gmail.com", 1],
[60, "Norma", "W.", None, None, None],
[87, "Phillip", "B.", None, None, None],
],
)
EXPECTED_COLUMN_TAGS = [
ColumnTag(
column_fqn="test-service-table-patch.test-db.test-schema.customers.first_name",
tag_label=TagLabel(
tagFQN=TagFQN("General.Person"),
source="Classification",
labelType="Automated",
state="Suggested",
),
),
ColumnTag(
column_fqn="test-service-table-patch.test-db.test-schema.customers.first_name",
tag_label=TagLabel(
tagFQN=TagFQN("PII.Sensitive"),
source="Classification",
labelType="Automated",
state="Suggested",
),
),
ColumnTag(
column_fqn="test-service-table-patch.test-db.test-schema.customers.first_order",
tag_label=TagLabel(
tagFQN=TagFQN("General.DateTime"),
source="Classification",
labelType="Automated",
state="Suggested",
),
),
ColumnTag(
column_fqn="test-service-table-patch.test-db.test-schema.customers.first_order",
tag_label=TagLabel(
tagFQN=TagFQN("PII.NonSensitive"),
source="Classification",
labelType="Automated",
state="Suggested",
),
),
ColumnTag(
column_fqn="test-service-table-patch.test-db.test-schema.customers.random",
tag_label=TagLabel(
tagFQN=TagFQN("General.Email"),
source="Classification",
labelType="Automated",
state="Suggested",
),
),
ColumnTag(
column_fqn="test-service-table-patch.test-db.test-schema.customers.random",
tag_label=TagLabel(
tagFQN=TagFQN("PII.Sensitive"),
source="Classification",
labelType="Automated",
state="Suggested",
),
),
]
class PiiProcessorTest(TestCase):
"""
Run this integration test with different type of column name
to attach PII Tags
"""
server_config = OpenMetadataConnection(
hostPort="http://localhost:8585/api",
authProvider="openmetadata",
securityConfig=OpenMetadataJWTClientConfig(
jwtToken="eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJh"
"bGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vc"
"mciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7Hgz"
"GBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUx"
"huv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakL"
"Lzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM"
"5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg"
),
)
workflow_config = OpenMetadataWorkflowConfig(
source=Source(
type="mysql",
serviceName="test",
sourceConfig=SourceConfig(
config=DatabaseServiceAutoClassificationPipeline(
confidence=85,
enableAutoClassification=True,
)
),
),
workflowConfig=WorkflowConfig(openMetadataServerConfig=server_config),
)
metadata = OpenMetadata(server_config)
pii_processor = PIIProcessor(
config=workflow_config, metadata=OpenMetadata(server_config)
)
@classmethod
def setUpClass(cls) -> None:
"""
Prepare ingredients
"""
service = CreateDatabaseServiceRequest(
name="test-service-table-patch",
serviceType=DatabaseServiceType.Mysql,
connection=DatabaseConnection(
config=MysqlConnection(
username="username",
authType=BasicAuth(
password="password",
),
hostPort="http://localhost:1234",
)
),
)
service_entity = cls.metadata.create_or_update(data=service)
create_db = CreateDatabaseRequest(
name="test-db",
service=service_entity.fullyQualifiedName,
)
create_db_entity = cls.metadata.create_or_update(data=create_db)
create_schema = CreateDatabaseSchemaRequest(
name="test-schema",
database=create_db_entity.fullyQualifiedName,
)
create_schema_entity = cls.metadata.create_or_update(data=create_schema)
created_table = CreateTableRequest(
name="customers",
columns=[
Column(name="customer_id", dataType=DataType.INT),
Column(name="first_name", dataType=DataType.VARCHAR, dataLength=20),
Column(name="last_name", dataType=DataType.VARCHAR, dataLength=20),
Column(name="first_order", dataType=DataType.DATE),
Column(name="customer_email", dataType=DataType.VARCHAR, dataLength=20),
Column(name="number_of_orders", dataType=DataType.BIGINT),
],
databaseSchema=create_schema_entity.fullyQualifiedName,
)
cls.table_entity = cls.metadata.create_or_update(data=created_table)
@classmethod
def tearDownClass(cls) -> None:
"""
Clean up
"""
service_id = str(
cls.metadata.get_by_name(
entity=DatabaseService, fqn="test-service-table-patch"
).id.root
)
cls.metadata.delete(
entity=DatabaseService,
entity_id=service_id,
recursive=True,
hard_delete=True,
)
@classmethod
def setUpClass(cls) -> None:
"""
Prepare ingredients
"""
service = CreateDatabaseServiceRequest(
name="test-service-table-patch",
serviceType=DatabaseServiceType.Mysql,
connection=DatabaseConnection(
config=MysqlConnection(
username="username",
authType=BasicAuth(
password="password",
),
hostPort="http://localhost:1234",
)
),
)
service_entity = cls.metadata.create_or_update(data=service)
create_db = CreateDatabaseRequest(
name="test-db",
service=service_entity.fullyQualifiedName,
)
create_db_entity = cls.metadata.create_or_update(data=create_db)
create_schema = CreateDatabaseSchemaRequest(
name="test-schema",
database=create_db_entity.fullyQualifiedName,
)
create_schema_entity = cls.metadata.create_or_update(data=create_schema)
created_table = CreateTableRequest(
name="customers",
columns=[
Column(name="customer_id", dataType=DataType.INT),
Column(name="first_name", dataType=DataType.VARCHAR, dataLength=20),
Column(name="last_name", dataType=DataType.VARCHAR, dataLength=20),
Column(name="first_order", dataType=DataType.DATE),
Column(name="random", dataType=DataType.VARCHAR, dataLength=20),
Column(name="number_of_orders", dataType=DataType.BIGINT),
],
databaseSchema=create_schema_entity.fullyQualifiedName,
)
cls.table_entity = cls.metadata.create_or_update(data=created_table)
def test_ner_scanner_process(self):
"""
test function for ner Scanner
"""
record = SamplerResponse(
table=self.table_entity,
sample_data=SampleData(data=table_data),
)
updated_record: ProfilerResponse = self.pii_processor.run(record)
for expected, updated in zip(EXPECTED_COLUMN_TAGS, updated_record.column_tags):
self.assertEqual(expected.column_fqn, updated.column_fqn)
self.assertEqual(expected.tag_label.tagFQN, updated.tag_label.tagFQN)