mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-10-18 12:18:35 +00:00
improvement in pii tagging (#10696)
* improvement in pii tagging * fix conflict and changes as per comment * Added confidence field * changes as per comments * Apply suggestions from code review Co-authored-by: Teddy <teddy.crepineau@gmail.com> --------- Co-authored-by: Ashish Gupta <ashish@getcollate.io> Co-authored-by: Pere Miquel Brull <peremiquelbrull@gmail.com> Co-authored-by: Teddy <teddy.crepineau@gmail.com>
This commit is contained in:
parent
4853dfc6c8
commit
46afe69811
@ -13,8 +13,9 @@
|
||||
Processor util to fetch pii sensitive columns
|
||||
"""
|
||||
import logging
|
||||
import re
|
||||
import traceback
|
||||
from enum import Enum
|
||||
from enum import Enum, auto
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from metadata.generated.schema.entity.classification.tag import Tag
|
||||
@ -25,11 +26,73 @@ from metadata.utils import fqn
|
||||
PII = "PII"
|
||||
|
||||
|
||||
class PiiTypes(Enum):
|
||||
"""PiiTypes enumerates the different types of PII data"""
|
||||
|
||||
NONE = auto()
|
||||
UNSUPPORTED = auto()
|
||||
PHONE = auto()
|
||||
EMAIL = auto()
|
||||
CREDIT_CARD = auto()
|
||||
ADDRESS = auto()
|
||||
ADDRESS_LOCATION = auto()
|
||||
PERSON = auto()
|
||||
LOCATION = auto()
|
||||
BIRTH_DATE = auto()
|
||||
GENDER = auto()
|
||||
NATIONALITY = auto()
|
||||
IP_ADDRESS = auto()
|
||||
SSN = auto()
|
||||
USER_NAME = auto()
|
||||
PASSWORD = auto()
|
||||
ETHNICITY = auto()
|
||||
TAX_ID = auto()
|
||||
KEY = auto()
|
||||
BANKACC = auto()
|
||||
|
||||
|
||||
class TagType(Enum):
|
||||
SENSITIVE = "Sensitive"
|
||||
NONSENSITIVE = "NonSensitive"
|
||||
|
||||
|
||||
class ColumnNameScanner:
|
||||
"""
|
||||
Column Name Scanner to scan column name
|
||||
"""
|
||||
|
||||
sensitive_regex = {
|
||||
PiiTypes.PASSWORD: re.compile("^.*password.*$", re.IGNORECASE),
|
||||
PiiTypes.USER_NAME: re.compile("^.*user(id|name|).*$", re.IGNORECASE),
|
||||
PiiTypes.KEY: re.compile("^.*(key).*$", re.IGNORECASE),
|
||||
PiiTypes.SSN: re.compile("^.*(ssn|social).*$", re.IGNORECASE),
|
||||
PiiTypes.CREDIT_CARD: re.compile("^.*(card).*$", re.IGNORECASE),
|
||||
PiiTypes.BANKACC: re.compile("^.*(bank|acc|amount).*$", re.IGNORECASE),
|
||||
PiiTypes.EMAIL: re.compile("^.*(email|e-mail|mail).*$", re.IGNORECASE),
|
||||
}
|
||||
non_sensitive_regex = {
|
||||
PiiTypes.PERSON: re.compile(
|
||||
"^.*(firstname|fname|lastname|lname|"
|
||||
"fullname|maidenname|_name|"
|
||||
"nickname|name_suffix|name).*$",
|
||||
re.IGNORECASE,
|
||||
),
|
||||
PiiTypes.BIRTH_DATE: re.compile(
|
||||
"^.*(date_of_birth|dateofbirth|dob|"
|
||||
"birthday|date_of_death|dateofdeath).*$",
|
||||
re.IGNORECASE,
|
||||
),
|
||||
PiiTypes.GENDER: re.compile("^.*(gender).*$", re.IGNORECASE),
|
||||
PiiTypes.NATIONALITY: re.compile("^.*(nationality).*$", re.IGNORECASE),
|
||||
PiiTypes.ADDRESS: re.compile(
|
||||
"^.*(address|city|state|county|country|"
|
||||
"zipcode|zip|postal|zone|borough).*$",
|
||||
re.IGNORECASE,
|
||||
),
|
||||
PiiTypes.PHONE: re.compile("^.*(phone).*$", re.IGNORECASE),
|
||||
}
|
||||
|
||||
|
||||
class NEREntity(Enum):
|
||||
CREDIT_CARD = TagType.SENSITIVE.value
|
||||
EMAIL_ADDRESS = TagType.SENSITIVE.value
|
||||
@ -42,7 +105,7 @@ class NEREntity(Enum):
|
||||
US_DRIVER_LICENSE = TagType.SENSITIVE.value
|
||||
DATE_TIME = TagType.NONSENSITIVE.value
|
||||
URL = TagType.SENSITIVE.value
|
||||
US_BANK_NUMBER = TagType.NONSENSITIVE.value
|
||||
US_BANK_NUMBER = TagType.SENSITIVE.value
|
||||
US_SSN = TagType.SENSITIVE.value
|
||||
PERSON = TagType.SENSITIVE.value
|
||||
US_PASSPORT = TagType.SENSITIVE.value
|
||||
@ -78,11 +141,21 @@ class NERScanner:
|
||||
most_used_label_occurrence = score[1]
|
||||
return label_score or (None, None)
|
||||
|
||||
def column_name_scan(self, column_name: str):
|
||||
for pii_type_pattern in ColumnNameScanner.sensitive_regex.values():
|
||||
if pii_type_pattern.match(column_name) is not None:
|
||||
return TagType.SENSITIVE.value, 1
|
||||
|
||||
for pii_type_pattern in ColumnNameScanner.non_sensitive_regex.values():
|
||||
if pii_type_pattern.match(column_name) is not None:
|
||||
return TagType.NONSENSITIVE.value, 1
|
||||
|
||||
return None
|
||||
|
||||
def scan(self, text) -> Tuple[str, float]:
|
||||
"""Scan the text and return an pii tag fqn and confidence/score"""
|
||||
|
||||
logging.debug("Processing '%s'", text)
|
||||
pii_tag_fqn = ""
|
||||
labels_score = {}
|
||||
self.text = [str(row) for row in text if row is not None]
|
||||
for row in self.text:
|
||||
@ -106,19 +179,23 @@ class NERScanner:
|
||||
|
||||
label, score = self.get_highest_score_label(labels_score)
|
||||
if label and score:
|
||||
label_type = NEREntity.__members__.get(
|
||||
tag_type = NEREntity.__members__.get(
|
||||
label, TagType.NONSENSITIVE.value
|
||||
).value
|
||||
pii_tag_fqn = fqn.build(
|
||||
self.metadata,
|
||||
entity_type=Tag,
|
||||
classification_name=PII,
|
||||
tag_name=label_type,
|
||||
)
|
||||
return tag_type, score
|
||||
|
||||
return pii_tag_fqn or "", score or 0
|
||||
return "", 0
|
||||
|
||||
def process(self, table_data: TableData, table_entity: Table, client: OpenMetadata):
|
||||
def process(
|
||||
self,
|
||||
table_data: TableData,
|
||||
table_entity: Table,
|
||||
client: OpenMetadata,
|
||||
thresold_confidence: float,
|
||||
):
|
||||
"""
|
||||
process function to start processing sample data
|
||||
"""
|
||||
len_of_rows = len(table_data.rows[0]) if table_data.rows else 0
|
||||
for idx in range(len_of_rows):
|
||||
pii_found = False
|
||||
@ -128,11 +205,19 @@ class NERScanner:
|
||||
continue
|
||||
if pii_found is True:
|
||||
continue
|
||||
pii_tag_fqn, confidence = self.scan([row[idx] for row in table_data.rows])
|
||||
if pii_tag_fqn and confidence >= 0.8:
|
||||
tag_type, confidence = self.column_name_scan(
|
||||
table_data.columns[idx].__root__
|
||||
) or self.scan([row[idx] for row in table_data.rows])
|
||||
if tag_type and confidence >= thresold_confidence / 100:
|
||||
tag_fqn = fqn.build(
|
||||
self.metadata,
|
||||
entity_type=Tag,
|
||||
classification_name=PII,
|
||||
tag_name=tag_type,
|
||||
)
|
||||
client.patch_column_tag(
|
||||
entity_id=table_entity.id,
|
||||
column_name=table_entity.columns[idx].name.__root__,
|
||||
tag_fqn=pii_tag_fqn,
|
||||
tag_fqn=tag_fqn,
|
||||
is_suggested=True,
|
||||
)
|
||||
|
@ -508,6 +508,7 @@ class Profiler(Generic[TMetric]):
|
||||
sample_data,
|
||||
self.profiler_interface.table_entity, # type: ignore
|
||||
self.profiler_interface.ometa_client, # type: ignore
|
||||
self.profiler_interface.source_config.confidence,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
|
@ -685,12 +685,10 @@ UPDATED_TABLE_ENTITY = [
|
||||
fullyQualifiedName="test-service-table-patch.test-db.test-schema.customers.first_name",
|
||||
tags=[
|
||||
TagLabel(
|
||||
tagFQN=TagFQN(__root__="PII.Sensitive"),
|
||||
tagFQN=TagFQN(__root__="PII.NonSensitive"),
|
||||
description=(
|
||||
(
|
||||
"PII which if lost, compromised, or disclosed without authorization, could result in "
|
||||
"substantial harm, embarrassment, inconvenience, or unfairness to an individual."
|
||||
)
|
||||
"PII which is easily accessible from public sources and can include zip code, "
|
||||
"race, gender, and date of birth."
|
||||
),
|
||||
source="Classification",
|
||||
labelType="Automated",
|
||||
@ -716,7 +714,19 @@ UPDATED_TABLE_ENTITY = [
|
||||
dataTypeDisplay="varchar",
|
||||
description=None,
|
||||
fullyQualifiedName="test-service-table-patch.test-db.test-schema.customers.last_name",
|
||||
tags=[],
|
||||
tags=[
|
||||
TagLabel(
|
||||
tagFQN=TagFQN(__root__="PII.NonSensitive"),
|
||||
description=(
|
||||
"PII which is easily accessible from public sources and can include zip code, "
|
||||
"race, gender, and date of birth."
|
||||
),
|
||||
source="Classification",
|
||||
labelType="Automated",
|
||||
state="Suggested",
|
||||
href=None,
|
||||
)
|
||||
],
|
||||
constraint=None,
|
||||
ordinalPosition=None,
|
||||
jsonSchema=None,
|
||||
@ -884,7 +894,10 @@ class PiiProcessorTest(TestCase):
|
||||
TABLE_ENTITY.id = table_entity.id
|
||||
|
||||
self.nerscanner_processor.process(
|
||||
table_data=table_data, table_entity=TABLE_ENTITY, client=self.metadata
|
||||
table_data=table_data,
|
||||
table_entity=TABLE_ENTITY,
|
||||
client=self.metadata,
|
||||
thresold_confidence=85,
|
||||
)
|
||||
updated_table_entity = self.metadata.get_by_id(
|
||||
entity=Table, entity_id=table_entity.id, fields=["tags"]
|
||||
|
@ -36,6 +36,11 @@
|
||||
"type": "boolean",
|
||||
"default": false
|
||||
},
|
||||
"confidence": {
|
||||
"description": "Set the Confidence value for which you want the column to be marked",
|
||||
"type": "number",
|
||||
"default": 80
|
||||
},
|
||||
"generateSampleData": {
|
||||
"description": "Option to turn on/off generating sample data.",
|
||||
"type": "boolean",
|
||||
|
@ -194,6 +194,7 @@ const AddIngestion = ({
|
||||
resultLimit: sourceConfig?.resultLimit ?? 1000,
|
||||
metadataToESConfig: undefined,
|
||||
dbtUpdateDescriptions: sourceConfig?.dbtUpdateDescriptions ?? false,
|
||||
confidence: sourceConfig?.confidence,
|
||||
dbtClassificationName:
|
||||
sourceConfig?.dbtClassificationName ?? DBT_CLASSIFICATION_DEFAULT_VALUE, // default value from Json Schema
|
||||
}),
|
||||
@ -425,6 +426,7 @@ const AddIngestion = ({
|
||||
threadCount,
|
||||
timeoutSeconds,
|
||||
processPii,
|
||||
confidence,
|
||||
} = state;
|
||||
switch (type) {
|
||||
case PipelineType.Usage: {
|
||||
@ -460,6 +462,7 @@ const AddIngestion = ({
|
||||
type: profilerIngestionType,
|
||||
generateSampleData: ingestSampleData,
|
||||
profileSample: profileSample,
|
||||
confidence: processPii ? confidence : undefined,
|
||||
profileSampleType: profileSampleType,
|
||||
threadCount: threadCount,
|
||||
timeoutSeconds: timeoutSeconds,
|
||||
|
@ -103,6 +103,7 @@ const mockConfigureIngestion: ConfigureIngestionProps = {
|
||||
threadCount: 5,
|
||||
timeoutSeconds: 43200,
|
||||
useFqnFilter: false,
|
||||
confidence: 80,
|
||||
} as unknown as AddIngestionState,
|
||||
};
|
||||
|
||||
|
@ -85,6 +85,7 @@ const ConfigureIngestion = ({
|
||||
topicFilterPattern,
|
||||
useFqnFilter,
|
||||
processPii,
|
||||
confidence,
|
||||
overrideOwner,
|
||||
} = useMemo(
|
||||
() => ({
|
||||
@ -124,6 +125,7 @@ const ConfigureIngestion = ({
|
||||
useFqnFilter: data.useFqnFilter,
|
||||
processPii: data.processPii,
|
||||
overrideOwner: data.overrideOwner,
|
||||
confidence: data.confidence,
|
||||
}),
|
||||
[data]
|
||||
);
|
||||
@ -150,6 +152,11 @@ const ConfigureIngestion = ({
|
||||
profileSample: profileSample ?? undefined,
|
||||
});
|
||||
|
||||
const handleConfidenceScore = (confidence: number | undefined | null) =>
|
||||
onChange({
|
||||
confidence: confidence ?? undefined,
|
||||
});
|
||||
|
||||
const handleProfileSampleTypeChange = (value: ProfileSampleType) => {
|
||||
onChange({
|
||||
profileSampleType: value,
|
||||
@ -481,20 +488,33 @@ const ConfigureIngestion = ({
|
||||
|
||||
const getProcessPiiTogglesForProfiler = () => {
|
||||
return (
|
||||
<Field>
|
||||
<div className="tw-flex tw-gap-1">
|
||||
<label>{t('label.auto-tag-pii-uppercase')}</label>
|
||||
<ToggleSwitchV1
|
||||
checked={processPii}
|
||||
handleCheck={handleProcessPii}
|
||||
testId="include-lineage"
|
||||
/>
|
||||
</div>
|
||||
<p className="tw-text-grey-muted tw-mt-3">
|
||||
{t('message.process-pii-sensitive-column-message-profiler')}
|
||||
</p>
|
||||
{getSeparator('')}
|
||||
</Field>
|
||||
<Fragment>
|
||||
<Field>
|
||||
<div className="tw-flex tw-gap-1">
|
||||
<label>{t('label.auto-tag-pii-uppercase')}</label>
|
||||
<ToggleSwitchV1
|
||||
checked={processPii}
|
||||
handleCheck={handleProcessPii}
|
||||
testId="include-lineage"
|
||||
/>
|
||||
</div>
|
||||
<p className="tw-text-grey-muted tw-mt-3">
|
||||
{t('message.process-pii-sensitive-column-message-profiler')}
|
||||
</p>
|
||||
{processPii && (
|
||||
<>
|
||||
{getSeparator('')}
|
||||
<Typography.Paragraph className="text-grey-muted m-t-0 m-b-xs text-sm">
|
||||
{t('message.confidence-percentage-message')}
|
||||
</Typography.Paragraph>
|
||||
<SliderWithInput
|
||||
value={confidence || 80}
|
||||
onChange={handleConfidenceScore}
|
||||
/>
|
||||
</>
|
||||
)}
|
||||
</Field>
|
||||
</Fragment>
|
||||
);
|
||||
};
|
||||
|
||||
|
@ -135,6 +135,7 @@ export interface AddIngestionState {
|
||||
useFqnFilter: boolean;
|
||||
processPii: boolean;
|
||||
overrideOwner: boolean;
|
||||
confidence?: number;
|
||||
}
|
||||
|
||||
export enum ShowFilter {
|
||||
|
@ -910,6 +910,7 @@
|
||||
"click-text-to-view-details": "Click <0>{{text}}</0> to view details.",
|
||||
"closed-this-task": "closed this task",
|
||||
"collaborate-with-other-user": "to collaborate with other users.",
|
||||
"confidence-percentage-message": "Set the confidence level for the NLP model to use when infering whether a column contains PII data or not.",
|
||||
"configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.",
|
||||
"configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our",
|
||||
"configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.",
|
||||
|
@ -910,6 +910,7 @@
|
||||
"click-text-to-view-details": "Click <0>{{text}}</0> to view details.",
|
||||
"closed-this-task": "closed this task",
|
||||
"collaborate-with-other-user": "to collaborate with other users.",
|
||||
"confidence-percentage-message": "Entrer le niveau de confiance à utiliser pour le modèle NLP lorsque celui-ci détermine si un champs contain des données IIP.",
|
||||
"configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.",
|
||||
"configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our",
|
||||
"configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.",
|
||||
|
@ -910,6 +910,7 @@
|
||||
"click-text-to-view-details": "Click <0>{{text}}</0> to view details.",
|
||||
"closed-this-task": "このタスクを閉じました",
|
||||
"collaborate-with-other-user": "to collaborate with other users.",
|
||||
"confidence-percentage-message": "Set the confidence level for the NLP model to use when infering whether a column contains PII data or not.",
|
||||
"configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.",
|
||||
"configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our",
|
||||
"configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.",
|
||||
|
@ -910,6 +910,7 @@
|
||||
"click-text-to-view-details": "Click <0>{{text}}</0> to view details.",
|
||||
"closed-this-task": "closed this task",
|
||||
"collaborate-with-other-user": "to collaborate with other users.",
|
||||
"confidence-percentage-message": "Set the confidence level for the NLP model to use when infering whether a column contains PII data or not.",
|
||||
"configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.",
|
||||
"configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our",
|
||||
"configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.",
|
||||
|
Loading…
x
Reference in New Issue
Block a user