improvement in pii tagging (#10696)

* improvement in pii tagging

* fix conflict and changes as per comment

* Added confidence field

* changes as per comments

* Apply suggestions from code review

Co-authored-by: Teddy <teddy.crepineau@gmail.com>

---------

Co-authored-by: Ashish Gupta <ashish@getcollate.io>
Co-authored-by: Pere Miquel Brull <peremiquelbrull@gmail.com>
Co-authored-by: Teddy <teddy.crepineau@gmail.com>
This commit is contained in:
NiharDoshi99 2023-03-28 19:37:48 +05:30 committed by GitHub
parent 4853dfc6c8
commit 46afe69811
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 169 additions and 36 deletions

View File

@ -13,8 +13,9 @@
Processor util to fetch pii sensitive columns
"""
import logging
import re
import traceback
from enum import Enum
from enum import Enum, auto
from typing import Optional, Tuple
from metadata.generated.schema.entity.classification.tag import Tag
@ -25,11 +26,73 @@ from metadata.utils import fqn
PII = "PII"
class PiiTypes(Enum):
"""PiiTypes enumerates the different types of PII data"""
NONE = auto()
UNSUPPORTED = auto()
PHONE = auto()
EMAIL = auto()
CREDIT_CARD = auto()
ADDRESS = auto()
ADDRESS_LOCATION = auto()
PERSON = auto()
LOCATION = auto()
BIRTH_DATE = auto()
GENDER = auto()
NATIONALITY = auto()
IP_ADDRESS = auto()
SSN = auto()
USER_NAME = auto()
PASSWORD = auto()
ETHNICITY = auto()
TAX_ID = auto()
KEY = auto()
BANKACC = auto()
class TagType(Enum):
SENSITIVE = "Sensitive"
NONSENSITIVE = "NonSensitive"
class ColumnNameScanner:
"""
Column Name Scanner to scan column name
"""
sensitive_regex = {
PiiTypes.PASSWORD: re.compile("^.*password.*$", re.IGNORECASE),
PiiTypes.USER_NAME: re.compile("^.*user(id|name|).*$", re.IGNORECASE),
PiiTypes.KEY: re.compile("^.*(key).*$", re.IGNORECASE),
PiiTypes.SSN: re.compile("^.*(ssn|social).*$", re.IGNORECASE),
PiiTypes.CREDIT_CARD: re.compile("^.*(card).*$", re.IGNORECASE),
PiiTypes.BANKACC: re.compile("^.*(bank|acc|amount).*$", re.IGNORECASE),
PiiTypes.EMAIL: re.compile("^.*(email|e-mail|mail).*$", re.IGNORECASE),
}
non_sensitive_regex = {
PiiTypes.PERSON: re.compile(
"^.*(firstname|fname|lastname|lname|"
"fullname|maidenname|_name|"
"nickname|name_suffix|name).*$",
re.IGNORECASE,
),
PiiTypes.BIRTH_DATE: re.compile(
"^.*(date_of_birth|dateofbirth|dob|"
"birthday|date_of_death|dateofdeath).*$",
re.IGNORECASE,
),
PiiTypes.GENDER: re.compile("^.*(gender).*$", re.IGNORECASE),
PiiTypes.NATIONALITY: re.compile("^.*(nationality).*$", re.IGNORECASE),
PiiTypes.ADDRESS: re.compile(
"^.*(address|city|state|county|country|"
"zipcode|zip|postal|zone|borough).*$",
re.IGNORECASE,
),
PiiTypes.PHONE: re.compile("^.*(phone).*$", re.IGNORECASE),
}
class NEREntity(Enum):
CREDIT_CARD = TagType.SENSITIVE.value
EMAIL_ADDRESS = TagType.SENSITIVE.value
@ -42,7 +105,7 @@ class NEREntity(Enum):
US_DRIVER_LICENSE = TagType.SENSITIVE.value
DATE_TIME = TagType.NONSENSITIVE.value
URL = TagType.SENSITIVE.value
US_BANK_NUMBER = TagType.NONSENSITIVE.value
US_BANK_NUMBER = TagType.SENSITIVE.value
US_SSN = TagType.SENSITIVE.value
PERSON = TagType.SENSITIVE.value
US_PASSPORT = TagType.SENSITIVE.value
@ -78,11 +141,21 @@ class NERScanner:
most_used_label_occurrence = score[1]
return label_score or (None, None)
def column_name_scan(self, column_name: str):
for pii_type_pattern in ColumnNameScanner.sensitive_regex.values():
if pii_type_pattern.match(column_name) is not None:
return TagType.SENSITIVE.value, 1
for pii_type_pattern in ColumnNameScanner.non_sensitive_regex.values():
if pii_type_pattern.match(column_name) is not None:
return TagType.NONSENSITIVE.value, 1
return None
def scan(self, text) -> Tuple[str, float]:
"""Scan the text and return an pii tag fqn and confidence/score"""
logging.debug("Processing '%s'", text)
pii_tag_fqn = ""
labels_score = {}
self.text = [str(row) for row in text if row is not None]
for row in self.text:
@ -106,19 +179,23 @@ class NERScanner:
label, score = self.get_highest_score_label(labels_score)
if label and score:
label_type = NEREntity.__members__.get(
tag_type = NEREntity.__members__.get(
label, TagType.NONSENSITIVE.value
).value
pii_tag_fqn = fqn.build(
self.metadata,
entity_type=Tag,
classification_name=PII,
tag_name=label_type,
)
return tag_type, score
return pii_tag_fqn or "", score or 0
return "", 0
def process(self, table_data: TableData, table_entity: Table, client: OpenMetadata):
def process(
self,
table_data: TableData,
table_entity: Table,
client: OpenMetadata,
thresold_confidence: float,
):
"""
process function to start processing sample data
"""
len_of_rows = len(table_data.rows[0]) if table_data.rows else 0
for idx in range(len_of_rows):
pii_found = False
@ -128,11 +205,19 @@ class NERScanner:
continue
if pii_found is True:
continue
pii_tag_fqn, confidence = self.scan([row[idx] for row in table_data.rows])
if pii_tag_fqn and confidence >= 0.8:
tag_type, confidence = self.column_name_scan(
table_data.columns[idx].__root__
) or self.scan([row[idx] for row in table_data.rows])
if tag_type and confidence >= thresold_confidence / 100:
tag_fqn = fqn.build(
self.metadata,
entity_type=Tag,
classification_name=PII,
tag_name=tag_type,
)
client.patch_column_tag(
entity_id=table_entity.id,
column_name=table_entity.columns[idx].name.__root__,
tag_fqn=pii_tag_fqn,
tag_fqn=tag_fqn,
is_suggested=True,
)

View File

@ -508,6 +508,7 @@ class Profiler(Generic[TMetric]):
sample_data,
self.profiler_interface.table_entity, # type: ignore
self.profiler_interface.ometa_client, # type: ignore
self.profiler_interface.source_config.confidence,
)
except Exception as exc:
logger.warning(

View File

@ -685,12 +685,10 @@ UPDATED_TABLE_ENTITY = [
fullyQualifiedName="test-service-table-patch.test-db.test-schema.customers.first_name",
tags=[
TagLabel(
tagFQN=TagFQN(__root__="PII.Sensitive"),
tagFQN=TagFQN(__root__="PII.NonSensitive"),
description=(
(
"PII which if lost, compromised, or disclosed without authorization, could result in "
"substantial harm, embarrassment, inconvenience, or unfairness to an individual."
)
"PII which is easily accessible from public sources and can include zip code, "
"race, gender, and date of birth."
),
source="Classification",
labelType="Automated",
@ -716,7 +714,19 @@ UPDATED_TABLE_ENTITY = [
dataTypeDisplay="varchar",
description=None,
fullyQualifiedName="test-service-table-patch.test-db.test-schema.customers.last_name",
tags=[],
tags=[
TagLabel(
tagFQN=TagFQN(__root__="PII.NonSensitive"),
description=(
"PII which is easily accessible from public sources and can include zip code, "
"race, gender, and date of birth."
),
source="Classification",
labelType="Automated",
state="Suggested",
href=None,
)
],
constraint=None,
ordinalPosition=None,
jsonSchema=None,
@ -884,7 +894,10 @@ class PiiProcessorTest(TestCase):
TABLE_ENTITY.id = table_entity.id
self.nerscanner_processor.process(
table_data=table_data, table_entity=TABLE_ENTITY, client=self.metadata
table_data=table_data,
table_entity=TABLE_ENTITY,
client=self.metadata,
thresold_confidence=85,
)
updated_table_entity = self.metadata.get_by_id(
entity=Table, entity_id=table_entity.id, fields=["tags"]

View File

@ -36,6 +36,11 @@
"type": "boolean",
"default": false
},
"confidence": {
"description": "Set the Confidence value for which you want the column to be marked",
"type": "number",
"default": 80
},
"generateSampleData": {
"description": "Option to turn on/off generating sample data.",
"type": "boolean",

View File

@ -194,6 +194,7 @@ const AddIngestion = ({
resultLimit: sourceConfig?.resultLimit ?? 1000,
metadataToESConfig: undefined,
dbtUpdateDescriptions: sourceConfig?.dbtUpdateDescriptions ?? false,
confidence: sourceConfig?.confidence,
dbtClassificationName:
sourceConfig?.dbtClassificationName ?? DBT_CLASSIFICATION_DEFAULT_VALUE, // default value from Json Schema
}),
@ -425,6 +426,7 @@ const AddIngestion = ({
threadCount,
timeoutSeconds,
processPii,
confidence,
} = state;
switch (type) {
case PipelineType.Usage: {
@ -460,6 +462,7 @@ const AddIngestion = ({
type: profilerIngestionType,
generateSampleData: ingestSampleData,
profileSample: profileSample,
confidence: processPii ? confidence : undefined,
profileSampleType: profileSampleType,
threadCount: threadCount,
timeoutSeconds: timeoutSeconds,

View File

@ -103,6 +103,7 @@ const mockConfigureIngestion: ConfigureIngestionProps = {
threadCount: 5,
timeoutSeconds: 43200,
useFqnFilter: false,
confidence: 80,
} as unknown as AddIngestionState,
};

View File

@ -85,6 +85,7 @@ const ConfigureIngestion = ({
topicFilterPattern,
useFqnFilter,
processPii,
confidence,
overrideOwner,
} = useMemo(
() => ({
@ -124,6 +125,7 @@ const ConfigureIngestion = ({
useFqnFilter: data.useFqnFilter,
processPii: data.processPii,
overrideOwner: data.overrideOwner,
confidence: data.confidence,
}),
[data]
);
@ -150,6 +152,11 @@ const ConfigureIngestion = ({
profileSample: profileSample ?? undefined,
});
const handleConfidenceScore = (confidence: number | undefined | null) =>
onChange({
confidence: confidence ?? undefined,
});
const handleProfileSampleTypeChange = (value: ProfileSampleType) => {
onChange({
profileSampleType: value,
@ -481,20 +488,33 @@ const ConfigureIngestion = ({
const getProcessPiiTogglesForProfiler = () => {
return (
<Field>
<div className="tw-flex tw-gap-1">
<label>{t('label.auto-tag-pii-uppercase')}</label>
<ToggleSwitchV1
checked={processPii}
handleCheck={handleProcessPii}
testId="include-lineage"
/>
</div>
<p className="tw-text-grey-muted tw-mt-3">
{t('message.process-pii-sensitive-column-message-profiler')}
</p>
{getSeparator('')}
</Field>
<Fragment>
<Field>
<div className="tw-flex tw-gap-1">
<label>{t('label.auto-tag-pii-uppercase')}</label>
<ToggleSwitchV1
checked={processPii}
handleCheck={handleProcessPii}
testId="include-lineage"
/>
</div>
<p className="tw-text-grey-muted tw-mt-3">
{t('message.process-pii-sensitive-column-message-profiler')}
</p>
{processPii && (
<>
{getSeparator('')}
<Typography.Paragraph className="text-grey-muted m-t-0 m-b-xs text-sm">
{t('message.confidence-percentage-message')}
</Typography.Paragraph>
<SliderWithInput
value={confidence || 80}
onChange={handleConfidenceScore}
/>
</>
)}
</Field>
</Fragment>
);
};

View File

@ -135,6 +135,7 @@ export interface AddIngestionState {
useFqnFilter: boolean;
processPii: boolean;
overrideOwner: boolean;
confidence?: number;
}
export enum ShowFilter {

View File

@ -910,6 +910,7 @@
"click-text-to-view-details": "Click <0>{{text}}</0> to view details.",
"closed-this-task": "closed this task",
"collaborate-with-other-user": "to collaborate with other users.",
"confidence-percentage-message": "Set the confidence level for the NLP model to use when infering whether a column contains PII data or not.",
"configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.",
"configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our",
"configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.",

View File

@ -910,6 +910,7 @@
"click-text-to-view-details": "Click <0>{{text}}</0> to view details.",
"closed-this-task": "closed this task",
"collaborate-with-other-user": "to collaborate with other users.",
"confidence-percentage-message": "Entrer le niveau de confiance à utiliser pour le modèle NLP lorsque celui-ci détermine si un champs contain des données IIP.",
"configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.",
"configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our",
"configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.",

View File

@ -910,6 +910,7 @@
"click-text-to-view-details": "Click <0>{{text}}</0> to view details.",
"closed-this-task": "このタスクを閉じました",
"collaborate-with-other-user": "to collaborate with other users.",
"confidence-percentage-message": "Set the confidence level for the NLP model to use when infering whether a column contains PII data or not.",
"configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.",
"configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our",
"configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.",

View File

@ -910,6 +910,7 @@
"click-text-to-view-details": "Click <0>{{text}}</0> to view details.",
"closed-this-task": "closed this task",
"collaborate-with-other-user": "to collaborate with other users.",
"confidence-percentage-message": "Set the confidence level for the NLP model to use when infering whether a column contains PII data or not.",
"configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.",
"configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our",
"configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.",