improvement in pii tagging (#10696)

* improvement in pii tagging

* fix conflict and changes as per comment

* Added confidence field

* changes as per comments

* Apply suggestions from code review

Co-authored-by: Teddy <teddy.crepineau@gmail.com>

---------

Co-authored-by: Ashish Gupta <ashish@getcollate.io>
Co-authored-by: Pere Miquel Brull <peremiquelbrull@gmail.com>
Co-authored-by: Teddy <teddy.crepineau@gmail.com>
This commit is contained in:
NiharDoshi99 2023-03-28 19:37:48 +05:30 committed by GitHub
parent 4853dfc6c8
commit 46afe69811
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 169 additions and 36 deletions

View File

@ -13,8 +13,9 @@
Processor util to fetch pii sensitive columns Processor util to fetch pii sensitive columns
""" """
import logging import logging
import re
import traceback import traceback
from enum import Enum from enum import Enum, auto
from typing import Optional, Tuple from typing import Optional, Tuple
from metadata.generated.schema.entity.classification.tag import Tag from metadata.generated.schema.entity.classification.tag import Tag
@ -25,11 +26,73 @@ from metadata.utils import fqn
PII = "PII" PII = "PII"
class PiiTypes(Enum):
"""PiiTypes enumerates the different types of PII data"""
NONE = auto()
UNSUPPORTED = auto()
PHONE = auto()
EMAIL = auto()
CREDIT_CARD = auto()
ADDRESS = auto()
ADDRESS_LOCATION = auto()
PERSON = auto()
LOCATION = auto()
BIRTH_DATE = auto()
GENDER = auto()
NATIONALITY = auto()
IP_ADDRESS = auto()
SSN = auto()
USER_NAME = auto()
PASSWORD = auto()
ETHNICITY = auto()
TAX_ID = auto()
KEY = auto()
BANKACC = auto()
class TagType(Enum): class TagType(Enum):
SENSITIVE = "Sensitive" SENSITIVE = "Sensitive"
NONSENSITIVE = "NonSensitive" NONSENSITIVE = "NonSensitive"
class ColumnNameScanner:
"""
Column Name Scanner to scan column name
"""
sensitive_regex = {
PiiTypes.PASSWORD: re.compile("^.*password.*$", re.IGNORECASE),
PiiTypes.USER_NAME: re.compile("^.*user(id|name|).*$", re.IGNORECASE),
PiiTypes.KEY: re.compile("^.*(key).*$", re.IGNORECASE),
PiiTypes.SSN: re.compile("^.*(ssn|social).*$", re.IGNORECASE),
PiiTypes.CREDIT_CARD: re.compile("^.*(card).*$", re.IGNORECASE),
PiiTypes.BANKACC: re.compile("^.*(bank|acc|amount).*$", re.IGNORECASE),
PiiTypes.EMAIL: re.compile("^.*(email|e-mail|mail).*$", re.IGNORECASE),
}
non_sensitive_regex = {
PiiTypes.PERSON: re.compile(
"^.*(firstname|fname|lastname|lname|"
"fullname|maidenname|_name|"
"nickname|name_suffix|name).*$",
re.IGNORECASE,
),
PiiTypes.BIRTH_DATE: re.compile(
"^.*(date_of_birth|dateofbirth|dob|"
"birthday|date_of_death|dateofdeath).*$",
re.IGNORECASE,
),
PiiTypes.GENDER: re.compile("^.*(gender).*$", re.IGNORECASE),
PiiTypes.NATIONALITY: re.compile("^.*(nationality).*$", re.IGNORECASE),
PiiTypes.ADDRESS: re.compile(
"^.*(address|city|state|county|country|"
"zipcode|zip|postal|zone|borough).*$",
re.IGNORECASE,
),
PiiTypes.PHONE: re.compile("^.*(phone).*$", re.IGNORECASE),
}
class NEREntity(Enum): class NEREntity(Enum):
CREDIT_CARD = TagType.SENSITIVE.value CREDIT_CARD = TagType.SENSITIVE.value
EMAIL_ADDRESS = TagType.SENSITIVE.value EMAIL_ADDRESS = TagType.SENSITIVE.value
@ -42,7 +105,7 @@ class NEREntity(Enum):
US_DRIVER_LICENSE = TagType.SENSITIVE.value US_DRIVER_LICENSE = TagType.SENSITIVE.value
DATE_TIME = TagType.NONSENSITIVE.value DATE_TIME = TagType.NONSENSITIVE.value
URL = TagType.SENSITIVE.value URL = TagType.SENSITIVE.value
US_BANK_NUMBER = TagType.NONSENSITIVE.value US_BANK_NUMBER = TagType.SENSITIVE.value
US_SSN = TagType.SENSITIVE.value US_SSN = TagType.SENSITIVE.value
PERSON = TagType.SENSITIVE.value PERSON = TagType.SENSITIVE.value
US_PASSPORT = TagType.SENSITIVE.value US_PASSPORT = TagType.SENSITIVE.value
@ -78,11 +141,21 @@ class NERScanner:
most_used_label_occurrence = score[1] most_used_label_occurrence = score[1]
return label_score or (None, None) return label_score or (None, None)
def column_name_scan(self, column_name: str):
for pii_type_pattern in ColumnNameScanner.sensitive_regex.values():
if pii_type_pattern.match(column_name) is not None:
return TagType.SENSITIVE.value, 1
for pii_type_pattern in ColumnNameScanner.non_sensitive_regex.values():
if pii_type_pattern.match(column_name) is not None:
return TagType.NONSENSITIVE.value, 1
return None
def scan(self, text) -> Tuple[str, float]: def scan(self, text) -> Tuple[str, float]:
"""Scan the text and return an pii tag fqn and confidence/score""" """Scan the text and return an pii tag fqn and confidence/score"""
logging.debug("Processing '%s'", text) logging.debug("Processing '%s'", text)
pii_tag_fqn = ""
labels_score = {} labels_score = {}
self.text = [str(row) for row in text if row is not None] self.text = [str(row) for row in text if row is not None]
for row in self.text: for row in self.text:
@ -106,19 +179,23 @@ class NERScanner:
label, score = self.get_highest_score_label(labels_score) label, score = self.get_highest_score_label(labels_score)
if label and score: if label and score:
label_type = NEREntity.__members__.get( tag_type = NEREntity.__members__.get(
label, TagType.NONSENSITIVE.value label, TagType.NONSENSITIVE.value
).value ).value
pii_tag_fqn = fqn.build( return tag_type, score
self.metadata,
entity_type=Tag,
classification_name=PII,
tag_name=label_type,
)
return pii_tag_fqn or "", score or 0 return "", 0
def process(self, table_data: TableData, table_entity: Table, client: OpenMetadata): def process(
self,
table_data: TableData,
table_entity: Table,
client: OpenMetadata,
thresold_confidence: float,
):
"""
process function to start processing sample data
"""
len_of_rows = len(table_data.rows[0]) if table_data.rows else 0 len_of_rows = len(table_data.rows[0]) if table_data.rows else 0
for idx in range(len_of_rows): for idx in range(len_of_rows):
pii_found = False pii_found = False
@ -128,11 +205,19 @@ class NERScanner:
continue continue
if pii_found is True: if pii_found is True:
continue continue
pii_tag_fqn, confidence = self.scan([row[idx] for row in table_data.rows]) tag_type, confidence = self.column_name_scan(
if pii_tag_fqn and confidence >= 0.8: table_data.columns[idx].__root__
) or self.scan([row[idx] for row in table_data.rows])
if tag_type and confidence >= thresold_confidence / 100:
tag_fqn = fqn.build(
self.metadata,
entity_type=Tag,
classification_name=PII,
tag_name=tag_type,
)
client.patch_column_tag( client.patch_column_tag(
entity_id=table_entity.id, entity_id=table_entity.id,
column_name=table_entity.columns[idx].name.__root__, column_name=table_entity.columns[idx].name.__root__,
tag_fqn=pii_tag_fqn, tag_fqn=tag_fqn,
is_suggested=True, is_suggested=True,
) )

View File

@ -508,6 +508,7 @@ class Profiler(Generic[TMetric]):
sample_data, sample_data,
self.profiler_interface.table_entity, # type: ignore self.profiler_interface.table_entity, # type: ignore
self.profiler_interface.ometa_client, # type: ignore self.profiler_interface.ometa_client, # type: ignore
self.profiler_interface.source_config.confidence,
) )
except Exception as exc: except Exception as exc:
logger.warning( logger.warning(

View File

@ -685,12 +685,10 @@ UPDATED_TABLE_ENTITY = [
fullyQualifiedName="test-service-table-patch.test-db.test-schema.customers.first_name", fullyQualifiedName="test-service-table-patch.test-db.test-schema.customers.first_name",
tags=[ tags=[
TagLabel( TagLabel(
tagFQN=TagFQN(__root__="PII.Sensitive"), tagFQN=TagFQN(__root__="PII.NonSensitive"),
description=( description=(
( "PII which is easily accessible from public sources and can include zip code, "
"PII which if lost, compromised, or disclosed without authorization, could result in " "race, gender, and date of birth."
"substantial harm, embarrassment, inconvenience, or unfairness to an individual."
)
), ),
source="Classification", source="Classification",
labelType="Automated", labelType="Automated",
@ -716,7 +714,19 @@ UPDATED_TABLE_ENTITY = [
dataTypeDisplay="varchar", dataTypeDisplay="varchar",
description=None, description=None,
fullyQualifiedName="test-service-table-patch.test-db.test-schema.customers.last_name", fullyQualifiedName="test-service-table-patch.test-db.test-schema.customers.last_name",
tags=[], tags=[
TagLabel(
tagFQN=TagFQN(__root__="PII.NonSensitive"),
description=(
"PII which is easily accessible from public sources and can include zip code, "
"race, gender, and date of birth."
),
source="Classification",
labelType="Automated",
state="Suggested",
href=None,
)
],
constraint=None, constraint=None,
ordinalPosition=None, ordinalPosition=None,
jsonSchema=None, jsonSchema=None,
@ -884,7 +894,10 @@ class PiiProcessorTest(TestCase):
TABLE_ENTITY.id = table_entity.id TABLE_ENTITY.id = table_entity.id
self.nerscanner_processor.process( self.nerscanner_processor.process(
table_data=table_data, table_entity=TABLE_ENTITY, client=self.metadata table_data=table_data,
table_entity=TABLE_ENTITY,
client=self.metadata,
thresold_confidence=85,
) )
updated_table_entity = self.metadata.get_by_id( updated_table_entity = self.metadata.get_by_id(
entity=Table, entity_id=table_entity.id, fields=["tags"] entity=Table, entity_id=table_entity.id, fields=["tags"]

View File

@ -36,6 +36,11 @@
"type": "boolean", "type": "boolean",
"default": false "default": false
}, },
"confidence": {
"description": "Set the Confidence value for which you want the column to be marked",
"type": "number",
"default": 80
},
"generateSampleData": { "generateSampleData": {
"description": "Option to turn on/off generating sample data.", "description": "Option to turn on/off generating sample data.",
"type": "boolean", "type": "boolean",

View File

@ -194,6 +194,7 @@ const AddIngestion = ({
resultLimit: sourceConfig?.resultLimit ?? 1000, resultLimit: sourceConfig?.resultLimit ?? 1000,
metadataToESConfig: undefined, metadataToESConfig: undefined,
dbtUpdateDescriptions: sourceConfig?.dbtUpdateDescriptions ?? false, dbtUpdateDescriptions: sourceConfig?.dbtUpdateDescriptions ?? false,
confidence: sourceConfig?.confidence,
dbtClassificationName: dbtClassificationName:
sourceConfig?.dbtClassificationName ?? DBT_CLASSIFICATION_DEFAULT_VALUE, // default value from Json Schema sourceConfig?.dbtClassificationName ?? DBT_CLASSIFICATION_DEFAULT_VALUE, // default value from Json Schema
}), }),
@ -425,6 +426,7 @@ const AddIngestion = ({
threadCount, threadCount,
timeoutSeconds, timeoutSeconds,
processPii, processPii,
confidence,
} = state; } = state;
switch (type) { switch (type) {
case PipelineType.Usage: { case PipelineType.Usage: {
@ -460,6 +462,7 @@ const AddIngestion = ({
type: profilerIngestionType, type: profilerIngestionType,
generateSampleData: ingestSampleData, generateSampleData: ingestSampleData,
profileSample: profileSample, profileSample: profileSample,
confidence: processPii ? confidence : undefined,
profileSampleType: profileSampleType, profileSampleType: profileSampleType,
threadCount: threadCount, threadCount: threadCount,
timeoutSeconds: timeoutSeconds, timeoutSeconds: timeoutSeconds,

View File

@ -103,6 +103,7 @@ const mockConfigureIngestion: ConfigureIngestionProps = {
threadCount: 5, threadCount: 5,
timeoutSeconds: 43200, timeoutSeconds: 43200,
useFqnFilter: false, useFqnFilter: false,
confidence: 80,
} as unknown as AddIngestionState, } as unknown as AddIngestionState,
}; };

View File

@ -85,6 +85,7 @@ const ConfigureIngestion = ({
topicFilterPattern, topicFilterPattern,
useFqnFilter, useFqnFilter,
processPii, processPii,
confidence,
overrideOwner, overrideOwner,
} = useMemo( } = useMemo(
() => ({ () => ({
@ -124,6 +125,7 @@ const ConfigureIngestion = ({
useFqnFilter: data.useFqnFilter, useFqnFilter: data.useFqnFilter,
processPii: data.processPii, processPii: data.processPii,
overrideOwner: data.overrideOwner, overrideOwner: data.overrideOwner,
confidence: data.confidence,
}), }),
[data] [data]
); );
@ -150,6 +152,11 @@ const ConfigureIngestion = ({
profileSample: profileSample ?? undefined, profileSample: profileSample ?? undefined,
}); });
const handleConfidenceScore = (confidence: number | undefined | null) =>
onChange({
confidence: confidence ?? undefined,
});
const handleProfileSampleTypeChange = (value: ProfileSampleType) => { const handleProfileSampleTypeChange = (value: ProfileSampleType) => {
onChange({ onChange({
profileSampleType: value, profileSampleType: value,
@ -481,20 +488,33 @@ const ConfigureIngestion = ({
const getProcessPiiTogglesForProfiler = () => { const getProcessPiiTogglesForProfiler = () => {
return ( return (
<Field> <Fragment>
<div className="tw-flex tw-gap-1"> <Field>
<label>{t('label.auto-tag-pii-uppercase')}</label> <div className="tw-flex tw-gap-1">
<ToggleSwitchV1 <label>{t('label.auto-tag-pii-uppercase')}</label>
checked={processPii} <ToggleSwitchV1
handleCheck={handleProcessPii} checked={processPii}
testId="include-lineage" handleCheck={handleProcessPii}
/> testId="include-lineage"
</div> />
<p className="tw-text-grey-muted tw-mt-3"> </div>
{t('message.process-pii-sensitive-column-message-profiler')} <p className="tw-text-grey-muted tw-mt-3">
</p> {t('message.process-pii-sensitive-column-message-profiler')}
{getSeparator('')} </p>
</Field> {processPii && (
<>
{getSeparator('')}
<Typography.Paragraph className="text-grey-muted m-t-0 m-b-xs text-sm">
{t('message.confidence-percentage-message')}
</Typography.Paragraph>
<SliderWithInput
value={confidence || 80}
onChange={handleConfidenceScore}
/>
</>
)}
</Field>
</Fragment>
); );
}; };

View File

@ -135,6 +135,7 @@ export interface AddIngestionState {
useFqnFilter: boolean; useFqnFilter: boolean;
processPii: boolean; processPii: boolean;
overrideOwner: boolean; overrideOwner: boolean;
confidence?: number;
} }
export enum ShowFilter { export enum ShowFilter {

View File

@ -910,6 +910,7 @@
"click-text-to-view-details": "Click <0>{{text}}</0> to view details.", "click-text-to-view-details": "Click <0>{{text}}</0> to view details.",
"closed-this-task": "closed this task", "closed-this-task": "closed this task",
"collaborate-with-other-user": "to collaborate with other users.", "collaborate-with-other-user": "to collaborate with other users.",
"confidence-percentage-message": "Set the confidence level for the NLP model to use when infering whether a column contains PII data or not.",
"configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.", "configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.",
"configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our", "configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our",
"configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.", "configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.",

View File

@ -910,6 +910,7 @@
"click-text-to-view-details": "Click <0>{{text}}</0> to view details.", "click-text-to-view-details": "Click <0>{{text}}</0> to view details.",
"closed-this-task": "closed this task", "closed-this-task": "closed this task",
"collaborate-with-other-user": "to collaborate with other users.", "collaborate-with-other-user": "to collaborate with other users.",
"confidence-percentage-message": "Entrer le niveau de confiance à utiliser pour le modèle NLP lorsque celui-ci détermine si un champs contain des données IIP.",
"configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.", "configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.",
"configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our", "configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our",
"configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.", "configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.",

View File

@ -910,6 +910,7 @@
"click-text-to-view-details": "Click <0>{{text}}</0> to view details.", "click-text-to-view-details": "Click <0>{{text}}</0> to view details.",
"closed-this-task": "このタスクを閉じました", "closed-this-task": "このタスクを閉じました",
"collaborate-with-other-user": "to collaborate with other users.", "collaborate-with-other-user": "to collaborate with other users.",
"confidence-percentage-message": "Set the confidence level for the NLP model to use when infering whether a column contains PII data or not.",
"configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.", "configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.",
"configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our", "configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our",
"configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.", "configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.",

View File

@ -910,6 +910,7 @@
"click-text-to-view-details": "Click <0>{{text}}</0> to view details.", "click-text-to-view-details": "Click <0>{{text}}</0> to view details.",
"closed-this-task": "closed this task", "closed-this-task": "closed this task",
"collaborate-with-other-user": "to collaborate with other users.", "collaborate-with-other-user": "to collaborate with other users.",
"confidence-percentage-message": "Set the confidence level for the NLP model to use when infering whether a column contains PII data or not.",
"configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.", "configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.",
"configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our", "configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our",
"configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.", "configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.",