feat(ingest): refactor classification mixin interface, support new info types (#6545)

This commit is contained in:
Mayuri Nehate 2022-11-25 18:48:42 +05:30 committed by GitHub
parent 5fd5866a03
commit 7a8e36d57d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 12 additions and 266 deletions

View File

@ -29,8 +29,8 @@ DataHub Classifier is the default classifier implementation, which uses [acryl-d
| Field | Required | Type | Description | Default |
| --- | --- | --- | --- | -- |
| confidence_level_threshold | | number | | 0.6 |
| info_types | | list[string] | List of infotypes to be predicted. By default, all supported infotypes are considered. If specified. this should be subset of ['Email_Address', 'Gender', 'Credit_Debit_Card_Number', 'Phone_Number', 'Street_Address', 'Full_Name', 'Age'] | None |
| info_types_config | Configuration details for infotypes | Dict[str, InfoTypeConfig] | | See [datahub_classifier.py](../../../metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py) |
| info_types | | list[string] | List of infotypes to be predicted. By default, all supported infotypes are considered. If specified. this should be subset of ['Email_Address', 'Gender', 'Credit_Debit_Card_Number', 'Phone_Number', 'Street_Address', 'Full_Name', 'Age', 'IBAN', 'US_Social_Security_Number', 'Vehicle_Identification_Number', 'IP_Address_v4', 'IP_Address_v6', 'US_Driving_License_Number', 'Swift_Code'] | None |
| info_types_config | Configuration details for infotypes | Dict[str, InfoTypeConfig] | | See [reference_input.py](https://github.com/acryldata/datahub-classify/blob/main/datahub-classify/src/datahub_classify/reference_input.py) for default configuration. |
| info_types_config.`key`.prediction_factors_and_weights | ❓ (required if info_types_config.`key` is set) | Dict[str,number] | Factors and their weights to consider when predicting info types | |
| info_types_config.`key`.name | | NameFactorConfig (see below for fields) | | |
| info_types_config.`key`.name.regex | | Array of string | List of regex patterns the column name follows for the info type | ['.*'] |

View File

@ -177,7 +177,7 @@ snowflake_common = {
"pandas",
"cryptography",
"msal",
"acryl-datahub-classify>=0.0.3",
"acryl-datahub-classify>=0.0.4",
# spacy version restricted to reduce backtracking, used by acryl-datahub-classify,
"spacy==3.4.3",
}

View File

@ -1,8 +1,7 @@
import logging
from typing import List, Optional
from typing import Dict, List, Optional
from datahub_classify.helper_classes import ColumnInfo, Metadata
from pandas import DataFrame
from typing_extensions import Protocol
from datahub.configuration.common import ConfigurationError
@ -106,7 +105,7 @@ class ClassificationMixin:
self: ClassificationSourceProtocol,
dataset_name: str,
schema_metadata: SchemaMetadata,
sample_data: DataFrame,
sample_data: Dict[str, list],
) -> None:
assert self.config.classification
@ -130,8 +129,8 @@ class ClassificationMixin:
"Dataset_Name": dataset_name,
}
),
values=sample_data[field.fieldPath].values
if field.fieldPath in sample_data.columns
values=sample_data[field.fieldPath]
if field.fieldPath in sample_data.keys()
else [],
)
)

View File

@ -2,269 +2,13 @@ from typing import Any, Dict, List, Optional
from datahub_classify.helper_classes import ColumnInfo
from datahub_classify.infotype_predictor import predict_infotypes
from datahub_classify.reference_input import input1 as default_config
from pydantic.class_validators import root_validator
from pydantic.fields import Field
from datahub.configuration.common import ConfigModel
from datahub.ingestion.glossary.classifier import Classifier
default_config = {
"Email_Address": {
"Prediction_Factors_and_Weights": {
"Name": 0.4,
"Description": 0,
"Datatype": 0,
"Values": 0.6,
},
"Name": {
"regex": [
"^.*mail.*id.*$",
"^.*id.*mail.*$",
"^.*mail.*add.*$",
"^.*add.*mail.*$",
"email",
"mail",
]
},
"Description": {
"regex": ["^.*mail.*id.*$", "^.*mail.*add.*$", "email", "mail"]
},
"Datatype": {"type": ["str"]},
"Values": {
"prediction_type": "regex",
"regex": ["[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}"],
"library": [],
},
},
"Gender": {
"Prediction_Factors_and_Weights": {
"Name": 0.4,
"Description": 0,
"Datatype": 0,
"Values": 0.6,
},
"Name": {"regex": ["^.*gender.*$", "^.*sex.*$", "gender", "sex"]},
"Description": {"regex": ["^.*gender.*$", "^.*sex.*$", "gender", "sex"]},
"Datatype": {"type": ["int", "str"]},
"Values": {
"prediction_type": "regex",
"regex": ["male", "female", "man", "woman", "m", "f", "w", "men", "women"],
"library": [],
},
},
"Credit_Debit_Card_Number": {
"Prediction_Factors_and_Weights": {
"Name": 0.4,
"Description": 0,
"Datatype": 0,
"Values": 0.6,
},
"Name": {
"regex": [
"^.*card.*number.*$",
"^.*number.*card.*$",
"^.*credit.*card.*$",
"^.*debit.*card.*$",
]
},
"Description": {
"regex": [
"^.*card.*number.*$",
"^.*number.*card.*$",
"^.*credit.*card.*$",
"^.*debit.*card.*$",
]
},
"Datatype": {"type": ["str", "int"]},
"Values": {
"prediction_type": "regex",
"regex": [
"^4[0-9]{12}(?:[0-9]{3})?$",
"^(?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}$",
"^3[47][0-9]{13}$",
"^3(?:0[0-5]|[68][0-9])[0-9]{11}$",
"^6(?:011|5[0-9]{2})[0-9]{12}$",
"^(?:2131|1800|35\\d{3})\\d{11}$",
"^(6541|6556)[0-9]{12}$",
"^389[0-9]{11}$",
"^63[7-9][0-9]{13}$",
"^9[0-9]{15}$",
"^(6304|6706|6709|6771)[0-9]{12,15}$",
"^(5018|5020|5038|6304|6759|6761|6763)[0-9]{8,15}$",
"^(62[0-9]{14,17})$",
"^(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14})$",
"^(4903|4905|4911|4936|6333|6759)[0-9]{12}|(4903|4905|4911|4936|6333|6759)[0-9]{14}|(4903|4905|4911|4936|6333|6759)[0-9]{15}|564182[0-9]{10}|564182[0-9]{12}|564182[0-9]{13}|633110[0-9]{10}|633110[0-9]{12}|633110[0-9]{13}$",
"^(6334|6767)[0-9]{12}|(6334|6767)[0-9]{14}|(6334|6767)[0-9]{15}$",
],
"library": [],
},
},
"Phone_Number": {
"Prediction_Factors_and_Weights": {
"Name": 0.4,
"Description": 0,
"Datatype": 0,
"Values": 0.6,
},
"Name": {
"regex": [
".*phone.*(num|no).*",
".*(num|no).*phone.*",
".*[^a-z]+ph[^a-z]+.*(num|no).*",
".*(num|no).*[^a-z]+ph[^a-z]+.*",
".*mobile.*(num|no).*",
".*(num|no).*mobile.*",
".*telephone.*(num|no).*",
".*(num|no).*telephone.*",
".*cell.*(num|no).*",
".*(num|no).*cell.*",
".*contact.*(num|no).*",
".*(num|no).*contact.*",
".*landline.*(num|no).*",
".*(num|no).*landline.*",
".*fax.*(num|no).*",
".*(num|no).*fax.*",
"phone",
"telephone",
"landline",
"mobile",
"tel",
"fax",
"cell",
"contact",
]
},
"Description": {
"regex": [
".*phone.*(num|no).*",
".*(num|no).*phone.*",
".*[^a-z]+ph[^a-z]+.*(num|no).*",
".*(num|no).*[^a-z]+ph[^a-z]+.*",
".*mobile.*(num|no).*",
".*(num|no).*mobile.*",
".*telephone.*(num|no).*",
".*(num|no).*telephone.*",
".*cell.*(num|no).*",
".*(num|no).*cell.*",
".*contact.*(num|no).*",
".*(num|no).*contact.*",
".*landline.*(num|no).*",
".*(num|no).*landline.*",
".*fax.*(num|no).*",
".*(num|no).*fax.*",
"phone",
"telephone",
"landline",
"mobile",
"tel",
"fax",
"cell",
"contact",
]
},
"Datatype": {"type": ["int", "str"]},
"Values": {
"prediction_type": "library",
"regex": [],
"library": ["phonenumbers"],
},
},
"Street_Address": {
"Prediction_Factors_and_Weights": {
"Name": 0.5,
"Description": 0,
"Datatype": 0,
"Values": 0.5,
},
"Name": {
"regex": [
".*street.*add.*",
".*add.*street.*",
".*full.*add.*",
".*add.*full.*",
".*mail.*add.*",
".*add.*mail.*",
"add[^a-z]+",
"address",
"street",
]
},
"Description": {
"regex": [
".*street.*add.*",
".*add.*street.*",
".*full.*add.*",
".*add.*full.*",
".*mail.*add.*",
".*add.*mail.*",
"add[^a-z]+",
"address",
"street",
]
},
"Datatype": {"type": ["str"]},
"Values": {"prediction_type": "library", "regex": [], "library": ["spacy"]},
},
"Full_Name": {
"Prediction_Factors_and_Weights": {
"Name": 0.3,
"Description": 0,
"Datatype": 0,
"Values": 0.7,
},
"Name": {
"regex": [
".*person.*name.*",
".*name.*person.*",
".*user.*name.*",
".*name.*user.*",
".*full.*name.*",
".*name.*full.*",
"fullname",
"name",
"person",
"user",
]
},
"Description": {
"regex": [
".*person.*name.*",
".*name.*person.*",
".*user.*name.*",
".*name.*user.*",
".*full.*name.*",
".*name.*full.*",
"fullname",
"name",
"person",
"user",
]
},
"Datatype": {"type": ["str"]},
"Values": {"prediction_type": "library", "regex": [], "library": ["spacy"]},
},
"Age": {
"Prediction_Factors_and_Weights": {
"Name": 0.65,
"Description": 0,
"Datatype": 0,
"Values": 0.35,
},
"Name": {
"regex": ["age[^a-z]+.*", ".*[^a-z]+age", ".*[^a-z]+age[^a-z]+.*", "age"]
},
"Description": {
"regex": ["age[^a-z]+.*", ".*[^a-z]+age", ".*[^a-z]+age[^a-z]+.*", "age"]
},
"Datatype": {"type": ["int"]},
"Values": {
"prediction_type": "library",
"regex": [],
"library": ["rule_based_logic"],
},
},
}
class NameFactorConfig(ConfigModel):
regex: List[str] = Field(
@ -322,6 +66,7 @@ class InfoTypeConfig(ConfigModel):
Values: Optional[ValuesFactorConfig] = Field(alias="values")
# TODO: Generate Classification doc (classification.md) from python source.
class DataHubClassifierConfig(ConfigModel):
confidence_level_threshold: float = Field(
default=0.6,

View File

@ -747,6 +747,8 @@ class SnowflakeV2Source(
foreignKeys=foreign_keys,
)
# TODO: classification is only run for snowflake tables.
# Should we run classification for snowflake views as well?
if isinstance(
table, SnowflakeTable
) and self.is_classification_enabled_for_table(dataset_name):
@ -756,7 +758,7 @@ class SnowflakeV2Source(
]
logger.debug(f"Classifying Table {dataset_name}")
self.classify_schema_fields(
dataset_name, schema_metadata, table.sample_data
dataset_name, schema_metadata, table.sample_data.to_dict(orient="list")
)
return schema_metadata