mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-24 08:28:12 +00:00
feat(ingest): refactor classification mixin interface, support new info types (#6545)
This commit is contained in:
parent
5fd5866a03
commit
7a8e36d57d
@ -29,8 +29,8 @@ DataHub Classifier is the default classifier implementation, which uses [acryl-d
|
||||
| Field | Required | Type | Description | Default |
|
||||
| --- | --- | --- | --- | -- |
|
||||
| confidence_level_threshold | | number | | 0.6 |
|
||||
| info_types | | list[string] | List of infotypes to be predicted. By default, all supported infotypes are considered. If specified. this should be subset of ['Email_Address', 'Gender', 'Credit_Debit_Card_Number', 'Phone_Number', 'Street_Address', 'Full_Name', 'Age'] | None |
|
||||
| info_types_config | Configuration details for infotypes | Dict[str, InfoTypeConfig] | | See [datahub_classifier.py](../../../metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py) |
|
||||
| info_types | | list[string] | List of infotypes to be predicted. By default, all supported infotypes are considered. If specified. this should be subset of ['Email_Address', 'Gender', 'Credit_Debit_Card_Number', 'Phone_Number', 'Street_Address', 'Full_Name', 'Age', 'IBAN', 'US_Social_Security_Number', 'Vehicle_Identification_Number', 'IP_Address_v4', 'IP_Address_v6', 'US_Driving_License_Number', 'Swift_Code'] | None |
|
||||
| info_types_config | Configuration details for infotypes | Dict[str, InfoTypeConfig] | | See [reference_input.py](https://github.com/acryldata/datahub-classify/blob/main/datahub-classify/src/datahub_classify/reference_input.py) for default configuration. |
|
||||
| info_types_config.`key`.prediction_factors_and_weights | ❓ (required if info_types_config.`key` is set) | Dict[str,number] | Factors and their weights to consider when predicting info types | |
|
||||
| info_types_config.`key`.name | | NameFactorConfig (see below for fields) | | |
|
||||
| info_types_config.`key`.name.regex | | Array of string | List of regex patterns the column name follows for the info type | ['.*'] |
|
||||
|
||||
@ -177,7 +177,7 @@ snowflake_common = {
|
||||
"pandas",
|
||||
"cryptography",
|
||||
"msal",
|
||||
"acryl-datahub-classify>=0.0.3",
|
||||
"acryl-datahub-classify>=0.0.4",
|
||||
# spacy version restricted to reduce backtracking, used by acryl-datahub-classify,
|
||||
"spacy==3.4.3",
|
||||
}
|
||||
|
||||
@ -1,8 +1,7 @@
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from datahub_classify.helper_classes import ColumnInfo, Metadata
|
||||
from pandas import DataFrame
|
||||
from typing_extensions import Protocol
|
||||
|
||||
from datahub.configuration.common import ConfigurationError
|
||||
@ -106,7 +105,7 @@ class ClassificationMixin:
|
||||
self: ClassificationSourceProtocol,
|
||||
dataset_name: str,
|
||||
schema_metadata: SchemaMetadata,
|
||||
sample_data: DataFrame,
|
||||
sample_data: Dict[str, list],
|
||||
) -> None:
|
||||
|
||||
assert self.config.classification
|
||||
@ -130,8 +129,8 @@ class ClassificationMixin:
|
||||
"Dataset_Name": dataset_name,
|
||||
}
|
||||
),
|
||||
values=sample_data[field.fieldPath].values
|
||||
if field.fieldPath in sample_data.columns
|
||||
values=sample_data[field.fieldPath]
|
||||
if field.fieldPath in sample_data.keys()
|
||||
else [],
|
||||
)
|
||||
)
|
||||
|
||||
@ -2,269 +2,13 @@ from typing import Any, Dict, List, Optional
|
||||
|
||||
from datahub_classify.helper_classes import ColumnInfo
|
||||
from datahub_classify.infotype_predictor import predict_infotypes
|
||||
from datahub_classify.reference_input import input1 as default_config
|
||||
from pydantic.class_validators import root_validator
|
||||
from pydantic.fields import Field
|
||||
|
||||
from datahub.configuration.common import ConfigModel
|
||||
from datahub.ingestion.glossary.classifier import Classifier
|
||||
|
||||
default_config = {
|
||||
"Email_Address": {
|
||||
"Prediction_Factors_and_Weights": {
|
||||
"Name": 0.4,
|
||||
"Description": 0,
|
||||
"Datatype": 0,
|
||||
"Values": 0.6,
|
||||
},
|
||||
"Name": {
|
||||
"regex": [
|
||||
"^.*mail.*id.*$",
|
||||
"^.*id.*mail.*$",
|
||||
"^.*mail.*add.*$",
|
||||
"^.*add.*mail.*$",
|
||||
"email",
|
||||
"mail",
|
||||
]
|
||||
},
|
||||
"Description": {
|
||||
"regex": ["^.*mail.*id.*$", "^.*mail.*add.*$", "email", "mail"]
|
||||
},
|
||||
"Datatype": {"type": ["str"]},
|
||||
"Values": {
|
||||
"prediction_type": "regex",
|
||||
"regex": ["[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}"],
|
||||
"library": [],
|
||||
},
|
||||
},
|
||||
"Gender": {
|
||||
"Prediction_Factors_and_Weights": {
|
||||
"Name": 0.4,
|
||||
"Description": 0,
|
||||
"Datatype": 0,
|
||||
"Values": 0.6,
|
||||
},
|
||||
"Name": {"regex": ["^.*gender.*$", "^.*sex.*$", "gender", "sex"]},
|
||||
"Description": {"regex": ["^.*gender.*$", "^.*sex.*$", "gender", "sex"]},
|
||||
"Datatype": {"type": ["int", "str"]},
|
||||
"Values": {
|
||||
"prediction_type": "regex",
|
||||
"regex": ["male", "female", "man", "woman", "m", "f", "w", "men", "women"],
|
||||
"library": [],
|
||||
},
|
||||
},
|
||||
"Credit_Debit_Card_Number": {
|
||||
"Prediction_Factors_and_Weights": {
|
||||
"Name": 0.4,
|
||||
"Description": 0,
|
||||
"Datatype": 0,
|
||||
"Values": 0.6,
|
||||
},
|
||||
"Name": {
|
||||
"regex": [
|
||||
"^.*card.*number.*$",
|
||||
"^.*number.*card.*$",
|
||||
"^.*credit.*card.*$",
|
||||
"^.*debit.*card.*$",
|
||||
]
|
||||
},
|
||||
"Description": {
|
||||
"regex": [
|
||||
"^.*card.*number.*$",
|
||||
"^.*number.*card.*$",
|
||||
"^.*credit.*card.*$",
|
||||
"^.*debit.*card.*$",
|
||||
]
|
||||
},
|
||||
"Datatype": {"type": ["str", "int"]},
|
||||
"Values": {
|
||||
"prediction_type": "regex",
|
||||
"regex": [
|
||||
"^4[0-9]{12}(?:[0-9]{3})?$",
|
||||
"^(?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}$",
|
||||
"^3[47][0-9]{13}$",
|
||||
"^3(?:0[0-5]|[68][0-9])[0-9]{11}$",
|
||||
"^6(?:011|5[0-9]{2})[0-9]{12}$",
|
||||
"^(?:2131|1800|35\\d{3})\\d{11}$",
|
||||
"^(6541|6556)[0-9]{12}$",
|
||||
"^389[0-9]{11}$",
|
||||
"^63[7-9][0-9]{13}$",
|
||||
"^9[0-9]{15}$",
|
||||
"^(6304|6706|6709|6771)[0-9]{12,15}$",
|
||||
"^(5018|5020|5038|6304|6759|6761|6763)[0-9]{8,15}$",
|
||||
"^(62[0-9]{14,17})$",
|
||||
"^(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14})$",
|
||||
"^(4903|4905|4911|4936|6333|6759)[0-9]{12}|(4903|4905|4911|4936|6333|6759)[0-9]{14}|(4903|4905|4911|4936|6333|6759)[0-9]{15}|564182[0-9]{10}|564182[0-9]{12}|564182[0-9]{13}|633110[0-9]{10}|633110[0-9]{12}|633110[0-9]{13}$",
|
||||
"^(6334|6767)[0-9]{12}|(6334|6767)[0-9]{14}|(6334|6767)[0-9]{15}$",
|
||||
],
|
||||
"library": [],
|
||||
},
|
||||
},
|
||||
"Phone_Number": {
|
||||
"Prediction_Factors_and_Weights": {
|
||||
"Name": 0.4,
|
||||
"Description": 0,
|
||||
"Datatype": 0,
|
||||
"Values": 0.6,
|
||||
},
|
||||
"Name": {
|
||||
"regex": [
|
||||
".*phone.*(num|no).*",
|
||||
".*(num|no).*phone.*",
|
||||
".*[^a-z]+ph[^a-z]+.*(num|no).*",
|
||||
".*(num|no).*[^a-z]+ph[^a-z]+.*",
|
||||
".*mobile.*(num|no).*",
|
||||
".*(num|no).*mobile.*",
|
||||
".*telephone.*(num|no).*",
|
||||
".*(num|no).*telephone.*",
|
||||
".*cell.*(num|no).*",
|
||||
".*(num|no).*cell.*",
|
||||
".*contact.*(num|no).*",
|
||||
".*(num|no).*contact.*",
|
||||
".*landline.*(num|no).*",
|
||||
".*(num|no).*landline.*",
|
||||
".*fax.*(num|no).*",
|
||||
".*(num|no).*fax.*",
|
||||
"phone",
|
||||
"telephone",
|
||||
"landline",
|
||||
"mobile",
|
||||
"tel",
|
||||
"fax",
|
||||
"cell",
|
||||
"contact",
|
||||
]
|
||||
},
|
||||
"Description": {
|
||||
"regex": [
|
||||
".*phone.*(num|no).*",
|
||||
".*(num|no).*phone.*",
|
||||
".*[^a-z]+ph[^a-z]+.*(num|no).*",
|
||||
".*(num|no).*[^a-z]+ph[^a-z]+.*",
|
||||
".*mobile.*(num|no).*",
|
||||
".*(num|no).*mobile.*",
|
||||
".*telephone.*(num|no).*",
|
||||
".*(num|no).*telephone.*",
|
||||
".*cell.*(num|no).*",
|
||||
".*(num|no).*cell.*",
|
||||
".*contact.*(num|no).*",
|
||||
".*(num|no).*contact.*",
|
||||
".*landline.*(num|no).*",
|
||||
".*(num|no).*landline.*",
|
||||
".*fax.*(num|no).*",
|
||||
".*(num|no).*fax.*",
|
||||
"phone",
|
||||
"telephone",
|
||||
"landline",
|
||||
"mobile",
|
||||
"tel",
|
||||
"fax",
|
||||
"cell",
|
||||
"contact",
|
||||
]
|
||||
},
|
||||
"Datatype": {"type": ["int", "str"]},
|
||||
"Values": {
|
||||
"prediction_type": "library",
|
||||
"regex": [],
|
||||
"library": ["phonenumbers"],
|
||||
},
|
||||
},
|
||||
"Street_Address": {
|
||||
"Prediction_Factors_and_Weights": {
|
||||
"Name": 0.5,
|
||||
"Description": 0,
|
||||
"Datatype": 0,
|
||||
"Values": 0.5,
|
||||
},
|
||||
"Name": {
|
||||
"regex": [
|
||||
".*street.*add.*",
|
||||
".*add.*street.*",
|
||||
".*full.*add.*",
|
||||
".*add.*full.*",
|
||||
".*mail.*add.*",
|
||||
".*add.*mail.*",
|
||||
"add[^a-z]+",
|
||||
"address",
|
||||
"street",
|
||||
]
|
||||
},
|
||||
"Description": {
|
||||
"regex": [
|
||||
".*street.*add.*",
|
||||
".*add.*street.*",
|
||||
".*full.*add.*",
|
||||
".*add.*full.*",
|
||||
".*mail.*add.*",
|
||||
".*add.*mail.*",
|
||||
"add[^a-z]+",
|
||||
"address",
|
||||
"street",
|
||||
]
|
||||
},
|
||||
"Datatype": {"type": ["str"]},
|
||||
"Values": {"prediction_type": "library", "regex": [], "library": ["spacy"]},
|
||||
},
|
||||
"Full_Name": {
|
||||
"Prediction_Factors_and_Weights": {
|
||||
"Name": 0.3,
|
||||
"Description": 0,
|
||||
"Datatype": 0,
|
||||
"Values": 0.7,
|
||||
},
|
||||
"Name": {
|
||||
"regex": [
|
||||
".*person.*name.*",
|
||||
".*name.*person.*",
|
||||
".*user.*name.*",
|
||||
".*name.*user.*",
|
||||
".*full.*name.*",
|
||||
".*name.*full.*",
|
||||
"fullname",
|
||||
"name",
|
||||
"person",
|
||||
"user",
|
||||
]
|
||||
},
|
||||
"Description": {
|
||||
"regex": [
|
||||
".*person.*name.*",
|
||||
".*name.*person.*",
|
||||
".*user.*name.*",
|
||||
".*name.*user.*",
|
||||
".*full.*name.*",
|
||||
".*name.*full.*",
|
||||
"fullname",
|
||||
"name",
|
||||
"person",
|
||||
"user",
|
||||
]
|
||||
},
|
||||
"Datatype": {"type": ["str"]},
|
||||
"Values": {"prediction_type": "library", "regex": [], "library": ["spacy"]},
|
||||
},
|
||||
"Age": {
|
||||
"Prediction_Factors_and_Weights": {
|
||||
"Name": 0.65,
|
||||
"Description": 0,
|
||||
"Datatype": 0,
|
||||
"Values": 0.35,
|
||||
},
|
||||
"Name": {
|
||||
"regex": ["age[^a-z]+.*", ".*[^a-z]+age", ".*[^a-z]+age[^a-z]+.*", "age"]
|
||||
},
|
||||
"Description": {
|
||||
"regex": ["age[^a-z]+.*", ".*[^a-z]+age", ".*[^a-z]+age[^a-z]+.*", "age"]
|
||||
},
|
||||
"Datatype": {"type": ["int"]},
|
||||
"Values": {
|
||||
"prediction_type": "library",
|
||||
"regex": [],
|
||||
"library": ["rule_based_logic"],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class NameFactorConfig(ConfigModel):
|
||||
regex: List[str] = Field(
|
||||
@ -322,6 +66,7 @@ class InfoTypeConfig(ConfigModel):
|
||||
Values: Optional[ValuesFactorConfig] = Field(alias="values")
|
||||
|
||||
|
||||
# TODO: Generate Classification doc (classification.md) from python source.
|
||||
class DataHubClassifierConfig(ConfigModel):
|
||||
confidence_level_threshold: float = Field(
|
||||
default=0.6,
|
||||
|
||||
@ -747,6 +747,8 @@ class SnowflakeV2Source(
|
||||
foreignKeys=foreign_keys,
|
||||
)
|
||||
|
||||
# TODO: classification is only run for snowflake tables.
|
||||
# Should we run classification for snowflake views as well?
|
||||
if isinstance(
|
||||
table, SnowflakeTable
|
||||
) and self.is_classification_enabled_for_table(dataset_name):
|
||||
@ -756,7 +758,7 @@ class SnowflakeV2Source(
|
||||
]
|
||||
logger.debug(f"Classifying Table {dataset_name}")
|
||||
self.classify_schema_fields(
|
||||
dataset_name, schema_metadata, table.sample_data
|
||||
dataset_name, schema_metadata, table.sample_data.to_dict(orient="list")
|
||||
)
|
||||
|
||||
return schema_metadata
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user