feat(ingest): refactor classification mixin interface, support new info types (#6545)

2025-12-24 08:28:12 +00:00 · 2022-11-25 18:48:42 +05:30 · 2022-11-25 18:48:42 +05:30 · 7a8e36d57d
commit 7a8e36d57d
parent 5fd5866a03
5 changed files with 12 additions and 266 deletions
--- a/metadata-ingestion/docs/dev_guides/classification.md
+++ b/metadata-ingestion/docs/dev_guides/classification.md
@ -29,8 +29,8 @@ DataHub Classifier is the default classifier implementation, which uses [acryl-d
 | Field | Required | Type | Description | Default |
 | ---   | ---      | ---  | --- | -- |
 | confidence_level_threshold |  | number |  | 0.6 |
-| info_types |  | list[string] | List of infotypes to be predicted. By default, all supported infotypes are considered. If specified. this should be subset of ['Email_Address', 'Gender', 'Credit_Debit_Card_Number', 'Phone_Number', 'Street_Address', 'Full_Name', 'Age'] | None |
-| info_types_config | Configuration details for infotypes | Dict[str, InfoTypeConfig] |  | See [datahub_classifier.py](../../../metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py) |
+| info_types |  | list[string] | List of infotypes to be predicted. By default, all supported infotypes are considered. If specified. this should be subset of ['Email_Address', 'Gender', 'Credit_Debit_Card_Number', 'Phone_Number', 'Street_Address', 'Full_Name', 'Age', 'IBAN', 'US_Social_Security_Number', 'Vehicle_Identification_Number', 'IP_Address_v4', 'IP_Address_v6', 'US_Driving_License_Number', 'Swift_Code'] | None |
+| info_types_config | Configuration details for infotypes | Dict[str, InfoTypeConfig] |  | See [reference_input.py](https://github.com/acryldata/datahub-classify/blob/main/datahub-classify/src/datahub_classify/reference_input.py) for default configuration. |
 | info_types_config.`key`.prediction_factors_and_weights | ❓ (required if info_types_config.`key` is set) | Dict[str,number] | Factors and their weights to consider when predicting info types |  |
 | info_types_config.`key`.name |  | NameFactorConfig (see below for fields) |  |  |
 | info_types_config.`key`.name.regex |  | Array of string | List of regex patterns the column name follows for the info type | ['.*'] |
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@ -177,7 +177,7 @@ snowflake_common = {
    "pandas",
    "cryptography",
    "msal",
-    "acryl-datahub-classify>=0.0.3",
+    "acryl-datahub-classify>=0.0.4",
    # spacy version restricted to reduce backtracking, used by acryl-datahub-classify,
    "spacy==3.4.3",
 }
--- a/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py
+++ b/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py
@ -1,8 +1,7 @@
 import logging
-from typing import List, Optional
+from typing import Dict, List, Optional

 from datahub_classify.helper_classes import ColumnInfo, Metadata
-from pandas import DataFrame
 from typing_extensions import Protocol

 from datahub.configuration.common import ConfigurationError
@ -106,7 +105,7 @@ class ClassificationMixin:
        self: ClassificationSourceProtocol,
        dataset_name: str,
        schema_metadata: SchemaMetadata,
-        sample_data: DataFrame,
+        sample_data: Dict[str, list],
    ) -> None:

        assert self.config.classification
@ -130,8 +129,8 @@ class ClassificationMixin:
                            "Dataset_Name": dataset_name,
                        }
                    ),
-                    values=sample_data[field.fieldPath].values
-                    if field.fieldPath in sample_data.columns
+                    values=sample_data[field.fieldPath]
+                    if field.fieldPath in sample_data.keys()
                    else [],
                )
            )
--- a/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py
+++ b/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py
@ -2,269 +2,13 @@ from typing import Any, Dict, List, Optional

 from datahub_classify.helper_classes import ColumnInfo
 from datahub_classify.infotype_predictor import predict_infotypes
+from datahub_classify.reference_input import input1 as default_config
 from pydantic.class_validators import root_validator
 from pydantic.fields import Field

 from datahub.configuration.common import ConfigModel
 from datahub.ingestion.glossary.classifier import Classifier

-default_config = {
-    "Email_Address": {
-        "Prediction_Factors_and_Weights": {
-            "Name": 0.4,
-            "Description": 0,
-            "Datatype": 0,
-            "Values": 0.6,
-        },
-        "Name": {
-            "regex": [
-                "^.*mail.*id.*$",
-                "^.*id.*mail.*$",
-                "^.*mail.*add.*$",
-                "^.*add.*mail.*$",
-                "email",
-                "mail",
-            ]
-        },
-        "Description": {
-            "regex": ["^.*mail.*id.*$", "^.*mail.*add.*$", "email", "mail"]
-        },
-        "Datatype": {"type": ["str"]},
-        "Values": {
-            "prediction_type": "regex",
-            "regex": ["[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}"],
-            "library": [],
-        },
-    },
-    "Gender": {
-        "Prediction_Factors_and_Weights": {
-            "Name": 0.4,
-            "Description": 0,
-            "Datatype": 0,
-            "Values": 0.6,
-        },
-        "Name": {"regex": ["^.*gender.*$", "^.*sex.*$", "gender", "sex"]},
-        "Description": {"regex": ["^.*gender.*$", "^.*sex.*$", "gender", "sex"]},
-        "Datatype": {"type": ["int", "str"]},
-        "Values": {
-            "prediction_type": "regex",
-            "regex": ["male", "female", "man", "woman", "m", "f", "w", "men", "women"],
-            "library": [],
-        },
-    },
-    "Credit_Debit_Card_Number": {
-        "Prediction_Factors_and_Weights": {
-            "Name": 0.4,
-            "Description": 0,
-            "Datatype": 0,
-            "Values": 0.6,
-        },
-        "Name": {
-            "regex": [
-                "^.*card.*number.*$",
-                "^.*number.*card.*$",
-                "^.*credit.*card.*$",
-                "^.*debit.*card.*$",
-            ]
-        },
-        "Description": {
-            "regex": [
-                "^.*card.*number.*$",
-                "^.*number.*card.*$",
-                "^.*credit.*card.*$",
-                "^.*debit.*card.*$",
-            ]
-        },
-        "Datatype": {"type": ["str", "int"]},
-        "Values": {
-            "prediction_type": "regex",
-            "regex": [
-                "^4[0-9]{12}(?:[0-9]{3})?$",
-                "^(?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}$",
-                "^3[47][0-9]{13}$",
-                "^3(?:0[0-5]|[68][0-9])[0-9]{11}$",
-                "^6(?:011|5[0-9]{2})[0-9]{12}$",
-                "^(?:2131|1800|35\\d{3})\\d{11}$",
-                "^(6541|6556)[0-9]{12}$",
-                "^389[0-9]{11}$",
-                "^63[7-9][0-9]{13}$",
-                "^9[0-9]{15}$",
-                "^(6304|6706|6709|6771)[0-9]{12,15}$",
-                "^(5018|5020|5038|6304|6759|6761|6763)[0-9]{8,15}$",
-                "^(62[0-9]{14,17})$",
-                "^(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14})$",
-                "^(4903|4905|4911|4936|6333|6759)[0-9]{12}|(4903|4905|4911|4936|6333|6759)[0-9]{14}|(4903|4905|4911|4936|6333|6759)[0-9]{15}|564182[0-9]{10}|564182[0-9]{12}|564182[0-9]{13}|633110[0-9]{10}|633110[0-9]{12}|633110[0-9]{13}$",
-                "^(6334|6767)[0-9]{12}|(6334|6767)[0-9]{14}|(6334|6767)[0-9]{15}$",
-            ],
-            "library": [],
-        },
-    },
-    "Phone_Number": {
-        "Prediction_Factors_and_Weights": {
-            "Name": 0.4,
-            "Description": 0,
-            "Datatype": 0,
-            "Values": 0.6,
-        },
-        "Name": {
-            "regex": [
-                ".*phone.*(num|no).*",
-                ".*(num|no).*phone.*",
-                ".*[^a-z]+ph[^a-z]+.*(num|no).*",
-                ".*(num|no).*[^a-z]+ph[^a-z]+.*",
-                ".*mobile.*(num|no).*",
-                ".*(num|no).*mobile.*",
-                ".*telephone.*(num|no).*",
-                ".*(num|no).*telephone.*",
-                ".*cell.*(num|no).*",
-                ".*(num|no).*cell.*",
-                ".*contact.*(num|no).*",
-                ".*(num|no).*contact.*",
-                ".*landline.*(num|no).*",
-                ".*(num|no).*landline.*",
-                ".*fax.*(num|no).*",
-                ".*(num|no).*fax.*",
-                "phone",
-                "telephone",
-                "landline",
-                "mobile",
-                "tel",
-                "fax",
-                "cell",
-                "contact",
-            ]
-        },
-        "Description": {
-            "regex": [
-                ".*phone.*(num|no).*",
-                ".*(num|no).*phone.*",
-                ".*[^a-z]+ph[^a-z]+.*(num|no).*",
-                ".*(num|no).*[^a-z]+ph[^a-z]+.*",
-                ".*mobile.*(num|no).*",
-                ".*(num|no).*mobile.*",
-                ".*telephone.*(num|no).*",
-                ".*(num|no).*telephone.*",
-                ".*cell.*(num|no).*",
-                ".*(num|no).*cell.*",
-                ".*contact.*(num|no).*",
-                ".*(num|no).*contact.*",
-                ".*landline.*(num|no).*",
-                ".*(num|no).*landline.*",
-                ".*fax.*(num|no).*",
-                ".*(num|no).*fax.*",
-                "phone",
-                "telephone",
-                "landline",
-                "mobile",
-                "tel",
-                "fax",
-                "cell",
-                "contact",
-            ]
-        },
-        "Datatype": {"type": ["int", "str"]},
-        "Values": {
-            "prediction_type": "library",
-            "regex": [],
-            "library": ["phonenumbers"],
-        },
-    },
-    "Street_Address": {
-        "Prediction_Factors_and_Weights": {
-            "Name": 0.5,
-            "Description": 0,
-            "Datatype": 0,
-            "Values": 0.5,
-        },
-        "Name": {
-            "regex": [
-                ".*street.*add.*",
-                ".*add.*street.*",
-                ".*full.*add.*",
-                ".*add.*full.*",
-                ".*mail.*add.*",
-                ".*add.*mail.*",
-                "add[^a-z]+",
-                "address",
-                "street",
-            ]
-        },
-        "Description": {
-            "regex": [
-                ".*street.*add.*",
-                ".*add.*street.*",
-                ".*full.*add.*",
-                ".*add.*full.*",
-                ".*mail.*add.*",
-                ".*add.*mail.*",
-                "add[^a-z]+",
-                "address",
-                "street",
-            ]
-        },
-        "Datatype": {"type": ["str"]},
-        "Values": {"prediction_type": "library", "regex": [], "library": ["spacy"]},
-    },
-    "Full_Name": {
-        "Prediction_Factors_and_Weights": {
-            "Name": 0.3,
-            "Description": 0,
-            "Datatype": 0,
-            "Values": 0.7,
-        },
-        "Name": {
-            "regex": [
-                ".*person.*name.*",
-                ".*name.*person.*",
-                ".*user.*name.*",
-                ".*name.*user.*",
-                ".*full.*name.*",
-                ".*name.*full.*",
-                "fullname",
-                "name",
-                "person",
-                "user",
-            ]
-        },
-        "Description": {
-            "regex": [
-                ".*person.*name.*",
-                ".*name.*person.*",
-                ".*user.*name.*",
-                ".*name.*user.*",
-                ".*full.*name.*",
-                ".*name.*full.*",
-                "fullname",
-                "name",
-                "person",
-                "user",
-            ]
-        },
-        "Datatype": {"type": ["str"]},
-        "Values": {"prediction_type": "library", "regex": [], "library": ["spacy"]},
-    },
-    "Age": {
-        "Prediction_Factors_and_Weights": {
-            "Name": 0.65,
-            "Description": 0,
-            "Datatype": 0,
-            "Values": 0.35,
-        },
-        "Name": {
-            "regex": ["age[^a-z]+.*", ".*[^a-z]+age", ".*[^a-z]+age[^a-z]+.*", "age"]
-        },
-        "Description": {
-            "regex": ["age[^a-z]+.*", ".*[^a-z]+age", ".*[^a-z]+age[^a-z]+.*", "age"]
-        },
-        "Datatype": {"type": ["int"]},
-        "Values": {
-            "prediction_type": "library",
-            "regex": [],
-            "library": ["rule_based_logic"],
-        },
-    },
-}
-

 class NameFactorConfig(ConfigModel):
    regex: List[str] = Field(
@ -322,6 +66,7 @@ class InfoTypeConfig(ConfigModel):
    Values: Optional[ValuesFactorConfig] = Field(alias="values")


+# TODO: Generate Classification doc (classification.md) from python source.
 class DataHubClassifierConfig(ConfigModel):
    confidence_level_threshold: float = Field(
        default=0.6,
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
@ -747,6 +747,8 @@ class SnowflakeV2Source(
            foreignKeys=foreign_keys,
        )

+        # TODO: classification is only run for snowflake tables.
+        # Should we run classification for snowflake views as well?
        if isinstance(
            table, SnowflakeTable
        ) and self.is_classification_enabled_for_table(dataset_name):
@ -756,7 +758,7 @@ class SnowflakeV2Source(
                ]
            logger.debug(f"Classifying Table {dataset_name}")
            self.classify_schema_fields(
-                dataset_name, schema_metadata, table.sample_data
+                dataset_name, schema_metadata, table.sample_data.to_dict(orient="list")
            )

        return schema_metadata