feat(classification): configurable minimum values threshold (#8186)

2025-12-24 16:38:19 +00:00 · 2023-06-08 09:58:13 +05:30 · 2023-06-08 09:58:13 +05:30 · ac06cf3d3f
commit ac06cf3d3f
parent 6813f4af1a
5 changed files with 111 additions and 43 deletions
--- a/metadata-ingestion/docs/dev_guides/classification.md
+++ b/metadata-ingestion/docs/dev_guides/classification.md
@ -6,20 +6,20 @@ The classification feature enables sources to be configured to automatically pre

 Note that a `.` is used to denote nested fields in the YAML recipe.

-| Field | Required | Type | Description | Default |
-| ---   | ---      | ---  | --- | -- |
-| enabled |  | boolean | Whether classification should be used to auto-detect glossary terms | False |
-| sample_size |  | int | Number of sample values used for classification. | 100 |
-| info_type_to_term |  | Dict[str,string] | Optional mapping to provide glossary term identifier for info type.  | By default, info type is used as glossary term identifier. |
-| classifiers |  | Array of object | Classifiers to use to auto-detect glossary terms. If more than one classifier, infotype predictions from the classifier defined later in sequence take precedance. | [{'type': 'datahub', 'config': None}] |
-| table_pattern |  | AllowDenyPattern (see below for fields) | Regex patterns to filter tables for classification. This is used in combination with other patterns in parent config. Specify regex to match the entire table name in `database.schema.table` format. e.g. to match all tables starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*' | {'allow': ['.*'], 'deny': [], 'ignoreCase': True} |
-| table_pattern.allow |  | Array of string | List of regex patterns to include in ingestion | ['.*'] |
-| table_pattern.deny |  | Array of string | List of regex patterns to exclude from ingestion. | [] |
-| table_pattern.ignoreCase |  | boolean | Whether to ignore case sensitivity during pattern matching. | True |
-| column_pattern |  | AllowDenyPattern (see below for fields) | Regex patterns to filter columns for classification. This is used in combination with other patterns in parent config. Specify regex to match the column name in `database.schema.table.column` format. | {'allow': ['.*'], 'deny': [], 'ignoreCase': True} |
-| column_pattern.allow |  | Array of string | List of regex patterns to include in ingestion | ['.*'] |
-| column_pattern.deny |  | Array of string | List of regex patterns to exclude from ingestion. | [] |
-| column_pattern.ignoreCase |  | boolean | Whether to ignore case sensitivity during pattern matching. | True |
+| Field                     | Required | Type                                    | Description                                                                                                                                                                                                                                                                                                                              | Default                                                    |
+| ------------------------- | -------- | --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------- |
+| enabled                   |          | boolean                                 | Whether classification should be used to auto-detect glossary terms                                                                                                                                                                                                                                                                      | False                                                      |
+| sample_size               |          | int                                     | Number of sample values used for classification.                                                                                                                                                                                                                                                                                         | 100                                                        |
+| info_type_to_term         |          | Dict[str,string]                        | Optional mapping to provide glossary term identifier for info type.                                                                                                                                                                                                                                                                      | By default, info type is used as glossary term identifier. |
+| classifiers               |          | Array of object                         | Classifiers to use to auto-detect glossary terms. If more than one classifier, infotype predictions from the classifier defined later in sequence take precedance.                                                                                                                                                                       | [{'type': 'datahub', 'config': None}]                      |
+| table_pattern             |          | AllowDenyPattern (see below for fields) | Regex patterns to filter tables for classification. This is used in combination with other patterns in parent config. Specify regex to match the entire table name in `database.schema.table` format. e.g. to match all tables starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*' | {'allow': ['.*'], 'deny': [], 'ignoreCase': True}          |
+| table_pattern.allow       |          | Array of string                         | List of regex patterns to include in ingestion                                                                                                                                                                                                                                                                                           | ['.*']                                                     |
+| table_pattern.deny        |          | Array of string                         | List of regex patterns to exclude from ingestion.                                                                                                                                                                                                                                                                                        | []                                                         |
+| table_pattern.ignoreCase  |          | boolean                                 | Whether to ignore case sensitivity during pattern matching.                                                                                                                                                                                                                                                                              | True                                                       |
+| column_pattern            |          | AllowDenyPattern (see below for fields) | Regex patterns to filter columns for classification. This is used in combination with other patterns in parent config. Specify regex to match the column name in `database.schema.table.column` format.                                                                                                                                  | {'allow': ['.*'], 'deny': [], 'ignoreCase': True}          |
+| column_pattern.allow      |          | Array of string                         | List of regex patterns to include in ingestion                                                                                                                                                                                                                                                                                           | ['.*']                                                     |
+| column_pattern.deny       |          | Array of string                         | List of regex patterns to exclude from ingestion.                                                                                                                                                                                                                                                                                        | []                                                         |
+| column_pattern.ignoreCase |          | boolean                                 | Whether to ignore case sensitivity during pattern matching.                                                                                                                                                                                                                                                                              | True                                                       |

 ## DataHub Classifier

@ -27,22 +27,39 @@ DataHub Classifier is the default classifier implementation, which uses [acryl-d

 ### Config Details

-| Field | Required | Type | Description | Default |
-| ---   | ---      | ---  | --- | -- |
-| confidence_level_threshold |  | number |  | 0.68 |
-| info_types |  | list[string] | List of infotypes to be predicted. By default, all supported infotypes are considered. If specified. this should be subset of `['Email_Address', 'Gender', 'Credit_Debit_Card_Number', 'Phone_Number', 'Street_Address', 'Full_Name', 'Age', 'IBAN', 'US_Social_Security_Number', 'Vehicle_Identification_Number', 'IP_Address_v4', 'IP_Address_v6', 'US_Driving_License_Number', 'Swift_Code']` | None |
-| info_types_config | Configuration details for infotypes | Dict[str, InfoTypeConfig] |  | See [reference_input.py](https://github.com/acryldata/datahub-classify/blob/main/datahub-classify/src/datahub_classify/reference_input.py) for default configuration. |
-| info_types_config.`key`.prediction_factors_and_weights | ❓ (required if info_types_config.`key` is set) | Dict[str,number] | Factors and their weights to consider when predicting info types |  |
-| info_types_config.`key`.name |  | NameFactorConfig (see below for fields) |  |  |
-| info_types_config.`key`.name.regex |  | Array of string | List of regex patterns the column name follows for the info type | ['.*'] |
-| info_types_config.`key`.description |  | DescriptionFactorConfig (see below for fields) |  |  |
-| info_types_config.`key`.description.regex |  | Array of string | List of regex patterns the column description follows for the info type | ['.*'] |
-| info_types_config.`key`.datatype |  | DataTypeFactorConfig (see below for fields) |  |  |
-| info_types_config.`key`.datatype.type |  | Array of string | List of data types for the info type | ['.*'] |
-| info_types_config.`key`.values |  | ValuesFactorConfig (see below for fields) |  |  |
-| info_types_config.`key`.values.prediction_type | ❓ (required if info_types_config.`key`.values is set) | string |  | None |
-| info_types_config.`key`.values.regex |  | Array of string | List of regex patterns the column value follows for the info type | None |
-| info_types_config.`key`.values.library |  | Array of string | Library used for prediction | None |
+| Field                                                  | Required                                              | Type                                           | Description                                                                                                                                               | Default                                                                                                                                                               |
+| ------------------------------------------------------ | ----------------------------------------------------- | ---------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| confidence_level_threshold                             |                                                       | number                                         |                                                                                                                                                           | 0.68                                                                                                                                                                  |
+| info_types                                             |                                                       | list[string]                                   | List of infotypes to be predicted. By default, all supported infotypes are considered, along with any custom infotypes configured in `info_types_config`. | None                                                                                                                                                                  |
+| info_types_config                                      | Configuration details for infotypes                   | Dict[str, InfoTypeConfig]                      |                                                                                                                                                           | See [reference_input.py](https://github.com/acryldata/datahub-classify/blob/main/datahub-classify/src/datahub_classify/reference_input.py) for default configuration. |
+| info_types_config.`key`.prediction_factors_and_weights | ❓ (required if info_types_config.`key` is set)        | Dict[str,number]                               | Factors and their weights to consider when predicting info types                                                                                          |                                                                                                                                                                       |
+| info_types_config.`key`.name                           |                                                       | NameFactorConfig (see below for fields)        |                                                                                                                                                           |                                                                                                                                                                       |
+| info_types_config.`key`.name.regex                     |                                                       | Array of string                                | List of regex patterns the column name follows for the info type                                                                                          | ['.*']                                                                                                                                                                |
+| info_types_config.`key`.description                    |                                                       | DescriptionFactorConfig (see below for fields) |                                                                                                                                                           |                                                                                                                                                                       |
+| info_types_config.`key`.description.regex              |                                                       | Array of string                                | List of regex patterns the column description follows for the info type                                                                                   | ['.*']                                                                                                                                                                |
+| info_types_config.`key`.datatype                       |                                                       | DataTypeFactorConfig (see below for fields)    |                                                                                                                                                           |                                                                                                                                                                       |
+| info_types_config.`key`.datatype.type                  |                                                       | Array of string                                | List of data types for the info type                                                                                                                      | ['.*']                                                                                                                                                                |
+| info_types_config.`key`.values                         |                                                       | ValuesFactorConfig (see below for fields)      |                                                                                                                                                           |                                                                                                                                                                       |
+| info_types_config.`key`.values.prediction_type         | ❓ (required if info_types_config.`key`.values is set) | string                                         |                                                                                                                                                           | None                                                                                                                                                                  |
+| info_types_config.`key`.values.regex                   |                                                       | Array of string                                | List of regex patterns the column value follows for the info type                                                                                         | None                                                                                                                                                                  |
+| info_types_config.`key`.values.library                 |                                                       | Array of string                                | Library used for prediction                                                                                                                               | None                                                                                                                                                                  |
+| minimum_values_threshold                               |                                                       | number                                         | Minimum number of non-null column values required to process `values` prediction factor.                                                                  | 50                                                                                                                                                                    |
+|                                                        |
+### Supported infotypes
+- `Email_Address`
+- `Gender`
+- `Credit_Debit_Card_Number`
+- `Phone_Number`
+- `Street_Address`
+- `Full_Name`
+- `Age`
+- `IBAN`
+- `US_Social_Security_Number`
+- `Vehicle_Identification_Number`
+- `IP_Address_v4`
+- `IP_Address_v6`
+- `US_Driving_License_Number`
+- `Swift_Code`

 ### Supported sources

@ -74,7 +91,7 @@ source:
        - type: datahub          
 ```

-#### Example with Advanced Configuration: Specifying custom info_types_config
+#### Example with Advanced Configuration: Customizing configuration for supported info types

 ```yml
 source:
@ -377,3 +394,44 @@ source:
                    - rule_based_logic

 ```
+
+
+#### Example with Advanced Configuration: Specifying custom info type
+
+```yml
+source:
+  type: snowflake
+  config:
+    env: PROD
+    # Coordinates
+    account_id: account_name
+    warehouse: "COMPUTE_WH"
+
+    # Credentials
+    username: user
+    password: pass
+    role: "sysadmin"
+
+    # Options
+    top_n_queries: 10
+    email_domain: mycompany.com
+
+    classification:
+      enabled: True
+      classifiers:
+        - type: datahub          
+          config:
+            confidence_level_threshold: 0.7
+            minimum_values_threshold: 10
+            info_types_config:
+              CloudRegion:
+                prediction_factors_and_weights:
+                  name: 0
+                  description: 0
+                  datatype: 0
+                  values: 1
+                values:
+                  prediction_type: regex
+                  regex:
+                    - "(af|ap|ca|eu|me|sa|us)-(central|north|(north(?:east|west))|south|south(?:east|west)|east|west)-\\d+"
+                  library: []
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@ -193,7 +193,7 @@ snowflake_common = {
    "pandas",
    "cryptography",
    "msal",
-    "acryl-datahub-classify==0.0.7",
+    "acryl-datahub-classify==0.0.8",
    # spacy version restricted to reduce backtracking, used by acryl-datahub-classify,
    "spacy==3.4.3",
 }
--- a/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py
+++ b/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py
@ -92,13 +92,18 @@ class DataHubClassifierConfig(ConfigModel):
    info_types: Optional[List[str]] = Field(
        default=None,
        init=False,
-        description=f"List of infotypes to be predicted. By default, all supported infotypes are considered. If specified. this should be subset of {list(default_config.keys())}.",
+        description="List of infotypes to be predicted. By default, all supported infotypes are considered, along with any custom infotypes configured in `info_types_config`.",
    )
    info_types_config: Dict[str, InfoTypeConfig] = Field(
        default=DEFAULT_CLASSIFIER_CONFIG,
        init=False,
        description="Configuration details for infotypes. See [reference_input.py](https://github.com/acryldata/datahub-classify/blob/main/datahub-classify/src/datahub_classify/reference_input.py) for default configuration.",
    )
+    minimum_values_threshold: int = Field(
+        default=50,
+        init=False,
+        description="Minimum number of non-null column values required to process `values` prediction factor.",
+    )

    @validator("info_types_config")
    def input_config_selectively_overrides_default_config(cls, info_types_config):
@ -164,9 +169,12 @@ class DataHubClassifier(Classifier):

    def classify(self, columns: List[ColumnInfo]) -> List[ColumnInfo]:
        columns = predict_infotypes(
-            columns,
-            self.config.confidence_level_threshold,
-            {k: v.dict() for k, v in self.config.info_types_config.items()},
-            self.config.info_types,
+            column_infos=columns,
+            confidence_level_threshold=self.config.confidence_level_threshold,
+            global_config={
+                k: v.dict() for k, v in self.config.info_types_config.items()
+            },
+            infotypes=self.config.info_types,
+            minimum_values_threshold=self.config.minimum_values_threshold,
        )
        return columns
--- a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py
+++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py
@ -76,14 +76,15 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph):

        mock_sample_values.return_value = pd.DataFrame(
            data={
-                "col_1": [random.randint(0, 100) for i in range(1, 100)],
-                "col_2": [random_email() for i in range(1, 100)],
-                "col_3": [random_cloud_region() for i in range(1, 100)],
+                "col_1": [random.randint(0, 100) for i in range(20)],
+                "col_2": [random_email() for i in range(20)],
+                "col_3": [random_cloud_region() for i in range(20)],
            }
        )

        datahub_classifier_config = DataHubClassifierConfig()
        datahub_classifier_config.confidence_level_threshold = 0.58
+        datahub_classifier_config.minimum_values_threshold = 10
        datahub_classifier_config.info_types_config = {
            "Age": InfoTypeConfig(
                Prediction_Factors_and_Weights=PredictionFactorsAndWeights(
--- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_legacy_lineage.py
+++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_legacy_lineage.py
@ -50,14 +50,15 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph):

        mock_sample_values.return_value = pd.DataFrame(
            data={
-                "col_1": [random.randint(0, 100) for i in range(1, 100)],
-                "col_2": [random_email() for i in range(1, 100)],
-                "col_3": [random_cloud_region() for i in range(1, 100)],
+                "col_1": [random.randint(0, 100) for i in range(20)],
+                "col_2": [random_email() for i in range(20)],
+                "col_3": [random_cloud_region() for i in range(20)],
            }
        )

        datahub_classifier_config = DataHubClassifierConfig()
        datahub_classifier_config.confidence_level_threshold = 0.58
+        datahub_classifier_config.minimum_values_threshold = 10
        datahub_classifier_config.info_types_config = {
            "Age": InfoTypeConfig(
                Prediction_Factors_and_Weights=PredictionFactorsAndWeights(