mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-24 16:38:19 +00:00
feat(classification): configurable minimum values threshold (#8186)
This commit is contained in:
parent
6813f4af1a
commit
ac06cf3d3f
@ -6,20 +6,20 @@ The classification feature enables sources to be configured to automatically pre
|
||||
|
||||
Note that a `.` is used to denote nested fields in the YAML recipe.
|
||||
|
||||
| Field | Required | Type | Description | Default |
|
||||
| --- | --- | --- | --- | -- |
|
||||
| enabled | | boolean | Whether classification should be used to auto-detect glossary terms | False |
|
||||
| sample_size | | int | Number of sample values used for classification. | 100 |
|
||||
| info_type_to_term | | Dict[str,string] | Optional mapping to provide glossary term identifier for info type. | By default, info type is used as glossary term identifier. |
|
||||
| classifiers | | Array of object | Classifiers to use to auto-detect glossary terms. If more than one classifier, infotype predictions from the classifier defined later in sequence take precedance. | [{'type': 'datahub', 'config': None}] |
|
||||
| table_pattern | | AllowDenyPattern (see below for fields) | Regex patterns to filter tables for classification. This is used in combination with other patterns in parent config. Specify regex to match the entire table name in `database.schema.table` format. e.g. to match all tables starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*' | {'allow': ['.*'], 'deny': [], 'ignoreCase': True} |
|
||||
| table_pattern.allow | | Array of string | List of regex patterns to include in ingestion | ['.*'] |
|
||||
| table_pattern.deny | | Array of string | List of regex patterns to exclude from ingestion. | [] |
|
||||
| table_pattern.ignoreCase | | boolean | Whether to ignore case sensitivity during pattern matching. | True |
|
||||
| column_pattern | | AllowDenyPattern (see below for fields) | Regex patterns to filter columns for classification. This is used in combination with other patterns in parent config. Specify regex to match the column name in `database.schema.table.column` format. | {'allow': ['.*'], 'deny': [], 'ignoreCase': True} |
|
||||
| column_pattern.allow | | Array of string | List of regex patterns to include in ingestion | ['.*'] |
|
||||
| column_pattern.deny | | Array of string | List of regex patterns to exclude from ingestion. | [] |
|
||||
| column_pattern.ignoreCase | | boolean | Whether to ignore case sensitivity during pattern matching. | True |
|
||||
| Field | Required | Type | Description | Default |
|
||||
| ------------------------- | -------- | --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------- |
|
||||
| enabled | | boolean | Whether classification should be used to auto-detect glossary terms | False |
|
||||
| sample_size | | int | Number of sample values used for classification. | 100 |
|
||||
| info_type_to_term | | Dict[str,string] | Optional mapping to provide glossary term identifier for info type. | By default, info type is used as glossary term identifier. |
|
||||
| classifiers | | Array of object | Classifiers to use to auto-detect glossary terms. If more than one classifier, infotype predictions from the classifier defined later in sequence take precedance. | [{'type': 'datahub', 'config': None}] |
|
||||
| table_pattern | | AllowDenyPattern (see below for fields) | Regex patterns to filter tables for classification. This is used in combination with other patterns in parent config. Specify regex to match the entire table name in `database.schema.table` format. e.g. to match all tables starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*' | {'allow': ['.*'], 'deny': [], 'ignoreCase': True} |
|
||||
| table_pattern.allow | | Array of string | List of regex patterns to include in ingestion | ['.*'] |
|
||||
| table_pattern.deny | | Array of string | List of regex patterns to exclude from ingestion. | [] |
|
||||
| table_pattern.ignoreCase | | boolean | Whether to ignore case sensitivity during pattern matching. | True |
|
||||
| column_pattern | | AllowDenyPattern (see below for fields) | Regex patterns to filter columns for classification. This is used in combination with other patterns in parent config. Specify regex to match the column name in `database.schema.table.column` format. | {'allow': ['.*'], 'deny': [], 'ignoreCase': True} |
|
||||
| column_pattern.allow | | Array of string | List of regex patterns to include in ingestion | ['.*'] |
|
||||
| column_pattern.deny | | Array of string | List of regex patterns to exclude from ingestion. | [] |
|
||||
| column_pattern.ignoreCase | | boolean | Whether to ignore case sensitivity during pattern matching. | True |
|
||||
|
||||
## DataHub Classifier
|
||||
|
||||
@ -27,22 +27,39 @@ DataHub Classifier is the default classifier implementation, which uses [acryl-d
|
||||
|
||||
### Config Details
|
||||
|
||||
| Field | Required | Type | Description | Default |
|
||||
| --- | --- | --- | --- | -- |
|
||||
| confidence_level_threshold | | number | | 0.68 |
|
||||
| info_types | | list[string] | List of infotypes to be predicted. By default, all supported infotypes are considered. If specified. this should be subset of `['Email_Address', 'Gender', 'Credit_Debit_Card_Number', 'Phone_Number', 'Street_Address', 'Full_Name', 'Age', 'IBAN', 'US_Social_Security_Number', 'Vehicle_Identification_Number', 'IP_Address_v4', 'IP_Address_v6', 'US_Driving_License_Number', 'Swift_Code']` | None |
|
||||
| info_types_config | Configuration details for infotypes | Dict[str, InfoTypeConfig] | | See [reference_input.py](https://github.com/acryldata/datahub-classify/blob/main/datahub-classify/src/datahub_classify/reference_input.py) for default configuration. |
|
||||
| info_types_config.`key`.prediction_factors_and_weights | ❓ (required if info_types_config.`key` is set) | Dict[str,number] | Factors and their weights to consider when predicting info types | |
|
||||
| info_types_config.`key`.name | | NameFactorConfig (see below for fields) | | |
|
||||
| info_types_config.`key`.name.regex | | Array of string | List of regex patterns the column name follows for the info type | ['.*'] |
|
||||
| info_types_config.`key`.description | | DescriptionFactorConfig (see below for fields) | | |
|
||||
| info_types_config.`key`.description.regex | | Array of string | List of regex patterns the column description follows for the info type | ['.*'] |
|
||||
| info_types_config.`key`.datatype | | DataTypeFactorConfig (see below for fields) | | |
|
||||
| info_types_config.`key`.datatype.type | | Array of string | List of data types for the info type | ['.*'] |
|
||||
| info_types_config.`key`.values | | ValuesFactorConfig (see below for fields) | | |
|
||||
| info_types_config.`key`.values.prediction_type | ❓ (required if info_types_config.`key`.values is set) | string | | None |
|
||||
| info_types_config.`key`.values.regex | | Array of string | List of regex patterns the column value follows for the info type | None |
|
||||
| info_types_config.`key`.values.library | | Array of string | Library used for prediction | None |
|
||||
| Field | Required | Type | Description | Default |
|
||||
| ------------------------------------------------------ | ----------------------------------------------------- | ---------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| confidence_level_threshold | | number | | 0.68 |
|
||||
| info_types | | list[string] | List of infotypes to be predicted. By default, all supported infotypes are considered, along with any custom infotypes configured in `info_types_config`. | None |
|
||||
| info_types_config | Configuration details for infotypes | Dict[str, InfoTypeConfig] | | See [reference_input.py](https://github.com/acryldata/datahub-classify/blob/main/datahub-classify/src/datahub_classify/reference_input.py) for default configuration. |
|
||||
| info_types_config.`key`.prediction_factors_and_weights | ❓ (required if info_types_config.`key` is set) | Dict[str,number] | Factors and their weights to consider when predicting info types | |
|
||||
| info_types_config.`key`.name | | NameFactorConfig (see below for fields) | | |
|
||||
| info_types_config.`key`.name.regex | | Array of string | List of regex patterns the column name follows for the info type | ['.*'] |
|
||||
| info_types_config.`key`.description | | DescriptionFactorConfig (see below for fields) | | |
|
||||
| info_types_config.`key`.description.regex | | Array of string | List of regex patterns the column description follows for the info type | ['.*'] |
|
||||
| info_types_config.`key`.datatype | | DataTypeFactorConfig (see below for fields) | | |
|
||||
| info_types_config.`key`.datatype.type | | Array of string | List of data types for the info type | ['.*'] |
|
||||
| info_types_config.`key`.values | | ValuesFactorConfig (see below for fields) | | |
|
||||
| info_types_config.`key`.values.prediction_type | ❓ (required if info_types_config.`key`.values is set) | string | | None |
|
||||
| info_types_config.`key`.values.regex | | Array of string | List of regex patterns the column value follows for the info type | None |
|
||||
| info_types_config.`key`.values.library | | Array of string | Library used for prediction | None |
|
||||
| minimum_values_threshold | | number | Minimum number of non-null column values required to process `values` prediction factor. | 50 |
|
||||
| |
|
||||
### Supported infotypes
|
||||
- `Email_Address`
|
||||
- `Gender`
|
||||
- `Credit_Debit_Card_Number`
|
||||
- `Phone_Number`
|
||||
- `Street_Address`
|
||||
- `Full_Name`
|
||||
- `Age`
|
||||
- `IBAN`
|
||||
- `US_Social_Security_Number`
|
||||
- `Vehicle_Identification_Number`
|
||||
- `IP_Address_v4`
|
||||
- `IP_Address_v6`
|
||||
- `US_Driving_License_Number`
|
||||
- `Swift_Code`
|
||||
|
||||
### Supported sources
|
||||
|
||||
@ -74,7 +91,7 @@ source:
|
||||
- type: datahub
|
||||
```
|
||||
|
||||
#### Example with Advanced Configuration: Specifying custom info_types_config
|
||||
#### Example with Advanced Configuration: Customizing configuration for supported info types
|
||||
|
||||
```yml
|
||||
source:
|
||||
@ -377,3 +394,44 @@ source:
|
||||
- rule_based_logic
|
||||
|
||||
```
|
||||
|
||||
|
||||
#### Example with Advanced Configuration: Specifying custom info type
|
||||
|
||||
```yml
|
||||
source:
|
||||
type: snowflake
|
||||
config:
|
||||
env: PROD
|
||||
# Coordinates
|
||||
account_id: account_name
|
||||
warehouse: "COMPUTE_WH"
|
||||
|
||||
# Credentials
|
||||
username: user
|
||||
password: pass
|
||||
role: "sysadmin"
|
||||
|
||||
# Options
|
||||
top_n_queries: 10
|
||||
email_domain: mycompany.com
|
||||
|
||||
classification:
|
||||
enabled: True
|
||||
classifiers:
|
||||
- type: datahub
|
||||
config:
|
||||
confidence_level_threshold: 0.7
|
||||
minimum_values_threshold: 10
|
||||
info_types_config:
|
||||
CloudRegion:
|
||||
prediction_factors_and_weights:
|
||||
name: 0
|
||||
description: 0
|
||||
datatype: 0
|
||||
values: 1
|
||||
values:
|
||||
prediction_type: regex
|
||||
regex:
|
||||
- "(af|ap|ca|eu|me|sa|us)-(central|north|(north(?:east|west))|south|south(?:east|west)|east|west)-\\d+"
|
||||
library: []
|
||||
|
||||
@ -193,7 +193,7 @@ snowflake_common = {
|
||||
"pandas",
|
||||
"cryptography",
|
||||
"msal",
|
||||
"acryl-datahub-classify==0.0.7",
|
||||
"acryl-datahub-classify==0.0.8",
|
||||
# spacy version restricted to reduce backtracking, used by acryl-datahub-classify,
|
||||
"spacy==3.4.3",
|
||||
}
|
||||
|
||||
@ -92,13 +92,18 @@ class DataHubClassifierConfig(ConfigModel):
|
||||
info_types: Optional[List[str]] = Field(
|
||||
default=None,
|
||||
init=False,
|
||||
description=f"List of infotypes to be predicted. By default, all supported infotypes are considered. If specified. this should be subset of {list(default_config.keys())}.",
|
||||
description="List of infotypes to be predicted. By default, all supported infotypes are considered, along with any custom infotypes configured in `info_types_config`.",
|
||||
)
|
||||
info_types_config: Dict[str, InfoTypeConfig] = Field(
|
||||
default=DEFAULT_CLASSIFIER_CONFIG,
|
||||
init=False,
|
||||
description="Configuration details for infotypes. See [reference_input.py](https://github.com/acryldata/datahub-classify/blob/main/datahub-classify/src/datahub_classify/reference_input.py) for default configuration.",
|
||||
)
|
||||
minimum_values_threshold: int = Field(
|
||||
default=50,
|
||||
init=False,
|
||||
description="Minimum number of non-null column values required to process `values` prediction factor.",
|
||||
)
|
||||
|
||||
@validator("info_types_config")
|
||||
def input_config_selectively_overrides_default_config(cls, info_types_config):
|
||||
@ -164,9 +169,12 @@ class DataHubClassifier(Classifier):
|
||||
|
||||
def classify(self, columns: List[ColumnInfo]) -> List[ColumnInfo]:
|
||||
columns = predict_infotypes(
|
||||
columns,
|
||||
self.config.confidence_level_threshold,
|
||||
{k: v.dict() for k, v in self.config.info_types_config.items()},
|
||||
self.config.info_types,
|
||||
column_infos=columns,
|
||||
confidence_level_threshold=self.config.confidence_level_threshold,
|
||||
global_config={
|
||||
k: v.dict() for k, v in self.config.info_types_config.items()
|
||||
},
|
||||
infotypes=self.config.info_types,
|
||||
minimum_values_threshold=self.config.minimum_values_threshold,
|
||||
)
|
||||
return columns
|
||||
|
||||
@ -76,14 +76,15 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph):
|
||||
|
||||
mock_sample_values.return_value = pd.DataFrame(
|
||||
data={
|
||||
"col_1": [random.randint(0, 100) for i in range(1, 100)],
|
||||
"col_2": [random_email() for i in range(1, 100)],
|
||||
"col_3": [random_cloud_region() for i in range(1, 100)],
|
||||
"col_1": [random.randint(0, 100) for i in range(20)],
|
||||
"col_2": [random_email() for i in range(20)],
|
||||
"col_3": [random_cloud_region() for i in range(20)],
|
||||
}
|
||||
)
|
||||
|
||||
datahub_classifier_config = DataHubClassifierConfig()
|
||||
datahub_classifier_config.confidence_level_threshold = 0.58
|
||||
datahub_classifier_config.minimum_values_threshold = 10
|
||||
datahub_classifier_config.info_types_config = {
|
||||
"Age": InfoTypeConfig(
|
||||
Prediction_Factors_and_Weights=PredictionFactorsAndWeights(
|
||||
|
||||
@ -50,14 +50,15 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph):
|
||||
|
||||
mock_sample_values.return_value = pd.DataFrame(
|
||||
data={
|
||||
"col_1": [random.randint(0, 100) for i in range(1, 100)],
|
||||
"col_2": [random_email() for i in range(1, 100)],
|
||||
"col_3": [random_cloud_region() for i in range(1, 100)],
|
||||
"col_1": [random.randint(0, 100) for i in range(20)],
|
||||
"col_2": [random_email() for i in range(20)],
|
||||
"col_3": [random_cloud_region() for i in range(20)],
|
||||
}
|
||||
)
|
||||
|
||||
datahub_classifier_config = DataHubClassifierConfig()
|
||||
datahub_classifier_config.confidence_level_threshold = 0.58
|
||||
datahub_classifier_config.minimum_values_threshold = 10
|
||||
datahub_classifier_config.info_types_config = {
|
||||
"Age": InfoTypeConfig(
|
||||
Prediction_Factors_and_Weights=PredictionFactorsAndWeights(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user