datahub/metadata-ingestion/tests/unit/test_classification.py
ethan-cartwright dfb2f7e596
feat(classifier): Add support for excluding list of exact column names (#9472)
Co-authored-by: Ethan Cartwright <ethan.cartwright@acryl.io>
2024-01-16 19:12:23 -08:00

229 lines
7.7 KiB
Python

import pytest
from pydantic import ValidationError
from datahub.ingestion.glossary.classifier import ClassificationConfig
from datahub.ingestion.glossary.datahub_classifier import (
DataHubClassifier,
DataHubClassifierConfig,
)
def test_default_classification_config():
default_classification_config = ClassificationConfig(enabled=True)
assert default_classification_config.classifiers[0].type == "datahub"
assert default_classification_config.classifiers[0].config is None
def test_default_datahub_classifier_config():
default_datahub_classifier = DataHubClassifier.create(config_dict=None)
assert default_datahub_classifier.config.info_types_config is not None
def test_selective_datahub_classifier_config_override():
simple_config_override = DataHubClassifier.create(
config_dict={"confidence_level_threshold": 0.7}
).config
assert simple_config_override.info_types_config is not None
complex_config_override = DataHubClassifier.create(
config_dict={
"confidence_level_threshold": 0.7,
"info_types_config": {
"Age": {
"Prediction_Factors_and_Weights": {
"Name": 0.5,
"Description": 0,
"Datatype": 0,
"Values": 0.5,
},
},
},
}
).config
assert complex_config_override.info_types_config is not None
assert (
complex_config_override.info_types_config[
"Age"
].Prediction_Factors_and_Weights.Name
== 0.5
)
assert (
complex_config_override.info_types_config[
"Age"
].Prediction_Factors_and_Weights.Values
== 0.5
)
assert complex_config_override.info_types_config["Age"].Name is not None
assert complex_config_override.info_types_config["Age"].Values is not None
assert complex_config_override.info_types_config["Email_Address"] is not None
def test_custom_info_type_config():
custom_info_type_config = DataHubClassifier.create(
config_dict={
"confidence_level_threshold": 0.7,
"info_types_config": {
"CloudRegion": {
"Prediction_Factors_and_Weights": {
"Name": 0.5,
"Description": 0,
"Datatype": 0,
"Values": 0.5,
},
"Name": {
"regex": [
".*region.*id",
".*cloud.*region.*",
]
},
"Values": {
"prediction_type": "regex",
"regex": [
r"(af|ap|ca|eu|me|sa|us)-(central|north|(north(?:east|west))|south|south(?:east|west)|east|west)-\d+"
],
},
},
},
}
).config
assert custom_info_type_config.info_types_config
assert (
custom_info_type_config.info_types_config[
"CloudRegion"
].Prediction_Factors_and_Weights.Name
== 0.5
)
assert (
custom_info_type_config.info_types_config[
"CloudRegion"
].Prediction_Factors_and_Weights.Values
== 0.5
)
# Default Info Type Configurations should exist
assert custom_info_type_config.info_types_config["Age"]
assert custom_info_type_config.info_types_config["Email_Address"]
def test_incorrect_custom_info_type_config():
with pytest.raises(
ValidationError, match="Missing Configuration for Prediction Factor"
):
DataHubClassifierConfig.parse_obj(
{
"confidence_level_threshold": 0.7,
"info_types_config": {
"CloudRegion": {
"Prediction_Factors_and_Weights": {
"Name": 0.5,
"Description": 0,
"Datatype": 0,
"Values": 0.5,
},
"Name": {
"regex": [
".*region.*id",
".*cloud.*region.*",
]
},
},
},
}
)
with pytest.raises(ValidationError, match="Invalid Prediction Type"):
DataHubClassifierConfig.parse_obj(
{
"confidence_level_threshold": 0.7,
"info_types_config": {
"CloudRegion": {
"Prediction_Factors_and_Weights": {
"Name": 0.5,
"Description": 0,
"Datatype": 0,
"Values": 0.5,
},
"Name": {
"regex": [
".*region.*id",
".*cloud.*region.*",
]
},
"Values": {"prediction_type": "library", "library": ["spacy"]},
},
},
}
)
def test_exclude_name_config():
config = DataHubClassifier.create(
config_dict={
"confidence_level_threshold": 0.7,
"info_types_config": {
"Email_Address": {
"Prediction_Factors_and_Weights": {
"Name": 1,
"Description": 0,
"Datatype": 0,
"Values": 0,
},
"ExcludeName": ["email_sent", "email_received"],
"Name": {
"regex": [
"^.*mail.*id.*$",
"^.*id.*mail.*$",
"^.*mail.*add.*$",
"^.*add.*mail.*$",
"email",
"mail",
]
},
"Description": {"regex": []},
"Datatype": {"type": ["str"]},
"Values": {"prediction_type": "regex", "regex": [], "library": []},
}
},
}
).config
assert config.info_types_config["Email_Address"].ExcludeName is not None
assert config.info_types_config["Email_Address"].ExcludeName == [
"email_sent",
"email_received",
]
def test_no_exclude_name_config():
config = DataHubClassifier.create(
config_dict={
"confidence_level_threshold": 0.7,
"info_types_config": {
"Email_Address": {
"Prediction_Factors_and_Weights": {
"Name": 1,
"Description": 0,
"Datatype": 0,
"Values": 0,
},
"Name": {
"regex": [
"^.*mail.*id.*$",
"^.*id.*mail.*$",
"^.*mail.*add.*$",
"^.*add.*mail.*$",
"email",
"mail",
]
},
"Description": {"regex": []},
"Datatype": {"type": ["str"]},
"Values": {"prediction_type": "regex", "regex": [], "library": []},
}
},
}
).config
assert config.info_types_config["Email_Address"].ExcludeName is None