feat(ingest): fix validators (#10115)

Co-authored-by: Tamas Nemeth <treff7es@gmail.com>
This commit is contained in:
Harshal Sheth 2024-03-27 15:20:55 -07:00 committed by GitHub
parent 07ef677ad3
commit 25d9d6656c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 51 additions and 50 deletions

View File

@ -3,7 +3,7 @@ from typing import Dict, Optional, Set
from pydantic import validator from pydantic import validator
from pydantic.fields import Field from pydantic.fields import Field
from datahub.configuration.common import ConfigModel, ConfigurationError from datahub.configuration.common import ConfigModel
from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
from datahub.metadata.schema_classes import FabricTypeClass from datahub.metadata.schema_classes import FabricTypeClass
@ -45,7 +45,7 @@ class EnvConfigMixin(ConfigModel):
@validator("env") @validator("env")
def env_must_be_one_of(cls, v: str) -> str: def env_must_be_one_of(cls, v: str) -> str:
if v.upper() not in ALL_ENV_TYPES: if v.upper() not in ALL_ENV_TYPES:
raise ConfigurationError(f"env must be one of {ALL_ENV_TYPES}, found {v}") raise ValueError(f"env must be one of {ALL_ENV_TYPES}, found {v}")
return v.upper() return v.upper()

View File

@ -122,7 +122,7 @@ class BigQueryAuditLogApi:
) )
for i, entry in enumerate(list_entries): for i, entry in enumerate(list_entries):
if i % 1000 == 0: if i > 0 and i % 1000 == 0:
logger.info( logger.info(
f"Loaded {i} log entries from GCP Log for {client.project}" f"Loaded {i} log entries from GCP Log for {client.project}"
) )

View File

@ -14,7 +14,7 @@ from okta.models import Group, GroupProfile, User, UserProfile, UserStatus
from pydantic import validator from pydantic import validator
from pydantic.fields import Field from pydantic.fields import Field
from datahub.configuration.common import ConfigModel, ConfigurationError from datahub.configuration.common import ConfigModel
from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.api.decorators import ( from datahub.ingestion.api.decorators import (
@ -157,7 +157,7 @@ class OktaConfig(StatefulIngestionConfigBase, ConfigModel):
@validator("okta_users_search") @validator("okta_users_search")
def okta_users_one_of_filter_or_search(cls, v, values): def okta_users_one_of_filter_or_search(cls, v, values):
if v and values["okta_users_filter"]: if v and values["okta_users_filter"]:
raise ConfigurationError( raise ValueError(
"Only one of okta_users_filter or okta_users_search can be set" "Only one of okta_users_filter or okta_users_search can be set"
) )
return v return v
@ -165,7 +165,7 @@ class OktaConfig(StatefulIngestionConfigBase, ConfigModel):
@validator("okta_groups_search") @validator("okta_groups_search")
def okta_groups_one_of_filter_or_search(cls, v, values): def okta_groups_one_of_filter_or_search(cls, v, values):
if v and values["okta_groups_filter"]: if v and values["okta_groups_filter"]:
raise ConfigurationError( raise ValueError(
"Only one of okta_groups_filter or okta_groups_search can be set" "Only one of okta_groups_filter or okta_groups_search can be set"
) )
return v return v

View File

@ -8,7 +8,7 @@ from pydantic import Field, validator
from typing_extensions import ClassVar from typing_extensions import ClassVar
from datahub.configuration import ConfigModel from datahub.configuration import ConfigModel
from datahub.configuration.common import AllowDenyPattern, ConfigurationError from datahub.configuration.common import AllowDenyPattern
from datahub.configuration.source_common import ( from datahub.configuration.source_common import (
EnvConfigMixin, EnvConfigMixin,
PlatformInstanceConfigMixin, PlatformInstanceConfigMixin,
@ -59,11 +59,11 @@ class NamingPattern(ConfigModel):
for v in variables: for v in variables:
if v not in self.ALLOWED_VARS: if v not in self.ALLOWED_VARS:
raise ConfigurationError( raise ValueError(
f"Failed to find {v} in allowed_variables {self.ALLOWED_VARS}" f"Failed to find {v} in allowed_variables {self.ALLOWED_VARS}"
) )
if at_least_one and len(variables) == 0: if at_least_one and len(variables) == 0:
raise ConfigurationError( raise ValueError(
f"Failed to find any variable assigned to pattern {self.pattern}. Must have at least one. {self.allowed_docstring()}" f"Failed to find any variable assigned to pattern {self.pattern}. Must have at least one. {self.allowed_docstring()}"
) )
return True return True

View File

@ -8,11 +8,7 @@ from pydantic.fields import Field
import datahub.metadata.schema_classes as models import datahub.metadata.schema_classes as models
from datahub.cli.cli_utils import get_aspects_for_entity from datahub.cli.cli_utils import get_aspects_for_entity
from datahub.configuration.common import ( from datahub.configuration.common import ConfigModel, VersionedConfig
ConfigModel,
ConfigurationError,
VersionedConfig,
)
from datahub.configuration.config_loader import load_config_file from datahub.configuration.config_loader import load_config_file
from datahub.configuration.source_common import EnvConfigMixin from datahub.configuration.source_common import EnvConfigMixin
from datahub.emitter.mce_builder import ( from datahub.emitter.mce_builder import (
@ -57,11 +53,19 @@ class EntityConfig(EnvConfigMixin):
def type_must_be_supported(cls, v: str) -> str: def type_must_be_supported(cls, v: str) -> str:
allowed_types = ["dataset"] allowed_types = ["dataset"]
if v not in allowed_types: if v not in allowed_types:
raise ConfigurationError( raise ValueError(
f"Type must be one of {allowed_types}, {v} is not yet supported." f"Type must be one of {allowed_types}, {v} is not yet supported."
) )
return v return v
@validator("name")
def validate_name(cls, v: str) -> str:
if v.startswith("urn:li:"):
raise ValueError(
"Name should not start with urn:li: - use a plain name, not an urn"
)
return v
class FineGrainedLineageConfig(ConfigModel): class FineGrainedLineageConfig(ConfigModel):
upstreamType: str = "FIELD_SET" upstreamType: str = "FIELD_SET"
@ -79,7 +83,7 @@ class FineGrainedLineageConfig(ConfigModel):
FineGrainedLineageUpstreamType.NONE, FineGrainedLineageUpstreamType.NONE,
] ]
if v not in allowed_types: if v not in allowed_types:
raise ConfigurationError( raise ValueError(
f"Upstream Type must be one of {allowed_types}, {v} is not yet supported." f"Upstream Type must be one of {allowed_types}, {v} is not yet supported."
) )
return v return v

View File

@ -7,7 +7,7 @@ from typing import Dict, Iterable, Optional, Tuple
from pydantic import validator from pydantic import validator
from pydantic.fields import Field from pydantic.fields import Field
from datahub.configuration.common import ConfigModel, ConfigurationError from datahub.configuration.common import ConfigModel
from datahub.emitter.mce_builder import make_tag_urn from datahub.emitter.mce_builder import make_tag_urn
from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.api.decorators import ( from datahub.ingestion.api.decorators import (
@ -87,9 +87,7 @@ class OpenApiConfig(ConfigModel):
cls, bearer_token: Optional[str], values: Dict cls, bearer_token: Optional[str], values: Dict
) -> Optional[str]: ) -> Optional[str]:
if bearer_token is not None and values.get("token") is not None: if bearer_token is not None and values.get("token") is not None:
raise ConfigurationError( raise ValueError("Unable to use 'token' and 'bearer_token' together.")
"Unable to use 'token' and 'bearer_token' together."
)
return bearer_token return bearer_token
def get_swagger(self) -> Dict: def get_swagger(self) -> Dict:

View File

@ -2,7 +2,7 @@ import re
import pydantic import pydantic
from datahub.configuration.common import ConfigModel, ConfigurationError from datahub.configuration.common import ConfigModel
# Regexp for sharded tables. # Regexp for sharded tables.
# A sharded table is a table that has a suffix of the form _yyyymmdd or yyyymmdd, where yyyymmdd is a date. # A sharded table is a table that has a suffix of the form _yyyymmdd or yyyymmdd, where yyyymmdd is a date.
@ -38,7 +38,7 @@ class BigQueryBaseConfig(ConfigModel):
try: try:
re.compile(v) re.compile(v)
except Exception as e: except Exception as e:
raise ConfigurationError( raise ValueError(
f"sharded_table_pattern configuration pattern is invalid. The exception was: {e}" f"sharded_table_pattern configuration pattern is invalid. The exception was: {e}"
) )
return v return v

View File

@ -2,7 +2,7 @@ from typing import Any, Dict
import pydantic import pydantic
from datahub.configuration.common import ConfigModel, ConfigurationError from datahub.configuration.common import ConfigModel
class CSVEnricherConfig(ConfigModel): class CSVEnricherConfig(ConfigModel):
@ -24,7 +24,7 @@ class CSVEnricherConfig(ConfigModel):
@pydantic.validator("write_semantics") @pydantic.validator("write_semantics")
def validate_write_semantics(cls, write_semantics: str) -> str: def validate_write_semantics(cls, write_semantics: str) -> str:
if write_semantics.lower() not in {"patch", "override"}: if write_semantics.lower() not in {"patch", "override"}:
raise ConfigurationError( raise ValueError(
"write_semantics cannot be any other value than PATCH or OVERRIDE. Default value is PATCH. " "write_semantics cannot be any other value than PATCH or OVERRIDE. Default value is PATCH. "
"For PATCH semantics consider using the datahub-rest sink or " "For PATCH semantics consider using the datahub-rest sink or "
"provide a datahub_api: configuration on your ingestion recipe" "provide a datahub_api: configuration on your ingestion recipe"
@ -34,7 +34,7 @@ class CSVEnricherConfig(ConfigModel):
@pydantic.validator("array_delimiter") @pydantic.validator("array_delimiter")
def validator_diff(cls, array_delimiter: str, values: Dict[str, Any]) -> str: def validator_diff(cls, array_delimiter: str, values: Dict[str, Any]) -> str:
if array_delimiter == values["delimiter"]: if array_delimiter == values["delimiter"]:
raise ConfigurationError( raise ValueError(
"array_delimiter and delimiter are the same. Please choose different delimiters." "array_delimiter and delimiter are the same. Please choose different delimiters."
) )
return array_delimiter return array_delimiter

View File

@ -5,7 +5,7 @@ from typing import Any, Dict, Optional
import pydantic import pydantic
from pydantic.fields import Field from pydantic.fields import Field
from datahub.configuration.common import ConfigModel, ConfigurationError from datahub.configuration.common import ConfigModel
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -34,7 +34,7 @@ class OperationConfig(ConfigModel):
and profile_day_of_week is None and profile_day_of_week is None
and profile_date_of_month is None and profile_date_of_month is None
): ):
raise ConfigurationError( raise ValueError(
"Lower freq profiling setting is enabled but no day of week or date of month is specified. Profiling will be done.", "Lower freq profiling setting is enabled but no day of week or date of month is specified. Profiling will be done.",
) )
return values return values
@ -45,7 +45,7 @@ class OperationConfig(ConfigModel):
if profile_day_of_week is None: if profile_day_of_week is None:
return None return None
if profile_day_of_week < 0 or profile_day_of_week > 6: if profile_day_of_week < 0 or profile_day_of_week > 6:
raise ConfigurationError( raise ValueError(
f"Invalid value {profile_day_of_week} for profile_day_of_week. Must be between 0 to 6 (both inclusive)." f"Invalid value {profile_day_of_week} for profile_day_of_week. Must be between 0 to 6 (both inclusive)."
) )
return profile_day_of_week return profile_day_of_week
@ -56,7 +56,7 @@ class OperationConfig(ConfigModel):
if profile_date_of_month is None: if profile_date_of_month is None:
return None return None
if profile_date_of_month < 1 or profile_date_of_month > 31: if profile_date_of_month < 1 or profile_date_of_month > 31:
raise ConfigurationError( raise ValueError(
f"Invalid value {profile_date_of_month} for profile_date_of_month. Must be between 1 to 31 (both inclusive)." f"Invalid value {profile_date_of_month} for profile_date_of_month. Must be between 1 to 31 (both inclusive)."
) )
return profile_date_of_month return profile_date_of_month

View File

@ -4,7 +4,7 @@ from urllib.parse import urlparse
from pydantic import Field, validator from pydantic import Field, validator
from datahub.configuration.common import AllowDenyPattern, ConfigurationError from datahub.configuration.common import AllowDenyPattern
from datahub.configuration.source_common import ( from datahub.configuration.source_common import (
EnvConfigMixin, EnvConfigMixin,
PlatformInstanceConfigMixin, PlatformInstanceConfigMixin,
@ -102,7 +102,7 @@ class PulsarSourceConfig(
cls, token: Optional[str], values: Dict[str, Optional[str]] cls, token: Optional[str], values: Dict[str, Optional[str]]
) -> Optional[str]: ) -> Optional[str]:
if token is not None and values.get("issuer_url") is not None: if token is not None and values.get("issuer_url") is not None:
raise ConfigurationError( raise ValueError(
"Expected only one authentication method, either issuer_url or token." "Expected only one authentication method, either issuer_url or token."
) )
return token return token
@ -114,7 +114,7 @@ class PulsarSourceConfig(
if values.get("issuer_url") is not None and ( if values.get("issuer_url") is not None and (
client_secret is None or values.get("client_id") is None client_secret is None or values.get("client_id") is None
): ):
raise ConfigurationError( raise ValueError(
"Missing configuration: client_id and client_secret are mandatory when issuer_url is set." "Missing configuration: client_id and client_secret are mandatory when issuer_url is set."
) )
return client_secret return client_secret
@ -125,12 +125,10 @@ class PulsarSourceConfig(
url = urlparse(val) url = urlparse(val)
if url.scheme not in ["http", "https"]: if url.scheme not in ["http", "https"]:
raise ConfigurationError( raise ValueError(f"Scheme should be http or https, found {url.scheme}")
f"Scheme should be http or https, found {url.scheme}"
)
if not _is_valid_hostname(url.hostname.__str__()): if not _is_valid_hostname(url.hostname.__str__()):
raise ConfigurationError( raise ValueError(
f"Not a valid hostname, hostname contains invalid characters, found {url.hostname}" f"Not a valid hostname, hostname contains invalid characters, found {url.hostname}"
) )

View File

@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional
import pydantic import pydantic
from datahub.configuration import ConfigModel from datahub.configuration import ConfigModel
from datahub.configuration.common import AllowDenyPattern, ConfigurationError from datahub.configuration.common import AllowDenyPattern
from datahub.configuration.source_common import EnvConfigMixin from datahub.configuration.source_common import EnvConfigMixin
from datahub.configuration.validate_field_removal import pydantic_removed_field from datahub.configuration.validate_field_removal import pydantic_removed_field
from datahub.configuration.validate_multiline_string import pydantic_multiline_string from datahub.configuration.validate_multiline_string import pydantic_multiline_string
@ -151,7 +151,7 @@ class BigQueryUsageConfig(BigQueryBaseConfig, EnvConfigMixin, BaseUsageConfig):
@pydantic.validator("use_exported_bigquery_audit_metadata") @pydantic.validator("use_exported_bigquery_audit_metadata")
def use_exported_bigquery_audit_metadata_uses_v2(cls, v, values): def use_exported_bigquery_audit_metadata_uses_v2(cls, v, values):
if v is True and not values["use_v2_audit_metadata"]: if v is True and not values["use_v2_audit_metadata"]:
raise ConfigurationError( raise ValueError(
"To use exported BigQuery audit metadata, you must also use v2 audit metadata" "To use exported BigQuery audit metadata, you must also use v2 audit metadata"
) )
return v return v

View File

@ -6,7 +6,6 @@ from typing import Any, Dict, List, Tuple
import pydantic import pydantic
import pytest import pytest
from datahub.configuration.common import ConfigurationError
from datahub.ingestion.source.elastic_search import ( from datahub.ingestion.source.elastic_search import (
CollapseUrns, CollapseUrns,
ElasticsearchSourceConfig, ElasticsearchSourceConfig,
@ -19,7 +18,7 @@ logger = logging.getLogger(__name__)
def test_elasticsearch_throws_error_wrong_operation_config(): def test_elasticsearch_throws_error_wrong_operation_config():
with pytest.raises(ConfigurationError): with pytest.raises(pydantic.ValidationError):
ElasticsearchSourceConfig.parse_obj( ElasticsearchSourceConfig.parse_obj(
{ {
"profiling": { "profiling": {

View File

@ -3,8 +3,8 @@ from typing import List
import pytest import pytest
import yaml import yaml
from pydantic import ValidationError
from datahub.configuration.common import ConfigurationError
from datahub.ingestion.source.metadata.lineage import LineageConfig, _get_lineage_mcp from datahub.ingestion.source.metadata.lineage import LineageConfig, _get_lineage_mcp
from datahub.metadata.schema_classes import FineGrainedLineageClass, UpstreamClass from datahub.metadata.schema_classes import FineGrainedLineageClass, UpstreamClass
@ -196,25 +196,25 @@ def test_basic_lineage_finegrained_upstream_urns(basic_mcp):
def test_unsupported_entity_type(): def test_unsupported_entity_type():
""" """
Checks to see how we handle the case of unsupported entity types. Checks to see how we handle the case of unsupported entity types.
If validation is working correctly, it should raise a ConfigurationError If validation is working correctly, it should raise a ValidationError
""" """
with pytest.raises(ConfigurationError): with pytest.raises(ValidationError):
unsupported_entity_type_mcp() unsupported_entity_type_mcp()
def test_unsupported_upstream_entity_type(): def test_unsupported_upstream_entity_type():
""" """
Checks to see how invalid types work in the upstream node. Checks to see how invalid types work in the upstream node.
If validation is working correctly, it should raise a ConfigurationError If validation is working correctly, it should raise a ValidationError
""" """
with pytest.raises(ConfigurationError): with pytest.raises(ValidationError):
unsupported_upstream_entity_type_mcp() unsupported_upstream_entity_type_mcp()
def test_unsupported_entity_env(): def test_unsupported_entity_env():
""" """
Checks to see how invalid envs work. Checks to see how invalid envs work.
If validation is working correctly, it should raise a ConfigurationError If validation is working correctly, it should raise a ValidationError
""" """
with pytest.raises(ConfigurationError): with pytest.raises(ValidationError):
unsupported_entity_env_mcp() unsupported_entity_env_mcp()

View File

@ -4,7 +4,6 @@ from unittest.mock import patch
import pytest import pytest
from datahub.configuration.common import ConfigurationError
from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.source.pulsar import ( from datahub.ingestion.source.pulsar import (
@ -24,6 +23,9 @@ mock_schema_response: Dict[str, Any] = {
class TestPulsarSourceConfig: class TestPulsarSourceConfig:
# TODO: While these tests work, we really shouldn't be calling pydantic
# validator methods directly.
def test_pulsar_source_config_valid_web_service_url(self): def test_pulsar_source_config_valid_web_service_url(self):
assert ( assert (
PulsarSourceConfig().web_service_url_scheme_host_port( PulsarSourceConfig().web_service_url_scheme_host_port(
@ -34,7 +36,7 @@ class TestPulsarSourceConfig:
def test_pulsar_source_config_invalid_web_service_url_scheme(self): def test_pulsar_source_config_invalid_web_service_url_scheme(self):
with pytest.raises( with pytest.raises(
ConfigurationError, match=r"Scheme should be http or https, found ftp" ValueError, match=r"Scheme should be http or https, found ftp"
): ):
PulsarSourceConfig().web_service_url_scheme_host_port( PulsarSourceConfig().web_service_url_scheme_host_port(
"ftp://localhost:8080/" "ftp://localhost:8080/"
@ -42,7 +44,7 @@ class TestPulsarSourceConfig:
def test_pulsar_source_config_invalid_web_service_url_host(self): def test_pulsar_source_config_invalid_web_service_url_host(self):
with pytest.raises( with pytest.raises(
ConfigurationError, ValueError,
match=r"Not a valid hostname, hostname contains invalid characters, found localhost&", match=r"Not a valid hostname, hostname contains invalid characters, found localhost&",
): ):
PulsarSourceConfig().web_service_url_scheme_host_port( PulsarSourceConfig().web_service_url_scheme_host_port(