377 lines
12 KiB
Python

import dataclasses
import re
import unittest.mock
from abc import ABC, abstractmethod
from enum import auto
from typing import (
IO,
TYPE_CHECKING,
Annotated,
Any,
ClassVar,
Dict,
List,
Optional,
Type,
TypeVar,
Union,
runtime_checkable,
)
import pydantic
import pydantic_core
from cached_property import cached_property
from pydantic import BaseModel, Extra, ValidationError
from pydantic.fields import Field
from typing_extensions import Protocol, Self
from datahub.configuration._config_enum import ConfigEnum as ConfigEnum
from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
from datahub.utilities.dedup_list import deduplicate_list
REDACT_KEYS = {
"password",
"token",
"secret",
"options",
"sqlalchemy_uri",
}
REDACT_SUFFIXES = {
"_password",
"-password",
"_secret",
"-secret",
"_token",
"-token",
"_key",
"-key",
"_key_id",
"-key-id",
}
def _should_redact_key(key: Union[str, int]) -> bool:
return isinstance(key, str) and (
key in REDACT_KEYS or any(key.endswith(suffix) for suffix in REDACT_SUFFIXES)
)
def _redact_value(value: Any) -> Any:
if isinstance(value, str):
# If it's just a variable reference, it's ok to show as-is.
if value.startswith("$"):
return value
return "********"
elif value is None:
return None
elif isinstance(value, bool):
# We don't have any sensitive boolean fields.
return value
elif isinstance(value, list) and not value:
# Empty states are fine.
return []
elif isinstance(value, dict) and not value:
return {}
else:
return "********"
def redact_raw_config(obj: Any) -> Any:
if isinstance(obj, dict):
return {
k: _redact_value(v) if _should_redact_key(k) else redact_raw_config(v)
for k, v in obj.items()
}
elif isinstance(obj, list):
return [redact_raw_config(v) for v in obj]
else:
return obj
if TYPE_CHECKING:
AnyType = TypeVar("AnyType")
HiddenFromDocs = Annotated[AnyType, ...]
else:
HiddenFromDocs = pydantic.json_schema.SkipJsonSchema
LaxStr = Annotated[str, pydantic.BeforeValidator(lambda v: str(v))]
@dataclasses.dataclass(frozen=True)
class SupportedSources:
sources: List[str]
def __get_pydantic_json_schema__(
self,
core_schema: pydantic_core.core_schema.CoreSchema,
handler: pydantic.GetJsonSchemaHandler,
) -> pydantic.json_schema.JsonSchemaValue:
json_schema = handler(core_schema)
json_schema.setdefault("schema_extra", {})["supported_sources"] = self.sources
return json_schema
class ConfigModel(BaseModel):
class Config:
@staticmethod
def _schema_extra(schema: Dict[str, Any], model: Type["ConfigModel"]) -> None:
# We use the custom "hidden_from_docs" attribute to hide fields from the
# autogenerated docs.
remove_fields = []
for key, prop in schema.get("properties", {}).items():
if prop.get("hidden_from_docs"):
remove_fields.append(key)
for key in remove_fields:
del schema["properties"][key]
# This is purely to suppress pydantic's warnings, since this class is used everywhere.
if PYDANTIC_VERSION_2:
extra = "forbid"
ignored_types = (cached_property,)
json_schema_extra = _schema_extra
else:
extra = Extra.forbid
underscore_attrs_are_private = True
keep_untouched = (
cached_property,
) # needed to allow cached_property to work. See https://github.com/samuelcolvin/pydantic/issues/1241 for more info.
schema_extra = _schema_extra
@classmethod
def parse_obj_allow_extras(cls, obj: Any) -> Self:
if PYDANTIC_VERSION_2:
try:
with unittest.mock.patch.dict(
cls.model_config, # type: ignore
{"extra": "allow"},
clear=False,
):
cls.model_rebuild(force=True) # type: ignore
return cls.parse_obj(obj)
finally:
cls.model_rebuild(force=True) # type: ignore
else:
with unittest.mock.patch.object(cls.Config, "extra", pydantic.Extra.allow):
return cls.parse_obj(obj)
class PermissiveConfigModel(ConfigModel):
# A permissive config model that allows extra fields.
# This is useful for cases where we want to strongly type certain fields,
# but still allow the user to pass in arbitrary fields that we don't care about.
# It is usually used for argument bags that are passed through to third-party libraries.
class Config:
if PYDANTIC_VERSION_2: # noqa: SIM108
extra = "allow"
else:
extra = Extra.allow
class ConnectionModel(BaseModel):
"""Represents the config associated with a connection"""
class Config:
if PYDANTIC_VERSION_2: # noqa: SIM108
extra = "allow"
else:
extra = Extra.allow
underscore_attrs_are_private = True
class TransformerSemantics(ConfigEnum):
"""Describes semantics for aspect changes"""
OVERWRITE = auto() # Apply changes blindly
PATCH = auto() # Only apply differences from what exists already on the server
class TransformerSemanticsConfigModel(ConfigModel):
semantics: TransformerSemantics = TransformerSemantics.OVERWRITE
replace_existing: bool = False
class DynamicTypedConfig(ConfigModel):
# Once support for discriminated unions gets merged into Pydantic, we can
# simplify this configuration and validation.
# See https://github.com/samuelcolvin/pydantic/pull/2336.
type: str = Field(
description="The type of the dynamic object",
)
# This config type is declared Optional[Any] here. The eventual parser for the
# specified type is responsible for further validation.
config: Optional[Any] = Field(
default=None,
description="The configuration required for initializing the state provider. Default: The datahub_api config if set at pipeline level. Otherwise, the default DatahubClientConfig. See the defaults (https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/graph/client.py#L19).",
)
# TODO: Many of these exception types are fairly specialized and shouldn't live in a common module.
class MetaError(Exception):
"""A base class for all meta exceptions."""
class PipelineExecutionError(MetaError):
"""An error occurred when executing the pipeline."""
class GraphError(MetaError):
"""An error in communicating with the DataHub Graph."""
class OperationalError(GraphError):
"""A GraphError with extra debug annotations."""
message: str
info: dict
def __init__(self, message: str, info: Optional[dict] = None):
self.message = message
self.info = info or {}
class ConfigurationError(MetaError):
"""A configuration error."""
class IgnorableError(MetaError):
"""An error that can be ignored."""
class TraceTimeoutError(OperationalError):
"""Failure to complete an API Trace within the timeout."""
class TraceValidationError(OperationalError):
"""Failure to complete the expected write operation."""
@runtime_checkable
class ExceptionWithProps(Protocol):
def get_telemetry_props(self) -> Dict[str, Any]: ...
def should_show_stack_trace(exc: Exception) -> bool:
# Unless the exception is a ValidationError or explicitly opts out of stack traces,
# we should show the stack trace.
if isinstance(exc, ValidationError) or isinstance(exc.__cause__, ValidationError):
return False
return getattr(exc, "SHOW_STACK_TRACE", True)
class ConfigurationWarning(Warning):
"""A configuration warning."""
class ConfigurationMechanism(ABC):
@abstractmethod
def load_config(self, config_fp: IO) -> dict:
pass
class AllowDenyPattern(ConfigModel):
"""A class to store allow deny regexes"""
# This regex is used to check if a given rule is a regex expression or a literal.
# Note that this is not a perfect check. For example, the '.' character should
# be considered a regex special character, but it's used frequently in literal
# patterns and hence we allow it anyway.
IS_SIMPLE_REGEX: ClassVar = re.compile(r"^[A-Za-z0-9 _.-]+$")
allow: List[str] = Field(
default=[".*"],
description="List of regex patterns to include in ingestion",
)
deny: List[str] = Field(
default=[],
description="List of regex patterns to exclude from ingestion.",
)
ignoreCase: Optional[bool] = Field(
default=True,
description="Whether to ignore case sensitivity during pattern matching.",
) # Name comparisons should default to ignoring case
@property
def regex_flags(self) -> int:
return re.IGNORECASE if self.ignoreCase else 0
@classmethod
def allow_all(cls) -> "AllowDenyPattern":
return AllowDenyPattern()
def allowed(self, string: str) -> bool:
if self.denied(string):
return False
return any(
re.match(allow_pattern, string, self.regex_flags)
for allow_pattern in self.allow
)
def denied(self, string: str) -> bool:
for deny_pattern in self.deny:
if re.match(deny_pattern, string, self.regex_flags):
return True
return False
def is_fully_specified_allow_list(self) -> bool:
"""
If the allow patterns are literals and not full regexes, then it is considered
fully specified. This is useful if you want to convert a 'list + filter'
pattern into a 'search for the ones that are allowed' pattern, which can be
much more efficient in some cases.
"""
return all(
self.IS_SIMPLE_REGEX.match(allow_pattern) for allow_pattern in self.allow
)
def get_allowed_list(self) -> List[str]:
"""Return the list of allowed strings as a list, after taking into account deny patterns, if possible"""
if not self.is_fully_specified_allow_list():
raise ValueError(
"allow list must be fully specified to get list of allowed strings"
)
return [a for a in self.allow if not self.denied(a)]
def __eq__(self, other): # type: ignore
return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
class KeyValuePattern(ConfigModel):
"""
The key-value pattern is used to map a regex pattern to a set of values.
For example, you can use it to map a table name to a list of tags to apply to it.
"""
rules: Dict[str, List[str]] = {".*": []}
first_match_only: bool = Field(
default=True,
description="Whether to stop after the first match. If false, all matching rules will be applied.",
)
@classmethod
def all(cls) -> "KeyValuePattern":
return KeyValuePattern()
def value(self, string: str) -> List[str]:
matching_keys = [key for key in self.rules if re.match(key, string)]
if not matching_keys:
return []
elif self.first_match_only:
return self.rules[matching_keys[0]]
else:
return deduplicate_list(
[v for key in matching_keys for v in self.rules[key]]
)
class VersionedConfig(ConfigModel):
version: LaxStr = "1"