mirror of
https://github.com/datahub-project/datahub.git
synced 2025-09-03 06:13:14 +00:00
refactor(ingest): streamline pydantic configs (#6011)
This commit is contained in:
parent
9d0a2dea53
commit
f227bd982b
@ -40,15 +40,12 @@ class BaseTimeWindowConfig(ConfigModel):
|
||||
# `start_time` and `end_time` will be populated by the pre-validators.
|
||||
# However, we must specify a "default" value here or pydantic will complain
|
||||
# if those fields are not set by the user.
|
||||
end_time: datetime = Field(default=None, description="Latest date of usage to consider. Default: Current time in UTC") # type: ignore
|
||||
end_time: datetime = Field(
|
||||
default_factory=lambda: datetime.now(tz=timezone.utc),
|
||||
description="Latest date of usage to consider. Default: Current time in UTC",
|
||||
)
|
||||
start_time: datetime = Field(default=None, description="Earliest date of usage to consider. Default: Last full day in UTC (or hour, depending on `bucket_duration`)") # type: ignore
|
||||
|
||||
@pydantic.validator("end_time", pre=True, always=True)
|
||||
def default_end_time(
|
||||
cls, v: Any, *, values: Dict[str, Any], **kwargs: Any
|
||||
) -> datetime:
|
||||
return v or datetime.now(tz=timezone.utc)
|
||||
|
||||
@pydantic.validator("start_time", pre=True, always=True)
|
||||
def default_start_time(
|
||||
cls, v: Any, *, values: Dict[str, Any], **kwargs: Any
|
||||
|
@ -14,6 +14,7 @@ def pydantic_renamed_field(
|
||||
old_name: str,
|
||||
new_name: str,
|
||||
transform: Callable = _default_rename_transform,
|
||||
print_warning: bool = True,
|
||||
) -> classmethod:
|
||||
def _validate_field_rename(cls: Type, values: dict) -> dict:
|
||||
if old_name in values:
|
||||
@ -22,10 +23,12 @@ def pydantic_renamed_field(
|
||||
f"Cannot specify both {old_name} and {new_name} in the same config. Note that {old_name} has been deprecated in favor of {new_name}."
|
||||
)
|
||||
else:
|
||||
warnings.warn(
|
||||
f"The {old_name} is deprecated, please use {new_name} instead.",
|
||||
UserWarning,
|
||||
)
|
||||
if print_warning:
|
||||
warnings.warn(
|
||||
f"The {old_name} is deprecated, please use {new_name} instead.",
|
||||
UserWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
values[new_name] = transform(values.pop(old_name))
|
||||
return values
|
||||
|
||||
|
@ -291,6 +291,7 @@ class DBTConfig(StatefulIngestionConfigBase):
|
||||
description="When enabled, applies the mappings that are defined through the `query_tag_mapping` directives.",
|
||||
)
|
||||
write_semantics: str = Field(
|
||||
# TODO: Replace with the WriteSemantics enum.
|
||||
default="PATCH",
|
||||
description='Whether the new tags, terms and owners to be added will override the existing ones added only by this source or not. Value for this config can be "PATCH" or "OVERRIDE"',
|
||||
)
|
||||
|
@ -2,17 +2,18 @@ import datetime
|
||||
import json
|
||||
import logging
|
||||
import os.path
|
||||
import pathlib
|
||||
from dataclasses import dataclass, field
|
||||
from enum import auto
|
||||
from io import BufferedReader
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, Iterator, Optional, Tuple, Union
|
||||
|
||||
import ijson
|
||||
from pydantic import root_validator, validator
|
||||
from pydantic import validator
|
||||
from pydantic.fields import Field
|
||||
|
||||
from datahub.configuration.common import ConfigEnum, ConfigModel
|
||||
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
||||
from datahub.ingestion.api.common import PipelineContext
|
||||
from datahub.ingestion.api.decorators import (
|
||||
SupportStatus,
|
||||
@ -43,8 +44,10 @@ class FileReadMode(ConfigEnum):
|
||||
|
||||
|
||||
class FileSourceConfig(ConfigModel):
|
||||
filename: Optional[str] = Field(None, description="Path to file to ingest.")
|
||||
path: str = Field(
|
||||
filename: Optional[str] = Field(
|
||||
None, description="[deprecated in favor or `path`] The file to ingest."
|
||||
)
|
||||
path: pathlib.Path = Field(
|
||||
description="Path to folder or file to ingest. If pointed to a folder, all files with extension {file_extension} (default json) within that folder will be processed."
|
||||
)
|
||||
file_extension: str = Field(
|
||||
@ -61,18 +64,9 @@ class FileSourceConfig(ConfigModel):
|
||||
100 * 1000 * 1000 # Must be at least 100MB before we use streaming mode
|
||||
)
|
||||
|
||||
@root_validator(pre=True)
|
||||
def filename_populates_path_if_present(
|
||||
cls, values: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
if "path" not in values and "filename" in values:
|
||||
values["path"] = values["filename"]
|
||||
elif values.get("filename"):
|
||||
raise ValueError(
|
||||
"Both path and filename should not be provided together. Use one. We recommend using path. filename is deprecated."
|
||||
)
|
||||
|
||||
return values
|
||||
_filename_populates_path_if_present = pydantic_renamed_field(
|
||||
"filename", "path", print_warning=False
|
||||
)
|
||||
|
||||
@validator("file_extension", always=True)
|
||||
def add_leading_dot_to_extension(cls, v: str) -> str:
|
||||
@ -179,16 +173,13 @@ class GenericFileSource(TestableSource):
|
||||
return cls(ctx, config)
|
||||
|
||||
def get_filenames(self) -> Iterable[str]:
|
||||
is_file = os.path.isfile(self.config.path)
|
||||
is_dir = os.path.isdir(self.config.path)
|
||||
if is_file:
|
||||
if self.config.path.is_file():
|
||||
self.report.total_num_files = 1
|
||||
return [self.config.path]
|
||||
if is_dir:
|
||||
p = Path(self.config.path)
|
||||
return [str(self.config.path)]
|
||||
elif self.config.path.is_dir():
|
||||
files_and_stats = [
|
||||
(str(x), os.path.getsize(x))
|
||||
for x in list(p.glob(f"*{self.config.file_extension}"))
|
||||
for x in list(self.config.path.glob(f"*{self.config.file_extension}"))
|
||||
if x.is_file()
|
||||
]
|
||||
self.report.total_num_files = len(files_and_stats)
|
||||
|
Loading…
x
Reference in New Issue
Block a user