mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-04 04:39:10 +00:00
fix(ingest): clarify s3/s3a requirements and platform defaults (#4263)
This commit is contained in:
parent
2a5cf3dd07
commit
b2b8826118
@ -10,7 +10,7 @@ This source is in **Beta** and under active development. Not yet considered read
|
|||||||
|
|
||||||
## Setup
|
## Setup
|
||||||
|
|
||||||
To install this plugin, run `pip install 'acryl-datahub[data-lake]'`. Note that because the profiling is run with PySpark, we require Spark 3.0.3 with Hadoop 3.2 to be installed (see [compatibility](#compatibility) for more details).
|
To install this plugin, run `pip install 'acryl-datahub[data-lake]'`. Note that because the profiling is run with PySpark, we require Spark 3.0.3 with Hadoop 3.2 to be installed (see [compatibility](#compatibility) for more details). If profiling, make sure that permissions for **s3a://** access are set because Spark and Hadoop use the s3a:// protocol to interface with AWS (schema inference outside of profiling requires s3:// access).
|
||||||
|
|
||||||
The data lake connector extracts schemas and profiles from a variety of file formats (see below for an exhaustive list).
|
The data lake connector extracts schemas and profiles from a variety of file formats (see below for an exhaustive list).
|
||||||
Individual files are ingested as tables, and profiles are computed similar to the [SQL profiler](./sql_profiles.md).
|
Individual files are ingested as tables, and profiles are computed similar to the [SQL profiler](./sql_profiles.md).
|
||||||
@ -94,7 +94,7 @@ Note that a `.` is used to denote nested fields in the YAML recipe.
|
|||||||
| Field | Required | Default | Description |
|
| Field | Required | Default | Description |
|
||||||
| ---------------------------------------------------- | ------------------------ | ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------------------------------------------- | ------------------------ | ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `env` | | `PROD` | Environment to use in namespace when constructing URNs. |
|
| `env` | | `PROD` | Environment to use in namespace when constructing URNs. |
|
||||||
| `platform` | ✅ | | Platform to use in namespace when constructing URNs. |
|
| `platform` | | Autodetected | Platform to use in namespace when constructing URNs. If left blank, local paths will correspond to `file` and S3 paths will correspond to `s3`. |
|
||||||
| `base_path` | ✅ | | Path of the base folder to crawl. Unless `schema_patterns` and `profile_patterns` are set, the connector will ingest all files in this folder. |
|
| `base_path` | ✅ | | Path of the base folder to crawl. Unless `schema_patterns` and `profile_patterns` are set, the connector will ingest all files in this folder. |
|
||||||
| `path_spec` | | | Format string for constructing table identifiers from the relative path. See the above [setup section](#setup) for details. |
|
| `path_spec` | | | Format string for constructing table identifiers from the relative path. See the above [setup section](#setup) for details. |
|
||||||
| `use_relative_path` | | `False` | Whether to use the relative path when constructing URNs. Has no effect when a `path_spec` is provided. |
|
| `use_relative_path` | | `False` | Whether to use the relative path when constructing URNs. Has no effect when a `path_spec` is provided. |
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from enum import Enum
|
||||||
from math import log10
|
from math import log10
|
||||||
from typing import Any, Dict, Iterable, List, Optional
|
from typing import Any, Dict, Iterable, List, Optional
|
||||||
|
|
||||||
@ -246,7 +247,10 @@ class DataLakeSource(Source):
|
|||||||
|
|
||||||
return cls(config, ctx)
|
return cls(config, ctx)
|
||||||
|
|
||||||
def read_file_spark(self, file: str) -> Optional[DataFrame]:
|
def read_file_spark(self, file: str, is_aws: bool) -> Optional[DataFrame]:
|
||||||
|
|
||||||
|
if is_aws:
|
||||||
|
file = f"s3a://{file}"
|
||||||
|
|
||||||
extension = os.path.splitext(file)[1]
|
extension = os.path.splitext(file)[1]
|
||||||
|
|
||||||
@ -297,7 +301,7 @@ class DataLakeSource(Source):
|
|||||||
return df.toDF(*(c.replace(".", "_") for c in df.columns))
|
return df.toDF(*(c.replace(".", "_") for c in df.columns))
|
||||||
|
|
||||||
def get_table_schema(
|
def get_table_schema(
|
||||||
self, file_path: str, table_name: str
|
self, file_path: str, table_name: str, is_aws: bool
|
||||||
) -> Iterable[MetadataWorkUnit]:
|
) -> Iterable[MetadataWorkUnit]:
|
||||||
|
|
||||||
data_platform_urn = make_data_platform_urn(self.source_config.platform)
|
data_platform_urn = make_data_platform_urn(self.source_config.platform)
|
||||||
@ -307,10 +311,6 @@ class DataLakeSource(Source):
|
|||||||
|
|
||||||
dataset_name = os.path.basename(file_path)
|
dataset_name = os.path.basename(file_path)
|
||||||
|
|
||||||
# if no path spec is provided and the file is in S3, then use the S3 path to construct an URN
|
|
||||||
if is_s3_uri(file_path) and self.source_config.path_spec is None:
|
|
||||||
dataset_urn = make_s3_urn(file_path, self.source_config.env)
|
|
||||||
|
|
||||||
dataset_snapshot = DatasetSnapshot(
|
dataset_snapshot = DatasetSnapshot(
|
||||||
urn=dataset_urn,
|
urn=dataset_urn,
|
||||||
aspects=[],
|
aspects=[],
|
||||||
@ -322,13 +322,15 @@ class DataLakeSource(Source):
|
|||||||
)
|
)
|
||||||
dataset_snapshot.aspects.append(dataset_properties)
|
dataset_snapshot.aspects.append(dataset_properties)
|
||||||
|
|
||||||
if file_path.startswith("s3a://"):
|
if is_aws:
|
||||||
if self.source_config.aws_config is None:
|
if self.source_config.aws_config is None:
|
||||||
raise ValueError("AWS config is required for S3 file sources")
|
raise ValueError("AWS config is required for S3 file sources")
|
||||||
|
|
||||||
s3_client = self.source_config.aws_config.get_s3_client()
|
s3_client = self.source_config.aws_config.get_s3_client()
|
||||||
|
|
||||||
file = smart_open(file_path, "rb", transport_params={"client": s3_client})
|
file = smart_open(
|
||||||
|
f"s3://{file_path}", "rb", transport_params={"client": s3_client}
|
||||||
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
|
||||||
@ -416,7 +418,7 @@ class DataLakeSource(Source):
|
|||||||
return ".".join(name_components)
|
return ".".join(name_components)
|
||||||
|
|
||||||
def ingest_table(
|
def ingest_table(
|
||||||
self, full_path: str, relative_path: str
|
self, full_path: str, relative_path: str, is_aws: bool
|
||||||
) -> Iterable[MetadataWorkUnit]:
|
) -> Iterable[MetadataWorkUnit]:
|
||||||
|
|
||||||
table_name = self.get_table_name(relative_path, full_path)
|
table_name = self.get_table_name(relative_path, full_path)
|
||||||
@ -425,14 +427,14 @@ class DataLakeSource(Source):
|
|||||||
logger.debug(
|
logger.debug(
|
||||||
f"Ingesting {full_path}: making table schemas {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
|
f"Ingesting {full_path}: making table schemas {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
|
||||||
)
|
)
|
||||||
yield from self.get_table_schema(full_path, table_name)
|
yield from self.get_table_schema(full_path, table_name, is_aws)
|
||||||
|
|
||||||
# If profiling is not enabled, skip the rest
|
# If profiling is not enabled, skip the rest
|
||||||
if not self.source_config.profiling.enabled:
|
if not self.source_config.profiling.enabled:
|
||||||
return
|
return
|
||||||
|
|
||||||
# read in the whole table with Spark for profiling
|
# read in the whole table with Spark for profiling
|
||||||
table = self.read_file_spark(full_path)
|
table = self.read_file_spark(full_path, is_aws)
|
||||||
|
|
||||||
# if table is not readable, skip
|
# if table is not readable, skip
|
||||||
if table is None:
|
if table is None:
|
||||||
@ -513,7 +515,7 @@ class DataLakeSource(Source):
|
|||||||
s3 = self.source_config.aws_config.get_s3_resource()
|
s3 = self.source_config.aws_config.get_s3_resource()
|
||||||
bucket = s3.Bucket(plain_base_path.split("/")[0])
|
bucket = s3.Bucket(plain_base_path.split("/")[0])
|
||||||
|
|
||||||
unordered_files = []
|
base_obj_paths = []
|
||||||
|
|
||||||
for obj in bucket.objects.filter(
|
for obj in bucket.objects.filter(
|
||||||
Prefix=plain_base_path.split("/", maxsplit=1)[1]
|
Prefix=plain_base_path.split("/", maxsplit=1)[1]
|
||||||
@ -534,16 +536,16 @@ class DataLakeSource(Source):
|
|||||||
if self.source_config.ignore_dotfiles and file.startswith("."):
|
if self.source_config.ignore_dotfiles and file.startswith("."):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
obj_path = f"s3a://{obj.bucket_name}/{obj.key}"
|
base_obj_path = f"{obj.bucket_name}/{obj.key}"
|
||||||
|
|
||||||
unordered_files.append(obj_path)
|
base_obj_paths.append(base_obj_path)
|
||||||
|
|
||||||
for aws_file in sorted(unordered_files):
|
for aws_file in sorted(base_obj_paths):
|
||||||
|
|
||||||
relative_path = "./" + aws_file[len(f"s3a://{plain_base_path}") :]
|
relative_path = "./" + aws_file[len(plain_base_path) :]
|
||||||
|
|
||||||
# pass in the same relative_path as the full_path for S3 files
|
# pass in the same relative_path as the full_path for S3 files
|
||||||
yield from self.ingest_table(aws_file, relative_path)
|
yield from self.ingest_table(aws_file, relative_path, is_aws=True)
|
||||||
|
|
||||||
def get_workunits_local(self) -> Iterable[MetadataWorkUnit]:
|
def get_workunits_local(self) -> Iterable[MetadataWorkUnit]:
|
||||||
for root, dirs, files in os.walk(self.source_config.base_path):
|
for root, dirs, files in os.walk(self.source_config.base_path):
|
||||||
@ -562,7 +564,7 @@ class DataLakeSource(Source):
|
|||||||
if not self.source_config.schema_patterns.allowed(full_path):
|
if not self.source_config.schema_patterns.allowed(full_path):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
yield from self.ingest_table(full_path, relative_path)
|
yield from self.ingest_table(full_path, relative_path, is_aws=False)
|
||||||
|
|
||||||
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
|
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
|
||||||
|
|
||||||
|
|||||||
@ -6,14 +6,15 @@ import pydantic
|
|||||||
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
||||||
from datahub.emitter.mce_builder import DEFAULT_ENV
|
from datahub.emitter.mce_builder import DEFAULT_ENV
|
||||||
from datahub.ingestion.source.aws.aws_common import AwsSourceConfig
|
from datahub.ingestion.source.aws.aws_common import AwsSourceConfig
|
||||||
|
from datahub.ingestion.source.aws.s3_util import is_s3_uri
|
||||||
from datahub.ingestion.source.data_lake.profiling import DataLakeProfilerConfig
|
from datahub.ingestion.source.data_lake.profiling import DataLakeProfilerConfig
|
||||||
|
|
||||||
|
|
||||||
class DataLakeSourceConfig(ConfigModel):
|
class DataLakeSourceConfig(ConfigModel):
|
||||||
|
|
||||||
env: str = DEFAULT_ENV
|
env: str = DEFAULT_ENV
|
||||||
platform: str
|
|
||||||
base_path: str
|
base_path: str
|
||||||
|
platform: str = "" # overwritten by validator below
|
||||||
|
|
||||||
use_relative_path: bool = False
|
use_relative_path: bool = False
|
||||||
ignore_dotfiles: bool = True
|
ignore_dotfiles: bool = True
|
||||||
@ -40,6 +41,15 @@ class DataLakeSourceConfig(ConfigModel):
|
|||||||
profiling.allow_deny_patterns = values["profile_patterns"]
|
profiling.allow_deny_patterns = values["profile_patterns"]
|
||||||
return values
|
return values
|
||||||
|
|
||||||
|
@pydantic.validator("platform", always=True)
|
||||||
|
def validate_platform(cls, value: str, values: Dict[str, Any]) -> Optional[str]:
|
||||||
|
if value != "":
|
||||||
|
return value
|
||||||
|
|
||||||
|
if is_s3_uri(values["base_path"]):
|
||||||
|
return "s3"
|
||||||
|
return "file"
|
||||||
|
|
||||||
@pydantic.validator("path_spec", always=True)
|
@pydantic.validator("path_spec", always=True)
|
||||||
def validate_path_spec(
|
def validate_path_spec(
|
||||||
cls, value: Optional[str], values: Dict[str, Any]
|
cls, value: Optional[str], values: Dict[str, Any]
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user