mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-03 22:23:37 +00:00
feat(ingest): add option for external Spark cluster (#4571)
* Add option for configuring spark cluster manager Co-authored-by: Ravindra Lanka <rslanka@gmail.com> Co-authored-by: Ravindra Lanka <rslanka@gmail.com>
This commit is contained in:
parent
95d6bcd1a0
commit
030d25f0a1
@ -112,6 +112,7 @@ Note that a `.` is used to denote nested fields in the YAML recipe.
|
||||
| `profile_patterns.deny` | | | List of regex patterns for tables to not profile (a must also be ingested for profiling). Defaults to none. |
|
||||
| `profile_patterns.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching of tables to profile. |
|
||||
| `profiling.enabled` | | `False` | Whether profiling should be done. |
|
||||
| `profiling.spark_cluster_manager` | | `None` | Spark master URL. See [Spark docs](https://spark.apache.org/docs/latest/submitting-applications.html#master-urls) for details. |
|
||||
| `profiling.profile_table_level_only` | | `False` | Whether to perform profiling at table-level only or include column-level profiling as well. |
|
||||
| `profiling.max_number_of_fields_to_profile` | | `None` | A positive integer that specifies the maximum number of columns to profile for any table. `None` implies all columns. The cost of profiling goes up significantly as the number of columns to profile goes up. |
|
||||
| `profiling.include_field_null_count` | | `True` | Whether to profile for the number of nulls for each column. |
|
||||
|
||||
@ -179,6 +179,10 @@ class DataLakeSource(Source):
|
||||
|
||||
conf = SparkConf()
|
||||
|
||||
# None by default, which corresponds to local
|
||||
if self.source_config.profiling.spark_cluster_manager:
|
||||
conf.setMaster(self.source_config.profiling.spark_cluster_manager)
|
||||
|
||||
conf.set(
|
||||
"spark.jars.packages",
|
||||
",".join(
|
||||
|
||||
@ -62,6 +62,7 @@ def null_str(value: Any) -> Optional[str]:
|
||||
class DataLakeProfilerConfig(ConfigModel):
|
||||
enabled: bool = False
|
||||
|
||||
spark_cluster_manager: Optional[str] = None
|
||||
# These settings will override the ones below.
|
||||
profile_table_level_only: bool = False
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user