feat(ingest): add option for external Spark cluster (#4571)

* Add option for configuring spark cluster manager

Co-authored-by: Ravindra Lanka <rslanka@gmail.com>

Co-authored-by: Ravindra Lanka <rslanka@gmail.com>
This commit is contained in:
Kevin Hu 2022-04-04 18:56:50 -04:00 committed by GitHub
parent 95d6bcd1a0
commit 030d25f0a1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 6 additions and 0 deletions

View File

@ -112,6 +112,7 @@ Note that a `.` is used to denote nested fields in the YAML recipe.
| `profile_patterns.deny` | | | List of regex patterns for tables to not profile (a must also be ingested for profiling). Defaults to none. |
| `profile_patterns.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching of tables to profile. |
| `profiling.enabled` | | `False` | Whether profiling should be done. |
| `profiling.spark_cluster_manager` | | `None` | Spark master URL. See [Spark docs](https://spark.apache.org/docs/latest/submitting-applications.html#master-urls) for details. |
| `profiling.profile_table_level_only` | | `False` | Whether to perform profiling at table-level only or include column-level profiling as well. |
| `profiling.max_number_of_fields_to_profile` | | `None` | A positive integer that specifies the maximum number of columns to profile for any table. `None` implies all columns. The cost of profiling goes up significantly as the number of columns to profile goes up. |
| `profiling.include_field_null_count` | | `True` | Whether to profile for the number of nulls for each column. |

View File

@ -179,6 +179,10 @@ class DataLakeSource(Source):
conf = SparkConf()
# None by default, which corresponds to local
if self.source_config.profiling.spark_cluster_manager:
conf.setMaster(self.source_config.profiling.spark_cluster_manager)
conf.set(
"spark.jars.packages",
",".join(

View File

@ -62,6 +62,7 @@ def null_str(value: Any) -> Optional[str]:
class DataLakeProfilerConfig(ConfigModel):
enabled: bool = False
spark_cluster_manager: Optional[str] = None
# These settings will override the ones below.
profile_table_level_only: bool = False