diff --git a/metadata-ingestion/source_docs/data_lake.md b/metadata-ingestion/source_docs/data_lake.md index a2a702e4e7..6042ca3056 100644 --- a/metadata-ingestion/source_docs/data_lake.md +++ b/metadata-ingestion/source_docs/data_lake.md @@ -112,6 +112,7 @@ Note that a `.` is used to denote nested fields in the YAML recipe. | `profile_patterns.deny` | | | List of regex patterns for tables to not profile (a must also be ingested for profiling). Defaults to none. | | `profile_patterns.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching of tables to profile. | | `profiling.enabled` | | `False` | Whether profiling should be done. | +| `profiling.spark_cluster_manager` | | `None` | Spark master URL. See [Spark docs](https://spark.apache.org/docs/latest/submitting-applications.html#master-urls) for details. | | `profiling.profile_table_level_only` | | `False` | Whether to perform profiling at table-level only or include column-level profiling as well. | | `profiling.max_number_of_fields_to_profile` | | `None` | A positive integer that specifies the maximum number of columns to profile for any table. `None` implies all columns. The cost of profiling goes up significantly as the number of columns to profile goes up. | | `profiling.include_field_null_count` | | `True` | Whether to profile for the number of nulls for each column. | diff --git a/metadata-ingestion/src/datahub/ingestion/source/data_lake/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/data_lake/__init__.py index ee3dabbf34..259b36889c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/data_lake/__init__.py +++ b/metadata-ingestion/src/datahub/ingestion/source/data_lake/__init__.py @@ -179,6 +179,10 @@ class DataLakeSource(Source): conf = SparkConf() + # None by default, which corresponds to local + if self.source_config.profiling.spark_cluster_manager: + conf.setMaster(self.source_config.profiling.spark_cluster_manager) + conf.set( "spark.jars.packages", ",".join( diff --git a/metadata-ingestion/src/datahub/ingestion/source/data_lake/profiling.py b/metadata-ingestion/src/datahub/ingestion/source/data_lake/profiling.py index 3c31e2c4df..845fe8f1cb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/data_lake/profiling.py +++ b/metadata-ingestion/src/datahub/ingestion/source/data_lake/profiling.py @@ -62,6 +62,7 @@ def null_str(value: Any) -> Optional[str]: class DataLakeProfilerConfig(ConfigModel): enabled: bool = False + spark_cluster_manager: Optional[str] = None # These settings will override the ones below. profile_table_level_only: bool = False