mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-09-26 17:34:41 +00:00
MINOR: Update table profile config to add spark configs. Update spark config to add temp path (#22646)
* Update table profile config to add spark configs. Update spark config to add temp path * Add default null value for sparkTableProfilerConfig * Fix TableProfileConfig extension DAO query * Update generated TypeScript types * Implemented Dependency Injection for the ProfilerProcessorConfig --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
e098635842
commit
f578a81277
@ -13,6 +13,7 @@ OpenMetadata package initialization.
|
||||
"""
|
||||
from typing import Type
|
||||
|
||||
from metadata.profiler.api.models import ProfilerProcessorConfig
|
||||
from metadata.profiler.metrics.registry import Metrics
|
||||
from metadata.profiler.registry import MetricRegistry
|
||||
from metadata.profiler.source.database.base.profiler_resolver import (
|
||||
@ -29,3 +30,4 @@ container = DependencyContainer()
|
||||
container.register(SourceLoader, DefaultSourceLoader)
|
||||
container.register(Type[MetricRegistry], lambda: Metrics)
|
||||
container.register(Type[ProfilerResolver], lambda: DefaultProfilerResolver)
|
||||
container.register(Type[ProfilerProcessorConfig], lambda: ProfilerProcessorConfig)
|
||||
|
@ -12,7 +12,7 @@
|
||||
Profiler Processor Step
|
||||
"""
|
||||
import traceback
|
||||
from typing import Optional, cast
|
||||
from typing import Optional, Type, cast
|
||||
|
||||
from metadata.generated.schema.entity.services.ingestionPipelines.status import (
|
||||
StackTraceError,
|
||||
@ -31,6 +31,11 @@ from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
||||
from metadata.profiler.api.models import ProfilerProcessorConfig, ProfilerResponse
|
||||
from metadata.profiler.processor.core import Profiler
|
||||
from metadata.profiler.source.model import ProfilerSourceAndEntity
|
||||
from metadata.utils.dependency_injector.dependency_injector import (
|
||||
DependencyNotFoundError,
|
||||
Inject,
|
||||
inject,
|
||||
)
|
||||
|
||||
|
||||
class ProfilerProcessor(Processor):
|
||||
@ -39,11 +44,21 @@ class ProfilerProcessor(Processor):
|
||||
the OpenMetadataSource and compute the metrics.
|
||||
"""
|
||||
|
||||
def __init__(self, config: OpenMetadataWorkflowConfig):
|
||||
@inject
|
||||
def __init__(
|
||||
self,
|
||||
config: OpenMetadataWorkflowConfig,
|
||||
profiler_config_class: Inject[Type[ProfilerProcessorConfig]] = None,
|
||||
):
|
||||
if profiler_config_class is None:
|
||||
raise DependencyNotFoundError(
|
||||
"ProfilerProcessorConfig class not found. Please ensure the ProfilerProcessorConfig is properly registered."
|
||||
)
|
||||
|
||||
super().__init__()
|
||||
|
||||
self.config = config
|
||||
self.profiler_config = ProfilerProcessorConfig.model_validate(
|
||||
self.profiler_config = profiler_config_class.model_validate(
|
||||
self.config.processor.model_dump().get("config")
|
||||
)
|
||||
self.source_config: DatabaseServiceProfilerPipeline = cast(
|
||||
|
@ -61,16 +61,23 @@ class ProfilerSource(ProfilerSourceInterface):
|
||||
Base class for the profiler source
|
||||
"""
|
||||
|
||||
@inject
|
||||
def __init__(
|
||||
self,
|
||||
config: OpenMetadataWorkflowConfig,
|
||||
database: Database,
|
||||
ometa_client: OpenMetadata,
|
||||
global_profiler_configuration: ProfilerConfiguration,
|
||||
profiler_config_class: Inject[Type[ProfilerProcessorConfig]] = None,
|
||||
):
|
||||
if profiler_config_class is None:
|
||||
raise DependencyNotFoundError(
|
||||
"ProfilerProcessorConfig class not found. Please ensure the ProfilerProcessorConfig is properly registered."
|
||||
)
|
||||
|
||||
self.config = config
|
||||
self.service_conn_config = self._copy_service_config(config, database)
|
||||
self.profiler_config = ProfilerProcessorConfig.model_validate(
|
||||
self.profiler_config = profiler_config_class.model_validate(
|
||||
config.processor.model_dump().get("config")
|
||||
)
|
||||
self.ometa_client = ometa_client
|
||||
|
@ -13,7 +13,7 @@ Data Sampler for the PII Workflow
|
||||
"""
|
||||
import traceback
|
||||
from copy import deepcopy
|
||||
from typing import Optional, cast
|
||||
from typing import Optional, Type, cast
|
||||
|
||||
from metadata.generated.schema.entity.data.database import Database
|
||||
from metadata.generated.schema.entity.data.table import Table
|
||||
@ -42,6 +42,11 @@ from metadata.sampler.config import get_config_for_table
|
||||
from metadata.sampler.models import SampleConfig, SampleData, SamplerResponse
|
||||
from metadata.sampler.sampler_interface import SamplerInterface
|
||||
from metadata.utils.bigquery_utils import copy_service_config
|
||||
from metadata.utils.dependency_injector.dependency_injector import (
|
||||
DependencyNotFoundError,
|
||||
Inject,
|
||||
inject,
|
||||
)
|
||||
from metadata.utils.profiler_utils import get_context_entities
|
||||
from metadata.utils.service_spec.service_spec import import_sampler_class
|
||||
|
||||
@ -49,7 +54,18 @@ from metadata.utils.service_spec.service_spec import import_sampler_class
|
||||
class SamplerProcessor(Processor):
|
||||
"""Use the profiler interface to fetch the sample data"""
|
||||
|
||||
def __init__(self, config: OpenMetadataWorkflowConfig, metadata: OpenMetadata):
|
||||
@inject
|
||||
def __init__(
|
||||
self,
|
||||
config: OpenMetadataWorkflowConfig,
|
||||
metadata: OpenMetadata,
|
||||
profiler_config_class: Inject[Type[ProfilerProcessorConfig]] = None,
|
||||
):
|
||||
if profiler_config_class is None:
|
||||
raise DependencyNotFoundError(
|
||||
"ProfilerProcessorConfig class not found. Please ensure the ProfilerProcessorConfig is properly registered."
|
||||
)
|
||||
|
||||
super().__init__()
|
||||
|
||||
self.config = config
|
||||
@ -60,7 +76,7 @@ class SamplerProcessor(Processor):
|
||||
self.config.source.sourceConfig.config,
|
||||
) # Used to satisfy type checked
|
||||
# We still rely on the orm-processor. We should decouple this in the future
|
||||
self.profiler_config = ProfilerProcessorConfig.model_validate(
|
||||
self.profiler_config = profiler_config_class.model_validate(
|
||||
self.config.processor.model_dump().get("config")
|
||||
)
|
||||
|
||||
|
@ -44,6 +44,11 @@ from metadata.utils.class_helper import (
|
||||
get_service_type_from_source_type,
|
||||
)
|
||||
from metadata.utils.constants import CUSTOM_CONNECTOR_PREFIX
|
||||
from metadata.utils.dependency_injector.dependency_injector import (
|
||||
DependencyNotFoundError,
|
||||
Inject,
|
||||
inject,
|
||||
)
|
||||
from metadata.utils.importer import (
|
||||
DynamicImportException,
|
||||
MissingPluginException,
|
||||
@ -178,12 +183,20 @@ class IngestionWorkflow(BaseWorkflow, ABC):
|
||||
f" using the secrets manager provider [{self.metadata.config.secretsManagerProvider}]: {exc}"
|
||||
)
|
||||
|
||||
def validate(self):
|
||||
@inject
|
||||
def validate(
|
||||
self, profiler_config_class: Inject[Type[ProfilerProcessorConfig]] = None
|
||||
):
|
||||
if profiler_config_class is None:
|
||||
raise DependencyNotFoundError(
|
||||
"ProfilerProcessorConfig class not found. Please ensure the ProfilerProcessorConfig is properly registered."
|
||||
)
|
||||
|
||||
try:
|
||||
if not self.config.source.serviceConnection.root.config.supportsProfiler:
|
||||
raise AttributeError()
|
||||
except AttributeError:
|
||||
if ProfilerProcessorConfig.model_validate(
|
||||
if profiler_config_class.model_validate(
|
||||
self.config.processor.model_dump().get("config")
|
||||
).ignoreValidation:
|
||||
logger.debug(
|
||||
|
@ -932,6 +932,15 @@ public interface CollectionDAO {
|
||||
parts = {":extensionPrefix", ".%"})
|
||||
String extensionPrefix);
|
||||
|
||||
@SqlQuery(
|
||||
"SELECT id, extension, json "
|
||||
+ "FROM entity_extension "
|
||||
+ "WHERE id IN (<ids>) AND extension = :extension "
|
||||
+ "ORDER BY id, extension")
|
||||
@RegisterRowMapper(ExtensionRecordWithIdMapper.class)
|
||||
List<ExtensionRecordWithId> getExtensionBatch(
|
||||
@BindList("ids") List<String> ids, @Bind("extension") String extension);
|
||||
|
||||
@SqlQuery(
|
||||
"SELECT id, extension, json, jsonschema "
|
||||
+ "FROM entity_extension "
|
||||
|
@ -1857,7 +1857,7 @@ public class TableRepository extends EntityRepository<Table> {
|
||||
List<CollectionDAO.ExtensionRecordWithId> records =
|
||||
daoCollection
|
||||
.entityExtensionDAO()
|
||||
.getExtensionsBatch(entityListToStrings(tables), TABLE_PROFILER_CONFIG_EXTENSION);
|
||||
.getExtensionBatch(entityListToStrings(tables), TABLE_PROFILER_CONFIG_EXTENSION);
|
||||
|
||||
for (CollectionDAO.ExtensionRecordWithId record : records) {
|
||||
try {
|
||||
|
@ -758,6 +758,39 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"sparkTableProfilerConfig": {
|
||||
"type": "object",
|
||||
"javaType": "org.openmetadata.schema.type.SparkTableProfilerConfig",
|
||||
"description": "Table Specific configuration for Profiling it with a Spark Engine. It is ignored for other engines.",
|
||||
"properties": {
|
||||
"partitioning": {
|
||||
"type": "object",
|
||||
"description": "When reading big tables from sources, we optimize the reading by partitioning the data. This configuration is responsible for it.",
|
||||
"properties": {
|
||||
"partitionColumn": {
|
||||
"type": "string",
|
||||
"description": "Column to partition on. It should be a date, timestamp or integer column. It is important for the data to be reasonably equally distributed across the partitions.",
|
||||
"default": null
|
||||
},
|
||||
"lowerBound": {
|
||||
"type": "string",
|
||||
"description": "Lower bound of the partition range. If not provided, it will be fetched from the source.",
|
||||
"default": null
|
||||
},
|
||||
"upperBound": {
|
||||
"type": "string",
|
||||
"description": "Upper bound of the partition range. If not provided, it will be fetched from the source.",
|
||||
"default": null
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"partitionColumn"
|
||||
]
|
||||
}
|
||||
},
|
||||
"additionalProperties": false
|
||||
},
|
||||
"tableProfilerConfig": {
|
||||
"type": "object",
|
||||
"javaType": "org.openmetadata.schema.type.TableProfilerConfig",
|
||||
@ -821,6 +854,11 @@
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"title": "Compute Column Metrics"
|
||||
},
|
||||
"sparkTableProfilerConfig": {
|
||||
"description": "Table Specific configuration for Profiling it with a Spark Engine. It is ignored for other engines.",
|
||||
"$ref": "#/definitions/sparkTableProfilerConfig",
|
||||
"default": null
|
||||
}
|
||||
}
|
||||
},
|
||||
|
@ -16,8 +16,19 @@
|
||||
"type": "string"
|
||||
},
|
||||
"config": {
|
||||
"description": "Additional Spark configuration properties as key-value pairs.",
|
||||
"$ref": "../../type/basic.json#/definitions/map"
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"tempPath": {
|
||||
"description": "Temporary path to store the data.",
|
||||
"type": "string",
|
||||
"default": "/tmp/openmetadata"
|
||||
},
|
||||
"extraConfig": {
|
||||
"title": "Additional Spark Configuration",
|
||||
"description": "Additional Spark configuration properties as key-value pairs.",
|
||||
"$ref": "../../type/basic.json#/definitions/map"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["type", "remote"],
|
||||
|
@ -841,6 +841,11 @@ export interface TableProfilerConfig {
|
||||
*/
|
||||
sampleDataCount?: number;
|
||||
samplingMethodType?: SamplingMethodType;
|
||||
/**
|
||||
* Table Specific configuration for Profiling it with a Spark Engine. It is ignored for
|
||||
* other engines.
|
||||
*/
|
||||
sparkTableProfilerConfig?: SparkTableProfilerConfig;
|
||||
[property: string]: any;
|
||||
}
|
||||
|
||||
@ -923,6 +928,38 @@ export enum SamplingMethodType {
|
||||
System = "SYSTEM",
|
||||
}
|
||||
|
||||
/**
|
||||
* Table Specific configuration for Profiling it with a Spark Engine. It is ignored for
|
||||
* other engines.
|
||||
*/
|
||||
export interface SparkTableProfilerConfig {
|
||||
/**
|
||||
* When reading big tables from sources, we optimize the reading by partitioning the data.
|
||||
* This configuration is responsible for it.
|
||||
*/
|
||||
partitioning?: Partitioning;
|
||||
}
|
||||
|
||||
/**
|
||||
* When reading big tables from sources, we optimize the reading by partitioning the data.
|
||||
* This configuration is responsible for it.
|
||||
*/
|
||||
export interface Partitioning {
|
||||
/**
|
||||
* Lower bound of the partition range. If not provided, it will be fetched from the source.
|
||||
*/
|
||||
lowerBound?: string;
|
||||
/**
|
||||
* Column to partition on. It should be a date, timestamp or integer column. It is important
|
||||
* for the data to be reasonably equally distributed across the partitions.
|
||||
*/
|
||||
partitionColumn: string;
|
||||
/**
|
||||
* Upper bound of the partition range. If not provided, it will be fetched from the source.
|
||||
*/
|
||||
upperBound?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* This schema defines the type used for describing different types of tables.
|
||||
*/
|
||||
|
@ -2049,17 +2049,26 @@ export interface ProcessingEngine {
|
||||
/**
|
||||
* The type of the engine configuration
|
||||
*/
|
||||
type: ProcessingEngineType;
|
||||
/**
|
||||
* Additional Spark configuration properties as key-value pairs.
|
||||
*/
|
||||
config?: { [key: string]: any };
|
||||
type: ProcessingEngineType;
|
||||
config?: Config;
|
||||
/**
|
||||
* Spark Connect Remote URL.
|
||||
*/
|
||||
remote?: string;
|
||||
}
|
||||
|
||||
export interface Config {
|
||||
/**
|
||||
* Additional Spark configuration properties as key-value pairs.
|
||||
*/
|
||||
extraConfig?: { [key: string]: any };
|
||||
/**
|
||||
* Temporary path to store the data.
|
||||
*/
|
||||
tempPath?: string;
|
||||
[property: string]: any;
|
||||
}
|
||||
|
||||
/**
|
||||
* The type of the engine configuration
|
||||
*/
|
||||
|
@ -1206,6 +1206,11 @@ export interface TableProfilerConfig {
|
||||
*/
|
||||
sampleDataCount?: number;
|
||||
samplingMethodType?: SamplingMethodType;
|
||||
/**
|
||||
* Table Specific configuration for Profiling it with a Spark Engine. It is ignored for
|
||||
* other engines.
|
||||
*/
|
||||
sparkTableProfilerConfig?: SparkTableProfilerConfig;
|
||||
[property: string]: any;
|
||||
}
|
||||
|
||||
@ -1272,6 +1277,38 @@ export enum PartitionIntervalUnit {
|
||||
Year = "YEAR",
|
||||
}
|
||||
|
||||
/**
|
||||
* Table Specific configuration for Profiling it with a Spark Engine. It is ignored for
|
||||
* other engines.
|
||||
*/
|
||||
export interface SparkTableProfilerConfig {
|
||||
/**
|
||||
* When reading big tables from sources, we optimize the reading by partitioning the data.
|
||||
* This configuration is responsible for it.
|
||||
*/
|
||||
partitioning?: Partitioning;
|
||||
}
|
||||
|
||||
/**
|
||||
* When reading big tables from sources, we optimize the reading by partitioning the data.
|
||||
* This configuration is responsible for it.
|
||||
*/
|
||||
export interface Partitioning {
|
||||
/**
|
||||
* Lower bound of the partition range. If not provided, it will be fetched from the source.
|
||||
*/
|
||||
lowerBound?: string;
|
||||
/**
|
||||
* Column to partition on. It should be a date, timestamp or integer column. It is important
|
||||
* for the data to be reasonably equally distributed across the partitions.
|
||||
*/
|
||||
partitionColumn: string;
|
||||
/**
|
||||
* Upper bound of the partition range. If not provided, it will be fetched from the source.
|
||||
*/
|
||||
upperBound?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* This schema defines the type used for describing different types of tables.
|
||||
*/
|
||||
|
@ -2560,17 +2560,26 @@ export interface ProcessingEngine {
|
||||
/**
|
||||
* The type of the engine configuration
|
||||
*/
|
||||
type: ProcessingEngineType;
|
||||
/**
|
||||
* Additional Spark configuration properties as key-value pairs.
|
||||
*/
|
||||
config?: { [key: string]: any };
|
||||
type: ProcessingEngineType;
|
||||
config?: Config;
|
||||
/**
|
||||
* Spark Connect Remote URL.
|
||||
*/
|
||||
remote?: string;
|
||||
}
|
||||
|
||||
export interface Config {
|
||||
/**
|
||||
* Additional Spark configuration properties as key-value pairs.
|
||||
*/
|
||||
extraConfig?: { [key: string]: any };
|
||||
/**
|
||||
* Temporary path to store the data.
|
||||
*/
|
||||
tempPath?: string;
|
||||
[property: string]: any;
|
||||
}
|
||||
|
||||
/**
|
||||
* The type of the engine configuration
|
||||
*/
|
||||
|
@ -122,17 +122,26 @@ export interface ProcessingEngine {
|
||||
/**
|
||||
* The type of the engine configuration
|
||||
*/
|
||||
type: Type;
|
||||
/**
|
||||
* Additional Spark configuration properties as key-value pairs.
|
||||
*/
|
||||
config?: { [key: string]: any };
|
||||
type: Type;
|
||||
config?: Config;
|
||||
/**
|
||||
* Spark Connect Remote URL.
|
||||
*/
|
||||
remote?: string;
|
||||
}
|
||||
|
||||
export interface Config {
|
||||
/**
|
||||
* Additional Spark configuration properties as key-value pairs.
|
||||
*/
|
||||
extraConfig?: { [key: string]: any };
|
||||
/**
|
||||
* Temporary path to store the data.
|
||||
*/
|
||||
tempPath?: string;
|
||||
[property: string]: any;
|
||||
}
|
||||
|
||||
/**
|
||||
* The type of the engine configuration
|
||||
*/
|
||||
|
@ -14,10 +14,7 @@
|
||||
* This schema defines the configuration for a Spark Engine runner.
|
||||
*/
|
||||
export interface SparkEngineConfig {
|
||||
/**
|
||||
* Additional Spark configuration properties as key-value pairs.
|
||||
*/
|
||||
config?: { [key: string]: any };
|
||||
config?: Config;
|
||||
/**
|
||||
* Spark Connect Remote URL.
|
||||
*/
|
||||
@ -25,6 +22,18 @@ export interface SparkEngineConfig {
|
||||
type: Type;
|
||||
}
|
||||
|
||||
export interface Config {
|
||||
/**
|
||||
* Additional Spark configuration properties as key-value pairs.
|
||||
*/
|
||||
extraConfig?: { [key: string]: any };
|
||||
/**
|
||||
* Temporary path to store the data.
|
||||
*/
|
||||
tempPath?: string;
|
||||
[property: string]: any;
|
||||
}
|
||||
|
||||
export enum Type {
|
||||
Spark = "Spark",
|
||||
}
|
||||
|
@ -5621,17 +5621,26 @@ export interface ProcessingEngine {
|
||||
/**
|
||||
* The type of the engine configuration
|
||||
*/
|
||||
type: ProcessingEngineType;
|
||||
/**
|
||||
* Additional Spark configuration properties as key-value pairs.
|
||||
*/
|
||||
config?: { [key: string]: any };
|
||||
type: ProcessingEngineType;
|
||||
config?: Config;
|
||||
/**
|
||||
* Spark Connect Remote URL.
|
||||
*/
|
||||
remote?: string;
|
||||
}
|
||||
|
||||
export interface Config {
|
||||
/**
|
||||
* Additional Spark configuration properties as key-value pairs.
|
||||
*/
|
||||
extraConfig?: { [key: string]: any };
|
||||
/**
|
||||
* Temporary path to store the data.
|
||||
*/
|
||||
tempPath?: string;
|
||||
[property: string]: any;
|
||||
}
|
||||
|
||||
/**
|
||||
* The type of the engine configuration
|
||||
*/
|
||||
|
Loading…
x
Reference in New Issue
Block a user