Fies #6183 - Ability to set profile sample at the profilier workflow level (#6292)

Fies  #6183 - Ability to set profile sample at the profilier workflow level (#6292)
This commit is contained in:
Teddy 2022-07-25 12:08:20 +02:00 committed by GitHub
parent 67339a6c25
commit aae4410c93
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 131 additions and 6 deletions

View File

@ -25,6 +25,13 @@
"description": "Option to turn on/off generating sample data.", "description": "Option to turn on/off generating sample data.",
"type": "boolean", "type": "boolean",
"default": true "default": true
},
"profileSample": {
"description": "Percentage of data used to execute the profiler for the whole workflow. This percentage will be applied to all tables in the workflow. Represented in the range (0, 100].",
"type": "number",
"exclusiveMinimum": 0,
"maximum": 100,
"default": null
} }
}, },
"additionalProperties": false "additionalProperties": false

View File

@ -138,8 +138,8 @@ class ProfilerWorkflow:
processor_config=self.config.processor or ProfilerProcessorConfig(), processor_config=self.config.processor or ProfilerProcessorConfig(),
metadata_config=self.metadata_config, metadata_config=self.metadata_config,
_from="orm_profiler", _from="orm_profiler",
# Pass the processor_interface as kwargs for the profiler
processor_interface=self.processor_interface, processor_interface=self.processor_interface,
workflow_profile_sample=self.source_config.profileSample,
) )
def create_engine_for_session(self, service_connection_config): def create_engine_for_session(self, service_connection_config):

View File

@ -84,6 +84,7 @@ class OrmProfilerProcessor(Processor[Table]):
config: ProfilerProcessorConfig, config: ProfilerProcessorConfig,
metadata_config: OpenMetadataConnection, metadata_config: OpenMetadataConnection,
processor_interface: InterfaceProtocol, processor_interface: InterfaceProtocol,
workflow_profile_sample: Optional[float] = None,
): ):
super().__init__() super().__init__()
self.config = config self.config = config
@ -94,7 +95,9 @@ class OrmProfilerProcessor(Processor[Table]):
# OpenMetadata client to fetch tables # OpenMetadata client to fetch tables
self.metadata = OpenMetadata(self.metadata_config) self.metadata = OpenMetadata(self.metadata_config)
self.processor_interface = processor_interface self.processor_interface = processor_interface
self.workflow_profile_sample = workflow_profile_sample
@classmethod @classmethod
def create( def create(
@ -110,12 +113,18 @@ class OrmProfilerProcessor(Processor[Table]):
config = ProfilerProcessorConfig.parse_obj(config_dict) config = ProfilerProcessorConfig.parse_obj(config_dict)
processor_interface = kwargs.get("processor_interface") processor_interface = kwargs.get("processor_interface")
workflow_profile_sample = kwargs.get("workflow_profile_sample")
if not processor_interface: if not processor_interface:
raise ValueError( raise ValueError(
"Cannot initialise the ProfilerProcessor without processor interface object" "Cannot initialise the ProfilerProcessor without processor interface object"
) )
return cls(config, metadata_config, processor_interface=processor_interface) return cls(
config,
metadata_config,
processor_interface=processor_interface,
workflow_profile_sample=workflow_profile_sample,
)
def get_table_profile_sample(self, table: Table) -> Optional[float]: def get_table_profile_sample(self, table: Table) -> Optional[float]:
""" """
@ -132,6 +141,14 @@ class OrmProfilerProcessor(Processor[Table]):
if my_record_tests and my_record_tests.profile_sample: if my_record_tests and my_record_tests.profile_sample:
return my_record_tests.profile_sample return my_record_tests.profile_sample
if self.workflow_profile_sample:
if (
table.profileSample is not None
and self.workflow_profile_sample != table.profileSample
):
return table.profileSample
return self.workflow_profile_sample
return table.profileSample or None return table.profileSample or None
def get_partition_details( def get_partition_details(

View File

@ -69,6 +69,15 @@ class User(Base):
age = Column(Integer) age = Column(Integer)
class NewUser(Base):
__tablename__ = "new_users"
id = Column(Integer, primary_key=True)
name = Column(String(256))
fullname = Column(String(256))
nickname = Column(String(256))
age = Column(Integer)
class ProfilerWorkflowTest(TestCase): class ProfilerWorkflowTest(TestCase):
""" """
Run the end to end workflow and validate Run the end to end workflow and validate
@ -87,8 +96,8 @@ class ProfilerWorkflowTest(TestCase):
""" """
Prepare Ingredients Prepare Ingredients
""" """
cls.session.execute("DROP TABLE IF EXISTS USERS")
User.__table__.create(bind=cls.engine) User.__table__.create(bind=cls.engine)
NewUser.__table__.create(bind=cls.engine)
data = [ data = [
User(name="John", fullname="John Doe", nickname="johnny b goode", age=30), User(name="John", fullname="John Doe", nickname="johnny b goode", age=30),
@ -97,6 +106,15 @@ class ProfilerWorkflowTest(TestCase):
cls.session.add_all(data) cls.session.add_all(data)
cls.session.commit() cls.session.commit()
new_user = [
NewUser(
name="John", fullname="John Doe", nickname="johnny b goode", age=30
),
NewUser(name="Jane", fullname="Jone Doe", nickname=None, age=31),
]
cls.session.add_all(new_user)
cls.session.commit()
ingestion_workflow = Workflow.create(ingestion_config) ingestion_workflow = Workflow.create(ingestion_config)
ingestion_workflow.execute() ingestion_workflow.execute()
ingestion_workflow.raise_from_status() ingestion_workflow.raise_from_status()
@ -122,6 +140,9 @@ class ProfilerWorkflowTest(TestCase):
hard_delete=True, hard_delete=True,
) )
NewUser.__table__.drop(bind=cls.engine)
User.__table__.drop(bind=cls.engine)
def test_ingestion(self): def test_ingestion(self):
""" """
Validate that the ingestion ran correctly Validate that the ingestion ran correctly
@ -138,7 +159,12 @@ class ProfilerWorkflowTest(TestCase):
on top of the Users table on top of the Users table
""" """
workflow_config = deepcopy(ingestion_config) workflow_config = deepcopy(ingestion_config)
workflow_config["source"]["sourceConfig"]["config"].update({"type": "Profiler"}) workflow_config["source"]["sourceConfig"]["config"].update(
{
"type": "Profiler",
"fqnFilterPattern": {"includes": ["test_sqlite.main.main.users"]},
}
)
workflow_config["processor"] = { workflow_config["processor"] = {
"type": "orm-profiler", "type": "orm-profiler",
"config": { "config": {
@ -198,3 +224,27 @@ class ProfilerWorkflowTest(TestCase):
with pytest.raises(WorkflowExecutionError): with pytest.raises(WorkflowExecutionError):
profiler_workflow.raise_from_status() profiler_workflow.raise_from_status()
def test_worflow_sample_profile(self):
"""Test the worflow sample profile gets propagated down to the table profileSample"""
workflow_config = deepcopy(ingestion_config)
workflow_config["source"]["sourceConfig"]["config"].update(
{
"type": "Profiler",
"profileSample": 50,
"fqnFilterPattern": {"includes": ["test_sqlite.main.main.new_users"]},
}
)
workflow_config["processor"] = {"type": "orm-profiler", "config": {}}
profiler_workflow = ProfilerWorkflow.create(workflow_config)
profiler_workflow.execute()
profiler_workflow.print_status()
profiler_workflow.stop()
table = self.metadata.get_by_name(
entity=Table,
fqn="test_sqlite.main.main.new_users",
fields=["profileSample"],
)
assert table.profileSample == 50

View File

@ -169,6 +169,9 @@ const AddIngestion = ({
const [enableDebugLog, setEnableDebugLog] = useState( const [enableDebugLog, setEnableDebugLog] = useState(
data?.loggerLevel === LogLevels.Debug data?.loggerLevel === LogLevels.Debug
); );
const [profileSample, setProfileSample] = useState(
(data?.sourceConfig.config as ConfigClass)?.profileSample
);
const [dashboardFilterPattern, setDashboardFilterPattern] = const [dashboardFilterPattern, setDashboardFilterPattern] =
useState<FilterPattern>( useState<FilterPattern>(
(data?.sourceConfig.config as ConfigClass)?.dashboardFilterPattern ?? (data?.sourceConfig.config as ConfigClass)?.dashboardFilterPattern ??
@ -469,6 +472,7 @@ const AddIngestion = ({
), ),
type: profilerIngestionType, type: profilerIngestionType,
generateSampleData: ingestSampleData, generateSampleData: ingestSampleData,
profileSample: profileSample,
}; };
} }
case PipelineType.Metadata: case PipelineType.Metadata:
@ -628,6 +632,7 @@ const AddIngestion = ({
handleIngestSampleData={() => setIngestSampleData((pre) => !pre)} handleIngestSampleData={() => setIngestSampleData((pre) => !pre)}
handleIngestionName={(val) => setIngestionName(val)} handleIngestionName={(val) => setIngestionName(val)}
handleMarkDeletedTables={() => setMarkDeletedTables((pre) => !pre)} handleMarkDeletedTables={() => setMarkDeletedTables((pre) => !pre)}
handleProfileSample={(val) => setProfileSample(val)}
handleQueryLogDuration={(val) => setQueryLogDuration(val)} handleQueryLogDuration={(val) => setQueryLogDuration(val)}
handleResultLimit={(val) => setResultLimit(val)} handleResultLimit={(val) => setResultLimit(val)}
handleShowFilter={handleShowFilter} handleShowFilter={handleShowFilter}
@ -640,6 +645,7 @@ const AddIngestion = ({
markDeletedTables={markDeletedTables} markDeletedTables={markDeletedTables}
pipelineFilterPattern={pipelineFilterPattern} pipelineFilterPattern={pipelineFilterPattern}
pipelineType={pipelineType} pipelineType={pipelineType}
profileSample={profileSample}
queryLogDuration={queryLogDuration} queryLogDuration={queryLogDuration}
resultLimit={resultLimit} resultLimit={resultLimit}
schemaFilterPattern={schemaFilterPattern} schemaFilterPattern={schemaFilterPattern}

View File

@ -83,6 +83,7 @@ const mockConfigureIngestion: ConfigureIngestionProps = {
handleIncludeTags: jest.fn(), handleIncludeTags: jest.fn(),
handleIngestionName: jest.fn(), handleIngestionName: jest.fn(),
handleMarkDeletedTables: jest.fn(), handleMarkDeletedTables: jest.fn(),
handleProfileSample: jest.fn(),
handleQueryLogDuration: jest.fn(), handleQueryLogDuration: jest.fn(),
handleResultLimit: jest.fn(), handleResultLimit: jest.fn(),
handleStageFileLocation: jest.fn(), handleStageFileLocation: jest.fn(),

View File

@ -13,11 +13,11 @@
import { isNil } from 'lodash'; import { isNil } from 'lodash';
import { EditorContentRef } from 'Models'; import { EditorContentRef } from 'Models';
import React, { Fragment, useRef } from 'react'; import React, { Fragment, useRef, useState } from 'react';
import { FilterPatternEnum } from '../../../enums/filterPattern.enum'; import { FilterPatternEnum } from '../../../enums/filterPattern.enum';
import { ServiceCategory } from '../../../enums/service.enum'; import { ServiceCategory } from '../../../enums/service.enum';
import { PipelineType } from '../../../generated/entity/services/ingestionPipelines/ingestionPipeline'; import { PipelineType } from '../../../generated/entity/services/ingestionPipelines/ingestionPipeline';
import { getSeparator } from '../../../utils/CommonUtils'; import { getSeparator, errorMsg } from '../../../utils/CommonUtils';
import { Button } from '../../buttons/Button/Button'; import { Button } from '../../buttons/Button/Button';
import FilterPattern from '../../common/FilterPattern/FilterPattern'; import FilterPattern from '../../common/FilterPattern/FilterPattern';
import RichTextEditor from '../../common/rich-text-editor/RichTextEditor'; import RichTextEditor from '../../common/rich-text-editor/RichTextEditor';
@ -56,6 +56,7 @@ const ConfigureIngestion = ({
stageFileLocation, stageFileLocation,
resultLimit, resultLimit,
enableDebugLog, enableDebugLog,
profileSample,
handleEnableDebugLog, handleEnableDebugLog,
getExcludeValue, getExcludeValue,
getIncludeValue, getIncludeValue,
@ -69,6 +70,7 @@ const ConfigureIngestion = ({
handleIngestSampleData, handleIngestSampleData,
handleDatasetServiceName, handleDatasetServiceName,
handleQueryLogDuration, handleQueryLogDuration,
handleProfileSample,
handleStageFileLocation, handleStageFileLocation,
handleResultLimit, handleResultLimit,
onCancel, onCancel,
@ -112,6 +114,44 @@ const ConfigureIngestion = ({
); );
}; };
const [profileSampleError, setProfileSampleError] = useState(false);
const handleProfileSampleValidation = (profileSampleValue: number) => {
let errMsg;
if (profileSampleValue < 0 || profileSampleValue > 99) {
errMsg = true;
} else {
errMsg = false;
}
setProfileSampleError(errMsg);
handleProfileSample(profileSampleValue);
};
const getProfileSample = () => {
return (
<div>
<label>Profile Sample</label>
<p className="tw-text-grey-muted tw-mt-1 tw-mb-2 tw-text-sm">
This is an optional percentage used to compute the table profile.
Should be between 0 and 99.
</p>
<input
className="tw-form-inputs tw-form-inputs-padding tw-w-24"
data-testid="profileSample"
id="profileSample"
name="profileSample"
placeholder="75"
type="number"
value={profileSample}
onChange={(e) =>
handleProfileSampleValidation(parseInt(e.target.value))
}
/>
{profileSampleError && errorMsg('Value must be between 0 and 99.')}
</div>
);
};
const getDatabaseFieldToggles = () => { const getDatabaseFieldToggles = () => {
return ( return (
<> <>
@ -459,6 +499,8 @@ const ConfigureIngestion = ({
</div> </div>
<div>{getProfilerFilterPatternField()}</div> <div>{getProfilerFilterPatternField()}</div>
{getSeparator('')} {getSeparator('')}
{getProfileSample()}
{getSeparator('')}
{getIngestSampleToggle( {getIngestSampleToggle(
'Ingest Sample Data', 'Ingest Sample Data',
'Extract sample data from each profile' 'Extract sample data from each profile'

View File

@ -72,6 +72,7 @@ export interface ConfigureIngestionProps {
includeTags: boolean; includeTags: boolean;
markDeletedTables?: boolean; markDeletedTables?: boolean;
enableDebugLog: boolean; enableDebugLog: boolean;
profileSample?: number;
ingestSampleData: boolean; ingestSampleData: boolean;
pipelineType: PipelineType; pipelineType: PipelineType;
showDatabaseFilter: boolean; showDatabaseFilter: boolean;
@ -97,6 +98,7 @@ export interface ConfigureIngestionProps {
getIncludeValue: (value: string[], type: FilterPatternEnum) => void; getIncludeValue: (value: string[], type: FilterPatternEnum) => void;
getExcludeValue: (value: string[], type: FilterPatternEnum) => void; getExcludeValue: (value: string[], type: FilterPatternEnum) => void;
handleShowFilter: (value: boolean, type: FilterPatternEnum) => void; handleShowFilter: (value: boolean, type: FilterPatternEnum) => void;
handleProfileSample: (value: number) => void;
handleQueryLogDuration: (value: number) => void; handleQueryLogDuration: (value: number) => void;
handleStageFileLocation: (value: string) => void; handleStageFileLocation: (value: string) => void;
handleResultLimit: (value: number) => void; handleResultLimit: (value: number) => void;