mirror of
				https://github.com/open-metadata/OpenMetadata.git
				synced 2025-10-31 10:39:30 +00:00 
			
		
		
		
	GEN-996 - Allow PII Processor without storing Sample Data (#17927)
* GEN-996 - Allow PII Processor without storing Sample Data * fix import * fix import
This commit is contained in:
		
							parent
							
								
									01e4b04573
								
							
						
					
					
						commit
						ad03f9e237
					
				| @ -565,9 +565,9 @@ class MetadataRestSink(Sink):  # pylint: disable=too-many-public-methods | |||||||
|             f"Successfully ingested profile metrics for {record.table.fullyQualifiedName.root}" |             f"Successfully ingested profile metrics for {record.table.fullyQualifiedName.root}" | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|         if record.sample_data: |         if record.sample_data and record.sample_data.store: | ||||||
|             table_data = self.metadata.ingest_table_sample_data( |             table_data = self.metadata.ingest_table_sample_data( | ||||||
|                 table=record.table, sample_data=record.sample_data |                 table=record.table, sample_data=record.sample_data.data | ||||||
|             ) |             ) | ||||||
|             if not table_data: |             if not table_data: | ||||||
|                 self.status.failed( |                 self.status.failed( | ||||||
|  | |||||||
| @ -171,7 +171,7 @@ class PIIProcessor(Processor): | |||||||
|                 col_tags = self.process_column( |                 col_tags = self.process_column( | ||||||
|                     idx=idx, |                     idx=idx, | ||||||
|                     column=column, |                     column=column, | ||||||
|                     table_data=record.sample_data, |                     table_data=record.sample_data.data, | ||||||
|                     confidence_threshold=self.confidence_threshold, |                     confidence_threshold=self.confidence_threshold, | ||||||
|                 ) |                 ) | ||||||
|                 if col_tags: |                 if col_tags: | ||||||
|  | |||||||
| @ -17,9 +17,10 @@ multiple profilers per table and columns. | |||||||
| """ | """ | ||||||
| from typing import List, Optional, Type, Union | from typing import List, Optional, Type, Union | ||||||
| 
 | 
 | ||||||
| from pydantic import ConfigDict | from pydantic import ConfigDict, Field | ||||||
| from sqlalchemy import Column | from sqlalchemy import Column | ||||||
| from sqlalchemy.orm import DeclarativeMeta | from sqlalchemy.orm import DeclarativeMeta | ||||||
|  | from typing_extensions import Annotated | ||||||
| 
 | 
 | ||||||
| from metadata.config.common import ConfigModel | from metadata.config.common import ConfigModel | ||||||
| from metadata.generated.schema.api.data.createTableProfile import ( | from metadata.generated.schema.api.data.createTableProfile import ( | ||||||
| @ -37,6 +38,7 @@ from metadata.generated.schema.entity.services.connections.connectionBasicType i | |||||||
| ) | ) | ||||||
| from metadata.generated.schema.tests.customMetric import CustomMetric | from metadata.generated.schema.tests.customMetric import CustomMetric | ||||||
| from metadata.generated.schema.type.basic import FullyQualifiedEntityName | from metadata.generated.schema.type.basic import FullyQualifiedEntityName | ||||||
|  | from metadata.ingestion.models.custom_pydantic import BaseModel | ||||||
| from metadata.ingestion.models.table_metadata import ColumnTag | from metadata.ingestion.models.table_metadata import ColumnTag | ||||||
| from metadata.profiler.metrics.core import Metric, MetricTypes | from metadata.profiler.metrics.core import Metric, MetricTypes | ||||||
| from metadata.profiler.processor.models import ProfilerDef | from metadata.profiler.processor.models import ProfilerDef | ||||||
| @ -104,6 +106,15 @@ class ProfilerProcessorConfig(ConfigModel): | |||||||
|     databaseConfig: Optional[List[DatabaseAndSchemaConfig]] = [] |     databaseConfig: Optional[List[DatabaseAndSchemaConfig]] = [] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | class SampleData(BaseModel): | ||||||
|  |     """TableData wrapper to handle ephemeral SampleData""" | ||||||
|  | 
 | ||||||
|  |     data: Annotated[TableData, Field(None, description="Table Sample Data")] | ||||||
|  |     store: Annotated[ | ||||||
|  |         bool, Field(False, description="Is the sample data should be stored or not") | ||||||
|  |     ] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| class ProfilerResponse(ConfigModel): | class ProfilerResponse(ConfigModel): | ||||||
|     """ |     """ | ||||||
|     ORM Profiler processor response. |     ORM Profiler processor response. | ||||||
| @ -114,7 +125,7 @@ class ProfilerResponse(ConfigModel): | |||||||
| 
 | 
 | ||||||
|     table: Table |     table: Table | ||||||
|     profile: CreateTableProfileRequest |     profile: CreateTableProfileRequest | ||||||
|     sample_data: Optional[TableData] = None |     sample_data: Optional[SampleData] = None | ||||||
|     column_tags: Optional[List[ColumnTag]] = None |     column_tags: Optional[List[ColumnTag]] = None | ||||||
| 
 | 
 | ||||||
|     def __str__(self): |     def __str__(self): | ||||||
|  | |||||||
| @ -33,7 +33,6 @@ from metadata.generated.schema.entity.data.table import ( | |||||||
|     ColumnProfile, |     ColumnProfile, | ||||||
|     ColumnProfilerConfig, |     ColumnProfilerConfig, | ||||||
|     SystemProfile, |     SystemProfile, | ||||||
|     TableData, |  | ||||||
|     TableProfile, |     TableProfile, | ||||||
| ) | ) | ||||||
| from metadata.generated.schema.settings.settings import Settings | from metadata.generated.schema.settings.settings import Settings | ||||||
| @ -41,7 +40,7 @@ from metadata.generated.schema.tests.customMetric import ( | |||||||
|     CustomMetric as CustomMetricEntity, |     CustomMetric as CustomMetricEntity, | ||||||
| ) | ) | ||||||
| from metadata.generated.schema.type.basic import Timestamp | from metadata.generated.schema.type.basic import Timestamp | ||||||
| from metadata.profiler.api.models import ProfilerResponse, ThreadPoolMetrics | from metadata.profiler.api.models import ProfilerResponse, SampleData, ThreadPoolMetrics | ||||||
| from metadata.profiler.interface.profiler_interface import ProfilerInterface | from metadata.profiler.interface.profiler_interface import ProfilerInterface | ||||||
| from metadata.profiler.metrics.core import ( | from metadata.profiler.metrics.core import ( | ||||||
|     ComposedMetric, |     ComposedMetric, | ||||||
| @ -492,7 +491,12 @@ class Profiler(Generic[TMetric]): | |||||||
|             ) |             ) | ||||||
|             self.compute_metrics() |             self.compute_metrics() | ||||||
| 
 | 
 | ||||||
|         if self.source_config.generateSampleData: |         # We need the sample data for Sample Data or PII Sensitive processing. | ||||||
|  |         # We'll nullify the Sample Data after the PII processing so that it's not stored. | ||||||
|  |         if ( | ||||||
|  |             self.source_config.generateSampleData | ||||||
|  |             or self.source_config.processPiiSensitive | ||||||
|  |         ): | ||||||
|             sample_data = self.generate_sample_data() |             sample_data = self.generate_sample_data() | ||||||
|         else: |         else: | ||||||
|             sample_data = None |             sample_data = None | ||||||
| @ -510,7 +514,7 @@ class Profiler(Generic[TMetric]): | |||||||
|         return table_profile |         return table_profile | ||||||
| 
 | 
 | ||||||
|     @calculate_execution_time(store=False) |     @calculate_execution_time(store=False) | ||||||
|     def generate_sample_data(self) -> Optional[TableData]: |     def generate_sample_data(self) -> Optional[SampleData]: | ||||||
|         """Fetch and ingest sample data |         """Fetch and ingest sample data | ||||||
| 
 | 
 | ||||||
|         Returns: |         Returns: | ||||||
| @ -532,7 +536,10 @@ class Profiler(Generic[TMetric]): | |||||||
|                     SAMPLE_DATA_DEFAULT_COUNT, self.profiler_interface.sample_data_count |                     SAMPLE_DATA_DEFAULT_COUNT, self.profiler_interface.sample_data_count | ||||||
|                 ) |                 ) | ||||||
|             ] |             ] | ||||||
|             return table_data |             return SampleData( | ||||||
|  |                 data=table_data, store=self.source_config.generateSampleData | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|         except Exception as err: |         except Exception as err: | ||||||
|             logger.debug(traceback.format_exc()) |             logger.debug(traceback.format_exc()) | ||||||
|             logger.warning(f"Error fetching sample data: {err}") |             logger.warning(f"Error fetching sample data: {err}") | ||||||
|  | |||||||
| @ -15,11 +15,28 @@ JSON workflows to the profiler | |||||||
| """ | """ | ||||||
| from typing import List, Optional | from typing import List, Optional | ||||||
| 
 | 
 | ||||||
| from pydantic import BaseModel, validator | from pydantic import BaseModel, BeforeValidator | ||||||
|  | from typing_extensions import Annotated | ||||||
| 
 | 
 | ||||||
| from metadata.profiler.metrics.registry import Metrics | from metadata.profiler.metrics.registry import Metrics | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def valid_metric(value: str): | ||||||
|  |     """ | ||||||
|  |     Validate that the input metrics are correctly named | ||||||
|  |     and can be found in the Registry | ||||||
|  |     """ | ||||||
|  |     if not Metrics.get(value.upper()): | ||||||
|  |         raise ValueError( | ||||||
|  |             f"Metric name {value} is not a proper metric name from the Registry" | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     return value.upper() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | ValidMetric = Annotated[str, BeforeValidator(valid_metric)] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| class ProfilerDef(BaseModel): | class ProfilerDef(BaseModel): | ||||||
|     """ |     """ | ||||||
|     Incoming profiler definition from the |     Incoming profiler definition from the | ||||||
| @ -30,26 +47,4 @@ class ProfilerDef(BaseModel): | |||||||
|     timeout_seconds: Optional[ |     timeout_seconds: Optional[ | ||||||
|         int |         int | ||||||
|     ] = None  # Stop running a query after X seconds and continue |     ] = None  # Stop running a query after X seconds and continue | ||||||
|     metrics: Optional[ |     metrics: Optional[List[ValidMetric]] = None | ||||||
|         List[str] |  | ||||||
|     ] = None  # names of currently supported Static and Composed metrics |  | ||||||
|     # TBD: |  | ||||||
|     # time_metrics: List[TimeMetricDef] = None |  | ||||||
|     # custom_metrics: List[CustomMetricDef] = None |  | ||||||
|     # rule_metrics: ... |  | ||||||
| 
 |  | ||||||
|     # pylint: disable=no-self-argument |  | ||||||
|     @validator("metrics", each_item=True) |  | ||||||
|     def valid_metric(cls, value): |  | ||||||
|         """ |  | ||||||
|         We are using cls as per pydantic docs |  | ||||||
| 
 |  | ||||||
|         Validate that the input metrics are correctly named |  | ||||||
|         and can be found in the Registry |  | ||||||
|         """ |  | ||||||
|         if not Metrics.get(value.upper()): |  | ||||||
|             raise ValueError( |  | ||||||
|                 f"Metric name {value} is not a proper metric name from the Registry" |  | ||||||
|             ) |  | ||||||
| 
 |  | ||||||
|         return value.upper() |  | ||||||
|  | |||||||
| @ -64,7 +64,7 @@ from metadata.generated.schema.type.tagLabel import TagFQN, TagLabel | |||||||
| from metadata.ingestion.models.table_metadata import ColumnTag | from metadata.ingestion.models.table_metadata import ColumnTag | ||||||
| from metadata.ingestion.ometa.ometa_api import OpenMetadata | from metadata.ingestion.ometa.ometa_api import OpenMetadata | ||||||
| from metadata.pii.processor import PIIProcessor | from metadata.pii.processor import PIIProcessor | ||||||
| from metadata.profiler.api.models import ProfilerResponse | from metadata.profiler.api.models import ProfilerResponse, SampleData | ||||||
| 
 | 
 | ||||||
| table_data = TableData( | table_data = TableData( | ||||||
|     columns=[ |     columns=[ | ||||||
| @ -314,7 +314,7 @@ class PiiProcessorTest(TestCase): | |||||||
|                     ) |                     ) | ||||||
|                 ) |                 ) | ||||||
|             ), |             ), | ||||||
|             sample_data=table_data, |             sample_data=SampleData(data=table_data), | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|         updated_record: ProfilerResponse = self.pii_processor.run(record) |         updated_record: ProfilerResponse = self.pii_processor.run(record) | ||||||
|  | |||||||
							
								
								
									
										25
									
								
								ingestion/tests/unit/profiler/test_profiler_models.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								ingestion/tests/unit/profiler/test_profiler_models.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,25 @@ | |||||||
|  | #  Copyright 2021 Collate | ||||||
|  | #  Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  | #  you may not use this file except in compliance with the License. | ||||||
|  | #  You may obtain a copy of the License at | ||||||
|  | #  http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  | #  Unless required by applicable law or agreed to in writing, software | ||||||
|  | #  distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  | #  See the License for the specific language governing permissions and | ||||||
|  | #  limitations under the License. | ||||||
|  | """Profiler models behave properly""" | ||||||
|  | import pytest | ||||||
|  | 
 | ||||||
|  | from metadata.profiler.processor.models import ProfilerDef | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_valid_metrics(): | ||||||
|  |     """ | ||||||
|  |     Test that the metrics are valid | ||||||
|  |     """ | ||||||
|  |     profiler_def = ProfilerDef(name="test", metrics=["count"]) | ||||||
|  |     assert profiler_def.metrics == ["COUNT"] | ||||||
|  | 
 | ||||||
|  |     with pytest.raises(ValueError): | ||||||
|  |         ProfilerDef(name="test", metrics=["potato"]) | ||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Pere Miquel Brull
						Pere Miquel Brull