mirror of
				https://github.com/open-metadata/OpenMetadata.git
				synced 2025-10-31 02:29:03 +00:00 
			
		
		
		
	GEN-996 - Allow PII Processor without storing Sample Data (#17927)
* GEN-996 - Allow PII Processor without storing Sample Data * fix import * fix import
This commit is contained in:
		
							parent
							
								
									01e4b04573
								
							
						
					
					
						commit
						ad03f9e237
					
				| @ -565,9 +565,9 @@ class MetadataRestSink(Sink):  # pylint: disable=too-many-public-methods | ||||
|             f"Successfully ingested profile metrics for {record.table.fullyQualifiedName.root}" | ||||
|         ) | ||||
| 
 | ||||
|         if record.sample_data: | ||||
|         if record.sample_data and record.sample_data.store: | ||||
|             table_data = self.metadata.ingest_table_sample_data( | ||||
|                 table=record.table, sample_data=record.sample_data | ||||
|                 table=record.table, sample_data=record.sample_data.data | ||||
|             ) | ||||
|             if not table_data: | ||||
|                 self.status.failed( | ||||
|  | ||||
| @ -171,7 +171,7 @@ class PIIProcessor(Processor): | ||||
|                 col_tags = self.process_column( | ||||
|                     idx=idx, | ||||
|                     column=column, | ||||
|                     table_data=record.sample_data, | ||||
|                     table_data=record.sample_data.data, | ||||
|                     confidence_threshold=self.confidence_threshold, | ||||
|                 ) | ||||
|                 if col_tags: | ||||
|  | ||||
| @ -17,9 +17,10 @@ multiple profilers per table and columns. | ||||
| """ | ||||
| from typing import List, Optional, Type, Union | ||||
| 
 | ||||
| from pydantic import ConfigDict | ||||
| from pydantic import ConfigDict, Field | ||||
| from sqlalchemy import Column | ||||
| from sqlalchemy.orm import DeclarativeMeta | ||||
| from typing_extensions import Annotated | ||||
| 
 | ||||
| from metadata.config.common import ConfigModel | ||||
| from metadata.generated.schema.api.data.createTableProfile import ( | ||||
| @ -37,6 +38,7 @@ from metadata.generated.schema.entity.services.connections.connectionBasicType i | ||||
| ) | ||||
| from metadata.generated.schema.tests.customMetric import CustomMetric | ||||
| from metadata.generated.schema.type.basic import FullyQualifiedEntityName | ||||
| from metadata.ingestion.models.custom_pydantic import BaseModel | ||||
| from metadata.ingestion.models.table_metadata import ColumnTag | ||||
| from metadata.profiler.metrics.core import Metric, MetricTypes | ||||
| from metadata.profiler.processor.models import ProfilerDef | ||||
| @ -104,6 +106,15 @@ class ProfilerProcessorConfig(ConfigModel): | ||||
|     databaseConfig: Optional[List[DatabaseAndSchemaConfig]] = [] | ||||
| 
 | ||||
| 
 | ||||
| class SampleData(BaseModel): | ||||
|     """TableData wrapper to handle ephemeral SampleData""" | ||||
| 
 | ||||
|     data: Annotated[TableData, Field(None, description="Table Sample Data")] | ||||
|     store: Annotated[ | ||||
|         bool, Field(False, description="Is the sample data should be stored or not") | ||||
|     ] | ||||
| 
 | ||||
| 
 | ||||
| class ProfilerResponse(ConfigModel): | ||||
|     """ | ||||
|     ORM Profiler processor response. | ||||
| @ -114,7 +125,7 @@ class ProfilerResponse(ConfigModel): | ||||
| 
 | ||||
|     table: Table | ||||
|     profile: CreateTableProfileRequest | ||||
|     sample_data: Optional[TableData] = None | ||||
|     sample_data: Optional[SampleData] = None | ||||
|     column_tags: Optional[List[ColumnTag]] = None | ||||
| 
 | ||||
|     def __str__(self): | ||||
|  | ||||
| @ -33,7 +33,6 @@ from metadata.generated.schema.entity.data.table import ( | ||||
|     ColumnProfile, | ||||
|     ColumnProfilerConfig, | ||||
|     SystemProfile, | ||||
|     TableData, | ||||
|     TableProfile, | ||||
| ) | ||||
| from metadata.generated.schema.settings.settings import Settings | ||||
| @ -41,7 +40,7 @@ from metadata.generated.schema.tests.customMetric import ( | ||||
|     CustomMetric as CustomMetricEntity, | ||||
| ) | ||||
| from metadata.generated.schema.type.basic import Timestamp | ||||
| from metadata.profiler.api.models import ProfilerResponse, ThreadPoolMetrics | ||||
| from metadata.profiler.api.models import ProfilerResponse, SampleData, ThreadPoolMetrics | ||||
| from metadata.profiler.interface.profiler_interface import ProfilerInterface | ||||
| from metadata.profiler.metrics.core import ( | ||||
|     ComposedMetric, | ||||
| @ -492,7 +491,12 @@ class Profiler(Generic[TMetric]): | ||||
|             ) | ||||
|             self.compute_metrics() | ||||
| 
 | ||||
|         if self.source_config.generateSampleData: | ||||
|         # We need the sample data for Sample Data or PII Sensitive processing. | ||||
|         # We'll nullify the Sample Data after the PII processing so that it's not stored. | ||||
|         if ( | ||||
|             self.source_config.generateSampleData | ||||
|             or self.source_config.processPiiSensitive | ||||
|         ): | ||||
|             sample_data = self.generate_sample_data() | ||||
|         else: | ||||
|             sample_data = None | ||||
| @ -510,7 +514,7 @@ class Profiler(Generic[TMetric]): | ||||
|         return table_profile | ||||
| 
 | ||||
|     @calculate_execution_time(store=False) | ||||
|     def generate_sample_data(self) -> Optional[TableData]: | ||||
|     def generate_sample_data(self) -> Optional[SampleData]: | ||||
|         """Fetch and ingest sample data | ||||
| 
 | ||||
|         Returns: | ||||
| @ -532,7 +536,10 @@ class Profiler(Generic[TMetric]): | ||||
|                     SAMPLE_DATA_DEFAULT_COUNT, self.profiler_interface.sample_data_count | ||||
|                 ) | ||||
|             ] | ||||
|             return table_data | ||||
|             return SampleData( | ||||
|                 data=table_data, store=self.source_config.generateSampleData | ||||
|             ) | ||||
| 
 | ||||
|         except Exception as err: | ||||
|             logger.debug(traceback.format_exc()) | ||||
|             logger.warning(f"Error fetching sample data: {err}") | ||||
|  | ||||
| @ -15,11 +15,28 @@ JSON workflows to the profiler | ||||
| """ | ||||
| from typing import List, Optional | ||||
| 
 | ||||
| from pydantic import BaseModel, validator | ||||
| from pydantic import BaseModel, BeforeValidator | ||||
| from typing_extensions import Annotated | ||||
| 
 | ||||
| from metadata.profiler.metrics.registry import Metrics | ||||
| 
 | ||||
| 
 | ||||
| def valid_metric(value: str): | ||||
|     """ | ||||
|     Validate that the input metrics are correctly named | ||||
|     and can be found in the Registry | ||||
|     """ | ||||
|     if not Metrics.get(value.upper()): | ||||
|         raise ValueError( | ||||
|             f"Metric name {value} is not a proper metric name from the Registry" | ||||
|         ) | ||||
| 
 | ||||
|     return value.upper() | ||||
| 
 | ||||
| 
 | ||||
| ValidMetric = Annotated[str, BeforeValidator(valid_metric)] | ||||
| 
 | ||||
| 
 | ||||
| class ProfilerDef(BaseModel): | ||||
|     """ | ||||
|     Incoming profiler definition from the | ||||
| @ -30,26 +47,4 @@ class ProfilerDef(BaseModel): | ||||
|     timeout_seconds: Optional[ | ||||
|         int | ||||
|     ] = None  # Stop running a query after X seconds and continue | ||||
|     metrics: Optional[ | ||||
|         List[str] | ||||
|     ] = None  # names of currently supported Static and Composed metrics | ||||
|     # TBD: | ||||
|     # time_metrics: List[TimeMetricDef] = None | ||||
|     # custom_metrics: List[CustomMetricDef] = None | ||||
|     # rule_metrics: ... | ||||
| 
 | ||||
|     # pylint: disable=no-self-argument | ||||
|     @validator("metrics", each_item=True) | ||||
|     def valid_metric(cls, value): | ||||
|         """ | ||||
|         We are using cls as per pydantic docs | ||||
| 
 | ||||
|         Validate that the input metrics are correctly named | ||||
|         and can be found in the Registry | ||||
|         """ | ||||
|         if not Metrics.get(value.upper()): | ||||
|             raise ValueError( | ||||
|                 f"Metric name {value} is not a proper metric name from the Registry" | ||||
|             ) | ||||
| 
 | ||||
|         return value.upper() | ||||
|     metrics: Optional[List[ValidMetric]] = None | ||||
|  | ||||
| @ -64,7 +64,7 @@ from metadata.generated.schema.type.tagLabel import TagFQN, TagLabel | ||||
| from metadata.ingestion.models.table_metadata import ColumnTag | ||||
| from metadata.ingestion.ometa.ometa_api import OpenMetadata | ||||
| from metadata.pii.processor import PIIProcessor | ||||
| from metadata.profiler.api.models import ProfilerResponse | ||||
| from metadata.profiler.api.models import ProfilerResponse, SampleData | ||||
| 
 | ||||
| table_data = TableData( | ||||
|     columns=[ | ||||
| @ -314,7 +314,7 @@ class PiiProcessorTest(TestCase): | ||||
|                     ) | ||||
|                 ) | ||||
|             ), | ||||
|             sample_data=table_data, | ||||
|             sample_data=SampleData(data=table_data), | ||||
|         ) | ||||
| 
 | ||||
|         updated_record: ProfilerResponse = self.pii_processor.run(record) | ||||
|  | ||||
							
								
								
									
										25
									
								
								ingestion/tests/unit/profiler/test_profiler_models.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								ingestion/tests/unit/profiler/test_profiler_models.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,25 @@ | ||||
| #  Copyright 2021 Collate | ||||
| #  Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| #  you may not use this file except in compliance with the License. | ||||
| #  You may obtain a copy of the License at | ||||
| #  http://www.apache.org/licenses/LICENSE-2.0 | ||||
| #  Unless required by applicable law or agreed to in writing, software | ||||
| #  distributed under the License is distributed on an "AS IS" BASIS, | ||||
| #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| #  See the License for the specific language governing permissions and | ||||
| #  limitations under the License. | ||||
| """Profiler models behave properly""" | ||||
| import pytest | ||||
| 
 | ||||
| from metadata.profiler.processor.models import ProfilerDef | ||||
| 
 | ||||
| 
 | ||||
| def test_valid_metrics(): | ||||
|     """ | ||||
|     Test that the metrics are valid | ||||
|     """ | ||||
|     profiler_def = ProfilerDef(name="test", metrics=["count"]) | ||||
|     assert profiler_def.metrics == ["COUNT"] | ||||
| 
 | ||||
|     with pytest.raises(ValueError): | ||||
|         ProfilerDef(name="test", metrics=["potato"]) | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Pere Miquel Brull
						Pere Miquel Brull