2023-10-09 20:47:19 +02:00

179 lines
5.8 KiB
Python

# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor util to fetch pii sensitive columns
"""
import traceback
from typing import List, Optional, cast
from metadata.generated.schema.entity.data.table import Column, TableData
from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import (
DatabaseServiceProfilerPipeline,
)
from metadata.generated.schema.metadataIngestion.workflow import (
OpenMetadataWorkflowConfig,
)
from metadata.generated.schema.type.tagLabel import (
LabelType,
State,
TagLabel,
TagSource,
)
from metadata.ingestion.api.models import Either, StackTraceError
from metadata.ingestion.api.parser import parse_workflow_config_gracefully
from metadata.ingestion.api.step import Step
from metadata.ingestion.api.steps import Processor
from metadata.ingestion.models.table_metadata import ColumnTag
from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.pii.constants import PII
from metadata.pii.scanners.column_name_scanner import ColumnNameScanner
from metadata.pii.scanners.ner_scanner import NERScanner
from metadata.profiler.api.models import ProfilerResponse
from metadata.utils.logger import profiler_logger
logger = profiler_logger()
class PIIProcessor(Processor):
"""
A scanner that uses Spacy NER for entity recognition
"""
def __init__(
self,
config: OpenMetadataWorkflowConfig,
metadata: OpenMetadata,
):
super().__init__()
self.config = config
self.metadata = metadata
# Init and type the source config
self.source_config: DatabaseServiceProfilerPipeline = cast(
DatabaseServiceProfilerPipeline, self.config.source.sourceConfig.config
) # Used to satisfy type checked
self._ner_scanner = None
self.confidence_threshold = self.source_config.confidence
@property
def ner_scanner(self) -> NERScanner:
"""Load the NER Scanner only if called"""
if self._ner_scanner is None:
self._ner_scanner = NERScanner()
return self._ner_scanner
@classmethod
def create(cls, config_dict: dict, metadata: OpenMetadata) -> "Step":
config = parse_workflow_config_gracefully(config_dict)
return cls(config=config, metadata=metadata)
def close(self) -> None:
"""Nothing to close"""
@staticmethod
def build_column_tag(tag_fqn: str, column_fqn: str) -> ColumnTag:
"""
Build the tag and run the PATCH
"""
tag_label = TagLabel(
tagFQN=tag_fqn,
source=TagSource.Classification,
state=State.Suggested,
labelType=LabelType.Automated,
)
return ColumnTag(column_fqn=column_fqn, tag_label=tag_label)
def process_column(
self,
idx: int,
column: Column,
table_data: Optional[TableData],
confidence_threshold: float,
) -> Optional[List[ColumnTag]]:
"""
Tag a column with PII if we find it using our scanners
"""
# First, check if the column we are about to process
# already has PII tags or not
column_has_pii_tag = any(
(PII in tag.tagFQN.__root__ for tag in column.tags or [])
)
# If it has PII tags, we skip the processing
# for the column
if column_has_pii_tag is True:
return None
# Scan by column name. If no results there, check the sample data, if any
tag_and_confidence = ColumnNameScanner.scan(column.name.__root__) or (
self.ner_scanner.scan([row[idx] for row in table_data.rows])
if table_data
else None
)
if (
tag_and_confidence
and tag_and_confidence.tag_fqn
and tag_and_confidence.confidence >= confidence_threshold / 100
):
# We support returning +1 tags for a single column in _run
return [
self.build_column_tag(
tag_fqn=tag_and_confidence.tag_fqn,
column_fqn=column.fullyQualifiedName.__root__,
)
]
return None
def _run(
self,
record: ProfilerResponse,
) -> Either[ProfilerResponse]:
"""
Main entrypoint for the scanner.
Adds PII tagging based on the column names
and TableData
"""
# We don't always need to process
if not self.source_config.processPiiSensitive:
return Either(right=record)
column_tags = []
for idx, column in enumerate(record.table.columns):
try:
col_tags = self.process_column(
idx=idx,
column=column,
table_data=record.sample_data,
confidence_threshold=self.confidence_threshold,
)
if col_tags:
column_tags.extend(col_tags)
except Exception as err:
self.status.failed(
StackTraceError(
name=record.table.fullyQualifiedName.__root__,
error=f"Error computing PII tags for [{column}] - [{err}]",
stack_trace=traceback.format_exc(),
)
)
record.column_tags = column_tags
return Either(right=record)