mirror of
				https://github.com/open-metadata/OpenMetadata.git
				synced 2025-10-31 02:29:03 +00:00 
			
		
		
		
	 46afe69811
			
		
	
	
		46afe69811
		
			
		
	
	
	
	
		
			
			* improvement in pii tagging * fix conflict and changes as per comment * Added confidence field * changes as per comments * Apply suggestions from code review Co-authored-by: Teddy <teddy.crepineau@gmail.com> --------- Co-authored-by: Ashish Gupta <ashish@getcollate.io> Co-authored-by: Pere Miquel Brull <peremiquelbrull@gmail.com> Co-authored-by: Teddy <teddy.crepineau@gmail.com>
		
			
				
	
	
		
			916 lines
		
	
	
		
			28 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			916 lines
		
	
	
		
			28 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #  Copyright 2021 Collate
 | |
| #  Licensed under the Apache License, Version 2.0 (the "License");
 | |
| #  you may not use this file except in compliance with the License.
 | |
| #  You may obtain a copy of the License at
 | |
| #  http://www.apache.org/licenses/LICENSE-2.0
 | |
| #  Unless required by applicable law or agreed to in writing, software
 | |
| #  distributed under the License is distributed on an "AS IS" BASIS,
 | |
| #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| #  See the License for the specific language governing permissions and
 | |
| #  limitations under the License.
 | |
| """
 | |
| Test Processor Class
 | |
| """
 | |
| 
 | |
| import datetime
 | |
| from unittest import TestCase
 | |
| 
 | |
| from metadata.generated.schema.api.data.createDatabase import CreateDatabaseRequest
 | |
| from metadata.generated.schema.api.data.createDatabaseSchema import (
 | |
|     CreateDatabaseSchemaRequest,
 | |
| )
 | |
| from metadata.generated.schema.api.data.createTable import CreateTableRequest
 | |
| from metadata.generated.schema.api.services.createDatabaseService import (
 | |
|     CreateDatabaseServiceRequest,
 | |
| )
 | |
| from metadata.generated.schema.entity.data.table import (
 | |
|     Column,
 | |
|     ColumnName,
 | |
|     DataType,
 | |
|     Table,
 | |
|     TableData,
 | |
|     TableType,
 | |
| )
 | |
| from metadata.generated.schema.entity.services.connections.database.mysqlConnection import (
 | |
|     MysqlConnection,
 | |
| )
 | |
| from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
 | |
|     OpenMetadataConnection,
 | |
| )
 | |
| from metadata.generated.schema.entity.services.databaseService import (
 | |
|     DatabaseConnection,
 | |
|     DatabaseService,
 | |
|     DatabaseServiceType,
 | |
| )
 | |
| from metadata.generated.schema.security.client.openMetadataJWTClientConfig import (
 | |
|     OpenMetadataJWTClientConfig,
 | |
| )
 | |
| from metadata.generated.schema.type.basic import AnyUrl, Href
 | |
| from metadata.generated.schema.type.entityReference import EntityReference
 | |
| from metadata.generated.schema.type.tagLabel import (
 | |
|     LabelType,
 | |
|     State,
 | |
|     TagFQN,
 | |
|     TagLabel,
 | |
|     TagSource,
 | |
| )
 | |
| from metadata.ingestion.ometa.ometa_api import OpenMetadata
 | |
| from metadata.ingestion.processor.pii import NERScanner
 | |
| 
 | |
| MOCK_TABLE: CreateTableRequest = CreateTableRequest(
 | |
|     name="DataSet Input",
 | |
|     displayName="DataSet Input",
 | |
|     description="this is a description for dataset input",
 | |
|     tableType=TableType.Regular.value,
 | |
|     columns=[
 | |
|         Column(
 | |
|             name="amount",
 | |
|             displayName=None,
 | |
|             dataType=DataType.DOUBLE.value,
 | |
|             arrayDataType=None,
 | |
|             dataLength=None,
 | |
|             precision=None,
 | |
|             scale=None,
 | |
|             dataTypeDisplay=None,
 | |
|             description="This is description for amount",
 | |
|             fullyQualifiedName=None,
 | |
|             tags=None,
 | |
|             constraint=None,
 | |
|             ordinalPosition=1,
 | |
|             jsonSchema=None,
 | |
|             children=None,
 | |
|             customMetrics=None,
 | |
|             profile=None,
 | |
|         ),
 | |
|         Column(
 | |
|             name="bank_transfer_amount",
 | |
|             displayName=None,
 | |
|             dataType=DataType.DOUBLE.value,
 | |
|             arrayDataType=None,
 | |
|             dataLength=None,
 | |
|             precision=None,
 | |
|             scale=None,
 | |
|             dataTypeDisplay=None,
 | |
|             description="",
 | |
|             fullyQualifiedName=None,
 | |
|             tags=None,
 | |
|             constraint=None,
 | |
|             ordinalPosition=2,
 | |
|             jsonSchema=None,
 | |
|             children=None,
 | |
|             customMetrics=None,
 | |
|             profile=None,
 | |
|         ),
 | |
|         Column(
 | |
|             name="coupon_amount",
 | |
|             displayName=None,
 | |
|             dataType=DataType.DOUBLE.value,
 | |
|             arrayDataType=None,
 | |
|             dataLength=None,
 | |
|             precision=None,
 | |
|             scale=None,
 | |
|             dataTypeDisplay=None,
 | |
|             description="",
 | |
|             fullyQualifiedName=None,
 | |
|             tags=None,
 | |
|             constraint=None,
 | |
|             ordinalPosition=3,
 | |
|             jsonSchema=None,
 | |
|             children=None,
 | |
|             customMetrics=None,
 | |
|             profile=None,
 | |
|         ),
 | |
|         Column(
 | |
|             name="credit_card_amount",
 | |
|             displayName=None,
 | |
|             dataType=DataType.DOUBLE.value,
 | |
|             arrayDataType=None,
 | |
|             dataLength=None,
 | |
|             precision=None,
 | |
|             scale=None,
 | |
|             dataTypeDisplay=None,
 | |
|             description="",
 | |
|             fullyQualifiedName=None,
 | |
|             tags=[
 | |
|                 TagLabel(
 | |
|                     tagFQN="PersonalData.Personal",
 | |
|                     description=None,
 | |
|                     source="Classification",
 | |
|                     labelType="Automated",
 | |
|                     state="Suggested",
 | |
|                     href=None,
 | |
|                 )
 | |
|             ],
 | |
|             constraint=None,
 | |
|             ordinalPosition=4,
 | |
|             jsonSchema=None,
 | |
|             children=None,
 | |
|             customMetrics=None,
 | |
|             profile=None,
 | |
|         ),
 | |
|         Column(
 | |
|             name="FirstName",
 | |
|             displayName=None,
 | |
|             dataType=DataType.STRING.value,
 | |
|             arrayDataType=None,
 | |
|             dataLength=None,
 | |
|             precision=None,
 | |
|             scale=None,
 | |
|             dataTypeDisplay=None,
 | |
|             description="",
 | |
|             fullyQualifiedName=None,
 | |
|             tags=None,
 | |
|             constraint=None,
 | |
|             ordinalPosition=4,
 | |
|             jsonSchema=None,
 | |
|             children=None,
 | |
|             customMetrics=None,
 | |
|             profile=None,
 | |
|         ),
 | |
|         Column(
 | |
|             name="is_customer",
 | |
|             displayName=None,
 | |
|             dataType=DataType.BOOLEAN.value,
 | |
|             arrayDataType=None,
 | |
|             dataLength=None,
 | |
|             precision=None,
 | |
|             scale=None,
 | |
|             dataTypeDisplay=None,
 | |
|             description="",
 | |
|             fullyQualifiedName=None,
 | |
|             tags=[
 | |
|                 TagLabel(
 | |
|                     tagFQN="PersonalData.Personal",
 | |
|                     description=None,
 | |
|                     source="Classification",
 | |
|                     labelType="Automated",
 | |
|                     state="Suggested",
 | |
|                     href=None,
 | |
|                 )
 | |
|             ],
 | |
|             constraint=None,
 | |
|             ordinalPosition=4,
 | |
|             jsonSchema=None,
 | |
|             children=None,
 | |
|             customMetrics=None,
 | |
|             profile=None,
 | |
|         ),
 | |
|     ],
 | |
|     tableConstraints=None,
 | |
|     tablePartition=None,
 | |
|     tableProfilerConfig=None,
 | |
|     owner=None,
 | |
|     databaseSchema="default.default.schema",
 | |
|     tags=None,
 | |
|     viewDefinition=None,
 | |
|     extension=None,
 | |
| )
 | |
| 
 | |
| EXPECTED_COLUMNS = [
 | |
|     Column(
 | |
|         name="amount",
 | |
|         displayName=None,
 | |
|         dataType="DOUBLE",
 | |
|         arrayDataType=None,
 | |
|         dataLength=None,
 | |
|         precision=None,
 | |
|         scale=None,
 | |
|         dataTypeDisplay=None,
 | |
|         description="This is description for amount",
 | |
|         fullyQualifiedName=None,
 | |
|         tags=[
 | |
|             TagLabel(
 | |
|                 tagFQN=TagFQN(__root__="PII.Sensitive"),
 | |
|                 description=None,
 | |
|                 source="Classification",
 | |
|                 labelType="Automated",
 | |
|                 state="Suggested",
 | |
|                 href=None,
 | |
|             )
 | |
|         ],
 | |
|         constraint=None,
 | |
|         ordinalPosition=1,
 | |
|         jsonSchema=None,
 | |
|         children=None,
 | |
|         customMetrics=None,
 | |
|         profile=None,
 | |
|     ),
 | |
|     Column(
 | |
|         name="bank_transfer_amount",
 | |
|         displayName=None,
 | |
|         dataType="DOUBLE",
 | |
|         arrayDataType=None,
 | |
|         dataLength=None,
 | |
|         precision=None,
 | |
|         scale=None,
 | |
|         dataTypeDisplay=None,
 | |
|         description="",
 | |
|         fullyQualifiedName=None,
 | |
|         tags=[
 | |
|             TagLabel(
 | |
|                 tagFQN=TagFQN(__root__="PII.Sensitive"),
 | |
|                 description=None,
 | |
|                 source="Classification",
 | |
|                 labelType="Automated",
 | |
|                 state="Suggested",
 | |
|                 href=None,
 | |
|             )
 | |
|         ],
 | |
|         constraint=None,
 | |
|         ordinalPosition=2,
 | |
|         jsonSchema=None,
 | |
|         children=None,
 | |
|         customMetrics=None,
 | |
|         profile=None,
 | |
|     ),
 | |
|     Column(
 | |
|         name="coupon_amount",
 | |
|         displayName=None,
 | |
|         dataType="DOUBLE",
 | |
|         arrayDataType=None,
 | |
|         dataLength=None,
 | |
|         precision=None,
 | |
|         scale=None,
 | |
|         dataTypeDisplay=None,
 | |
|         description="",
 | |
|         fullyQualifiedName=None,
 | |
|         tags=[
 | |
|             TagLabel(
 | |
|                 tagFQN=TagFQN(__root__="PII.Sensitive"),
 | |
|                 description=None,
 | |
|                 source="Classification",
 | |
|                 labelType="Automated",
 | |
|                 state="Suggested",
 | |
|                 href=None,
 | |
|             )
 | |
|         ],
 | |
|         constraint=None,
 | |
|         ordinalPosition=3,
 | |
|         jsonSchema=None,
 | |
|         children=None,
 | |
|         customMetrics=None,
 | |
|         profile=None,
 | |
|     ),
 | |
|     Column(
 | |
|         name="credit_card_amount",
 | |
|         displayName=None,
 | |
|         dataType="DOUBLE",
 | |
|         arrayDataType=None,
 | |
|         dataLength=None,
 | |
|         precision=None,
 | |
|         scale=None,
 | |
|         dataTypeDisplay=None,
 | |
|         description="",
 | |
|         fullyQualifiedName=None,
 | |
|         tags=[
 | |
|             TagLabel(
 | |
|                 tagFQN="PersonalData.Personal",
 | |
|                 description=None,
 | |
|                 source="Classification",
 | |
|                 labelType="Automated",
 | |
|                 state="Suggested",
 | |
|                 href=None,
 | |
|             ),
 | |
|             TagLabel(
 | |
|                 tagFQN=TagFQN(__root__="PII.Sensitive"),
 | |
|                 description=None,
 | |
|                 source="Classification",
 | |
|                 labelType="Automated",
 | |
|                 state="Suggested",
 | |
|                 href=None,
 | |
|             ),
 | |
|         ],
 | |
|         constraint=None,
 | |
|         ordinalPosition=4,
 | |
|         jsonSchema=None,
 | |
|         children=None,
 | |
|         customMetrics=None,
 | |
|         profile=None,
 | |
|     ),
 | |
|     Column(
 | |
|         name="FirstName",
 | |
|         displayName=None,
 | |
|         dataType="STRING",
 | |
|         arrayDataType=None,
 | |
|         dataLength=None,
 | |
|         precision=None,
 | |
|         scale=None,
 | |
|         dataTypeDisplay=None,
 | |
|         description="",
 | |
|         fullyQualifiedName=None,
 | |
|         tags=[
 | |
|             TagLabel(
 | |
|                 tagFQN=TagFQN(__root__="PII.NonSensitive"),
 | |
|                 description=None,
 | |
|                 source="Classification",
 | |
|                 labelType="Automated",
 | |
|                 state="Suggested",
 | |
|                 href=None,
 | |
|             )
 | |
|         ],
 | |
|         constraint=None,
 | |
|         ordinalPosition=4,
 | |
|         jsonSchema=None,
 | |
|         children=None,
 | |
|         customMetrics=None,
 | |
|         profile=None,
 | |
|     ),
 | |
|     Column(
 | |
|         name="is_customer",
 | |
|         displayName=None,
 | |
|         dataType="BOOLEAN",
 | |
|         arrayDataType=None,
 | |
|         dataLength=None,
 | |
|         precision=None,
 | |
|         scale=None,
 | |
|         dataTypeDisplay=None,
 | |
|         description="",
 | |
|         fullyQualifiedName=None,
 | |
|         tags=[
 | |
|             TagLabel(
 | |
|                 tagFQN="PersonalData.Personal",
 | |
|                 description=None,
 | |
|                 source="Classification",
 | |
|                 labelType="Automated",
 | |
|                 state="Suggested",
 | |
|                 href=None,
 | |
|             )
 | |
|         ],
 | |
|         constraint=None,
 | |
|         ordinalPosition=4,
 | |
|         jsonSchema=None,
 | |
|         children=None,
 | |
|         customMetrics=None,
 | |
|         profile=None,
 | |
|     ),
 | |
| ]
 | |
| 
 | |
| table_data = TableData(
 | |
|     columns=[
 | |
|         ColumnName(__root__="customer_id"),
 | |
|         ColumnName(__root__="first_name"),
 | |
|         ColumnName(__root__="last_name"),
 | |
|         ColumnName(__root__="first_order"),
 | |
|         ColumnName(__root__="customer_email"),
 | |
|         ColumnName(__root__="number_of_orders"),
 | |
|     ],
 | |
|     rows=[
 | |
|         [
 | |
|             30,
 | |
|             "Christina",
 | |
|             "W.",
 | |
|             datetime.date(2018, 3, 2),
 | |
|             "christina@hotmail.com",
 | |
|             2,
 | |
|         ],
 | |
|         [73, "Alan", "B.", None, "joshua.alan@yahoo.com", None],
 | |
|         [71, "Gerald", "C.", datetime.date(2018, 1, 18), "geraldc@gmail.com", 3],
 | |
|         [35, "Sara", "T.", datetime.date(2018, 2, 21), "saratimithi@godesign.com", 2],
 | |
|         [22, "Sean", "H.", datetime.date(2018, 1, 26), "heroldsean@google.com", 3],
 | |
|         [50, "Billy", "L.", datetime.date(2018, 1, 5), "bliam@random.com", 2],
 | |
|         [
 | |
|             76,
 | |
|             "Barbara",
 | |
|             "W.",
 | |
|             datetime.date(2018, 3, 23),
 | |
|             "bmwastin@gmail.co.in",
 | |
|             1,
 | |
|         ],
 | |
|         [5, "Katherine", "R.", None, None, None],
 | |
|         [31, "Jane", "G.", datetime.date(2018, 2, 17), "gg34jane@hammer.com", 1],
 | |
|         [45, "Scott", "B.", None, None, None],
 | |
|         [21, "Willie", "H.", datetime.date(2018, 3, 28), "12hwilliejose@gmail.com", 1],
 | |
|         [18, "Johnny", "K.", datetime.date(2018, 2, 27), "johnnykk@dexter.com", 1],
 | |
|         [6, "Sarah", "R.", datetime.date(2018, 2, 19), "rrsarah@britinia.com", 1],
 | |
|         [56, "Joshua", "K.", None, None, None],
 | |
|         [79, "Jack", "R.", datetime.date(2018, 2, 28), "jack.mm@people.co.in", 2],
 | |
|         [94, "Gregory", "H.", datetime.date(2018, 1, 4), "peter.gregory@japer.com", 2],
 | |
|         [83, "Virginia", "R.", None, None, None],
 | |
|         [17, "Kimberly", "R.", None, None, None],
 | |
|         [2, "Shawn", "M.", datetime.date(2018, 1, 11), "shawn344@gmail.com", 1],
 | |
|         [60, "Norma", "W.", None, None, None],
 | |
|         [87, "Phillip", "B.", None, None, None],
 | |
|     ],
 | |
| )
 | |
| 
 | |
| TABLE_ENTITY = Table(
 | |
|     id="c6e75645-62e3-4110-8040-faa0e1ae3289",
 | |
|     name="customers",
 | |
|     displayName=None,
 | |
|     fullyQualifiedName="aws_redshift1.dev.dbt_jaffle.customers",
 | |
|     description=None,
 | |
|     version=0.7,
 | |
|     updatedAt=1676984225597,
 | |
|     updatedBy="admin",
 | |
|     href=Href(
 | |
|         __root__=AnyUrl(
 | |
|             "http://localhost:8585/api/v1/tables/c6e75645-62e3-4110-8040-faa0e1ae3289",
 | |
|             scheme="http",
 | |
|             host="localhost",
 | |
|             host_type="int_domain",
 | |
|             port="8585",
 | |
|             path="/api/v1/tables/c6e75645-62e3-4110-8040-faa0e1ae3289",
 | |
|         )
 | |
|     ),
 | |
|     tableType="Local",
 | |
|     columns=[
 | |
|         Column(
 | |
|             name=ColumnName(__root__="customer_id"),
 | |
|             displayName=None,
 | |
|             dataType="INT",
 | |
|             arrayDataType=None,
 | |
|             dataLength=1,
 | |
|             precision=None,
 | |
|             scale=None,
 | |
|             dataTypeDisplay="int",
 | |
|             description="This is an ID identifing a unique customer",
 | |
|             fullyQualifiedName="aws_redshift1.dev.dbt_jaffle.customers.customer_id",
 | |
|             tags=[
 | |
|                 TagLabel(
 | |
|                     tagFQN="PII.Sensitive",
 | |
|                     source=TagSource.Classification.value,
 | |
|                     labelType=LabelType.Automated.value,
 | |
|                     state=State.Suggested.value,
 | |
|                 )
 | |
|             ],
 | |
|             constraint="NULL",
 | |
|             ordinalPosition=None,
 | |
|             jsonSchema=None,
 | |
|             children=None,
 | |
|             customMetrics=None,
 | |
|             profile=None,
 | |
|         ),
 | |
|         Column(
 | |
|             name=ColumnName(__root__="first_name"),
 | |
|             displayName=None,
 | |
|             dataType="VARCHAR",
 | |
|             arrayDataType=None,
 | |
|             dataLength=10,
 | |
|             precision=None,
 | |
|             scale=None,
 | |
|             dataTypeDisplay="varchar(10)",
 | |
|             description=None,
 | |
|             fullyQualifiedName="aws_redshift1.dev.dbt_jaffle.customers.first_name",
 | |
|             tags=None,
 | |
|             constraint="NULL",
 | |
|             ordinalPosition=None,
 | |
|             jsonSchema=None,
 | |
|             children=None,
 | |
|             customMetrics=None,
 | |
|             profile=None,
 | |
|         ),
 | |
|         Column(
 | |
|             name=ColumnName(__root__="last_name"),
 | |
|             displayName=None,
 | |
|             dataType="VARCHAR",
 | |
|             arrayDataType=None,
 | |
|             dataLength=2,
 | |
|             precision=None,
 | |
|             scale=None,
 | |
|             dataTypeDisplay="varchar(2)",
 | |
|             description=None,
 | |
|             fullyQualifiedName="aws_redshift1.dev.dbt_jaffle.customers.last_name",
 | |
|             tags=None,
 | |
|             constraint="NULL",
 | |
|             ordinalPosition=None,
 | |
|             jsonSchema=None,
 | |
|             children=None,
 | |
|             customMetrics=None,
 | |
|             profile=None,
 | |
|         ),
 | |
|         Column(
 | |
|             name=ColumnName(__root__="first_order"),
 | |
|             displayName=None,
 | |
|             dataType="DATE",
 | |
|             arrayDataType=None,
 | |
|             dataLength=1,
 | |
|             precision=None,
 | |
|             scale=None,
 | |
|             dataTypeDisplay="date",
 | |
|             description=None,
 | |
|             fullyQualifiedName="aws_redshift1.dev.dbt_jaffle.customers.first_order",
 | |
|             tags=None,
 | |
|             constraint="NULL",
 | |
|             ordinalPosition=None,
 | |
|             jsonSchema=None,
 | |
|             children=None,
 | |
|             customMetrics=None,
 | |
|             profile=None,
 | |
|         ),
 | |
|         Column(
 | |
|             name=ColumnName(__root__="customer_email"),
 | |
|             displayName=None,
 | |
|             dataType="DATE",
 | |
|             arrayDataType=None,
 | |
|             dataLength=1,
 | |
|             precision=None,
 | |
|             scale=None,
 | |
|             dataTypeDisplay="date",
 | |
|             description=None,
 | |
|             fullyQualifiedName="aws_redshift1.dev.dbt_jaffle.customers.customer_email",
 | |
|             tags=None,
 | |
|             constraint="NULL",
 | |
|             ordinalPosition=None,
 | |
|             jsonSchema=None,
 | |
|             children=None,
 | |
|             customMetrics=None,
 | |
|             profile=None,
 | |
|         ),
 | |
|         Column(
 | |
|             name=ColumnName(__root__="number_of_orders"),
 | |
|             displayName=None,
 | |
|             dataType="BIGINT",
 | |
|             arrayDataType=None,
 | |
|             dataLength=1,
 | |
|             precision=None,
 | |
|             scale=None,
 | |
|             dataTypeDisplay="bigint",
 | |
|             description=None,
 | |
|             fullyQualifiedName="aws_redshift1.dev.dbt_jaffle.customers.number_of_orders",
 | |
|             tags=None,
 | |
|             constraint="NULL",
 | |
|             ordinalPosition=None,
 | |
|             jsonSchema=None,
 | |
|             children=None,
 | |
|             customMetrics=None,
 | |
|             profile=None,
 | |
|         ),
 | |
|     ],
 | |
|     tableConstraints=None,
 | |
|     tablePartition=None,
 | |
|     owner=None,
 | |
|     databaseSchema=EntityReference(
 | |
|         id="9db326f8-c23c-49c5-bc75-865cb8e87981",
 | |
|         type="databaseSchema",
 | |
|         name="dbt_jaffle",
 | |
|         fullyQualifiedName="aws_redshift1.dev.dbt_jaffle",
 | |
|         description=None,
 | |
|         displayName=None,
 | |
|         deleted=False,
 | |
|         href=Href(
 | |
|             __root__=AnyUrl(
 | |
|                 "http://localhost:8585/api/v1/databaseSchemas/9db326f8-c23c-49c5-bc75-865cb8e87981",
 | |
|                 scheme="http",
 | |
|                 host="localhost",
 | |
|                 host_type="int_domain",
 | |
|                 port="8585",
 | |
|                 path="/api/v1/databaseSchemas/9db326f8-c23c-49c5-bc75-865cb8e87981",
 | |
|             )
 | |
|         ),
 | |
|     ),
 | |
|     database=EntityReference(
 | |
|         id="f74772d0-2827-442a-8aa4-3dfd136f0c53",
 | |
|         type="database",
 | |
|         name="dev",
 | |
|         fullyQualifiedName="aws_redshift1.dev",
 | |
|         description=None,
 | |
|         displayName=None,
 | |
|         deleted=False,
 | |
|         href=Href(
 | |
|             __root__=AnyUrl(
 | |
|                 "http://localhost:8585/api/v1/databases/f74772d0-2827-442a-8aa4-3dfd136f0c53",
 | |
|                 scheme="http",
 | |
|                 host="localhost",
 | |
|                 host_type="int_domain",
 | |
|                 port="8585",
 | |
|                 path="/api/v1/databases/f74772d0-2827-442a-8aa4-3dfd136f0c53",
 | |
|             )
 | |
|         ),
 | |
|     ),
 | |
|     service=EntityReference(
 | |
|         id="31964ed7-8b76-468f-8f1d-d3839792a3b0",
 | |
|         type="databaseService",
 | |
|         name="aws_redshift1",
 | |
|         fullyQualifiedName="aws_redshift1",
 | |
|         description=None,
 | |
|         displayName=None,
 | |
|         deleted=False,
 | |
|         href=Href(
 | |
|             __root__=AnyUrl(
 | |
|                 "http://localhost:8585/api/v1/services/databaseServices/31964ed7-8b76-468f-8f1d-d3839792a3b0",
 | |
|                 scheme="http",
 | |
|                 host="localhost",
 | |
|                 host_type="int_domain",
 | |
|                 port="8585",
 | |
|                 path="/api/v1/services/databaseServices/31964ed7-8b76-468f-8f1d-d3839792a3b0",
 | |
|             )
 | |
|         ),
 | |
|     ),
 | |
|     serviceType="Redshift",
 | |
|     location=None,
 | |
|     viewDefinition=None,
 | |
|     tags=None,
 | |
|     usageSummary=None,
 | |
|     followers=None,
 | |
|     joins=None,
 | |
|     sampleData=None,
 | |
|     tableProfilerConfig=None,
 | |
|     profile=None,
 | |
|     dataModel=None,
 | |
|     changeDescription=None,
 | |
|     deleted=False,
 | |
|     extension=None,
 | |
| )
 | |
| 
 | |
| UPDATED_TABLE_ENTITY = [
 | |
|     Column(
 | |
|         name=ColumnName(__root__="customer_id"),
 | |
|         displayName=None,
 | |
|         dataType="INT",
 | |
|         arrayDataType=None,
 | |
|         dataLength=None,
 | |
|         precision=None,
 | |
|         scale=None,
 | |
|         dataTypeDisplay="int",
 | |
|         description=None,
 | |
|         fullyQualifiedName="test-service-table-patch.test-db.test-schema.customers.customer_id",
 | |
|         tags=[],
 | |
|         constraint=None,
 | |
|         ordinalPosition=None,
 | |
|         jsonSchema=None,
 | |
|         children=None,
 | |
|         customMetrics=None,
 | |
|         profile=None,
 | |
|     ),
 | |
|     Column(
 | |
|         name=ColumnName(__root__="first_name"),
 | |
|         displayName=None,
 | |
|         dataType="VARCHAR",
 | |
|         arrayDataType=None,
 | |
|         dataLength=20,
 | |
|         precision=None,
 | |
|         scale=None,
 | |
|         dataTypeDisplay="varchar",
 | |
|         description=None,
 | |
|         fullyQualifiedName="test-service-table-patch.test-db.test-schema.customers.first_name",
 | |
|         tags=[
 | |
|             TagLabel(
 | |
|                 tagFQN=TagFQN(__root__="PII.NonSensitive"),
 | |
|                 description=(
 | |
|                     "PII which is easily accessible from public sources and can include zip code, "
 | |
|                     "race, gender, and date of birth."
 | |
|                 ),
 | |
|                 source="Classification",
 | |
|                 labelType="Automated",
 | |
|                 state="Suggested",
 | |
|                 href=None,
 | |
|             )
 | |
|         ],
 | |
|         constraint=None,
 | |
|         ordinalPosition=None,
 | |
|         jsonSchema=None,
 | |
|         children=None,
 | |
|         customMetrics=None,
 | |
|         profile=None,
 | |
|     ),
 | |
|     Column(
 | |
|         name=ColumnName(__root__="last_name"),
 | |
|         displayName=None,
 | |
|         dataType="VARCHAR",
 | |
|         arrayDataType=None,
 | |
|         dataLength=20,
 | |
|         precision=None,
 | |
|         scale=None,
 | |
|         dataTypeDisplay="varchar",
 | |
|         description=None,
 | |
|         fullyQualifiedName="test-service-table-patch.test-db.test-schema.customers.last_name",
 | |
|         tags=[
 | |
|             TagLabel(
 | |
|                 tagFQN=TagFQN(__root__="PII.NonSensitive"),
 | |
|                 description=(
 | |
|                     "PII which is easily accessible from public sources and can include zip code, "
 | |
|                     "race, gender, and date of birth."
 | |
|                 ),
 | |
|                 source="Classification",
 | |
|                 labelType="Automated",
 | |
|                 state="Suggested",
 | |
|                 href=None,
 | |
|             )
 | |
|         ],
 | |
|         constraint=None,
 | |
|         ordinalPosition=None,
 | |
|         jsonSchema=None,
 | |
|         children=None,
 | |
|         customMetrics=None,
 | |
|         profile=None,
 | |
|     ),
 | |
|     Column(
 | |
|         name=ColumnName(__root__="first_order"),
 | |
|         displayName=None,
 | |
|         dataType="DATE",
 | |
|         arrayDataType=None,
 | |
|         dataLength=None,
 | |
|         precision=None,
 | |
|         scale=None,
 | |
|         dataTypeDisplay="date",
 | |
|         description=None,
 | |
|         fullyQualifiedName="test-service-table-patch.test-db.test-schema.customers.first_order",
 | |
|         tags=[
 | |
|             TagLabel(
 | |
|                 tagFQN=TagFQN(__root__="PII.NonSensitive"),
 | |
|                 description=(
 | |
|                     "PII which is easily accessible from public sources and can include zip code, "
 | |
|                     "race, gender, and date of birth."
 | |
|                 ),
 | |
|                 source="Classification",
 | |
|                 labelType="Automated",
 | |
|                 state="Suggested",
 | |
|                 href=None,
 | |
|             )
 | |
|         ],
 | |
|         constraint=None,
 | |
|         ordinalPosition=None,
 | |
|         jsonSchema=None,
 | |
|         children=None,
 | |
|         customMetrics=None,
 | |
|         profile=None,
 | |
|     ),
 | |
|     Column(
 | |
|         name=ColumnName(__root__="customer_email"),
 | |
|         displayName=None,
 | |
|         dataType="VARCHAR",
 | |
|         arrayDataType=None,
 | |
|         dataLength=20,
 | |
|         precision=None,
 | |
|         scale=None,
 | |
|         dataTypeDisplay="date",
 | |
|         description=None,
 | |
|         fullyQualifiedName="test-service-table-patch.test-db.test-schema.customers.customer_email",
 | |
|         tags=[
 | |
|             TagLabel(
 | |
|                 tagFQN=TagFQN(__root__="PII.Sensitive"),
 | |
|                 description=(
 | |
|                     (
 | |
|                         "PII which if lost, compromised, or disclosed without authorization, could result in"
 | |
|                         " substantial harm, embarrassment, inconvenience, or unfairness to an individual."
 | |
|                     )
 | |
|                 ),
 | |
|                 source="Classification",
 | |
|                 labelType="Automated",
 | |
|                 state="Suggested",
 | |
|                 href=None,
 | |
|             )
 | |
|         ],
 | |
|         constraint=None,
 | |
|         ordinalPosition=None,
 | |
|         jsonSchema=None,
 | |
|         children=None,
 | |
|         customMetrics=None,
 | |
|         profile=None,
 | |
|     ),
 | |
|     Column(
 | |
|         name=ColumnName(__root__="number_of_orders"),
 | |
|         displayName=None,
 | |
|         dataType="BIGINT",
 | |
|         arrayDataType=None,
 | |
|         dataLength=None,
 | |
|         precision=None,
 | |
|         scale=None,
 | |
|         dataTypeDisplay="bigint",
 | |
|         description=None,
 | |
|         fullyQualifiedName="test-service-table-patch.test-db.test-schema.customers.number_of_orders",
 | |
|         tags=[],
 | |
|         constraint=None,
 | |
|         ordinalPosition=None,
 | |
|         jsonSchema=None,
 | |
|         children=None,
 | |
|         customMetrics=None,
 | |
|         profile=None,
 | |
|     ),
 | |
| ]
 | |
| 
 | |
| 
 | |
| class PiiProcessorTest(TestCase):
 | |
|     """
 | |
|     Run this integration test with different type of column name
 | |
|     to attach PII Tags
 | |
|     """
 | |
| 
 | |
|     def __init__(
 | |
|         self,
 | |
|         methodName,
 | |
|     ) -> None:
 | |
|         super().__init__(methodName)
 | |
|         server_config = OpenMetadataConnection(
 | |
|             hostPort="http://localhost:8585/api",
 | |
|             authProvider="openmetadata",
 | |
|             securityConfig=OpenMetadataJWTClientConfig(
 | |
|                 jwtToken="eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJh"
 | |
|                 "bGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vc"
 | |
|                 "mciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7Hgz"
 | |
|                 "GBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUx"
 | |
|                 "huv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakL"
 | |
|                 "Lzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM"
 | |
|                 "5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg"
 | |
|             ),
 | |
|         )
 | |
|         self.metadata = OpenMetadata(server_config)
 | |
|         self.nerscanner_processor = NERScanner(self.metadata)
 | |
| 
 | |
|     def test_nerscanner_process(self):
 | |
|         """
 | |
|         test function for ner Scanner
 | |
|         """
 | |
|         service = CreateDatabaseServiceRequest(
 | |
|             name="test-service-table-patch",
 | |
|             serviceType=DatabaseServiceType.Mysql,
 | |
|             connection=DatabaseConnection(
 | |
|                 config=MysqlConnection(
 | |
|                     username="username",
 | |
|                     password="password",
 | |
|                     hostPort="http://localhost:1234",
 | |
|                 )
 | |
|             ),
 | |
|         )
 | |
|         service_entity = self.metadata.create_or_update(data=service)
 | |
| 
 | |
|         create_db = CreateDatabaseRequest(
 | |
|             name="test-db",
 | |
|             service=service_entity.fullyQualifiedName,
 | |
|         )
 | |
| 
 | |
|         create_db_entity = self.metadata.create_or_update(data=create_db)
 | |
| 
 | |
|         create_schema = CreateDatabaseSchemaRequest(
 | |
|             name="test-schema",
 | |
|             database=create_db_entity.fullyQualifiedName,
 | |
|         )
 | |
| 
 | |
|         create_schema_entity = self.metadata.create_or_update(data=create_schema)
 | |
| 
 | |
|         created_table = CreateTableRequest(
 | |
|             name="customers",
 | |
|             columns=[
 | |
|                 Column(name="customer_id", dataType=DataType.INT),
 | |
|                 Column(name="first_name", dataType=DataType.VARCHAR, dataLength=20),
 | |
|                 Column(name="last_name", dataType=DataType.VARCHAR, dataLength=20),
 | |
|                 Column(name="first_order", dataType=DataType.DATE),
 | |
|                 Column(name="customer_email", dataType=DataType.VARCHAR, dataLength=20),
 | |
|                 Column(name="number_of_orders", dataType=DataType.BIGINT),
 | |
|             ],
 | |
|             databaseSchema=create_schema_entity.fullyQualifiedName,
 | |
|         )
 | |
|         table_entity = self.metadata.create_or_update(data=created_table)
 | |
|         TABLE_ENTITY.id = table_entity.id
 | |
| 
 | |
|         self.nerscanner_processor.process(
 | |
|             table_data=table_data,
 | |
|             table_entity=TABLE_ENTITY,
 | |
|             client=self.metadata,
 | |
|             thresold_confidence=85,
 | |
|         )
 | |
|         updated_table_entity = self.metadata.get_by_id(
 | |
|             entity=Table, entity_id=table_entity.id, fields=["tags"]
 | |
|         )
 | |
|         for _, (expected, original) in enumerate(
 | |
|             zip(UPDATED_TABLE_ENTITY, updated_table_entity.columns)
 | |
|         ):
 | |
|             self.assertEqual(expected.tags, original.tags)
 | |
| 
 | |
|         self.metadata.delete(
 | |
|             entity=DatabaseService,
 | |
|             entity_id=service_entity.id,
 | |
|             recursive=True,
 | |
|             hard_delete=True,
 | |
|         )
 |