Ensure recognizers are created (#23645)

* Add the migration classes and data for recognizers

This is so that we can run a migration that sets `json->recognizers` of `PII.Sensitive` and `PII.NonSensitive` tags from json values.

The issue with normal migrations was that the value of recognizers was too long to be persisted in the server migrations log.

Created a common `migration.utils.v1110.MigrationProcessBase`

* Ensure building automatically with the right parameters

* Update typescript types
This commit is contained in:
Eugenio 2025-10-07 17:13:35 +02:00 committed by GitHub
parent c4a4b22295
commit a6ac42371d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
23 changed files with 1142 additions and 3 deletions

View File

@ -0,0 +1,167 @@
[
{
"name": "DateRecognizer",
"displayName": "Date Recognizer",
"description": "Recognize date using regex.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "DateRecognizer"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "PhoneRecognizer",
"displayName": "Phone Recognizer",
"description": "Recognize multi-regional phone numbers.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "PhoneRecognizer"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "UrlRecognizer",
"displayName": "Url Recognizer",
"description": "Recognize urls using regex.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "UrlRecognizer"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "SpacyRecognizer",
"displayName": "Recognizer using spaCy NLP model",
"description": "Recognize PII entities using a spaCy NLP model.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "SpacyRecognizer",
"supportedEntities": [
"DATE_TIME",
"NRP",
"LOCATION"
]
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"displayName": "Date time column name",
"name": "date_time",
"description": "A regex recognizer for column names",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "pattern",
"supportedLanguage": "en",
"patterns": [
{
"name": "date_time_pattern_0",
"regex": "^.*(date|time|dob|birthday|dod).*$",
"score": 0.6
}
],
"supportedEntity": "DATE_TIME",
"regexFlags": {
"dotAll": true,
"multiline": true,
"ignoreCase": true
},
"context": []
},
"confidenceThreshold": 0.6,
"target": "column_name"
},
{
"displayName": "Nrp column name",
"name": "nrp",
"description": "A regex recognizer for column names",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "pattern",
"supportedLanguage": "en",
"patterns": [
{
"name": "nrp_pattern_0",
"regex": "^.*(gender|nationality).*$",
"score": 0.6
}
],
"supportedEntity": "NRP",
"regexFlags": {
"dotAll": true,
"multiline": true,
"ignoreCase": true
},
"context": []
},
"confidenceThreshold": 0.6,
"target": "column_name"
},
{
"displayName": "Location column name",
"name": "location",
"description": "A regex recognizer for column names",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "pattern",
"supportedLanguage": "en",
"patterns": [
{
"name": "location_pattern_0",
"regex": "^.*(address|city|state|county|country|zipcode|zip|postal|zone|borough).*$",
"score": 0.6
}
],
"supportedEntity": "LOCATION",
"regexFlags": {
"dotAll": true,
"multiline": true,
"ignoreCase": true
},
"context": []
},
"confidenceThreshold": 0.6,
"target": "column_name"
},
{
"displayName": "Phone number column name",
"name": "phone_number",
"description": "A regex recognizer for column names",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "pattern",
"supportedLanguage": "en",
"patterns": [
{
"name": "phone_number_pattern_0",
"regex": "^.*(phone).*$",
"score": 0.6
}
],
"supportedEntity": "PHONE_NUMBER",
"regexFlags": {
"dotAll": true,
"multiline": true,
"ignoreCase": true
},
"context": []
},
"confidenceThreshold": 0.6,
"target": "column_name"
}
]

View File

@ -0,0 +1,741 @@
[
{
"name": "EnglishCreditCardRecognizer",
"displayName": "English Credit Card Recognizer",
"description": "Recognize common credit card numbers using regex + checksum.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "CreditCardRecognizer",
"supportedLanguage": "en",
"context": [
"credit",
"card",
"visa",
"mastercard",
"cc",
"amex",
"discover",
"jcb",
"diners",
"maestro",
"instapayment"
]
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "SpanishCreditCardRecognizer",
"displayName": "Spanish Credit Card Recognizer",
"description": "Recognize common credit card numbers using regex + checksum.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "CreditCardRecognizer",
"supportedLanguage": "es",
"context": [
"tarjeta",
"credito",
"visa",
"mastercard",
"cc",
"amex",
"discover",
"jcb",
"diners",
"maestro",
"instapayment"
]
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "ItalianCreditCardRecognizer",
"displayName": "Italian Credit Card Recognizer",
"description": "Recognize common credit card numbers using regex + checksum.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "CreditCardRecognizer",
"supportedLanguage": "it"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "PolishCreditCardRecognizer",
"displayName": "Polish Credit Card Recognizer",
"description": "Recognize common credit card numbers using regex + checksum.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "CreditCardRecognizer",
"supportedLanguage": "pl"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "UsBankRecognizer",
"displayName": "Us Bank Recognizer",
"description": "Recognizes US bank number using regex.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "UsBankRecognizer",
"supportedLanguage": "en"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "UsLicenseRecognizer",
"displayName": "Us License Recognizer",
"description": "Recognizes US driver license using regex.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "UsLicenseRecognizer",
"supportedLanguage": "en"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "UsItinRecognizer",
"displayName": "Us Itin Recognizer",
"description": "Recognizes US ITIN (Individual Taxpayer Identification Number) using regex.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "UsItinRecognizer",
"supportedLanguage": "en"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "UsPassportRecognizer",
"displayName": "Us Passport Recognizer",
"description": "Recognizes US Passport number using regex.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "UsPassportRecognizer",
"supportedLanguage": "en"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "UsSsnRecognizer",
"displayName": "Us Ssn Recognizer",
"description": "Recognize US Social Security Number (SSN) using regex.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "UsSsnRecognizer",
"supportedLanguage": "en"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "NhsRecognizer",
"displayName": "Nhs Recognizer",
"description": "Recognizes NHS number using regex and checksum.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "NhsRecognizer",
"supportedLanguage": "en"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "UkNinoRecognizer",
"displayName": "Uk Nino Recognizer",
"description": "Recognizes UK National Insurance Number using regex.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "UkNinoRecognizer",
"supportedLanguage": "en"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "SgFinRecognizer",
"displayName": "Sg Fin Recognizer",
"description": "Recognize SG FIN/NRIC number using regex.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "SgFinRecognizer",
"supportedLanguage": "en"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "AuAbnRecognizer",
"displayName": "Au Abn Recognizer",
"description": "Recognizes Australian Business Number (\\\"ABN\\\").<br/><br/>The Australian Business Number (ABN) is a unique 11 digit identifier issued to all entities registered in the Australian Business Register (ABR). The 11 digit ABN is structured as a 9 digit identifier<br/><br/>with two leading check digits.<br/><br/>The leading check digits are derived using a modulus 89 calculation.<br/><br/>This recognizer identifies ABN using regex, context words and checksum.<br/><br/>Reference: https://abr.business.gov.au/Help/AbnFormat",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "AuAbnRecognizer",
"supportedLanguage": "en"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "AuAcnRecognizer",
"displayName": "Au Acn Recognizer",
"description": "Recognizes Australian Company Number (\\\"ACN\\\".<br/><br/>The Australian Company Number (ACN) is a nine digit number with the last digit being a check digit calculated using a modified modulus 10 calculation.<br/><br/>This recognizer identifies ACN using regex, context words, and checksum.<br/><br/>Reference: https://asic.gov.au/",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "AuAcnRecognizer",
"supportedLanguage": "en"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "AuTfnRecognizer",
"displayName": "Au Tfn Recognizer",
"description": "Recognizes Australian Tax File Numbers (\\\"TFN\\\".<br/><br/>The tax file number (TFN) is a unique identifier issued by the Australian Taxation Office to each taxpaying entity, an individual, company,<br/><br/>superannuation fund, partnership, or trust.<br/><br/>The TFN consists of a nine digit number, usually presented in the format NNN NNN NNN.<br/><br/>TFN includes a check digit for detecting erroneous number based on simple modulo 11.<br/><br/>This recognizer uses regex, context words,<br/><br/>and checksum to identify TFN.<br/><br/>Reference: https://www.ato.gov.au/individuals/tax-file-number/",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "AuTfnRecognizer",
"supportedLanguage": "en"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "AuMedicareRecognizer",
"displayName": "Au Medicare Recognizer",
"description": "Recognizes Australian Medicare number using regex, context words, and checksum.<br/><br/>Medicare number is a unique identifier issued by Australian Government that enables the cardholder to receive a rebates of medical expenses under Australia's Medicare system.<br/><br/>It uses a modulus 10 checksum scheme to validate the number.<br/><br/>Reference: https://en.wikipedia.org/wiki/Medicare_card_(Australia)",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "AuMedicareRecognizer",
"supportedLanguage": "en"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "InPanRecognizer",
"displayName": "In Pan Recognizer",
"description": "Recognizes Indian Permanent Account Number (\\\"PAN\\\".<br/><br/>The Permanent Account Number (PAN) is a ten digit alpha-numeric code with the last digit being a check digit calculated using a modified modulus 10 calculation.<br/><br/>This recognizer identifies PAN using regex and context words.<br/><br/>Reference: https://en.wikipedia.org/wiki/Permanent_account_number<br/><br/>https://incometaxindia.gov.in/Forms/tps/1.Permanent%20Account%20Number%20(PAN).pdf",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "InPanRecognizer",
"supportedLanguage": "en"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "InAadhaarRecognizer",
"displayName": "In Aadhaar Recognizer",
"description": "Recognizes Indian UIDAI Person Identification Number (\\\"AADHAAR\\\").<br/><br/>Reference: https://en.wikipedia.org/wiki/Aadhaar<br/><br/>A 12 digit unique number that is issued to each individual by Government of India",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "InAadhaarRecognizer",
"supportedLanguage": "en"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "InVehicleRegistrationRecognizer",
"displayName": "In Vehicle Registration Recognizer",
"description": "Recognizes Indian Vehicle Registration Number issued by RTO.<br/><br/>Reference(s):<br/><br/>https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_India<br/><br/>https://en.wikipedia.org/wiki/Regional_Transport_Office<br/><br/>https://en.wikipedia.org/wiki/List_of_Regional_Transport_Office_districts_in_India<br/><br/>The registration scheme changed over time with multiple formats in play over the years<br/><br/>India has multiple active patterns for registration plates issued to different vehicle categories",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "InVehicleRegistrationRecognizer",
"supportedLanguage": "en"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "InPassportRecognizer",
"displayName": "In Passport Recognizer",
"description": "Recognizes Indian Passport Number.<br/><br/>Indian Passport Number is a eight digit alphanumeric number.<br/><br/>Reference:<br/><br/>https://www.bajajallianz.com/blog/travel-insurance-articles/where-is-passport-number-in-indian-passport.html",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "InPassportRecognizer",
"supportedLanguage": "en"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "EsNifRecognizer",
"displayName": "Es Nif Recognizer",
"description": "Recognize NIF number using regex and checksum.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "EsNifRecognizer",
"supportedLanguage": "es"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "EsNieRecognizer",
"displayName": "Es Nie Recognizer",
"description": "Recognize NIE number using regex and checksum.<br/><br/>Reference(s):<br/><br/>https://es.wikipedia.org/wiki/N%C3%BAmero_de_identidad_de_extranjero<br/><br/>https://www.interior.gob.es/opencms/ca/servicios-al-ciudadano/tramites-y-gestiones/dni/calculo-del-digito-de-control-del-nif-nie/",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "EsNieRecognizer",
"supportedLanguage": "es"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "ItDriverLicenseRecognizer",
"displayName": "It Driver License Recognizer",
"description": "Recognizes IT Driver License using regex.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "ItDriverLicenseRecognizer",
"supportedLanguage": "it"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "ItFiscalCodeRecognizer",
"displayName": "It Fiscal Code Recognizer",
"description": "Recognizes IT Fiscal Code using regex.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "ItFiscalCodeRecognizer",
"supportedLanguage": "it"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "ItVatCodeRecognizer",
"displayName": "It Vat Code Recognizer",
"description": "Recognizes Italian VAT code using regex and checksum.<br/><br/>For more information about italian VAT code:<br/><br/>https://en.wikipedia.org/wiki/VAT_identification_number#:~:text=%5B2%5D)-,Italy,-Partita%20IVA",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "ItVatCodeRecognizer",
"supportedLanguage": "it"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "ItIdentityCardRecognizer",
"displayName": "It Identity Card Recognizer",
"description": "Recognizes Italian Identity Card number using case-insensitive regex.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "ItIdentityCardRecognizer",
"supportedLanguage": "it"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "ItPassportRecognizer",
"displayName": "It Passport Recognizer",
"description": "Recognizes IT Passport number using case-insensitive regex.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "ItPassportRecognizer",
"supportedLanguage": "it"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "PlPeselRecognizer",
"displayName": "Pl Pesel Recognizer",
"description": "Recognize PESEL number using regex and checksum.<br/><br/>For more information about PESEL: https://en.wikipedia.org/wiki/PESEL",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "PlPeselRecognizer",
"supportedLanguage": "pl"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "CryptoRecognizer",
"displayName": "Crypto Recognizer",
"description": "Recognize common crypto account numbers using regex + checksum.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "CryptoRecognizer"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "EmailRecognizer",
"displayName": "Email Recognizer",
"description": "Recognize email addresses using regex.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "EmailRecognizer"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "IbanRecognizer",
"displayName": "Iban Recognizer",
"description": "Recognize IBAN code using regex and checksum.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "IbanRecognizer"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "IpRecognizer",
"displayName": "Ip Recognizer",
"description": "Recognize IP address using regex.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "IpRecognizer"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "MedicalLicenseRecognizer",
"displayName": "Medical License Recognizer",
"description": "Recognize common Medical license numbers using regex + checksum.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "MedicalLicenseRecognizer"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "InVoterRecognizer",
"displayName": "In Voter Recognizer",
"description": "Recognize Indian Voter/Election Id(EPIC).<br/><br/>The Elector's Photo Identity Card or Voter id is a ten digit alpha-numeric code issued by Election Commission of India to adult domiciles who have reached the age of 18<br/><br/>Ref: https://en.wikipedia.org/wiki/Voter_ID_(India)",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "InVoterRecognizer"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "AbaRoutingRecognizer",
"displayName": "ABA Routing Recognizer",
"description": "Recognize American Banking Association (ABA) routing number.<br/><br/>Also known as routing transit number (RTN) and used to identify financial institutions and process transactions.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "AbaRoutingRecognizer"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "FiPersonalIdentityCodeRecognizer",
"displayName": "FI Personal Identity Code Recognizer",
"description": "Recognizes and validates Finnish Personal Identity Codes (Henkilötunnus).",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "FiPersonalIdentityCodeRecognizer",
"supportedLanguage": "fi"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "SgUenRecognizer",
"displayName": "Singaporean UEN recognizer",
"description": "Recognize Singapore UEN (Unique Entity Number) using regex.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "SgUenRecognizer"
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "SpacyRecognizer",
"displayName": "Recognizer using spaCy NLP model",
"description": "Recognize PII entities using a spaCy NLP model.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "SpacyRecognizer",
"supportedEntities": [
"PERSON"
]
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"displayName": "US SSN column name",
"name": "us_ssn",
"description": "A regex recognizer for column names",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "pattern",
"supportedLanguage": "en",
"patterns": [
{
"name": "us_ssn_pattern_0",
"regex": "^.*(ssn|social).*$",
"score": 0.6
}
],
"supportedEntity": "US_SSN",
"regexFlags": {
"dotAll": true,
"multiline": true,
"ignoreCase": true
},
"context": []
},
"confidenceThreshold": 0.6,
"target": "column_name"
},
{
"displayName": "Credit card column name",
"name": "credit_card",
"description": "A regex recognizer for column names",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "pattern",
"supportedLanguage": "en",
"patterns": [
{
"name": "credit_card_pattern_0",
"regex": "^.*(credit).*(card).*$",
"score": 0.6
}
],
"supportedEntity": "CREDIT_CARD",
"regexFlags": {
"dotAll": true,
"multiline": true,
"ignoreCase": true
},
"context": []
},
"confidenceThreshold": 0.6,
"target": "column_name"
},
{
"displayName": "US bank number column name",
"name": "us_bank_number",
"description": "A regex recognizer for column names",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "pattern",
"supportedLanguage": "en",
"patterns": [
{
"name": "us_bank_number_pattern_0",
"regex": "\\b(account|acct|acc)[_-]?(number|num|no)\\b",
"score": 0.6
},
{
"name": "us_bank_number_pattern_1",
"regex": "\\bbank[_-]?(account|number|num|no)?\\b",
"score": 0.6
}
],
"supportedEntity": "US_BANK_NUMBER",
"regexFlags": {
"dotAll": true,
"multiline": true,
"ignoreCase": true
},
"context": []
},
"confidenceThreshold": 0.6,
"target": "column_name"
},
{
"displayName": "Iban code column name",
"name": "iban_code",
"description": "A regex recognizer for column names",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "pattern",
"supportedLanguage": "en",
"patterns": [
{
"name": "iban_code_pattern_0",
"regex": "\\b(account|acct|acc)[_-]?(number|num|no)\\b",
"score": 0.6
},
{
"name": "iban_code_pattern_1",
"regex": "\\bbank[_-]?(account|number|num|no)?\\b",
"score": 0.6
},
{
"name": "iban_code_pattern_2",
"regex": "\\biban(?:[_]?(number|code))?\\b",
"score": 0.6
},
{
"name": "iban_code_pattern_3",
"regex": "\\bbank[_]?iban\\b",
"score": 0.6
},
{
"name": "iban_code_pattern_4",
"regex": "\\binternational[_]?(account|bank[_]?number)\\b",
"score": 0.6
}
],
"supportedEntity": "IBAN_CODE",
"regexFlags": {
"dotAll": true,
"multiline": true,
"ignoreCase": true
},
"context": []
},
"confidenceThreshold": 0.6,
"target": "column_name"
},
{
"displayName": "Email address column name",
"name": "email_address",
"description": "A regex recognizer for column names",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "pattern",
"supportedLanguage": "en",
"patterns": [
{
"name": "email_address_pattern_0",
"regex": "^(email|e-mail|mail)(.*address)?$",
"score": 0.6
}
],
"supportedEntity": "EMAIL_ADDRESS",
"regexFlags": {
"dotAll": true,
"multiline": true,
"ignoreCase": true
},
"context": []
},
"confidenceThreshold": 0.6,
"target": "column_name"
},
{
"displayName": "Person column name",
"name": "person",
"description": "A regex recognizer for column names",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "pattern",
"supportedLanguage": "en",
"patterns": [
{
"name": "person_pattern_0",
"regex": "^.*(user|client|person|first|last|maiden|nick).*(name).*$",
"score": 0.6
}
],
"supportedEntity": "PERSON",
"regexFlags": {
"dotAll": true,
"multiline": true,
"ignoreCase": true
},
"context": []
},
"confidenceThreshold": 0.6,
"target": "column_name"
}
]

View File

@ -219,6 +219,8 @@ class PresidioRecognizerFactory:
args["supported_language"] = supported_language
if context := config.context:
args["context"] = context
if supported_entities := config.supportedEntities:
args["supported_entities"] = [entity.value for entity in supported_entities]
return predefined_class(**args)

View File

@ -14,7 +14,8 @@ from metadata.pii.tag_processor import TagAnalyzerGenerator, TagProcessor
def create_pii_processor(
metadata: OpenMetadata[Any, Any], openmetadata_config: OpenMetadataWorkflowConfig
) -> AutoClassificationProcessor:
if getattr(openmetadata_config.processor, "type") == "tag-pii-processor":
processor_type = getattr(openmetadata_config.processor, "type", "tag-pii-processor")
if processor_type == "tag-pii-processor":
return TagProcessor(
config=parse_workflow_config_gracefully(openmetadata_config.model_dump()),
metadata=metadata,

View File

@ -68,7 +68,7 @@ def build_auto_classification_workflow_config(
config={},
),
processor=Processor(
type="orm-profiler",
type="tag-pii-processor",
config={},
),
workflowConfig=WorkflowConfig(

View File

@ -74,6 +74,8 @@ public interface MigrationProcess {
String getPostDDLScriptFilePath();
String getMigrationsDir();
// Handle Non-transactional supported SQLs here Example changes in table struct (DDL
Map<String, QueryStatus> runSchemaChanges(boolean isForceMigration);

View File

@ -91,6 +91,11 @@ public class MigrationProcessImpl implements MigrationProcess {
return migrationFile.getPostDDLScriptFile();
}
@Override
public String getMigrationsDir() {
return migrationFile.getDirPath();
}
@Override
public Map<String, QueryStatus> runSchemaChanges(boolean isForceMigration) {
return performSqlExecutionAndUpdate(

View File

@ -0,0 +1,18 @@
package org.openmetadata.service.migration.mysql.v1110;
import lombok.extern.slf4j.Slf4j;
import org.openmetadata.service.migration.utils.MigrationFile;
import org.openmetadata.service.migration.utils.v1110.MigrationProcessBase;
@Slf4j
public class Migration extends MigrationProcessBase {
public Migration(MigrationFile migrationFile) {
super(migrationFile);
}
@Override
protected String getQueryFormat() {
return "UPDATE tag SET json = JSON_SET(json, '$.recognizers', CAST('%s' AS JSON)) "
+ "WHERE JSON_EXTRACT(json, '$.fullyQualifiedName') = '%s'";
}
}

View File

@ -0,0 +1,18 @@
package org.openmetadata.service.migration.postgres.v1110;
import lombok.extern.slf4j.Slf4j;
import org.openmetadata.service.migration.utils.MigrationFile;
import org.openmetadata.service.migration.utils.v1110.MigrationProcessBase;
@Slf4j
public class Migration extends MigrationProcessBase {
public Migration(MigrationFile migrationFile) {
super(migrationFile);
}
@Override
protected String getQueryFormat() {
return "UPDATE tag SET json = jsonb_set(json, '{recognizers}', '%s'::jsonb) "
+ "WHERE json->>'fullyQualifiedName' = '%s'";
}
}

View File

@ -154,6 +154,10 @@ public class MigrationFile implements Comparable<MigrationFile> {
return postDDLScripts;
}
public String getDirPath() {
return this.dir.getAbsolutePath();
}
private int[] convertToNumber(String version) {
final String[] split = version.split("\\-")[0].split("\\.");
int[] numbers = new int[split.length];

View File

@ -0,0 +1,77 @@
package org.openmetadata.service.migration.utils.v1110;
import static org.openmetadata.service.util.EntityUtil.hash;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Map;
import lombok.extern.slf4j.Slf4j;
import org.openmetadata.service.migration.QueryStatus;
import org.openmetadata.service.migration.api.MigrationProcessImpl;
import org.openmetadata.service.migration.utils.MigrationFile;
@Slf4j
public abstract class MigrationProcessBase extends MigrationProcessImpl {
private static final Map<String, String> PATH_BY_TAG =
Map.of(
"PII.Sensitive", "data/tags/Sensitive.json",
"PII.NonSensitive", "data/tags/NonSensitive.json");
public MigrationProcessBase(MigrationFile migrationFile) {
super(migrationFile);
}
@Override
public Map<String, QueryStatus> runPostDDLScripts(boolean isForceMigration) {
Map<String, QueryStatus> result = super.runPostDDLScripts(isForceMigration);
PATH_BY_TAG.forEach(
(tagFqn, relativePath) -> {
try {
updateTagRecognizers(tagFqn, relativePath, result, isForceMigration);
} catch (Exception e) {
LOG.error("Failed to update recognizers for tag: {}", tagFqn, e);
}
});
return result;
}
private void updateTagRecognizers(
String tagFqn,
String relativePath,
Map<String, QueryStatus> results,
Boolean isForceMigration)
throws IOException {
Path dataPath = Paths.get(this.getMigrationsDir(), relativePath);
if (!Files.exists(dataPath)) {
LOG.warn("Tag data file not found: {}", dataPath);
return;
}
String jsonContent = Files.readString(dataPath);
String queryFormat = getQueryFormat();
String updateQuery = String.format(queryFormat, jsonContent.replace("'", "''"), tagFqn);
String truncatedQuery = String.format(queryFormat, "[ ... data truncated ... ]", tagFqn);
try {
handle.execute(updateQuery);
migrationDAO.upsertServerMigrationSQL(getVersion(), truncatedQuery, hash(truncatedQuery));
results.put(
updateQuery, new QueryStatus(QueryStatus.Status.SUCCESS, "Successfully Executed Query"));
} catch (Exception e) {
String message = String.format("Failed to run sql: [%s] due to [%s]", truncatedQuery, e);
results.put(truncatedQuery, new QueryStatus(QueryStatus.Status.FAILURE, message));
if (!isForceMigration) {
throw new RuntimeException(message, e);
}
}
}
protected abstract String getQueryFormat();
}

View File

@ -63,6 +63,24 @@
"confidenceThreshold": 0.6,
"target": "content"
},
{
"name": "SpacyRecognizer",
"displayName": "Recognizer using spaCy NLP model",
"description": "Recognize PII entities using a spaCy NLP model.",
"enabled": true,
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "SpacyRecognizer",
"supportedEntities": [
"DATE_TIME",
"NRP",
"LOCATION"
]
},
"confidenceThreshold": 0.6,
"target": "content"
},
{
"displayName": "Date time column name",
"name": "date_time",
@ -724,7 +742,10 @@
"isSystemDefault": true,
"recognizerConfig": {
"type": "predefined",
"name": "SpacyRecognizer"
"name": "SpacyRecognizer",
"supportedEntities": [
"PERSON"
]
},
"confidenceThreshold": 0.6,
"target": "content"

View File

@ -70,6 +70,13 @@
"type": "string"
},
"default": []
},
"supportedEntities": {
"description": "PII (Personally Identifiable Information) tags for classification and detection of sensitive data",
"type": "array",
"items": {
"$ref": "piiEntity.json"
}
}
},
"required": ["type", "name"],

View File

@ -289,6 +289,11 @@ export interface RecognizerConfig {
* Name of the recognizer (defaults to class name if not provided)
*/
name?: Name;
/**
* PII (Personally Identifiable Information) tags for classification and detection of
* sensitive data
*/
supportedEntities?: PIIEntity[];
}
/**

View File

@ -261,6 +261,11 @@ export interface RecognizerConfig {
* Name of the recognizer (defaults to class name if not provided)
*/
name?: Name;
/**
* PII (Personally Identifiable Information) tags for classification and detection of
* sensitive data
*/
supportedEntities?: PIIEntity[];
}
/**

View File

@ -369,6 +369,11 @@ export interface RecognizerConfig {
* Name of the recognizer (defaults to class name if not provided)
*/
name?: Name;
/**
* PII (Personally Identifiable Information) tags for classification and detection of
* sensitive data
*/
supportedEntities?: PIIEntity[];
}
/**

View File

@ -435,6 +435,11 @@ export interface RecognizerConfig {
* Name of the recognizer (defaults to class name if not provided)
*/
name?: Name;
/**
* PII (Personally Identifiable Information) tags for classification and detection of
* sensitive data
*/
supportedEntities?: PIIEntity[];
}
/**

View File

@ -23,6 +23,11 @@ export interface PredefinedRecognizer {
* Name of the recognizer (defaults to class name if not provided)
*/
name: Name;
/**
* PII (Personally Identifiable Information) tags for classification and detection of
* sensitive data
*/
supportedEntities?: PIIEntity[];
/**
* Language supported by this recognizer (ISO 639-1 code)
*/
@ -77,3 +82,49 @@ export enum Name {
UsPassportRecognizer = "UsPassportRecognizer",
UsSsnRecognizer = "UsSsnRecognizer",
}
/**
* Enum of PII (Personally Identifiable Information) tags for classification and detection
* of sensitive data. Based on Presidio supported entities
* (https://microsoft.github.io/presidio/supported_entities/).
*/
export enum PIIEntity {
AuAbn = "AU_ABN",
AuAcn = "AU_ACN",
AuMedicare = "AU_MEDICARE",
AuTfn = "AU_TFN",
CreditCard = "CREDIT_CARD",
Crypto = "CRYPTO",
DateTime = "DATE_TIME",
EmailAddress = "EMAIL_ADDRESS",
EsNie = "ES_NIE",
EsNif = "ES_NIF",
FiPersonalIdentityCode = "FI_PERSONAL_IDENTITY_CODE",
IPAddress = "IP_ADDRESS",
IbanCode = "IBAN_CODE",
InAadhaar = "IN_AADHAAR",
InPan = "IN_PAN",
InPassport = "IN_PASSPORT",
InVehicleRegistration = "IN_VEHICLE_REGISTRATION",
InVoter = "IN_VOTER",
ItDriverLicense = "IT_DRIVER_LICENSE",
ItFiscalCode = "IT_FISCAL_CODE",
ItIdentityCard = "IT_IDENTITY_CARD",
ItPassport = "IT_PASSPORT",
ItVatCode = "IT_VAT_CODE",
Location = "LOCATION",
MedicalLicense = "MEDICAL_LICENSE",
Nrp = "NRP",
Person = "PERSON",
PhoneNumber = "PHONE_NUMBER",
PlPesel = "PL_PESEL",
SgNricFin = "SG_NRIC_FIN",
SgUen = "SG_UEN",
URL = "URL",
UkNhs = "UK_NHS",
UsBankNumber = "US_BANK_NUMBER",
UsDriverLicense = "US_DRIVER_LICENSE",
UsItin = "US_ITIN",
UsPassport = "US_PASSPORT",
UsSsn = "US_SSN",
}

View File

@ -206,6 +206,11 @@ export interface RecognizerConfig {
* Name of the recognizer (defaults to class name if not provided)
*/
name?: Name;
/**
* PII (Personally Identifiable Information) tags for classification and detection of
* sensitive data
*/
supportedEntities?: PIIEntity[];
}
/**