| 
									
										
										
										
											2023-05-17 15:58:17 +02:00
										 |  |  | #  Copyright 2021 Collate | 
					
						
							|  |  |  | #  Licensed under the Apache License, Version 2.0 (the "License"); | 
					
						
							|  |  |  | #  you may not use this file except in compliance with the License. | 
					
						
							|  |  |  | #  You may obtain a copy of the License at | 
					
						
							|  |  |  | #  http://www.apache.org/licenses/LICENSE-2.0 | 
					
						
							|  |  |  | #  Unless required by applicable law or agreed to in writing, software | 
					
						
							|  |  |  | #  distributed under the License is distributed on an "AS IS" BASIS, | 
					
						
							|  |  |  | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
					
						
							|  |  |  | #  See the License for the specific language governing permissions and | 
					
						
							|  |  |  | #  limitations under the License. | 
					
						
							|  |  |  | """
 | 
					
						
							|  |  |  | Test Column Name Scanner | 
					
						
							|  |  |  | """
 | 
					
						
							| 
									
										
										
										
											2024-09-06 08:54:23 +02:00
										 |  |  | from typing import Any | 
					
						
							| 
									
										
										
										
											2023-05-17 15:58:17 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-04 12:11:07 +02:00
										 |  |  | import pytest | 
					
						
							| 
									
										
										
										
											2023-05-17 15:58:17 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-04 12:11:07 +02:00
										 |  |  | from metadata.pii.scanners.ner_scanner import NERScanner, StringAnalysis | 
					
						
							| 
									
										
										
										
											2023-05-17 15:58:17 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-04 12:11:07 +02:00
										 |  |  | @pytest.fixture | 
					
						
							|  |  |  | def scanner() -> NERScanner: | 
					
						
							|  |  |  |     """Return the scanner""" | 
					
						
							|  |  |  |     return NERScanner() | 
					
						
							| 
									
										
										
										
											2023-05-18 12:53:22 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-04 12:11:07 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | def test_scanner_none(scanner): | 
					
						
							|  |  |  |     assert scanner.scan(list(range(100))) is None | 
					
						
							|  |  |  |     assert ( | 
					
						
							|  |  |  |         scanner.scan( | 
					
						
							|  |  |  |             " ".split( | 
					
						
							|  |  |  |                 "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nam consequat quam sagittis convallis cursus." | 
					
						
							| 
									
										
										
										
											2023-05-17 15:58:17 +02:00
										 |  |  |             ) | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-09-04 12:11:07 +02:00
										 |  |  |     ) is None | 
					
						
							| 
									
										
										
										
											2023-05-17 15:58:17 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-19 18:21:01 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-04 12:11:07 +02:00
										 |  |  | def test_scanner_sensitive(scanner): | 
					
						
							|  |  |  |     assert ( | 
					
						
							|  |  |  |         scanner.scan( | 
					
						
							|  |  |  |             [ | 
					
						
							|  |  |  |                 "geraldc@gmail.com", | 
					
						
							|  |  |  |                 "saratimithi@godesign.com", | 
					
						
							|  |  |  |                 "heroldsean@google.com", | 
					
						
							|  |  |  |             ] | 
					
						
							|  |  |  |         ).tag_fqn | 
					
						
							|  |  |  |         == "PII.Sensitive" | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert ( | 
					
						
							|  |  |  |         scanner.scan(["im ok", "saratimithi@godesign.com", "not sensitive"]).tag_fqn | 
					
						
							|  |  |  |         == "PII.Sensitive" | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_scanner_nonsensitive(scanner): | 
					
						
							|  |  |  |     assert ( | 
					
						
							|  |  |  |         scanner.scan( | 
					
						
							|  |  |  |             [ | 
					
						
							|  |  |  |                 "Washington", | 
					
						
							|  |  |  |                 "Alaska", | 
					
						
							|  |  |  |                 "Netherfield Lea Street", | 
					
						
							|  |  |  |             ] | 
					
						
							|  |  |  |         ).tag_fqn | 
					
						
							|  |  |  |         == "PII.NonSensitive" | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_get_highest_score_label(scanner): | 
					
						
							|  |  |  |     """Validate that even with score clashes, we only get one result back""" | 
					
						
							|  |  |  |     assert scanner.get_highest_score_label( | 
					
						
							|  |  |  |         { | 
					
						
							|  |  |  |             "PII.Sensitive": StringAnalysis(score=0.9, appearances=1), | 
					
						
							|  |  |  |             "PII.NonSensitive": StringAnalysis(score=0.8, appearances=1), | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     ) == ("PII.Sensitive", 0.9) | 
					
						
							|  |  |  |     assert scanner.get_highest_score_label( | 
					
						
							|  |  |  |         { | 
					
						
							|  |  |  |             "PII.Sensitive": StringAnalysis(score=1.0, appearances=1), | 
					
						
							|  |  |  |             "PII.NonSensitive": StringAnalysis(score=1.0, appearances=1), | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     ) == ("PII.Sensitive", 1.0) | 
					
						
							| 
									
										
										
										
											2024-09-06 08:54:23 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     "data,is_json", | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         ("potato", (False, None)), | 
					
						
							|  |  |  |         ("1", (False, None)), | 
					
						
							|  |  |  |         ('{"key": "value"}', (True, {"key": "value"})), | 
					
						
							|  |  |  |         ( | 
					
						
							|  |  |  |             '{"key": "value", "key2": "value2"}', | 
					
						
							|  |  |  |             (True, {"key": "value", "key2": "value2"}), | 
					
						
							|  |  |  |         ), | 
					
						
							|  |  |  |         ('["potato"]', (True, ["potato"])), | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | def test_is_json_data(scanner, data: Any, is_json: bool): | 
					
						
							|  |  |  |     """Assert we are flagging JSON data correctly""" | 
					
						
							|  |  |  |     assert scanner.is_json_data(data) == is_json | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_scanner_with_json(scanner): | 
					
						
							|  |  |  |     """Test the scanner with JSON data""" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert ( | 
					
						
							|  |  |  |         scanner.scan( | 
					
						
							|  |  |  |             [ | 
					
						
							|  |  |  |                 '{"email": "johndoe@example.com", "address": {"street": "123 Main St"}}', | 
					
						
							|  |  |  |                 '{"email": "potato", "age": 30, "preferences": {"newsletter": true, "notifications": "email"}}', | 
					
						
							|  |  |  |             ] | 
					
						
							|  |  |  |         ).tag_fqn | 
					
						
							|  |  |  |         == "PII.Sensitive" | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert ( | 
					
						
							|  |  |  |         scanner.scan( | 
					
						
							|  |  |  |             [ | 
					
						
							|  |  |  |                 '{"email": "foo", "address": {"street": "bar"}}', | 
					
						
							|  |  |  |                 '{"email": "potato", "age": 30, "preferences": {"newsletter": true, "notifications": "email"}}', | 
					
						
							|  |  |  |             ] | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  |         is None | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_scanner_with_lists(scanner): | 
					
						
							|  |  |  |     """Test the scanner with list data""" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert scanner.scan(["foo", "bar", "biz"]) is None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert ( | 
					
						
							|  |  |  |         scanner.scan(["foo", "bar", "johndoe@example.com"]).tag_fqn == "PII.Sensitive" | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert ( | 
					
						
							|  |  |  |         scanner.scan( | 
					
						
							|  |  |  |             [ | 
					
						
							|  |  |  |                 '{"emails": ["johndoe@example.com", "lima@example.com"]}', | 
					
						
							|  |  |  |                 '{"emails": ["foo", "bar", "biz"]}', | 
					
						
							|  |  |  |             ] | 
					
						
							|  |  |  |         ).tag_fqn | 
					
						
							|  |  |  |         == "PII.Sensitive" | 
					
						
							|  |  |  |     ) |