From 9eb6b2d68dd72efdfeb36fa49b43d6b4346fc3f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20L=C3=BCdin?= <13187726+Masterchen09@users.noreply.github.com> Date: Tue, 16 Apr 2024 21:48:48 +0200 Subject: [PATCH] fix(ingest): improve performance of get_allowed_list in AllowDenyPattern when dealing with large lists (#10219) --- .../src/datahub/configuration/common.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index dabf9f2006..7aaa1706a6 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -243,15 +243,21 @@ class AllowDenyPattern(ConfigModel): return AllowDenyPattern() def allowed(self, string: str) -> bool: - for deny_pattern in self.deny: - if re.match(deny_pattern, string, self.regex_flags): - return False + if self._denied(string): + return False return any( re.match(allow_pattern, string, self.regex_flags) for allow_pattern in self.allow ) + def _denied(self, string: str) -> bool: + for deny_pattern in self.deny: + if re.match(deny_pattern, string, self.regex_flags): + return True + + return False + def is_fully_specified_allow_list(self) -> bool: """ If the allow patterns are literals and not full regexes, then it is considered @@ -265,8 +271,11 @@ class AllowDenyPattern(ConfigModel): def get_allowed_list(self) -> List[str]: """Return the list of allowed strings as a list, after taking into account deny patterns, if possible""" - assert self.is_fully_specified_allow_list() - return [a for a in self.allow if self.allowed(a)] + if not self.is_fully_specified_allow_list(): + raise ValueError( + "allow list must be fully specified to get list of allowed strings" + ) + return [a for a in self.allow if not self._denied(a)] def __eq__(self, other): # type: ignore return isinstance(other, self.__class__) and self.__dict__ == other.__dict__