mirror of
				https://github.com/datahub-project/datahub.git
				synced 2025-11-03 20:27:50 +00:00 
			
		
		
		
	fix(ingest): add support for database and table patterns to glue source (#2339)
This commit is contained in:
		
							parent
							
								
									6e762ce3bc
								
							
						
					
					
						commit
						c1f3eaed35
					
				@ -367,7 +367,8 @@ source:
 | 
			
		||||
  config:
 | 
			
		||||
    aws_region: aws_region_name # i.e. "eu-west-1"
 | 
			
		||||
    env: environment used for the DatasetSnapshot URN, one of "DEV", "EI", "PROD" or "CORP". # Optional, defaults to "PROD".
 | 
			
		||||
    databases: list of databases to process. # Optional, if not specified then all databases will be processed.
 | 
			
		||||
    database_pattern: # Optional, to filter databases scanned, same as schema_pattern above.
 | 
			
		||||
    table_pattern: # Optional, to filter tables scanned, same as table_pattern above.
 | 
			
		||||
    aws_access_key_id # Optional. If not specified, credentials are picked up according to boto3 rules.
 | 
			
		||||
    # See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html
 | 
			
		||||
    aws_secret_access_key # Optional.
 | 
			
		||||
 | 
			
		||||
@ -79,10 +79,10 @@ plugins: Dict[str, Set[str]] = {
 | 
			
		||||
    "ldap": {"python-ldap>=2.4"},
 | 
			
		||||
    "druid": sql_common | {"pydruid>=0.6.2"},
 | 
			
		||||
    "mongodb": {"pymongo>=3.11"},
 | 
			
		||||
    "glue": {"boto3"},
 | 
			
		||||
    # Sink plugins.
 | 
			
		||||
    "datahub-kafka": kafka_common,
 | 
			
		||||
    "datahub-rest": {"requests>=2.25.1"},
 | 
			
		||||
    "glue": {"boto3"},
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
dev_requirements = {
 | 
			
		||||
 | 
			
		||||
@ -54,6 +54,11 @@ class AllowDenyPattern(ConfigModel):
 | 
			
		||||
 | 
			
		||||
    allow: List[str] = [".*"]
 | 
			
		||||
    deny: List[str] = []
 | 
			
		||||
    alphabet: str = "[A-Za-z0-9 _.-]"
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def alphabet_pattern(self):
 | 
			
		||||
        return re.compile(f"^{self.alphabet}+$")
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def allow_all(cls):
 | 
			
		||||
@ -69,3 +74,20 @@ class AllowDenyPattern(ConfigModel):
 | 
			
		||||
                return True
 | 
			
		||||
 | 
			
		||||
        return False
 | 
			
		||||
 | 
			
		||||
    def is_fully_specified_allow_list(self) -> bool:
 | 
			
		||||
        """
 | 
			
		||||
        If the allow patterns are literals and not full regexes, then it is considered
 | 
			
		||||
        fully specified. This is useful if you want to convert a 'list + filter'
 | 
			
		||||
        pattern into a 'search for the ones that are allowed' pattern, which can be
 | 
			
		||||
        much more efficient in some cases.
 | 
			
		||||
        """
 | 
			
		||||
        for allow_pattern in self.allow:
 | 
			
		||||
            if not self.alphabet_pattern.match(allow_pattern):
 | 
			
		||||
                return False
 | 
			
		||||
        return True
 | 
			
		||||
 | 
			
		||||
    def get_allowed_list(self):
 | 
			
		||||
        """Return the list of allowed strings as a list, after taking into account deny patterns, if possible"""
 | 
			
		||||
        assert self.is_fully_specified_allow_list()
 | 
			
		||||
        return [a for a in self.allow if self.allowed(a)]
 | 
			
		||||
 | 
			
		||||
@ -1,10 +1,12 @@
 | 
			
		||||
import time
 | 
			
		||||
from dataclasses import dataclass
 | 
			
		||||
from dataclasses import field as dataclass_field
 | 
			
		||||
from typing import Dict, Iterable, List, Optional
 | 
			
		||||
 | 
			
		||||
import boto3
 | 
			
		||||
 | 
			
		||||
from datahub.configuration import ConfigModel
 | 
			
		||||
from datahub.configuration.common import AllowDenyPattern
 | 
			
		||||
from datahub.ingestion.api.common import PipelineContext
 | 
			
		||||
from datahub.ingestion.api.source import Source, SourceReport
 | 
			
		||||
from datahub.ingestion.source.metadata_common import MetadataWorkUnit
 | 
			
		||||
@ -38,7 +40,8 @@ from datahub.metadata.schema_classes import (
 | 
			
		||||
 | 
			
		||||
class GlueSourceConfig(ConfigModel):
 | 
			
		||||
    env: str = "PROD"
 | 
			
		||||
    databases: Optional[List[str]] = None
 | 
			
		||||
    database_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
 | 
			
		||||
    table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
 | 
			
		||||
    aws_access_key_id: Optional[str] = None
 | 
			
		||||
    aws_secret_access_key: Optional[str] = None
 | 
			
		||||
    aws_session_token: Optional[str] = None
 | 
			
		||||
@ -72,10 +75,14 @@ class GlueSourceConfig(ConfigModel):
 | 
			
		||||
@dataclass
 | 
			
		||||
class GlueSourceReport(SourceReport):
 | 
			
		||||
    tables_scanned = 0
 | 
			
		||||
    filtered: List[str] = dataclass_field(default_factory=list)
 | 
			
		||||
 | 
			
		||||
    def report_table_scanned(self) -> None:
 | 
			
		||||
        self.tables_scanned += 1
 | 
			
		||||
 | 
			
		||||
    def report_table_dropped(self, table: str) -> None:
 | 
			
		||||
        self.filtered.append(table)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class GlueSource(Source):
 | 
			
		||||
    source_config: GlueSourceConfig
 | 
			
		||||
@ -87,7 +94,6 @@ class GlueSource(Source):
 | 
			
		||||
        self.report = GlueSourceReport()
 | 
			
		||||
        self.glue_client = config.glue_client
 | 
			
		||||
        self.env = config.env
 | 
			
		||||
        self.databases = config.databases
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def create(cls, config_dict, ctx):
 | 
			
		||||
@ -95,7 +101,7 @@ class GlueSource(Source):
 | 
			
		||||
        return cls(config, ctx)
 | 
			
		||||
 | 
			
		||||
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
 | 
			
		||||
        def get_all_tables(database_names: Optional[List[str]]):
 | 
			
		||||
        def get_all_tables():
 | 
			
		||||
            def get_tables_from_database(database_name: str, tables: List):
 | 
			
		||||
                kwargs = {"DatabaseName": database_name}
 | 
			
		||||
                while True:
 | 
			
		||||
@ -119,22 +125,28 @@ class GlueSource(Source):
 | 
			
		||||
                        break
 | 
			
		||||
                return tables
 | 
			
		||||
 | 
			
		||||
            if database_names:
 | 
			
		||||
            if self.source_config.database_pattern.is_fully_specified_allow_list():
 | 
			
		||||
                all_tables: List = []
 | 
			
		||||
                database_names = self.source_config.database_pattern.get_allowed_list()
 | 
			
		||||
                for database in database_names:
 | 
			
		||||
                    all_tables += get_tables_from_database(database, all_tables)
 | 
			
		||||
            else:
 | 
			
		||||
                all_tables = get_tables_from_all_databases()
 | 
			
		||||
            return all_tables
 | 
			
		||||
 | 
			
		||||
        tables = get_all_tables(self.databases)
 | 
			
		||||
        tables = get_all_tables()
 | 
			
		||||
 | 
			
		||||
        for table in tables:
 | 
			
		||||
            table_name = table["Name"]
 | 
			
		||||
            database_name = table["DatabaseName"]
 | 
			
		||||
            table_name = table["Name"]
 | 
			
		||||
            full_table_name = f"{database_name}.{table_name}"
 | 
			
		||||
 | 
			
		||||
            self.report.report_table_scanned()
 | 
			
		||||
            if not self.source_config.database_pattern.allowed(
 | 
			
		||||
                database_name
 | 
			
		||||
            ) or not self.source_config.table_pattern.allowed(full_table_name):
 | 
			
		||||
                self.report.report_table_dropped(full_table_name)
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            mce = self._extract_record(table, full_table_name)
 | 
			
		||||
            workunit = MetadataWorkUnit(id=f"glue-{full_table_name}", mce=mce)
 | 
			
		||||
            self.report.report_workunit(workunit)
 | 
			
		||||
 | 
			
		||||
@ -19,3 +19,17 @@ def test_single_table():
 | 
			
		||||
def test_default_deny():
 | 
			
		||||
    pattern = AllowDenyPattern(allow=["foo.mytable"])
 | 
			
		||||
    assert not pattern.allowed("foo.bar")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_fully_speced():
 | 
			
		||||
    pattern = AllowDenyPattern(allow=["foo.mytable"])
 | 
			
		||||
    assert pattern.is_fully_specified_allow_list()
 | 
			
		||||
    pattern = AllowDenyPattern(allow=["foo.*", "foo.table"])
 | 
			
		||||
    assert not pattern.is_fully_specified_allow_list()
 | 
			
		||||
    pattern = AllowDenyPattern(allow=["foo.?", "foo.table"])
 | 
			
		||||
    assert not pattern.is_fully_specified_allow_list()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_is_allowed():
 | 
			
		||||
    pattern = AllowDenyPattern(allow=["foo.mytable"], deny=["foo.*"])
 | 
			
		||||
    assert pattern.get_allowed_list() == []
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user