diff --git a/metadata-ingestion/src/datahub/configuration/git.py b/metadata-ingestion/src/datahub/configuration/git.py index 80eb41c100..9ea9007553 100644 --- a/metadata-ingestion/src/datahub/configuration/git.py +++ b/metadata-ingestion/src/datahub/configuration/git.py @@ -1,10 +1,12 @@ import os -from typing import Any, Dict, Optional +import pathlib +from typing import Any, Dict, Optional, Union from pydantic import Field, FilePath, SecretStr, validator from datahub.configuration.common import ConfigModel from datahub.configuration.validate_field_rename import pydantic_renamed_field +from datahub.ingestion.source.git.git_import import GitClone _GITHUB_PREFIX = "https://github.com/" _GITLAB_PREFIX = "https://gitlab.com/" @@ -141,3 +143,22 @@ class GitInfo(GitReference): if "branch" in self.__fields_set__: return self.branch return None + + def clone( + self, + tmp_path: Union[pathlib.Path, str], + fallback_deploy_key: Optional[SecretStr] = None, + ) -> pathlib.Path: + """Clones the repo into a temporary directory and returns the path to the checkout.""" + + assert self.repo_ssh_locator + + git_clone = GitClone(str(tmp_path)) + + checkout_dir = git_clone.clone( + ssh_key=self.deploy_key or fallback_deploy_key, + repo_url=self.repo_ssh_locator, + branch=self.branch_for_clone, + ) + + return checkout_dir diff --git a/metadata-ingestion/src/datahub/ingestion/source/git/git_import.py b/metadata-ingestion/src/datahub/ingestion/source/git/git_import.py index 55eeb2bc6d..2122374c1e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/git/git_import.py +++ b/metadata-ingestion/src/datahub/ingestion/source/git/git_import.py @@ -6,6 +6,7 @@ from typing import Optional from uuid import uuid4 import git +from git.util import remove_password_if_present from pydantic import SecretStr logger = logging.getLogger(__name__) @@ -53,7 +54,10 @@ class GitClone: " -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" ) logger.debug(f"ssh_command={git_ssh_cmd}") - logger.info(f"⏳ Cloning repo '{repo_url}', this can take some time...") + + logger.info( + f"⏳ Cloning repo '{self.sanitize_repo_url(repo_url)}', this can take some time..." + ) self.last_repo_cloned = git.Repo.clone_from( repo_url, checkout_dir, @@ -69,3 +73,26 @@ class GitClone: def get_last_repo_cloned(self) -> Optional[git.Repo]: return self.last_repo_cloned + + @staticmethod + def sanitize_repo_url(repo_url: str) -> str: + """Sanitizes the repo URL for logging purposes. + + Args: + repo_url (str): The repository URL. + + Returns: + str: The sanitized repository URL. + + Examples: + >>> GitClone.sanitize_repo_url("https://username:password@github.com/org/repo.git") + 'https://*****:*****@github.com/org/repo.git' + + >>> GitClone.sanitize_repo_url("https://github.com/org/repo.git") + 'https://github.com/org/repo.git' + + >>> GitClone.sanitize_repo_url("git@github.com:org/repo.git") + 'git@github.com:org/repo.git' + """ + + return remove_password_if_present([repo_url])[0] diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py index 93c405f0a3..b76bef49a7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py @@ -301,13 +301,13 @@ class LookMLSourceConfig( ) -> Optional[pydantic.DirectoryPath]: if v is None: git_info: Optional[GitInfo] = values.get("git_info") - if git_info and git_info.deploy_key: - # We have git_info populated correctly, base folder is not needed - pass + if git_info: + if not git_info.deploy_key: + logger.warning( + "git_info is provided, but no SSH key is present. If the repo is not public, we'll fail to clone it." + ) else: - raise ValueError( - "base_folder is not provided. Neither has a github deploy_key or deploy_key_file been provided" - ) + raise ValueError("Neither base_folder nor git_info has been provided.") return v @@ -1831,14 +1831,8 @@ class LookMLSource(StatefulIngestionSourceBase): assert self.source_config.git_info # we don't have a base_folder, so we need to clone the repo and process it locally start_time = datetime.now() - git_clone = GitClone(tmp_dir) - # Github info deploy key is always populated - assert self.source_config.git_info.deploy_key - assert self.source_config.git_info.repo_ssh_locator - checkout_dir = git_clone.clone( - ssh_key=self.source_config.git_info.deploy_key, - repo_url=self.source_config.git_info.repo_ssh_locator, - branch=self.source_config.git_info.branch_for_clone, + checkout_dir = self.source_config.git_info.clone( + tmp_path=tmp_dir, ) self.reporter.git_clone_latency = datetime.now() - start_time self.source_config.base_folder = checkout_dir.resolve() @@ -1853,29 +1847,20 @@ class LookMLSource(StatefulIngestionSourceBase): for project, p_ref in self.source_config.project_dependencies.items(): # If we were given GitHub info, we need to clone the project. if isinstance(p_ref, GitInfo): - assert p_ref.repo_ssh_locator - - p_cloner = GitClone(f"{tmp_dir}/_included_/{project}") try: - p_checkout_dir = p_cloner.clone( - ssh_key=( - # If a deploy key was provided, use it. Otherwise, fall back - # to the main project deploy key. - p_ref.deploy_key - or ( - self.source_config.git_info.deploy_key - if self.source_config.git_info - else None - ) - ), - repo_url=p_ref.repo_ssh_locator, - branch=p_ref.branch_for_clone, + p_checkout_dir = p_ref.clone( + tmp_path=f"{tmp_dir}/_included_/{project}", + # If a deploy key was provided, use it. Otherwise, fall back + # to the main project deploy key, if present. + fallback_deploy_key=self.source_config.git_info.deploy_key + if self.source_config.git_info + else None, ) p_ref = p_checkout_dir.resolve() except Exception as e: logger.warning( - f"Failed to clone remote project {project}. This can lead to failures in parsing lookml files later on: {e}", + f"Failed to clone project dependency {project}. This can lead to failures in parsing lookml files later on: {e}", ) visited_projects.add(project) continue @@ -1910,68 +1895,73 @@ class LookMLSource(StatefulIngestionSourceBase): return manifest = self.get_manifest_if_present(project_path) - if manifest: - # Special case handling if the root project has a name in the manifest file. - if project_name == _BASE_PROJECT_NAME and manifest.project_name: - if ( - self.source_config.project_name is not None - and manifest.project_name != self.source_config.project_name - ): - logger.warning( - f"The project name in the manifest file '{manifest.project_name}'" - f"does not match the configured project name '{self.source_config.project_name}'. " - "This can lead to failures in LookML include resolution and lineage generation." - ) - elif self.source_config.project_name is None: - self.source_config.project_name = manifest.project_name + if not manifest: + return - # Clone the remote project dependencies. - for remote_project in manifest.remote_dependencies: - if remote_project.name in project_visited: - continue + # Special case handling if the root project has a name in the manifest file. + if project_name == _BASE_PROJECT_NAME and manifest.project_name: + if ( + self.source_config.project_name is not None + and manifest.project_name != self.source_config.project_name + ): + logger.warning( + f"The project name in the manifest file '{manifest.project_name}'" + f"does not match the configured project name '{self.source_config.project_name}'. " + "This can lead to failures in LookML include resolution and lineage generation." + ) + elif self.source_config.project_name is None: + self.source_config.project_name = manifest.project_name - p_cloner = GitClone(f"{tmp_dir}/_remote_/{project_name}") - try: - # TODO: For 100% correctness, we should be consulting - # the manifest lock file for the exact ref to use. + # Clone the remote project dependencies. + for remote_project in manifest.remote_dependencies: + if remote_project.name in project_visited: + continue + if remote_project.name in self.base_projects_folder: + # In case a remote_dependency is specified in the project_dependencies config, + # we don't need to clone it again. + continue - p_checkout_dir = p_cloner.clone( - ssh_key=( - self.source_config.git_info.deploy_key - if self.source_config.git_info - else None - ), - repo_url=remote_project.url, - ) + p_cloner = GitClone(f"{tmp_dir}/_remote_/{remote_project.name}") + try: + # TODO: For 100% correctness, we should be consulting + # the manifest lock file for the exact ref to use. - self.base_projects_folder[ - remote_project.name - ] = p_checkout_dir.resolve() - repo = p_cloner.get_last_repo_cloned() - assert repo - remote_git_info = GitInfo( - url_template=remote_project.url, - repo="dummy/dummy", # set to dummy values to bypass validation - branch=repo.active_branch.name, - ) - remote_git_info.repo = ( - "" # set to empty because url already contains the full path - ) - self.remote_projects_git_info[remote_project.name] = remote_git_info + p_checkout_dir = p_cloner.clone( + ssh_key=( + self.source_config.git_info.deploy_key + if self.source_config.git_info + else None + ), + repo_url=remote_project.url, + ) - except Exception as e: - logger.warning( - f"Failed to clone remote project {project_name}. This can lead to failures in parsing lookml files later on", - e, - ) - project_visited.add(project_name) - else: - self._recursively_check_manifests( - tmp_dir, remote_project.name, project_visited - ) + self.base_projects_folder[ + remote_project.name + ] = p_checkout_dir.resolve() + repo = p_cloner.get_last_repo_cloned() + assert repo + remote_git_info = GitInfo( + url_template=remote_project.url, + repo="dummy/dummy", # set to dummy values to bypass validation + branch=repo.active_branch.name, + ) + remote_git_info.repo = ( + "" # set to empty because url already contains the full path + ) + self.remote_projects_git_info[remote_project.name] = remote_git_info - for project in manifest.local_dependencies: - self._recursively_check_manifests(tmp_dir, project, project_visited) + except Exception as e: + logger.warning( + f"Failed to clone remote project {project_name}. This can lead to failures in parsing lookml files later on: {e}", + ) + project_visited.add(project_name) + else: + self._recursively_check_manifests( + tmp_dir, remote_project.name, project_visited + ) + + for project in manifest.local_dependencies: + self._recursively_check_manifests(tmp_dir, project, project_visited) def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 assert self.source_config.base_folder diff --git a/metadata-ingestion/tests/integration/git/test_git_clone.py b/metadata-ingestion/tests/integration/git/test_git_clone.py index 3436c692f5..2428a6dfb1 100644 --- a/metadata-ingestion/tests/integration/git/test_git_clone.py +++ b/metadata-ingestion/tests/integration/git/test_git_clone.py @@ -1,3 +1,4 @@ +import doctest import os import pytest @@ -81,6 +82,15 @@ def test_github_branch(): assert config.branch_for_clone == "main" +def test_sanitize_repo_url(): + import datahub.ingestion.source.git.git_import + + assert doctest.testmod(datahub.ingestion.source.git.git_import) == ( + 0, + 3, + ) # 0 failures, 3 tests + + def test_git_clone_public(tmp_path): git_clone = GitClone(str(tmp_path)) checkout_dir = git_clone.clone( diff --git a/metadata-ingestion/tests/integration/lookml/test_lookml.py b/metadata-ingestion/tests/integration/lookml/test_lookml.py index a71b597863..1ed0d05c84 100644 --- a/metadata-ingestion/tests/integration/lookml/test_lookml.py +++ b/metadata-ingestion/tests/integration/lookml/test_lookml.py @@ -799,7 +799,7 @@ def test_lookml_base_folder(): ) with pytest.raises( - pydantic.ValidationError, match=r"base_folder.+not provided.+deploy_key" + pydantic.ValidationError, match=r"base_folder.+nor.+git_info.+provided" ): LookMLSourceConfig.parse_obj({"api": fake_api})