fix(ingest/lookml): make deploy key optional (#9378)

This commit is contained in:
Harshal Sheth 2023-12-07 17:18:16 -05:00 committed by GitHub
parent 724736939a
commit f03c66ca1f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 138 additions and 90 deletions

View File

@ -1,10 +1,12 @@
import os import os
from typing import Any, Dict, Optional import pathlib
from typing import Any, Dict, Optional, Union
from pydantic import Field, FilePath, SecretStr, validator from pydantic import Field, FilePath, SecretStr, validator
from datahub.configuration.common import ConfigModel from datahub.configuration.common import ConfigModel
from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.configuration.validate_field_rename import pydantic_renamed_field
from datahub.ingestion.source.git.git_import import GitClone
_GITHUB_PREFIX = "https://github.com/" _GITHUB_PREFIX = "https://github.com/"
_GITLAB_PREFIX = "https://gitlab.com/" _GITLAB_PREFIX = "https://gitlab.com/"
@ -141,3 +143,22 @@ class GitInfo(GitReference):
if "branch" in self.__fields_set__: if "branch" in self.__fields_set__:
return self.branch return self.branch
return None return None
def clone(
self,
tmp_path: Union[pathlib.Path, str],
fallback_deploy_key: Optional[SecretStr] = None,
) -> pathlib.Path:
"""Clones the repo into a temporary directory and returns the path to the checkout."""
assert self.repo_ssh_locator
git_clone = GitClone(str(tmp_path))
checkout_dir = git_clone.clone(
ssh_key=self.deploy_key or fallback_deploy_key,
repo_url=self.repo_ssh_locator,
branch=self.branch_for_clone,
)
return checkout_dir

View File

@ -6,6 +6,7 @@ from typing import Optional
from uuid import uuid4 from uuid import uuid4
import git import git
from git.util import remove_password_if_present
from pydantic import SecretStr from pydantic import SecretStr
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -53,7 +54,10 @@ class GitClone:
" -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" " -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no"
) )
logger.debug(f"ssh_command={git_ssh_cmd}") logger.debug(f"ssh_command={git_ssh_cmd}")
logger.info(f"⏳ Cloning repo '{repo_url}', this can take some time...")
logger.info(
f"⏳ Cloning repo '{self.sanitize_repo_url(repo_url)}', this can take some time..."
)
self.last_repo_cloned = git.Repo.clone_from( self.last_repo_cloned = git.Repo.clone_from(
repo_url, repo_url,
checkout_dir, checkout_dir,
@ -69,3 +73,26 @@ class GitClone:
def get_last_repo_cloned(self) -> Optional[git.Repo]: def get_last_repo_cloned(self) -> Optional[git.Repo]:
return self.last_repo_cloned return self.last_repo_cloned
@staticmethod
def sanitize_repo_url(repo_url: str) -> str:
"""Sanitizes the repo URL for logging purposes.
Args:
repo_url (str): The repository URL.
Returns:
str: The sanitized repository URL.
Examples:
>>> GitClone.sanitize_repo_url("https://username:password@github.com/org/repo.git")
'https://*****:*****@github.com/org/repo.git'
>>> GitClone.sanitize_repo_url("https://github.com/org/repo.git")
'https://github.com/org/repo.git'
>>> GitClone.sanitize_repo_url("git@github.com:org/repo.git")
'git@github.com:org/repo.git'
"""
return remove_password_if_present([repo_url])[0]

View File

@ -301,13 +301,13 @@ class LookMLSourceConfig(
) -> Optional[pydantic.DirectoryPath]: ) -> Optional[pydantic.DirectoryPath]:
if v is None: if v is None:
git_info: Optional[GitInfo] = values.get("git_info") git_info: Optional[GitInfo] = values.get("git_info")
if git_info and git_info.deploy_key: if git_info:
# We have git_info populated correctly, base folder is not needed if not git_info.deploy_key:
pass logger.warning(
else: "git_info is provided, but no SSH key is present. If the repo is not public, we'll fail to clone it."
raise ValueError(
"base_folder is not provided. Neither has a github deploy_key or deploy_key_file been provided"
) )
else:
raise ValueError("Neither base_folder nor git_info has been provided.")
return v return v
@ -1831,14 +1831,8 @@ class LookMLSource(StatefulIngestionSourceBase):
assert self.source_config.git_info assert self.source_config.git_info
# we don't have a base_folder, so we need to clone the repo and process it locally # we don't have a base_folder, so we need to clone the repo and process it locally
start_time = datetime.now() start_time = datetime.now()
git_clone = GitClone(tmp_dir) checkout_dir = self.source_config.git_info.clone(
# Github info deploy key is always populated tmp_path=tmp_dir,
assert self.source_config.git_info.deploy_key
assert self.source_config.git_info.repo_ssh_locator
checkout_dir = git_clone.clone(
ssh_key=self.source_config.git_info.deploy_key,
repo_url=self.source_config.git_info.repo_ssh_locator,
branch=self.source_config.git_info.branch_for_clone,
) )
self.reporter.git_clone_latency = datetime.now() - start_time self.reporter.git_clone_latency = datetime.now() - start_time
self.source_config.base_folder = checkout_dir.resolve() self.source_config.base_folder = checkout_dir.resolve()
@ -1853,29 +1847,20 @@ class LookMLSource(StatefulIngestionSourceBase):
for project, p_ref in self.source_config.project_dependencies.items(): for project, p_ref in self.source_config.project_dependencies.items():
# If we were given GitHub info, we need to clone the project. # If we were given GitHub info, we need to clone the project.
if isinstance(p_ref, GitInfo): if isinstance(p_ref, GitInfo):
assert p_ref.repo_ssh_locator
p_cloner = GitClone(f"{tmp_dir}/_included_/{project}")
try: try:
p_checkout_dir = p_cloner.clone( p_checkout_dir = p_ref.clone(
ssh_key=( tmp_path=f"{tmp_dir}/_included_/{project}",
# If a deploy key was provided, use it. Otherwise, fall back # If a deploy key was provided, use it. Otherwise, fall back
# to the main project deploy key. # to the main project deploy key, if present.
p_ref.deploy_key fallback_deploy_key=self.source_config.git_info.deploy_key
or (
self.source_config.git_info.deploy_key
if self.source_config.git_info if self.source_config.git_info
else None else None,
)
),
repo_url=p_ref.repo_ssh_locator,
branch=p_ref.branch_for_clone,
) )
p_ref = p_checkout_dir.resolve() p_ref = p_checkout_dir.resolve()
except Exception as e: except Exception as e:
logger.warning( logger.warning(
f"Failed to clone remote project {project}. This can lead to failures in parsing lookml files later on: {e}", f"Failed to clone project dependency {project}. This can lead to failures in parsing lookml files later on: {e}",
) )
visited_projects.add(project) visited_projects.add(project)
continue continue
@ -1910,7 +1895,9 @@ class LookMLSource(StatefulIngestionSourceBase):
return return
manifest = self.get_manifest_if_present(project_path) manifest = self.get_manifest_if_present(project_path)
if manifest: if not manifest:
return
# Special case handling if the root project has a name in the manifest file. # Special case handling if the root project has a name in the manifest file.
if project_name == _BASE_PROJECT_NAME and manifest.project_name: if project_name == _BASE_PROJECT_NAME and manifest.project_name:
if ( if (
@ -1929,8 +1916,12 @@ class LookMLSource(StatefulIngestionSourceBase):
for remote_project in manifest.remote_dependencies: for remote_project in manifest.remote_dependencies:
if remote_project.name in project_visited: if remote_project.name in project_visited:
continue continue
if remote_project.name in self.base_projects_folder:
# In case a remote_dependency is specified in the project_dependencies config,
# we don't need to clone it again.
continue
p_cloner = GitClone(f"{tmp_dir}/_remote_/{project_name}") p_cloner = GitClone(f"{tmp_dir}/_remote_/{remote_project.name}")
try: try:
# TODO: For 100% correctness, we should be consulting # TODO: For 100% correctness, we should be consulting
# the manifest lock file for the exact ref to use. # the manifest lock file for the exact ref to use.
@ -1961,8 +1952,7 @@ class LookMLSource(StatefulIngestionSourceBase):
except Exception as e: except Exception as e:
logger.warning( logger.warning(
f"Failed to clone remote project {project_name}. This can lead to failures in parsing lookml files later on", f"Failed to clone remote project {project_name}. This can lead to failures in parsing lookml files later on: {e}",
e,
) )
project_visited.add(project_name) project_visited.add(project_name)
else: else:

View File

@ -1,3 +1,4 @@
import doctest
import os import os
import pytest import pytest
@ -81,6 +82,15 @@ def test_github_branch():
assert config.branch_for_clone == "main" assert config.branch_for_clone == "main"
def test_sanitize_repo_url():
import datahub.ingestion.source.git.git_import
assert doctest.testmod(datahub.ingestion.source.git.git_import) == (
0,
3,
) # 0 failures, 3 tests
def test_git_clone_public(tmp_path): def test_git_clone_public(tmp_path):
git_clone = GitClone(str(tmp_path)) git_clone = GitClone(str(tmp_path))
checkout_dir = git_clone.clone( checkout_dir = git_clone.clone(

View File

@ -799,7 +799,7 @@ def test_lookml_base_folder():
) )
with pytest.raises( with pytest.raises(
pydantic.ValidationError, match=r"base_folder.+not provided.+deploy_key" pydantic.ValidationError, match=r"base_folder.+nor.+git_info.+provided"
): ):
LookMLSourceConfig.parse_obj({"api": fake_api}) LookMLSourceConfig.parse_obj({"api": fake_api})