fix(ingest/lookml): make deploy key optional (#9378)

This commit is contained in:
Harshal Sheth 2023-12-07 17:18:16 -05:00 committed by GitHub
parent 724736939a
commit f03c66ca1f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 138 additions and 90 deletions

View File

@ -1,10 +1,12 @@
import os import os
from typing import Any, Dict, Optional import pathlib
from typing import Any, Dict, Optional, Union
from pydantic import Field, FilePath, SecretStr, validator from pydantic import Field, FilePath, SecretStr, validator
from datahub.configuration.common import ConfigModel from datahub.configuration.common import ConfigModel
from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.configuration.validate_field_rename import pydantic_renamed_field
from datahub.ingestion.source.git.git_import import GitClone
_GITHUB_PREFIX = "https://github.com/" _GITHUB_PREFIX = "https://github.com/"
_GITLAB_PREFIX = "https://gitlab.com/" _GITLAB_PREFIX = "https://gitlab.com/"
@ -141,3 +143,22 @@ class GitInfo(GitReference):
if "branch" in self.__fields_set__: if "branch" in self.__fields_set__:
return self.branch return self.branch
return None return None
def clone(
self,
tmp_path: Union[pathlib.Path, str],
fallback_deploy_key: Optional[SecretStr] = None,
) -> pathlib.Path:
"""Clones the repo into a temporary directory and returns the path to the checkout."""
assert self.repo_ssh_locator
git_clone = GitClone(str(tmp_path))
checkout_dir = git_clone.clone(
ssh_key=self.deploy_key or fallback_deploy_key,
repo_url=self.repo_ssh_locator,
branch=self.branch_for_clone,
)
return checkout_dir

View File

@ -6,6 +6,7 @@ from typing import Optional
from uuid import uuid4 from uuid import uuid4
import git import git
from git.util import remove_password_if_present
from pydantic import SecretStr from pydantic import SecretStr
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -53,7 +54,10 @@ class GitClone:
" -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" " -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no"
) )
logger.debug(f"ssh_command={git_ssh_cmd}") logger.debug(f"ssh_command={git_ssh_cmd}")
logger.info(f"⏳ Cloning repo '{repo_url}', this can take some time...")
logger.info(
f"⏳ Cloning repo '{self.sanitize_repo_url(repo_url)}', this can take some time..."
)
self.last_repo_cloned = git.Repo.clone_from( self.last_repo_cloned = git.Repo.clone_from(
repo_url, repo_url,
checkout_dir, checkout_dir,
@ -69,3 +73,26 @@ class GitClone:
def get_last_repo_cloned(self) -> Optional[git.Repo]: def get_last_repo_cloned(self) -> Optional[git.Repo]:
return self.last_repo_cloned return self.last_repo_cloned
@staticmethod
def sanitize_repo_url(repo_url: str) -> str:
"""Sanitizes the repo URL for logging purposes.
Args:
repo_url (str): The repository URL.
Returns:
str: The sanitized repository URL.
Examples:
>>> GitClone.sanitize_repo_url("https://username:password@github.com/org/repo.git")
'https://*****:*****@github.com/org/repo.git'
>>> GitClone.sanitize_repo_url("https://github.com/org/repo.git")
'https://github.com/org/repo.git'
>>> GitClone.sanitize_repo_url("git@github.com:org/repo.git")
'git@github.com:org/repo.git'
"""
return remove_password_if_present([repo_url])[0]

View File

@ -301,13 +301,13 @@ class LookMLSourceConfig(
) -> Optional[pydantic.DirectoryPath]: ) -> Optional[pydantic.DirectoryPath]:
if v is None: if v is None:
git_info: Optional[GitInfo] = values.get("git_info") git_info: Optional[GitInfo] = values.get("git_info")
if git_info and git_info.deploy_key: if git_info:
# We have git_info populated correctly, base folder is not needed if not git_info.deploy_key:
pass logger.warning(
"git_info is provided, but no SSH key is present. If the repo is not public, we'll fail to clone it."
)
else: else:
raise ValueError( raise ValueError("Neither base_folder nor git_info has been provided.")
"base_folder is not provided. Neither has a github deploy_key or deploy_key_file been provided"
)
return v return v
@ -1831,14 +1831,8 @@ class LookMLSource(StatefulIngestionSourceBase):
assert self.source_config.git_info assert self.source_config.git_info
# we don't have a base_folder, so we need to clone the repo and process it locally # we don't have a base_folder, so we need to clone the repo and process it locally
start_time = datetime.now() start_time = datetime.now()
git_clone = GitClone(tmp_dir) checkout_dir = self.source_config.git_info.clone(
# Github info deploy key is always populated tmp_path=tmp_dir,
assert self.source_config.git_info.deploy_key
assert self.source_config.git_info.repo_ssh_locator
checkout_dir = git_clone.clone(
ssh_key=self.source_config.git_info.deploy_key,
repo_url=self.source_config.git_info.repo_ssh_locator,
branch=self.source_config.git_info.branch_for_clone,
) )
self.reporter.git_clone_latency = datetime.now() - start_time self.reporter.git_clone_latency = datetime.now() - start_time
self.source_config.base_folder = checkout_dir.resolve() self.source_config.base_folder = checkout_dir.resolve()
@ -1853,29 +1847,20 @@ class LookMLSource(StatefulIngestionSourceBase):
for project, p_ref in self.source_config.project_dependencies.items(): for project, p_ref in self.source_config.project_dependencies.items():
# If we were given GitHub info, we need to clone the project. # If we were given GitHub info, we need to clone the project.
if isinstance(p_ref, GitInfo): if isinstance(p_ref, GitInfo):
assert p_ref.repo_ssh_locator
p_cloner = GitClone(f"{tmp_dir}/_included_/{project}")
try: try:
p_checkout_dir = p_cloner.clone( p_checkout_dir = p_ref.clone(
ssh_key=( tmp_path=f"{tmp_dir}/_included_/{project}",
# If a deploy key was provided, use it. Otherwise, fall back # If a deploy key was provided, use it. Otherwise, fall back
# to the main project deploy key. # to the main project deploy key, if present.
p_ref.deploy_key fallback_deploy_key=self.source_config.git_info.deploy_key
or ( if self.source_config.git_info
self.source_config.git_info.deploy_key else None,
if self.source_config.git_info
else None
)
),
repo_url=p_ref.repo_ssh_locator,
branch=p_ref.branch_for_clone,
) )
p_ref = p_checkout_dir.resolve() p_ref = p_checkout_dir.resolve()
except Exception as e: except Exception as e:
logger.warning( logger.warning(
f"Failed to clone remote project {project}. This can lead to failures in parsing lookml files later on: {e}", f"Failed to clone project dependency {project}. This can lead to failures in parsing lookml files later on: {e}",
) )
visited_projects.add(project) visited_projects.add(project)
continue continue
@ -1910,68 +1895,73 @@ class LookMLSource(StatefulIngestionSourceBase):
return return
manifest = self.get_manifest_if_present(project_path) manifest = self.get_manifest_if_present(project_path)
if manifest: if not manifest:
# Special case handling if the root project has a name in the manifest file. return
if project_name == _BASE_PROJECT_NAME and manifest.project_name:
if (
self.source_config.project_name is not None
and manifest.project_name != self.source_config.project_name
):
logger.warning(
f"The project name in the manifest file '{manifest.project_name}'"
f"does not match the configured project name '{self.source_config.project_name}'. "
"This can lead to failures in LookML include resolution and lineage generation."
)
elif self.source_config.project_name is None:
self.source_config.project_name = manifest.project_name
# Clone the remote project dependencies. # Special case handling if the root project has a name in the manifest file.
for remote_project in manifest.remote_dependencies: if project_name == _BASE_PROJECT_NAME and manifest.project_name:
if remote_project.name in project_visited: if (
continue self.source_config.project_name is not None
and manifest.project_name != self.source_config.project_name
):
logger.warning(
f"The project name in the manifest file '{manifest.project_name}'"
f"does not match the configured project name '{self.source_config.project_name}'. "
"This can lead to failures in LookML include resolution and lineage generation."
)
elif self.source_config.project_name is None:
self.source_config.project_name = manifest.project_name
p_cloner = GitClone(f"{tmp_dir}/_remote_/{project_name}") # Clone the remote project dependencies.
try: for remote_project in manifest.remote_dependencies:
# TODO: For 100% correctness, we should be consulting if remote_project.name in project_visited:
# the manifest lock file for the exact ref to use. continue
if remote_project.name in self.base_projects_folder:
# In case a remote_dependency is specified in the project_dependencies config,
# we don't need to clone it again.
continue
p_checkout_dir = p_cloner.clone( p_cloner = GitClone(f"{tmp_dir}/_remote_/{remote_project.name}")
ssh_key=( try:
self.source_config.git_info.deploy_key # TODO: For 100% correctness, we should be consulting
if self.source_config.git_info # the manifest lock file for the exact ref to use.
else None
),
repo_url=remote_project.url,
)
self.base_projects_folder[ p_checkout_dir = p_cloner.clone(
remote_project.name ssh_key=(
] = p_checkout_dir.resolve() self.source_config.git_info.deploy_key
repo = p_cloner.get_last_repo_cloned() if self.source_config.git_info
assert repo else None
remote_git_info = GitInfo( ),
url_template=remote_project.url, repo_url=remote_project.url,
repo="dummy/dummy", # set to dummy values to bypass validation )
branch=repo.active_branch.name,
)
remote_git_info.repo = (
"" # set to empty because url already contains the full path
)
self.remote_projects_git_info[remote_project.name] = remote_git_info
except Exception as e: self.base_projects_folder[
logger.warning( remote_project.name
f"Failed to clone remote project {project_name}. This can lead to failures in parsing lookml files later on", ] = p_checkout_dir.resolve()
e, repo = p_cloner.get_last_repo_cloned()
) assert repo
project_visited.add(project_name) remote_git_info = GitInfo(
else: url_template=remote_project.url,
self._recursively_check_manifests( repo="dummy/dummy", # set to dummy values to bypass validation
tmp_dir, remote_project.name, project_visited branch=repo.active_branch.name,
) )
remote_git_info.repo = (
"" # set to empty because url already contains the full path
)
self.remote_projects_git_info[remote_project.name] = remote_git_info
for project in manifest.local_dependencies: except Exception as e:
self._recursively_check_manifests(tmp_dir, project, project_visited) logger.warning(
f"Failed to clone remote project {project_name}. This can lead to failures in parsing lookml files later on: {e}",
)
project_visited.add(project_name)
else:
self._recursively_check_manifests(
tmp_dir, remote_project.name, project_visited
)
for project in manifest.local_dependencies:
self._recursively_check_manifests(tmp_dir, project, project_visited)
def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901
assert self.source_config.base_folder assert self.source_config.base_folder

View File

@ -1,3 +1,4 @@
import doctest
import os import os
import pytest import pytest
@ -81,6 +82,15 @@ def test_github_branch():
assert config.branch_for_clone == "main" assert config.branch_for_clone == "main"
def test_sanitize_repo_url():
import datahub.ingestion.source.git.git_import
assert doctest.testmod(datahub.ingestion.source.git.git_import) == (
0,
3,
) # 0 failures, 3 tests
def test_git_clone_public(tmp_path): def test_git_clone_public(tmp_path):
git_clone = GitClone(str(tmp_path)) git_clone = GitClone(str(tmp_path))
checkout_dir = git_clone.clone( checkout_dir = git_clone.clone(

View File

@ -799,7 +799,7 @@ def test_lookml_base_folder():
) )
with pytest.raises( with pytest.raises(
pydantic.ValidationError, match=r"base_folder.+not provided.+deploy_key" pydantic.ValidationError, match=r"base_folder.+nor.+git_info.+provided"
): ):
LookMLSourceConfig.parse_obj({"api": fake_api}) LookMLSourceConfig.parse_obj({"api": fake_api})