mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-03 12:16:10 +00:00
fix(ingest/lookml): make deploy key optional (#9378)
This commit is contained in:
parent
724736939a
commit
f03c66ca1f
@ -1,10 +1,12 @@
|
|||||||
import os
|
import os
|
||||||
from typing import Any, Dict, Optional
|
import pathlib
|
||||||
|
from typing import Any, Dict, Optional, Union
|
||||||
|
|
||||||
from pydantic import Field, FilePath, SecretStr, validator
|
from pydantic import Field, FilePath, SecretStr, validator
|
||||||
|
|
||||||
from datahub.configuration.common import ConfigModel
|
from datahub.configuration.common import ConfigModel
|
||||||
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
||||||
|
from datahub.ingestion.source.git.git_import import GitClone
|
||||||
|
|
||||||
_GITHUB_PREFIX = "https://github.com/"
|
_GITHUB_PREFIX = "https://github.com/"
|
||||||
_GITLAB_PREFIX = "https://gitlab.com/"
|
_GITLAB_PREFIX = "https://gitlab.com/"
|
||||||
@ -141,3 +143,22 @@ class GitInfo(GitReference):
|
|||||||
if "branch" in self.__fields_set__:
|
if "branch" in self.__fields_set__:
|
||||||
return self.branch
|
return self.branch
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def clone(
|
||||||
|
self,
|
||||||
|
tmp_path: Union[pathlib.Path, str],
|
||||||
|
fallback_deploy_key: Optional[SecretStr] = None,
|
||||||
|
) -> pathlib.Path:
|
||||||
|
"""Clones the repo into a temporary directory and returns the path to the checkout."""
|
||||||
|
|
||||||
|
assert self.repo_ssh_locator
|
||||||
|
|
||||||
|
git_clone = GitClone(str(tmp_path))
|
||||||
|
|
||||||
|
checkout_dir = git_clone.clone(
|
||||||
|
ssh_key=self.deploy_key or fallback_deploy_key,
|
||||||
|
repo_url=self.repo_ssh_locator,
|
||||||
|
branch=self.branch_for_clone,
|
||||||
|
)
|
||||||
|
|
||||||
|
return checkout_dir
|
||||||
|
|||||||
@ -6,6 +6,7 @@ from typing import Optional
|
|||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
|
|
||||||
import git
|
import git
|
||||||
|
from git.util import remove_password_if_present
|
||||||
from pydantic import SecretStr
|
from pydantic import SecretStr
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@ -53,7 +54,10 @@ class GitClone:
|
|||||||
" -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no"
|
" -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no"
|
||||||
)
|
)
|
||||||
logger.debug(f"ssh_command={git_ssh_cmd}")
|
logger.debug(f"ssh_command={git_ssh_cmd}")
|
||||||
logger.info(f"⏳ Cloning repo '{repo_url}', this can take some time...")
|
|
||||||
|
logger.info(
|
||||||
|
f"⏳ Cloning repo '{self.sanitize_repo_url(repo_url)}', this can take some time..."
|
||||||
|
)
|
||||||
self.last_repo_cloned = git.Repo.clone_from(
|
self.last_repo_cloned = git.Repo.clone_from(
|
||||||
repo_url,
|
repo_url,
|
||||||
checkout_dir,
|
checkout_dir,
|
||||||
@ -69,3 +73,26 @@ class GitClone:
|
|||||||
|
|
||||||
def get_last_repo_cloned(self) -> Optional[git.Repo]:
|
def get_last_repo_cloned(self) -> Optional[git.Repo]:
|
||||||
return self.last_repo_cloned
|
return self.last_repo_cloned
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def sanitize_repo_url(repo_url: str) -> str:
|
||||||
|
"""Sanitizes the repo URL for logging purposes.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
repo_url (str): The repository URL.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The sanitized repository URL.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> GitClone.sanitize_repo_url("https://username:password@github.com/org/repo.git")
|
||||||
|
'https://*****:*****@github.com/org/repo.git'
|
||||||
|
|
||||||
|
>>> GitClone.sanitize_repo_url("https://github.com/org/repo.git")
|
||||||
|
'https://github.com/org/repo.git'
|
||||||
|
|
||||||
|
>>> GitClone.sanitize_repo_url("git@github.com:org/repo.git")
|
||||||
|
'git@github.com:org/repo.git'
|
||||||
|
"""
|
||||||
|
|
||||||
|
return remove_password_if_present([repo_url])[0]
|
||||||
|
|||||||
@ -301,13 +301,13 @@ class LookMLSourceConfig(
|
|||||||
) -> Optional[pydantic.DirectoryPath]:
|
) -> Optional[pydantic.DirectoryPath]:
|
||||||
if v is None:
|
if v is None:
|
||||||
git_info: Optional[GitInfo] = values.get("git_info")
|
git_info: Optional[GitInfo] = values.get("git_info")
|
||||||
if git_info and git_info.deploy_key:
|
if git_info:
|
||||||
# We have git_info populated correctly, base folder is not needed
|
if not git_info.deploy_key:
|
||||||
pass
|
logger.warning(
|
||||||
|
"git_info is provided, but no SSH key is present. If the repo is not public, we'll fail to clone it."
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError("Neither base_folder nor git_info has been provided.")
|
||||||
"base_folder is not provided. Neither has a github deploy_key or deploy_key_file been provided"
|
|
||||||
)
|
|
||||||
return v
|
return v
|
||||||
|
|
||||||
|
|
||||||
@ -1831,14 +1831,8 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|||||||
assert self.source_config.git_info
|
assert self.source_config.git_info
|
||||||
# we don't have a base_folder, so we need to clone the repo and process it locally
|
# we don't have a base_folder, so we need to clone the repo and process it locally
|
||||||
start_time = datetime.now()
|
start_time = datetime.now()
|
||||||
git_clone = GitClone(tmp_dir)
|
checkout_dir = self.source_config.git_info.clone(
|
||||||
# Github info deploy key is always populated
|
tmp_path=tmp_dir,
|
||||||
assert self.source_config.git_info.deploy_key
|
|
||||||
assert self.source_config.git_info.repo_ssh_locator
|
|
||||||
checkout_dir = git_clone.clone(
|
|
||||||
ssh_key=self.source_config.git_info.deploy_key,
|
|
||||||
repo_url=self.source_config.git_info.repo_ssh_locator,
|
|
||||||
branch=self.source_config.git_info.branch_for_clone,
|
|
||||||
)
|
)
|
||||||
self.reporter.git_clone_latency = datetime.now() - start_time
|
self.reporter.git_clone_latency = datetime.now() - start_time
|
||||||
self.source_config.base_folder = checkout_dir.resolve()
|
self.source_config.base_folder = checkout_dir.resolve()
|
||||||
@ -1853,29 +1847,20 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|||||||
for project, p_ref in self.source_config.project_dependencies.items():
|
for project, p_ref in self.source_config.project_dependencies.items():
|
||||||
# If we were given GitHub info, we need to clone the project.
|
# If we were given GitHub info, we need to clone the project.
|
||||||
if isinstance(p_ref, GitInfo):
|
if isinstance(p_ref, GitInfo):
|
||||||
assert p_ref.repo_ssh_locator
|
|
||||||
|
|
||||||
p_cloner = GitClone(f"{tmp_dir}/_included_/{project}")
|
|
||||||
try:
|
try:
|
||||||
p_checkout_dir = p_cloner.clone(
|
p_checkout_dir = p_ref.clone(
|
||||||
ssh_key=(
|
tmp_path=f"{tmp_dir}/_included_/{project}",
|
||||||
# If a deploy key was provided, use it. Otherwise, fall back
|
# If a deploy key was provided, use it. Otherwise, fall back
|
||||||
# to the main project deploy key.
|
# to the main project deploy key, if present.
|
||||||
p_ref.deploy_key
|
fallback_deploy_key=self.source_config.git_info.deploy_key
|
||||||
or (
|
if self.source_config.git_info
|
||||||
self.source_config.git_info.deploy_key
|
else None,
|
||||||
if self.source_config.git_info
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
),
|
|
||||||
repo_url=p_ref.repo_ssh_locator,
|
|
||||||
branch=p_ref.branch_for_clone,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
p_ref = p_checkout_dir.resolve()
|
p_ref = p_checkout_dir.resolve()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Failed to clone remote project {project}. This can lead to failures in parsing lookml files later on: {e}",
|
f"Failed to clone project dependency {project}. This can lead to failures in parsing lookml files later on: {e}",
|
||||||
)
|
)
|
||||||
visited_projects.add(project)
|
visited_projects.add(project)
|
||||||
continue
|
continue
|
||||||
@ -1910,68 +1895,73 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|||||||
return
|
return
|
||||||
|
|
||||||
manifest = self.get_manifest_if_present(project_path)
|
manifest = self.get_manifest_if_present(project_path)
|
||||||
if manifest:
|
if not manifest:
|
||||||
# Special case handling if the root project has a name in the manifest file.
|
return
|
||||||
if project_name == _BASE_PROJECT_NAME and manifest.project_name:
|
|
||||||
if (
|
|
||||||
self.source_config.project_name is not None
|
|
||||||
and manifest.project_name != self.source_config.project_name
|
|
||||||
):
|
|
||||||
logger.warning(
|
|
||||||
f"The project name in the manifest file '{manifest.project_name}'"
|
|
||||||
f"does not match the configured project name '{self.source_config.project_name}'. "
|
|
||||||
"This can lead to failures in LookML include resolution and lineage generation."
|
|
||||||
)
|
|
||||||
elif self.source_config.project_name is None:
|
|
||||||
self.source_config.project_name = manifest.project_name
|
|
||||||
|
|
||||||
# Clone the remote project dependencies.
|
# Special case handling if the root project has a name in the manifest file.
|
||||||
for remote_project in manifest.remote_dependencies:
|
if project_name == _BASE_PROJECT_NAME and manifest.project_name:
|
||||||
if remote_project.name in project_visited:
|
if (
|
||||||
continue
|
self.source_config.project_name is not None
|
||||||
|
and manifest.project_name != self.source_config.project_name
|
||||||
|
):
|
||||||
|
logger.warning(
|
||||||
|
f"The project name in the manifest file '{manifest.project_name}'"
|
||||||
|
f"does not match the configured project name '{self.source_config.project_name}'. "
|
||||||
|
"This can lead to failures in LookML include resolution and lineage generation."
|
||||||
|
)
|
||||||
|
elif self.source_config.project_name is None:
|
||||||
|
self.source_config.project_name = manifest.project_name
|
||||||
|
|
||||||
p_cloner = GitClone(f"{tmp_dir}/_remote_/{project_name}")
|
# Clone the remote project dependencies.
|
||||||
try:
|
for remote_project in manifest.remote_dependencies:
|
||||||
# TODO: For 100% correctness, we should be consulting
|
if remote_project.name in project_visited:
|
||||||
# the manifest lock file for the exact ref to use.
|
continue
|
||||||
|
if remote_project.name in self.base_projects_folder:
|
||||||
|
# In case a remote_dependency is specified in the project_dependencies config,
|
||||||
|
# we don't need to clone it again.
|
||||||
|
continue
|
||||||
|
|
||||||
p_checkout_dir = p_cloner.clone(
|
p_cloner = GitClone(f"{tmp_dir}/_remote_/{remote_project.name}")
|
||||||
ssh_key=(
|
try:
|
||||||
self.source_config.git_info.deploy_key
|
# TODO: For 100% correctness, we should be consulting
|
||||||
if self.source_config.git_info
|
# the manifest lock file for the exact ref to use.
|
||||||
else None
|
|
||||||
),
|
|
||||||
repo_url=remote_project.url,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.base_projects_folder[
|
p_checkout_dir = p_cloner.clone(
|
||||||
remote_project.name
|
ssh_key=(
|
||||||
] = p_checkout_dir.resolve()
|
self.source_config.git_info.deploy_key
|
||||||
repo = p_cloner.get_last_repo_cloned()
|
if self.source_config.git_info
|
||||||
assert repo
|
else None
|
||||||
remote_git_info = GitInfo(
|
),
|
||||||
url_template=remote_project.url,
|
repo_url=remote_project.url,
|
||||||
repo="dummy/dummy", # set to dummy values to bypass validation
|
)
|
||||||
branch=repo.active_branch.name,
|
|
||||||
)
|
|
||||||
remote_git_info.repo = (
|
|
||||||
"" # set to empty because url already contains the full path
|
|
||||||
)
|
|
||||||
self.remote_projects_git_info[remote_project.name] = remote_git_info
|
|
||||||
|
|
||||||
except Exception as e:
|
self.base_projects_folder[
|
||||||
logger.warning(
|
remote_project.name
|
||||||
f"Failed to clone remote project {project_name}. This can lead to failures in parsing lookml files later on",
|
] = p_checkout_dir.resolve()
|
||||||
e,
|
repo = p_cloner.get_last_repo_cloned()
|
||||||
)
|
assert repo
|
||||||
project_visited.add(project_name)
|
remote_git_info = GitInfo(
|
||||||
else:
|
url_template=remote_project.url,
|
||||||
self._recursively_check_manifests(
|
repo="dummy/dummy", # set to dummy values to bypass validation
|
||||||
tmp_dir, remote_project.name, project_visited
|
branch=repo.active_branch.name,
|
||||||
)
|
)
|
||||||
|
remote_git_info.repo = (
|
||||||
|
"" # set to empty because url already contains the full path
|
||||||
|
)
|
||||||
|
self.remote_projects_git_info[remote_project.name] = remote_git_info
|
||||||
|
|
||||||
for project in manifest.local_dependencies:
|
except Exception as e:
|
||||||
self._recursively_check_manifests(tmp_dir, project, project_visited)
|
logger.warning(
|
||||||
|
f"Failed to clone remote project {project_name}. This can lead to failures in parsing lookml files later on: {e}",
|
||||||
|
)
|
||||||
|
project_visited.add(project_name)
|
||||||
|
else:
|
||||||
|
self._recursively_check_manifests(
|
||||||
|
tmp_dir, remote_project.name, project_visited
|
||||||
|
)
|
||||||
|
|
||||||
|
for project in manifest.local_dependencies:
|
||||||
|
self._recursively_check_manifests(tmp_dir, project, project_visited)
|
||||||
|
|
||||||
def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901
|
def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901
|
||||||
assert self.source_config.base_folder
|
assert self.source_config.base_folder
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
import doctest
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
@ -81,6 +82,15 @@ def test_github_branch():
|
|||||||
assert config.branch_for_clone == "main"
|
assert config.branch_for_clone == "main"
|
||||||
|
|
||||||
|
|
||||||
|
def test_sanitize_repo_url():
|
||||||
|
import datahub.ingestion.source.git.git_import
|
||||||
|
|
||||||
|
assert doctest.testmod(datahub.ingestion.source.git.git_import) == (
|
||||||
|
0,
|
||||||
|
3,
|
||||||
|
) # 0 failures, 3 tests
|
||||||
|
|
||||||
|
|
||||||
def test_git_clone_public(tmp_path):
|
def test_git_clone_public(tmp_path):
|
||||||
git_clone = GitClone(str(tmp_path))
|
git_clone = GitClone(str(tmp_path))
|
||||||
checkout_dir = git_clone.clone(
|
checkout_dir = git_clone.clone(
|
||||||
|
|||||||
@ -799,7 +799,7 @@ def test_lookml_base_folder():
|
|||||||
)
|
)
|
||||||
|
|
||||||
with pytest.raises(
|
with pytest.raises(
|
||||||
pydantic.ValidationError, match=r"base_folder.+not provided.+deploy_key"
|
pydantic.ValidationError, match=r"base_folder.+nor.+git_info.+provided"
|
||||||
):
|
):
|
||||||
LookMLSourceConfig.parse_obj({"api": fake_api})
|
LookMLSourceConfig.parse_obj({"api": fake_api})
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user