refactor(github): change github reference to git references (#7308)

This commit is contained in:
Aseem Bansal 2023-02-10 21:07:44 +05:30 committed by GitHub
parent a60d27b40f
commit ca65f9db18
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 51 additions and 50 deletions

View File

@ -13,7 +13,7 @@ _GITHUB_URL_TEMPLATE = "{repo_url}/blob/{branch}/{file_path}"
_GITLAB_URL_TEMPLATE = "{repo_url}/-/blob/{branch}/{file_path}" _GITLAB_URL_TEMPLATE = "{repo_url}/-/blob/{branch}/{file_path}"
class GitHubReference(ConfigModel): class GitReference(ConfigModel):
"""Reference to a hosted Git repository. Used to generate "view source" links.""" """Reference to a hosted Git repository. Used to generate "view source" links."""
repo: str = Field( repo: str = Field(
@ -72,7 +72,7 @@ class GitHubReference(ConfigModel):
) )
class GitHubInfo(GitHubReference): class GitInfo(GitReference):
"""A reference to a Git repository, including a deploy key that can be used to clone it.""" """A reference to a Git repository, including a deploy key that can be used to clone it."""
deploy_key_file: Optional[FilePath] = Field( deploy_key_file: Optional[FilePath] = Field(

View File

@ -9,7 +9,8 @@ import dateutil.parser
import requests import requests
from pydantic import BaseModel, Field, validator from pydantic import BaseModel, Field, validator
from datahub.configuration.github import GitHubReference from datahub.configuration.git import GitReference
from datahub.configuration.validate_field_rename import pydantic_renamed_field
from datahub.ingestion.api.decorators import ( from datahub.ingestion.api.decorators import (
SupportStatus, SupportStatus,
capability, capability,
@ -53,11 +54,13 @@ class DBTCoreConfig(DBTCommonConfig):
description="When fetching manifest files from s3, configuration for aws connection details", description="When fetching manifest files from s3, configuration for aws connection details",
) )
github_info: Optional[GitHubReference] = Field( git_info: Optional[GitReference] = Field(
None, None,
description="Reference to your github location to enable easy navigation from DataHub to your dbt files.", description="Reference to your git location to enable easy navigation from DataHub to your dbt files.",
) )
_github_info_deprecated = pydantic_renamed_field("github_info", "git_info")
@property @property
def s3_client(self): def s3_client(self):
assert self.aws_connection assert self.aws_connection
@ -476,8 +479,8 @@ class DBTCoreSource(DBTSourceBase):
return all_nodes, additional_custom_props return all_nodes, additional_custom_props
def get_external_url(self, node: DBTNode) -> Optional[str]: def get_external_url(self, node: DBTNode) -> Optional[str]:
if self.config.github_info and node.dbt_file_path: if self.config.git_info and node.dbt_file_path:
return self.config.github_info.get_url_for_file_path(node.dbt_file_path) return self.config.git_info.get_url_for_file_path(node.dbt_file_path)
return None return None
def get_platform_instance_id(self) -> str: def get_platform_instance_id(self) -> str:

View File

@ -19,8 +19,9 @@ from pydantic.fields import Field
import datahub.emitter.mce_builder as builder import datahub.emitter.mce_builder as builder
from datahub.configuration import ConfigModel from datahub.configuration import ConfigModel
from datahub.configuration.common import AllowDenyPattern, ConfigurationError from datahub.configuration.common import AllowDenyPattern, ConfigurationError
from datahub.configuration.github import GitHubInfo from datahub.configuration.git import GitInfo
from datahub.configuration.source_common import EnvBasedSourceConfigBase from datahub.configuration.source_common import EnvBasedSourceConfigBase
from datahub.configuration.validate_field_rename import pydantic_renamed_field
from datahub.emitter.mce_builder import make_schema_field_urn from datahub.emitter.mce_builder import make_schema_field_urn
from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.common import PipelineContext
@ -169,15 +170,16 @@ class LookerConnectionDefinition(ConfigModel):
class LookMLSourceConfig(LookerCommonConfig, StatefulIngestionConfigBase): class LookMLSourceConfig(LookerCommonConfig, StatefulIngestionConfigBase):
github_info: Optional[GitHubInfo] = Field( git_info: Optional[GitInfo] = Field(
None, None,
description="Reference to your github location. If present, supplies handy links to your lookml on the dataset entity page.", description="Reference to your git location. If present, supplies handy links to your lookml on the dataset entity page.",
) )
_github_info_deprecated = pydantic_renamed_field("github_info", "git_info")
base_folder: Optional[pydantic.DirectoryPath] = Field( base_folder: Optional[pydantic.DirectoryPath] = Field(
None, None,
description="Required if not providing github configuration and deploy keys. A pointer to a local directory (accessible to the ingestion system) where the root of the LookML repo has been checked out (typically via a git clone). This is typically the root folder where the `*.model.lkml` and `*.view.lkml` files are stored. e.g. If you have checked out your LookML repo under `/Users/jdoe/workspace/my-lookml-repo`, then set `base_folder` to `/Users/jdoe/workspace/my-lookml-repo`.", description="Required if not providing github configuration and deploy keys. A pointer to a local directory (accessible to the ingestion system) where the root of the LookML repo has been checked out (typically via a git clone). This is typically the root folder where the `*.model.lkml` and `*.view.lkml` files are stored. e.g. If you have checked out your LookML repo under `/Users/jdoe/workspace/my-lookml-repo`, then set `base_folder` to `/Users/jdoe/workspace/my-lookml-repo`.",
) )
project_dependencies: Dict[str, Union[pydantic.DirectoryPath, GitHubInfo]] = Field( project_dependencies: Dict[str, Union[pydantic.DirectoryPath, GitInfo]] = Field(
{}, {},
description="A map of project_name to local directory (accessible to the ingestion system) or Git credentials. " description="A map of project_name to local directory (accessible to the ingestion system) or Git credentials. "
"Every local_dependencies or private remote_dependency listed in the main project's manifest.lkml file should have a corresponding entry here. " "Every local_dependencies or private remote_dependency listed in the main project's manifest.lkml file should have a corresponding entry here. "
@ -284,9 +286,9 @@ class LookMLSourceConfig(LookerCommonConfig, StatefulIngestionConfigBase):
cls, v: Optional[pydantic.DirectoryPath], values: Dict[str, Any] cls, v: Optional[pydantic.DirectoryPath], values: Dict[str, Any]
) -> Optional[pydantic.DirectoryPath]: ) -> Optional[pydantic.DirectoryPath]:
if v is None: if v is None:
github_info: Optional[GitHubInfo] = values.get("github_info", None) git_info: Optional[GitInfo] = values.get("git_info", None)
if github_info and github_info.deploy_key: if git_info and git_info.deploy_key:
# We have github_info populated correctly, base folder is not needed # We have git_info populated correctly, base folder is not needed
pass pass
else: else:
raise ValueError( raise ValueError(
@ -1082,7 +1084,7 @@ class LookMLSource(StatefulIngestionSourceBase):
# This is populated during the git clone step. # This is populated during the git clone step.
base_projects_folder: Dict[str, pathlib.Path] = {} base_projects_folder: Dict[str, pathlib.Path] = {}
remote_projects_github_info: Dict[str, GitHubInfo] = {} remote_projects_git_info: Dict[str, GitInfo] = {}
def __init__(self, config: LookMLSourceConfig, ctx: PipelineContext): def __init__(self, config: LookMLSourceConfig, ctx: PipelineContext):
super().__init__(config, ctx) super().__init__(config, ctx)
@ -1306,17 +1308,17 @@ class LookMLSource(StatefulIngestionSourceBase):
name=looker_view.id.view_name, customProperties=custom_properties name=looker_view.id.view_name, customProperties=custom_properties
) )
maybe_github_info = self.source_config.project_dependencies.get( maybe_git_info = self.source_config.project_dependencies.get(
looker_view.id.project_name, looker_view.id.project_name,
self.remote_projects_github_info.get(looker_view.id.project_name), self.remote_projects_git_info.get(looker_view.id.project_name),
) )
if isinstance(maybe_github_info, GitHubInfo): if isinstance(maybe_git_info, GitInfo):
github_info: Optional[GitHubInfo] = maybe_github_info git_info: Optional[GitInfo] = maybe_git_info
else: else:
github_info = self.source_config.github_info git_info = self.source_config.git_info
if github_info is not None and file_path: if git_info is not None and file_path:
# It should be that looker_view.id.project_name is the base project. # It should be that looker_view.id.project_name is the base project.
github_file_url = github_info.get_url_for_file_path(file_path) github_file_url = git_info.get_url_for_file_path(file_path)
dataset_props.externalUrl = github_file_url dataset_props.externalUrl = github_file_url
return dataset_props return dataset_props
@ -1424,17 +1426,17 @@ class LookMLSource(StatefulIngestionSourceBase):
with tempfile.TemporaryDirectory("lookml_tmp") as tmp_dir: with tempfile.TemporaryDirectory("lookml_tmp") as tmp_dir:
# Clone the base_folder if necessary. # Clone the base_folder if necessary.
if not self.source_config.base_folder: if not self.source_config.base_folder:
assert self.source_config.github_info assert self.source_config.git_info
# we don't have a base_folder, so we need to clone the repo and process it locally # we don't have a base_folder, so we need to clone the repo and process it locally
start_time = datetime.now() start_time = datetime.now()
git_clone = GitClone(tmp_dir) git_clone = GitClone(tmp_dir)
# github info deploy key is always populated # github info deploy key is always populated
assert self.source_config.github_info.deploy_key assert self.source_config.git_info.deploy_key
assert self.source_config.github_info.repo_ssh_locator assert self.source_config.git_info.repo_ssh_locator
checkout_dir = git_clone.clone( checkout_dir = git_clone.clone(
ssh_key=self.source_config.github_info.deploy_key, ssh_key=self.source_config.git_info.deploy_key,
repo_url=self.source_config.github_info.repo_ssh_locator, repo_url=self.source_config.git_info.repo_ssh_locator,
branch=self.source_config.github_info.branch_for_clone, branch=self.source_config.git_info.branch_for_clone,
) )
self.reporter.git_clone_latency = datetime.now() - start_time self.reporter.git_clone_latency = datetime.now() - start_time
self.source_config.base_folder = checkout_dir.resolve() self.source_config.base_folder = checkout_dir.resolve()
@ -1447,7 +1449,7 @@ class LookMLSource(StatefulIngestionSourceBase):
# We clone everything that we're pointed at. # We clone everything that we're pointed at.
for project, p_ref in self.source_config.project_dependencies.items(): for project, p_ref in self.source_config.project_dependencies.items():
# If we were given GitHub info, we need to clone the project. # If we were given GitHub info, we need to clone the project.
if isinstance(p_ref, GitHubInfo): if isinstance(p_ref, GitInfo):
assert p_ref.repo_ssh_locator assert p_ref.repo_ssh_locator
p_cloner = GitClone(f"{tmp_dir}/_included_/{project}") p_cloner = GitClone(f"{tmp_dir}/_included_/{project}")
@ -1458,8 +1460,8 @@ class LookMLSource(StatefulIngestionSourceBase):
# to the main project deploy key. # to the main project deploy key.
p_ref.deploy_key p_ref.deploy_key
or ( or (
self.source_config.github_info.deploy_key self.source_config.git_info.deploy_key
if self.source_config.github_info if self.source_config.git_info
else None else None
) )
), ),
@ -1512,8 +1514,8 @@ class LookMLSource(StatefulIngestionSourceBase):
p_checkout_dir = p_cloner.clone( p_checkout_dir = p_cloner.clone(
ssh_key=( ssh_key=(
self.source_config.github_info.deploy_key self.source_config.git_info.deploy_key
if self.source_config.github_info if self.source_config.git_info
else None else None
), ),
repo_url=remote_project.url, repo_url=remote_project.url,
@ -1524,17 +1526,15 @@ class LookMLSource(StatefulIngestionSourceBase):
] = p_checkout_dir.resolve() ] = p_checkout_dir.resolve()
repo = p_cloner.get_last_repo_cloned() repo = p_cloner.get_last_repo_cloned()
assert repo assert repo
remote_github_info = GitHubInfo( remote_git_info = GitInfo(
url_template=remote_project.url, url_template=remote_project.url,
repo="dummy/dummy", # set to dummy values to bypass validation repo="dummy/dummy", # set to dummy values to bypass validation
branch=repo.active_branch.name, branch=repo.active_branch.name,
) )
remote_github_info.repo = ( remote_git_info.repo = (
"" # set to empty because url already contains the full path "" # set to empty because url already contains the full path
) )
self.remote_projects_github_info[ self.remote_projects_git_info[remote_project.name] = remote_git_info
remote_project.name
] = remote_github_info
except Exception as e: except Exception as e:
logger.warning( logger.warning(

View File

@ -4,7 +4,7 @@ import pytest
from pydantic import SecretStr from pydantic import SecretStr
from datahub.configuration.common import ConfigurationWarning from datahub.configuration.common import ConfigurationWarning
from datahub.configuration.github import GitHubInfo, GitHubReference from datahub.configuration.git import GitInfo, GitReference
from datahub.ingestion.source.git.git_import import GitClone from datahub.ingestion.source.git.git_import import GitClone
LOOKML_TEST_SSH_KEY = os.environ.get("DATAHUB_LOOKML_GIT_TEST_SSH_KEY") LOOKML_TEST_SSH_KEY = os.environ.get("DATAHUB_LOOKML_GIT_TEST_SSH_KEY")
@ -12,13 +12,11 @@ LOOKML_TEST_SSH_KEY = os.environ.get("DATAHUB_LOOKML_GIT_TEST_SSH_KEY")
def test_base_url_guessing(): def test_base_url_guessing():
# Basic GitHub repo. # Basic GitHub repo.
config = GitHubInfo( config = GitInfo(repo="https://github.com/datahub-project/datahub", branch="master")
repo="https://github.com/datahub-project/datahub", branch="master"
)
assert config.repo_ssh_locator == "git@github.com:datahub-project/datahub.git" assert config.repo_ssh_locator == "git@github.com:datahub-project/datahub.git"
# Defaults to GitHub. # Defaults to GitHub.
config = GitHubInfo(repo="datahub-project/datahub", branch="master") config = GitInfo(repo="datahub-project/datahub", branch="master")
assert ( assert (
config.get_url_for_file_path("docker/README.md") config.get_url_for_file_path("docker/README.md")
== "https://github.com/datahub-project/datahub/blob/master/docker/README.md" == "https://github.com/datahub-project/datahub/blob/master/docker/README.md"
@ -26,7 +24,7 @@ def test_base_url_guessing():
assert config.repo_ssh_locator == "git@github.com:datahub-project/datahub.git" assert config.repo_ssh_locator == "git@github.com:datahub-project/datahub.git"
# GitLab repo (notice the trailing slash). # GitLab repo (notice the trailing slash).
config_ref = GitHubReference( config_ref = GitReference(
repo="https://gitlab.com/gitlab-tests/sample-project/", branch="master" repo="https://gitlab.com/gitlab-tests/sample-project/", branch="master"
) )
assert ( assert (
@ -35,7 +33,7 @@ def test_base_url_guessing():
) )
# Three-tier GitLab repo. # Three-tier GitLab repo.
config = GitHubInfo( config = GitInfo(
repo="https://gitlab.com/gitlab-com/gl-infra/reliability", branch="master" repo="https://gitlab.com/gitlab-com/gl-infra/reliability", branch="master"
) )
assert ( assert (
@ -47,7 +45,7 @@ def test_base_url_guessing():
) )
# Overrides. # Overrides.
config = GitHubInfo( config = GitInfo(
repo="https://gitea.com/gitea/tea", repo="https://gitea.com/gitea/tea",
branch="main", branch="main",
url_template="https://gitea.com/gitea/tea/src/branch/{branch}/{file_path}", url_template="https://gitea.com/gitea/tea/src/branch/{branch}/{file_path}",
@ -60,7 +58,7 @@ def test_base_url_guessing():
# Deprecated: base_url. # Deprecated: base_url.
with pytest.warns(ConfigurationWarning, match="base_url is deprecated"): with pytest.warns(ConfigurationWarning, match="base_url is deprecated"):
config = GitHubInfo.parse_obj( config = GitInfo.parse_obj(
dict( dict(
repo="https://github.com/datahub-project/datahub", repo="https://github.com/datahub-project/datahub",
branch="master", branch="master",
@ -70,12 +68,12 @@ def test_base_url_guessing():
def test_github_branch(): def test_github_branch():
config = GitHubInfo( config = GitInfo(
repo="owner/repo", repo="owner/repo",
) )
assert config.branch_for_clone is None assert config.branch_for_clone is None
config = GitHubInfo( config = GitInfo(
repo="owner/repo", repo="owner/repo",
branch="main", branch="main",
) )

View File

@ -337,7 +337,7 @@ def test_lookml_bad_sql_parser(pytestconfig, tmp_path, mock_time):
@freeze_time(FROZEN_TIME) @freeze_time(FROZEN_TIME)
def test_lookml_github_info(pytestconfig, tmp_path, mock_time): def test_lookml_git_info(pytestconfig, tmp_path, mock_time):
"""Add github info to config""" """Add github info to config"""
test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml" test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml"
mce_out = "lookml_mces_with_external_urls.json" mce_out = "lookml_mces_with_external_urls.json"