mirror of
https://github.com/datahub-project/datahub.git
synced 2025-09-03 14:23:03 +00:00
refactor(github): change github reference to git references (#7308)
This commit is contained in:
parent
a60d27b40f
commit
ca65f9db18
@ -13,7 +13,7 @@ _GITHUB_URL_TEMPLATE = "{repo_url}/blob/{branch}/{file_path}"
|
|||||||
_GITLAB_URL_TEMPLATE = "{repo_url}/-/blob/{branch}/{file_path}"
|
_GITLAB_URL_TEMPLATE = "{repo_url}/-/blob/{branch}/{file_path}"
|
||||||
|
|
||||||
|
|
||||||
class GitHubReference(ConfigModel):
|
class GitReference(ConfigModel):
|
||||||
"""Reference to a hosted Git repository. Used to generate "view source" links."""
|
"""Reference to a hosted Git repository. Used to generate "view source" links."""
|
||||||
|
|
||||||
repo: str = Field(
|
repo: str = Field(
|
||||||
@ -72,7 +72,7 @@ class GitHubReference(ConfigModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class GitHubInfo(GitHubReference):
|
class GitInfo(GitReference):
|
||||||
"""A reference to a Git repository, including a deploy key that can be used to clone it."""
|
"""A reference to a Git repository, including a deploy key that can be used to clone it."""
|
||||||
|
|
||||||
deploy_key_file: Optional[FilePath] = Field(
|
deploy_key_file: Optional[FilePath] = Field(
|
@ -9,7 +9,8 @@ import dateutil.parser
|
|||||||
import requests
|
import requests
|
||||||
from pydantic import BaseModel, Field, validator
|
from pydantic import BaseModel, Field, validator
|
||||||
|
|
||||||
from datahub.configuration.github import GitHubReference
|
from datahub.configuration.git import GitReference
|
||||||
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
||||||
from datahub.ingestion.api.decorators import (
|
from datahub.ingestion.api.decorators import (
|
||||||
SupportStatus,
|
SupportStatus,
|
||||||
capability,
|
capability,
|
||||||
@ -53,11 +54,13 @@ class DBTCoreConfig(DBTCommonConfig):
|
|||||||
description="When fetching manifest files from s3, configuration for aws connection details",
|
description="When fetching manifest files from s3, configuration for aws connection details",
|
||||||
)
|
)
|
||||||
|
|
||||||
github_info: Optional[GitHubReference] = Field(
|
git_info: Optional[GitReference] = Field(
|
||||||
None,
|
None,
|
||||||
description="Reference to your github location to enable easy navigation from DataHub to your dbt files.",
|
description="Reference to your git location to enable easy navigation from DataHub to your dbt files.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
_github_info_deprecated = pydantic_renamed_field("github_info", "git_info")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def s3_client(self):
|
def s3_client(self):
|
||||||
assert self.aws_connection
|
assert self.aws_connection
|
||||||
@ -476,8 +479,8 @@ class DBTCoreSource(DBTSourceBase):
|
|||||||
return all_nodes, additional_custom_props
|
return all_nodes, additional_custom_props
|
||||||
|
|
||||||
def get_external_url(self, node: DBTNode) -> Optional[str]:
|
def get_external_url(self, node: DBTNode) -> Optional[str]:
|
||||||
if self.config.github_info and node.dbt_file_path:
|
if self.config.git_info and node.dbt_file_path:
|
||||||
return self.config.github_info.get_url_for_file_path(node.dbt_file_path)
|
return self.config.git_info.get_url_for_file_path(node.dbt_file_path)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_platform_instance_id(self) -> str:
|
def get_platform_instance_id(self) -> str:
|
||||||
|
@ -19,8 +19,9 @@ from pydantic.fields import Field
|
|||||||
import datahub.emitter.mce_builder as builder
|
import datahub.emitter.mce_builder as builder
|
||||||
from datahub.configuration import ConfigModel
|
from datahub.configuration import ConfigModel
|
||||||
from datahub.configuration.common import AllowDenyPattern, ConfigurationError
|
from datahub.configuration.common import AllowDenyPattern, ConfigurationError
|
||||||
from datahub.configuration.github import GitHubInfo
|
from datahub.configuration.git import GitInfo
|
||||||
from datahub.configuration.source_common import EnvBasedSourceConfigBase
|
from datahub.configuration.source_common import EnvBasedSourceConfigBase
|
||||||
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
||||||
from datahub.emitter.mce_builder import make_schema_field_urn
|
from datahub.emitter.mce_builder import make_schema_field_urn
|
||||||
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
||||||
from datahub.ingestion.api.common import PipelineContext
|
from datahub.ingestion.api.common import PipelineContext
|
||||||
@ -169,15 +170,16 @@ class LookerConnectionDefinition(ConfigModel):
|
|||||||
|
|
||||||
|
|
||||||
class LookMLSourceConfig(LookerCommonConfig, StatefulIngestionConfigBase):
|
class LookMLSourceConfig(LookerCommonConfig, StatefulIngestionConfigBase):
|
||||||
github_info: Optional[GitHubInfo] = Field(
|
git_info: Optional[GitInfo] = Field(
|
||||||
None,
|
None,
|
||||||
description="Reference to your github location. If present, supplies handy links to your lookml on the dataset entity page.",
|
description="Reference to your git location. If present, supplies handy links to your lookml on the dataset entity page.",
|
||||||
)
|
)
|
||||||
|
_github_info_deprecated = pydantic_renamed_field("github_info", "git_info")
|
||||||
base_folder: Optional[pydantic.DirectoryPath] = Field(
|
base_folder: Optional[pydantic.DirectoryPath] = Field(
|
||||||
None,
|
None,
|
||||||
description="Required if not providing github configuration and deploy keys. A pointer to a local directory (accessible to the ingestion system) where the root of the LookML repo has been checked out (typically via a git clone). This is typically the root folder where the `*.model.lkml` and `*.view.lkml` files are stored. e.g. If you have checked out your LookML repo under `/Users/jdoe/workspace/my-lookml-repo`, then set `base_folder` to `/Users/jdoe/workspace/my-lookml-repo`.",
|
description="Required if not providing github configuration and deploy keys. A pointer to a local directory (accessible to the ingestion system) where the root of the LookML repo has been checked out (typically via a git clone). This is typically the root folder where the `*.model.lkml` and `*.view.lkml` files are stored. e.g. If you have checked out your LookML repo under `/Users/jdoe/workspace/my-lookml-repo`, then set `base_folder` to `/Users/jdoe/workspace/my-lookml-repo`.",
|
||||||
)
|
)
|
||||||
project_dependencies: Dict[str, Union[pydantic.DirectoryPath, GitHubInfo]] = Field(
|
project_dependencies: Dict[str, Union[pydantic.DirectoryPath, GitInfo]] = Field(
|
||||||
{},
|
{},
|
||||||
description="A map of project_name to local directory (accessible to the ingestion system) or Git credentials. "
|
description="A map of project_name to local directory (accessible to the ingestion system) or Git credentials. "
|
||||||
"Every local_dependencies or private remote_dependency listed in the main project's manifest.lkml file should have a corresponding entry here. "
|
"Every local_dependencies or private remote_dependency listed in the main project's manifest.lkml file should have a corresponding entry here. "
|
||||||
@ -284,9 +286,9 @@ class LookMLSourceConfig(LookerCommonConfig, StatefulIngestionConfigBase):
|
|||||||
cls, v: Optional[pydantic.DirectoryPath], values: Dict[str, Any]
|
cls, v: Optional[pydantic.DirectoryPath], values: Dict[str, Any]
|
||||||
) -> Optional[pydantic.DirectoryPath]:
|
) -> Optional[pydantic.DirectoryPath]:
|
||||||
if v is None:
|
if v is None:
|
||||||
github_info: Optional[GitHubInfo] = values.get("github_info", None)
|
git_info: Optional[GitInfo] = values.get("git_info", None)
|
||||||
if github_info and github_info.deploy_key:
|
if git_info and git_info.deploy_key:
|
||||||
# We have github_info populated correctly, base folder is not needed
|
# We have git_info populated correctly, base folder is not needed
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@ -1082,7 +1084,7 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|||||||
|
|
||||||
# This is populated during the git clone step.
|
# This is populated during the git clone step.
|
||||||
base_projects_folder: Dict[str, pathlib.Path] = {}
|
base_projects_folder: Dict[str, pathlib.Path] = {}
|
||||||
remote_projects_github_info: Dict[str, GitHubInfo] = {}
|
remote_projects_git_info: Dict[str, GitInfo] = {}
|
||||||
|
|
||||||
def __init__(self, config: LookMLSourceConfig, ctx: PipelineContext):
|
def __init__(self, config: LookMLSourceConfig, ctx: PipelineContext):
|
||||||
super().__init__(config, ctx)
|
super().__init__(config, ctx)
|
||||||
@ -1306,17 +1308,17 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|||||||
name=looker_view.id.view_name, customProperties=custom_properties
|
name=looker_view.id.view_name, customProperties=custom_properties
|
||||||
)
|
)
|
||||||
|
|
||||||
maybe_github_info = self.source_config.project_dependencies.get(
|
maybe_git_info = self.source_config.project_dependencies.get(
|
||||||
looker_view.id.project_name,
|
looker_view.id.project_name,
|
||||||
self.remote_projects_github_info.get(looker_view.id.project_name),
|
self.remote_projects_git_info.get(looker_view.id.project_name),
|
||||||
)
|
)
|
||||||
if isinstance(maybe_github_info, GitHubInfo):
|
if isinstance(maybe_git_info, GitInfo):
|
||||||
github_info: Optional[GitHubInfo] = maybe_github_info
|
git_info: Optional[GitInfo] = maybe_git_info
|
||||||
else:
|
else:
|
||||||
github_info = self.source_config.github_info
|
git_info = self.source_config.git_info
|
||||||
if github_info is not None and file_path:
|
if git_info is not None and file_path:
|
||||||
# It should be that looker_view.id.project_name is the base project.
|
# It should be that looker_view.id.project_name is the base project.
|
||||||
github_file_url = github_info.get_url_for_file_path(file_path)
|
github_file_url = git_info.get_url_for_file_path(file_path)
|
||||||
dataset_props.externalUrl = github_file_url
|
dataset_props.externalUrl = github_file_url
|
||||||
|
|
||||||
return dataset_props
|
return dataset_props
|
||||||
@ -1424,17 +1426,17 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|||||||
with tempfile.TemporaryDirectory("lookml_tmp") as tmp_dir:
|
with tempfile.TemporaryDirectory("lookml_tmp") as tmp_dir:
|
||||||
# Clone the base_folder if necessary.
|
# Clone the base_folder if necessary.
|
||||||
if not self.source_config.base_folder:
|
if not self.source_config.base_folder:
|
||||||
assert self.source_config.github_info
|
assert self.source_config.git_info
|
||||||
# we don't have a base_folder, so we need to clone the repo and process it locally
|
# we don't have a base_folder, so we need to clone the repo and process it locally
|
||||||
start_time = datetime.now()
|
start_time = datetime.now()
|
||||||
git_clone = GitClone(tmp_dir)
|
git_clone = GitClone(tmp_dir)
|
||||||
# github info deploy key is always populated
|
# github info deploy key is always populated
|
||||||
assert self.source_config.github_info.deploy_key
|
assert self.source_config.git_info.deploy_key
|
||||||
assert self.source_config.github_info.repo_ssh_locator
|
assert self.source_config.git_info.repo_ssh_locator
|
||||||
checkout_dir = git_clone.clone(
|
checkout_dir = git_clone.clone(
|
||||||
ssh_key=self.source_config.github_info.deploy_key,
|
ssh_key=self.source_config.git_info.deploy_key,
|
||||||
repo_url=self.source_config.github_info.repo_ssh_locator,
|
repo_url=self.source_config.git_info.repo_ssh_locator,
|
||||||
branch=self.source_config.github_info.branch_for_clone,
|
branch=self.source_config.git_info.branch_for_clone,
|
||||||
)
|
)
|
||||||
self.reporter.git_clone_latency = datetime.now() - start_time
|
self.reporter.git_clone_latency = datetime.now() - start_time
|
||||||
self.source_config.base_folder = checkout_dir.resolve()
|
self.source_config.base_folder = checkout_dir.resolve()
|
||||||
@ -1447,7 +1449,7 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|||||||
# We clone everything that we're pointed at.
|
# We clone everything that we're pointed at.
|
||||||
for project, p_ref in self.source_config.project_dependencies.items():
|
for project, p_ref in self.source_config.project_dependencies.items():
|
||||||
# If we were given GitHub info, we need to clone the project.
|
# If we were given GitHub info, we need to clone the project.
|
||||||
if isinstance(p_ref, GitHubInfo):
|
if isinstance(p_ref, GitInfo):
|
||||||
assert p_ref.repo_ssh_locator
|
assert p_ref.repo_ssh_locator
|
||||||
|
|
||||||
p_cloner = GitClone(f"{tmp_dir}/_included_/{project}")
|
p_cloner = GitClone(f"{tmp_dir}/_included_/{project}")
|
||||||
@ -1458,8 +1460,8 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|||||||
# to the main project deploy key.
|
# to the main project deploy key.
|
||||||
p_ref.deploy_key
|
p_ref.deploy_key
|
||||||
or (
|
or (
|
||||||
self.source_config.github_info.deploy_key
|
self.source_config.git_info.deploy_key
|
||||||
if self.source_config.github_info
|
if self.source_config.git_info
|
||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
),
|
),
|
||||||
@ -1512,8 +1514,8 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|||||||
|
|
||||||
p_checkout_dir = p_cloner.clone(
|
p_checkout_dir = p_cloner.clone(
|
||||||
ssh_key=(
|
ssh_key=(
|
||||||
self.source_config.github_info.deploy_key
|
self.source_config.git_info.deploy_key
|
||||||
if self.source_config.github_info
|
if self.source_config.git_info
|
||||||
else None
|
else None
|
||||||
),
|
),
|
||||||
repo_url=remote_project.url,
|
repo_url=remote_project.url,
|
||||||
@ -1524,17 +1526,15 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|||||||
] = p_checkout_dir.resolve()
|
] = p_checkout_dir.resolve()
|
||||||
repo = p_cloner.get_last_repo_cloned()
|
repo = p_cloner.get_last_repo_cloned()
|
||||||
assert repo
|
assert repo
|
||||||
remote_github_info = GitHubInfo(
|
remote_git_info = GitInfo(
|
||||||
url_template=remote_project.url,
|
url_template=remote_project.url,
|
||||||
repo="dummy/dummy", # set to dummy values to bypass validation
|
repo="dummy/dummy", # set to dummy values to bypass validation
|
||||||
branch=repo.active_branch.name,
|
branch=repo.active_branch.name,
|
||||||
)
|
)
|
||||||
remote_github_info.repo = (
|
remote_git_info.repo = (
|
||||||
"" # set to empty because url already contains the full path
|
"" # set to empty because url already contains the full path
|
||||||
)
|
)
|
||||||
self.remote_projects_github_info[
|
self.remote_projects_git_info[remote_project.name] = remote_git_info
|
||||||
remote_project.name
|
|
||||||
] = remote_github_info
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
|
@ -4,7 +4,7 @@ import pytest
|
|||||||
from pydantic import SecretStr
|
from pydantic import SecretStr
|
||||||
|
|
||||||
from datahub.configuration.common import ConfigurationWarning
|
from datahub.configuration.common import ConfigurationWarning
|
||||||
from datahub.configuration.github import GitHubInfo, GitHubReference
|
from datahub.configuration.git import GitInfo, GitReference
|
||||||
from datahub.ingestion.source.git.git_import import GitClone
|
from datahub.ingestion.source.git.git_import import GitClone
|
||||||
|
|
||||||
LOOKML_TEST_SSH_KEY = os.environ.get("DATAHUB_LOOKML_GIT_TEST_SSH_KEY")
|
LOOKML_TEST_SSH_KEY = os.environ.get("DATAHUB_LOOKML_GIT_TEST_SSH_KEY")
|
||||||
@ -12,13 +12,11 @@ LOOKML_TEST_SSH_KEY = os.environ.get("DATAHUB_LOOKML_GIT_TEST_SSH_KEY")
|
|||||||
|
|
||||||
def test_base_url_guessing():
|
def test_base_url_guessing():
|
||||||
# Basic GitHub repo.
|
# Basic GitHub repo.
|
||||||
config = GitHubInfo(
|
config = GitInfo(repo="https://github.com/datahub-project/datahub", branch="master")
|
||||||
repo="https://github.com/datahub-project/datahub", branch="master"
|
|
||||||
)
|
|
||||||
assert config.repo_ssh_locator == "git@github.com:datahub-project/datahub.git"
|
assert config.repo_ssh_locator == "git@github.com:datahub-project/datahub.git"
|
||||||
|
|
||||||
# Defaults to GitHub.
|
# Defaults to GitHub.
|
||||||
config = GitHubInfo(repo="datahub-project/datahub", branch="master")
|
config = GitInfo(repo="datahub-project/datahub", branch="master")
|
||||||
assert (
|
assert (
|
||||||
config.get_url_for_file_path("docker/README.md")
|
config.get_url_for_file_path("docker/README.md")
|
||||||
== "https://github.com/datahub-project/datahub/blob/master/docker/README.md"
|
== "https://github.com/datahub-project/datahub/blob/master/docker/README.md"
|
||||||
@ -26,7 +24,7 @@ def test_base_url_guessing():
|
|||||||
assert config.repo_ssh_locator == "git@github.com:datahub-project/datahub.git"
|
assert config.repo_ssh_locator == "git@github.com:datahub-project/datahub.git"
|
||||||
|
|
||||||
# GitLab repo (notice the trailing slash).
|
# GitLab repo (notice the trailing slash).
|
||||||
config_ref = GitHubReference(
|
config_ref = GitReference(
|
||||||
repo="https://gitlab.com/gitlab-tests/sample-project/", branch="master"
|
repo="https://gitlab.com/gitlab-tests/sample-project/", branch="master"
|
||||||
)
|
)
|
||||||
assert (
|
assert (
|
||||||
@ -35,7 +33,7 @@ def test_base_url_guessing():
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Three-tier GitLab repo.
|
# Three-tier GitLab repo.
|
||||||
config = GitHubInfo(
|
config = GitInfo(
|
||||||
repo="https://gitlab.com/gitlab-com/gl-infra/reliability", branch="master"
|
repo="https://gitlab.com/gitlab-com/gl-infra/reliability", branch="master"
|
||||||
)
|
)
|
||||||
assert (
|
assert (
|
||||||
@ -47,7 +45,7 @@ def test_base_url_guessing():
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Overrides.
|
# Overrides.
|
||||||
config = GitHubInfo(
|
config = GitInfo(
|
||||||
repo="https://gitea.com/gitea/tea",
|
repo="https://gitea.com/gitea/tea",
|
||||||
branch="main",
|
branch="main",
|
||||||
url_template="https://gitea.com/gitea/tea/src/branch/{branch}/{file_path}",
|
url_template="https://gitea.com/gitea/tea/src/branch/{branch}/{file_path}",
|
||||||
@ -60,7 +58,7 @@ def test_base_url_guessing():
|
|||||||
|
|
||||||
# Deprecated: base_url.
|
# Deprecated: base_url.
|
||||||
with pytest.warns(ConfigurationWarning, match="base_url is deprecated"):
|
with pytest.warns(ConfigurationWarning, match="base_url is deprecated"):
|
||||||
config = GitHubInfo.parse_obj(
|
config = GitInfo.parse_obj(
|
||||||
dict(
|
dict(
|
||||||
repo="https://github.com/datahub-project/datahub",
|
repo="https://github.com/datahub-project/datahub",
|
||||||
branch="master",
|
branch="master",
|
||||||
@ -70,12 +68,12 @@ def test_base_url_guessing():
|
|||||||
|
|
||||||
|
|
||||||
def test_github_branch():
|
def test_github_branch():
|
||||||
config = GitHubInfo(
|
config = GitInfo(
|
||||||
repo="owner/repo",
|
repo="owner/repo",
|
||||||
)
|
)
|
||||||
assert config.branch_for_clone is None
|
assert config.branch_for_clone is None
|
||||||
|
|
||||||
config = GitHubInfo(
|
config = GitInfo(
|
||||||
repo="owner/repo",
|
repo="owner/repo",
|
||||||
branch="main",
|
branch="main",
|
||||||
)
|
)
|
||||||
|
@ -337,7 +337,7 @@ def test_lookml_bad_sql_parser(pytestconfig, tmp_path, mock_time):
|
|||||||
|
|
||||||
|
|
||||||
@freeze_time(FROZEN_TIME)
|
@freeze_time(FROZEN_TIME)
|
||||||
def test_lookml_github_info(pytestconfig, tmp_path, mock_time):
|
def test_lookml_git_info(pytestconfig, tmp_path, mock_time):
|
||||||
"""Add github info to config"""
|
"""Add github info to config"""
|
||||||
test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml"
|
test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml"
|
||||||
mce_out = "lookml_mces_with_external_urls.json"
|
mce_out = "lookml_mces_with_external_urls.json"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user