diff --git a/metadata-ingestion/src/datahub/configuration/github.py b/metadata-ingestion/src/datahub/configuration/git.py similarity index 98% rename from metadata-ingestion/src/datahub/configuration/github.py rename to metadata-ingestion/src/datahub/configuration/git.py index 0a45fecb16..0c7d64d4aa 100644 --- a/metadata-ingestion/src/datahub/configuration/github.py +++ b/metadata-ingestion/src/datahub/configuration/git.py @@ -13,7 +13,7 @@ _GITHUB_URL_TEMPLATE = "{repo_url}/blob/{branch}/{file_path}" _GITLAB_URL_TEMPLATE = "{repo_url}/-/blob/{branch}/{file_path}" -class GitHubReference(ConfigModel): +class GitReference(ConfigModel): """Reference to a hosted Git repository. Used to generate "view source" links.""" repo: str = Field( @@ -72,7 +72,7 @@ class GitHubReference(ConfigModel): ) -class GitHubInfo(GitHubReference): +class GitInfo(GitReference): """A reference to a Git repository, including a deploy key that can be used to clone it.""" deploy_key_file: Optional[FilePath] = Field( diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py index 8023d24b7a..d4492bbf1e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py @@ -9,7 +9,8 @@ import dateutil.parser import requests from pydantic import BaseModel, Field, validator -from datahub.configuration.github import GitHubReference +from datahub.configuration.git import GitReference +from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.ingestion.api.decorators import ( SupportStatus, capability, @@ -53,11 +54,13 @@ class DBTCoreConfig(DBTCommonConfig): description="When fetching manifest files from s3, configuration for aws connection details", ) - github_info: Optional[GitHubReference] = Field( + git_info: Optional[GitReference] = Field( None, - description="Reference to your github location to enable easy navigation from DataHub to your dbt files.", + description="Reference to your git location to enable easy navigation from DataHub to your dbt files.", ) + _github_info_deprecated = pydantic_renamed_field("github_info", "git_info") + @property def s3_client(self): assert self.aws_connection @@ -476,8 +479,8 @@ class DBTCoreSource(DBTSourceBase): return all_nodes, additional_custom_props def get_external_url(self, node: DBTNode) -> Optional[str]: - if self.config.github_info and node.dbt_file_path: - return self.config.github_info.get_url_for_file_path(node.dbt_file_path) + if self.config.git_info and node.dbt_file_path: + return self.config.git_info.get_url_for_file_path(node.dbt_file_path) return None def get_platform_instance_id(self) -> str: diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py index d42c43777f..edec408e82 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py @@ -19,8 +19,9 @@ from pydantic.fields import Field import datahub.emitter.mce_builder as builder from datahub.configuration import ConfigModel from datahub.configuration.common import AllowDenyPattern, ConfigurationError -from datahub.configuration.github import GitHubInfo +from datahub.configuration.git import GitInfo from datahub.configuration.source_common import EnvBasedSourceConfigBase +from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.emitter.mce_builder import make_schema_field_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext @@ -169,15 +170,16 @@ class LookerConnectionDefinition(ConfigModel): class LookMLSourceConfig(LookerCommonConfig, StatefulIngestionConfigBase): - github_info: Optional[GitHubInfo] = Field( + git_info: Optional[GitInfo] = Field( None, - description="Reference to your github location. If present, supplies handy links to your lookml on the dataset entity page.", + description="Reference to your git location. If present, supplies handy links to your lookml on the dataset entity page.", ) + _github_info_deprecated = pydantic_renamed_field("github_info", "git_info") base_folder: Optional[pydantic.DirectoryPath] = Field( None, description="Required if not providing github configuration and deploy keys. A pointer to a local directory (accessible to the ingestion system) where the root of the LookML repo has been checked out (typically via a git clone). This is typically the root folder where the `*.model.lkml` and `*.view.lkml` files are stored. e.g. If you have checked out your LookML repo under `/Users/jdoe/workspace/my-lookml-repo`, then set `base_folder` to `/Users/jdoe/workspace/my-lookml-repo`.", ) - project_dependencies: Dict[str, Union[pydantic.DirectoryPath, GitHubInfo]] = Field( + project_dependencies: Dict[str, Union[pydantic.DirectoryPath, GitInfo]] = Field( {}, description="A map of project_name to local directory (accessible to the ingestion system) or Git credentials. " "Every local_dependencies or private remote_dependency listed in the main project's manifest.lkml file should have a corresponding entry here. " @@ -284,9 +286,9 @@ class LookMLSourceConfig(LookerCommonConfig, StatefulIngestionConfigBase): cls, v: Optional[pydantic.DirectoryPath], values: Dict[str, Any] ) -> Optional[pydantic.DirectoryPath]: if v is None: - github_info: Optional[GitHubInfo] = values.get("github_info", None) - if github_info and github_info.deploy_key: - # We have github_info populated correctly, base folder is not needed + git_info: Optional[GitInfo] = values.get("git_info", None) + if git_info and git_info.deploy_key: + # We have git_info populated correctly, base folder is not needed pass else: raise ValueError( @@ -1082,7 +1084,7 @@ class LookMLSource(StatefulIngestionSourceBase): # This is populated during the git clone step. base_projects_folder: Dict[str, pathlib.Path] = {} - remote_projects_github_info: Dict[str, GitHubInfo] = {} + remote_projects_git_info: Dict[str, GitInfo] = {} def __init__(self, config: LookMLSourceConfig, ctx: PipelineContext): super().__init__(config, ctx) @@ -1306,17 +1308,17 @@ class LookMLSource(StatefulIngestionSourceBase): name=looker_view.id.view_name, customProperties=custom_properties ) - maybe_github_info = self.source_config.project_dependencies.get( + maybe_git_info = self.source_config.project_dependencies.get( looker_view.id.project_name, - self.remote_projects_github_info.get(looker_view.id.project_name), + self.remote_projects_git_info.get(looker_view.id.project_name), ) - if isinstance(maybe_github_info, GitHubInfo): - github_info: Optional[GitHubInfo] = maybe_github_info + if isinstance(maybe_git_info, GitInfo): + git_info: Optional[GitInfo] = maybe_git_info else: - github_info = self.source_config.github_info - if github_info is not None and file_path: + git_info = self.source_config.git_info + if git_info is not None and file_path: # It should be that looker_view.id.project_name is the base project. - github_file_url = github_info.get_url_for_file_path(file_path) + github_file_url = git_info.get_url_for_file_path(file_path) dataset_props.externalUrl = github_file_url return dataset_props @@ -1424,17 +1426,17 @@ class LookMLSource(StatefulIngestionSourceBase): with tempfile.TemporaryDirectory("lookml_tmp") as tmp_dir: # Clone the base_folder if necessary. if not self.source_config.base_folder: - assert self.source_config.github_info + assert self.source_config.git_info # we don't have a base_folder, so we need to clone the repo and process it locally start_time = datetime.now() git_clone = GitClone(tmp_dir) # github info deploy key is always populated - assert self.source_config.github_info.deploy_key - assert self.source_config.github_info.repo_ssh_locator + assert self.source_config.git_info.deploy_key + assert self.source_config.git_info.repo_ssh_locator checkout_dir = git_clone.clone( - ssh_key=self.source_config.github_info.deploy_key, - repo_url=self.source_config.github_info.repo_ssh_locator, - branch=self.source_config.github_info.branch_for_clone, + ssh_key=self.source_config.git_info.deploy_key, + repo_url=self.source_config.git_info.repo_ssh_locator, + branch=self.source_config.git_info.branch_for_clone, ) self.reporter.git_clone_latency = datetime.now() - start_time self.source_config.base_folder = checkout_dir.resolve() @@ -1447,7 +1449,7 @@ class LookMLSource(StatefulIngestionSourceBase): # We clone everything that we're pointed at. for project, p_ref in self.source_config.project_dependencies.items(): # If we were given GitHub info, we need to clone the project. - if isinstance(p_ref, GitHubInfo): + if isinstance(p_ref, GitInfo): assert p_ref.repo_ssh_locator p_cloner = GitClone(f"{tmp_dir}/_included_/{project}") @@ -1458,8 +1460,8 @@ class LookMLSource(StatefulIngestionSourceBase): # to the main project deploy key. p_ref.deploy_key or ( - self.source_config.github_info.deploy_key - if self.source_config.github_info + self.source_config.git_info.deploy_key + if self.source_config.git_info else None ) ), @@ -1512,8 +1514,8 @@ class LookMLSource(StatefulIngestionSourceBase): p_checkout_dir = p_cloner.clone( ssh_key=( - self.source_config.github_info.deploy_key - if self.source_config.github_info + self.source_config.git_info.deploy_key + if self.source_config.git_info else None ), repo_url=remote_project.url, @@ -1524,17 +1526,15 @@ class LookMLSource(StatefulIngestionSourceBase): ] = p_checkout_dir.resolve() repo = p_cloner.get_last_repo_cloned() assert repo - remote_github_info = GitHubInfo( + remote_git_info = GitInfo( url_template=remote_project.url, repo="dummy/dummy", # set to dummy values to bypass validation branch=repo.active_branch.name, ) - remote_github_info.repo = ( + remote_git_info.repo = ( "" # set to empty because url already contains the full path ) - self.remote_projects_github_info[ - remote_project.name - ] = remote_github_info + self.remote_projects_git_info[remote_project.name] = remote_git_info except Exception as e: logger.warning( diff --git a/metadata-ingestion/tests/integration/git/test_git_clone.py b/metadata-ingestion/tests/integration/git/test_git_clone.py index 04a133df99..35fab522d5 100644 --- a/metadata-ingestion/tests/integration/git/test_git_clone.py +++ b/metadata-ingestion/tests/integration/git/test_git_clone.py @@ -4,7 +4,7 @@ import pytest from pydantic import SecretStr from datahub.configuration.common import ConfigurationWarning -from datahub.configuration.github import GitHubInfo, GitHubReference +from datahub.configuration.git import GitInfo, GitReference from datahub.ingestion.source.git.git_import import GitClone LOOKML_TEST_SSH_KEY = os.environ.get("DATAHUB_LOOKML_GIT_TEST_SSH_KEY") @@ -12,13 +12,11 @@ LOOKML_TEST_SSH_KEY = os.environ.get("DATAHUB_LOOKML_GIT_TEST_SSH_KEY") def test_base_url_guessing(): # Basic GitHub repo. - config = GitHubInfo( - repo="https://github.com/datahub-project/datahub", branch="master" - ) + config = GitInfo(repo="https://github.com/datahub-project/datahub", branch="master") assert config.repo_ssh_locator == "git@github.com:datahub-project/datahub.git" # Defaults to GitHub. - config = GitHubInfo(repo="datahub-project/datahub", branch="master") + config = GitInfo(repo="datahub-project/datahub", branch="master") assert ( config.get_url_for_file_path("docker/README.md") == "https://github.com/datahub-project/datahub/blob/master/docker/README.md" @@ -26,7 +24,7 @@ def test_base_url_guessing(): assert config.repo_ssh_locator == "git@github.com:datahub-project/datahub.git" # GitLab repo (notice the trailing slash). - config_ref = GitHubReference( + config_ref = GitReference( repo="https://gitlab.com/gitlab-tests/sample-project/", branch="master" ) assert ( @@ -35,7 +33,7 @@ def test_base_url_guessing(): ) # Three-tier GitLab repo. - config = GitHubInfo( + config = GitInfo( repo="https://gitlab.com/gitlab-com/gl-infra/reliability", branch="master" ) assert ( @@ -47,7 +45,7 @@ def test_base_url_guessing(): ) # Overrides. - config = GitHubInfo( + config = GitInfo( repo="https://gitea.com/gitea/tea", branch="main", url_template="https://gitea.com/gitea/tea/src/branch/{branch}/{file_path}", @@ -60,7 +58,7 @@ def test_base_url_guessing(): # Deprecated: base_url. with pytest.warns(ConfigurationWarning, match="base_url is deprecated"): - config = GitHubInfo.parse_obj( + config = GitInfo.parse_obj( dict( repo="https://github.com/datahub-project/datahub", branch="master", @@ -70,12 +68,12 @@ def test_base_url_guessing(): def test_github_branch(): - config = GitHubInfo( + config = GitInfo( repo="owner/repo", ) assert config.branch_for_clone is None - config = GitHubInfo( + config = GitInfo( repo="owner/repo", branch="main", ) diff --git a/metadata-ingestion/tests/integration/lookml/test_lookml.py b/metadata-ingestion/tests/integration/lookml/test_lookml.py index 018077f826..e122dc6d3b 100644 --- a/metadata-ingestion/tests/integration/lookml/test_lookml.py +++ b/metadata-ingestion/tests/integration/lookml/test_lookml.py @@ -337,7 +337,7 @@ def test_lookml_bad_sql_parser(pytestconfig, tmp_path, mock_time): @freeze_time(FROZEN_TIME) -def test_lookml_github_info(pytestconfig, tmp_path, mock_time): +def test_lookml_git_info(pytestconfig, tmp_path, mock_time): """Add github info to config""" test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml" mce_out = "lookml_mces_with_external_urls.json"