diff --git a/bootstrap/sql/com.mysql.cj.jdbc.Driver/v012__create_db_connection_info.sql b/bootstrap/sql/com.mysql.cj.jdbc.Driver/v012__create_db_connection_info.sql index c1c4876aab3..946bdca8a51 100644 --- a/bootstrap/sql/com.mysql.cj.jdbc.Driver/v012__create_db_connection_info.sql +++ b/bootstrap/sql/com.mysql.cj.jdbc.Driver/v012__create_db_connection_info.sql @@ -2,3 +2,13 @@ UPDATE metadata_service_entity SET json = JSON_REMOVE(json, '$.openMetadataServerConnection.secretsManagerCredentials') where name = 'OpenMetadata'; + +-- Rename githubCredentials to gitCredentials +UPDATE dashboard_service_entity +SET json = JSON_INSERT( + JSON_REMOVE(json, '$.connection.config.githubCredentials'), + '$.connection.config.gitCredentials', + JSON_EXTRACT(json, '$.connection.config.githubCredentials') + ) +WHERE serviceType = 'Looker' + AND JSON_EXTRACT(json, '$.connection.config.githubCredentials') IS NOT NULL; diff --git a/bootstrap/sql/org.postgresql.Driver/v012__create_db_connection_info.sql b/bootstrap/sql/org.postgresql.Driver/v012__create_db_connection_info.sql index 67d7579ff1b..f3516e5129b 100644 --- a/bootstrap/sql/org.postgresql.Driver/v012__create_db_connection_info.sql +++ b/bootstrap/sql/org.postgresql.Driver/v012__create_db_connection_info.sql @@ -2,3 +2,9 @@ UPDATE metadata_service_entity SET json = json::jsonb #- '{openMetadataServerConnection.secretsManagerCredentials}' where name = 'OpenMetadata'; + +-- Rename githubCredentials to gitCredentials +UPDATE dashboard_service_entity +SET json = jsonb_set(json, '{connection,config,gitCredentials}', json#>'{connection,config,githubCredentials}') + where serviceType = 'Looker' + and json#>'{connection,config,githubCredentials}' is not null; \ No newline at end of file diff --git a/ingestion/src/metadata/ingestion/source/dashboard/looker/connection.py b/ingestion/src/metadata/ingestion/source/dashboard/looker/connection.py index ea300ab708a..aecddda9f40 100644 --- a/ingestion/src/metadata/ingestion/source/dashboard/looker/connection.py +++ b/ingestion/src/metadata/ingestion/source/dashboard/looker/connection.py @@ -61,8 +61,17 @@ def test_connection( """ assert client.all_lookml_models(limit=1) + def validate_api_version(): + """ + Make sure we get a True + """ + assert "4.0" in ( + api_version.version for api_version in client.versions().supported_versions + ) + test_fn = { "CheckAccess": client.me, + "ValidateVersion": validate_api_version, "ListDashboards": lambda: client.all_dashboards(fields="id,title"), "ListLookMLModels": list_datamodels_test, } diff --git a/ingestion/src/metadata/ingestion/source/dashboard/looker/links.py b/ingestion/src/metadata/ingestion/source/dashboard/looker/links.py new file mode 100644 index 00000000000..b0238cda5b5 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/dashboard/looker/links.py @@ -0,0 +1,30 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +LookML Link handler +""" +from urllib.parse import unquote, urlparse + + +def get_path_from_link(link: str) -> str: + """ + Given the `lookml_link` property from an explore, + get the source file path to fetch the file from Git. + + Note that we cannot directly use the `source_file` + property since it does not give us the actual path, + only the file name. + + The usual shape will be: + /projects//files/?params + """ + parsed = urlparse(unquote(link)) + return parsed.path.split("/files/")[-1] diff --git a/ingestion/src/metadata/ingestion/source/dashboard/looker/metadata.py b/ingestion/src/metadata/ingestion/source/dashboard/looker/metadata.py index 3336f412600..2d38ea4b18b 100644 --- a/ingestion/src/metadata/ingestion/source/dashboard/looker/metadata.py +++ b/ingestion/src/metadata/ingestion/source/dashboard/looker/metadata.py @@ -21,7 +21,7 @@ Notes: import traceback from datetime import datetime -from typing import Iterable, List, Optional, Sequence, Set, Union, cast +from typing import Dict, Iterable, List, Optional, Sequence, Set, Type, Union, cast from looker_sdk.sdk.api40.methods import Looker40SDK from looker_sdk.sdk.api40.models import Dashboard as LookerDashboard @@ -31,6 +31,7 @@ from looker_sdk.sdk.api40.models import ( LookmlModel, LookmlModelExplore, LookmlModelNavExplore, + Project, ) from pydantic import ValidationError @@ -61,6 +62,9 @@ from metadata.generated.schema.entity.services.dashboardService import ( from metadata.generated.schema.metadataIngestion.workflow import ( Source as WorkflowSource, ) +from metadata.generated.schema.security.credentials.bitbucketCredentials import ( + BitBucketCredentials, +) from metadata.generated.schema.security.credentials.githubCredentials import ( GitHubCredentials, ) @@ -73,12 +77,17 @@ from metadata.ingestion.source.dashboard.dashboard_service import ( DashboardUsage, ) from metadata.ingestion.source.dashboard.looker.columns import get_columns_from_model +from metadata.ingestion.source.dashboard.looker.links import get_path_from_link from metadata.ingestion.source.dashboard.looker.models import ( Includes, LookMlView, ViewName, ) from metadata.ingestion.source.dashboard.looker.parser import LkmlParser +from metadata.readers.api_reader import ReadersCredentials +from metadata.readers.base import Reader +from metadata.readers.bitbucket import BitBucketReader +from metadata.readers.credentials import get_credentials_from_url from metadata.readers.github import GitHubReader from metadata.utils import fqn from metadata.utils.filters import filter_by_chart, filter_by_datamodel @@ -118,6 +127,7 @@ def build_datamodel_name(model_name: str, explore_name: str) -> str: return clean_dashboard_name(model_name + "_" + explore_name) +# pylint: disable=too-many-public-methods class LookerSource(DashboardServiceSource): """ Looker Source Class. @@ -137,8 +147,10 @@ class LookerSource(DashboardServiceSource): super().__init__(config, metadata_config) self.today = datetime.now().strftime("%Y-%m-%d") - self._parser = None self._explores_cache = {} + self._repo_credentials: Optional[ReadersCredentials] = None + self._reader_class: Optional[Type[Reader]] = None + self._project_parsers: Optional[Dict[str, LkmlParser]] = None @classmethod def create( @@ -153,24 +165,91 @@ class LookerSource(DashboardServiceSource): return cls(config, metadata_config) @property - def parser(self) -> Optional[LkmlParser]: - if not self._parser and self.github_credentials: - self._parser = LkmlParser(reader=GitHubReader(self.github_credentials)) + def parser(self) -> Optional[Dict[str, LkmlParser]]: + if self.repository_credentials: + return self._project_parsers - return self._parser + return None + + @parser.setter + def parser(self, all_lookml_models: Sequence[LookmlModel]) -> None: + """ + Initialize the project parsers. + + Each LookML model is linked to a Looker Project. Each project can be + hosted in different GitHub repositories. + + Here we will prepare the Readers for each project and the LookML parser. + + We are assuming that each Git repo is based under the same owner + and can be accessed with the same token. If we have + any errors obtaining the git project information, we will default + to the incoming GitHub Credentials. + """ + if self.repository_credentials: + all_projects: Set[str] = {model.project_name for model in all_lookml_models} + self._project_parsers: Dict[str, LkmlParser] = { + project_name: LkmlParser( + reader=self.reader( + credentials=self.get_lookml_project_credentials( + project_name=project_name + ) + ) + ) + for project_name in all_projects + } + + logger.info(f"We found the following parsers:\n {self._project_parsers}") + + def get_lookml_project_credentials(self, project_name: str) -> GitHubCredentials: + """ + Given a lookml project, get its git URL and build the credentials + """ + try: + project: Project = self.client.project(project_id=project_name) + return get_credentials_from_url( + original=self.repository_credentials, url=project.git_remote_url + ) + except Exception as err: + logger.error( + f"Error trying to build project credentials - [{err}]. We'll use the default ones." + ) + return self.repository_credentials @property - def github_credentials(self) -> Optional[GitHubCredentials]: + def reader(self) -> Optional[Type[Reader]]: + """ + Depending on the type of the credentials we'll need a different reader + """ + if not self._reader_class: + + if self.service_connection.gitCredentials and isinstance( + self.service_connection.gitCredentials, GitHubCredentials + ): + self._reader_class = GitHubReader + + if self.service_connection.gitCredentials and isinstance( + self.service_connection.gitCredentials, BitBucketCredentials + ): + self._reader_class = BitBucketReader + + return self._reader_class + + @property + def repository_credentials(self) -> Optional[ReadersCredentials]: """ Check if the credentials are informed and return them. We either get GitHubCredentials or `NoGitHubCredentials` """ - if self.service_connection.githubCredentials and isinstance( - self.service_connection.githubCredentials, GitHubCredentials - ): - return self.service_connection.githubCredentials - return None + if not self._repo_credentials: + + if self.service_connection.gitCredentials and isinstance( + self.service_connection.gitCredentials, GitHubCredentials + ): + self._repo_credentials = self.service_connection.gitCredentials + + return self._repo_credentials def list_datamodels(self) -> Iterable[LookmlModelExplore]: """ @@ -182,6 +261,11 @@ class LookerSource(DashboardServiceSource): all_lookml_models: Sequence[ LookmlModel ] = self.client.all_lookml_models() + + # Then, gather their information and build the parser + self.parser = all_lookml_models + + # Finally, iterate through them to ingest Explores and Views yield from self.fetch_lookml_explores(all_lookml_models) except Exception as err: logger.debug(traceback.format_exc()) @@ -255,7 +339,7 @@ class LookerSource(DashboardServiceSource): # We can get VIEWs from the JOINs to know the dependencies # We will only try and fetch if we have the credentials - if self.github_credentials: + if self.repository_credentials: for view in model.joins: if filter_by_datamodel( self.source_config.dataModelFilterPattern, view.name @@ -290,11 +374,17 @@ class LookerSource(DashboardServiceSource): file definition and add it here """ # Only look to parse if creds are in - if self.github_credentials: + if self.repository_credentials: try: - # This will only parse if the file has not been parsed yet - self.parser.parse_file(Includes(explore.source_file)) - return self.parser.parsed_files.get(Includes(explore.source_file)) + project_parser = self.parser.get(explore.project_name) + if project_parser: + # This will only parse if the file has not been parsed yet + project_parser.parse_file( + Includes(get_path_from_link(explore.lookml_link)) + ) + return project_parser.parsed_files.get( + Includes(get_path_from_link(explore.lookml_link)) + ) except Exception as err: logger.warning(f"Exception getting the model sql: {err}") @@ -310,24 +400,29 @@ class LookerSource(DashboardServiceSource): Every visited view, will be cached so that we don't need to process everything again. """ - view: Optional[LookMlView] = self.parser.find_view( - view_name=view_name, path=Includes(explore.source_file) - ) - if view: - yield CreateDashboardDataModelRequest( - name=build_datamodel_name(explore.model_name, view.name), - displayName=view.name, - description=view.description, - service=self.context.dashboard_service.fullyQualifiedName.__root__, - dataModelType=DataModelType.LookMlView.value, - serviceType=DashboardServiceType.Looker.value, - columns=get_columns_from_model(view), - sql=self.parser.parsed_files.get(Includes(view.source_file)), + project_parser = self.parser.get(explore.project_name) + if project_parser: + + view: Optional[LookMlView] = project_parser.find_view( + view_name=view_name, + path=Includes(get_path_from_link(explore.lookml_link)), ) - self.status.scanned(f"Data Model Scanned: {view.name}") - yield from self.add_view_lineage(view, explore) + if view: + yield CreateDashboardDataModelRequest( + name=build_datamodel_name(explore.model_name, view.name), + displayName=view.name, + description=view.description, + service=self.context.dashboard_service.fullyQualifiedName.__root__, + dataModelType=DataModelType.LookMlView.value, + serviceType=DashboardServiceType.Looker.value, + columns=get_columns_from_model(view), + sql=project_parser.parsed_files.get(Includes(view.source_file)), + ) + self.status.scanned(f"Data Model Scanned: {view.name}") + + yield from self.add_view_lineage(view, explore) def add_view_lineage( self, view: LookMlView, explore: LookmlModelExplore diff --git a/ingestion/src/metadata/ingestion/source/dashboard/looker/parser.py b/ingestion/src/metadata/ingestion/source/dashboard/looker/parser.py index 0da310558b6..aec093b4c6c 100644 --- a/ingestion/src/metadata/ingestion/source/dashboard/looker/parser.py +++ b/ingestion/src/metadata/ingestion/source/dashboard/looker/parser.py @@ -134,3 +134,9 @@ class LkmlParser: # We might not find the view ever return self.get_view_from_cache(view_name) + + def __repr__(self): + """ + Customize string repr for logs + """ + return f"Parser at [{self.reader.credentials.repositoryOwner}/{self.reader.credentials.repositoryName}]" diff --git a/ingestion/src/metadata/ingestion/source/pipeline/spline/__init__.py b/ingestion/src/metadata/ingestion/source/pipeline/spline/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/ingestion/src/metadata/ingestion/source/pipeline/spline/client.py b/ingestion/src/metadata/ingestion/source/pipeline/spline/client.py index 31d375c1d84..bff917ecbfc 100644 --- a/ingestion/src/metadata/ingestion/source/pipeline/spline/client.py +++ b/ingestion/src/metadata/ingestion/source/pipeline/spline/client.py @@ -17,16 +17,16 @@ from typing import List from metadata.generated.schema.entity.services.connections.pipeline.splineConnection import ( SplineConnection, ) -from metadata.ingestion.ometa.client import REST, APIError, ClientConfig +from metadata.ingestion.ometa.client import REST, ClientConfig from metadata.ingestion.source.pipeline.spline.models import ( ExecutionDetail, ExecutionEvents, ) from metadata.utils.constants import AUTHORIZATION_HEADER, NO_ACCESS_TOKEN +from metadata.utils.helpers import clean_uri from metadata.utils.logger import ingestion_logger logger = ingestion_logger() -from metadata.utils.helpers import clean_uri class SplineClient: @@ -34,7 +34,6 @@ class SplineClient: Wrapper on top of Spline REST API """ - # pylint: disable=too-many-arguments def __init__(self, config: SplineConnection): self.config = config client_config: ClientConfig = ClientConfig( diff --git a/ingestion/src/metadata/ingestion/source/pipeline/spline/metadata.py b/ingestion/src/metadata/ingestion/source/pipeline/spline/metadata.py index 10d585efcb9..7cdeb2c85af 100644 --- a/ingestion/src/metadata/ingestion/source/pipeline/spline/metadata.py +++ b/ingestion/src/metadata/ingestion/source/pipeline/spline/metadata.py @@ -109,9 +109,9 @@ class SplineSource(PipelineServiceSource): def _get_table_entity( self, database_name: str, schema_name: str, table_name: str - ) -> Table: + ) -> Optional[Table]: if not table_name: - return + return None for service_name in self.source_config.dbServiceNames: table_fqn = fqn.build( metadata=self.metadata, @@ -122,19 +122,21 @@ class SplineSource(PipelineServiceSource): database_name=database_name, ) if table_fqn: - table_entity = self.metadata.get_by_name(entity=Table, fqn=table_fqn) + table_entity: Table = self.metadata.get_by_name( + entity=Table, fqn=table_fqn + ) if table_entity: return table_entity return None - def _get_table_from_datasource_name(self, datasource: str): + def _get_table_from_datasource_name(self, datasource: str) -> Optional[Table]: if ( not datasource and not datasource.startswith("dbfs") and not datasource.startswith("jdbc") ): - return + return None try: schema_name = None @@ -153,6 +155,8 @@ class SplineSource(PipelineServiceSource): logger.debug(traceback.format_exc()) logger.warning(f"failed to parse datasource details due to: {exc}") + return None + def yield_pipeline_lineage_details( self, pipeline_details: ExecutionEvent ) -> Optional[Iterable[AddLineageRequest]]: diff --git a/ingestion/src/metadata/ingestion/source/pipeline/spline/models.py b/ingestion/src/metadata/ingestion/source/pipeline/spline/models.py index 8e3662408fc..b976ef289f2 100644 --- a/ingestion/src/metadata/ingestion/source/pipeline/spline/models.py +++ b/ingestion/src/metadata/ingestion/source/pipeline/spline/models.py @@ -8,7 +8,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +""" +Spline connector API response models +""" from typing import List, Optional from pydantic import BaseModel diff --git a/ingestion/src/metadata/ingestion/source/pipeline/spline/utils.py b/ingestion/src/metadata/ingestion/source/pipeline/spline/utils.py index 7909f11cdc5..3d894c63dac 100644 --- a/ingestion/src/metadata/ingestion/source/pipeline/spline/utils.py +++ b/ingestion/src/metadata/ingestion/source/pipeline/spline/utils.py @@ -8,7 +8,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +""" +Spline source processing utilities +""" import traceback from typing import Optional, Tuple @@ -36,13 +38,15 @@ def parse_dbfs_path(path: str) -> Optional[str]: return None -def clean_name(name: str) -> str: +def clean_name(name: str) -> Optional[str]: """ replace empty string with None """ if name: return name + return None + def parse_jdbc_url(url: str) -> Tuple[Optional[str], Optional[str], Optional[str]]: """ @@ -52,7 +56,7 @@ def parse_jdbc_url(url: str) -> Tuple[Optional[str], Optional[str], Optional[str lexer = JdbcUriLexer(InputStream(url)) stream = CommonTokenStream(lexer) parser = JdbcUriParser(stream) - parser._errHandler = BailErrorStrategy() # pylint: disable=protected-acc ess + parser._errHandler = BailErrorStrategy() # pylint: disable=protected-access tree = parser.jdbcUrl() schema_table = tree.schemaTable() if schema_table: diff --git a/ingestion/src/metadata/readers/api_reader.py b/ingestion/src/metadata/readers/api_reader.py new file mode 100644 index 00000000000..65ff42c5470 --- /dev/null +++ b/ingestion/src/metadata/readers/api_reader.py @@ -0,0 +1,60 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +GitHub client to read files with token auth +""" + +from abc import ABC +from typing import Dict, Union + +from metadata.generated.schema.security.credentials.bitbucketCredentials import ( + BitBucketCredentials, +) +from metadata.generated.schema.security.credentials.githubCredentials import ( + GitHubCredentials, +) +from metadata.readers.base import Reader +from metadata.utils.logger import ingestion_logger + +logger = ingestion_logger() + +ReadersCredentials = Union[GitHubCredentials, BitBucketCredentials] + + +class ApiReader(Reader, ABC): + """ + Generic API Reader + """ + + def __init__(self, credentials: ReadersCredentials): + + self._auth_headers = None + self.credentials = credentials + + @property + def auth_headers(self) -> Dict[str, str]: + """ + Build the headers to authenticate + to the API + """ + if self._auth_headers is None and self.credentials.token: + self._auth_headers = { + "Authorization": f"Bearer {self.credentials.token.__root__.get_secret_value()}" + } + + return self._auth_headers + + @staticmethod + def _build_url(*parts: str): + """ + Build URL parts + """ + return "/".join(parts) diff --git a/ingestion/src/metadata/readers/bitbucket.py b/ingestion/src/metadata/readers/bitbucket.py new file mode 100644 index 00000000000..9163377875e --- /dev/null +++ b/ingestion/src/metadata/readers/bitbucket.py @@ -0,0 +1,76 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +GitHub client to read files with token auth +""" +import traceback +from enum import Enum + +import requests + +from metadata.generated.schema.security.credentials.bitbucketCredentials import ( + BitBucketCredentials, +) +from metadata.readers.api_reader import ApiReader +from metadata.readers.base import ReadException +from metadata.utils.logger import ingestion_logger + +logger = ingestion_logger() + + +HOST = "https://api.bitbucket.org/2.0" + + +class UrlParts(Enum): + REPOS = "repositories" + SRC = "src" + + +class BitBucketReader(ApiReader): + """ + Handle calls to the GitHub API against a repo + """ + + credentials: BitBucketCredentials + + def read(self, path: str) -> str: + """ + Read a file from a GitHub Repo and return its + contents as a string + https://docs.github.com/en/rest/repos/contents?apiVersion=2022-11-28#get-repository-content + + This does not care if the path starts with `/` or not. + """ + try: + res = requests.get( + self._build_url( + HOST, + UrlParts.REPOS.value, + self.credentials.repositoryOwner.__root__, + self.credentials.repositoryName.__root__, + UrlParts.SRC.value, + self.credentials.branch, + path, + ), + headers=self.auth_headers, + timeout=30, + ) + if res.status_code == 200: + return res.text + + # If we don't get a 200, raise + res.raise_for_status() + + except Exception as err: + logger.debug(traceback.format_exc()) + raise ReadException(f"Error fetching file [{path}] from repo: {err}") + + raise ReadException(f"Could not fetch file [{path}] from repo") diff --git a/ingestion/src/metadata/readers/credentials.py b/ingestion/src/metadata/readers/credentials.py new file mode 100644 index 00000000000..421b7cba3d7 --- /dev/null +++ b/ingestion/src/metadata/readers/credentials.py @@ -0,0 +1,62 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Helper to manage readers' credentials functionalities +""" +from metadata.generated.schema.security.credentials.gitCredentials import RepositoryName +from metadata.readers.api_reader import ReadersCredentials +from metadata.utils.logger import ingestion_logger + +logger = ingestion_logger() + + +def update_repository_name( + original: ReadersCredentials, name: str +) -> ReadersCredentials: + """ + Given an original set of credentials and a new repository name, + return the updated credentials + """ + updated = original.copy(deep=True) + updated.repositoryName = RepositoryName(__root__=name) + + return updated + + +def get_credentials_from_url( + original: ReadersCredentials, url: str +) -> ReadersCredentials: + """ + Given a default set of credentials and a git URL, check if the + owner of the original credentials is part of the new URL. + + If it is, return updated credentials with the new repository name. + + If not, return the original credentials. + + This is just a quick sanity check. Worst case scenario, we won't be able to pick + up information, which would still not happen since we work with a single + token which cannot have permissions on different owners. + """ + if original.repositoryOwner.__root__ not in url: + logger.warning( + f"Default repository owner [{original.repositoryOwner.__root__}] not found in [{url}]." + " We'll use the default reader credentials." + ) + return original + + # Your typical URL is git@bitbucket.org:owner/repo.git + # or git@github.com:owner/repo.git + url_repository = url.split(original.repositoryOwner.__root__ + "/")[-1] + repo_name = url_repository.replace(".git", "") + + return update_repository_name(original=original, name=repo_name) diff --git a/ingestion/src/metadata/readers/github.py b/ingestion/src/metadata/readers/github.py index d26b5bbd2fe..e3cdc94249d 100644 --- a/ingestion/src/metadata/readers/github.py +++ b/ingestion/src/metadata/readers/github.py @@ -21,7 +21,8 @@ import requests from metadata.generated.schema.security.credentials.githubCredentials import ( GitHubCredentials, ) -from metadata.readers.base import Reader, ReadException +from metadata.readers.api_reader import ApiReader +from metadata.readers.base import ReadException from metadata.utils.constants import UTF_8 from metadata.utils.logger import ingestion_logger @@ -36,35 +37,12 @@ class UrlParts(Enum): CONTENTS = "contents" -class GitHubReader(Reader): +class GitHubReader(ApiReader): """ Handle calls to the GitHub API against a repo """ - def __init__(self, credentials: GitHubCredentials): - self.credentials = credentials - - self._auth_headers = None - - @property - def auth_headers(self) -> Dict[str, str]: - """ - Build the headers to authenticate - to the API - """ - if self._auth_headers is None: - self._auth_headers = { - "Authorization": f"Bearer {self.credentials.token.get_secret_value()}" - } - - return self._auth_headers - - @staticmethod - def _build_url(*parts: str): - """ - Build URL parts - """ - return "/".join(parts) + credentials: GitHubCredentials @staticmethod def _decode_content(json_response: Dict[str, Any]) -> str: @@ -80,14 +58,16 @@ class GitHubReader(Reader): Read a file from a GitHub Repo and return its contents as a string https://docs.github.com/en/rest/repos/contents?apiVersion=2022-11-28#get-repository-content + + This does not care if the path starts with `/` or not. """ try: res = requests.get( self._build_url( HOST, UrlParts.REPOS.value, - self.credentials.repositoryOwner, - self.credentials.repositoryName, + self.credentials.repositoryOwner.__root__, + self.credentials.repositoryName.__root__, UrlParts.CONTENTS.value, path, ), diff --git a/ingestion/tests/unit/readers/__init__.py b/ingestion/tests/unit/readers/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/ingestion/tests/unit/readers/test_credentials.py b/ingestion/tests/unit/readers/test_credentials.py new file mode 100644 index 00000000000..8cfe68c91db --- /dev/null +++ b/ingestion/tests/unit/readers/test_credentials.py @@ -0,0 +1,120 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Test Credentials helpers +""" +from unittest import TestCase + +from metadata.generated.schema.security.credentials.bitbucketCredentials import ( + BitBucketCredentials, +) +from metadata.generated.schema.security.credentials.githubCredentials import ( + GitHubCredentials, +) +from metadata.ingestion.models.custom_pydantic import CustomSecretStr +from metadata.readers.credentials import ( + get_credentials_from_url, + update_repository_name, +) + + +class TestCreds(TestCase): + """ + Validate credentials scenarios + """ + + def test_update_repository_name(self): + """ + Check we get new creds without updating the original + """ + + original = GitHubCredentials( + repositoryOwner="owner", + repositoryName="name", + token="token", + ) + + updated = update_repository_name(original=original, name="new_name") + + self.assertEqual(original.repositoryName.__root__, "name") + self.assertEqual(updated.repositoryName.__root__, "new_name") + self.assertEqual( + updated.repositoryOwner.__root__, original.repositoryOwner.__root__ + ) + self.assertEqual(updated.token.__root__, original.token.__root__) + + bb_original = BitBucketCredentials( + repositoryOwner="owner", + repositoryName="name", + token="token", + branch="branch", + ) + + bb_updated = update_repository_name(original=bb_original, name="new_name") + + self.assertEqual(bb_original.repositoryName.__root__, "name") + self.assertEqual(bb_updated.repositoryName.__root__, "new_name") + self.assertEqual( + bb_updated.repositoryOwner.__root__, bb_original.repositoryOwner.__root__ + ) + self.assertEqual(bb_updated.token.__root__, bb_original.token.__root__) + self.assertEqual(bb_updated.branch, bb_original.branch) + + def test_get_credentials_from_url(self): + """ + With and without the right owner + """ + url = "git@github.com:owner/repo.git" + + original = GitHubCredentials( + repositoryOwner="owner", + repositoryName="name", + token="token", + ) + + updated = get_credentials_from_url(original=original, url=url) + self.assertEqual(updated.repositoryName.__root__, "repo") + + original_not_owner = GitHubCredentials( + repositoryOwner="not_owner", + repositoryName="name", + token="token", + ) + + updated_not_owner = get_credentials_from_url( + original=original_not_owner, url=url + ) + self.assertEqual(updated_not_owner, original_not_owner) + + bb_url = "git@gitbucket.org:owner/repo.git" + + bb_original = BitBucketCredentials( + repositoryOwner="owner", + repositoryName="name", + token="token", + branch="branch", + ) + + bb_updated = get_credentials_from_url(original=bb_original, url=bb_url) + self.assertEqual(bb_updated.repositoryName.__root__, "repo") + + bb_original_not_owner = BitBucketCredentials( + repositoryOwner="not_owner", + repositoryName="name", + token="token", + branch="branch", + ) + + bb_updated_not_owner = get_credentials_from_url( + original=bb_original_not_owner, url=bb_url + ) + self.assertEqual(bb_updated_not_owner, bb_original_not_owner) diff --git a/ingestion/tests/unit/readers/test_github_reader.py b/ingestion/tests/unit/readers/test_github_reader.py new file mode 100644 index 00000000000..59abeb128c1 --- /dev/null +++ b/ingestion/tests/unit/readers/test_github_reader.py @@ -0,0 +1,50 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Test GitHub Reader +""" +from unittest import TestCase + +from metadata.generated.schema.security.credentials.githubCredentials import ( + GitHubCredentials, +) +from metadata.readers.github import GitHubReader + + +class TestGitHubReader(TestCase): + """ + Validate the github reader against the OM repo + """ + + def test_headers(self): + """ + We build the headers correctly + """ + creds = GitHubCredentials( + repositoryName="name", repositoryOwner="owner", token="token" + ) + + reader = GitHubReader(creds) + + self.assertEqual(reader.auth_headers, {"Authorization": "Bearer token"}) + + def test_read(self): + """ + We can read the OM README + """ + creds = GitHubCredentials( + repositoryName="OpenMetadata", + repositoryOwner="open-metadata", + ) + + reader = GitHubReader(creds) + self.assertIsNotNone(reader.read("README.md")) diff --git a/ingestion/tests/unit/topology/dashboard/test_looker_lkml_parser.py b/ingestion/tests/unit/topology/dashboard/test_looker_lkml_parser.py index 113208f1dec..a0b91d91141 100644 --- a/ingestion/tests/unit/topology/dashboard/test_looker_lkml_parser.py +++ b/ingestion/tests/unit/topology/dashboard/test_looker_lkml_parser.py @@ -14,6 +14,7 @@ Test the lkml parser from pathlib import Path from unittest import TestCase +from metadata.ingestion.source.dashboard.looker.links import get_path_from_link from metadata.ingestion.source.dashboard.looker.parser import ( Includes, LkmlParser, @@ -136,3 +137,20 @@ class TestLkmlParser(TestCase): "views/cats.view.lkml": [], }, ) + + def test_get_path_from_link(self): + """ + Validate utility + """ + simple_link = "/projects/my_project/files/hello.explore.lkml" + self.assertEqual(get_path_from_link(simple_link), "hello.explore.lkml") + + link = "/projects/my_project/files/hello%2Fexplores%2Fmy_explore.explore.lkml?line=13" + self.assertEqual( + get_path_from_link(link), "hello/explores/my_explore.explore.lkml" + ) + + link_no_files = "hello%2Fexplores%2Fmy_explore.explore.lkml?line=13" + self.assertEqual( + get_path_from_link(link_no_files), "hello/explores/my_explore.explore.lkml" + ) diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/secrets/converter/ClassConverterFactory.java b/openmetadata-service/src/main/java/org/openmetadata/service/secrets/converter/ClassConverterFactory.java index 08eb44a43a8..c7b41c9ff55 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/secrets/converter/ClassConverterFactory.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/secrets/converter/ClassConverterFactory.java @@ -21,6 +21,7 @@ import org.openmetadata.schema.entity.automations.Workflow; import org.openmetadata.schema.metadataIngestion.DbtPipeline; import org.openmetadata.schema.metadataIngestion.dbtconfig.DbtGCSConfig; import org.openmetadata.schema.security.credentials.GCSCredentials; +import org.openmetadata.schema.services.connections.dashboard.LookerConnection; import org.openmetadata.schema.services.connections.dashboard.SupersetConnection; import org.openmetadata.schema.services.connections.dashboard.TableauConnection; import org.openmetadata.schema.services.connections.database.BigQueryConnection; @@ -49,6 +50,7 @@ public final class ClassConverterFactory { Map.entry(GCSConfig.class, new GCSConfigClassConverter()), Map.entry(GCSCredentials.class, new GcsCredentialsClassConverter()), Map.entry(GcsConnection.class, new GcsConnectionClassConverter()), + Map.entry(LookerConnection.class, new LookerConnectionClassConverter()), Map.entry(OpenMetadataConnection.class, new OpenMetadataConnectionClassConverter()), Map.entry(SSOAuthMechanism.class, new SSOAuthMechanismClassConverter()), Map.entry(SupersetConnection.class, new SupersetConnectionClassConverter()), diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/secrets/converter/LookerConnectionClassConverter.java b/openmetadata-service/src/main/java/org/openmetadata/service/secrets/converter/LookerConnectionClassConverter.java new file mode 100644 index 00000000000..895514d8925 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/secrets/converter/LookerConnectionClassConverter.java @@ -0,0 +1,27 @@ +package org.openmetadata.service.secrets.converter; + +import java.util.List; +import org.openmetadata.schema.security.credentials.BitBucketCredentials; +import org.openmetadata.schema.security.credentials.GitHubCredentials; +import org.openmetadata.schema.services.connections.dashboard.LookerConnection; +import org.openmetadata.service.util.JsonUtils; + +public class LookerConnectionClassConverter extends ClassConverter { + + private static final List> CREDENTIALS_CLASSES = + List.of(GitHubCredentials.class, BitBucketCredentials.class); + + public LookerConnectionClassConverter() { + super(LookerConnection.class); + } + + @Override + public Object convert(Object object) { + LookerConnection lookerConnection = (LookerConnection) JsonUtils.convertValue(object, this.clazz); + + tryToConvertOrFail(lookerConnection.getGitCredentials(), CREDENTIALS_CLASSES) + .ifPresent(lookerConnection::setGitCredentials); + + return lookerConnection; + } +} diff --git a/openmetadata-service/src/main/resources/json/data/testConnections/dashboard/looker.json b/openmetadata-service/src/main/resources/json/data/testConnections/dashboard/looker.json index 5ef0a54c1e8..7f9871fd61f 100644 --- a/openmetadata-service/src/main/resources/json/data/testConnections/dashboard/looker.json +++ b/openmetadata-service/src/main/resources/json/data/testConnections/dashboard/looker.json @@ -11,6 +11,13 @@ "shortCircuit": true, "mandatory": true }, + { + "name": "ValidateVersion", + "description": "Validate that the API version support the SKD 4.0", + "errorMessage": "API Version 4.0 is not listed on your instance supported versions. Note that 4.0 is the stable version: https://cloud.google.com/looker/docs/api-sdk", + "shortCircuit": true, + "mandatory": true + }, { "name": "ListDashboards", "description": "The user has permissions to list a non-empty list of dashboards", diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/secrets/converter/ClassConverterFactoryTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/secrets/converter/ClassConverterFactoryTest.java index ec51032d7e7..05e08067580 100644 --- a/openmetadata-service/src/test/java/org/openmetadata/service/secrets/converter/ClassConverterFactoryTest.java +++ b/openmetadata-service/src/test/java/org/openmetadata/service/secrets/converter/ClassConverterFactoryTest.java @@ -12,7 +12,9 @@ import org.openmetadata.schema.entity.automations.Workflow; import org.openmetadata.schema.metadataIngestion.DbtPipeline; import org.openmetadata.schema.metadataIngestion.dbtconfig.DbtGCSConfig; import org.openmetadata.schema.security.credentials.GCSCredentials; +import org.openmetadata.schema.services.connections.dashboard.LookerConnection; import org.openmetadata.schema.services.connections.dashboard.SupersetConnection; +import org.openmetadata.schema.services.connections.dashboard.TableauConnection; import org.openmetadata.schema.services.connections.database.BigQueryConnection; import org.openmetadata.schema.services.connections.database.DatalakeConnection; import org.openmetadata.schema.services.connections.database.datalake.GCSConfig; @@ -26,16 +28,18 @@ public class ClassConverterFactoryTest { @ValueSource( classes = { AirflowConnection.class, + BigQueryConnection.class, DatalakeConnection.class, + DbtGCSConfig.class, DbtPipeline.class, + GCSConfig.class, + GCSCredentials.class, + GcsConnection.class, + LookerConnection.class, + OpenMetadataConnection.class, SSOAuthMechanism.class, SupersetConnection.class, - GCSCredentials.class, - OpenMetadataConnection.class, - GcsConnection.class, - GCSConfig.class, - BigQueryConnection.class, - DbtGCSConfig.class, + TableauConnection.class, TestServiceConnectionRequest.class, Workflow.class }) @@ -45,6 +49,6 @@ public class ClassConverterFactoryTest { @Test void testClassConvertedMapIsNotModified() { - assertEquals(ClassConverterFactory.getConverterMap().size(), 14); + assertEquals(ClassConverterFactory.getConverterMap().size(), 15); } } diff --git a/openmetadata-spec/src/main/resources/json/schema/entity/services/connections/dashboard/lookerConnection.json b/openmetadata-spec/src/main/resources/json/schema/entity/services/connections/dashboard/lookerConnection.json index 68779d57dd7..09726306d35 100644 --- a/openmetadata-spec/src/main/resources/json/schema/entity/services/connections/dashboard/lookerConnection.json +++ b/openmetadata-spec/src/main/resources/json/schema/entity/services/connections/dashboard/lookerConnection.json @@ -12,7 +12,7 @@ "enum": ["Looker"], "default": "Looker" }, - "noGitHubCredentials": { + "noGitCredentials": { "title": "No GitHub Credentials", "description": "Do not set any credentials. Note that credentials are required to extract .lkml views and their lineage.", "type": "object", @@ -44,15 +44,18 @@ "type": "string", "format": "uri" }, - "githubCredentials": { + "gitCredentials": { "title": "GitHub Credentials", "description": "Credentials to extract the .lkml files from a repository. This is required to get all the lineage and definitions.", "oneOf": [ { - "$ref": "#/definitions/noGitHubCredentials" + "$ref": "#/definitions/noGitCredentials" }, { "$ref": "../../../../security/credentials/githubCredentials.json" + }, + { + "$ref": "../../../../security/credentials/bitbucketCredentials.json" } ] }, diff --git a/openmetadata-spec/src/main/resources/json/schema/security/credentials/bitbucketCredentials.json b/openmetadata-spec/src/main/resources/json/schema/security/credentials/bitbucketCredentials.json new file mode 100644 index 00000000000..c3fcc71a140 --- /dev/null +++ b/openmetadata-spec/src/main/resources/json/schema/security/credentials/bitbucketCredentials.json @@ -0,0 +1,40 @@ +{ + "$id": "https://open-metadata.org/security/credentials/bitbucketCredentials.json", + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "BitBucketCredentials", + "description": "Credentials for a BitBucket repository", + "type": "object", + "javaType": "org.openmetadata.schema.security.credentials.BitBucketCredentials", + "definitions": { + "bitbucketType": { + "description": "BitBucket Credentials type", + "type": "string", + "enum": ["BitBucket"], + "default": "BitBucket" + } + }, + "properties": { + "type": { + "title": "Credentials Type", + "description": "Credentials Type", + "$ref": "#/definitions/bitbucketType", + "default": "BitBucket" + }, + "repositoryOwner": { + "$ref": "gitCredentials.json#/definitions/repositoryOwner" + }, + "repositoryName": { + "$ref": "gitCredentials.json#/definitions/repositoryName" + }, + "token": { + "$ref": "gitCredentials.json#/definitions/token" + }, + "branch": { + "title": "Main Branch", + "description": "Main production branch of the repository. E.g., `main`", + "type": "string" + } + }, + "additionalProperties": false, + "required": ["repositoryOwner", "repositoryName", "branch"] +} diff --git a/openmetadata-spec/src/main/resources/json/schema/security/credentials/gitCredentials.json b/openmetadata-spec/src/main/resources/json/schema/security/credentials/gitCredentials.json new file mode 100644 index 00000000000..6ba362882c5 --- /dev/null +++ b/openmetadata-spec/src/main/resources/json/schema/security/credentials/gitCredentials.json @@ -0,0 +1,26 @@ +{ + "$id": "https://open-metadata.org/security/credentials/gitCredentials.json", + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "GitCredentials", + "description": "Credentials for a generic Git repository", + "type": "object", + "javaType": "org.openmetadata.schema.security.credentials.GitCredentials", + "definitions": { + "repositoryOwner": { + "title": "Repository Owner", + "description": "The owner (user or organization) of a Git repository. For example, in https://github.com/open-metadata/OpenMetadata, the owner is `open-metadata`.", + "type": "string" + }, + "repositoryName": { + "title": "Repository Name", + "description": "The name of a Git repository. For example, in https://github.com/open-metadata/OpenMetadata, the name is `OpenMetadata`.", + "type": "string" + }, + "token": { + "title": "API Token", + "description": "Token to use the API. This is required for private repositories and to ensure we don't hit API limits.", + "type": "string", + "format": "password" + } + } +} diff --git a/openmetadata-spec/src/main/resources/json/schema/security/credentials/githubCredentials.json b/openmetadata-spec/src/main/resources/json/schema/security/credentials/githubCredentials.json index 7c08cf1cf7b..17c650a08be 100644 --- a/openmetadata-spec/src/main/resources/json/schema/security/credentials/githubCredentials.json +++ b/openmetadata-spec/src/main/resources/json/schema/security/credentials/githubCredentials.json @@ -5,22 +5,29 @@ "description": "Credentials for a GitHub repository", "type": "object", "javaType": "org.openmetadata.schema.security.credentials.GitHubCredentials", + "definitions": { + "githubType": { + "description": "GitHub Credentials type", + "type": "string", + "enum": ["GitHub"], + "default": "GitHub" + } + }, "properties": { + "type": { + "title": "Credentials Type", + "description": "Credentials Type", + "$ref": "#/definitions/githubType", + "default": "GitHub" + }, "repositoryOwner": { - "title": "Repository Owner", - "description": "The owner (user or organization) of a GitHub repository. For example, in https://github.com/open-metadata/OpenMetadata, the owner is `open-metadata`.", - "type": "string" + "$ref": "gitCredentials.json#/definitions/repositoryOwner" }, "repositoryName": { - "title": "Repository Name", - "description": "The name of a GitHub repository. For example, in https://github.com/open-metadata/OpenMetadata, the name is `OpenMetadata`.", - "type": "string" + "$ref": "gitCredentials.json#/definitions/repositoryName" }, "token": { - "title": "API Token", - "description": "Token to use the API. This is required for private repositories and to ensure we don't hit API limits.", - "type": "string", - "format": "password" + "$ref": "gitCredentials.json#/definitions/token" } }, "additionalProperties": false,