From 1ecf5607c7aab339d7ebd33d8bdbc19be3d76b27 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Thu, 29 Jun 2023 16:14:17 +0200 Subject: [PATCH] Looker - Fix file extension and blob import (#12232) * Fix file extension and blob import * Fix file extension and blob import --- .../source/dashboard/looker/parser.py | 93 ++++++++++++++++--- ingestion/src/metadata/readers/base.py | 28 ++++++ ingestion/src/metadata/readers/bitbucket.py | 47 ++++++++++ ingestion/src/metadata/readers/github.py | 52 ++++++++++- ingestion/src/metadata/readers/local.py | 10 ++ .../unit/resources/lkml/cats.explore.lkml | 2 +- .../tests/unit/topology/dashboard/__init__.py | 0 .../dashboard/test_looker_lkml_parser.py | 12 +++ .../dashboard/test_lookml_bitbucket_reader.py | 60 ++++++++++++ .../dashboard/test_lookml_github_reader.py | 59 ++++++++++++ 10 files changed, 350 insertions(+), 13 deletions(-) create mode 100644 ingestion/tests/unit/topology/dashboard/__init__.py create mode 100644 ingestion/tests/unit/topology/dashboard/test_lookml_bitbucket_reader.py create mode 100644 ingestion/tests/unit/topology/dashboard/test_lookml_github_reader.py diff --git a/ingestion/src/metadata/ingestion/source/dashboard/looker/parser.py b/ingestion/src/metadata/ingestion/source/dashboard/looker/parser.py index 7087edd3e9a..71b36441ae7 100644 --- a/ingestion/src/metadata/ingestion/source/dashboard/looker/parser.py +++ b/ingestion/src/metadata/ingestion/source/dashboard/looker/parser.py @@ -11,7 +11,9 @@ """ .lkml files parser """ +import fnmatch import traceback +from pathlib import Path from typing import Dict, List, Optional import lkml @@ -28,6 +30,8 @@ from metadata.utils.logger import ingestion_logger logger = ingestion_logger() +EXTENSIONS = (".lkml", ".lookml") + class LkmlParser: """ @@ -61,6 +65,18 @@ class LkmlParser: self.reader = reader + self._file_tree: Optional[List[Includes]] = None + + @property + def file_tree(self) -> List[Includes]: + """ + Parse the file tree of the repo + """ + if not self._file_tree: + self._file_tree = self.reader.get_tree() + + return self._file_tree or [] + def parse_file(self, path: Includes) -> Optional[List[Includes]]: """ Internal parser. Parse the file and cache the views @@ -79,17 +95,7 @@ class LkmlParser: return [] try: - file = self.reader.read(path) - lkml_file = LkmlFile.parse_obj(lkml.load(file)) - self.parsed_files[path] = file - - # Cache everything - self._visited_files[path] = lkml_file.includes - for view in lkml_file.views: - view.source_file = path - self._views_cache[view.name] = view - - return lkml_file.includes + return self._process_file(path) except ReadException as err: logger.debug(traceback.format_exc()) @@ -104,6 +110,71 @@ class LkmlParser: return None + def _process_file(self, path: Includes) -> Optional[List[Includes]]: + """ + Processing of a single path + """ + file = self._read_file(path) + lkml_file = LkmlFile.parse_obj(lkml.load(file)) + self.parsed_files[path] = file + + # Cache everything + expanded_includes = self._expand_includes(lkml_file.includes) + self._visited_files[path] = expanded_includes + for view in lkml_file.views: + view.source_file = path + self._views_cache[view.name] = view + + return expanded_includes + + def _expand_includes( + self, includes: Optional[List[Includes]] + ) -> Optional[List[Includes]]: + """ + If we have * in includes, expand them based on the file tree + """ + if not includes: + return includes + + return [expanded for path in includes for expanded in self._expand(path)] + + def _expand(self, path: Includes) -> List[Includes]: + """ + Match files in tree if there's any * in the include + """ + suffixes = Path(path).suffixes + if "*" in path: + if set(suffixes).intersection(set(EXTENSIONS)): + return fnmatch.filter(self.file_tree, path) + for suffix in EXTENSIONS: + res = fnmatch.filter(self.file_tree, Includes(str(path) + suffix)) + if res: + return res + # Nothing matched, we cannot find the file + logger.warning(f"We could not match any file from the include {path}") + return [] + + return [path] + + def _read_file(self, path: Includes) -> str: + """ + Read the LookML file + """ + suffixes = Path(path).suffixes + + # Check if any suffix is in our extension list + if not set(suffixes).intersection(set(EXTENSIONS)): + for suffix in EXTENSIONS: + try: + return self.reader.read(path + suffix) + except ReadException as err: + logger.debug(f"Error trying to read the file [{path}]: {err}") + + else: + return self.reader.read(path) + + raise ReadException(f"Error trying to read the file [{path}]") + def get_view_from_cache(self, view_name: ViewName) -> Optional[LookMlView]: """ Check if view is cached, and return it. diff --git a/ingestion/src/metadata/readers/base.py b/ingestion/src/metadata/readers/base.py index 70a12e11605..00ecbdebb08 100644 --- a/ingestion/src/metadata/readers/base.py +++ b/ingestion/src/metadata/readers/base.py @@ -11,7 +11,13 @@ """ Base local reader """ +import traceback from abc import ABC, abstractmethod +from typing import List, Optional + +from metadata.utils.logger import ingestion_logger + +logger = ingestion_logger() class ReadException(Exception): @@ -21,9 +27,31 @@ class ReadException(Exception): class Reader(ABC): + """ + Abstract class for all readers + """ + @abstractmethod def read(self, path: str) -> str: """ Given a string, return a string """ raise NotImplementedError("Missing read implementation") + + @abstractmethod + def _get_tree(self) -> List[str]: + """ + Return the filenames of the root + """ + raise NotImplementedError("Missing get_tree implementation") + + def get_tree(self) -> Optional[List[str]]: + """ + If something happens, return None + """ + try: + return self._get_tree() + except Exception as err: + logger.debug(traceback.format_exc()) + logger.error(f"Error getting file tree [{err}]") + return None diff --git a/ingestion/src/metadata/readers/bitbucket.py b/ingestion/src/metadata/readers/bitbucket.py index 9163377875e..3fef32c153a 100644 --- a/ingestion/src/metadata/readers/bitbucket.py +++ b/ingestion/src/metadata/readers/bitbucket.py @@ -13,6 +13,7 @@ GitHub client to read files with token auth """ import traceback from enum import Enum +from typing import List import requests @@ -27,6 +28,7 @@ logger = ingestion_logger() HOST = "https://api.bitbucket.org/2.0" +PAGE_LENGTH = 100 class UrlParts(Enum): @@ -74,3 +76,48 @@ class BitBucketReader(ApiReader): raise ReadException(f"Error fetching file [{path}] from repo: {err}") raise ReadException(f"Could not fetch file [{path}] from repo") + + def _get_files_from_dir(self, url: str) -> List[str]: + """ + Run the request and return the page results + """ + res = requests.get( + url=url + "/?fields=values.path", + headers=self.auth_headers, + timeout=30, + ) + + if res.status_code == 200: + files = [] + json_res = res.json() + for file in json_res.get("values") or []: + path = file.get("path") + new_url = url + "/" + path.split("/")[-1] + + # If we have a file, append. Otherwise, call again + if "." in path: + files.append(path) + else: + files.extend(self._get_files_from_dir(new_url)) + + return files + + # If we don't get a 200, raise + res.raise_for_status() + raise RuntimeError("Could not fetch the tree") + + def _get_tree(self) -> List[str]: + """ + Paginate over the results + """ + + url = self._build_url( + HOST, + UrlParts.REPOS.value, + self.credentials.repositoryOwner.__root__, + self.credentials.repositoryName.__root__, + UrlParts.SRC.value, + self.credentials.branch, + ) + + return self._get_files_from_dir(url) diff --git a/ingestion/src/metadata/readers/github.py b/ingestion/src/metadata/readers/github.py index e3cdc94249d..7d0aa552f10 100644 --- a/ingestion/src/metadata/readers/github.py +++ b/ingestion/src/metadata/readers/github.py @@ -14,7 +14,7 @@ GitHub client to read files with token auth import base64 import traceback from enum import Enum -from typing import Any, Dict +from typing import Any, Dict, List, Optional import requests @@ -85,3 +85,53 @@ class GitHubReader(ApiReader): raise ReadException(f"Error fetching file [{path}] from repo: {err}") raise ReadException(f"Could not fetch file [{path}] from repo") + + def _get_default_branch(self) -> str: + """ + Get repo default branch + """ + res = requests.get( + self._build_url( + HOST, + UrlParts.REPOS.value, + self.credentials.repositoryOwner.__root__, + self.credentials.repositoryName.__root__, + ), + headers=self.auth_headers, + timeout=30, + ) + if res.status_code == 200: + return res.json().get("default_branch") + + # If we don't get a 200, raise + res.raise_for_status() + raise RuntimeError("Could not fetch the default branch") + + def _get_tree(self) -> Optional[List[str]]: + """ + Use the GitHub Tree API + """ + # First, get the default branch + branch = self._get_default_branch() + if branch: + res = requests.get( + self._build_url( + HOST, + UrlParts.REPOS.value, + self.credentials.repositoryOwner.__root__, + self.credentials.repositoryName.__root__, + "git", + "trees", + f"{branch}?recursive=1", + ), + headers=self.auth_headers, + timeout=30, + ) + if res.status_code == 200: + return [elem.get("path") for elem in res.json().get("tree")] + + # If we don't get a 200, raise + res.raise_for_status() + + # If we don't find a branch, return None + return None diff --git a/ingestion/src/metadata/readers/local.py b/ingestion/src/metadata/readers/local.py index 4ffe487606d..6e0370c1c75 100644 --- a/ingestion/src/metadata/readers/local.py +++ b/ingestion/src/metadata/readers/local.py @@ -13,6 +13,7 @@ Local Reader """ import traceback from pathlib import Path +from typing import List, Optional from metadata.readers.base import Reader, ReadException from metadata.utils.constants import UTF_8 @@ -40,3 +41,12 @@ class LocalReader(Reader): except Exception as err: logger.debug(traceback.format_exc()) raise ReadException(f"Error reading file [{path}] locally: {err}") + + def _get_tree(self) -> Optional[List[str]]: + """ + Return the tree with the files relative to the base path + """ + return [ + str(path).replace(str(self.base_path) + "/", "") + for path in Path(self.base_path).rglob("*") + ] diff --git a/ingestion/tests/unit/resources/lkml/cats.explore.lkml b/ingestion/tests/unit/resources/lkml/cats.explore.lkml index 5d22542b57b..43724ffd4fc 100644 --- a/ingestion/tests/unit/resources/lkml/cats.explore.lkml +++ b/ingestion/tests/unit/resources/lkml/cats.explore.lkml @@ -1,4 +1,4 @@ -include: "views/cats.view.lkml" +include: "*/cats.view" include: "views/dogs.view.lkml" diff --git a/ingestion/tests/unit/topology/dashboard/__init__.py b/ingestion/tests/unit/topology/dashboard/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/ingestion/tests/unit/topology/dashboard/test_looker_lkml_parser.py b/ingestion/tests/unit/topology/dashboard/test_looker_lkml_parser.py index a0b91d91141..b0ddb5a7f2e 100644 --- a/ingestion/tests/unit/topology/dashboard/test_looker_lkml_parser.py +++ b/ingestion/tests/unit/topology/dashboard/test_looker_lkml_parser.py @@ -154,3 +154,15 @@ class TestLkmlParser(TestCase): self.assertEqual( get_path_from_link(link_no_files), "hello/explores/my_explore.explore.lkml" ) + + def test_expand(self): + """ + We can expand a single Path. We are looking for "*/cats.view", which will + match a file in the resources directory "cats.view.lkml" + """ + path = Includes("*/cats.view") + + reader = LocalReader(BASE_PATH) + parser = LkmlParser(reader) + + self.assertIn("cats.view.lkml", parser._expand(path)[0]) diff --git a/ingestion/tests/unit/topology/dashboard/test_lookml_bitbucket_reader.py b/ingestion/tests/unit/topology/dashboard/test_lookml_bitbucket_reader.py new file mode 100644 index 00000000000..bae2fad1ced --- /dev/null +++ b/ingestion/tests/unit/topology/dashboard/test_lookml_bitbucket_reader.py @@ -0,0 +1,60 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Test GitHub Reader +""" +from unittest import TestCase + +from metadata.generated.schema.security.credentials.bitbucketCredentials import ( + BitBucketCredentials, +) +from metadata.ingestion.source.dashboard.looker.models import Includes, ViewName +from metadata.ingestion.source.dashboard.looker.parser import LkmlParser +from metadata.readers.bitbucket import BitBucketReader + + +class TestLookMLBitBucketReader(TestCase): + """ + Validate the github reader against the OM repo + """ + + creds = BitBucketCredentials( + repositoryName="api", + repositoryOwner="pmbrull-trial-api", + branch="main", + ) + + reader = BitBucketReader(creds) + parser = LkmlParser(reader) + + def test_lookml_read_and_parse(self): + """ + We can parse the explore file. + + We'll expand and find views from https://bitbucket.org/pmbrull-trial-api/api/src/main + """ + + explore_file = "cats.explore.lkml" + self.parser.parse_file(Includes(explore_file)) + + contents = self.parser.parsed_files.get(Includes(explore_file)) + + # Check file contents + self.assertIn("explore: cats", contents) + + view = self.parser.find_view( + view_name=ViewName("cats"), path=Includes(explore_file) + ) + + # We can get views that are resolved even if the include does not contain `.lkml` + self.assertIsNotNone(view) + self.assertEqual(view.name, "cats") diff --git a/ingestion/tests/unit/topology/dashboard/test_lookml_github_reader.py b/ingestion/tests/unit/topology/dashboard/test_lookml_github_reader.py new file mode 100644 index 00000000000..a420c8ab300 --- /dev/null +++ b/ingestion/tests/unit/topology/dashboard/test_lookml_github_reader.py @@ -0,0 +1,59 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Test GitHub Reader +""" +from unittest import TestCase + +from metadata.generated.schema.security.credentials.githubCredentials import ( + GitHubCredentials, +) +from metadata.ingestion.source.dashboard.looker.models import Includes, ViewName +from metadata.ingestion.source.dashboard.looker.parser import LkmlParser +from metadata.readers.github import GitHubReader + + +class TestLookMLGitHubReader(TestCase): + """ + Validate the github reader against the OM repo + """ + + creds = GitHubCredentials( + repositoryName="lookml-sample", + repositoryOwner="open-metadata", + ) + + reader = GitHubReader(creds) + parser = LkmlParser(reader) + + def test_lookml_read_and_parse(self): + """ + We can parse the explore file. + + We'll expand and find views from https://github.com/open-metadata/lookml-sample/blob/main/cats.explore.lkml + """ + + explore_file = "cats.explore.lkml" + self.parser.parse_file(Includes(explore_file)) + + contents = self.parser.parsed_files.get(Includes(explore_file)) + + # Check file contents + self.assertIn("explore: cats", contents) + + view = self.parser.find_view( + view_name=ViewName("cats"), path=Includes(explore_file) + ) + + # We can get views that are resolved even if the include does not contain `.lkml` + self.assertIsNotNone(view) + self.assertEqual(view.name, "cats")