Looker - Fix file extension and blob import (#12232)

* Fix file extension and blob import

* Fix file extension and blob import
This commit is contained in:
Pere Miquel Brull 2023-06-29 16:14:17 +02:00 committed by GitHub
parent 4aab0fd6fb
commit 1ecf5607c7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 350 additions and 13 deletions

View File

@ -11,7 +11,9 @@
"""
.lkml files parser
"""
import fnmatch
import traceback
from pathlib import Path
from typing import Dict, List, Optional
import lkml
@ -28,6 +30,8 @@ from metadata.utils.logger import ingestion_logger
logger = ingestion_logger()
EXTENSIONS = (".lkml", ".lookml")
class LkmlParser:
"""
@ -61,6 +65,18 @@ class LkmlParser:
self.reader = reader
self._file_tree: Optional[List[Includes]] = None
@property
def file_tree(self) -> List[Includes]:
"""
Parse the file tree of the repo
"""
if not self._file_tree:
self._file_tree = self.reader.get_tree()
return self._file_tree or []
def parse_file(self, path: Includes) -> Optional[List[Includes]]:
"""
Internal parser. Parse the file and cache the views
@ -79,17 +95,7 @@ class LkmlParser:
return []
try:
file = self.reader.read(path)
lkml_file = LkmlFile.parse_obj(lkml.load(file))
self.parsed_files[path] = file
# Cache everything
self._visited_files[path] = lkml_file.includes
for view in lkml_file.views:
view.source_file = path
self._views_cache[view.name] = view
return lkml_file.includes
return self._process_file(path)
except ReadException as err:
logger.debug(traceback.format_exc())
@ -104,6 +110,71 @@ class LkmlParser:
return None
def _process_file(self, path: Includes) -> Optional[List[Includes]]:
"""
Processing of a single path
"""
file = self._read_file(path)
lkml_file = LkmlFile.parse_obj(lkml.load(file))
self.parsed_files[path] = file
# Cache everything
expanded_includes = self._expand_includes(lkml_file.includes)
self._visited_files[path] = expanded_includes
for view in lkml_file.views:
view.source_file = path
self._views_cache[view.name] = view
return expanded_includes
def _expand_includes(
self, includes: Optional[List[Includes]]
) -> Optional[List[Includes]]:
"""
If we have * in includes, expand them based on the file tree
"""
if not includes:
return includes
return [expanded for path in includes for expanded in self._expand(path)]
def _expand(self, path: Includes) -> List[Includes]:
"""
Match files in tree if there's any * in the include
"""
suffixes = Path(path).suffixes
if "*" in path:
if set(suffixes).intersection(set(EXTENSIONS)):
return fnmatch.filter(self.file_tree, path)
for suffix in EXTENSIONS:
res = fnmatch.filter(self.file_tree, Includes(str(path) + suffix))
if res:
return res
# Nothing matched, we cannot find the file
logger.warning(f"We could not match any file from the include {path}")
return []
return [path]
def _read_file(self, path: Includes) -> str:
"""
Read the LookML file
"""
suffixes = Path(path).suffixes
# Check if any suffix is in our extension list
if not set(suffixes).intersection(set(EXTENSIONS)):
for suffix in EXTENSIONS:
try:
return self.reader.read(path + suffix)
except ReadException as err:
logger.debug(f"Error trying to read the file [{path}]: {err}")
else:
return self.reader.read(path)
raise ReadException(f"Error trying to read the file [{path}]")
def get_view_from_cache(self, view_name: ViewName) -> Optional[LookMlView]:
"""
Check if view is cached, and return it.

View File

@ -11,7 +11,13 @@
"""
Base local reader
"""
import traceback
from abc import ABC, abstractmethod
from typing import List, Optional
from metadata.utils.logger import ingestion_logger
logger = ingestion_logger()
class ReadException(Exception):
@ -21,9 +27,31 @@ class ReadException(Exception):
class Reader(ABC):
"""
Abstract class for all readers
"""
@abstractmethod
def read(self, path: str) -> str:
"""
Given a string, return a string
"""
raise NotImplementedError("Missing read implementation")
@abstractmethod
def _get_tree(self) -> List[str]:
"""
Return the filenames of the root
"""
raise NotImplementedError("Missing get_tree implementation")
def get_tree(self) -> Optional[List[str]]:
"""
If something happens, return None
"""
try:
return self._get_tree()
except Exception as err:
logger.debug(traceback.format_exc())
logger.error(f"Error getting file tree [{err}]")
return None

View File

@ -13,6 +13,7 @@ GitHub client to read files with token auth
"""
import traceback
from enum import Enum
from typing import List
import requests
@ -27,6 +28,7 @@ logger = ingestion_logger()
HOST = "https://api.bitbucket.org/2.0"
PAGE_LENGTH = 100
class UrlParts(Enum):
@ -74,3 +76,48 @@ class BitBucketReader(ApiReader):
raise ReadException(f"Error fetching file [{path}] from repo: {err}")
raise ReadException(f"Could not fetch file [{path}] from repo")
def _get_files_from_dir(self, url: str) -> List[str]:
"""
Run the request and return the page results
"""
res = requests.get(
url=url + "/?fields=values.path",
headers=self.auth_headers,
timeout=30,
)
if res.status_code == 200:
files = []
json_res = res.json()
for file in json_res.get("values") or []:
path = file.get("path")
new_url = url + "/" + path.split("/")[-1]
# If we have a file, append. Otherwise, call again
if "." in path:
files.append(path)
else:
files.extend(self._get_files_from_dir(new_url))
return files
# If we don't get a 200, raise
res.raise_for_status()
raise RuntimeError("Could not fetch the tree")
def _get_tree(self) -> List[str]:
"""
Paginate over the results
"""
url = self._build_url(
HOST,
UrlParts.REPOS.value,
self.credentials.repositoryOwner.__root__,
self.credentials.repositoryName.__root__,
UrlParts.SRC.value,
self.credentials.branch,
)
return self._get_files_from_dir(url)

View File

@ -14,7 +14,7 @@ GitHub client to read files with token auth
import base64
import traceback
from enum import Enum
from typing import Any, Dict
from typing import Any, Dict, List, Optional
import requests
@ -85,3 +85,53 @@ class GitHubReader(ApiReader):
raise ReadException(f"Error fetching file [{path}] from repo: {err}")
raise ReadException(f"Could not fetch file [{path}] from repo")
def _get_default_branch(self) -> str:
"""
Get repo default branch
"""
res = requests.get(
self._build_url(
HOST,
UrlParts.REPOS.value,
self.credentials.repositoryOwner.__root__,
self.credentials.repositoryName.__root__,
),
headers=self.auth_headers,
timeout=30,
)
if res.status_code == 200:
return res.json().get("default_branch")
# If we don't get a 200, raise
res.raise_for_status()
raise RuntimeError("Could not fetch the default branch")
def _get_tree(self) -> Optional[List[str]]:
"""
Use the GitHub Tree API
"""
# First, get the default branch
branch = self._get_default_branch()
if branch:
res = requests.get(
self._build_url(
HOST,
UrlParts.REPOS.value,
self.credentials.repositoryOwner.__root__,
self.credentials.repositoryName.__root__,
"git",
"trees",
f"{branch}?recursive=1",
),
headers=self.auth_headers,
timeout=30,
)
if res.status_code == 200:
return [elem.get("path") for elem in res.json().get("tree")]
# If we don't get a 200, raise
res.raise_for_status()
# If we don't find a branch, return None
return None

View File

@ -13,6 +13,7 @@ Local Reader
"""
import traceback
from pathlib import Path
from typing import List, Optional
from metadata.readers.base import Reader, ReadException
from metadata.utils.constants import UTF_8
@ -40,3 +41,12 @@ class LocalReader(Reader):
except Exception as err:
logger.debug(traceback.format_exc())
raise ReadException(f"Error reading file [{path}] locally: {err}")
def _get_tree(self) -> Optional[List[str]]:
"""
Return the tree with the files relative to the base path
"""
return [
str(path).replace(str(self.base_path) + "/", "")
for path in Path(self.base_path).rglob("*")
]

View File

@ -1,4 +1,4 @@
include: "views/cats.view.lkml"
include: "*/cats.view"
include: "views/dogs.view.lkml"

View File

@ -154,3 +154,15 @@ class TestLkmlParser(TestCase):
self.assertEqual(
get_path_from_link(link_no_files), "hello/explores/my_explore.explore.lkml"
)
def test_expand(self):
"""
We can expand a single Path. We are looking for "*/cats.view", which will
match a file in the resources directory "cats.view.lkml"
"""
path = Includes("*/cats.view")
reader = LocalReader(BASE_PATH)
parser = LkmlParser(reader)
self.assertIn("cats.view.lkml", parser._expand(path)[0])

View File

@ -0,0 +1,60 @@
# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Test GitHub Reader
"""
from unittest import TestCase
from metadata.generated.schema.security.credentials.bitbucketCredentials import (
BitBucketCredentials,
)
from metadata.ingestion.source.dashboard.looker.models import Includes, ViewName
from metadata.ingestion.source.dashboard.looker.parser import LkmlParser
from metadata.readers.bitbucket import BitBucketReader
class TestLookMLBitBucketReader(TestCase):
"""
Validate the github reader against the OM repo
"""
creds = BitBucketCredentials(
repositoryName="api",
repositoryOwner="pmbrull-trial-api",
branch="main",
)
reader = BitBucketReader(creds)
parser = LkmlParser(reader)
def test_lookml_read_and_parse(self):
"""
We can parse the explore file.
We'll expand and find views from https://bitbucket.org/pmbrull-trial-api/api/src/main
"""
explore_file = "cats.explore.lkml"
self.parser.parse_file(Includes(explore_file))
contents = self.parser.parsed_files.get(Includes(explore_file))
# Check file contents
self.assertIn("explore: cats", contents)
view = self.parser.find_view(
view_name=ViewName("cats"), path=Includes(explore_file)
)
# We can get views that are resolved even if the include does not contain `.lkml`
self.assertIsNotNone(view)
self.assertEqual(view.name, "cats")

View File

@ -0,0 +1,59 @@
# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Test GitHub Reader
"""
from unittest import TestCase
from metadata.generated.schema.security.credentials.githubCredentials import (
GitHubCredentials,
)
from metadata.ingestion.source.dashboard.looker.models import Includes, ViewName
from metadata.ingestion.source.dashboard.looker.parser import LkmlParser
from metadata.readers.github import GitHubReader
class TestLookMLGitHubReader(TestCase):
"""
Validate the github reader against the OM repo
"""
creds = GitHubCredentials(
repositoryName="lookml-sample",
repositoryOwner="open-metadata",
)
reader = GitHubReader(creds)
parser = LkmlParser(reader)
def test_lookml_read_and_parse(self):
"""
We can parse the explore file.
We'll expand and find views from https://github.com/open-metadata/lookml-sample/blob/main/cats.explore.lkml
"""
explore_file = "cats.explore.lkml"
self.parser.parse_file(Includes(explore_file))
contents = self.parser.parsed_files.get(Includes(explore_file))
# Check file contents
self.assertIn("explore: cats", contents)
view = self.parser.find_view(
view_name=ViewName("cats"), path=Includes(explore_file)
)
# We can get views that are resolved even if the include does not contain `.lkml`
self.assertIsNotNone(view)
self.assertEqual(view.name, "cats")