Looker - Fix file extension and blob import (#12232)

* Fix file extension and blob import

* Fix file extension and blob import
This commit is contained in:
Pere Miquel Brull 2023-06-29 16:14:17 +02:00 committed by GitHub
parent 4aab0fd6fb
commit 1ecf5607c7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 350 additions and 13 deletions

View File

@ -11,7 +11,9 @@
""" """
.lkml files parser .lkml files parser
""" """
import fnmatch
import traceback import traceback
from pathlib import Path
from typing import Dict, List, Optional from typing import Dict, List, Optional
import lkml import lkml
@ -28,6 +30,8 @@ from metadata.utils.logger import ingestion_logger
logger = ingestion_logger() logger = ingestion_logger()
EXTENSIONS = (".lkml", ".lookml")
class LkmlParser: class LkmlParser:
""" """
@ -61,6 +65,18 @@ class LkmlParser:
self.reader = reader self.reader = reader
self._file_tree: Optional[List[Includes]] = None
@property
def file_tree(self) -> List[Includes]:
"""
Parse the file tree of the repo
"""
if not self._file_tree:
self._file_tree = self.reader.get_tree()
return self._file_tree or []
def parse_file(self, path: Includes) -> Optional[List[Includes]]: def parse_file(self, path: Includes) -> Optional[List[Includes]]:
""" """
Internal parser. Parse the file and cache the views Internal parser. Parse the file and cache the views
@ -79,17 +95,7 @@ class LkmlParser:
return [] return []
try: try:
file = self.reader.read(path) return self._process_file(path)
lkml_file = LkmlFile.parse_obj(lkml.load(file))
self.parsed_files[path] = file
# Cache everything
self._visited_files[path] = lkml_file.includes
for view in lkml_file.views:
view.source_file = path
self._views_cache[view.name] = view
return lkml_file.includes
except ReadException as err: except ReadException as err:
logger.debug(traceback.format_exc()) logger.debug(traceback.format_exc())
@ -104,6 +110,71 @@ class LkmlParser:
return None return None
def _process_file(self, path: Includes) -> Optional[List[Includes]]:
"""
Processing of a single path
"""
file = self._read_file(path)
lkml_file = LkmlFile.parse_obj(lkml.load(file))
self.parsed_files[path] = file
# Cache everything
expanded_includes = self._expand_includes(lkml_file.includes)
self._visited_files[path] = expanded_includes
for view in lkml_file.views:
view.source_file = path
self._views_cache[view.name] = view
return expanded_includes
def _expand_includes(
self, includes: Optional[List[Includes]]
) -> Optional[List[Includes]]:
"""
If we have * in includes, expand them based on the file tree
"""
if not includes:
return includes
return [expanded for path in includes for expanded in self._expand(path)]
def _expand(self, path: Includes) -> List[Includes]:
"""
Match files in tree if there's any * in the include
"""
suffixes = Path(path).suffixes
if "*" in path:
if set(suffixes).intersection(set(EXTENSIONS)):
return fnmatch.filter(self.file_tree, path)
for suffix in EXTENSIONS:
res = fnmatch.filter(self.file_tree, Includes(str(path) + suffix))
if res:
return res
# Nothing matched, we cannot find the file
logger.warning(f"We could not match any file from the include {path}")
return []
return [path]
def _read_file(self, path: Includes) -> str:
"""
Read the LookML file
"""
suffixes = Path(path).suffixes
# Check if any suffix is in our extension list
if not set(suffixes).intersection(set(EXTENSIONS)):
for suffix in EXTENSIONS:
try:
return self.reader.read(path + suffix)
except ReadException as err:
logger.debug(f"Error trying to read the file [{path}]: {err}")
else:
return self.reader.read(path)
raise ReadException(f"Error trying to read the file [{path}]")
def get_view_from_cache(self, view_name: ViewName) -> Optional[LookMlView]: def get_view_from_cache(self, view_name: ViewName) -> Optional[LookMlView]:
""" """
Check if view is cached, and return it. Check if view is cached, and return it.

View File

@ -11,7 +11,13 @@
""" """
Base local reader Base local reader
""" """
import traceback
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import List, Optional
from metadata.utils.logger import ingestion_logger
logger = ingestion_logger()
class ReadException(Exception): class ReadException(Exception):
@ -21,9 +27,31 @@ class ReadException(Exception):
class Reader(ABC): class Reader(ABC):
"""
Abstract class for all readers
"""
@abstractmethod @abstractmethod
def read(self, path: str) -> str: def read(self, path: str) -> str:
""" """
Given a string, return a string Given a string, return a string
""" """
raise NotImplementedError("Missing read implementation") raise NotImplementedError("Missing read implementation")
@abstractmethod
def _get_tree(self) -> List[str]:
"""
Return the filenames of the root
"""
raise NotImplementedError("Missing get_tree implementation")
def get_tree(self) -> Optional[List[str]]:
"""
If something happens, return None
"""
try:
return self._get_tree()
except Exception as err:
logger.debug(traceback.format_exc())
logger.error(f"Error getting file tree [{err}]")
return None

View File

@ -13,6 +13,7 @@ GitHub client to read files with token auth
""" """
import traceback import traceback
from enum import Enum from enum import Enum
from typing import List
import requests import requests
@ -27,6 +28,7 @@ logger = ingestion_logger()
HOST = "https://api.bitbucket.org/2.0" HOST = "https://api.bitbucket.org/2.0"
PAGE_LENGTH = 100
class UrlParts(Enum): class UrlParts(Enum):
@ -74,3 +76,48 @@ class BitBucketReader(ApiReader):
raise ReadException(f"Error fetching file [{path}] from repo: {err}") raise ReadException(f"Error fetching file [{path}] from repo: {err}")
raise ReadException(f"Could not fetch file [{path}] from repo") raise ReadException(f"Could not fetch file [{path}] from repo")
def _get_files_from_dir(self, url: str) -> List[str]:
"""
Run the request and return the page results
"""
res = requests.get(
url=url + "/?fields=values.path",
headers=self.auth_headers,
timeout=30,
)
if res.status_code == 200:
files = []
json_res = res.json()
for file in json_res.get("values") or []:
path = file.get("path")
new_url = url + "/" + path.split("/")[-1]
# If we have a file, append. Otherwise, call again
if "." in path:
files.append(path)
else:
files.extend(self._get_files_from_dir(new_url))
return files
# If we don't get a 200, raise
res.raise_for_status()
raise RuntimeError("Could not fetch the tree")
def _get_tree(self) -> List[str]:
"""
Paginate over the results
"""
url = self._build_url(
HOST,
UrlParts.REPOS.value,
self.credentials.repositoryOwner.__root__,
self.credentials.repositoryName.__root__,
UrlParts.SRC.value,
self.credentials.branch,
)
return self._get_files_from_dir(url)

View File

@ -14,7 +14,7 @@ GitHub client to read files with token auth
import base64 import base64
import traceback import traceback
from enum import Enum from enum import Enum
from typing import Any, Dict from typing import Any, Dict, List, Optional
import requests import requests
@ -85,3 +85,53 @@ class GitHubReader(ApiReader):
raise ReadException(f"Error fetching file [{path}] from repo: {err}") raise ReadException(f"Error fetching file [{path}] from repo: {err}")
raise ReadException(f"Could not fetch file [{path}] from repo") raise ReadException(f"Could not fetch file [{path}] from repo")
def _get_default_branch(self) -> str:
"""
Get repo default branch
"""
res = requests.get(
self._build_url(
HOST,
UrlParts.REPOS.value,
self.credentials.repositoryOwner.__root__,
self.credentials.repositoryName.__root__,
),
headers=self.auth_headers,
timeout=30,
)
if res.status_code == 200:
return res.json().get("default_branch")
# If we don't get a 200, raise
res.raise_for_status()
raise RuntimeError("Could not fetch the default branch")
def _get_tree(self) -> Optional[List[str]]:
"""
Use the GitHub Tree API
"""
# First, get the default branch
branch = self._get_default_branch()
if branch:
res = requests.get(
self._build_url(
HOST,
UrlParts.REPOS.value,
self.credentials.repositoryOwner.__root__,
self.credentials.repositoryName.__root__,
"git",
"trees",
f"{branch}?recursive=1",
),
headers=self.auth_headers,
timeout=30,
)
if res.status_code == 200:
return [elem.get("path") for elem in res.json().get("tree")]
# If we don't get a 200, raise
res.raise_for_status()
# If we don't find a branch, return None
return None

View File

@ -13,6 +13,7 @@ Local Reader
""" """
import traceback import traceback
from pathlib import Path from pathlib import Path
from typing import List, Optional
from metadata.readers.base import Reader, ReadException from metadata.readers.base import Reader, ReadException
from metadata.utils.constants import UTF_8 from metadata.utils.constants import UTF_8
@ -40,3 +41,12 @@ class LocalReader(Reader):
except Exception as err: except Exception as err:
logger.debug(traceback.format_exc()) logger.debug(traceback.format_exc())
raise ReadException(f"Error reading file [{path}] locally: {err}") raise ReadException(f"Error reading file [{path}] locally: {err}")
def _get_tree(self) -> Optional[List[str]]:
"""
Return the tree with the files relative to the base path
"""
return [
str(path).replace(str(self.base_path) + "/", "")
for path in Path(self.base_path).rglob("*")
]

View File

@ -1,4 +1,4 @@
include: "views/cats.view.lkml" include: "*/cats.view"
include: "views/dogs.view.lkml" include: "views/dogs.view.lkml"

View File

@ -154,3 +154,15 @@ class TestLkmlParser(TestCase):
self.assertEqual( self.assertEqual(
get_path_from_link(link_no_files), "hello/explores/my_explore.explore.lkml" get_path_from_link(link_no_files), "hello/explores/my_explore.explore.lkml"
) )
def test_expand(self):
"""
We can expand a single Path. We are looking for "*/cats.view", which will
match a file in the resources directory "cats.view.lkml"
"""
path = Includes("*/cats.view")
reader = LocalReader(BASE_PATH)
parser = LkmlParser(reader)
self.assertIn("cats.view.lkml", parser._expand(path)[0])

View File

@ -0,0 +1,60 @@
# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Test GitHub Reader
"""
from unittest import TestCase
from metadata.generated.schema.security.credentials.bitbucketCredentials import (
BitBucketCredentials,
)
from metadata.ingestion.source.dashboard.looker.models import Includes, ViewName
from metadata.ingestion.source.dashboard.looker.parser import LkmlParser
from metadata.readers.bitbucket import BitBucketReader
class TestLookMLBitBucketReader(TestCase):
"""
Validate the github reader against the OM repo
"""
creds = BitBucketCredentials(
repositoryName="api",
repositoryOwner="pmbrull-trial-api",
branch="main",
)
reader = BitBucketReader(creds)
parser = LkmlParser(reader)
def test_lookml_read_and_parse(self):
"""
We can parse the explore file.
We'll expand and find views from https://bitbucket.org/pmbrull-trial-api/api/src/main
"""
explore_file = "cats.explore.lkml"
self.parser.parse_file(Includes(explore_file))
contents = self.parser.parsed_files.get(Includes(explore_file))
# Check file contents
self.assertIn("explore: cats", contents)
view = self.parser.find_view(
view_name=ViewName("cats"), path=Includes(explore_file)
)
# We can get views that are resolved even if the include does not contain `.lkml`
self.assertIsNotNone(view)
self.assertEqual(view.name, "cats")

View File

@ -0,0 +1,59 @@
# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Test GitHub Reader
"""
from unittest import TestCase
from metadata.generated.schema.security.credentials.githubCredentials import (
GitHubCredentials,
)
from metadata.ingestion.source.dashboard.looker.models import Includes, ViewName
from metadata.ingestion.source.dashboard.looker.parser import LkmlParser
from metadata.readers.github import GitHubReader
class TestLookMLGitHubReader(TestCase):
"""
Validate the github reader against the OM repo
"""
creds = GitHubCredentials(
repositoryName="lookml-sample",
repositoryOwner="open-metadata",
)
reader = GitHubReader(creds)
parser = LkmlParser(reader)
def test_lookml_read_and_parse(self):
"""
We can parse the explore file.
We'll expand and find views from https://github.com/open-metadata/lookml-sample/blob/main/cats.explore.lkml
"""
explore_file = "cats.explore.lkml"
self.parser.parse_file(Includes(explore_file))
contents = self.parser.parsed_files.get(Includes(explore_file))
# Check file contents
self.assertIn("explore: cats", contents)
view = self.parser.find_view(
view_name=ViewName("cats"), path=Includes(explore_file)
)
# We can get views that are resolved even if the include does not contain `.lkml`
self.assertIsNotNone(view)
self.assertEqual(view.name, "cats")