feat(ingest/lookml): support complex lookml manifests (#9688)

This commit is contained in:
Harshal Sheth 2024-01-22 16:23:56 -08:00 committed by GitHub
parent 943bb57cbc
commit b94d463fe0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 109 additions and 57 deletions

View File

@ -149,7 +149,7 @@ looker_common = {
# This version of lkml contains a fix for parsing lists in
# LookML files with spaces between an item and the following comma.
# See https://github.com/joshtemple/lkml/issues/73.
"lkml>=1.3.0b5",
"lkml>=1.3.4",
"sql-metadata==2.2.2",
*sqllineage_lib,
"GitPython>2",

View File

@ -0,0 +1,28 @@
import pathlib
from typing import Union
import lkml
import lkml.simple
import lkml.tree
# Patch lkml to support the manifest.lkml files.
# We have to patch both locations because lkml uses a immutable tuple
# instead of a list for this type.
lkml.simple.PLURAL_KEYS = (
*lkml.simple.PLURAL_KEYS,
"local_dependency",
"remote_dependency",
"constant",
"override_constant",
)
lkml.tree.PLURAL_KEYS = lkml.simple.PLURAL_KEYS
def load_lkml(path: Union[str, pathlib.Path]) -> dict:
"""Loads a LookML file from disk and returns a dictionary."""
# Using this method instead of lkml.load directly ensures
# that our patches to lkml are applied.
with open(path, "r") as file:
return lkml.load(file)

View File

@ -49,6 +49,7 @@ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceCapabi
from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
from datahub.ingestion.source.git.git_import import GitClone
from datahub.ingestion.source.looker.lkml_patched import load_lkml
from datahub.ingestion.source.looker.looker_common import (
CORPUSER_DATAHUB,
LookerCommonConfig,
@ -98,13 +99,6 @@ logger = logging.getLogger(__name__)
_BASE_PROJECT_NAME = "__BASE"
# Patch lkml to support the local_dependency and remote_dependency keywords.
lkml.simple.PLURAL_KEYS = (
*lkml.simple.PLURAL_KEYS,
"local_dependency",
"remote_dependency",
)
_EXPLORE_FILE_EXTENSION = ".explore.lkml"
_VIEW_FILE_EXTENSION = ".view.lkml"
_MODEL_FILE_EXTENSION = ".model.lkml"
@ -384,10 +378,9 @@ class LookerModel:
]
for included_file in explore_files:
try:
with open(included_file, "r") as file:
parsed = lkml.load(file)
included_explores = parsed.get("explores", [])
explores.extend(included_explores)
parsed = load_lkml(included_file)
included_explores = parsed.get("explores", [])
explores.extend(included_explores)
except Exception as e:
reporter.report_warning(
path, f"Failed to load {included_file} due to {e}"
@ -514,24 +507,23 @@ class LookerModel:
f"Will be loading {included_file}, traversed here via {traversal_path}"
)
try:
with open(included_file, "r") as file:
parsed = lkml.load(file)
seen_so_far.add(included_file)
if "includes" in parsed: # we have more includes to resolve!
resolved.extend(
LookerModel.resolve_includes(
parsed["includes"],
resolved_project_name,
root_project_name,
base_projects_folder,
included_file,
reporter,
seen_so_far,
traversal_path=traversal_path
+ "."
+ pathlib.Path(included_file).stem,
)
parsed = load_lkml(included_file)
seen_so_far.add(included_file)
if "includes" in parsed: # we have more includes to resolve!
resolved.extend(
LookerModel.resolve_includes(
parsed["includes"],
resolved_project_name,
root_project_name,
base_projects_folder,
included_file,
reporter,
seen_so_far,
traversal_path=traversal_path
+ "."
+ pathlib.Path(included_file).stem,
)
)
except Exception as e:
reporter.report_warning(
path, f"Failed to load {included_file} due to {e}"
@ -648,21 +640,20 @@ class LookerViewFileLoader:
self.reporter.report_failure(path, f"failed to load view file: {e}")
return None
try:
with open(path, "r") as file:
logger.debug(f"Loading viewfile {path}")
parsed = lkml.load(file)
looker_viewfile = LookerViewFile.from_looker_dict(
absolute_file_path=path,
looker_view_file_dict=parsed,
project_name=project_name,
root_project_name=self._root_project_name,
base_projects_folder=self._base_projects_folder,
raw_file_content=raw_file_content,
reporter=reporter,
)
logger.debug(f"adding viewfile for path {path} to the cache")
self.viewfile_cache[path] = looker_viewfile
return looker_viewfile
logger.debug(f"Loading viewfile {path}")
parsed = load_lkml(path)
looker_viewfile = LookerViewFile.from_looker_dict(
absolute_file_path=path,
looker_view_file_dict=parsed,
project_name=project_name,
root_project_name=self._root_project_name,
base_projects_folder=self._base_projects_folder,
raw_file_content=raw_file_content,
reporter=reporter,
)
logger.debug(f"adding viewfile for path {path} to the cache")
self.viewfile_cache[path] = looker_viewfile
return looker_viewfile
except Exception as e:
self.reporter.report_failure(path, f"failed to load view file: {e}")
return None
@ -1498,17 +1489,16 @@ class LookMLSource(StatefulIngestionSourceBase):
)
def _load_model(self, path: str) -> LookerModel:
with open(path, "r") as file:
logger.debug(f"Loading model from file {path}")
parsed = lkml.load(file)
looker_model = LookerModel.from_looker_dict(
parsed,
_BASE_PROJECT_NAME,
self.source_config.project_name,
self.base_projects_folder,
path,
self.reporter,
)
logger.debug(f"Loading model from file {path}")
parsed = load_lkml(path)
looker_model = LookerModel.from_looker_dict(
parsed,
_BASE_PROJECT_NAME,
self.source_config.project_name,
self.base_projects_folder,
path,
self.reporter,
)
return looker_model
def _platform_names_have_2_parts(self, platform: str) -> bool:
@ -1797,8 +1787,7 @@ class LookMLSource(StatefulIngestionSourceBase):
def get_manifest_if_present(self, folder: pathlib.Path) -> Optional[LookerManifest]:
manifest_file = folder / "manifest.lkml"
if manifest_file.exists():
with manifest_file.open() as fp:
manifest_dict = lkml.load(fp)
manifest_dict = load_lkml(manifest_file)
manifest = LookerManifest(
project_name=manifest_dict.get("project_name"),

View File

@ -0,0 +1,23 @@
project_name: "complex-manifest-project"
constant: CONNECTION_NAME {
value: "choose-connection"
export: override_required
}
constant: other_variable {
value: "other-variable"
export: override_required
}
local_dependency: {
project: "looker-hub"
}
remote_dependency: remote-proj-1 {
override_constant: schema_name {value: "mycorp_prod" }
override_constant: choose-connection {value: "snowflake-conn-main"}
}
remote_dependency: remote-proj-2 {
}

View File

@ -16,6 +16,7 @@ from datahub.ingestion.source.looker.lookml_source import (
LookerModel,
LookerRefinementResolver,
LookMLSourceConfig,
load_lkml,
)
from datahub.metadata.schema_classes import (
DatasetSnapshotClass,
@ -852,3 +853,14 @@ def test_same_name_views_different_file_path(pytestconfig, tmp_path, mock_time):
output_path=tmp_path / mce_out,
golden_path=test_resources_dir / mce_out,
)
def test_manifest_parser(pytestconfig: pytest.Config) -> None:
# This mainly tests that we're permissive enough that we don't crash when parsing the manifest file.
# We need the test because we monkeypatch the lkml library.
test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml"
manifest_file = test_resources_dir / "lkml_manifest_samples/complex-manifest.lkml"
manifest = load_lkml(manifest_file)
assert manifest