2021-02-11 12:53:44 -08:00
|
|
|
import json
|
2021-05-17 11:42:12 -07:00
|
|
|
import os
|
|
|
|
from typing import Union
|
2021-02-11 23:14:20 -08:00
|
|
|
|
2021-02-11 12:53:44 -08:00
|
|
|
import deepdiff
|
|
|
|
|
|
|
|
|
2021-05-17 11:42:12 -07:00
|
|
|
def load_json_file(filename: Union[str, os.PathLike]) -> object:
|
2021-02-11 12:53:44 -08:00
|
|
|
with open(str(filename)) as f:
|
|
|
|
a = json.load(f)
|
|
|
|
return a
|
|
|
|
|
|
|
|
|
2021-05-17 11:42:12 -07:00
|
|
|
def assert_mces_equal(output: object, golden: object) -> None:
|
2021-02-11 12:53:44 -08:00
|
|
|
# This method assumes we're given a list of MCE json objects.
|
|
|
|
|
|
|
|
ignore_paths = {
|
|
|
|
# Ignore timestamps from the ETL pipeline. A couple examples:
|
|
|
|
# root[0]['proposedSnapshot']['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot']['aspects'][0]['com.linkedin.pegasus2avro.common.Ownership']['lastModified']['time']
|
|
|
|
# root[69]['proposedSnapshot']['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot']['aspects'][0]['com.linkedin.pegasus2avro.schema.SchemaMetadata']['lastModified']['time']"
|
2021-06-28 20:40:10 -07:00
|
|
|
# root[0]['proposedSnapshot']['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot']['aspects'][1]['com.linkedin.pegasus2avro.dataset.UpstreamLineage']['upstreams'][0]['auditStamp']['time']
|
2021-02-11 12:53:44 -08:00
|
|
|
r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['created'\]\['time'\]",
|
|
|
|
r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['lastModified'\]\['time'\]",
|
|
|
|
r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['createStamp'\]\['time'\]",
|
2021-06-28 20:40:10 -07:00
|
|
|
r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['auditStamp'\]\['time'\]",
|
2021-02-11 12:53:44 -08:00
|
|
|
}
|
|
|
|
diff = deepdiff.DeepDiff(golden, output, exclude_regex_paths=ignore_paths)
|
2021-06-24 19:44:59 -07:00
|
|
|
assert not diff, str(diff)
|