datahub/metadata-ingestion/tests/unit/serde/test_serde.py

import io
import json
import pathlib

import fastavro
import pytest
from _pytest.config import Config as PytestConfig
from click.testing import CliRunner

from datahub.entrypoints import datahub
from datahub.ingestion.run.pipeline import Pipeline
from datahub.ingestion.source.mce_file import iterate_mce_file
from datahub.metadata.schema_classes import SCHEMA_JSON_STR, MetadataChangeEventClass
from tests.test_helpers import mce_helpers

# The current PytestConfig solution is somewhat ugly and not ideal.
# However, it is currently the best solution available, as the type itself is not
# exported: https://docs.pytest.org/en/stable/reference.html#config.
# As pytest's type support improves, this will likely change.
# TODO: revisit pytestconfig as https://github.com/pytest-dev/pytest/issues/7469 progresses.


@pytest.mark.parametrize(
    "json_filename",
    [
        # Normal test.
        "tests/unit/serde/test_serde_large.json",
        # Ensure correct representation of chart info's input list.
        "tests/unit/serde/test_serde_chart_snapshot.json",
    ],
)
def test_serde_to_json(
    pytestconfig: PytestConfig, tmp_path: pathlib.Path, json_filename: str
) -> None:
    golden_file = pytestconfig.rootpath / json_filename

    output_filename = "output.json"
    output_file = tmp_path / output_filename

    pipeline = Pipeline.create(
        {
            "source": {"type": "file", "config": {"filename": str(golden_file)}},
            "sink": {"type": "file", "config": {"filename": str(output_file)}},
        }
    )
    pipeline.run()
    pipeline.raise_from_status()

    output = mce_helpers.load_json_file(tmp_path / output_filename)
    golden = mce_helpers.load_json_file(golden_file)
    assert golden == output


@pytest.mark.parametrize(
    "json_filename",
    [
        "tests/unit/serde/test_serde_large.json",
        "tests/unit/serde/test_serde_chart_snapshot.json",
    ],
)
def test_serde_to_avro(pytestconfig: PytestConfig, json_filename: str) -> None:
    # In this test, we want to read in from JSON -> MCE object.
    # Next we serialize from MCE to Avro and then deserialize back to MCE.
    # Finally, we want to compare the two MCE objects.

    json_path = pytestconfig.rootpath / json_filename
    mces = list(iterate_mce_file(str(json_path)))

    # Serialize to Avro.
    parsed_schema = fastavro.parse_schema(json.loads(SCHEMA_JSON_STR))
    fo = io.BytesIO()
    out_records = [mce.to_obj(tuples=True) for mce in mces]
    fastavro.writer(fo, parsed_schema, out_records)

    # Deserialized from Avro.
    fo.seek(0)
    in_records = list(fastavro.reader(fo))
    in_mces = [
        MetadataChangeEventClass.from_obj(record, tuples=True) for record in in_records
    ]

    # Check diff
    assert len(mces) == len(in_mces)
    for i in range(len(mces)):
        assert str(mces[i]) == str(in_mces[i])


@pytest.mark.parametrize(
    "json_filename",
    [
        # Normal test.
        "tests/unit/serde/test_serde_large.json",
        # Check for backwards compatability with specifying all union types.
        "tests/unit/serde/test_serde_backwards_compat.json",
        # Ensure sample MCE files are valid.
        "examples/mce_files/single_mce.json",
        "examples/mce_files/mce_list.json",
        "examples/mce_files/bootstrap_mce.json",
    ],
)
def test_check_mce_schema(pytestconfig: PytestConfig, json_filename: str) -> None:
    json_file_path = pytestconfig.rootpath / json_filename

    runner = CliRunner()
    result = runner.invoke(datahub, ["check", "mce-file", f"{json_file_path}"])
    assert result.exit_code == 0
fix(ingest): add test for avro serialization and deserialization (#2351) 2021-04-07 21:30:21 -07:00			`import io`
			`import json`
fix(ingest): enable mypy `disallow_incomplete_defs` and `disallow_untyped_decorators` (#2393) 2021-04-14 13:40:24 -07:00			`import pathlib`
fix(ingest): add test for avro serialization and deserialization (#2351) 2021-04-07 21:30:21 -07:00
			`import fastavro`
feat(ingest): various minor fixes (#2246) 2021-03-18 02:05:05 -04:00			`import pytest`
fix(ingest): enable mypy `disallow_incomplete_defs` and `disallow_untyped_decorators` (#2393) 2021-04-14 13:40:24 -07:00			`from _pytest.config import Config as PytestConfig`
fix(ingest): various avro codegen fixes (#2232) 2021-03-15 15:27:30 -07:00			`from click.testing import CliRunner`
Add serde tests 2021-02-11 18:31:15 -08:00
fix(ingest): various avro codegen fixes (#2232) 2021-03-15 15:27:30 -07:00			`from datahub.entrypoints import datahub`
gometa -> datahub 2021-02-15 15:04:21 -08:00			`from datahub.ingestion.run.pipeline import Pipeline`
fix(ingest): add test for avro serialization and deserialization (#2351) 2021-04-07 21:30:21 -07:00			`from datahub.ingestion.source.mce_file import iterate_mce_file`
			`from datahub.metadata.schema_classes import SCHEMA_JSON_STR, MetadataChangeEventClass`
refactor(ingest): update test harness to use a compose file per test (#2392) 2021-04-13 17:30:24 -07:00			`from tests.test_helpers import mce_helpers`
Add isort to CI 2021-02-11 23:14:20 -08:00
fix(ingest): enable mypy `disallow_incomplete_defs` and `disallow_untyped_decorators` (#2393) 2021-04-14 13:40:24 -07:00			`# The current PytestConfig solution is somewhat ugly and not ideal.`
			`# However, it is currently the best solution available, as the type itself is not`
			`# exported: https://docs.pytest.org/en/stable/reference.html#config.`
			`# As pytest's type support improves, this will likely change.`
			`# TODO: revisit pytestconfig as https://github.com/pytest-dev/pytest/issues/7469 progresses.`

Add serde tests 2021-02-11 18:31:15 -08:00
feat(ingest): various minor fixes (#2246) 2021-03-18 02:05:05 -04:00			`@pytest.mark.parametrize(`
			`"json_filename",`
			`[`
			`# Normal test.`
			`"tests/unit/serde/test_serde_large.json",`
			`# Ensure correct representation of chart info's input list.`
			`"tests/unit/serde/test_serde_chart_snapshot.json",`
			`],`
			`)`
fix(ingest): enable mypy `disallow_incomplete_defs` and `disallow_untyped_decorators` (#2393) 2021-04-14 13:40:24 -07:00			`def test_serde_to_json(`
			`pytestconfig: PytestConfig, tmp_path: pathlib.Path, json_filename: str`
			`) -> None:`
feat(ingest): various minor fixes (#2246) 2021-03-18 02:05:05 -04:00			`golden_file = pytestconfig.rootpath / json_filename`
Add serde tests 2021-02-11 18:31:15 -08:00
feat(ingest): various minor fixes (#2246) 2021-03-18 02:05:05 -04:00			`output_filename = "output.json"`
Fix serde test 2021-02-11 21:17:59 -08:00			`output_file = tmp_path / output_filename`

Black 2021-02-11 21:34:36 -08:00			`pipeline = Pipeline.create(`
			`{`
Use type + config everywhere 2021-02-12 12:05:41 -08:00			`"source": {"type": "file", "config": {"filename": str(golden_file)}},`
			`"sink": {"type": "file", "config": {"filename": str(output_file)}},`
Fix serde test 2021-02-11 21:17:59 -08:00			`}`
Black 2021-02-11 21:34:36 -08:00			`)`
Fix serde test 2021-02-11 21:17:59 -08:00			`pipeline.run()`
feat(ingest): improve error reporting for pipelines (#2121) 2021-02-18 11:15:13 -08:00			`pipeline.raise_from_status()`
Add serde tests 2021-02-11 18:31:15 -08:00
			`output = mce_helpers.load_json_file(tmp_path / output_filename)`
			`golden = mce_helpers.load_json_file(golden_file)`
feat(ingest): various minor fixes (#2246) 2021-03-18 02:05:05 -04:00			`assert golden == output`


fix(ingest): add test for avro serialization and deserialization (#2351) 2021-04-07 21:30:21 -07:00			`@pytest.mark.parametrize(`
			`"json_filename",`
			`[`
			`"tests/unit/serde/test_serde_large.json",`
			`"tests/unit/serde/test_serde_chart_snapshot.json",`
			`],`
			`)`
fix(ingest): enable mypy `disallow_incomplete_defs` and `disallow_untyped_decorators` (#2393) 2021-04-14 13:40:24 -07:00			`def test_serde_to_avro(pytestconfig: PytestConfig, json_filename: str) -> None:`
fix(ingest): add test for avro serialization and deserialization (#2351) 2021-04-07 21:30:21 -07:00			`# In this test, we want to read in from JSON -> MCE object.`
			`# Next we serialize from MCE to Avro and then deserialize back to MCE.`
			`# Finally, we want to compare the two MCE objects.`

			`json_path = pytestconfig.rootpath / json_filename`
			`mces = list(iterate_mce_file(str(json_path)))`

			`# Serialize to Avro.`
			`parsed_schema = fastavro.parse_schema(json.loads(SCHEMA_JSON_STR))`
			`fo = io.BytesIO()`
			`out_records = [mce.to_obj(tuples=True) for mce in mces]`
			`fastavro.writer(fo, parsed_schema, out_records)`

			`# Deserialized from Avro.`
			`fo.seek(0)`
			`in_records = list(fastavro.reader(fo))`
			`in_mces = [`
			`MetadataChangeEventClass.from_obj(record, tuples=True) for record in in_records`
			`]`

			`# Check diff`
			`assert len(mces) == len(in_mces)`
			`for i in range(len(mces)):`
			`assert str(mces[i]) == str(in_mces[i])`


feat(ingest): various minor fixes (#2246) 2021-03-18 02:05:05 -04:00			`@pytest.mark.parametrize(`
			`"json_filename",`
			`[`
			`# Normal test.`
			`"tests/unit/serde/test_serde_large.json",`
			`# Check for backwards compatability with specifying all union types.`
			`"tests/unit/serde/test_serde_backwards_compat.json",`
			`# Ensure sample MCE files are valid.`
			`"examples/mce_files/single_mce.json",`
			`"examples/mce_files/mce_list.json",`
			`"examples/mce_files/bootstrap_mce.json",`
			`],`
			`)`
fix(ingest): enable mypy `disallow_incomplete_defs` and `disallow_untyped_decorators` (#2393) 2021-04-14 13:40:24 -07:00			`def test_check_mce_schema(pytestconfig: PytestConfig, json_filename: str) -> None:`
feat(ingest): various minor fixes (#2246) 2021-03-18 02:05:05 -04:00			`json_file_path = pytestconfig.rootpath / json_filename`
fix(ingest): various avro codegen fixes (#2232) 2021-03-15 15:27:30 -07:00
			`runner = CliRunner()`
			`result = runner.invoke(datahub, ["check", "mce-file", f"{json_file_path}"])`
			`assert result.exit_code == 0`