datahub/metadata-ingestion/tests/unit/serde/test_serde.py

import io
import json
import pathlib

import fastavro
import pytest
from freezegun import freeze_time

import datahub.metadata.schema_classes as models
from datahub.cli.json_file import check_mce_file
from datahub.ingestion.run.pipeline import Pipeline
from datahub.ingestion.source.file import iterate_mce_file
from datahub.metadata.schema_classes import MetadataChangeEventClass
from datahub.metadata.schemas import getMetadataChangeEventSchema
from tests.test_helpers import mce_helpers
from tests.test_helpers.click_helpers import run_datahub_cmd
from tests.test_helpers.type_helpers import PytestConfig

FROZEN_TIME = "2021-07-22 18:54:06"


@freeze_time(FROZEN_TIME)
@pytest.mark.parametrize(
    "json_filename",
    [
        # Normal test.
        "tests/unit/serde/test_serde_large.json",
        # Ensure correct representation of chart info's input list.
        "tests/unit/serde/test_serde_chart_snapshot.json",
        # Check usage stats as well.
        "tests/unit/serde/test_serde_usage.json",
        # Profiles with the MetadataChangeProposal format.
        "tests/unit/serde/test_serde_profile.json",
    ],
)
def test_serde_to_json(
    pytestconfig: PytestConfig, tmp_path: pathlib.Path, json_filename: str
) -> None:
    golden_file = pytestconfig.rootpath / json_filename

    output_filename = "output.json"
    output_file = tmp_path / output_filename

    pipeline = Pipeline.create(
        {
            "source": {"type": "file", "config": {"filename": str(golden_file)}},
            "sink": {"type": "file", "config": {"filename": str(output_file)}},
            "run_id": "serde_test",
        }
    )
    pipeline.run()
    pipeline.raise_from_status()

    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=f"{tmp_path}/{output_filename}",
        golden_path=golden_file,
    )


@pytest.mark.parametrize(
    "json_filename",
    [
        "tests/unit/serde/test_serde_large.json",
        "tests/unit/serde/test_serde_chart_snapshot.json",
    ],
)
@freeze_time(FROZEN_TIME)
def test_serde_to_avro(pytestconfig: PytestConfig, json_filename: str) -> None:
    # In this test, we want to read in from JSON -> MCE object.
    # Next we serialize from MCE to Avro and then deserialize back to MCE.
    # Finally, we want to compare the two MCE objects.

    json_path = pytestconfig.rootpath / json_filename
    mces = list(iterate_mce_file(str(json_path)))

    # Serialize to Avro.
    parsed_schema = fastavro.parse_schema(json.loads(getMetadataChangeEventSchema()))
    fo = io.BytesIO()
    out_records = [mce.to_obj(tuples=True) for mce in mces]
    fastavro.writer(fo, parsed_schema, out_records)

    # Deserialized from Avro.
    fo.seek(0)
    in_records = list(fastavro.reader(fo, return_record_name=True))
    in_mces = [
        MetadataChangeEventClass.from_obj(record, tuples=True) for record in in_records
    ]

    # Check diff
    assert len(mces) == len(in_mces)
    for i in range(len(mces)):
        assert mces[i] == in_mces[i]


@pytest.mark.parametrize(
    "json_filename",
    [
        # Normal test.
        "tests/unit/serde/test_serde_large.json",
        # Check for backwards compatability with specifying all union types.
        "tests/unit/serde/test_serde_backwards_compat.json",
        # Usage stats.
        "tests/unit/serde/test_serde_usage.json",
        # Profiles with the MetadataChangeProposal format.
        "tests/unit/serde/test_serde_profile.json",
        # Ensure sample MCE files are valid.
        "examples/mce_files/single_mce.json",
        "examples/mce_files/mce_list.json",
        "examples/mce_files/bootstrap_mce.json",
    ],
)
@freeze_time(FROZEN_TIME)
def test_check_mce_schema(pytestconfig: PytestConfig, json_filename: str) -> None:
    json_file_path = pytestconfig.rootpath / json_filename

    run_datahub_cmd(["check", "mce-file", f"{json_file_path}"])


@pytest.mark.parametrize(
    "json_filename",
    [
        # Extra field.
        "tests/unit/serde/test_serde_extra_field.json",
        # Missing fields.
        "tests/unit/serde/test_serde_missing_field.json",
    ],
)
def test_check_mce_schema_failure(
    pytestconfig: PytestConfig, json_filename: str
) -> None:
    json_file_path = pytestconfig.rootpath / json_filename

    with pytest.raises((ValueError, AssertionError)):
        check_mce_file(str(json_file_path))


def test_field_discriminator() -> None:
    cost_object = models.CostClass(
        costType=models.CostTypeClass.ORG_COST_TYPE,
        cost=models.CostCostClass(
            fieldDiscriminator=models.CostCostDiscriminatorClass.costCode,
            costCode="sampleCostCode",
        ),
    )

    assert cost_object.validate()
fix(ingest): add test for avro serialization and deserialization (#2351) 2021-04-07 21:30:21 -07:00			`import io`
			`import json`
fix(ingest): enable mypy `disallow_incomplete_defs` and `disallow_untyped_decorators` (#2393) 2021-04-14 13:40:24 -07:00			`import pathlib`
fix(ingest): add test for avro serialization and deserialization (#2351) 2021-04-07 21:30:21 -07:00
			`import fastavro`
feat(ingest): various minor fixes (#2246) 2021-03-18 02:05:05 -04:00			`import pytest`
feat(deletes): add run commands (list, show, rollback) to datahub ingest (#2960) 2021-07-29 20:04:40 -07:00			`from freezegun import freeze_time`
Add serde tests 2021-02-11 18:31:15 -08:00
fix(ingest): remove datahub.metadata import shortcut (#2449) 2021-04-30 21:10:12 -07:00			`import datahub.metadata.schema_classes as models`
feat(ingest): stricter deserialization for MCE JSONs (#2976) 2021-07-28 14:50:21 -07:00			`from datahub.cli.json_file import check_mce_file`
gometa -> datahub 2021-02-15 15:04:21 -08:00			`from datahub.ingestion.run.pipeline import Pipeline`
feat: usage stats (part 1) (#2750) Co-authored-by: Gabe Lyons <itsgabelyons@gmail.com> 2021-06-24 17:11:00 -07:00			`from datahub.ingestion.source.file import iterate_mce_file`
feat(ingest): expose additional types to Python via codegen (#2712) 2021-06-17 10:04:28 -07:00			`from datahub.metadata.schema_classes import MetadataChangeEventClass`
			`from datahub.metadata.schemas import getMetadataChangeEventSchema`
refactor(ingest): update test harness to use a compose file per test (#2392) 2021-04-13 17:30:24 -07:00			`from tests.test_helpers import mce_helpers`
refactor(test): replace `CliRunner` with `run_datahub_cmd` method (#3746) 2021-12-16 23:07:38 -05:00			`from tests.test_helpers.click_helpers import run_datahub_cmd`
feat(ingest): refactor mce comparison and add pytest update golden files option (#2812) 2021-06-30 16:53:20 -07:00			`from tests.test_helpers.type_helpers import PytestConfig`
fix(ingest): enable mypy `disallow_incomplete_defs` and `disallow_untyped_decorators` (#2393) 2021-04-14 13:40:24 -07:00
feat(deletes): add run commands (list, show, rollback) to datahub ingest (#2960) 2021-07-29 20:04:40 -07:00			`FROZEN_TIME = "2021-07-22 18:54:06"`
Add serde tests 2021-02-11 18:31:15 -08:00
feat(deletes): add run commands (list, show, rollback) to datahub ingest (#2960) 2021-07-29 20:04:40 -07:00
			`@freeze_time(FROZEN_TIME)`
feat(ingest): various minor fixes (#2246) 2021-03-18 02:05:05 -04:00			`@pytest.mark.parametrize(`
			`"json_filename",`
			`[`
			`# Normal test.`
			`"tests/unit/serde/test_serde_large.json",`
			`# Ensure correct representation of chart info's input list.`
			`"tests/unit/serde/test_serde_chart_snapshot.json",`
feat: usage stats (part 1) (#2750) Co-authored-by: Gabe Lyons <itsgabelyons@gmail.com> 2021-06-24 17:11:00 -07:00			`# Check usage stats as well.`
			`"tests/unit/serde/test_serde_usage.json",`
Introducing TimeSeries Aspects + Dataset Profile (Stats) Aspect (#2983) Co-authored-by: Dexter Lee <dexter@acryl.io> Co-authored-by: Harshal Sheth <hsheth2@gmail.com> Co-authored-by: Ravindra Lanka <rlanka@acryl.io> Co-authored-by: Shirshanka Das <shirshanka@apache.org> 2021-07-30 17:41:03 -07:00			`# Profiles with the MetadataChangeProposal format.`
			`"tests/unit/serde/test_serde_profile.json",`
feat(ingest): various minor fixes (#2246) 2021-03-18 02:05:05 -04:00			`],`
			`)`
fix(ingest): enable mypy `disallow_incomplete_defs` and `disallow_untyped_decorators` (#2393) 2021-04-14 13:40:24 -07:00			`def test_serde_to_json(`
			`pytestconfig: PytestConfig, tmp_path: pathlib.Path, json_filename: str`
			`) -> None:`
feat(ingest): various minor fixes (#2246) 2021-03-18 02:05:05 -04:00			`golden_file = pytestconfig.rootpath / json_filename`
Add serde tests 2021-02-11 18:31:15 -08:00
feat(ingest): various minor fixes (#2246) 2021-03-18 02:05:05 -04:00			`output_filename = "output.json"`
Fix serde test 2021-02-11 21:17:59 -08:00			`output_file = tmp_path / output_filename`

Black 2021-02-11 21:34:36 -08:00			`pipeline = Pipeline.create(`
			`{`
Use type + config everywhere 2021-02-12 12:05:41 -08:00			`"source": {"type": "file", "config": {"filename": str(golden_file)}},`
			`"sink": {"type": "file", "config": {"filename": str(output_file)}},`
feat(deletes): add run commands (list, show, rollback) to datahub ingest (#2960) 2021-07-29 20:04:40 -07:00			`"run_id": "serde_test",`
Fix serde test 2021-02-11 21:17:59 -08:00			`}`
Black 2021-02-11 21:34:36 -08:00			`)`
Fix serde test 2021-02-11 21:17:59 -08:00			`pipeline.run()`
feat(ingest): improve error reporting for pipelines (#2121) 2021-02-18 11:15:13 -08:00			`pipeline.raise_from_status()`
Add serde tests 2021-02-11 18:31:15 -08:00
feat: Adding support for nested schemas in ingestion and visualization (#3079) 2021-08-11 15:47:18 -07:00			`mce_helpers.check_golden_file(`
			`pytestconfig,`
			`output_path=f"{tmp_path}/{output_filename}",`
			`golden_path=golden_file,`
			`)`
feat(ingest): various minor fixes (#2246) 2021-03-18 02:05:05 -04:00

fix(ingest): add test for avro serialization and deserialization (#2351) 2021-04-07 21:30:21 -07:00			`@pytest.mark.parametrize(`
			`"json_filename",`
			`[`
			`"tests/unit/serde/test_serde_large.json",`
			`"tests/unit/serde/test_serde_chart_snapshot.json",`
			`],`
			`)`
feat(deletes): add run commands (list, show, rollback) to datahub ingest (#2960) 2021-07-29 20:04:40 -07:00			`@freeze_time(FROZEN_TIME)`
fix(ingest): enable mypy `disallow_incomplete_defs` and `disallow_untyped_decorators` (#2393) 2021-04-14 13:40:24 -07:00			`def test_serde_to_avro(pytestconfig: PytestConfig, json_filename: str) -> None:`
fix(ingest): add test for avro serialization and deserialization (#2351) 2021-04-07 21:30:21 -07:00			`# In this test, we want to read in from JSON -> MCE object.`
			`# Next we serialize from MCE to Avro and then deserialize back to MCE.`
			`# Finally, we want to compare the two MCE objects.`

			`json_path = pytestconfig.rootpath / json_filename`
			`mces = list(iterate_mce_file(str(json_path)))`

			`# Serialize to Avro.`
feat(ingest): expose additional types to Python via codegen (#2712) 2021-06-17 10:04:28 -07:00			`parsed_schema = fastavro.parse_schema(json.loads(getMetadataChangeEventSchema()))`
fix(ingest): add test for avro serialization and deserialization (#2351) 2021-04-07 21:30:21 -07:00			`fo = io.BytesIO()`
			`out_records = [mce.to_obj(tuples=True) for mce in mces]`
			`fastavro.writer(fo, parsed_schema, out_records)`

			`# Deserialized from Avro.`
			`fo.seek(0)`
feat(entities): add markdown description update/viewer feature in dataset, datajob, dataflow, chart and dashboard, update ui/ux (#2707) 2021-06-17 06:48:27 +08:00			`in_records = list(fastavro.reader(fo, return_record_name=True))`
fix(ingest): add test for avro serialization and deserialization (#2351) 2021-04-07 21:30:21 -07:00			`in_mces = [`
			`MetadataChangeEventClass.from_obj(record, tuples=True) for record in in_records`
			`]`

			`# Check diff`
			`assert len(mces) == len(in_mces)`
			`for i in range(len(mces)):`
fix(ingest): streamline codegen init methods (#2400) 2021-04-14 19:25:57 -07:00			`assert mces[i] == in_mces[i]`
fix(ingest): add test for avro serialization and deserialization (#2351) 2021-04-07 21:30:21 -07:00

feat(ingest): various minor fixes (#2246) 2021-03-18 02:05:05 -04:00			`@pytest.mark.parametrize(`
			`"json_filename",`
			`[`
			`# Normal test.`
			`"tests/unit/serde/test_serde_large.json",`
			`# Check for backwards compatability with specifying all union types.`
			`"tests/unit/serde/test_serde_backwards_compat.json",`
feat: usage stats (part 1) (#2750) Co-authored-by: Gabe Lyons <itsgabelyons@gmail.com> 2021-06-24 17:11:00 -07:00			`# Usage stats.`
			`"tests/unit/serde/test_serde_usage.json",`
Introducing TimeSeries Aspects + Dataset Profile (Stats) Aspect (#2983) Co-authored-by: Dexter Lee <dexter@acryl.io> Co-authored-by: Harshal Sheth <hsheth2@gmail.com> Co-authored-by: Ravindra Lanka <rlanka@acryl.io> Co-authored-by: Shirshanka Das <shirshanka@apache.org> 2021-07-30 17:41:03 -07:00			`# Profiles with the MetadataChangeProposal format.`
			`"tests/unit/serde/test_serde_profile.json",`
feat(ingest): various minor fixes (#2246) 2021-03-18 02:05:05 -04:00			`# Ensure sample MCE files are valid.`
			`"examples/mce_files/single_mce.json",`
			`"examples/mce_files/mce_list.json",`
			`"examples/mce_files/bootstrap_mce.json",`
			`],`
			`)`
feat(deletes): add run commands (list, show, rollback) to datahub ingest (#2960) 2021-07-29 20:04:40 -07:00			`@freeze_time(FROZEN_TIME)`
fix(ingest): enable mypy `disallow_incomplete_defs` and `disallow_untyped_decorators` (#2393) 2021-04-14 13:40:24 -07:00			`def test_check_mce_schema(pytestconfig: PytestConfig, json_filename: str) -> None:`
feat(ingest): various minor fixes (#2246) 2021-03-18 02:05:05 -04:00			`json_file_path = pytestconfig.rootpath / json_filename`
fix(ingest): various avro codegen fixes (#2232) 2021-03-15 15:27:30 -07:00
refactor(test): replace `CliRunner` with `run_datahub_cmd` method (#3746) 2021-12-16 23:07:38 -05:00			`run_datahub_cmd(["check", "mce-file", f"{json_file_path}"])`
fix(ingest): properly handle fieldDiscriminator with restli (#2408) 2021-04-16 09:42:52 -07:00

feat(ingest): stricter deserialization for MCE JSONs (#2976) 2021-07-28 14:50:21 -07:00			`@pytest.mark.parametrize(`
			`"json_filename",`
			`[`
			`# Extra field.`
			`"tests/unit/serde/test_serde_extra_field.json",`
			`# Missing fields.`
			`"tests/unit/serde/test_serde_missing_field.json",`
			`],`
			`)`
			`def test_check_mce_schema_failure(`
			`pytestconfig: PytestConfig, json_filename: str`
			`) -> None:`
			`json_file_path = pytestconfig.rootpath / json_filename`

			`with pytest.raises((ValueError, AssertionError)):`
			`check_mce_file(str(json_file_path))`


fix(ingest): properly handle fieldDiscriminator with restli (#2408) 2021-04-16 09:42:52 -07:00			`def test_field_discriminator() -> None:`
			`cost_object = models.CostClass(`
			`costType=models.CostTypeClass.ORG_COST_TYPE,`
			`cost=models.CostCostClass(`
			`fieldDiscriminator=models.CostCostDiscriminatorClass.costCode,`
			`costCode="sampleCostCode",`
			`),`
			`)`

			`assert cost_object.validate()`