fix(ingest): various avro codegen fixes (#2232)

2025-10-10 00:13:49 +00:00 · 2021-03-15 15:27:30 -07:00 · 2021-03-15 15:27:30 -07:00 · aa6bc15cd7
commit aa6bc15cd7
parent d895202d52
19 changed files with 4133 additions and 40436 deletions
--- a/metadata-ingestion/.gitignore
+++ b/metadata-ingestion/.gitignore
@ -4,6 +4,7 @@ output
 src/datahub/metadata/
 pvenv36/
 bq_credentials.json
+/tmp

 # Byte-compiled / optimized / DLL files
 __pycache__/
--- a/metadata-ingestion/examples/demo_data/demo_data.json
+++ b/metadata-ingestion/examples/demo_data/demo_data.json
--- a/metadata-ingestion/examples/demo_data/enrich.py
+++ b/metadata-ingestion/examples/demo_data/enrich.py
@ -107,7 +107,7 @@ def create_ownership_aspect_mce(directive: Directive) -> MetadataChangeEventClas
                    owners=[
                        OwnerClass(
                            owner=owner_name_to_urn(clean_owner_name(owner)),
-                            type=OwnershipTypeClass.DATAOWNER,  # type: ignore
+                            type=OwnershipTypeClass.DATAOWNER,
                        )
                        for owner in directive.owners
                    ],
@ -130,7 +130,7 @@ def create_lineage_aspect_mce(directive: Directive) -> MetadataChangeEventClass:
                    upstreams=[
                        UpstreamClass(
                            dataset=dataset_name_to_urn(upstream),
-                            type=DatasetLineageTypeClass.TRANSFORMED,  # type: ignore
+                            type=DatasetLineageTypeClass.TRANSFORMED,
                            auditStamp=AuditStampClass(
                                time=int(time.time() * 1000),
                                actor="urn:li:corpuser:datahub",
--- a/metadata-ingestion/scripts/update_golden_files.sh
+++ b/metadata-ingestion/scripts/update_golden_files.sh
@ -0,0 +1,17 @@
+#!/bin/bash
+set -euxo pipefail
+
+# We allow for failures in this step. Usually you'll be invoking this
+# script to fix a build failure.
+pytest --basetemp=tmp || true
+
+# Update the golden files.
+cp tmp/test_serde_large0/output.json tests/unit/serde/test_serde_large.json
+cp tmp/test_ldap_ingest0/ldap_mces.json tests/integration/ldap/ldap_mce_golden.json
+cp tmp/test_mysql_ingest0/mysql_mces.json tests/integration/mysql/mysql_mce_golden.json
+cp tmp/test_mssql_ingest0/mssql_mces.json tests/integration/sql_server/mssql_mce_golden.json
+
+# Print success message.
+set +x
+echo ''
+echo 'Make sure to check `git diff` to verify the changes!'
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@ -37,7 +37,7 @@ framework_common = {
    "click>=7.1.1",
    "pyyaml>=5.4.1",
    "toml>=0.10.0",
-    "avro-gen3==0.3.2",
+    "avro-gen3==0.3.3",
    "avro-python3>=1.8.2",
 }

--- a/metadata-ingestion/src/datahub/entrypoints.py
+++ b/metadata-ingestion/src/datahub/entrypoints.py
@ -10,6 +10,7 @@ from datahub.configuration.toml import TomlConfigurationMechanism
 from datahub.configuration.yaml import YamlConfigurationMechanism
 from datahub.ingestion.run.pipeline import Pipeline
 from datahub.ingestion.sink.sink_registry import sink_registry
+from datahub.ingestion.source.mce_file import check_mce_file
 from datahub.ingestion.source.source_registry import source_registry

 logger = logging.getLogger(__name__)
@ -37,7 +38,7 @@ def datahub():
@click.option(
    "-c",
    "--config",
-    type=click.Path(exists=True),
+    type=click.Path(exists=True, dir_okay=False),
    help="Config file in .toml or .yaml format",
    required=True,
 )
@ -77,6 +78,8 @@ def ingest(config: str):

@datahub.command(context_settings=DEFAULT_CONTEXT_SETTINGS)
 def ingest_list_plugins():
+    """List enabled ingestion plugins"""
+
    click.secho("Sources:", bold=True)
    click.echo(str(source_registry))
    click.echo()
@ -84,3 +87,17 @@ def ingest_list_plugins():
    click.echo(str(sink_registry))
    click.echo()
    click.echo('If a plugin is disabled, try running: pip install ".[<plugin>]"')
+
+
+@datahub.group()
+def check():
+    pass
+
+
+@check.command()
+@click.argument("json-file", type=click.Path(exists=True, dir_okay=False))
+def mce_file(json_file: str):
+    """Check the schema of a MCE JSON file"""
+
+    report = check_mce_file(json_file)
+    click.echo(report)
--- a/metadata-ingestion/src/datahub/ingestion/extractor/mce_extractor.py
+++ b/metadata-ingestion/src/datahub/ingestion/extractor/mce_extractor.py
@ -14,7 +14,9 @@ class WorkUnitMCEExtractor(Extractor):

    def get_records(self, workunit) -> Iterable[RecordEnvelope[MetadataChangeEvent]]:
        if len(workunit.mce.proposedSnapshot.aspects) == 0:
-            raise AttributeError('every mce must have at least one aspect')
+            raise AttributeError("every mce must have at least one aspect")
+        if not workunit.mce.validate():
+            raise ValueError(f"source produced an invalid MCE: {workunit.mce}")
        yield RecordEnvelope(workunit.mce, {})

    def close(self):
--- a/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py
+++ b/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py
@ -41,7 +41,7 @@ _field_type_mapping = {

 def _get_column_type(field_type) -> SchemaFieldDataType:
    tp = field_type
-    if hasattr(tp, 'type'):
+    if hasattr(tp, "type"):
        tp = tp.type
    tp = str(tp)
    TypeClass: Any = _field_type_mapping.get(tp)
@ -56,7 +56,7 @@ def avro_schema_to_mce_fields(avro_schema_string: str) -> List[SchemaField]:
    """Converts an avro schema into a schema compatible with MCE"""

    # Handle some library compatability issues.
-    if hasattr(avro.schema, 'parse'):
+    if hasattr(avro.schema, "parse"):
        schema_parse_fn = avro.schema.parse
    else:
        schema_parse_fn = avro.schema.Parse
@ -69,7 +69,9 @@ def avro_schema_to_mce_fields(avro_schema_string: str) -> List[SchemaField]:
            fieldPath=parsed_field.name,
            nativeDataType=str(parsed_field.type),
            type=_get_column_type(parsed_field.type),
-            description=parsed_field.props.get('doc', None),
+            description=parsed_field.props.get("doc", None),
+            recursive=False,
+            nullable=(parsed_field.type == "null"),
        )

        fields.append(field)
--- a/metadata-ingestion/src/datahub/ingestion/source/kafka.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/kafka.py
@ -92,6 +92,7 @@ class KafkaSource(Source):
        metadata_record = MetadataChangeEvent()
        dataset_snapshot = DatasetSnapshot(
            urn=f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{env})",
+            aspects=[],  # we append to this list later on
        )
        dataset_snapshot.aspects.append(Status(removed=False))
        metadata_record.proposedSnapshot = dataset_snapshot
--- a/metadata-ingestion/src/datahub/ingestion/source/mce_file.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/mce_file.py
@ -30,6 +30,8 @@ class MetadataFileSource(Source):

        for i, obj in enumerate(mce_obj_list):
            mce: MetadataChangeEvent = MetadataChangeEvent.from_obj(obj)
+            if not mce.validate():
+                raise ValueError(f"failed to parse into valid MCE: {obj}")
            wu = MetadataWorkUnit(f"file://{self.config.filename}:{i}", mce)
            self.report.report_workunit(wu)
            yield wu
@ -39,3 +41,10 @@ class MetadataFileSource(Source):

    def close(self):
        pass
+
+
+def check_mce_file(filepath: str) -> str:
+    mce_source = MetadataFileSource.create({"filename": filepath}, None)
+    for _ in mce_source.get_workunits():
+        pass
+    return f"{mce_source.get_report().workunits_produced} MCEs found - all valid"
--- a/metadata-ingestion/src/datahub/ingestion/source/sql_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql_common.py
@ -162,6 +162,7 @@ def get_schema_metadata(
            type=get_column_type(sql_report, dataset_name, column["type"]),
            description=column.get("comment", None),
            nullable=column["nullable"],
+            recursive=False,
        )
        canonical_schema.append(field)

@ -229,6 +230,8 @@ class SQLAlchemySource(Source):
                if description is not None:
                    dataset_properties = DatasetPropertiesClass(
                        description=description,
+                        tags=[],
+                        customProperties={},
                        # uri=dataset_name,
                    )
                    dataset_snapshot.aspects.append(dataset_properties)
--- a/metadata-ingestion/tests/integration/ldap/ldap_mce_golden.json
+++ b/metadata-ingestion/tests/integration/ldap/ldap_mce_golden.json
@ -8,25 +8,15 @@
                {
                    "com.linkedin.pegasus2avro.identity.CorpUserInfo": {
                        "active": true,
-                        "displayName": {
-                            "string": "Bart Simpson"
-                        },
+                        "displayName": "Bart Simpson",
                        "email": "",
-                        "title": {
-                            "string": "Mr. Boss"
-                        },
+                        "title": "Mr. Boss",
                        "managerUrn": null,
                        "departmentId": null,
                        "departmentName": null,
-                        "firstName": {
-                            "string": "Bart"
-                        },
-                        "lastName": {
-                            "string": "Simpson"
-                        },
-                        "fullName": {
-                            "string": "Bart Simpson"
-                        },
+                        "firstName": "Bart",
+                        "lastName": "Simpson",
+                        "fullName": "Bart Simpson",
                        "countryCode": null
                    }
                }
@ -44,29 +34,15 @@
                {
                    "com.linkedin.pegasus2avro.identity.CorpUserInfo": {
                        "active": true,
-                        "displayName": {
-                            "string": "Homer Simpson"
-                        },
+                        "displayName": "Homer Simpson",
                        "email": "hsimpson",
-                        "title": {
-                            "string": "Mr. Everything"
-                        },
-                        "managerUrn": {
-                            "string": "urn:li:corpuser:bsimpson"
-                        },
+                        "title": "Mr. Everything",
+                        "managerUrn": "urn:li:corpuser:bsimpson",
                        "departmentId": null,
-                        "departmentName": {
-                            "string": "1001"
-                        },
-                        "firstName": {
-                            "string": "Homer"
-                        },
-                        "lastName": {
-                            "string": "Simpson"
-                        },
-                        "fullName": {
-                            "string": "Homer Simpson"
-                        },
+                        "departmentName": "1001",
+                        "firstName": "Homer",
+                        "lastName": "Simpson",
+                        "fullName": "Homer Simpson",
                        "countryCode": null
                    }
                }
@ -84,23 +60,15 @@
                {
                    "com.linkedin.pegasus2avro.identity.CorpUserInfo": {
                        "active": true,
-                        "displayName": {
-                            "string": "Lisa Simpson"
-                        },
+                        "displayName": "Lisa Simpson",
                        "email": "",
                        "title": null,
                        "managerUrn": null,
                        "departmentId": null,
                        "departmentName": null,
-                        "firstName": {
-                            "string": "Lisa"
-                        },
-                        "lastName": {
-                            "string": "Simpson"
-                        },
-                        "fullName": {
-                            "string": "Lisa Simpson"
-                        },
+                        "firstName": "Lisa",
+                        "lastName": "Simpson",
+                        "fullName": "Lisa Simpson",
                        "countryCode": null
                    }
                }
@ -118,23 +86,15 @@
                {
                    "com.linkedin.pegasus2avro.identity.CorpUserInfo": {
                        "active": true,
-                        "displayName": {
-                            "string": "Maggie Simpson"
-                        },
+                        "displayName": "Maggie Simpson",
                        "email": "",
                        "title": null,
                        "managerUrn": null,
                        "departmentId": null,
                        "departmentName": null,
-                        "firstName": {
-                            "string": "Maggie"
-                        },
-                        "lastName": {
-                            "string": "Simpson"
-                        },
-                        "fullName": {
-                            "string": "Maggie Simpson"
-                        },
+                        "firstName": "Maggie",
+                        "lastName": "Simpson",
+                        "fullName": "Maggie Simpson",
                        "countryCode": null
                    }
                }
--- a/metadata-ingestion/tests/integration/mysql/mysql_mce_golden.json
+++ b/metadata-ingestion/tests/integration/mysql/mysql_mce_golden.json
--- a/metadata-ingestion/tests/integration/mysql/mysql_to_file.yml
+++ b/metadata-ingestion/tests/integration/mysql/mysql_to_file.yml
@ -7,8 +7,11 @@ source:
    password: example
    database: metagalaxy
    host_port: localhost:53306
+    schema_pattern:
+      allow:
+        - "^metagalaxy"

 sink:
  type: file
  config:
-    filename: './mysql_mces.json'
+    filename: "./mysql_mces.json"
--- a/metadata-ingestion/tests/integration/sql_server/mssql_mces_golden.json
+++ b/metadata-ingestion/tests/integration/sql_server/mssql_mces_golden.json
--- a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py
+++ b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py
@ -29,6 +29,6 @@ def test_mssql_ingest(sql_server, pytestconfig, tmp_path, mock_time):

    # Verify the output.
    golden = mce_helpers.load_json_file(
-        str(test_resources_dir / "mssql_mces_golden.json")
+        str(test_resources_dir / "mssql_mce_golden.json")
    )
    mce_helpers.assert_mces_equal(output, golden)
--- a/metadata-ingestion/tests/unit/serde/test_serde.py
+++ b/metadata-ingestion/tests/unit/serde/test_serde.py
@ -1,5 +1,7 @@
 import mce_helpers
+from click.testing import CliRunner

+from datahub.entrypoints import datahub
 from datahub.ingestion.run.pipeline import Pipeline


@ -24,3 +26,23 @@ def test_serde_large(pytestconfig, tmp_path):
    output = mce_helpers.load_json_file(tmp_path / output_filename)
    golden = mce_helpers.load_json_file(golden_file)
    mce_helpers.assert_mces_equal(output, golden)
+
+
+def test_check_mce_schema(pytestconfig):
+    json_filename = "test_serde_large.json"
+    test_resources_dir = pytestconfig.rootpath / "tests/unit/serde"
+    json_file_path = test_resources_dir / json_filename
+
+    runner = CliRunner()
+    result = runner.invoke(datahub, ["check", "mce-file", f"{json_file_path}"])
+    assert result.exit_code == 0
+
+
+def test_reader_allows_verbose_unions(pytestconfig):
+    json_filename = "test_serde_backwards_compat.json"
+    test_resources_dir = pytestconfig.rootpath / "tests/unit/serde"
+    json_file_path = test_resources_dir / json_filename
+
+    runner = CliRunner()
+    result = runner.invoke(datahub, ["check", "mce-file", f"{json_file_path}"])
+    assert result.exit_code == 0
--- a/metadata-ingestion/tests/unit/serde/test_serde_backwards_compat.json
+++ b/metadata-ingestion/tests/unit/serde/test_serde_backwards_compat.json
@ -0,0 +1,229 @@
+[
+    {
+        "auditHeader": null,
+        "proposedSnapshot": {
+            "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
+                "urn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,bigquery-public-data.covid19_geotab_mobility_impact.airport_traffic,PROD)",
+                "aspects": [
+                    {
+                        "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
+                            "description": {
+                                "string": "This dataset shows traffic to and from the Airport as a Percentage of the Traffic volume during the baseline period. The baseline period used for computing this metric is from 1st Feb to 15th March 2020. The dataset gets updated daily."
+                            },
+                            "uri": null,
+                            "tags": [],
+                            "customProperties": {}
+                        }
+                    },
+                    {
+                        "com.linkedin.pegasus2avro.schema.SchemaMetadata": {
+                            "schemaName": "bigquery-public-data.covid19_geotab_mobility_impact.airport_traffic",
+                            "platform": "urn:li:dataPlatform:bigquery",
+                            "version": 0,
+                            "created": {
+                                "time": 1615444202056,
+                                "actor": "urn:li:corpuser:etl",
+                                "impersonator": null
+                            },
+                            "lastModified": {
+                                "time": 1615444202056,
+                                "actor": "urn:li:corpuser:etl",
+                                "impersonator": null
+                            },
+                            "deleted": null,
+                            "dataset": null,
+                            "cluster": null,
+                            "hash": "",
+                            "platformSchema": {
+                                "com.linkedin.pegasus2avro.schema.MySqlDDL": {
+                                    "tableSchema": ""
+                                }
+                            },
+                            "fields": [
+                                {
+                                    "fieldPath": "aggregation_method",
+                                    "jsonPath": null,
+                                    "nullable": true,
+                                    "description": {
+                                        "string": "Aggregation period used to compute this metric"
+                                    },
+                                    "type": {
+                                        "type": {
+                                            "com.linkedin.pegasus2avro.schema.StringType": {}
+                                        }
+                                    },
+                                    "nativeDataType": "String()",
+                                    "recursive": false,
+                                    "globalTags": null
+                                },
+                                {
+                                    "fieldPath": "date",
+                                    "jsonPath": null,
+                                    "nullable": true,
+                                    "description": {
+                                        "string": "Date of the data"
+                                    },
+                                    "type": {
+                                        "type": {
+                                            "com.linkedin.pegasus2avro.schema.DateType": {}
+                                        }
+                                    },
+                                    "nativeDataType": "DATE()",
+                                    "recursive": false,
+                                    "globalTags": null
+                                },
+                                {
+                                    "fieldPath": "version",
+                                    "jsonPath": null,
+                                    "nullable": true,
+                                    "description": {
+                                        "string": "Version of the table"
+                                    },
+                                    "type": {
+                                        "type": {
+                                            "com.linkedin.pegasus2avro.schema.StringType": {}
+                                        }
+                                    },
+                                    "nativeDataType": "String()",
+                                    "recursive": false,
+                                    "globalTags": null
+                                },
+                                {
+                                    "fieldPath": "airport_name",
+                                    "jsonPath": null,
+                                    "nullable": true,
+                                    "description": {
+                                        "string": "Aggregation period used to compute this metric"
+                                    },
+                                    "type": {
+                                        "type": {
+                                            "com.linkedin.pegasus2avro.schema.StringType": {}
+                                        }
+                                    },
+                                    "nativeDataType": "String()",
+                                    "recursive": false,
+                                    "globalTags": null
+                                },
+                                {
+                                    "fieldPath": "percent_of_baseline",
+                                    "jsonPath": null,
+                                    "nullable": true,
+                                    "description": {
+                                        "string": "Proportion of trips on this date as compared to Avg number of trips on the same day of week in baseline period i.e 1st February 2020 - 15th March 2020"
+                                    },
+                                    "type": {
+                                        "type": {
+                                            "com.linkedin.pegasus2avro.schema.NumberType": {}
+                                        }
+                                    },
+                                    "nativeDataType": "Float()",
+                                    "recursive": false,
+                                    "globalTags": null
+                                },
+                                {
+                                    "fieldPath": "center_point_geom",
+                                    "jsonPath": null,
+                                    "nullable": true,
+                                    "description": {
+                                        "string": "Geographic representation of the centroid of the Airport polygon"
+                                    },
+                                    "type": {
+                                        "type": {
+                                            "com.linkedin.pegasus2avro.schema.NullType": {}
+                                        }
+                                    },
+                                    "nativeDataType": "NullType()",
+                                    "recursive": false,
+                                    "globalTags": null
+                                },
+                                {
+                                    "fieldPath": "city",
+                                    "jsonPath": null,
+                                    "nullable": true,
+                                    "description": {
+                                        "string": "City within which the Airport is located"
+                                    },
+                                    "type": {
+                                        "type": {
+                                            "com.linkedin.pegasus2avro.schema.StringType": {}
+                                        }
+                                    },
+                                    "nativeDataType": "String()",
+                                    "recursive": false,
+                                    "globalTags": null
+                                },
+                                {
+                                    "fieldPath": "state_region",
+                                    "jsonPath": null,
+                                    "nullable": true,
+                                    "description": {
+                                        "string": "State within which the Airport is located"
+                                    },
+                                    "type": {
+                                        "type": {
+                                            "com.linkedin.pegasus2avro.schema.StringType": {}
+                                        }
+                                    },
+                                    "nativeDataType": "String()",
+                                    "recursive": false,
+                                    "globalTags": null
+                                },
+                                {
+                                    "fieldPath": "country_iso_code_2",
+                                    "jsonPath": null,
+                                    "nullable": true,
+                                    "description": {
+                                        "string": "ISO 3166-2 code representing the county and subdivision within which the Airport is located"
+                                    },
+                                    "type": {
+                                        "type": {
+                                            "com.linkedin.pegasus2avro.schema.StringType": {}
+                                        }
+                                    },
+                                    "nativeDataType": "String()",
+                                    "recursive": false,
+                                    "globalTags": null
+                                },
+                                {
+                                    "fieldPath": "country_name",
+                                    "jsonPath": null,
+                                    "nullable": true,
+                                    "description": {
+                                        "string": "Full text name of the country within which the Airport is located"
+                                    },
+                                    "type": {
+                                        "type": {
+                                            "com.linkedin.pegasus2avro.schema.StringType": {}
+                                        }
+                                    },
+                                    "nativeDataType": "String()",
+                                    "recursive": false,
+                                    "globalTags": null
+                                },
+                                {
+                                    "fieldPath": "airport_geom",
+                                    "jsonPath": null,
+                                    "nullable": true,
+                                    "description": {
+                                        "string": "Geographic representation of the Airport polygon"
+                                    },
+                                    "type": {
+                                        "type": {
+                                            "com.linkedin.pegasus2avro.schema.NullType": {}
+                                        }
+                                    },
+                                    "nativeDataType": "NullType()",
+                                    "recursive": false,
+                                    "globalTags": null
+                                }
+                            ],
+                            "primaryKeys": null,
+                            "foreignKeysSpecs": null
+                        }
+                    }
+                ]
+            }
+        },
+        "proposedDelta": null
+    }
+]
--- a/metadata-ingestion/tests/unit/serde/test_serde_large.json
+++ b/metadata-ingestion/tests/unit/serde/test_serde_large.json