datahub/metadata-ingestion/scripts/avro_codegen.py

import json
from pathlib import Path
from typing import Union

import click
from avrogen import write_schema_files

autogen_header = """# flake8: noqa

# This file is autogenerated by /metadata-ingestion/scripts/avro_codegen.py
# Do not modify manually!

# fmt: off
"""


def suppress_checks_in_file(filepath: Union[str, Path]) -> None:
    """
    Adds a couple lines to the top of an autogenerated file:
        - Comments to suppress flake8 and black.
        - A note stating that the file was autogenerated.
    """

    with open(filepath, "r+") as f:
        contents = f.read()

        f.seek(0, 0)
        f.write(autogen_header)
        f.write(contents)
        f.write("# fmt: on\n")


@click.command()
@click.argument("schema_file", type=click.Path(exists=True))
@click.argument("outdir", type=click.Path())
def generate(schema_file: str, outdir: str) -> None:
    with open(schema_file) as f:
        raw_schema_text = f.read()

    no_spaces_schema = json.dumps(json.loads(raw_schema_text))
    schema_json = no_spaces_schema.replace(
        '{"type": "string", "avro.java.string": "String"}', '"string"'
    )

    redo_spaces = json.dumps(json.loads(schema_json), indent=2)

    write_schema_files(redo_spaces, outdir)
    with open(f"{outdir}/__init__.py", "w"):
        # Truncate this file.
        pass

    # Add headers for all generated files
    generated_files = Path(outdir).glob("**/*.py")
    for file in generated_files:
        suppress_checks_in_file(file)


if __name__ == "__main__":
    generate()