2021-02-05 21:03:04 -08:00
|
|
|
import json
|
2021-06-17 10:04:28 -07:00
|
|
|
import types
|
|
|
|
import unittest.mock
|
2021-06-11 09:44:18 -07:00
|
|
|
from pathlib import Path
|
2021-06-17 10:04:28 -07:00
|
|
|
from typing import Dict, Iterable, List, Union
|
2021-02-05 21:03:04 -08:00
|
|
|
|
2021-06-17 10:04:28 -07:00
|
|
|
import avro.schema
|
2021-02-05 21:03:04 -08:00
|
|
|
import click
|
2021-02-11 23:14:20 -08:00
|
|
|
from avrogen import write_schema_files
|
2021-02-05 21:03:04 -08:00
|
|
|
|
2021-06-17 10:04:28 -07:00
|
|
|
|
|
|
|
def load_schema_file(schema_file: str) -> str:
|
|
|
|
with open(schema_file) as f:
|
|
|
|
raw_schema_text = f.read()
|
|
|
|
|
|
|
|
redo_spaces = json.dumps(json.loads(raw_schema_text), indent=2)
|
|
|
|
return redo_spaces
|
|
|
|
|
|
|
|
|
|
|
|
def merge_schemas(schemas: List[str]) -> str:
|
|
|
|
# Combine schemas.
|
|
|
|
schemas_obj = [json.loads(schema) for schema in schemas]
|
|
|
|
merged = ["null"] + schemas_obj
|
|
|
|
|
|
|
|
# Deduplicate repeated names.
|
|
|
|
def Register(self, schema):
|
|
|
|
if schema.fullname in self._names:
|
|
|
|
# print(f"deduping {schema.fullname}")
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
self._names[schema.fullname] = schema
|
|
|
|
|
|
|
|
with unittest.mock.patch("avro.schema.Names.Register", Register):
|
|
|
|
cleaned_schema = avro.schema.SchemaFromJSONData(merged)
|
|
|
|
|
|
|
|
# Convert back to an Avro schema JSON representation.
|
|
|
|
class MappingProxyEncoder(json.JSONEncoder):
|
|
|
|
def default(self, obj):
|
|
|
|
if isinstance(obj, types.MappingProxyType):
|
|
|
|
return dict(obj)
|
|
|
|
return json.JSONEncoder.default(self, obj)
|
|
|
|
|
|
|
|
out_schema = cleaned_schema.to_json()
|
|
|
|
encoded = json.dumps(out_schema, cls=MappingProxyEncoder, indent=2)
|
|
|
|
return encoded
|
|
|
|
|
|
|
|
|
2021-06-11 09:44:18 -07:00
|
|
|
autogen_header = """# flake8: noqa
|
2021-02-05 21:03:04 -08:00
|
|
|
|
2021-06-11 09:44:18 -07:00
|
|
|
# This file is autogenerated by /metadata-ingestion/scripts/avro_codegen.py
|
|
|
|
# Do not modify manually!
|
|
|
|
|
|
|
|
# fmt: off
|
|
|
|
"""
|
2021-06-17 10:04:28 -07:00
|
|
|
autogen_footer = "# fmt: on\n"
|
2021-06-11 09:44:18 -07:00
|
|
|
|
|
|
|
|
|
|
|
def suppress_checks_in_file(filepath: Union[str, Path]) -> None:
|
|
|
|
"""
|
|
|
|
Adds a couple lines to the top of an autogenerated file:
|
|
|
|
- Comments to suppress flake8 and black.
|
|
|
|
- A note stating that the file was autogenerated.
|
|
|
|
"""
|
2021-03-18 02:05:05 -04:00
|
|
|
|
|
|
|
with open(filepath, "r+") as f:
|
|
|
|
contents = f.read()
|
|
|
|
|
|
|
|
f.seek(0, 0)
|
2021-06-11 09:44:18 -07:00
|
|
|
f.write(autogen_header)
|
2021-03-18 02:05:05 -04:00
|
|
|
f.write(contents)
|
2021-06-17 10:04:28 -07:00
|
|
|
f.write(autogen_footer)
|
2021-03-18 02:05:05 -04:00
|
|
|
|
|
|
|
|
2021-06-17 10:04:28 -07:00
|
|
|
load_schema_method = """
|
|
|
|
import functools
|
|
|
|
import pathlib
|
|
|
|
|
|
|
|
def _load_schema(schema_name: str) -> str:
|
|
|
|
return (pathlib.Path(__file__).parent / f"{schema_name}.avsc").read_text()
|
|
|
|
"""
|
|
|
|
individual_schema_method = """
|
|
|
|
@functools.lru_cache(maxsize=None)
|
|
|
|
def get{schema_name}Schema() -> str:
|
|
|
|
return _load_schema("{schema_name}")
|
|
|
|
"""
|
2021-02-05 21:03:04 -08:00
|
|
|
|
2021-06-17 10:04:28 -07:00
|
|
|
|
|
|
|
def make_load_schema_methods(schemas: Iterable[str]) -> str:
|
|
|
|
return load_schema_method + "".join(
|
|
|
|
individual_schema_method.format(schema_name=schema) for schema in schemas
|
2021-02-12 10:46:28 -08:00
|
|
|
)
|
2021-02-05 21:03:04 -08:00
|
|
|
|
|
|
|
|
2021-06-17 10:04:28 -07:00
|
|
|
@click.command()
|
|
|
|
@click.argument("schema_files", type=click.Path(exists=True), nargs=-1, required=True)
|
|
|
|
@click.argument("outdir", type=click.Path(), required=True)
|
|
|
|
def generate(schema_files: List[str], outdir: str) -> None:
|
|
|
|
schemas: Dict[str, str] = {}
|
|
|
|
for schema_file in schema_files:
|
|
|
|
schema = load_schema_file(schema_file)
|
|
|
|
schemas[Path(schema_file).stem] = schema
|
|
|
|
|
|
|
|
merged_schema = merge_schemas(list(schemas.values()))
|
|
|
|
|
|
|
|
write_schema_files(merged_schema, outdir)
|
2021-04-30 21:10:12 -07:00
|
|
|
with open(f"{outdir}/__init__.py", "w"):
|
|
|
|
# Truncate this file.
|
|
|
|
pass
|
2021-02-05 21:03:04 -08:00
|
|
|
|
2021-06-17 10:04:28 -07:00
|
|
|
# Save raw schema files in codegen as well.
|
|
|
|
schema_save_dir = Path(outdir) / "schemas"
|
|
|
|
schema_save_dir.mkdir()
|
|
|
|
for schema_out_file, schema in schemas.items():
|
|
|
|
(schema_save_dir / f"{schema_out_file}.avsc").write_text(schema)
|
|
|
|
|
|
|
|
# Add load_schema method.
|
|
|
|
with open(schema_save_dir / "__init__.py", "a") as schema_dir_init:
|
|
|
|
schema_dir_init.write(make_load_schema_methods(schemas.keys()))
|
|
|
|
|
2021-06-11 09:44:18 -07:00
|
|
|
# Add headers for all generated files
|
|
|
|
generated_files = Path(outdir).glob("**/*.py")
|
|
|
|
for file in generated_files:
|
|
|
|
suppress_checks_in_file(file)
|
|
|
|
|
2021-02-05 21:03:04 -08:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
generate()
|