2021-02-05 21:03:04 -08:00
|
|
|
import json
|
2021-06-17 10:04:28 -07:00
|
|
|
import types
|
|
|
|
import unittest.mock
|
2021-06-11 09:44:18 -07:00
|
|
|
from pathlib import Path
|
2021-06-17 10:04:28 -07:00
|
|
|
from typing import Dict, Iterable, List, Union
|
2021-02-05 21:03:04 -08:00
|
|
|
|
2021-06-17 10:04:28 -07:00
|
|
|
import avro.schema
|
2021-02-05 21:03:04 -08:00
|
|
|
import click
|
2021-02-11 23:14:20 -08:00
|
|
|
from avrogen import write_schema_files
|
2021-02-05 21:03:04 -08:00
|
|
|
|
2021-06-17 10:04:28 -07:00
|
|
|
|
|
|
|
def load_schema_file(schema_file: str) -> str:
|
|
|
|
with open(schema_file) as f:
|
|
|
|
raw_schema_text = f.read()
|
|
|
|
|
|
|
|
redo_spaces = json.dumps(json.loads(raw_schema_text), indent=2)
|
|
|
|
return redo_spaces
|
|
|
|
|
|
|
|
|
|
|
|
def merge_schemas(schemas: List[str]) -> str:
|
|
|
|
# Combine schemas.
|
|
|
|
schemas_obj = [json.loads(schema) for schema in schemas]
|
|
|
|
merged = ["null"] + schemas_obj
|
|
|
|
|
2021-10-18 18:35:27 -04:00
|
|
|
# Patch add_name method to NOT complain about duplicate names
|
|
|
|
def add_name(self, name_attr, space_attr, new_schema):
|
|
|
|
to_add = avro.schema.Name(name_attr, space_attr, self.default_namespace)
|
2021-06-17 10:04:28 -07:00
|
|
|
|
2021-10-18 18:35:27 -04:00
|
|
|
self.names[to_add.fullname] = new_schema
|
|
|
|
return to_add
|
|
|
|
|
|
|
|
with unittest.mock.patch("avro.schema.Names.add_name", add_name):
|
|
|
|
cleaned_schema = avro.schema.make_avsc_object(merged)
|
2021-06-17 10:04:28 -07:00
|
|
|
|
|
|
|
# Convert back to an Avro schema JSON representation.
|
|
|
|
class MappingProxyEncoder(json.JSONEncoder):
|
|
|
|
def default(self, obj):
|
|
|
|
if isinstance(obj, types.MappingProxyType):
|
|
|
|
return dict(obj)
|
|
|
|
return json.JSONEncoder.default(self, obj)
|
|
|
|
|
|
|
|
out_schema = cleaned_schema.to_json()
|
|
|
|
encoded = json.dumps(out_schema, cls=MappingProxyEncoder, indent=2)
|
|
|
|
return encoded
|
|
|
|
|
|
|
|
|
2021-06-11 09:44:18 -07:00
|
|
|
autogen_header = """# flake8: noqa
|
2021-02-05 21:03:04 -08:00
|
|
|
|
2021-06-11 09:44:18 -07:00
|
|
|
# This file is autogenerated by /metadata-ingestion/scripts/avro_codegen.py
|
|
|
|
# Do not modify manually!
|
|
|
|
|
|
|
|
# fmt: off
|
|
|
|
"""
|
2021-10-18 18:35:27 -04:00
|
|
|
autogen_footer = """
|
|
|
|
# fmt: on
|
|
|
|
"""
|
2021-06-11 09:44:18 -07:00
|
|
|
|
|
|
|
|
|
|
|
def suppress_checks_in_file(filepath: Union[str, Path]) -> None:
|
|
|
|
"""
|
|
|
|
Adds a couple lines to the top of an autogenerated file:
|
|
|
|
- Comments to suppress flake8 and black.
|
|
|
|
- A note stating that the file was autogenerated.
|
|
|
|
"""
|
2021-03-18 02:05:05 -04:00
|
|
|
|
|
|
|
with open(filepath, "r+") as f:
|
|
|
|
contents = f.read()
|
|
|
|
|
|
|
|
f.seek(0, 0)
|
2021-06-11 09:44:18 -07:00
|
|
|
f.write(autogen_header)
|
2021-03-18 02:05:05 -04:00
|
|
|
f.write(contents)
|
2021-06-17 10:04:28 -07:00
|
|
|
f.write(autogen_footer)
|
2021-03-18 02:05:05 -04:00
|
|
|
|
|
|
|
|
2021-10-18 18:35:27 -04:00
|
|
|
def add_avro_python3_warning(filepath: Path) -> None:
|
|
|
|
contents = filepath.read_text()
|
|
|
|
|
|
|
|
contents = f"""
|
|
|
|
# The SchemaFromJSONData method only exists in avro-python3, but is called make_avsc_object in avro.
|
|
|
|
# We can use this fact to detect conflicts between the two packages. Pip won't detect those conflicts
|
|
|
|
# because both are namespace packages, and hence are allowed to overwrite files from each other.
|
|
|
|
# This means that installation order matters, which is a pretty unintuitive outcome.
|
|
|
|
# See https://github.com/pypa/pip/issues/4625 for details.
|
|
|
|
try:
|
|
|
|
from avro.schema import SchemaFromJSONData
|
|
|
|
import warnings
|
|
|
|
|
|
|
|
warnings.warn("It seems like 'avro-python3' is installed, which conflicts with the 'avro' package used by datahub. "
|
|
|
|
+ "Try running `pip uninstall avro-python3 && pip install --upgrade --force-reinstall avro` to fix this issue.")
|
|
|
|
except ImportError:
|
|
|
|
pass
|
|
|
|
|
|
|
|
{contents}
|
|
|
|
"""
|
|
|
|
|
|
|
|
filepath.write_text(contents)
|
|
|
|
|
|
|
|
|
2021-06-17 10:04:28 -07:00
|
|
|
load_schema_method = """
|
|
|
|
import functools
|
|
|
|
import pathlib
|
|
|
|
|
|
|
|
def _load_schema(schema_name: str) -> str:
|
|
|
|
return (pathlib.Path(__file__).parent / f"{schema_name}.avsc").read_text()
|
|
|
|
"""
|
|
|
|
individual_schema_method = """
|
|
|
|
@functools.lru_cache(maxsize=None)
|
|
|
|
def get{schema_name}Schema() -> str:
|
|
|
|
return _load_schema("{schema_name}")
|
|
|
|
"""
|
2021-02-05 21:03:04 -08:00
|
|
|
|
2021-06-17 10:04:28 -07:00
|
|
|
|
|
|
|
def make_load_schema_methods(schemas: Iterable[str]) -> str:
|
|
|
|
return load_schema_method + "".join(
|
|
|
|
individual_schema_method.format(schema_name=schema) for schema in schemas
|
2021-02-12 10:46:28 -08:00
|
|
|
)
|
2021-02-05 21:03:04 -08:00
|
|
|
|
|
|
|
|
2021-06-17 10:04:28 -07:00
|
|
|
@click.command()
|
|
|
|
@click.argument("schema_files", type=click.Path(exists=True), nargs=-1, required=True)
|
|
|
|
@click.argument("outdir", type=click.Path(), required=True)
|
|
|
|
def generate(schema_files: List[str], outdir: str) -> None:
|
|
|
|
schemas: Dict[str, str] = {}
|
|
|
|
for schema_file in schema_files:
|
|
|
|
schema = load_schema_file(schema_file)
|
|
|
|
schemas[Path(schema_file).stem] = schema
|
|
|
|
|
|
|
|
merged_schema = merge_schemas(list(schemas.values()))
|
|
|
|
|
|
|
|
write_schema_files(merged_schema, outdir)
|
2021-10-18 18:35:27 -04:00
|
|
|
|
|
|
|
# Schema files post-processing.
|
|
|
|
(Path(outdir) / "__init__.py").write_text("# This file is intentionally empty.\n")
|
|
|
|
add_avro_python3_warning(Path(outdir) / "schema_classes.py")
|
2021-02-05 21:03:04 -08:00
|
|
|
|
2021-06-17 10:04:28 -07:00
|
|
|
# Save raw schema files in codegen as well.
|
|
|
|
schema_save_dir = Path(outdir) / "schemas"
|
|
|
|
schema_save_dir.mkdir()
|
|
|
|
for schema_out_file, schema in schemas.items():
|
|
|
|
(schema_save_dir / f"{schema_out_file}.avsc").write_text(schema)
|
|
|
|
|
|
|
|
# Add load_schema method.
|
|
|
|
with open(schema_save_dir / "__init__.py", "a") as schema_dir_init:
|
|
|
|
schema_dir_init.write(make_load_schema_methods(schemas.keys()))
|
|
|
|
|
2021-06-11 09:44:18 -07:00
|
|
|
# Add headers for all generated files
|
|
|
|
generated_files = Path(outdir).glob("**/*.py")
|
|
|
|
for file in generated_files:
|
|
|
|
suppress_checks_in_file(file)
|
|
|
|
|
2021-02-05 21:03:04 -08:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
generate()
|