feat: remove jq requirement + tweak modeldocgen args (#6904)

Co-authored-by: Tamas Nemeth <treff7es@gmail.com>
This commit is contained in:
Harshal Sheth 2022-12-30 14:02:57 -05:00 committed by GitHub
parent b796db1caf
commit 62a2aa94f6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 43 additions and 66 deletions

View File

@ -17,7 +17,6 @@ RUN apt-get update && apt-get install -y \
&& apt-get install -y -qq \
# gcc \
make \
jq \
python3-ldap \
libldap2-dev \
libsasl2-dev \

View File

@ -230,8 +230,8 @@ datahub delete --urn "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset
The `get` command allows you to easily retrieve metadata from DataHub, by using the REST API. This works for both versioned aspects and timeseries aspects. For timeseries aspects, it fetches the latest value.
For example the following command gets the ownership aspect from the dataset `urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)`
```console
datahub get --urn "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)" --aspect ownership | jq put_command
```shell-session
$ datahub get --urn "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)" --aspect ownership
{
"value": {
"com.linkedin.metadata.snapshot.DatasetSnapshot": {

View File

@ -19,8 +19,7 @@ Before you go further, ensure you have the following installed:
* [Python >=3.7.0](https://www.python.org/downloads/)
* [Docker](https://docs.docker.com/get-docker/)
* [jq](https://stedolan.github.io/jq/download/)
* [Docker Compose](https://github.com/docker/compose/blob/master/INSTALL.md) - if using Linux
* [Docker Compose v2](https://docs.docker.com/compose/install/) - may be bundled with docker
:::note

View File

@ -16,9 +16,7 @@ Tested & confirmed config: 2 CPUs, 8GB RAM, 2GB Swap area, and 10GB disk space.
:::
2. Install [jq](https://stedolan.github.io/jq/download/)
3. Launch the Docker Engine from command line or the desktop app.
2. Launch the Docker Engine from command line or the desktop app.
3. Install the DataHub CLI
@ -26,9 +24,8 @@ Tested & confirmed config: 2 CPUs, 8GB RAM, 2GB Swap area, and 10GB disk space.
b. Run the following commands in your terminal
```
```sh
python3 -m pip install --upgrade pip wheel setuptools
python3 -m pip uninstall datahub acryl-datahub || true # sanity check - ok if it fails
python3 -m pip install --upgrade acryl-datahub
datahub version
```
@ -88,7 +85,7 @@ Tested & confirmed config: 2 CPUs, 8GB RAM, 2GB Swap area, and 10GB disk space.
5. To ingest the sample metadata, run the following CLI command from your terminal
```
```bash
datahub docker ingest-sample-data
```
@ -110,13 +107,13 @@ Command not found: datahub
If running the datahub cli produces "command not found" errors inside your terminal, your system may be defaulting to an
older version of Python. Try prefixing your `datahub` commands with `python3 -m`:
```
```bash
python3 -m datahub docker quickstart
```
Another possibility is that your system PATH does not include pip's `$HOME/.local/bin` directory. On linux, you can add this to your `~/.bashrc`:
```
```bash
if [ -d "$HOME/.local/bin" ] ; then
PATH="$HOME/.local/bin:$PATH"
fi

View File

@ -98,7 +98,7 @@ EOF
fi
printf "✨ Setting up prerequisities\n"
brew install "jq"
# none for now, since jq was removed
printf "\e[38;2;0;255;0m✅ Done\e[38;2;255;255;255m\n"
}

View File

@ -1,7 +1,9 @@
import glob
import json
import logging
import os
import re
import shutil
import unittest.mock
from dataclasses import Field, dataclass, field
from enum import auto
@ -135,12 +137,7 @@ def load_schema_file(schema_file: str) -> None:
# probably an aspect schema
record_schema: avro.schema.RecordSchema = avro_schema
aspect_def = record_schema.get_prop("Aspect")
try:
aspect_definition = AspectDefinition(**aspect_def)
except Exception as e:
import pdb
breakpoint()
aspect_definition = AspectDefinition(**aspect_def)
aspect_definition.schema = record_schema
aspect_registry[aspect_definition.name] = aspect_definition
@ -255,8 +252,9 @@ def make_entity_docs(entity_display_name: str, graph: RelationshipGraph) -> str:
timeseries_aspects_section = ""
for aspect in entity_def.aspects or []:
aspect_definition: AspectDefinition = aspect_registry.get(aspect)
aspect_definition: AspectDefinition = aspect_registry[aspect]
assert aspect_definition
assert aspect_definition.schema
deprecated_message = (
" (Deprecated)"
if aspect_definition.schema.get_prop("Deprecated")
@ -270,7 +268,7 @@ def make_entity_docs(entity_display_name: str, graph: RelationshipGraph) -> str:
f"\n### {aspect}{deprecated_message}{timeseries_qualifier}\n"
)
this_aspect_doc += f"{aspect_definition.schema.get_prop('doc')}\n"
this_aspect_doc += f"<details>\n<summary>Schema</summary>\n\n"
this_aspect_doc += "<details>\n<summary>Schema</summary>\n\n"
# breakpoint()
this_aspect_doc += f"```javascript\n{json.dumps(aspect_definition.schema.to_json(), indent=2)}\n```\n</details>\n"
@ -287,20 +285,20 @@ def make_entity_docs(entity_display_name: str, graph: RelationshipGraph) -> str:
relationships_section = "\n## Relationships\n"
adjacency = graph.get_adjacency(entity_def.display_name)
if adjacency.self_loop:
relationships_section += f"\n### Self\nThese are the relationships to itself, stored in this entity's aspects"
relationships_section += "\n### Self\nThese are the relationships to itself, stored in this entity's aspects"
for relnship in adjacency.self_loop:
relationships_section += (
f"\n- {relnship.name} ({relnship.doc[1:] if relnship.doc else ''})"
)
if adjacency.outgoing:
relationships_section += f"\n### Outgoing\nThese are the relationships stored in this entity's aspects"
relationships_section += "\n### Outgoing\nThese are the relationships stored in this entity's aspects"
relationships_section += make_relnship_docs(
adjacency.outgoing, direction="outgoing"
)
if adjacency.incoming:
relationships_section += f"\n### Incoming\nThese are the relationships stored in other entity's aspects"
relationships_section += "\n### Incoming\nThese are the relationships stored in other entity's aspects"
relationships_section += make_relnship_docs(
adjacency.incoming, direction="incoming"
)
@ -405,9 +403,6 @@ def generate_stitched_record(relnships_graph: RelationshipGraph) -> List[Any]:
f_field.globalTags.tags.append(
TagAssociationClass(tag="urn:li:tag:Temporal")
)
import pdb
# breakpoint()
if "Searchable" in json_dict:
f_field.globalTags = f_field.globalTags or GlobalTagsClass(
tags=[]
@ -533,7 +528,7 @@ def get_sorted_entity_names(
(x, y) for (x, y) in entity_names if y.category == EntityCategory.CORE
]
priority_bearing_core_entities = [(x, y) for (x, y) in core_entities if y.priority]
priority_bearing_core_entities.sort(key=lambda x: x[1].priority)
priority_bearing_core_entities.sort(key=lambda t: t[1].priority)
priority_bearing_core_entities = [x for (x, y) in priority_bearing_core_entities]
non_priority_core_entities = [x for (x, y) in core_entities if not y.priority]
@ -570,6 +565,7 @@ def preprocess_markdown(markdown_contents: str) -> str:
content_swap_register = {}
while inline_pattern.search(markdown_contents, pos=pos):
match = inline_pattern.search(markdown_contents, pos=pos)
assert match
file_name = match.group(1)
with open(file_name, "r") as fp:
inline_content = fp.read()
@ -587,7 +583,9 @@ def preprocess_markdown(markdown_contents: str) -> str:
@click.command()
@click.argument("schema_files", type=click.Path(exists=True), nargs=-1, required=True)
@click.argument("schemas_root", type=click.Path(exists=True), required=True)
@click.option("--registry", type=click.Path(exists=True), required=True)
@click.option("--generated-docs-dir", type=click.Path(exists=True), required=True)
@click.option("--server", type=str, required=False)
@click.option("--file", type=str, required=False)
@click.option(
@ -596,7 +594,9 @@ def preprocess_markdown(markdown_contents: str) -> str:
@click.option("--png", type=str, required=False)
@click.option("--extra-docs", type=str, required=False)
def generate(
schema_files: List[str],
schemas_root: str,
registry: str,
generated_docs_dir: str,
server: Optional[str],
file: Optional[str],
dot: Optional[str],
@ -619,40 +619,39 @@ def generate(
final_markdown = preprocess_markdown(file_contents)
entity_extra_docs[entity_name] = final_markdown
for schema_file in schema_files:
if schema_file.endswith(".yml") or schema_file.endswith(".yaml"):
# registry file
load_registry_file(schema_file)
else:
# schema file
load_schema_file(schema_file)
# registry file
load_registry_file(registry)
# schema files
for schema_file in Path(schemas_root).glob("**/*.avsc"):
if (
schema_file.name in {"MetadataChangeEvent.avsc"}
or json.loads(schema_file.read_text()).get("Aspect") is not None
):
load_schema_file(str(schema_file))
if entity_extra_docs:
for entity_name in entity_extra_docs:
entity_registry.get(entity_name).doc_file_contents = entity_extra_docs[
entity_registry[entity_name].doc_file_contents = entity_extra_docs[
entity_name
]
relationship_graph = RelationshipGraph()
events = generate_stitched_record(relationship_graph)
generated_docs_dir = "../docs/generated/metamodel"
import shutil
shutil.rmtree(f"{generated_docs_dir}/entities", ignore_errors=True)
entity_names = [(x, entity_registry.get(x)) for x in generated_documentation]
entity_names = [(x, entity_registry[x]) for x in generated_documentation]
sorted_entity_names = get_sorted_entity_names(entity_names)
index = 0
for category, sorted_entities in sorted_entity_names:
for entity_name in sorted_entities:
entity_def = entity_registry.get(entity_name)
entity_def = entity_registry[entity_name]
entity_category = entity_def.category
entity_dir = f"{generated_docs_dir}/entities/"
import os
os.makedirs(entity_dir, exist_ok=True)

View File

@ -2,32 +2,15 @@
set -euo pipefail
OUTDIR=./generated/docs
DOCS_OUTDIR=../docs/generated/metamodel
# Note: this assumes that datahub has already been built with `./gradlew build`.
DATAHUB_ROOT=..
REGISTRY_ROOT="$DATAHUB_ROOT/metadata-models/src/main/resources"
SCHEMAS_ROOT="$DATAHUB_ROOT/metadata-events/mxe-schemas/src/mainGeneratedAvroSchema/avro/"
FILES="$REGISTRY_ROOT/entity-registry.yml $SCHEMAS_ROOT/com/linkedin/mxe/MetadataChangeEvent.avsc"
ENTITY_REGISTRY="$DATAHUB_ROOT/metadata-models/src/main/resources/entity-registry.yml"
METADATA_MODEL_DOCS_ROOT="$DATAHUB_ROOT/metadata-models/docs"
# Since we depend on jq, check if jq is installed
if ! which jq > /dev/null; then
echo "jq is not installed. Please install jq and rerun (https://stedolan.github.io/jq/)"
exit 1
fi
find $SCHEMAS_ROOT -name "*.avsc" | sort | while read file
do
# Add all other files that are aspects but not included in the above
if (jq '.Aspect' -e $file > /dev/null)
then
FILES="${FILES} ${file}"
fi
echo $FILES > /tmp/docgen_files.txt
done
FILES=$(cat /tmp/docgen_files.txt)
rm -r $OUTDIR || true
python scripts/modeldocgen.py $FILES --file generated/docs/metadata_model_mces.json --extra-docs ${METADATA_MODEL_DOCS_ROOT} $@
python scripts/modeldocgen.py $SCHEMAS_ROOT --registry $ENTITY_REGISTRY --generated-docs-dir $DOCS_OUTDIR --file generated/docs/metadata_model_mces.json --extra-docs ${METADATA_MODEL_DOCS_ROOT} $@
## Full version of this command that generates dot files and png files (requires pydot and graphviz)
# python scripts/modeldocgen.py $FILES --dot generated/docs/metadata_graph.dot --file generated/docs/metadata_model_mces.json --extra-docs ${METADATA_MODEL_DOCS_ROOT} --png generated/docs/metadata_graph.png $@
# python scripts/modeldocgen.py $SCHEMAS_ROOT --registry $ENTITY_REGISTRY --generated-docs-dir $DOCS_OUTDIR --dot generated/docs/metadata_graph.dot --file generated/docs/metadata_model_mces.json --extra-docs ${METADATA_MODEL_DOCS_ROOT} --png generated/docs/metadata_graph.png $@