mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-27 18:07:57 +00:00
feat: remove jq requirement + tweak modeldocgen args (#6904)
Co-authored-by: Tamas Nemeth <treff7es@gmail.com>
This commit is contained in:
parent
b796db1caf
commit
62a2aa94f6
@ -17,7 +17,6 @@ RUN apt-get update && apt-get install -y \
|
||||
&& apt-get install -y -qq \
|
||||
# gcc \
|
||||
make \
|
||||
jq \
|
||||
python3-ldap \
|
||||
libldap2-dev \
|
||||
libsasl2-dev \
|
||||
|
||||
@ -230,8 +230,8 @@ datahub delete --urn "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset
|
||||
The `get` command allows you to easily retrieve metadata from DataHub, by using the REST API. This works for both versioned aspects and timeseries aspects. For timeseries aspects, it fetches the latest value.
|
||||
For example the following command gets the ownership aspect from the dataset `urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)`
|
||||
|
||||
```console
|
||||
datahub get --urn "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)" --aspect ownership | jq put_command
|
||||
```shell-session
|
||||
$ datahub get --urn "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)" --aspect ownership
|
||||
{
|
||||
"value": {
|
||||
"com.linkedin.metadata.snapshot.DatasetSnapshot": {
|
||||
|
||||
@ -19,8 +19,7 @@ Before you go further, ensure you have the following installed:
|
||||
|
||||
* [Python >=3.7.0](https://www.python.org/downloads/)
|
||||
* [Docker](https://docs.docker.com/get-docker/)
|
||||
* [jq](https://stedolan.github.io/jq/download/)
|
||||
* [Docker Compose](https://github.com/docker/compose/blob/master/INSTALL.md) - if using Linux
|
||||
* [Docker Compose v2](https://docs.docker.com/compose/install/) - may be bundled with docker
|
||||
|
||||
:::note
|
||||
|
||||
|
||||
@ -16,9 +16,7 @@ Tested & confirmed config: 2 CPUs, 8GB RAM, 2GB Swap area, and 10GB disk space.
|
||||
|
||||
:::
|
||||
|
||||
2. Install [jq](https://stedolan.github.io/jq/download/)
|
||||
|
||||
3. Launch the Docker Engine from command line or the desktop app.
|
||||
2. Launch the Docker Engine from command line or the desktop app.
|
||||
|
||||
3. Install the DataHub CLI
|
||||
|
||||
@ -26,9 +24,8 @@ Tested & confirmed config: 2 CPUs, 8GB RAM, 2GB Swap area, and 10GB disk space.
|
||||
|
||||
b. Run the following commands in your terminal
|
||||
|
||||
```
|
||||
```sh
|
||||
python3 -m pip install --upgrade pip wheel setuptools
|
||||
python3 -m pip uninstall datahub acryl-datahub || true # sanity check - ok if it fails
|
||||
python3 -m pip install --upgrade acryl-datahub
|
||||
datahub version
|
||||
```
|
||||
@ -88,7 +85,7 @@ Tested & confirmed config: 2 CPUs, 8GB RAM, 2GB Swap area, and 10GB disk space.
|
||||
|
||||
5. To ingest the sample metadata, run the following CLI command from your terminal
|
||||
|
||||
```
|
||||
```bash
|
||||
datahub docker ingest-sample-data
|
||||
```
|
||||
|
||||
@ -110,13 +107,13 @@ Command not found: datahub
|
||||
If running the datahub cli produces "command not found" errors inside your terminal, your system may be defaulting to an
|
||||
older version of Python. Try prefixing your `datahub` commands with `python3 -m`:
|
||||
|
||||
```
|
||||
```bash
|
||||
python3 -m datahub docker quickstart
|
||||
```
|
||||
|
||||
Another possibility is that your system PATH does not include pip's `$HOME/.local/bin` directory. On linux, you can add this to your `~/.bashrc`:
|
||||
|
||||
```
|
||||
```bash
|
||||
if [ -d "$HOME/.local/bin" ] ; then
|
||||
PATH="$HOME/.local/bin:$PATH"
|
||||
fi
|
||||
|
||||
@ -98,7 +98,7 @@ EOF
|
||||
fi
|
||||
|
||||
printf "✨ Setting up prerequisities\n"
|
||||
brew install "jq"
|
||||
# none for now, since jq was removed
|
||||
|
||||
printf "\e[38;2;0;255;0m✅ Done\e[38;2;255;255;255m\n"
|
||||
}
|
||||
|
||||
@ -1,7 +1,9 @@
|
||||
import glob
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import unittest.mock
|
||||
from dataclasses import Field, dataclass, field
|
||||
from enum import auto
|
||||
@ -135,12 +137,7 @@ def load_schema_file(schema_file: str) -> None:
|
||||
# probably an aspect schema
|
||||
record_schema: avro.schema.RecordSchema = avro_schema
|
||||
aspect_def = record_schema.get_prop("Aspect")
|
||||
try:
|
||||
aspect_definition = AspectDefinition(**aspect_def)
|
||||
except Exception as e:
|
||||
import pdb
|
||||
|
||||
breakpoint()
|
||||
aspect_definition = AspectDefinition(**aspect_def)
|
||||
|
||||
aspect_definition.schema = record_schema
|
||||
aspect_registry[aspect_definition.name] = aspect_definition
|
||||
@ -255,8 +252,9 @@ def make_entity_docs(entity_display_name: str, graph: RelationshipGraph) -> str:
|
||||
timeseries_aspects_section = ""
|
||||
|
||||
for aspect in entity_def.aspects or []:
|
||||
aspect_definition: AspectDefinition = aspect_registry.get(aspect)
|
||||
aspect_definition: AspectDefinition = aspect_registry[aspect]
|
||||
assert aspect_definition
|
||||
assert aspect_definition.schema
|
||||
deprecated_message = (
|
||||
" (Deprecated)"
|
||||
if aspect_definition.schema.get_prop("Deprecated")
|
||||
@ -270,7 +268,7 @@ def make_entity_docs(entity_display_name: str, graph: RelationshipGraph) -> str:
|
||||
f"\n### {aspect}{deprecated_message}{timeseries_qualifier}\n"
|
||||
)
|
||||
this_aspect_doc += f"{aspect_definition.schema.get_prop('doc')}\n"
|
||||
this_aspect_doc += f"<details>\n<summary>Schema</summary>\n\n"
|
||||
this_aspect_doc += "<details>\n<summary>Schema</summary>\n\n"
|
||||
# breakpoint()
|
||||
this_aspect_doc += f"```javascript\n{json.dumps(aspect_definition.schema.to_json(), indent=2)}\n```\n</details>\n"
|
||||
|
||||
@ -287,20 +285,20 @@ def make_entity_docs(entity_display_name: str, graph: RelationshipGraph) -> str:
|
||||
relationships_section = "\n## Relationships\n"
|
||||
adjacency = graph.get_adjacency(entity_def.display_name)
|
||||
if adjacency.self_loop:
|
||||
relationships_section += f"\n### Self\nThese are the relationships to itself, stored in this entity's aspects"
|
||||
relationships_section += "\n### Self\nThese are the relationships to itself, stored in this entity's aspects"
|
||||
for relnship in adjacency.self_loop:
|
||||
relationships_section += (
|
||||
f"\n- {relnship.name} ({relnship.doc[1:] if relnship.doc else ''})"
|
||||
)
|
||||
|
||||
if adjacency.outgoing:
|
||||
relationships_section += f"\n### Outgoing\nThese are the relationships stored in this entity's aspects"
|
||||
relationships_section += "\n### Outgoing\nThese are the relationships stored in this entity's aspects"
|
||||
relationships_section += make_relnship_docs(
|
||||
adjacency.outgoing, direction="outgoing"
|
||||
)
|
||||
|
||||
if adjacency.incoming:
|
||||
relationships_section += f"\n### Incoming\nThese are the relationships stored in other entity's aspects"
|
||||
relationships_section += "\n### Incoming\nThese are the relationships stored in other entity's aspects"
|
||||
relationships_section += make_relnship_docs(
|
||||
adjacency.incoming, direction="incoming"
|
||||
)
|
||||
@ -405,9 +403,6 @@ def generate_stitched_record(relnships_graph: RelationshipGraph) -> List[Any]:
|
||||
f_field.globalTags.tags.append(
|
||||
TagAssociationClass(tag="urn:li:tag:Temporal")
|
||||
)
|
||||
import pdb
|
||||
|
||||
# breakpoint()
|
||||
if "Searchable" in json_dict:
|
||||
f_field.globalTags = f_field.globalTags or GlobalTagsClass(
|
||||
tags=[]
|
||||
@ -533,7 +528,7 @@ def get_sorted_entity_names(
|
||||
(x, y) for (x, y) in entity_names if y.category == EntityCategory.CORE
|
||||
]
|
||||
priority_bearing_core_entities = [(x, y) for (x, y) in core_entities if y.priority]
|
||||
priority_bearing_core_entities.sort(key=lambda x: x[1].priority)
|
||||
priority_bearing_core_entities.sort(key=lambda t: t[1].priority)
|
||||
priority_bearing_core_entities = [x for (x, y) in priority_bearing_core_entities]
|
||||
|
||||
non_priority_core_entities = [x for (x, y) in core_entities if not y.priority]
|
||||
@ -570,6 +565,7 @@ def preprocess_markdown(markdown_contents: str) -> str:
|
||||
content_swap_register = {}
|
||||
while inline_pattern.search(markdown_contents, pos=pos):
|
||||
match = inline_pattern.search(markdown_contents, pos=pos)
|
||||
assert match
|
||||
file_name = match.group(1)
|
||||
with open(file_name, "r") as fp:
|
||||
inline_content = fp.read()
|
||||
@ -587,7 +583,9 @@ def preprocess_markdown(markdown_contents: str) -> str:
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument("schema_files", type=click.Path(exists=True), nargs=-1, required=True)
|
||||
@click.argument("schemas_root", type=click.Path(exists=True), required=True)
|
||||
@click.option("--registry", type=click.Path(exists=True), required=True)
|
||||
@click.option("--generated-docs-dir", type=click.Path(exists=True), required=True)
|
||||
@click.option("--server", type=str, required=False)
|
||||
@click.option("--file", type=str, required=False)
|
||||
@click.option(
|
||||
@ -596,7 +594,9 @@ def preprocess_markdown(markdown_contents: str) -> str:
|
||||
@click.option("--png", type=str, required=False)
|
||||
@click.option("--extra-docs", type=str, required=False)
|
||||
def generate(
|
||||
schema_files: List[str],
|
||||
schemas_root: str,
|
||||
registry: str,
|
||||
generated_docs_dir: str,
|
||||
server: Optional[str],
|
||||
file: Optional[str],
|
||||
dot: Optional[str],
|
||||
@ -619,40 +619,39 @@ def generate(
|
||||
final_markdown = preprocess_markdown(file_contents)
|
||||
entity_extra_docs[entity_name] = final_markdown
|
||||
|
||||
for schema_file in schema_files:
|
||||
if schema_file.endswith(".yml") or schema_file.endswith(".yaml"):
|
||||
# registry file
|
||||
load_registry_file(schema_file)
|
||||
else:
|
||||
# schema file
|
||||
load_schema_file(schema_file)
|
||||
# registry file
|
||||
load_registry_file(registry)
|
||||
|
||||
# schema files
|
||||
for schema_file in Path(schemas_root).glob("**/*.avsc"):
|
||||
if (
|
||||
schema_file.name in {"MetadataChangeEvent.avsc"}
|
||||
or json.loads(schema_file.read_text()).get("Aspect") is not None
|
||||
):
|
||||
load_schema_file(str(schema_file))
|
||||
|
||||
if entity_extra_docs:
|
||||
for entity_name in entity_extra_docs:
|
||||
|
||||
entity_registry.get(entity_name).doc_file_contents = entity_extra_docs[
|
||||
entity_registry[entity_name].doc_file_contents = entity_extra_docs[
|
||||
entity_name
|
||||
]
|
||||
|
||||
relationship_graph = RelationshipGraph()
|
||||
events = generate_stitched_record(relationship_graph)
|
||||
|
||||
generated_docs_dir = "../docs/generated/metamodel"
|
||||
import shutil
|
||||
|
||||
shutil.rmtree(f"{generated_docs_dir}/entities", ignore_errors=True)
|
||||
entity_names = [(x, entity_registry.get(x)) for x in generated_documentation]
|
||||
entity_names = [(x, entity_registry[x]) for x in generated_documentation]
|
||||
|
||||
sorted_entity_names = get_sorted_entity_names(entity_names)
|
||||
|
||||
index = 0
|
||||
for category, sorted_entities in sorted_entity_names:
|
||||
for entity_name in sorted_entities:
|
||||
entity_def = entity_registry.get(entity_name)
|
||||
entity_def = entity_registry[entity_name]
|
||||
|
||||
entity_category = entity_def.category
|
||||
entity_dir = f"{generated_docs_dir}/entities/"
|
||||
import os
|
||||
|
||||
os.makedirs(entity_dir, exist_ok=True)
|
||||
|
||||
|
||||
@ -2,32 +2,15 @@
|
||||
set -euo pipefail
|
||||
|
||||
OUTDIR=./generated/docs
|
||||
DOCS_OUTDIR=../docs/generated/metamodel
|
||||
|
||||
# Note: this assumes that datahub has already been built with `./gradlew build`.
|
||||
DATAHUB_ROOT=..
|
||||
REGISTRY_ROOT="$DATAHUB_ROOT/metadata-models/src/main/resources"
|
||||
SCHEMAS_ROOT="$DATAHUB_ROOT/metadata-events/mxe-schemas/src/mainGeneratedAvroSchema/avro/"
|
||||
FILES="$REGISTRY_ROOT/entity-registry.yml $SCHEMAS_ROOT/com/linkedin/mxe/MetadataChangeEvent.avsc"
|
||||
ENTITY_REGISTRY="$DATAHUB_ROOT/metadata-models/src/main/resources/entity-registry.yml"
|
||||
METADATA_MODEL_DOCS_ROOT="$DATAHUB_ROOT/metadata-models/docs"
|
||||
# Since we depend on jq, check if jq is installed
|
||||
if ! which jq > /dev/null; then
|
||||
echo "jq is not installed. Please install jq and rerun (https://stedolan.github.io/jq/)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
find $SCHEMAS_ROOT -name "*.avsc" | sort | while read file
|
||||
do
|
||||
# Add all other files that are aspects but not included in the above
|
||||
if (jq '.Aspect' -e $file > /dev/null)
|
||||
then
|
||||
FILES="${FILES} ${file}"
|
||||
fi
|
||||
echo $FILES > /tmp/docgen_files.txt
|
||||
done
|
||||
|
||||
FILES=$(cat /tmp/docgen_files.txt)
|
||||
|
||||
rm -r $OUTDIR || true
|
||||
python scripts/modeldocgen.py $FILES --file generated/docs/metadata_model_mces.json --extra-docs ${METADATA_MODEL_DOCS_ROOT} $@
|
||||
python scripts/modeldocgen.py $SCHEMAS_ROOT --registry $ENTITY_REGISTRY --generated-docs-dir $DOCS_OUTDIR --file generated/docs/metadata_model_mces.json --extra-docs ${METADATA_MODEL_DOCS_ROOT} $@
|
||||
## Full version of this command that generates dot files and png files (requires pydot and graphviz)
|
||||
# python scripts/modeldocgen.py $FILES --dot generated/docs/metadata_graph.dot --file generated/docs/metadata_model_mces.json --extra-docs ${METADATA_MODEL_DOCS_ROOT} --png generated/docs/metadata_graph.png $@
|
||||
# python scripts/modeldocgen.py $SCHEMAS_ROOT --registry $ENTITY_REGISTRY --generated-docs-dir $DOCS_OUTDIR --dot generated/docs/metadata_graph.dot --file generated/docs/metadata_model_mces.json --extra-docs ${METADATA_MODEL_DOCS_ROOT} --png generated/docs/metadata_graph.png $@
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user