feat: remove jq requirement + tweak modeldocgen args (#6904)

Co-authored-by: Tamas Nemeth <treff7es@gmail.com>
2025-12-27 18:07:57 +00:00 · 2022-12-30 14:02:57 -05:00 · 2022-12-30 14:02:57 -05:00 · 62a2aa94f6
commit 62a2aa94f6
parent b796db1caf
7 changed files with 43 additions and 66 deletions
--- a/docker/datahub-ingestion/base.Dockerfile
+++ b/docker/datahub-ingestion/base.Dockerfile
@ -17,7 +17,6 @@ RUN apt-get update && apt-get install -y \
    && apt-get install -y -qq \
    #    gcc \
    make \
-    jq \
    python3-ldap \
    libldap2-dev \
    libsasl2-dev \
--- a/docs/cli.md
+++ b/docs/cli.md
@ -230,8 +230,8 @@ datahub delete --urn "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset
 The `get` command allows you to easily retrieve metadata from DataHub, by using the REST API. This works for both versioned aspects and timeseries aspects. For timeseries aspects, it fetches the latest value.
 For example the following command gets the ownership aspect from the dataset `urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)`

-```console
-datahub get --urn "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)" --aspect ownership | jq                                                                       put_command
+```shell-session
+$ datahub get --urn "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)" --aspect ownership
 {
  "value": {
    "com.linkedin.metadata.snapshot.DatasetSnapshot": {
--- a/docs/get-started-with-datahub.md
+++ b/docs/get-started-with-datahub.md
@ -19,8 +19,7 @@ Before you go further, ensure you have the following installed:

 * [Python >=3.7.0](https://www.python.org/downloads/)
 * [Docker](https://docs.docker.com/get-docker/)
-* [jq](https://stedolan.github.io/jq/download/)
-* [Docker Compose](https://github.com/docker/compose/blob/master/INSTALL.md) - if using Linux
+* [Docker Compose v2](https://docs.docker.com/compose/install/) - may be bundled with docker

 :::note

--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@ -16,9 +16,7 @@ Tested & confirmed config: 2 CPUs, 8GB RAM, 2GB Swap area, and 10GB disk space.

 :::

-2. Install [jq](https://stedolan.github.io/jq/download/)
-   
-3. Launch the Docker Engine from command line or the desktop app.
+2. Launch the Docker Engine from command line or the desktop app.

 3. Install the DataHub CLI

@ -26,9 +24,8 @@ Tested & confirmed config: 2 CPUs, 8GB RAM, 2GB Swap area, and 10GB disk space.

   b. Run the following commands in your terminal

-   ```
+   ```sh
   python3 -m pip install --upgrade pip wheel setuptools
-   python3 -m pip uninstall datahub acryl-datahub || true  # sanity check - ok if it fails
   python3 -m pip install --upgrade acryl-datahub
   datahub version
   ```
@ -88,7 +85,7 @@ Tested & confirmed config: 2 CPUs, 8GB RAM, 2GB Swap area, and 10GB disk space.

 5. To ingest the sample metadata, run the following CLI command from your terminal

-   ```
+   ```bash
   datahub docker ingest-sample-data
   ```

@ -110,13 +107,13 @@ Command not found: datahub
 If running the datahub cli produces "command not found" errors inside your terminal, your system may be defaulting to an
 older version of Python. Try prefixing your `datahub` commands with `python3 -m`:

-```
+```bash
 python3 -m datahub docker quickstart
 ```

 Another possibility is that your system PATH does not include pip's `$HOME/.local/bin` directory.  On linux, you can add this to your `~/.bashrc`:

-```
+```bash
 if [ -d "$HOME/.local/bin" ] ; then
    PATH="$HOME/.local/bin:$PATH"
 fi
--- a/metadata-ingestion/scripts/datahub_preflight.sh
+++ b/metadata-ingestion/scripts/datahub_preflight.sh
@ -98,7 +98,7 @@ EOF
  fi

  printf "✨ Setting up prerequisities\n"
-  brew install "jq"
+  # none for now, since jq was removed

  printf "\e[38;2;0;255;0m✅ Done\e[38;2;255;255;255m\n"
 }
--- a/metadata-ingestion/scripts/modeldocgen.py
+++ b/metadata-ingestion/scripts/modeldocgen.py
@ -1,7 +1,9 @@
 import glob
 import json
 import logging
+import os
 import re
+import shutil
 import unittest.mock
 from dataclasses import Field, dataclass, field
 from enum import auto
@ -135,12 +137,7 @@ def load_schema_file(schema_file: str) -> None:
        # probably an aspect schema
        record_schema: avro.schema.RecordSchema = avro_schema
        aspect_def = record_schema.get_prop("Aspect")
-        try:
-            aspect_definition = AspectDefinition(**aspect_def)
-        except Exception as e:
-            import pdb
-
-            breakpoint()
+        aspect_definition = AspectDefinition(**aspect_def)

        aspect_definition.schema = record_schema
        aspect_registry[aspect_definition.name] = aspect_definition
@ -255,8 +252,9 @@ def make_entity_docs(entity_display_name: str, graph: RelationshipGraph) -> str:
        timeseries_aspects_section = ""

        for aspect in entity_def.aspects or []:
-            aspect_definition: AspectDefinition = aspect_registry.get(aspect)
+            aspect_definition: AspectDefinition = aspect_registry[aspect]
            assert aspect_definition
+            assert aspect_definition.schema
            deprecated_message = (
                " (Deprecated)"
                if aspect_definition.schema.get_prop("Deprecated")
@ -270,7 +268,7 @@ def make_entity_docs(entity_display_name: str, graph: RelationshipGraph) -> str:
                f"\n### {aspect}{deprecated_message}{timeseries_qualifier}\n"
            )
            this_aspect_doc += f"{aspect_definition.schema.get_prop('doc')}\n"
-            this_aspect_doc += f"<details>\n<summary>Schema</summary>\n\n"
+            this_aspect_doc += "<details>\n<summary>Schema</summary>\n\n"
            # breakpoint()
            this_aspect_doc += f"```javascript\n{json.dumps(aspect_definition.schema.to_json(), indent=2)}\n```\n</details>\n"

@ -287,20 +285,20 @@ def make_entity_docs(entity_display_name: str, graph: RelationshipGraph) -> str:
        relationships_section = "\n## Relationships\n"
        adjacency = graph.get_adjacency(entity_def.display_name)
        if adjacency.self_loop:
-            relationships_section += f"\n### Self\nThese are the relationships to itself, stored in this entity's aspects"
+            relationships_section += "\n### Self\nThese are the relationships to itself, stored in this entity's aspects"
        for relnship in adjacency.self_loop:
            relationships_section += (
                f"\n- {relnship.name} ({relnship.doc[1:] if relnship.doc else ''})"
            )

        if adjacency.outgoing:
-            relationships_section += f"\n### Outgoing\nThese are the relationships stored in this entity's aspects"
+            relationships_section += "\n### Outgoing\nThese are the relationships stored in this entity's aspects"
            relationships_section += make_relnship_docs(
                adjacency.outgoing, direction="outgoing"
            )

        if adjacency.incoming:
-            relationships_section += f"\n### Incoming\nThese are the relationships stored in other entity's aspects"
+            relationships_section += "\n### Incoming\nThese are the relationships stored in other entity's aspects"
            relationships_section += make_relnship_docs(
                adjacency.incoming, direction="incoming"
            )
@ -405,9 +403,6 @@ def generate_stitched_record(relnships_graph: RelationshipGraph) -> List[Any]:
                            f_field.globalTags.tags.append(
                                TagAssociationClass(tag="urn:li:tag:Temporal")
                            )
-                        import pdb
-
-                        # breakpoint()
                    if "Searchable" in json_dict:
                        f_field.globalTags = f_field.globalTags or GlobalTagsClass(
                            tags=[]
@ -533,7 +528,7 @@ def get_sorted_entity_names(
        (x, y) for (x, y) in entity_names if y.category == EntityCategory.CORE
    ]
    priority_bearing_core_entities = [(x, y) for (x, y) in core_entities if y.priority]
-    priority_bearing_core_entities.sort(key=lambda x: x[1].priority)
+    priority_bearing_core_entities.sort(key=lambda t: t[1].priority)
    priority_bearing_core_entities = [x for (x, y) in priority_bearing_core_entities]

    non_priority_core_entities = [x for (x, y) in core_entities if not y.priority]
@ -570,6 +565,7 @@ def preprocess_markdown(markdown_contents: str) -> str:
    content_swap_register = {}
    while inline_pattern.search(markdown_contents, pos=pos):
        match = inline_pattern.search(markdown_contents, pos=pos)
+        assert match
        file_name = match.group(1)
        with open(file_name, "r") as fp:
            inline_content = fp.read()
@ -587,7 +583,9 @@ def preprocess_markdown(markdown_contents: str) -> str:


@click.command()
-@click.argument("schema_files", type=click.Path(exists=True), nargs=-1, required=True)
+@click.argument("schemas_root", type=click.Path(exists=True), required=True)
+@click.option("--registry", type=click.Path(exists=True), required=True)
+@click.option("--generated-docs-dir", type=click.Path(exists=True), required=True)
@click.option("--server", type=str, required=False)
@click.option("--file", type=str, required=False)
@click.option(
@ -596,7 +594,9 @@ def preprocess_markdown(markdown_contents: str) -> str:
@click.option("--png", type=str, required=False)
@click.option("--extra-docs", type=str, required=False)
 def generate(
-    schema_files: List[str],
+    schemas_root: str,
+    registry: str,
+    generated_docs_dir: str,
    server: Optional[str],
    file: Optional[str],
    dot: Optional[str],
@ -619,40 +619,39 @@ def generate(
                    final_markdown = preprocess_markdown(file_contents)
                    entity_extra_docs[entity_name] = final_markdown

-    for schema_file in schema_files:
-        if schema_file.endswith(".yml") or schema_file.endswith(".yaml"):
-            # registry file
-            load_registry_file(schema_file)
-        else:
-            # schema file
-            load_schema_file(schema_file)
+    # registry file
+    load_registry_file(registry)
+
+    # schema files
+    for schema_file in Path(schemas_root).glob("**/*.avsc"):
+        if (
+            schema_file.name in {"MetadataChangeEvent.avsc"}
+            or json.loads(schema_file.read_text()).get("Aspect") is not None
+        ):
+            load_schema_file(str(schema_file))

    if entity_extra_docs:
        for entity_name in entity_extra_docs:

-            entity_registry.get(entity_name).doc_file_contents = entity_extra_docs[
+            entity_registry[entity_name].doc_file_contents = entity_extra_docs[
                entity_name
            ]

    relationship_graph = RelationshipGraph()
    events = generate_stitched_record(relationship_graph)

-    generated_docs_dir = "../docs/generated/metamodel"
-    import shutil
-
    shutil.rmtree(f"{generated_docs_dir}/entities", ignore_errors=True)
-    entity_names = [(x, entity_registry.get(x)) for x in generated_documentation]
+    entity_names = [(x, entity_registry[x]) for x in generated_documentation]

    sorted_entity_names = get_sorted_entity_names(entity_names)

    index = 0
    for category, sorted_entities in sorted_entity_names:
        for entity_name in sorted_entities:
-            entity_def = entity_registry.get(entity_name)
+            entity_def = entity_registry[entity_name]

            entity_category = entity_def.category
            entity_dir = f"{generated_docs_dir}/entities/"
-            import os

            os.makedirs(entity_dir, exist_ok=True)

--- a/metadata-ingestion/scripts/modeldocgen.sh
+++ b/metadata-ingestion/scripts/modeldocgen.sh
@ -2,32 +2,15 @@
 set -euo pipefail

 OUTDIR=./generated/docs
+DOCS_OUTDIR=../docs/generated/metamodel

 # Note: this assumes that datahub has already been built with `./gradlew build`.
 DATAHUB_ROOT=..
-REGISTRY_ROOT="$DATAHUB_ROOT/metadata-models/src/main/resources"
 SCHEMAS_ROOT="$DATAHUB_ROOT/metadata-events/mxe-schemas/src/mainGeneratedAvroSchema/avro/"
-FILES="$REGISTRY_ROOT/entity-registry.yml $SCHEMAS_ROOT/com/linkedin/mxe/MetadataChangeEvent.avsc"
+ENTITY_REGISTRY="$DATAHUB_ROOT/metadata-models/src/main/resources/entity-registry.yml"
 METADATA_MODEL_DOCS_ROOT="$DATAHUB_ROOT/metadata-models/docs"
-# Since we depend on jq, check if jq is installed
-if ! which jq > /dev/null; then
-   echo "jq is not installed. Please install jq and rerun (https://stedolan.github.io/jq/)"
-   exit 1
-fi
-
-find $SCHEMAS_ROOT -name "*.avsc" | sort | while read file
-do
-# Add all other files that are aspects but not included in the above
-        if (jq '.Aspect' -e $file > /dev/null)
-        then
-            FILES="${FILES} ${file}"
-        fi
-        echo $FILES > /tmp/docgen_files.txt
-done
-
-FILES=$(cat /tmp/docgen_files.txt)

 rm -r $OUTDIR || true
-python scripts/modeldocgen.py $FILES --file generated/docs/metadata_model_mces.json --extra-docs ${METADATA_MODEL_DOCS_ROOT} $@
+python scripts/modeldocgen.py $SCHEMAS_ROOT --registry $ENTITY_REGISTRY --generated-docs-dir $DOCS_OUTDIR --file generated/docs/metadata_model_mces.json --extra-docs ${METADATA_MODEL_DOCS_ROOT} $@
 ## Full version of this command that generates dot files and png files (requires pydot and graphviz)
-# python scripts/modeldocgen.py $FILES --dot generated/docs/metadata_graph.dot --file generated/docs/metadata_model_mces.json --extra-docs ${METADATA_MODEL_DOCS_ROOT} --png generated/docs/metadata_graph.png $@
+# python scripts/modeldocgen.py $SCHEMAS_ROOT --registry $ENTITY_REGISTRY --generated-docs-dir $DOCS_OUTDIR --dot generated/docs/metadata_graph.dot --file generated/docs/metadata_model_mces.json --extra-docs ${METADATA_MODEL_DOCS_ROOT} --png generated/docs/metadata_graph.png $@