docs: Graph onboarding demo

2025-12-29 02:48:24 +00:00 · 2020-06-26 01:10:44 -07:00 · 2020-06-26 01:10:44 -07:00 · 9501e9bd70
commit 9501e9bd70
parent 34d6f4ed09
3 changed files with 540 additions and 0 deletions
--- a/docker/rebuild-all/docker-compose.yml
+++ b/docker/rebuild-all/docker-compose.yml
@ -0,0 +1,268 @@
+---
+version: '3.5'
+services:
+  mysql:
+    container_name: mysql
+    hostname: mysql
+    image: mysql:5.7
+    restart: always
+    command: --character-set-server=utf8mb4 --collation-server=utf8mb4_unicode_ci
+    environment:
+      MYSQL_DATABASE: 'datahub'
+      MYSQL_USER: 'datahub'
+      MYSQL_PASSWORD: 'datahub'
+      MYSQL_ROOT_PASSWORD: 'datahub'
+    ports:
+      - "3306:3306"
+    volumes:
+      - ../mysql/init.sql:/docker-entrypoint-initdb.d/init.sql
+      - mysqldata:/var/lib/mysql
+
+  zookeeper:
+    image: confluentinc/cp-zookeeper:5.4.0
+    hostname: zookeeper
+    container_name: zookeeper
+    ports:
+      - "2181:2181"
+    environment:
+      ZOOKEEPER_CLIENT_PORT: 2181
+      ZOOKEEPER_TICK_TIME: 2000
+    volumes:
+      - zkdata:/var/opt/zookeeper
+
+  broker:
+    image: confluentinc/cp-kafka:5.4.0
+    hostname: broker
+    container_name: broker
+    depends_on:
+      - zookeeper
+    ports:
+      - "29092:29092"
+      - "9092:9092"
+    environment:
+      KAFKA_BROKER_ID: 1
+      KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181'
+      KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
+      KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092
+      KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
+      KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0
+
+  kafka-rest-proxy:
+    image: confluentinc/cp-kafka-rest:5.4.0
+    hostname: kafka-rest-proxy
+    container_name: kafka-rest-proxy
+    ports:
+      - "8082:8082"
+    environment:
+      KAFKA_REST_LISTENERS: http://0.0.0.0:8082/
+      KAFKA_REST_SCHEMA_REGISTRY_URL: http://schema-registry:8081/
+      KAFKA_REST_HOST_NAME: kafka-rest-proxy
+      KAFKA_REST_BOOTSTRAP_SERVERS: PLAINTEXT://broker:29092
+    depends_on:
+      - zookeeper
+      - broker
+      - schema-registry
+
+  kafka-topics-ui:
+    image: landoop/kafka-topics-ui:0.9.4
+    hostname: kafka-topics-ui
+    container_name: kafka-topics-ui
+    ports:
+      - "18000:8000"
+    environment:
+      KAFKA_REST_PROXY_URL: "http://kafka-rest-proxy:8082/"
+      PROXY: "true"
+    depends_on:
+      - zookeeper
+      - broker
+      - schema-registry
+      - kafka-rest-proxy
+
+  # This "container" is a workaround to pre-create topics
+  kafka-setup:
+    build:
+      context: ../kafka
+    hostname: kafka-setup
+    container_name: kafka-setup
+    depends_on:
+      - broker
+      - schema-registry
+    environment:
+      - KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181
+      - KAFKA_BOOTSTRAP_SERVER=broker:29092
+
+  schema-registry:
+    image: confluentinc/cp-schema-registry:5.4.0
+    hostname: schema-registry
+    container_name: schema-registry
+    depends_on:
+      - zookeeper
+      - broker
+    ports:
+      - "8081:8081"
+    environment:
+      SCHEMA_REGISTRY_HOST_NAME: schema-registry
+      SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL: 'zookeeper:2181'
+
+  schema-registry-ui:
+    image: landoop/schema-registry-ui:latest
+    container_name: schema-registry-ui
+    hostname: schema-registry-ui
+    ports:
+      - "8000:8000"
+    environment:
+      SCHEMAREGISTRY_URL: 'http://schema-registry:8081'
+      ALLOW_GLOBAL: 'true'
+      ALLOW_TRANSITIVE: 'true'
+      ALLOW_DELETION: 'true'
+      READONLY_MODE: 'true'
+      PROXY: 'true'
+    depends_on:
+      - schema-registry
+
+  elasticsearch:
+    image: docker.elastic.co/elasticsearch/elasticsearch:5.6.8
+    container_name: elasticsearch
+    hostname: elasticsearch
+    ports:
+      - "9200:9200"
+    environment:
+      - discovery.type=single-node
+      - xpack.security.enabled=false
+      - "ES_JAVA_OPTS=-Xms1g -Xmx1g"
+    volumes:
+      - esdata:/usr/share/elasticsearch/data
+
+  kibana:
+    image: docker.elastic.co/kibana/kibana:5.6.8
+    container_name: kibana
+    hostname: kibana
+    ports:
+      - "5601:5601"
+    environment:
+      - SERVER_HOST=0.0.0.0
+      - ELASTICSEARCH_URL=http://elasticsearch:9200
+    depends_on:
+      - elasticsearch
+
+  neo4j:
+    image: neo4j:3.5.7
+    hostname: neo4j
+    container_name: neo4j
+    environment:
+      NEO4J_AUTH: 'neo4j/datahub'
+    ports:
+      - "7474:7474"
+      - "7687:7687"
+    volumes:
+      - neo4jdata:/data
+
+  # This "container" is a workaround to pre-create search indices
+  elasticsearch-setup:
+    build:
+      context: ../elasticsearch
+    hostname: elasticsearch-setup
+    container_name: elasticsearch-setup
+    depends_on:
+      - elasticsearch
+    environment:
+      - ELASTICSEARCH_HOST=elasticsearch
+      - ELASTICSEARCH_PORT=9200
+
+  datahub-gms:
+    build:
+      context: ../../
+      dockerfile: docker/gms/Dockerfile
+    hostname: datahub-gms
+    container_name: datahub-gms
+    ports:
+      - "8080:8080"
+    environment:
+      - EBEAN_DATASOURCE_USERNAME=datahub
+      - EBEAN_DATASOURCE_PASSWORD=datahub
+      - EBEAN_DATASOURCE_HOST=mysql:3306
+      - EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8
+      - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver
+      - KAFKA_BOOTSTRAP_SERVER=broker:29092
+      - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
+      - ELASTICSEARCH_HOST=elasticsearch
+      - ELASTICSEARCH_PORT=9200
+      - NEO4J_HOST=neo4j:7474
+      - NEO4J_URI=bolt://neo4j
+      - NEO4J_USERNAME=neo4j
+      - NEO4J_PASSWORD=datahub
+    depends_on:
+      - elasticsearch-setup
+      - kafka-setup
+      - mysql
+      - neo4j
+
+  datahub-frontend:
+    build:
+      context: ../../
+      dockerfile: docker/frontend/Dockerfile
+    hostname: datahub-frontend
+    container_name: datahub-frontend
+    ports:
+      - "9001:9001"
+    environment:
+      - DATAHUB_GMS_HOST=datahub-gms
+      - DATAHUB_GMS_PORT=8080
+      - DATAHUB_SECRET=YouKnowNothing
+      - DATAHUB_APP_VERSION=1.0
+      - DATAHUB_PLAY_MEM_BUFFER_SIZE=10MB
+    depends_on:
+      - datahub-gms
+
+  datahub-mae-consumer:
+    build:
+      context: ../../
+      dockerfile: docker/mae-consumer/Dockerfile
+    hostname: datahub-mae-consumer
+    container_name: datahub-mae-consumer
+    ports:
+        - "9091:9091"
+    environment:
+      - KAFKA_BOOTSTRAP_SERVER=broker:29092
+      - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
+      - ELASTICSEARCH_HOST=elasticsearch
+      - ELASTICSEARCH_PORT=9200
+      - NEO4J_HOST=neo4j:7474
+      - NEO4J_URI=bolt://neo4j
+      - NEO4J_USERNAME=neo4j
+      - NEO4J_PASSWORD=datahub
+    depends_on:
+      - kafka-setup
+      - elasticsearch-setup
+      - neo4j
+    command: "sh -c 'while ping -c1 kafka-setup &>/dev/null; do echo waiting for kafka-setup... && sleep 1; done; \
+                  echo kafka-setup done! && /start.sh'"
+
+  datahub-mce-consumer:
+    build:
+      context: ../../
+      dockerfile: docker/mce-consumer/Dockerfile
+    hostname: datahub-mce-consumer
+    container_name: datahub-mce-consumer
+    ports:
+      - "9090:9090"
+    environment:
+      - KAFKA_BOOTSTRAP_SERVER=broker:29092
+      - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
+      - GMS_HOST=datahub-gms
+      - GMS_PORT=8080
+    depends_on:
+      - kafka-setup
+      - datahub-gms
+    command: "sh -c 'while ping -c1 kafka-setup &>/dev/null; do echo waiting for kafka-setup... && sleep 1; done; \
+                  echo kafka-setup done! && /start.sh'"
+
+networks:
+  default:
+    name: datahub_network
+
+volumes:
+  mysqldata:
+  esdata:
+  neo4jdata:
+  zkdata:
--- a/docker/rebuild-all/rebuild-all.sh
+++ b/docker/rebuild-all/rebuild-all.sh
@ -0,0 +1,4 @@
+#!/bin/bash
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+cd $DIR && docker-compose pull && docker-compose -p datahub up
--- a/docs/demo/graph-onboarding.md
+++ b/docs/demo/graph-onboarding.md
@ -0,0 +1,268 @@
+# Onboarding to GMA Graph - Adding a new relationship type
+
+Steps for this already detailed in https://github.com/linkedin/datahub/blob/master/docs/how/graph-onboarding.md
+
+For this exercise, we'll add a new relationship type `FollowedBy` which is extracted out of `Follow` aspect. For that, we first need to add `Follow` aspect.
+
+## 1. Onboard `Follow` aspect
+Referring to the guide https://github.com/linkedin/datahub/blob/master/docs/how/add-new-aspect.md
+
+### 1.1 Model new aspect
+* Follow.pdl
+```
+namespace com.linkedin.common
+
+/**
+ * Follow information of an entity.
+ */
+record Follow {
+
+  /**
+   * List of followers of an entity.
+   */
+  followers: array[FollowAction]
+}
+```
+
+* FollowAction.pdl
+```
+namespace com.linkedin.common
+
+/**
+ * Follow Action of an entity.
+ */
+record FollowAction {
+
+  /**
+   * Follower (User or a Group) of an entity
+   */
+  follower: FollowerType
+
+  /**
+   * Audit stamp containing who last modified the record and when.
+   */
+  lastModified: optional AuditStamp
+}
+```
+
+* FollowerType.pdl
+```
+namespace com.linkedin.common
+
+/**
+ * A union of all supported follower types
+ */
+typeref FollowerType = union[
+  corpUser: CorpuserUrn,
+  corpGroup: CorpGroupUrn
+]
+```
+
+### 1.2 Update aspect union for dataset
+```
+namespace com.linkedin.metadata.aspect
+
+import com.linkedin.common.Follow
+import com.linkedin.common.InstitutionalMemory
+import com.linkedin.common.Ownership
+import com.linkedin.common.Status
+import com.linkedin.dataset.DatasetDeprecation
+import com.linkedin.dataset.DatasetProperties
+import com.linkedin.dataset.UpstreamLineage
+import com.linkedin.schema.SchemaMetadata
+
+/**
+ * A union of all supported metadata aspects for a Dataset
+ */
+typeref DatasetAspect = union[
+  DatasetProperties,
+  DatasetDeprecation,
+  Follow,
+  UpstreamLineage,
+  InstitutionalMemory,
+  Ownership,
+  Status,
+  SchemaMetadata
+]
+```
+
+## 2. Create `FollowedBy` relationship
+```
+namespace com.linkedin.metadata.relationship
+
+/**
+ * A generic model for the Followed-By relationship
+ */
+@pairings = [ {
+  "destination" : "com.linkedin.common.urn.CorpuserUrn",
+  "source" : "com.linkedin.common.urn.DatasetUrn"
+}, {
+  "destination" : "com.linkedin.common.urn.CorpGroupUrn",
+  "source" : "com.linkedin.common.urn.DatasetUrn"
+} ]
+record FollowedBy includes BaseRelationship {
+}
+```
+
+## 3. Build the repo to generate Java classes for newly added models
+```
+./gradlew build -Prest.model.compatibility=ignore
+```
+
+You can verify that API definitions for /dataset endpoint of GMS as well as MXE schemas are automatically updated to include new model changes.
+
+## 4. Create `FollowedBy` relationship builder from `Follow` aspect
+```java
+package com.linkedin.metadata.builders.graph.relationship;
+
+import com.linkedin.common.Follow;
+import com.linkedin.common.urn.Urn;
+import com.linkedin.metadata.builders.graph.GraphBuilder;
+import com.linkedin.metadata.relationship.FollowedBy;
+import java.util.Collections;
+import java.util.List;
+import java.util.Objects;
+import java.util.stream.Collectors;
+import javax.annotation.Nonnull;
+
+import static com.linkedin.metadata.dao.internal.BaseGraphWriterDAO.RemovalOption.*;
+
+
+public class FollowedByBuilderFromFollow extends BaseRelationshipBuilder<Follow> {
+
+  public FollowedByBuilderFromFollow() {
+    super(Follow.class);
+  }
+
+  @Nonnull
+  @Override
+  public <URN extends Urn> List<GraphBuilder.RelationshipUpdates> buildRelationships(@Nonnull URN urn,
+      @Nonnull Follow follow) {
+    final List<FollowedBy> followedByList = follow.getFollowers().stream().map(followAction -> {
+      if (followAction.getFollower().isCorpUser()) {
+        return new FollowedBy().setSource(urn).setDestination(followAction.getFollower().getCorpUser());
+      }
+      if (followAction.getFollower().isCorpGroup()) {
+        return new FollowedBy().setSource(urn).setDestination(followAction.getFollower().getCorpGroup());
+      }
+      return null;
+    }).filter(Objects::nonNull).collect(Collectors.toList());
+
+    return Collections.singletonList(new GraphBuilder.RelationshipUpdates(followedByList, REMOVE_ALL_EDGES_FROM_SOURCE));
+  }
+}
+```
+
+## 5. Update set of relationship builders for dataset by adding `FollowedByBuilderFromFollow`
+
+```java
+package com.linkedin.metadata.builders.graph;
+
+import com.linkedin.common.urn.DatasetUrn;
+import com.linkedin.data.template.RecordTemplate;
+import com.linkedin.metadata.builders.graph.relationship.BaseRelationshipBuilder;
+import com.linkedin.metadata.builders.graph.relationship.DownstreamOfBuilderFromUpstreamLineage;
+import com.linkedin.metadata.builders.graph.relationship.FollowedByBuilderFromFollow;
+import com.linkedin.metadata.builders.graph.relationship.OwnedByBuilderFromOwnership;
+import com.linkedin.metadata.entity.DatasetEntity;
+import com.linkedin.metadata.snapshot.DatasetSnapshot;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import javax.annotation.Nonnull;
+
+public class DatasetGraphBuilder extends BaseGraphBuilder<DatasetSnapshot> {
+  private static final Set<BaseRelationshipBuilder> RELATIONSHIP_BUILDERS =
+      Collections.unmodifiableSet(new HashSet<BaseRelationshipBuilder>() {
+        {
+          add(new DownstreamOfBuilderFromUpstreamLineage());
+          add(new FollowedByBuilderFromFollow());
+          add(new OwnedByBuilderFromOwnership());
+        }
+      });
+
+  public DatasetGraphBuilder() {
+    super(DatasetSnapshot.class, RELATIONSHIP_BUILDERS);
+  }
+
+  @Nonnull
+  @Override
+  protected List<? extends RecordTemplate> buildEntities(@Nonnull DatasetSnapshot snapshot) {
+    final DatasetUrn urn = snapshot.getUrn();
+    final DatasetEntity entity = new DatasetEntity().setUrn(urn)
+        .setName(urn.getDatasetNameEntity())
+        .setPlatform(urn.getPlatformEntity())
+        .setOrigin(urn.getOriginEntity());
+
+    setRemovedProperty(snapshot, entity);
+
+    return Collections.singletonList(entity);
+  }
+}
+```
+
+## 6. Rebuild & restart all containers with new changes
+
+This is all the code change we need to do to enable linking datasets and corp users (groups as well).
+Now we can re-build & start all Docker images.
+
+```
+./docker/rebuild-all/rebuild-all.sh
+```
+
+## 7. That's it. Let's test our new feature!
+
+Let's ingest a user first
+```
+curl 'http://localhost:8080/corpUsers?action=ingest' -X POST -H 'X-RestLi-Protocol-Version:2.0.0' --data '
+{
+  "snapshot": {
+    "aspects": [{
+      "com.linkedin.identity.CorpUserInfo": {
+        "active": true,
+        "displayName": "Foo Bar",
+        "fullName": "Foo Bar",
+        "email": "fbar@linkedin.com"
+      }
+    }],
+    "urn": "urn:li:corpuser:fbar"
+  }
+}'
+```
+
+And now let's ingest a dataset with two aspects: Ownership & Follow
+```
+curl 'http://localhost:8080/datasets?action=ingest' -X POST -H 'X-RestLi-Protocol-Version:2.0.0' --data '
+{
+  "snapshot": {
+    "aspects": [{
+      "com.linkedin.common.Ownership": {
+        "owners": [{
+            "owner": "urn:li:corpuser:fbar",
+            "type": "DATAOWNER"
+        }],
+        "lastModified": {
+            "time": 0,
+            "actor": "urn:li:corpuser:fbar"
+        }
+      }
+    },
+    {
+      "com.linkedin.common.Follow": {
+        "followers": [{
+          "follower": {
+            "corpUser": "urn:li:corpuser:fbar"
+          }
+        }]
+      }
+    }],
+    "urn": "urn:li:dataset:(urn:li:dataPlatform:foo,bar,PROD)"
+  }
+}'
+```
+
+Ownership aspect will help create an `OwnedBy` edge between user & dataset nodes. That existed already.
+Now that we added follow aspect, we'll also be able to see a `FollowedBy` edge between same user & dataset nodes.
+
+You can confirm this by connecting to Neo4j browser on http://localhost:7474/browser