docs: Graph onboarding demo

This commit is contained in:
Kerem Sahin 2020-06-26 01:10:44 -07:00
parent 34d6f4ed09
commit 9501e9bd70
3 changed files with 540 additions and 0 deletions

View File

@ -0,0 +1,268 @@
---
version: '3.5'
services:
mysql:
container_name: mysql
hostname: mysql
image: mysql:5.7
restart: always
command: --character-set-server=utf8mb4 --collation-server=utf8mb4_unicode_ci
environment:
MYSQL_DATABASE: 'datahub'
MYSQL_USER: 'datahub'
MYSQL_PASSWORD: 'datahub'
MYSQL_ROOT_PASSWORD: 'datahub'
ports:
- "3306:3306"
volumes:
- ../mysql/init.sql:/docker-entrypoint-initdb.d/init.sql
- mysqldata:/var/lib/mysql
zookeeper:
image: confluentinc/cp-zookeeper:5.4.0
hostname: zookeeper
container_name: zookeeper
ports:
- "2181:2181"
environment:
ZOOKEEPER_CLIENT_PORT: 2181
ZOOKEEPER_TICK_TIME: 2000
volumes:
- zkdata:/var/opt/zookeeper
broker:
image: confluentinc/cp-kafka:5.4.0
hostname: broker
container_name: broker
depends_on:
- zookeeper
ports:
- "29092:29092"
- "9092:9092"
environment:
KAFKA_BROKER_ID: 1
KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181'
KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0
kafka-rest-proxy:
image: confluentinc/cp-kafka-rest:5.4.0
hostname: kafka-rest-proxy
container_name: kafka-rest-proxy
ports:
- "8082:8082"
environment:
KAFKA_REST_LISTENERS: http://0.0.0.0:8082/
KAFKA_REST_SCHEMA_REGISTRY_URL: http://schema-registry:8081/
KAFKA_REST_HOST_NAME: kafka-rest-proxy
KAFKA_REST_BOOTSTRAP_SERVERS: PLAINTEXT://broker:29092
depends_on:
- zookeeper
- broker
- schema-registry
kafka-topics-ui:
image: landoop/kafka-topics-ui:0.9.4
hostname: kafka-topics-ui
container_name: kafka-topics-ui
ports:
- "18000:8000"
environment:
KAFKA_REST_PROXY_URL: "http://kafka-rest-proxy:8082/"
PROXY: "true"
depends_on:
- zookeeper
- broker
- schema-registry
- kafka-rest-proxy
# This "container" is a workaround to pre-create topics
kafka-setup:
build:
context: ../kafka
hostname: kafka-setup
container_name: kafka-setup
depends_on:
- broker
- schema-registry
environment:
- KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181
- KAFKA_BOOTSTRAP_SERVER=broker:29092
schema-registry:
image: confluentinc/cp-schema-registry:5.4.0
hostname: schema-registry
container_name: schema-registry
depends_on:
- zookeeper
- broker
ports:
- "8081:8081"
environment:
SCHEMA_REGISTRY_HOST_NAME: schema-registry
SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL: 'zookeeper:2181'
schema-registry-ui:
image: landoop/schema-registry-ui:latest
container_name: schema-registry-ui
hostname: schema-registry-ui
ports:
- "8000:8000"
environment:
SCHEMAREGISTRY_URL: 'http://schema-registry:8081'
ALLOW_GLOBAL: 'true'
ALLOW_TRANSITIVE: 'true'
ALLOW_DELETION: 'true'
READONLY_MODE: 'true'
PROXY: 'true'
depends_on:
- schema-registry
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:5.6.8
container_name: elasticsearch
hostname: elasticsearch
ports:
- "9200:9200"
environment:
- discovery.type=single-node
- xpack.security.enabled=false
- "ES_JAVA_OPTS=-Xms1g -Xmx1g"
volumes:
- esdata:/usr/share/elasticsearch/data
kibana:
image: docker.elastic.co/kibana/kibana:5.6.8
container_name: kibana
hostname: kibana
ports:
- "5601:5601"
environment:
- SERVER_HOST=0.0.0.0
- ELASTICSEARCH_URL=http://elasticsearch:9200
depends_on:
- elasticsearch
neo4j:
image: neo4j:3.5.7
hostname: neo4j
container_name: neo4j
environment:
NEO4J_AUTH: 'neo4j/datahub'
ports:
- "7474:7474"
- "7687:7687"
volumes:
- neo4jdata:/data
# This "container" is a workaround to pre-create search indices
elasticsearch-setup:
build:
context: ../elasticsearch
hostname: elasticsearch-setup
container_name: elasticsearch-setup
depends_on:
- elasticsearch
environment:
- ELASTICSEARCH_HOST=elasticsearch
- ELASTICSEARCH_PORT=9200
datahub-gms:
build:
context: ../../
dockerfile: docker/gms/Dockerfile
hostname: datahub-gms
container_name: datahub-gms
ports:
- "8080:8080"
environment:
- EBEAN_DATASOURCE_USERNAME=datahub
- EBEAN_DATASOURCE_PASSWORD=datahub
- EBEAN_DATASOURCE_HOST=mysql:3306
- EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8
- EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver
- KAFKA_BOOTSTRAP_SERVER=broker:29092
- KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
- ELASTICSEARCH_HOST=elasticsearch
- ELASTICSEARCH_PORT=9200
- NEO4J_HOST=neo4j:7474
- NEO4J_URI=bolt://neo4j
- NEO4J_USERNAME=neo4j
- NEO4J_PASSWORD=datahub
depends_on:
- elasticsearch-setup
- kafka-setup
- mysql
- neo4j
datahub-frontend:
build:
context: ../../
dockerfile: docker/frontend/Dockerfile
hostname: datahub-frontend
container_name: datahub-frontend
ports:
- "9001:9001"
environment:
- DATAHUB_GMS_HOST=datahub-gms
- DATAHUB_GMS_PORT=8080
- DATAHUB_SECRET=YouKnowNothing
- DATAHUB_APP_VERSION=1.0
- DATAHUB_PLAY_MEM_BUFFER_SIZE=10MB
depends_on:
- datahub-gms
datahub-mae-consumer:
build:
context: ../../
dockerfile: docker/mae-consumer/Dockerfile
hostname: datahub-mae-consumer
container_name: datahub-mae-consumer
ports:
- "9091:9091"
environment:
- KAFKA_BOOTSTRAP_SERVER=broker:29092
- KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
- ELASTICSEARCH_HOST=elasticsearch
- ELASTICSEARCH_PORT=9200
- NEO4J_HOST=neo4j:7474
- NEO4J_URI=bolt://neo4j
- NEO4J_USERNAME=neo4j
- NEO4J_PASSWORD=datahub
depends_on:
- kafka-setup
- elasticsearch-setup
- neo4j
command: "sh -c 'while ping -c1 kafka-setup &>/dev/null; do echo waiting for kafka-setup... && sleep 1; done; \
echo kafka-setup done! && /start.sh'"
datahub-mce-consumer:
build:
context: ../../
dockerfile: docker/mce-consumer/Dockerfile
hostname: datahub-mce-consumer
container_name: datahub-mce-consumer
ports:
- "9090:9090"
environment:
- KAFKA_BOOTSTRAP_SERVER=broker:29092
- KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
- GMS_HOST=datahub-gms
- GMS_PORT=8080
depends_on:
- kafka-setup
- datahub-gms
command: "sh -c 'while ping -c1 kafka-setup &>/dev/null; do echo waiting for kafka-setup... && sleep 1; done; \
echo kafka-setup done! && /start.sh'"
networks:
default:
name: datahub_network
volumes:
mysqldata:
esdata:
neo4jdata:
zkdata:

View File

@ -0,0 +1,4 @@
#!/bin/bash
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
cd $DIR && docker-compose pull && docker-compose -p datahub up

View File

@ -0,0 +1,268 @@
# Onboarding to GMA Graph - Adding a new relationship type
Steps for this already detailed in https://github.com/linkedin/datahub/blob/master/docs/how/graph-onboarding.md
For this exercise, we'll add a new relationship type `FollowedBy` which is extracted out of `Follow` aspect. For that, we first need to add `Follow` aspect.
## 1. Onboard `Follow` aspect
Referring to the guide https://github.com/linkedin/datahub/blob/master/docs/how/add-new-aspect.md
### 1.1 Model new aspect
* Follow.pdl
```
namespace com.linkedin.common
/**
* Follow information of an entity.
*/
record Follow {
/**
* List of followers of an entity.
*/
followers: array[FollowAction]
}
```
* FollowAction.pdl
```
namespace com.linkedin.common
/**
* Follow Action of an entity.
*/
record FollowAction {
/**
* Follower (User or a Group) of an entity
*/
follower: FollowerType
/**
* Audit stamp containing who last modified the record and when.
*/
lastModified: optional AuditStamp
}
```
* FollowerType.pdl
```
namespace com.linkedin.common
/**
* A union of all supported follower types
*/
typeref FollowerType = union[
corpUser: CorpuserUrn,
corpGroup: CorpGroupUrn
]
```
### 1.2 Update aspect union for dataset
```
namespace com.linkedin.metadata.aspect
import com.linkedin.common.Follow
import com.linkedin.common.InstitutionalMemory
import com.linkedin.common.Ownership
import com.linkedin.common.Status
import com.linkedin.dataset.DatasetDeprecation
import com.linkedin.dataset.DatasetProperties
import com.linkedin.dataset.UpstreamLineage
import com.linkedin.schema.SchemaMetadata
/**
* A union of all supported metadata aspects for a Dataset
*/
typeref DatasetAspect = union[
DatasetProperties,
DatasetDeprecation,
Follow,
UpstreamLineage,
InstitutionalMemory,
Ownership,
Status,
SchemaMetadata
]
```
## 2. Create `FollowedBy` relationship
```
namespace com.linkedin.metadata.relationship
/**
* A generic model for the Followed-By relationship
*/
@pairings = [ {
"destination" : "com.linkedin.common.urn.CorpuserUrn",
"source" : "com.linkedin.common.urn.DatasetUrn"
}, {
"destination" : "com.linkedin.common.urn.CorpGroupUrn",
"source" : "com.linkedin.common.urn.DatasetUrn"
} ]
record FollowedBy includes BaseRelationship {
}
```
## 3. Build the repo to generate Java classes for newly added models
```
./gradlew build -Prest.model.compatibility=ignore
```
You can verify that API definitions for /dataset endpoint of GMS as well as MXE schemas are automatically updated to include new model changes.
## 4. Create `FollowedBy` relationship builder from `Follow` aspect
```java
package com.linkedin.metadata.builders.graph.relationship;
import com.linkedin.common.Follow;
import com.linkedin.common.urn.Urn;
import com.linkedin.metadata.builders.graph.GraphBuilder;
import com.linkedin.metadata.relationship.FollowedBy;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;
import javax.annotation.Nonnull;
import static com.linkedin.metadata.dao.internal.BaseGraphWriterDAO.RemovalOption.*;
public class FollowedByBuilderFromFollow extends BaseRelationshipBuilder<Follow> {
public FollowedByBuilderFromFollow() {
super(Follow.class);
}
@Nonnull
@Override
public <URN extends Urn> List<GraphBuilder.RelationshipUpdates> buildRelationships(@Nonnull URN urn,
@Nonnull Follow follow) {
final List<FollowedBy> followedByList = follow.getFollowers().stream().map(followAction -> {
if (followAction.getFollower().isCorpUser()) {
return new FollowedBy().setSource(urn).setDestination(followAction.getFollower().getCorpUser());
}
if (followAction.getFollower().isCorpGroup()) {
return new FollowedBy().setSource(urn).setDestination(followAction.getFollower().getCorpGroup());
}
return null;
}).filter(Objects::nonNull).collect(Collectors.toList());
return Collections.singletonList(new GraphBuilder.RelationshipUpdates(followedByList, REMOVE_ALL_EDGES_FROM_SOURCE));
}
}
```
## 5. Update set of relationship builders for dataset by adding `FollowedByBuilderFromFollow`
```java
package com.linkedin.metadata.builders.graph;
import com.linkedin.common.urn.DatasetUrn;
import com.linkedin.data.template.RecordTemplate;
import com.linkedin.metadata.builders.graph.relationship.BaseRelationshipBuilder;
import com.linkedin.metadata.builders.graph.relationship.DownstreamOfBuilderFromUpstreamLineage;
import com.linkedin.metadata.builders.graph.relationship.FollowedByBuilderFromFollow;
import com.linkedin.metadata.builders.graph.relationship.OwnedByBuilderFromOwnership;
import com.linkedin.metadata.entity.DatasetEntity;
import com.linkedin.metadata.snapshot.DatasetSnapshot;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import javax.annotation.Nonnull;
public class DatasetGraphBuilder extends BaseGraphBuilder<DatasetSnapshot> {
private static final Set<BaseRelationshipBuilder> RELATIONSHIP_BUILDERS =
Collections.unmodifiableSet(new HashSet<BaseRelationshipBuilder>() {
{
add(new DownstreamOfBuilderFromUpstreamLineage());
add(new FollowedByBuilderFromFollow());
add(new OwnedByBuilderFromOwnership());
}
});
public DatasetGraphBuilder() {
super(DatasetSnapshot.class, RELATIONSHIP_BUILDERS);
}
@Nonnull
@Override
protected List<? extends RecordTemplate> buildEntities(@Nonnull DatasetSnapshot snapshot) {
final DatasetUrn urn = snapshot.getUrn();
final DatasetEntity entity = new DatasetEntity().setUrn(urn)
.setName(urn.getDatasetNameEntity())
.setPlatform(urn.getPlatformEntity())
.setOrigin(urn.getOriginEntity());
setRemovedProperty(snapshot, entity);
return Collections.singletonList(entity);
}
}
```
## 6. Rebuild & restart all containers with new changes
This is all the code change we need to do to enable linking datasets and corp users (groups as well).
Now we can re-build & start all Docker images.
```
./docker/rebuild-all/rebuild-all.sh
```
## 7. That's it. Let's test our new feature!
Let's ingest a user first
```
curl 'http://localhost:8080/corpUsers?action=ingest' -X POST -H 'X-RestLi-Protocol-Version:2.0.0' --data '
{
"snapshot": {
"aspects": [{
"com.linkedin.identity.CorpUserInfo": {
"active": true,
"displayName": "Foo Bar",
"fullName": "Foo Bar",
"email": "fbar@linkedin.com"
}
}],
"urn": "urn:li:corpuser:fbar"
}
}'
```
And now let's ingest a dataset with two aspects: Ownership & Follow
```
curl 'http://localhost:8080/datasets?action=ingest' -X POST -H 'X-RestLi-Protocol-Version:2.0.0' --data '
{
"snapshot": {
"aspects": [{
"com.linkedin.common.Ownership": {
"owners": [{
"owner": "urn:li:corpuser:fbar",
"type": "DATAOWNER"
}],
"lastModified": {
"time": 0,
"actor": "urn:li:corpuser:fbar"
}
}
},
{
"com.linkedin.common.Follow": {
"followers": [{
"follower": {
"corpUser": "urn:li:corpuser:fbar"
}
}]
}
}],
"urn": "urn:li:dataset:(urn:li:dataPlatform:foo,bar,PROD)"
}
}'
```
Ownership aspect will help create an `OwnedBy` edge between user & dataset nodes. That existed already.
Now that we added follow aspect, we'll also be able to see a `FollowedBy` edge between same user & dataset nodes.
You can confirm this by connecting to Neo4j browser on http://localhost:7474/browser