From 8cfcefb19c79db006c1d3c6ae526c99b89a43df8 Mon Sep 17 00:00:00 2001 From: Tamas Nemeth Date: Sat, 5 Nov 2022 19:37:50 +0100 Subject: [PATCH] fix(spark-lineage): smoke test fixes, M1 support (#6372) --- .../docker/SparkBase.Dockerfile | 21 ++++++++++---- .../spark-smoke-test/docker/build_images.sh | 2 +- .../docker/spark-docker-compose.yml | 1 - .../golden_json/JavaHdfsIn2HdfsOut1.json | 8 +++--- .../golden_json/JavaHdfsIn2HdfsOut2.json | 6 ++-- .../JavaHdfsIn2HiveCreateInsertTable.json | 12 ++++---- .../JavaHdfsIn2HiveCreateTable.json | 6 ++-- .../golden_json/JavaHiveInHiveOut.json | 6 ++-- .../golden_json/PythonHdfsIn2HdfsOut1.json | 6 ++-- .../golden_json/PythonHdfsIn2HdfsOut2.json | 6 ++-- .../PythonHdfsIn2HiveCreateInsertTable.json | 12 ++++---- .../PythonHdfsIn2HiveCreateTable.json | 4 +-- .../golden_json/PythonHiveInHiveOut.json | 4 +-- .../python_test_run.sh | 12 ++++---- .../setup_spark_smoke_test.sh | 6 +++- .../spark-lineage/spark-smoke-test/smoke.sh | 28 ++++++++++++++----- .../test-spark-lineage/java_test_run.sh | 12 ++++---- .../spark-smoke-test/test_e2e.py | 13 ++++++++- 18 files changed, 102 insertions(+), 63 deletions(-) diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/docker/SparkBase.Dockerfile b/metadata-integration/java/spark-lineage/spark-smoke-test/docker/SparkBase.Dockerfile index f0095d67f4..119338be6c 100644 --- a/metadata-integration/java/spark-lineage/spark-smoke-test/docker/SparkBase.Dockerfile +++ b/metadata-integration/java/spark-lineage/spark-smoke-test/docker/SparkBase.Dockerfile @@ -1,4 +1,4 @@ -FROM rappdw/docker-java-python:openjdk1.8.0_171-python3.6.6 +FROM python:3.9 ARG shared_workspace=/opt/workspace @@ -7,21 +7,32 @@ ENV SHARED_WORKSPACE=${shared_workspace} # -- Layer: Apache Spark -ARG spark_version=2.4.8 +ARG spark_version=3.2.0 ARG hadoop_version=2.7 RUN apt-get update -y && \ - apt-get install -y curl && \ + apt-get install -y --no-install-recommends curl gnupg software-properties-common && \ + apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 0xB1998361219BD9C9 && \ + curl https://cdn.azul.com/zulu/bin/zulu-repo_1.0.0-3_all.deb -o /tmp/zulu-repo_1.0.0-3_all.deb && \ + apt-get install /tmp/zulu-repo_1.0.0-3_all.deb && \ + apt-get update && \ +# apt-cache search zulu && \ + apt-get install -y --no-install-recommends zulu11-jre && \ + apt-get clean && \ curl -sS https://archive.apache.org/dist/spark/spark-${spark_version}/spark-${spark_version}-bin-hadoop${hadoop_version}.tgz -o spark.tgz && \ tar -xf spark.tgz && \ mv spark-${spark_version}-bin-hadoop${hadoop_version} /usr/bin/ && \ mkdir /usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}/logs && \ - rm spark.tgz + rm spark.tgz && \ + rm -rf /var/tmp/* /tmp/* /var/lib/apt/lists/* + +RUN set -e; \ + pip install JPype1 ENV SPARK_HOME /usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version} ENV SPARK_MASTER_HOST spark-master ENV SPARK_MASTER_PORT 7077 -ENV PYSPARK_PYTHON python2.7 +ENV PYSPARK_PYTHON python3.9 ENV PATH=$PATH:$SPARK_HOME/bin COPY workspace $SHARED_WORKSPACE diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/docker/build_images.sh b/metadata-integration/java/spark-lineage/spark-smoke-test/docker/build_images.sh index 863f493214..a1edcdaf41 100755 --- a/metadata-integration/java/spark-lineage/spark-smoke-test/docker/build_images.sh +++ b/metadata-integration/java/spark-lineage/spark-smoke-test/docker/build_images.sh @@ -1,4 +1,4 @@ - +#!/bin/bash -xe #Remove old configuration rm -rf workspace diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/docker/spark-docker-compose.yml b/metadata-integration/java/spark-lineage/spark-smoke-test/docker/spark-docker-compose.yml index e225332ea8..865c5dee0f 100644 --- a/metadata-integration/java/spark-lineage/spark-smoke-test/docker/spark-docker-compose.yml +++ b/metadata-integration/java/spark-lineage/spark-smoke-test/docker/spark-docker-compose.yml @@ -1,5 +1,4 @@ version: "3.6" - services: spark-master: image: spark-master diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHdfsIn2HdfsOut1.json b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHdfsIn2HdfsOut1.json index 754c4f59ac..6a47211972 100644 --- a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHdfsIn2HdfsOut1.json +++ b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHdfsIn2HdfsOut1.json @@ -76,11 +76,11 @@ { "com.linkedin.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)" ], "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/JavaHdfsIn2HdfsOut1/out.csv,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/JavaHdfsIn2HdfsOut1/out.csv,PROD)" ] } } @@ -88,4 +88,4 @@ } } } -} \ No newline at end of file +} diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHdfsIn2HdfsOut2.json b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHdfsIn2HdfsOut2.json index b64aaf5c87..bc73d93364 100644 --- a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHdfsIn2HdfsOut2.json +++ b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHdfsIn2HdfsOut2.json @@ -62,11 +62,11 @@ { "com.linkedin.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)" ], "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/JavaHdfsIn2HdfsOut2/out.csv,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/JavaHdfsIn2HdfsOut2/out.csv,PROD)" ] } }, diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHdfsIn2HiveCreateInsertTable.json b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHdfsIn2HiveCreateInsertTable.json index a1d24dfdf5..4816f55bc6 100644 --- a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHdfsIn2HiveCreateInsertTable.json +++ b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHdfsIn2HiveCreateInsertTable.json @@ -50,8 +50,8 @@ { "com.linkedin.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)" ], "outputDatasets": [ "urn:li:dataset:(urn:li:dataPlatform:hive,JavaHdfsIn2HiveCreateInsertTable.foo4,PROD)" @@ -114,8 +114,8 @@ { "com.linkedin.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)" ], "outputDatasets": [ "urn:li:dataset:(urn:li:dataPlatform:hive,javahdfsin2hivecreateinserttable.foo4,PROD)" @@ -179,8 +179,8 @@ { "com.linkedin.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)" ], "outputDatasets": [ "urn:li:dataset:(urn:li:dataPlatform:hive,javahdfsin2hivecreateinserttable.foo4,PROD)" diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHdfsIn2HiveCreateTable.json b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHdfsIn2HiveCreateTable.json index 2d1d8930c8..7bc876a4f3 100644 --- a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHdfsIn2HiveCreateTable.json +++ b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHdfsIn2HiveCreateTable.json @@ -76,11 +76,11 @@ { "com.linkedin.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)" ], "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hive,JavaHdfsIn2HiveCreateTable.foo3,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:file,JavaHdfsIn2HiveCreateTable.foo3,PROD)" ] } } diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHiveInHiveOut.json b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHiveInHiveOut.json index b999b07c4f..8cbf34dae1 100644 --- a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHiveInHiveOut.json +++ b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHiveInHiveOut.json @@ -62,11 +62,11 @@ { "com.linkedin.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)" ], "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hive,JavaHiveInHiveOut.foo5,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:file,JavaHiveInHiveOut.foo5,PROD)" ] } }, diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHdfsIn2HdfsOut1.json b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHdfsIn2HdfsOut1.json index b7ea7abf73..c6937ee3d8 100644 --- a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHdfsIn2HdfsOut1.json +++ b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHdfsIn2HdfsOut1.json @@ -50,11 +50,11 @@ { "com.linkedin.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)" ], "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/PythonHdfsIn2HdfsOut1/out.csv,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/PythonHdfsIn2HdfsOut1/out.csv,PROD)" ] } }, diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHdfsIn2HdfsOut2.json b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHdfsIn2HdfsOut2.json index 146bafe322..2c0b699503 100644 --- a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHdfsIn2HdfsOut2.json +++ b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHdfsIn2HdfsOut2.json @@ -102,11 +102,11 @@ { "com.linkedin.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)" ], "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/PythonHdfsIn2HdfsOut2/out2.csv,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/PythonHdfsIn2HdfsOut2/out2.csv,PROD)" ] } }, diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHdfsIn2HiveCreateInsertTable.json b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHdfsIn2HiveCreateInsertTable.json index 34d751486d..62e70ec00f 100644 --- a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHdfsIn2HiveCreateInsertTable.json +++ b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHdfsIn2HiveCreateInsertTable.json @@ -55,8 +55,8 @@ { "com.linkedin.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)" ], "outputDatasets": [ "urn:li:dataset:(urn:li:dataPlatform:hive,PythonHdfsIn2HiveCreateInsertTable.foo4,PROD)" @@ -123,8 +123,8 @@ { "com.linkedin.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)" ], "outputDatasets": [ "urn:li:dataset:(urn:li:dataPlatform:hive,pythonhdfsin2hivecreateinserttable.foo4,PROD)" @@ -154,8 +154,8 @@ { "com.linkedin.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)" ], "outputDatasets": [ "urn:li:dataset:(urn:li:dataPlatform:hive,pythonhdfsin2hivecreateinserttable.foo4,PROD)" diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHdfsIn2HiveCreateTable.json b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHdfsIn2HiveCreateTable.json index dd206c4ce5..78c258d819 100644 --- a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHdfsIn2HiveCreateTable.json +++ b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHdfsIn2HiveCreateTable.json @@ -76,8 +76,8 @@ { "com.linkedin.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)" ], "outputDatasets": [ "urn:li:dataset:(urn:li:dataPlatform:hive,PythonHdfsIn2HiveCreateTable.foo3,PROD)" diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHiveInHiveOut.json b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHiveInHiveOut.json index ff9c31633c..09e9eabef4 100644 --- a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHiveInHiveOut.json +++ b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHiveInHiveOut.json @@ -164,8 +164,8 @@ { "com.linkedin.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)" ], "outputDatasets": [ "urn:li:dataset:(urn:li:dataPlatform:hive,PythonHiveInHiveOut.foo5,PROD)" diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/python-spark-lineage-test/python_test_run.sh b/metadata-integration/java/spark-lineage/spark-smoke-test/python-spark-lineage-test/python_test_run.sh index 41c506f34c..429f692500 100755 --- a/metadata-integration/java/spark-lineage/spark-smoke-test/python-spark-lineage-test/python_test_run.sh +++ b/metadata-integration/java/spark-lineage/spark-smoke-test/python-spark-lineage-test/python_test_run.sh @@ -7,25 +7,25 @@ saluation () { echo "--------------------------------------------------------" - echo "Starting exectuion" + echo "Starting execution $1" echo "--------------------------------------------------------" } -saluation +saluation "HdfsIn2HdfsOut1.py" spark-submit --properties-file $2 HdfsIn2HdfsOut1.py -saluation +saluation "HdfsIn2HdfsOut2.py" spark-submit --properties-file $2 HdfsIn2HdfsOut2.py -saluation +saluation "HdfsIn2HiveCreateTable.py" spark-submit --properties-file $2 HdfsIn2HiveCreateTable.py -saluation +saluation "HdfsIn2HiveCreateInsertTable.py" spark-submit --properties-file $2 HdfsIn2HiveCreateInsertTable.py -saluation +saluation "HiveInHiveOut.py" spark-submit --properties-file $2 HiveInHiveOut.py diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/setup_spark_smoke_test.sh b/metadata-integration/java/spark-lineage/spark-smoke-test/setup_spark_smoke_test.sh index 6572ffa0f5..33cac9d562 100755 --- a/metadata-integration/java/spark-lineage/spark-smoke-test/setup_spark_smoke_test.sh +++ b/metadata-integration/java/spark-lineage/spark-smoke-test/setup_spark_smoke_test.sh @@ -1,6 +1,9 @@ -#!/bin/bash +#!/bin/bash -x set -e + +SMOKE_TEST_ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + pip install -r requirements.txt echo "--------------------------------------------------------------------" @@ -25,6 +28,7 @@ echo "--------------------------------------------------------------------" echo "Bringing up spark cluster" echo "--------------------------------------------------------------------" +cd "${SMOKE_TEST_ROOT_DIR}"/docker #bring up spark cluster docker-compose -f spark-docker-compose.yml up -d diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/smoke.sh b/metadata-integration/java/spark-lineage/spark-smoke-test/smoke.sh index 5e3e70ca7f..8f2f21d3b0 100755 --- a/metadata-integration/java/spark-lineage/spark-smoke-test/smoke.sh +++ b/metadata-integration/java/spark-lineage/spark-smoke-test/smoke.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash -x set -e # Script assumptions: @@ -7,6 +7,24 @@ set -e # - pytest is installed # - requests is installed +is_healthy() { + local service="$1" + local -r -i max_attempts="$2"; shift + local -i attempt_num=1 + + until [ -n "$(docker ps -f name="$service" -f "health=healthy"|tail -n +2)" ] + do + if (( attempt_num == max_attempts )) + then + echo "Attempt $attempt_num failed and there are no more attempts left!" + return 1 + else + echo "Attempt $attempt_num failed! Trying again in $attempt_num seconds..." + sleep $(( attempt_num++ )) + fi + done +} + DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" cd "$DIR" @@ -22,12 +40,8 @@ echo "--------------------------------------------------------------------" pwd ../../../ -datahub docker quickstart \ - --build-locally \ - --quickstart-compose-file ../../../../docker/docker-compose.yml \ - --quickstart-compose-file ../../../../docker/docker-compose.override.yml \ - --quickstart-compose-file ../../../../docker/docker-compose.dev.yml \ - --dump-logs-on-failure +../../../../docker/dev.sh -d +is_healthy "datahub-gms" 60 echo "--------------------------------------------------------------------" echo "Setup environment for pytest" diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/test-spark-lineage/java_test_run.sh b/metadata-integration/java/spark-lineage/spark-smoke-test/test-spark-lineage/java_test_run.sh index d5830f375c..ca736c78f6 100755 --- a/metadata-integration/java/spark-lineage/spark-smoke-test/test-spark-lineage/java_test_run.sh +++ b/metadata-integration/java/spark-lineage/spark-smoke-test/test-spark-lineage/java_test_run.sh @@ -1,24 +1,24 @@ saluation () { echo "--------------------------------------------------------" - echo "Starting exectuion" + echo "Starting execution $1" echo "--------------------------------------------------------" } -saluation +saluation "test.spark.lineage.HdfsIn2HdfsOut1" $1/bin/spark-submit --properties-file $2 --class test.spark.lineage.HdfsIn2HdfsOut1 build/libs/test-spark-lineage.jar -saluation +saluation "test.spark.lineage.HdfsIn2HdfsOut2" $1/bin/spark-submit --properties-file $2 --class test.spark.lineage.HdfsIn2HdfsOut2 build/libs/test-spark-lineage.jar -saluation +saluation "test.spark.lineage.HdfsIn2HiveCreateTable" $1/bin/spark-submit --properties-file $2 --class test.spark.lineage.HdfsIn2HiveCreateTable build/libs/test-spark-lineage.jar -saluation +saluation "test.spark.lineage.HdfsIn2HiveCreateInsertTable" $1/bin/spark-submit --properties-file $2 --class test.spark.lineage.HdfsIn2HiveCreateInsertTable build/libs/test-spark-lineage.jar -saluation +saluation "test.spark.lineage.HiveInHiveOut" $1/bin/spark-submit --properties-file $2 --class test.spark.lineage.HiveInHiveOut build/libs/test-spark-lineage.jar diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/test_e2e.py b/metadata-integration/java/spark-lineage/spark-smoke-test/test_e2e.py index 47e86bdb56..e5d83279d2 100644 --- a/metadata-integration/java/spark-lineage/spark-smoke-test/test_e2e.py +++ b/metadata-integration/java/spark-lineage/spark-smoke-test/test_e2e.py @@ -17,6 +17,7 @@ from jsoncomparison import Compare, NO_DIFF GMS_ENDPOINT = "http://localhost:8080" GOLDEN_FILES_PATH = "./spark-smoke-test/golden_json/" golden_files = os.listdir(GOLDEN_FILES_PATH) + print(golden_files) [file_name.strip(".json") for file_name in golden_files] restli_default_headers = { @@ -59,6 +60,14 @@ def test_healthchecks(wait_for_healthchecks): pass +def sort_aspects(input): + print(input) + item_id = list(input["value"].keys())[0] + input["value"][item_id]["aspects"] = sorted( + input["value"][item_id]["aspects"], key=lambda x: list(x.keys())[0] + ) + + @pytest.mark.dependency(depends=["test_healthchecks"]) @pytest.mark.parametrize("json_file", golden_files, ) def test_ingestion_via_rest(json_file): @@ -71,7 +80,9 @@ def test_ingestion_via_rest(json_file): print(url) response = requests.get(url) response.raise_for_status() - data = response.json() + + data = sort_aspects(response.json()) + value = sort_aspects(value) diff = json_compare.check(value, data) print(urn) if diff != NO_DIFF: