import time import pytest import requests import urllib from datahub.cli.docker import check_local_docker_containers from datahub.ingestion.run.pipeline import Pipeline GMS_ENDPOINT = "http://localhost:8080" FRONTEND_ENDPOINT = "http://localhost:9002" KAFKA_BROKER = "localhost:9092" bootstrap_sample_data = "../metadata-ingestion/examples/mce_files/bootstrap_mce.json" usage_sample_data = ( "../metadata-ingestion/tests/integration/bigquery-usage/bigquery_usages_golden.json" ) bq_sample_data = "./sample_bq_data.json" restli_default_headers = { "X-RestLi-Protocol-Version": "2.0.0", } kafka_post_ingestion_wait_sec = 60 @pytest.fixture(scope="session") def wait_for_healthchecks(): # Simply assert that everything is healthy, but don't wait. assert not check_local_docker_containers() yield @pytest.mark.dependency() def test_healthchecks(wait_for_healthchecks): # Call to wait_for_healthchecks fixture will do the actual functionality. pass def ingest_file(filename: str): pipeline = Pipeline.create( { "source": { "type": "file", "config": {"filename": filename}, }, "sink": { "type": "datahub-rest", "config": {"server": GMS_ENDPOINT}, }, } ) pipeline.run() pipeline.raise_from_status() @pytest.mark.dependency(depends=["test_healthchecks"]) def test_ingestion_via_rest(wait_for_healthchecks): ingest_file(bootstrap_sample_data) @pytest.mark.dependency(depends=["test_healthchecks"]) def test_ingestion_usage_via_rest(wait_for_healthchecks): ingest_file(usage_sample_data) @pytest.mark.dependency(depends=["test_healthchecks"]) def test_ingestion_via_kafka(wait_for_healthchecks): pipeline = Pipeline.create( { "source": { "type": "file", "config": {"filename": bq_sample_data}, }, "sink": { "type": "datahub-kafka", "config": { "connection": { "bootstrap": KAFKA_BROKER, } }, }, } ) pipeline.run() pipeline.raise_from_status() # Since Kafka emission is asynchronous, we must wait a little bit so that # the changes are actually processed. time.sleep(kafka_post_ingestion_wait_sec) @pytest.mark.dependency( depends=[ "test_ingestion_via_rest", "test_ingestion_via_kafka", "test_ingestion_usage_via_rest", ] ) def test_run_ingestion(wait_for_healthchecks): # Dummy test so that future ones can just depend on this one. pass @pytest.mark.dependency(depends=["test_healthchecks", "test_run_ingestion"]) def test_gms_get_user(): username = "jdoe" urn = f"urn:li:corpuser:{username}" response = requests.get( f"{GMS_ENDPOINT}/entities/{urllib.parse.quote(urn)}", headers={ **restli_default_headers, }, ) response.raise_for_status() data = response.json() assert data["value"] assert data["value"]["com.linkedin.metadata.snapshot.CorpUserSnapshot"] assert data["value"]["com.linkedin.metadata.snapshot.CorpUserSnapshot"]["urn"] == urn @pytest.mark.parametrize( "platform,dataset_name,env", [ ( # This one tests the bootstrap sample data. "urn:li:dataPlatform:kafka", "SampleKafkaDataset", "PROD", ), ( # This one tests BigQuery ingestion. "urn:li:dataPlatform:bigquery", "bigquery-public-data.covid19_geotab_mobility_impact.us_border_wait_times", "PROD", ), ], ) @pytest.mark.dependency(depends=["test_healthchecks", "test_run_ingestion"]) def test_gms_get_dataset(platform, dataset_name, env): platform = "urn:li:dataPlatform:bigquery" dataset_name = ( "bigquery-public-data.covid19_geotab_mobility_impact.us_border_wait_times" ) env = "PROD" urn = f"urn:li:dataset:({platform},{dataset_name},{env})" response = requests.get( f"{GMS_ENDPOINT}/entities/{urllib.parse.quote(urn)}", headers={ **restli_default_headers, "X-RestLi-Method": "get", }, ) response.raise_for_status() res_data = response.json() assert res_data["value"] assert res_data["value"]["com.linkedin.metadata.snapshot.DatasetSnapshot"] assert res_data["value"]["com.linkedin.metadata.snapshot.DatasetSnapshot"]["urn"] == urn @pytest.mark.parametrize( "query,min_expected_results", [ ("covid", 1), ("sample", 3), ], ) @pytest.mark.dependency(depends=["test_healthchecks", "test_run_ingestion"]) def test_gms_search_dataset(query, min_expected_results): json = { "input": f"{query}", "entity": "dataset", "start": 0, "count": 10 } print(json) response = requests.post( f"{GMS_ENDPOINT}/entities?action=search", headers=restli_default_headers, json=json ) response.raise_for_status() res_data = response.json() assert res_data["value"] assert res_data["value"]["numEntities"] >= min_expected_results assert len(res_data["value"]["entities"]) >= min_expected_results @pytest.mark.dependency(depends=["test_healthchecks", "test_run_ingestion"]) def test_gms_usage_fetch(): response = requests.post( f"{GMS_ENDPOINT}/usageStats?action=queryRange", headers=restli_default_headers, json={ "resource": "urn:li:dataset:(urn:li:dataPlatform:bigquery,harshal-playground-306419.test_schema.excess_deaths_derived,PROD)", "duration": "DAY", "rangeFromEnd": "ALL", }, ) response.raise_for_status() data = response.json()["value"] assert len(data["buckets"]) == 3 assert data["buckets"][0]["metrics"]["topSqlQueries"] fields = data["aggregations"].pop("fields") assert len(fields) == 12 assert fields[0]["count"] == 7 users = data["aggregations"].pop("users") assert len(users) == 1 assert users[0]["count"] == 7 assert data["aggregations"] == { # "fields" and "users" already popped out "totalSqlQueries": 7, "uniqueUserCount": 1, } @pytest.fixture(scope="session") def frontend_session(wait_for_healthchecks): session = requests.Session() headers = { "Content-Type": "application/json", } data = '{"username":"datahub", "password":"datahub"}' response = session.post( f"{FRONTEND_ENDPOINT}/logIn", headers=headers, data=data ) response.raise_for_status() yield session @pytest.mark.dependency(depends=["test_healthchecks"]) def test_frontend_auth(frontend_session): pass @pytest.mark.dependency(depends=["test_healthchecks", "test_run_ingestion"]) def test_frontend_browse_datasets(frontend_session): json = { "query": """query browse($input: BrowseInput!) {\n browse(input: $input) {\n start\n count\n total\n groups { name } entities {\n ... on Dataset {\n urn\n name\n }\n }\n }\n }""", "variables": { "input": { "type": "DATASET", "path": ["prod"] } } } response = frontend_session.post( f"{FRONTEND_ENDPOINT}/api/v2/graphql", json=json ) response.raise_for_status() res_data = response.json() assert res_data assert res_data["data"] assert res_data["data"]["browse"] assert len(res_data["data"]["browse"]["entities"]) == 0 assert len(res_data["data"]["browse"]["groups"]) > 0 @pytest.mark.parametrize( "query,min_expected_results", [ ("covid", 1), ("sample", 3), ], ) @pytest.mark.dependency(depends=["test_healthchecks", "test_run_ingestion"]) def test_frontend_search_datasets(frontend_session, query, min_expected_results): json = { "query": """query search($input: SearchInput!) {\n search(input: $input) {\n start\n count\n total\n searchResults {\n entity {\n ... on Dataset {\n urn\n name\n }\n }\n }\n }\n }""", "variables": { "input": { "type": "DATASET", "query": f"{query}", "start": 0, "count": 10 } } } response = frontend_session.post( f"{FRONTEND_ENDPOINT}/api/v2/graphql", json=json ) response.raise_for_status() res_data = response.json() assert res_data assert res_data["data"] assert res_data["data"]["search"] assert res_data["data"]["search"]["total"] >= min_expected_results assert len(res_data["data"]["search"]["searchResults"]) >= min_expected_results @pytest.mark.dependency(depends=["test_healthchecks", "test_run_ingestion"]) def test_frontend_user_info(frontend_session): urn = f"urn:li:corpuser:datahub" json = { "query": """query corpUser($urn: String!) {\n corpUser(urn: $urn) {\n urn\n username\n editableInfo {\n pictureLink\n }\n info {\n firstName\n fullName\n title\n email\n }\n }\n }""", "variables": { "urn": urn } } response = frontend_session.post( f"{FRONTEND_ENDPOINT}/api/v2/graphql", json=json ) response.raise_for_status() res_data = response.json() assert res_data assert res_data["data"] assert res_data["data"]["corpUser"] assert res_data["data"]["corpUser"]["urn"] == urn @pytest.mark.parametrize( "platform,dataset_name,env", [ ( # This one tests the bootstrap sample data. "urn:li:dataPlatform:kafka", "SampleKafkaDataset", "PROD", ), ( # This one tests BigQuery ingestion. "urn:li:dataPlatform:bigquery", "bigquery-public-data.covid19_geotab_mobility_impact.us_border_wait_times", "PROD", ), ], ) @pytest.mark.dependency(depends=["test_healthchecks", "test_run_ingestion"]) def test_frontend_datasets(frontend_session, platform, dataset_name, env): urn = f"urn:li:dataset:({platform},{dataset_name},{env})" json = { "query": """query getDataset($urn: String!) {\n dataset(urn: $urn) {\n urn\n name\n description\n platform {\n urn\n }\n schemaMetadata {\n name\n version\n createdAt\n }\n }\n }""", "variables": { "urn": urn } } # Basic dataset info. response = frontend_session.post( f"{FRONTEND_ENDPOINT}/api/v2/graphql", json=json ) response.raise_for_status() res_data = response.json() assert res_data assert res_data["data"] assert res_data["data"]["dataset"] assert res_data["data"]["dataset"]["urn"] == urn assert res_data["data"]["dataset"]["name"] == dataset_name assert res_data["data"]["dataset"]["platform"]["urn"] == platform @pytest.mark.dependency(depends=["test_healthchecks", "test_run_ingestion"]) def test_ingest_with_system_metadata(): response = requests.post( f"{GMS_ENDPOINT}/entities?action=ingest", headers=restli_default_headers, json={ 'entity': { 'value': {'com.linkedin.metadata.snapshot.CorpUserSnapshot': {'urn': 'urn:li:corpuser:datahub', 'aspects': [{'com.linkedin.identity.CorpUserInfo': {'active': True, 'displayName': 'Data Hub', 'email': 'datahub@linkedin.com', 'title': 'CEO', 'fullName': 'Data Hub'}}] } } }, 'systemMetadata': {'lastObserved': 1628097379571, 'runId': 'af0fe6e4-f547-11eb-81b2-acde48001122'} }, ) response.raise_for_status() @pytest.mark.dependency(depends=["test_healthchecks", "test_run_ingestion"]) def test_ingest_with_blank_system_metadata(): response = requests.post( f"{GMS_ENDPOINT}/entities?action=ingest", headers=restli_default_headers, json={ 'entity': { 'value': {'com.linkedin.metadata.snapshot.CorpUserSnapshot': {'urn': 'urn:li:corpuser:datahub', 'aspects': [{'com.linkedin.identity.CorpUserInfo': {'active': True, 'displayName': 'Data Hub', 'email': 'datahub@linkedin.com', 'title': 'CEO', 'fullName': 'Data Hub'}}] } } }, 'systemMetadata': {} }, ) response.raise_for_status() @pytest.mark.dependency(depends=["test_healthchecks", "test_run_ingestion"]) def test_ingest_without_system_metadata(): response = requests.post( f"{GMS_ENDPOINT}/entities?action=ingest", headers=restli_default_headers, json={ 'entity': { 'value': {'com.linkedin.metadata.snapshot.CorpUserSnapshot': {'urn': 'urn:li:corpuser:datahub', 'aspects': [{'com.linkedin.identity.CorpUserInfo': {'active': True, 'displayName': 'Data Hub', 'email': 'datahub@linkedin.com', 'title': 'CEO', 'fullName': 'Data Hub'}}] } } }, }, ) response.raise_for_status()