From a113e4357ea46b15f6cb5402eade5685fd18902c Mon Sep 17 00:00:00 2001 From: Vincenzo Lavorini Date: Fri, 25 Feb 2022 08:29:01 +0100 Subject: [PATCH] fix(ingest): openapi - add support for user, password auth (#4086) --- metadata-ingestion/source_docs/openapi.md | 13 ++++- .../src/datahub/ingestion/source/openapi.py | 48 +++++++++++++++---- .../ingestion/source/openapi_parser.py | 44 +++++++++++++---- .../openapi/openapi_mces_golden.json | 6 +-- 4 files changed, 87 insertions(+), 24 deletions(-) mode change 100644 => 100755 metadata-ingestion/source_docs/openapi.md mode change 100644 => 100755 metadata-ingestion/src/datahub/ingestion/source/openapi.py mode change 100644 => 100755 metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py mode change 100644 => 100755 metadata-ingestion/tests/integration/openapi/openapi_mces_golden.json diff --git a/metadata-ingestion/source_docs/openapi.md b/metadata-ingestion/source_docs/openapi.md old mode 100644 new mode 100755 index 4f5b42e5b4..b63721f7f8 --- a/metadata-ingestion/source_docs/openapi.md +++ b/metadata-ingestion/source_docs/openapi.md @@ -30,7 +30,9 @@ source: name: test_endpoint # this name will appear in DatHub url: https://test_endpoint.com/ swagger_file: classicapi/doc/swagger.json # where to search for the OpenApi definitions - get_token: True # optional, if you need to get an authentication token beforehand + get_token: # optional, if you need to get an authentication token beforehand + request_type: get + url: api/authentication/login?username={username}&password={password} username: your_username # optional password: your_password # optional forced_examples: # optionals @@ -137,6 +139,15 @@ and this URL will be called to get back the needed metadata. ## Config details +### Token authentication + +If this tool needs to get an access token to interrogate the endpoints, this can be requested. Two methods are available at the moment: + +* 'get' : this requires username/password combination to be present in the url. Note that {username} and {password} are mandatory placeholders. They will be replaced with the true credentials at runtime. Note that username and password will be sent in the request address, so it's unsecure. If your provider allows for the other method, please go for it. +* 'post' : username and password will be inserted in the body of the POST request + +In both cases, username and password are the ones defined in the configuration file. + ### Getting dataset metadata from `forced_example` Suppose you have an endpoint defined in the swagger file, but without example given, and the tool is diff --git a/metadata-ingestion/src/datahub/ingestion/source/openapi.py b/metadata-ingestion/src/datahub/ingestion/source/openapi.py old mode 100644 new mode 100755 index abe5c11816..11a1380c51 --- a/metadata-ingestion/src/datahub/ingestion/source/openapi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/openapi.py @@ -2,7 +2,7 @@ import logging import time import warnings from abc import ABC -from typing import Dict, Generator, Iterable, Tuple +from typing import Dict, Generator, Iterable, Optional, Tuple from datahub.configuration.common import ConfigModel from datahub.emitter.mce_builder import make_tag_urn @@ -43,20 +43,46 @@ class OpenApiConfig(ConfigModel): username: str = "" password: str = "" forced_examples: dict = {} - token: str = "" - get_token: bool = False + token: Optional[str] = None + get_token: dict = {} def get_swagger(self) -> Dict: - if self.get_token: # token based authentication, to be tested - if self.token == "": + if self.get_token or self.token is not None: + if self.token is not None: + ... + else: + assert ( + "url_complement" in self.get_token.keys() + ), "When 'request_type' is set to 'get', an url_complement is needed for the request." + if self.get_token["request_type"] == "get": + assert ( + "{username}" in self.get_token["url_complement"] + ), "we expect the keyword {username} to be present in the url" + assert ( + "{password}" in self.get_token["url_complement"] + ), "we expect the keyword {password} to be present in the url" + url4req = self.get_token["url_complement"].replace( + "{username}", self.username + ) + url4req = url4req.replace("{password}", self.password) + elif self.get_token["request_type"] == "post": + url4req = self.get_token["url_complement"] + else: + raise KeyError( + "This tool accepts only 'get' and 'post' as method for getting tokens" + ) self.token = get_tok( - url=self.url, username=self.username, password=self.password + url=self.url, + username=self.username, + password=self.password, + tok_url=url4req, + method=self.get_token["request_type"], ) - sw_dict = get_swag_json( self.url, token=self.token, swagger_file=self.swagger_file ) # load the swagger file - else: + + else: # using basic auth for accessing endpoints sw_dict = get_swag_json( self.url, username=self.username, @@ -102,7 +128,9 @@ class APISource(Source, ABC): elif status_code == 504: self.report.report_warning(key=key, reason="Timeout for reaching endpoint") else: - raise Exception(f"Unable to retrieve endpoint, response code {status_code}") + raise Exception( + f"Unable to retrieve endpoint, response code {status_code}, key {key}" + ) def init_dataset( self, endpoint_k: str, endpoint_dets: dict @@ -269,7 +297,7 @@ class APISource(Source, ABC): class OpenApiSource(APISource): def __init__(self, config: OpenApiConfig, ctx: PipelineContext): - super().__init__(config, ctx, "OpenApi") + super().__init__(config, ctx, "openapi") @classmethod def create(cls, config_dict, ctx): diff --git a/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py b/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py old mode 100644 new mode 100755 index e7675fc2ed..c2f87b55d8 --- a/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py @@ -172,7 +172,8 @@ def get_endpoints(sw_dict: dict) -> dict: # noqa: C901 if "parameters" in p_o["get"].keys(): url_details[p_k]["parameters"] = p_o["get"]["parameters"] - return url_details + ord_d = dict(sorted(url_details.items())) # sorting for convenience + return ord_d def guessing_url_name(url: str, examples: dict) -> str: @@ -211,6 +212,10 @@ def guessing_url_name(url: str, examples: dict) -> str: ex2use = root elif root[:-1] in examples.keys(): ex2use = root[:-1] + elif root.replace("/", ".") in examples.keys(): + ex2use = root.replace("/", ".") + elif root[:-1].replace("/", ".") in examples.keys(): + ex2use = root[:-1].replace("/", ".") else: return url @@ -332,19 +337,38 @@ def extract_fields( return [], {} -def get_tok(url: str, username: str = "", password: str = "") -> str: +def get_tok( + url: str, + username: str = "", + password: str = "", + tok_url: str = "", + method: str = "post", +) -> str: """ Trying to post username/password to get auth. - Simplified version: it expect a POST at api/authenticate """ - data = {"username": username, "password": password} - url2post = url + "api/authenticate/" - response = requests.post(url2post, data=data) - if response.status_code == 200: - cont = json.loads(response.content) - return cont["tokens"]["access"] + token = "" + url4req = url + tok_url + if method == "post": + # this will make a POST call with username and password + data = {"username": username, "password": password} + # url2post = url + "api/authenticate/" + response = requests.post(url4req, data=data) + if response.status_code == 200: + cont = json.loads(response.content) + token = cont["tokens"]["access"] + elif method == "get": + # this will make a GET call with username and password + response = requests.get(url4req) + if response.status_code == 200: + cont = json.loads(response.content) + token = cont["token"] else: - raise Exception("Unable to get a valid token") + raise ValueError(f"Method unrecognised: {method}") + if token != "": + return token + else: + raise Exception(f"Unable to get a valid token: {response.text}") def set_metadata( diff --git a/metadata-ingestion/tests/integration/openapi/openapi_mces_golden.json b/metadata-ingestion/tests/integration/openapi/openapi_mces_golden.json old mode 100644 new mode 100755 index 2a297fa2fa..f58b97c36e --- a/metadata-ingestion/tests/integration/openapi/openapi_mces_golden.json +++ b/metadata-ingestion/tests/integration/openapi/openapi_mces_golden.json @@ -3,7 +3,7 @@ "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:OpenApi,test_openapi.root,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:openapi,test_openapi.root,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { @@ -95,7 +95,7 @@ "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:OpenApi,test_openapi.v2,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:openapi,test_openapi.v2,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { @@ -183,4 +183,4 @@ "properties": null } } -] \ No newline at end of file +]