diff --git a/ingestion/src/metadata/ingestion/source/api/rest/connection.py b/ingestion/src/metadata/ingestion/source/api/rest/connection.py index d8d3bed10ed..9c0c4e9c9ac 100644 --- a/ingestion/src/metadata/ingestion/source/api/rest/connection.py +++ b/ingestion/src/metadata/ingestion/source/api/rest/connection.py @@ -28,6 +28,11 @@ from metadata.generated.schema.entity.services.connections.testConnectionResult ) from metadata.ingestion.connections.test_connections import test_connection_steps from metadata.ingestion.ometa.ometa_api import OpenMetadata +from metadata.ingestion.source.api.rest.parser import ( + OpenAPIParseError, + parse_openapi_schema, + validate_openapi_schema, +) from metadata.utils.constants import THREE_MIN @@ -75,16 +80,17 @@ def test_connection( def custom_schema_exec(): try: - if client.json() is not None and client.json().get("openapi") is not None: + schema = parse_openapi_schema(client) + if validate_openapi_schema(schema): return [] raise InvalidOpenAPISchemaError( - "Provided schema is not valid OpenAPI JSON schema" - ) - except: - raise InvalidOpenAPISchemaError( - "Provided schema is not valid OpenAPI JSON schema" + "Provided schema is not valid OpenAPI specification" ) + except OpenAPIParseError as e: + raise InvalidOpenAPISchemaError(f"Failed to parse OpenAPI schema: {e}") + except Exception as e: + raise InvalidOpenAPISchemaError(f"Error validating OpenAPI schema: {e}") test_fn = {"CheckURL": custom_url_exec, "CheckSchema": custom_schema_exec} diff --git a/ingestion/src/metadata/ingestion/source/api/rest/metadata.py b/ingestion/src/metadata/ingestion/source/api/rest/metadata.py index ee3a520a3c8..4eaf0a0db9e 100644 --- a/ingestion/src/metadata/ingestion/source/api/rest/metadata.py +++ b/ingestion/src/metadata/ingestion/source/api/rest/metadata.py @@ -40,6 +40,7 @@ from metadata.ingestion.api.steps import InvalidSourceException from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.ingestion.source.api.api_service import ApiServiceSource from metadata.ingestion.source.api.rest.models import RESTCollection, RESTEndpoint +from metadata.ingestion.source.api.rest.parser import parse_openapi_schema from metadata.utils import fqn from metadata.utils.filters import filter_by_collection from metadata.utils.helpers import clean_uri @@ -77,7 +78,7 @@ class RestSource(ApiServiceSource): Here is where filtering happens """ try: - self.json_response = self.connection.json() + self.json_response = parse_openapi_schema(self.connection) collections_list = [] tags_collection_set = set() if self.json_response.get("tags", []): diff --git a/ingestion/src/metadata/ingestion/source/api/rest/parser.py b/ingestion/src/metadata/ingestion/source/api/rest/parser.py new file mode 100644 index 00000000000..32237911fc1 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/api/rest/parser.py @@ -0,0 +1,103 @@ +# Copyright 2024 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +OpenAPI schema parser for both JSON and YAML formats +""" +import json +from typing import Any, Dict + +import yaml +from requests.models import Response + +from metadata.utils.logger import ingestion_logger + +logger = ingestion_logger() + + +class OpenAPIParseError(Exception): + """ + Exception raised when OpenAPI schema cannot be parsed as either JSON or YAML + """ + + +def parse_openapi_schema(response: Response) -> Dict[str, Any]: + """ + Parse OpenAPI schema from HTTP response. + Supports both JSON and YAML formats. + + Args: + response: HTTP response containing OpenAPI schema + + Returns: + Parsed OpenAPI schema as dictionary + + Raises: + OpenAPIParseError: If content cannot be parsed as either JSON or YAML + """ + content = response.text + content_type = response.headers.get("content-type", "").lower() + + # Try to determine format from content-type header + if "json" in content_type: + try: + return json.loads(content) + except json.JSONDecodeError as e: + logger.warning(f"Failed to parse as JSON despite content-type: {e}") + elif "yaml" in content_type or "yml" in content_type: + try: + return yaml.safe_load(content) + except yaml.YAMLError as e: + logger.warning(f"Failed to parse as YAML despite content-type: {e}") + + # If content-type is not definitive or parsing failed, try both formats + + # First try JSON (backward compatibility) + try: + parsed = json.loads(content) + logger.debug("Successfully parsed OpenAPI schema as JSON") + return parsed + except json.JSONDecodeError: + logger.debug("Content is not valid JSON, trying YAML") + + # Then try YAML + try: + parsed = yaml.safe_load(content) + if parsed is None: + raise OpenAPIParseError("YAML parsing returned None") + logger.debug("Successfully parsed OpenAPI schema as YAML") + return parsed + except yaml.YAMLError as e: + logger.error(f"Failed to parse as YAML: {e}") + + # If both formats fail, raise an error + raise OpenAPIParseError( + "Failed to parse OpenAPI schema as either JSON or YAML. " + "Please ensure the schema is valid OpenAPI specification." + ) + + +def validate_openapi_schema(schema: Dict[str, Any]) -> bool: + """ + Validate that the parsed schema is a valid OpenAPI specification. + + Args: + schema: Parsed schema dictionary + + Returns: + True if schema appears to be valid OpenAPI, False otherwise + """ + # Check for required OpenAPI fields + if not isinstance(schema, dict): + return False + + # OpenAPI 3.x uses "openapi" field, OpenAPI 2.x uses "swagger" field + return schema.get("openapi") is not None or schema.get("swagger") is not None diff --git a/ingestion/tests/unit/topology/api/test_openapi_parser.py b/ingestion/tests/unit/topology/api/test_openapi_parser.py new file mode 100644 index 00000000000..a512364332e --- /dev/null +++ b/ingestion/tests/unit/topology/api/test_openapi_parser.py @@ -0,0 +1,200 @@ +# Copyright 2024 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Test OpenAPI schema parser for both JSON and YAML formats +""" + +import json +from unittest import TestCase +from unittest.mock import Mock + +import yaml + +from metadata.ingestion.source.api.rest.parser import ( + OpenAPIParseError, + parse_openapi_schema, + validate_openapi_schema, +) + + +class TestOpenAPIParser(TestCase): + """Test cases for OpenAPI schema parser""" + + def setUp(self): + """Set up test data""" + self.openapi_data = { + "openapi": "3.0.3", + "info": {"title": "Test API", "version": "1.0.0"}, + "paths": { + "/pets": { + "get": { + "tags": ["pets"], + "summary": "List all pets", + "operationId": "listPets", + } + } + }, + "tags": [{"name": "pets", "description": "Everything about pets"}], + } + + self.swagger_data = { + "swagger": "2.0", + "info": {"title": "Test API", "version": "1.0.0"}, + "paths": { + "/pets": { + "get": { + "tags": ["pets"], + "summary": "List all pets", + "operationId": "listPets", + } + } + }, + "tags": [{"name": "pets", "description": "Everything about pets"}], + } + + def test_parse_json_openapi_schema(self): + """Test parsing valid JSON OpenAPI schema""" + mock_response = Mock() + mock_response.text = json.dumps(self.openapi_data) + mock_response.headers = {"content-type": "application/json"} + + result = parse_openapi_schema(mock_response) + + self.assertEqual(result, self.openapi_data) + + def test_parse_yaml_openapi_schema(self): + """Test parsing valid YAML OpenAPI schema""" + mock_response = Mock() + mock_response.text = yaml.dump(self.openapi_data) + mock_response.headers = {"content-type": "application/yaml"} + + result = parse_openapi_schema(mock_response) + + self.assertEqual(result, self.openapi_data) + + def test_parse_json_without_content_type(self): + """Test parsing JSON when content-type is not specified""" + mock_response = Mock() + mock_response.text = json.dumps(self.openapi_data) + mock_response.headers = {} + + result = parse_openapi_schema(mock_response) + + self.assertEqual(result, self.openapi_data) + + def test_parse_yaml_without_content_type(self): + """Test parsing YAML when content-type is not specified""" + mock_response = Mock() + # Use YAML format that's not valid JSON to ensure YAML parser is used + yaml_content = """ +openapi: 3.0.3 +info: + title: Test API + version: 1.0.0 +paths: + /pets: + get: + tags: + - pets + summary: List all pets + operationId: listPets +tags: + - name: pets + description: Everything about pets +""" + mock_response.text = yaml_content + mock_response.headers = {} + + result = parse_openapi_schema(mock_response) + + self.assertEqual(result["openapi"], "3.0.3") + self.assertEqual(result["info"]["title"], "Test API") + + def test_parse_yml_content_type(self): + """Test parsing with content-type: application/yml""" + mock_response = Mock() + mock_response.text = yaml.dump(self.openapi_data) + mock_response.headers = {"content-type": "application/yml"} + + result = parse_openapi_schema(mock_response) + + self.assertEqual(result, self.openapi_data) + + def test_parse_text_yaml_content_type(self): + """Test parsing with content-type: text/yaml""" + mock_response = Mock() + mock_response.text = yaml.dump(self.openapi_data) + mock_response.headers = {"content-type": "text/yaml"} + + result = parse_openapi_schema(mock_response) + + self.assertEqual(result, self.openapi_data) + + def test_parse_empty_content(self): + """Test parsing empty content""" + mock_response = Mock() + mock_response.text = "" + mock_response.headers = {} + + with self.assertRaises(OpenAPIParseError): + parse_openapi_schema(mock_response) + + def test_parse_none_yaml_content(self): + """Test parsing YAML that returns None""" + mock_response = Mock() + mock_response.text = "# This is just a comment\n" + mock_response.headers = {} + + with self.assertRaises(OpenAPIParseError): + parse_openapi_schema(mock_response) + + def test_validate_openapi_schema_valid_openapi3(self): + """Test validation of valid OpenAPI 3.x schema""" + self.assertTrue(validate_openapi_schema(self.openapi_data)) + + def test_validate_openapi_schema_valid_swagger2(self): + """Test validation of valid Swagger 2.0 schema""" + self.assertTrue(validate_openapi_schema(self.swagger_data)) + + def test_validate_openapi_schema_invalid_not_dict(self): + """Test validation of invalid schema (not a dictionary)""" + self.assertFalse(validate_openapi_schema("not a dict")) + self.assertFalse(validate_openapi_schema(123)) + self.assertFalse(validate_openapi_schema([])) + + def test_validate_openapi_schema_invalid_missing_fields(self): + """Test validation of invalid schema (missing required fields)""" + invalid_schema = {"info": {"title": "Test API", "version": "1.0.0"}} + self.assertFalse(validate_openapi_schema(invalid_schema)) + + def test_parse_json_with_wrong_content_type_fallback(self): + """Test parsing JSON with wrong content-type, should fallback to JSON parser""" + mock_response = Mock() + mock_response.text = json.dumps(self.openapi_data) + mock_response.headers = {"content-type": "text/plain"} # Wrong content-type + + result = parse_openapi_schema(mock_response) + + self.assertEqual(result, self.openapi_data) + + def test_parse_yaml_with_wrong_content_type_fallback(self): + """Test parsing YAML with wrong content-type, should fallback to YAML parser""" + mock_response = Mock() + # YAML that's not valid JSON + yaml_content = "openapi: '3.0.3'\ninfo:\n title: Test\n version: '1.0'" + mock_response.text = yaml_content + mock_response.headers = {"content-type": "text/plain"} # Wrong content-type + + result = parse_openapi_schema(mock_response) + + self.assertEqual(result["openapi"], "3.0.3") + self.assertEqual(result["info"]["title"], "Test") diff --git a/ingestion/tests/unit/topology/api/test_rest.py b/ingestion/tests/unit/topology/api/test_rest.py index 96070379dc7..ab32afffbbc 100644 --- a/ingestion/tests/unit/topology/api/test_rest.py +++ b/ingestion/tests/unit/topology/api/test_rest.py @@ -276,7 +276,7 @@ class RESTTest(TestCase): ): collections = list(self.rest_source.get_api_collections()) MOCK_COLLECTIONS_COPY = deepcopy(MOCK_COLLECTIONS) - MOCK_COLLECTIONS_COPY[2].description = None + MOCK_COLLECTIONS_COPY[2].description = Markdown(root="Operations about user") assert collections == MOCK_COLLECTIONS_COPY def test_generate_collection_url(self):