Add OpenAPI YAML format support for REST API ingestion (#22304)

* Initial plan

* Implement OpenAPI YAML support with backward JSON compatibility

Co-authored-by: harshach <38649+harshach@users.noreply.github.com>

* fix tests & lint

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: harshach <38649+harshach@users.noreply.github.com>
Co-authored-by: ulixius9 <mayursingal9@gmail.com>
This commit is contained in:
Copilot 2025-08-12 18:22:41 +05:30 committed by ulixius9
parent 13d73de29f
commit a4da357892
5 changed files with 318 additions and 8 deletions

View File

@ -28,6 +28,11 @@ from metadata.generated.schema.entity.services.connections.testConnectionResult
)
from metadata.ingestion.connections.test_connections import test_connection_steps
from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.ingestion.source.api.rest.parser import (
OpenAPIParseError,
parse_openapi_schema,
validate_openapi_schema,
)
from metadata.utils.constants import THREE_MIN
@ -75,16 +80,17 @@ def test_connection(
def custom_schema_exec():
try:
if client.json() is not None and client.json().get("openapi") is not None:
schema = parse_openapi_schema(client)
if validate_openapi_schema(schema):
return []
raise InvalidOpenAPISchemaError(
"Provided schema is not valid OpenAPI JSON schema"
)
except:
raise InvalidOpenAPISchemaError(
"Provided schema is not valid OpenAPI JSON schema"
"Provided schema is not valid OpenAPI specification"
)
except OpenAPIParseError as e:
raise InvalidOpenAPISchemaError(f"Failed to parse OpenAPI schema: {e}")
except Exception as e:
raise InvalidOpenAPISchemaError(f"Error validating OpenAPI schema: {e}")
test_fn = {"CheckURL": custom_url_exec, "CheckSchema": custom_schema_exec}

View File

@ -40,6 +40,7 @@ from metadata.ingestion.api.steps import InvalidSourceException
from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.ingestion.source.api.api_service import ApiServiceSource
from metadata.ingestion.source.api.rest.models import RESTCollection, RESTEndpoint
from metadata.ingestion.source.api.rest.parser import parse_openapi_schema
from metadata.utils import fqn
from metadata.utils.filters import filter_by_collection
from metadata.utils.helpers import clean_uri
@ -77,7 +78,7 @@ class RestSource(ApiServiceSource):
Here is where filtering happens
"""
try:
self.json_response = self.connection.json()
self.json_response = parse_openapi_schema(self.connection)
collections_list = []
tags_collection_set = set()
if self.json_response.get("tags", []):

View File

@ -0,0 +1,103 @@
# Copyright 2024 Collate
# Licensed under the Collate Community License, Version 1.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
OpenAPI schema parser for both JSON and YAML formats
"""
import json
from typing import Any, Dict
import yaml
from requests.models import Response
from metadata.utils.logger import ingestion_logger
logger = ingestion_logger()
class OpenAPIParseError(Exception):
"""
Exception raised when OpenAPI schema cannot be parsed as either JSON or YAML
"""
def parse_openapi_schema(response: Response) -> Dict[str, Any]:
"""
Parse OpenAPI schema from HTTP response.
Supports both JSON and YAML formats.
Args:
response: HTTP response containing OpenAPI schema
Returns:
Parsed OpenAPI schema as dictionary
Raises:
OpenAPIParseError: If content cannot be parsed as either JSON or YAML
"""
content = response.text
content_type = response.headers.get("content-type", "").lower()
# Try to determine format from content-type header
if "json" in content_type:
try:
return json.loads(content)
except json.JSONDecodeError as e:
logger.warning(f"Failed to parse as JSON despite content-type: {e}")
elif "yaml" in content_type or "yml" in content_type:
try:
return yaml.safe_load(content)
except yaml.YAMLError as e:
logger.warning(f"Failed to parse as YAML despite content-type: {e}")
# If content-type is not definitive or parsing failed, try both formats
# First try JSON (backward compatibility)
try:
parsed = json.loads(content)
logger.debug("Successfully parsed OpenAPI schema as JSON")
return parsed
except json.JSONDecodeError:
logger.debug("Content is not valid JSON, trying YAML")
# Then try YAML
try:
parsed = yaml.safe_load(content)
if parsed is None:
raise OpenAPIParseError("YAML parsing returned None")
logger.debug("Successfully parsed OpenAPI schema as YAML")
return parsed
except yaml.YAMLError as e:
logger.error(f"Failed to parse as YAML: {e}")
# If both formats fail, raise an error
raise OpenAPIParseError(
"Failed to parse OpenAPI schema as either JSON or YAML. "
"Please ensure the schema is valid OpenAPI specification."
)
def validate_openapi_schema(schema: Dict[str, Any]) -> bool:
"""
Validate that the parsed schema is a valid OpenAPI specification.
Args:
schema: Parsed schema dictionary
Returns:
True if schema appears to be valid OpenAPI, False otherwise
"""
# Check for required OpenAPI fields
if not isinstance(schema, dict):
return False
# OpenAPI 3.x uses "openapi" field, OpenAPI 2.x uses "swagger" field
return schema.get("openapi") is not None or schema.get("swagger") is not None

View File

@ -0,0 +1,200 @@
# Copyright 2024 Collate
# Licensed under the Collate Community License, Version 1.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Test OpenAPI schema parser for both JSON and YAML formats
"""
import json
from unittest import TestCase
from unittest.mock import Mock
import yaml
from metadata.ingestion.source.api.rest.parser import (
OpenAPIParseError,
parse_openapi_schema,
validate_openapi_schema,
)
class TestOpenAPIParser(TestCase):
"""Test cases for OpenAPI schema parser"""
def setUp(self):
"""Set up test data"""
self.openapi_data = {
"openapi": "3.0.3",
"info": {"title": "Test API", "version": "1.0.0"},
"paths": {
"/pets": {
"get": {
"tags": ["pets"],
"summary": "List all pets",
"operationId": "listPets",
}
}
},
"tags": [{"name": "pets", "description": "Everything about pets"}],
}
self.swagger_data = {
"swagger": "2.0",
"info": {"title": "Test API", "version": "1.0.0"},
"paths": {
"/pets": {
"get": {
"tags": ["pets"],
"summary": "List all pets",
"operationId": "listPets",
}
}
},
"tags": [{"name": "pets", "description": "Everything about pets"}],
}
def test_parse_json_openapi_schema(self):
"""Test parsing valid JSON OpenAPI schema"""
mock_response = Mock()
mock_response.text = json.dumps(self.openapi_data)
mock_response.headers = {"content-type": "application/json"}
result = parse_openapi_schema(mock_response)
self.assertEqual(result, self.openapi_data)
def test_parse_yaml_openapi_schema(self):
"""Test parsing valid YAML OpenAPI schema"""
mock_response = Mock()
mock_response.text = yaml.dump(self.openapi_data)
mock_response.headers = {"content-type": "application/yaml"}
result = parse_openapi_schema(mock_response)
self.assertEqual(result, self.openapi_data)
def test_parse_json_without_content_type(self):
"""Test parsing JSON when content-type is not specified"""
mock_response = Mock()
mock_response.text = json.dumps(self.openapi_data)
mock_response.headers = {}
result = parse_openapi_schema(mock_response)
self.assertEqual(result, self.openapi_data)
def test_parse_yaml_without_content_type(self):
"""Test parsing YAML when content-type is not specified"""
mock_response = Mock()
# Use YAML format that's not valid JSON to ensure YAML parser is used
yaml_content = """
openapi: 3.0.3
info:
title: Test API
version: 1.0.0
paths:
/pets:
get:
tags:
- pets
summary: List all pets
operationId: listPets
tags:
- name: pets
description: Everything about pets
"""
mock_response.text = yaml_content
mock_response.headers = {}
result = parse_openapi_schema(mock_response)
self.assertEqual(result["openapi"], "3.0.3")
self.assertEqual(result["info"]["title"], "Test API")
def test_parse_yml_content_type(self):
"""Test parsing with content-type: application/yml"""
mock_response = Mock()
mock_response.text = yaml.dump(self.openapi_data)
mock_response.headers = {"content-type": "application/yml"}
result = parse_openapi_schema(mock_response)
self.assertEqual(result, self.openapi_data)
def test_parse_text_yaml_content_type(self):
"""Test parsing with content-type: text/yaml"""
mock_response = Mock()
mock_response.text = yaml.dump(self.openapi_data)
mock_response.headers = {"content-type": "text/yaml"}
result = parse_openapi_schema(mock_response)
self.assertEqual(result, self.openapi_data)
def test_parse_empty_content(self):
"""Test parsing empty content"""
mock_response = Mock()
mock_response.text = ""
mock_response.headers = {}
with self.assertRaises(OpenAPIParseError):
parse_openapi_schema(mock_response)
def test_parse_none_yaml_content(self):
"""Test parsing YAML that returns None"""
mock_response = Mock()
mock_response.text = "# This is just a comment\n"
mock_response.headers = {}
with self.assertRaises(OpenAPIParseError):
parse_openapi_schema(mock_response)
def test_validate_openapi_schema_valid_openapi3(self):
"""Test validation of valid OpenAPI 3.x schema"""
self.assertTrue(validate_openapi_schema(self.openapi_data))
def test_validate_openapi_schema_valid_swagger2(self):
"""Test validation of valid Swagger 2.0 schema"""
self.assertTrue(validate_openapi_schema(self.swagger_data))
def test_validate_openapi_schema_invalid_not_dict(self):
"""Test validation of invalid schema (not a dictionary)"""
self.assertFalse(validate_openapi_schema("not a dict"))
self.assertFalse(validate_openapi_schema(123))
self.assertFalse(validate_openapi_schema([]))
def test_validate_openapi_schema_invalid_missing_fields(self):
"""Test validation of invalid schema (missing required fields)"""
invalid_schema = {"info": {"title": "Test API", "version": "1.0.0"}}
self.assertFalse(validate_openapi_schema(invalid_schema))
def test_parse_json_with_wrong_content_type_fallback(self):
"""Test parsing JSON with wrong content-type, should fallback to JSON parser"""
mock_response = Mock()
mock_response.text = json.dumps(self.openapi_data)
mock_response.headers = {"content-type": "text/plain"} # Wrong content-type
result = parse_openapi_schema(mock_response)
self.assertEqual(result, self.openapi_data)
def test_parse_yaml_with_wrong_content_type_fallback(self):
"""Test parsing YAML with wrong content-type, should fallback to YAML parser"""
mock_response = Mock()
# YAML that's not valid JSON
yaml_content = "openapi: '3.0.3'\ninfo:\n title: Test\n version: '1.0'"
mock_response.text = yaml_content
mock_response.headers = {"content-type": "text/plain"} # Wrong content-type
result = parse_openapi_schema(mock_response)
self.assertEqual(result["openapi"], "3.0.3")
self.assertEqual(result["info"]["title"], "Test")

View File

@ -276,7 +276,7 @@ class RESTTest(TestCase):
):
collections = list(self.rest_source.get_api_collections())
MOCK_COLLECTIONS_COPY = deepcopy(MOCK_COLLECTIONS)
MOCK_COLLECTIONS_COPY[2].description = None
MOCK_COLLECTIONS_COPY[2].description = Markdown(root="Operations about user")
assert collections == MOCK_COLLECTIONS_COPY
def test_generate_collection_url(self):