mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-11-04 12:36:23 +00:00
* Added protobuf parser complex schema support * Added options keyword in proto testing
This commit is contained in:
parent
91f930d4c4
commit
87c8254c38
@ -47,6 +47,7 @@ from metadata.parsers.schema_parsers import (
|
|||||||
)
|
)
|
||||||
from metadata.utils import fqn
|
from metadata.utils import fqn
|
||||||
from metadata.utils.logger import ingestion_logger
|
from metadata.utils.logger import ingestion_logger
|
||||||
|
from metadata.utils.messaging_utils import merge_and_clean_protobuf_schema
|
||||||
|
|
||||||
logger = ingestion_logger()
|
logger = ingestion_logger()
|
||||||
|
|
||||||
@ -75,6 +76,7 @@ class CommonBrokerSource(MessagingServiceSource, ABC):
|
|||||||
self.generate_sample_data = self.config.sourceConfig.config.generateSampleData
|
self.generate_sample_data = self.config.sourceConfig.config.generateSampleData
|
||||||
self.admin_client = self.connection.admin_client
|
self.admin_client = self.connection.admin_client
|
||||||
self.schema_registry_client = self.connection.schema_registry_client
|
self.schema_registry_client = self.connection.schema_registry_client
|
||||||
|
self.context.processed_schemas = {}
|
||||||
if self.generate_sample_data:
|
if self.generate_sample_data:
|
||||||
self.consumer_client = self.connection.consumer_client
|
self.consumer_client = self.connection.consumer_client
|
||||||
|
|
||||||
@ -125,9 +127,14 @@ class CommonBrokerSource(MessagingServiceSource, ABC):
|
|||||||
raise InvalidSchemaTypeException(
|
raise InvalidSchemaTypeException(
|
||||||
f"Cannot find {schema_type} in parser providers registry."
|
f"Cannot find {schema_type} in parser providers registry."
|
||||||
)
|
)
|
||||||
schema_fields = load_parser_fn(
|
schema_text = topic_schema.schema_str
|
||||||
topic_details.topic_name, topic_schema.schema_str
|
|
||||||
)
|
# In protobuf schema, we need to merge all the schema text with references
|
||||||
|
if schema_type == SchemaType.Protobuf.value.lower():
|
||||||
|
schema_text = merge_and_clean_protobuf_schema(
|
||||||
|
self._get_schema_text_with_references(schema=topic_schema)
|
||||||
|
)
|
||||||
|
schema_fields = load_parser_fn(topic_details.topic_name, schema_text)
|
||||||
|
|
||||||
topic.messageSchema = Topic(
|
topic.messageSchema = Topic(
|
||||||
schemaText=topic_schema.schema_str,
|
schemaText=topic_schema.schema_str,
|
||||||
@ -195,6 +202,38 @@ class CommonBrokerSource(MessagingServiceSource, ABC):
|
|||||||
f"Exception adding properties to topic [{topic.name}]: {exc}"
|
f"Exception adding properties to topic [{topic.name}]: {exc}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _get_schema_text_with_references(self, schema) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Returns the schema text with references resolved using recursive calls
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if schema:
|
||||||
|
schema_text = schema.schema_str
|
||||||
|
for reference in schema.references or []:
|
||||||
|
if not self.context.processed_schemas.get(reference.name):
|
||||||
|
self.context.processed_schemas[reference.name] = True
|
||||||
|
reference_schema = (
|
||||||
|
self.schema_registry_client.get_latest_version(
|
||||||
|
reference.name
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if reference_schema.schema.references:
|
||||||
|
schema_text = (
|
||||||
|
schema_text
|
||||||
|
+ self._get_schema_text_with_references(
|
||||||
|
reference_schema.schema
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
schema_text = (
|
||||||
|
schema_text + reference_schema.schema.schema_str
|
||||||
|
)
|
||||||
|
return schema_text
|
||||||
|
except Exception as exc:
|
||||||
|
logger.debug(traceback.format_exc())
|
||||||
|
logger.warning(f"Failed to get schema with references: {exc}")
|
||||||
|
return None
|
||||||
|
|
||||||
def _parse_topic_metadata(self, topic_name: str) -> Optional[Schema]:
|
def _parse_topic_metadata(self, topic_name: str) -> Optional[Schema]:
|
||||||
try:
|
try:
|
||||||
if self.schema_registry_client:
|
if self.schema_registry_client:
|
||||||
|
|||||||
43
ingestion/src/metadata/utils/messaging_utils.py
Normal file
43
ingestion/src/metadata/utils/messaging_utils.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
# Copyright 2024 Collate
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
Helpers module for messaging sources
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import traceback
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from metadata.utils.logger import utils_logger
|
||||||
|
|
||||||
|
logger = utils_logger()
|
||||||
|
|
||||||
|
|
||||||
|
def merge_and_clean_protobuf_schema(schema_text: Optional[str]) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Remove the import and extra syntax lines for a schema with references
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
lines = schema_text.splitlines() if schema_text else []
|
||||||
|
new_lines = []
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
if not re.search(r'import ".*";', line) and not re.search(
|
||||||
|
r"option .*;", line
|
||||||
|
):
|
||||||
|
if re.search(r'\s*syntax\s*=\s*"proto\d+";\s*', line) and i != 0:
|
||||||
|
continue
|
||||||
|
new_lines.append(line)
|
||||||
|
return "\n".join(new_lines)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.debug(traceback.format_exc())
|
||||||
|
logger.warning(f"Failed to merge and clean protobuf schema: {exc}")
|
||||||
|
return None
|
||||||
@ -0,0 +1,9 @@
|
|||||||
|
syntax = "proto3";
|
||||||
|
|
||||||
|
message Address {
|
||||||
|
string street = 1;
|
||||||
|
string city = 2;
|
||||||
|
string state = 3;
|
||||||
|
string zip = 4;
|
||||||
|
string country = 5;
|
||||||
|
}
|
||||||
10
ingestion/tests/unit/resources/protobuf_parser/company.proto
Normal file
10
ingestion/tests/unit/resources/protobuf_parser/company.proto
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
syntax = "proto3";
|
||||||
|
|
||||||
|
import "address.proto";
|
||||||
|
import "contact.proto";
|
||||||
|
|
||||||
|
message Company {
|
||||||
|
string name = 1;
|
||||||
|
Address address = 2;
|
||||||
|
Contact contact = 3;
|
||||||
|
}
|
||||||
@ -0,0 +1,9 @@
|
|||||||
|
syntax = "proto3";
|
||||||
|
|
||||||
|
option java_package = "group.org.prod.schemas.contact";
|
||||||
|
option java_outer_classname = "ContactProto";
|
||||||
|
|
||||||
|
message Contact {
|
||||||
|
string email = 1;
|
||||||
|
string phone = 2;
|
||||||
|
}
|
||||||
@ -0,0 +1,8 @@
|
|||||||
|
syntax = "proto3";
|
||||||
|
|
||||||
|
import "company.proto";
|
||||||
|
|
||||||
|
message Department {
|
||||||
|
string name = 1;
|
||||||
|
Company company = 2;
|
||||||
|
}
|
||||||
@ -0,0 +1,14 @@
|
|||||||
|
syntax = "proto3";
|
||||||
|
|
||||||
|
import "team.proto";
|
||||||
|
import "contact.proto";
|
||||||
|
|
||||||
|
option java_package = "group.org.prod.schemas.customers";
|
||||||
|
option java_outer_classname = "EmployeeProto";
|
||||||
|
|
||||||
|
message Employee {
|
||||||
|
string first_name = 1;
|
||||||
|
string last_name = 2;
|
||||||
|
Team team = 3;
|
||||||
|
Contact contact = 4;
|
||||||
|
}
|
||||||
@ -0,0 +1,8 @@
|
|||||||
|
syntax = "proto3";
|
||||||
|
|
||||||
|
import "department.proto";
|
||||||
|
|
||||||
|
message Team {
|
||||||
|
string name = 1;
|
||||||
|
Department department = 2;
|
||||||
|
}
|
||||||
@ -13,10 +13,12 @@
|
|||||||
Protobuf parser tests
|
Protobuf parser tests
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
from unittest import TestCase
|
from unittest import TestCase
|
||||||
|
|
||||||
from metadata.generated.schema.entity.data.table import Column
|
from metadata.generated.schema.entity.data.table import Column
|
||||||
from metadata.parsers.protobuf_parser import ProtobufParser, ProtobufParserConfig
|
from metadata.parsers.protobuf_parser import ProtobufParser, ProtobufParserConfig
|
||||||
|
from metadata.utils.messaging_utils import merge_and_clean_protobuf_schema
|
||||||
|
|
||||||
|
|
||||||
class ProtobufParserTests(TestCase):
|
class ProtobufParserTests(TestCase):
|
||||||
@ -94,3 +96,33 @@ class ProtobufParserTests(TestCase):
|
|||||||
parsed_schema = self.protobuf_parser.parse_protobuf_schema(cls=Column)
|
parsed_schema = self.protobuf_parser.parse_protobuf_schema(cls=Column)
|
||||||
field_types = {str(field.dataType.name) for field in parsed_schema[0].children}
|
field_types = {str(field.dataType.name) for field in parsed_schema[0].children}
|
||||||
self.assertEqual(field_types, {"INT", "ENUM", "RECORD", "STRING", "BOOLEAN"})
|
self.assertEqual(field_types, {"INT", "ENUM", "RECORD", "STRING", "BOOLEAN"})
|
||||||
|
|
||||||
|
def test_complex_protobuf_schema_files(self):
|
||||||
|
"""
|
||||||
|
We'll read the files under ./ingestion/tests/unit/resources/protobuf_parser and parse them
|
||||||
|
This will be similar in way to how we get the data from kafka source
|
||||||
|
"""
|
||||||
|
resource_path = "./ingestion/tests/unit/resources/protobuf_parser/"
|
||||||
|
schema_name = "employee"
|
||||||
|
file_list = os.listdir(resource_path)
|
||||||
|
schema_text = ""
|
||||||
|
for file_name in file_list:
|
||||||
|
file_path = os.path.join(resource_path, file_name)
|
||||||
|
with open(file_path, "r") as file:
|
||||||
|
schema_text = schema_text + file.read()
|
||||||
|
schema_text = merge_and_clean_protobuf_schema(schema_text)
|
||||||
|
protobuf_parser = ProtobufParser(
|
||||||
|
config=ProtobufParserConfig(
|
||||||
|
schema_name=schema_name, schema_text=schema_text
|
||||||
|
)
|
||||||
|
)
|
||||||
|
parsed_schema = protobuf_parser.parse_protobuf_schema()
|
||||||
|
self.assertEqual(parsed_schema[0].name.__root__, "Employee")
|
||||||
|
self.assertEqual(len(parsed_schema[0].children), 4)
|
||||||
|
self.assertEqual(parsed_schema[0].children[3].name.__root__, "contact")
|
||||||
|
self.assertEqual(
|
||||||
|
parsed_schema[0].children[3].children[0].name.__root__, "email"
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
parsed_schema[0].children[3].children[1].name.__root__, "phone"
|
||||||
|
)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user