diff --git a/ingestion/src/metadata/ingestion/source/database/column_type_parser.py b/ingestion/src/metadata/ingestion/source/database/column_type_parser.py index a9b2210d5b7..3749b3d6338 100644 --- a/ingestion/src/metadata/ingestion/source/database/column_type_parser.py +++ b/ingestion/src/metadata/ingestion/source/database/column_type_parser.py @@ -1,6 +1,7 @@ import re from typing import Any, Dict, List, Optional, Tuple, Type, Union +from sqlalchemy.dialects.postgresql import BYTEA from sqlalchemy.sql import sqltypes as types from sqlalchemy.types import TypeEngine @@ -59,6 +60,7 @@ class ColumnTypeParser: # Custom wrapper types enriching SQA type system sqa_types.SQAMap: "MAP", sqa_types.SQAStruct: "STRUCT", + BYTEA: "BYTEA", } _SOURCE_TYPE_TO_OM_TYPE = { @@ -172,6 +174,7 @@ class ColumnTypeParser: "UUID": "UUID", "POINT": "POINT", "POLYGON": "POLYGON", + "BYTEA": "BYTEA", } _COMPLEX_TYPE = re.compile("^(struct|map|array|uniontype)") diff --git a/ingestion/src/metadata/orm_profiler/orm/converter.py b/ingestion/src/metadata/orm_profiler/orm/converter.py index 75acde0a36a..0421525e569 100644 --- a/ingestion/src/metadata/orm_profiler/orm/converter.py +++ b/ingestion/src/metadata/orm_profiler/orm/converter.py @@ -66,6 +66,7 @@ _TYPE_MAP = { DataType.ENUM: sqlalchemy.Enum, DataType.JSON: sqlalchemy.JSON, DataType.UUID: CustomTypes.UUID.value, + DataType.BYTEA: CustomTypes.BYTEA.value, } SQA_RESERVED_ATTRIBUTES = ["metadata"] diff --git a/ingestion/src/metadata/orm_profiler/orm/registry.py b/ingestion/src/metadata/orm_profiler/orm/registry.py index d3881764edf..889feffe00f 100644 --- a/ingestion/src/metadata/orm_profiler/orm/registry.py +++ b/ingestion/src/metadata/orm_profiler/orm/registry.py @@ -18,6 +18,7 @@ from sqlalchemy import Date, DateTime, Integer, Numeric, Time from sqlalchemy.sql.sqltypes import Concatenable, Enum from metadata.ingestion.source import sqa_types +from metadata.orm_profiler.orm.types.bytea_to_string import ByteaToHex from metadata.orm_profiler.orm.types.hex_byte_string import HexByteString from metadata.orm_profiler.orm.types.uuid import UUIDString from metadata.orm_profiler.registry import TypeRegistry @@ -26,6 +27,7 @@ from metadata.orm_profiler.registry import TypeRegistry class CustomTypes(TypeRegistry): BYTES = HexByteString UUID = UUIDString + BYTEA = ByteaToHex class Dialects(Enum): diff --git a/ingestion/src/metadata/orm_profiler/orm/types/bytea_to_string.py b/ingestion/src/metadata/orm_profiler/orm/types/bytea_to_string.py new file mode 100644 index 00000000000..a95f52a05be --- /dev/null +++ b/ingestion/src/metadata/orm_profiler/orm/types/bytea_to_string.py @@ -0,0 +1,52 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Expand sqlalchemy types to map them to OpenMetadata DataType +""" +# pylint: disable=duplicate-code + +from typing import Optional + +from sqlalchemy.sql.sqltypes import String, TypeDecorator + + +class ByteaToHex(TypeDecorator): + """convert bytea type to string""" + + impl = String + cache_ok = True + + @property + def python_type(self): + return str + + @staticmethod + def validate(value: bytes): + """ + Make sure the data is of correct type + """ + if not isinstance(value, (memoryview, bytes)): + raise TypeError("ByteaToString columns support only memoryview values.") + + def process_result_value(self, value: str, dialect) -> Optional[str]: + """This is executed during result retrieval + + Args: + value: database record + dialect: database dialect + Returns: + hex string representation of the byte value + """ + if not value: + return None + self.validate(value) + return value.hex() diff --git a/ingestion/tests/unit/profiler/custom_types/test_custom_types.py b/ingestion/tests/unit/profiler/custom_types/test_custom_types.py new file mode 100644 index 00000000000..c4f34ef2458 --- /dev/null +++ b/ingestion/tests/unit/profiler/custom_types/test_custom_types.py @@ -0,0 +1,82 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Test SQA custom types are correctly maped""" + +from unittest import TestCase + +from sqlalchemy import TEXT, Column, Integer, String, create_engine, func +from sqlalchemy.orm import Session, declarative_base + +from metadata.orm_profiler.orm.types.bytea_to_string import ByteaToHex + +Base = declarative_base() + + +class User(Base): + __tablename__ = "users" + id = Column(Integer, primary_key=True) + name = Column(String(256)) + fullname = Column(String(256)) + nickname = Column(String(256)) + comments = Column(TEXT) + age = Column(Integer) + config = Column(ByteaToHex) + + +class TestCustomTypes(TestCase): + """test for customer sqa types""" + + engine = create_engine("sqlite+pysqlite:///:memory:", echo=False, future=True) + session = Session(engine) + + @classmethod + def setUpClass(cls) -> None: + User.__table__.create(bind=cls.engine) + + for i in range(10): + data = [ + User( + name="John", + fullname="John Doe", + nickname="johnny b goode", + comments="no comments", + age=30, + config=memoryview(b"foo"), + ), + User( + name="Jane", + fullname="Jone Doe", + nickname=None, + comments="maybe some comments", + age=31, + config=memoryview(b"bar"), + ), + User( + name="John", + fullname="John Doe", + nickname=None, + comments=None, + age=None, + config=memoryview(b"fooBar"), + ), + ] + cls.session.add_all(data) + cls.session.commit() + + def test_bytea_to_hex(self): + """test ByteaToHex correctly returns an hex from a memoryview value""" + assert isinstance(self.session.query(User.config).first().config, str) + + @classmethod + def tearDownClass(cls) -> None: + User.__table__.drop(bind=cls.engine) + return super().tearDownClass() diff --git a/openmetadata-spec/src/main/resources/json/schema/entity/data/table.json b/openmetadata-spec/src/main/resources/json/schema/entity/data/table.json index d870ac99397..1c1342f8881 100644 --- a/openmetadata-spec/src/main/resources/json/schema/entity/data/table.json +++ b/openmetadata-spec/src/main/resources/json/schema/entity/data/table.json @@ -94,7 +94,8 @@ "VARIANT", "GEOMETRY", "POINT", - "POLYGON" + "POLYGON", + "BYTEA" ] }, "constraint": {