OpenMetadata/ingestion/tests/unit/test_helpers.py

#  Copyright 2021 Collate
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#  http://www.apache.org/licenses/LICENSE-2.0
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""
Test helpers module
"""
import uuid
from unittest import TestCase

from metadata.generated.schema.entity.data.table import Column, DataType, Table
from metadata.generated.schema.type.tagLabel import (
    LabelType,
    State,
    TagLabel,
    TagSource,
)
from metadata.utils.helpers import (
    clean_up_starting_ending_double_quotes_in_string,
    deep_size_of_dict,
    format_large_string_numbers,
    get_entity_tier_from_tags,
    is_safe_sql_query,
    list_to_dict,
)


class TestHelpers(TestCase):
    """
    Test helpers module
    """

    def test_list_to_dict(self):
        original = ["key=value", "a=b"]

        self.assertEqual(list_to_dict(original=original), {"key": "value", "a": "b"})
        self.assertEqual(list_to_dict([]), {})
        self.assertEqual(list_to_dict(None), {})

    def test_clean_up_starting_ending_double_quotes_in_string(self):
        input_ = '"password"'
        output_ = "password"

        assert clean_up_starting_ending_double_quotes_in_string(input_) == output_

    def test_get_entity_tier_from_tags(self):
        """test correct entity tier are returned"""
        table_entity_w_tier = Table(
            id=uuid.uuid4(),
            name="table_entity_test",
            columns=[Column(name="col1", dataType=DataType.STRING)],
            tags=[
                TagLabel(
                    tagFQN="Tier.Tier1",
                    source=TagSource.Classification,
                    labelType=LabelType.Automated,
                    state=State.Confirmed,
                ),
                TagLabel(
                    tagFQN="Foo.Bar",
                    source=TagSource.Classification,
                    labelType=LabelType.Automated,
                    state=State.Confirmed,
                ),
            ],
        )

        assert get_entity_tier_from_tags(table_entity_w_tier.tags) == "Tier.Tier1"

        table_entity_wo_tier = Table(
            id=uuid.uuid4(),
            name="table_entity_test",
            columns=[Column(name="col1", dataType=DataType.STRING)],
            tags=[
                TagLabel(
                    tagFQN="Foo.Bar",
                    source=TagSource.Classification,
                    labelType=LabelType.Automated,
                    state=State.Confirmed,
                )
            ],
        )

        assert get_entity_tier_from_tags(table_entity_wo_tier.tags) is None

    def test_deep_size_of_dict(self):
        """test deep size of dict"""
        test_dict = {
            "a": 1,
            "b": {"c": 2, "d": {"e": "Hello World", "f": [4, 5, 6]}},
        }

        assert deep_size_of_dict(test_dict) >= 1000
        assert deep_size_of_dict(test_dict) <= 1500

    def test_is_safe_sql_query(self):
        """Test is_safe_sql_query function"""

        delete_query = """
         DELETE FROM airflow_task_instance
         WHERE dag_id = 'test_dag_id'
         """

        drop_query = """
         DROP TABLE IF EXISTS test_table
         """

        create_query = """
         CREATE TABLE test_table (
             id INT,
             name VARCHAR(255)
         )
         """

        select_query = """
         SELECT * FROM test_table
         """

        cte_query = """
         WITH foo AS (
             SELECT * FROM test_table
         )
         SELECT * FROM foo
         """

        transaction_query = """
         BEGIN TRAN T1;  
             UPDATE table1 ...;  
             BEGIN TRAN M2 WITH MARK;  
                 UPDATE table2 ...;  
                 SELECT * from table1;  
             COMMIT TRAN M2;  
             UPDATE table3 ...;  
         COMMIT TRAN T1;  
         """

        self.assertFalse(is_safe_sql_query(delete_query))
        self.assertFalse(is_safe_sql_query(drop_query))
        self.assertFalse(is_safe_sql_query(create_query))
        self.assertTrue(is_safe_sql_query(select_query))
        self.assertTrue(is_safe_sql_query(cte_query))
        self.assertFalse(is_safe_sql_query(transaction_query))

    def test_format_large_string_numbers(self):
        """test format_large_string_numbers"""
        assert format_large_string_numbers(1000) == "1.000K"
        assert format_large_string_numbers(1001) == "1.001K"
        assert format_large_string_numbers(1000000) == "1.000M"
        assert format_large_string_numbers(1000000000) == "1.000B"
        assert format_large_string_numbers(1000000000000) == "1.000T"
Fix #6489 - Update backup CLI (#6488) Fix #6489 - Update backup CLI (#6488) 2022-08-03 12:01:57 +02:00			`# Copyright 2021 Collate`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`"""`
			`Test helpers module`
			`"""`
Fixes #8206 -- Implement data insight refinement for entity report data (#8351) * added data insight module to refine and ingest data * Added json schema for data insight workflow * Added mixin for data insight * Added tests for data insight workflow and methods * Fixed python style * Fixed python linting * Fixed unit test * Added data insight worflow + workflow logging * Fix python style * Renamed analytic -> data_insight + added test to make + fixed style * Fixed java style * Fixed code smells + python style * Ignore ES linting for now * Fixed comments from review * fix python formatting * Fixe PipelineType attribute for data insight 2022-10-26 11:18:08 +02:00			`import uuid`
Fix #6489 - Update backup CLI (#6488) Fix #6489 - Update backup CLI (#6488) 2022-08-03 12:01:57 +02:00			`from unittest import TestCase`

Fixes #8206 -- Implement data insight refinement for entity report data (#8351) * added data insight module to refine and ingest data * Added json schema for data insight workflow * Added mixin for data insight * Added tests for data insight workflow and methods * Fixed python style * Fixed python linting * Fixed unit test * Added data insight worflow + workflow logging * Fix python style * Renamed analytic -> data_insight + added test to make + fixed style * Fixed java style * Fixed code smells + python style * Ignore ES linting for now * Fixed comments from review * fix python formatting * Fixe PipelineType attribute for data insight 2022-10-26 11:18:08 +02:00			`from metadata.generated.schema.entity.data.table import Column, DataType, Table`
			`from metadata.generated.schema.type.tagLabel import (`
			`LabelType,`
			`State,`
			`TagLabel,`
			`TagSource,`
			`)`
added logic to clean up quotes from table constraints (#7959) 2022-10-05 16:09:33 +02:00			`from metadata.utils.helpers import (`
			`clean_up_starting_ending_double_quotes_in_string,`
Fixes #11384 - Implement mem. optimization for sys. metrics (#11460) * fix: optimize system metrics retrieval for memory * fix: ran python linting * fix: logic to retrieve unique system metrics operations * fix: added logic to clean up query before parsing it * fix: added E2E tests for rds, bq, snflk system metrics * fix: ran python linting * fix: fix postgres query + add default byte size to env var * fix: ran python linting 2023-05-09 12:05:35 +02:00			`deep_size_of_dict,`
fix: increase floating point precision (#14827) 2024-01-24 09:19:19 +01:00			`format_large_string_numbers,`
Fixes #8206 -- Implement data insight refinement for entity report data (#8351) * added data insight module to refine and ingest data * Added json schema for data insight workflow * Added mixin for data insight * Added tests for data insight workflow and methods * Fixed python style * Fixed python linting * Fixed unit test * Added data insight worflow + workflow logging * Fix python style * Renamed analytic -> data_insight + added test to make + fixed style * Fixed java style * Fixed code smells + python style * Ignore ES linting for now * Fixed comments from review * fix python formatting * Fixe PipelineType attribute for data insight 2022-10-26 11:18:08 +02:00			`get_entity_tier_from_tags,`
Fixes Issue #11438 - Implement threshold and startegy for custom SQL (#11847) * feat: Add threshold and strategy logic on the custom SQL object test * feat: ran python linting * feat: added safety checks for custom sql query * feat: ran python linting 2023-06-02 09:41:31 +02:00			`is_safe_sql_query,`
added logic to clean up quotes from table constraints (#7959) 2022-10-05 16:09:33 +02:00			`list_to_dict,`
			`)`
Fix #6489 - Update backup CLI (#6488) Fix #6489 - Update backup CLI (#6488) 2022-08-03 12:01:57 +02:00

			`class TestHelpers(TestCase):`
			`"""`
			`Test helpers module`
			`"""`

			`def test_list_to_dict(self):`
			`original = ["key=value", "a=b"]`

			`self.assertEqual(list_to_dict(original=original), {"key": "value", "a": "b"})`
			`self.assertEqual(list_to_dict([]), {})`
			`self.assertEqual(list_to_dict(None), {})`
added logic to clean up quotes from table constraints (#7959) 2022-10-05 16:09:33 +02:00
			`def test_clean_up_starting_ending_double_quotes_in_string(self):`
			`input_ = '"password"'`
			`output_ = "password"`

			`assert clean_up_starting_ending_double_quotes_in_string(input_) == output_`
Fixes #8206 -- Implement data insight refinement for entity report data (#8351) * added data insight module to refine and ingest data * Added json schema for data insight workflow * Added mixin for data insight * Added tests for data insight workflow and methods * Fixed python style * Fixed python linting * Fixed unit test * Added data insight worflow + workflow logging * Fix python style * Renamed analytic -> data_insight + added test to make + fixed style * Fixed java style * Fixed code smells + python style * Ignore ES linting for now * Fixed comments from review * fix python formatting * Fixe PipelineType attribute for data insight 2022-10-26 11:18:08 +02:00
			`def test_get_entity_tier_from_tags(self):`
			`"""test correct entity tier are returned"""`
			`table_entity_w_tier = Table(`
			`id=uuid.uuid4(),`
			`name="table_entity_test",`
			`columns=[Column(name="col1", dataType=DataType.STRING)],`
			`tags=[`
			`TagLabel(`
			`tagFQN="Tier.Tier1",`
Fixes #10480 Glossary rename results in rename of Classification with… (#10486) * Fixes #10480 Glossary rename results in rename of Classification with the same name * Rename TagSource Tag to Classification 2023-03-09 00:30:36 -08:00			`source=TagSource.Classification,`
Fixes #8206 -- Implement data insight refinement for entity report data (#8351) * added data insight module to refine and ingest data * Added json schema for data insight workflow * Added mixin for data insight * Added tests for data insight workflow and methods * Fixed python style * Fixed python linting * Fixed unit test * Added data insight worflow + workflow logging * Fix python style * Renamed analytic -> data_insight + added test to make + fixed style * Fixed java style * Fixed code smells + python style * Ignore ES linting for now * Fixed comments from review * fix python formatting * Fixe PipelineType attribute for data insight 2022-10-26 11:18:08 +02:00			`labelType=LabelType.Automated,`
			`state=State.Confirmed,`
			`),`
			`TagLabel(`
			`tagFQN="Foo.Bar",`
Fixes #10480 Glossary rename results in rename of Classification with… (#10486) * Fixes #10480 Glossary rename results in rename of Classification with the same name * Rename TagSource Tag to Classification 2023-03-09 00:30:36 -08:00			`source=TagSource.Classification,`
Fixes #8206 -- Implement data insight refinement for entity report data (#8351) * added data insight module to refine and ingest data * Added json schema for data insight workflow * Added mixin for data insight * Added tests for data insight workflow and methods * Fixed python style * Fixed python linting * Fixed unit test * Added data insight worflow + workflow logging * Fix python style * Renamed analytic -> data_insight + added test to make + fixed style * Fixed java style * Fixed code smells + python style * Ignore ES linting for now * Fixed comments from review * fix python formatting * Fixe PipelineType attribute for data insight 2022-10-26 11:18:08 +02:00			`labelType=LabelType.Automated,`
			`state=State.Confirmed,`
			`),`
			`],`
			`)`

			`assert get_entity_tier_from_tags(table_entity_w_tier.tags) == "Tier.Tier1"`

			`table_entity_wo_tier = Table(`
			`id=uuid.uuid4(),`
			`name="table_entity_test",`
			`columns=[Column(name="col1", dataType=DataType.STRING)],`
			`tags=[`
			`TagLabel(`
			`tagFQN="Foo.Bar",`
Fixes #10480 Glossary rename results in rename of Classification with… (#10486) * Fixes #10480 Glossary rename results in rename of Classification with the same name * Rename TagSource Tag to Classification 2023-03-09 00:30:36 -08:00			`source=TagSource.Classification,`
Fixes #8206 -- Implement data insight refinement for entity report data (#8351) * added data insight module to refine and ingest data * Added json schema for data insight workflow * Added mixin for data insight * Added tests for data insight workflow and methods * Fixed python style * Fixed python linting * Fixed unit test * Added data insight worflow + workflow logging * Fix python style * Renamed analytic -> data_insight + added test to make + fixed style * Fixed java style * Fixed code smells + python style * Ignore ES linting for now * Fixed comments from review * fix python formatting * Fixe PipelineType attribute for data insight 2022-10-26 11:18:08 +02:00			`labelType=LabelType.Automated,`
			`state=State.Confirmed,`
			`)`
			`],`
			`)`

			`assert get_entity_tier_from_tags(table_entity_wo_tier.tags) is None`
Fixes #11384 - Implement mem. optimization for sys. metrics (#11460) * fix: optimize system metrics retrieval for memory * fix: ran python linting * fix: logic to retrieve unique system metrics operations * fix: added logic to clean up query before parsing it * fix: added E2E tests for rds, bq, snflk system metrics * fix: ran python linting * fix: fix postgres query + add default byte size to env var * fix: ran python linting 2023-05-09 12:05:35 +02:00
			`def test_deep_size_of_dict(self):`
			`"""test deep size of dict"""`
			`test_dict = {`
			`"a": 1,`
			`"b": {"c": 2, "d": {"e": "Hello World", "f": [4, 5, 6]}},`
			`}`

			`assert deep_size_of_dict(test_dict) >= 1000`
			`assert deep_size_of_dict(test_dict) <= 1500`
Fixes Issue #11438 - Implement threshold and startegy for custom SQL (#11847) * feat: Add threshold and strategy logic on the custom SQL object test * feat: ran python linting * feat: added safety checks for custom sql query * feat: ran python linting 2023-06-02 09:41:31 +02:00
			`def test_is_safe_sql_query(self):`
			`"""Test is_safe_sql_query function"""`

			`delete_query = """`
			`DELETE FROM airflow_task_instance`
			`WHERE dag_id = 'test_dag_id'`
			`"""`

			`drop_query = """`
			`DROP TABLE IF EXISTS test_table`
			`"""`

			`create_query = """`
			`CREATE TABLE test_table (`
			`id INT,`
			`name VARCHAR(255)`
			`)`
			`"""`

			`select_query = """`
			`SELECT * FROM test_table`
			`"""`

			`cte_query = """`
			`WITH foo AS (`
			`SELECT * FROM test_table`
			`)`
			`SELECT * FROM foo`
			`"""`

			`transaction_query = """`
			`BEGIN TRAN T1;`
			`UPDATE table1 ...;`
			`BEGIN TRAN M2 WITH MARK;`
			`UPDATE table2 ...;`
			`SELECT * from table1;`
			`COMMIT TRAN M2;`
			`UPDATE table3 ...;`
			`COMMIT TRAN T1;`
			`"""`

			`self.assertFalse(is_safe_sql_query(delete_query))`
			`self.assertFalse(is_safe_sql_query(drop_query))`
			`self.assertFalse(is_safe_sql_query(create_query))`
			`self.assertTrue(is_safe_sql_query(select_query))`
			`self.assertTrue(is_safe_sql_query(cte_query))`
			`self.assertFalse(is_safe_sql_query(transaction_query))`
fix: increase floating point precision (#14827) 2024-01-24 09:19:19 +01:00
			`def test_format_large_string_numbers(self):`
			`"""test format_large_string_numbers"""`
			`assert format_large_string_numbers(1000) == "1.000K"`
			`assert format_large_string_numbers(1001) == "1.001K"`
			`assert format_large_string_numbers(1000000) == "1.000M"`
			`assert format_large_string_numbers(1000000000) == "1.000B"`
			`assert format_large_string_numbers(1000000000000) == "1.000T"`