datahub/metadata-ingestion/tests/unit/test_bigquery_sql_lineage.py

from datahub.utilities.bigquery_sql_parser import BigQuerySQLParser


def test_bigquery_sql_lineage_hash_as_comment_sign_is_accepted():
    parser = BigQuerySQLParser(
        sql_query="""
/*
HERE IS A STANDARD COMMENT BLOCK
THIS WILL NOT BREAK sqllineage
*/
CREATE OR REPLACE TABLE `project.dataset.trg_tbl`AS
#This, comment will not break sqllineage
SELECT foo
-- this comment will not break sqllineage either
# this comment will not break sqllineage either
FROM `project.dataset.src_tbl`
        """
    )

    assert parser.get_tables() == ["project.dataset.src_tbl"]


def test_bigquery_sql_lineage_camel_case_table():
    """
    This test aims to test the parameter to ignore sqllineage lowercasing.
    On the BigQuery service, it's possible to use uppercase name un datasets and tables.
    The lowercasing, by default, breaks the lineage construction in these cases.
    """
    parser = BigQuerySQLParser(
        sql_query="""
/*
HERE IS A STANDARD COMMENT BLOCK
THIS WILL NOT BREAK sqllineage
*/
CREATE OR REPLACE TABLE `project.dataset.trg_tbl`AS
#This, comment will not break sqllineage
SELECT foo, bar
-- this comment will not break sqllineage either
# this comment will not break sqllineage either
FROM `project.dataset.CamelCaseTable`
        """,
        use_raw_names=True,
    )

    assert parser.get_tables() == ["project.dataset.CamelCaseTable"]


def test_bigquery_sql_lineage_camel_case_dataset():
    """
    This test aims to test the parameter to ignore sqllineage lowercasing.
    On the BigQuery service, it's possible to use uppercase name un datasets and tables.
    The lowercasing, by default, breaks the lineage construction in these cases.
    """
    parser = BigQuerySQLParser(
        sql_query="""
/*
HERE IS A STANDARD COMMENT BLOCK
THIS WILL NOT BREAK sqllineage
*/
CREATE OR REPLACE TABLE `project.dataset.trg_tbl`AS
#This, comment will not break sqllineage
SELECT foo, bar
-- this comment will not break sqllineage either
# this comment will not break sqllineage either
FROM `project.DataSet.table`
        """,
        use_raw_names=True,
    )

    assert parser.get_tables() == ["project.DataSet.table"]


def test_bigquery_sql_lineage_camel_case_table_and_dataset():
    """
    This test aims to test the parameter to ignore sqllineage lowercasing.
    On the BigQuery service, it's possible to use uppercase name un datasets and tables.
    The lowercasing, by default, breaks the lineage construction in these cases.
    """
    parser = BigQuerySQLParser(
        sql_query="""
/*
HERE IS A STANDARD COMMENT BLOCK
THIS WILL NOT BREAK sqllineage
*/
CREATE OR REPLACE TABLE `project.dataset.trg_tbl`AS
#This, comment will not break sqllineage
SELECT foo, bar
-- this comment will not break sqllineage either
# this comment will not break sqllineage either
FROM `project.DataSet.CamelTable`
        """,
        use_raw_names=True,
    )

    assert parser.get_tables() == ["project.DataSet.CamelTable"]


def test_bigquery_sql_lineage_camel_case_table_and_dataset_subquery():
    """
    This test aims to test the parameter to ignore sqllineage lowercasing.
    On the BigQuery service, it's possible to use uppercase name un datasets and tables.
    The lowercasing, by default, breaks the lineage construction in these cases.
    """
    parser = BigQuerySQLParser(
        sql_query="""
/*
HERE IS A STANDARD COMMENT BLOCK
THIS WILL NOT BREAK sqllineage
*/
CREATE OR REPLACE TABLE `project.dataset.trg_tbl`AS
#This, comment will not break sqllineage
SELECT foo, bar
-- this comment will not break sqllineage either
# this comment will not break sqllineage either
FROM (
    # this comment will not break sqllineage either
    SELECT * FROM `project.DataSet.CamelTable`
)
        """,
        use_raw_names=True,
    )

    assert parser.get_tables() == ["project.DataSet.CamelTable"]


def test_bigquery_sql_lineage_camel_case_table_and_dataset_joins():
    """
    This test aims to test the parameter to ignore sqllineage lowercasing.
    On the BigQuery service, it's possible to use uppercase name un datasets and tables.
    The lowercasing, by default, breaks the lineage construction in these cases.
    """
    parser = BigQuerySQLParser(
        sql_query="""
/*
HERE IS A STANDARD COMMENT BLOCK
THIS WILL NOT BREAK sqllineage
*/
CREATE OR REPLACE TABLE `project.dataset.trg_tbl`AS
#This, comment will not break sqllineage
SELECT foo, bar
-- this comment will not break sqllineage either
# this comment will not break sqllineage either
FROM `project.DataSet1.CamelTable`
INNER JOIN `project.DataSet2.CamelTable2`
    ON b.id = a.id
LEFT JOIN `project.DataSet3.CamelTable3`
    on c.id = b.id
        """,
        use_raw_names=True,
    )

    assert parser.get_tables() == [
        "project.DataSet1.CamelTable",
        "project.DataSet2.CamelTable2",
        "project.DataSet3.CamelTable3",
    ]


def test_bigquery_sql_lineage_camel_case_table_and_dataset_joins_and_subquery():
    """
    This test aims to test the parameter to ignore sqllineage lowercasing.
    On the BigQuery service, it's possible to use uppercase name un datasets and tables.
    The lowercasing, by default, breaks the lineage construction in these cases.
    """
    parser = BigQuerySQLParser(
        sql_query="""
/*
HERE IS A STANDARD COMMENT BLOCK
THIS WILL NOT BREAK sqllineage
*/
CREATE OR REPLACE TABLE `project.dataset.trg_tbl`AS
#This, comment will not break sqllineage
SELECT foo, bar
-- this comment will not break sqllineage either
# this comment will not break sqllineage either
FROM `project.DataSet1.CamelTable` a
INNER JOIN `project.DataSet2.CamelTable2` b
    ON b.id = a.id
LEFT JOIN (SELECT * FROM `project.DataSet3.CamelTable3`) c
    ON c.id = b.id
        """,
        use_raw_names=True,
    )

    assert parser.get_tables() == [
        "project.DataSet1.CamelTable",
        "project.DataSet2.CamelTable2",
        "project.DataSet3.CamelTable3",
    ]


def test_bigquery_sql_lineage_keyword_data_is_accepted():
    parser = BigQuerySQLParser(
        sql_query="""
            WITH data AS (
                SELECT
                    *,
                    'foo' AS bar
                FROM `project.example_dataset.example_table`
            )
            SELECT * FROM data
        """
    )

    assert parser.get_tables() == ["project.example_dataset.example_table"]


def test_bigquery_sql_lineage_keyword_admin_is_accepted():
    parser = BigQuerySQLParser(
        sql_query="""
            WITH admin AS (
                SELECT *
                FROM `project.example_dataset.example_table`
            )
            SELECT * FROM admin
        """
    )

    assert parser.get_tables() == ["project.example_dataset.example_table"]


def test_bigquery_sql_lineage_cte_alias_as_keyword_is_accepted():
    parser = BigQuerySQLParser(
        sql_query="""
CREATE OR REPLACE TABLE `project.dataset.test_table` AS
WITH map AS (
        SELECT a.col_1,
               b.col_2
          FROM (
                SELECT DISTINCT *
                  FROM (
                        SELECT col_1
                          FROM `project.dataset.source_table_a`
                       )
               ) a
          JOIN `project.dataset.source_table_b` b
            ON a.col_1 = b.col_1
       )
SELECT *
  FROM map
        """
    )

    assert parser.get_tables() == [
        "project.dataset.source_table_a",
        "project.dataset.source_table_b",
    ]


def test_bigquery_sql_lineage_create_or_replace_view_name_with_hyphens_is_accepted():
    parser = BigQuerySQLParser(
        sql_query="""
            CREATE OR REPLACE VIEW test-project.dataset.test_view AS
            SELECT *
            FROM project.dataset.src_table_a
            UNION
            SELECT * FROM `project.dataset.src_table_b`
        """
    )

    assert parser.get_tables() == [
        "project.dataset.src_table_a",
        "project.dataset.src_table_b",
    ]


def test_bigquery_sql_lineage_source_table_name_with_hyphens_is_accepted():
    parser = BigQuerySQLParser(
        sql_query="""
            CREATE OR REPLACE VIEW `project.dataset.test_view` AS
            SELECT *
            FROM test-project.dataset.src_table
        """
    )

    assert parser.get_tables() == ["test-project.dataset.src_table"]


def test_bigquery_sql_lineage_from_as_column_name_is_accepted():
    parser = BigQuerySQLParser(
        sql_query="""
            CREATE OR REPLACE VIEW `project.dataset.test_view` AS
            SELECT x.from AS col
            FROM project.dataset.src_table AS x
        """
    )

    assert parser.get_tables() == ["project.dataset.src_table"]
fix(bigquery): improve handling of extracted audit log sql queries (#4735) 2022-05-03 11:13:23 +02:00			`from datahub.utilities.bigquery_sql_parser import BigQuerySQLParser`


			`def test_bigquery_sql_lineage_hash_as_comment_sign_is_accepted():`
			`parser = BigQuerySQLParser(`
			`sql_query="""`
			`/*`
			`HERE IS A STANDARD COMMENT BLOCK`
			`THIS WILL NOT BREAK sqllineage`
			`*/`
			CREATE OR REPLACE TABLE `project.dataset.trg_tbl`AS
			`#This, comment will not break sqllineage`
			`SELECT foo`
			`-- this comment will not break sqllineage either`
			`# this comment will not break sqllineage either`
			FROM `project.dataset.src_tbl`
			`"""`
			`)`

			`assert parser.get_tables() == ["project.dataset.src_tbl"]`


refactor(ingest): bigquery-lineage - allow tables and datasets in uppercase (#6739) 2022-12-14 10:58:03 -03:00			`def test_bigquery_sql_lineage_camel_case_table():`
			`"""`
			`This test aims to test the parameter to ignore sqllineage lowercasing.`
			`On the BigQuery service, it's possible to use uppercase name un datasets and tables.`
			`The lowercasing, by default, breaks the lineage construction in these cases.`
			`"""`
			`parser = BigQuerySQLParser(`
			`sql_query="""`
			`/*`
			`HERE IS A STANDARD COMMENT BLOCK`
			`THIS WILL NOT BREAK sqllineage`
			`*/`
			CREATE OR REPLACE TABLE `project.dataset.trg_tbl`AS
			`#This, comment will not break sqllineage`
			`SELECT foo, bar`
			`-- this comment will not break sqllineage either`
			`# this comment will not break sqllineage either`
			FROM `project.dataset.CamelCaseTable`
			`""",`
			`use_raw_names=True,`
			`)`

			`assert parser.get_tables() == ["project.dataset.CamelCaseTable"]`


			`def test_bigquery_sql_lineage_camel_case_dataset():`
			`"""`
			`This test aims to test the parameter to ignore sqllineage lowercasing.`
			`On the BigQuery service, it's possible to use uppercase name un datasets and tables.`
			`The lowercasing, by default, breaks the lineage construction in these cases.`
			`"""`
			`parser = BigQuerySQLParser(`
			`sql_query="""`
			`/*`
			`HERE IS A STANDARD COMMENT BLOCK`
			`THIS WILL NOT BREAK sqllineage`
			`*/`
			CREATE OR REPLACE TABLE `project.dataset.trg_tbl`AS
			`#This, comment will not break sqllineage`
			`SELECT foo, bar`
			`-- this comment will not break sqllineage either`
			`# this comment will not break sqllineage either`
			FROM `project.DataSet.table`
			`""",`
			`use_raw_names=True,`
			`)`

			`assert parser.get_tables() == ["project.DataSet.table"]`


			`def test_bigquery_sql_lineage_camel_case_table_and_dataset():`
			`"""`
			`This test aims to test the parameter to ignore sqllineage lowercasing.`
			`On the BigQuery service, it's possible to use uppercase name un datasets and tables.`
			`The lowercasing, by default, breaks the lineage construction in these cases.`
			`"""`
			`parser = BigQuerySQLParser(`
			`sql_query="""`
			`/*`
			`HERE IS A STANDARD COMMENT BLOCK`
			`THIS WILL NOT BREAK sqllineage`
			`*/`
			CREATE OR REPLACE TABLE `project.dataset.trg_tbl`AS
			`#This, comment will not break sqllineage`
			`SELECT foo, bar`
			`-- this comment will not break sqllineage either`
			`# this comment will not break sqllineage either`
			FROM `project.DataSet.CamelTable`
			`""",`
			`use_raw_names=True,`
			`)`

			`assert parser.get_tables() == ["project.DataSet.CamelTable"]`


			`def test_bigquery_sql_lineage_camel_case_table_and_dataset_subquery():`
			`"""`
			`This test aims to test the parameter to ignore sqllineage lowercasing.`
			`On the BigQuery service, it's possible to use uppercase name un datasets and tables.`
			`The lowercasing, by default, breaks the lineage construction in these cases.`
			`"""`
			`parser = BigQuerySQLParser(`
			`sql_query="""`
			`/*`
			`HERE IS A STANDARD COMMENT BLOCK`
			`THIS WILL NOT BREAK sqllineage`
			`*/`
			CREATE OR REPLACE TABLE `project.dataset.trg_tbl`AS
			`#This, comment will not break sqllineage`
			`SELECT foo, bar`
			`-- this comment will not break sqllineage either`
			`# this comment will not break sqllineage either`
			`FROM (`
			`# this comment will not break sqllineage either`
			SELECT * FROM `project.DataSet.CamelTable`
			`)`
			`""",`
			`use_raw_names=True,`
			`)`

			`assert parser.get_tables() == ["project.DataSet.CamelTable"]`


			`def test_bigquery_sql_lineage_camel_case_table_and_dataset_joins():`
			`"""`
			`This test aims to test the parameter to ignore sqllineage lowercasing.`
			`On the BigQuery service, it's possible to use uppercase name un datasets and tables.`
			`The lowercasing, by default, breaks the lineage construction in these cases.`
			`"""`
			`parser = BigQuerySQLParser(`
			`sql_query="""`
			`/*`
			`HERE IS A STANDARD COMMENT BLOCK`
			`THIS WILL NOT BREAK sqllineage`
			`*/`
			CREATE OR REPLACE TABLE `project.dataset.trg_tbl`AS
			`#This, comment will not break sqllineage`
			`SELECT foo, bar`
			`-- this comment will not break sqllineage either`
			`# this comment will not break sqllineage either`
			FROM `project.DataSet1.CamelTable`
			INNER JOIN `project.DataSet2.CamelTable2`
			`ON b.id = a.id`
			LEFT JOIN `project.DataSet3.CamelTable3`
			`on c.id = b.id`
			`""",`
			`use_raw_names=True,`
			`)`

			`assert parser.get_tables() == [`
			`"project.DataSet1.CamelTable",`
			`"project.DataSet2.CamelTable2",`
			`"project.DataSet3.CamelTable3",`
			`]`


			`def test_bigquery_sql_lineage_camel_case_table_and_dataset_joins_and_subquery():`
			`"""`
			`This test aims to test the parameter to ignore sqllineage lowercasing.`
			`On the BigQuery service, it's possible to use uppercase name un datasets and tables.`
			`The lowercasing, by default, breaks the lineage construction in these cases.`
			`"""`
			`parser = BigQuerySQLParser(`
			`sql_query="""`
			`/*`
			`HERE IS A STANDARD COMMENT BLOCK`
			`THIS WILL NOT BREAK sqllineage`
			`*/`
			CREATE OR REPLACE TABLE `project.dataset.trg_tbl`AS
			`#This, comment will not break sqllineage`
			`SELECT foo, bar`
			`-- this comment will not break sqllineage either`
			`# this comment will not break sqllineage either`
			FROM `project.DataSet1.CamelTable` a
			INNER JOIN `project.DataSet2.CamelTable2` b
			`ON b.id = a.id`
			LEFT JOIN (SELECT * FROM `project.DataSet3.CamelTable3`) c
			`ON c.id = b.id`
			`""",`
			`use_raw_names=True,`
			`)`

			`assert parser.get_tables() == [`
			`"project.DataSet1.CamelTable",`
			`"project.DataSet2.CamelTable2",`
			`"project.DataSet3.CamelTable3",`
			`]`


fix(bigquery): improve handling of extracted audit log sql queries (#4735) 2022-05-03 11:13:23 +02:00			`def test_bigquery_sql_lineage_keyword_data_is_accepted():`
			`parser = BigQuerySQLParser(`
			`sql_query="""`
			`WITH data AS (`
			`SELECT`
			`*,`
			`'foo' AS bar`
			FROM `project.example_dataset.example_table`
			`)`
			`SELECT * FROM data`
			`"""`
			`)`

			`assert parser.get_tables() == ["project.example_dataset.example_table"]`


			`def test_bigquery_sql_lineage_keyword_admin_is_accepted():`
			`parser = BigQuerySQLParser(`
			`sql_query="""`
			`WITH admin AS (`
			`SELECT *`
			FROM `project.example_dataset.example_table`
			`)`
			`SELECT * FROM admin`
			`"""`
			`)`

			`assert parser.get_tables() == ["project.example_dataset.example_table"]`


			`def test_bigquery_sql_lineage_cte_alias_as_keyword_is_accepted():`
			`parser = BigQuerySQLParser(`
			`sql_query="""`
			CREATE OR REPLACE TABLE `project.dataset.test_table` AS
			`WITH map AS (`
			`SELECT a.col_1,`
			`b.col_2`
			`FROM (`
			`SELECT DISTINCT *`
			`FROM (`
			`SELECT col_1`
			FROM `project.dataset.source_table_a`
			`)`
			`) a`
			JOIN `project.dataset.source_table_b` b
			`ON a.col_1 = b.col_1`
			`)`
			`SELECT *`
			`FROM map`
			`"""`
			`)`

			`assert parser.get_tables() == [`
			`"project.dataset.source_table_a",`
			`"project.dataset.source_table_b",`
			`]`


			`def test_bigquery_sql_lineage_create_or_replace_view_name_with_hyphens_is_accepted():`
			`parser = BigQuerySQLParser(`
			`sql_query="""`
			`CREATE OR REPLACE VIEW test-project.dataset.test_view AS`
			`SELECT *`
			`FROM project.dataset.src_table_a`
			`UNION`
			SELECT * FROM `project.dataset.src_table_b`
			`"""`
			`)`

			`assert parser.get_tables() == [`
			`"project.dataset.src_table_a",`
			`"project.dataset.src_table_b",`
			`]`


			`def test_bigquery_sql_lineage_source_table_name_with_hyphens_is_accepted():`
			`parser = BigQuerySQLParser(`
			`sql_query="""`
			CREATE OR REPLACE VIEW `project.dataset.test_view` AS
			`SELECT *`
			`FROM test-project.dataset.src_table`
			`"""`
			`)`

			`assert parser.get_tables() == ["test-project.dataset.src_table"]`


			`def test_bigquery_sql_lineage_from_as_column_name_is_accepted():`
			`parser = BigQuerySQLParser(`
			`sql_query="""`
			CREATE OR REPLACE VIEW `project.dataset.test_view` AS
			`SELECT x.from AS col`
			`FROM project.dataset.src_table AS x`
			`"""`
			`)`

			`assert parser.get_tables() == ["project.dataset.src_table"]`