fix(ingest): remove original_table_name logic in sql source (#8130)

2025-12-08 16:45:36 +00:00 · 2023-05-31 15:58:09 -07:00 · 2023-05-31 15:58:09 -07:00 · 60dd9ef187
commit 60dd9ef187
parent 4ade1311f1
3 changed files with 11 additions and 89 deletions
--- a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py
@ -3,7 +3,12 @@ from dataclasses import dataclass, field
 from typing import Dict, Iterable, List, Optional, Tuple, Type, Union, ValuesView
 import bson
 import bson.dbref
 import bson.int64
 import bson.objectid
 import bson.timestamp
 import pymongo
 import pymongo.collection
 from packaging import version
 from pydantic import PositiveInt, validator
 from pydantic.fields import Field
@ -199,7 +204,7 @@ def construct_schema_pymongo(
@platform_name("MongoDB")
@config_class(MongoDBConfig)
@support_status(SupportStatus.CERTIFIED)
-@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
+@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
@dataclass
 class MongoDBSource(Source):
    """
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
@ -517,13 +517,6 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
                    profile_requests, profiler, platform=self.platform
                )
    def standardize_schema_table_names(
        self, schema: str, entity: str
    ) -> Tuple[str, str]:
        # Some SQLAlchemy dialects need a standardization step to clean the schema
        # and table names. See BigQuery for an example of when this is useful.
        return schema, entity
    def get_identifier(
        self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any
    ) -> str:
@ -572,9 +565,6 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
            fk_dict["name"], foreign_fields, source_fields, foreign_dataset
        )
    def normalise_dataset_name(self, dataset_name: str) -> str:
        return dataset_name
    def loop_tables(  # noqa: C901
        self,
        inspector: Inspector,
@ -584,15 +574,10 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
        tables_seen: Set[str] = set()
        try:
            for table in inspector.get_table_names(schema):
                schema, table = self.standardize_schema_table_names(
                    schema=schema, entity=table
                )
                dataset_name = self.get_identifier(
                    schema=schema, entity=table, inspector=inspector
                )
                dataset_name = self.normalise_dataset_name(dataset_name)
                if dataset_name not in tables_seen:
                    tables_seen.add(dataset_name)
                else:
@ -650,18 +635,8 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
            inspector, schema, table
        )
        # Tablename might be different from the real table if we ran some normalisation ont it.
        # Getting normalized table name from the dataset_name
        # Table is the last item in the dataset name
        normalised_table = table
        splits = dataset_name.split(".")
        if splits:
            normalised_table = splits[-1]
            if properties and normalised_table != table:
                properties["original_table_name"] = table
        dataset_properties = DatasetPropertiesClass(
-            name=normalised_table,
+            name=table,
            description=description,
            customProperties=properties,
        )
@ -866,14 +841,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
    ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]:
        try:
            for view in inspector.get_view_names(schema):
                schema, view = self.standardize_schema_table_names(
                    schema=schema, entity=view
                )
                dataset_name = self.get_identifier(
                    schema=schema, entity=view, inspector=inspector
                )
                dataset_name = self.normalise_dataset_name(dataset_name)
                self.report.report_entity_scanned(dataset_name, ent_type="view")
                if not sql_config.view_pattern.allowed(dataset_name):
@ -1068,9 +1038,6 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
                logger.debug("Source does not support generating profile candidates.")
        for table in inspector.get_table_names(schema):
            schema, table = self.standardize_schema_table_names(
                schema=schema, entity=table
            )
            dataset_name = self.get_identifier(
                schema=schema, entity=table, inspector=inspector
            )
@ -1081,8 +1048,6 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
                    self.report.report_dropped(f"profile of {dataset_name}")
                continue
            dataset_name = self.normalise_dataset_name(dataset_name)
            if dataset_name not in tables_seen:
                tables_seen.add(dataset_name)
            else:
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py
@ -265,13 +265,9 @@ class VerticaSource(SQLAlchemySource):
    ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]:
        try:
            for view in inspector.get_view_names(schema):
                schema, view = self.standardize_schema_table_names(
                    schema=schema, entity=view
                )
                dataset_name = self.get_identifier(
                    schema=schema, entity=view, inspector=inspector
                )
                dataset_name = self.normalise_dataset_name(dataset_name)
                self.report.report_entity_scanned(dataset_name, ent_type="view")
@ -396,13 +392,9 @@ class VerticaSource(SQLAlchemySource):
        try:
            # table_tags = self.get_extra_tags(inspector, schema, "projection")
            for projection in inspector.get_projection_names(schema):  # type: ignore
                schema, projection = self.standardize_schema_table_names(
                    schema=schema, entity=projection
                )
                dataset_name = self.get_identifier(
                    schema=schema, entity=projection, inspector=inspector
                )
                dataset_name = self.normalise_dataset_name(dataset_name)
                if dataset_name not in projections_seen:
                    projections_seen.add(dataset_name)
                else:
@ -479,18 +471,8 @@ class VerticaSource(SQLAlchemySource):
            inspector, schema, projection
        )
        # Tablename might be different from the real table if we ran some normalisation ont it.
        # Getting normalized table name from the dataset_name
        # Table is the last item in the dataset name
        normalised_table = projection
        splits = dataset_name.split(".")
        if splits:
            normalised_table = splits[-1]
            if properties and normalised_table != projection:
                properties["original_table_name"] = projection
        dataset_properties = DatasetPropertiesClass(
-            name=normalised_table,
+            name=projection,
            description=description,
            customProperties=properties,
        )
@ -618,14 +600,10 @@ class VerticaSource(SQLAlchemySource):
        models_seen: Set[str] = set()
        try:
            for models in inspector.get_models_names(schema):  # type: ignore
                schema, models = self.standardize_schema_table_names(
                    schema=schema, entity=models
                )
                dataset_name = self.get_identifier(
                    schema="Entities", entity=models, inspector=inspector
                )
                dataset_name = self.normalise_dataset_name(dataset_name)
                if dataset_name not in models_seen:
                    models_seen.add(dataset_name)
                else:
@ -687,22 +665,12 @@ class VerticaSource(SQLAlchemySource):
        description, properties, location = self.get_model_properties(
            inspector, schema, table
        )
        # Tablename might be different from the real table if we ran some normalisation ont it.
        # Getting normalized table name from the dataset_name
        # Table is the last item in the dataset name
        normalised_table = table
        splits = dataset_name.split(".")
        if splits:
            normalised_table = splits[-1]
            if properties and normalised_table != table:
                properties["original_table_name"] = table
        dataset_properties = DatasetPropertiesClass(
-            name=normalised_table,
+            name=table,
            description=description,
            customProperties=properties,
        )
        dataset_snapshot.aspects.append(dataset_properties)
        schema_fields = self.get_schema_fields(dataset_name, columns)
@ -796,14 +764,10 @@ class VerticaSource(SQLAlchemySource):
        oauth_seen: Set[str] = set()
        try:
            for oauth in inspector.get_Oauth_names(schema):  # type: ignore
                schema, oauth = self.standardize_schema_table_names(
                    schema=schema, entity=oauth
                )
                dataset_name = self.get_identifier(
                    schema=schema, entity=oauth, inspector=inspector
                )
                dataset_name = self.normalise_dataset_name(dataset_name)
                if dataset_name not in oauth_seen:
                    oauth_seen.add(dataset_name)
                else:
@ -861,17 +825,9 @@ class VerticaSource(SQLAlchemySource):
        description, properties, location_urn = self.get_oauth_properties(
            inspector, schema, oauth
        )
-        # Tablename might be different from the real table if we ran some normalisation ont it.
+
        # Getting normalized table name from the dataset_name
        # Table is the last item in the dataset name
        normalised_table = oauth
        splits = dataset_name.split(".")
        if splits:
            normalised_table = splits[-1]
            if properties and normalised_table != oauth:
                properties["original_table_name"] = oauth
        dataset_properties = DatasetPropertiesClass(
-            name=normalised_table,
+            name=oauth,
            description=description,
            customProperties=properties,
        )
@ -966,9 +922,6 @@ class VerticaSource(SQLAlchemySource):
        profile_candidates = None  # Default value if profile candidates not available.
        yield from super().loop_profiler_requests(inspector, schema, sql_config)
        for projection in inspector.get_projection_names(schema):  # type: ignore
            schema, projection = self.standardize_schema_table_names(
                schema=schema, entity=projection
            )
            dataset_name = self.get_identifier(
                schema=schema, entity=projection, inspector=inspector
            )
@ -979,7 +932,6 @@ class VerticaSource(SQLAlchemySource):
                if self.config.profiling.report_dropped_profiles:
                    self.report.report_dropped(f"profile of {dataset_name}")
                continue
            dataset_name = self.normalise_dataset_name(dataset_name)
            if dataset_name not in tables_seen:
                tables_seen.add(dataset_name)
            else: