Merge pull request #150 from open-metadata/setup-optimize

Setup optimize
2025-10-13 17:58:36 +00:00 · 2021-08-14 00:25:16 +05:30 · 2021-08-14 00:25:16 +05:30 · 82bec0e50f
commit 82bec0e50f
parent 21ae968689 03ad583744
42 changed files with 267 additions and 434 deletions
--- a/docker/metadata/docker-compose.yml
+++ b/docker/metadata/docker-compose.yml
@ -41,10 +41,8 @@ services:
      - 9200:9200
      - 9300:9300

-  catalog:
-    build:
-      context: ../../.
-      dockerfile: docker/metadata/Dockerfile
+  openmetadata-server:
+    image: openmetadata/server:latest
    expose:
      - 8585
      - 9200
@ -62,9 +60,7 @@ services:
      - "localhost:172.16.239.11"

  ingestion:
-    build:
-      context: ../../ingestion/.
-      dockerfile: Dockerfile
+    image: openmetadata/ingestion:latest
    expose:
      - 7777
    ports:
@ -75,20 +71,6 @@ services:
      - "localhost:172.16.239.10"
      - "localhost:172.16.239.11"
      - "localhost:172.16.239.12"
-      - "localhost:172.16.239.13"
-
-  postgres:
-    image: postgres
-    restart: always
-    environment:
-      POSTGRES_DB: pagila
-      POSTGRES_USER: openmetadata_user
-      POSTGRES_PASSWORD: openmetadata_password
-    ports:
-      - 5433:5432
-    networks:
-      app_net:
-        ipv4_address: 172.16.239.13

 networks:
  app_net:
--- a/docs/install/metadata-ingestion/connectors/mssql.md
+++ b/docs/install/metadata-ingestion/connectors/mssql.md
@ -95,7 +95,7 @@ Add Optional `pii-tags` processor and `metadata-rest-tables` sink along with `me
    }
  },
  "processor": {
-    "type": "pii-tags",
+    "type": "pii",
    "config": {
    }
  },
--- a/docs/install/metadata-ingestion/connectors/mysql.md
+++ b/docs/install/metadata-ingestion/connectors/mysql.md
@ -94,7 +94,7 @@ Add Optional `pii-tags` processor and `metadata-rest-tables` sink along with `me
    }
  },
  "processor": {
-    "type": "pii-tags",
+    "type": "pii",
    "config": {
      "api_endpoint": "http://localhost:8585/api"
    }
--- a/docs/install/metadata-ingestion/connectors/postgres.md
+++ b/docs/install/metadata-ingestion/connectors/postgres.md
@ -95,7 +95,7 @@ Add Optional `pii-tags` processor and `metadata-rest-tables` sink along with `me
    }
  },
  "processor": {
-    "type": "pii-tags",
+    "type": "pii",
    "config": {}
  },
  "sink": {
--- a/ingestion/Dockerfile
+++ b/ingestion/Dockerfile
@ -1,8 +1,7 @@
-FROM python:3.9.2
+FROM python:3.8.10

 EXPOSE 7777

-COPY ./examples /openmetadata-ingestion/examples
 COPY ./pipelines /openmetadata-ingestion/pipelines
 COPY ./ingestion_scheduler /openmetadata-ingestion/ingestion_scheduler
 COPY ./ingestion_dependency.sh /openmetadata-ingestion/ingestion_dependency.sh
--- a/ingestion/examples/workflows/bigquery.json
+++ b/ingestion/examples/workflows/bigquery.json
@ -10,7 +10,7 @@
    }
  },
  "processor": {
-    "type": "pii-tags",
+    "type": "pii",
    "config": {
      "api_endpoint": "http://localhost:8585/api"
    }
--- a/ingestion/examples/workflows/hive.json
+++ b/ingestion/examples/workflows/hive.json
@ -8,14 +8,12 @@
    }
  },
  "processor": {
-    "type": "pii-tags",
-    "config": {
-    }
+    "type": "pii",
+    "config": {}
  },
  "sink": {
    "type": "metadata-rest-tables",
-    "config": {
-    }
+    "config": {}
  },
  "metadata_server": {
    "type": "metadata-server",
--- a/ingestion/examples/workflows/mssql.json
+++ b/ingestion/examples/workflows/mssql.json
@ -14,7 +14,7 @@
    }
  },
  "processor": {
-    "type": "pii-tags",
+    "type": "pii",
    "config": {
    }
  },
--- a/ingestion/examples/workflows/postgres.json
+++ b/ingestion/examples/workflows/postgres.json
@ -11,7 +11,7 @@
    }
  },
  "processor": {
-    "type": "pii-tags",
+    "type": "pii",
    "config": {}
  },
  "sink": {
--- a/ingestion/examples/workflows/redshift.json
+++ b/ingestion/examples/workflows/redshift.json
@ -11,16 +11,14 @@
    }
  },
  "processor": {
-    "type": "pii-tags",
-    "config": {
-    }
+    "type": "pii",
+    "config": {}
  },
  "sink": {
    "type": "metadata-rest-tables",
-    "config": {
-    }
+    "config": {}
  },
-   "metadata_server": {
+  "metadata_server": {
    "type": "metadata-server",
    "config": {
      "api_endpoint": "http://localhost:8585/api",
--- a/ingestion/examples/workflows/redshift_usage.json
+++ b/ingestion/examples/workflows/redshift_usage.json
@ -19,7 +19,7 @@
    }
  },
  "stage": {
-    "type": "table-usage-stage",
+    "type": "table-usage",
    "config": {
      "filename": "/tmp/redshift_usage"
    }
--- a/ingestion/examples/workflows/snowflake.json
+++ b/ingestion/examples/workflows/snowflake.json
@ -19,7 +19,7 @@
    }
  },
  "processor": {
-    "type": "pii-tags",
+    "type": "pii",
    "config": {}
  },
  "sink": {
--- a/ingestion/examples/workflows/snowflake_usage.json
+++ b/ingestion/examples/workflows/snowflake_usage.json
@ -19,7 +19,7 @@
    }
  },
  "stage": {
-    "type": "table-usage-stage",
+    "type": "table-usage",
    "config": {
      "filename": "/tmp/snowflake_usage"
    }
--- a/ingestion/ingestion_dependency.sh
+++ b/ingestion/ingestion_dependency.sh
@ -17,7 +17,9 @@
 #

 set -euo pipefail
-pip install --upgrade pip setuptools openmetadata-ingestion==0.2.1 apns
-pip install openmetadata-ingestion[mysql,sample-tables,elasticsearch]
+pip install --upgrade setuptools openmetadata-ingestion==0.2.1 apns
+# wget https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl
+# pip install en_core_web_sm-3.0.0-py3-none-any.whl
 python -m spacy download en_core_web_sm
+rm -rf en_core_web_sm-3.0.0-py3-none-any.whl
 pip install "simplescheduler@git+https://github.com/StreamlineData/sdscheduler.git#egg=simplescheduler"
--- a/ingestion/pipelines/metadata_to_es.json
+++ b/ingestion/pipelines/metadata_to_es.json
@ -1,10 +1,10 @@
 {
  "source": {
-    "type": "metadata-rest-tables",
+    "type": "metadata_es",
    "config": {}
  },
  "stage": {
-    "type": "file-stage",
+    "type": "file",
    "config": {
      "filename": "/tmp/tables.txt"
    }
--- a/ingestion/pipelines/mysql.json
+++ b/ingestion/pipelines/mysql.json
@ -12,7 +12,7 @@
    }
  },
  "processor": {
-    "type": "pii-tags",
+    "type": "pii",
    "config": {
      "api_endpoint": "http://localhost:8585/api"
    }
--- a/ingestion/pipelines/sample_tables.json
+++ b/ingestion/pipelines/sample_tables.json
@ -9,7 +9,7 @@
    }
  },
  "processor": {
-    "type": "pii-tags",
+    "type": "pii",
    "config": {
    }
  },
--- a/ingestion/pipelines/sample_usage.json
+++ b/ingestion/pipelines/sample_usage.json
@ -15,7 +15,7 @@
    }
  },
  "stage": {
-    "type": "table-usage-stage",
+    "type": "table-usage",
    "config": {
      "filename": "/tmp/sample_usage"
    }
--- a/ingestion/setup.py
+++ b/ingestion/setup.py
@ -44,7 +44,7 @@ base_requirements = {
    "typing_extensions>=3.7.4"
    "mypy_extensions>=0.4.3",
    "typing-inspect",
-    "pydantic~=1.7.4",
+    "pydantic==1.7.4",
    "pydantic[email]>=1.7.2",
    "google>=3.0.0",
    "google-auth>=1.33.0",
@ -54,13 +54,14 @@ base_requirements = {
    "python-jose==3.3.0",
    "okta==1.7.0",
    "pandas~=1.3.1",
-    "sqlalchemy>=1.3.24",
-    "sql-metadata~=2.0.0",
-    "spacy==3.0.5",
-    "requests~=2.25.1"
+    "sqlalchemy>=1.3.24"
+    "sql-metadata~=2.0.0"
+    "spacy==3.0.5"
+    "requests~=2.25.1",
+    "en_core_web_sm@https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web"
 }
 base_plugins = {
-    "pii-tags",
+    "pii-processor",
    "query-parser",
    "metadata-usage",
    "file-stage",
@ -110,44 +111,6 @@ setup(
    packages=find_namespace_packages(where='./src', exclude=['tests*']),
    entry_points={
        "console_scripts": ["metadata = metadata.cmd:metadata"],
-        "metadata.ingestion.source.plugins": [
-            "mysql = metadata.ingestion.source.mysql:MySQLSource",
-            "postgres = metadata.ingestion.source.postgres:PostgresSource",
-            "snowflake = metadata.ingestion.source.snowflake:SnowflakeSource",
-            "redshift = metadata.ingestion.source.redshift:RedshiftSource",
-            "redshift-sql = metadata.ingestion.source.redshift_sql:RedshiftSQLSource",
-            "bigquery = metadata.ingestion.source.bigquery:BigQuerySource",
-            "athena = metadata.ingestion.source.athena:AthenaSource",
-            "oracle = metadata.ingestion.source.oracle:OracleSource",
-            "mssql = metadata.ingestion.source.mssql:SQLServerSource",
-            "hive = metadata.ingestion.source.hive:HiveSource",
-            "sample-tables = metadata.ingestion.source.sample_data_generator:SampleTableSource",
-            "sample-users = metadata.ingestion.source.sample_data_generator:SampleUserSource",
-            "sample-usage = metadata.ingestion.source.sample_data_generator:SampleUsageSource",
-            "metadata-rest-tables = metadata.ingestion.source.metadata_rest:MetadataTablesRestSource",
-            "redshift-usage = metadata.ingestion.source.redshift_usage:RedshiftUsageSource",
-            "snowflake-usage = metadata.ingestion.source.snowflake_usage:SnowflakeUsageSource",
-            "ldap-users = metadata.ingestion.source.ldap_source:LDAPUserSource"
-        ],
-        "metadata.ingestion.sink.plugins": [
-            "file = metadata.ingestion.sink.file:FileSink",
-            "console = metadata.ingestion.sink.console:ConsoleSink",
-            "metadata-rest-tables = metadata.ingestion.sink.metadata_tables_rest:MetadataTablesRestSink",
-            "metadata-rest-users = metadata.ingestion.sink.metadata_users_rest:MetadataUsersRestSink",
-            "ldap-rest-users = metadata.ingestion.sink.ldap_add_user:LdapUserRestSink"
-        ],
-        "metadata.ingestion.processor.plugins": [
-            "pii-tags = metadata.ingestion.processor.pii_processor:PIIProcessor",
-            "query-parser =  metadata.ingestion.processor.query_parser:QueryParserProcessor",
-        ],
-        "metadata.ingestion.stage.plugins": [
-            "file-stage = metadata.ingestion.stage.file:FileStage",
-            "table-usage-stage = metadata.ingestion.stage.table_usage_stage:TableUsageStage"
-        ],
-        "metadata.ingestion.bulksink.plugins": [
-            "elasticsearch = metadata.ingestion.bulksink.elastic_search:ElasticSearchBulkSink",
-            "metadata-usage = metadata.ingestion.bulksink.metadata_usage_rest:MetadataUsageBulkSink",
-        ],
    },
    install_requires=list(base_requirements),
    extras_require={
--- a/ingestion/src/metadata/cmd.py
+++ b/ingestion/src/metadata/cmd.py
@ -71,8 +71,8 @@ def ingest(config: str) -> None:
        sys.exit(1)

    workflow.execute()
-    ret = workflow.print_status()
    workflow.stop()
+    ret = workflow.print_status()
    sys.exit(ret)


--- a/ingestion/src/metadata/ingestion/bulksink/bulk_sink_registry.py
+++ b/ingestion/src/metadata/ingestion/bulksink/bulk_sink_registry.py
@ -1,22 +0,0 @@
-#  Licensed to the Apache Software Foundation (ASF) under one or more
-#  contributor license agreements. See the NOTICE file distributed with
-#  this work for additional information regarding copyright ownership.
-#  The ASF licenses this file to You under the Apache License, Version 2.0
-#  (the "License"); you may not use this file except in compliance with
-#  the License. You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-from metadata.ingestion.api.bulk_sink import BulkSink
-from metadata.ingestion.api.registry import Registry
-
-bulk_sink_registry = Registry[BulkSink]()
-bulk_sink_registry.load("metadata.ingestion.bulksink.plugins")
-
-
--- a/ingestion/src/metadata/ingestion/bulksink/elastic_search.py
+++ b/ingestion/src/metadata/ingestion/bulksink/elastic_search.py
@ -30,7 +30,7 @@ class ElasticSearchConfig(ConfigModel):
    batch_size: Optional[int] = 10000


-class ElasticSearchBulkSink(BulkSink):
+class ElasticsearchBulkSink(BulkSink):
    """
    Elasticsearch Publisher uses Bulk API to load data from JSON file.
    A new index is created and data is uploaded into it. After the upload
--- a/ingestion/src/metadata/ingestion/bulksink/metadata_usage_rest.py
+++ b/ingestion/src/metadata/ingestion/bulksink/metadata_usage_rest.py
--- a/ingestion/src/metadata/ingestion/processor/pii_processor.py
+++ b/ingestion/src/metadata/ingestion/processor/pii_processor.py
@ -160,19 +160,19 @@ class ColumnNameScanner(Scanner):
        return list(types)


-class PIIProcessorConfig(ConfigModel):
+class PiiProcessorConfig(ConfigModel):
    filter: Optional[str] = None
    api_endpoint: Optional[str] = None
    auth_provider_type: Optional[str] = None


-class PIIProcessor(Processor):
-    config: PIIProcessorConfig
+class PiiProcessor(Processor):
+    config: PiiProcessorConfig
    metadata_config: MetadataServerConfig
    status: ProcessorStatus
    client: REST

-    def __init__(self, ctx: WorkflowContext, config: PIIProcessorConfig, metadata_config: MetadataServerConfig):
+    def __init__(self, ctx: WorkflowContext, config: PiiProcessorConfig, metadata_config: MetadataServerConfig):
        super().__init__(ctx)
        self.config = config
        self.metadata_config = metadata_config
@ -184,7 +184,7 @@ class PIIProcessor(Processor):

    @classmethod
    def create(cls, config_dict: dict, metadata_config_dict: dict, ctx: WorkflowContext):
-        config = PIIProcessorConfig.parse_obj(config_dict)
+        config = PiiProcessorConfig.parse_obj(config_dict)
        metadata_config = MetadataServerConfig.parse_obj(metadata_config_dict)
        return cls(ctx, config, metadata_config)

--- a/ingestion/src/metadata/ingestion/processor/processor_registry.py
+++ b/ingestion/src/metadata/ingestion/processor/processor_registry.py
@ -1,23 +0,0 @@
-#  Licensed to the Apache Software Foundation (ASF) under one or more
-#  contributor license agreements. See the NOTICE file distributed with
-#  this work for additional information regarding copyright ownership.
-#  The ASF licenses this file to You under the Apache License, Version 2.0
-#  (the "License"); you may not use this file except in compliance with
-#  the License. You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-from metadata.ingestion.api.processor import Processor
-from metadata.ingestion.api.registry import Registry
-
-processor_registry = Registry[Processor]()
-processor_registry.load("metadata.ingestion.processor.plugins")
-
-
-
--- a/ingestion/src/metadata/ingestion/sink/ldap_rest_users.py
+++ b/ingestion/src/metadata/ingestion/sink/ldap_rest_users.py
@ -29,7 +29,7 @@ class LDAPSourceConfig(ConfigModel):
    api_end_point: str


-class LdapUserRestSink(Sink):
+class LdapRestUsersSink(Sink):
    config: LDAPSourceConfig
    status: SinkStatus

--- a/ingestion/src/metadata/ingestion/sink/metadata_rest_tables.py
+++ b/ingestion/src/metadata/ingestion/sink/metadata_rest_tables.py
@ -33,7 +33,7 @@ class MetadataTablesSinkConfig(ConfigModel):
    api_endpoint: str = None


-class MetadataTablesRestSink(Sink):
+class MetadataRestTablesSink(Sink):
    config: MetadataTablesSinkConfig
    status: SinkStatus

--- a/ingestion/src/metadata/ingestion/sink/metadata_rest_users.py
+++ b/ingestion/src/metadata/ingestion/sink/metadata_rest_users.py
@ -30,7 +30,7 @@ class MetadataUsersSinkConfig(ConfigModel):
    api_end_point: str = None


-class MetadataUsersRestSink(Sink):
+class MetadataRestUsersSink(Sink):
    config: MetadataUsersSinkConfig
    metadata_config: MetadataServerConfig
    status: SinkStatus
--- a/ingestion/src/metadata/ingestion/sink/sink_registry.py
+++ b/ingestion/src/metadata/ingestion/sink/sink_registry.py
@ -1,22 +0,0 @@
-#  Licensed to the Apache Software Foundation (ASF) under one or more
-#  contributor license agreements. See the NOTICE file distributed with
-#  this work for additional information regarding copyright ownership.
-#  The ASF licenses this file to You under the Apache License, Version 2.0
-#  (the "License"); you may not use this file except in compliance with
-#  the License. You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-from metadata.ingestion.api.registry import Registry
-from metadata.ingestion.api.sink import Sink
-
-sink_registry = Registry[Sink]()
-sink_registry.load("metadata.ingestion.sink.plugins")
-# These sinks are always enabled
-assert sink_registry.get("file")
--- a/ingestion/src/metadata/ingestion/source/bigquery.py
+++ b/ingestion/src/metadata/ingestion/source/bigquery.py
@ -31,7 +31,7 @@ class BigQueryConfig(SQLConnectionConfig, SQLSource):
        return f"{self.scheme}://"


-class BigQuerySource(SQLSource):
+class BigquerySource(SQLSource):
    def __init__(self, config, metadata_config, ctx):
        super().__init__(config, metadata_config, ctx)

--- a/ingestion/src/metadata/ingestion/source/ldap_source.py
+++ b/ingestion/src/metadata/ingestion/source/ldap_source.py
@ -32,7 +32,7 @@ class LDAPUserConfig(ConfigModel):
    password: str


-class LDAPUserSource(Source):
+class LdapUsersSource(Source):
    config: LDAPUserConfig
    status: SourceStatus

--- a/ingestion/src/metadata/ingestion/source/metadata_rest.py
+++ b/ingestion/src/metadata/ingestion/source/metadata_rest.py
@ -30,7 +30,7 @@ class MetadataTablesRestSourceConfig(ConfigModel):
    api_endpoint: Optional[str] = None


-class MetadataTablesRestSource(Source):
+class MetadataEsSource(Source):
    config: MetadataTablesRestSourceConfig
    report: SourceStatus

--- a/ingestion/src/metadata/ingestion/source/mssql.py
+++ b/ingestion/src/metadata/ingestion/source/mssql.py
@ -20,7 +20,7 @@ from .sql_source import SQLConnectionConfig, SQLSource
 from ..ometa.auth_provider import MetadataServerConfig


-class SQLServerConfig(SQLConnectionConfig):
+class MssqlConfig(SQLConnectionConfig):
    host_port = "localhost:1433"
    scheme = "mssql+pytds"

@ -28,12 +28,12 @@ class SQLServerConfig(SQLConnectionConfig):
        return super().get_connection_url()


-class SQLServerSource(SQLSource):
+class MssqlSource(SQLSource):
    def __init__(self, config, metadata_config, ctx):
        super().__init__(config, metadata_config, ctx)

    @classmethod
    def create(cls, config_dict, metadata_config_dict, ctx):
-        config = SQLServerConfig.parse_obj(config_dict)
+        config = MssqlConfig.parse_obj(config_dict)
        metadata_config = MetadataServerConfig.parse_obj(metadata_config_dict)
        return cls(config, metadata_config, ctx)
--- a/ingestion/src/metadata/ingestion/source/mysql.py
+++ b/ingestion/src/metadata/ingestion/source/mysql.py
@ -24,7 +24,7 @@ class MySQLConfig(SQLConnectionConfig):
    def get_connection_url(self):
        return super().get_connection_url()

-class MySQLSource(SQLSource):
+class MysqlSource(SQLSource):
    def __init__(self, config, metadata_config, ctx):
        super().__init__(config, metadata_config, ctx)

--- a/ingestion/src/metadata/ingestion/source/sample_data_generator.py
+++ b/ingestion/src/metadata/ingestion/source/sample_data_generator.py
@ -14,45 +14,50 @@
 #  limitations under the License.

 import csv
-import json
+import pandas as pd
 import uuid
 import os
-from datetime import datetime
-
-import pandas as pd
-import random
-import string
-import logging
-
-from faker import Faker
+import json
 from collections import namedtuple
-from typing import Iterable, Dict, Any, List, Union
-from metadata.generated.schema.api.services.createDatabaseService import CreateDatabaseServiceEntityRequest
-from metadata.generated.schema.entity.services.databaseService import DatabaseServiceEntity
+from dataclasses import dataclass, field
+from typing import Iterable, List, Dict, Any, Union
 from metadata.config.common import ConfigModel
 from metadata.generated.schema.entity.data.table import TableEntity
 from metadata.generated.schema.entity.data.database import DatabaseEntity
 from metadata.generated.schema.type.entityReference import EntityReference
-from metadata.ingestion.api.source import Source, SourceStatus
-from dataclasses import dataclass, field
+from metadata.ingestion.api.source import SourceStatus, Source
 from metadata.ingestion.models.ometa_table_db import OMetaDatabaseAndTable
-from metadata.ingestion.models.table_metadata import DatabaseMetadata
-from metadata.ingestion.models.table_queries import TableQuery
-from metadata.ingestion.models.user import User
 from metadata.ingestion.ometa.auth_provider import MetadataServerConfig
 from metadata.ingestion.ometa.client import REST
+from metadata.generated.schema.api.services.createDatabaseService import CreateDatabaseServiceEntityRequest
+from metadata.generated.schema.entity.services.databaseService import DatabaseServiceEntity

 COLUMN_NAME = 'Column'
 KEY_TYPE = 'Key type'
 DATA_TYPE = 'Data type'
-FAKER_METHOD = 'Faker method'
 COL_DESCRIPTION = 'Description'
-
-logger = logging.getLogger(__name__)
-
 TableKey = namedtuple('TableKey', ['schema', 'table_name'])


+def get_service_or_create(service_json, metadata_config) -> DatabaseServiceEntity:
+    client = REST(metadata_config)
+    service = client.get_database_service(service_json['name'])
+    if service is not None:
+        return service
+    else:
+        created_service = client.create_database_service(CreateDatabaseServiceEntityRequest(**service_json))
+        return created_service
+
+
+def get_table_key(row: Dict[str, Any]) -> Union[TableKey, None]:
+    """
+    Table key consists of schema and table name
+    :param row:
+    :return:
+    """
+    return TableKey(schema=row['schema'], table_name=row['table_name'])
+
+
 class SampleTableSourceConfig(ConfigModel):
    sample_schema_folder: str
    service_name: str
@ -64,19 +69,6 @@ class SampleTableSourceConfig(ConfigModel):
        return self.sample_schema_folder


-class SampleUserSourceConfig(ConfigModel):
-    no_of_users: int
-
-
-def get_table_key(row: Dict[str, Any]) -> Union[TableKey, None]:
-    """
-    Table key consists of schema and table name
-    :param row:
-    :return:
-    """
-    return TableKey(schema=row['schema'], table_name=row['table_name'])
-
-
@dataclass
 class SampleTableSourceStatus(SourceStatus):
    tables_scanned: List[str] = field(default_factory=list)
@ -85,14 +77,6 @@ class SampleTableSourceStatus(SourceStatus):
        self.tables_scanned.append(table_name)


-@dataclass
-class SampleUserSourceStatus(SourceStatus):
-    users_scanned: List[str] = field(default_factory=list)
-
-    def report_table_scanned(self, user_name: str) -> None:
-        self.users_scanned.append(user_name)
-
-
 class TableSchema:
    def __init__(self, filename):
        # error if the file is not csv file
@ -122,31 +106,6 @@ class TableSchema:
        return [c[COLUMN_NAME] for c in self.columns]


-class DataGenerator:
-    def __init__(self, schemas):
-        if not schemas:
-            raise Exception('Input schemas should be an array of one or more TableSchemas')
-
-        self.schemas = schemas
-
-        # validate that each FK is a PK in one of the input schemas
-        # TODO
-
-        self.table_to_schema = dict((s.get_name(), s) for s in schemas)
-
-    def generate_data(self, table_name, number_of_rows):
-        fake = Faker()
-        schema = self.table_to_schema[table_name]
-        data = {}
-        for c in schema.get_schema():
-            if not c[FAKER_METHOD]:
-                logging.debug('{} has no faker method input'.format(c))
-                continue
-            fn = getattr(fake, c[FAKER_METHOD])
-            data[c[COLUMN_NAME]] = [fn() for _ in range(number_of_rows)]
-        return pd.DataFrame(data)
-
-
 class SampleTableMetadataGenerator:
    def __init__(self, table_to_df_dict, table_to_schema_map):
        self.table_to_df_dict = table_to_df_dict
@ -210,61 +169,7 @@ class SampleTableMetadataGenerator:
        return sorted_row_dict


-class SampleUserMetadataGenerator:
-
-    def __init__(self, number_of_users):
-        self.number_of_users = number_of_users
-
-    def generate_sample_user(self):
-        schema = dict()
-        fake = Faker()
-        # columns that use faker
-        schema['email'] = lambda: None
-        schema['first_name'] = lambda: fake.first_name()
-        schema['last_name'] = lambda: fake.last_name()
-        schema['full_name'] = lambda: None
-        schema['github_username'] = lambda: None
-        schema['team_name'] = lambda: random.choice(
-            ['Data_Infra', 'Infra', 'Payments', 'Legal', 'Dev_Platform', 'Trust', 'Marketplace'])
-        schema['employee_type'] = lambda: None
-        schema['manager_email'] = lambda: fake.email()
-        schema['slack_id'] = lambda: None
-        schema['role_name'] = lambda: random.choices(
-            ['ROLE_ENGINEER', 'ROLE_DATA_SCIENTIST', 'ROLE_ADMIN'], weights=[40, 40, 10])[0]
-        data = {}
-
-        for k in schema.keys():
-            data[k] = [schema[k]() for _ in range(self.number_of_users)]
-
-        # fill in the columns that can be derived from the random data above
-        for i in range(self.number_of_users):
-            data['full_name'][i] = data['first_name'][i] + ' ' + data['last_name'][i]
-            username = data['first_name'][i].lower() + '_' + data['last_name'][i].lower() + random.choice(
-                string.digits)
-            data['slack_id'][i] = username
-            data['github_username'][i] = username
-            data['email'][i] = username + '@gmail.com'
-            data['employee_type'] = data['role_name']
-
-        pd_rows = pd.DataFrame(data)
-        row_dict = []
-        for index, row in pd_rows.iterrows():
-            row_dict.append(row)
-
-        return row_dict
-
-
-def get_service_or_create(service_json, metadata_config) -> DatabaseServiceEntity:
-    client = REST(metadata_config)
-    service = client.get_database_service(service_json['name'])
-    if service is not None:
-        return service
-    else:
-        created_service = client.create_database_service(CreateDatabaseServiceEntityRequest(**service_json))
-        return created_service
-
-
-class SampleTableSource(Source):
+class SampleTablesSource(Source):

    def __init__(self, config: SampleTableSourceConfig, metadata_config: MetadataServerConfig, ctx):
        super().__init__(ctx)
@ -302,80 +207,3 @@ class SampleTableSource(Source):

    def get_status(self):
        return self.status
-
-
-class SampleUsageSource(Source):
-
-    def __init__(self, config: SampleTableSourceConfig, metadata_config: MetadataServerConfig, ctx):
-        super().__init__(ctx)
-        self.status = SampleTableSourceStatus()
-        self.config = config
-        self.metadata_config = metadata_config
-        self.client = REST(metadata_config)
-        self.service_json = json.load(open(config.sample_schema_folder + "/service.json", 'r'))
-        self.query_log_csv = config.sample_schema_folder + "/query_log"
-        with open(self.query_log_csv, 'r') as fin:
-            self.query_logs = [dict(i) for i in csv.DictReader(fin)]
-        self.service = get_service_or_create(self.service_json, metadata_config)
-
-    @classmethod
-    def create(cls, config_dict, metadata_config_dict, ctx):
-        config = SampleTableSourceConfig.parse_obj(config_dict)
-        metadata_config = MetadataServerConfig.parse_obj(metadata_config_dict)
-        return cls(config, metadata_config, ctx)
-
-    def prepare(self):
-        pass
-
-    def next_record(self) -> Iterable[TableQuery]:
-        for row in self.query_logs:
-            tq = TableQuery(row['query'], '', 100, 0, 0, '',
-                            '', datetime.today().strftime('%Y-%m-%d %H:%M:%S'), 100, 'shopify',
-                            False, row['query'])
-            yield tq
-
-    def close(self):
-        pass
-
-    def get_status(self):
-        return self.status
-
-
-class SampleUserSource(Source):
-
-    def __init__(self, config: SampleUserSourceConfig, metadata_config: MetadataServerConfig, ctx):
-        super().__init__(ctx)
-        self.status = SampleUserSourceStatus()
-        metadata_gen = SampleUserMetadataGenerator(config.no_of_users)
-        self.sample_columns = metadata_gen.generate_sample_user()
-
-    @classmethod
-    def create(cls, config_dict, metadata_config_dict, ctx):
-        config = SampleUserSourceConfig.parse_obj(config_dict)
-        metadata_config = MetadataServerConfig.parse_obj(metadata_config_dict)
-        return cls(config, metadata_config, ctx)
-
-    def prepare(self):
-        pass
-
-    def next_record(self) -> Iterable[DatabaseMetadata]:
-        for user in self.sample_columns:
-            user_metadata = User(user['email'],
-                                 user['first_name'],
-                                 user['last_name'],
-                                 user['full_name'],
-                                 user['github_username'],
-                                 user['team_name'],
-                                 user['employee_type'],
-                                 user['manager_email'],
-                                 user['slack_id'],
-                                 True,
-                                 0)
-            self.status.report_table_scanned(user['github_username'])
-            yield user_metadata
-
-    def close(self):
-        pass
-
-    def get_status(self):
-        return self.status
--- a/ingestion/src/metadata/ingestion/source/sample_usage.py
+++ b/ingestion/src/metadata/ingestion/source/sample_usage.py
@ -0,0 +1,46 @@
+import json
+import csv
+from metadata.ingestion.api.source import Source
+from .sample_tables import SampleTableSourceConfig, SampleTableSourceStatus, get_service_or_create
+from metadata.ingestion.ometa.auth_provider import MetadataServerConfig
+from metadata.ingestion.models.table_queries import TableQuery
+from typing import Iterable
+from datetime import datetime
+from metadata.ingestion.ometa.client import REST
+
+
+class SampleUsageSource(Source):
+
+    def __init__(self, config: SampleTableSourceConfig, metadata_config: MetadataServerConfig, ctx):
+        super().__init__(ctx)
+        self.status = SampleTableSourceStatus()
+        self.config = config
+        self.metadata_config = metadata_config
+        self.client = REST(metadata_config)
+        self.service_json = json.load(open(config.sample_schema_folder + "/service.json", 'r'))
+        self.query_log_csv = config.sample_schema_folder + "/query_log"
+        with open(self.query_log_csv, 'r') as fin:
+            self.query_logs = [dict(i) for i in csv.DictReader(fin)]
+        self.service = get_service_or_create(self.service_json, metadata_config)
+
+    @classmethod
+    def create(cls, config_dict, metadata_config_dict, ctx):
+        config = SampleTableSourceConfig.parse_obj(config_dict)
+        metadata_config = MetadataServerConfig.parse_obj(metadata_config_dict)
+        return cls(config, metadata_config, ctx)
+
+    def prepare(self):
+        pass
+
+    def next_record(self) -> Iterable[TableQuery]:
+        for row in self.query_logs:
+            tq = TableQuery(row['query'], '', 100, 0, 0, '',
+                            '', datetime.today().strftime('%Y-%m-%d %H:%M:%S'), 100, 'shopify',
+                            False, row['query'])
+            yield tq
+
+    def close(self):
+        pass
+
+    def get_status(self):
+        return self.status
--- a/ingestion/src/metadata/ingestion/source/sample_users.py
+++ b/ingestion/src/metadata/ingestion/source/sample_users.py
@ -0,0 +1,122 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements. See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License. You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import random
+import string
+import pandas as pd
+from faker import Faker
+from typing import Iterable, List
+from dataclasses import dataclass, field
+from metadata.config.common import ConfigModel
+from metadata.ingestion.api.source import Source, SourceStatus
+from metadata.ingestion.ometa.auth_provider import MetadataServerConfig
+from metadata.ingestion.models.table_metadata import DatabaseMetadata
+from metadata.ingestion.models.user import User
+
+
+class SampleUserSourceConfig(ConfigModel):
+    no_of_users: int
+
+
+@dataclass
+class SampleUserSourceStatus(SourceStatus):
+    users_scanned: List[str] = field(default_factory=list)
+
+    def report_table_scanned(self, user_name: str) -> None:
+        self.users_scanned.append(user_name)
+
+
+class SampleUserMetadataGenerator:
+
+    def __init__(self, number_of_users):
+        self.number_of_users = number_of_users
+
+    def generate_sample_user(self):
+        schema = dict()
+        fake = Faker()
+        # columns that use faker
+        schema['email'] = lambda: None
+        schema['first_name'] = lambda: fake.first_name()
+        schema['last_name'] = lambda: fake.last_name()
+        schema['full_name'] = lambda: None
+        schema['github_username'] = lambda: None
+        schema['team_name'] = lambda: random.choice(
+            ['Data_Infra', 'Infra', 'Payments', 'Legal', 'Dev_Platform', 'Trust', 'Marketplace'])
+        schema['employee_type'] = lambda: None
+        schema['manager_email'] = lambda: fake.email()
+        schema['slack_id'] = lambda: None
+        schema['role_name'] = lambda: random.choices(
+            ['ROLE_ENGINEER', 'ROLE_DATA_SCIENTIST', 'ROLE_ADMIN'], weights=[40, 40, 10])[0]
+        data = {}
+
+        for k in schema.keys():
+            data[k] = [schema[k]() for _ in range(self.number_of_users)]
+
+        # fill in the columns that can be derived from the random data above
+        for i in range(self.number_of_users):
+            data['full_name'][i] = data['first_name'][i] + ' ' + data['last_name'][i]
+            username = data['first_name'][i].lower() + '_' + data['last_name'][i].lower() + random.choice(
+                string.digits)
+            data['slack_id'][i] = username
+            data['github_username'][i] = username
+            data['email'][i] = username + '@gmail.com'
+            data['employee_type'] = data['role_name']
+
+        pd_rows = pd.DataFrame(data)
+        row_dict = []
+        for index, row in pd_rows.iterrows():
+            row_dict.append(row)
+
+        return row_dict
+
+
+class SampleUsersSource(Source):
+
+    def __init__(self, config: SampleUserSourceConfig, metadata_config: MetadataServerConfig, ctx):
+        super().__init__(ctx)
+        self.status = SampleUserSourceStatus()
+        metadata_gen = SampleUserMetadataGenerator(config.no_of_users)
+        self.sample_columns = metadata_gen.generate_sample_user()
+
+    @classmethod
+    def create(cls, config_dict, metadata_config_dict, ctx):
+        config = SampleUserSourceConfig.parse_obj(config_dict)
+        metadata_config = MetadataServerConfig.parse_obj(metadata_config_dict)
+        return cls(config, metadata_config, ctx)
+
+    def prepare(self):
+        pass
+
+    def next_record(self) -> Iterable[DatabaseMetadata]:
+        for user in self.sample_columns:
+            user_metadata = User(user['email'],
+                                 user['first_name'],
+                                 user['last_name'],
+                                 user['full_name'],
+                                 user['github_username'],
+                                 user['team_name'],
+                                 user['employee_type'],
+                                 user['manager_email'],
+                                 user['slack_id'],
+                                 True,
+                                 0)
+            self.status.report_table_scanned(user['github_username'])
+            yield user_metadata
+
+    def close(self):
+        pass
+
+    def get_status(self):
+        return self.status
--- a/ingestion/src/metadata/ingestion/source/snowflake_usage.py
+++ b/ingestion/src/metadata/ingestion/source/snowflake_usage.py
@ -28,7 +28,7 @@ class SnowflakeUsageSource(Source):
    # SELECT statement from mysql information_schema to extract table and column metadata
    SQL_STATEMENT = """
        select query_id as query,Query_text as sql,query_type as label,
-        database_name as database,start_time as starttime,end_time as endtime
+        database_name as database,start_time as starttime,end_time as endtime,schema_name
        from table(information_schema.query_history(
        end_time_range_start=>to_timestamp_ltz('{start_date}'),
        end_time_range_end=>to_timestamp_ltz('{end_date}')));
@ -83,7 +83,7 @@ class SnowflakeUsageSource(Source):
        for row in self._get_raw_extract_iter():
            tq = TableQuery(row['query'], row['label'], 0, 0, 0, str(row['starttime']),
                            str(row['endtime']), str(row['starttime'])[0:19], 2, row['database'], 0, row['sql'])
-            self.report.scanned(tq)
+            self.report.scanned(f"{row['database']}.{row['schema_name']}")
            yield tq

    def get_report(self):
--- a/ingestion/src/metadata/ingestion/source/source_registry.py
+++ b/ingestion/src/metadata/ingestion/source/source_registry.py
@ -1,22 +0,0 @@
-#  Licensed to the Apache Software Foundation (ASF) under one or more
-#  contributor license agreements. See the NOTICE file distributed with
-#  this work for additional information regarding copyright ownership.
-#  The ASF licenses this file to You under the Apache License, Version 2.0
-#  (the "License"); you may not use this file except in compliance with
-#  the License. You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-from metadata.ingestion.api.registry import Registry
-from metadata.ingestion.api.source import Source
-
-source_registry = Registry[Source]()
-source_registry.load("metadata.ingestion.source.plugins")
-
-# This source is always enabled
--- a/ingestion/src/metadata/ingestion/stage/stage_registry.py
+++ b/ingestion/src/metadata/ingestion/stage/stage_registry.py
@ -1,22 +0,0 @@
-#  Licensed to the Apache Software Foundation (ASF) under one or more
-#  contributor license agreements. See the NOTICE file distributed with
-#  this work for additional information regarding copyright ownership.
-#  The ASF licenses this file to You under the Apache License, Version 2.0
-#  (the "License"); you may not use this file except in compliance with
-#  the License. You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-from metadata.ingestion.api.registry import Registry
-from metadata.ingestion.api.stage import Stage
-
-stage_registry = Registry[Stage]()
-stage_registry.load("metadata.ingestion.stage.plugins")
-
-
--- a/ingestion/src/metadata/ingestion/stage/table_usage_stage.py
+++ b/ingestion/src/metadata/ingestion/stage/table_usage_stage.py
--- a/ingestion/src/metadata/ingestion/workflow/workflow.py
+++ b/ingestion/src/metadata/ingestion/workflow/workflow.py
@ -31,11 +31,8 @@ from metadata.ingestion.api.processor import Processor
 from metadata.ingestion.api.sink import Sink
 from metadata.ingestion.api.source import Source
 from metadata.ingestion.api.stage import Stage
-from metadata.ingestion.bulksink.bulk_sink_registry import bulk_sink_registry
-from metadata.ingestion.sink.sink_registry import sink_registry
-from metadata.ingestion.source.source_registry import source_registry
-from metadata.ingestion.processor.processor_registry import processor_registry
-from metadata.ingestion.stage.stage_registry import stage_registry
+from metadata.ingestion.api.registry import Registry
+from metadata.ingestion.api.source import Source

 logger = logging.getLogger(__name__)

@ -61,9 +58,10 @@ class Workflow:
    def __init__(self, config: WorkflowConfig):
        self.config = config
        self.ctx = WorkflowContext(workflow_id=self.config.run_id)
-
        source_type = self.config.source.type
-        source_class = source_registry.get(source_type)
+        source_registry = Registry[Source]()
+        source_class = source_registry.get('metadata.ingestion.source.{}.{}Source'.format(
+            source_type.replace('-', '_'), ''.join([i.title() for i in source_type.replace('-', '_').split('_')])))
        metadata_config = self.config.metadata_server.dict().get("config", {})
        self.source: Source = source_class.create(
            self.config.source.dict().get("config", {}), metadata_config, self.ctx
@ -74,28 +72,36 @@ class Workflow:

        if self.config.processor:
            processor_type = self.config.processor.type
-            processor_class = processor_registry.get(processor_type)
+            processor_registry = Registry[Processor]()
+            processor_class = processor_registry.get('metadata.ingestion.processor.{}.{}Processor'.format(
+                processor_type.replace('-', '_'), ''.join([i.title() for i in processor_type.replace('-', '_').split('_')])))
            processor_config = self.config.processor.dict().get("config", {})
            self.processor: Processor = processor_class.create(processor_config, metadata_config, self.ctx)
            logger.debug(f"Processor Type: {processor_type}, {processor_class} configured")

        if self.config.stage:
            stage_type = self.config.stage.type
-            stage_class = stage_registry.get(stage_type)
+            stage_registry = Registry[Stage]()
+            stage_class = stage_registry.get('metadata.ingestion.stage.{}.{}Stage'.format(
+                stage_type.replace('-', '_'), ''.join([i.title() for i in stage_type.replace('-', '_').split('_')])))
            stage_config = self.config.stage.dict().get("config", {})
            self.stage: Stage = stage_class.create(stage_config, metadata_config, self.ctx)
            logger.debug(f"Stage Type: {stage_type}, {stage_class} configured")

        if self.config.sink:
            sink_type = self.config.sink.type
-            sink_class = sink_registry.get(sink_type)
+            sink_registry = Registry[Sink]()
+            sink_class = sink_registry.get('metadata.ingestion.sink.{}.{}Sink'.format(
+                sink_type.replace('-', '_'), ''.join([i.title() for i in sink_type.replace('-', '_').split('_')])))
            sink_config = self.config.sink.dict().get("config", {})
            self.sink: Sink = sink_class.create(sink_config, metadata_config, self.ctx)
            logger.debug(f"Sink type:{self.config.sink.type},{sink_class} configured")

        if self.config.bulk_sink:
            bulk_sink_type = self.config.bulk_sink.type
-            bulk_sink_class = bulk_sink_registry.get(bulk_sink_type)
+            bulk_sink_registry = Registry[BulkSink]()
+            bulk_sink_class = bulk_sink_registry.get('metadata.ingestion.bulksink.{}.{}BulkSink'.format(
+                bulk_sink_type.replace('-', '_'), ''.join([i.title() for i in bulk_sink_type.replace('-', '_').split('_')])))
            bulk_sink_config = self.config.bulk_sink.dict().get("config", {})
            self.bulk_sink: BulkSink = bulk_sink_class.create(bulk_sink_config, metadata_config, self.ctx)
            logger.info(f"BulkSink type:{self.config.bulk_sink.type},{bulk_sink_class} configured")