Setup.py Refactored, ES port fix (#521)

* Pylint build failure fixed

* Setup & dependency modified, Data profiler default to False, ES port fix

* Profiler requirements refactored

* Setup.py requirement fix

* openmetadata-ingestion version upgrade
This commit is contained in:
Ayush Shah 2021-09-19 13:59:14 +05:30 committed by GitHub
parent 3937971959
commit 7652baa00d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 13 additions and 149 deletions

View File

@ -1,103 +0,0 @@
---
description: This guide will help install Redshift connector and run manually
---
# Redshift
{% hint style="info" %}
**Prerequisites**
OpenMetadata is built using Java, DropWizard, Jetty, and MySQL.
1. Python 3.7 or above
{% endhint %}
### Install from PyPI or Source
{% tabs %}
{% tab title="Install Using PyPI" %}
```bash
pip install 'openmetadata-ingestion[redshift]'
python -m spacy download en_core_web_sm
```
{% endtab %}
{% endtabs %}
## Run Manually
```bash
metadata ingest -c ./examples/workflows/redshift.json
```
### Configuration
{% code title="redshift.json" %}
```javascript
{
"source": {
"type": "redshift",
"config": {
"host_port": "redshift-cluster-1.clot5cqn1cnb.us-west-2.redshift.amazonaws.com:5439",
"username": "awsuser",
"password": "focguC-kaqqe5-nepsok",
"database": "warehouse",
"service_name": "aws_redshift",
"filter_pattern": {
"excludes": ["information_schema.*", "[\\w]*event_vw.*"]
}
}
},
...
```
{% endcode %}
1. **username** - pass the Redshift username. We recommend creating a user with read-only permissions to all the databases in your Redshift installation
2. **password** - password for the username
3. **service\_name** - Service Name for this Redshift cluster. If you added Redshift cluster through OpenMetadata UI, make sure the service name matches the same.
4. **filter\_pattern** - It contains includes, excludes options to choose which pattern of datasets you want to ingest into OpenMetadata
## Publish to OpenMetadata
Below is the configuration to publish Redshift data into the OpenMeatadata service.
Add optionally `pii` processor and `metadata-rest-tables` sink along with `metadata-server` config
{% code title="redshift.json" %}
```javascript
{
"source": {
"type": "redshift",
"config": {
"host_port": "redshift-cluster-1.clot5cqn1cnb.us-west-2.redshift.amazonaws.com:5439",
"username": "awsuser",
"password": "focguC-kaqqe5-nepsok",
"database": "warehouse",
"service_name": "aws_redshift",
"filter_pattern": {
"excludes": ["information_schema.*", "[\\w]*event_vw.*"]
}
}
},
"sink": {
"type": "metadata-rest",
"config": {}
},
"metadata_server": {
"type": "metadata-server",
"config": {
"api_endpoint": "http://localhost:8585/api",
"auth_provider_type": "no-auth"
}
},
"cron": {
"minute": "*/5",
"hour": null,
"day": null,
"month": null,
"day_of_week": null
}
}
```
{% endcode %}

View File

@ -1,33 +0,0 @@
{
"source": {
"type": "redshift",
"config": {
"host_port": "redshift-cluster-1.clot5cqn1cnb.us-west-2.redshift.amazonaws.com:5439",
"username": "awsuser",
"password": "focguC-kaqqe5-nepsok",
"database": "warehouse",
"service_name": "aws_redshift",
"filter_pattern": {
"excludes": ["information_schema.*", "[\\w]*event_vw.*"]
}
}
},
"sink": {
"type": "metadata-rest",
"config": {}
},
"metadata_server": {
"type": "metadata-server",
"config": {
"api_endpoint": "http://localhost:8585/api",
"auth_provider_type": "no-auth"
}
},
"cron": {
"minute": "*/5",
"hour": null,
"day": null,
"month": null,
"day_of_week": null
}
}

View File

@ -14,7 +14,7 @@
"index_topics": "true",
"index_dashboards": "true",
"es_host": "localhost",
"es_port": 9300
"es_port": 9200
}
},
"metadata_server": {

View File

@ -41,9 +41,6 @@ scheduler_requirements = {
"simplescheduler@git+git://github.com/open-metadata/simplescheduler.git#egg=simplescheduler"
}
profiler_requirements = {
"openmetadata-data-profiler@git+git://github.com/open-metadata/data-profiler.git#egg=openmetadata-data-profiler"
}
base_requirements = {
"commonregex",
@ -65,10 +62,14 @@ base_requirements = {
"okta>=1.7.0",
"sqlalchemy>=1.3.24",
"sql-metadata~=2.0.0",
"spacy==3.0.5",
"requests~=2.25.1",
"en_core_web_sm@https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web"
"requests~=2.25.1"
}
pii_requirements = {
"en_core_web_sm@https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web",
"pandas~=1.3.1",
"spacy==3.0.5"
}
base_plugins = {
"query-parser",
"metadata-usage",
@ -88,16 +89,16 @@ plugins: Dict[str, Set[str]] = {
"mssql-odbc": {"pyodbc"},
"mysql": {"pymysql>=1.0.2"},
"oracle": {"cx_Oracle"},
"pii-processor": {"pandas~=1.3.1"},
"pii-processor": pii_requirements,
"presto": {"pyhive~=0.6.3"},
"postgres": {"pymysql>=1.0.2", "psycopg2-binary", "GeoAlchemy2"},
"redshift": {"sqlalchemy-redshift", "GeoAlchemy2", "psycopg2-binary"},
"redshift-usage": {"sqlalchemy-redshift", "psycopg2-binary", "GeoAlchemy2"},
"scheduler": scheduler_requirements,
"data-profiler": profiler_requirements,
"data-profiler": {"openmetadata-data-profiler"},
"snowflake": {"snowflake-sqlalchemy<=1.2.4"},
"snowflake-usage": {"snowflake-sqlalchemy<=1.2.4"},
"sample-data": {"faker~=8.1.1"},
"sample-data": {"faker~=8.1.1","pandas~=1.3.1"},
"superset": {},
"tableau": {"tableau-api-lib==0.1.22"},
"vertica": {"sqlalchemy-vertica[vertica-python]>=0.0.5"}
@ -106,7 +107,7 @@ plugins: Dict[str, Set[str]] = {
build_options = {"includes": ["_cffi_backend"]}
setup(
name="openmetadata-ingestion",
version="0.2.2",
version="0.3.0",
url="https://open-metadata.org/",
author="OpenMetadata Committers",
license="Apache License 2.0",

View File

@ -21,7 +21,6 @@ from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Type
from urllib.parse import quote_plus
from pydantic import ValidationError
from metadata.config.common import ConfigurationError
from metadata.generated.schema.entity.services.databaseService import DatabaseServiceType
from metadata.ingestion.models.ometa_table_db import OMetaDatabaseAndTable
@ -78,7 +77,7 @@ class SQLConnectionConfig(ConfigModel):
include_views: Optional[bool] = True
include_tables: Optional[bool] = True
generate_sample_data: Optional[bool] = True
data_profiler_enabled: Optional[bool] = True
data_profiler_enabled: Optional[bool] = False
data_profiler_offset: Optional[int] = 0
data_profiler_limit: Optional[int] = 50000
filter_pattern: IncludeFilterPattern = IncludeFilterPattern.allow_all()