refactor(py3): Refactor all ETL scripts to using Python 3 exclusively (#1710)

* refactor(py3): Refactor all ETL scripts to using Python 3 exclusively

Fix https://github.com/linkedin/datahub/issues/1688

* Update requirements.txt
This commit is contained in:
Mars Lan 2020-06-25 15:16:04 -07:00 committed by GitHub
parent 60b7c63b26
commit fa9fe5e110
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 50 additions and 42 deletions

View File

@ -4,14 +4,20 @@
1. Before running any metadata ingestion job, you should make sure that DataHub backend services are all running. Easiest
way to do that is through [Docker images](../docker).
2. You also need to build the `mxe-schemas` module as below.
```
./gradlew :metadata-events:mxe-schemas:build
```
This is needed to generate `MetadataChangeEvent.avsc` which is the schema for `MetadataChangeEvent` Kafka topic.
3. Before launching each ETL ingestion pipeline, you can install/verify the library versions as below.
```
pip install --user -r requirements.txt
```
```
./gradlew :metadata-events:mxe-schemas:build
```
This is needed to generate `MetadataChangeEvent.avsc` which is the schema for `MetadataChangeEvent` Kafka topic.
3. All the scripts are written using Python 3 and most likely won't work with Python 2.x interpreters.
You can verify the version of your Python using the following command.
```
python --version
```
We recommend using [pyenv](https://github.com/pyenv/pyenv) to install and manage your Python environment.
4. Before launching each ETL ingestion pipeline, you can install/verify the library versions as below.
```
pip install --user -r requirements.txt
```
## MCE Producer/Consumer CLI
`mce_cli.py` script provides a convenient way to produce a list of MCEs from a data file.

View File

@ -1,6 +1,8 @@
#! /usr/bin/python
import sys
import time
from confluent_kafka import avro
from confluent_kafka.avro import AvroProducer
from pyhive import hive
from TCLIService.ttypes import TOperationState
@ -16,7 +18,7 @@ def hive_query(query):
Execute the query to the HiveStore.
"""
cursor = hive.connect(HIVESTORE).cursor()
cursor.execute(query, async=True)
cursor.execute(query, async_=True)
status = cursor.poll().operationState
while status in (TOperationState.INITIALIZED_STATE, TOperationState.RUNNING_STATE):
logs = cursor.fetch_logs()
@ -49,9 +51,6 @@ def produce_hive_dataset_mce(mce):
"""
Produce MetadataChangeEvent records.
"""
from confluent_kafka import avro
from confluent_kafka.avro import AvroProducer
conf = {'bootstrap.servers': BOOTSTRAP,
'schema.registry.url': SCHEMAREGISTRY}
record_schema = avro.load(AVROLOADPATH)

View File

@ -1,2 +1,4 @@
confluent-kafka[avro]==1.1.0
pyhive==0.6.1
avro-python3==1.8.2
confluent-kafka==1.4.0
pyhive==0.6.1
thrift-sasl==0.4.2

View File

@ -2,7 +2,9 @@
import sys
import time
from kazoo.client import KazooClient
from confluent.schemaregistry.client import CachedSchemaRegistryClient
from confluent_kafka import avro
from confluent_kafka.avro.cached_schema_registry_client import CachedSchemaRegistryClient
from confluent_kafka.avro import AvroProducer
ZOOKEEPER='localhost:2181'
AVROLOADPATH = '../../metadata-events/mxe-schemas/src/renamed/avro/com/linkedin/mxe/MetadataChangeEvent.avsc'
@ -15,7 +17,7 @@ def build_kafka_dataset_mce(dataset_name, schema, schema_version):
"""
Create the MetadataChangeEvent via dataset_name and schema.
"""
actor, sys_time = "urn:li:corpuser:", long(time.time())
actor, sys_time = "urn:li:corpuser:", time.time()
schema_name = {"schemaName":dataset_name,"platform":"urn:li:dataPlatform:kafka","version":schema_version,"created":{"time":sys_time,"actor":actor},
"lastModified":{"time":sys_time,"actor":actor},"hash":"","platformSchema":{"documentSchema": schema},
"fields":[{"fieldPath":"","description":"","nativeDataType":"string","type":{"type":{"com.linkedin.pegasus2avro.schema.StringType":{}}}}]}
@ -31,9 +33,6 @@ def produce_kafka_dataset_mce(mce):
"""
Produce MetadataChangeEvent records.
"""
from confluent_kafka import avro
from confluent_kafka.avro import AvroProducer
conf = {'bootstrap.servers': BOOTSTRAP,
'schema.registry.url': SCHEMAREGISTRY}
record_schema = avro.load(AVROLOADPATH)
@ -58,7 +57,11 @@ for dataset_name in topics:
continue
topic = dataset_name + '-value'
schema_id, schema, schema_version = client.get_latest_schema(topic)
print topic
if schema_id is None:
print(f"Skipping topic without schema: {topic}")
continue
print(topic)
build_kafka_dataset_mce(dataset_name, str(schema), int(schema_version))
sys.exit(0)

View File

@ -1,3 +1,3 @@
confluent-kafka[avro]==1.1.0
python-schema-registry-client==1.2.1
avro-python3==1.8.2
confluent-kafka==1.4.0
kazoo==2.5.0

View File

@ -1,6 +1,8 @@
#! /usr/bin/python
import sys
import ldap
from confluent_kafka import avro
from confluent_kafka.avro import AvroProducer
from ldap.controls import SimplePagedResultsControl
from distutils.version import LooseVersion
@ -10,7 +12,7 @@ LDAPSERVER ='LDAPSERVER'
BASEDN ='BASEDN'
LDAPUSER = 'LDAPUSER'
LDAPPASSWORD = 'LDAPPASSWORD'
PAGESIZE = PAGESIZE
PAGESIZE = 20
ATTRLIST = ['cn', 'title', 'mail', 'sAMAccountName', 'department','manager']
SEARCHFILTER='SEARCHFILTER'
@ -81,9 +83,6 @@ def produce_corp_user_mce(mce):
"""
Produce MetadataChangeEvent records
"""
from confluent_kafka import avro
from confluent_kafka.avro import AvroProducer
conf = {'bootstrap.servers': BOOTSTRAP,
'schema.registry.url': SCHEMAREGISTRY}
record_schema = avro.load(AVROLOADPATH)

View File

@ -1,2 +1,3 @@
confluent-kafka[avro]==1.1.0
avro-python3==1.8.2
confluent-kafka==1.4.0
python-ldap==3.2.0

View File

@ -1,6 +1,11 @@
#! /usr/bin/python
import argparse
import ast
from confluent_kafka import avro
from confluent_kafka.avro import AvroConsumer
from confluent_kafka.avro import AvroProducer
from confluent_kafka.avro.serializer import SerializerError
topic = "MetadataChangeEvent"
@ -13,9 +18,6 @@ def produce(conf, data_file, schema_record):
"""
Produce MetadataChangeEvent records
"""
from confluent_kafka.avro import AvroProducer
import ast
producer = AvroProducer(conf, default_value_schema=avro.load(schema_record))
print("Producing MetadataChangeEvent records to topic {}. ^c to exit.".format(topic))
@ -36,7 +38,7 @@ def produce(conf, data_file, schema_record):
break
except ValueError as e:
print ("Message serialization failed {}".format(e))
continue
break
print("Flushing records...")
producer.flush()
@ -46,9 +48,6 @@ def consume(conf, schema_record):
"""
Consume MetadataChangeEvent records
"""
from confluent_kafka.avro import AvroConsumer
from confluent_kafka.avro.serializer import SerializerError
print("Consuming MetadataChangeEvent records from topic {} with group {}. ^c to exit.".format(topic, conf["group.id"]))
c = AvroConsumer(conf, reader_value_schema=avro.load(schema_record))

View File

@ -1,3 +1,2 @@
avro-python3==1.8.2; python_version == '3.7'
confluent-kafka==1.1.0; python_version == '3.7'
confluent-kafka[avro]==1.1.0; python_version < '3.7'
avro-python3==1.8.2
confluent-kafka==1.4.0

View File

@ -3,6 +3,8 @@ import sys
import time
import mysql.connector
from mysql.connector import Error
from confluent_kafka import avro
from confluent_kafka.avro import AvroProducer
HOST = 'HOST'
DATABASE = 'DATABASE'
@ -19,7 +21,7 @@ def build_mysql_dataset_mce(dataset_name, schema, schema_version):
"""
Create the MetadataChangeEvent via dataset_name and schema.
"""
actor, fields, sys_time = "urn:li:corpuser:datahub", [], long(time.time())
actor, fields, sys_time = "urn:li:corpuser:datahub", [], time.time()
owner = {"owners":[{"owner":actor,"type":"DATAOWNER"}],"lastModified":{"time":0,"actor":actor}}
@ -41,9 +43,6 @@ def produce_mysql_dataset_mce(mce):
"""
Produce MetadataChangeEvent records.
"""
from confluent_kafka import avro
from confluent_kafka.avro import AvroProducer
conf = {'bootstrap.servers': BOOTSTRAP,
'schema.registry.url': SCHEMAREGISTRY}
record_schema = avro.load(AVROLOADPATH)

View File

@ -1,2 +1,3 @@
confluent-kafka[avro]==1.1.0
avro-python3==1.8.2
confluent-kafka==1.4.0
mysql-connector==2.2.9