mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-28 10:28:22 +00:00
commit
0401cdf31d
15
.travis.yml
Normal file
15
.travis.yml
Normal file
@ -0,0 +1,15 @@
|
||||
language: java
|
||||
|
||||
jdk:
|
||||
- oraclejdk8
|
||||
|
||||
before_install:
|
||||
# download play 2.2.4
|
||||
- wget http://downloads.typesafe.com/play/2.2.4/play-2.2.4.zip
|
||||
- unzip play-2.2.4.zip && rm play-2.2.4.zip && mv play-2.2.4 $HOME/
|
||||
- export PLAY_HOME="$HOME/play-2.2.4"
|
||||
- echo $PLAY_HOME
|
||||
|
||||
# change the build file stack size
|
||||
- sed -i -e 's/-Xss1M/-Xss2M/g' $PLAY_HOME/framework/build
|
||||
- cat $PLAY_HOME/framework/build
|
||||
@ -75,11 +75,11 @@ idea {
|
||||
}
|
||||
}
|
||||
|
||||
task "build" (type: Exec, dependsOn: playCompile) {
|
||||
task "build" (type: Exec, dependsOn: playCompile, overwrite: true) {
|
||||
commandLine playExec, 'stage'
|
||||
}
|
||||
|
||||
task "dist" (type: Exec) {
|
||||
task "dist" (type: Exec, overwrite: true) {
|
||||
commandLine playExec, 'dist'
|
||||
}
|
||||
|
||||
|
||||
@ -48,8 +48,8 @@ subprojects {
|
||||
}
|
||||
}
|
||||
|
||||
ext.externalDependency = [//"mysql" : "mysql:mysql-connector-java:5.1.36",
|
||||
//"jython" : "org.python:jython-standalone:2.7.0",
|
||||
ext.externalDependency = ["mysql" : "mysql:mysql-connector-java:5.1.36",
|
||||
"jython" : "org.python:jython-standalone:2.7.0",
|
||||
"testng" : "org.testng:testng:6.9.6",
|
||||
"hadoop_common" : "org.apache.hadoop:hadoop-common:2.7.1",
|
||||
"hadoop_client" : "org.apache.hadoop:hadoop-mapreduce-client-core:2.7.1",
|
||||
|
||||
@ -25,10 +25,10 @@ dependencies {
|
||||
compile externalDependency.hive_exec
|
||||
compile files("extralibs/terajdbc4-15.00.00.20.jar")
|
||||
compile files("extralibs/tdgssconfig-15.00.00.20.jar")
|
||||
// compile externalDependency.jython
|
||||
// compile externalDependency.mysql
|
||||
compile files("extralibs/mysql-connector-java-5.1.36.jar") // externalDependency.mysql
|
||||
compile files("extralibs/jython-standalone-2.7.0.jar") //externalDependency.jython
|
||||
compile externalDependency.jython
|
||||
compile externalDependency.mysql
|
||||
// compile files("extralibs/mysql-connector-java-5.1.36.jar")
|
||||
// compile files("extralibs/jython-standalone-2.7.0.jar")
|
||||
provided project(":hadoop-dataset-extractor-standalone")
|
||||
testCompile externalDependency.testng
|
||||
}
|
||||
|
||||
@ -1,6 +1,4 @@
|
||||
Please get the extra library files, which may not be available in Maven/TypeSafe repository or Artifactory, and put them here. For example:
|
||||
|
||||
* https://downloads.teradata.com/download/connectivity/jdbc-driver
|
||||
* http://dev.mysql.com/downloads/connector/j
|
||||
* http://download.oracle.com/otn/utilities_drivers/jdbc/121010/ojdbc7.jar
|
||||
* http://www.jython.org/downloads.html
|
||||
|
||||
@ -463,7 +463,7 @@ class TeradataExtract:
|
||||
data_with_column = map(lambda x:dict(zip(columns, x)), rows_data)
|
||||
return ref_urn, json.dumps({'sample': data_with_column})
|
||||
|
||||
def run(self, database_name, table_name, schema_output_file, sample_output_file):
|
||||
def run(self, database_name, table_name, schema_output_file, sample_output_file, sample=True):
|
||||
"""
|
||||
The entrance of the class, extract schema and sample data
|
||||
Notice the database need to have a order that the databases have more info (DWH_STG) should be scaned first.
|
||||
@ -480,9 +480,6 @@ class TeradataExtract:
|
||||
schema_json = open(schema_output_file, 'wb')
|
||||
os.chmod(schema_output_file, 0666)
|
||||
|
||||
open(sample_output_file, 'wb')
|
||||
os.chmod(sample_output_file, 0666)
|
||||
sample_file_writer = FileWriter(sample_output_file)
|
||||
|
||||
if database_name is None and table_name is None: # default route: process everything
|
||||
for database_name in self.databases:
|
||||
@ -506,25 +503,31 @@ class TeradataExtract:
|
||||
f_log.write("Get view info %12s [%s -> %s]\n" % (database_name, str(begin), str(end)))
|
||||
|
||||
scaned_dict = {} # a cache of {name : {urn : _, data : _}} to avoid repeat computing
|
||||
# collect sample data
|
||||
for onedatabase in schema:
|
||||
database_name = onedatabase['database']
|
||||
if 'tables' in onedatabase:
|
||||
alltables = onedatabase['tables']
|
||||
else:
|
||||
alltables = onedatabase['views']
|
||||
|
||||
for onetable in alltables:
|
||||
table_name = onetable['original_name'].split('.')[1]
|
||||
if table_name in scaned_dict:
|
||||
sample_record = SampleDataRecord('teradata', '/' + database_name + '/' + table_name,
|
||||
scaned_dict[table_name]['ref_urn'], scaned_dict[table_name]['data'])
|
||||
if sample:
|
||||
open(sample_output_file, 'wb')
|
||||
os.chmod(sample_output_file, 0666)
|
||||
sample_file_writer = FileWriter(sample_output_file)
|
||||
|
||||
# collect sample data
|
||||
for onedatabase in schema:
|
||||
database_name = onedatabase['database']
|
||||
if 'tables' in onedatabase:
|
||||
alltables = onedatabase['tables']
|
||||
else:
|
||||
(ref_urn, sample_data) = self.get_sample_data(database_name, table_name)
|
||||
sample_record = SampleDataRecord('teradata', '/' + database_name + '/' + table_name, '', sample_data)
|
||||
scaned_dict[table_name] = {'ref_urn': ref_urn, 'data': sample_data}
|
||||
sample_file_writer.append(sample_record)
|
||||
sample_file_writer.close()
|
||||
alltables = onedatabase['views']
|
||||
|
||||
for onetable in alltables:
|
||||
table_name = onetable['original_name'].split('.')[1]
|
||||
if table_name in scaned_dict:
|
||||
sample_record = SampleDataRecord('teradata', '/' + database_name + '/' + table_name,
|
||||
scaned_dict[table_name]['ref_urn'], scaned_dict[table_name]['data'])
|
||||
else:
|
||||
(ref_urn, sample_data) = self.get_sample_data(database_name, table_name)
|
||||
sample_record = SampleDataRecord('teradata', '/' + database_name + '/' + table_name, '', sample_data)
|
||||
scaned_dict[table_name] = {'ref_urn': ref_urn, 'data': sample_data}
|
||||
sample_file_writer.append(sample_record)
|
||||
sample_file_writer.close()
|
||||
|
||||
# print 'byte size of schema : ' + str(sys.getsizeof(schema))
|
||||
schema_json.write(json.dumps(schema, indent=None) + '\n')
|
||||
@ -554,7 +557,7 @@ if __name__ == "__main__":
|
||||
index_type = {'P': 'Primary Index', 'K': 'Primary Key', 'S': 'Secondary Index', 'Q': 'Partitioned Primary Index',
|
||||
'J': 'Join Index', 'U': 'Unique Index'}
|
||||
|
||||
e.run(None, None, args[Constant.TD_SCHEMA_OUTPUT_KEY], args[Constant.TD_SAMPLE_OUTPUT_KEY])
|
||||
e.run(None, None, args[Constant.TD_SCHEMA_OUTPUT_KEY], args[Constant.TD_SAMPLE_OUTPUT_KEY], sample=False)
|
||||
finally:
|
||||
e.conn_td.close()
|
||||
|
||||
|
||||
@ -86,15 +86,23 @@ idea {
|
||||
}
|
||||
}
|
||||
|
||||
task "build" (type: Exec, dependsOn: playClean) {
|
||||
// delegate gradle java task to play command
|
||||
|
||||
task "build" (type: Exec, dependsOn: playClean, overwrite: true) {
|
||||
commandLine playExec, 'stage'
|
||||
}
|
||||
|
||||
task "assemble" (type: Exec, dependsOn: playClean, overwrite: true) {
|
||||
commandLine playExec, 'stage'
|
||||
}
|
||||
|
||||
task "dist" (type: Exec) {
|
||||
task "dist" (type: Exec, overwrite: true) {
|
||||
commandLine playExec, 'dist'
|
||||
}
|
||||
|
||||
task "check" (overwrite: true) {
|
||||
// skip gradle check of this repository
|
||||
}
|
||||
/*
|
||||
// optional: if using 'eclipse' plugin
|
||||
eclipse {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user