mirror of
https://github.com/datahub-project/datahub.git
synced 2025-10-11 17:08:23 +00:00

* Use WH_APP_FOLDER property as the base directory for temp files in Oracle ETL, instead of the full path defined in wh_etl_job_property, which is often /var/tmp/{something}. * Move common code to FileUtil.py and move Voldamort's temp files too. * Move Kafaka ETL temp files. * Move Espresso ETL temp files. * Move Multiproduct ETL temp files. * Move CodeSearch ETL temp files. * Move teradata ETL temp files.
315 lines
12 KiB
Python
315 lines
12 KiB
Python
#
|
|
# Copyright 2015 LinkedIn Corp. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
#
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
import datetime
|
|
import xml.etree.ElementTree as ET
|
|
|
|
from jython import requests
|
|
from wherehows.common import Constant
|
|
from wherehows.common.schemas import MultiproductProjectRecord
|
|
from wherehows.common.schemas import MultiproductRepoRecord
|
|
from wherehows.common.schemas import MultiproductRepoOwnerRecord
|
|
from wherehows.common.writers import FileWriter
|
|
from org.slf4j import LoggerFactory
|
|
|
|
import FileUtil
|
|
|
|
|
|
class MultiproductLoad:
|
|
|
|
def __init__(self, args):
|
|
self.logger = LoggerFactory.getLogger('jython script : ' + self.__class__.__name__)
|
|
requests.packages.urllib3.disable_warnings()
|
|
self.app_id = int(args[Constant.APP_ID_KEY])
|
|
self.wh_exec_id = long(args[Constant.WH_EXEC_ID_KEY])
|
|
|
|
temp_dir = FileUtil.etl_temp_dir(args, "MULTIPRODUCT")
|
|
self.project_writer = FileWriter(os.path.join(temp_dir, args[Constant.GIT_PROJECT_OUTPUT_KEY]))
|
|
self.repo_writer = FileWriter(os.path.join(temp_dir, args[Constant.PRODUCT_REPO_OUTPUT_KEY]))
|
|
self.repo_owner_writer = FileWriter(os.path.join(temp_dir, args[Constant.PRODUCT_REPO_OWNER_OUTPUT_KEY]))
|
|
|
|
self.multiproduct = {}
|
|
self.git_repo = {}
|
|
self.product_repo = []
|
|
|
|
|
|
def get_multiproducts(self):
|
|
'''
|
|
fetch all products and owners of Multiproduct
|
|
'''
|
|
resp = requests.get(args[Constant.MULTIPRODUCT_SERVICE_URL], verify=False)
|
|
if resp.status_code != 200:
|
|
# This means something went wrong.
|
|
raise Exception('Request Error', 'GET /api/v1/mpl {}'.format(resp.status_code))
|
|
|
|
# print resp.content
|
|
re_git_repo_name = re.compile(r":(.*)\.git$")
|
|
re_svn_repo_name = re.compile(r"/(.*)/trunk$")
|
|
if resp.headers['content-type'].split(';')[0] == 'application/json':
|
|
for product_name, product_info in resp.json()['products'].items():
|
|
scm_type = product_info["scm"]["name"]
|
|
try:
|
|
if scm_type == 'git':
|
|
repo_fullname = re_git_repo_name.search(product_info["uris"]["trunk"]).group(1)
|
|
repo_key = 'git:' + repo_fullname
|
|
elif scm_type == 'svn':
|
|
repo_fullname = re_svn_repo_name.search(product_info["uris"]["trunk"]).group(1)
|
|
repo_key = 'svn:' + repo_fullname
|
|
except:
|
|
self.logger.debug("Error parsing repo full name {} - {}".format(product_name, product_info["uris"]))
|
|
continue
|
|
|
|
self.multiproduct[repo_key] = {
|
|
"scm_repo_fullname": repo_fullname,
|
|
"scm_type": scm_type,
|
|
"multiproduct_name": product_name,
|
|
"product_type": product_info["type"],
|
|
"namespace": product_info["org"],
|
|
"owner_name": ",".join(product_info["owners"]),
|
|
"product_version": product_info["product-version"]
|
|
}
|
|
self.logger.info("Fetched {} Multiproducts".format(len(self.multiproduct)))
|
|
|
|
|
|
def get_project_repo(self):
|
|
'''
|
|
fetch detail and repos of all git projects
|
|
'''
|
|
re_git_project_name = re.compile(r"(.*)/(.*)$")
|
|
re_git_repo_name = re.compile(r"git://[\w\.-]+/(.*)\.git$")
|
|
|
|
project_nonexist = []
|
|
project_names = {}
|
|
for key, product in self.multiproduct.iteritems():
|
|
if product["scm_type"] == 'svn':
|
|
continue
|
|
|
|
project_name = re_git_project_name.search(product['scm_repo_fullname']).group(1)
|
|
if project_name in project_names:
|
|
continue
|
|
project_url = '{}/{}?format=xml'.format(args[Constant.GIT_URL_PREFIX], project_name)
|
|
|
|
try:
|
|
resp = requests.get(project_url, verify=False)
|
|
except Exception as ex:
|
|
self.logger.info("Error getting /{}.xml - {}".format(project_name, ex.message))
|
|
continue
|
|
if resp.status_code != 200:
|
|
# This means something went wrong.
|
|
self.logger.debug('Request Error: GET /{}.xml {}'.format(project_name, resp.status_code))
|
|
project_nonexist.append(project_name)
|
|
continue
|
|
|
|
# print resp.content
|
|
if resp.headers['content-type'].split(';')[0] == 'application/xml':
|
|
xml = ET.fromstring(resp.content)
|
|
current_project = MultiproductProjectRecord(
|
|
self.app_id,
|
|
xml.find('slug').text,
|
|
'git',
|
|
xml.find('owner').attrib['kind'],
|
|
xml.find('owner').text,
|
|
xml.find('created-at').text,
|
|
xml.find('license').text,
|
|
self.trim_newline(xml.find('description').text),
|
|
self.wh_exec_id
|
|
)
|
|
|
|
project_repo_names = []
|
|
for repo in xml.findall('repositories/mainlines/repository'):
|
|
repo_fullname = re_git_repo_name.search(repo.find('clone_url').text).group(1)
|
|
project_repo_names.append(repo_fullname)
|
|
repo_key = 'git:' + repo_fullname
|
|
self.git_repo[repo_key] = {
|
|
'scm_repo_fullname': repo_fullname,
|
|
'scm_type': 'git',
|
|
'repo_id': repo.find('id').text,
|
|
'project': project_name,
|
|
'owner_type': repo.find('owner').attrib['kind'],
|
|
'owner_name': repo.find('owner').text
|
|
}
|
|
|
|
project_repo_num = len(project_repo_names)
|
|
current_project.setRepos(project_repo_num, ','.join(project_repo_names))
|
|
self.project_writer.append(current_project)
|
|
project_names[project_name] = project_repo_num
|
|
# self.logger.debug("Project: {} - Repos: {}".format(project_name, project_repo_num))
|
|
|
|
self.project_writer.close()
|
|
self.logger.info("Finish Fetching git projects and repos")
|
|
self.logger.debug('Non-exist projects: {}'.format(project_nonexist))
|
|
|
|
|
|
def merge_product_repo(self):
|
|
'''
|
|
merge multiproduct and repo into same product_repo store
|
|
'''
|
|
for key, repo in self.git_repo.iteritems():
|
|
record = MultiproductRepoRecord(
|
|
self.app_id,
|
|
repo['scm_repo_fullname'],
|
|
repo['scm_type'],
|
|
int(repo['repo_id']),
|
|
repo['project'],
|
|
repo['owner_type'],
|
|
repo['owner_name'],
|
|
self.wh_exec_id
|
|
)
|
|
if key in self.multiproduct:
|
|
mp = self.multiproduct[key]
|
|
record.setMultiproductInfo(
|
|
mp["multiproduct_name"],
|
|
mp["product_type"],
|
|
mp["product_version"],
|
|
mp["namespace"]
|
|
)
|
|
self.repo_writer.append(record)
|
|
self.product_repo.append(record)
|
|
|
|
for key, product in self.multiproduct.iteritems():
|
|
if key not in self.git_repo:
|
|
record = MultiproductRepoRecord(
|
|
self.app_id,
|
|
product["scm_repo_fullname"],
|
|
product["scm_type"],
|
|
0,
|
|
None,
|
|
None,
|
|
product["owner_name"],
|
|
self.wh_exec_id
|
|
)
|
|
record.setMultiproductInfo(
|
|
product["multiproduct_name"],
|
|
product["product_type"],
|
|
product["product_version"],
|
|
product["namespace"],
|
|
)
|
|
self.repo_writer.append(record)
|
|
self.product_repo.append(record)
|
|
|
|
self.repo_writer.close()
|
|
self.logger.info("Merged products and repos, total {} records".format(len(self.product_repo)))
|
|
|
|
|
|
def get_acl_owners(self):
|
|
'''
|
|
fetch owners information from acl
|
|
'''
|
|
re_acl_owners = re.compile(r"owners\:\s*\[([^\[\]]+)\]")
|
|
re_acl_path = re.compile(r"paths\:\s*\[([^\[\]]+)\]")
|
|
re_svn_acl_url = re.compile(r'href=\"[\w\/\-]+[\/\:]acl\/([\w\-\/]+)\.acl(\?revision=\d+)&view=markup\"')
|
|
re_git_acl_url = re.compile(r'href=\"[\w\/\-]+\/source\/([\w\:]*)acl\/([\w\-]+)\.acl\"')
|
|
|
|
owner_count = 0
|
|
for repo in self.product_repo:
|
|
repo_fullname = repo.getScmRepoFullname()
|
|
scm_type = repo.getScmType()
|
|
repo_id = repo.getRepoId()
|
|
|
|
if scm_type == "git":
|
|
repo_url = '{}/{}/source/acl'.format(args[Constant.GIT_URL_PREFIX], repo_fullname)
|
|
elif scm_type == "svn":
|
|
repo_url = '{}/{}/acl'.format(args[Constant.SVN_URL_PREFIX], repo_fullname)
|
|
|
|
try:
|
|
resp = requests.get(repo_url, verify=False)
|
|
except Exception as ex:
|
|
self.logger.info("Error getting acl {} - {}".format(repo_url, ex.message))
|
|
continue
|
|
if resp.status_code != 200:
|
|
self.logger.debug('Request Error: GET repo {} acls - {}'.format(repo, resp.status_code))
|
|
continue
|
|
|
|
if resp.headers['content-type'].split(';')[0] == 'text/html':
|
|
re_acl_url = re_git_acl_url if scm_type == "git" else re_svn_acl_url
|
|
|
|
for acl_url in re_acl_url.finditer(resp.content):
|
|
if scm_type == "git":
|
|
acl_name = acl_url.group(2)
|
|
commit_hash = acl_url.group(1)
|
|
full_acl_url = '{}/{}/raw/{}acl/{}.acl'.format(args[Constant.GIT_URL_PREFIX],
|
|
repo_fullname, commit_hash, acl_name)
|
|
elif scm_type == "svn":
|
|
acl_name = acl_url.group(1)
|
|
commit_hash = acl_url.group(2)
|
|
full_acl_url = '{}/{}.acl{}'.format(repo_url, acl_name, commit_hash)
|
|
|
|
try:
|
|
resp = requests.get(full_acl_url, verify=False)
|
|
except Exception as ex:
|
|
self.logger.info("Error getting acl {} - {}".format(full_acl_url, ex.message))
|
|
continue
|
|
if resp.status_code != 200:
|
|
self.logger.debug('Request Error: GET acl {} - {}'.format(full_acl_url, resp.status_code))
|
|
continue
|
|
|
|
owners_string = re_acl_owners.search(resp.content)
|
|
path_string = re_acl_path.search(resp.content)
|
|
if owners_string:
|
|
owners = self.parse_owners(owners_string.group(1))
|
|
paths = self.trim_path(path_string.group(1)) if path_string else None
|
|
sort_id = 0
|
|
for owner in owners:
|
|
owner_record = MultiproductRepoOwnerRecord(
|
|
self.app_id,
|
|
repo_fullname,
|
|
scm_type,
|
|
repo_id,
|
|
acl_name.title(),
|
|
owner,
|
|
sort_id,
|
|
paths,
|
|
self.wh_exec_id
|
|
)
|
|
self.repo_owner_writer.append(owner_record)
|
|
sort_id += 1
|
|
owner_count += 1
|
|
# self.logger.debug('{} - {} owners: {}'.format(repo_fullname, acl_name, len(owners)))
|
|
|
|
self.repo_owner_writer.close()
|
|
self.logger.info('Finish Fetching acl owners, total {} records'.format(owner_count))
|
|
|
|
|
|
def trim_newline(self, line):
|
|
return line.replace('\n', ' ').replace('\r', ' ').encode('ascii', 'ignore') if line else None
|
|
|
|
def trim_path(self, line):
|
|
return line.strip().replace('\n', ' ').replace('\r', ' ').replace(''', "'")
|
|
|
|
def parse_owners(self, line):
|
|
elements = [s.strip() for l in line.splitlines() for s in l.split(',')]
|
|
return [x for x in elements if x and not x.startswith('#')]
|
|
|
|
|
|
def run(self):
|
|
begin = datetime.datetime.now().strftime("%H:%M:%S")
|
|
self.get_multiproducts()
|
|
self.get_project_repo()
|
|
self.merge_product_repo()
|
|
mid = datetime.datetime.now().strftime("%H:%M:%S")
|
|
self.logger.info("Finish getting multiproducts and repos [{} -> {}]".format(str(begin), str(mid)))
|
|
self.get_acl_owners()
|
|
end = datetime.datetime.now().strftime("%H:%M:%S")
|
|
self.logger.info("Extract Multiproduct and gitli metadata [{} -> {}]".format(str(begin), str(end)))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
args = sys.argv[1]
|
|
|
|
e = MultiproductLoad(args)
|
|
e.run()
|