2016-09-29 15:01:32 -07:00
|
|
|
/**
|
|
|
|
* Copyright 2015 LinkedIn Corp. All rights reserved.
|
|
|
|
*
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
* You may obtain a copy of the License at
|
|
|
|
*
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
*
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
*/
|
2016-10-10 14:49:14 -07:00
|
|
|
package models.kafka;
|
2016-09-29 15:01:32 -07:00
|
|
|
|
2017-05-22 15:21:22 -07:00
|
|
|
import com.fasterxml.jackson.databind.node.ObjectNode;
|
2017-05-24 11:26:14 -07:00
|
|
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
2017-05-22 15:21:22 -07:00
|
|
|
import com.google.common.collect.ImmutableList;
|
2017-08-03 15:04:56 -07:00
|
|
|
import controllers.Application;
|
|
|
|
import java.util.Date;
|
2017-05-22 15:21:22 -07:00
|
|
|
import java.util.List;
|
2016-09-29 15:01:32 -07:00
|
|
|
import java.util.Map;
|
|
|
|
import java.util.regex.Matcher;
|
|
|
|
import java.util.regex.Pattern;
|
2017-05-22 15:21:22 -07:00
|
|
|
import models.daos.DatasetDao;
|
2016-09-29 15:01:32 -07:00
|
|
|
import models.daos.DatasetInfoDao;
|
|
|
|
import org.apache.avro.generic.GenericData;
|
2017-05-22 15:21:22 -07:00
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
|
|
import wherehows.common.schemas.DatasetRecord;
|
2016-09-29 15:01:32 -07:00
|
|
|
import wherehows.common.schemas.Record;
|
|
|
|
import wherehows.common.utils.StringUtil;
|
2017-05-22 15:21:22 -07:00
|
|
|
import play.libs.Json;
|
|
|
|
import play.Logger;
|
2017-08-03 15:04:56 -07:00
|
|
|
import wherehows.dao.DatasetClassificationDao;
|
|
|
|
import wherehows.models.DatasetClassification;
|
2016-09-29 15:01:32 -07:00
|
|
|
|
|
|
|
|
2016-10-10 14:49:14 -07:00
|
|
|
public class GobblinTrackingAuditProcessor extends KafkaConsumerProcessor {
|
2016-09-29 15:01:32 -07:00
|
|
|
|
2017-05-22 15:21:22 -07:00
|
|
|
private static final String DALI_LIMITED_RETENTION_AUDITOR = "DaliLimitedRetentionAuditor";
|
|
|
|
private static final String DALI_AUTOPURGED_AUDITOR = "DaliAutoPurgeAuditor";
|
|
|
|
private static final String DS_IGNORE_IDPC_AUDITOR = "DsIgnoreIDPCAuditor";
|
|
|
|
private static final String METADATA_FILE_CLASSIFIER = "MetadataFileClassifier";
|
|
|
|
private static final String DATASET_URN_PREFIX = "hdfs://";
|
|
|
|
private static final String DATASET_OWNER_SOURCE = "IDPC";
|
|
|
|
|
2017-08-03 15:04:56 -07:00
|
|
|
private DatasetClassificationDao datasetClassificationDao;
|
|
|
|
|
|
|
|
public GobblinTrackingAuditProcessor() {
|
|
|
|
this.datasetClassificationDao = Application.daoFactory.getDatasetClassificationDao();
|
|
|
|
}
|
|
|
|
|
2017-06-29 10:23:22 -07:00
|
|
|
// TODO: Make these regex patterns part of job file
|
2017-05-22 15:21:22 -07:00
|
|
|
private static final Pattern LOCATION_PREFIX_PATTERN = Pattern.compile("/[^/]+(/[^/]+)?");
|
|
|
|
|
2017-05-23 10:09:13 -07:00
|
|
|
private static final Pattern SHORT_NAME_PATTERN = Pattern.compile("(/[^/]+/[^/]+)$");
|
|
|
|
|
2017-08-03 15:04:56 -07:00
|
|
|
private static final List<Pattern> PARENT_PATTERNS =
|
|
|
|
ImmutableList.<Pattern>builder().add(Pattern.compile("/data/external/gobblin/(.+)"))
|
|
|
|
.add(Pattern.compile("/data/(databases|dbchange|external)/.+"))
|
|
|
|
.add(Pattern.compile("/([^/]*data)/tracking/.+"))
|
|
|
|
.add(Pattern.compile("/([^/]*data)/derived/.+"))
|
|
|
|
.add(Pattern.compile("/(data)/service/.+"))
|
|
|
|
.add(Pattern.compile("/([^/]+)/.+"))
|
|
|
|
.build();
|
|
|
|
|
|
|
|
private static final List<Pattern> BLACKLISTED_DATASET_PATTERNS =
|
|
|
|
ImmutableList.<Pattern>builder().add(Pattern.compile("(\\b|_)temporary(\\b|_)"))
|
|
|
|
.add(Pattern.compile("(\\b|_)temp(\\b|_)"))
|
|
|
|
.add(Pattern.compile("(\\b|_)tmp(\\b|_)"))
|
|
|
|
.add(Pattern.compile("(\\b|_)staging(\\b|_)"))
|
|
|
|
.add(Pattern.compile("(\\b|_)stg(\\b|_)"))
|
|
|
|
.add(Pattern.compile("_distcp_"))
|
|
|
|
.add(Pattern.compile("/output/"))
|
|
|
|
.build();
|
2016-09-29 15:01:32 -07:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Process a Gobblin tracking event audit record
|
|
|
|
* @param record
|
|
|
|
* @param topic
|
2016-10-10 14:49:14 -07:00
|
|
|
* @return null
|
2016-09-29 15:01:32 -07:00
|
|
|
* @throws Exception
|
|
|
|
*/
|
2017-05-22 15:21:22 -07:00
|
|
|
public Record process(GenericData.Record record, String topic) throws Exception {
|
|
|
|
|
|
|
|
if (record == null || record.get("name") == null) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
final String name = record.get("name").toString();
|
|
|
|
// only handle "DaliLimitedRetentionAuditor","DaliAutoPurgeAuditor" and "DsIgnoreIDPCAuditor"
|
|
|
|
if (name.equals(DALI_LIMITED_RETENTION_AUDITOR) || name.equals(DALI_AUTOPURGED_AUDITOR) || name.equals(
|
|
|
|
DS_IGNORE_IDPC_AUDITOR)) {
|
2017-05-24 11:26:14 -07:00
|
|
|
// TODO: Re-enable this once it's fixed.
|
|
|
|
//updateKafkaDatasetOwner(record);
|
2017-05-22 15:21:22 -07:00
|
|
|
} else if (name.equals(METADATA_FILE_CLASSIFIER)) {
|
|
|
|
updateHdfsDatasetSchema(record);
|
2016-09-29 15:01:32 -07:00
|
|
|
}
|
2017-05-22 15:21:22 -07:00
|
|
|
|
2016-09-29 15:01:32 -07:00
|
|
|
return null;
|
|
|
|
}
|
2017-05-22 15:21:22 -07:00
|
|
|
|
|
|
|
private void updateKafkaDatasetOwner(GenericData.Record record) throws Exception {
|
|
|
|
Long timestamp = (Long) record.get("timestamp");
|
|
|
|
Map<String, String> metadata = StringUtil.convertObjectMapToStringMap(record.get("metadata"));
|
|
|
|
|
|
|
|
String hasError = metadata.get("HasError");
|
|
|
|
if (!hasError.equalsIgnoreCase("true")) {
|
|
|
|
String datasetPath = metadata.get("DatasetPath");
|
|
|
|
String datasetUrn = DATASET_URN_PREFIX + (datasetPath.startsWith("/") ? "" : "/") + datasetPath;
|
|
|
|
String ownerUrns = metadata.get("OwnerURNs");
|
|
|
|
DatasetInfoDao.updateKafkaDatasetOwner(datasetUrn, ownerUrns, DATASET_OWNER_SOURCE, timestamp);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private void updateHdfsDatasetSchema(GenericData.Record record) throws Exception {
|
|
|
|
Long timestamp = (Long) record.get("timestamp");
|
|
|
|
Map<String, String> metadata = StringUtil.convertObjectMapToStringMap(record.get("metadata"));
|
|
|
|
|
|
|
|
String datasetName = metadata.get("dataset");
|
|
|
|
if (StringUtils.isEmpty(datasetName) || isDatasetNameBlacklisted(datasetName)) {
|
|
|
|
Logger.info("Skipped processing metadata event for dataset {}", datasetName);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
DatasetRecord dataset = new DatasetRecord();
|
2017-05-23 10:09:13 -07:00
|
|
|
dataset.setName(getShortName(datasetName));
|
2017-08-03 15:04:56 -07:00
|
|
|
dataset.setUrn(DATASET_URN_PREFIX + datasetName);
|
2017-05-22 15:21:22 -07:00
|
|
|
dataset.setSchema(metadata.get("schema"));
|
|
|
|
dataset.setSchemaType("JSON");
|
|
|
|
dataset.setSource("Hdfs");
|
|
|
|
dataset.setParentName(getParentName(datasetName));
|
|
|
|
dataset.setDatasetType("hdfs");
|
2017-07-19 17:07:28 -07:00
|
|
|
dataset.setIsActive(true);
|
2017-05-22 15:21:22 -07:00
|
|
|
dataset.setSourceModifiedTime(getsourceModifiedTime(metadata.get("modificationTime")));
|
|
|
|
|
|
|
|
Matcher matcher = LOCATION_PREFIX_PATTERN.matcher(datasetName);
|
|
|
|
if (matcher.lookingAt()) {
|
|
|
|
dataset.setLocationPrefix(matcher.group());
|
|
|
|
}
|
|
|
|
|
|
|
|
ObjectNode properties = Json.newObject();
|
|
|
|
properties.put("owner", metadata.get("owner"));
|
|
|
|
properties.put("group", metadata.get("group"));
|
|
|
|
properties.put("file_permission", metadata.get("permission"));
|
|
|
|
properties.put("codec", metadata.get("codec"));
|
|
|
|
properties.put("storage", metadata.get("storage"));
|
|
|
|
properties.put("cluster", metadata.get("cluster"));
|
|
|
|
properties.put("abstract_path", metadata.get("abstractPath"));
|
2017-05-24 11:26:14 -07:00
|
|
|
dataset.setProperties(new ObjectMapper().writeValueAsString(properties));
|
2017-05-22 15:21:22 -07:00
|
|
|
|
|
|
|
Logger.info("Updating dataset {}", datasetName);
|
|
|
|
DatasetDao.setDatasetRecord(dataset);
|
2017-08-03 15:04:56 -07:00
|
|
|
updateDatasetClassificationResult(metadata);
|
|
|
|
}
|
|
|
|
|
|
|
|
private void updateDatasetClassificationResult(Map<String, String> metadata) {
|
|
|
|
try {
|
|
|
|
String urn = DATASET_URN_PREFIX + metadata.get("dataset");
|
|
|
|
String classificationResult = metadata.get("classificationResult");
|
|
|
|
|
|
|
|
DatasetClassification record = new DatasetClassification(urn, classificationResult, new Date());
|
|
|
|
datasetClassificationDao.updateDatasetClassification(record);
|
|
|
|
} catch (Exception e) {
|
|
|
|
logger.info("unable to update classification result due to {}", e.getMessage());
|
|
|
|
}
|
2017-05-22 15:21:22 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
private boolean isDatasetNameBlacklisted(String datasetName) {
|
|
|
|
for (Pattern pattern : BLACKLISTED_DATASET_PATTERNS) {
|
|
|
|
if (pattern.matcher(datasetName).find()) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2017-05-23 10:09:13 -07:00
|
|
|
private String getShortName(String datasetName) {
|
|
|
|
Matcher matcher = SHORT_NAME_PATTERN.matcher(datasetName);
|
|
|
|
if (matcher.find()) {
|
|
|
|
return matcher.group();
|
|
|
|
}
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
|
2017-05-22 15:21:22 -07:00
|
|
|
private String getParentName(String datasetName) {
|
|
|
|
for (Pattern pattern : PARENT_PATTERNS) {
|
|
|
|
Matcher matcher = pattern.matcher(datasetName);
|
|
|
|
if (matcher.matches()) {
|
|
|
|
return matcher.group();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
|
|
|
|
private String getsourceModifiedTime(String hdfsModifiedTime) {
|
|
|
|
if (hdfsModifiedTime == null) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
return Long.toString(Long.parseLong(hdfsModifiedTime) / 1000);
|
|
|
|
}
|
2016-09-29 15:01:32 -07:00
|
|
|
}
|