mirror of
https://github.com/datahub-project/datahub.git
synced 2025-07-11 19:23:02 +00:00
101 lines
3.9 KiB
Java
101 lines
3.9 KiB
Java
![]() |
/**
|
||
|
* Copyright 2015 LinkedIn Corp. All rights reserved.
|
||
|
*
|
||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
* you may not use this file except in compliance with the License.
|
||
|
* You may obtain a copy of the License at
|
||
|
*
|
||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||
|
*
|
||
|
* Unless required by applicable law or agreed to in writing, software
|
||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
*/
|
||
|
package metadata.etl.kafka;
|
||
|
|
||
|
import java.util.Map;
|
||
|
import java.util.regex.Matcher;
|
||
|
import java.util.regex.Pattern;
|
||
|
import org.apache.avro.generic.GenericData;
|
||
|
import wherehows.common.schemas.GobblinTrackingCompactionRecord;
|
||
|
import wherehows.common.schemas.Record;
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Kafka message processor for Gobblin tracking event compaction topic
|
||
|
*/
|
||
|
public class GobblinTrackingCompactionProcessor extends KafkaConsumerProcessor {
|
||
|
|
||
|
// for data tracking urn, such as '/data/tracking/ab-c/daily-dedup/2016/...'
|
||
|
// cut into 3 pieces: dataset + partitionType + partition
|
||
|
private final String UrnRegex = "^(\\/\\w+\\/\\w+\\/[\\w-]+)\\/([\\w-]+)\\/(\\d+.*)$";
|
||
|
private final Pattern UrnPattern = Pattern.compile(UrnRegex);
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Process a Gobblin tracking event compaction record
|
||
|
* @param record
|
||
|
* @param topic
|
||
|
* @throws Exception
|
||
|
*/
|
||
|
@Override
|
||
|
public Record process(GenericData.Record record, String topic) throws Exception {
|
||
|
GobblinTrackingCompactionRecord eventRecord = null;
|
||
|
|
||
|
// only handle namespace "compaction.tracking.events"
|
||
|
if (record != null && record.get("namespace").equals("compaction.tracking.events")) {
|
||
|
final String name = (String) record.get("name");
|
||
|
|
||
|
// for event name "CompactionCompleted" or "CompactionRecordCounts"
|
||
|
if (name.equals("CompactionCompleted") || name.equals("CompactionRecordCounts")) {
|
||
|
logger.info("Processing Gobblin tracking event record: " + name);
|
||
|
final long timestamp = (long) record.get("timestamp");
|
||
|
final Map<String, String> metadata = (Map<String, String>) record.get("metadata");
|
||
|
|
||
|
final String jobContext = "Gobblin:" + name;
|
||
|
final String cluster = parseClusterIdentifier(metadata.get("clusterIdentifier")).get("cluster");
|
||
|
final String projectName = metadata.get("azkabanProjectName");
|
||
|
final String flowId = metadata.get("azkabanFlowId");
|
||
|
final String jobId = metadata.get("azkabanJobId");
|
||
|
final int execId = Integer.parseInt(metadata.get("azkabanExecId"));
|
||
|
|
||
|
// final String metricContextId = metadata.get("metricContextID");
|
||
|
// final String metricContextName = metadata.get("metricContextName");
|
||
|
final String dedupeStatus = metadata.get("dedupeStatus");
|
||
|
|
||
|
String dataset = null;
|
||
|
String partitionType = null;
|
||
|
String partitionName = null;
|
||
|
long recordCount = 0;
|
||
|
long lateRecordCount = 0;
|
||
|
|
||
|
if (name.equals("CompactionCompleted")) {
|
||
|
dataset = metadata.get("datasetUrn");
|
||
|
partitionName = metadata.get("partition");
|
||
|
recordCount = parseLong(metadata.get("recordCount"));
|
||
|
}
|
||
|
// name = "CompactionRecordCounts"
|
||
|
else {
|
||
|
final Matcher m = UrnPattern.matcher(metadata.get("DatasetOutputPath"));
|
||
|
if (m.find()) {
|
||
|
dataset = m.group(1);
|
||
|
partitionType = m.group(2);
|
||
|
partitionName = m.group(3);
|
||
|
}
|
||
|
|
||
|
recordCount = parseLong(metadata.get("RegularRecordCount"));
|
||
|
lateRecordCount = parseLong(metadata.get("LateRecordCount"));
|
||
|
}
|
||
|
|
||
|
eventRecord = new GobblinTrackingCompactionRecord(timestamp, jobContext,
|
||
|
cluster, projectName, flowId, jobId, execId);
|
||
|
eventRecord.setDatasetUrn(dataset, partitionType, partitionName);
|
||
|
eventRecord.setRecordCount(recordCount);
|
||
|
eventRecord.setLateRecordCount(lateRecordCount);
|
||
|
eventRecord.setDedupeStatus(dedupeStatus);
|
||
|
}
|
||
|
}
|
||
|
return eventRecord;
|
||
|
}
|
||
|
}
|