2022-12-29 09:58:06 -08:00
|
|
|
/*
|
|
|
|
* Copyright 2021 Collate
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
* You may obtain a copy of the License at
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
package org.openmetadata.csv;
|
|
|
|
|
|
|
|
import static org.openmetadata.common.utils.CommonUtil.listOf;
|
|
|
|
import static org.openmetadata.common.utils.CommonUtil.listOrEmpty;
|
|
|
|
import static org.openmetadata.common.utils.CommonUtil.nullOrEmpty;
|
|
|
|
|
2024-10-01 00:12:43 +05:30
|
|
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
2022-12-29 09:58:06 -08:00
|
|
|
import java.io.IOException;
|
2024-10-01 00:12:43 +05:30
|
|
|
import java.io.StringReader;
|
2022-12-29 09:58:06 -08:00
|
|
|
import java.io.StringWriter;
|
|
|
|
import java.util.ArrayList;
|
2023-01-29 16:56:53 -08:00
|
|
|
import java.util.Arrays;
|
2024-10-01 00:12:43 +05:30
|
|
|
import java.util.Collections;
|
2022-12-29 09:58:06 -08:00
|
|
|
import java.util.List;
|
2024-10-01 00:12:43 +05:30
|
|
|
import java.util.Map;
|
|
|
|
import java.util.regex.Pattern;
|
2022-12-29 09:58:06 -08:00
|
|
|
import java.util.stream.Collectors;
|
|
|
|
import org.apache.commons.csv.CSVFormat;
|
|
|
|
import org.apache.commons.csv.CSVFormat.Builder;
|
2024-10-01 00:12:43 +05:30
|
|
|
import org.apache.commons.csv.CSVParser;
|
2022-12-29 09:58:06 -08:00
|
|
|
import org.apache.commons.csv.CSVPrinter;
|
|
|
|
import org.apache.commons.csv.CSVRecord;
|
|
|
|
import org.openmetadata.schema.type.EntityReference;
|
|
|
|
import org.openmetadata.schema.type.TagLabel;
|
|
|
|
import org.openmetadata.schema.type.csv.CsvFile;
|
|
|
|
import org.openmetadata.schema.type.csv.CsvHeader;
|
|
|
|
|
|
|
|
public final class CsvUtil {
|
2023-01-05 11:22:33 -08:00
|
|
|
public static final String SEPARATOR = ",";
|
|
|
|
public static final String FIELD_SEPARATOR = ";";
|
2024-07-29 23:06:39 -07:00
|
|
|
|
|
|
|
public static final String ENTITY_TYPE_SEPARATOR = ":";
|
2023-01-05 11:22:33 -08:00
|
|
|
public static final String LINE_SEPARATOR = "\r\n";
|
2022-12-29 09:58:06 -08:00
|
|
|
|
2024-10-01 00:12:43 +05:30
|
|
|
public static final String INTERNAL_ARRAY_SEPARATOR = "|";
|
|
|
|
|
2022-12-29 09:58:06 -08:00
|
|
|
private CsvUtil() {
|
|
|
|
// Utility class hides the constructor
|
|
|
|
}
|
|
|
|
|
|
|
|
public static String formatCsv(CsvFile csvFile) throws IOException {
|
|
|
|
// CSV file is generated by the backend and the data exported is expected to be correct. Hence,
|
|
|
|
// no validation
|
|
|
|
StringWriter writer = new StringWriter();
|
|
|
|
List<String> headers = getHeaders(csvFile.getHeaders());
|
|
|
|
CSVFormat csvFormat =
|
|
|
|
Builder.create(CSVFormat.DEFAULT).setHeader(headers.toArray(new String[0])).build();
|
|
|
|
try (CSVPrinter printer = new CSVPrinter(writer, csvFormat)) {
|
2023-05-10 20:30:17 +05:30
|
|
|
for (List<String> csvRecord : listOrEmpty(csvFile.getRecords())) {
|
|
|
|
printer.printRecord(csvRecord);
|
2022-12-29 09:58:06 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return writer.toString();
|
|
|
|
}
|
|
|
|
|
2023-04-11 20:00:45 -07:00
|
|
|
/** Get headers from CsvHeaders */
|
2022-12-29 09:58:06 -08:00
|
|
|
public static List<String> getHeaders(List<CsvHeader> csvHeaders) {
|
|
|
|
List<String> headers = new ArrayList<>();
|
|
|
|
for (CsvHeader header : csvHeaders) {
|
2023-02-05 01:30:51 +02:00
|
|
|
String headerString = header.getName();
|
|
|
|
if (Boolean.TRUE.equals(header.getRequired()))
|
|
|
|
headerString = String.format("%s*", header.getName());
|
2023-01-29 16:56:53 -08:00
|
|
|
headers.add(headerString);
|
2022-12-29 09:58:06 -08:00
|
|
|
}
|
|
|
|
return headers;
|
|
|
|
}
|
|
|
|
|
2023-05-10 20:30:17 +05:30
|
|
|
public static String recordToString(CSVRecord csvRecord) {
|
|
|
|
return recordToString(csvRecord.toList());
|
2022-12-29 09:58:06 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
public static String recordToString(List<String> fields) {
|
2023-01-29 16:56:53 -08:00
|
|
|
return nullOrEmpty(fields)
|
|
|
|
? ""
|
2023-05-10 20:30:17 +05:30
|
|
|
: fields.stream().map(CsvUtil::quoteCsvField).collect(Collectors.joining(SEPARATOR));
|
2022-12-29 09:58:06 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
public static String recordToString(String[] fields) {
|
2023-01-29 16:56:53 -08:00
|
|
|
return recordToString(Arrays.asList(fields));
|
2022-12-29 09:58:06 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
public static List<String> fieldToStrings(String field) {
|
|
|
|
// Split a field that contains multiple strings separated by FIELD_SEPARATOR
|
2024-08-20 15:38:22 -07:00
|
|
|
return field == null || field.isBlank() ? null : listOf(field.split(FIELD_SEPARATOR));
|
2022-12-29 09:58:06 -08:00
|
|
|
}
|
|
|
|
|
2024-07-29 23:06:39 -07:00
|
|
|
public static List<String> fieldToEntities(String field) {
|
|
|
|
// Split a field that contains multiple strings separated by FIELD_SEPARATOR
|
|
|
|
return field == null ? null : listOf(field.split(ENTITY_TYPE_SEPARATOR));
|
|
|
|
}
|
|
|
|
|
2024-10-01 00:12:43 +05:30
|
|
|
public static List<String> fieldToInternalArray(String field) {
|
|
|
|
// Split a fieldValue that contains multiple elements of an array separated by
|
|
|
|
// INTERNAL_ARRAY_SEPARATOR
|
|
|
|
if (field == null || field.isBlank()) {
|
|
|
|
return Collections.emptyList();
|
|
|
|
}
|
|
|
|
return listOf(field.split(Pattern.quote(INTERNAL_ARRAY_SEPARATOR)));
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Parses a field containing key-value pairs separated by semicolons, correctly handling quotes.
|
|
|
|
* Each key-value pair may also be enclosed in quotes, especially if it contains delimiter like (SEPARATOR , FIELD_SEPARATOR).
|
|
|
|
* Input Example:
|
|
|
|
* "key1:value1;key2:value2;\"key3:value;with;semicolon\""
|
|
|
|
* Output: [key1:value1, key2:value2, key3:value;with;semicolon]
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
public static List<String> fieldToExtensionStrings(String field) throws IOException {
|
|
|
|
if (field == null || field.isBlank()) {
|
|
|
|
return List.of();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Replace semicolons within quoted strings with a placeholder
|
|
|
|
String preprocessedField =
|
|
|
|
Pattern.compile("\"([^\"]*)\"") // Matches content inside double quotes
|
|
|
|
.matcher(field)
|
|
|
|
.replaceAll(mr -> "\"" + mr.group(1).replace(";", "__SEMICOLON__") + "\"");
|
|
|
|
|
|
|
|
preprocessedField = preprocessedField.replace("\n", "\\n").replace("\"", "\\\"");
|
|
|
|
|
|
|
|
CSVFormat format =
|
|
|
|
CSVFormat.DEFAULT
|
|
|
|
.withDelimiter(';')
|
|
|
|
.withQuote('"')
|
|
|
|
.withRecordSeparator(null)
|
|
|
|
.withIgnoreSurroundingSpaces(true)
|
|
|
|
.withIgnoreEmptyLines(true)
|
|
|
|
.withEscape('\\'); // Use backslash for escaping special characters
|
|
|
|
|
|
|
|
try (CSVParser parser = CSVParser.parse(new StringReader(preprocessedField), format)) {
|
|
|
|
return parser.getRecords().stream()
|
|
|
|
.flatMap(CSVRecord::stream)
|
|
|
|
.map(
|
|
|
|
value ->
|
|
|
|
value
|
|
|
|
.replace("__SEMICOLON__", ";")
|
|
|
|
.replace("\\n", "\n")) // Restore original semicolons and newlines
|
|
|
|
.map(
|
|
|
|
value ->
|
|
|
|
value.startsWith("\"") && value.endsWith("\"") // Remove outer quotes if present
|
|
|
|
? value.substring(1, value.length() - 1)
|
|
|
|
: value)
|
|
|
|
.toList();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-12-29 09:58:06 -08:00
|
|
|
public static String quote(String field) {
|
|
|
|
return String.format("\"%s\"", field);
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Quote a CSV field made of multiple strings that has SEPARATOR or FIELD_SEPARATOR with " " */
|
|
|
|
public static String quoteField(List<String> field) {
|
|
|
|
return nullOrEmpty(field)
|
|
|
|
? ""
|
2023-05-10 20:30:17 +05:30
|
|
|
: field.stream().map(CsvUtil::quoteCsvField).collect(Collectors.joining(FIELD_SEPARATOR));
|
2022-12-29 09:58:06 -08:00
|
|
|
}
|
|
|
|
|
2023-11-21 07:45:06 -08:00
|
|
|
public static void addField(List<String> csvRecord, Boolean field) {
|
2023-05-10 20:30:17 +05:30
|
|
|
csvRecord.add(field == null ? "" : field.toString());
|
2023-01-29 16:56:53 -08:00
|
|
|
}
|
|
|
|
|
2023-05-10 20:30:17 +05:30
|
|
|
public static List<String> addField(List<String> csvRecord, String field) {
|
|
|
|
csvRecord.add(field);
|
|
|
|
return csvRecord;
|
2022-12-29 09:58:06 -08:00
|
|
|
}
|
|
|
|
|
2023-05-10 20:30:17 +05:30
|
|
|
public static List<String> addFieldList(List<String> csvRecord, List<String> field) {
|
|
|
|
csvRecord.add(quoteField(field));
|
|
|
|
return csvRecord;
|
2022-12-29 09:58:06 -08:00
|
|
|
}
|
|
|
|
|
2023-05-10 20:30:17 +05:30
|
|
|
public static List<String> addEntityReferences(
|
|
|
|
List<String> csvRecord, List<EntityReference> refs) {
|
|
|
|
csvRecord.add(
|
2022-12-29 09:58:06 -08:00
|
|
|
nullOrEmpty(refs)
|
|
|
|
? null
|
|
|
|
: refs.stream()
|
|
|
|
.map(EntityReference::getFullyQualifiedName)
|
|
|
|
.collect(Collectors.joining(FIELD_SEPARATOR)));
|
2023-05-10 20:30:17 +05:30
|
|
|
return csvRecord;
|
2022-12-29 09:58:06 -08:00
|
|
|
}
|
|
|
|
|
2023-05-10 20:30:17 +05:30
|
|
|
public static List<String> addEntityReference(List<String> csvRecord, EntityReference ref) {
|
|
|
|
csvRecord.add(nullOrEmpty(ref) ? null : ref.getFullyQualifiedName());
|
|
|
|
return csvRecord;
|
2022-12-29 09:58:06 -08:00
|
|
|
}
|
|
|
|
|
2023-05-10 20:30:17 +05:30
|
|
|
public static List<String> addTagLabels(List<String> csvRecord, List<TagLabel> tags) {
|
|
|
|
csvRecord.add(
|
2022-12-29 09:58:06 -08:00
|
|
|
nullOrEmpty(tags)
|
|
|
|
? null
|
2024-04-07 02:21:56 +05:30
|
|
|
: tags.stream()
|
|
|
|
.filter(
|
|
|
|
tagLabel ->
|
|
|
|
tagLabel.getSource().equals(TagLabel.TagSource.CLASSIFICATION)
|
|
|
|
&& !tagLabel.getTagFQN().split("\\.")[0].equals("Tier")
|
|
|
|
&& !tagLabel.getLabelType().equals(TagLabel.LabelType.DERIVED))
|
|
|
|
.map(TagLabel::getTagFQN)
|
|
|
|
.collect(Collectors.joining(FIELD_SEPARATOR)));
|
|
|
|
|
|
|
|
return csvRecord;
|
|
|
|
}
|
|
|
|
|
|
|
|
public static List<String> addGlossaryTerms(List<String> csvRecord, List<TagLabel> tags) {
|
|
|
|
csvRecord.add(
|
|
|
|
nullOrEmpty(tags)
|
|
|
|
? null
|
|
|
|
: tags.stream()
|
|
|
|
.filter(
|
|
|
|
tagLabel ->
|
|
|
|
tagLabel.getSource().equals(TagLabel.TagSource.GLOSSARY)
|
|
|
|
&& !tagLabel.getTagFQN().split("\\.")[0].equals("Tier"))
|
|
|
|
.map(TagLabel::getTagFQN)
|
|
|
|
.collect(Collectors.joining(FIELD_SEPARATOR)));
|
|
|
|
|
|
|
|
return csvRecord;
|
|
|
|
}
|
|
|
|
|
|
|
|
public static List<String> addTagTiers(List<String> csvRecord, List<TagLabel> tags) {
|
|
|
|
csvRecord.add(
|
|
|
|
nullOrEmpty(tags)
|
|
|
|
? null
|
|
|
|
: tags.stream()
|
|
|
|
.filter(
|
|
|
|
tagLabel ->
|
|
|
|
tagLabel.getSource().equals(TagLabel.TagSource.CLASSIFICATION)
|
|
|
|
&& tagLabel.getTagFQN().split("\\.")[0].equals("Tier"))
|
|
|
|
.map(TagLabel::getTagFQN)
|
|
|
|
.collect(Collectors.joining(FIELD_SEPARATOR)));
|
|
|
|
|
2023-05-10 20:30:17 +05:30
|
|
|
return csvRecord;
|
2022-12-29 09:58:06 -08:00
|
|
|
}
|
2023-04-02 15:04:00 -07:00
|
|
|
|
2024-07-29 23:06:39 -07:00
|
|
|
public static void addOwners(List<String> csvRecord, List<EntityReference> owners) {
|
|
|
|
csvRecord.add(
|
|
|
|
nullOrEmpty(owners)
|
|
|
|
? null
|
|
|
|
: owners.stream()
|
|
|
|
.map(owner -> (owner.getType() + ENTITY_TYPE_SEPARATOR + owner.getName()))
|
|
|
|
.collect(Collectors.joining(FIELD_SEPARATOR)));
|
2023-06-19 03:13:05 -07:00
|
|
|
}
|
|
|
|
|
2024-08-20 15:38:22 -07:00
|
|
|
public static void addReviewers(List<String> csvRecord, List<EntityReference> reviewers) {
|
|
|
|
csvRecord.add(
|
|
|
|
nullOrEmpty(reviewers)
|
|
|
|
? null
|
|
|
|
: reviewers.stream()
|
|
|
|
.map(reviewer -> (reviewer.getType() + ENTITY_TYPE_SEPARATOR + reviewer.getName()))
|
|
|
|
.collect(Collectors.joining(FIELD_SEPARATOR)));
|
|
|
|
}
|
|
|
|
|
2023-05-10 20:30:17 +05:30
|
|
|
private static String quoteCsvField(String str) {
|
|
|
|
if (str.contains(SEPARATOR) || str.contains(FIELD_SEPARATOR)) {
|
|
|
|
return quote(str);
|
|
|
|
}
|
|
|
|
return str;
|
2023-04-02 15:04:00 -07:00
|
|
|
}
|
2024-10-01 00:12:43 +05:30
|
|
|
|
|
|
|
public static List<String> addExtension(List<String> csvRecord, Object extension) {
|
|
|
|
if (extension == null) {
|
|
|
|
csvRecord.add(null);
|
|
|
|
return csvRecord;
|
|
|
|
}
|
|
|
|
|
|
|
|
ObjectMapper objectMapper = new ObjectMapper();
|
|
|
|
Map<String, Object> extensionMap = objectMapper.convertValue(extension, Map.class);
|
|
|
|
|
|
|
|
String extensionString =
|
|
|
|
extensionMap.entrySet().stream()
|
|
|
|
.map(
|
|
|
|
entry -> {
|
|
|
|
String key = entry.getKey();
|
|
|
|
Object value = entry.getValue();
|
|
|
|
return CsvUtil.quoteCsvField(key + ENTITY_TYPE_SEPARATOR + formatValue(value));
|
|
|
|
})
|
|
|
|
.collect(Collectors.joining(FIELD_SEPARATOR));
|
|
|
|
|
|
|
|
csvRecord.add(extensionString);
|
|
|
|
return csvRecord;
|
|
|
|
}
|
|
|
|
|
|
|
|
private static String formatValue(Object value) {
|
|
|
|
if (value instanceof Map) {
|
|
|
|
return formatMapValue((Map<String, Object>) value);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (value instanceof List) {
|
|
|
|
return formatListValue((List<?>) value);
|
|
|
|
}
|
|
|
|
|
|
|
|
return value != null ? value.toString() : "";
|
|
|
|
}
|
|
|
|
|
|
|
|
private static String formatMapValue(Map<String, Object> valueMap) {
|
|
|
|
if (isEntityReference(valueMap)) {
|
|
|
|
return formatEntityReference(valueMap);
|
|
|
|
} else if (isTimeInterval(valueMap)) {
|
|
|
|
return formatTimeInterval(valueMap);
|
|
|
|
}
|
|
|
|
|
|
|
|
return valueMap.toString();
|
|
|
|
}
|
|
|
|
|
|
|
|
private static String formatListValue(List<?> list) {
|
|
|
|
if (list.isEmpty()) {
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
|
|
|
|
if (list.get(0) instanceof Map && isEnumWithDescriptions((Map<String, Object>) list.get(0))) {
|
|
|
|
return list.stream()
|
|
|
|
.map(item -> ((Map<String, Object>) item).get("key").toString())
|
|
|
|
.collect(Collectors.joining(INTERNAL_ARRAY_SEPARATOR));
|
|
|
|
} else if (list.get(0) instanceof Map) {
|
|
|
|
return list.stream()
|
|
|
|
.map(item -> formatMapValue((Map<String, Object>) item))
|
|
|
|
.collect(Collectors.joining(INTERNAL_ARRAY_SEPARATOR));
|
|
|
|
} else {
|
|
|
|
return list.stream()
|
|
|
|
.map(Object::toString)
|
|
|
|
.collect(Collectors.joining(INTERNAL_ARRAY_SEPARATOR));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private static boolean isEntityReference(Map<String, Object> valueMap) {
|
|
|
|
return valueMap.containsKey("type") && valueMap.containsKey("fullyQualifiedName");
|
|
|
|
}
|
|
|
|
|
|
|
|
private static boolean isTimeInterval(Map<String, Object> valueMap) {
|
|
|
|
return valueMap.containsKey("start") && valueMap.containsKey("end");
|
|
|
|
}
|
|
|
|
|
|
|
|
private static boolean isEnumWithDescriptions(Map<String, Object> valueMap) {
|
|
|
|
return valueMap.containsKey("key") && valueMap.containsKey("description");
|
|
|
|
}
|
|
|
|
|
|
|
|
private static String formatEntityReference(Map<String, Object> valueMap) {
|
|
|
|
return valueMap.get("type") + ENTITY_TYPE_SEPARATOR + valueMap.get("fullyQualifiedName");
|
|
|
|
}
|
|
|
|
|
|
|
|
private static String formatTimeInterval(Map<String, Object> valueMap) {
|
|
|
|
return valueMap.get("start") + ENTITY_TYPE_SEPARATOR + valueMap.get("end");
|
|
|
|
}
|
2022-12-29 09:58:06 -08:00
|
|
|
}
|