diff --git a/gms/api/src/main/pegasus/com/linkedin/ml/MLFeature.pdl b/gms/api/src/main/pegasus/com/linkedin/ml/MLFeature.pdl new file mode 100644 index 0000000000..c90fad1546 --- /dev/null +++ b/gms/api/src/main/pegasus/com/linkedin/ml/MLFeature.pdl @@ -0,0 +1,41 @@ +namespace com.linkedin.ml + +import com.linkedin.common.ChangeAuditStamps +import com.linkedin.common.MLFeatureUrn +import com.linkedin.common.VersionTag +import com.linkedin.common.Ownership +import com.linkedin.common.InstitutionalMemory +import com.linkedin.common.Status +import com.linkedin.common.Deprecation +import com.linkedin.ml.metadata.MLFeatureProperties + +/** + * MLFeature spec. for a feature store. A collection of MLFeature metadata schema that can evolve over time. + */ +record MLFeature includes MLFeatureKey, ChangeAuditStamps { + + /** + * Ownership Info + */ + ownership: optional Ownership + + /** + * MLFeature Properties + */ + featureProperties: optional MLFeatureProperties + + /** + * Institutional Memory + */ + institutionalMemory: optional InstitutionalMemory + + /** + * Status + */ + status: optional Status + + /** + * Deprecation + */ + deprecation: optional Deprecation +} diff --git a/gms/api/src/main/pegasus/com/linkedin/ml/MLFeatureKey.pdl b/gms/api/src/main/pegasus/com/linkedin/ml/MLFeatureKey.pdl new file mode 100644 index 0000000000..7678800db0 --- /dev/null +++ b/gms/api/src/main/pegasus/com/linkedin/ml/MLFeatureKey.pdl @@ -0,0 +1,25 @@ +namespace com.linkedin.ml + +/** + * Key for MLFeature resource + */ +record MLFeatureKey { + + /** + * ML Feature Namespace e.g. {db}.{table}, /dir/subdir/{name}, or {name} + */ + @validate.strlen = { + "max" : 500, + "min" : 1 + } + featureNamespace: string + + /** + * Feature Name + */ + @validate.strlen = { + "max" : 500, + "min" : 1 + } + featureName: string +} diff --git a/gms/api/src/main/pegasus/com/linkedin/ml/MLModel.pdl b/gms/api/src/main/pegasus/com/linkedin/ml/MLModel.pdl new file mode 100644 index 0000000000..b9f9c58d23 --- /dev/null +++ b/gms/api/src/main/pegasus/com/linkedin/ml/MLModel.pdl @@ -0,0 +1,102 @@ +namespace com.linkedin.ml + +import com.linkedin.common.ChangeAuditStamps +import com.linkedin.common.MlModelUrn +import com.linkedin.common.VersionTag +import com.linkedin.common.Ownership +import com.linkedin.common.InstitutionalMemory +import com.linkedin.common.Status +import com.linkedin.common.Cost +import com.linkedin.common.Deprecation +import com.linkedin.ml.metadata.MLModelProperties +import com.linkedin.ml.metadata.IntendedUse +import com.linkedin.ml.metadata.MLModelFactors +import com.linkedin.ml.metadata.Metrics +import com.linkedin.ml.metadata.EvaluationData +import com.linkedin.ml.metadata.TrainingData +import com.linkedin.ml.metadata.QuantitativeAnalyses +import com.linkedin.ml.metadata.EthicalConsiderations +import com.linkedin.ml.metadata.CaveatsAndRecommendations +import com.linkedin.ml.metadata.SourceCode + + +/** + * MlModel spec. for a model store. A collection of MlModel metadata schema that can evolve over time. + */ +record MLModel includes MLModelKey, ChangeAuditStamps { + + /** + * Ownership Info + */ + ownership: optional Ownership + + /** + * MLModel Properties + */ + mlModelProperties: optional MLModelProperties + + /** + * Intended Use + */ + intendedUse: optional IntendedUse + + /** + * MLModel Factors + */ + mlModelFactors: optional MLModelFactors + + /** + * Metrics + */ + metrics: optional Metrics + + /** + * Evaluation Data + */ + evaluationData: optional EvaluationData + + /** + * Training Data + */ + trainingData: optional TrainingData + + /** + * Quantitative Analyses + */ + quantitativeAnalyses: optional QuantitativeAnalyses + + /** + * Ethical Considerations + */ + ethicalConsiderations: optional EthicalConsiderations + + /** + * Caveats and Recommendations + */ + caveatsAndRecommendations: optional CaveatsAndRecommendations + + /** + * Institutional Memory + */ + institutionalMemory: optional InstitutionalMemory + + /** + * Source Code + */ + sourceCode: optional SourceCode + + /** + * Status + */ + status: optional Status + + /** + * Cost + */ + cost: optional Cost + + /** + * Deprecation + */ + deprecation: optional Deprecation +} diff --git a/gms/api/src/main/pegasus/com/linkedin/ml/MLModelKey.pdl b/gms/api/src/main/pegasus/com/linkedin/ml/MLModelKey.pdl new file mode 100644 index 0000000000..5f4d0e2efb --- /dev/null +++ b/gms/api/src/main/pegasus/com/linkedin/ml/MLModelKey.pdl @@ -0,0 +1,30 @@ +namespace com.linkedin.ml + +import com.linkedin.common.DataPlatformUrn +import com.linkedin.common.FabricType + +/** + * Key for MLModel resource + */ +record MLModelKey { + + /** + * Standardized platform urn where ML Model is defined. The data platform Urn (urn:li:platform:{dataScienceplatform_name}) + */ + @validate.`com.linkedin.dataset.rest.validator.DataPlatformValidator` = { } + platform: DataPlatformUrn + + /** + * ML Model name e.g. {db}.{table}, /dir/subdir/{name}, or {name} + */ + @validate.strlen = { + "max" : 500, + "min" : 1 + } + name: string + + /** + * Fabric type where ML Model belongs to or where it was generated. + */ + origin: FabricType +} diff --git a/li-utils/src/main/java/com/linkedin/common/urn/MLFeatureUrn.java b/li-utils/src/main/java/com/linkedin/common/urn/MLFeatureUrn.java new file mode 100644 index 0000000000..1354bd3bfa --- /dev/null +++ b/li-utils/src/main/java/com/linkedin/common/urn/MLFeatureUrn.java @@ -0,0 +1,27 @@ +package com.linkedin.common.urn; + +public final class MLFeatureUrn extends Urn { + + public static final String ENTITY_TYPE = "mlFeature"; + + private static final String CONTENT_FORMAT = "(%s,%s,%s)"; + + private final String mlFeatureNamespace; + + private final String mlFeatureName; + + public MLFeatureUrn(String mlFeatureNamespace, String mlFeatureName) { + super(ENTITY_TYPE, String.format(CONTENT_FORMAT, mlFeatureNamespace, mlFeatureName)); + this.mlFeatureNamespace = mlFeatureNamespace; + this.mlFeatureName = mlFeatureName; + } + + public String getMlFeatureName() { + return mlFeatureName; + } + + public String getMlFeatureNamespace() { + return mlFeatureNamespace; + } + +} diff --git a/li-utils/src/main/java/com/linkedin/common/urn/MLModelUrn.java b/li-utils/src/main/java/com/linkedin/common/urn/MLModelUrn.java new file mode 100644 index 0000000000..eb81c9a64f --- /dev/null +++ b/li-utils/src/main/java/com/linkedin/common/urn/MLModelUrn.java @@ -0,0 +1,50 @@ +package com.linkedin.common.urn; + +import java.net.URISyntaxException; + +import com.linkedin.common.FabricType; + +import static com.linkedin.common.urn.UrnUtils.toFabricType; + + +public final class MLModelUrn extends Urn { + + public static final String ENTITY_TYPE = "mlModel"; + + private static final String CONTENT_FORMAT = "(%s,%s,%s)"; + + private final DataPlatformUrn platformEntity; + + private final String mlModelNameEntity; + + private final FabricType originEntity; + + public MLModelUrn(DataPlatformUrn platform, String mlModelName, FabricType origin) { + super(ENTITY_TYPE, String.format(CONTENT_FORMAT, platform.toString(), mlModelName, origin.name())); + this.platformEntity = platform; + this.mlModelNameEntity = mlModelName; + this.originEntity = origin; + } + + public DataPlatformUrn getPlatformEntity() { + return platformEntity; + } + + public String getMlModelNameEntity() { + return mlModelNameEntity; + } + + public FabricType getOriginEntity() { + return originEntity; + } + + public static MLModelUrn createFromString(String rawUrn) throws URISyntaxException { + String content = new Urn(rawUrn).getContent(); + String[] parts = content.substring(1, content.length() - 1).split(","); + return new MLModelUrn(DataPlatformUrn.createFromString(parts[0]), parts[1], toFabricType(parts[2])); + } + + public static MLModelUrn deserialize(String rawUrn) throws URISyntaxException { + return createFromString(rawUrn); + } +} diff --git a/li-utils/src/main/pegasus/com/linkedin/common/MLFeatureDataType.pdl b/li-utils/src/main/pegasus/com/linkedin/common/MLFeatureDataType.pdl new file mode 100644 index 0000000000..86b142b3fc --- /dev/null +++ b/li-utils/src/main/pegasus/com/linkedin/common/MLFeatureDataType.pdl @@ -0,0 +1,83 @@ +namespace com.linkedin.common + +/** + * MLFeature Data Type + */ +enum MLFeatureDataType { + + /** + * Useless data is unique, discrete data with no potential relationship with the outcome variable. + * A useless feature has high cardinality. An example would be bank account numbers that were generated randomly. + */ + USELESS + + /** + * Nominal data is made of discrete values with no numerical relationship between the different categories — mean and median are meaningless. + * Animal species is one example. For example, pig is not higher than bird and lower than fish. + */ + NOMINAL + + /** + * Ordinal data are discrete integers that can be ranked or sorted. + * For example, the distance between first and second may not be the same as the distance between second and third. + */ + ORDINAL + + /** + * Binary data is discrete data that can be in only one of two categories — either yes or no, 1 or 0, off or on, etc + */ + BINARY + + /** + * Count data is discrete whole number data — no negative numbers here. + * Count data often has many small values, such as zero and one. + */ + COUNT + + /** + * Time data is a cyclical, repeating continuous form of data. + * The relevant time features can be any period— daily, weekly, monthly, annual, etc. + */ + TIME + + /** + * Interval data has equal spaces between the numbers and does not represent a temporal pattern. + * Examples include percentages, temperatures, and income. + */ + INTERVAL + + /** + * Image Data + */ + IMAGE + + /** + * Video Data + */ + VIDEO + + /** + * Audio Data + */ + AUDIO + + /** + * Text Data + */ + TEXT + + /** + * Mapping Data Type ex: dict, map + */ + MAP + + /** + * Sequence Data Type ex: list, tuple, range + */ + SEQUENCE + + /** + * Set Data Type ex: set, frozenset + */ + SET +} diff --git a/li-utils/src/main/pegasus/com/linkedin/common/MLFeatureUrn.pdl b/li-utils/src/main/pegasus/com/linkedin/common/MLFeatureUrn.pdl new file mode 100644 index 0000000000..e199f3e575 --- /dev/null +++ b/li-utils/src/main/pegasus/com/linkedin/common/MLFeatureUrn.pdl @@ -0,0 +1,27 @@ +namespace com.linkedin.common + +/** + * Standardized MLFeature identifier. + */ +@java.class = "com.linkedin.common.urn.MLFeatureUrn" +@validate.`com.linkedin.common.validator.TypedUrnValidator` = { + "accessible" : true, + "owningTeam" : "urn:li:internalTeam:datahub", + "entityType" : "mlFeature", + "constructable" : true, + "namespace" : "li", + "name" : "MLFeature", + "doc" : "Standardized MLFeature identifier.", + "owners" : [ "urn:li:corpuser:fbar", "urn:li:corpuser:bfoo" ], + "fields" : [ { + "name" : "mlFeatureNamespace", + "type" : "string", + "doc" : "Namespace for the MLFeature" + }, { "type" : "string", + "name" : "mlFeatureName", + "doc" : "Name of the MLFeature", + "maxLength" : 210 + }], + "maxLength" : 284 +} +typeref MLFeatureUrn = string diff --git a/li-utils/src/main/pegasus/com/linkedin/common/MLModelUrn.pdl b/li-utils/src/main/pegasus/com/linkedin/common/MLModelUrn.pdl new file mode 100644 index 0000000000..ea4dfbf69b --- /dev/null +++ b/li-utils/src/main/pegasus/com/linkedin/common/MLModelUrn.pdl @@ -0,0 +1,32 @@ +namespace com.linkedin.common + +/** + * Standardized MLModel identifier. + */ +@java.class = "com.linkedin.common.urn.MLModelUrn" +@validate.`com.linkedin.common.validator.TypedUrnValidator` = { + "accessible" : true, + "owningTeam" : "urn:li:internalTeam:datahub", + "entityType" : "mlModel", + "constructable" : true, + "namespace" : "li", + "name" : "MlModel", + "doc" : "Standardized model identifier.", + "owners" : [ "urn:li:corpuser:fbar", "urn:li:corpuser:bfoo" ], + "fields" : [ { + "name" : "platform", + "type" : "com.linkedin.common.urn.DataPlatformUrn", + "doc" : "Standardized platform urn for the MLModel." + }, { + "name" : "mlModelName", + "doc" : "Name of the MLModel", + "type" : "string", + "maxLength" : 210 + }, { + "name" : "origin", + "type" : "com.linkedin.common.FabricType", + "doc" : "Fabric type where model belongs to or where it was generated." + } ], + "maxLength" : 284 +} +typeref MLModelUrn = string diff --git a/metadata-models/src/main/pegasus/com/linkedin/common/Cost.pdl b/metadata-models/src/main/pegasus/com/linkedin/common/Cost.pdl new file mode 100644 index 0000000000..c092679fdc --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/common/Cost.pdl @@ -0,0 +1,17 @@ +namespace com.linkedin.common + +/* +* Cost Details for an Entity +*/ +record Cost { + + /* + * Type of Cost Code + */ + costType: CostType + + /* + * Code to which the Cost of this entity should be attributed to + */ + cost: CostValue +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/common/CostType.pdl b/metadata-models/src/main/pegasus/com/linkedin/common/CostType.pdl new file mode 100644 index 0000000000..a7e3509ef1 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/common/CostType.pdl @@ -0,0 +1,12 @@ +namespace com.linkedin.common + +/** + * Type of Cost Code + */ +enum CostType { + + /** + * Org Cost Type to which the Cost of this entity should be attributed to + */ + ORG_COST_TYPE +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/common/CostValue.pdl b/metadata-models/src/main/pegasus/com/linkedin/common/CostValue.pdl new file mode 100644 index 0000000000..8109cf9bd9 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/common/CostValue.pdl @@ -0,0 +1,9 @@ +namespace com.linkedin.common + +/** + * A union of all supported Cost Value types + */ +typeref CostValue = union[ + costId: double + costCode: string +] diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/aspect/MLFeatureAspect.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/aspect/MLFeatureAspect.pdl new file mode 100644 index 0000000000..7b57ee085f --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/aspect/MLFeatureAspect.pdl @@ -0,0 +1,18 @@ +namespace com.linkedin.metadata.aspect + +import com.linkedin.common.InstitutionalMemory +import com.linkedin.common.Ownership +import com.linkedin.common.Status +import com.linkedin.ml.metadata.MLFeatureProperties +import com.linkedin.common.Deprecation + +/** + * A union of all supported metadata aspects for a MLFeature + */ +typeref MLFeatureAspect = union[ + Ownership, + MLFeatureProperties, + InstitutionalMemory, + Status, + Deprecation +] diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/aspect/MLModelAspect.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/aspect/MLModelAspect.pdl new file mode 100644 index 0000000000..da28620c06 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/aspect/MLModelAspect.pdl @@ -0,0 +1,38 @@ +namespace com.linkedin.metadata.aspect + +import com.linkedin.common.InstitutionalMemory +import com.linkedin.common.Ownership +import com.linkedin.common.Status +import com.linkedin.ml.metadata.CaveatsAndRecommendations +import com.linkedin.ml.metadata.EthicalConsiderations +import com.linkedin.ml.metadata.EvaluationData +import com.linkedin.ml.metadata.IntendedUse +import com.linkedin.ml.metadata.Metrics +import com.linkedin.ml.metadata.MLModelFactorPrompts +import com.linkedin.ml.metadata.MLModelProperties +import com.linkedin.ml.metadata.QuantitativeAnalyses +import com.linkedin.ml.metadata.TrainingData +import com.linkedin.common.Cost +import com.linkedin.common.Deprecation +import com.linkedin.ml.metadata.SourceCode + +/** + * A union of all supported metadata aspects for a ML Model + */ +typeref MLModelAspect = union[ + Ownership, + MLModelProperties, + IntendedUse, + MLModelFactorPrompts, + Metrics, + EvaluationData, + TrainingData, + QuantitativeAnalyses, + EthicalConsiderations, + CaveatsAndRecommendations, + InstitutionalMemory, + SourceCode, + Status, + Cost, + Deprecation +] diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/MLFeatureSnapshot.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/MLFeatureSnapshot.pdl new file mode 100644 index 0000000000..5a2345d238 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/MLFeatureSnapshot.pdl @@ -0,0 +1,17 @@ +namespace com.linkedin.metadata.snapshot + +import com.linkedin.common.MLFeatureUrn +import com.linkedin.metadata.aspect.MLFeatureAspect + +record MLFeatureSnapshot { + + /** + * URN for the entity the metadata snapshot is associated with. + */ + urn: MLFeatureUrn + + /** + * The list of metadata aspects associated with the MLModel. Depending on the use case, this can either be all, or a selection, of supported aspects. + */ + aspects: array[MLFeatureAspect] +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/MLModelSnapshot.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/MLModelSnapshot.pdl new file mode 100644 index 0000000000..01088fce06 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/MLModelSnapshot.pdl @@ -0,0 +1,20 @@ +namespace com.linkedin.metadata.snapshot + +import com.linkedin.common.MLModelUrn +import com.linkedin.metadata.aspect.MLModelAspect + +/** + * MLModel Snapshot entity details. + */ +record MLModelSnapshot { + + /** + * URN for the entity the metadata snapshot is associated with. + */ + urn: MLModelUrn + + /** + * The list of metadata aspects associated with the MLModel. Depending on the use case, this can either be all, or a selection, of supported aspects. + */ + aspects: array[MLModelAspect] +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/Snapshot.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/Snapshot.pdl index c883cff4d2..e83994d4db 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/Snapshot.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/Snapshot.pdl @@ -8,4 +8,6 @@ typeref Snapshot = union[ CorpUserSnapshot, DatasetSnapshot, DataProcessSnapshot, + MLModelSnapshot, + MLFeatureSnapshot ] \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/BaseData.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/BaseData.pdl new file mode 100644 index 0000000000..a84c0372e9 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/BaseData.pdl @@ -0,0 +1,24 @@ +namespace com.linkedin.ml.metadata + +import com.linkedin.common.DatasetUrn + +/** + * BaseData record + */ +record BaseData { + + /** + * What dataset were used in the MLModel? + */ + dataset: DatasetUrn + + /** + * Why was this dataset chosen? + */ + motivation: optional string + + /** + * How was the data preprocessed (e.g., tokenization of sentences, cropping of images, any filtering such as dropping images without faces)? + */ + preProcessing: optional array[string] +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/CaveatDetails.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/CaveatDetails.pdl new file mode 100644 index 0000000000..22937b76fd --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/CaveatDetails.pdl @@ -0,0 +1,23 @@ +namespace com.linkedin.ml.metadata + +/** + * This section should list additional concerns that were not covered in the previous sections. For example, did the results suggest any further testing? Were there any relevant groups that were not represented in the evaluation dataset? Are there additional recommendations for model use? + */ +record CaveatDetails { + + /** + * Did the results suggest any further testing? + */ + needsFurtherTesting: optional boolean + + /** + * Caveat Description + * For ex: Given gender classes are binary (male/not male), which we include as male/female. Further work needed to evaluate across a spectrum of genders. + */ + caveatDescription: optional string + + /** + * Relevant groups that were not represented in the evaluation dataset? + */ + groupsNotRepresented: optional array[string] +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/CaveatsAndRecommendations.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/CaveatsAndRecommendations.pdl new file mode 100644 index 0000000000..68b483c91c --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/CaveatsAndRecommendations.pdl @@ -0,0 +1,22 @@ +namespace com.linkedin.ml.metadata + +/** + * This section should list additional concerns that were not covered in the previous sections. For example, did the results suggest any further testing? Were there any relevant groups that were not represented in the evaluation dataset? Are there additional recommendations for model use? + */ +record CaveatsAndRecommendations { + + /** + * This section should list additional concerns that were not covered in the previous sections. For example, did the results suggest any further testing? Were there any relevant groups that were not represented in the evaluation dataset? + */ + caveats: optional CaveatDetails + + /** + * Recommendations on where this MLModel should be used. + */ + recommendations: optional string + + /** + * Ideal characteristics of an evaluation dataset for this MLModel + */ + idealDatasetCharacteristics: optional array[string] +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/EthicalConsiderations.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/EthicalConsiderations.pdl new file mode 100644 index 0000000000..b635ba91d2 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/EthicalConsiderations.pdl @@ -0,0 +1,32 @@ +namespace com.linkedin.ml.metadata + +/** + * This section is intended to demonstrate the ethical considerations that went into MLModel development, surfacing ethical challenges and solutions to stakeholders. + */ +record EthicalConsiderations { + + /** + * Does the MLModel use any sensitive data (e.g., protected classes)? + */ + data: optional array[string] + + /** + * Is the MLModel intended to inform decisions about matters central to human life or flourishing – e.g., health or safety? Or could it be used in such a way? + */ + humanLife: optional array[string] + + /** + * What risk mitigation strategies were used during MLModel development? + */ + mitigations: optional array[string] + + /** + * What risks may be present in MLModel usage? Try to identify the potential recipients, likelihood, and magnitude of harms. If these cannot be determined, note that they were considered but remain unknown. + */ + risksAndHarms: optional array[string] + + /** + * Are there any known MLModel use cases that are especially fraught? This may connect directly to the intended use section + */ + useCases: optional array[string] +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/EvaluationData.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/EvaluationData.pdl new file mode 100644 index 0000000000..0f029b61d4 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/EvaluationData.pdl @@ -0,0 +1,14 @@ +namespace com.linkedin.ml.metadata + +import com.linkedin.common.DatasetUrn + +/** + * All referenced datasets would ideally point to any set of documents that provide visibility into the source and composition of the dataset. + */ +record EvaluationData { + + /** + * Details on the dataset(s) used for the quantitative analyses in the MLModel + */ + evaluationData: array[BaseData] +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/HyperParameterValueType.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/HyperParameterValueType.pdl new file mode 100644 index 0000000000..ea11e6b369 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/HyperParameterValueType.pdl @@ -0,0 +1,6 @@ +namespace com.linkedin.ml.metadata + +/** + * A union of all supported metadata aspects for HyperParameter Value + */ +typeref HyperParameterValueType = union[string, int, float, double, boolean] diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/IntendedUse.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/IntendedUse.pdl new file mode 100644 index 0000000000..1ed5b528a1 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/IntendedUse.pdl @@ -0,0 +1,22 @@ +namespace com.linkedin.ml.metadata + +/** + * Intended Use for the ML Model + */ +record IntendedUse { + + /** + * Primary Use cases for the MLModel. + */ + primaryUses: optional array[string] + + /** + * Primary Intended Users - For example, was the MLModel developed for entertainment purposes, for hobbyists, or enterprise solutions? + */ + primaryUsers: optional array[IntendedUserType] + + /** + * Highlight technology that the MLModel might easily be confused with, or related contexts that users could try to apply the MLModel to. + */ + outOfScopeUses: optional array[string] +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/IntendedUserType.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/IntendedUserType.pdl new file mode 100644 index 0000000000..4a85d919bb --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/IntendedUserType.pdl @@ -0,0 +1,22 @@ +namespace com.linkedin.ml.metadata + +/* +* Primary Intended User Types or User Categories +*/ +enum IntendedUserType { + + /* + * Developed for Enterprise Users + */ + ENTERPRISE + + /* + * Developed for Hobbyists + */ + HOBBY + + /* + * Developed for Entertainment Purposes + */ + ENTERTAINMENT +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLFeatureProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLFeatureProperties.pdl new file mode 100644 index 0000000000..59f5f2990c --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLFeatureProperties.pdl @@ -0,0 +1,25 @@ +namespace com.linkedin.ml.metadata + +import com.linkedin.common.MLFeatureDataType +import com.linkedin.common.VersionTag + +/** + * Properties associated with a MLFeature + */ +record MLFeatureProperties { + + /** + * Documentation of the MLFeature + */ + description: optional string + + /** + * Data Type of the MLFeature + */ + dataType: optional MLFeatureDataType + + /** + * Version of the MLFeature + */ + version: optional VersionTag +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelFactorPrompts.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelFactorPrompts.pdl new file mode 100644 index 0000000000..3b50dd8d82 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelFactorPrompts.pdl @@ -0,0 +1,17 @@ +namespace com.linkedin.ml.metadata + +/** + * Prompts which affect the performance of the MLModel + */ +record MLModelFactorPrompts { + + /** + * What are foreseeable salient factors for which MLModel performance may vary, and how were these determined? + */ + relevantFactors: optional array[MLModelFactors] + + /** + * Which factors are being reported, and why were these chosen? + */ + evaluationFactors: optional array[MLModelFactors] +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelFactors.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelFactors.pdl new file mode 100644 index 0000000000..5495ade1fd --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelFactors.pdl @@ -0,0 +1,25 @@ +namespace com.linkedin.ml.metadata + +/** + * Factors affecting the performance of the MLModel. + */ +record MLModelFactors { + + /** + * Groups refers to distinct categories with similar characteristics that are present in the evaluation data instances. + * For human-centric machine learning MLModels, groups are people who share one or multiple characteristics. + */ + groups: optional array[string] + + /** + * The performance of a MLModel can vary depending on what instruments were used to capture the input to the MLModel. + * For example, a face detection model may perform differently depending on the camera’s hardware and software, + * including lens, image stabilization, high dynamic range techniques, and background blurring for portrait mode. + */ + instrumentation: optional array[string] + + /** + * A further factor affecting MLModel performance is the environment in which it is deployed. + */ + environment: optional array[string] +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl new file mode 100644 index 0000000000..19040aa801 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl @@ -0,0 +1,46 @@ +namespace com.linkedin.ml.metadata + +import com.linkedin.common.MLFeatureUrn +import com.linkedin.common.Time +import com.linkedin.common.VersionTag + +/** + * Properties associated with a ML Model + */ +record MLModelProperties { + + /** + * Documentation of the MLModel + */ + description: optional string + + /** + * Date when the MLModel was developed + */ + date: optional Time + + /** + * Version of the MLModel + */ + version: optional VersionTag + + /** + * Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc + */ + type: optional string + + /** + * Hyper Parameters of the MLModel + */ + hyperParameters: optional map[string, HyperParameterValueType] + + /** + * List of features used for MLModel training + */ + mlFeatures: optional array[MLFeatureUrn] + + /** + * Tags for the MLModel + */ + tags: array[string] = [ ] +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/Metrics.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/Metrics.pdl new file mode 100644 index 0000000000..6d735700fc --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/Metrics.pdl @@ -0,0 +1,17 @@ +namespace com.linkedin.ml.metadata + +/** + * Metrics to be featured for the MLModel. + */ +record Metrics { + + /** + * Measures of MLModel performance + */ + performanceMeasures: optional array[string] + + /** + * Decision Thresholds used (if any)? + */ + decisionThreshold: optional array[string] +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/QuantitativeAnalyses.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/QuantitativeAnalyses.pdl new file mode 100644 index 0000000000..e792974e27 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/QuantitativeAnalyses.pdl @@ -0,0 +1,17 @@ +namespace com.linkedin.ml.metadata + +/** + * Quantitative analyses should be disaggregated, that is, broken down by the chosen factors. Quantitative analyses should provide the results of evaluating the MLModel according to the chosen metrics, providing confidence interval values when possible. + */ +record QuantitativeAnalyses { + + /** + * Link to a dashboard with results showing how the MLModel performed with respect to each factor + */ + unitaryResults: optional ResultsType + + /** + * Link to a dashboard with results showing how the MLModel performed with respect to the intersection of evaluated factors? + */ + intersectionalResults: optional ResultsType +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/ResultsType.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/ResultsType.pdl new file mode 100644 index 0000000000..c44bc9bffc --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/ResultsType.pdl @@ -0,0 +1,6 @@ +namespace com.linkedin.ml.metadata + +/** + * A union of all supported metadata aspects for ResultsType + */ +typeref ResultsType = union[string] diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/SourceCode.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/SourceCode.pdl new file mode 100644 index 0000000000..a1a3b015f3 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/SourceCode.pdl @@ -0,0 +1,12 @@ +namespace com.linkedin.ml.metadata + +/** + * Source Code + */ +record SourceCode { + + /** + * Source Code along with types + */ + sourceCode: array[SourceCodeUrl] +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/SourceCodeUrl.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/SourceCodeUrl.pdl new file mode 100644 index 0000000000..86d26397be --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/SourceCodeUrl.pdl @@ -0,0 +1,19 @@ +namespace com.linkedin.ml.metadata + +import com.linkedin.common.Url + +/** + * Source Code Url Entity + */ +record SourceCodeUrl { + + /** + * Source Code Url Types + */ + type: SourceCodeUrlType + + /** + * Source Code Url + */ + sourceCodeUrl: Url +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/SourceCodeUrlType.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/SourceCodeUrlType.pdl new file mode 100644 index 0000000000..d62094bf4e --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/SourceCodeUrlType.pdl @@ -0,0 +1,22 @@ +namespace com.linkedin.ml.metadata + +/* +* Source Code Url Types +*/ +enum SourceCodeUrlType { + + /* + * MLModel Source Code + */ + ML_MODEL_SOURCE_CODE + + /* + * Training Pipeline Source Code + */ + TRAINING_PIPELINE_SOURCE_CODE + + /* + * Evaluation Pipeline Source Code + */ + EVALUATION_PIPELINE_SOURCE_CODE +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/TrainingData.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/TrainingData.pdl new file mode 100644 index 0000000000..d647192c71 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/TrainingData.pdl @@ -0,0 +1,14 @@ +namespace com.linkedin.ml.metadata + +import com.linkedin.common.DatasetUrn + +/** + * Ideally, the MLModel card would contain as much information about the training data as the evaluation data. However, there might be cases where it is not feasible to provide this level of detailed information about the training data. For example, the data may be proprietary, or require a non-disclosure agreement. In these cases, we advocate for basic details about the distributions over groups in the data, as well as any other details that could inform stakeholders on the kinds of biases the model may have encoded. + */ +record TrainingData { + + /** + * Details on the dataset(s) used for training the MLModel + */ + trainingData: array[BaseData] +}