diff --git a/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryTermHeader.tsx b/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryTermHeader.tsx index 75530f326d..e3d5a84be9 100644 --- a/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryTermHeader.tsx +++ b/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryTermHeader.tsx @@ -1,9 +1,7 @@ -import { Avatar, Divider, Space, Tooltip, Typography } from 'antd'; +import { Divider, Space, Typography } from 'antd'; import React from 'react'; -import { Link } from 'react-router-dom'; -import { EntityType } from '../../../../types.generated'; import { useEntityRegistry } from '../../../useEntityRegistry'; -import defaultAvatar from '../../../../images/default_avatar.png'; +import { AvatarsGroup } from '../../../shared/avatar'; type Props = { definition: string; @@ -26,26 +24,7 @@ export default function GlossaryTermHeader({ definition, sourceRef, sourceUrl, o )} - {ownership && ( - - {ownership?.owners?.map((owner) => ( - - - - - - ))} - - )} + ); diff --git a/metadata-ingestion/README.md b/metadata-ingestion/README.md index 49256d9e67..3e89485167 100644 --- a/metadata-ingestion/README.md +++ b/metadata-ingestion/README.md @@ -38,6 +38,7 @@ Sources: | [athena](./source_docs/athena.md) | `pip install 'acryl-datahub[athena]'` | AWS Athena source | | [bigquery](./source_docs/bigquery.md) | `pip install 'acryl-datahub[bigquery]'` | BigQuery source | | [bigquery-usage](./source_docs/bigquery.md) | `pip install 'acryl-datahub[bigquery-usage]'` | BigQuery usage statistics source | +| [datahub-business-glossary](./source_docs/business_glossary.md) | _no additional dependencies_ | Business Glossary File source | | [dbt](./source_docs/dbt.md) | _no additional dependencies_ | dbt source | | [druid](./source_docs/druid.md) | `pip install 'acryl-datahub[druid]'` | Druid Source | | [feast](./source_docs/feast.md) | `pip install 'acryl-datahub[feast]'` | Feast source | diff --git a/metadata-ingestion/examples/bootstrap_data/business_glossary.yml b/metadata-ingestion/examples/bootstrap_data/business_glossary.yml new file mode 100644 index 0000000000..8e6b3cd1ee --- /dev/null +++ b/metadata-ingestion/examples/bootstrap_data/business_glossary.yml @@ -0,0 +1,55 @@ +version: 1 +source: DataHub +owners: + users: + - mjames +url: "https://github.com/linkedin/datahub/" +nodes: + - name: Classification + description: A set of terms related to Data Classification + terms: + - name: Sensitive + description: Sensitive Data + - name: Confidential + description: Confidential Data + - name: HighlyConfidential + description: Highly Confidential Data + - name: PersonalInformation + description: All terms related to personal information + owners: + users: + - mjames + terms: + - name: Email + description: An individual's email address + inherits: + - Classification.Confidential + owners: + groups: + - Trust and Safety + - name: Address + description: A physical address + - name: Gender + description: The gender identity of the individual + inherits: + - Classification.Sensitive + - name: ClientsAndAccounts + description: Provides basic concepts such as account, account holder, account provider, relationship manager that are commonly used by financial services providers to describe customers and to determine counterparty identities + owners: + groups: + - finance + terms: + - name: Account + description: Container for records associated with a business arrangement for regular transactions and services + term_source: "EXTERNAL" + source_ref: FIBO + source_url: "https://spec.edmcouncil.org/fibo/ontology/FBC/ProductsAndServices/ClientsAndAccounts/Account" + inherits: + - Classification.HighlyConfidential + contains: + - ClientsAndAccounts.Balance + - name: Balance + description: Amount of money available or owed + term_source: "EXTERNAL" + source_ref: FIBO + source_url: "https://spec.edmcouncil.org/fibo/ontology/FBC/ProductsAndServices/ClientsAndAccounts/Balance" diff --git a/metadata-ingestion/examples/recipes/business_glossary_to_datahub.yml b/metadata-ingestion/examples/recipes/business_glossary_to_datahub.yml new file mode 100644 index 0000000000..f260320064 --- /dev/null +++ b/metadata-ingestion/examples/recipes/business_glossary_to_datahub.yml @@ -0,0 +1,11 @@ +source: + type: datahub-business-glossary + config: + file: ./examples/bootstrap_data/business_glossary.yml + + +sink: + type: datahub-rest + config: + server: http://localhost:8080 + diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index fdd6dce8fc..b52f536b7d 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -79,6 +79,7 @@ plugins: Dict[str, Set[str]] = { "azure": set(), "bigquery": sql_common | {"pybigquery >= 0.6.0"}, "bigquery-usage": {"google-cloud-logging", "cachetools"}, + "datahub-business-glossary": set(), "dbt": set(), "druid": sql_common | {"pydruid>=0.6.2"}, "feast": {"docker"}, @@ -232,6 +233,7 @@ entry_points = { "ldap = datahub.ingestion.source.ldap:LDAPSource", "looker = datahub.ingestion.source.looker:LookerDashboardSource", "lookml = datahub.ingestion.source.lookml:LookMLSource", + "datahub-business-glossary = datahub.ingestion.source.metadata.business_glossary:BusinessGlossaryFileSource", "mongodb = datahub.ingestion.source.mongodb:MongoDBSource", "mssql = datahub.ingestion.source.sql.mssql:SQLServerSource", "mysql = datahub.ingestion.source.sql.mysql:MySQLSource", diff --git a/metadata-ingestion/source_docs/business_glossary.md b/metadata-ingestion/source_docs/business_glossary.md new file mode 100644 index 0000000000..cf0ed1dcb3 --- /dev/null +++ b/metadata-ingestion/source_docs/business_glossary.md @@ -0,0 +1,49 @@ +# Business Glossary + +For context on getting started with ingestion, check out our [metadata ingestion guide](../README.md). + +## Setup + +Works with `acryl-datahub` out of the box. + +## Capabilities + +This plugin pulls business glossary metadata from a yaml-formatted file. An example of one such file is located in the examples directory [here](../examples/bootstrap_data/business_glossary.yml). + +## Quickstart recipe + +Check out the following recipe to get started with ingestion! See [below](#config-details) for full configuration options. + +For general pointers on writing and running a recipe, see our [main recipe guide](../README.md#recipes). + +```yml +source: + type: datahub-business-glossary + config: + # Coordinates + file: /path/to/business_glossary_yaml + +sink: + # sink configs +``` + +## Config details + +Note that a `.` is used to denote nested fields in the YAML recipe. + +| Field | Required | Default | Description | +| ---------- | -------- | ------- | ----------------------- | +| `file` | ✅ | | Path to business glossary file to ingest. | + +### Business Glossary File Format + +The business glossary file format should be pretty easy to understand using the sample business glossary checked in [here](../examples/bootstrap_data/business_glossary.yml) + +## Compatibility + +Compatible with version 1 of business glossary format. +The source will be evolved as we publish newer versions of this format. + +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/src/datahub/emitter/mce_builder.py b/metadata-ingestion/src/datahub/emitter/mce_builder.py index 0029782798..6267fd6b25 100644 --- a/metadata-ingestion/src/datahub/emitter/mce_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mce_builder.py @@ -47,6 +47,10 @@ def make_user_urn(username: str) -> str: return f"urn:li:corpuser:{username}" +def make_group_urn(groupname: str) -> str: + return f"urn:li:corpGroup:{groupname}" + + def make_tag_urn(tag: str) -> str: return f"urn:li:tag:{tag}" diff --git a/metadata-ingestion/src/datahub/ingestion/source/metadata/business_glossary.py b/metadata-ingestion/src/datahub/ingestion/source/metadata/business_glossary.py new file mode 100644 index 0000000000..ac5110185f --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/metadata/business_glossary.py @@ -0,0 +1,261 @@ +import logging +from dataclasses import dataclass, field +from typing import Any, Iterable, List, Optional, Union + +from pydantic import validator + +import datahub.metadata.schema_classes as models +from datahub.configuration.common import ConfigModel +from datahub.configuration.config_loader import load_config_file +from datahub.emitter.mce_builder import get_sys_time, make_group_urn, make_user_urn +from datahub.ingestion.api.source import Source, SourceReport +from datahub.ingestion.api.workunit import MetadataWorkUnit, UsageStatsWorkUnit + +logger = logging.getLogger(__name__) + + +valid_status: models.StatusClass = models.StatusClass(removed=False) +auditStamp = models.AuditStampClass( + time=get_sys_time(), actor="urn:li:corpUser:restEmitter" +) + + +class Owners(ConfigModel): + users: Optional[List[str]] + groups: Optional[List[str]] + + +class GlossaryTermConfig(ConfigModel): + name: str + description: str + term_source: Optional[str] + source_ref: Optional[str] + source_url: Optional[str] + owners: Optional[Owners] + inherits: Optional[List[str]] + contains: Optional[List[str]] + + +class GlossaryNodeConfig(ConfigModel): + name: str + description: str + owners: Optional[Owners] + terms: Optional[List[GlossaryTermConfig]] + nodes: Optional[List["GlossaryNodeConfig"]] + + +GlossaryNodeConfig.update_forward_refs() + + +class DefaultConfig(ConfigModel): + """Holds defaults for populating fields in glossary terms""" + + source: str + url: str + owners: Owners + source_type: Optional[str] = "INTERNAL" + + +class BusinessGlossarySourceConfig(ConfigModel): + file: str + + +class BusinessGlossaryConfig(DefaultConfig): + version: str + nodes: List[GlossaryNodeConfig] + + @validator("version") + def version_must_be_1(cls, v): + if v != "1": + raise ValueError("Only version 1 is supported") + + +def make_glossary_node_urn(path: List[str]) -> str: + return "urn:li:glossaryNode:" + ".".join(path) + + +def make_glossary_term_urn(path: List[str]) -> str: + return "urn:li:glossaryTerm:" + ".".join(path) + + +def get_owners(owners: Owners) -> models.OwnershipClass: + owners_meta: List[models.OwnerClass] = [] + if owners.users is not None: + owners_meta = owners_meta + [ + models.OwnerClass( + owner=make_user_urn(o), + type=models.OwnershipTypeClass.DEVELOPER, + ) + for o in owners.users + ] + if owners.groups is not None: + owners_meta = owners_meta + [ + models.OwnerClass( + owner=make_group_urn(o), + type=models.OwnershipTypeClass.DEVELOPER, + ) + for o in owners.groups + ] + return models.OwnershipClass(owners=owners_meta) + + +def get_mces( + glossary: BusinessGlossaryConfig, +) -> List[models.MetadataChangeEventClass]: + events: List[models.MetadataChangeEventClass] = [] + path: List[str] = [] + root_owners = get_owners(glossary.owners) + + for node in glossary.nodes: + events += get_mces_from_node( + node, + path + [node.name], + parentNode=None, + parentOwners=root_owners, + defaults=glossary, + ) + return events + + +def get_mce_from_snapshot(snapshot: Any) -> models.MetadataChangeEventClass: + return models.MetadataChangeEventClass( + proposedSnapshot=snapshot, + systemMetadata=models.SystemMetadataClass(runId="test-glossary"), + ) + + +def get_mces_from_node( + glossaryNode: GlossaryNodeConfig, + path: List[str], + parentNode: Optional[str], + parentOwners: models.OwnershipClass, + defaults: DefaultConfig, +) -> List[models.MetadataChangeEventClass]: + node_urn = make_glossary_node_urn(path) + node_info = models.GlossaryNodeInfoClass( + definition=glossaryNode.description, + parentNode=parentNode, + ) + node_owners = parentOwners + if glossaryNode.owners is not None: + assert glossaryNode.owners is not None + node_owners = get_owners(glossaryNode.owners) + + node_snapshot = models.GlossaryNodeSnapshotClass( + urn=node_urn, + aspects=[node_info, node_owners, valid_status], + ) + mces = [get_mce_from_snapshot(node_snapshot)] + if glossaryNode.nodes: + for node in glossaryNode.nodes: + mces += get_mces_from_node( + node, + path + [node.name], + parentNode=node_urn, + parentOwners=node_owners, + defaults=defaults, + ) + + if glossaryNode.terms: + for term in glossaryNode.terms: + mces += get_mces_from_term( + term, + path + [term.name], + parentNode=node_urn, + parentOwnership=node_owners, + defaults=defaults, + ) + return mces + + +def get_mces_from_term( + glossaryTerm: GlossaryTermConfig, + path: List[str], + parentNode: str, + parentOwnership: models.OwnershipClass, + defaults: DefaultConfig, +) -> List[models.MetadataChangeEventClass]: + term_urn = make_glossary_term_urn(path) + aspects: List[ + Union[ + models.GlossaryTermInfoClass, + models.GlossaryRelatedTermsClass, + models.OwnershipClass, + models.StatusClass, + models.GlossaryTermKeyClass, + models.BrowsePathsClass, + ] + ] = [] + term_info = models.GlossaryTermInfoClass( + definition=glossaryTerm.description, + termSource=glossaryTerm.term_source # type: ignore + if glossaryTerm.term_source is not None + else defaults.source_type, + sourceRef=glossaryTerm.source_ref + if glossaryTerm.source_ref + else defaults.source, + sourceUrl=glossaryTerm.source_url if glossaryTerm.source_url else defaults.url, + parentNode=parentNode, + ) + aspects.append(term_info) + + isA_related = None + hasA_related = None + if glossaryTerm.inherits is not None: + assert glossaryTerm.inherits is not None + isA_related = [make_glossary_term_urn([term]) for term in glossaryTerm.inherits] + if glossaryTerm.contains is not None: + assert glossaryTerm.contains is not None + hasA_related = [ + make_glossary_term_urn([term]) for term in glossaryTerm.contains + ] + + if isA_related is not None or hasA_related is not None: + relatedTerms = models.GlossaryRelatedTermsClass( + isRelatedTerms=isA_related, hasRelatedTerms=hasA_related + ) + aspects.append(relatedTerms) + + ownership: models.OwnershipClass = parentOwnership + if glossaryTerm.owners is not None: + assert glossaryTerm.owners is not None + ownership = get_owners(glossaryTerm.owners) + aspects.append(ownership) + + term_browse = models.BrowsePathsClass(paths=["/" + "/".join(path)]) + aspects.append(term_browse) + + term_snapshot: models.GlossaryTermSnapshotClass = models.GlossaryTermSnapshotClass( + urn=term_urn, + aspects=aspects, + ) + return [get_mce_from_snapshot(term_snapshot)] + + +@dataclass +class BusinessGlossaryFileSource(Source): + config: BusinessGlossarySourceConfig + report: SourceReport = field(default_factory=SourceReport) + + @classmethod + def create(cls, config_dict, ctx): + config = BusinessGlossarySourceConfig.parse_obj(config_dict) + return cls(ctx, config) + + def load_glossary_config(self, file_name: str) -> BusinessGlossaryConfig: + config = load_config_file(file_name) + glossary_cfg = BusinessGlossaryConfig.parse_obj(config) + return glossary_cfg + + def get_workunits(self) -> Iterable[Union[MetadataWorkUnit, UsageStatsWorkUnit]]: + glossary_config = self.load_glossary_config(self.config.file) + for mce in get_mces(glossary_config): + wu = MetadataWorkUnit(f"{mce.proposedSnapshot.urn}", mce=mce) + self.report.report_workunit(wu) + yield wu + + def get_report(self): + return self.report + + def close(self): + pass diff --git a/metadata-ingestion/src/datahub/metadata/schema.avsc b/metadata-ingestion/src/datahub/metadata/schema.avsc index 7b8ea15223..492060358a 100644 --- a/metadata-ingestion/src/datahub/metadata/schema.avsc +++ b/metadata-ingestion/src/datahub/metadata/schema.avsc @@ -1075,7 +1075,7 @@ }, "type": "string", "name": "pictureLink", - "default": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web/packages/data-portal/public/assets/images/default_avatar.png", + "default": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/default_avatar.png", "doc": "A URL which points to a picture which user wants to set as a profile photo" } ], diff --git a/metadata-ingestion/src/datahub/metadata/schema_classes.py b/metadata-ingestion/src/datahub/metadata/schema_classes.py index c49c284f66..2ba336aa8d 100644 --- a/metadata-ingestion/src/datahub/metadata/schema_classes.py +++ b/metadata-ingestion/src/datahub/metadata/schema_classes.py @@ -3718,7 +3718,7 @@ class CorpUserEditableInfoClass(DictWrapper): else: self.skills = skills if pictureLink is None: - # default: 'https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web/packages/data-portal/public/assets/images/default_avatar.png' + # default: 'https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/default_avatar.png' self.pictureLink = self.RECORD_SCHEMA.field_map["pictureLink"].default else: self.pictureLink = pictureLink diff --git a/metadata-ingestion/src/datahub/metadata/schemas/MetadataChangeEvent.avsc b/metadata-ingestion/src/datahub/metadata/schemas/MetadataChangeEvent.avsc index 2b7596dc7a..0ed27dc039 100644 --- a/metadata-ingestion/src/datahub/metadata/schemas/MetadataChangeEvent.avsc +++ b/metadata-ingestion/src/datahub/metadata/schemas/MetadataChangeEvent.avsc @@ -1056,7 +1056,7 @@ "name": "pictureLink", "type": "string", "doc": "A URL which points to a picture which user wants to set as a profile photo", - "default": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web/packages/data-portal/public/assets/images/default_avatar.png", + "default": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/default_avatar.png", "java": { "class": "com.linkedin.pegasus2avro.common.url.Url", "coercerClass": "com.linkedin.pegasus2avro.common.url.UrlCoercer"