diff --git a/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryTermHeader.tsx b/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryTermHeader.tsx
index 75530f326d..e3d5a84be9 100644
--- a/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryTermHeader.tsx
+++ b/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryTermHeader.tsx
@@ -1,9 +1,7 @@
-import { Avatar, Divider, Space, Tooltip, Typography } from 'antd';
+import { Divider, Space, Typography } from 'antd';
import React from 'react';
-import { Link } from 'react-router-dom';
-import { EntityType } from '../../../../types.generated';
import { useEntityRegistry } from '../../../useEntityRegistry';
-import defaultAvatar from '../../../../images/default_avatar.png';
+import { AvatarsGroup } from '../../../shared/avatar';
type Props = {
definition: string;
@@ -26,26 +24,7 @@ export default function GlossaryTermHeader({ definition, sourceRef, sourceUrl, o
)}
- {ownership && (
-
- {ownership?.owners?.map((owner) => (
-
-
-
-
-
- ))}
-
- )}
+
>
);
diff --git a/metadata-ingestion/README.md b/metadata-ingestion/README.md
index 49256d9e67..3e89485167 100644
--- a/metadata-ingestion/README.md
+++ b/metadata-ingestion/README.md
@@ -38,6 +38,7 @@ Sources:
| [athena](./source_docs/athena.md) | `pip install 'acryl-datahub[athena]'` | AWS Athena source |
| [bigquery](./source_docs/bigquery.md) | `pip install 'acryl-datahub[bigquery]'` | BigQuery source |
| [bigquery-usage](./source_docs/bigquery.md) | `pip install 'acryl-datahub[bigquery-usage]'` | BigQuery usage statistics source |
+| [datahub-business-glossary](./source_docs/business_glossary.md) | _no additional dependencies_ | Business Glossary File source |
| [dbt](./source_docs/dbt.md) | _no additional dependencies_ | dbt source |
| [druid](./source_docs/druid.md) | `pip install 'acryl-datahub[druid]'` | Druid Source |
| [feast](./source_docs/feast.md) | `pip install 'acryl-datahub[feast]'` | Feast source |
diff --git a/metadata-ingestion/examples/bootstrap_data/business_glossary.yml b/metadata-ingestion/examples/bootstrap_data/business_glossary.yml
new file mode 100644
index 0000000000..8e6b3cd1ee
--- /dev/null
+++ b/metadata-ingestion/examples/bootstrap_data/business_glossary.yml
@@ -0,0 +1,55 @@
+version: 1
+source: DataHub
+owners:
+ users:
+ - mjames
+url: "https://github.com/linkedin/datahub/"
+nodes:
+ - name: Classification
+ description: A set of terms related to Data Classification
+ terms:
+ - name: Sensitive
+ description: Sensitive Data
+ - name: Confidential
+ description: Confidential Data
+ - name: HighlyConfidential
+ description: Highly Confidential Data
+ - name: PersonalInformation
+ description: All terms related to personal information
+ owners:
+ users:
+ - mjames
+ terms:
+ - name: Email
+ description: An individual's email address
+ inherits:
+ - Classification.Confidential
+ owners:
+ groups:
+ - Trust and Safety
+ - name: Address
+ description: A physical address
+ - name: Gender
+ description: The gender identity of the individual
+ inherits:
+ - Classification.Sensitive
+ - name: ClientsAndAccounts
+ description: Provides basic concepts such as account, account holder, account provider, relationship manager that are commonly used by financial services providers to describe customers and to determine counterparty identities
+ owners:
+ groups:
+ - finance
+ terms:
+ - name: Account
+ description: Container for records associated with a business arrangement for regular transactions and services
+ term_source: "EXTERNAL"
+ source_ref: FIBO
+ source_url: "https://spec.edmcouncil.org/fibo/ontology/FBC/ProductsAndServices/ClientsAndAccounts/Account"
+ inherits:
+ - Classification.HighlyConfidential
+ contains:
+ - ClientsAndAccounts.Balance
+ - name: Balance
+ description: Amount of money available or owed
+ term_source: "EXTERNAL"
+ source_ref: FIBO
+ source_url: "https://spec.edmcouncil.org/fibo/ontology/FBC/ProductsAndServices/ClientsAndAccounts/Balance"
diff --git a/metadata-ingestion/examples/recipes/business_glossary_to_datahub.yml b/metadata-ingestion/examples/recipes/business_glossary_to_datahub.yml
new file mode 100644
index 0000000000..f260320064
--- /dev/null
+++ b/metadata-ingestion/examples/recipes/business_glossary_to_datahub.yml
@@ -0,0 +1,11 @@
+source:
+ type: datahub-business-glossary
+ config:
+ file: ./examples/bootstrap_data/business_glossary.yml
+
+
+sink:
+ type: datahub-rest
+ config:
+ server: http://localhost:8080
+
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index fdd6dce8fc..b52f536b7d 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -79,6 +79,7 @@ plugins: Dict[str, Set[str]] = {
"azure": set(),
"bigquery": sql_common | {"pybigquery >= 0.6.0"},
"bigquery-usage": {"google-cloud-logging", "cachetools"},
+ "datahub-business-glossary": set(),
"dbt": set(),
"druid": sql_common | {"pydruid>=0.6.2"},
"feast": {"docker"},
@@ -232,6 +233,7 @@ entry_points = {
"ldap = datahub.ingestion.source.ldap:LDAPSource",
"looker = datahub.ingestion.source.looker:LookerDashboardSource",
"lookml = datahub.ingestion.source.lookml:LookMLSource",
+ "datahub-business-glossary = datahub.ingestion.source.metadata.business_glossary:BusinessGlossaryFileSource",
"mongodb = datahub.ingestion.source.mongodb:MongoDBSource",
"mssql = datahub.ingestion.source.sql.mssql:SQLServerSource",
"mysql = datahub.ingestion.source.sql.mysql:MySQLSource",
diff --git a/metadata-ingestion/source_docs/business_glossary.md b/metadata-ingestion/source_docs/business_glossary.md
new file mode 100644
index 0000000000..cf0ed1dcb3
--- /dev/null
+++ b/metadata-ingestion/source_docs/business_glossary.md
@@ -0,0 +1,49 @@
+# Business Glossary
+
+For context on getting started with ingestion, check out our [metadata ingestion guide](../README.md).
+
+## Setup
+
+Works with `acryl-datahub` out of the box.
+
+## Capabilities
+
+This plugin pulls business glossary metadata from a yaml-formatted file. An example of one such file is located in the examples directory [here](../examples/bootstrap_data/business_glossary.yml).
+
+## Quickstart recipe
+
+Check out the following recipe to get started with ingestion! See [below](#config-details) for full configuration options.
+
+For general pointers on writing and running a recipe, see our [main recipe guide](../README.md#recipes).
+
+```yml
+source:
+ type: datahub-business-glossary
+ config:
+ # Coordinates
+ file: /path/to/business_glossary_yaml
+
+sink:
+ # sink configs
+```
+
+## Config details
+
+Note that a `.` is used to denote nested fields in the YAML recipe.
+
+| Field | Required | Default | Description |
+| ---------- | -------- | ------- | ----------------------- |
+| `file` | ✅ | | Path to business glossary file to ingest. |
+
+### Business Glossary File Format
+
+The business glossary file format should be pretty easy to understand using the sample business glossary checked in [here](../examples/bootstrap_data/business_glossary.yml)
+
+## Compatibility
+
+Compatible with version 1 of business glossary format.
+The source will be evolved as we publish newer versions of this format.
+
+## Questions
+
+If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)!
diff --git a/metadata-ingestion/src/datahub/emitter/mce_builder.py b/metadata-ingestion/src/datahub/emitter/mce_builder.py
index 0029782798..6267fd6b25 100644
--- a/metadata-ingestion/src/datahub/emitter/mce_builder.py
+++ b/metadata-ingestion/src/datahub/emitter/mce_builder.py
@@ -47,6 +47,10 @@ def make_user_urn(username: str) -> str:
return f"urn:li:corpuser:{username}"
+def make_group_urn(groupname: str) -> str:
+ return f"urn:li:corpGroup:{groupname}"
+
+
def make_tag_urn(tag: str) -> str:
return f"urn:li:tag:{tag}"
diff --git a/metadata-ingestion/src/datahub/ingestion/source/metadata/business_glossary.py b/metadata-ingestion/src/datahub/ingestion/source/metadata/business_glossary.py
new file mode 100644
index 0000000000..ac5110185f
--- /dev/null
+++ b/metadata-ingestion/src/datahub/ingestion/source/metadata/business_glossary.py
@@ -0,0 +1,261 @@
+import logging
+from dataclasses import dataclass, field
+from typing import Any, Iterable, List, Optional, Union
+
+from pydantic import validator
+
+import datahub.metadata.schema_classes as models
+from datahub.configuration.common import ConfigModel
+from datahub.configuration.config_loader import load_config_file
+from datahub.emitter.mce_builder import get_sys_time, make_group_urn, make_user_urn
+from datahub.ingestion.api.source import Source, SourceReport
+from datahub.ingestion.api.workunit import MetadataWorkUnit, UsageStatsWorkUnit
+
+logger = logging.getLogger(__name__)
+
+
+valid_status: models.StatusClass = models.StatusClass(removed=False)
+auditStamp = models.AuditStampClass(
+ time=get_sys_time(), actor="urn:li:corpUser:restEmitter"
+)
+
+
+class Owners(ConfigModel):
+ users: Optional[List[str]]
+ groups: Optional[List[str]]
+
+
+class GlossaryTermConfig(ConfigModel):
+ name: str
+ description: str
+ term_source: Optional[str]
+ source_ref: Optional[str]
+ source_url: Optional[str]
+ owners: Optional[Owners]
+ inherits: Optional[List[str]]
+ contains: Optional[List[str]]
+
+
+class GlossaryNodeConfig(ConfigModel):
+ name: str
+ description: str
+ owners: Optional[Owners]
+ terms: Optional[List[GlossaryTermConfig]]
+ nodes: Optional[List["GlossaryNodeConfig"]]
+
+
+GlossaryNodeConfig.update_forward_refs()
+
+
+class DefaultConfig(ConfigModel):
+ """Holds defaults for populating fields in glossary terms"""
+
+ source: str
+ url: str
+ owners: Owners
+ source_type: Optional[str] = "INTERNAL"
+
+
+class BusinessGlossarySourceConfig(ConfigModel):
+ file: str
+
+
+class BusinessGlossaryConfig(DefaultConfig):
+ version: str
+ nodes: List[GlossaryNodeConfig]
+
+ @validator("version")
+ def version_must_be_1(cls, v):
+ if v != "1":
+ raise ValueError("Only version 1 is supported")
+
+
+def make_glossary_node_urn(path: List[str]) -> str:
+ return "urn:li:glossaryNode:" + ".".join(path)
+
+
+def make_glossary_term_urn(path: List[str]) -> str:
+ return "urn:li:glossaryTerm:" + ".".join(path)
+
+
+def get_owners(owners: Owners) -> models.OwnershipClass:
+ owners_meta: List[models.OwnerClass] = []
+ if owners.users is not None:
+ owners_meta = owners_meta + [
+ models.OwnerClass(
+ owner=make_user_urn(o),
+ type=models.OwnershipTypeClass.DEVELOPER,
+ )
+ for o in owners.users
+ ]
+ if owners.groups is not None:
+ owners_meta = owners_meta + [
+ models.OwnerClass(
+ owner=make_group_urn(o),
+ type=models.OwnershipTypeClass.DEVELOPER,
+ )
+ for o in owners.groups
+ ]
+ return models.OwnershipClass(owners=owners_meta)
+
+
+def get_mces(
+ glossary: BusinessGlossaryConfig,
+) -> List[models.MetadataChangeEventClass]:
+ events: List[models.MetadataChangeEventClass] = []
+ path: List[str] = []
+ root_owners = get_owners(glossary.owners)
+
+ for node in glossary.nodes:
+ events += get_mces_from_node(
+ node,
+ path + [node.name],
+ parentNode=None,
+ parentOwners=root_owners,
+ defaults=glossary,
+ )
+ return events
+
+
+def get_mce_from_snapshot(snapshot: Any) -> models.MetadataChangeEventClass:
+ return models.MetadataChangeEventClass(
+ proposedSnapshot=snapshot,
+ systemMetadata=models.SystemMetadataClass(runId="test-glossary"),
+ )
+
+
+def get_mces_from_node(
+ glossaryNode: GlossaryNodeConfig,
+ path: List[str],
+ parentNode: Optional[str],
+ parentOwners: models.OwnershipClass,
+ defaults: DefaultConfig,
+) -> List[models.MetadataChangeEventClass]:
+ node_urn = make_glossary_node_urn(path)
+ node_info = models.GlossaryNodeInfoClass(
+ definition=glossaryNode.description,
+ parentNode=parentNode,
+ )
+ node_owners = parentOwners
+ if glossaryNode.owners is not None:
+ assert glossaryNode.owners is not None
+ node_owners = get_owners(glossaryNode.owners)
+
+ node_snapshot = models.GlossaryNodeSnapshotClass(
+ urn=node_urn,
+ aspects=[node_info, node_owners, valid_status],
+ )
+ mces = [get_mce_from_snapshot(node_snapshot)]
+ if glossaryNode.nodes:
+ for node in glossaryNode.nodes:
+ mces += get_mces_from_node(
+ node,
+ path + [node.name],
+ parentNode=node_urn,
+ parentOwners=node_owners,
+ defaults=defaults,
+ )
+
+ if glossaryNode.terms:
+ for term in glossaryNode.terms:
+ mces += get_mces_from_term(
+ term,
+ path + [term.name],
+ parentNode=node_urn,
+ parentOwnership=node_owners,
+ defaults=defaults,
+ )
+ return mces
+
+
+def get_mces_from_term(
+ glossaryTerm: GlossaryTermConfig,
+ path: List[str],
+ parentNode: str,
+ parentOwnership: models.OwnershipClass,
+ defaults: DefaultConfig,
+) -> List[models.MetadataChangeEventClass]:
+ term_urn = make_glossary_term_urn(path)
+ aspects: List[
+ Union[
+ models.GlossaryTermInfoClass,
+ models.GlossaryRelatedTermsClass,
+ models.OwnershipClass,
+ models.StatusClass,
+ models.GlossaryTermKeyClass,
+ models.BrowsePathsClass,
+ ]
+ ] = []
+ term_info = models.GlossaryTermInfoClass(
+ definition=glossaryTerm.description,
+ termSource=glossaryTerm.term_source # type: ignore
+ if glossaryTerm.term_source is not None
+ else defaults.source_type,
+ sourceRef=glossaryTerm.source_ref
+ if glossaryTerm.source_ref
+ else defaults.source,
+ sourceUrl=glossaryTerm.source_url if glossaryTerm.source_url else defaults.url,
+ parentNode=parentNode,
+ )
+ aspects.append(term_info)
+
+ isA_related = None
+ hasA_related = None
+ if glossaryTerm.inherits is not None:
+ assert glossaryTerm.inherits is not None
+ isA_related = [make_glossary_term_urn([term]) for term in glossaryTerm.inherits]
+ if glossaryTerm.contains is not None:
+ assert glossaryTerm.contains is not None
+ hasA_related = [
+ make_glossary_term_urn([term]) for term in glossaryTerm.contains
+ ]
+
+ if isA_related is not None or hasA_related is not None:
+ relatedTerms = models.GlossaryRelatedTermsClass(
+ isRelatedTerms=isA_related, hasRelatedTerms=hasA_related
+ )
+ aspects.append(relatedTerms)
+
+ ownership: models.OwnershipClass = parentOwnership
+ if glossaryTerm.owners is not None:
+ assert glossaryTerm.owners is not None
+ ownership = get_owners(glossaryTerm.owners)
+ aspects.append(ownership)
+
+ term_browse = models.BrowsePathsClass(paths=["/" + "/".join(path)])
+ aspects.append(term_browse)
+
+ term_snapshot: models.GlossaryTermSnapshotClass = models.GlossaryTermSnapshotClass(
+ urn=term_urn,
+ aspects=aspects,
+ )
+ return [get_mce_from_snapshot(term_snapshot)]
+
+
+@dataclass
+class BusinessGlossaryFileSource(Source):
+ config: BusinessGlossarySourceConfig
+ report: SourceReport = field(default_factory=SourceReport)
+
+ @classmethod
+ def create(cls, config_dict, ctx):
+ config = BusinessGlossarySourceConfig.parse_obj(config_dict)
+ return cls(ctx, config)
+
+ def load_glossary_config(self, file_name: str) -> BusinessGlossaryConfig:
+ config = load_config_file(file_name)
+ glossary_cfg = BusinessGlossaryConfig.parse_obj(config)
+ return glossary_cfg
+
+ def get_workunits(self) -> Iterable[Union[MetadataWorkUnit, UsageStatsWorkUnit]]:
+ glossary_config = self.load_glossary_config(self.config.file)
+ for mce in get_mces(glossary_config):
+ wu = MetadataWorkUnit(f"{mce.proposedSnapshot.urn}", mce=mce)
+ self.report.report_workunit(wu)
+ yield wu
+
+ def get_report(self):
+ return self.report
+
+ def close(self):
+ pass
diff --git a/metadata-ingestion/src/datahub/metadata/schema.avsc b/metadata-ingestion/src/datahub/metadata/schema.avsc
index 7b8ea15223..492060358a 100644
--- a/metadata-ingestion/src/datahub/metadata/schema.avsc
+++ b/metadata-ingestion/src/datahub/metadata/schema.avsc
@@ -1075,7 +1075,7 @@
},
"type": "string",
"name": "pictureLink",
- "default": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web/packages/data-portal/public/assets/images/default_avatar.png",
+ "default": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/default_avatar.png",
"doc": "A URL which points to a picture which user wants to set as a profile photo"
}
],
diff --git a/metadata-ingestion/src/datahub/metadata/schema_classes.py b/metadata-ingestion/src/datahub/metadata/schema_classes.py
index c49c284f66..2ba336aa8d 100644
--- a/metadata-ingestion/src/datahub/metadata/schema_classes.py
+++ b/metadata-ingestion/src/datahub/metadata/schema_classes.py
@@ -3718,7 +3718,7 @@ class CorpUserEditableInfoClass(DictWrapper):
else:
self.skills = skills
if pictureLink is None:
- # default: 'https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web/packages/data-portal/public/assets/images/default_avatar.png'
+ # default: 'https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/default_avatar.png'
self.pictureLink = self.RECORD_SCHEMA.field_map["pictureLink"].default
else:
self.pictureLink = pictureLink
diff --git a/metadata-ingestion/src/datahub/metadata/schemas/MetadataChangeEvent.avsc b/metadata-ingestion/src/datahub/metadata/schemas/MetadataChangeEvent.avsc
index 2b7596dc7a..0ed27dc039 100644
--- a/metadata-ingestion/src/datahub/metadata/schemas/MetadataChangeEvent.avsc
+++ b/metadata-ingestion/src/datahub/metadata/schemas/MetadataChangeEvent.avsc
@@ -1056,7 +1056,7 @@
"name": "pictureLink",
"type": "string",
"doc": "A URL which points to a picture which user wants to set as a profile photo",
- "default": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web/packages/data-portal/public/assets/images/default_avatar.png",
+ "default": "https://raw.githubusercontent.com/linkedin/datahub/master/datahub-web-react/src/images/default_avatar.png",
"java": {
"class": "com.linkedin.pegasus2avro.common.url.Url",
"coercerClass": "com.linkedin.pegasus2avro.common.url.UrlCoercer"