mirror of
https://github.com/datahub-project/datahub.git
synced 2025-08-23 08:38:02 +00:00
1 line
20 KiB
JavaScript
1 line
20 KiB
JavaScript
"use strict";(self.webpackChunkdocs_website=self.webpackChunkdocs_website||[]).push([[92637],{23078:(e,t,a)=>{a.r(t),a.d(t,{assets:()=>p,contentTitle:()=>u,default:()=>h,frontMatter:()=>m,metadata:()=>d,toc:()=>c});a(96540);var r=a(15680),n=a(53720),s=a(5400);function o(e,t,a){return t in e?Object.defineProperty(e,t,{value:a,enumerable:!0,configurable:!0,writable:!0}):e[t]=a,e}function l(e,t){return t=null!=t?t:{},Object.getOwnPropertyDescriptors?Object.defineProperties(e,Object.getOwnPropertyDescriptors(t)):function(e,t){var a=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),a.push.apply(a,r)}return a}(Object(t)).forEach((function(a){Object.defineProperty(e,a,Object.getOwnPropertyDescriptor(t,a))})),e}function i(e,t){if(null==e)return{};var a,r,n=function(e,t){if(null==e)return{};var a,r,n={},s=Object.keys(e);for(r=0;r<s.length;r++)a=s[r],t.indexOf(a)>=0||(n[a]=e[a]);return n}(e,t);if(Object.getOwnPropertySymbols){var s=Object.getOwnPropertySymbols(e);for(r=0;r<s.length;r++)a=s[r],t.indexOf(a)>=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(n[a]=e[a])}return n}const m={title:"Terms",slug:"/api/tutorials/terms",custom_edit_url:"https://github.com/datahub-project/datahub/blob/master/docs/api/tutorials/terms.md"},u="Terms",d={unversionedId:"docs/api/tutorials/terms",id:"version-1.1.0/docs/api/tutorials/terms",title:"Terms",description:"Why Would You Use Terms on Datasets?",source:"@site/versioned_docs/version-1.1.0/docs/api/tutorials/terms.md",sourceDirName:"docs/api/tutorials",slug:"/api/tutorials/terms",permalink:"/docs/1.1.0/api/tutorials/terms",draft:!1,editUrl:"https://github.com/datahub-project/datahub/blob/master/docs/api/tutorials/terms.md",tags:[],version:"1.1.0",frontMatter:{title:"Terms",slug:"/api/tutorials/terms",custom_edit_url:"https://github.com/datahub-project/datahub/blob/master/docs/api/tutorials/terms.md"},sidebar:"overviewSidebar",previous:{title:"Tags",permalink:"/docs/1.1.0/api/tutorials/tags"},next:{title:"Emitting Patch Updates to DataHub",permalink:"/docs/1.1.0/advanced/patch"}},p={},c=[{value:"Why Would You Use Terms on Datasets?",id:"why-would-you-use-terms-on-datasets",level:2},{value:"Goal Of This Guide",id:"goal-of-this-guide",level:3},{value:"Prerequisites",id:"prerequisites",level:2},{value:"Create Terms",id:"create-terms",level:2},{value:"Expected Outcome of Creating Terms",id:"expected-outcome-of-creating-terms",level:3},{value:"Read Terms",id:"read-terms",level:2},{value:"Add Terms",id:"add-terms",level:2},{value:"Add Terms to a dataset",id:"add-terms-to-a-dataset",level:3},{value:"Add Terms to a Column of a Dataset",id:"add-terms-to-a-column-of-a-dataset",level:3},{value:"Expected Outcome of Adding Terms",id:"expected-outcome-of-adding-terms",level:3},{value:"Remove Terms",id:"remove-terms",level:2},{value:"Expected Outcome of Removing Terms",id:"expected-outcome-of-removing-terms",level:3}],g={toc:c},y="wrapper";function h(e){var{components:t}=e,a=i(e,["components"]);return(0,r.yg)(y,l(function(e){for(var t=1;t<arguments.length;t++){var a=null!=arguments[t]?arguments[t]:{},r=Object.keys(a);"function"==typeof Object.getOwnPropertySymbols&&(r=r.concat(Object.getOwnPropertySymbols(a).filter((function(e){return Object.getOwnPropertyDescriptor(a,e).enumerable})))),r.forEach((function(t){o(e,t,a[t])}))}return e}({},g,a),{components:t,mdxType:"MDXLayout"}),(0,r.yg)("h1",{id:"terms"},"Terms"),(0,r.yg)("h2",{id:"why-would-you-use-terms-on-datasets"},"Why Would You Use Terms on Datasets?"),(0,r.yg)("p",null,"The Business Glossary(Term) feature in DataHub helps you use a shared vocabulary within the orgarnization, by providing a framework for defining a standardized set of data concepts and then associating them with the physical assets that exist within your data ecosystem."),(0,r.yg)("p",null,"For more information about terms, refer to ",(0,r.yg)("a",{parentName:"p",href:"/docs/1.1.0/glossary/business-glossary"},"About DataHub Business Glossary"),"."),(0,r.yg)("h3",{id:"goal-of-this-guide"},"Goal Of This Guide"),(0,r.yg)("p",null,"This guide will show you how to"),(0,r.yg)("ul",null,(0,r.yg)("li",{parentName:"ul"},"Create: create a term."),(0,r.yg)("li",{parentName:"ul"},"Read : read terms attached to a dataset."),(0,r.yg)("li",{parentName:"ul"},"Add: add a term to a column of a dataset or a dataset itself."),(0,r.yg)("li",{parentName:"ul"},"Remove: remove a term from a dataset.")),(0,r.yg)("h2",{id:"prerequisites"},"Prerequisites"),(0,r.yg)("p",null,"For this tutorial, you need to deploy DataHub Quickstart and ingest sample data.\nFor detailed information, please refer to ",(0,r.yg)("a",{parentName:"p",href:"/docs/1.1.0/quickstart"},"Datahub Quickstart Guide"),"."),(0,r.yg)("admonition",{type:"note"},(0,r.yg)("p",{parentName:"admonition"},"Before modifying terms, you need to ensure the target dataset is already present in your DataHub instance.\nIf you attempt to manipulate entities that do not exist, your operation will fail.\nIn this guide, we will be using data from sample ingestion.")),(0,r.yg)("p",null,"For more information on how to set up for GraphQL, please refer to ",(0,r.yg)("a",{parentName:"p",href:"/docs/1.1.0/api/graphql/how-to-set-up-graphql"},"How To Set Up GraphQL"),"."),(0,r.yg)("h2",{id:"create-terms"},"Create Terms"),(0,r.yg)("p",null,"The following code creates a term ",(0,r.yg)("inlineCode",{parentName:"p"},"Rate of Return"),"."),(0,r.yg)(n.A,{mdxType:"Tabs"},(0,r.yg)(s.A,{value:"graphql",label:"GraphQL",default:!0,mdxType:"TabItem"},(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-json"},'mutation createGlossaryTerm {\n createGlossaryTerm(input: {\n name: "Rate of Return",\n id: "rateofreturn",\n description: "A rate of return (RoR) is the net gain or loss of an investment over a specified time period."\n },\n )\n}\n')),(0,r.yg)("p",null,"If you see the following response, the operation was successful:"),(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-python"},'{\n "data": {\n "createGlossaryTerm": "urn:li:glossaryTerm:rateofreturn"\n },\n "extensions": {}\n}\n'))),(0,r.yg)(s.A,{value:"curl",label:"Curl",mdxType:"TabItem"},(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-shell"},'curl --location --request POST \'http://localhost:8080/api/graphql\' \\\n--header \'Authorization: Bearer <my-access-token>\' \\\n--header \'Content-Type: application/json\' \\\n--data-raw \'{ "query": "mutation createGlossaryTerm { createGlossaryTerm(input: { name: \\"Rate of Return\\", id:\\"rateofreturn\\", description: \\"A rate of return (RoR) is the net gain or loss of an investment over a specified time period.\\" }) }", "variables":{}}\'\n')),(0,r.yg)("p",null,"Expected Response:"),(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-json"},'{\n "data": { "createGlossaryTerm": "urn:li:glossaryTerm:rateofreturn" },\n "extensions": {}\n}\n'))),(0,r.yg)(s.A,{value:"python",label:"Python",mdxType:"TabItem"},(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-python"},'# Inlined from /metadata-ingestion/examples/library/create_term.py\nimport logging\n\nfrom datahub.emitter.mce_builder import make_term_urn\nfrom datahub.emitter.mcp import MetadataChangeProposalWrapper\nfrom datahub.emitter.rest_emitter import DatahubRestEmitter\n\n# Imports for metadata model classes\nfrom datahub.metadata.schema_classes import GlossaryTermInfoClass\n\nlog = logging.getLogger(__name__)\nlogging.basicConfig(level=logging.INFO)\n\nterm_urn = make_term_urn("rateofreturn")\nterm_properties_aspect = GlossaryTermInfoClass(\n definition="A rate of return (RoR) is the net gain or loss of an investment over a specified time period.",\n name="Rate of Return",\n termSource="",\n)\n\nevent: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper(\n entityUrn=term_urn,\n aspect=term_properties_aspect,\n)\n\n# Create rest emitter\nrest_emitter = DatahubRestEmitter(gms_server="http://localhost:8080")\nrest_emitter.emit(event)\nlog.info(f"Created term {term_urn}")\n\n')))),(0,r.yg)("h3",{id:"expected-outcome-of-creating-terms"},"Expected Outcome of Creating Terms"),(0,r.yg)("p",null,"You can now see the new term ",(0,r.yg)("inlineCode",{parentName:"p"},"Rate of Return")," has been created."),(0,r.yg)("p",{align:"center"},(0,r.yg)("img",{width:"70%",src:"https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/term-created.png"})),(0,r.yg)("p",null,"We can also verify this operation by programmatically searching ",(0,r.yg)("inlineCode",{parentName:"p"},"Rate of Return")," term after running this code using the ",(0,r.yg)("inlineCode",{parentName:"p"},"datahub")," cli."),(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-shell"},'datahub get --urn "urn:li:glossaryTerm:rateofreturn" --aspect glossaryTermInfo\n\n{\n "glossaryTermInfo": {\n "definition": "A rate of return (RoR) is the net gain or loss of an investment over a specified time period.",\n "name": "Rate of Return",\n "termSource": "INTERNAL"\n }\n}\n')),(0,r.yg)("h2",{id:"read-terms"},"Read Terms"),(0,r.yg)(n.A,{mdxType:"Tabs"},(0,r.yg)(s.A,{value:"graphql",label:"GraphQL",default:!0,mdxType:"TabItem"},(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-json"},'query {\n dataset(urn: "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)") {\n glossaryTerms {\n terms {\n term {\n urn\n glossaryTermInfo {\n name\n description\n }\n }\n }\n }\n }\n}\n')),(0,r.yg)("p",null,"If you see the following response, the operation was successful:"),(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-python"},'{\n "data": {\n "dataset": {\n "glossaryTerms": {\n "terms": [\n {\n "term": {\n "urn": "urn:li:glossaryTerm:CustomerAccount",\n "glossaryTermInfo": {\n "name": "CustomerAccount",\n "description": "account that represents an identified, named collection of balances and cumulative totals used to summarize customer transaction-related activity over a designated period of time"\n }\n }\n }\n ]\n }\n }\n },\n "extensions": {}\n}\n'))),(0,r.yg)(s.A,{value:"curl",label:"Curl",mdxType:"TabItem"},(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-shell"},"curl --location --request POST 'http://localhost:8080/api/graphql' \\\n--header 'Authorization: Bearer <my-access-token>' \\\n--header 'Content-Type: application/json' \\\n--data-raw '{ \"query\": \"{dataset(urn: \\\"urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)\\\") {glossaryTerms {terms {term {urn glossaryTermInfo { name description } } } } } }\", \"variables\":{}}'\n")),(0,r.yg)("p",null,"Expected Response:"),(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-json"},'{"data":{"dataset":{"glossaryTerms":{"terms":[{"term":{"urn":"urn:li:glossaryTerm:CustomerAccount","glossaryTermInfo":{"name":"CustomerAccount","description":"account that represents an identified, named collection of balances and cumulative totals used to summarize customer transaction-related activity over a designated period of time"}}}]}}},"extensions":{}}```\n'))),(0,r.yg)(s.A,{value:"python",label:"Python",mdxType:"TabItem"},(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-python"},'# Inlined from /metadata-ingestion/examples/library/dataset_query_terms.py\nfrom datahub.sdk import DataHubClient, DatasetUrn\n\nclient = DataHubClient.from_env()\n\ndataset = client.entities.get(\n DatasetUrn(platform="hive", name="realestate_db.sales", env="PROD")\n)\n\nprint(dataset.terms)\n\n')))),(0,r.yg)("h2",{id:"add-terms"},"Add Terms"),(0,r.yg)("h3",{id:"add-terms-to-a-dataset"},"Add Terms to a dataset"),(0,r.yg)("p",null,"The following code shows you how can add terms to a dataset.\nIn the following code, we add a term ",(0,r.yg)("inlineCode",{parentName:"p"},"Rate of Return")," to a dataset named ",(0,r.yg)("inlineCode",{parentName:"p"},"fct_users_created"),"."),(0,r.yg)(n.A,{mdxType:"Tabs"},(0,r.yg)(s.A,{value:"graphql",label:"GraphQL",default:!0,mdxType:"TabItem"},(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-json"},'mutation addTerms {\n addTerms(\n input: {\n termUrns: ["urn:li:glossaryTerm:rateofreturn"],\n resourceUrn: "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)",\n }\n )\n}\n')),(0,r.yg)("p",null,"If you see the following response, the operation was successful:"),(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-python"},'{\n "data": {\n "addTerms": true\n },\n "extensions": {}\n}\n'))),(0,r.yg)(s.A,{value:"curl",label:"Curl",mdxType:"TabItem"},(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-shell"},'curl --location --request POST \'http://localhost:8080/api/graphql\' \\\n--header \'Authorization: Bearer <my-access-token>\' \\\n--header \'Content-Type: application/json\' \\\n--data-raw \'{ "query": "mutation addTerm { addTerms(input: { termUrns: [\\"urn:li:glossaryTerm:rateofreturn\\"], resourceUrn: \\"urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)\\" }) }", "variables":{}}\'\n')),(0,r.yg)("p",null,"Expected Response:"),(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-json"},'{ "data": { "addTerms": true }, "extensions": {} }\n'))),(0,r.yg)(s.A,{value:"python",label:"Python",mdxType:"TabItem"},(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-python"},'# Inlined from /metadata-ingestion/examples/library/dataset_add_term.py\nfrom datahub.sdk import DataHubClient, DatasetUrn, GlossaryTermUrn\n\nclient = DataHubClient.from_env()\n\ndataset = client.entities.get(\n DatasetUrn(platform="hive", name="realestate_db.sales", env="PROD")\n)\ndataset.add_term(GlossaryTermUrn("Classification.HighlyConfidential"))\n\nclient.entities.update(dataset)\n\n')))),(0,r.yg)("h3",{id:"add-terms-to-a-column-of-a-dataset"},"Add Terms to a Column of a Dataset"),(0,r.yg)(n.A,{mdxType:"Tabs"},(0,r.yg)(s.A,{value:"graphql",label:"GraphQL",mdxType:"TabItem"},(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-json"},'mutation addTerms {\n addTerms(\n input: {\n termUrns: ["urn:li:glossaryTerm:rateofreturn"],\n resourceUrn: "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)",\n subResourceType:DATASET_FIELD,\n subResource:"user_name"})\n}\n'))),(0,r.yg)(s.A,{value:"curl",label:"Curl",mdxType:"TabItem"},(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-shell"},'curl --location --request POST \'http://localhost:8080/api/graphql\' \\\n--header \'Authorization: Bearer <my-access-token>\' \\\n--header \'Content-Type: application/json\' \\\n--data-raw \'{ "query": "mutation addTerms { addTerms(input: { termUrns: [\\"urn:li:glossaryTerm:rateofreturn\\"], resourceUrn: \\"urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)\\", subResourceType: DATASET_FIELD, subResource: \\"user_name\\" }) }", "variables":{}}\'\n')),(0,r.yg)("p",null,"Expected Response:"),(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-json"},'{ "data": { "addTerms": true }, "extensions": {} }\n'))),(0,r.yg)(s.A,{value:"python",label:"Python",mdxType:"TabItem"},(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-python"},'# Inlined from /metadata-ingestion/examples/library/dataset_add_column_term.py\nfrom datahub.sdk import DataHubClient, DatasetUrn, GlossaryTermUrn\n\nclient = DataHubClient.from_env()\n\ndataset = client.entities.get(\n DatasetUrn(platform="hive", name="realestate_db.sales", env="PROD")\n)\n\ndataset["address.zipcode"].add_term(GlossaryTermUrn("Classification.Location"))\n\nclient.entities.update(dataset)\n\n')))),(0,r.yg)("h3",{id:"expected-outcome-of-adding-terms"},"Expected Outcome of Adding Terms"),(0,r.yg)("p",null,"You can now see ",(0,r.yg)("inlineCode",{parentName:"p"},"Rate of Return")," term has been added to ",(0,r.yg)("inlineCode",{parentName:"p"},"user_name")," column."),(0,r.yg)("p",{align:"center"},(0,r.yg)("img",{width:"70%",src:"https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/term-added.png"})),(0,r.yg)("h2",{id:"remove-terms"},"Remove Terms"),(0,r.yg)("p",null,"The following code remove a term from a dataset.\nAfter running this code, ",(0,r.yg)("inlineCode",{parentName:"p"},"Rate of Return")," term will be removed from a ",(0,r.yg)("inlineCode",{parentName:"p"},"user_name")," column."),(0,r.yg)(n.A,{mdxType:"Tabs"},(0,r.yg)(s.A,{value:"graphql",label:"GraphQL",default:!0,mdxType:"TabItem"},(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-json"},'mutation removeTerm {\n removeTerm(\n input: {\n termUrn: "urn:li:glossaryTerm:rateofreturn",\n resourceUrn: "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)",\n subResourceType:DATASET_FIELD,\n subResource:"user_name"})\n}\n')),(0,r.yg)("p",null,"Note that you can also remove a term from a dataset if you don't specify ",(0,r.yg)("inlineCode",{parentName:"p"},"subResourceType")," and ",(0,r.yg)("inlineCode",{parentName:"p"},"subResource"),"."),(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-json"},'mutation removeTerm {\n removeTerm(\n input: {\n termUrn: "urn:li:glossaryTerm:rateofreturn",\n resourceUrn: "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)",\n })\n}\n')),(0,r.yg)("p",null,"Also note that you can remove terms from multiple entities or subresource using ",(0,r.yg)("inlineCode",{parentName:"p"},"batchRemoveTerms"),"."),(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-json"},'mutation batchRemoveTerms {\n batchRemoveTerms(\n input: {\n termUrns: ["urn:li:glossaryTerm:rateofreturn"],\n resources: [\n { resourceUrn:"urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleHdfsDataset,PROD)"} ,\n { resourceUrn:"urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)"} ,]\n }\n )\n}\n'))),(0,r.yg)(s.A,{value:"curl",label:"Curl",mdxType:"TabItem"},(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-shell"},'curl --location --request POST \'http://localhost:8080/api/graphql\' \\\n--header \'Authorization: Bearer <my-access-token>\' \\\n--header \'Content-Type: application/json\' \\\n--data-raw \'{ "query": "mutation removeTerm { removeTerm(input: { termUrn: \\"urn:li:glossaryTerm:rateofreturn\\", resourceUrn: \\"urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleHdfsDataset,PROD)\\" }) }", "variables":{}}\'\n'))),(0,r.yg)(s.A,{value:"python",label:"Python",mdxType:"TabItem"},(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-python"},'# Inlined from /metadata-ingestion/examples/library/dataset_remove_term_execute_graphql.py\n# read-modify-write requires access to the DataHubGraph (RestEmitter is not enough)\nfrom datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph\n\ngms_endpoint = "http://localhost:8080"\ngraph = DataHubGraph(DatahubClientConfig(server=gms_endpoint))\n\n# Query multiple aspects from entity\nquery = """\nmutation batchRemoveTerms {\n batchRemoveTerms(\n input: {\n termUrns: ["urn:li:glossaryTerm:rateofreturn"],\n resources: [\n { resourceUrn:"urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleHdfsDataset,PROD)"} ,\n { resourceUrn:"urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)"} ,]\n }\n )\n}\n"""\nresult = graph.execute_graphql(query=query)\n\nprint(result)\n\n')))),(0,r.yg)("h3",{id:"expected-outcome-of-removing-terms"},"Expected Outcome of Removing Terms"),(0,r.yg)("p",null,"You can now see ",(0,r.yg)("inlineCode",{parentName:"p"},"Rate of Return")," term has been removed to ",(0,r.yg)("inlineCode",{parentName:"p"},"user_name")," column."),(0,r.yg)("p",{align:"center"},(0,r.yg)("img",{width:"70%",src:"https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/term-removed.png"})))}h.isMDXComponent=!0}}]); |