datahub/assets/js/134c2490.e718a600.js

1 line
9.2 KiB
JavaScript

"use strict";(self.webpackChunkdocs_website=self.webpackChunkdocs_website||[]).push([[48685],{22447:(a,t,e)=>{e.r(t),e.d(t,{assets:()=>m,contentTitle:()=>s,default:()=>w,frontMatter:()=>p,metadata:()=>u,toc:()=>b});e(96540);var n=e(15680),o=e(53720),l=e(5400);function r(a,t,e){return t in a?Object.defineProperty(a,t,{value:e,enumerable:!0,configurable:!0,writable:!0}):a[t]=e,a}function i(a,t){return t=null!=t?t:{},Object.getOwnPropertyDescriptors?Object.defineProperties(a,Object.getOwnPropertyDescriptors(t)):function(a,t){var e=Object.keys(a);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(a);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(a,t).enumerable}))),e.push.apply(e,n)}return e}(Object(t)).forEach((function(e){Object.defineProperty(a,e,Object.getOwnPropertyDescriptor(t,e))})),a}function d(a,t){if(null==a)return{};var e,n,o=function(a,t){if(null==a)return{};var e,n,o={},l=Object.keys(a);for(n=0;n<l.length;n++)e=l[n],t.indexOf(e)>=0||(o[e]=a[e]);return o}(a,t);if(Object.getOwnPropertySymbols){var l=Object.getOwnPropertySymbols(a);for(n=0;n<l.length;n++)e=l[n],t.indexOf(e)>=0||Object.prototype.propertyIsEnumerable.call(a,e)&&(o[e]=a[e])}return o}const p={title:"DataFlow & DataJob",slug:"/api/tutorials/dataflow-datajob",custom_edit_url:"https://github.com/datahub-project/datahub/blob/master/docs/api/tutorials/dataflow-datajob.md"},s="DataFlow & DataJob",u={unversionedId:"docs/api/tutorials/dataflow-datajob",id:"docs/api/tutorials/dataflow-datajob",title:"DataFlow & DataJob",description:"Why Would You Use DataFlow and DataJob?",source:"@site/genDocs/docs/api/tutorials/dataflow-datajob.md",sourceDirName:"docs/api/tutorials",slug:"/api/tutorials/dataflow-datajob",permalink:"/docs/api/tutorials/dataflow-datajob",draft:!1,editUrl:"https://github.com/datahub-project/datahub/blob/master/docs/api/tutorials/dataflow-datajob.md",tags:[],version:"current",frontMatter:{title:"DataFlow & DataJob",slug:"/api/tutorials/dataflow-datajob",custom_edit_url:"https://github.com/datahub-project/datahub/blob/master/docs/api/tutorials/dataflow-datajob.md"},sidebar:"overviewSidebar",previous:{title:"Dashboard & Chart",permalink:"/docs/api/tutorials/dashboard-chart"},next:{title:"MLModel & MLModelGroup",permalink:"/docs/api/tutorials/mlmodel-mlmodelgroup"}},m={},b=[{value:"Why Would You Use DataFlow and DataJob?",id:"why-would-you-use-dataflow-and-datajob",level:2},{value:"Goal Of This Guide",id:"goal-of-this-guide",level:3},{value:"Prerequisites",id:"prerequisites",level:2},{value:"Create DataFlow",id:"create-dataflow",level:2},{value:"Create DataJob",id:"create-datajob",level:2},{value:"Read DataFlow",id:"read-dataflow",level:2},{value:"Example Output",id:"example-output",level:4},{value:"Read DataJob",id:"read-datajob",level:2},{value:"Example Output",id:"example-output-1",level:4}],f={toc:b},c="wrapper";function w(a){var{components:t}=a,e=d(a,["components"]);return(0,n.yg)(c,i(function(a){for(var t=1;t<arguments.length;t++){var e=null!=arguments[t]?arguments[t]:{},n=Object.keys(e);"function"==typeof Object.getOwnPropertySymbols&&(n=n.concat(Object.getOwnPropertySymbols(e).filter((function(a){return Object.getOwnPropertyDescriptor(e,a).enumerable})))),n.forEach((function(t){r(a,t,e[t])}))}return a}({},f,e),{components:t,mdxType:"MDXLayout"}),(0,n.yg)("h1",{id:"dataflow--datajob"},"DataFlow & DataJob"),(0,n.yg)("h2",{id:"why-would-you-use-dataflow-and-datajob"},"Why Would You Use DataFlow and DataJob?"),(0,n.yg)("p",null,"The DataFlow and DataJob entities are used to represent data processing pipelines and jobs within a data ecosystem. They allow users to define, manage, and monitor the flow of data through various stages of processing, from ingestion to transformation and storage."),(0,n.yg)("h3",{id:"goal-of-this-guide"},"Goal Of This Guide"),(0,n.yg)("p",null,"This guide will show you how to"),(0,n.yg)("ul",null,(0,n.yg)("li",{parentName:"ul"},"Create a DataFlow."),(0,n.yg)("li",{parentName:"ul"},"Create a Datajob with a DataFlow.")),(0,n.yg)("h2",{id:"prerequisites"},"Prerequisites"),(0,n.yg)("p",null,"For this tutorial, you need to deploy DataHub Quickstart and ingest sample data.\nFor detailed steps, please refer to ",(0,n.yg)("a",{parentName:"p",href:"/docs/quickstart"},"Datahub Quickstart Guide"),"."),(0,n.yg)("h2",{id:"create-dataflow"},"Create DataFlow"),(0,n.yg)(o.A,{mdxType:"Tabs"},(0,n.yg)(l.A,{value:"python",label:"Python",default:!0,mdxType:"TabItem"},(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-python"},'# Inlined from /metadata-ingestion/examples/library/create_dataflow.py\nfrom datahub.metadata.urns import TagUrn\nfrom datahub.sdk import DataFlow, DataHubClient\n\nclient = DataHubClient.from_env()\n\ndataflow = DataFlow(\n name="example_dataflow",\n platform="airflow",\n description="airflow pipeline for production",\n tags=[TagUrn(name="production"), TagUrn(name="data_engineering")],\n)\n\nclient.entities.upsert(dataflow)\n\n')))),(0,n.yg)("h2",{id:"create-datajob"},"Create DataJob"),(0,n.yg)("p",null,"DataJob must be associated with a DataFlow. You can create a DataJob by providing the DataFlow object or the DataFlow URN and its platform instance."),(0,n.yg)(o.A,{mdxType:"Tabs"},(0,n.yg)(l.A,{value:"python",label:"Create DataJob with a DataFlow Object",default:!0,mdxType:"TabItem"},"```python # Inlined from /metadata-ingestion/examples/library/create_datajob.py from datahub.metadata.urns import DatasetUrn, TagUrn from datahub.sdk import DataFlow, DataHubClient, DataJob",(0,n.yg)("p",null,"client = DataHubClient.from_env()"),(0,n.yg)("h1",{id:"datajob-will-inherit-the-platform-and-platform-instance-from-the-flow"},"datajob will inherit the platform and platform instance from the flow"),(0,n.yg)("p",null,'dataflow = DataFlow(\nplatform="airflow",\nname="example_dag",\nplatform_instance="PROD",\ndescription="example dataflow",\ntags=','[TagUrn(name="tag1"), TagUrn(name="tag2")]',",\n)"),(0,n.yg)("p",null,'datajob = DataJob(\nname="example_datajob",\nflow=dataflow,\ninlets=','[\nDatasetUrn(platform="hdfs", name="dataset1", env="PROD"),\n]',",\noutlets=",'[\nDatasetUrn(platform="hdfs", name="dataset2", env="PROD"),\n]',",\n)"),(0,n.yg)("p",null,"client.entities.upsert(dataflow)\nclient.entities.upsert(datajob)"),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre"},'\n</TabItem>\n<TabItem value="python" label="Create DataJob with DataFlow URN">\n```python\n# Inlined from /metadata-ingestion/examples/library/create_datajob_with_flow_urn.py\nfrom datahub.metadata.urns import DataFlowUrn, DatasetUrn\nfrom datahub.sdk import DataHubClient, DataJob\n\nclient = DataHubClient.from_env()\n\n# datajob will inherit the platform and platform instance from the flow\n\ndatajob = DataJob(\n name="example_datajob",\n flow_urn=DataFlowUrn(\n orchestrator="airflow",\n flow_id="example_dag",\n cluster="PROD",\n ),\n platform_instance="PROD",\n inlets=[\n DatasetUrn(platform="hdfs", name="dataset1", env="PROD"),\n ],\n outlets=[\n DatasetUrn(platform="hdfs", name="dataset2", env="PROD"),\n ],\n)\n\nclient.entities.upsert(datajob)\n\n')))),(0,n.yg)("h2",{id:"read-dataflow"},"Read DataFlow"),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-python"},'# Inlined from /metadata-ingestion/examples/library/read_dataflow.py\nfrom datahub.metadata.urns import TagUrn\nfrom datahub.sdk import DataFlow, DataHubClient\n\nclient = DataHubClient.from_env()\n\ndataflow = DataFlow(\n name="example_dataflow",\n platform="airflow",\n description="airflow pipeline for production",\n tags=[TagUrn(name="production"), TagUrn(name="data_engineering")],\n)\n\nclient.entities.upsert(dataflow)\n\ndataflow_entity = client.entities.get(dataflow.urn)\nprint("DataFlow name:", dataflow_entity.name)\nprint("DataFlow platform:", dataflow_entity.platform)\nprint("DataFlow description:", dataflow_entity.description)\n\n')),(0,n.yg)("h4",{id:"example-output"},"Example Output"),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-python"},">> DataFlow name: example_dataflow\n>> DataFlow platform: urn:li:dataPlatform:airflow\n>> DataFlow description: airflow pipeline for production\n")),(0,n.yg)("h2",{id:"read-datajob"},"Read DataJob"),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-python"},'# Inlined from /metadata-ingestion/examples/library/read_datajob.py\nfrom datahub.sdk import DataFlow, DataHubClient, DataJob\n\nclient = DataHubClient.from_env()\n\ndataflow = DataFlow(\n platform="airflow",\n name="example_dag",\n platform_instance="PROD",\n)\n\n# datajob will inherit the platform and platform instance from the flow\ndatajob = DataJob(\n name="example_datajob",\n description="example datajob",\n flow=dataflow,\n)\n\nclient.entities.upsert(dataflow)\nclient.entities.upsert(datajob)\n\ndatajob_entity = client.entities.get(datajob.urn)\n\nprint("DataJob name:", datajob_entity.name)\nprint("DataJob Flow URN:", datajob_entity.flow_urn)\nprint("DataJob description:", datajob_entity.description)\n\n')),(0,n.yg)("h4",{id:"example-output-1"},"Example Output"),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-python"},">> DataJob name: example_datajob\n>> DataJob Flow URN: urn:li:dataFlow:(airflow,PROD.example_dag,PROD)\n>> DataJob description: example datajob\n")))}w.isMDXComponent=!0}}]);