"use strict";(self.webpackChunkdocs_website=self.webpackChunkdocs_website||[]).push([[48685],{22447:(a,t,e)=>{e.r(t),e.d(t,{assets:()=>m,contentTitle:()=>s,default:()=>w,frontMatter:()=>p,metadata:()=>u,toc:()=>b});e(96540);var n=e(15680),o=e(53720),l=e(5400);function r(a,t,e){return t in a?Object.defineProperty(a,t,{value:e,enumerable:!0,configurable:!0,writable:!0}):a[t]=e,a}function i(a,t){return t=null!=t?t:{},Object.getOwnPropertyDescriptors?Object.defineProperties(a,Object.getOwnPropertyDescriptors(t)):function(a,t){var e=Object.keys(a);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(a);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(a,t).enumerable}))),e.push.apply(e,n)}return e}(Object(t)).forEach((function(e){Object.defineProperty(a,e,Object.getOwnPropertyDescriptor(t,e))})),a}function d(a,t){if(null==a)return{};var e,n,o=function(a,t){if(null==a)return{};var e,n,o={},l=Object.keys(a);for(n=0;n=0||(o[e]=a[e]);return o}(a,t);if(Object.getOwnPropertySymbols){var l=Object.getOwnPropertySymbols(a);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(a,e)&&(o[e]=a[e])}return o}const p={title:"DataFlow & DataJob",slug:"/api/tutorials/dataflow-datajob",custom_edit_url:"https://github.com/datahub-project/datahub/blob/master/docs/api/tutorials/dataflow-datajob.md"},s="DataFlow & DataJob",u={unversionedId:"docs/api/tutorials/dataflow-datajob",id:"docs/api/tutorials/dataflow-datajob",title:"DataFlow & DataJob",description:"Why Would You Use DataFlow and DataJob?",source:"@site/genDocs/docs/api/tutorials/dataflow-datajob.md",sourceDirName:"docs/api/tutorials",slug:"/api/tutorials/dataflow-datajob",permalink:"/docs/api/tutorials/dataflow-datajob",draft:!1,editUrl:"https://github.com/datahub-project/datahub/blob/master/docs/api/tutorials/dataflow-datajob.md",tags:[],version:"current",frontMatter:{title:"DataFlow & DataJob",slug:"/api/tutorials/dataflow-datajob",custom_edit_url:"https://github.com/datahub-project/datahub/blob/master/docs/api/tutorials/dataflow-datajob.md"},sidebar:"overviewSidebar",previous:{title:"Dashboard & Chart",permalink:"/docs/api/tutorials/dashboard-chart"},next:{title:"MLModel & MLModelGroup",permalink:"/docs/api/tutorials/mlmodel-mlmodelgroup"}},m={},b=[{value:"Why Would You Use DataFlow and DataJob?",id:"why-would-you-use-dataflow-and-datajob",level:2},{value:"Goal Of This Guide",id:"goal-of-this-guide",level:3},{value:"Prerequisites",id:"prerequisites",level:2},{value:"Create DataFlow",id:"create-dataflow",level:2},{value:"Create DataJob",id:"create-datajob",level:2},{value:"Read DataFlow",id:"read-dataflow",level:2},{value:"Example Output",id:"example-output",level:4},{value:"Read DataJob",id:"read-datajob",level:2},{value:"Example Output",id:"example-output-1",level:4}],f={toc:b},c="wrapper";function w(a){var{components:t}=a,e=d(a,["components"]);return(0,n.yg)(c,i(function(a){for(var t=1;t\n\n```python\n# Inlined from /metadata-ingestion/examples/library/create_datajob_with_flow_urn.py\nfrom datahub.metadata.urns import DataFlowUrn, DatasetUrn\nfrom datahub.sdk import DataHubClient, DataJob\n\nclient = DataHubClient.from_env()\n\n# datajob will inherit the platform and platform instance from the flow\n\ndatajob = DataJob(\n name="example_datajob",\n flow_urn=DataFlowUrn(\n orchestrator="airflow",\n flow_id="example_dag",\n cluster="PROD",\n ),\n platform_instance="PROD",\n inlets=[\n DatasetUrn(platform="hdfs", name="dataset1", env="PROD"),\n ],\n outlets=[\n DatasetUrn(platform="hdfs", name="dataset2", env="PROD"),\n ],\n)\n\nclient.entities.upsert(datajob)\n\n')))),(0,n.yg)("h2",{id:"read-dataflow"},"Read DataFlow"),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-python"},'# Inlined from /metadata-ingestion/examples/library/read_dataflow.py\nfrom datahub.metadata.urns import TagUrn\nfrom datahub.sdk import DataFlow, DataHubClient\n\nclient = DataHubClient.from_env()\n\ndataflow = DataFlow(\n name="example_dataflow",\n platform="airflow",\n description="airflow pipeline for production",\n tags=[TagUrn(name="production"), TagUrn(name="data_engineering")],\n)\n\nclient.entities.upsert(dataflow)\n\ndataflow_entity = client.entities.get(dataflow.urn)\nprint("DataFlow name:", dataflow_entity.name)\nprint("DataFlow platform:", dataflow_entity.platform)\nprint("DataFlow description:", dataflow_entity.description)\n\n')),(0,n.yg)("h4",{id:"example-output"},"Example Output"),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-python"},">> DataFlow name: example_dataflow\n>> DataFlow platform: urn:li:dataPlatform:airflow\n>> DataFlow description: airflow pipeline for production\n")),(0,n.yg)("h2",{id:"read-datajob"},"Read DataJob"),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-python"},'# Inlined from /metadata-ingestion/examples/library/read_datajob.py\nfrom datahub.sdk import DataFlow, DataHubClient, DataJob\n\nclient = DataHubClient.from_env()\n\ndataflow = DataFlow(\n platform="airflow",\n name="example_dag",\n platform_instance="PROD",\n)\n\n# datajob will inherit the platform and platform instance from the flow\ndatajob = DataJob(\n name="example_datajob",\n description="example datajob",\n flow=dataflow,\n)\n\nclient.entities.upsert(dataflow)\nclient.entities.upsert(datajob)\n\ndatajob_entity = client.entities.get(datajob.urn)\n\nprint("DataJob name:", datajob_entity.name)\nprint("DataJob Flow URN:", datajob_entity.flow_urn)\nprint("DataJob description:", datajob_entity.description)\n\n')),(0,n.yg)("h4",{id:"example-output-1"},"Example Output"),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-python"},">> DataJob name: example_datajob\n>> DataJob Flow URN: urn:li:dataFlow:(airflow,PROD.example_dag,PROD)\n>> DataJob description: example datajob\n")))}w.isMDXComponent=!0}}]);