mirror of
https://github.com/datahub-project/datahub.git
synced 2025-07-23 09:32:04 +00:00
147 lines
117 KiB
HTML
147 lines
117 KiB
HTML
<!doctype html>
|
||
<html lang="en" dir="ltr" class="docs-wrapper docs-doc-page docs-version-current plugin-docs plugin-id-default docs-doc-id-metadata-integration/java/acryl-spark-lineage/README" data-has-hydrated="false">
|
||
<head>
|
||
<meta charset="UTF-8">
|
||
<meta name="generator" content="Docusaurus v2.4.3">
|
||
<title data-rh="true">Spark | DataHub</title><meta data-rh="true" name="viewport" content="width=device-width,initial-scale=1"><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:url" content="https://docs.datahub.com/docs/metadata-integration/java/acryl-spark-lineage"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" name="docusaurus_version" content="current"><meta data-rh="true" name="docusaurus_tag" content="docs-default-current"><meta data-rh="true" name="docsearch:version" content="current"><meta data-rh="true" name="docsearch:docusaurus_tag" content="docs-default-current"><meta data-rh="true" property="og:title" content="Spark | DataHub"><meta data-rh="true" name="description" content="To integrate Spark with DataHub, we provide a lightweight Java agent that listens for Spark application and job events"><meta data-rh="true" property="og:description" content="To integrate Spark with DataHub, we provide a lightweight Java agent that listens for Spark application and job events"><link data-rh="true" rel="icon" href="/img/favicon.ico"><link data-rh="true" rel="canonical" href="https://docs.datahub.com/docs/metadata-integration/java/acryl-spark-lineage"><link data-rh="true" rel="alternate" href="https://docs.datahub.com/docs/metadata-integration/java/acryl-spark-lineage" hreflang="en"><link data-rh="true" rel="alternate" href="https://docs.datahub.com/docs/metadata-integration/java/acryl-spark-lineage" hreflang="x-default"><link data-rh="true" rel="preconnect" href="https://RK0UG797F3-dsn.algolia.net" crossorigin="anonymous"><link rel="alternate" type="application/rss+xml" href="/learn/rss.xml" title="DataHub RSS Feed">
|
||
<link rel="alternate" type="application/atom+xml" href="/learn/atom.xml" title="DataHub Atom Feed">
|
||
|
||
<link rel="preconnect" href="https://www.google-analytics.com">
|
||
<link rel="preconnect" href="https://www.googletagmanager.com">
|
||
<script async src="https://www.googletagmanager.com/gtag/js?id=G-PKGVLETT4C"></script>
|
||
<script>function gtag(){dataLayer.push(arguments)}window.dataLayer=window.dataLayer||[],gtag("js",new Date),gtag("config","G-PKGVLETT4C",{})</script>
|
||
<link rel="preconnect" href="https://www.googletagmanager.com">
|
||
<script>window.dataLayer=window.dataLayer||[]</script>
|
||
<script>!function(e,t,a,n,g){e[n]=e[n]||[],e[n].push({"gtm.start":(new Date).getTime(),event:"gtm.js"});var m=t.getElementsByTagName(a)[0],r=t.createElement(a);r.async=!0,r.src="https://www.googletagmanager.com/gtm.js?id=GTM-5M8T9HNN",m.parentNode.insertBefore(r,m)}(window,document,"script","dataLayer")</script>
|
||
|
||
|
||
<link rel="search" type="application/opensearchdescription+xml" title="DataHub" href="/opensearch.xml">
|
||
|
||
|
||
|
||
|
||
<meta httpequiv="Content-Security-Policy" content="frame-ancestors 'self' https://*.acryl.io https://acryldata.io http://localhost:*">
|
||
<link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Manrope:wght@400;500;700&display=swap">
|
||
<script src="https://tools.luckyorange.com/core/lo.js?site-id=28ea8a38" async defer="defer"></script>
|
||
<script src="/scripts/rb2b.js" async defer="defer"></script>
|
||
<script src="https://app.revenuehero.io/scheduler.min.js"></script>
|
||
<script src="https://tag.clearbitscripts.com/v1/pk_2e321cabe30432a5c44c0424781aa35f/tags.js" referrerpolicy="strict-origin-when-cross-origin"></script>
|
||
<script src="/scripts/reo.js"></script>
|
||
<script id="runllm-widget-script" type="module" src="https://widget.runllm.com" crossorigin="true" runllm-name="DataHub" runllm-assistant-id="81" runllm-position="BOTTOM_RIGHT" runllm-keyboard-shortcut="Mod+j" runllm-preset="docusaurus" runllm-theme-color="#1890FF" runllm-brand-logo="https://docs.datahub.com/img/datahub-logo-color-mark.svg" runllm-community-url="https://datahub.com/slack" runllm-community-type="slack" runllm-disable-ask-a-person="true" async></script><link rel="stylesheet" href="/assets/css/styles.d8fe2eb8.css">
|
||
<link rel="preload" href="/assets/js/runtime~main.310f59c4.js" as="script">
|
||
<link rel="preload" href="/assets/js/main.49198d73.js" as="script">
|
||
</head>
|
||
<body class="navigation-with-keyboard">
|
||
<noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-5M8T9HNN" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
|
||
|
||
<script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){var t=null;try{t=new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}return t}()||function(){var t=null;try{t=localStorage.getItem("theme")}catch(t){}return t}();t(null!==e?e:"light")}(),document.documentElement.setAttribute("data-announcement-bar-initially-dismissed",function(){try{return"true"===localStorage.getItem("docusaurus.announcement.dismiss")}catch(t){}return!1}())</script><div id="__docusaurus">
|
||
<div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#__docusaurus_skipToContent_fallback">Skip to main content</a></div><div class="announcementBar_mb4j" style="background-color:transparent;color:#ffffff" role="banner"><div class="content_knG7 announcementBarContent_xLdY"><div class="shimmer-banner"><p>DataHub Secures $35 Million Series B</p><a href="https://datahub.com/news/series-b-announcement/" target="_blank" class="button"><div>Read the announcement<span> →</span></div></a></div></div></div><nav aria-label="Main" class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><button aria-label="Toggle navigation bar" aria-expanded="false" class="navbar__toggle clean-btn" type="button"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a href="https://datahub.com" target="_blank" rel="noopener noreferrer" class="navbar__brand"><div class="navbar__logo"><img src="/img/datahub-logo-color-light-horizontal.svg" alt="DataHub Logo" class="themedImage_ToTc themedImage--light_HNdA"><img src="/img/datahub-logo-color-dark-horizontal.svg" alt="DataHub Logo" class="themedImage_ToTc themedImage--dark_i4oU"></div></a><div class="navbar__item dropdown dropdown--hoverable"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link versionNavItem_cbn8">Next</a><ul class="dropdown__menu"><li><a aria-current="page" class="dropdown__link dropdown__link--active" href="/docs/metadata-integration/java/acryl-spark-lineage">Next</a></li><li><a class="dropdown__link" href="/docs/1.1.0/metadata-integration/java/acryl-spark-lineage">1.1.0</a></li><li><hr class="dropdown-separator" style="margin: 0.4rem;"></li><li><div class="dropdown__link"><b>Archived versions</b></div></li><li>
|
||
<a class="dropdown__link" href="https://docs-website-t9sv4w3gr-acryldata.vercel.app/docs/features">1.0.0
|
||
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
|
||
</a>
|
||
</li><li>
|
||
<a class="dropdown__link" href="https://docs-website-t9sv4w3gr-acryldata.vercel.app/docs/0.15.0/features">0.15.0
|
||
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
|
||
</a>
|
||
</li><li>
|
||
<a class="dropdown__link" href="https://docs-website-8jkm4uler-acryldata.vercel.app/docs/0.14.1/features">0.14.1
|
||
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
|
||
</a>
|
||
</li><li>
|
||
<a class="dropdown__link" href="https://docs-website-eue2qafvn-acryldata.vercel.app/docs/features">0.14.0
|
||
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
|
||
</a>
|
||
</li><li>
|
||
<a class="dropdown__link" href="https://docs-website-psat3nzgi-acryldata.vercel.app/docs/features">0.13.1
|
||
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
|
||
</a>
|
||
</li><li>
|
||
<a class="dropdown__link" href="https://docs-website-lzxh86531-acryldata.vercel.app/docs/features">0.13.0
|
||
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
|
||
</a>
|
||
</li><li>
|
||
<a class="dropdown__link" href="https://docs-website-2uuxmgza2-acryldata.vercel.app/docs/features">0.12.1
|
||
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
|
||
</a>
|
||
</li><li>
|
||
<a class="dropdown__link" href="https://docs-website-irpoe2osc-acryldata.vercel.app/docs/features">0.11.0
|
||
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
|
||
</a>
|
||
</li><li>
|
||
<a class="dropdown__link" href="https://docs-website-1gv2yzn9d-acryldata.vercel.app/docs/features">0.10.5
|
||
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
|
||
</a>
|
||
</li></ul></div></div><div class="navbar__items navbar__items--right"><a aria-current="page" class="navbar__item navbar__link navbar__link--active" href="/docs">Docs</a><a class="navbar__item navbar__link" href="/integrations">Integrations</a><div class="navbar__item dropdown dropdown--hoverable"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link">Learn</a><ul class="dropdown__menu dropdown__menu_Z8FC"><div class="wrapper_kp81"><div><a href="https://datahub.com/weekly-demo" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-join-slack.png" alt="Weekly Demo"></div><div class="title_c7DP">Weekly Demo</div></a></div><div><a href="https://datahub.com/use-cases" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-forum.png" alt="Use Cases"></div><div class="title_c7DP">Use Cases</div></a></div><div><a href="httpps://datahub.com/adoption-stories" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-events.png" alt="Adoption Stories"></div><div class="title_c7DP">Adoption Stories</div></a></div><div><a href="https://medium.com/datahub-project" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-champions.png" alt="Blog"></div><div class="title_c7DP">Blog</div></a></div><div><a href="https://www.youtube.com/channel/UC3qFQC5IiwR5fvWEqi_tJ5w" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-share-your-journey.png" alt="Youtube"></div><div class="title_c7DP">Youtube</div></a></div></div></ul></div><div class="navbar__item dropdown dropdown--hoverable"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link">Community</a><ul class="dropdown__menu dropdown__menu_Z8FC"><div class="wrapper_kp81"><div><a href="https://datahub.com/slack/" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-join-slack.png" alt="Join Slack"></div><div class="title_c7DP">Join Slack</div></a></div><div><a href="https://datahub.com/events" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-events.png" alt="Events"></div><div class="title_c7DP">Events</div></a></div><div><a href="https://datahub.com/champions/" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-champions.png" alt="Champions"></div><div class="title_c7DP">Champions</div></a></div><div><a href="https://datahub.com/share-your-journey/" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-share-your-journey.png" alt="Share Your Journey"></div><div class="title_c7DP">Share Your Journey</div></a></div></div></ul></div><a href="https://datahub.com/products/why-datahub-cloud/" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">
|
||
<style>
|
||
.cloud-cta {
|
||
color: var(--ifm-menu-color-active);
|
||
font-weight: 600;
|
||
background: linear-gradient(40deg, var(--ifm-menu-color-active), var(--ifm-menu-color-active));
|
||
background-size: 200% 100%;
|
||
-webkit-background-clip: text;
|
||
background-clip: text;
|
||
transition: background-image 0.3s ease;
|
||
}
|
||
.cloud-cta:hover {
|
||
color: transparent;
|
||
background: linear-gradient(40deg, var(--ifm-menu-color-active), #ff1493);
|
||
background-size: 200% 100%;
|
||
-webkit-background-clip: text;
|
||
background-clip: text;
|
||
animation: gradientShift 3s ease infinite;
|
||
}
|
||
@keyframes gradientShift {
|
||
0%, 100% { background-position: 0% 50%; }
|
||
50% { background-position: 100% 50%; }
|
||
}
|
||
</style>
|
||
<div class="cloud-cta">Get Cloud</div>
|
||
</a><a href="https://datahub.com/slack?utm_source=docs&utm_medium=header&utm_campaign=docs_header" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">
|
||
<style>
|
||
.slack-logo:hover {
|
||
opacity: 0.8;
|
||
}
|
||
</style>
|
||
<img class="slack-logo" src="https://upload.wikimedia.org/wikipedia/commons/d/d5/Slack_icon_2019.svg" , alt="slack" , height="20px" style="margin: 10px 0 0 0;">
|
||
</a><div class="searchBox_ZlJk"><button type="button" class="DocSearch DocSearch-Button" aria-label="Search"><span class="DocSearch-Button-Container"><svg width="20" height="20" class="DocSearch-Search-Icon" viewBox="0 0 20 20" aria-hidden="true"><path d="M14.386 14.386l4.0877 4.0877-4.0877-4.0877c-2.9418 2.9419-7.7115 2.9419-10.6533 0-2.9419-2.9418-2.9419-7.7115 0-10.6533 2.9418-2.9419 7.7115-2.9419 10.6533 0 2.9419 2.9418 2.9419 7.7115 0 10.6533z" stroke="currentColor" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg><span class="DocSearch-Button-Placeholder">Search</span></span><span class="DocSearch-Button-Keys"></span></button></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div id="__docusaurus_skipToContent_fallback" class="main-wrapper mainWrapper_z2l0 docsWrapper_BCFX"><button aria-label="Scroll back to top" class="clean-btn theme-back-to-top-button backToTopButton_sjWU" type="button"></button><div class="docPage__5DB"><aside class="theme-doc-sidebar-container docSidebarContainer_b6E3"><div class="sidebarViewport_Xe31"><div class="sidebar_njMd"><nav aria-label="Docs sidebar" class="menu thin-scrollbar menu_SIkG menuWithAnnouncementBar_GW3s"><ul class="theme-doc-sidebar-menu menu__list"><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menuHtmlItem_M9Kj menu__list-item"><div>Getting Started</div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist" aria-expanded="false" href="/docs/features">What Is DataHub?</a><button aria-label="Toggle the collapsible sidebar category 'What Is DataHub?'" type="button" class="clean-btn menu__caret"></button></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist" aria-expanded="false" href="/docs/category/features">Features</a><button aria-label="Toggle the collapsible sidebar category 'Features'" type="button" class="clean-btn menu__caret"></button></div></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menuHtmlItem_M9Kj menu__list-item"><div>DataHub Cloud</div></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/managed-datahub/managed-datahub-overview">DataHub Cloud Overview</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/managed-datahub/welcome-acryl">Getting Started with DataHub Cloud</a></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/authentication/guides/sso/initialize-oidc">Configure Single Sign-On</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/managed-datahub/remote-executor/about">Remote Executor</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/managed-datahub/datahub-api/entity-events-api">DataHub API</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/managed-datahub/slack/saas-slack-app">Slack</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/managed-datahub/operator-guide/setting-up-events-api-on-aws-eventbridge">Operator Guides</a></div></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item saasOnly"><a class="menu__link" href="/docs/managed-datahub/approval-workflows">Change Proposals & Approval Workflows</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/managed-datahub/chrome-extension">Cloud Chrome Extension</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item saasOnly"><a class="menu__link" href="/docs/managed-datahub/subscription-and-notification">Subscriptions & Notifications</a></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/managed-datahub/release-notes/v_0_3_12">DataHub Cloud Release History</a></div></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menuHtmlItem_M9Kj menu__list-item"><div>Integrations</div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist" aria-expanded="false" href="/docs/metadata-ingestion">Overview</a><button aria-label="Toggle the collapsible sidebar category 'Overview'" type="button" class="clean-btn menu__caret"></button></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/metadata-ingestion/cli-ingestion">Quickstart Guides</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--active" aria-expanded="true" href="/docs/metadata-ingestion/source_overview">Sources</a><button aria-label="Toggle the collapsible sidebar category 'Sources'" type="button" class="clean-btn menu__caret"></button></div><ul style="display:block;overflow:visible;height:auto" class="menu__list"><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/lineage/airflow">Airflow</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/lineage/dagster">Dagster</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/lineage/prefect">Prefect</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link menu__link--active" aria-current="page" tabindex="0" href="/docs/metadata-integration/java/acryl-spark-lineage">Spark</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/metadata-ingestion/integration_docs/great-expectations">Great Expectations</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/metadata-integration/java/datahub-protobuf">Protobuf Schemas</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/abs">ABS Data Lake</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/athena">Athena</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/azure-ad">Azure AD</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/bigquery">BigQuery</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/business-glossary">Business Glossary</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/cassandra">Cassandra</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/clickhouse">ClickHouse</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/cockroachdb">CockroachDB</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/csv-enricher">CSV Enricher</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/databricks">Databricks</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/datahub">DataHub</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/datahubapply">DataHubApply</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/datahubdebug">DataHubDebug</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/datahubgc">DataHubGc</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/datahubmockdata">DataHubMockData</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/dbt">dbt</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/delta-lake">Delta Lake</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/demo-data">Demo Data</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/dremio">Dremio</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/druid">Druid</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/dynamodb">DynamoDB</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/elasticsearch">Elasticsearch</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/feast">Feast</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/file-based-lineage">File Based Lineage</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/fivetran">Fivetran</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/glue">Glue</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/gcs">Google Cloud Storage</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/grafana">Grafana</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/hex">Hex</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/hive">Hive</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/hive-metastore">Hive Metastore</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/iceberg">Iceberg</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/json-schema">JSON Schemas</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/kafka">Kafka</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/kafka-connect">Kafka Connect</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/ldap">LDAP</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/looker">Looker</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/mariadb">MariaDB</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/metabase">Metabase</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/metadata-file">Metadata File</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/mssql">Microsoft SQL Server</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/mlflow">MLflow</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/mode">Mode</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/mongodb">MongoDB</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/mysql">MySQL</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/neo4j">Neo4j</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/nifi">NiFi</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/okta">Okta</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/openapi">OpenAPI</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/oracle">Oracle</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/postgres">Postgres</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/powerbi">PowerBI</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/powerbi-report-server">PowerBI Report Server</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/preset">Preset</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/presto">Presto</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/pulsar">Pulsar</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/qlik-sense">Qlik Sense</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/redash">Redash</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/redshift">Redshift</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/s3">S3 / Local Files</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/sagemaker">SageMaker</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/salesforce">Salesforce</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/sac">SAP Analytics Cloud</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/hana">SAP HANA</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/sigma">Sigma</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/slack">Slack</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/snowflake">Snowflake</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/sql-queries">SQL Queries</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/sqlalchemy">SQLAlchemy</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/superset">Superset</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/tableau">Tableau</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/teradata">Teradata</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/trino">Trino</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/vertexai">Vertex AI</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/generated/ingestion/sources/vertica">Vertica</a></li></ul></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/metadata-ingestion/schedule_docs/intro">Advanced Guides</a></div></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menuHtmlItem_M9Kj menu__list-item"><div>API & SDKs</div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/metadata-modeling/metadata-model">DataHub's Open Metadata Standard</a></div></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/what-is-datahub/datahub-concepts">Concepts</a></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist" aria-expanded="false" href="/docs/metadata-standards">Metadata Standards</a><button aria-label="Toggle the collapsible sidebar category 'Metadata Standards'" type="button" class="clean-btn menu__caret"></button></div></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/api/datahub-apis">APIs and SDKs Overview</a></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/api/graphql/overview">API</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/metadata-ingestion/as-a-library">Python SDK</a></div></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/metadata-integration/java/as-a-library">Java SDK</a></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist" aria-expanded="false" href="/docs/cli">DataHub CLI</a><button aria-label="Toggle the collapsible sidebar category 'DataHub CLI'" type="button" class="clean-btn menu__caret"></button></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist" aria-expanded="false" href="/docs/act-on-metadata">DataHub Actions</a><button aria-label="Toggle the collapsible sidebar category 'DataHub Actions'" type="button" class="clean-btn menu__caret"></button></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/api/tutorials/datasets">Guides</a></div></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menuHtmlItem_M9Kj menu__list-item"><div>Admin</div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/authentication">Authentication</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/authorization">Authorization</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/how/delete-metadata">Advanced Guides</a></div></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menuHtmlItem_M9Kj menu__list-item"><div>Deployment</div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist" aria-expanded="false" href="/docs/category/deployment-guides">Deployment Guides</a><button aria-label="Toggle the collapsible sidebar category 'Deployment Guides'" type="button" class="clean-btn menu__caret"></button></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/deploy/confluent-cloud">Advanced Guides</a></div></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menuHtmlItem_M9Kj menu__list-item"><div>Developers</div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/architecture/architecture">Architecture</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/developers">Developing on DataHub</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/advanced/mcp-mcl">Advanced Guides</a></div></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menuHtmlItem_M9Kj menu__list-item"><div>Community</div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist" aria-expanded="false" href="/docs/category/community">Community</a><button aria-label="Toggle the collapsible sidebar category 'Community'" type="button" class="clean-btn menu__caret"></button></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/releases">Release History</a></div></li></ul></nav></div></div></aside><main class="docMainContainer_gTbr"><div class="container padding-top--md padding-bottom--lg"><div class="row"><div class="col docItemCol_VOVn"><div class="docItemContainer_Djhp"><article><nav class="theme-doc-breadcrumbs breadcrumbsContainer_Z_bl" aria-label="Breadcrumbs"><ul class="breadcrumbs" itemscope="" itemtype="https://schema.org/BreadcrumbList"><li class="breadcrumbs__item"><a aria-label="Home page" class="breadcrumbs__link" href="/"><svg viewBox="0 0 24 24" class="breadcrumbHomeIcon_YNFT"><path d="M10 19v-5h4v5c0 .55.45 1 1 1h3c.55 0 1-.45 1-1v-7h1.7c.46 0 .68-.57.33-.87L12.67 3.6c-.38-.34-.96-.34-1.34 0l-8.36 7.53c-.34.3-.13.87.33.87H5v7c0 .55.45 1 1 1h3c.55 0 1-.45 1-1z" fill="currentColor"></path></svg></a></li><li itemscope="" itemprop="itemListElement" itemtype="https://schema.org/ListItem" class="breadcrumbs__item"><a class="breadcrumbs__link" itemprop="item" href="/docs/metadata-ingestion/source_overview"><span itemprop="name">Sources</span></a><meta itemprop="position" content="1"></li><li itemscope="" itemprop="itemListElement" itemtype="https://schema.org/ListItem" class="breadcrumbs__item breadcrumbs__item--active"><span class="breadcrumbs__link" itemprop="name">Spark</span><meta itemprop="position" content="2"></li></ul></nav><span class="theme-doc-version-badge badge badge--secondary">Version: Next</span><div class="tocCollapsible_ETCw theme-doc-toc-mobile tocMobile_ITEo"><button type="button" class="clean-btn tocCollapsibleButton_TO0P">On this page</button></div><div class="theme-doc-markdown markdown"><h1>Spark</h1><p>To integrate Spark with DataHub, we provide a lightweight Java agent that listens for Spark application and job events
|
||
and pushes metadata out to DataHub in real-time. The agent listens to events such as application start/end, and
|
||
SQLExecution start/end to create pipelines (i.e. DataJob) and tasks (i.e. DataFlow) in Datahub along with lineage to
|
||
datasets that are being read from and written to. Read on to learn how to configure this for different Spark scenarios.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="configuring-spark-agent">Configuring Spark agent<a href="#configuring-spark-agent" class="hash-link" aria-label="Direct link to Configuring Spark agent" title="Direct link to Configuring Spark agent"></a></h2><p>The Spark agent can be configured using a config file or while creating a Spark Session. If you are using Spark on
|
||
Databricks, refer to <a href="#configuration-instructions-databricks">Configuration Instructions for Databricks</a>.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="before-you-begin-versions-and-release-notes">Before you begin: Versions and Release Notes<a href="#before-you-begin-versions-and-release-notes" class="hash-link" aria-label="Direct link to Before you begin: Versions and Release Notes" title="Direct link to Before you begin: Versions and Release Notes"></a></h3><p>Versioning of the jar artifact will follow the semantic versioning of the
|
||
main <a href="https://github.com/datahub-project/datahub" target="_blank" rel="noopener noreferrer">DataHub repo</a> and release notes will be
|
||
available <a href="https://github.com/datahub-project/datahub/releases" target="_blank" rel="noopener noreferrer">here</a>.
|
||
Always check <a href="https://search.maven.org/search?q=a:acryl-spark-lineage" target="_blank" rel="noopener noreferrer">the Maven central repository</a> for the latest
|
||
released version.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="configuration-instructions-spark-submit">Configuration Instructions: spark-submit<a href="#configuration-instructions-spark-submit" class="hash-link" aria-label="Direct link to Configuration Instructions: spark-submit" title="Direct link to Configuration Instructions: spark-submit"></a></h3><p>When running jobs using spark-submit, the agent needs to be configured in the config file.</p><div class="language-text codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">#Configuring DataHub spark agent jar</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.jars.packages io.acryl:acryl-spark-lineage:0.2.17</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.extraListeners datahub.spark.DatahubSparkListener</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.rest.server http://localhost:8080</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h2 class="anchor anchorWithStickyNavbar_LWe7" id="spark-submit-command-line">spark-submit command line<a href="#spark-submit-command-line" class="hash-link" aria-label="Direct link to spark-submit command line" title="Direct link to spark-submit command line"></a></h2><div class="language-sh codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-sh codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark-submit --packages io.acryl:acryl-spark-lineage:0.2.17 --conf "spark.extraListeners=datahub.spark.DatahubSparkListener" my_spark_job_to_run.py</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h3 class="anchor anchorWithStickyNavbar_LWe7" id="configuration-instructions-amazon-emr">Configuration Instructions: Amazon EMR<a href="#configuration-instructions-amazon-emr" class="hash-link" aria-label="Direct link to Configuration Instructions: Amazon EMR" title="Direct link to Configuration Instructions: Amazon EMR"></a></h3><p>Set the following spark-defaults configuration properties as it
|
||
stated <a href="https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-configure.html" target="_blank" rel="noopener noreferrer">here</a></p><div class="language-text codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.jars.packages io.acryl:acryl-spark-lineage:0.2.17</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.extraListeners datahub.spark.DatahubSparkListener</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.rest.server https://your_datahub_host/gms</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">#If you have authentication set up then you also need to specify the Datahub access token</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.rest.token yourtoken</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h3 class="anchor anchorWithStickyNavbar_LWe7" id="configuration-instructions-notebooks">Configuration Instructions: Notebooks<a href="#configuration-instructions-notebooks" class="hash-link" aria-label="Direct link to Configuration Instructions: Notebooks" title="Direct link to Configuration Instructions: Notebooks"></a></h3><p>When running interactive jobs from a notebook, the listener can be configured while building the Spark Session.</p><div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-python codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark </span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token plain"> SparkSession</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">builder</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">master</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token string" style="color:rgb(195, 232, 141)">"spark://spark-master:7077"</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">appName</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token string" style="color:rgb(195, 232, 141)">"test-application"</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">config</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token string" style="color:rgb(195, 232, 141)">"spark.jars.packages"</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">"io.acryl:acryl-spark-lineage:0.2.17"</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">config</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token string" style="color:rgb(195, 232, 141)">"spark.extraListeners"</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">"datahub.spark.DatahubSparkListener"</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">config</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token string" style="color:rgb(195, 232, 141)">"spark.datahub.rest.server"</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">"http://localhost:8080"</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">enableHiveSupport</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">getOrCreate</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h3 class="anchor anchorWithStickyNavbar_LWe7" id="configuration-instructions-standalone-java-applications">Configuration Instructions: Standalone Java Applications<a href="#configuration-instructions-standalone-java-applications" class="hash-link" aria-label="Direct link to Configuration Instructions: Standalone Java Applications" title="Direct link to Configuration Instructions: Standalone Java Applications"></a></h3><p>The configuration for standalone Java apps is very similar.</p><div class="language-java codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-java codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark </span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token class-name" style="color:rgb(255, 203, 107)">SparkSession</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token function" style="color:rgb(130, 170, 255)">builder</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token function" style="color:rgb(130, 170, 255)">appName</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token string" style="color:rgb(195, 232, 141)">"test-application"</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token function" style="color:rgb(130, 170, 255)">config</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token string" style="color:rgb(195, 232, 141)">"spark.master"</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token string" style="color:rgb(195, 232, 141)">"spark://spark-master:7077"</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token function" style="color:rgb(130, 170, 255)">config</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token string" style="color:rgb(195, 232, 141)">"spark.jars.packages"</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token string" style="color:rgb(195, 232, 141)">"io.acryl:acryl-spark-lineage:0.2.17"</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token function" style="color:rgb(130, 170, 255)">config</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token string" style="color:rgb(195, 232, 141)">"spark.extraListeners"</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token string" style="color:rgb(195, 232, 141)">"datahub.spark.DatahubSparkListener"</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token function" style="color:rgb(130, 170, 255)">config</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token string" style="color:rgb(195, 232, 141)">"spark.datahub.rest.server"</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token string" style="color:rgb(195, 232, 141)">"http://localhost:8080"</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token function" style="color:rgb(130, 170, 255)">enableHiveSupport</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token function" style="color:rgb(130, 170, 255)">getOrCreate</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token punctuation" style="color:rgb(199, 146, 234)">;</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h3 class="anchor anchorWithStickyNavbar_LWe7" id="configuration-instructions-databricks">Configuration Instructions: Databricks<a href="#configuration-instructions-databricks" class="hash-link" aria-label="Direct link to Configuration Instructions: Databricks" title="Direct link to Configuration Instructions: Databricks"></a></h3><p>The Spark agent can be configured using Databricks
|
||
Cluster <a href="https://docs.databricks.com/clusters/configure.html#spark-configuration" target="_blank" rel="noopener noreferrer">Spark configuration</a>
|
||
and <a href="https://docs.databricks.com/clusters/configure.html#init-scripts" target="_blank" rel="noopener noreferrer">Init script</a>.</p><p><a href="https://docs.databricks.com/security/secrets/secrets.html" target="_blank" rel="noopener noreferrer">Databricks Secrets</a> can be leveraged to store sensitive
|
||
information like tokens.</p><ul><li><p>Download <code>datahub-spark-lineage</code> jar
|
||
from <a href="https://s01.oss.sonatype.org/content/groups/public/io/acryl/acryl-spark-lineage/" target="_blank" rel="noopener noreferrer">the Maven central repository</a>.</p></li><li><p>Create <code>init.sh</code> with below content</p><div class="language-sh codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-sh codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">#!/bin/bash</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">cp /dbfs/datahub/datahub-spark-lineage*.jar /databricks/jars</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div></li><li><p>Install and configure <a href="https://docs.databricks.com/dev-tools/cli/index.html" target="_blank" rel="noopener noreferrer">Databricks CLI</a>.</p></li><li><p>Copy jar and init script to Databricks File System(DBFS) using Databricks CLI.</p><div class="language-sh codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-sh codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">databricks fs mkdirs dbfs:/datahub</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">databricks fs cp --overwrite datahub-spark-lineage*.jar dbfs:/datahub</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">databricks fs cp --overwrite init.sh dbfs:/datahub</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div></li><li><p>Open Databricks Cluster configuration page. Click the <strong>Advanced Options</strong> toggle. Click the <strong>Spark</strong> tab. Add below
|
||
configurations under <code>Spark Config</code>.</p><div class="language-text codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.extraListeners datahub.spark.DatahubSparkListener</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.rest.server http://localhost:8080</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.stage_metadata_coalescing true</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.databricks.cluster cluster-name<any preferred cluster identifier></span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div></li><li><p>Click the <strong>Init Scripts</strong> tab. Set cluster init script as <code>dbfs:/datahub/init.sh</code>.</p></li><li><p>Configuring DataHub authentication token</p><ul><li><p>Add below config in cluster spark config.</p><div class="language-text codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.rest.token <token></span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div></li><li><p>Alternatively, Databricks secrets can be used to secure token.</p><ul><li><p>Create secret using Databricks CLI.</p><div class="language-sh codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-sh codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">databricks secrets create-scope --scope datahub --initial-manage-principal users</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">databricks secrets put --scope datahub --key rest-token</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">databricks secrets list --scope datahub &lt;&lt;Edit prompted file with token value&gt;&gt;</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div></li><li><p>Add in spark config</p><div class="language-text codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.rest.token {{secrets/datahub/rest-token}}</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div></li></ul></li></ul></li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="configuration-options">Configuration Options<a href="#configuration-options" class="hash-link" aria-label="Direct link to Configuration Options" title="Direct link to Configuration Options"></a></h2><table><thead><tr><th>Field</th><th>Required</th><th>Default</th><th>Description</th><th></th></tr></thead><tbody><tr><td>spark.jars.packages</td><td>✅</td><td></td><td>Set with latest/required version io.acryl:acryl-spark-lineage:0.2.15</td><td></td></tr><tr><td>spark.extraListeners</td><td>✅</td><td></td><td>datahub.spark.DatahubSparkListener</td><td></td></tr><tr><td>spark.datahub.emitter</td><td></td><td>rest</td><td>Specify the ways to emit metadata. By default it sends to DataHub using REST emitter. Valid options are rest, kafka or file</td><td></td></tr><tr><td>spark.datahub.rest.server</td><td></td><td>http://localhost:8080</td><td>Datahub server url eg:<a href="http://localhost:8080" target="_blank" rel="noopener noreferrer">http://localhost:8080</a></td><td></td></tr><tr><td>spark.datahub.rest.token</td><td></td><td></td><td>Authentication token.</td><td></td></tr><tr><td>spark.datahub.rest.disable_ssl_verification</td><td></td><td>false</td><td>Disable SSL certificate validation. Caution: Only use this if you know what you are doing!</td><td></td></tr><tr><td>spark.datahub.rest.disable_chunked_encoding</td><td></td><td>false</td><td>Disable Chunked Transfer Encoding. In some environment chunked encoding causes issues. With this config option it can be disabled.</td><td></td></tr><tr><td>spark.datahub.rest.max_retries</td><td></td><td>0</td><td>Number of times a request retried if failed</td><td></td></tr><tr><td>spark.datahub.rest.retry_interval</td><td></td><td>10</td><td>Number of seconds to wait between retries</td><td></td></tr><tr><td>spark.datahub.file.filename</td><td></td><td></td><td>The file where metadata will be written if file emitter is set</td><td></td></tr><tr><td>spark.datahub.kafka.bootstrap</td><td></td><td></td><td>The Kafka bootstrap server url to use if the Kafka emitter is set</td><td></td></tr><tr><td>spark.datahub.kafka.schema_registry_url</td><td></td><td></td><td>The Schema registry url to use if the Kafka emitter is set</td><td></td></tr><tr><td>spark.datahub.kafka.schema_registry_config.</td><td></td><td></td><td>Additional config to pass in to the Schema Registry Client</td><td></td></tr><tr><td>spark.datahub.kafka.producer_config.</td><td></td><td></td><td>Additional config to pass in to the Kafka producer. For example: <code>--conf "spark.datahub.kafka.producer_config.client.id=my_client_id"</code></td><td></td></tr><tr><td>spark.datahub.metadata.pipeline.platformInstance</td><td></td><td></td><td>Pipeline level platform instance</td><td></td></tr><tr><td>spark.datahub.metadata.dataset.platformInstance</td><td></td><td></td><td>dataset level platform instance (it is usefult to set if you have it in your glue ingestion)</td><td></td></tr><tr><td>spark.datahub.metadata.dataset.env</td><td></td><td>PROD</td><td><a href="/docs/graphql/enums#fabrictype">Supported values</a>. In all other cases, will fallback to PROD</td><td></td></tr><tr><td>spark.datahub.metadata.dataset.hivePlatformAlias</td><td></td><td>hive</td><td>By default, datahub assigns Hive-like tables to the Hive platform. If you are using Glue as your Hive metastore, set this config flag to <code>glue</code></td><td></td></tr><tr><td>spark.datahub.metadata.include_scheme</td><td></td><td>true</td><td>Include scheme from the path URI (e.g. hdfs://, s3://) in the dataset URN. We recommend setting this value to false, it is set to true for backwards compatibility with previous versions</td><td></td></tr><tr><td>spark.datahub.metadata.remove_partition_pattern</td><td></td><td></td><td>Remove partition pattern. (e.g. /partition=\d+) It change database/table/partition=123 to database/table</td><td></td></tr><tr><td>spark.datahub.coalesce_jobs</td><td></td><td>true</td><td>Only one datajob(task) will be emitted containing all input and output datasets for the spark application</td><td></td></tr><tr><td>spark.datahub.parent.datajob_urn</td><td></td><td></td><td>Specified dataset will be set as upstream dataset for datajob created. Effective only when spark.datahub.coalesce_jobs is set to true</td><td></td></tr><tr><td>spark.datahub.metadata.dataset.materialize</td><td></td><td>false</td><td>Materialize Datasets in DataHub</td><td></td></tr><tr><td>spark.datahub.platform.s3.path_spec_list</td><td></td><td></td><td>List of pathspec per platform</td><td></td></tr><tr><td>spark.datahub.metadata.dataset.include_schema_metadata</td><td>false</td><td></td><td>Emit dataset schema metadata based on the spark execution. It is recommended to get schema information from platform specific DataHub sources as this is less reliable</td><td></td></tr><tr><td>spark.datahub.flow_name</td><td></td><td></td><td>If it is set it will be used as the DataFlow name otherwise it uses spark app name as flow_name</td><td></td></tr><tr><td>spark.datahub.file_partition_regexp</td><td></td><td></td><td>Strip partition part from the path if path end matches with the specified regexp. Example <code>year=.*/month=.*/day=.*</code></td><td></td></tr><tr><td>spark.datahub.tags</td><td></td><td></td><td>Comma separated list of tags to attach to the DataFlow</td><td></td></tr><tr><td>spark.datahub.domains</td><td></td><td></td><td>Comma separated list of domain urns to attach to the DataFlow</td><td></td></tr><tr><td>spark.datahub.stage_metadata_coalescing</td><td></td><td></td><td>Normally it coalesces and sends metadata at the onApplicationEnd event which is never called on Databricks or on Glue. You should enable this on Databricks if you want coalesced run.</td><td></td></tr><tr><td>spark.datahub.patch.enabled</td><td></td><td>false</td><td>Set this to true to send lineage as a patch, which appends rather than overwrites existing Dataset lineage edges. By default, it is disabled.</td><td></td></tr><tr><td>spark.datahub.metadata.dataset.lowerCaseUrns</td><td></td><td>false</td><td>Set this to true to lowercase dataset urns. By default, it is disabled.</td><td></td></tr><tr><td>spark.datahub.disableSymlinkResolution</td><td></td><td>false</td><td>Set this to true if you prefer using the s3 location instead of the Hive table. By default, it is disabled.</td><td></td></tr><tr><td>spark.datahub.s3.bucket</td><td></td><td></td><td>The name of the bucket where metadata will be written if s3 emitter is set</td><td></td></tr><tr><td>spark.datahub.s3.prefix</td><td></td><td></td><td>The prefix for the file where metadata will be written on s3 if s3 emitter is set</td><td></td></tr><tr><td>spark.datahub.s3.filename</td><td></td><td></td><td>The name of the file where metadata will be written if it is not set random filename will be used on s3 if s3 emitter is set</td><td></td></tr><tr><td>spark.datahub.s3.filename</td><td></td><td></td><td>The name of the file where metadata will be written if it is not set random filename will be used on s3 if s3 emitter is set</td><td></td></tr><tr><td>spark.datahub.log.mcps</td><td></td><td>true</td><td>Set this to true to log MCPS to the log. By default, it is enabled.</td><td></td></tr><tr><td>spark.datahub.legacyLineageCleanup.enabled</td><td></td><td>false</td><td>Set this to true to remove legacy lineages from older Spark Plugin runs. This will remove those lineages from the Datasets which it adds to DataJob. By default, it is disabled.</td><td></td></tr></tbody></table><h2 class="anchor anchorWithStickyNavbar_LWe7" id="what-to-expect-the-metadata-model">What to Expect: The Metadata Model<a href="#what-to-expect-the-metadata-model" class="hash-link" aria-label="Direct link to What to Expect: The Metadata Model" title="Direct link to What to Expect: The Metadata Model"></a></h2><p>As of current writing, the Spark agent produces metadata related to the Spark job, tasks and lineage edges to datasets.</p><ul><li>A pipeline is created per Spark <master, appName>.</li><li>A task is created per unique Spark query execution within an app.</li></ul><p>For Spark on Databricks,</p><ul><li>A pipeline is created per<ul><li>cluster_identifier: specified with spark.datahub.databricks.cluster</li><li>applicationID: on every restart of the cluster new spark applicationID will be created.</li></ul></li><li>A task is created per unique Spark query execution.</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="custom-properties--relating-to-spark-ui">Custom properties & relating to Spark UI<a href="#custom-properties--relating-to-spark-ui" class="hash-link" aria-label="Direct link to Custom properties & relating to Spark UI" title="Direct link to Custom properties & relating to Spark UI"></a></h3><p>The following custom properties in pipelines and tasks relate to the Spark UI:</p><ul><li>appName and appId in a pipeline can be used to determine the Spark application</li><li>Other custom properties of pipelines and tasks capture the start and end times of execution etc.</li></ul><p>For Spark on Databricks, pipeline start time is the cluster start time.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="spark-versions-supported">Spark versions supported<a href="#spark-versions-supported" class="hash-link" aria-label="Direct link to Spark versions supported" title="Direct link to Spark versions supported"></a></h3><p>Supports Spark 3.x series.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="environments-tested-with">Environments tested with<a href="#environments-tested-with" class="hash-link" aria-label="Direct link to Environments tested with" title="Direct link to Environments tested with"></a></h3><p>This initial release has been tested with the following environments:</p><ul><li>spark-submit of Python/Java applications to local and remote servers</li><li>Standalone Java applications</li><li>Databricks Standalone Cluster</li><li>EMR</li></ul><p>Testing with Databricks Standard and High-concurrency Cluster is not done yet.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="configuring-hdfs-based-dataset-urns">Configuring Hdfs based dataset URNs<a href="#configuring-hdfs-based-dataset-urns" class="hash-link" aria-label="Direct link to Configuring Hdfs based dataset URNs" title="Direct link to Configuring Hdfs based dataset URNs"></a></h3><p>Spark emits lineage between datasets. It has its own logic for generating urns. Python sources emit metadata of
|
||
datasets. To link these 2 things, urns generated by both have to match.
|
||
This section will help you to match urns to that of other ingestion sources.
|
||
By default, URNs are created using
|
||
template <code>urn:li:dataset:(urn:li:dataPlatform:<$platform>,<platformInstance>.<name>,<env>)</code>. We can configure these 4
|
||
things to generate the desired urn.</p><p><strong>Platform</strong>:
|
||
Hdfs-based platforms supported explicitly:</p><ul><li>AWS S3 (s3)</li><li>Google Cloud Storage (gcs)</li><li>local ( local file system) (local)
|
||
All other platforms will have "hdfs" as a platform.</li></ul><p><strong>Name</strong>:
|
||
By default, the name is the complete path. For Hdfs base datasets, tables can be at different levels in the path than
|
||
that of the actual file read due to various reasons like partitioning, and sharding. 'path_spec' is used to alter the
|
||
name.
|
||
{table} marker is used to specify the table level. Below are a few examples. One can specify multiple path_specs for
|
||
different paths specified in the <code>path_spec_list</code>. Each actual path is matched against all path_spes present in the
|
||
list. First, one to match will be used to generate urn.</p><p><strong>path_spec Examples</strong></p><div class="codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.platform.s3.path_spec_list=s3://my-bucket/foo/{table}/year=*/month=*/day=*/*,s3://my-other-bucket/foo/{table}/year=*/month=*/day=*/*"</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><table><thead><tr><th>Absolute path</th><th>path_spec</th><th>Urn</th></tr></thead><tbody><tr><td>s3://my-bucket/foo/tests/bar.avro</td><td>Not provided</td><td>urn:li:dataset:(urn:li:dataPlatform:s3,my-bucket/foo/tests/bar.avro,PROD)</td></tr><tr><td>s3://my-bucket/foo/tests/bar.avro</td><td>s3://my-bucket/foo/{table}/<!-- -->*</td><td>urn:li:dataset:(urn:li:dataPlatform:s3,my-bucket/foo/tests,PROD)</td></tr><tr><td>s3://my-bucket/foo/tests/bar.avro</td><td>s3://my-bucket/foo/tests/{table}</td><td>urn:li:dataset:(urn:li:dataPlatform:s3,my-bucket/foo/tests/bar.avro,PROD)</td></tr><tr><td>gs://my-bucket/foo/tests/bar.avro</td><td>gs://my-bucket/{table}/<em>/</em></td><td>urn:li:dataset:(urn:li:dataPlatform:gcs,my-bucket/foo,PROD)</td></tr><tr><td>gs://my-bucket/foo/tests/bar.avro</td><td>gs://my-bucket/{table}</td><td>urn:li:dataset:(urn:li:dataPlatform:gcs,my-bucket/foo,PROD)</td></tr><tr><td>file:///my-bucket/foo/tests/bar.avro</td><td>file:///my-bucket/<em>/</em>/{table}</td><td>urn:li:dataset:(urn:li:dataPlatform:local,my-bucket/foo/tests/bar.avro,PROD)</td></tr></tbody></table><p><strong>platform instance and env:</strong></p><p>The default value for env is 'PROD' and the platform instance is None. env and platform instances can be set for all
|
||
datasets using configurations 'spark.datahub.metadata.dataset.env' and 'spark.datahub.metadata.dataset.platformInstace'.
|
||
If spark is processing data that belongs to a different env or platform instance, then 'path_alias' can be used to
|
||
specify <code>path_spec</code> specific values of these. 'path_alias' groups the 'path_spec_list', its env, and platform instance
|
||
together.</p><p>path_alias_list Example:</p><p>The below example explains the configuration of the case, where files from 2 buckets are being processed in a single
|
||
spark application and files from my-bucket are supposed to have "instance1" as platform instance and "PROD" as env, and
|
||
files from bucket2 should have env "DEV" in their dataset URNs.</p><div class="codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.platform.s3.path_alias_list : path1,path2</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.platform.s3.path1.env : PROD</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.platform.s3.path1.path_spec_list: s3://my-bucket/*/*/{table}</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.platform.s3.path1.platform_instance : instance-1</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.platform.s3.path2.env: DEV</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.platform.s3.path2.path_spec_list: s3://bucket2/*/{table}</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h3 class="anchor anchorWithStickyNavbar_LWe7" id="important-notes-on-usage">Important notes on usage<a href="#important-notes-on-usage" class="hash-link" aria-label="Direct link to Important notes on usage" title="Direct link to Important notes on usage"></a></h3><ul><li>It is advisable to ensure appName is used appropriately to ensure you can trace lineage from a pipeline back to your
|
||
source code.</li><li>If multiple apps with the same appName run concurrently, dataset-lineage will be captured correctly but the
|
||
custom-properties e.g. app-id, SQLQueryId would be unreliable. We expect this to be quite rare.</li><li>If spark execution fails, then an empty pipeline would still get created, but it may not have any tasks.</li><li>For HDFS sources, the folder (name) is regarded as the dataset (name) to align with typical storage of parquet/csv
|
||
formats.</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="debugging">Debugging<a href="#debugging" class="hash-link" aria-label="Direct link to Debugging" title="Direct link to Debugging"></a></h3><ul><li>Following info logs are generated</li></ul><p>On Spark context startup</p><div class="language-text codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">YY/MM/DD HH:mm:ss INFO DatahubSparkListener: DatahubSparkListener initialised.</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">YY/MM/DD HH:mm:ss INFO SparkContext: Registered listener datahub.spark.DatahubSparkListener</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p>On application start</p><div class="language-text codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">YY/MM/DD HH:mm:ss INFO DatahubSparkListener: Application started: SparkListenerApplicationStart(AppName,Some(local-1644489736794),1644489735772,user,None,None)</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">YY/MM/DD HH:mm:ss INFO McpEmitter: REST Emitter Configuration: GMS url <rest.server></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">YY/MM/DD HH:mm:ss INFO McpEmitter: REST Emitter Configuration: Token XXXXX</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p>On pushing data to server</p><div class="language-text codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">YY/MM/DD HH:mm:ss INFO McpEmitter: MetadataWriteResponse(success=true, responseContent={"value":"<URN>"}, underlyingResponse=HTTP/1.1 200 OK [Date: day, DD month year HH:mm:ss GMT, Content-Type: application/json, X-RestLi-Protocol-Version: 2.0.0, Content-Length: 97, Server: Jetty(9.4.46.v20220331)] [Content-Length: 97,Chunked: false])</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p>On application end</p><div class="language-text codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">YY/MM/DD HH:mm:ss INFO DatahubSparkListener: Application ended : AppName AppID</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><ul><li>To enable debugging logs, add below configuration in log4j.properties file</li></ul><div class="language-properties codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-properties codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">log4j.logger.datahub.spark=DEBUG</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">log4j.logger.datahub.client.rest=DEBUG</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h2 class="anchor anchorWithStickyNavbar_LWe7" id="how-to-build">How to build<a href="#how-to-build" class="hash-link" aria-label="Direct link to How to build" title="Direct link to How to build"></a></h2><p>Use Java 8 to build the project. The project uses Gradle as the build tool. To build the project, run the following command:</p><div class="language-shell codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-shell codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">./gradlew -PjavaClassVersionDefault</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token number" style="color:rgb(247, 140, 108)">8</span><span class="token plain"> :metadata-integration:java:acryl-spark-lineage:shadowJar</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h2 class="anchor anchorWithStickyNavbar_LWe7" id="known-limitations">Known limitations<a href="#known-limitations" class="hash-link" aria-label="Direct link to Known limitations" title="Direct link to Known limitations"></a></h2><ul><li></li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="changelog">Changelog<a href="#changelog" class="hash-link" aria-label="Direct link to Changelog" title="Direct link to Changelog"></a></h2><h3 class="anchor anchorWithStickyNavbar_LWe7" id="version-0217">Version 0.2.17<a href="#version-0217" class="hash-link" aria-label="Direct link to Version 0.2.17" title="Direct link to Version 0.2.17"></a></h3><ul><li><p><em>Major changes</em>:</p><ul><li>Finegrained lineage is emitted on the DataJob and not on the emitted Datasets. This is the correct behaviour which was not correct earlier. This causes earlier emitted finegrained lineages won't be overwritten by the new ones.
|
||
You can remove the old lineages by setting <code>spark.datahub.legacyLineageCleanup.enabled=true</code>. Make sure you have the latest server if you enable with patch support. (this was introduced since 0.2.17-rc5)</li></ul></li><li><p><em>Changes</em>:</p><ul><li>OpenLineage 1.25.0 upgrade</li><li>Add option to disable chunked encoding in the datahub rest sink -> <code>spark.datahub.rest.disable_chunked_encoding</code></li><li>Add option to specify the mcp kafka topic for the datahub kafka sink -> <code>spark.datahub.kafka.mcp_topic</code></li><li>Add option to remove legacy lineages from older Spark Plugin runs. This will remove those lineages from the Datasets which it adds to DataJob -> <code>spark.datahub.legacyLineageCleanup.enabled</code></li></ul></li><li><p><em>Fixes</em>:</p><ul><li>Fix handling map transformation in the lineage. Earlier it generated wrong lineage for map transformation.</li></ul></li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="version-0216">Version 0.2.16<a href="#version-0216" class="hash-link" aria-label="Direct link to Version 0.2.16" title="Direct link to Version 0.2.16"></a></h3><ul><li>Remove logging DataHub config into logs</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="version-0215">Version 0.2.15<a href="#version-0215" class="hash-link" aria-label="Direct link to Version 0.2.15" title="Direct link to Version 0.2.15"></a></h3><ul><li>Add Kafka emitter to emit lineage to kafka</li><li>Add File emitter to emit lineage to file</li><li>Add S3 emitter to save mcps to s3</li><li>Upgrading OpenLineage to 1.19.0</li><li>Renaming project to acryl-datahub-spark-lineage</li><li>Supporting OpenLineage 1.17+ glue identifier changes</li><li>Fix handling OpenLineage input/output where wasn't any facet attached</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="version-0214">Version 0.2.14<a href="#version-0214" class="hash-link" aria-label="Direct link to Version 0.2.14" title="Direct link to Version 0.2.14"></a></h3><ul><li>Fix warning about MeterFilter warning from Micrometer</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="version-0213">Version 0.2.13<a href="#version-0213" class="hash-link" aria-label="Direct link to Version 0.2.13" title="Direct link to Version 0.2.13"></a></h3><ul><li>Add kafka emitter to emit lineage to kafka</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="version-0212">Version 0.2.12<a href="#version-0212" class="hash-link" aria-label="Direct link to Version 0.2.12" title="Direct link to Version 0.2.12"></a></h3><ul><li>Silencing some chatty warnings in RddPathUtils</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="version-0211">Version 0.2.11<a href="#version-0211" class="hash-link" aria-label="Direct link to Version 0.2.11" title="Direct link to Version 0.2.11"></a></h3><ul><li>Add option to lowercase dataset URNs</li><li>Add option to set platform instance and/or env per platform with <code>spark.datahub.platform.<platform_name>.env</code> and <code>spark.datahub.platform.<platform_name>.platform_instance</code> config parameter</li><li>Fixing platform instance setting for datasets when <code>spark.datahub.metadata.dataset.platformInstance</code> is set</li><li>Fixing column level lineage support when patch is enabled</li></ul></div><footer class="theme-doc-footer docusaurus-mt-lg"><div class="slackUtm_uoBr"><div class="slackUtm_uoBr"><hr>Need more help? Join the conversation in <a href="https://datahub.com/slack?utm_source=docs&utm_medium=footer&utm_campaign=docs_footer&utm_content=metadata-integration/java/acryl-spark-lineage/README">Slack!</a></div></div><div class="theme-doc-footer-edit-meta-row row"><div class="col"><a href="https://github.com/datahub-project/datahub/blob/master/metadata-integration/java/acryl-spark-lineage/README.md" target="_blank" rel="noreferrer noopener" class="theme-edit-this-page"><svg fill="currentColor" height="20" width="20" viewBox="0 0 40 40" class="iconEdit_Z9Sw" aria-hidden="true"><g><path d="m34.5 11.7l-3 3.1-6.3-6.3 3.1-3q0.5-0.5 1.2-0.5t1.1 0.5l3.9 3.9q0.5 0.4 0.5 1.1t-0.5 1.2z m-29.5 17.1l18.4-18.5 6.3 6.3-18.4 18.4h-6.3v-6.2z"></path></g></svg>Edit this page</a></div><div class="col lastUpdated_VsjB"></div></div></footer><div class="feedbackWrapper_mUHF"><div class="feedbackWidget_PX4d"><div class="feedbackButtons_wn3V"><strong>Is this page helpful?</strong><div><button class="feedbackButton_UgQs"><span role="img" aria-label="like" class="anticon anticon-like"><svg viewBox="64 64 896 896" focusable="false" data-icon="like" width="1em" height="1em" fill="currentColor" aria-hidden="true"><path d="M885.9 533.7c16.8-22.2 26.1-49.4 26.1-77.7 0-44.9-25.1-87.4-65.5-111.1a67.67 67.67 0 00-34.3-9.3H572.4l6-122.9c1.4-29.7-9.1-57.9-29.5-79.4A106.62 106.62 0 00471 99.9c-52 0-98 35-111.8 85.1l-85.9 311H144c-17.7 0-32 14.3-32 32v364c0 17.7 14.3 32 32 32h601.3c9.2 0 18.2-1.8 26.5-5.4 47.6-20.3 78.3-66.8 78.3-118.4 0-12.6-1.8-25-5.4-37 16.8-22.2 26.1-49.4 26.1-77.7 0-12.6-1.8-25-5.4-37 16.8-22.2 26.1-49.4 26.1-77.7-.2-12.6-2-25.1-5.6-37.1zM184 852V568h81v284h-81zm636.4-353l-21.9 19 13.9 25.4a56.2 56.2 0 016.9 27.3c0 16.5-7.2 32.2-19.6 43l-21.9 19 13.9 25.4a56.2 56.2 0 016.9 27.3c0 16.5-7.2 32.2-19.6 43l-21.9 19 13.9 25.4a56.2 56.2 0 016.9 27.3c0 22.4-13.2 42.6-33.6 51.8H329V564.8l99.5-360.5a44.1 44.1 0 0142.2-32.3c7.6 0 15.1 2.2 21.1 6.7 9.9 7.4 15.2 18.6 14.6 30.5l-9.6 198.4h314.4C829 418.5 840 436.9 840 456c0 16.5-7.2 32.1-19.6 43z"></path></svg></span></button><button class="feedbackButton_UgQs"><span role="img" aria-label="dislike" class="anticon anticon-dislike"><svg viewBox="64 64 896 896" focusable="false" data-icon="dislike" width="1em" height="1em" fill="currentColor" aria-hidden="true"><path d="M885.9 490.3c3.6-12 5.4-24.4 5.4-37 0-28.3-9.3-55.5-26.1-77.7 3.6-12 5.4-24.4 5.4-37 0-28.3-9.3-55.5-26.1-77.7 3.6-12 5.4-24.4 5.4-37 0-51.6-30.7-98.1-78.3-118.4a66.1 66.1 0 00-26.5-5.4H144c-17.7 0-32 14.3-32 32v364c0 17.7 14.3 32 32 32h129.3l85.8 310.8C372.9 889 418.9 924 470.9 924c29.7 0 57.4-11.8 77.9-33.4 20.5-21.5 31-49.7 29.5-79.4l-6-122.9h239.9c12.1 0 23.9-3.2 34.3-9.3 40.4-23.5 65.5-66.1 65.5-111 0-28.3-9.3-55.5-26.1-77.7zM184 456V172h81v284h-81zm627.2 160.4H496.8l9.6 198.4c.6 11.9-4.7 23.1-14.6 30.5-6.1 4.5-13.6 6.8-21.1 6.7a44.28 44.28 0 01-42.2-32.3L329 459.2V172h415.4a56.85 56.85 0 0133.6 51.8c0 9.7-2.3 18.9-6.9 27.3l-13.9 25.4 21.9 19a56.76 56.76 0 0119.6 43c0 9.7-2.3 18.9-6.9 27.3l-13.9 25.4 21.9 19a56.76 56.76 0 0119.6 43c0 9.7-2.3 18.9-6.9 27.3l-14 25.5 21.9 19a56.76 56.76 0 0119.6 43c0 19.1-11 37.5-28.8 48.4z"></path></svg></span></button></div></div></div></div></article><nav class="pagination-nav docusaurus-mt-lg" aria-label="Docs pages"><a class="pagination-nav__link pagination-nav__link--prev" href="/docs/lineage/prefect"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">Prefect Integration with DataHub</div></a><a class="pagination-nav__link pagination-nav__link--next" href="/docs/metadata-ingestion/integration_docs/great-expectations"><div class="pagination-nav__sublabel">Next</div><div class="pagination-nav__label">Great Expectations</div></a></nav></div></div><div class="col col--3"><div class="tableOfContents_bqdL thin-scrollbar theme-doc-toc-desktop"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#configuring-spark-agent" class="table-of-contents__link toc-highlight">Configuring Spark agent</a><ul><li><a href="#before-you-begin-versions-and-release-notes" class="table-of-contents__link toc-highlight">Before you begin: Versions and Release Notes</a></li><li><a href="#configuration-instructions-spark-submit" class="table-of-contents__link toc-highlight">Configuration Instructions: spark-submit</a></li></ul></li><li><a href="#spark-submit-command-line" class="table-of-contents__link toc-highlight">spark-submit command line</a><ul><li><a href="#configuration-instructions-amazon-emr" class="table-of-contents__link toc-highlight">Configuration Instructions: Amazon EMR</a></li><li><a href="#configuration-instructions-notebooks" class="table-of-contents__link toc-highlight">Configuration Instructions: Notebooks</a></li><li><a href="#configuration-instructions-standalone-java-applications" class="table-of-contents__link toc-highlight">Configuration Instructions: Standalone Java Applications</a></li><li><a href="#configuration-instructions-databricks" class="table-of-contents__link toc-highlight">Configuration Instructions: Databricks</a></li></ul></li><li><a href="#configuration-options" class="table-of-contents__link toc-highlight">Configuration Options</a></li><li><a href="#what-to-expect-the-metadata-model" class="table-of-contents__link toc-highlight">What to Expect: The Metadata Model</a><ul><li><a href="#custom-properties--relating-to-spark-ui" class="table-of-contents__link toc-highlight">Custom properties & relating to Spark UI</a></li><li><a href="#spark-versions-supported" class="table-of-contents__link toc-highlight">Spark versions supported</a></li><li><a href="#environments-tested-with" class="table-of-contents__link toc-highlight">Environments tested with</a></li><li><a href="#configuring-hdfs-based-dataset-urns" class="table-of-contents__link toc-highlight">Configuring Hdfs based dataset URNs</a></li><li><a href="#important-notes-on-usage" class="table-of-contents__link toc-highlight">Important notes on usage</a></li><li><a href="#debugging" class="table-of-contents__link toc-highlight">Debugging</a></li></ul></li><li><a href="#how-to-build" class="table-of-contents__link toc-highlight">How to build</a></li><li><a href="#known-limitations" class="table-of-contents__link toc-highlight">Known limitations</a></li><li><a href="#changelog" class="table-of-contents__link toc-highlight">Changelog</a><ul><li><a href="#version-0217" class="table-of-contents__link toc-highlight">Version 0.2.17</a></li><li><a href="#version-0216" class="table-of-contents__link toc-highlight">Version 0.2.16</a></li><li><a href="#version-0215" class="table-of-contents__link toc-highlight">Version 0.2.15</a></li><li><a href="#version-0214" class="table-of-contents__link toc-highlight">Version 0.2.14</a></li><li><a href="#version-0213" class="table-of-contents__link toc-highlight">Version 0.2.13</a></li><li><a href="#version-0212" class="table-of-contents__link toc-highlight">Version 0.2.12</a></li><li><a href="#version-0211" class="table-of-contents__link toc-highlight">Version 0.2.11</a></li></ul></li></ul></div></div></div></div></main></div></div><footer class="footer footer--dark"><div class="container container-fluid"><div class="row footer__links"><div class="col footer__col"><div class="footer__title">Docs</div><ul class="footer__items clean-list"><li class="footer__item"><a class="footer__link-item" href="/docs/">Introduction</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/quickstart">Quickstart</a></li></ul></div><div class="col footer__col"><div class="footer__title">Community</div><ul class="footer__items clean-list"><li class="footer__item"><a href="https://datahub.com/slack" target="_blank" rel="noopener noreferrer" class="footer__link-item">Slack<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://www.youtube.com/channel/UC3qFQC5IiwR5fvWEqi_tJ5w" target="_blank" rel="noopener noreferrer" class="footer__link-item">YouTube<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://medium.com/datahub-project" target="_blank" rel="noopener noreferrer" class="footer__link-item">Blog<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a class="footer__link-item" href="/docs/townhalls">Town Halls</a></li><li class="footer__item"><a href="https://datahub.com/adoption-stories/" target="_blank" rel="noopener noreferrer" class="footer__link-item">Adoption<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div><div class="col footer__col"><div class="footer__title">More</div><ul class="footer__items clean-list"><li class="footer__item"><a href="https://demo.datahub.com/" target="_blank" rel="noopener noreferrer" class="footer__link-item">Demo</a></li><li class="footer__item"><a href="https://feature-requests.datahubproject.io/roadmap" target="_blank" rel="noopener noreferrer" class="footer__link-item">Roadmap<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a class="footer__link-item" href="/docs/contributing">Contributing</a></li><li class="footer__item"><a href="https://github.com/datahub-project/datahub" target="_blank" rel="noopener noreferrer" class="footer__link-item">GitHub<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://feature-requests.datahubproject.io/" target="_blank" rel="noopener noreferrer" class="footer__link-item">Feature Requests<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div></div><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2015-2025 DataHub Project Authors.</div></div></div></footer></div>
|
||
<script src="/assets/js/runtime~main.310f59c4.js"></script>
|
||
<script src="/assets/js/main.49198d73.js"></script>
|
||
</body>
|
||
</html> |