147 lines
117 KiB
HTML
Raw Normal View History

<!doctype html>
<html lang="en" dir="ltr" class="docs-wrapper docs-doc-page docs-version-current plugin-docs plugin-id-default docs-doc-id-metadata-integration/java/acryl-spark-lineage/README" data-has-hydrated="false">
<head>
<meta charset="UTF-8">
<meta name="generator" content="Docusaurus v2.4.3">
<title data-rh="true">Spark | DataHub</title><meta data-rh="true" name="viewport" content="width=device-width,initial-scale=1"><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:url" content="https://docs.datahub.com/docs/metadata-integration/java/acryl-spark-lineage"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" name="docusaurus_version" content="current"><meta data-rh="true" name="docusaurus_tag" content="docs-default-current"><meta data-rh="true" name="docsearch:version" content="current"><meta data-rh="true" name="docsearch:docusaurus_tag" content="docs-default-current"><meta data-rh="true" property="og:title" content="Spark | DataHub"><meta data-rh="true" name="description" content="To integrate Spark with DataHub, we provide a lightweight Java agent that listens for Spark application and job events"><meta data-rh="true" property="og:description" content="To integrate Spark with DataHub, we provide a lightweight Java agent that listens for Spark application and job events"><link data-rh="true" rel="icon" href="/img/favicon.ico"><link data-rh="true" rel="canonical" href="https://docs.datahub.com/docs/metadata-integration/java/acryl-spark-lineage"><link data-rh="true" rel="alternate" href="https://docs.datahub.com/docs/metadata-integration/java/acryl-spark-lineage" hreflang="en"><link data-rh="true" rel="alternate" href="https://docs.datahub.com/docs/metadata-integration/java/acryl-spark-lineage" hreflang="x-default"><link data-rh="true" rel="preconnect" href="https://RK0UG797F3-dsn.algolia.net" crossorigin="anonymous"><link rel="alternate" type="application/rss+xml" href="/learn/rss.xml" title="DataHub RSS Feed">
<link rel="alternate" type="application/atom+xml" href="/learn/atom.xml" title="DataHub Atom Feed">
<link rel="preconnect" href="https://www.google-analytics.com">
<link rel="preconnect" href="https://www.googletagmanager.com">
<script async src="https://www.googletagmanager.com/gtag/js?id=G-PKGVLETT4C"></script>
<script>function gtag(){dataLayer.push(arguments)}window.dataLayer=window.dataLayer||[],gtag("js",new Date),gtag("config","G-PKGVLETT4C",{})</script>
<link rel="preconnect" href="https://www.googletagmanager.com">
<script>window.dataLayer=window.dataLayer||[]</script>
<script>!function(e,t,a,n,g){e[n]=e[n]||[],e[n].push({"gtm.start":(new Date).getTime(),event:"gtm.js"});var m=t.getElementsByTagName(a)[0],r=t.createElement(a);r.async=!0,r.src="https://www.googletagmanager.com/gtm.js?id=GTM-5M8T9HNN",m.parentNode.insertBefore(r,m)}(window,document,"script","dataLayer")</script>
<link rel="search" type="application/opensearchdescription+xml" title="DataHub" href="/opensearch.xml">
<meta httpequiv="Content-Security-Policy" content="frame-ancestors &#39;self&#39; https://*.acryl.io https://acryldata.io http://localhost:*">
<link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Manrope:wght@400;500;700&display=swap">
<script src="https://tools.luckyorange.com/core/lo.js?site-id=28ea8a38" async defer="defer"></script>
<script src="/scripts/rb2b.js" async defer="defer"></script>
<script src="https://app.revenuehero.io/scheduler.min.js"></script>
<script src="https://tag.clearbitscripts.com/v1/pk_2e321cabe30432a5c44c0424781aa35f/tags.js" referrerpolicy="strict-origin-when-cross-origin"></script>
<script src="/scripts/reo.js"></script>
<script id="runllm-widget-script" type="module" src="https://widget.runllm.com" crossorigin="true" runllm-name="DataHub" runllm-assistant-id="81" runllm-position="BOTTOM_RIGHT" runllm-keyboard-shortcut="Mod+j" runllm-preset="docusaurus" runllm-theme-color="#1890FF" runllm-brand-logo="https://docs.datahub.com/img/datahub-logo-color-mark.svg" runllm-community-url="https://datahub.com/slack" runllm-community-type="slack" runllm-disable-ask-a-person="true" async></script><link rel="stylesheet" href="/assets/css/styles.d8fe2eb8.css">
<link rel="preload" href="/assets/js/runtime~main.310f59c4.js" as="script">
<link rel="preload" href="/assets/js/main.49198d73.js" as="script">
</head>
<body class="navigation-with-keyboard">
<noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-5M8T9HNN" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
<script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){var t=null;try{t=new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}return t}()||function(){var t=null;try{t=localStorage.getItem("theme")}catch(t){}return t}();t(null!==e?e:"light")}(),document.documentElement.setAttribute("data-announcement-bar-initially-dismissed",function(){try{return"true"===localStorage.getItem("docusaurus.announcement.dismiss")}catch(t){}return!1}())</script><div id="__docusaurus">
<div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#__docusaurus_skipToContent_fallback">Skip to main content</a></div><div class="announcementBar_mb4j" style="background-color:transparent;color:#ffffff" role="banner"><div class="content_knG7 announcementBarContent_xLdY"><div class="shimmer-banner"><p>DataHub Secures $35 Million Series B</p><a href="https://datahub.com/news/series-b-announcement/" target="_blank" class="button"><div>Read the announcement<span></span></div></a></div></div></div><nav aria-label="Main" class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><button aria-label="Toggle navigation bar" aria-expanded="false" class="navbar__toggle clean-btn" type="button"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a href="https://datahub.com" target="_blank" rel="noopener noreferrer" class="navbar__brand"><div class="navbar__logo"><img src="/img/datahub-logo-color-light-horizontal.svg" alt="DataHub Logo" class="themedImage_ToTc themedImage--light_HNdA"><img src="/img/datahub-logo-color-dark-horizontal.svg" alt="DataHub Logo" class="themedImage_ToTc themedImage--dark_i4oU"></div></a><div class="navbar__item dropdown dropdown--hoverable"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link versionNavItem_cbn8">Next</a><ul class="dropdown__menu"><li><a aria-current="page" class="dropdown__link dropdown__link--active" href="/docs/metadata-integration/java/acryl-spark-lineage">Next</a></li><li><a class="dropdown__link" href="/docs/1.1.0/metadata-integration/java/acryl-spark-lineage">1.1.0</a></li><li><hr class="dropdown-separator" style="margin: 0.4rem;"></li><li><div class="dropdown__link"><b>Archived versions</b></div></li><li>
<a class="dropdown__link" href="https://docs-website-t9sv4w3gr-acryldata.vercel.app/docs/features">1.0.0
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li><li>
<a class="dropdown__link" href="https://docs-website-t9sv4w3gr-acryldata.vercel.app/docs/0.15.0/features">0.15.0
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li><li>
<a class="dropdown__link" href="https://docs-website-8jkm4uler-acryldata.vercel.app/docs/0.14.1/features">0.14.1
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li><li>
<a class="dropdown__link" href="https://docs-website-eue2qafvn-acryldata.vercel.app/docs/features">0.14.0
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li><li>
<a class="dropdown__link" href="https://docs-website-psat3nzgi-acryldata.vercel.app/docs/features">0.13.1
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li><li>
<a class="dropdown__link" href="https://docs-website-lzxh86531-acryldata.vercel.app/docs/features">0.13.0
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li><li>
<a class="dropdown__link" href="https://docs-website-2uuxmgza2-acryldata.vercel.app/docs/features">0.12.1
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li><li>
<a class="dropdown__link" href="https://docs-website-irpoe2osc-acryldata.vercel.app/docs/features">0.11.0
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li><li>
<a class="dropdown__link" href="https://docs-website-1gv2yzn9d-acryldata.vercel.app/docs/features">0.10.5
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li></ul></div></div><div class="navbar__items navbar__items--right"><a aria-current="page" class="navbar__item navbar__link navbar__link--active" href="/docs">Docs</a><a class="navbar__item navbar__link" href="/integrations">Integrations</a><div class="navbar__item dropdown dropdown--hoverable"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link">Learn</a><ul class="dropdown__menu dropdown__menu_Z8FC"><div class="wrapper_kp81"><div><a href="https://datahub.com/weekly-demo" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-join-slack.png" alt="Weekly Demo"></div><div class="title_c7DP">Weekly Demo</div></a></div><div><a href="https://datahub.com/use-cases" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-forum.png" alt="Use Cases"></div><div class="title_c7DP">Use Cases</div></a></div><div><a href="httpps://datahub.com/adoption-stories" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-events.png" alt="Adoption Stories"></div><div class="title_c7DP">Adoption Stories</div></a></div><div><a href="https://medium.com/datahub-project" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-champions.png" alt="Blog"></div><div class="title_c7DP">Blog</div></a></div><div><a href="https://www.youtube.com/channel/UC3qFQC5IiwR5fvWEqi_tJ5w" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-share-your-journey.png" alt="Youtube"></div><div class="title_c7DP">Youtube</div></a></div></div></ul></div><div class="navbar__item dropdown dropdown--hoverable"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link">Community</a><ul class="dropdown__menu dropdown__menu_Z8FC"><div class="wrapper_kp81"><div><a href="https://datahub.com/slack/" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-join-slack.png" alt="Join Slack"></div><div class="title_c7DP">Join Slack</div></a></div><div><a href="https://datahub.com/events" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-events.png" alt="Events"></div><div class="title_c7DP">Events</div></a></div><div><a href="https://datahub.com/champions/" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-champions.png" alt="Champions"></div><div class="title_c7DP">Champions</div></a></div><div><a href="https://datahub.com/share-your-journey/" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-share-your-journey.png" alt="Share Your Journey"></div><div class="title_c7DP">Share Your Journey</div></a></div></div></ul></div><a href="https://datahub.com/products/why-datahub-cloud/" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">
<style>
.cloud-cta {
color: var(--ifm-menu-color-active);
font-weight: 600;
background: linear-gradient(40deg, var(--ifm-menu-color-active), var(--ifm-menu-color-active));
background-size: 200% 100%;
-webkit-background-clip: text;
background-clip: text;
transition: background-image 0.3s ease;
}
.cloud-cta:hover {
color: transparent;
background: linear-gradient(40deg, var(--ifm-menu-color-active), #ff1493);
background-size: 200% 100%;
-webkit-background-clip: text;
background-clip: text;
animation: gradientShift 3s ease infinite;
}
@keyframes gradientShift {
0%, 100% { background-position: 0% 50%; }
50% { background-position: 100% 50%; }
}
</style>
<div class="cloud-cta">Get Cloud</div>
</a><a href="https://datahub.com/slack?utm_source=docs&amp;utm_medium=header&amp;utm_campaign=docs_header" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">
<style>
.slack-logo:hover {
opacity: 0.8;
}
</style>
<img class="slack-logo" src="https://upload.wikimedia.org/wikipedia/commons/d/d5/Slack_icon_2019.svg" , alt="slack" , height="20px" style="margin: 10px 0 0 0;">
</a><div class="searchBox_ZlJk"><button type="button" class="DocSearch DocSearch-Button" aria-label="Search"><span class="DocSearch-Button-Container"><svg width="20" height="20" class="DocSearch-Search-Icon" viewBox="0 0 20 20" aria-hidden="true"><path d="M14.386 14.386l4.0877 4.0877-4.0877-4.0877c-2.9418 2.9419-7.7115 2.9419-10.6533 0-2.9419-2.9418-2.9419-7.7115 0-10.6533 2.9418-2.9419 7.7115-2.9419 10.6533 0 2.9419 2.9418 2.9419 7.7115 0 10.6533z" stroke="currentColor" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg><span class="DocSearch-Button-Placeholder">Search</span></span><span class="DocSearch-Button-Keys"></span></button></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div id="__docusaurus_skipToContent_fallback" class="main-wrapper mainWrapper_z2l0 docsWrapper_BCFX"><button aria-label="Scroll back to top" class="clean-btn theme-back-to-top-button backToTopButton_sjWU" type="button"></button><div class="docPage__5DB"><aside class="theme-doc-sidebar-container docSidebarContainer_b6E3"><div class="sidebarViewport_Xe31"><div class="sidebar_njMd"><nav aria-label="Docs sidebar" class="menu thin-scrollbar menu_SIkG menuWithAnnouncementBar_GW3s"><ul class="theme-doc-sidebar-menu menu__list"><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menuHtmlItem_M9Kj menu__list-item"><div>Getting Started</div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist" aria-expanded="false" href="/docs/features">What Is DataHub?</a><button aria-label="Toggle the collapsible sidebar category &#x27;What Is DataHub?&#x27;" type="button" class="clean-btn menu__caret"></button></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist" aria-expanded="false" href="/docs/category/features">Features</a><button aria-label="Toggle the collapsible sidebar category &#x27;Features&#x27;" type="button" class="clean-btn menu__caret"></button></div></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menuHtmlItem_M9Kj menu__list-item"><div>DataHub Cloud</div></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/managed-datahub/managed-datahub-overview">DataHub Cloud Overview</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/managed-datahub/welcome-acryl">Getting Started with DataHub Cloud</a></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/authentication/guides/sso/initialize-oidc">Configure Single Sign-On</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/managed-datahub/remote-executor/about">Remote Executor</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/managed-datahub/datahub-api/entity-events-api">DataHub API</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--subli
and pushes metadata out to DataHub in real-time. The agent listens to events such as application start/end, and
SQLExecution start/end to create pipelines (i.e. DataJob) and tasks (i.e. DataFlow) in Datahub along with lineage to
datasets that are being read from and written to. Read on to learn how to configure this for different Spark scenarios.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="configuring-spark-agent">Configuring Spark agent<a href="#configuring-spark-agent" class="hash-link" aria-label="Direct link to Configuring Spark agent" title="Direct link to Configuring Spark agent"></a></h2><p>The Spark agent can be configured using a config file or while creating a Spark Session. If you are using Spark on
Databricks, refer to <a href="#configuration-instructions-databricks">Configuration Instructions for Databricks</a>.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="before-you-begin-versions-and-release-notes">Before you begin: Versions and Release Notes<a href="#before-you-begin-versions-and-release-notes" class="hash-link" aria-label="Direct link to Before you begin: Versions and Release Notes" title="Direct link to Before you begin: Versions and Release Notes"></a></h3><p>Versioning of the jar artifact will follow the semantic versioning of the
main <a href="https://github.com/datahub-project/datahub" target="_blank" rel="noopener noreferrer">DataHub repo</a> and release notes will be
available <a href="https://github.com/datahub-project/datahub/releases" target="_blank" rel="noopener noreferrer">here</a>.
Always check <a href="https://search.maven.org/search?q=a:acryl-spark-lineage" target="_blank" rel="noopener noreferrer">the Maven central repository</a> for the latest
released version.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="configuration-instructions-spark-submit">Configuration Instructions: spark-submit<a href="#configuration-instructions-spark-submit" class="hash-link" aria-label="Direct link to Configuration Instructions: spark-submit" title="Direct link to Configuration Instructions: spark-submit"></a></h3><p>When running jobs using spark-submit, the agent needs to be configured in the config file.</p><div class="language-text codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">#Configuring DataHub spark agent jar</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.jars.packages io.acryl:acryl-spark-lineage:0.2.17</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.extraListeners datahub.spark.DatahubSparkListener</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.rest.server http://localhost:8080</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h2 class="anchor anchorWithStickyNavbar_LWe7" id="spark-submit-command-line">spark-submit command line<a href="#spark-submit-command-line" class="hash-link" aria-label="Direct link to spark-submit command line" title="Direct link to spark-submit command line"></a></h2><div class="language-sh codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-sh codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark-submit --packages io.acryl:acryl-spark-lineage:0.2.17 --conf &quot;spark.extraListeners=datahub.spark.DatahubSparkListener&quot; my_spark_job_to_run.py</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h3 class="anchor anchorWithStickyNavbar_LWe7" id="configuration-instructions-amazon-emr">Configuration Instructions: Amazon EMR<a href="#configuration-instructions-amazon-emr" class="hash-link" aria-label="Direct link to Configuration Instructions: Amazon EMR" title="Direct link to Configuration Instructions: Amazon EMR"></a></h3><p>Set the following spark-defaults configuration properties as it
stated <a href="https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-configure.html" target="_blank" rel="noopener noreferrer">here</a></p><div class="language-text codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.jars.packages io.acryl:acryl-spark-lineage:0.2.17</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.extraListeners datahub.spark.DatahubSparkListener</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.rest.server https://your_datahub_host/gms</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">#If you have authentication set up then you also need to specify the Datahub access token</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.rest.token yourtoken</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h3 class="anchor anchorWithStickyNavbar_LWe7" id="configuration-instructions-notebooks">Configuration Instructions: Notebooks<a href="#configuration-instructions-notebooks" class="hash-link" aria-label="Direct link to Configuration Instructions: Notebooks" title="Direct link to Configuration Instructions: Notebooks"></a></h3><p>When running interactive jobs from a notebook, the listener can be configured while building the Spark Session.</p><div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-python codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark </span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token plain"> SparkSession</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">builder</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">master</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token string" style="color:rgb(195, 232, 141)">&quot;spark://spark-master:7077&quot;</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">appName</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token string" style="color:rgb(195, 232, 141)">&quot;test-application&quot;</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">config</span><span class="token pu
Cluster <a href="https://docs.databricks.com/clusters/configure.html#spark-configuration" target="_blank" rel="noopener noreferrer">Spark configuration</a>
and <a href="https://docs.databricks.com/clusters/configure.html#init-scripts" target="_blank" rel="noopener noreferrer">Init script</a>.</p><p><a href="https://docs.databricks.com/security/secrets/secrets.html" target="_blank" rel="noopener noreferrer">Databricks Secrets</a> can be leveraged to store sensitive
information like tokens.</p><ul><li><p>Download <code>datahub-spark-lineage</code> jar
from <a href="https://s01.oss.sonatype.org/content/groups/public/io/acryl/acryl-spark-lineage/" target="_blank" rel="noopener noreferrer">the Maven central repository</a>.</p></li><li><p>Create <code>init.sh</code> with below content</p><div class="language-sh codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-sh codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">#!/bin/bash</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">cp /dbfs/datahub/datahub-spark-lineage*.jar /databricks/jars</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div></li><li><p>Install and configure <a href="https://docs.databricks.com/dev-tools/cli/index.html" target="_blank" rel="noopener noreferrer">Databricks CLI</a>.</p></li><li><p>Copy jar and init script to Databricks File System(DBFS) using Databricks CLI.</p><div class="language-sh codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-sh codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">databricks fs mkdirs dbfs:/datahub</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">databricks fs cp --overwrite datahub-spark-lineage*.jar dbfs:/datahub</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">databricks fs cp --overwrite init.sh dbfs:/datahub</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div></li><li><p>Open Databricks Cluster configuration page. Click the <strong>Advanced Options</strong> toggle. Click the <strong>Spark</strong> tab. Add below
configurations under <code>Spark Config</code>.</p><div class="language-text codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.extraListeners datahub.spark.DatahubSparkListener</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.rest.server http://localhost:8080</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.stage_metadata_coalescing true</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.databricks.cluster cluster-name&lt;any preferred cluster identifier&gt;</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div></li><li><p>Click the <strong>Init Scripts</strong> tab. Set cluster init script as <code>dbfs:/datahub/init.sh</code>.</p></li><li><p>Configuring DataHub authentication token</p><ul><li><p>Add below config in cluster spark config.</p><div class="language-text codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.rest.token &lt;token&gt;</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div></li><li><p>Alternatively, Databricks secrets can be used to secure token.</p><ul><li><p>Create secret using Databricks CLI.</p><div class="language-sh codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-sh codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">databricks secrets create-scope --scope datahub --initial-manage-principal users</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">databricks secrets put --scope datahub --key rest-token</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">databricks secrets list --scope datahub &amp;lt;&amp;lt;Edit prompted file with token value&amp;gt;&amp;gt;</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2
datasets. To link these 2 things, urns generated by both have to match.
This section will help you to match urns to that of other ingestion sources.
By default, URNs are created using
template <code>urn:li:dataset:(urn:li:dataPlatform:&lt;$platform&gt;,&lt;platformInstance&gt;.&lt;name&gt;,&lt;env&gt;)</code>. We can configure these 4
things to generate the desired urn.</p><p><strong>Platform</strong>:
Hdfs-based platforms supported explicitly:</p><ul><li>AWS S3 (s3)</li><li>Google Cloud Storage (gcs)</li><li>local ( local file system) (local)
All other platforms will have &quot;hdfs&quot; as a platform.</li></ul><p><strong>Name</strong>:
By default, the name is the complete path. For Hdfs base datasets, tables can be at different levels in the path than
that of the actual file read due to various reasons like partitioning, and sharding. &#x27;path_spec&#x27; is used to alter the
name.
{table} marker is used to specify the table level. Below are a few examples. One can specify multiple path_specs for
different paths specified in the <code>path_spec_list</code>. Each actual path is matched against all path_spes present in the
list. First, one to match will be used to generate urn.</p><p><strong>path_spec Examples</strong></p><div class="codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.platform.s3.path_spec_list=s3://my-bucket/foo/{table}/year=*/month=*/day=*/*,s3://my-other-bucket/foo/{table}/year=*/month=*/day=*/*&quot;</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><table><thead><tr><th>Absolute path</th><th>path_spec</th><th>Urn</th></tr></thead><tbody><tr><td>s3://my-bucket/foo/tests/bar.avro</td><td>Not provided</td><td>urn:li:dataset:(urn:li:dataPlatform:s3,my-bucket/foo/tests/bar.avro,PROD)</td></tr><tr><td>s3://my-bucket/foo/tests/bar.avro</td><td>s3://my-bucket/foo/{table}/<!-- -->*</td><td>urn:li:dataset:(urn:li:dataPlatform:s3,my-bucket/foo/tests,PROD)</td></tr><tr><td>s3://my-bucket/foo/tests/bar.avro</td><td>s3://my-bucket/foo/tests/{table}</td><td>urn:li:dataset:(urn:li:dataPlatform:s3,my-bucket/foo/tests/bar.avro,PROD)</td></tr><tr><td>gs://my-bucket/foo/tests/bar.avro</td><td>gs://my-bucket/{table}/<em>/</em></td><td>urn:li:dataset:(urn:li:dataPlatform:gcs,my-bucket/foo,PROD)</td></tr><tr><td>gs://my-bucket/foo/tests/bar.avro</td><td>gs://my-bucket/{table}</td><td>urn:li:dataset:(urn:li:dataPlatform:gcs,my-bucket/foo,PROD)</td></tr><tr><td>file:///my-bucket/foo/tests/bar.avro</td><td>file:///my-bucket/<em>/</em>/{table}</td><td>urn:li:dataset:(urn:li:dataPlatform:local,my-bucket/foo/tests/bar.avro,PROD)</td></tr></tbody></table><p><strong>platform instance and env:</strong></p><p>The default value for env is &#x27;PROD&#x27; and the platform instance is None. env and platform instances can be set for all
datasets using configurations &#x27;spark.datahub.metadata.dataset.env&#x27; and &#x27;spark.datahub.metadata.dataset.platformInstace&#x27;.
If spark is processing data that belongs to a different env or platform instance, then &#x27;path_alias&#x27; can be used to
specify <code>path_spec</code> specific values of these. &#x27;path_alias&#x27; groups the &#x27;path_spec_list&#x27;, its env, and platform instance
together.</p><p>path_alias_list Example:</p><p>The below example explains the configuration of the case, where files from 2 buckets are being processed in a single
spark application and files from my-bucket are supposed to have &quot;instance1&quot; as platform instance and &quot;PROD&quot; as env, and
files from bucket2 should have env &quot;DEV&quot; in their dataset URNs.</p><div class="codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.platform.s3.path_alias_list : path1,path2</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.platform.s3.path1.env : PROD</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.platform.s3.path1.path_spec_list: s3://my-bucket/*/*/{table}</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.platform.s3.path1.platform_instance : instance-1</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.platform.s3.path2.env: DEV</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.platform.s3.path2.path_spec_list: s3://bucket2/*/{table}</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h3 class="anchor anchorWithStickyNavbar_LWe7" id="important-notes-on-usage">Important notes on usage<a href="#important-notes-on-usage" class="hash-link" aria-label="Direct link to Important notes on usage" title="Direct link to Important notes on usage"></a></h3><ul><li>It is advisable to ensure appName is used appropriately to ensure you can trace lineage from a pipeline back to your
source code.</li><li>If multiple apps with the same appName run concurrently, dataset-lineage will be captured correctly but the
custom-properties e.g. app-id, SQLQueryId would be unreliable. We expect this to be quite rare.</li><li>If spark execution fails, then an empty pipeline would still get created, but it may not have any tasks.</li><li>For HDFS sources, the folder (name) is regarded as the dataset (name) to align with typical storage of parquet/csv
formats.</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="debugging">Debugging<a href="#debugging" class="hash-link" aria-label="Direct link to Debugging" title="Direct link to Debugging"></a></h3><ul><li>Following info logs are generated</li></ul><p>On Spark context startup</p><div class="language-text codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">YY/MM/DD HH:mm:ss INFO DatahubSparkListener: DatahubSparkListener initialised.</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">YY/MM/DD HH:mm:ss INFO SparkContext: Registered listener datahub.spark.DatahubSparkListener</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p>On application start</p><div class="language-text codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">YY/MM/DD HH:mm:ss INFO DatahubSparkListener: Application started: SparkListenerApplicationStart(AppName,Some(local-1644489736794),1644489735772,user,None,None)</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">YY/MM/DD HH:mm:ss INFO McpEmitter: REST Emitter Configuration: GMS url &lt;rest.server&gt;</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">YY/MM/DD HH:mm:ss INFO McpEmitter: REST Emitter Configuration: Token XXXXX</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p>On pushing data to server</p><div class="language-text codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">YY/MM/DD HH:mm:ss INFO McpEmitter: MetadataWriteResponse(success=true, responseContent={&quot;value&quot;:&quot;&lt;URN&gt;&quot;}, underlyingResponse=HTTP/1.1 200 OK [Date: day, DD month year HH:mm:ss GMT, Content-Type: application/json, X-RestLi-Protocol-Version: 2.0.0, Content-Length: 97, Server: Jetty(9.4.46.v20220331)] [Content-Length: 97,Chunked: false])</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V2
You can remove the old lineages by setting <code>spark.datahub.legacyLineageCleanup.enabled=true</code>. Make sure you have the latest server if you enable with patch support. (this was introduced since 0.2.17-rc5)</li></ul></li><li><p><em>Changes</em>:</p><ul><li>OpenLineage 1.25.0 upgrade</li><li>Add option to disable chunked encoding in the datahub rest sink -&gt; <code>spark.datahub.rest.disable_chunked_encoding</code></li><li>Add option to specify the mcp kafka topic for the datahub kafka sink -&gt; <code>spark.datahub.kafka.mcp_topic</code></li><li>Add option to remove legacy lineages from older Spark Plugin runs. This will remove those lineages from the Datasets which it adds to DataJob -&gt; <code>spark.datahub.legacyLineageCleanup.enabled</code></li></ul></li><li><p><em>Fixes</em>:</p><ul><li>Fix handling map transformation in the lineage. Earlier it generated wrong lineage for map transformation.</li></ul></li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="version-0216">Version 0.2.16<a href="#version-0216" class="hash-link" aria-label="Direct link to Version 0.2.16" title="Direct link to Version 0.2.16"></a></h3><ul><li>Remove logging DataHub config into logs</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="version-0215">Version 0.2.15<a href="#version-0215" class="hash-link" aria-label="Direct link to Version 0.2.15" title="Direct link to Version 0.2.15"></a></h3><ul><li>Add Kafka emitter to emit lineage to kafka</li><li>Add File emitter to emit lineage to file</li><li>Add S3 emitter to save mcps to s3</li><li>Upgrading OpenLineage to 1.19.0</li><li>Renaming project to acryl-datahub-spark-lineage</li><li>Supporting OpenLineage 1.17+ glue identifier changes</li><li>Fix handling OpenLineage input/output where wasn&#x27;t any facet attached</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="version-0214">Version 0.2.14<a href="#version-0214" class="hash-link" aria-label="Direct link to Version 0.2.14" title="Direct link to Version 0.2.14"></a></h3><ul><li>Fix warning about MeterFilter warning from Micrometer</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="version-0213">Version 0.2.13<a href="#version-0213" class="hash-link" aria-label="Direct link to Version 0.2.13" title="Direct link to Version 0.2.13"></a></h3><ul><li>Add kafka emitter to emit lineage to kafka</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="version-0212">Version 0.2.12<a href="#version-0212" class="hash-link" aria-label="Direct link to Version 0.2.12" title="Direct link to Version 0.2.12"></a></h3><ul><li>Silencing some chatty warnings in RddPathUtils</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="version-0211">Version 0.2.11<a href="#version-0211" class="hash-link" aria-label="Direct link to Version 0.2.11" title="Direct link to Version 0.2.11"></a></h3><ul><li>Add option to lowercase dataset URNs</li><li>Add option to set platform instance and/or env per platform with <code>spark.datahub.platform.&lt;platform_name&gt;.env</code> and <code>spark.datahub.platform.&lt;platform_name&gt;.platform_instance</code> config parameter</li><li>Fixing platform instance setting for datasets when <code>spark.datahub.metadata.dataset.platformInstance</code> is set</li><li>Fixing column level lineage support when patch is enabled</li></ul></div><footer class="theme-doc-footer docusaurus-mt-lg"><div class="slackUtm_uoBr"><div class="slackUtm_uoBr"><hr>Need more help? Join the conversation in <a href="https://datahub.com/slack?utm_source=docs&amp;utm_medium=footer&amp;utm_campaign=docs_footer&amp;utm_content=metadata-integration/java/acryl-spark-lineage/README">Slack!</a></div></div><div class="theme-doc-footer-edit-meta-row row"><div class="col"><a href="https://github.com/datahub-project/datahub/blob/master/metadata-integration/java/acryl-spark-lineage/README.md" target="_blank" rel="noreferrer noopener" class="theme-edit-this-page"><svg fill="currentColor" height="20" width="20" viewBox="0 0 40 40" class="iconEdit_Z9Sw" aria-hidden="true"><g><path d="m34.5 11.7l-3 3.1-6.3-6
<script src="/assets/js/runtime~main.310f59c4.js"></script>
<script src="/assets/js/main.49198d73.js"></script>
</body>
</html>