mirror of
https://github.com/datahub-project/datahub.git
synced 2025-07-23 17:39:59 +00:00
147 lines
117 KiB
HTML
147 lines
117 KiB
HTML
![]() |
<!doctype html>
|
|||
|
<html lang="en" dir="ltr" class="docs-wrapper docs-doc-page docs-version-current plugin-docs plugin-id-default docs-doc-id-metadata-integration/java/acryl-spark-lineage/README" data-has-hydrated="false">
|
|||
|
<head>
|
|||
|
<meta charset="UTF-8">
|
|||
|
<meta name="generator" content="Docusaurus v2.4.3">
|
|||
|
<title data-rh="true">Spark | DataHub</title><meta data-rh="true" name="viewport" content="width=device-width,initial-scale=1"><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:url" content="https://docs.datahub.com/docs/metadata-integration/java/acryl-spark-lineage"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" name="docusaurus_version" content="current"><meta data-rh="true" name="docusaurus_tag" content="docs-default-current"><meta data-rh="true" name="docsearch:version" content="current"><meta data-rh="true" name="docsearch:docusaurus_tag" content="docs-default-current"><meta data-rh="true" property="og:title" content="Spark | DataHub"><meta data-rh="true" name="description" content="To integrate Spark with DataHub, we provide a lightweight Java agent that listens for Spark application and job events"><meta data-rh="true" property="og:description" content="To integrate Spark with DataHub, we provide a lightweight Java agent that listens for Spark application and job events"><link data-rh="true" rel="icon" href="/img/favicon.ico"><link data-rh="true" rel="canonical" href="https://docs.datahub.com/docs/metadata-integration/java/acryl-spark-lineage"><link data-rh="true" rel="alternate" href="https://docs.datahub.com/docs/metadata-integration/java/acryl-spark-lineage" hreflang="en"><link data-rh="true" rel="alternate" href="https://docs.datahub.com/docs/metadata-integration/java/acryl-spark-lineage" hreflang="x-default"><link data-rh="true" rel="preconnect" href="https://RK0UG797F3-dsn.algolia.net" crossorigin="anonymous"><link rel="alternate" type="application/rss+xml" href="/learn/rss.xml" title="DataHub RSS Feed">
|
|||
|
<link rel="alternate" type="application/atom+xml" href="/learn/atom.xml" title="DataHub Atom Feed">
|
|||
|
|
|||
|
<link rel="preconnect" href="https://www.google-analytics.com">
|
|||
|
<link rel="preconnect" href="https://www.googletagmanager.com">
|
|||
|
<script async src="https://www.googletagmanager.com/gtag/js?id=G-PKGVLETT4C"></script>
|
|||
|
<script>function gtag(){dataLayer.push(arguments)}window.dataLayer=window.dataLayer||[],gtag("js",new Date),gtag("config","G-PKGVLETT4C",{})</script>
|
|||
|
<link rel="preconnect" href="https://www.googletagmanager.com">
|
|||
|
<script>window.dataLayer=window.dataLayer||[]</script>
|
|||
|
<script>!function(e,t,a,n,g){e[n]=e[n]||[],e[n].push({"gtm.start":(new Date).getTime(),event:"gtm.js"});var m=t.getElementsByTagName(a)[0],r=t.createElement(a);r.async=!0,r.src="https://www.googletagmanager.com/gtm.js?id=GTM-5M8T9HNN",m.parentNode.insertBefore(r,m)}(window,document,"script","dataLayer")</script>
|
|||
|
|
|||
|
|
|||
|
<link rel="search" type="application/opensearchdescription+xml" title="DataHub" href="/opensearch.xml">
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<meta httpequiv="Content-Security-Policy" content="frame-ancestors 'self' https://*.acryl.io https://acryldata.io http://localhost:*">
|
|||
|
<link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Manrope:wght@400;500;700&display=swap">
|
|||
|
<script src="https://tools.luckyorange.com/core/lo.js?site-id=28ea8a38" async defer="defer"></script>
|
|||
|
<script src="/scripts/rb2b.js" async defer="defer"></script>
|
|||
|
<script src="https://app.revenuehero.io/scheduler.min.js"></script>
|
|||
|
<script src="https://tag.clearbitscripts.com/v1/pk_2e321cabe30432a5c44c0424781aa35f/tags.js" referrerpolicy="strict-origin-when-cross-origin"></script>
|
|||
|
<script src="/scripts/reo.js"></script>
|
|||
|
<script id="runllm-widget-script" type="module" src="https://widget.runllm.com" crossorigin="true" runllm-name="DataHub" runllm-assistant-id="81" runllm-position="BOTTOM_RIGHT" runllm-keyboard-shortcut="Mod+j" runllm-preset="docusaurus" runllm-theme-color="#1890FF" runllm-brand-logo="https://docs.datahub.com/img/datahub-logo-color-mark.svg" runllm-community-url="https://datahub.com/slack" runllm-community-type="slack" runllm-disable-ask-a-person="true" async></script><link rel="stylesheet" href="/assets/css/styles.d8fe2eb8.css">
|
|||
|
<link rel="preload" href="/assets/js/runtime~main.310f59c4.js" as="script">
|
|||
|
<link rel="preload" href="/assets/js/main.49198d73.js" as="script">
|
|||
|
</head>
|
|||
|
<body class="navigation-with-keyboard">
|
|||
|
<noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-5M8T9HNN" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
|
|||
|
|
|||
|
<script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){var t=null;try{t=new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}return t}()||function(){var t=null;try{t=localStorage.getItem("theme")}catch(t){}return t}();t(null!==e?e:"light")}(),document.documentElement.setAttribute("data-announcement-bar-initially-dismissed",function(){try{return"true"===localStorage.getItem("docusaurus.announcement.dismiss")}catch(t){}return!1}())</script><div id="__docusaurus">
|
|||
|
<div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#__docusaurus_skipToContent_fallback">Skip to main content</a></div><div class="announcementBar_mb4j" style="background-color:transparent;color:#ffffff" role="banner"><div class="content_knG7 announcementBarContent_xLdY"><div class="shimmer-banner"><p>DataHub Secures $35 Million Series B</p><a href="https://datahub.com/news/series-b-announcement/" target="_blank" class="button"><div>Read the announcement<span> →</span></div></a></div></div></div><nav aria-label="Main" class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><button aria-label="Toggle navigation bar" aria-expanded="false" class="navbar__toggle clean-btn" type="button"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a href="https://datahub.com" target="_blank" rel="noopener noreferrer" class="navbar__brand"><div class="navbar__logo"><img src="/img/datahub-logo-color-light-horizontal.svg" alt="DataHub Logo" class="themedImage_ToTc themedImage--light_HNdA"><img src="/img/datahub-logo-color-dark-horizontal.svg" alt="DataHub Logo" class="themedImage_ToTc themedImage--dark_i4oU"></div></a><div class="navbar__item dropdown dropdown--hoverable"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link versionNavItem_cbn8">Next</a><ul class="dropdown__menu"><li><a aria-current="page" class="dropdown__link dropdown__link--active" href="/docs/metadata-integration/java/acryl-spark-lineage">Next</a></li><li><a class="dropdown__link" href="/docs/1.1.0/metadata-integration/java/acryl-spark-lineage">1.1.0</a></li><li><hr class="dropdown-separator" style="margin: 0.4rem;"></li><li><div class="dropdown__link"><b>Archived versions</b></div></li><li>
|
|||
|
<a class="dropdown__link" href="https://docs-website-t9sv4w3gr-acryldata.vercel.app/docs/features">1.0.0
|
|||
|
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
|
|||
|
</a>
|
|||
|
</li><li>
|
|||
|
<a class="dropdown__link" href="https://docs-website-t9sv4w3gr-acryldata.vercel.app/docs/0.15.0/features">0.15.0
|
|||
|
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
|
|||
|
</a>
|
|||
|
</li><li>
|
|||
|
<a class="dropdown__link" href="https://docs-website-8jkm4uler-acryldata.vercel.app/docs/0.14.1/features">0.14.1
|
|||
|
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
|
|||
|
</a>
|
|||
|
</li><li>
|
|||
|
<a class="dropdown__link" href="https://docs-website-eue2qafvn-acryldata.vercel.app/docs/features">0.14.0
|
|||
|
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
|
|||
|
</a>
|
|||
|
</li><li>
|
|||
|
<a class="dropdown__link" href="https://docs-website-psat3nzgi-acryldata.vercel.app/docs/features">0.13.1
|
|||
|
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
|
|||
|
</a>
|
|||
|
</li><li>
|
|||
|
<a class="dropdown__link" href="https://docs-website-lzxh86531-acryldata.vercel.app/docs/features">0.13.0
|
|||
|
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
|
|||
|
</a>
|
|||
|
</li><li>
|
|||
|
<a class="dropdown__link" href="https://docs-website-2uuxmgza2-acryldata.vercel.app/docs/features">0.12.1
|
|||
|
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
|
|||
|
</a>
|
|||
|
</li><li>
|
|||
|
<a class="dropdown__link" href="https://docs-website-irpoe2osc-acryldata.vercel.app/docs/features">0.11.0
|
|||
|
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
|
|||
|
</a>
|
|||
|
</li><li>
|
|||
|
<a class="dropdown__link" href="https://docs-website-1gv2yzn9d-acryldata.vercel.app/docs/features">0.10.5
|
|||
|
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
|
|||
|
</a>
|
|||
|
</li></ul></div></div><div class="navbar__items navbar__items--right"><a aria-current="page" class="navbar__item navbar__link navbar__link--active" href="/docs">Docs</a><a class="navbar__item navbar__link" href="/integrations">Integrations</a><div class="navbar__item dropdown dropdown--hoverable"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link">Learn</a><ul class="dropdown__menu dropdown__menu_Z8FC"><div class="wrapper_kp81"><div><a href="https://datahub.com/weekly-demo" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-join-slack.png" alt="Weekly Demo"></div><div class="title_c7DP">Weekly Demo</div></a></div><div><a href="https://datahub.com/use-cases" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-forum.png" alt="Use Cases"></div><div class="title_c7DP">Use Cases</div></a></div><div><a href="httpps://datahub.com/adoption-stories" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-events.png" alt="Adoption Stories"></div><div class="title_c7DP">Adoption Stories</div></a></div><div><a href="https://medium.com/datahub-project" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-champions.png" alt="Blog"></div><div class="title_c7DP">Blog</div></a></div><div><a href="https://www.youtube.com/channel/UC3qFQC5IiwR5fvWEqi_tJ5w" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-share-your-journey.png" alt="Youtube"></div><div class="title_c7DP">Youtube</div></a></div></div></ul></div><div class="navbar__item dropdown dropdown--hoverable"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link">Community</a><ul class="dropdown__menu dropdown__menu_Z8FC"><div class="wrapper_kp81"><div><a href="https://datahub.com/slack/" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-join-slack.png" alt="Join Slack"></div><div class="title_c7DP">Join Slack</div></a></div><div><a href="https://datahub.com/events" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-events.png" alt="Events"></div><div class="title_c7DP">Events</div></a></div><div><a href="https://datahub.com/champions/" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-champions.png" alt="Champions"></div><div class="title_c7DP">Champions</div></a></div><div><a href="https://datahub.com/share-your-journey/" target="_blank" rel="noopener noreferrer" class="card_BUD7"><div class="icon_BgHd"><img src="/img/icon-share-your-journey.png" alt="Share Your Journey"></div><div class="title_c7DP">Share Your Journey</div></a></div></div></ul></div><a href="https://datahub.com/products/why-datahub-cloud/" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">
|
|||
|
<style>
|
|||
|
.cloud-cta {
|
|||
|
color: var(--ifm-menu-color-active);
|
|||
|
font-weight: 600;
|
|||
|
background: linear-gradient(40deg, var(--ifm-menu-color-active), var(--ifm-menu-color-active));
|
|||
|
background-size: 200% 100%;
|
|||
|
-webkit-background-clip: text;
|
|||
|
background-clip: text;
|
|||
|
transition: background-image 0.3s ease;
|
|||
|
}
|
|||
|
.cloud-cta:hover {
|
|||
|
color: transparent;
|
|||
|
background: linear-gradient(40deg, var(--ifm-menu-color-active), #ff1493);
|
|||
|
background-size: 200% 100%;
|
|||
|
-webkit-background-clip: text;
|
|||
|
background-clip: text;
|
|||
|
animation: gradientShift 3s ease infinite;
|
|||
|
}
|
|||
|
@keyframes gradientShift {
|
|||
|
0%, 100% { background-position: 0% 50%; }
|
|||
|
50% { background-position: 100% 50%; }
|
|||
|
}
|
|||
|
</style>
|
|||
|
<div class="cloud-cta">Get Cloud</div>
|
|||
|
</a><a href="https://datahub.com/slack?utm_source=docs&utm_medium=header&utm_campaign=docs_header" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">
|
|||
|
<style>
|
|||
|
.slack-logo:hover {
|
|||
|
opacity: 0.8;
|
|||
|
}
|
|||
|
</style>
|
|||
|
<img class="slack-logo" src="https://upload.wikimedia.org/wikipedia/commons/d/d5/Slack_icon_2019.svg" , alt="slack" , height="20px" style="margin: 10px 0 0 0;">
|
|||
|
</a><div class="searchBox_ZlJk"><button type="button" class="DocSearch DocSearch-Button" aria-label="Search"><span class="DocSearch-Button-Container"><svg width="20" height="20" class="DocSearch-Search-Icon" viewBox="0 0 20 20" aria-hidden="true"><path d="M14.386 14.386l4.0877 4.0877-4.0877-4.0877c-2.9418 2.9419-7.7115 2.9419-10.6533 0-2.9419-2.9418-2.9419-7.7115 0-10.6533 2.9418-2.9419 7.7115-2.9419 10.6533 0 2.9419 2.9418 2.9419 7.7115 0 10.6533z" stroke="currentColor" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg><span class="DocSearch-Button-Placeholder">Search</span></span><span class="DocSearch-Button-Keys"></span></button></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div id="__docusaurus_skipToContent_fallback" class="main-wrapper mainWrapper_z2l0 docsWrapper_BCFX"><button aria-label="Scroll back to top" class="clean-btn theme-back-to-top-button backToTopButton_sjWU" type="button"></button><div class="docPage__5DB"><aside class="theme-doc-sidebar-container docSidebarContainer_b6E3"><div class="sidebarViewport_Xe31"><div class="sidebar_njMd"><nav aria-label="Docs sidebar" class="menu thin-scrollbar menu_SIkG menuWithAnnouncementBar_GW3s"><ul class="theme-doc-sidebar-menu menu__list"><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menuHtmlItem_M9Kj menu__list-item"><div>Getting Started</div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist" aria-expanded="false" href="/docs/features">What Is DataHub?</a><button aria-label="Toggle the collapsible sidebar category 'What Is DataHub?'" type="button" class="clean-btn menu__caret"></button></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist" aria-expanded="false" href="/docs/category/features">Features</a><button aria-label="Toggle the collapsible sidebar category 'Features'" type="button" class="clean-btn menu__caret"></button></div></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menuHtmlItem_M9Kj menu__list-item"><div>DataHub Cloud</div></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/managed-datahub/managed-datahub-overview">DataHub Cloud Overview</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/managed-datahub/welcome-acryl">Getting Started with DataHub Cloud</a></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/authentication/guides/sso/initialize-oidc">Configure Single Sign-On</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/managed-datahub/remote-executor/about">Remote Executor</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/managed-datahub/datahub-api/entity-events-api">DataHub API</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--subli
|
|||
|
and pushes metadata out to DataHub in real-time. The agent listens to events such as application start/end, and
|
|||
|
SQLExecution start/end to create pipelines (i.e. DataJob) and tasks (i.e. DataFlow) in Datahub along with lineage to
|
|||
|
datasets that are being read from and written to. Read on to learn how to configure this for different Spark scenarios.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="configuring-spark-agent">Configuring Spark agent<a href="#configuring-spark-agent" class="hash-link" aria-label="Direct link to Configuring Spark agent" title="Direct link to Configuring Spark agent"></a></h2><p>The Spark agent can be configured using a config file or while creating a Spark Session. If you are using Spark on
|
|||
|
Databricks, refer to <a href="#configuration-instructions-databricks">Configuration Instructions for Databricks</a>.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="before-you-begin-versions-and-release-notes">Before you begin: Versions and Release Notes<a href="#before-you-begin-versions-and-release-notes" class="hash-link" aria-label="Direct link to Before you begin: Versions and Release Notes" title="Direct link to Before you begin: Versions and Release Notes"></a></h3><p>Versioning of the jar artifact will follow the semantic versioning of the
|
|||
|
main <a href="https://github.com/datahub-project/datahub" target="_blank" rel="noopener noreferrer">DataHub repo</a> and release notes will be
|
|||
|
available <a href="https://github.com/datahub-project/datahub/releases" target="_blank" rel="noopener noreferrer">here</a>.
|
|||
|
Always check <a href="https://search.maven.org/search?q=a:acryl-spark-lineage" target="_blank" rel="noopener noreferrer">the Maven central repository</a> for the latest
|
|||
|
released version.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="configuration-instructions-spark-submit">Configuration Instructions: spark-submit<a href="#configuration-instructions-spark-submit" class="hash-link" aria-label="Direct link to Configuration Instructions: spark-submit" title="Direct link to Configuration Instructions: spark-submit"></a></h3><p>When running jobs using spark-submit, the agent needs to be configured in the config file.</p><div class="language-text codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">#Configuring DataHub spark agent jar</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.jars.packages io.acryl:acryl-spark-lineage:0.2.17</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.extraListeners datahub.spark.DatahubSparkListener</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.rest.server http://localhost:8080</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h2 class="anchor anchorWithStickyNavbar_LWe7" id="spark-submit-command-line">spark-submit command line<a href="#spark-submit-command-line" class="hash-link" aria-label="Direct link to spark-submit command line" title="Direct link to spark-submit command line"></a></h2><div class="language-sh codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-sh codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark-submit --packages io.acryl:acryl-spark-lineage:0.2.17 --conf "spark.extraListeners=datahub.spark.DatahubSparkListener" my_spark_job_to_run.py</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h3 class="anchor anchorWithStickyNavbar_LWe7" id="configuration-instructions-amazon-emr">Configuration Instructions: Amazon EMR<a href="#configuration-instructions-amazon-emr" class="hash-link" aria-label="Direct link to Configuration Instructions: Amazon EMR" title="Direct link to Configuration Instructions: Amazon EMR"></a></h3><p>Set the following spark-defaults configuration properties as it
|
|||
|
stated <a href="https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-configure.html" target="_blank" rel="noopener noreferrer">here</a></p><div class="language-text codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.jars.packages io.acryl:acryl-spark-lineage:0.2.17</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.extraListeners datahub.spark.DatahubSparkListener</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.rest.server https://your_datahub_host/gms</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">#If you have authentication set up then you also need to specify the Datahub access token</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.rest.token yourtoken</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h3 class="anchor anchorWithStickyNavbar_LWe7" id="configuration-instructions-notebooks">Configuration Instructions: Notebooks<a href="#configuration-instructions-notebooks" class="hash-link" aria-label="Direct link to Configuration Instructions: Notebooks" title="Direct link to Configuration Instructions: Notebooks"></a></h3><p>When running interactive jobs from a notebook, the listener can be configured while building the Spark Session.</p><div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-python codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark </span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token plain"> SparkSession</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">builder</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">master</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token string" style="color:rgb(195, 232, 141)">"spark://spark-master:7077"</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">appName</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token string" style="color:rgb(195, 232, 141)">"test-application"</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">config</span><span class="token pu
|
|||
|
Cluster <a href="https://docs.databricks.com/clusters/configure.html#spark-configuration" target="_blank" rel="noopener noreferrer">Spark configuration</a>
|
|||
|
and <a href="https://docs.databricks.com/clusters/configure.html#init-scripts" target="_blank" rel="noopener noreferrer">Init script</a>.</p><p><a href="https://docs.databricks.com/security/secrets/secrets.html" target="_blank" rel="noopener noreferrer">Databricks Secrets</a> can be leveraged to store sensitive
|
|||
|
information like tokens.</p><ul><li><p>Download <code>datahub-spark-lineage</code> jar
|
|||
|
from <a href="https://s01.oss.sonatype.org/content/groups/public/io/acryl/acryl-spark-lineage/" target="_blank" rel="noopener noreferrer">the Maven central repository</a>.</p></li><li><p>Create <code>init.sh</code> with below content</p><div class="language-sh codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-sh codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">#!/bin/bash</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">cp /dbfs/datahub/datahub-spark-lineage*.jar /databricks/jars</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div></li><li><p>Install and configure <a href="https://docs.databricks.com/dev-tools/cli/index.html" target="_blank" rel="noopener noreferrer">Databricks CLI</a>.</p></li><li><p>Copy jar and init script to Databricks File System(DBFS) using Databricks CLI.</p><div class="language-sh codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-sh codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">databricks fs mkdirs dbfs:/datahub</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">databricks fs cp --overwrite datahub-spark-lineage*.jar dbfs:/datahub</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">databricks fs cp --overwrite init.sh dbfs:/datahub</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div></li><li><p>Open Databricks Cluster configuration page. Click the <strong>Advanced Options</strong> toggle. Click the <strong>Spark</strong> tab. Add below
|
|||
|
configurations under <code>Spark Config</code>.</p><div class="language-text codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.extraListeners datahub.spark.DatahubSparkListener</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.rest.server http://localhost:8080</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.stage_metadata_coalescing true</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.databricks.cluster cluster-name<any preferred cluster identifier></span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div></li><li><p>Click the <strong>Init Scripts</strong> tab. Set cluster init script as <code>dbfs:/datahub/init.sh</code>.</p></li><li><p>Configuring DataHub authentication token</p><ul><li><p>Add below config in cluster spark config.</p><div class="language-text codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.rest.token <token></span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div></li><li><p>Alternatively, Databricks secrets can be used to secure token.</p><ul><li><p>Create secret using Databricks CLI.</p><div class="language-sh codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-sh codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">databricks secrets create-scope --scope datahub --initial-manage-principal users</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">databricks secrets put --scope datahub --key rest-token</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">databricks secrets list --scope datahub &lt;&lt;Edit prompted file with token value&gt;&gt;</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2
|
|||
|
datasets. To link these 2 things, urns generated by both have to match.
|
|||
|
This section will help you to match urns to that of other ingestion sources.
|
|||
|
By default, URNs are created using
|
|||
|
template <code>urn:li:dataset:(urn:li:dataPlatform:<$platform>,<platformInstance>.<name>,<env>)</code>. We can configure these 4
|
|||
|
things to generate the desired urn.</p><p><strong>Platform</strong>:
|
|||
|
Hdfs-based platforms supported explicitly:</p><ul><li>AWS S3 (s3)</li><li>Google Cloud Storage (gcs)</li><li>local ( local file system) (local)
|
|||
|
All other platforms will have "hdfs" as a platform.</li></ul><p><strong>Name</strong>:
|
|||
|
By default, the name is the complete path. For Hdfs base datasets, tables can be at different levels in the path than
|
|||
|
that of the actual file read due to various reasons like partitioning, and sharding. 'path_spec' is used to alter the
|
|||
|
name.
|
|||
|
{table} marker is used to specify the table level. Below are a few examples. One can specify multiple path_specs for
|
|||
|
different paths specified in the <code>path_spec_list</code>. Each actual path is matched against all path_spes present in the
|
|||
|
list. First, one to match will be used to generate urn.</p><p><strong>path_spec Examples</strong></p><div class="codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.platform.s3.path_spec_list=s3://my-bucket/foo/{table}/year=*/month=*/day=*/*,s3://my-other-bucket/foo/{table}/year=*/month=*/day=*/*"</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><table><thead><tr><th>Absolute path</th><th>path_spec</th><th>Urn</th></tr></thead><tbody><tr><td>s3://my-bucket/foo/tests/bar.avro</td><td>Not provided</td><td>urn:li:dataset:(urn:li:dataPlatform:s3,my-bucket/foo/tests/bar.avro,PROD)</td></tr><tr><td>s3://my-bucket/foo/tests/bar.avro</td><td>s3://my-bucket/foo/{table}/<!-- -->*</td><td>urn:li:dataset:(urn:li:dataPlatform:s3,my-bucket/foo/tests,PROD)</td></tr><tr><td>s3://my-bucket/foo/tests/bar.avro</td><td>s3://my-bucket/foo/tests/{table}</td><td>urn:li:dataset:(urn:li:dataPlatform:s3,my-bucket/foo/tests/bar.avro,PROD)</td></tr><tr><td>gs://my-bucket/foo/tests/bar.avro</td><td>gs://my-bucket/{table}/<em>/</em></td><td>urn:li:dataset:(urn:li:dataPlatform:gcs,my-bucket/foo,PROD)</td></tr><tr><td>gs://my-bucket/foo/tests/bar.avro</td><td>gs://my-bucket/{table}</td><td>urn:li:dataset:(urn:li:dataPlatform:gcs,my-bucket/foo,PROD)</td></tr><tr><td>file:///my-bucket/foo/tests/bar.avro</td><td>file:///my-bucket/<em>/</em>/{table}</td><td>urn:li:dataset:(urn:li:dataPlatform:local,my-bucket/foo/tests/bar.avro,PROD)</td></tr></tbody></table><p><strong>platform instance and env:</strong></p><p>The default value for env is 'PROD' and the platform instance is None. env and platform instances can be set for all
|
|||
|
datasets using configurations 'spark.datahub.metadata.dataset.env' and 'spark.datahub.metadata.dataset.platformInstace'.
|
|||
|
If spark is processing data that belongs to a different env or platform instance, then 'path_alias' can be used to
|
|||
|
specify <code>path_spec</code> specific values of these. 'path_alias' groups the 'path_spec_list', its env, and platform instance
|
|||
|
together.</p><p>path_alias_list Example:</p><p>The below example explains the configuration of the case, where files from 2 buckets are being processed in a single
|
|||
|
spark application and files from my-bucket are supposed to have "instance1" as platform instance and "PROD" as env, and
|
|||
|
files from bucket2 should have env "DEV" in their dataset URNs.</p><div class="codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.platform.s3.path_alias_list : path1,path2</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.platform.s3.path1.env : PROD</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.platform.s3.path1.path_spec_list: s3://my-bucket/*/*/{table}</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.platform.s3.path1.platform_instance : instance-1</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.platform.s3.path2.env: DEV</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">spark.datahub.platform.s3.path2.path_spec_list: s3://bucket2/*/{table}</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h3 class="anchor anchorWithStickyNavbar_LWe7" id="important-notes-on-usage">Important notes on usage<a href="#important-notes-on-usage" class="hash-link" aria-label="Direct link to Important notes on usage" title="Direct link to Important notes on usage"></a></h3><ul><li>It is advisable to ensure appName is used appropriately to ensure you can trace lineage from a pipeline back to your
|
|||
|
source code.</li><li>If multiple apps with the same appName run concurrently, dataset-lineage will be captured correctly but the
|
|||
|
custom-properties e.g. app-id, SQLQueryId would be unreliable. We expect this to be quite rare.</li><li>If spark execution fails, then an empty pipeline would still get created, but it may not have any tasks.</li><li>For HDFS sources, the folder (name) is regarded as the dataset (name) to align with typical storage of parquet/csv
|
|||
|
formats.</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="debugging">Debugging<a href="#debugging" class="hash-link" aria-label="Direct link to Debugging" title="Direct link to Debugging"></a></h3><ul><li>Following info logs are generated</li></ul><p>On Spark context startup</p><div class="language-text codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">YY/MM/DD HH:mm:ss INFO DatahubSparkListener: DatahubSparkListener initialised.</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">YY/MM/DD HH:mm:ss INFO SparkContext: Registered listener datahub.spark.DatahubSparkListener</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p>On application start</p><div class="language-text codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">YY/MM/DD HH:mm:ss INFO DatahubSparkListener: Application started: SparkListenerApplicationStart(AppName,Some(local-1644489736794),1644489735772,user,None,None)</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">YY/MM/DD HH:mm:ss INFO McpEmitter: REST Emitter Configuration: GMS url <rest.server></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">YY/MM/DD HH:mm:ss INFO McpEmitter: REST Emitter Configuration: Token XXXXX</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p>On pushing data to server</p><div class="language-text codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">YY/MM/DD HH:mm:ss INFO McpEmitter: MetadataWriteResponse(success=true, responseContent={"value":"<URN>"}, underlyingResponse=HTTP/1.1 200 OK [Date: day, DD month year HH:mm:ss GMT, Content-Type: application/json, X-RestLi-Protocol-Version: 2.0.0, Content-Length: 97, Server: Jetty(9.4.46.v20220331)] [Content-Length: 97,Chunked: false])</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V2
|
|||
|
You can remove the old lineages by setting <code>spark.datahub.legacyLineageCleanup.enabled=true</code>. Make sure you have the latest server if you enable with patch support. (this was introduced since 0.2.17-rc5)</li></ul></li><li><p><em>Changes</em>:</p><ul><li>OpenLineage 1.25.0 upgrade</li><li>Add option to disable chunked encoding in the datahub rest sink -> <code>spark.datahub.rest.disable_chunked_encoding</code></li><li>Add option to specify the mcp kafka topic for the datahub kafka sink -> <code>spark.datahub.kafka.mcp_topic</code></li><li>Add option to remove legacy lineages from older Spark Plugin runs. This will remove those lineages from the Datasets which it adds to DataJob -> <code>spark.datahub.legacyLineageCleanup.enabled</code></li></ul></li><li><p><em>Fixes</em>:</p><ul><li>Fix handling map transformation in the lineage. Earlier it generated wrong lineage for map transformation.</li></ul></li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="version-0216">Version 0.2.16<a href="#version-0216" class="hash-link" aria-label="Direct link to Version 0.2.16" title="Direct link to Version 0.2.16"></a></h3><ul><li>Remove logging DataHub config into logs</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="version-0215">Version 0.2.15<a href="#version-0215" class="hash-link" aria-label="Direct link to Version 0.2.15" title="Direct link to Version 0.2.15"></a></h3><ul><li>Add Kafka emitter to emit lineage to kafka</li><li>Add File emitter to emit lineage to file</li><li>Add S3 emitter to save mcps to s3</li><li>Upgrading OpenLineage to 1.19.0</li><li>Renaming project to acryl-datahub-spark-lineage</li><li>Supporting OpenLineage 1.17+ glue identifier changes</li><li>Fix handling OpenLineage input/output where wasn't any facet attached</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="version-0214">Version 0.2.14<a href="#version-0214" class="hash-link" aria-label="Direct link to Version 0.2.14" title="Direct link to Version 0.2.14"></a></h3><ul><li>Fix warning about MeterFilter warning from Micrometer</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="version-0213">Version 0.2.13<a href="#version-0213" class="hash-link" aria-label="Direct link to Version 0.2.13" title="Direct link to Version 0.2.13"></a></h3><ul><li>Add kafka emitter to emit lineage to kafka</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="version-0212">Version 0.2.12<a href="#version-0212" class="hash-link" aria-label="Direct link to Version 0.2.12" title="Direct link to Version 0.2.12"></a></h3><ul><li>Silencing some chatty warnings in RddPathUtils</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="version-0211">Version 0.2.11<a href="#version-0211" class="hash-link" aria-label="Direct link to Version 0.2.11" title="Direct link to Version 0.2.11"></a></h3><ul><li>Add option to lowercase dataset URNs</li><li>Add option to set platform instance and/or env per platform with <code>spark.datahub.platform.<platform_name>.env</code> and <code>spark.datahub.platform.<platform_name>.platform_instance</code> config parameter</li><li>Fixing platform instance setting for datasets when <code>spark.datahub.metadata.dataset.platformInstance</code> is set</li><li>Fixing column level lineage support when patch is enabled</li></ul></div><footer class="theme-doc-footer docusaurus-mt-lg"><div class="slackUtm_uoBr"><div class="slackUtm_uoBr"><hr>Need more help? Join the conversation in <a href="https://datahub.com/slack?utm_source=docs&utm_medium=footer&utm_campaign=docs_footer&utm_content=metadata-integration/java/acryl-spark-lineage/README">Slack!</a></div></div><div class="theme-doc-footer-edit-meta-row row"><div class="col"><a href="https://github.com/datahub-project/datahub/blob/master/metadata-integration/java/acryl-spark-lineage/README.md" target="_blank" rel="noreferrer noopener" class="theme-edit-this-page"><svg fill="currentColor" height="20" width="20" viewBox="0 0 40 40" class="iconEdit_Z9Sw" aria-hidden="true"><g><path d="m34.5 11.7l-3 3.1-6.3-6
|
|||
|
<script src="/assets/js/runtime~main.310f59c4.js"></script>
|
|||
|
<script src="/assets/js/main.49198d73.js"></script>
|
|||
|
</body>
|
|||
|
</html>
|