datahub/docs/pyspark/index.html

98 lines
75 KiB
HTML
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!doctype html>
<html lang="en" dir="ltr" class="docs-wrapper docs-doc-page docs-version-current plugin-docs plugin-id-default docs-doc-id-docs/PYSPARK" data-has-hydrated="false">
<head>
<meta charset="UTF-8">
<meta name="generator" content="Docusaurus v2.4.3">
<title data-rh="true">Optional PySpark Support for S3 Source | DataHub</title><meta data-rh="true" name="viewport" content="width=device-width,initial-scale=1"><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:url" content="https://docs.datahub.com/docs/pyspark"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" name="docusaurus_version" content="current"><meta data-rh="true" name="docusaurus_tag" content="docs-default-current"><meta data-rh="true" name="docsearch:version" content="current"><meta data-rh="true" name="docsearch:docusaurus_tag" content="docs-default-current"><meta data-rh="true" property="og:title" content="Optional PySpark Support for S3 Source | DataHub"><meta data-rh="true" name="description" content="DataHub&#x27;s S3 source now supports optional PySpark installation through the s3-slim variant. This allows users to choose a lightweight installation when data lake profiling is not needed."><meta data-rh="true" property="og:description" content="DataHub&#x27;s S3 source now supports optional PySpark installation through the s3-slim variant. This allows users to choose a lightweight installation when data lake profiling is not needed."><link data-rh="true" rel="icon" href="/img/favicon.ico"><link data-rh="true" rel="canonical" href="https://docs.datahub.com/docs/pyspark"><link data-rh="true" rel="alternate" href="https://docs.datahub.com/docs/pyspark" hreflang="en"><link data-rh="true" rel="alternate" href="https://docs.datahub.com/docs/pyspark" hreflang="x-default"><link data-rh="true" rel="preconnect" href="https://RK0UG797F3-dsn.algolia.net" crossorigin="anonymous"><link rel="alternate" type="application/rss+xml" href="/learn/rss.xml" title="DataHub RSS Feed">
<link rel="alternate" type="application/atom+xml" href="/learn/atom.xml" title="DataHub Atom Feed">
<link rel="preconnect" href="https://www.google-analytics.com">
<link rel="preconnect" href="https://www.googletagmanager.com">
<script async src="https://www.googletagmanager.com/gtag/js?id=G-PKGVLETT4C"></script>
<script>function gtag(){dataLayer.push(arguments)}window.dataLayer=window.dataLayer||[],gtag("js",new Date),gtag("config","G-PKGVLETT4C",{})</script>
<link rel="preconnect" href="https://www.googletagmanager.com">
<script>window.dataLayer=window.dataLayer||[]</script>
<script>!function(e,t,a,n,g){e[n]=e[n]||[],e[n].push({"gtm.start":(new Date).getTime(),event:"gtm.js"});var m=t.getElementsByTagName(a)[0],r=t.createElement(a);r.async=!0,r.src="https://www.googletagmanager.com/gtm.js?id=GTM-5M8T9HNN",m.parentNode.insertBefore(r,m)}(window,document,"script","dataLayer")</script>
<link rel="search" type="application/opensearchdescription+xml" title="DataHub" href="/opensearch.xml">
<meta httpequiv="Content-Security-Policy" content="frame-ancestors &#39;self&#39; https://*.acryl.io https://acryldata.io http://localhost:*">
<link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Manrope:wght@400;500;700&display=swap">
<script src="https://tools.luckyorange.com/core/lo.js?site-id=28ea8a38" async defer="defer"></script>
<script src="/scripts/rb2b.js" async defer="defer"></script>
<script src="https://app.revenuehero.io/scheduler.min.js"></script>
<script src="https://tag.clearbitscripts.com/v1/pk_2e321cabe30432a5c44c0424781aa35f/tags.js" referrerpolicy="strict-origin-when-cross-origin"></script>
<script src="/scripts/reo.js"></script>
<script id="runllm-widget-script" type="module" src="https://widget.runllm.com" crossorigin="true" runllm-name="DataHub" runllm-assistant-id="81" runllm-position="BOTTOM_RIGHT" runllm-keyboard-shortcut="Mod+j" runllm-preset="docusaurus" runllm-theme-color="#1890FF" runllm-brand-logo="https://docs.datahub.com/img/datahub-logo-color-mark.svg" runllm-community-url="https://datahub.com/slack" runllm-community-type="slack" runllm-disable-ask-a-person="true" async></script><link rel="stylesheet" href="/assets/css/styles.1e47f27f.css">
<link rel="preload" href="/assets/js/runtime~main.a43ac34d.js" as="script">
<link rel="preload" href="/assets/js/main.86f3a4df.js" as="script">
</head>
<body class="navigation-with-keyboard">
<noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-5M8T9HNN" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
<script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){var t=null;try{t=new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}return t}()||function(){var t=null;try{t=localStorage.getItem("theme")}catch(t){}return t}();t(null!==e?e:"light")}(),document.documentElement.setAttribute("data-announcement-bar-initially-dismissed",function(){try{return"true"===localStorage.getItem("docusaurus.announcement.dismiss")}catch(t){}return!1}())</script><div id="__docusaurus">
<div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#__docusaurus_skipToContent_fallback">Skip to main content</a></div><div class="announcementBar_mb4j" style="background-color:transparent;color:#ffffff" role="banner"><div class="content_knG7 announcementBarContent_xLdY"><div class="shimmer-banner"><p><strong>CONTEXT:</strong> The Future of Agentic AI is On Demand</p><a href="https://datahub.com/resources/context/?utm_term=docs" target="_blank" class="button"><div>Watch Now<span></span></div></a></div></div></div><nav aria-label="Main" class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><button aria-label="Toggle navigation bar" aria-expanded="false" class="navbar__toggle clean-btn" type="button"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a href="https://datahub.com" target="_blank" rel="noopener noreferrer" class="navbar__brand"><div class="navbar__logo"><img src="/img/datahub-logo-color-light-horizontal.svg" alt="DataHub Logo" class="themedImage_ToTc themedImage--light_HNdA"><img src="/img/datahub-logo-color-dark-horizontal.svg" alt="DataHub Logo" class="themedImage_ToTc themedImage--dark_i4oU"></div></a><div class="navbar__item dropdown dropdown--hoverable"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link versionNavItem_cbn8">Next</a><ul class="dropdown__menu"><li><a aria-current="page" class="dropdown__link dropdown__link--active" href="/docs/pyspark">Next</a></li><li><a class="dropdown__link" href="/docs/1.1.0/features">1.1.0</a></li><li><hr class="dropdown-separator" style="margin: 0.4rem;"></li><li><div class="dropdown__link"><b>Archived versions</b></div></li><li>
<a class="dropdown__link" href="https://docs-website-t9sv4w3gr-acryldata.vercel.app/docs/features">1.0.0
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li><li>
<a class="dropdown__link" href="https://docs-website-t9sv4w3gr-acryldata.vercel.app/docs/0.15.0/features">0.15.0
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li><li>
<a class="dropdown__link" href="https://docs-website-8jkm4uler-acryldata.vercel.app/docs/0.14.1/features">0.14.1
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li><li>
<a class="dropdown__link" href="https://docs-website-eue2qafvn-acryldata.vercel.app/docs/features">0.14.0
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li><li>
<a class="dropdown__link" href="https://docs-website-psat3nzgi-acryldata.vercel.app/docs/features">0.13.1
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li><li>
<a class="dropdown__link" href="https://docs-website-lzxh86531-acryldata.vercel.app/docs/features">0.13.0
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li><li>
<a class="dropdown__link" href="https://docs-website-2uuxmgza2-acryldata.vercel.app/docs/features">0.12.1
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li><li>
<a class="dropdown__link" href="https://docs-website-irpoe2osc-acryldata.vercel.app/docs/features">0.11.0
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li><li>
<a class="dropdown__link" href="https://docs-website-1gv2yzn9d-acryldata.vercel.app/docs/features">0.10.5
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li></ul></div></div><div class="navbar__items navbar__items--right"><a aria-current="page" class="navbar__item navbar__link navbar__link--active" href="/docs">Docs</a><a class="navbar__item navbar__link" href="/integrations">Integrations</a><a href="https://datahub.com/slack?utm_source=docs&amp;utm_medium=header&amp;utm_campaign=docs_header" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">
<style>
.slack-logo:hover {
opacity: 0.8;
}
</style>
<img class="slack-logo" src="https://upload.wikimedia.org/wikipedia/commons/d/d5/Slack_icon_2019.svg" , alt="slack" , height="20px" style="margin: 10px 0 0 0;">
</a><a href="https://github.com/datahub-project/datahub" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">
<style>
.github-logo:hover {
opacity: 0.8;
}
</style>
<img class="github-logo" src="https://upload.wikimedia.org/wikipedia/commons/9/91/Octicons-mark-github.svg" , alt="slack" , height="20px" style="margin: 10px 0 0 0;">
</a><div class="searchBox_ZlJk"><button type="button" class="DocSearch DocSearch-Button" aria-label="Search"><span class="DocSearch-Button-Container"><svg width="20" height="20" class="DocSearch-Search-Icon" viewBox="0 0 20 20" aria-hidden="true"><path d="M14.386 14.386l4.0877 4.0877-4.0877-4.0877c-2.9418 2.9419-7.7115 2.9419-10.6533 0-2.9419-2.9418-2.9419-7.7115 0-10.6533 2.9418-2.9419 7.7115-2.9419 10.6533 0 2.9419 2.9418 2.9419 7.7115 0 10.6533z" stroke="currentColor" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg><span class="DocSearch-Button-Placeholder">Search</span></span><span class="DocSearch-Button-Keys"></span></button></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div id="__docusaurus_skipToContent_fallback" class="main-wrapper mainWrapper_z2l0 docsWrapper_BCFX"><button aria-label="Scroll back to top" class="clean-btn theme-back-to-top-button backToTopButton_sjWU" type="button"></button><div class="docPage__5DB"><main class="docMainContainer_gTbr docMainContainerEnhanced_Uz_u"><div class="container padding-top--md padding-bottom--lg"><div class="row"><div class="col docItemCol_VOVn"><div class="docItemContainer_Djhp"><article><span class="theme-doc-version-badge badge badge--secondary">Version: Next</span><div class="tocCollapsible_ETCw theme-doc-toc-mobile tocMobile_ITEo"><button type="button" class="clean-btn tocCollapsibleButton_TO0P">On this page</button></div><div class="theme-doc-markdown markdown"><h1>Optional PySpark Support for S3 Source</h1><p>DataHub&#x27;s S3 source now supports optional PySpark installation through the <code>s3-slim</code> variant. This allows users to choose a lightweight installation when data lake profiling is not needed.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="overview">Overview<a href="#overview" class="hash-link" aria-label="Direct link to Overview" title="Direct link to Overview"></a></h2><p>The S3 source includes PySpark by default for backward compatibility and profiling support. For users who only need metadata extraction without profiling, the <code>s3-slim</code> variant provides a ~500MB smaller installation.</p><p><strong>Current implementation status:</strong></p><ul><li><strong>S3</strong>: SparkProfiler pattern fully implemented (optional PySpark)</li><li><strong>ABS</strong>: Not yet implemented (still requires PySpark for profiling)</li><li><strong>Unity Catalog</strong>: Not affected by this change (uses separate profiling mechanisms)</li><li><strong>GCS</strong>: Does not support profiling</li></ul><blockquote><p><strong>Note:</strong> This change implements the SparkProfiler pattern for S3 only. The same pattern can be applied to other sources (ABS, etc.) in future PRs.</p></blockquote><h2 class="anchor anchorWithStickyNavbar_LWe7" id="pyspark-version">PySpark Version<a href="#pyspark-version" class="hash-link" aria-label="Direct link to PySpark Version" title="Direct link to PySpark Version"></a></h2><blockquote><p><strong>Current Version:</strong> PySpark 3.5.x (3.5.6)</p><p>PySpark 4.0 support is planned for a future release. Until then, all DataHub components use PySpark 3.5.x for compatibility and stability.</p></blockquote><h2 class="anchor anchorWithStickyNavbar_LWe7" id="installation-options">Installation Options<a href="#installation-options" class="hash-link" aria-label="Direct link to Installation Options" title="Direct link to Installation Options"></a></h2><h3 class="anchor anchorWithStickyNavbar_LWe7" id="standard-installation-includes-pyspark">Standard Installation (includes PySpark)<a href="#standard-installation-includes-pyspark" class="hash-link" aria-label="Direct link to Standard Installation (includes PySpark)" title="Direct link to Standard Installation (includes PySpark)"></a></h3><div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-bash codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip </span><span class="token function" style="color:rgb(130, 170, 255)">install</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&#x27;acryl-datahub[s3]&#x27;</span><span class="token plain"> </span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># S3 with PySpark/profiling support</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h3 class="anchor anchorWithStickyNavbar_LWe7" id="lightweight-installation-without-pyspark">Lightweight Installation (without PySpark)<a href="#lightweight-installation-without-pyspark" class="hash-link" aria-label="Direct link to Lightweight Installation (without PySpark)" title="Direct link to Lightweight Installation (without PySpark)"></a></h3><p>For installations where you don&#x27;t need profiling capabilities and want to save ~500MB:</p><div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-bash codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip </span><span class="token function" style="color:rgb(130, 170, 255)">install</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&#x27;acryl-datahub[s3-slim]&#x27;</span><span class="token plain"> </span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># S3 without profiling (~500MB smaller)</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p><strong>Recommendation:</strong> Use <code>s3-slim</code> when profiling is not needed.</p><p>The <code>data-lake-profiling</code> dependencies (included in standard <code>s3</code> by default):</p><ul><li><code>pyspark~=3.5.6</code></li><li><code>pydeequ&gt;=1.1.0</code></li><li>Profiling dependencies (cachetools)</li></ul><blockquote><p><strong>Note:</strong> In a future major release (e.g., DataHub 2.0), the <code>s3-slim</code> variant may become the default, and PySpark will be truly optional. This current approach provides backward compatibility while giving users time to adapt.</p></blockquote><h3 class="anchor anchorWithStickyNavbar_LWe7" id="whats-included">What&#x27;s Included<a href="#whats-included" class="hash-link" aria-label="Direct link to What&#x27;s Included" title="Direct link to What&#x27;s Included"></a></h3><p><strong>S3 source:</strong></p><p>Standard <code>s3</code> extra:</p><ul><li>✅ Metadata extraction (schemas, tables, file listing)</li><li>✅ Data format detection (Parquet, Avro, CSV, JSON, etc.)</li><li>✅ Schema inference from files</li><li>✅ Table and column-level metadata</li><li>✅ Tags and properties extraction</li><li>✅ Data profiling (min/max, nulls, distinct counts)</li><li>✅ Data quality checks (PyDeequ-based)</li><li>Includes: PySpark 3.5.6 + PyDeequ</li></ul><p><code>s3-slim</code> variant:</p><ul><li>✅ All metadata features (same as above)</li><li>❌ Data profiling disabled</li><li>No PySpark dependencies (~500MB smaller)</li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="feature-comparison">Feature Comparison<a href="#feature-comparison" class="hash-link" aria-label="Direct link to Feature Comparison" title="Direct link to Feature Comparison"></a></h2><table><thead><tr><th>Feature</th><th><code>s3-slim</code></th><th>Standard <code>s3</code></th></tr></thead><tbody><tr><td><strong>Metadata extraction</strong></td><td>✅ Full support</td><td>✅ Full support</td></tr><tr><td><strong>Schema inference</strong></td><td>✅ Full support</td><td>✅ Full support</td></tr><tr><td><strong>Tags &amp; properties</strong></td><td>✅ Full support</td><td>✅ Full support</td></tr><tr><td><strong>Data profiling</strong></td><td>❌ Not available</td><td>✅ Full profiling</td></tr><tr><td><strong>Installation size</strong></td><td>~200MB</td><td>~700MB</td></tr><tr><td><strong>Install time</strong></td><td>Fast</td><td>Slower (PySpark build)</td></tr><tr><td><strong>PySpark dependencies</strong></td><td>❌ None</td><td>✅ PySpark 3.5.6 + PyDeequ</td></tr></tbody></table><h2 class="anchor anchorWithStickyNavbar_LWe7" id="configuration">Configuration<a href="#configuration" class="hash-link" aria-label="Direct link to Configuration" title="Direct link to Configuration"></a></h2><h3 class="anchor anchorWithStickyNavbar_LWe7" id="with-standard-installation-pyspark-included">With Standard Installation (PySpark included)<a href="#with-standard-installation-pyspark-included" class="hash-link" aria-label="Direct link to With Standard Installation (PySpark included)" title="Direct link to With Standard Installation (PySpark included)"></a></h3><p>When you install <code>acryl-datahub[s3]</code>, profiling works out of the box:</p><div class="language-yaml codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-yaml codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token key atrule">source</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">type</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> s3</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">config</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">path_specs</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token plain"> </span><span class="token key atrule">include</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> s3</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain">//my</span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token plain">bucket/data/</span><span class="token important">**/*.parquet</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">profiling</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">enabled</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token boolean important" style="color:rgb(255, 88, 116)">true</span><span class="token plain"> </span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Works seamlessly with standard installation</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">profile_table_level_only</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token boolean important" style="color:rgb(255, 88, 116)">false</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h3 class="anchor anchorWithStickyNavbar_LWe7" id="with-slim-installation-no-pyspark">With Slim Installation (no PySpark)<a href="#with-slim-installation-no-pyspark" class="hash-link" aria-label="Direct link to With Slim Installation (no PySpark)" title="Direct link to With Slim Installation (no PySpark)"></a></h3><p>When you install <code>s3-slim</code>, disable profiling in your config:</p><div class="language-yaml codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-yaml codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token key atrule">source</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">type</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> s3</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">config</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">path_specs</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token plain"> </span><span class="token key atrule">include</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> s3</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain">//my</span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token plain">bucket/data/</span><span class="token important">**/*.parquet</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">profiling</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">enabled</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token boolean important" style="color:rgb(255, 88, 116)">false</span><span class="token plain"> </span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Required for s3-slim installation</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p><strong>If you enable profiling with s3-slim installation</strong>, you&#x27;ll see a clear error message at runtime:</p><div class="codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">RuntimeError: PySpark is not installed, but is required for S3 profiling.</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">Please install with: pip install &#x27;acryl-datahub[s3]&#x27;</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h2 class="anchor anchorWithStickyNavbar_LWe7" id="developer-guide">Developer Guide<a href="#developer-guide" class="hash-link" aria-label="Direct link to Developer Guide" title="Direct link to Developer Guide"></a></h2><h3 class="anchor anchorWithStickyNavbar_LWe7" id="implementation-pattern">Implementation Pattern<a href="#implementation-pattern" class="hash-link" aria-label="Direct link to Implementation Pattern" title="Direct link to Implementation Pattern"></a></h3><p>The S3 source demonstrates the recommended pattern for isolating PySpark-dependent code. This pattern can be applied to ABS and other sources in future PRs.</p><p><strong>Architecture (currently implemented for S3 only):</strong></p><ol><li><strong>Main source class</strong> (<code>source.py</code>) - Contains no PySpark imports at module level</li><li><strong>Profiler class</strong> (<code>profiling.py</code>) - Encapsulates all PySpark/PyDeequ logic in <code>SparkProfiler</code> class</li><li><strong>Conditional instantiation</strong> - <code>SparkProfiler</code> created only when profiling is enabled</li><li><strong>TYPE_CHECKING imports</strong> - Type annotations use TYPE_CHECKING block for optional dependencies</li></ol><p><strong>Key Benefits:</strong></p><ul><li>✅ Type safety preserved (mypy passes without issues)</li><li>✅ Proper code layer separation</li><li>✅ Works with both standard and <code>-slim</code> installations</li><li>✅ Clear error messages when dependencies missing</li><li>✅ Pattern can be reused for ABS and other sources</li></ul><p><strong>Example structure:</strong></p><div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-python codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># source.py</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token keyword" style="font-style:italic">if</span><span class="token plain"> TYPE_CHECKING</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token keyword" style="font-style:italic">from</span><span class="token plain"> datahub</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">ingestion</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">source</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">s3</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">profiling </span><span class="token keyword" style="font-style:italic">import</span><span class="token plain"> SparkProfiler</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token keyword" style="font-style:italic">class</span><span class="token plain"> </span><span class="token class-name" style="color:rgb(255, 203, 107)">S3Source</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> profiler</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> Optional</span><span class="token punctuation" style="color:rgb(199, 146, 234)">[</span><span class="token string" style="color:rgb(195, 232, 141)">&quot;SparkProfiler&quot;</span><span class="token punctuation" style="color:rgb(199, 146, 234)">]</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token keyword" style="font-style:italic">def</span><span class="token plain"> </span><span class="token function" style="color:rgb(130, 170, 255)">__init__</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token plain">self</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> config</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> ctx</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token keyword" style="font-style:italic">if</span><span class="token plain"> config</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">is_profiling_enabled</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token keyword" style="font-style:italic">from</span><span class="token plain"> datahub</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">ingestion</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">source</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">s3</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">profiling </span><span class="token keyword" style="font-style:italic">import</span><span class="token plain"> SparkProfiler</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> self</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">profiler </span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token plain"> SparkProfiler</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token keyword" style="font-style:italic">else</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> self</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">profiler </span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token plain"> </span><span class="token boolean" style="color:rgb(255, 88, 116)">None</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-python codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># profiling.py</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token keyword" style="font-style:italic">class</span><span class="token plain"> </span><span class="token class-name" style="color:rgb(255, 203, 107)">SparkProfiler</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token triple-quoted-string string" style="color:rgb(195, 232, 141)">&quot;&quot;&quot;Encapsulates all PySpark/PyDeequ profiling logic.&quot;&quot;&quot;</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token keyword" style="font-style:italic">def</span><span class="token plain"> </span><span class="token function" style="color:rgb(130, 170, 255)">init_spark</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token plain">self</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token plain"> </span><span class="token operator" style="color:rgb(137, 221, 255)">-</span><span class="token operator" style="color:rgb(137, 221, 255)">&gt;</span><span class="token plain"> Any</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Spark session initialization</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token keyword" style="font-style:italic">def</span><span class="token plain"> </span><span class="token function" style="color:rgb(130, 170, 255)">read_file_spark</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token plain">self</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> </span><span class="token builtin" style="color:rgb(130, 170, 255)">file</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token builtin" style="color:rgb(130, 170, 255)">str</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> ext</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token builtin" style="color:rgb(130, 170, 255)">str</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># File reading with Spark</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token keyword" style="font-style:italic">def</span><span class="token plain"> </span><span class="token function" style="color:rgb(130, 170, 255)">get_table_profile</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token plain">self</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> table_data</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> dataset_urn</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Table profiling coordination</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p>For more details, see the <a href="/docs/metadata-ingestion/adding-source#31-using-optional-dependencies-eg-pyspark">Adding a Metadata Ingestion Source</a> guide.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="troubleshooting">Troubleshooting<a href="#troubleshooting" class="hash-link" aria-label="Direct link to Troubleshooting" title="Direct link to Troubleshooting"></a></h2><h3 class="anchor anchorWithStickyNavbar_LWe7" id="error-pyspark-is-not-installed-but-is-required-for-profiling">Error: &quot;PySpark is not installed, but is required for profiling&quot;<a href="#error-pyspark-is-not-installed-but-is-required-for-profiling" class="hash-link" aria-label="Direct link to Error: &quot;PySpark is not installed, but is required for profiling&quot;" title="Direct link to Error: &quot;PySpark is not installed, but is required for profiling&quot;"></a></h3><p><strong>Problem:</strong> You installed a <code>-slim</code> variant but have profiling enabled in your config.</p><p><strong>Solutions:</strong></p><ol><li><p><strong>Recommended:</strong> Use standard installation with PySpark:</p><div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-bash codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip uninstall acryl-datahub</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip </span><span class="token function" style="color:rgb(130, 170, 255)">install</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&#x27;acryl-datahub[s3]&#x27;</span><span class="token plain"> </span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># For S3 profiling</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div></li><li><p><strong>Alternative:</strong> Disable profiling in your recipe:</p><div class="language-yaml codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-yaml codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token key atrule">profiling</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">enabled</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token boolean important" style="color:rgb(255, 88, 116)">false</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div></li></ol><h3 class="anchor anchorWithStickyNavbar_LWe7" id="verifying-installation">Verifying Installation<a href="#verifying-installation" class="hash-link" aria-label="Direct link to Verifying Installation" title="Direct link to Verifying Installation"></a></h3><p>Check if PySpark is installed:</p><div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-bash codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Check installed packages</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip list </span><span class="token operator" style="color:rgb(137, 221, 255)">|</span><span class="token plain"> </span><span class="token function" style="color:rgb(130, 170, 255)">grep</span><span class="token plain"> pyspark</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Test import in Python</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">python -c </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;import pyspark; print(pyspark.__version__)&quot;</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p>Expected output:</p><ul><li>Standard installation (<code>s3</code>): Shows <code>pyspark 3.5.x</code></li><li>Slim installation (<code>s3-slim</code>): Import fails or package not found</li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="migration-guide">Migration Guide<a href="#migration-guide" class="hash-link" aria-label="Direct link to Migration Guide" title="Direct link to Migration Guide"></a></h2><h3 class="anchor anchorWithStickyNavbar_LWe7" id="upgrading-from-previous-versions">Upgrading from Previous Versions<a href="#upgrading-from-previous-versions" class="hash-link" aria-label="Direct link to Upgrading from Previous Versions" title="Direct link to Upgrading from Previous Versions"></a></h3><p><strong>No action required!</strong> This change is fully backward compatible:</p><div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-bash codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Existing installations continue to work exactly as before</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip </span><span class="token function" style="color:rgb(130, 170, 255)">install</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&#x27;acryl-datahub[s3]&#x27;</span><span class="token plain"> </span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Still includes PySpark by default (profiling supported)</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p><strong>Recommended: Optimize installations</strong></p><ul><li><strong>S3 with profiling:</strong> Keep using <code>acryl-datahub[s3]</code> (includes PySpark)</li><li><strong>S3 without profiling:</strong> Switch to <code>acryl-datahub[s3-slim]</code> to save ~500MB</li></ul><div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-bash codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Recommended installations</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip </span><span class="token function" style="color:rgb(130, 170, 255)">install</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&#x27;acryl-datahub[s3]&#x27;</span><span class="token plain"> </span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># S3 with profiling support</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip </span><span class="token function" style="color:rgb(130, 170, 255)">install</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&#x27;acryl-datahub[s3-slim]&#x27;</span><span class="token plain"> </span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># S3 metadata only (no profiling)</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h3 class="anchor anchorWithStickyNavbar_LWe7" id="no-breaking-changes">No Breaking Changes<a href="#no-breaking-changes" class="hash-link" aria-label="Direct link to No Breaking Changes" title="Direct link to No Breaking Changes"></a></h3><p>This implementation maintains full backward compatibility:</p><ul><li>Standard <code>s3</code> extra includes PySpark (unchanged behavior)</li><li>All existing recipes and configs continue to work</li><li>New <code>s3-slim</code> variant available for users who want smaller installations</li><li>Future DataHub 2.0 may flip defaults, but provides migration path</li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="benefits-for-datahub-actions">Benefits for DataHub Actions<a href="#benefits-for-datahub-actions" class="hash-link" aria-label="Direct link to Benefits for DataHub Actions" title="Direct link to Benefits for DataHub Actions"></a></h2><p><a href="https://github.com/datahub-project/datahub/tree/master/datahub-actions" target="_blank" rel="noopener noreferrer">DataHub Actions</a> depends on <code>acryl-datahub</code> and can benefit from <code>s3-slim</code> when profiling is not needed:</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="reduced-installation-size">Reduced Installation Size<a href="#reduced-installation-size" class="hash-link" aria-label="Direct link to Reduced Installation Size" title="Direct link to Reduced Installation Size"></a></h3><p>DataHub Actions typically doesn&#x27;t need data lake profiling capabilities since it focuses on reacting to metadata events, not extracting metadata from data lakes. Use <code>s3-slim</code> to reduce footprint:</p><div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-bash codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># If Actions needs S3 metadata access but not profiling</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip </span><span class="token function" style="color:rgb(130, 170, 255)">install</span><span class="token plain"> acryl-datahub-actions</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip </span><span class="token function" style="color:rgb(130, 170, 255)">install</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&#x27;acryl-datahub[s3-slim]&#x27;</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Result: ~500MB smaller than standard s3 extra</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># If Actions needs full S3 with profiling</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip </span><span class="token function" style="color:rgb(130, 170, 255)">install</span><span class="token plain"> acryl-datahub-actions</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip </span><span class="token function" style="color:rgb(130, 170, 255)">install</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&#x27;acryl-datahub[s3]&#x27;</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Result: Includes PySpark for profiling capabilities</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h3 class="anchor anchorWithStickyNavbar_LWe7" id="faster-deployment">Faster Deployment<a href="#faster-deployment" class="hash-link" aria-label="Direct link to Faster Deployment" title="Direct link to Faster Deployment"></a></h3><p>Actions services using <code>s3-slim</code> deploy faster in containerized environments:</p><ul><li><strong>Faster pip install</strong>: No PySpark compilation required</li><li><strong>Smaller Docker images</strong>: Reduced base image size</li><li><strong>Quicker cold starts</strong>: Less code to load and initialize</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="fewer-dependency-conflicts">Fewer Dependency Conflicts<a href="#fewer-dependency-conflicts" class="hash-link" aria-label="Direct link to Fewer Dependency Conflicts" title="Direct link to Fewer Dependency Conflicts"></a></h3><p>Actions workflows often integrate with other tools (Slack, Teams, email services). Using <code>s3-slim</code> reduces:</p><ul><li>Python version constraint conflicts</li><li>Java/Spark runtime conflicts in restricted environments</li><li>Transitive dependency version mismatches</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="when-actions-needs-profiling">When Actions Needs Profiling<a href="#when-actions-needs-profiling" class="hash-link" aria-label="Direct link to When Actions Needs Profiling" title="Direct link to When Actions Needs Profiling"></a></h3><p>If your Actions workflow needs to trigger data lake profiling jobs, use the standard extra:</p><div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-bash codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Actions with data lake profiling capability</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip </span><span class="token function" style="color:rgb(130, 170, 255)">install</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&#x27;acryl-datahub-actions&#x27;</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip </span><span class="token function" style="color:rgb(130, 170, 255)">install</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&#x27;acryl-datahub[s3]&#x27;</span><span class="token plain"> </span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Includes PySpark by default</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p><strong>Common Actions use cases that DON&#x27;T need PySpark:</strong></p><ul><li>Slack notifications on schema changes</li><li>Propagating tags and terms to downstream systems</li><li>Triggering dbt runs on metadata updates</li><li>Sending emails on data quality failures</li><li>Creating Jira tickets for governance issues</li><li>Updating external catalogs (e.g., Alation, Collibra)</li></ul><p><strong>Rare Actions use cases that MIGHT need PySpark:</strong></p><ul><li>Custom actions that programmatically trigger S3 profiling</li><li>Actions that directly process data lake files (not typical)</li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="benefits-summary">Benefits Summary<a href="#benefits-summary" class="hash-link" aria-label="Direct link to Benefits Summary" title="Direct link to Benefits Summary"></a></h2><p><strong>Backward compatible</strong>: Standard <code>s3</code> extra unchanged, existing users unaffected
<strong>Smaller installations</strong>: Save ~500MB with <code>s3-slim</code>
<strong>Faster setup</strong>: No PySpark compilation with <code>s3-slim</code>
<strong>Flexible deployment</strong>: Choose based on profiling needs
<strong>Type safety maintained</strong>: Refactored with proper code layer separation (mypy passes)
<strong>Clear error messages</strong>: Runtime errors guide users to correct installation
<strong>Actions-friendly</strong>: DataHub Actions benefits from reduced footprint with <code>s3-slim</code></p><p><strong>Key Takeaways:</strong></p><ul><li>Use <code>s3</code> if you need S3 profiling, <code>s3-slim</code> if you don&#x27;t</li><li>Pattern can be applied to other sources (ABS, etc.) in future PRs</li><li>Existing installations continue working without changes</li></ul></div><footer class="theme-doc-footer docusaurus-mt-lg"><div class="slackUtm_uoBr"><div class="slackUtm_uoBr"><hr>Need more help? Join the conversation in <a href="https://datahub.com/slack?utm_source=docs&amp;utm_medium=footer&amp;utm_campaign=docs_footer&amp;utm_content=docs/PYSPARK">Slack!</a></div></div><div class="theme-doc-footer-edit-meta-row row"><div class="col"><a href="https://github.com/datahub-project/datahub/blob/master/docs/PYSPARK.md" target="_blank" rel="noreferrer noopener" class="theme-edit-this-page"><svg fill="currentColor" height="20" width="20" viewBox="0 0 40 40" class="iconEdit_Z9Sw" aria-hidden="true"><g><path d="m34.5 11.7l-3 3.1-6.3-6.3 3.1-3q0.5-0.5 1.2-0.5t1.1 0.5l3.9 3.9q0.5 0.4 0.5 1.1t-0.5 1.2z m-29.5 17.1l18.4-18.5 6.3 6.3-18.4 18.4h-6.3v-6.2z"></path></g></svg>Edit this page</a></div><div class="col lastUpdated_VsjB"></div></div></footer><div class="feedbackWrapper_mUHF"><div class="feedbackWidget_PX4d"><div class="feedbackButtons_wn3V"><strong>Is this page helpful?</strong><div><button class="feedbackButton_UgQs"><span role="img" aria-label="like" class="anticon anticon-like"><svg viewBox="64 64 896 896" focusable="false" data-icon="like" width="1em" height="1em" fill="currentColor" aria-hidden="true"><path d="M885.9 533.7c16.8-22.2 26.1-49.4 26.1-77.7 0-44.9-25.1-87.4-65.5-111.1a67.67 67.67 0 00-34.3-9.3H572.4l6-122.9c1.4-29.7-9.1-57.9-29.5-79.4A106.62 106.62 0 00471 99.9c-52 0-98 35-111.8 85.1l-85.9 311H144c-17.7 0-32 14.3-32 32v364c0 17.7 14.3 32 32 32h601.3c9.2 0 18.2-1.8 26.5-5.4 47.6-20.3 78.3-66.8 78.3-118.4 0-12.6-1.8-25-5.4-37 16.8-22.2 26.1-49.4 26.1-77.7 0-12.6-1.8-25-5.4-37 16.8-22.2 26.1-49.4 26.1-77.7-.2-12.6-2-25.1-5.6-37.1zM184 852V568h81v284h-81zm636.4-353l-21.9 19 13.9 25.4a56.2 56.2 0 016.9 27.3c0 16.5-7.2 32.2-19.6 43l-21.9 19 13.9 25.4a56.2 56.2 0 016.9 27.3c0 16.5-7.2 32.2-19.6 43l-21.9 19 13.9 25.4a56.2 56.2 0 016.9 27.3c0 22.4-13.2 42.6-33.6 51.8H329V564.8l99.5-360.5a44.1 44.1 0 0142.2-32.3c7.6 0 15.1 2.2 21.1 6.7 9.9 7.4 15.2 18.6 14.6 30.5l-9.6 198.4h314.4C829 418.5 840 436.9 840 456c0 16.5-7.2 32.1-19.6 43z"></path></svg></span></button><button class="feedbackButton_UgQs"><span role="img" aria-label="dislike" class="anticon anticon-dislike"><svg viewBox="64 64 896 896" focusable="false" data-icon="dislike" width="1em" height="1em" fill="currentColor" aria-hidden="true"><path d="M885.9 490.3c3.6-12 5.4-24.4 5.4-37 0-28.3-9.3-55.5-26.1-77.7 3.6-12 5.4-24.4 5.4-37 0-28.3-9.3-55.5-26.1-77.7 3.6-12 5.4-24.4 5.4-37 0-51.6-30.7-98.1-78.3-118.4a66.1 66.1 0 00-26.5-5.4H144c-17.7 0-32 14.3-32 32v364c0 17.7 14.3 32 32 32h129.3l85.8 310.8C372.9 889 418.9 924 470.9 924c29.7 0 57.4-11.8 77.9-33.4 20.5-21.5 31-49.7 29.5-79.4l-6-122.9h239.9c12.1 0 23.9-3.2 34.3-9.3 40.4-23.5 65.5-66.1 65.5-111 0-28.3-9.3-55.5-26.1-77.7zM184 456V172h81v284h-81zm627.2 160.4H496.8l9.6 198.4c.6 11.9-4.7 23.1-14.6 30.5-6.1 4.5-13.6 6.8-21.1 6.7a44.28 44.28 0 01-42.2-32.3L329 459.2V172h415.4a56.85 56.85 0 0133.6 51.8c0 9.7-2.3 18.9-6.9 27.3l-13.9 25.4 21.9 19a56.76 56.76 0 0119.6 43c0 9.7-2.3 18.9-6.9 27.3l-13.9 25.4 21.9 19a56.76 56.76 0 0119.6 43c0 9.7-2.3 18.9-6.9 27.3l-14 25.5 21.9 19a56.76 56.76 0 0119.6 43c0 19.1-11 37.5-28.8 48.4z"></path></svg></span></button></div></div></div></div></article><nav class="pagination-nav docusaurus-mt-lg" aria-label="Docs pages"></nav></div></div><div class="col col--3"><div class="tableOfContents_bqdL thin-scrollbar theme-doc-toc-desktop"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#overview" class="table-of-contents__link toc-highlight">Overview</a></li><li><a href="#pyspark-version" class="table-of-contents__link toc-highlight">PySpark Version</a></li><li><a href="#installation-options" class="table-of-contents__link toc-highlight">Installation Options</a><ul><li><a href="#standard-installation-includes-pyspark" class="table-of-contents__link toc-highlight">Standard Installation (includes PySpark)</a></li><li><a href="#lightweight-installation-without-pyspark" class="table-of-contents__link toc-highlight">Lightweight Installation (without PySpark)</a></li><li><a href="#whats-included" class="table-of-contents__link toc-highlight">What&#39;s Included</a></li></ul></li><li><a href="#feature-comparison" class="table-of-contents__link toc-highlight">Feature Comparison</a></li><li><a href="#configuration" class="table-of-contents__link toc-highlight">Configuration</a><ul><li><a href="#with-standard-installation-pyspark-included" class="table-of-contents__link toc-highlight">With Standard Installation (PySpark included)</a></li><li><a href="#with-slim-installation-no-pyspark" class="table-of-contents__link toc-highlight">With Slim Installation (no PySpark)</a></li></ul></li><li><a href="#developer-guide" class="table-of-contents__link toc-highlight">Developer Guide</a><ul><li><a href="#implementation-pattern" class="table-of-contents__link toc-highlight">Implementation Pattern</a></li></ul></li><li><a href="#troubleshooting" class="table-of-contents__link toc-highlight">Troubleshooting</a><ul><li><a href="#error-pyspark-is-not-installed-but-is-required-for-profiling" class="table-of-contents__link toc-highlight">Error: &quot;PySpark is not installed, but is required for profiling&quot;</a></li><li><a href="#verifying-installation" class="table-of-contents__link toc-highlight">Verifying Installation</a></li></ul></li><li><a href="#migration-guide" class="table-of-contents__link toc-highlight">Migration Guide</a><ul><li><a href="#upgrading-from-previous-versions" class="table-of-contents__link toc-highlight">Upgrading from Previous Versions</a></li><li><a href="#no-breaking-changes" class="table-of-contents__link toc-highlight">No Breaking Changes</a></li></ul></li><li><a href="#benefits-for-datahub-actions" class="table-of-contents__link toc-highlight">Benefits for DataHub Actions</a><ul><li><a href="#reduced-installation-size" class="table-of-contents__link toc-highlight">Reduced Installation Size</a></li><li><a href="#faster-deployment" class="table-of-contents__link toc-highlight">Faster Deployment</a></li><li><a href="#fewer-dependency-conflicts" class="table-of-contents__link toc-highlight">Fewer Dependency Conflicts</a></li><li><a href="#when-actions-needs-profiling" class="table-of-contents__link toc-highlight">When Actions Needs Profiling</a></li></ul></li><li><a href="#benefits-summary" class="table-of-contents__link toc-highlight">Benefits Summary</a></li></ul></div></div></div></div></main></div></div><footer class="footer footer--dark"><div class="container container-fluid"><div class="row footer__links"><div class="col footer__col"><div class="footer__title">Docs</div><ul class="footer__items clean-list"><li class="footer__item"><a class="footer__link-item" href="/docs/">Introduction</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/quickstart">Quickstart</a></li></ul></div><div class="col footer__col"><div class="footer__title">Community</div><ul class="footer__items clean-list"><li class="footer__item"><a href="https://datahub.com/slack" target="_blank" rel="noopener noreferrer" class="footer__link-item">Slack<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://www.youtube.com/channel/UC3qFQC5IiwR5fvWEqi_tJ5w" target="_blank" rel="noopener noreferrer" class="footer__link-item">YouTube<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://medium.com/datahub-project" target="_blank" rel="noopener noreferrer" class="footer__link-item">Blog<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a class="footer__link-item" href="/docs/townhalls">Town Halls</a></li><li class="footer__item"><a href="https://datahub.com/resources/?2004611554=dh-stories" target="_blank" rel="noopener noreferrer" class="footer__link-item">Customer Stories<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div><div class="col footer__col"><div class="footer__title">More</div><ul class="footer__items clean-list"><li class="footer__item"><a href="https://demo.datahub.com/" target="_blank" rel="noopener noreferrer" class="footer__link-item">Demo</a></li><li class="footer__item"><a href="https://feature-requests.datahubproject.io/roadmap" target="_blank" rel="noopener noreferrer" class="footer__link-item">Roadmap<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a class="footer__link-item" href="/docs/contributing">Contributing</a></li><li class="footer__item"><a href="https://github.com/datahub-project/datahub" target="_blank" rel="noopener noreferrer" class="footer__link-item">GitHub<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://feature-requests.datahubproject.io/" target="_blank" rel="noopener noreferrer" class="footer__link-item">Feature Requests<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div></div><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2015-2025 DataHub Project Authors.</div></div></div></footer></div>
<script src="/assets/js/runtime~main.a43ac34d.js"></script>
<script src="/assets/js/main.86f3a4df.js"></script>
</body>
</html>