datahub/docs/pyspark/index.html
2025-11-20 14:25:26 +00:00

98 lines
75 KiB
HTML
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!doctype html>
<html lang="en" dir="ltr" class="docs-wrapper docs-doc-page docs-version-current plugin-docs plugin-id-default docs-doc-id-docs/PYSPARK" data-has-hydrated="false">
<head>
<meta charset="UTF-8">
<meta name="generator" content="Docusaurus v2.4.3">
<title data-rh="true">Optional PySpark Support for S3 Source | DataHub</title><meta data-rh="true" name="viewport" content="width=device-width,initial-scale=1"><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:url" content="https://docs.datahub.com/docs/pyspark"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" name="docusaurus_version" content="current"><meta data-rh="true" name="docusaurus_tag" content="docs-default-current"><meta data-rh="true" name="docsearch:version" content="current"><meta data-rh="true" name="docsearch:docusaurus_tag" content="docs-default-current"><meta data-rh="true" property="og:title" content="Optional PySpark Support for S3 Source | DataHub"><meta data-rh="true" name="description" content="DataHub&#x27;s S3 source now supports optional PySpark installation through the s3-slim variant. This allows users to choose a lightweight installation when data lake profiling is not needed."><meta data-rh="true" property="og:description" content="DataHub&#x27;s S3 source now supports optional PySpark installation through the s3-slim variant. This allows users to choose a lightweight installation when data lake profiling is not needed."><link data-rh="true" rel="icon" href="/img/favicon.ico"><link data-rh="true" rel="canonical" href="https://docs.datahub.com/docs/pyspark"><link data-rh="true" rel="alternate" href="https://docs.datahub.com/docs/pyspark" hreflang="en"><link data-rh="true" rel="alternate" href="https://docs.datahub.com/docs/pyspark" hreflang="x-default"><link data-rh="true" rel="preconnect" href="https://RK0UG797F3-dsn.algolia.net" crossorigin="anonymous"><link rel="alternate" type="application/rss+xml" href="/learn/rss.xml" title="DataHub RSS Feed">
<link rel="alternate" type="application/atom+xml" href="/learn/atom.xml" title="DataHub Atom Feed">
<link rel="preconnect" href="https://www.google-analytics.com">
<link rel="preconnect" href="https://www.googletagmanager.com">
<script async src="https://www.googletagmanager.com/gtag/js?id=G-PKGVLETT4C"></script>
<script>function gtag(){dataLayer.push(arguments)}window.dataLayer=window.dataLayer||[],gtag("js",new Date),gtag("config","G-PKGVLETT4C",{})</script>
<link rel="preconnect" href="https://www.googletagmanager.com">
<script>window.dataLayer=window.dataLayer||[]</script>
<script>!function(e,t,a,n,g){e[n]=e[n]||[],e[n].push({"gtm.start":(new Date).getTime(),event:"gtm.js"});var m=t.getElementsByTagName(a)[0],r=t.createElement(a);r.async=!0,r.src="https://www.googletagmanager.com/gtm.js?id=GTM-5M8T9HNN",m.parentNode.insertBefore(r,m)}(window,document,"script","dataLayer")</script>
<link rel="search" type="application/opensearchdescription+xml" title="DataHub" href="/opensearch.xml">
<meta httpequiv="Content-Security-Policy" content="frame-ancestors &#39;self&#39; https://*.acryl.io https://acryldata.io http://localhost:*">
<link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Manrope:wght@400;500;700&display=swap">
<script src="https://tools.luckyorange.com/core/lo.js?site-id=28ea8a38" async defer="defer"></script>
<script src="/scripts/rb2b.js" async defer="defer"></script>
<script src="https://app.revenuehero.io/scheduler.min.js"></script>
<script src="https://tag.clearbitscripts.com/v1/pk_2e321cabe30432a5c44c0424781aa35f/tags.js" referrerpolicy="strict-origin-when-cross-origin"></script>
<script src="/scripts/reo.js"></script>
<script id="runllm-widget-script" type="module" src="https://widget.runllm.com" crossorigin="true" runllm-name="DataHub" runllm-assistant-id="81" runllm-position="BOTTOM_RIGHT" runllm-keyboard-shortcut="Mod+j" runllm-preset="docusaurus" runllm-theme-color="#1890FF" runllm-brand-logo="https://docs.datahub.com/img/datahub-logo-color-mark.svg" runllm-community-url="https://datahub.com/slack" runllm-community-type="slack" runllm-disable-ask-a-person="true" async></script><link rel="stylesheet" href="/assets/css/styles.1e47f27f.css">
<link rel="preload" href="/assets/js/runtime~main.7bcf907e.js" as="script">
<link rel="preload" href="/assets/js/main.99352010.js" as="script">
</head>
<body class="navigation-with-keyboard">
<noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-5M8T9HNN" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
<script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){var t=null;try{t=new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}return t}()||function(){var t=null;try{t=localStorage.getItem("theme")}catch(t){}return t}();t(null!==e?e:"light")}(),document.documentElement.setAttribute("data-announcement-bar-initially-dismissed",function(){try{return"true"===localStorage.getItem("docusaurus.announcement.dismiss")}catch(t){}return!1}())</script><div id="__docusaurus">
<div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#__docusaurus_skipToContent_fallback">Skip to main content</a></div><div class="announcementBar_mb4j" style="background-color:transparent;color:#ffffff" role="banner"><div class="content_knG7 announcementBarContent_xLdY"><div class="shimmer-banner"><p><strong>CONTEXT:</strong> The Future of Agentic AI is On Demand</p><a href="https://datahub.com/resources/context/?utm_term=docs" target="_blank" class="button"><div>Watch Now<span></span></div></a></div></div></div><nav aria-label="Main" class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><button aria-label="Toggle navigation bar" aria-expanded="false" class="navbar__toggle clean-btn" type="button"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a href="https://datahub.com" target="_blank" rel="noopener noreferrer" class="navbar__brand"><div class="navbar__logo"><img src="/img/datahub-logo-color-light-horizontal.svg" alt="DataHub Logo" class="themedImage_ToTc themedImage--light_HNdA"><img src="/img/datahub-logo-color-dark-horizontal.svg" alt="DataHub Logo" class="themedImage_ToTc themedImage--dark_i4oU"></div></a><div class="navbar__item dropdown dropdown--hoverable"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link versionNavItem_cbn8">Next</a><ul class="dropdown__menu"><li><a aria-current="page" class="dropdown__link dropdown__link--active" href="/docs/pyspark">Next</a></li><li><a class="dropdown__link" href="/docs/1.1.0/features">1.1.0</a></li><li><hr class="dropdown-separator" style="margin: 0.4rem;"></li><li><div class="dropdown__link"><b>Archived versions</b></div></li><li>
<a class="dropdown__link" href="https://docs-website-t9sv4w3gr-acryldata.vercel.app/docs/features">1.0.0
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li><li>
<a class="dropdown__link" href="https://docs-website-t9sv4w3gr-acryldata.vercel.app/docs/0.15.0/features">0.15.0
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li><li>
<a class="dropdown__link" href="https://docs-website-8jkm4uler-acryldata.vercel.app/docs/0.14.1/features">0.14.1
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li><li>
<a class="dropdown__link" href="https://docs-website-eue2qafvn-acryldata.vercel.app/docs/features">0.14.0
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li><li>
<a class="dropdown__link" href="https://docs-website-psat3nzgi-acryldata.vercel.app/docs/features">0.13.1
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li><li>
<a class="dropdown__link" href="https://docs-website-lzxh86531-acryldata.vercel.app/docs/features">0.13.0
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li><li>
<a class="dropdown__link" href="https://docs-website-2uuxmgza2-acryldata.vercel.app/docs/features">0.12.1
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li><li>
<a class="dropdown__link" href="https://docs-website-irpoe2osc-acryldata.vercel.app/docs/features">0.11.0
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li><li>
<a class="dropdown__link" href="https://docs-website-1gv2yzn9d-acryldata.vercel.app/docs/features">0.10.5
<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg>
</a>
</li></ul></div></div><div class="navbar__items navbar__items--right"><a aria-current="page" class="navbar__item navbar__link navbar__link--active" href="/docs">Docs</a><a class="navbar__item navbar__link" href="/integrations">Integrations</a><a href="https://datahub.com/slack?utm_source=docs&amp;utm_medium=header&amp;utm_campaign=docs_header" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">
<style>
.slack-logo:hover {
opacity: 0.8;
}
</style>
<img class="slack-logo" src="https://upload.wikimedia.org/wikipedia/commons/d/d5/Slack_icon_2019.svg" , alt="slack" , height="20px" style="margin: 10px 0 0 0;">
</a><a href="https://github.com/datahub-project/datahub" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">
<style>
.github-logo:hover {
opacity: 0.8;
}
</style>
<img class="github-logo" src="https://upload.wikimedia.org/wikipedia/commons/9/91/Octicons-mark-github.svg" , alt="slack" , height="20px" style="margin: 10px 0 0 0;">
</a><div class="searchBox_ZlJk"><button type="button" class="DocSearch DocSearch-Button" aria-label="Search"><span class="DocSearch-Button-Container"><svg width="20" height="20" class="DocSearch-Search-Icon" viewBox="0 0 20 20" aria-hidden="true"><path d="M14.386 14.386l4.0877 4.0877-4.0877-4.0877c-2.9418 2.9419-7.7115 2.9419-10.6533 0-2.9419-2.9418-2.9419-7.7115 0-10.6533 2.9418-2.9419 7.7115-2.9419 10.6533 0 2.9419 2.9418 2.9419 7.7115 0 10.6533z" stroke="currentColor" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg><span class="DocSearch-Button-Placeholder">Search</span></span><span class="DocSearch-Button-Keys"></span></button></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div id="__docusaurus_skipToContent_fallback" class="main-wrapper mainWrapper_z2l0 docsWrapper_BCFX"><button aria-label="Scroll back to top" class="clean-btn theme-back-to-top-button backToTopButton_sjWU" type="button"></button><div class="docPage__5DB"><main class="docMainContainer_gTbr docMainContainerEnhanced_Uz_u"><div class="container padding-top--md padding-bottom--lg"><div class="row"><div class="col docItemCol_VOVn"><div class="docItemContainer_Djhp"><article><span class="theme-doc-version-badge badge badge--secondary">Version: Next</span><div class="tocCollapsible_ETCw theme-doc-toc-mobile tocMobile_ITEo"><button type="button" class="clean-btn tocCollapsibleButton_TO0P">On this page</button></div><div class="theme-doc-markdown markdown"><h1>Optional PySpark Support for S3 Source</h1><p>DataHub&#x27;s S3 source now supports optional PySpark installation through the <code>s3-slim</code> variant. This allows users to choose a lightweight installation when data lake profiling is not needed.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="overview">Overview<a href="#overview" class="hash-link" aria-label="Direct link to Overview" title="Direct link to Overview"></a></h2><p>The S3 source includes PySpark by default for backward compatibility and profiling support. For users who only need metadata extraction without profiling, the <code>s3-slim</code> variant provides a ~500MB smaller installation.</p><p><strong>Current implementation status:</strong></p><ul><li><strong>S3</strong>: SparkProfiler pattern fully implemented (optional PySpark)</li><li><strong>ABS</strong>: Not yet implemented (still requires PySpark for profiling)</li><li><strong>Unity Catalog</strong>: Not affected by this change (uses separate profiling mechanisms)</li><li><strong>GCS</strong>: Does not support profiling</li></ul><blockquote><p><strong>Note:</strong> This change implements the SparkProfiler pattern for S3 only. The same pattern can be applied to other sources (ABS, etc.) in future PRs.</p></blockquote><h2 class="anchor anchorWithStickyNavbar_LWe7" id="pyspark-version">PySpark Version<a href="#pyspark-version" class="hash-link" aria-label="Direct link to PySpark Version" title="Direct link to PySpark Version"></a></h2><blockquote><p><strong>Current Version:</strong> PySpark 3.5.x (3.5.6)</p><p>PySpark 4.0 support is planned for a future release. Until then, all DataHub components use PySpark 3.5.x for compatibility and stability.</p></blockquote><h2 class="anchor anchorWithStickyNavbar_LWe7" id="installation-options">Installation Options<a href="#installation-options" class="hash-link" aria-label="Direct link to Installation Options" title="Direct link to Installation Options"></a></h2><h3 class="anchor anchorWithStickyNavbar_LWe7" id="standard-installation-includes-pyspark">Standard Installation (includes PySpark)<a href="#standard-installation-includes-pyspark" class="hash-link" aria-label="Direct link to Standard Installation (includes PySpark)" title="Direct link to Standard Installation (includes PySpark)"></a></h3><div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-bash codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip </span><span class="token function" style="color:rgb(130, 170, 255)">install</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&#x27;acryl-datahub[s3]&#x27;</span><span class="token plain"> </span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># S3 with PySpark/profiling support</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h3 class="anchor anchorWithStickyNavbar_LWe7" id="lightweight-installation-without-pyspark">Lightweight Installation (without PySpark)<a href="#lightweight-installation-without-pyspark" class="hash-link" aria-label="Direct link to Lightweight Installation (without PySpark)" title="Direct link to Lightweight Installation (without PySpark)"></a></h3><p>For installations where you don&#x27;t need profiling capabilities and want to save ~500MB:</p><div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-bash codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip </span><span class="token function" style="color:rgb(130, 170, 255)">install</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&#x27;acryl-datahub[s3-slim]&#x27;</span><span class="token plain"> </span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># S3 without profiling (~500MB smaller)</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p><strong>Recommendation:</strong> Use <code>s3-slim</code> when profiling is not needed.</p><p>The <code>data-lake-profiling</code> dependencies (included in standard <code>s3</code> by default):</p><ul><li><code>pyspark~=3.5.6</code></li><li><code>pydeequ&gt;=1.1.0</code></li><li>Profiling dependencies (cachetools)</li></ul><blockquote><p><strong>Note:</strong> In a future major release (e.g., DataHub 2.0), the <code>s3-slim</code> variant may become the default, and PySpark will be truly optional. This current approach provides backward compatibility while giving users time to adapt.</p></blockquote><h3 class="anchor anchorWithStickyNavbar_LWe7" id="whats-included">What&#x27;s Included<a href="#whats-included" class="hash-link" aria-label="Direct link to What&#x27;s Included" title="Direct link to What&#x27;s Included"></a></h3><p><strong>S3 source:</strong></p><p>Standard <code>s3</code> extra:</p><ul><li>✅ Metadata extraction (schemas, tables, file listing)</li><li>✅ Data format detection (Parquet, Avro, CSV, JSON, etc.)</li><li>✅ Schema inference from files</li><li>✅ Table and column-level metadata</li><li>✅ Tags and properties extraction</li><li>✅ Data profiling (min/max, nulls, distinct counts)</li><li>✅ Data quality checks (PyDeequ-based)</li><li>Includes: PySpark 3.5.6 + PyDeequ</li></ul><p><code>s3-slim</code> variant:</p><ul><li>✅ All metadata features (same as above)</li><li>❌ Data profiling disabled</li><li>No PySpark dependencies (~500MB smaller)</li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="feature-comparison">Feature Comparison<a href="#feature-comparison" class="hash-link" aria-label="Direct link to Feature Comparison" title="Direct link to Feature Comparison"></a></h2><table><thead><tr><th>Feature</th><th><code>s3-slim</code></th><th>Standard <code>s3</code></th></tr></thead><tbody><tr><td><strong>Metadata extraction</strong></td><td>✅ Full support</td><td>✅ Full support</td></tr><tr><td><strong>Schema inference</strong></td><td>✅ Full support</td><td>✅ Full support</td></tr><tr><td><strong>Tags &amp; properties</strong></td><td>✅ Full support</td><td>✅ Full support</td></tr><tr><td><strong>Data profiling</strong></td><td>❌ Not available</td><td>✅ Full profiling</td></tr><tr><td><strong>Installation size</strong></td><td>~200MB</td><td>~700MB</td></tr><tr><td><strong>Install time</strong></td><td>Fast</td><td>Slower (PySpark build)</td></tr><tr><td><strong>PySpark dependencies</strong></td><td>❌ None</td><td>✅ PySpark 3.5.6 + PyDeequ</td></tr></tbody></table><h2 class="anchor anchorWithStickyNavbar_LWe7" id="configuration">Configuration<a href="#configuration" class="hash-link" aria-label="Direct link to Configuration" title="Direct link to Configuration"></a></h2><h3 class="anchor anchorWithStickyNavbar_LWe7" id="with-standard-installation-pyspark-included">With Standard Installation (PySpark included)<a href="#with-standard-installation-pyspark-included" class="hash-link" aria-label="Direct link to With Standard Installation (PySpark included)" title="Direct link to With Standard Installation (PySpark included)"></a></h3><p>When you install <code>acryl-datahub[s3]</code>, profiling works out of the box:</p><div class="language-yaml codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-yaml codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token key atrule">source</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">type</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> s3</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">config</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">path_specs</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token plain"> </span><span class="token key atrule">include</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> s3</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain">//my</span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token plain">bucket/data/</span><span class="token important">**/*.parquet</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">profiling</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">enabled</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token boolean important" style="color:rgb(255, 88, 116)">true</span><span class="token plain"> </span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Works seamlessly with standard installation</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">profile_table_level_only</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token boolean important" style="color:rgb(255, 88, 116)">false</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h3 class="anchor anchorWithStickyNavbar_LWe7" id="with-slim-installation-no-pyspark">With Slim Installation (no PySpark)<a href="#with-slim-installation-no-pyspark" class="hash-link" aria-label="Direct link to With Slim Installation (no PySpark)" title="Direct link to With Slim Installation (no PySpark)"></a></h3><p>When you install <code>s3-slim</code>, disable profiling in your config:</p><div class="language-yaml codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-yaml codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token key atrule">source</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">type</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> s3</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">config</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">path_specs</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token plain"> </span><span class="token key atrule">include</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> s3</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain">//my</span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token plain">bucket/data/</span><span class="token important">**/*.parquet</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">profiling</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">enabled</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token boolean important" style="color:rgb(255, 88, 116)">false</span><span class="token plain"> </span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Required for s3-slim installation</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p><strong>If you enable profiling with s3-slim installation</strong>, you&#x27;ll see a clear error message at runtime:</p><div class="codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">RuntimeError: PySpark is not installed, but is required for S3 profiling.</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">Please install with: pip install &#x27;acryl-datahub[s3]&#x27;</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h2 class="anchor anchorWithStickyNavbar_LWe7" id="developer-guide">Developer Guide<a href="#developer-guide" class="hash-link" aria-label="Direct link to Developer Guide" title="Direct link to Developer Guide"></a></h2><h3 class="anchor anchorWithStickyNavbar_LWe7" id="implementation-pattern">Implementation Pattern<a href="#implementation-pattern" class="hash-link" aria-label="Direct link to Implementation Pattern" title="Direct link to Implementation Pattern"></a></h3><p>The S3 source demonstrates the recommended pattern for isolating PySpark-dependent code. This pattern can be applied to ABS and other sources in future PRs.</p><p><strong>Architecture (currently implemented for S3 only):</strong></p><ol><li><strong>Main source class</strong> (<code>source.py</code>) - Contains no PySpark imports at module level</li><li><strong>Profiler class</strong> (<code>profiling.py</code>) - Encapsulates all PySpark/PyDeequ logic in <code>SparkProfiler</code> class</li><li><strong>Conditional instantiation</strong> - <code>SparkProfiler</code> created only when profiling is enabled</li><li><strong>TYPE_CHECKING imports</strong> - Type annotations use TYPE_CHECKING block for optional dependencies</li></ol><p><strong>Key Benefits:</strong></p><ul><li>✅ Type safety preserved (mypy passes without issues)</li><li>✅ Proper code layer separation</li><li>✅ Works with both standard and <code>-slim</code> installations</li><li>✅ Clear error messages when dependencies missing</li><li>✅ Pattern can be reused for ABS and other sources</li></ul><p><strong>Example structure:</strong></p><div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-python codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># source.py</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token keyword" style="font-style:italic">if</span><span class="token plain"> TYPE_CHECKING</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token keyword" style="font-style:italic">from</span><span class="token plain"> datahub</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">ingestion</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">source</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">s3</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">profiling </span><span class="token keyword" style="font-style:italic">import</span><span class="token plain"> SparkProfiler</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token keyword" style="font-style:italic">class</span><span class="token plain"> </span><span class="token class-name" style="color:rgb(255, 203, 107)">S3Source</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> profiler</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> Optional</span><span class="token punctuation" style="color:rgb(199, 146, 234)">[</span><span class="token string" style="color:rgb(195, 232, 141)">&quot;SparkProfiler&quot;</span><span class="token punctuation" style="color:rgb(199, 146, 234)">]</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token keyword" style="font-style:italic">def</span><span class="token plain"> </span><span class="token function" style="color:rgb(130, 170, 255)">__init__</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token plain">self</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> config</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> ctx</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token keyword" style="font-style:italic">if</span><span class="token plain"> config</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">is_profiling_enabled</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token keyword" style="font-style:italic">from</span><span class="token plain"> datahub</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">ingestion</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">source</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">s3</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">profiling </span><span class="token keyword" style="font-style:italic">import</span><span class="token plain"> SparkProfiler</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> self</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">profiler </span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token plain"> SparkProfiler</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token keyword" style="font-style:italic">else</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> self</span><span class="token punctuation" style="color:rgb(199, 146, 234)">.</span><span class="token plain">profiler </span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token plain"> </span><span class="token boolean" style="color:rgb(255, 88, 116)">None</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-python codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># profiling.py</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token keyword" style="font-style:italic">class</span><span class="token plain"> </span><span class="token class-name" style="color:rgb(255, 203, 107)">SparkProfiler</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token triple-quoted-string string" style="color:rgb(195, 232, 141)">&quot;&quot;&quot;Encapsulates all PySpark/PyDeequ profiling logic.&quot;&quot;&quot;</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token keyword" style="font-style:italic">def</span><span class="token plain"> </span><span class="token function" style="color:rgb(130, 170, 255)">init_spark</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token plain">self</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token plain"> </span><span class="token operator" style="color:rgb(137, 221, 255)">-</span><span class="token operator" style="color:rgb(137, 221, 255)">&gt;</span><span class="token plain"> Any</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Spark session initialization</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token keyword" style="font-style:italic">def</span><span class="token plain"> </span><span class="token function" style="color:rgb(130, 170, 255)">read_file_spark</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token plain">self</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> </span><span class="token builtin" style="color:rgb(130, 170, 255)">file</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token builtin" style="color:rgb(130, 170, 255)">str</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> ext</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token builtin" style="color:rgb(130, 170, 255)">str</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># File reading with Spark</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token keyword" style="font-style:italic">def</span><span class="token plain"> </span><span class="token function" style="color:rgb(130, 170, 255)">get_table_profile</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token plain">self</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> table_data</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> dataset_urn</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Table profiling coordination</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p>For more details, see the <a href="/docs/metadata-ingestion/adding-source#31-using-optional-dependencies-eg-pyspark">Adding a Metadata Ingestion Source</a> guide.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="troubleshooting">Troubleshooting<a href="#troubleshooting" class="hash-link" aria-label="Direct link to Troubleshooting" title="Direct link to Troubleshooting"></a></h2><h3 class="anchor anchorWithStickyNavbar_LWe7" id="error-pyspark-is-not-installed-but-is-required-for-profiling">Error: &quot;PySpark is not installed, but is required for profiling&quot;<a href="#error-pyspark-is-not-installed-but-is-required-for-profiling" class="hash-link" aria-label="Direct link to Error: &quot;PySpark is not installed, but is required for profiling&quot;" title="Direct link to Error: &quot;PySpark is not installed, but is required for profiling&quot;"></a></h3><p><strong>Problem:</strong> You installed a <code>-slim</code> variant but have profiling enabled in your config.</p><p><strong>Solutions:</strong></p><ol><li><p><strong>Recommended:</strong> Use standard installation with PySpark:</p><div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-bash codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip uninstall acryl-datahub</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip </span><span class="token function" style="color:rgb(130, 170, 255)">install</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&#x27;acryl-datahub[s3]&#x27;</span><span class="token plain"> </span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># For S3 profiling</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div></li><li><p><strong>Alternative:</strong> Disable profiling in your recipe:</p><div class="language-yaml codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-yaml codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token key atrule">profiling</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">enabled</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token boolean important" style="color:rgb(255, 88, 116)">false</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div></li></ol><h3 class="anchor anchorWithStickyNavbar_LWe7" id="verifying-installation">Verifying Installation<a href="#verifying-installation" class="hash-link" aria-label="Direct link to Verifying Installation" title="Direct link to Verifying Installation"></a></h3><p>Check if PySpark is installed:</p><div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-bash codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Check installed packages</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip list </span><span class="token operator" style="color:rgb(137, 221, 255)">|</span><span class="token plain"> </span><span class="token function" style="color:rgb(130, 170, 255)">grep</span><span class="token plain"> pyspark</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Test import in Python</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">python -c </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;import pyspark; print(pyspark.__version__)&quot;</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p>Expected output:</p><ul><li>Standard installation (<code>s3</code>): Shows <code>pyspark 3.5.x</code></li><li>Slim installation (<code>s3-slim</code>): Import fails or package not found</li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="migration-guide">Migration Guide<a href="#migration-guide" class="hash-link" aria-label="Direct link to Migration Guide" title="Direct link to Migration Guide"></a></h2><h3 class="anchor anchorWithStickyNavbar_LWe7" id="upgrading-from-previous-versions">Upgrading from Previous Versions<a href="#upgrading-from-previous-versions" class="hash-link" aria-label="Direct link to Upgrading from Previous Versions" title="Direct link to Upgrading from Previous Versions"></a></h3><p><strong>No action required!</strong> This change is fully backward compatible:</p><div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-bash codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Existing installations continue to work exactly as before</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip </span><span class="token function" style="color:rgb(130, 170, 255)">install</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&#x27;acryl-datahub[s3]&#x27;</span><span class="token plain"> </span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Still includes PySpark by default (profiling supported)</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p><strong>Recommended: Optimize installations</strong></p><ul><li><strong>S3 with profiling:</strong> Keep using <code>acryl-datahub[s3]</code> (includes PySpark)</li><li><strong>S3 without profiling:</strong> Switch to <code>acryl-datahub[s3-slim]</code> to save ~500MB</li></ul><div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-bash codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Recommended installations</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip </span><span class="token function" style="color:rgb(130, 170, 255)">install</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&#x27;acryl-datahub[s3]&#x27;</span><span class="token plain"> </span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># S3 with profiling support</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip </span><span class="token function" style="color:rgb(130, 170, 255)">install</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&#x27;acryl-datahub[s3-slim]&#x27;</span><span class="token plain"> </span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># S3 metadata only (no profiling)</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h3 class="anchor anchorWithStickyNavbar_LWe7" id="no-breaking-changes">No Breaking Changes<a href="#no-breaking-changes" class="hash-link" aria-label="Direct link to No Breaking Changes" title="Direct link to No Breaking Changes"></a></h3><p>This implementation maintains full backward compatibility:</p><ul><li>Standard <code>s3</code> extra includes PySpark (unchanged behavior)</li><li>All existing recipes and configs continue to work</li><li>New <code>s3-slim</code> variant available for users who want smaller installations</li><li>Future DataHub 2.0 may flip defaults, but provides migration path</li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="benefits-for-datahub-actions">Benefits for DataHub Actions<a href="#benefits-for-datahub-actions" class="hash-link" aria-label="Direct link to Benefits for DataHub Actions" title="Direct link to Benefits for DataHub Actions"></a></h2><p><a href="https://github.com/datahub-project/datahub/tree/master/datahub-actions" target="_blank" rel="noopener noreferrer">DataHub Actions</a> depends on <code>acryl-datahub</code> and can benefit from <code>s3-slim</code> when profiling is not needed:</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="reduced-installation-size">Reduced Installation Size<a href="#reduced-installation-size" class="hash-link" aria-label="Direct link to Reduced Installation Size" title="Direct link to Reduced Installation Size"></a></h3><p>DataHub Actions typically doesn&#x27;t need data lake profiling capabilities since it focuses on reacting to metadata events, not extracting metadata from data lakes. Use <code>s3-slim</code> to reduce footprint:</p><div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-bash codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># If Actions needs S3 metadata access but not profiling</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip </span><span class="token function" style="color:rgb(130, 170, 255)">install</span><span class="token plain"> acryl-datahub-actions</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip </span><span class="token function" style="color:rgb(130, 170, 255)">install</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&#x27;acryl-datahub[s3-slim]&#x27;</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Result: ~500MB smaller than standard s3 extra</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># If Actions needs full S3 with profiling</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip </span><span class="token function" style="color:rgb(130, 170, 255)">install</span><span class="token plain"> acryl-datahub-actions</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip </span><span class="token function" style="color:rgb(130, 170, 255)">install</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&#x27;acryl-datahub[s3]&#x27;</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Result: Includes PySpark for profiling capabilities</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h3 class="anchor anchorWithStickyNavbar_LWe7" id="faster-deployment">Faster Deployment<a href="#faster-deployment" class="hash-link" aria-label="Direct link to Faster Deployment" title="Direct link to Faster Deployment"></a></h3><p>Actions services using <code>s3-slim</code> deploy faster in containerized environments:</p><ul><li><strong>Faster pip install</strong>: No PySpark compilation required</li><li><strong>Smaller Docker images</strong>: Reduced base image size</li><li><strong>Quicker cold starts</strong>: Less code to load and initialize</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="fewer-dependency-conflicts">Fewer Dependency Conflicts<a href="#fewer-dependency-conflicts" class="hash-link" aria-label="Direct link to Fewer Dependency Conflicts" title="Direct link to Fewer Dependency Conflicts"></a></h3><p>Actions workflows often integrate with other tools (Slack, Teams, email services). Using <code>s3-slim</code> reduces:</p><ul><li>Python version constraint conflicts</li><li>Java/Spark runtime conflicts in restricted environments</li><li>Transitive dependency version mismatches</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="when-actions-needs-profiling">When Actions Needs Profiling<a href="#when-actions-needs-profiling" class="hash-link" aria-label="Direct link to When Actions Needs Profiling" title="Direct link to When Actions Needs Profiling"></a></h3><p>If your Actions workflow needs to trigger data lake profiling jobs, use the standard extra:</p><div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#bfc7d5;--prism-background-color:#292d3e"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-bash codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#bfc7d5"><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Actions with data lake profiling capability</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip </span><span class="token function" style="color:rgb(130, 170, 255)">install</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&#x27;acryl-datahub-actions&#x27;</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">pip </span><span class="token function" style="color:rgb(130, 170, 255)">install</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&#x27;acryl-datahub[s3]&#x27;</span><span class="token plain"> </span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Includes PySpark by default</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p><strong>Common Actions use cases that DON&#x27;T need PySpark:</strong></p><ul><li>Slack notifications on schema changes</li><li>Propagating tags and terms to downstream systems</li><li>Triggering dbt runs on metadata updates</li><li>Sending emails on data quality failures</li><li>Creating Jira tickets for governance issues</li><li>Updating external catalogs (e.g., Alation, Collibra)</li></ul><p><strong>Rare Actions use cases that MIGHT need PySpark:</strong></p><ul><li>Custom actions that programmatically trigger S3 profiling</li><li>Actions that directly process data lake files (not typical)</li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="benefits-summary">Benefits Summary<a href="#benefits-summary" class="hash-link" aria-label="Direct link to Benefits Summary" title="Direct link to Benefits Summary"></a></h2><p><strong>Backward compatible</strong>: Standard <code>s3</code> extra unchanged, existing users unaffected
<strong>Smaller installations</strong>: Save ~500MB with <code>s3-slim</code>
<strong>Faster setup</strong>: No PySpark compilation with <code>s3-slim</code>
<strong>Flexible deployment</strong>: Choose based on profiling needs
<strong>Type safety maintained</strong>: Refactored with proper code layer separation (mypy passes)
<strong>Clear error messages</strong>: Runtime errors guide users to correct installation
<strong>Actions-friendly</strong>: DataHub Actions benefits from reduced footprint with <code>s3-slim</code></p><p><strong>Key Takeaways:</strong></p><ul><li>Use <code>s3</code> if you need S3 profiling, <code>s3-slim</code> if you don&#x27;t</li><li>Pattern can be applied to other sources (ABS, etc.) in future PRs</li><li>Existing installations continue working without changes</li></ul></div><footer class="theme-doc-footer docusaurus-mt-lg"><div class="slackUtm_uoBr"><div class="slackUtm_uoBr"><hr>Need more help? Join the conversation in <a href="https://datahub.com/slack?utm_source=docs&amp;utm_medium=footer&amp;utm_campaign=docs_footer&amp;utm_content=docs/PYSPARK">Slack!</a></div></div><div class="theme-doc-footer-edit-meta-row row"><div class="col"><a href="https://github.com/datahub-project/datahub/blob/master/docs/PYSPARK.md" target="_blank" rel="noreferrer noopener" class="theme-edit-this-page"><svg fill="currentColor" height="20" width="20" viewBox="0 0 40 40" class="iconEdit_Z9Sw" aria-hidden="true"><g><path d="m34.5 11.7l-3 3.1-6.3-6.3 3.1-3q0.5-0.5 1.2-0.5t1.1 0.5l3.9 3.9q0.5 0.4 0.5 1.1t-0.5 1.2z m-29.5 17.1l18.4-18.5 6.3 6.3-18.4 18.4h-6.3v-6.2z"></path></g></svg>Edit this page</a></div><div class="col lastUpdated_VsjB"></div></div></footer><div class="feedbackWrapper_mUHF"><div class="feedbackWidget_PX4d"><div class="feedbackButtons_wn3V"><strong>Is this page helpful?</strong><div><button class="feedbackButton_UgQs"><span role="img" aria-label="like" class="anticon anticon-like"><svg viewBox="64 64 896 896" focusable="false" data-icon="like" width="1em" height="1em" fill="currentColor" aria-hidden="true"><path d="M885.9 533.7c16.8-22.2 26.1-49.4 26.1-77.7 0-44.9-25.1-87.4-65.5-111.1a67.67 67.67 0 00-34.3-9.3H572.4l6-122.9c1.4-29.7-9.1-57.9-29.5-79.4A106.62 106.62 0 00471 99.9c-52 0-98 35-111.8 85.1l-85.9 311H144c-17.7 0-32 14.3-32 32v364c0 17.7 14.3 32 32 32h601.3c9.2 0 18.2-1.8 26.5-5.4 47.6-20.3 78.3-66.8 78.3-118.4 0-12.6-1.8-25-5.4-37 16.8-22.2 26.1-49.4 26.1-77.7 0-12.6-1.8-25-5.4-37 16.8-22.2 26.1-49.4 26.1-77.7-.2-12.6-2-25.1-5.6-37.1zM184 852V568h81v284h-81zm636.4-353l-21.9 19 13.9 25.4a56.2 56.2 0 016.9 27.3c0 16.5-7.2 32.2-19.6 43l-21.9 19 13.9 25.4a56.2 56.2 0 016.9 27.3c0 16.5-7.2 32.2-19.6 43l-21.9 19 13.9 25.4a56.2 56.2 0 016.9 27.3c0 22.4-13.2 42.6-33.6 51.8H329V564.8l99.5-360.5a44.1 44.1 0 0142.2-32.3c7.6 0 15.1 2.2 21.1 6.7 9.9 7.4 15.2 18.6 14.6 30.5l-9.6 198.4h314.4C829 418.5 840 436.9 840 456c0 16.5-7.2 32.1-19.6 43z"></path></svg></span></button><button class="feedbackButton_UgQs"><span role="img" aria-label="dislike" class="anticon anticon-dislike"><svg viewBox="64 64 896 896" focusable="false" data-icon="dislike" width="1em" height="1em" fill="currentColor" aria-hidden="true"><path d="M885.9 490.3c3.6-12 5.4-24.4 5.4-37 0-28.3-9.3-55.5-26.1-77.7 3.6-12 5.4-24.4 5.4-37 0-28.3-9.3-55.5-26.1-77.7 3.6-12 5.4-24.4 5.4-37 0-51.6-30.7-98.1-78.3-118.4a66.1 66.1 0 00-26.5-5.4H144c-17.7 0-32 14.3-32 32v364c0 17.7 14.3 32 32 32h129.3l85.8 310.8C372.9 889 418.9 924 470.9 924c29.7 0 57.4-11.8 77.9-33.4 20.5-21.5 31-49.7 29.5-79.4l-6-122.9h239.9c12.1 0 23.9-3.2 34.3-9.3 40.4-23.5 65.5-66.1 65.5-111 0-28.3-9.3-55.5-26.1-77.7zM184 456V172h81v284h-81zm627.2 160.4H496.8l9.6 198.4c.6 11.9-4.7 23.1-14.6 30.5-6.1 4.5-13.6 6.8-21.1 6.7a44.28 44.28 0 01-42.2-32.3L329 459.2V172h415.4a56.85 56.85 0 0133.6 51.8c0 9.7-2.3 18.9-6.9 27.3l-13.9 25.4 21.9 19a56.76 56.76 0 0119.6 43c0 9.7-2.3 18.9-6.9 27.3l-13.9 25.4 21.9 19a56.76 56.76 0 0119.6 43c0 9.7-2.3 18.9-6.9 27.3l-14 25.5 21.9 19a56.76 56.76 0 0119.6 43c0 19.1-11 37.5-28.8 48.4z"></path></svg></span></button></div></div></div></div></article><nav class="pagination-nav docusaurus-mt-lg" aria-label="Docs pages"></nav></div></div><div class="col col--3"><div class="tableOfContents_bqdL thin-scrollbar theme-doc-toc-desktop"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#overview" class="table-of-contents__link toc-highlight">Overview</a></li><li><a href="#pyspark-version" class="table-of-contents__link toc-highlight">PySpark Version</a></li><li><a href="#installation-options" class="table-of-contents__link toc-highlight">Installation Options</a><ul><li><a href="#standard-installation-includes-pyspark" class="table-of-contents__link toc-highlight">Standard Installation (includes PySpark)</a></li><li><a href="#lightweight-installation-without-pyspark" class="table-of-contents__link toc-highlight">Lightweight Installation (without PySpark)</a></li><li><a href="#whats-included" class="table-of-contents__link toc-highlight">What&#39;s Included</a></li></ul></li><li><a href="#feature-comparison" class="table-of-contents__link toc-highlight">Feature Comparison</a></li><li><a href="#configuration" class="table-of-contents__link toc-highlight">Configuration</a><ul><li><a href="#with-standard-installation-pyspark-included" class="table-of-contents__link toc-highlight">With Standard Installation (PySpark included)</a></li><li><a href="#with-slim-installation-no-pyspark" class="table-of-contents__link toc-highlight">With Slim Installation (no PySpark)</a></li></ul></li><li><a href="#developer-guide" class="table-of-contents__link toc-highlight">Developer Guide</a><ul><li><a href="#implementation-pattern" class="table-of-contents__link toc-highlight">Implementation Pattern</a></li></ul></li><li><a href="#troubleshooting" class="table-of-contents__link toc-highlight">Troubleshooting</a><ul><li><a href="#error-pyspark-is-not-installed-but-is-required-for-profiling" class="table-of-contents__link toc-highlight">Error: &quot;PySpark is not installed, but is required for profiling&quot;</a></li><li><a href="#verifying-installation" class="table-of-contents__link toc-highlight">Verifying Installation</a></li></ul></li><li><a href="#migration-guide" class="table-of-contents__link toc-highlight">Migration Guide</a><ul><li><a href="#upgrading-from-previous-versions" class="table-of-contents__link toc-highlight">Upgrading from Previous Versions</a></li><li><a href="#no-breaking-changes" class="table-of-contents__link toc-highlight">No Breaking Changes</a></li></ul></li><li><a href="#benefits-for-datahub-actions" class="table-of-contents__link toc-highlight">Benefits for DataHub Actions</a><ul><li><a href="#reduced-installation-size" class="table-of-contents__link toc-highlight">Reduced Installation Size</a></li><li><a href="#faster-deployment" class="table-of-contents__link toc-highlight">Faster Deployment</a></li><li><a href="#fewer-dependency-conflicts" class="table-of-contents__link toc-highlight">Fewer Dependency Conflicts</a></li><li><a href="#when-actions-needs-profiling" class="table-of-contents__link toc-highlight">When Actions Needs Profiling</a></li></ul></li><li><a href="#benefits-summary" class="table-of-contents__link toc-highlight">Benefits Summary</a></li></ul></div></div></div></div></main></div></div><footer class="footer footer--dark"><div class="container container-fluid"><div class="row footer__links"><div class="col footer__col"><div class="footer__title">Docs</div><ul class="footer__items clean-list"><li class="footer__item"><a class="footer__link-item" href="/docs/">Introduction</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/quickstart">Quickstart</a></li></ul></div><div class="col footer__col"><div class="footer__title">Community</div><ul class="footer__items clean-list"><li class="footer__item"><a href="https://datahub.com/slack" target="_blank" rel="noopener noreferrer" class="footer__link-item">Slack<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://www.youtube.com/channel/UC3qFQC5IiwR5fvWEqi_tJ5w" target="_blank" rel="noopener noreferrer" class="footer__link-item">YouTube<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://medium.com/datahub-project" target="_blank" rel="noopener noreferrer" class="footer__link-item">Blog<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a class="footer__link-item" href="/docs/townhalls">Town Halls</a></li><li class="footer__item"><a href="https://datahub.com/resources/?2004611554=dh-stories" target="_blank" rel="noopener noreferrer" class="footer__link-item">Customer Stories<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div><div class="col footer__col"><div class="footer__title">More</div><ul class="footer__items clean-list"><li class="footer__item"><a href="https://demo.datahub.com/" target="_blank" rel="noopener noreferrer" class="footer__link-item">Demo</a></li><li class="footer__item"><a href="https://feature-requests.datahubproject.io/roadmap" target="_blank" rel="noopener noreferrer" class="footer__link-item">Roadmap<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a class="footer__link-item" href="/docs/contributing">Contributing</a></li><li class="footer__item"><a href="https://github.com/datahub-project/datahub" target="_blank" rel="noopener noreferrer" class="footer__link-item">GitHub<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://feature-requests.datahubproject.io/" target="_blank" rel="noopener noreferrer" class="footer__link-item">Feature Requests<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div></div><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2015-2025 DataHub Project Authors.</div></div></div></footer></div>
<script src="/assets/js/runtime~main.7bcf907e.js"></script>
<script src="/assets/js/main.99352010.js"></script>
</body>
</html>