datahub/assets/js/179e51a2.e08f1be8.js
2025-08-08 21:37:06 +00:00

1 line
43 KiB
JavaScript

"use strict";(self.webpackChunkdocs_website=self.webpackChunkdocs_website||[]).push([[77374],{60025:(e,t,a)=>{a.r(t),a.d(t,{assets:()=>d,contentTitle:()=>p,default:()=>f,frontMatter:()=>g,metadata:()=>c,toc:()=>u});a(96540);var n=a(15680),i=a(53720),r=a(5400);function l(e,t,a){return t in e?Object.defineProperty(e,t,{value:a,enumerable:!0,configurable:!0,writable:!0}):e[t]=a,e}function o(e,t){return t=null!=t?t:{},Object.getOwnPropertyDescriptors?Object.defineProperties(e,Object.getOwnPropertyDescriptors(t)):function(e,t){var a=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),a.push.apply(a,n)}return a}(Object(t)).forEach((function(a){Object.defineProperty(e,a,Object.getOwnPropertyDescriptor(t,a))})),e}function s(e,t){if(null==e)return{};var a,n,i=function(e,t){if(null==e)return{};var a,n,i={},r=Object.keys(e);for(n=0;n<r.length;n++)a=r[n],t.indexOf(a)>=0||(i[a]=e[a]);return i}(e,t);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);for(n=0;n<r.length;n++)a=r[n],t.indexOf(a)>=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(i[a]=e[a])}return i}const g={sidebar_position:31,title:"Iceberg",slug:"/generated/ingestion/sources/iceberg",custom_edit_url:"https://github.com/datahub-project/datahub/blob/master/docs/generated/ingestion/sources/iceberg.md"},p="Iceberg",c={unversionedId:"docs/generated/ingestion/sources/iceberg",id:"docs/generated/ingestion/sources/iceberg",title:"Iceberg",description:"Testing",source:"@site/genDocs/docs/generated/ingestion/sources/iceberg.md",sourceDirName:"docs/generated/ingestion/sources",slug:"/generated/ingestion/sources/iceberg",permalink:"/docs/generated/ingestion/sources/iceberg",draft:!1,editUrl:"https://github.com/datahub-project/datahub/blob/master/docs/generated/ingestion/sources/iceberg.md",tags:[],version:"current",sidebarPosition:31,frontMatter:{sidebar_position:31,title:"Iceberg",slug:"/generated/ingestion/sources/iceberg",custom_edit_url:"https://github.com/datahub-project/datahub/blob/master/docs/generated/ingestion/sources/iceberg.md"},sidebar:"overviewSidebar",previous:{title:"Hive Metastore",permalink:"/docs/generated/ingestion/sources/hive-metastore"},next:{title:"JSON Schemas",permalink:"/docs/generated/ingestion/sources/json-schema"}},d={},u=[{value:"Important Capabilities",id:"important-capabilities",level:3},{value:"Integration Details",id:"integration-details",level:2},{value:"CLI based Ingestion",id:"cli-based-ingestion",level:3},{value:"Config Details",id:"config-details",level:3},{value:"Setting up connection to an Iceberg catalog",id:"setting-up-connection-to-an-iceberg-catalog",level:2},{value:"Glue catalog + S3 warehouse",id:"glue-catalog--s3-warehouse",level:3},{value:"Glue and S3 permissions required",id:"glue-and-s3-permissions-required",level:4},{value:"Iceberg REST Catalog + MinIO",id:"iceberg-rest-catalog--minio",level:3},{value:"Iceberg REST Catalog (with authentication) + S3",id:"iceberg-rest-catalog-with-authentication--s3",level:3},{value:"Special REST connection parameters for resiliency",id:"special-rest-connection-parameters-for-resiliency",level:4},{value:"SQL catalog + Azure DLS as the warehouse",id:"sql-catalog--azure-dls-as-the-warehouse",level:3},{value:"Concept Mapping",id:"concept-mapping",level:2},{value:"Troubleshooting",id:"troubleshooting",level:2},{value:"Exceptions while increasing <code>processing_threads</code>",id:"exceptions-while-increasing-processing_threads",level:3},{value:"DataHub Iceberg REST Catalog",id:"datahub-iceberg-rest-catalog",level:2},{value:"Code Coordinates",id:"code-coordinates",level:3}],m={toc:u},y="wrapper";function f(e){var{components:t}=e,a=s(e,["components"]);return(0,n.yg)(y,o(function(e){for(var t=1;t<arguments.length;t++){var a=null!=arguments[t]?arguments[t]:{},n=Object.keys(a);"function"==typeof Object.getOwnPropertySymbols&&(n=n.concat(Object.getOwnPropertySymbols(a).filter((function(e){return Object.getOwnPropertyDescriptor(a,e).enumerable})))),n.forEach((function(t){l(e,t,a[t])}))}return e}({},m,a),{components:t,mdxType:"MDXLayout"}),(0,n.yg)("h1",{id:"iceberg"},"Iceberg"),(0,n.yg)("p",null,(0,n.yg)("img",{parentName:"p",src:"https://img.shields.io/badge/support%20status-testing-lightgrey",alt:"Testing"})),(0,n.yg)("h3",{id:"important-capabilities"},"Important Capabilities"),(0,n.yg)("table",null,(0,n.yg)("thead",{parentName:"table"},(0,n.yg)("tr",{parentName:"thead"},(0,n.yg)("th",{parentName:"tr",align:null},"Capability"),(0,n.yg)("th",{parentName:"tr",align:null},"Status"),(0,n.yg)("th",{parentName:"tr",align:null},"Notes"))),(0,n.yg)("tbody",{parentName:"table"},(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:null},(0,n.yg)("a",{parentName:"td",href:"/docs/metadata-ingestion/docs/dev_guides/sql_profiles"},"Data Profiling")),(0,n.yg)("td",{parentName:"tr",align:null},"\u2705"),(0,n.yg)("td",{parentName:"tr",align:null},"Optionally enabled via configuration.")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:null},"Descriptions"),(0,n.yg)("td",{parentName:"tr",align:null},"\u2705"),(0,n.yg)("td",{parentName:"tr",align:null},"Enabled by default.")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:null},(0,n.yg)("a",{parentName:"td",href:"/docs/metadata-ingestion/docs/dev_guides/stateful#stale-entity-removal"},"Detect Deleted Entities")),(0,n.yg)("td",{parentName:"tr",align:null},"\u2705"),(0,n.yg)("td",{parentName:"tr",align:null},"Enabled by default via stateful ingestion.")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:null},(0,n.yg)("a",{parentName:"td",href:"/docs/domains"},"Domains")),(0,n.yg)("td",{parentName:"tr",align:null},"\u274c"),(0,n.yg)("td",{parentName:"tr",align:null},"Currently not supported.")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:null},"Extract Ownership"),(0,n.yg)("td",{parentName:"tr",align:null},"\u2705"),(0,n.yg)("td",{parentName:"tr",align:null},"Automatically ingests ownership information from table properties based on ",(0,n.yg)("inlineCode",{parentName:"td"},"user_ownership_property")," and ",(0,n.yg)("inlineCode",{parentName:"td"},"group_ownership_property"),".")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:null},"Partition Support"),(0,n.yg)("td",{parentName:"tr",align:null},"\u274c"),(0,n.yg)("td",{parentName:"tr",align:null},"Currently not supported.")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:null},(0,n.yg)("a",{parentName:"td",href:"/docs/platform-instances"},"Platform Instance")),(0,n.yg)("td",{parentName:"tr",align:null},"\u2705"),(0,n.yg)("td",{parentName:"tr",align:null},"Optionally enabled via configuration, an Iceberg instance represents the catalog name where the table is stored.")))),(0,n.yg)("h2",{id:"integration-details"},"Integration Details"),(0,n.yg)("p",null,"The DataHub Iceberg source plugin extracts metadata from ",(0,n.yg)("a",{parentName:"p",href:"https://iceberg.apache.org/spec/"},"Iceberg tables")," stored in a distributed or local file system.\nTypically, Iceberg tables are stored in a distributed file system like S3 or Azure Data Lake Storage (ADLS) and registered in a catalog. There are various catalog\nimplementations like Filesystem-based, RDBMS-based or even REST-based catalogs. This Iceberg source plugin relies on the\n",(0,n.yg)("a",{parentName:"p",href:"https://py.iceberg.apache.org/"},"pyiceberg library"),"."),(0,n.yg)("h3",{id:"cli-based-ingestion"},"CLI based Ingestion"),(0,n.yg)("h3",{id:"config-details"},"Config Details"),(0,n.yg)(i.A,{mdxType:"Tabs"},(0,n.yg)(r.A,{value:"options",label:"Options",default:!0,mdxType:"TabItem"},(0,n.yg)("p",null,"Note that a ",(0,n.yg)("inlineCode",{parentName:"p"},".")," is used to denote nested fields in the YAML recipe."),(0,n.yg)("div",{className:"config-table"},(0,n.yg)("table",null,(0,n.yg)("thead",{parentName:"table"},(0,n.yg)("tr",{parentName:"thead"},(0,n.yg)("th",{parentName:"tr",align:"left"},"Field"),(0,n.yg)("th",{parentName:"tr",align:"left"},"Description"))),(0,n.yg)("tbody",{parentName:"table"},(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"catalog"),"\xa0",(0,n.yg)("abbr",{title:"Required"},"\u2705"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"map(str,object)"))),(0,n.yg)("td",{parentName:"tr",align:"left"})),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"group_ownership_property"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Iceberg table property to look for a ",(0,n.yg)("inlineCode",{parentName:"td"},"CorpGroup")," owner. Can only hold a single group value. If property has no value, no owner information will be emitted.")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"namespace_pattern"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"AllowDenyPattern"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Regex patterns for namespaces to filter in ingestion. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"{","'","allow","'",": ","[","'",".","*","'","]",", ","'","deny","'",": ","[","]",", ","'","ignoreCase","'",": True","}")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"platform_instance"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"The instance of the platform that all assets produced by this recipe belong to. This should be unique within the platform. See ",(0,n.yg)("a",{parentName:"td",href:"https://docs.datahub.com/docs/platform-instances/"},"https://docs.datahub.com/docs/platform-instances/")," for more details.")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"processing_threads"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"integer"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"How many threads will be processing tables ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"1")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"table_pattern"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"AllowDenyPattern"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Regex patterns for tables to filter in ingestion. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"{","'","allow","'",": ","[","'",".","*","'","]",", ","'","deny","'",": ","[","]",", ","'","ignoreCase","'",": True","}")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"user_ownership_property"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Iceberg table property to look for a ",(0,n.yg)("inlineCode",{parentName:"td"},"CorpUser")," owner. Can only hold a single user value. If property has no value, no owner information will be emitted. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"owner")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"env"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"The environment that all assets produced by this connector belong to ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"PROD")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"profiling"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"IcebergProfilingConfig"))),(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"default-line "},"Default: ",(0,n.yg)("span",{className:"default-value"},"{","'","enabled","'",": False, ","'","include","_","field","_","null","_","count","'",": Tru...")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"profiling."),(0,n.yg)("span",{className:"path-main"},"enabled"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"boolean"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Whether profiling should be done. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"False")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"profiling."),(0,n.yg)("span",{className:"path-main"},"include_field_max_value"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"boolean"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Whether to profile for the max value of numeric columns. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"True")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"profiling."),(0,n.yg)("span",{className:"path-main"},"include_field_min_value"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"boolean"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Whether to profile for the min value of numeric columns. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"True")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"profiling."),(0,n.yg)("span",{className:"path-main"},"include_field_null_count"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"boolean"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Whether to profile for the number of nulls for each column. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"True")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"profiling."),(0,n.yg)("span",{className:"path-main"},"operation_config"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"OperationConfig"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Experimental feature. To specify operation configs.")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"profiling.operation_config."),(0,n.yg)("span",{className:"path-main"},"lower_freq_profile_enabled"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"boolean"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Whether to do profiling at lower freq or not. This does not do any scheduling just adds additional checks to when not to run profiling. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"False")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"profiling.operation_config."),(0,n.yg)("span",{className:"path-main"},"profile_date_of_month"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"integer"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Number between 1 to 31 for date of month (both inclusive). If not specified, defaults to Nothing and this field does not take affect.")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"profiling.operation_config."),(0,n.yg)("span",{className:"path-main"},"profile_day_of_week"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"integer"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Number between 0 to 6 for day of week (both inclusive). 0 is Monday and 6 is Sunday. If not specified, defaults to Nothing and this field does not take affect.")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"stateful_ingestion"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"StatefulStaleMetadataRemovalConfig"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Iceberg Stateful Ingestion Config.")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"stateful_ingestion."),(0,n.yg)("span",{className:"path-main"},"enabled"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"boolean"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Whether or not to enable stateful ingest. Default: True if a pipeline_name is set and either a datahub-rest sink or ",(0,n.yg)("inlineCode",{parentName:"td"},"datahub_api")," is specified, otherwise False ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"False")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"stateful_ingestion."),(0,n.yg)("span",{className:"path-main"},"fail_safe_threshold"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"number"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Prevents large amount of soft deletes & the state from committing from accidental changes to the source configuration if the relative change percent in entities compared to the previous state is above the 'fail_safe_threshold'. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"75.0")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"stateful_ingestion."),(0,n.yg)("span",{className:"path-main"},"remove_stale_metadata"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"boolean"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Soft-deletes the entities present in the last successful run but missing in the current run with stateful_ingestion enabled. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"True")))))))),(0,n.yg)(r.A,{value:"schema",label:"Schema",mdxType:"TabItem"},(0,n.yg)("p",null,"The ",(0,n.yg)("a",{parentName:"p",href:"https://json-schema.org/"},"JSONSchema")," for this configuration is inlined below."),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-javascript"},'{\n "title": "IcebergSourceConfig",\n "description": "Base configuration class for stateful ingestion for source configs to inherit from.",\n "type": "object",\n "properties": {\n "env": {\n "title": "Env",\n "description": "The environment that all assets produced by this connector belong to",\n "default": "PROD",\n "type": "string"\n },\n "platform_instance": {\n "title": "Platform Instance",\n "description": "The instance of the platform that all assets produced by this recipe belong to. This should be unique within the platform. See https://docs.datahub.com/docs/platform-instances/ for more details.",\n "type": "string"\n },\n "stateful_ingestion": {\n "title": "Stateful Ingestion",\n "description": "Iceberg Stateful Ingestion Config.",\n "allOf": [\n {\n "$ref": "#/definitions/StatefulStaleMetadataRemovalConfig"\n }\n ]\n },\n "catalog": {\n "title": "Catalog",\n "description": "Catalog configuration where to find Iceberg tables. Only one catalog specification is supported. The format is the same as [pyiceberg\'s catalog configuration](https://py.iceberg.apache.org/configuration/), where the catalog name is specified as the object name and attributes are set as key-value pairs.",\n "type": "object",\n "additionalProperties": {\n "type": "object"\n }\n },\n "table_pattern": {\n "title": "Table Pattern",\n "description": "Regex patterns for tables to filter in ingestion.",\n "default": {\n "allow": [\n ".*"\n ],\n "deny": [],\n "ignoreCase": true\n },\n "allOf": [\n {\n "$ref": "#/definitions/AllowDenyPattern"\n }\n ]\n },\n "namespace_pattern": {\n "title": "Namespace Pattern",\n "description": "Regex patterns for namespaces to filter in ingestion.",\n "default": {\n "allow": [\n ".*"\n ],\n "deny": [],\n "ignoreCase": true\n },\n "allOf": [\n {\n "$ref": "#/definitions/AllowDenyPattern"\n }\n ]\n },\n "user_ownership_property": {\n "title": "User Ownership Property",\n "description": "Iceberg table property to look for a `CorpUser` owner. Can only hold a single user value. If property has no value, no owner information will be emitted.",\n "default": "owner",\n "type": "string"\n },\n "group_ownership_property": {\n "title": "Group Ownership Property",\n "description": "Iceberg table property to look for a `CorpGroup` owner. Can only hold a single group value. If property has no value, no owner information will be emitted.",\n "type": "string"\n },\n "profiling": {\n "title": "Profiling",\n "default": {\n "enabled": false,\n "include_field_null_count": true,\n "include_field_min_value": true,\n "include_field_max_value": true,\n "operation_config": {\n "lower_freq_profile_enabled": false,\n "profile_day_of_week": null,\n "profile_date_of_month": null\n }\n },\n "allOf": [\n {\n "$ref": "#/definitions/IcebergProfilingConfig"\n }\n ]\n },\n "processing_threads": {\n "title": "Processing Threads",\n "description": "How many threads will be processing tables",\n "default": 1,\n "type": "integer"\n }\n },\n "required": [\n "catalog"\n ],\n "additionalProperties": false,\n "definitions": {\n "DynamicTypedStateProviderConfig": {\n "title": "DynamicTypedStateProviderConfig",\n "type": "object",\n "properties": {\n "type": {\n "title": "Type",\n "description": "The type of the state provider to use. For DataHub use `datahub`",\n "type": "string"\n },\n "config": {\n "title": "Config",\n "description": "The configuration required for initializing the state provider. Default: The datahub_api config if set at pipeline level. Otherwise, the default DatahubClientConfig. See the defaults (https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/graph/client.py#L19).",\n "default": {},\n "type": "object"\n }\n },\n "required": [\n "type"\n ],\n "additionalProperties": false\n },\n "StatefulStaleMetadataRemovalConfig": {\n "title": "StatefulStaleMetadataRemovalConfig",\n "description": "Base specialized config for Stateful Ingestion with stale metadata removal capability.",\n "type": "object",\n "properties": {\n "enabled": {\n "title": "Enabled",\n "description": "Whether or not to enable stateful ingest. Default: True if a pipeline_name is set and either a datahub-rest sink or `datahub_api` is specified, otherwise False",\n "default": false,\n "type": "boolean"\n },\n "remove_stale_metadata": {\n "title": "Remove Stale Metadata",\n "description": "Soft-deletes the entities present in the last successful run but missing in the current run with stateful_ingestion enabled.",\n "default": true,\n "type": "boolean"\n },\n "fail_safe_threshold": {\n "title": "Fail Safe Threshold",\n "description": "Prevents large amount of soft deletes & the state from committing from accidental changes to the source configuration if the relative change percent in entities compared to the previous state is above the \'fail_safe_threshold\'.",\n "default": 75.0,\n "minimum": 0.0,\n "maximum": 100.0,\n "type": "number"\n }\n },\n "additionalProperties": false\n },\n "AllowDenyPattern": {\n "title": "AllowDenyPattern",\n "description": "A class to store allow deny regexes",\n "type": "object",\n "properties": {\n "allow": {\n "title": "Allow",\n "description": "List of regex patterns to include in ingestion",\n "default": [\n ".*"\n ],\n "type": "array",\n "items": {\n "type": "string"\n }\n },\n "deny": {\n "title": "Deny",\n "description": "List of regex patterns to exclude from ingestion.",\n "default": [],\n "type": "array",\n "items": {\n "type": "string"\n }\n },\n "ignoreCase": {\n "title": "Ignorecase",\n "description": "Whether to ignore case sensitivity during pattern matching.",\n "default": true,\n "type": "boolean"\n }\n },\n "additionalProperties": false\n },\n "OperationConfig": {\n "title": "OperationConfig",\n "type": "object",\n "properties": {\n "lower_freq_profile_enabled": {\n "title": "Lower Freq Profile Enabled",\n "description": "Whether to do profiling at lower freq or not. This does not do any scheduling just adds additional checks to when not to run profiling.",\n "default": false,\n "type": "boolean"\n },\n "profile_day_of_week": {\n "title": "Profile Day Of Week",\n "description": "Number between 0 to 6 for day of week (both inclusive). 0 is Monday and 6 is Sunday. If not specified, defaults to Nothing and this field does not take affect.",\n "type": "integer"\n },\n "profile_date_of_month": {\n "title": "Profile Date Of Month",\n "description": "Number between 1 to 31 for date of month (both inclusive). If not specified, defaults to Nothing and this field does not take affect.",\n "type": "integer"\n }\n },\n "additionalProperties": false\n },\n "IcebergProfilingConfig": {\n "title": "IcebergProfilingConfig",\n "type": "object",\n "properties": {\n "enabled": {\n "title": "Enabled",\n "description": "Whether profiling should be done.",\n "default": false,\n "type": "boolean"\n },\n "include_field_null_count": {\n "title": "Include Field Null Count",\n "description": "Whether to profile for the number of nulls for each column.",\n "default": true,\n "type": "boolean"\n },\n "include_field_min_value": {\n "title": "Include Field Min Value",\n "description": "Whether to profile for the min value of numeric columns.",\n "default": true,\n "type": "boolean"\n },\n "include_field_max_value": {\n "title": "Include Field Max Value",\n "description": "Whether to profile for the max value of numeric columns.",\n "default": true,\n "type": "boolean"\n },\n "operation_config": {\n "title": "Operation Config",\n "description": "Experimental feature. To specify operation configs.",\n "allOf": [\n {\n "$ref": "#/definitions/OperationConfig"\n }\n ]\n }\n },\n "additionalProperties": false\n }\n }\n}\n')))),(0,n.yg)("h2",{id:"setting-up-connection-to-an-iceberg-catalog"},"Setting up connection to an Iceberg catalog"),(0,n.yg)("p",null,"There are multiple servers compatible with the Iceberg Catalog specification. DataHub's ",(0,n.yg)("inlineCode",{parentName:"p"},"iceberg")," connector uses ",(0,n.yg)("inlineCode",{parentName:"p"},"pyiceberg"),"\nlibrary to extract metadata from them. The recipe for the source consists of 2 parts:"),(0,n.yg)("ol",null,(0,n.yg)("li",{parentName:"ol"},(0,n.yg)("inlineCode",{parentName:"li"},"catalog")," part which is passed as-is to the ",(0,n.yg)("inlineCode",{parentName:"li"},"pyiceberg")," library and configures the connection and its details (i.e. authentication).\nThe name of catalog specified in the recipe has no consequence, it is just a formal requirement from the library.\nOnly one catalog will be considered for the ingestion."),(0,n.yg)("li",{parentName:"ol"},"The remaining configuration consists of parameters, such as ",(0,n.yg)("inlineCode",{parentName:"li"},"env")," or ",(0,n.yg)("inlineCode",{parentName:"li"},"stateful_ingestion")," which are standard\nDataHub's ingestor configuration parameters and are described in the ",(0,n.yg)("a",{parentName:"li",href:"#config-details"},"Config Details")," chapter.")),(0,n.yg)("p",null,"This chapter showcases several examples of setting up connections to an Iceberg catalog, varying based on the underlying\nimplementation. Iceberg is designed to have catalog and warehouse separated, which is reflected in how we configure it.\nIt is especially visible when using Iceberg REST Catalog - which can use many blob storages\n(AWS S3, Azure Blob Storage, MinIO) as a warehouse."),(0,n.yg)("p",null,"Note that, for advanced users, it is possible to specify a custom catalog client implementation via ",(0,n.yg)("inlineCode",{parentName:"p"},"py-catalog-impl"),"\nconfiguration option - refer to ",(0,n.yg)("inlineCode",{parentName:"p"},"pyiceberg")," documentation on details."),(0,n.yg)("h3",{id:"glue-catalog--s3-warehouse"},"Glue catalog + S3 warehouse"),(0,n.yg)("p",null,"The minimal configuration for connecting to Glue catalog with S3 warehouse:"),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-yaml"},'source:\n type: "iceberg"\n config:\n env: dev\n catalog:\n my_catalog:\n type: "glue"\n s3.region: "us-west-2"\n region_name: "us-west-2"\n')),(0,n.yg)("p",null,"Where ",(0,n.yg)("inlineCode",{parentName:"p"},"us-west-2")," is the region from which you want to ingest. The above configuration will work assuming your pod or environment in which\nyou run your datahub CLI is already authenticated to AWS and has proper permissions granted (see below). If you need\nto specify secrets directly, use the following configuration as the template:"),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-yaml"},'source:\n type: "iceberg"\n config:\n env: dev\n catalog:\n demo:\n type: "glue"\n s3.region: "us-west-2"\n s3.access-key-id: "${AWS_ACCESS_KEY_ID}"\n s3.secret-access-key: "${AWS_SECRET_ACCESS_KEY}"\n s3.session-token: "${AWS_SESSION_TOKEN}"\n\n aws_access_key_id: "${AWS_ACCESS_KEY_ID}"\n aws_secret_access_key: "${AWS_SECRET_ACCESS_KEY}"\n aws_session_token: "${AWS_SESSION_TOKEN}"\n region_name: "us-west-2"\n')),(0,n.yg)("p",null,"This example uses references to fill credentials (either from Secrets defined in Managed Ingestion or environmental variables).\nIt is possible (but not recommended due to security concerns) to provide those values in plaintext, directly in the recipe."),(0,n.yg)("h4",{id:"glue-and-s3-permissions-required"},"Glue and S3 permissions required"),(0,n.yg)("p",null,"The role used by the ingestor for ingesting metadata from Glue Iceberg Catalog and S3 warehouse is:"),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-json"},'{\n "Version": "2012-10-17",\n "Statement": [\n {\n "Effect": "Allow",\n "Action": ["glue:GetDatabases", "glue:GetTables", "glue:GetTable"],\n "Resource": "*"\n },\n {\n "Effect": "Allow",\n "Action": ["s3:GetObject", "s3:ListBucket", "s3:GetObjectVersion"],\n "Resource": [\n "arn:aws:s3:::<bucket used by the warehouse>",\n "arn:aws:s3:::<bucket used by the warehouse>/*"\n ]\n }\n ]\n}\n')),(0,n.yg)("h3",{id:"iceberg-rest-catalog--minio"},"Iceberg REST Catalog + MinIO"),(0,n.yg)("p",null,"The following configuration assumes MinIO defines authentication using the ",(0,n.yg)("inlineCode",{parentName:"p"},"s3.*")," prefix. Note the specification of ",(0,n.yg)("inlineCode",{parentName:"p"},"s3.endpoint"),", assuming\nMinIO listens on port ",(0,n.yg)("inlineCode",{parentName:"p"},"9000")," at ",(0,n.yg)("inlineCode",{parentName:"p"},"minio-host"),". The ",(0,n.yg)("inlineCode",{parentName:"p"},"uri")," parameter points at Iceberg REST Catalog (IRC) endpoint (in this case ",(0,n.yg)("inlineCode",{parentName:"p"},"iceberg-catalog:8181"),")."),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-yaml"},'source:\n type: "iceberg"\n config:\n env: dev\n catalog:\n demo:\n type: "rest"\n uri: "http://iceberg-catalog:8181"\n s3.access-key-id: "${AWS_ACCESS_KEY_ID}"\n s3.secret-access-key: "${AWS_SECRET_ACCESS_KEY}"\n s3.region: "eu-east-1"\n s3.endpoint: "http://minio-host:9000"\n')),(0,n.yg)("h3",{id:"iceberg-rest-catalog-with-authentication--s3"},"Iceberg REST Catalog (with authentication) + S3"),(0,n.yg)("p",null,"This example assumes IRC requires token authentication (via ",(0,n.yg)("inlineCode",{parentName:"p"},"Authorization")," header). There are more options available,\nsee ",(0,n.yg)("a",{parentName:"p",href:"https://py.iceberg.apache.org/configuration/#rest-catalog"},"https://py.iceberg.apache.org/configuration/#rest-catalog")," for details. Moreover, the assumption here is that the\nenvironment (i.e. pod) is already authenticated to perform actions against AWS S3."),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-yaml"},'source:\n type: "iceberg"\n config:\n env: dev\n catalog:\n demo:\n type: "rest"\n uri: "http://iceberg-catalog-uri"\n token: "token-value"\n s3.region: "us-west-2"\n')),(0,n.yg)("h4",{id:"special-rest-connection-parameters-for-resiliency"},"Special REST connection parameters for resiliency"),(0,n.yg)("p",null,"Unlike other parameters provided in the dictionary under the ",(0,n.yg)("inlineCode",{parentName:"p"},"catalog")," key, ",(0,n.yg)("inlineCode",{parentName:"p"},"connection")," parameter is a custom feature in\nDataHub, allowing to inject connection resiliency parameters to the REST connection made by the ingestor. ",(0,n.yg)("inlineCode",{parentName:"p"},"connection"),"\nallows for 2 parameters:"),(0,n.yg)("ul",null,(0,n.yg)("li",{parentName:"ul"},(0,n.yg)("inlineCode",{parentName:"li"},"timeout")," is provided as amount of seconds, it needs to be whole number (or ",(0,n.yg)("inlineCode",{parentName:"li"},"null")," to turn it off)"),(0,n.yg)("li",{parentName:"ul"},(0,n.yg)("inlineCode",{parentName:"li"},"retry")," is a complex object representing parameters used to create ",(0,n.yg)("a",{parentName:"li",href:"https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#module-urllib3.util.retry"},"urllib3 Retry object"),".\nThere are many possible parameters, most important would be ",(0,n.yg)("inlineCode",{parentName:"li"},"total")," (total retries) and ",(0,n.yg)("inlineCode",{parentName:"li"},"backoff_factor"),". See the linked docs\nfor the details.")),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-yaml"},'source:\n type: "iceberg"\n config:\n env: dev\n catalog:\n demo:\n type: "rest"\n uri: "http://iceberg-catalog-uri"\n connection:\n retry:\n backoff_factor: 0.5\n total: 3\n timeout: 120\n')),(0,n.yg)("h3",{id:"sql-catalog--azure-dls-as-the-warehouse"},"SQL catalog + Azure DLS as the warehouse"),(0,n.yg)("p",null,"This example targets ",(0,n.yg)("inlineCode",{parentName:"p"},"Postgres")," as the sql-type ",(0,n.yg)("inlineCode",{parentName:"p"},"Iceberg")," catalog and uses Azure DLS as the warehouse."),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-yaml"},'source:\n type: "iceberg"\n config:\n env: dev\n catalog:\n demo:\n type: sql\n uri: postgresql+psycopg2://user:password@sqldatabase.postgres.database.azure.com:5432/icebergcatalog\n adlfs.tenant-id: <Azure tenant ID>\n adlfs.account-name: <Azure storage account name>\n adlfs.client-id: <Azure Client/Application ID>\n adlfs.client-secret: <Azure Client Secret>\n')),(0,n.yg)("h2",{id:"concept-mapping"},"Concept Mapping"),(0,n.yg)("p",null,"This ingestion source maps the following Source System Concepts to DataHub Concepts:"),(0,n.yg)("table",null,(0,n.yg)("thead",{parentName:"table"},(0,n.yg)("tr",{parentName:"thead"},(0,n.yg)("th",{parentName:"tr",align:null},"Source Concept"),(0,n.yg)("th",{parentName:"tr",align:null},"DataHub Concept"),(0,n.yg)("th",{parentName:"tr",align:null},"Notes"))),(0,n.yg)("tbody",{parentName:"table"},(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:null},(0,n.yg)("inlineCode",{parentName:"td"},"iceberg")),(0,n.yg)("td",{parentName:"tr",align:null},(0,n.yg)("a",{parentName:"td",href:"/docs/generated/metamodel/entities/dataplatform"},"Data Platform")),(0,n.yg)("td",{parentName:"tr",align:null})),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:null},"Table"),(0,n.yg)("td",{parentName:"tr",align:null},(0,n.yg)("a",{parentName:"td",href:"/docs/generated/metamodel/entities/dataset"},"Dataset")),(0,n.yg)("td",{parentName:"tr",align:null},"An Iceberg table is registered inside a catalog using a name, where the catalog is responsible for creating, dropping and renaming tables. Catalogs manage a collection of tables that are usually grouped into namespaces. The name of a table is mapped to a Dataset name. If a ",(0,n.yg)("a",{parentName:"td",href:"/docs/platform-instances/"},"Platform Instance")," is configured, it will be used as a prefix: ",(0,n.yg)("inlineCode",{parentName:"td"},"<platform_instance>.my.namespace.table"),".")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:null},(0,n.yg)("a",{parentName:"td",href:"https://iceberg.apache.org/docs/latest/configuration/#table-properties"},"Table property")),(0,n.yg)("td",{parentName:"tr",align:null},(0,n.yg)("a",{parentName:"td",href:"/docs/generated/metamodel/entities/corpuser"},"User (a.k.a CorpUser)")),(0,n.yg)("td",{parentName:"tr",align:null},"The value of a table property can be used as the name of a CorpUser owner. This table property name can be configured with the source option ",(0,n.yg)("inlineCode",{parentName:"td"},"user_ownership_property"),".")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:null},(0,n.yg)("a",{parentName:"td",href:"https://iceberg.apache.org/docs/latest/configuration/#table-properties"},"Table property")),(0,n.yg)("td",{parentName:"tr",align:null},"CorpGroup"),(0,n.yg)("td",{parentName:"tr",align:null},"The value of a table property can be used as the name of a CorpGroup owner. This table property name can be configured with the source option ",(0,n.yg)("inlineCode",{parentName:"td"},"group_ownership_property"),".")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:null},"Table parent folders (excluding ",(0,n.yg)("a",{parentName:"td",href:"https://iceberg.apache.org/docs/latest/configuration/#catalog-properties"},"warehouse catalog location"),")"),(0,n.yg)("td",{parentName:"tr",align:null},"Container"),(0,n.yg)("td",{parentName:"tr",align:null},"Available in a future release")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:null},(0,n.yg)("a",{parentName:"td",href:"https://iceberg.apache.org/spec/#schemas-and-data-types"},"Table schema")),(0,n.yg)("td",{parentName:"tr",align:null},"SchemaField"),(0,n.yg)("td",{parentName:"tr",align:null},"Maps to the fields defined within the Iceberg table schema definition.")))),(0,n.yg)("h2",{id:"troubleshooting"},"Troubleshooting"),(0,n.yg)("h3",{id:"exceptions-while-increasing-processing_threads"},"Exceptions while increasing ",(0,n.yg)("inlineCode",{parentName:"h3"},"processing_threads")),(0,n.yg)("p",null,"Each processing thread will open several files/sockets to download manifest files from blob storage. If you experience\nexceptions appearing when increasing ",(0,n.yg)("inlineCode",{parentName:"p"},"processing_threads")," configuration parameter, try to increase limit of open\nfiles (e.g. using ",(0,n.yg)("inlineCode",{parentName:"p"},"ulimit")," in Linux)."),(0,n.yg)("h2",{id:"datahub-iceberg-rest-catalog"},"DataHub Iceberg REST Catalog"),(0,n.yg)("p",null,"DataHub also implements the Iceberg REST Catalog. See the ",(0,n.yg)("a",{parentName:"p",href:"/docs/iceberg-catalog"},"Iceberg Catalog documentation")," for more details."),(0,n.yg)("h3",{id:"code-coordinates"},"Code Coordinates"),(0,n.yg)("ul",null,(0,n.yg)("li",{parentName:"ul"},"Class Name: ",(0,n.yg)("inlineCode",{parentName:"li"},"datahub.ingestion.source.iceberg.iceberg.IcebergSource")),(0,n.yg)("li",{parentName:"ul"},"Browse on ",(0,n.yg)("a",{parentName:"li",href:"https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py"},"GitHub"))),(0,n.yg)("h2",null,"Questions"),(0,n.yg)("p",null,"If you've got any questions on configuring ingestion for Iceberg, feel free to ping us on ",(0,n.yg)("a",{parentName:"p",href:"https://datahub.com/slack"},"our Slack"),"."))}f.isMDXComponent=!0}}]);