"use strict";(self.webpackChunkdocs_website=self.webpackChunkdocs_website||[]).push([[77374],{60025:(e,t,a)=>{a.r(t),a.d(t,{assets:()=>d,contentTitle:()=>p,default:()=>f,frontMatter:()=>g,metadata:()=>c,toc:()=>u});a(96540);var n=a(15680),i=a(53720),r=a(5400);function l(e,t,a){return t in e?Object.defineProperty(e,t,{value:a,enumerable:!0,configurable:!0,writable:!0}):e[t]=a,e}function o(e,t){return t=null!=t?t:{},Object.getOwnPropertyDescriptors?Object.defineProperties(e,Object.getOwnPropertyDescriptors(t)):function(e,t){var a=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),a.push.apply(a,n)}return a}(Object(t)).forEach((function(a){Object.defineProperty(e,a,Object.getOwnPropertyDescriptor(t,a))})),e}function s(e,t){if(null==e)return{};var a,n,i=function(e,t){if(null==e)return{};var a,n,i={},r=Object.keys(e);for(n=0;n=0||(i[a]=e[a]);return i}(e,t);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(i[a]=e[a])}return i}const g={sidebar_position:31,title:"Iceberg",slug:"/generated/ingestion/sources/iceberg",custom_edit_url:"https://github.com/datahub-project/datahub/blob/master/docs/generated/ingestion/sources/iceberg.md"},p="Iceberg",c={unversionedId:"docs/generated/ingestion/sources/iceberg",id:"docs/generated/ingestion/sources/iceberg",title:"Iceberg",description:"Testing",source:"@site/genDocs/docs/generated/ingestion/sources/iceberg.md",sourceDirName:"docs/generated/ingestion/sources",slug:"/generated/ingestion/sources/iceberg",permalink:"/docs/generated/ingestion/sources/iceberg",draft:!1,editUrl:"https://github.com/datahub-project/datahub/blob/master/docs/generated/ingestion/sources/iceberg.md",tags:[],version:"current",sidebarPosition:31,frontMatter:{sidebar_position:31,title:"Iceberg",slug:"/generated/ingestion/sources/iceberg",custom_edit_url:"https://github.com/datahub-project/datahub/blob/master/docs/generated/ingestion/sources/iceberg.md"},sidebar:"overviewSidebar",previous:{title:"Hive Metastore",permalink:"/docs/generated/ingestion/sources/hive-metastore"},next:{title:"JSON Schemas",permalink:"/docs/generated/ingestion/sources/json-schema"}},d={},u=[{value:"Important Capabilities",id:"important-capabilities",level:3},{value:"Integration Details",id:"integration-details",level:2},{value:"CLI based Ingestion",id:"cli-based-ingestion",level:3},{value:"Config Details",id:"config-details",level:3},{value:"Setting up connection to an Iceberg catalog",id:"setting-up-connection-to-an-iceberg-catalog",level:2},{value:"Glue catalog + S3 warehouse",id:"glue-catalog--s3-warehouse",level:3},{value:"Glue and S3 permissions required",id:"glue-and-s3-permissions-required",level:4},{value:"Iceberg REST Catalog + MinIO",id:"iceberg-rest-catalog--minio",level:3},{value:"Iceberg REST Catalog (with authentication) + S3",id:"iceberg-rest-catalog-with-authentication--s3",level:3},{value:"Special REST connection parameters for resiliency",id:"special-rest-connection-parameters-for-resiliency",level:4},{value:"SQL catalog + Azure DLS as the warehouse",id:"sql-catalog--azure-dls-as-the-warehouse",level:3},{value:"Concept Mapping",id:"concept-mapping",level:2},{value:"Troubleshooting",id:"troubleshooting",level:2},{value:"Exceptions while increasing processing_threads",id:"exceptions-while-increasing-processing_threads",level:3},{value:"DataHub Iceberg REST Catalog",id:"datahub-iceberg-rest-catalog",level:2},{value:"Code Coordinates",id:"code-coordinates",level:3}],m={toc:u},y="wrapper";function f(e){var{components:t}=e,a=s(e,["components"]);return(0,n.yg)(y,o(function(e){for(var t=1;t",\n "arn:aws:s3:::/*"\n ]\n }\n ]\n}\n')),(0,n.yg)("h3",{id:"iceberg-rest-catalog--minio"},"Iceberg REST Catalog + MinIO"),(0,n.yg)("p",null,"The following configuration assumes MinIO defines authentication using the ",(0,n.yg)("inlineCode",{parentName:"p"},"s3.*")," prefix. Note the specification of ",(0,n.yg)("inlineCode",{parentName:"p"},"s3.endpoint"),", assuming\nMinIO listens on port ",(0,n.yg)("inlineCode",{parentName:"p"},"9000")," at ",(0,n.yg)("inlineCode",{parentName:"p"},"minio-host"),". The ",(0,n.yg)("inlineCode",{parentName:"p"},"uri")," parameter points at Iceberg REST Catalog (IRC) endpoint (in this case ",(0,n.yg)("inlineCode",{parentName:"p"},"iceberg-catalog:8181"),")."),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-yaml"},'source:\n type: "iceberg"\n config:\n env: dev\n catalog:\n demo:\n type: "rest"\n uri: "http://iceberg-catalog:8181"\n s3.access-key-id: "${AWS_ACCESS_KEY_ID}"\n s3.secret-access-key: "${AWS_SECRET_ACCESS_KEY}"\n s3.region: "eu-east-1"\n s3.endpoint: "http://minio-host:9000"\n')),(0,n.yg)("h3",{id:"iceberg-rest-catalog-with-authentication--s3"},"Iceberg REST Catalog (with authentication) + S3"),(0,n.yg)("p",null,"This example assumes IRC requires token authentication (via ",(0,n.yg)("inlineCode",{parentName:"p"},"Authorization")," header). There are more options available,\nsee ",(0,n.yg)("a",{parentName:"p",href:"https://py.iceberg.apache.org/configuration/#rest-catalog"},"https://py.iceberg.apache.org/configuration/#rest-catalog")," for details. Moreover, the assumption here is that the\nenvironment (i.e. pod) is already authenticated to perform actions against AWS S3."),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-yaml"},'source:\n type: "iceberg"\n config:\n env: dev\n catalog:\n demo:\n type: "rest"\n uri: "http://iceberg-catalog-uri"\n token: "token-value"\n s3.region: "us-west-2"\n')),(0,n.yg)("h4",{id:"special-rest-connection-parameters-for-resiliency"},"Special REST connection parameters for resiliency"),(0,n.yg)("p",null,"Unlike other parameters provided in the dictionary under the ",(0,n.yg)("inlineCode",{parentName:"p"},"catalog")," key, ",(0,n.yg)("inlineCode",{parentName:"p"},"connection")," parameter is a custom feature in\nDataHub, allowing to inject connection resiliency parameters to the REST connection made by the ingestor. ",(0,n.yg)("inlineCode",{parentName:"p"},"connection"),"\nallows for 2 parameters:"),(0,n.yg)("ul",null,(0,n.yg)("li",{parentName:"ul"},(0,n.yg)("inlineCode",{parentName:"li"},"timeout")," is provided as amount of seconds, it needs to be whole number (or ",(0,n.yg)("inlineCode",{parentName:"li"},"null")," to turn it off)"),(0,n.yg)("li",{parentName:"ul"},(0,n.yg)("inlineCode",{parentName:"li"},"retry")," is a complex object representing parameters used to create ",(0,n.yg)("a",{parentName:"li",href:"https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#module-urllib3.util.retry"},"urllib3 Retry object"),".\nThere are many possible parameters, most important would be ",(0,n.yg)("inlineCode",{parentName:"li"},"total")," (total retries) and ",(0,n.yg)("inlineCode",{parentName:"li"},"backoff_factor"),". See the linked docs\nfor the details.")),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-yaml"},'source:\n type: "iceberg"\n config:\n env: dev\n catalog:\n demo:\n type: "rest"\n uri: "http://iceberg-catalog-uri"\n connection:\n retry:\n backoff_factor: 0.5\n total: 3\n timeout: 120\n')),(0,n.yg)("h3",{id:"sql-catalog--azure-dls-as-the-warehouse"},"SQL catalog + Azure DLS as the warehouse"),(0,n.yg)("p",null,"This example targets ",(0,n.yg)("inlineCode",{parentName:"p"},"Postgres")," as the sql-type ",(0,n.yg)("inlineCode",{parentName:"p"},"Iceberg")," catalog and uses Azure DLS as the warehouse."),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-yaml"},'source:\n type: "iceberg"\n config:\n env: dev\n catalog:\n demo:\n type: sql\n uri: postgresql+psycopg2://user:password@sqldatabase.postgres.database.azure.com:5432/icebergcatalog\n adlfs.tenant-id: \n adlfs.account-name: \n adlfs.client-id: \n adlfs.client-secret: \n')),(0,n.yg)("h2",{id:"concept-mapping"},"Concept Mapping"),(0,n.yg)("p",null,"This ingestion source maps the following Source System Concepts to DataHub Concepts:"),(0,n.yg)("table",null,(0,n.yg)("thead",{parentName:"table"},(0,n.yg)("tr",{parentName:"thead"},(0,n.yg)("th",{parentName:"tr",align:null},"Source Concept"),(0,n.yg)("th",{parentName:"tr",align:null},"DataHub Concept"),(0,n.yg)("th",{parentName:"tr",align:null},"Notes"))),(0,n.yg)("tbody",{parentName:"table"},(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:null},(0,n.yg)("inlineCode",{parentName:"td"},"iceberg")),(0,n.yg)("td",{parentName:"tr",align:null},(0,n.yg)("a",{parentName:"td",href:"/docs/generated/metamodel/entities/dataplatform"},"Data Platform")),(0,n.yg)("td",{parentName:"tr",align:null})),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:null},"Table"),(0,n.yg)("td",{parentName:"tr",align:null},(0,n.yg)("a",{parentName:"td",href:"/docs/generated/metamodel/entities/dataset"},"Dataset")),(0,n.yg)("td",{parentName:"tr",align:null},"An Iceberg table is registered inside a catalog using a name, where the catalog is responsible for creating, dropping and renaming tables. Catalogs manage a collection of tables that are usually grouped into namespaces. The name of a table is mapped to a Dataset name. If a ",(0,n.yg)("a",{parentName:"td",href:"/docs/platform-instances/"},"Platform Instance")," is configured, it will be used as a prefix: ",(0,n.yg)("inlineCode",{parentName:"td"},".my.namespace.table"),".")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:null},(0,n.yg)("a",{parentName:"td",href:"https://iceberg.apache.org/docs/latest/configuration/#table-properties"},"Table property")),(0,n.yg)("td",{parentName:"tr",align:null},(0,n.yg)("a",{parentName:"td",href:"/docs/generated/metamodel/entities/corpuser"},"User (a.k.a CorpUser)")),(0,n.yg)("td",{parentName:"tr",align:null},"The value of a table property can be used as the name of a CorpUser owner. This table property name can be configured with the source option ",(0,n.yg)("inlineCode",{parentName:"td"},"user_ownership_property"),".")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:null},(0,n.yg)("a",{parentName:"td",href:"https://iceberg.apache.org/docs/latest/configuration/#table-properties"},"Table property")),(0,n.yg)("td",{parentName:"tr",align:null},"CorpGroup"),(0,n.yg)("td",{parentName:"tr",align:null},"The value of a table property can be used as the name of a CorpGroup owner. This table property name can be configured with the source option ",(0,n.yg)("inlineCode",{parentName:"td"},"group_ownership_property"),".")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:null},"Table parent folders (excluding ",(0,n.yg)("a",{parentName:"td",href:"https://iceberg.apache.org/docs/latest/configuration/#catalog-properties"},"warehouse catalog location"),")"),(0,n.yg)("td",{parentName:"tr",align:null},"Container"),(0,n.yg)("td",{parentName:"tr",align:null},"Available in a future release")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:null},(0,n.yg)("a",{parentName:"td",href:"https://iceberg.apache.org/spec/#schemas-and-data-types"},"Table schema")),(0,n.yg)("td",{parentName:"tr",align:null},"SchemaField"),(0,n.yg)("td",{parentName:"tr",align:null},"Maps to the fields defined within the Iceberg table schema definition.")))),(0,n.yg)("h2",{id:"troubleshooting"},"Troubleshooting"),(0,n.yg)("h3",{id:"exceptions-while-increasing-processing_threads"},"Exceptions while increasing ",(0,n.yg)("inlineCode",{parentName:"h3"},"processing_threads")),(0,n.yg)("p",null,"Each processing thread will open several files/sockets to download manifest files from blob storage. If you experience\nexceptions appearing when increasing ",(0,n.yg)("inlineCode",{parentName:"p"},"processing_threads")," configuration parameter, try to increase limit of open\nfiles (e.g. using ",(0,n.yg)("inlineCode",{parentName:"p"},"ulimit")," in Linux)."),(0,n.yg)("h2",{id:"datahub-iceberg-rest-catalog"},"DataHub Iceberg REST Catalog"),(0,n.yg)("p",null,"DataHub also implements the Iceberg REST Catalog. See the ",(0,n.yg)("a",{parentName:"p",href:"/docs/iceberg-catalog"},"Iceberg Catalog documentation")," for more details."),(0,n.yg)("h3",{id:"code-coordinates"},"Code Coordinates"),(0,n.yg)("ul",null,(0,n.yg)("li",{parentName:"ul"},"Class Name: ",(0,n.yg)("inlineCode",{parentName:"li"},"datahub.ingestion.source.iceberg.iceberg.IcebergSource")),(0,n.yg)("li",{parentName:"ul"},"Browse on ",(0,n.yg)("a",{parentName:"li",href:"https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py"},"GitHub"))),(0,n.yg)("h2",null,"Questions"),(0,n.yg)("p",null,"If you've got any questions on configuring ingestion for Iceberg, feel free to ping us on ",(0,n.yg)("a",{parentName:"p",href:"https://datahub.com/slack"},"our Slack"),"."))}f.isMDXComponent=!0}}]);