datahub/assets/js/b4dca44a.41e08335.js
2025-08-22 14:09:31 +00:00

1 line
42 KiB
JavaScript

"use strict";(self.webpackChunkdocs_website=self.webpackChunkdocs_website||[]).push([[53912],{79531:(e,a,t)=>{t.r(a),t.d(a,{assets:()=>g,contentTitle:()=>p,default:()=>f,frontMatter:()=>d,metadata:()=>c,toc:()=>m});t(96540);var n=t(15680),s=t(53720),i=t(5400);function l(e,a,t){return a in e?Object.defineProperty(e,a,{value:t,enumerable:!0,configurable:!0,writable:!0}):e[a]=t,e}function r(e,a){return a=null!=a?a:{},Object.getOwnPropertyDescriptors?Object.defineProperties(e,Object.getOwnPropertyDescriptors(a)):function(e,a){var t=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);a&&(n=n.filter((function(a){return Object.getOwnPropertyDescriptor(e,a).enumerable}))),t.push.apply(t,n)}return t}(Object(a)).forEach((function(t){Object.defineProperty(e,t,Object.getOwnPropertyDescriptor(a,t))})),e}function o(e,a){if(null==e)return{};var t,n,s=function(e,a){if(null==e)return{};var t,n,s={},i=Object.keys(e);for(n=0;n<i.length;n++)t=i[n],a.indexOf(t)>=0||(s[t]=e[t]);return s}(e,a);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(n=0;n<i.length;n++)t=i[n],a.indexOf(t)>=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(s[t]=e[t])}return s}const d={sidebar_position:11,title:"DataHub",slug:"/generated/ingestion/sources/datahub",custom_edit_url:"https://github.com/datahub-project/datahub/blob/master/docs/generated/ingestion/sources/datahub.md"},p="DataHub",c={unversionedId:"docs/generated/ingestion/sources/datahub",id:"version-1.1.0/docs/generated/ingestion/sources/datahub",title:"DataHub",description:"Migrate data from one DataHub instance to another.",source:"@site/versioned_docs/version-1.1.0/docs/generated/ingestion/sources/datahub.md",sourceDirName:"docs/generated/ingestion/sources",slug:"/generated/ingestion/sources/datahub",permalink:"/docs/1.1.0/generated/ingestion/sources/datahub",draft:!1,editUrl:"https://github.com/datahub-project/datahub/blob/master/docs/generated/ingestion/sources/datahub.md",tags:[],version:"1.1.0",sidebarPosition:11,frontMatter:{sidebar_position:11,title:"DataHub",slug:"/generated/ingestion/sources/datahub",custom_edit_url:"https://github.com/datahub-project/datahub/blob/master/docs/generated/ingestion/sources/datahub.md"},sidebar:"overviewSidebar",previous:{title:"Databricks",permalink:"/docs/1.1.0/generated/ingestion/sources/databricks"},next:{title:"DataHubApply",permalink:"/docs/1.1.0/generated/ingestion/sources/datahubapply"}},g={},m=[{value:"Important Capabilities",id:"important-capabilities",level:3},{value:"Overview",id:"overview",level:3},{value:"Stateful Ingestion",id:"stateful-ingestion",level:4},{value:"Limitations",id:"limitations",level:4},{value:"Performance",id:"performance",level:4},{value:"Exclusions",id:"exclusions",level:4},{value:"CLI based Ingestion",id:"cli-based-ingestion",level:3},{value:"Starter Recipe",id:"starter-recipe",level:3},{value:"Config Details",id:"config-details",level:3},{value:"Code Coordinates",id:"code-coordinates",level:3}],y={toc:m},u="wrapper";function f(e){var{components:a}=e,t=o(e,["components"]);return(0,n.yg)(u,r(function(e){for(var a=1;a<arguments.length;a++){var t=null!=arguments[a]?arguments[a]:{},n=Object.keys(t);"function"==typeof Object.getOwnPropertySymbols&&(n=n.concat(Object.getOwnPropertySymbols(t).filter((function(e){return Object.getOwnPropertyDescriptor(t,e).enumerable})))),n.forEach((function(a){l(e,a,t[a])}))}return e}({},y,t),{components:a,mdxType:"MDXLayout"}),(0,n.yg)("h1",{id:"datahub"},"DataHub"),(0,n.yg)("p",null,"Migrate data from one DataHub instance to another."),(0,n.yg)("p",null,"Requires direct access to the database, kafka broker, and kafka schema registry\nof the source DataHub instance.\n",(0,n.yg)("img",{parentName:"p",src:"https://img.shields.io/badge/support%20status-testing-lightgrey",alt:"Testing"})),(0,n.yg)("h3",{id:"important-capabilities"},"Important Capabilities"),(0,n.yg)("table",null,(0,n.yg)("thead",{parentName:"table"},(0,n.yg)("tr",{parentName:"thead"},(0,n.yg)("th",{parentName:"tr",align:null},"Capability"),(0,n.yg)("th",{parentName:"tr",align:null},"Status"),(0,n.yg)("th",{parentName:"tr",align:null},"Notes"))),(0,n.yg)("tbody",{parentName:"table"},(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:null},(0,n.yg)("a",{parentName:"td",href:"/docs/1.1.0/metadata-ingestion/docs/dev_guides/stateful#stale-entity-removal"},"Detect Deleted Entities")),(0,n.yg)("td",{parentName:"tr",align:null},"\u2705"),(0,n.yg)("td",{parentName:"tr",align:null},"Optionally enabled via ",(0,n.yg)("inlineCode",{parentName:"td"},"stateful_ingestion.remove_stale_metadata"))))),(0,n.yg)("h3",{id:"overview"},"Overview"),(0,n.yg)("p",null,"This source pulls data from two locations:"),(0,n.yg)("ul",null,(0,n.yg)("li",{parentName:"ul"},"The DataHub database, containing a single table holding all versioned aspects"),(0,n.yg)("li",{parentName:"ul"},"The DataHub Kafka cluster, reading from the ",(0,n.yg)("a",{parentName:"li",href:"/docs/1.1.0/what/mxe#metadata-change-log-mcl"},"MCL Log"),"\ntopic for timeseries aspects.")),(0,n.yg)("p",null,"All data is first read from the database, before timeseries data is ingested from kafka.\nTo prevent this source from potentially running forever, it will not ingest data produced after the\ndatahub_source ingestion job is started. This ",(0,n.yg)("inlineCode",{parentName:"p"},"stop_time")," is reflected in the report."),(0,n.yg)("p",null,"Data from the database and kafka are read in chronological order, specifically by the\ncreatedon timestamp in the database and by kafka offset per partition. In order to\nproperly read from the database, please ensure that the ",(0,n.yg)("inlineCode",{parentName:"p"},"createdon")," column is indexed.\nNewly created databases should have this index, named ",(0,n.yg)("inlineCode",{parentName:"p"},"timeIndex"),", by default, but older\nones you may have to create yourself, with the statement:"),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre"},"CREATE INDEX timeIndex ON metadata_aspect_v2 (createdon);\n")),(0,n.yg)("p",null,(0,n.yg)("em",{parentName:"p"},"If you do not have this index, the source may run incredibly slowly and produce\nsignificant database load.")),(0,n.yg)("h4",{id:"stateful-ingestion"},"Stateful Ingestion"),(0,n.yg)("p",null,"On first run, the source will read from the earliest data in the database and the earliest\nkafka offsets. Every ",(0,n.yg)("inlineCode",{parentName:"p"},"commit_state_interval")," (default 1000) records, the source will store\na checkpoint to remember its place, i.e. the last createdon timestamp and kafka offsets.\nThis allows you to stop and restart the source without losing much progress, but note that\nyou will re-ingest some data at the start of the new run."),(0,n.yg)("p",null,"If any errors are encountered in the ingestion process, e.g. we are unable to emit an aspect\ndue to network errors, the source will keep running, but will stop committing checkpoints,\nunless ",(0,n.yg)("inlineCode",{parentName:"p"},"commit_with_parse_errors")," (default ",(0,n.yg)("inlineCode",{parentName:"p"},"false"),") is set. Thus, if you re-run the ingestion,\nyou can re-ingest the data that was missed, but note it will all re-ingest all subsequent data."),(0,n.yg)("p",null,"If you want to re-ingest all data, you can set a different ",(0,n.yg)("inlineCode",{parentName:"p"},"pipeline_name")," in your recipe,\nor set ",(0,n.yg)("inlineCode",{parentName:"p"},"stateful_ingestion.ignore_old_state"),":"),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-yaml"},"source:\n config:\n # ... connection config, etc.\n stateful_ingestion:\n enabled: true\n ignore_old_state: true\n urn_pattern: # URN pattern to ignore/include in the ingestion\n deny:\n # Ignores all datahub metadata where the urn matches the regex\n - ^denied.urn.*\n allow:\n # Ingests all datahub metadata where the urn matches the regex.\n - ^allowed.urn.*\n")),(0,n.yg)("h4",{id:"limitations"},"Limitations"),(0,n.yg)("ul",null,(0,n.yg)("li",{parentName:"ul"},"Can only pull timeseries aspects retained by Kafka, which by default lasts 90 days."),(0,n.yg)("li",{parentName:"ul"},"Does not detect hard timeseries deletions, e.g. if via a ",(0,n.yg)("inlineCode",{parentName:"li"},"datahub delete")," command using the CLI.\nTherefore, if you deleted data in this way, it will still exist in the destination instance."),(0,n.yg)("li",{parentName:"ul"},"If you have a significant amount of aspects with the exact same ",(0,n.yg)("inlineCode",{parentName:"li"},"createdon")," timestamp,\nstateful ingestion will not be able to save checkpoints partially through that timestamp.\nOn a subsequent run, all aspects for that timestamp will be ingested.")),(0,n.yg)("h4",{id:"performance"},"Performance"),(0,n.yg)("p",null,"On your destination DataHub instance, we suggest the following settings:"),(0,n.yg)("ul",null,(0,n.yg)("li",{parentName:"ul"},"Enable ",(0,n.yg)("a",{parentName:"li",href:"/docs/1.1.0/deploy/environment-vars#ingestion"},"async ingestion")),(0,n.yg)("li",{parentName:"ul"},"Use standalone consumers\n(",(0,n.yg)("a",{parentName:"li",href:"/docs/1.1.0/metadata-jobs/mae-consumer-job"},"mae-consumer"),"\nand ",(0,n.yg)("a",{parentName:"li",href:"/docs/1.1.0/metadata-jobs/mce-consumer-job"},"mce-consumer"),")",(0,n.yg)("ul",{parentName:"li"},(0,n.yg)("li",{parentName:"ul"},"If you are migrating large amounts of data, consider scaling consumer replicas."))),(0,n.yg)("li",{parentName:"ul"},"Increase the number of gms pods to add redundancy and increase resilience to node evictions",(0,n.yg)("ul",{parentName:"li"},(0,n.yg)("li",{parentName:"ul"},"If you are migrating large amounts of data, consider increasing elasticsearch's\nthread count via the ",(0,n.yg)("inlineCode",{parentName:"li"},"ELASTICSEARCH_THREAD_COUNT")," environment variable.")))),(0,n.yg)("h4",{id:"exclusions"},"Exclusions"),(0,n.yg)("p",null,"You will likely want to exclude some urn types from your ingestion, as they contain instance-specific\nmetadata, such as settings, roles, policies, ingestion sources, and ingestion runs. For example, you\nwill likely want to start with this:"),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-yaml"},"source:\n config:\n urn_pattern: # URN pattern to ignore/include in the ingestion\n deny:\n # Ignores all datahub metadata where the urn matches the regex\n - ^urn:li:role.* # Only exclude if you do not want to ingest roles\n - ^urn:li:dataHubRole.* # Only exclude if you do not want to ingest roles\n - ^urn:li:dataHubPolicy.* # Only exclude if you do not want to ingest policies\n - ^urn:li:dataHubIngestionSource.* # Only exclude if you do not want to ingest ingestion sources\n - ^urn:li:dataHubSecret.*\n - ^urn:li:dataHubExecutionRequest.*\n - ^urn:li:dataHubAccessToken.*\n - ^urn:li:dataHubUpgrade.*\n - ^urn:li:inviteToken.*\n - ^urn:li:globalSettings.*\n - ^urn:li:dataHubStepState.*\n")),(0,n.yg)("h3",{id:"cli-based-ingestion"},"CLI based Ingestion"),(0,n.yg)("h3",{id:"starter-recipe"},"Starter Recipe"),(0,n.yg)("p",null,"Check out the following recipe to get started with ingestion! See ",(0,n.yg)("a",{parentName:"p",href:"#config-details"},"below")," for full configuration options."),(0,n.yg)("p",null,"For general pointers on writing and running a recipe, see our ",(0,n.yg)("a",{parentName:"p",href:"/docs/1.1.0/metadata-ingestion#recipes"},"main recipe guide"),"."),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-yaml"},'pipeline_name: datahub_source_1\ndatahub_api:\n server: "http://localhost:8080" # Migrate data from DataHub instance on localhost:8080\n token: "<token>"\nsource:\n type: datahub\n config:\n include_all_versions: false\n database_connection:\n scheme: "mysql+pymysql" # or "postgresql+psycopg2" for Postgres\n host_port: "<database_host>:<database_port>"\n username: "<username>"\n password: "<password>"\n database: "<database>"\n kafka_connection:\n bootstrap: "<boostrap_url>:9092"\n schema_registry_url: "<schema_registry_url>:8081"\n stateful_ingestion:\n enabled: true\n ignore_old_state: false\n urn_pattern:\n deny:\n # Ignores all datahub metadata where the urn matches the regex\n - ^denied.urn.*\n allow:\n # Ingests all datahub metadata where the urn matches the regex.\n - ^allowed.urn.*\n\nflags:\n set_system_metadata: false # Replicate system metadata\n\n# Here, we write to a DataHub instance\n# You can also use a different sink, e.g. to write the data to a file instead\nsink:\n type: datahub-rest\n config:\n server: "<destination_gms_url>"\n token: "<token>"\n\n')),(0,n.yg)("h3",{id:"config-details"},"Config Details"),(0,n.yg)(s.A,{mdxType:"Tabs"},(0,n.yg)(i.A,{value:"options",label:"Options",default:!0,mdxType:"TabItem"},(0,n.yg)("p",null,"Note that a ",(0,n.yg)("inlineCode",{parentName:"p"},".")," is used to denote nested fields in the YAML recipe."),(0,n.yg)("div",{className:"config-table"},(0,n.yg)("table",null,(0,n.yg)("thead",{parentName:"table"},(0,n.yg)("tr",{parentName:"thead"},(0,n.yg)("th",{parentName:"tr",align:"left"},"Field"),(0,n.yg)("th",{parentName:"tr",align:"left"},"Description"))),(0,n.yg)("tbody",{parentName:"table"},(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"commit_state_interval"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"integer"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Number of records to process before committing state ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"1000")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"commit_with_parse_errors"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"boolean"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Whether to update createdon timestamp and kafka offset despite parse errors. Enable if you want to ignore the errors. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"False")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"database_query_batch_size"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"integer"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Number of records to fetch from the database at a time ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"10000")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"database_table_name"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Name of database table containing all versioned aspects ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"metadata","_","aspect","_","v2")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"drop_duplicate_schema_fields"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"boolean"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Whether to drop duplicate schema fields in the schemaMetadata aspect. Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"False")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"include_all_versions"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"boolean"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"If enabled, include all versions of each aspect. Otherwise, only include the latest version of each aspect. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"False")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"include_soft_deleted_entities"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"boolean"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"If enabled, include entities that have been soft deleted. Otherwise, include all entities regardless of removal status. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"True")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"kafka_topic_name"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Name of kafka topic containing timeseries MCLs ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"MetadataChangeLog","_","Timeseries","_","v1")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"max_workers"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"integer"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Number of worker threads to use for datahub api ingestion. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"70")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"pull_from_datahub_api"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"boolean"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Use the DataHub API to fetch versioned aspects. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"False")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"database_connection"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"SQLAlchemyConnectionConfig"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Database connection config")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"database_connection."),(0,n.yg)("span",{className:"path-main"},"host_port"),"\xa0",(0,n.yg)("abbr",{title:"Required if database_connection is set"},"\u2753"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"host URL")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"database_connection."),(0,n.yg)("span",{className:"path-main"},"scheme"),"\xa0",(0,n.yg)("abbr",{title:"Required if database_connection is set"},"\u2753"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"scheme")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"database_connection."),(0,n.yg)("span",{className:"path-main"},"database"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"database (catalog)")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"database_connection."),(0,n.yg)("span",{className:"path-main"},"options"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"object"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Any options specified here will be passed to ",(0,n.yg)("a",{parentName:"td",href:"https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine"},"SQLAlchemy.create_engine")," as kwargs. To set connection arguments in the URL, specify them under ",(0,n.yg)("inlineCode",{parentName:"td"},"connect_args"),".")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"database_connection."),(0,n.yg)("span",{className:"path-main"},"password"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string(password)"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"password")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"database_connection."),(0,n.yg)("span",{className:"path-main"},"sqlalchemy_uri"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"URI of database to connect to. See ",(0,n.yg)("a",{parentName:"td",href:"https://docs.sqlalchemy.org/en/14/core/engines.html#database-urls"},"https://docs.sqlalchemy.org/en/14/core/engines.html#database-urls"),". Takes precedence over other connection parameters.")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"database_connection."),(0,n.yg)("span",{className:"path-main"},"username"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"username")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"exclude_aspects"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"array"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Set of aspect names to exclude from ingestion ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"[","'","globalSettingsInfo","'",", ","'","dataHubExecutionRequestSig...")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"exclude_aspects."),(0,n.yg)("span",{className:"path-main"},"string"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"})),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"kafka_connection"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"KafkaConsumerConnectionConfig"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Kafka connection config")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"kafka_connection."),(0,n.yg)("span",{className:"path-main"},"bootstrap"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"default-line "},"Default: ",(0,n.yg)("span",{className:"default-value"},"localhost:9092")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"kafka_connection."),(0,n.yg)("span",{className:"path-main"},"client_timeout_seconds"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"integer"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"The request timeout used when interacting with the Kafka APIs. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"60")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"kafka_connection."),(0,n.yg)("span",{className:"path-main"},"consumer_config"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"object"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Extra consumer config serialized as JSON. These options will be passed into Kafka's DeserializingConsumer. See ",(0,n.yg)("a",{parentName:"td",href:"https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#deserializingconsumer"},"https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#deserializingconsumer")," and ",(0,n.yg)("a",{parentName:"td",href:"https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md"},"https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md")," .")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"kafka_connection."),(0,n.yg)("span",{className:"path-main"},"schema_registry_config"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"object"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Extra schema registry config serialized as JSON. These options will be passed into Kafka's SchemaRegistryClient. ",(0,n.yg)("a",{parentName:"td",href:"https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html?#schemaregistryclient"},"https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html?#schemaregistryclient"))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"kafka_connection."),(0,n.yg)("span",{className:"path-main"},"schema_registry_url"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"default-line "},"Default: ",(0,n.yg)("span",{className:"default-value"},"http://localhost:8080/schema-registry/api/")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"urn_pattern"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"AllowDenyPattern"))),(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"default-line "},"Default: ",(0,n.yg)("span",{className:"default-value"},"{","'","allow","'",": ","[","'",".","*","'","]",", ","'","deny","'",": ","[","]",", ","'","ignoreCase","'",": True","}")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"urn_pattern."),(0,n.yg)("span",{className:"path-main"},"ignoreCase"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"boolean"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Whether to ignore case sensitivity during pattern matching. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"True")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"urn_pattern."),(0,n.yg)("span",{className:"path-main"},"allow"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"array"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"List of regex patterns to include in ingestion ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"[","'",".","*","'","]")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"urn_pattern.allow."),(0,n.yg)("span",{className:"path-main"},"string"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"})),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"urn_pattern."),(0,n.yg)("span",{className:"path-main"},"deny"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"array"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"List of regex patterns to exclude from ingestion. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"[","]")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"urn_pattern.deny."),(0,n.yg)("span",{className:"path-main"},"string"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"})),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"stateful_ingestion"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"StatefulIngestionConfig"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Stateful Ingestion Config ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"{","'","enabled","'",": True, ","'","max","_","checkpoint","_","state","_","size","'",": 167...")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"stateful_ingestion."),(0,n.yg)("span",{className:"path-main"},"enabled"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"boolean"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Whether or not to enable stateful ingest. Default: True if a pipeline_name is set and either a datahub-rest sink or ",(0,n.yg)("inlineCode",{parentName:"td"},"datahub_api")," is specified, otherwise False ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"False")))))))),(0,n.yg)(i.A,{value:"schema",label:"Schema",mdxType:"TabItem"},(0,n.yg)("p",null,"The ",(0,n.yg)("a",{parentName:"p",href:"https://json-schema.org/"},"JSONSchema")," for this configuration is inlined below."),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-javascript"},'{\n "title": "DataHubSourceConfig",\n "description": "Base configuration class for stateful ingestion for source configs to inherit from.",\n "type": "object",\n "properties": {\n "stateful_ingestion": {\n "title": "Stateful Ingestion",\n "description": "Stateful Ingestion Config",\n "default": {\n "enabled": true,\n "max_checkpoint_state_size": 16777216,\n "state_provider": {\n "type": "datahub",\n "config": {}\n },\n "ignore_old_state": false,\n "ignore_new_state": false\n },\n "allOf": [\n {\n "$ref": "#/definitions/StatefulIngestionConfig"\n }\n ]\n },\n "database_connection": {\n "title": "Database Connection",\n "description": "Database connection config",\n "allOf": [\n {\n "$ref": "#/definitions/SQLAlchemyConnectionConfig"\n }\n ]\n },\n "kafka_connection": {\n "title": "Kafka Connection",\n "description": "Kafka connection config",\n "allOf": [\n {\n "$ref": "#/definitions/KafkaConsumerConnectionConfig"\n }\n ]\n },\n "include_all_versions": {\n "title": "Include All Versions",\n "description": "If enabled, include all versions of each aspect. Otherwise, only include the latest version of each aspect. ",\n "default": false,\n "type": "boolean"\n },\n "include_soft_deleted_entities": {\n "title": "Include Soft Deleted Entities",\n "description": "If enabled, include entities that have been soft deleted. Otherwise, include all entities regardless of removal status. ",\n "default": true,\n "type": "boolean"\n },\n "exclude_aspects": {\n "title": "Exclude Aspects",\n "description": "Set of aspect names to exclude from ingestion",\n "default": [\n "globalSettingsInfo",\n "dataHubExecutionRequestSignal",\n "globalSettingsKey",\n "dataHubExecutionRequestInput",\n "datahubIngestionRunSummary",\n "dataHubSecretValue",\n "testResults",\n "datahubIngestionCheckpoint",\n "dataHubExecutionRequestKey",\n "dataHubExecutionRequestResult",\n "dataHubIngestionSourceKey",\n "dataHubSecretKey",\n "dataHubIngestionSourceInfo"\n ],\n "type": "array",\n "items": {\n "type": "string"\n },\n "uniqueItems": true\n },\n "database_query_batch_size": {\n "title": "Database Query Batch Size",\n "description": "Number of records to fetch from the database at a time",\n "default": 10000,\n "type": "integer"\n },\n "database_table_name": {\n "title": "Database Table Name",\n "description": "Name of database table containing all versioned aspects",\n "default": "metadata_aspect_v2",\n "type": "string"\n },\n "kafka_topic_name": {\n "title": "Kafka Topic Name",\n "description": "Name of kafka topic containing timeseries MCLs",\n "default": "MetadataChangeLog_Timeseries_v1",\n "type": "string"\n },\n "commit_state_interval": {\n "title": "Commit State Interval",\n "description": "Number of records to process before committing state",\n "default": 1000,\n "type": "integer"\n },\n "commit_with_parse_errors": {\n "title": "Commit With Parse Errors",\n "description": "Whether to update createdon timestamp and kafka offset despite parse errors. Enable if you want to ignore the errors.",\n "default": false,\n "type": "boolean"\n },\n "pull_from_datahub_api": {\n "title": "Pull From Datahub Api",\n "description": "Use the DataHub API to fetch versioned aspects.",\n "default": false,\n "hidden_from_docs": true,\n "type": "boolean"\n },\n "max_workers": {\n "title": "Max Workers",\n "description": "Number of worker threads to use for datahub api ingestion.",\n "default": 70,\n "hidden_from_docs": true,\n "type": "integer"\n },\n "urn_pattern": {\n "title": "Urn Pattern",\n "default": {\n "allow": [\n ".*"\n ],\n "deny": [],\n "ignoreCase": true\n },\n "allOf": [\n {\n "$ref": "#/definitions/AllowDenyPattern"\n }\n ]\n },\n "drop_duplicate_schema_fields": {\n "title": "Drop Duplicate Schema Fields",\n "description": "Whether to drop duplicate schema fields in the schemaMetadata aspect. Useful if the source system has duplicate field paths in the db, but we\'re pushing to a system with server-side duplicate checking.",\n "default": false,\n "type": "boolean"\n }\n },\n "definitions": {\n "DynamicTypedStateProviderConfig": {\n "title": "DynamicTypedStateProviderConfig",\n "type": "object",\n "properties": {\n "type": {\n "title": "Type",\n "description": "The type of the state provider to use. For DataHub use `datahub`",\n "type": "string"\n },\n "config": {\n "title": "Config",\n "description": "The configuration required for initializing the state provider. Default: The datahub_api config if set at pipeline level. Otherwise, the default DatahubClientConfig. See the defaults (https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/graph/client.py#L19).",\n "default": {},\n "type": "object"\n }\n },\n "required": [\n "type"\n ],\n "additionalProperties": false\n },\n "StatefulIngestionConfig": {\n "title": "StatefulIngestionConfig",\n "description": "Basic Stateful Ingestion Specific Configuration for any source.",\n "type": "object",\n "properties": {\n "enabled": {\n "title": "Enabled",\n "description": "Whether or not to enable stateful ingest. Default: True if a pipeline_name is set and either a datahub-rest sink or `datahub_api` is specified, otherwise False",\n "default": false,\n "type": "boolean"\n }\n },\n "additionalProperties": false\n },\n "SQLAlchemyConnectionConfig": {\n "title": "SQLAlchemyConnectionConfig",\n "type": "object",\n "properties": {\n "username": {\n "title": "Username",\n "description": "username",\n "type": "string"\n },\n "password": {\n "title": "Password",\n "description": "password",\n "type": "string",\n "writeOnly": true,\n "format": "password"\n },\n "host_port": {\n "title": "Host Port",\n "description": "host URL",\n "type": "string"\n },\n "database": {\n "title": "Database",\n "description": "database (catalog)",\n "type": "string"\n },\n "scheme": {\n "title": "Scheme",\n "description": "scheme",\n "type": "string"\n },\n "sqlalchemy_uri": {\n "title": "Sqlalchemy Uri",\n "description": "URI of database to connect to. See https://docs.sqlalchemy.org/en/14/core/engines.html#database-urls. Takes precedence over other connection parameters.",\n "type": "string"\n },\n "options": {\n "title": "Options",\n "description": "Any options specified here will be passed to [SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs. To set connection arguments in the URL, specify them under `connect_args`.",\n "type": "object"\n }\n },\n "required": [\n "host_port",\n "scheme"\n ],\n "additionalProperties": false\n },\n "KafkaConsumerConnectionConfig": {\n "title": "KafkaConsumerConnectionConfig",\n "description": "Configuration class for holding connectivity information for Kafka consumers",\n "type": "object",\n "properties": {\n "bootstrap": {\n "title": "Bootstrap",\n "default": "localhost:9092",\n "type": "string"\n },\n "schema_registry_url": {\n "title": "Schema Registry Url",\n "default": "http://localhost:8080/schema-registry/api/",\n "type": "string"\n },\n "schema_registry_config": {\n "title": "Schema Registry Config",\n "description": "Extra schema registry config serialized as JSON. These options will be passed into Kafka\'s SchemaRegistryClient. https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html?#schemaregistryclient",\n "type": "object"\n },\n "client_timeout_seconds": {\n "title": "Client Timeout Seconds",\n "description": "The request timeout used when interacting with the Kafka APIs.",\n "default": 60,\n "type": "integer"\n },\n "consumer_config": {\n "title": "Consumer Config",\n "description": "Extra consumer config serialized as JSON. These options will be passed into Kafka\'s DeserializingConsumer. See https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#deserializingconsumer and https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md .",\n "type": "object"\n }\n },\n "additionalProperties": false\n },\n "AllowDenyPattern": {\n "title": "AllowDenyPattern",\n "description": "A class to store allow deny regexes",\n "type": "object",\n "properties": {\n "allow": {\n "title": "Allow",\n "description": "List of regex patterns to include in ingestion",\n "default": [\n ".*"\n ],\n "type": "array",\n "items": {\n "type": "string"\n }\n },\n "deny": {\n "title": "Deny",\n "description": "List of regex patterns to exclude from ingestion.",\n "default": [],\n "type": "array",\n "items": {\n "type": "string"\n }\n },\n "ignoreCase": {\n "title": "Ignorecase",\n "description": "Whether to ignore case sensitivity during pattern matching.",\n "default": true,\n "type": "boolean"\n }\n },\n "additionalProperties": false\n }\n }\n}\n')))),(0,n.yg)("h3",{id:"code-coordinates"},"Code Coordinates"),(0,n.yg)("ul",null,(0,n.yg)("li",{parentName:"ul"},"Class Name: ",(0,n.yg)("inlineCode",{parentName:"li"},"datahub.ingestion.source.datahub.datahub_source.DataHubSource")),(0,n.yg)("li",{parentName:"ul"},"Browse on ",(0,n.yg)("a",{parentName:"li",href:"https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py"},"GitHub"))),(0,n.yg)("h2",null,"Questions"),(0,n.yg)("p",null,"If you've got any questions on configuring ingestion for DataHub, feel free to ping us on ",(0,n.yg)("a",{parentName:"p",href:"https://datahub.com/slack"},"our Slack"),"."))}f.isMDXComponent=!0}}]);