"use strict";(self.webpackChunkdocs_website=self.webpackChunkdocs_website||[]).push([[53912],{79531:(e,a,t)=>{t.r(a),t.d(a,{assets:()=>g,contentTitle:()=>p,default:()=>f,frontMatter:()=>d,metadata:()=>c,toc:()=>m});t(96540);var n=t(15680),s=t(53720),i=t(5400);function l(e,a,t){return a in e?Object.defineProperty(e,a,{value:t,enumerable:!0,configurable:!0,writable:!0}):e[a]=t,e}function r(e,a){return a=null!=a?a:{},Object.getOwnPropertyDescriptors?Object.defineProperties(e,Object.getOwnPropertyDescriptors(a)):function(e,a){var t=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);a&&(n=n.filter((function(a){return Object.getOwnPropertyDescriptor(e,a).enumerable}))),t.push.apply(t,n)}return t}(Object(a)).forEach((function(t){Object.defineProperty(e,t,Object.getOwnPropertyDescriptor(a,t))})),e}function o(e,a){if(null==e)return{};var t,n,s=function(e,a){if(null==e)return{};var t,n,s={},i=Object.keys(e);for(n=0;n=0||(s[t]=e[t]);return s}(e,a);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(s[t]=e[t])}return s}const d={sidebar_position:11,title:"DataHub",slug:"/generated/ingestion/sources/datahub",custom_edit_url:"https://github.com/datahub-project/datahub/blob/master/docs/generated/ingestion/sources/datahub.md"},p="DataHub",c={unversionedId:"docs/generated/ingestion/sources/datahub",id:"version-1.1.0/docs/generated/ingestion/sources/datahub",title:"DataHub",description:"Migrate data from one DataHub instance to another.",source:"@site/versioned_docs/version-1.1.0/docs/generated/ingestion/sources/datahub.md",sourceDirName:"docs/generated/ingestion/sources",slug:"/generated/ingestion/sources/datahub",permalink:"/docs/1.1.0/generated/ingestion/sources/datahub",draft:!1,editUrl:"https://github.com/datahub-project/datahub/blob/master/docs/generated/ingestion/sources/datahub.md",tags:[],version:"1.1.0",sidebarPosition:11,frontMatter:{sidebar_position:11,title:"DataHub",slug:"/generated/ingestion/sources/datahub",custom_edit_url:"https://github.com/datahub-project/datahub/blob/master/docs/generated/ingestion/sources/datahub.md"},sidebar:"overviewSidebar",previous:{title:"Databricks",permalink:"/docs/1.1.0/generated/ingestion/sources/databricks"},next:{title:"DataHubApply",permalink:"/docs/1.1.0/generated/ingestion/sources/datahubapply"}},g={},m=[{value:"Important Capabilities",id:"important-capabilities",level:3},{value:"Overview",id:"overview",level:3},{value:"Stateful Ingestion",id:"stateful-ingestion",level:4},{value:"Limitations",id:"limitations",level:4},{value:"Performance",id:"performance",level:4},{value:"Exclusions",id:"exclusions",level:4},{value:"CLI based Ingestion",id:"cli-based-ingestion",level:3},{value:"Starter Recipe",id:"starter-recipe",level:3},{value:"Config Details",id:"config-details",level:3},{value:"Code Coordinates",id:"code-coordinates",level:3}],y={toc:m},u="wrapper";function f(e){var{components:a}=e,t=o(e,["components"]);return(0,n.yg)(u,r(function(e){for(var a=1;a"\nsource:\n type: datahub\n config:\n include_all_versions: false\n database_connection:\n scheme: "mysql+pymysql" # or "postgresql+psycopg2" for Postgres\n host_port: ":"\n username: ""\n password: ""\n database: ""\n kafka_connection:\n bootstrap: ":9092"\n schema_registry_url: ":8081"\n stateful_ingestion:\n enabled: true\n ignore_old_state: false\n urn_pattern:\n deny:\n # Ignores all datahub metadata where the urn matches the regex\n - ^denied.urn.*\n allow:\n # Ingests all datahub metadata where the urn matches the regex.\n - ^allowed.urn.*\n\nflags:\n set_system_metadata: false # Replicate system metadata\n\n# Here, we write to a DataHub instance\n# You can also use a different sink, e.g. to write the data to a file instead\nsink:\n type: datahub-rest\n config:\n server: ""\n token: ""\n\n')),(0,n.yg)("h3",{id:"config-details"},"Config Details"),(0,n.yg)(s.A,{mdxType:"Tabs"},(0,n.yg)(i.A,{value:"options",label:"Options",default:!0,mdxType:"TabItem"},(0,n.yg)("p",null,"Note that a ",(0,n.yg)("inlineCode",{parentName:"p"},".")," is used to denote nested fields in the YAML recipe."),(0,n.yg)("div",{className:"config-table"},(0,n.yg)("table",null,(0,n.yg)("thead",{parentName:"table"},(0,n.yg)("tr",{parentName:"thead"},(0,n.yg)("th",{parentName:"tr",align:"left"},"Field"),(0,n.yg)("th",{parentName:"tr",align:"left"},"Description"))),(0,n.yg)("tbody",{parentName:"table"},(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"commit_state_interval"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"integer"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Number of records to process before committing state ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"1000")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"commit_with_parse_errors"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"boolean"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Whether to update createdon timestamp and kafka offset despite parse errors. Enable if you want to ignore the errors. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"False")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"database_query_batch_size"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"integer"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Number of records to fetch from the database at a time ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"10000")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"database_table_name"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Name of database table containing all versioned aspects ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"metadata","_","aspect","_","v2")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"drop_duplicate_schema_fields"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"boolean"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Whether to drop duplicate schema fields in the schemaMetadata aspect. Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"False")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"include_all_versions"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"boolean"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"If enabled, include all versions of each aspect. Otherwise, only include the latest version of each aspect. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"False")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"include_soft_deleted_entities"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"boolean"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"If enabled, include entities that have been soft deleted. Otherwise, include all entities regardless of removal status. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"True")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"kafka_topic_name"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Name of kafka topic containing timeseries MCLs ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"MetadataChangeLog","_","Timeseries","_","v1")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"max_workers"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"integer"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Number of worker threads to use for datahub api ingestion. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"70")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"pull_from_datahub_api"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"boolean"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Use the DataHub API to fetch versioned aspects. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"False")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"database_connection"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"SQLAlchemyConnectionConfig"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Database connection config")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"database_connection."),(0,n.yg)("span",{className:"path-main"},"host_port"),"\xa0",(0,n.yg)("abbr",{title:"Required if database_connection is set"},"\u2753"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"host URL")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"database_connection."),(0,n.yg)("span",{className:"path-main"},"scheme"),"\xa0",(0,n.yg)("abbr",{title:"Required if database_connection is set"},"\u2753"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"scheme")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"database_connection."),(0,n.yg)("span",{className:"path-main"},"database"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"database (catalog)")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"database_connection."),(0,n.yg)("span",{className:"path-main"},"options"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"object"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Any options specified here will be passed to ",(0,n.yg)("a",{parentName:"td",href:"https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine"},"SQLAlchemy.create_engine")," as kwargs. To set connection arguments in the URL, specify them under ",(0,n.yg)("inlineCode",{parentName:"td"},"connect_args"),".")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"database_connection."),(0,n.yg)("span",{className:"path-main"},"password"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string(password)"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"password")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"database_connection."),(0,n.yg)("span",{className:"path-main"},"sqlalchemy_uri"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"URI of database to connect to. See ",(0,n.yg)("a",{parentName:"td",href:"https://docs.sqlalchemy.org/en/14/core/engines.html#database-urls"},"https://docs.sqlalchemy.org/en/14/core/engines.html#database-urls"),". Takes precedence over other connection parameters.")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"database_connection."),(0,n.yg)("span",{className:"path-main"},"username"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"username")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"exclude_aspects"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"array"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Set of aspect names to exclude from ingestion ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"[","'","globalSettingsInfo","'",", ","'","dataHubExecutionRequestSig...")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"exclude_aspects."),(0,n.yg)("span",{className:"path-main"},"string"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"})),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"kafka_connection"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"KafkaConsumerConnectionConfig"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Kafka connection config")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"kafka_connection."),(0,n.yg)("span",{className:"path-main"},"bootstrap"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"default-line "},"Default: ",(0,n.yg)("span",{className:"default-value"},"localhost:9092")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"kafka_connection."),(0,n.yg)("span",{className:"path-main"},"client_timeout_seconds"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"integer"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"The request timeout used when interacting with the Kafka APIs. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"60")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"kafka_connection."),(0,n.yg)("span",{className:"path-main"},"consumer_config"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"object"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Extra consumer config serialized as JSON. These options will be passed into Kafka's DeserializingConsumer. See ",(0,n.yg)("a",{parentName:"td",href:"https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#deserializingconsumer"},"https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#deserializingconsumer")," and ",(0,n.yg)("a",{parentName:"td",href:"https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md"},"https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md")," .")),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"kafka_connection."),(0,n.yg)("span",{className:"path-main"},"schema_registry_config"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"object"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Extra schema registry config serialized as JSON. These options will be passed into Kafka's SchemaRegistryClient. ",(0,n.yg)("a",{parentName:"td",href:"https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html?#schemaregistryclient"},"https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html?#schemaregistryclient"))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"kafka_connection."),(0,n.yg)("span",{className:"path-main"},"schema_registry_url"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"default-line "},"Default: ",(0,n.yg)("span",{className:"default-value"},"http://localhost:8080/schema-registry/api/")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"urn_pattern"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"AllowDenyPattern"))),(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"default-line "},"Default: ",(0,n.yg)("span",{className:"default-value"},"{","'","allow","'",": ","[","'",".","*","'","]",", ","'","deny","'",": ","[","]",", ","'","ignoreCase","'",": True","}")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"urn_pattern."),(0,n.yg)("span",{className:"path-main"},"ignoreCase"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"boolean"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Whether to ignore case sensitivity during pattern matching. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"True")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"urn_pattern."),(0,n.yg)("span",{className:"path-main"},"allow"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"array"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"List of regex patterns to include in ingestion ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"[","'",".","*","'","]")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"urn_pattern.allow."),(0,n.yg)("span",{className:"path-main"},"string"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"})),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"urn_pattern."),(0,n.yg)("span",{className:"path-main"},"deny"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"array"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"List of regex patterns to exclude from ingestion. ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"[","]")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"urn_pattern.deny."),(0,n.yg)("span",{className:"path-main"},"string"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"string"))),(0,n.yg)("td",{parentName:"tr",align:"left"})),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-main"},"stateful_ingestion"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"StatefulIngestionConfig"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Stateful Ingestion Config ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"{","'","enabled","'",": True, ","'","max","_","checkpoint","_","state","_","size","'",": 167...")))),(0,n.yg)("tr",{parentName:"tbody"},(0,n.yg)("td",{parentName:"tr",align:"left"},(0,n.yg)("div",{className:"path-line"},(0,n.yg)("span",{className:"path-prefix"},"stateful_ingestion."),(0,n.yg)("span",{className:"path-main"},"enabled"))," ",(0,n.yg)("div",{className:"type-name-line"},(0,n.yg)("span",{className:"type-name"},"boolean"))),(0,n.yg)("td",{parentName:"tr",align:"left"},"Whether or not to enable stateful ingest. Default: True if a pipeline_name is set and either a datahub-rest sink or ",(0,n.yg)("inlineCode",{parentName:"td"},"datahub_api")," is specified, otherwise False ",(0,n.yg)("div",{className:"default-line default-line-with-docs"},"Default: ",(0,n.yg)("span",{className:"default-value"},"False")))))))),(0,n.yg)(i.A,{value:"schema",label:"Schema",mdxType:"TabItem"},(0,n.yg)("p",null,"The ",(0,n.yg)("a",{parentName:"p",href:"https://json-schema.org/"},"JSONSchema")," for this configuration is inlined below."),(0,n.yg)("pre",null,(0,n.yg)("code",{parentName:"pre",className:"language-javascript"},'{\n "title": "DataHubSourceConfig",\n "description": "Base configuration class for stateful ingestion for source configs to inherit from.",\n "type": "object",\n "properties": {\n "stateful_ingestion": {\n "title": "Stateful Ingestion",\n "description": "Stateful Ingestion Config",\n "default": {\n "enabled": true,\n "max_checkpoint_state_size": 16777216,\n "state_provider": {\n "type": "datahub",\n "config": {}\n },\n "ignore_old_state": false,\n "ignore_new_state": false\n },\n "allOf": [\n {\n "$ref": "#/definitions/StatefulIngestionConfig"\n }\n ]\n },\n "database_connection": {\n "title": "Database Connection",\n "description": "Database connection config",\n "allOf": [\n {\n "$ref": "#/definitions/SQLAlchemyConnectionConfig"\n }\n ]\n },\n "kafka_connection": {\n "title": "Kafka Connection",\n "description": "Kafka connection config",\n "allOf": [\n {\n "$ref": "#/definitions/KafkaConsumerConnectionConfig"\n }\n ]\n },\n "include_all_versions": {\n "title": "Include All Versions",\n "description": "If enabled, include all versions of each aspect. Otherwise, only include the latest version of each aspect. ",\n "default": false,\n "type": "boolean"\n },\n "include_soft_deleted_entities": {\n "title": "Include Soft Deleted Entities",\n "description": "If enabled, include entities that have been soft deleted. Otherwise, include all entities regardless of removal status. ",\n "default": true,\n "type": "boolean"\n },\n "exclude_aspects": {\n "title": "Exclude Aspects",\n "description": "Set of aspect names to exclude from ingestion",\n "default": [\n "globalSettingsInfo",\n "dataHubExecutionRequestSignal",\n "globalSettingsKey",\n "dataHubExecutionRequestInput",\n "datahubIngestionRunSummary",\n "dataHubSecretValue",\n "testResults",\n "datahubIngestionCheckpoint",\n "dataHubExecutionRequestKey",\n "dataHubExecutionRequestResult",\n "dataHubIngestionSourceKey",\n "dataHubSecretKey",\n "dataHubIngestionSourceInfo"\n ],\n "type": "array",\n "items": {\n "type": "string"\n },\n "uniqueItems": true\n },\n "database_query_batch_size": {\n "title": "Database Query Batch Size",\n "description": "Number of records to fetch from the database at a time",\n "default": 10000,\n "type": "integer"\n },\n "database_table_name": {\n "title": "Database Table Name",\n "description": "Name of database table containing all versioned aspects",\n "default": "metadata_aspect_v2",\n "type": "string"\n },\n "kafka_topic_name": {\n "title": "Kafka Topic Name",\n "description": "Name of kafka topic containing timeseries MCLs",\n "default": "MetadataChangeLog_Timeseries_v1",\n "type": "string"\n },\n "commit_state_interval": {\n "title": "Commit State Interval",\n "description": "Number of records to process before committing state",\n "default": 1000,\n "type": "integer"\n },\n "commit_with_parse_errors": {\n "title": "Commit With Parse Errors",\n "description": "Whether to update createdon timestamp and kafka offset despite parse errors. Enable if you want to ignore the errors.",\n "default": false,\n "type": "boolean"\n },\n "pull_from_datahub_api": {\n "title": "Pull From Datahub Api",\n "description": "Use the DataHub API to fetch versioned aspects.",\n "default": false,\n "hidden_from_docs": true,\n "type": "boolean"\n },\n "max_workers": {\n "title": "Max Workers",\n "description": "Number of worker threads to use for datahub api ingestion.",\n "default": 70,\n "hidden_from_docs": true,\n "type": "integer"\n },\n "urn_pattern": {\n "title": "Urn Pattern",\n "default": {\n "allow": [\n ".*"\n ],\n "deny": [],\n "ignoreCase": true\n },\n "allOf": [\n {\n "$ref": "#/definitions/AllowDenyPattern"\n }\n ]\n },\n "drop_duplicate_schema_fields": {\n "title": "Drop Duplicate Schema Fields",\n "description": "Whether to drop duplicate schema fields in the schemaMetadata aspect. Useful if the source system has duplicate field paths in the db, but we\'re pushing to a system with server-side duplicate checking.",\n "default": false,\n "type": "boolean"\n }\n },\n "definitions": {\n "DynamicTypedStateProviderConfig": {\n "title": "DynamicTypedStateProviderConfig",\n "type": "object",\n "properties": {\n "type": {\n "title": "Type",\n "description": "The type of the state provider to use. For DataHub use `datahub`",\n "type": "string"\n },\n "config": {\n "title": "Config",\n "description": "The configuration required for initializing the state provider. Default: The datahub_api config if set at pipeline level. Otherwise, the default DatahubClientConfig. See the defaults (https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/graph/client.py#L19).",\n "default": {},\n "type": "object"\n }\n },\n "required": [\n "type"\n ],\n "additionalProperties": false\n },\n "StatefulIngestionConfig": {\n "title": "StatefulIngestionConfig",\n "description": "Basic Stateful Ingestion Specific Configuration for any source.",\n "type": "object",\n "properties": {\n "enabled": {\n "title": "Enabled",\n "description": "Whether or not to enable stateful ingest. Default: True if a pipeline_name is set and either a datahub-rest sink or `datahub_api` is specified, otherwise False",\n "default": false,\n "type": "boolean"\n }\n },\n "additionalProperties": false\n },\n "SQLAlchemyConnectionConfig": {\n "title": "SQLAlchemyConnectionConfig",\n "type": "object",\n "properties": {\n "username": {\n "title": "Username",\n "description": "username",\n "type": "string"\n },\n "password": {\n "title": "Password",\n "description": "password",\n "type": "string",\n "writeOnly": true,\n "format": "password"\n },\n "host_port": {\n "title": "Host Port",\n "description": "host URL",\n "type": "string"\n },\n "database": {\n "title": "Database",\n "description": "database (catalog)",\n "type": "string"\n },\n "scheme": {\n "title": "Scheme",\n "description": "scheme",\n "type": "string"\n },\n "sqlalchemy_uri": {\n "title": "Sqlalchemy Uri",\n "description": "URI of database to connect to. See https://docs.sqlalchemy.org/en/14/core/engines.html#database-urls. Takes precedence over other connection parameters.",\n "type": "string"\n },\n "options": {\n "title": "Options",\n "description": "Any options specified here will be passed to [SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs. To set connection arguments in the URL, specify them under `connect_args`.",\n "type": "object"\n }\n },\n "required": [\n "host_port",\n "scheme"\n ],\n "additionalProperties": false\n },\n "KafkaConsumerConnectionConfig": {\n "title": "KafkaConsumerConnectionConfig",\n "description": "Configuration class for holding connectivity information for Kafka consumers",\n "type": "object",\n "properties": {\n "bootstrap": {\n "title": "Bootstrap",\n "default": "localhost:9092",\n "type": "string"\n },\n "schema_registry_url": {\n "title": "Schema Registry Url",\n "default": "http://localhost:8080/schema-registry/api/",\n "type": "string"\n },\n "schema_registry_config": {\n "title": "Schema Registry Config",\n "description": "Extra schema registry config serialized as JSON. These options will be passed into Kafka\'s SchemaRegistryClient. https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html?#schemaregistryclient",\n "type": "object"\n },\n "client_timeout_seconds": {\n "title": "Client Timeout Seconds",\n "description": "The request timeout used when interacting with the Kafka APIs.",\n "default": 60,\n "type": "integer"\n },\n "consumer_config": {\n "title": "Consumer Config",\n "description": "Extra consumer config serialized as JSON. These options will be passed into Kafka\'s DeserializingConsumer. See https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#deserializingconsumer and https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md .",\n "type": "object"\n }\n },\n "additionalProperties": false\n },\n "AllowDenyPattern": {\n "title": "AllowDenyPattern",\n "description": "A class to store allow deny regexes",\n "type": "object",\n "properties": {\n "allow": {\n "title": "Allow",\n "description": "List of regex patterns to include in ingestion",\n "default": [\n ".*"\n ],\n "type": "array",\n "items": {\n "type": "string"\n }\n },\n "deny": {\n "title": "Deny",\n "description": "List of regex patterns to exclude from ingestion.",\n "default": [],\n "type": "array",\n "items": {\n "type": "string"\n }\n },\n "ignoreCase": {\n "title": "Ignorecase",\n "description": "Whether to ignore case sensitivity during pattern matching.",\n "default": true,\n "type": "boolean"\n }\n },\n "additionalProperties": false\n }\n }\n}\n')))),(0,n.yg)("h3",{id:"code-coordinates"},"Code Coordinates"),(0,n.yg)("ul",null,(0,n.yg)("li",{parentName:"ul"},"Class Name: ",(0,n.yg)("inlineCode",{parentName:"li"},"datahub.ingestion.source.datahub.datahub_source.DataHubSource")),(0,n.yg)("li",{parentName:"ul"},"Browse on ",(0,n.yg)("a",{parentName:"li",href:"https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py"},"GitHub"))),(0,n.yg)("h2",null,"Questions"),(0,n.yg)("p",null,"If you've got any questions on configuring ingestion for DataHub, feel free to ping us on ",(0,n.yg)("a",{parentName:"p",href:"https://datahub.com/slack"},"our Slack"),"."))}f.isMDXComponent=!0}}]);