diff --git a/metadata-ingestion/docs/sources/csv/csv-enricher_recipe.yml b/metadata-ingestion/docs/sources/csv/csv-enricher_recipe.yml new file mode 100644 index 0000000000..c176166a92 --- /dev/null +++ b/metadata-ingestion/docs/sources/csv/csv-enricher_recipe.yml @@ -0,0 +1,8 @@ +source: + type: csv-enricher + config: + # relative path to your csv file to ingest + filename: ./path/to/your/file.csv + +# Default sink is datahub-rest and doesn't need to be configured +# See https://datahubproject.io/docs/metadata-ingestion/sink_docs/datahub for customization options diff --git a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py index 1172877899..869d5a0254 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py +++ b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py @@ -98,22 +98,24 @@ class CSVEnricherReport(SourceReport): @support_status(SupportStatus.INCUBATING) class CSVEnricherSource(Source): """ - This plugin is used to apply glossary terms, tags, owners and domain at the entity level. It can also be used to apply tags - and glossary terms at the column level. These values are read from a CSV file and can be used to either overwrite - or append the above aspects to entities. + This plugin is used to bulk upload metadata to Datahub. + It will apply glossary terms, tags, decription, owners and domain at the entity level. It can also be used to apply tags, + glossary terms, and documentation at the column level. These values are read from a CSV file. You have the option to either overwrite + or append existing values. - The format of the CSV must be like so, with a few example rows. + The format of the CSV is demonstrated below. The header is required and URNs should be surrounded by quotes when they contains commas (most URNs contains commas). - |resource |subresource|glossary_terms |tags |owners |ownership_type |description |domain | - |----------------------------------------------------------------|-----------|------------------------------------|-------------------|---------------------------------------------------|---------------|---------------|---------------------------| - |urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)| |[urn:li:glossaryTerm:AccountBalance]|[urn:li:tag:Legacy]|[urn:li:corpuser:datahub|urn:li:corpuser:jdoe]|TECHNICAL_OWNER|new description|urn:li:domain:Engineering | - |urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)|field_foo |[urn:li:glossaryTerm:AccountBalance]| | | |field_foo! | | - |urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)|field_bar | |[urn:li:tag:Legacy]| | |field_bar? | | + ``` + resource,subresource,glossary_terms,tags,owners,ownership_type,description,domain + "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub.growth.users,PROD",,[urn:li:glossaryTerm:Users],[urn:li:tag:HighQuality],[urn:li:corpuser:lfoe;urn:li:corpuser:jdoe],TECHNICAL_OWNER,"description for users table",urn:li:domain:Engineering + "urn:li:dataset:(urn:li:dataPlatform:hive,datahub.growth.users,PROD",first_name,[urn:li:glossaryTerm:FirstName],,,,"first_name description" + "urn:li:dataset:(urn:li:dataPlatform:hive,datahub.growth.users,PROD",last_name,[urn:li:glossaryTerm:LastName],,,,"last_name description" + ``` Note that the first row does not have a subresource populated. That means any glossary terms, tags, and owners will - be applied at the entity field. If a subresource IS populated (as it is for the second and third rows), glossary - terms and tags will be applied on the subresource. Every row MUST have a resource. Also note that owners can only - be applied at the resource level and will be ignored if populated for a row with a subresource. + be applied at the entity field. If a subresource is populated (as it is for the second and third rows), glossary + terms and tags will be applied on the column. Every row MUST have a resource. Also note that owners can only + be applied at the resource level. :::note This source will not work on very large csv files that do not fit in memory.