From 3fdaf13f2ca1729b6f740fa1d4adc403e64d97f7 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 22 Nov 2022 12:53:34 -0500 Subject: [PATCH] feat(ingest/csv-enrich): handle BOM character (#6509) This has come up a number of times e.g. https://datahubspace.slack.com/archives/C029A3M079U/p1669000226732159?thread_ts=1669000226.732159&cid=C029A3M079U --- .../src/datahub/ingestion/source/csv_enricher.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py index b0f4c3f707..88e170ae8f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py +++ b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py @@ -534,7 +534,12 @@ class CSVEnricherSource(Source): return owners def get_workunits(self) -> Iterable[MetadataWorkUnit]: - with open(self.config.filename, "r") as f: + # As per https://stackoverflow.com/a/49150749/5004662, we want to use + # the 'utf-8-sig' encoding to handle any BOM character that may be + # present in the file. Excel is known to add a BOM to CSV files. + # As per https://stackoverflow.com/a/63508823/5004662, + # this is also safe with normal files that don't have a BOM. + with open(self.config.filename, mode="r", encoding="utf-8-sig") as f: rows = csv.DictReader(f, delimiter=self.config.delimiter) for row in rows: # We need the resource to move forward