From 3fdaf13f2ca1729b6f740fa1d4adc403e64d97f7 Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Tue, 22 Nov 2022 12:53:34 -0500
Subject: [PATCH] feat(ingest/csv-enrich): handle BOM character (#6509)

This has come up a number of times e.g. https://datahubspace.slack.com/archives/C029A3M079U/p1669000226732159?thread_ts=1669000226.732159&amp;cid=C029A3M079U
---
 .../src/datahub/ingestion/source/csv_enricher.py           | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py
index b0f4c3f707..88e170ae8f 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py
@@ -534,7 +534,12 @@ class CSVEnricherSource(Source):
         return owners
 
     def get_workunits(self) -> Iterable[MetadataWorkUnit]:
-        with open(self.config.filename, "r") as f:
+        # As per https://stackoverflow.com/a/49150749/5004662, we want to use
+        # the 'utf-8-sig' encoding to handle any BOM character that may be
+        # present in the file. Excel is known to add a BOM to CSV files.
+        # As per https://stackoverflow.com/a/63508823/5004662,
+        # this is also safe with normal files that don't have a BOM.
+        with open(self.config.filename, mode="r", encoding="utf-8-sig") as f:
             rows = csv.DictReader(f, delimiter=self.config.delimiter)
             for row in rows:
                 # We need the resource to move forward