From 38457777faa50091d2d1917f7779851a7740ece7 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Wed, 28 Jun 2023 13:22:23 -0400 Subject: [PATCH] fix: ignore escaped commas in CSV checks (#832) * fix file content checking bug * skip counting commas in quotes for csv detection * add test for comma count * change file content grab to -1 * version and changelog * add csv to extension check * add file to tests * ingest-test-fixtures-update * Update ingest test fixtures (#833) Co-authored-by: MthwRobinson * fix typo * fix changelog wording --------- Co-authored-by: MthwRobinson --- CHANGELOG.md | 2 ++ test_unstructured/file_utils/test_filetype.py | 12 ++++++++++++ .../example-docs/english-and-korean.png.json | 10 +++++----- unstructured/file_utils/exploration.py | 2 +- unstructured/file_utils/filetype.py | 17 ++++++++++++----- 5 files changed, 32 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f94b5f9bf..559796dad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ ### Fixes +* CSV check now ignores escaped commas. +* Fix for filetype exploration util when file content does not have a comma. * Adds negative lookahead to bullet pattern to avoid detecting plain text line breaks like `-------` as list items. * Fix pre tag parsing for `partition_html` diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index e918ecc4d..9a1302a4a 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -424,3 +424,15 @@ def test_detect_filetype_detects_empty_filename(filename="example-docs/empty.txt def test_detect_filetype_detects_empty_file(filename="example-docs/empty.txt"): with open(filename, "rb") as f: assert detect_filetype(file=f) == FileType.EMPTY + + +def test_detect_filetype_skips_escape_commas_for_csv(tmpdir): + text = 'A,A,A,A,A\nA,A,A,"A,A",A\nA,A,A,"A,A",A' + filename = os.path.join(tmpdir.dirname, "csv-with-escaped-commas.csv") + with open(filename, "w") as f: + f.write(text) + + assert detect_filetype(filename=filename) == FileType.CSV + + with open(filename, "rb") as f: + assert detect_filetype(file=f) == FileType.CSV diff --git a/test_unstructured_ingest/expected-structured-output/parameterized-ingest-output/example-docs/english-and-korean.png.json b/test_unstructured_ingest/expected-structured-output/parameterized-ingest-output/example-docs/english-and-korean.png.json index 11027a8dc..4f50077dd 100644 --- a/test_unstructured_ingest/expected-structured-output/parameterized-ingest-output/example-docs/english-and-korean.png.json +++ b/test_unstructured_ingest/expected-structured-output/parameterized-ingest-output/example-docs/english-and-korean.png.json @@ -27,13 +27,13 @@ "text": "Note: Remember to write your own \"OPENING MESSAGE\" before you copy and paste the template. please always include [TREASURE HARUTO] for example:" }, { - "type": "NarrativeText", - "element_id": "577b0ea5cd4f62a19b9d14dd0bd7272e", + "type": "UncategorizedText", + "element_id": "23a37aa2d5f39d5e2275dec011be76be", "metadata": { "data_source": {}, "filetype": "image/png" }, - "text": "안녕하세요, 저 희 는 YGEAS 그룹 TREASUREWH HARUTOM|2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 메 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 고 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다." + "text": "StS ofAl2, AS|E YGEAS 1B TREASUREMH HARUTOM| 2] BHEYLICH, BHO AY, HARUTO M BE = WSO Hol Wat SSGstsS LRU, O| Wil BS SH ASP ASS AZopO} HAS] TAI St a Bat FSAel GAS HS + U7/S HLICh." }, { "type": "NarrativeText", @@ -46,11 +46,11 @@ }, { "type": "NarrativeText", - "element_id": "da91cb5310386aafd5aaf6fd988b9616", + "element_id": "c37248b6d436997d36acc7852f502a8e", "metadata": { "data_source": {}, "filetype": "image/png" }, - "text": "4. Use the hashtag of Haruto on your tweet to show that vou have sent vour email]" + "text": "4. Use the hashtag of Haruto on your tweet to show that vou have sent your email]" } ] \ No newline at end of file diff --git a/unstructured/file_utils/exploration.py b/unstructured/file_utils/exploration.py index 43b74ae79..55d9719b1 100644 --- a/unstructured/file_utils/exploration.py +++ b/unstructured/file_utils/exploration.py @@ -61,7 +61,7 @@ def get_file_info_from_file_contents( data["filename"] = [] for i, file_content in enumerate(file_contents): - _, content_string = file_content.split(",") + content_string = file_content.split(",")[-1] content_bytes = base64.b64decode(content_string) f = io.BytesIO(content_bytes) filetype = detect_filetype(file=f) diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 3e75b0593..970317b94 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -284,6 +284,9 @@ def detect_filetype( encoding = "utf-8" formatted_encoding = format_encoding_str(encoding) + if extension in [".eml", ".md", ".rtf", ".html", ".rst", ".org", ".csv", ".tsv", ".json"]: + return EXT_TO_FILETYPE.get(extension) + # NOTE(crag): for older versions of the OS libmagic package, such as is currently # installed on the Unstructured docker image, .json files resolve to "text/plain" # rather than "application/json". this corrects for that case. @@ -296,9 +299,6 @@ def detect_filetype( if file and _check_eml_from_buffer(file=file) is True: return FileType.EML - if extension in [".eml", ".md", ".rtf", ".html", ".rst", ".org", ".tsv", ".json"]: - return EXT_TO_FILETYPE.get(extension) - # Safety catch if mime_type in STR_TO_FILETYPE: return STR_TO_FILETYPE[mime_type] @@ -404,6 +404,13 @@ def _is_text_file_a_json( return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None +def _count_commas(text: str): + """Counts the number of commas in a line, excluding commas in quotes.""" + pattern = r"(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)," + matches = re.findall(pattern, text) + return len(matches) + + def _is_text_file_a_csv( filename: Optional[str] = None, file: Optional[IO] = None, @@ -415,10 +422,10 @@ def _is_text_file_a_csv( if len(lines) < 2: return False lines = lines[: len(lines)] if len(lines) < 10 else lines[:10] - header = lines[0].split(",") + header_count = _count_commas(lines[0]) if any("," not in line for line in lines): return False - return all(len(line.split(",")) == len(header) for line in lines[:-1]) + return all(_count_commas(line) == header_count for line in lines[:1]) def _check_eml_from_buffer(file: IO) -> bool: