From 21df17f7fa7b9e6c7c94a9f88e1088d61fee6ad7 Mon Sep 17 00:00:00 2001 From: M Bharat lal Date: Sat, 14 Oct 2023 02:06:05 +0530 Subject: [PATCH] fix: consider all the required lines instead of first line to detect file type as CSV (#1728) Current file detection logic for csv in file_utils/filetype.py is not considering all the lines for counting the no. of comma's, it is considering just the first line which will return always return true ``` lines = lines[: len(lines)] if len(lines) < 10 else lines[:10] header_count = _count_commas(lines[0]) if any("," not in line for line in lines): return False return all(_count_commas(line) == header_count for line in lines[:1]) ``` fixed issue by considering all the lines except the first line as shown below ``` lines = lines[: len(lines)] if len(lines) < 10 else lines[:10] header_count = _count_commas(lines[0]) if any("," not in line for line in lines): return False return all(_count_commas(line) == header_count for line in lines[1:]) ``` --- CHANGELOG.md | 1 + unstructured/file_utils/filetype.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fd0ee7f15..222afb7f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ### Enhancements * **Add functionality to limit precision when serializing to json** Precision for `points` is limited to 1 decimal point if coordinates["system"] == "PixelSpace" (otherwise 2 decimal points?). Precision for `detection_class_prob` is limited to 5 decimal points. +* **Fix csv file detection logic when mime-type is text/plain** Previously the logic to detect csv file type was considering only first row's comma count comparing with the header_row comma count and both the rows being same line the result was always true, Now the logic is changed to consider the comma's count for all the lines except first line and compare with header_row comma count. ### Features diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 0b786b2b9..90d610b80 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -496,7 +496,7 @@ def _is_text_file_a_csv( header_count = _count_commas(lines[0]) if any("," not in line for line in lines): return False - return all(_count_commas(line) == header_count for line in lines[:1]) + return all(_count_commas(line) == header_count for line in lines[1:]) def _check_eml_from_buffer(file: IO[bytes]) -> bool: