diff --git a/CHANGELOG.md b/CHANGELOG.md index fd0ee7f15..222afb7f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ### Enhancements * **Add functionality to limit precision when serializing to json** Precision for `points` is limited to 1 decimal point if coordinates["system"] == "PixelSpace" (otherwise 2 decimal points?). Precision for `detection_class_prob` is limited to 5 decimal points. +* **Fix csv file detection logic when mime-type is text/plain** Previously the logic to detect csv file type was considering only first row's comma count comparing with the header_row comma count and both the rows being same line the result was always true, Now the logic is changed to consider the comma's count for all the lines except first line and compare with header_row comma count. ### Features diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 0b786b2b9..90d610b80 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -496,7 +496,7 @@ def _is_text_file_a_csv( header_count = _count_commas(lines[0]) if any("," not in line for line in lines): return False - return all(_count_commas(line) == header_count for line in lines[:1]) + return all(_count_commas(line) == header_count for line in lines[1:]) def _check_eml_from_buffer(file: IO[bytes]) -> bool: