mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-18 21:57:58 +00:00
fix: ignore escaped commas in CSV checks (#832)
* fix file content checking bug * skip counting commas in quotes for csv detection * add test for comma count * change file content grab to -1 * version and changelog * add csv to extension check * add file to tests * ingest-test-fixtures-update * Update ingest test fixtures (#833) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * fix typo * fix changelog wording --------- Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
This commit is contained in:
parent
06077b09ee
commit
38457777fa
@ -10,6 +10,8 @@
|
|||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
|
* CSV check now ignores escaped commas.
|
||||||
|
* Fix for filetype exploration util when file content does not have a comma.
|
||||||
* Adds negative lookahead to bullet pattern to avoid detecting plain text line
|
* Adds negative lookahead to bullet pattern to avoid detecting plain text line
|
||||||
breaks like `-------` as list items.
|
breaks like `-------` as list items.
|
||||||
* Fix pre tag parsing for `partition_html`
|
* Fix pre tag parsing for `partition_html`
|
||||||
|
@ -424,3 +424,15 @@ def test_detect_filetype_detects_empty_filename(filename="example-docs/empty.txt
|
|||||||
def test_detect_filetype_detects_empty_file(filename="example-docs/empty.txt"):
|
def test_detect_filetype_detects_empty_file(filename="example-docs/empty.txt"):
|
||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
assert detect_filetype(file=f) == FileType.EMPTY
|
assert detect_filetype(file=f) == FileType.EMPTY
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_filetype_skips_escape_commas_for_csv(tmpdir):
|
||||||
|
text = 'A,A,A,A,A\nA,A,A,"A,A",A\nA,A,A,"A,A",A'
|
||||||
|
filename = os.path.join(tmpdir.dirname, "csv-with-escaped-commas.csv")
|
||||||
|
with open(filename, "w") as f:
|
||||||
|
f.write(text)
|
||||||
|
|
||||||
|
assert detect_filetype(filename=filename) == FileType.CSV
|
||||||
|
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
assert detect_filetype(file=f) == FileType.CSV
|
||||||
|
@ -27,13 +27,13 @@
|
|||||||
"text": "Note: Remember to write your own \"OPENING MESSAGE\" before you copy and paste the template. please always include [TREASURE HARUTO] for example:"
|
"text": "Note: Remember to write your own \"OPENING MESSAGE\" before you copy and paste the template. please always include [TREASURE HARUTO] for example:"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"type": "NarrativeText",
|
"type": "UncategorizedText",
|
||||||
"element_id": "577b0ea5cd4f62a19b9d14dd0bd7272e",
|
"element_id": "23a37aa2d5f39d5e2275dec011be76be",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"data_source": {},
|
"data_source": {},
|
||||||
"filetype": "image/png"
|
"filetype": "image/png"
|
||||||
},
|
},
|
||||||
"text": "안녕하세요, 저 희 는 YGEAS 그룹 TREASUREWH HARUTOM|2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 메 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 고 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다."
|
"text": "StS ofAl2, AS|E YGEAS 1B TREASUREMH HARUTOM| 2] BHEYLICH, BHO AY, HARUTO M BE = WSO Hol Wat SSGstsS LRU, O| Wil BS SH ASP ASS AZopO} HAS] TAI St a Bat FSAel GAS HS + U7/S HLICh."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"type": "NarrativeText",
|
"type": "NarrativeText",
|
||||||
@ -46,11 +46,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"type": "NarrativeText",
|
"type": "NarrativeText",
|
||||||
"element_id": "da91cb5310386aafd5aaf6fd988b9616",
|
"element_id": "c37248b6d436997d36acc7852f502a8e",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"data_source": {},
|
"data_source": {},
|
||||||
"filetype": "image/png"
|
"filetype": "image/png"
|
||||||
},
|
},
|
||||||
"text": "4. Use the hashtag of Haruto on your tweet to show that vou have sent vour email]"
|
"text": "4. Use the hashtag of Haruto on your tweet to show that vou have sent your email]"
|
||||||
}
|
}
|
||||||
]
|
]
|
@ -61,7 +61,7 @@ def get_file_info_from_file_contents(
|
|||||||
data["filename"] = []
|
data["filename"] = []
|
||||||
|
|
||||||
for i, file_content in enumerate(file_contents):
|
for i, file_content in enumerate(file_contents):
|
||||||
_, content_string = file_content.split(",")
|
content_string = file_content.split(",")[-1]
|
||||||
content_bytes = base64.b64decode(content_string)
|
content_bytes = base64.b64decode(content_string)
|
||||||
f = io.BytesIO(content_bytes)
|
f = io.BytesIO(content_bytes)
|
||||||
filetype = detect_filetype(file=f)
|
filetype = detect_filetype(file=f)
|
||||||
|
@ -284,6 +284,9 @@ def detect_filetype(
|
|||||||
encoding = "utf-8"
|
encoding = "utf-8"
|
||||||
formatted_encoding = format_encoding_str(encoding)
|
formatted_encoding = format_encoding_str(encoding)
|
||||||
|
|
||||||
|
if extension in [".eml", ".md", ".rtf", ".html", ".rst", ".org", ".csv", ".tsv", ".json"]:
|
||||||
|
return EXT_TO_FILETYPE.get(extension)
|
||||||
|
|
||||||
# NOTE(crag): for older versions of the OS libmagic package, such as is currently
|
# NOTE(crag): for older versions of the OS libmagic package, such as is currently
|
||||||
# installed on the Unstructured docker image, .json files resolve to "text/plain"
|
# installed on the Unstructured docker image, .json files resolve to "text/plain"
|
||||||
# rather than "application/json". this corrects for that case.
|
# rather than "application/json". this corrects for that case.
|
||||||
@ -296,9 +299,6 @@ def detect_filetype(
|
|||||||
if file and _check_eml_from_buffer(file=file) is True:
|
if file and _check_eml_from_buffer(file=file) is True:
|
||||||
return FileType.EML
|
return FileType.EML
|
||||||
|
|
||||||
if extension in [".eml", ".md", ".rtf", ".html", ".rst", ".org", ".tsv", ".json"]:
|
|
||||||
return EXT_TO_FILETYPE.get(extension)
|
|
||||||
|
|
||||||
# Safety catch
|
# Safety catch
|
||||||
if mime_type in STR_TO_FILETYPE:
|
if mime_type in STR_TO_FILETYPE:
|
||||||
return STR_TO_FILETYPE[mime_type]
|
return STR_TO_FILETYPE[mime_type]
|
||||||
@ -404,6 +404,13 @@ def _is_text_file_a_json(
|
|||||||
return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
|
return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
|
||||||
|
|
||||||
|
|
||||||
|
def _count_commas(text: str):
|
||||||
|
"""Counts the number of commas in a line, excluding commas in quotes."""
|
||||||
|
pattern = r"(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$),"
|
||||||
|
matches = re.findall(pattern, text)
|
||||||
|
return len(matches)
|
||||||
|
|
||||||
|
|
||||||
def _is_text_file_a_csv(
|
def _is_text_file_a_csv(
|
||||||
filename: Optional[str] = None,
|
filename: Optional[str] = None,
|
||||||
file: Optional[IO] = None,
|
file: Optional[IO] = None,
|
||||||
@ -415,10 +422,10 @@ def _is_text_file_a_csv(
|
|||||||
if len(lines) < 2:
|
if len(lines) < 2:
|
||||||
return False
|
return False
|
||||||
lines = lines[: len(lines)] if len(lines) < 10 else lines[:10]
|
lines = lines[: len(lines)] if len(lines) < 10 else lines[:10]
|
||||||
header = lines[0].split(",")
|
header_count = _count_commas(lines[0])
|
||||||
if any("," not in line for line in lines):
|
if any("," not in line for line in lines):
|
||||||
return False
|
return False
|
||||||
return all(len(line.split(",")) == len(header) for line in lines[:-1])
|
return all(_count_commas(line) == header_count for line in lines[:1])
|
||||||
|
|
||||||
|
|
||||||
def _check_eml_from_buffer(file: IO) -> bool:
|
def _check_eml_from_buffer(file: IO) -> bool:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user