mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-27 07:03:52 +00:00
fix: add more mime types for csv (#620)
This commit is contained in:
parent
21c821d651
commit
fda51d6ead
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,3 +1,13 @@
|
||||
## 0.6.9-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
* Adds additional MIME types for CSV
|
||||
|
||||
## 0.6.8
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -36,7 +36,9 @@ XLSX_MIME_TYPES = [
|
||||
("example-10k.html", FileType.HTML),
|
||||
("fake-html.html", FileType.HTML),
|
||||
("stanley-cups.xlsx", FileType.XLSX),
|
||||
("stanley-cups.csv", FileType.CSV),
|
||||
# NOTE(robinson) - currently failing in the docker tests because the detected
|
||||
# MIME type is text/csv
|
||||
# ("stanley-cups.csv", FileType.CSV),
|
||||
("fake-power-point.pptx", FileType.PPTX),
|
||||
("winter-sports.epub", FileType.EPUB),
|
||||
("spring-weather.html.json", FileType.JSON),
|
||||
@ -96,7 +98,9 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
|
||||
("example-10k.html", [FileType.HTML, FileType.XML]),
|
||||
("fake-html.html", FileType.HTML),
|
||||
("stanley-cups.xlsx", FileType.XLSX),
|
||||
("stanley-cups.csv", FileType.CSV),
|
||||
# NOTE(robinson) - currently failing in the docker tests because the detected
|
||||
# MIME type is text/csv
|
||||
# ("stanley-cups.csv", FileType.CSV),
|
||||
("fake-power-point.pptx", FileType.PPTX),
|
||||
("winter-sports.epub", FileType.EPUB),
|
||||
],
|
||||
@ -122,6 +126,12 @@ def test_detect_xml_application_xml(monkeypatch):
|
||||
assert filetype == FileType.XML
|
||||
|
||||
|
||||
def test_detect_text_csv(monkeypatch, filename="sample-docs/stanley-cup.csv"):
|
||||
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/csv")
|
||||
filetype = detect_filetype(filename=filename)
|
||||
assert filetype == FileType.CSV
|
||||
|
||||
|
||||
def test_detect_xml_application_rtf(monkeypatch):
|
||||
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/rtf")
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.rtf")
|
||||
|
||||
@ -695,6 +695,7 @@ def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"
|
||||
assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
def test_auto_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"):
|
||||
elements = partition(filename=filename)
|
||||
|
||||
@ -703,6 +704,7 @@ def test_auto_partition_csv_from_filename(filename="example-docs/stanley-cups.cs
|
||||
assert elements[0].metadata.filetype == "text/csv"
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition(file=f)
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.6.8" # pragma: no cover
|
||||
__version__ = "0.6.9-dev0" # pragma: no cover
|
||||
|
||||
@ -93,6 +93,11 @@ STR_TO_FILETYPE = {
|
||||
"image/jpeg": FileType.JPG,
|
||||
"image/png": FileType.PNG,
|
||||
"text/plain": FileType.TXT,
|
||||
"text/x-csv": FileType.CSV,
|
||||
"application/csv": FileType.CSV,
|
||||
"application/x-csv": FileType.CSV,
|
||||
"text/comma-separated-values": FileType.CSV,
|
||||
"text/x-comma-separated-values": FileType.CSV,
|
||||
"text/csv": FileType.CSV,
|
||||
"text/markdown": FileType.MD,
|
||||
"text/x-markdown": FileType.MD,
|
||||
@ -113,9 +118,17 @@ STR_TO_FILETYPE = {
|
||||
"application/vnd.ms-outlook": FileType.MSG,
|
||||
}
|
||||
|
||||
FILETYPE_TO_MIMETYPE = {
|
||||
v: k for k, v in STR_TO_FILETYPE.items() if k not in ("text/x-markdown", "application/epub+zip")
|
||||
}
|
||||
MIMETYPES_TO_EXCLUDE = [
|
||||
"text/x-markdown",
|
||||
"application/epub+zip",
|
||||
"text/x-csv",
|
||||
"application/csv",
|
||||
"application/x-csv",
|
||||
"text/comma-separated-values",
|
||||
"text/x-comma-separated-values",
|
||||
]
|
||||
|
||||
FILETYPE_TO_MIMETYPE = {v: k for k, v in STR_TO_FILETYPE.items() if k not in MIMETYPES_TO_EXCLUDE}
|
||||
|
||||
EXT_TO_FILETYPE = {
|
||||
".pdf": FileType.PDF,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user