rfctr(file): improve filetype tests (#3402)

**Summary**
Improve file-detection tests in preparation for additional work and bug
fixes.

**Additional Context**
- Add type annotations.
- Use mocks instead of `monkeypatch` in most cases and verify calls to
mock. This revealed a dozen broken tests, broken in that the mocks
weren't being called so a different code path than intended was being
exercised.
- Use `example_doc_path()` instead of hard-coded paths.
- Add actual test files for cases where they were being constructed in
temporary directories.
- Make test names consistent and more descriptive of behavior under
test.
This commit is contained in:
Steve Canny 2024-07-15 21:04:34 -07:00 committed by GitHub
parent 0057f9dea8
commit 56ca39ca7f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 381 additions and 293 deletions

View File

@ -1,4 +1,4 @@
## 0.15.0-dev11 ## 0.15.0-dev12
### Enhancements ### Enhancements

BIN
example-docs/bmp_24.bmp Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 117 KiB

View File

@ -0,0 +1,3 @@
A,A,A,A,A
A,A,A,"A,A",A
A,A,A,"A,A",A
1 A A A A A
2 A A A A,A A
3 A A A A,A A

7
example-docs/fake.go Normal file
View File

@ -0,0 +1,7 @@
package main
import "fmt"
func main() {
fmt.Println("Hello Go!")
}

19
example-docs/logger.py Normal file
View File

@ -0,0 +1,19 @@
import logging
from typing import Any
logger = logging.getLogger("unstructured")
trace_logger = logging.getLogger("unstructured.trace")
# Create a custom logging level
DETAIL = 15
logging.addLevelName(DETAIL, "DETAIL")
# Create a custom log method for the "DETAIL" level
def detail(self: logging.Logger, message: str, *args: Any, **kwargs: Any):
if self.isEnabledFor(DETAIL):
self._log(DETAIL, message, args, **kwargs)
# Add the custom log method to the logging.Logger class
logging.Logger.detail = detail # type: ignore

19
example-docs/simple.yaml Normal file
View File

@ -0,0 +1,19 @@
---
doe: "a deer, a female deer"
ray: "a drop of golden sun"
pi: 3.14159
xmas: true
french-hens: 3
calling-birds:
- huey
- dewey
- louie
- fred
xmas-fifth-day:
calling-birds: four
french-hens: 3
golden-rings: 5
partridges:
count: 1
location: "a pear tree"
turtle-doves: two

BIN
example-docs/simple.zip Normal file

Binary file not shown.

View File

@ -1,12 +1,24 @@
# pyright: reportPrivateUsage=false
"""Test suite for `unstructured.file_utils.filetype`."""
from __future__ import annotations
import io
import os import os
import pathlib import pathlib
import zipfile
import magic
import pytest import pytest
import yaml
from PIL import Image
from test_unstructured.unit_utils import (
FixtureRequest,
LogCaptureFixture,
Mock,
MonkeyPatch,
call,
example_doc_path,
function_mock,
)
from unstructured.file_utils import filetype from unstructured.file_utils import filetype
from unstructured.file_utils.filetype import ( from unstructured.file_utils.filetype import (
FileType, FileType,
@ -17,22 +29,11 @@ from unstructured.file_utils.filetype import (
detect_filetype, detect_filetype,
) )
FILE_DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs")
is_in_docker = os.path.exists("/.dockerenv") is_in_docker = os.path.exists("/.dockerenv")
DOCX_MIME_TYPES = [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
]
XLSX_MIME_TYPES = [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
]
@pytest.mark.parametrize( @pytest.mark.parametrize(
("file", "expected"), ("file_name", "expected_value"),
[ [
("layout-parser-paper-fast.pdf", FileType.PDF), ("layout-parser-paper-fast.pdf", FileType.PDF),
("fake.docx", FileType.DOCX), ("fake.docx", FileType.DOCX),
@ -57,13 +58,12 @@ XLSX_MIME_TYPES = [
("fake-incomplete-json.txt", FileType.TXT), ("fake-incomplete-json.txt", FileType.TXT),
], ],
) )
def test_detect_filetype_from_filename(file, expected): def test_detect_filetype_from_filename(file_name: str, expected_value: FileType):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, file) assert detect_filetype(example_doc_path(file_name)) == expected_value
assert detect_filetype(filename) == expected
@pytest.mark.parametrize( @pytest.mark.parametrize(
("file", "expected"), ("file_name", "expected_value"),
[ [
("layout-parser-paper-fast.pdf", FileType.PDF), ("layout-parser-paper-fast.pdf", FileType.PDF),
("fake.docx", FileType.DOCX), ("fake.docx", FileType.DOCX),
@ -84,340 +84,375 @@ def test_detect_filetype_from_filename(file, expected):
("fake-incomplete-json.txt", FileType.TXT), ("fake-incomplete-json.txt", FileType.TXT),
], ],
) )
def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expected): def test_detect_filetype_from_filename_with_extension(
"""Test that we detect the filetype from the filename extension when libmagic is not available file_name: str, expected_value: FileType, monkeypatch: MonkeyPatch
or the file does not exist.""" ):
# Test when libmagic is not available """File-type is detected from extension when libmagic not available or file does not exist."""
# -- when libmagic is not available --
monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", False) monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", False)
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, file) assert detect_filetype(example_doc_path(file_name)) == expected_value
assert detect_filetype(filename) == expected # -- when file does not exist --
# Test when the file does not exist
monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", True) monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", True)
extension = pathlib.Path(file).suffix extension = pathlib.Path(file_name).suffix
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "not-on-disk" + extension) assert detect_filetype(example_doc_path("not-on-disk" + extension)) == expected_value
assert detect_filetype(filename) == expected
@pytest.mark.parametrize( @pytest.mark.parametrize(
("file", "expected"), ("file_name", "expected_value"),
[ [
("layout-parser-paper-fast.pdf", FileType.PDF), ("layout-parser-paper-fast.pdf", [FileType.PDF]),
("fake.docx", FileType.DOCX), ("fake.docx", [FileType.DOCX]),
("example.jpg", FileType.JPG), ("example.jpg", [FileType.JPG]),
("fake-text.txt", FileType.TXT), ("fake-text.txt", [FileType.TXT]),
("eml/fake-email.eml", FileType.EML), ("eml/fake-email.eml", [FileType.EML]),
("factbook.xml", FileType.XML), ("factbook.xml", [FileType.XML]),
# NOTE(robinson) - For the document, some operating systems return # NOTE(robinson]) - For the document, some operating systems return
# */xml and some return */html. Either could be acceptable depending on the OS # */xml and some return */html. Either could be acceptable depending on the OS
("example-10k.html", [FileType.HTML, FileType.XML]), ("example-10k.html", [FileType.HTML, FileType.XML]),
("fake-html.html", FileType.HTML), ("fake-html.html", [FileType.HTML]),
("stanley-cups.xlsx", FileType.XLSX), ("stanley-cups.xlsx", [FileType.XLSX]),
# NOTE(robinson) - currently failing in the docker tests because the detected # NOTE(robinson]) - currently failing in the docker tests because the detected
# MIME type is text/csv # MIME type is text/csv
# ("stanley-cups.csv", FileType.CSV), # ("stanley-cups.csv", [FileType.CSV]),
("stanley-cups.tsv", FileType.TSV), ("stanley-cups.tsv", [FileType.TSV]),
("fake-power-point.pptx", FileType.PPTX), ("fake-power-point.pptx", [FileType.PPTX]),
("winter-sports.epub", FileType.EPUB), ("winter-sports.epub", [FileType.EPUB]),
("fake-incomplete-json.txt", FileType.TXT), ("fake-incomplete-json.txt", [FileType.TXT]),
], ],
) )
def test_detect_filetype_from_file(file, expected): def test_detect_filetype_from_file(file_name: str, expected_value: list[FileType]):
expected = expected if isinstance(expected, list) else [expected] with open(example_doc_path(file_name), "rb") as f:
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, file) assert detect_filetype(file=f) in expected_value
with open(filename, "rb") as f:
assert detect_filetype(file=f) in expected
def test_detect_filetype_from_file_warning_without_libmagic(monkeypatch, caplog): def test_detect_filetype_from_file_warns_when_libmagic_is_not_installed(
monkeypatch: MonkeyPatch, caplog: LogCaptureFixture
):
monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", False) monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", False)
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt") with open(example_doc_path("fake-text.txt"), "rb") as f:
with open(filename, "rb") as f:
detect_filetype(file=f) detect_filetype(file=f)
assert "WARNING" in caplog.text assert "WARNING" in caplog.text
def test_detect_xml_application_xml(monkeypatch): def test_detect_XML_from_application_xml_file_path(magic_from_file_: Mock):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/xml") magic_from_file_.return_value = "application/xml"
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.xml") file_path = example_doc_path("factbook.xml")
filetype = detect_filetype(filename=filename)
filetype = detect_filetype(file_path)
magic_from_file_.assert_called_once_with(file_path, mime=True)
assert filetype == FileType.XML assert filetype == FileType.XML
def test_detect_text_csv(monkeypatch, filename="example-docs/stanley-cup.csv"): def test_detect_CSV_from_text_csv_file_path(magic_from_file_: Mock):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/csv") magic_from_file_.return_value = "text/csv"
filetype = detect_filetype(filename=filename) file_path = example_doc_path("stanley-cups.csv")
filetype = detect_filetype(file_path)
magic_from_file_.assert_called_once_with(file_path, mime=True)
assert filetype == FileType.CSV assert filetype == FileType.CSV
def test_detect_text_python_from_filename(monkeypatch, filename="unstructured/logger.py"): def test_detect_TXT_from_text_x_script_python_file_path(magic_from_file_: Mock):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/x-script.python") magic_from_file_.return_value = "text/x-script.python"
filetype = detect_filetype(filename=filename) file_path = example_doc_path("logger.py")
filetype = detect_filetype(file_path)
magic_from_file_.assert_called_once_with(file_path, mime=True)
assert filetype == FileType.TXT assert filetype == FileType.TXT
def test_detect_text_python_from_file(monkeypatch, filename="unstructured/logger.py"): def test_detect_TXT_from_text_x_script_python_file(magic_from_buffer_: Mock):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/x-script.python") magic_from_buffer_.return_value = "text/x-script.python"
with open(filename, "rb") as f: file_path = example_doc_path("logger.py")
with open(file_path, "rb") as f:
head = f.read(4096)
f.seek(0)
filetype = detect_filetype(file=f) filetype = detect_filetype(file=f)
magic_from_buffer_.assert_called_once_with(head, mime=True)
assert filetype == FileType.TXT assert filetype == FileType.TXT
def test_detects_go_mime_type(): def test_is_code_mime_type_for_Go():
assert _is_code_mime_type("text/x-go") is True assert _is_code_mime_type("text/x-go") is True
def test_detect_xml_application_go(monkeypatch, tmpdir): def test_detect_TXT_from_text_go_file(magic_from_buffer_: Mock):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/x-go") magic_from_buffer_.return_value = "text/x-go"
file_path = example_doc_path("fake.go")
filename = os.path.join(tmpdir, "fake.go") with open(file_path, "rb") as f:
with open(filename, "w") as f: head = f.read(4096)
f.write("") f.seek(0)
filetype = detect_filetype(file=f)
with open(filename, "rb") as f: magic_from_buffer_.assert_called_once_with(head, mime=True)
assert detect_filetype(filename=filename) == FileType.TXT assert filetype == FileType.TXT
def test_detect_xml_application_rtf(monkeypatch): def test_detect_RTF_from_application_rtf_file_path(magic_from_file_: Mock):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/rtf") magic_from_file_.return_value = "application/rtf"
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.rtf") file_path = example_doc_path("fake-doc.rtf")
filetype = detect_filetype(filename=filename)
filetype = detect_filetype(file_path)
magic_from_file_.assert_called_once_with(file_path, mime=True)
assert filetype == FileType.RTF assert filetype == FileType.RTF
def test_detect_xml_text_xml(monkeypatch): def test_detect_XML_from_text_xml_file_path(magic_from_file_: Mock):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/xml") magic_from_file_.return_value = "text/xml"
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.xml") file_path = example_doc_path("factbook.xml")
filetype = detect_filetype(filename=filename)
filetype = detect_filetype(file_path)
magic_from_file_.assert_called_once_with(file_path, mime=True)
assert filetype == FileType.XML assert filetype == FileType.XML
def test_detect_html_application_xml(monkeypatch): def test_detect_HTML_from_application_xml_file_path_with_html_extension(magic_from_file_: Mock):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/xml") magic_from_file_.return_value = "application/xml"
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.html") file_path = example_doc_path("fake-html.html")
filetype = detect_filetype(filename=filename)
filetype = detect_filetype(file_path)
magic_from_file_.assert_called_once_with(file_path, mime=True)
assert filetype == FileType.HTML assert filetype == FileType.HTML
def test_detect_html_text_xml(monkeypatch): def test_detect_HTML_from_text_xml_file_path_with_html_extension(magic_from_file_: Mock):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/xml") magic_from_file_.return_value = "text/xml"
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.html") file_path = example_doc_path("fake-html.html")
filetype = detect_filetype(filename=filename)
filetype = detect_filetype(file_path)
magic_from_file_.assert_called_once_with(file_path, mime=True)
assert filetype == FileType.HTML assert filetype == FileType.HTML
def test_detect_docx_filetype_application_octet_stream(monkeypatch): def test_detect_DOCX_from_application_octet_stream_file_no_extension(magic_from_buffer_: Mock):
monkeypatch.setattr( magic_from_buffer_.return_value = "application/octet-stream"
magic, with open(example_doc_path("simple.docx"), "rb") as f:
"from_buffer", file = io.BytesIO(f.read())
lambda *args, **kwargs: "application/octet-stream",
) filetype = detect_filetype(file=file)
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.docx")
with open(filename, "rb") as f: magic_from_buffer_.assert_called_once_with(file.getvalue()[:4096], mime=True)
filetype = detect_filetype(file=f)
assert filetype == FileType.DOCX assert filetype == FileType.DOCX
def test_detect_docx_filetype_application_octet_stream_with_filename(monkeypatch): def test_detect_DOCX_from_application_octet_stream_file_path(magic_from_file_: Mock):
monkeypatch.setattr( magic_from_file_.return_value = "application/octet-stream"
magic, file_path = example_doc_path("simple.docx")
"from_file",
lambda *args, **kwargs: "application/octet-stream", filetype = detect_filetype(file_path)
)
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.docx") magic_from_file_.assert_called_once_with(file_path, mime=True)
filetype = detect_filetype(filename=filename)
assert filetype == FileType.DOCX assert filetype == FileType.DOCX
def test_detect_docx_filetype_application_zip(monkeypatch): def test_detect_DOCX_from_application_zip_file_path(magic_from_file_: Mock):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/zip") magic_from_file_.return_value = "application/zip"
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.docx") file_path = example_doc_path("simple.docx")
filetype = detect_filetype(filename=filename)
filetype = detect_filetype(file_path)
magic_from_file_.assert_called_once_with(file_path, mime=True)
assert filetype == FileType.DOCX assert filetype == FileType.DOCX
def test_detect_application_zip_files(monkeypatch, tmpdir): def test_detect_ZIP_from_application_zip_file_path(magic_from_file_: Mock):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/zip") magic_from_file_.return_value = "application/zip"
filename = os.path.join(tmpdir, "test.zip") file_path = example_doc_path("simple.zip")
zf = zipfile.ZipFile(filename, "w")
zf.close() filetype = detect_filetype(file_path)
filetype = detect_filetype(filename=filename)
magic_from_file_.assert_called_once_with(file_path, mime=True)
assert filetype == FileType.ZIP assert filetype == FileType.ZIP
def test_detect_doc_file_from_mime_type(monkeypatch): def test_detect_DOC_from_application_msword_file_path(magic_from_file_: Mock):
monkeypatch.setattr( magic_from_file_.return_value = "application/msword"
magic, file_path = example_doc_path("fake.doc")
"from_file",
lambda *args, **kwargs: "application/msword", filetype = detect_filetype(file_path)
)
filetype = detect_filetype(filename="fake.doc") magic_from_file_.assert_called_once_with(file_path, mime=True)
assert filetype == FileType.DOC assert filetype == FileType.DOC
def test_detect_ppt_file_from_mime_type(monkeypatch): def test_detect_PPT_from_application_vnd_ms_powerpoint_file_path(magic_from_file_: Mock):
monkeypatch.setattr( magic_from_file_.return_value = "application/vnd.ms-powerpoint"
magic, file_path = example_doc_path("fake-power-point.ppt")
"from_file",
lambda *args, **kwargs: "application/vnd.ms-powerpoint", filetype = detect_filetype(file_path)
)
filetype = detect_filetype(filename="fake.ppt") magic_from_file_.assert_called_once_with(file_path, mime=True)
assert filetype == FileType.PPT assert filetype == FileType.PPT
def test_detect_xls_file_from_mime_type(monkeypatch): def test_detect_XLS_from_application_vnd_ms_excel_file_path(magic_from_file_: Mock):
monkeypatch.setattr( magic_from_file_.return_value = "application/vnd.ms-excel"
magic, file_path = example_doc_path("tests-example.xls")
"from_file",
lambda *args, **kwargs: "application/vnd.ms-excel", filetype = detect_filetype(file_path)
)
filetype = detect_filetype(filename="fake.xls") magic_from_file_.assert_called_once_with(file_path, mime=True)
assert filetype == FileType.XLS assert filetype == FileType.XLS
def test_detect_xlsx_filetype_application_octet_stream(monkeypatch): def test_detect_XLSX_from_application_octet_stream_file_no_extension(magic_from_buffer_: Mock):
monkeypatch.setattr( magic_from_buffer_.return_value = "application/octet-stream"
magic, with open(example_doc_path("stanley-cups.xlsx"), "rb") as f:
"from_buffer", file = io.BytesIO(f.read())
lambda *args, **kwargs: "application/octet-stream",
) filetype = detect_filetype(file=file)
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
with open(filename, "rb") as f: magic_from_buffer_.assert_called_once_with(file.getvalue()[:4096], mime=True)
filetype = detect_filetype(file=f)
assert filetype == FileType.XLSX assert filetype == FileType.XLSX
def test_detect_xlsx_filetype_application_octet_stream_with_filename(monkeypatch): def test_detect_XLSX_from_application_octet_stream_file_path(magic_from_file_: Mock):
monkeypatch.setattr( magic_from_file_.return_value = "application/octet-stream"
magic, file_path = example_doc_path("stanley-cups.xlsx")
"from_file",
lambda *args, **kwargs: "application/octet-stream", filetype = detect_filetype(file_path)
)
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx") magic_from_file_.assert_called_once_with(file_path, mime=True)
filetype = detect_filetype(filename=filename)
assert filetype == FileType.XLSX assert filetype == FileType.XLSX
def test_detect_pptx_filetype_application_octet_stream(monkeypatch): def test_detect_PPTX_from_application_octet_stream_file_no_extension(magic_from_buffer_: Mock):
monkeypatch.setattr( magic_from_buffer_.return_value = "application/octet-stream"
magic, with open(example_doc_path("fake-power-point.pptx"), "rb") as f:
"from_buffer", file = io.BytesIO(f.read())
lambda *args, **kwargs: "application/octet-stream",
) filetype = detect_filetype(file=file)
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
with open(filename, "rb") as f: magic_from_buffer_.assert_called_once_with(file.getvalue()[:4096], mime=True)
filetype = detect_filetype(file=f)
assert filetype == FileType.PPTX assert filetype == FileType.PPTX
def test_detect_pptx_filetype_application_octet_stream_with_filename(monkeypatch): def test_detect_PPTX_from_application_octet_stream_file_path(magic_from_file_: Mock):
monkeypatch.setattr( magic_from_file_.return_value = "application/octet-stream"
magic, file_path = example_doc_path("fake-power-point.pptx")
"from_file",
lambda *args, **kwargs: "application/octet-stream", filetype = detect_filetype(file_path)
)
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx") magic_from_file_.assert_called_once_with(file_path, mime=True)
filetype = detect_filetype(filename=filename)
assert filetype == FileType.PPTX assert filetype == FileType.PPTX
def test_detect_application_octet_stream_returns_none_with_unknown(monkeypatch): def test_detect_UNK_from_application_octet_stream_text_file_no_extension(magic_from_buffer_: Mock):
monkeypatch.setattr( magic_from_buffer_.return_value = "application/octet-stream"
magic, with open(example_doc_path("fake-text.txt"), "rb") as f:
"from_buffer", file = io.BytesIO(f.read())
lambda *args, **kwargs: "application/octet-stream",
) filetype = detect_filetype(file=file)
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
with open(filename, "rb") as f: assert magic_from_buffer_.call_args_list == [
filetype = detect_filetype(file=f) call(file.getvalue()[:4096], mime=True),
call(b"", mime=True),
]
assert filetype == FileType.UNK assert filetype == FileType.UNK
def test_detect_application_zip_returns_zip_with_unknown(monkeypatch): def test_detect_ZIP_from_application_zip_not_a_zip_file(magic_from_buffer_: Mock):
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/zip") magic_from_buffer_.return_value = "application/zip"
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
with open(filename, "rb") as f: with open(example_doc_path("fake-text.txt"), "rb") as f:
head = f.read(4096)
f.seek(0)
filetype = detect_filetype(file=f) filetype = detect_filetype(file=f)
assert magic_from_buffer_.call_args_list == [
call(head, mime=True),
call(b"", mime=True),
]
assert filetype == FileType.ZIP assert filetype == FileType.ZIP
def test_detect_docx_filetype_word_mime_type(monkeypatch): def test_detect_DOCX_from_docx_mime_type_file_no_extension(magic_from_buffer_: Mock):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: DOCX_MIME_TYPES[0]) magic_from_buffer_.return_value = (
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.docx") "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
with open(filename, "rb") as f: )
filetype = detect_filetype(file=f) with open(example_doc_path("simple.docx"), "rb") as f:
file = io.BytesIO(f.read())
filetype = detect_filetype(file=file)
magic_from_buffer_.assert_called_once_with(file.getvalue()[:4096], mime=True)
assert filetype == FileType.DOCX assert filetype == FileType.DOCX
def test_detect_xlsx_filetype_word_mime_type(monkeypatch): def test_detect_XLSX_from_xlsx_mime_type_file_no_extension(magic_from_buffer_: Mock):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: XLSX_MIME_TYPES[0]) magic_from_buffer_.return_value = (
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx") "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
with open(filename, "rb") as f: )
filetype = detect_filetype(file=f) with open(example_doc_path("stanley-cups.xlsx"), "rb") as f:
file = io.BytesIO(f.read())
filetype = detect_filetype(file=file)
magic_from_buffer_.assert_called_once_with(file.getvalue()[:4096], mime=True)
assert filetype == FileType.XLSX assert filetype == FileType.XLSX
def test_detect_filetype_returns_none_with_unknown(monkeypatch): def test_detect_UNK_from_extension_of_non_existent_file_path():
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/fake") assert detect_filetype(example_doc_path("made_up.fake")) == FileType.UNK
assert detect_filetype(filename="made_up.fake") == FileType.UNK
def test_detect_filetype_detects_png(monkeypatch): def test_detect_PNG_from_extension_of_non_existent_file_path():
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "image/png") assert detect_filetype(example_doc_path("made_up.png")) == FileType.PNG
assert detect_filetype(filename="made_up.png") == FileType.PNG
def test_detect_filetype_detects_unknown_text_types_as_txt(monkeypatch, tmpdir): def test_detect_TXT_from_unknown_text_subtype_file_no_extension(magic_from_buffer_: Mock):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/new-type") magic_from_buffer_.return_value = "text/new-type"
monkeypatch.setattr(os.path, "isfile", lambda *args, **kwargs: True) with open(example_doc_path("fake-text.txt"), "rb") as f:
file = io.BytesIO(f.read())
filename = os.path.join(tmpdir.dirname, "made_up.png") filetype = detect_filetype(file=file)
with open(filename, "w") as f:
f.write("here is a fake file!")
assert detect_filetype(filename=filename) == FileType.TXT magic_from_buffer_.assert_called_once_with(file.getvalue()[:4096], mime=True)
assert filetype == FileType.TXT
def test_detect_filetype_detects_bmp_from_filename( def test_detect_BMP_from_file_path():
tmpdir, assert detect_filetype(example_doc_path("bmp_24.bmp")) == FileType.BMP
filename="example-docs/layout-parser-paper-with-table.jpg",
):
bmp_filename = os.path.join(tmpdir.dirname, "example.bmp")
img = Image.open(filename)
img.save(bmp_filename)
detect_filetype(filename=bmp_filename) == FileType.BMP
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") def test_detect_BMP_from_file_no_extension():
def test_detect_filetype_detects_bmp_from_file( with open(example_doc_path("bmp_24.bmp"), "rb") as f:
tmpdir, file = io.BytesIO(f.read())
filename="example-docs/layout-parser-paper-with-table.jpg", assert detect_filetype(file=file) == FileType.BMP
):
bmp_filename = os.path.join(tmpdir.dirname, "example.bmp")
img = Image.open(filename)
img.save(bmp_filename)
with open(bmp_filename, "rb") as f:
assert detect_filetype(file=f) == FileType.BMP
def test_detect_filetype_raises_with_both_specified(): def test_detect_filetype_raises_when_both_path_and_file_like_object_are_specified():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "eml/fake-email.eml") file_path = example_doc_path("fake-email.eml")
with open(filename, "rb") as f, pytest.raises(ValueError): with open(example_doc_path(file_path), "rb") as f:
detect_filetype(filename=filename, file=f) file = io.BytesIO(f.read())
with pytest.raises(ValueError, match="Exactly one of filename and file must be specified."):
detect_filetype(filename=file_path, file=file)
def test_detect_filetype_raises_with_none_specified(): def test_detect_filetype_raises_with_neither_path_or_file_like_object_specified():
with pytest.raises(ValueError): with pytest.raises(ValueError, match="Exactly one of filename and file must be specified."):
detect_filetype() detect_filetype()
def test_filetype_order(): def test_FileType_is_ordererd_by_name():
assert FileType.HTML < FileType.XML """FileType is a total order on name, e.g. FileType.A < FileType.B."""
assert FileType.EML < FileType.HTML < FileType.XML
@pytest.mark.parametrize( @pytest.mark.parametrize(
("content", "expected"), ("content", "expected_value"),
[ [
(b"d\xe2\x80", False), # Invalid JSON (b"d\xe2\x80", False), # Invalid JSON
(b'[{"key": "value"}]', True), # Valid JSON (b'[{"key": "value"}]', True), # Valid JSON
@ -425,15 +460,13 @@ def test_filetype_order():
(b'"This is not a JSON"', False), # Serializable as JSON, but we want to treat it as txt (b'"This is not a JSON"', False), # Serializable as JSON, but we want to treat it as txt
], ],
) )
def test_is_text_file_a_json(content, expected): def test_is_text_file_a_json_distinguishes_JSON_from_text(content: bytes, expected_value: bool):
from io import BytesIO with io.BytesIO(content) as f:
assert _is_text_file_a_json(file=f) == expected_value
with BytesIO(content) as f:
assert _is_text_file_a_json(file=f) == expected
@pytest.mark.parametrize( @pytest.mark.parametrize(
("content", "expected"), ("content", "expected_value"),
[ [
(b"d\xe2\x80", False), # Invalid CSV (b"d\xe2\x80", False), # Invalid CSV
(b'[{"key": "value"}]', False), # Invalid CSV (b'[{"key": "value"}]', False), # Invalid CSV
@ -441,76 +474,83 @@ def test_is_text_file_a_json(content, expected):
(b"", False), # Empty content (b"", False), # Empty content
], ],
) )
def test_is_text_file_a_csv(content, expected): def test_is_text_file_a_csv_distinguishes_CSV_from_text(content: bytes, expected_value: bool):
from io import BytesIO with io.BytesIO(content) as f:
assert _is_text_file_a_csv(file=f) == expected_value
with BytesIO(content) as f:
assert _is_text_file_a_csv(file=f) == expected
def test_csv_json_check_with_filename_and_utf_32(filename="example-docs/fake-text-utf-32.txt"): def test_csv_and_json_checks_with_filename_accommodate_utf_32_encoded_file():
assert _is_text_file_a_csv(filename=filename) is False file_path = example_doc_path("fake-text-utf-32.txt")
assert _is_text_file_a_json(filename=filename) is False assert _is_text_file_a_csv(filename=file_path) is False
assert _is_text_file_a_json(filename=file_path) is False
def test_csv_json_check_with_file_and_utf_32(filename="example-docs/fake-text-utf-32.txt"): def test_csv_and_json_checks_with_file_accommodate_utf_32_encoded_content():
with open(filename, "rb") as f: with open(example_doc_path("fake-text-utf-32.txt"), "rb") as f:
assert _is_text_file_a_csv(file=f) is False file = io.BytesIO(f.read())
with open(filename, "rb") as f: assert _is_text_file_a_csv(file=file) is False
assert _is_text_file_a_json(file=f) is False file.seek(0)
assert _is_text_file_a_json(file=file) is False
def test_detect_filetype_detects_empty_filename(filename="example-docs/empty.txt"): def test_detect_EMPTY_from_file_path_to_empty_file():
assert detect_filetype(filename=filename) == FileType.EMPTY assert detect_filetype(example_doc_path("empty.txt")) == FileType.EMPTY
def test_detect_filetype_detects_empty_file(filename="example-docs/empty.txt"): def test_detect_EMPTY_from_file_that_is_empty():
with open(filename, "rb") as f: with open(example_doc_path("empty.txt"), "rb") as f:
assert detect_filetype(file=f) == FileType.EMPTY assert detect_filetype(file=f) == FileType.EMPTY
def test_detect_filetype_skips_escape_commas_for_csv(tmpdir): def test_detect_CSV_from_path_and_file_when_content_contains_escaped_commas():
text = 'A,A,A,A,A\nA,A,A,"A,A",A\nA,A,A,"A,A",A' file_path = example_doc_path("csv-with-escaped-commas.csv")
filename = os.path.join(tmpdir.dirname, "csv-with-escaped-commas.csv")
with open(filename, "w") as f:
f.write(text)
assert detect_filetype(filename=filename) == FileType.CSV assert detect_filetype(filename=file_path) == FileType.CSV
with open(file_path, "rb") as f:
with open(filename, "rb") as f:
assert detect_filetype(file=f) == FileType.CSV assert detect_filetype(file=f) == FileType.CSV
def test_detect_filetype_from_octet_stream(filename="example-docs/emoji.xlsx"): def test_detect_filetype_from_octet_stream():
with open(filename, "rb") as f: with open(example_doc_path("emoji.xlsx"), "rb") as f:
assert _detect_filetype_from_octet_stream(file=f) == FileType.XLSX assert _detect_filetype_from_octet_stream(file=f) == FileType.XLSX
def test_detect_wav_from_filename(filename="example-docs/CantinaBand3.wav"): def test_detect_WAV_from_filename():
assert detect_filetype(filename=filename) == FileType.WAV assert detect_filetype(example_doc_path("CantinaBand3.wav")) == FileType.WAV
def test_detect_wav_from_file(filename="example-docs/CantinaBand3.wav"): def test_detect_wav_from_file():
with open(filename, "rb") as f: with open(example_doc_path("CantinaBand3.wav"), "rb") as f:
assert detect_filetype(file=f) == FileType.WAV assert detect_filetype(file=f) == FileType.WAV
def test_detect_yaml_as_text_from_filename(tmpdir): def test_detect_TXT_from_file_path_to_yaml():
data = {"hi": "there", "this is": "yaml"} assert detect_filetype(example_doc_path("simple.yaml")) == FileType.TXT
filename = os.path.join(tmpdir.dirname, "test.yaml")
with open(filename, "w") as f:
yaml.dump(data, f)
assert detect_filetype(filename=filename) == FileType.TXT
def test_detect_yaml_as_text_from_file(tmpdir, monkeypatch): def test_detect_TXT_from_yaml_file(magic_from_buffer_: Mock):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/yaml") magic_from_buffer_.return_value = "text/yaml"
data = {"hi": "there", "this is": "yaml"}
filename = os.path.join(tmpdir.dirname, "test.yaml")
with open(filename, "w") as f:
yaml.dump(data, f)
with open(filename, "rb") as f: with open(example_doc_path("simple.yaml"), "rb") as f:
assert detect_filetype(file=f) == FileType.TXT head = f.read(4096)
f.seek(0)
file_type = detect_filetype(file=f)
magic_from_buffer_.assert_called_once_with(head, mime=True)
assert file_type == FileType.TXT
# ================================================================================================
# MODULE-LEVEL FIXTURES
# ================================================================================================
@pytest.fixture()
def magic_from_buffer_(request: FixtureRequest):
return function_mock(request, "unstructured.file_utils.filetype.magic.from_buffer")
@pytest.fixture()
def magic_from_file_(request: FixtureRequest):
return function_mock(request, "unstructured.file_utils.filetype.magic.from_file")

View File

@ -1 +1 @@
__version__ = "0.15.0-dev11" # pragma: no cover __version__ = "0.15.0-dev12" # pragma: no cover