Adding content_type and file_filename to autopartition (#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
2025-12-27 15:13:35 +00:00 · 2023-03-24 16:32:45 -07:00 · 2023-03-24 16:32:45 -07:00 · 71e035c34c
commit 71e035c34c
parent 8ffd31029e
6 changed files with 110 additions and 23 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,9 +1,10 @@
-## 0.5.7-dev3
+## 0.5.7

 ### Enhancements

 * Refactored codebase using `exactly_one`
 * Adds ability to pass headers when passing a url in partition_html()
+* Added optional `content_type` and `file_filename` parameters to `partition()` to bypass file detection

 ### Features

--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@ -30,6 +30,7 @@ In cases where ``libmagic`` is not available, filetype detection will fall back
 As shown in the examples below, the ``partition`` function accepts both filenames and file-like objects as input.
 ``partition`` also has some optional kwargs.
 For example, if you set ``include_page_breaks=True``, the output will include ``PageBreak`` elements if the filetype supports it.
+Additionally you can bypass the filetype detection logic with the optional  ``content_type`` argument which may be specified with either the ``filename`` or file-like object, ``file``.
 You can find a full listing of optional kwargs in the documentation below.

 .. code:: python
@ -38,7 +39,7 @@ You can find a full listing of optional kwargs in the documentation below.


  filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
-  elements = partition(filename=filename)
+  elements = partition(filename=filename, content_type="application/pdf")
  print("\n\n".join([str(el) for el in elements][:10]))


@ -57,7 +58,7 @@ The ``unstructured`` library also includes partitioning bricks targeted at speci
 The ``partition`` brick uses these document-specific partitioning bricks under the hood.
 There are a few reasons you may want to use a document-specific partitioning brick instead of ``partition``:

-* If you already know the document type, filetype detection is unnecessary. Using the document-specific brick directly will make your program run faster.
+* If you already know the document type, filetype detection is unnecessary. Using the document-specific brick directly, or passing in the ``content_type`` will make your program run faster.
 * Fewer dependencies. You don't need to install ``libmagic`` for filetype detection if you're only using document-specific bricks.
 * Additional features. The API for partition is the least common denominator for all document types. Certain document-specific brick include extra features that you may want to take advantage of. For example, ``partition_html`` allows you to pass in a URL so you don't have to store the ``.html`` file locally. See the documentation below learn about the options available in each partitioning brick.

--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -105,13 +105,27 @@ def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_element
    assert elements == expected_docx_elements


-def test_auto_partition_doc_with_filename(mock_docx_document, expected_docx_elements, tmpdir):
+@pytest.mark.parametrize(
+    ("pass_file_filename", "content_type"),
+    [(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
+)
+def test_auto_partition_doc_with_filename(
+    mock_docx_document,
+    expected_docx_elements,
+    tmpdir,
+    pass_file_filename,
+    content_type,
+):
    docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
    doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
    mock_docx_document.save(docx_filename)
    convert_office_doc(docx_filename, tmpdir.dirname, "doc")
-
-    elements = partition(filename=doc_filename)
+    file_filename = doc_filename if pass_file_filename else None
+    elements = partition(
+        filename=doc_filename,
+        file_filename=file_filename,
+        content_type=content_type,
+    )
    assert elements == expected_docx_elements
    assert elements[0].metadata.filename == doc_filename

@ -130,17 +144,27 @@ def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements
    assert elements == expected_docx_elements


-def test_auto_partition_html_from_filename():
+@pytest.mark.parametrize(
+    ("pass_file_filename", "content_type"),
+    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
+)
+def test_auto_partition_html_from_filename(pass_file_filename, content_type):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html")
-    elements = partition(filename=filename)
+    file_filename = filename if pass_file_filename else None
+    elements = partition(filename=filename, file_filename=file_filename, content_type=content_type)
    assert len(elements) > 0
    assert elements[0].metadata.filename == filename


-def test_auto_partition_html_from_file():
+@pytest.mark.parametrize(
+    ("pass_file_filename", "content_type"),
+    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
+)
+def test_auto_partition_html_from_file(pass_file_filename, content_type):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
+    file_filename = filename if pass_file_filename else None
    with open(filename) as f:
-        elements = partition(file=f)
+        elements = partition(file=f, file_filename=file_filename, content_type=content_type)
    assert len(elements) > 0


@ -177,9 +201,15 @@ def test_auto_partition_text_from_file():
    assert elements == EXPECTED_TEXT_OUTPUT


-def test_auto_partition_pdf_from_filename():
+@pytest.mark.parametrize(
+    ("pass_file_filename", "content_type"),
+    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
+)
+def test_auto_partition_pdf_from_filename(pass_file_filename, content_type):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
-    elements = partition(filename=filename)
+    file_filename = filename if pass_file_filename else None
+
+    elements = partition(filename=filename, file_filename=file_filename, content_type=content_type)

    assert isinstance(elements[0], Title)
    assert elements[0].text.startswith("LayoutParser")
@ -207,10 +237,16 @@ def test_auto_partition_pdf_with_fast_strategy():
    )


-def test_auto_partition_pdf_from_file():
+@pytest.mark.parametrize(
+    ("pass_file_filename", "content_type"),
+    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
+)
+def test_auto_partition_pdf_from_file(pass_file_filename, content_type):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
+    file_filename = filename if pass_file_filename else None
+
    with open(filename, "rb") as f:
-        elements = partition(file=f)
+        elements = partition(file=f, file_filename=file_filename, content_type=content_type)

    assert isinstance(elements[0], Title)
    assert elements[0].text.startswith("LayoutParser")
@ -230,16 +266,26 @@ def test_partition_pdf_doesnt_raise_warning():
        partition(filename=filename)


-def test_auto_partition_jpg():
+@pytest.mark.parametrize(
+    ("pass_file_filename", "content_type"),
+    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
+)
+def test_auto_partition_jpg(pass_file_filename, content_type):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example.jpg")
-    elements = partition(filename=filename)
+    file_filename = filename if pass_file_filename else None
+    elements = partition(filename=filename, file_filename=file_filename, content_type=content_type)
    assert len(elements) > 0


-def test_auto_partition_jpg_from_file():
+@pytest.mark.parametrize(
+    ("pass_file_filename", "content_type"),
+    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
+)
+def test_auto_partition_jpg_from_file(pass_file_filename, content_type):
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example.jpg")
+    file_filename = filename if pass_file_filename else None
    with open(filename, "rb") as f:
-        elements = partition(file=f)
+        elements = partition(file=f, file_filename=file_filename, content_type=content_type)
    assert len(elements) > 0


--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.5.7-dev3"  # pragma: no cover
+__version__ = "0.5.7"  # pragma: no cover
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@ -111,6 +111,24 @@ class FileType(Enum):
        return self.name < other.name


+STR_TO_FILETYPE = {
+    "application/pdf": FileType.PDF,
+    "application/msword": FileType.DOC,
+    "image/jpeg": FileType.JPG,
+    "image/png": FileType.PNG,
+    "text/markdown": FileType.MD,
+    "text/x-markdown": FileType.MD,
+    "application/epub": FileType.EPUB,
+    "application/epub+zip": FileType.EPUB,
+    "text/html": FileType.HTML,
+    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": FileType.XLSX,
+    "application/vnd.ms-excel": FileType.XLS,
+    "application/vnd.openxmlformats-officedocument.presentationml.presentation": FileType.PPTX,
+    "application/vnd.ms-powerpoint": FileType.PPT,
+    "application/xml": FileType.XML,
+}
+
+
 EXT_TO_FILETYPE = {
    ".pdf": FileType.PDF,
    ".docx": FileType.DOCX,
@ -138,18 +156,26 @@ EXT_TO_FILETYPE = {

 def detect_filetype(
    filename: Optional[str] = None,
+    content_type: Optional[str] = None,
    file: Optional[IO] = None,
+    file_filename: Optional[str] = None,
 ) -> Optional[FileType]:
    """Use libmagic to determine a file's type. Helps determine which partition brick
    to use for a given file. A return value of None indicates a non-supported file type."""
    exactly_one(filename=filename, file=file)

-    if filename:
-        _, extension = os.path.splitext(filename)
+    if content_type:
+        filetype = STR_TO_FILETYPE.get(content_type)
+        if filetype:
+            return filetype
+
+    if filename or file_filename:
+        _, extension = os.path.splitext(filename or file_filename or "")
        extension = extension.lower()
        if LIBMAGIC_AVAILABLE:
-            mime_type = magic.from_file(filename, mime=True)
+            mime_type = magic.from_file(filename or file_filename, mime=True)  # type: ignore
        else:
+            # might not need this
            return EXT_TO_FILETYPE.get(extension.lower(), FileType.UNK)
    elif file is not None:
        extension = None
@ -164,6 +190,8 @@ def detect_filetype(
                "Filetype detection on file-like objects requires libmagic. "
                "Please install libmagic and try again.",
            )
+    else:
+        raise ValueError("No filename, file, nor file_filename were specified.")

    if mime_type == "application/pdf":
        return FileType.PDF
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -17,7 +17,9 @@ from unstructured.partition.text import partition_text

 def partition(
    filename: Optional[str] = None,
+    content_type: Optional[str] = None,
    file: Optional[IO] = None,
+    file_filename: Optional[str] = None,
    include_page_breaks: bool = False,
    strategy: str = "hi_res",
    encoding: str = "utf-8",
@ -31,8 +33,12 @@ def partition(
    ----------
     filename
        A string defining the target filename path.
+    content_type
+        A string defining the file content in MIME type
    file
        A file-like object using "rb" mode --> open(filename, "rb").
+    file_filename
+        When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
    include_page_breaks
        If True, the output will include page breaks if the filetype supports it
    strategy
@ -42,7 +48,12 @@ def partition(
    encoding
        The encoding method used to decode the text input. If None, utf-8 will be used.
    """
-    filetype = detect_filetype(filename=filename, file=file)
+    filetype = detect_filetype(
+        filename=filename,
+        file=file,
+        file_filename=file_filename,
+        content_type=content_type,
+    )

    if file is not None:
        file.seek(0)