chore: add support for SpooledTemporaryFiles (#569)

2025-12-04 11:10:22 +00:00 · 2023-05-09 21:39:07 -07:00 · 2023-05-09 21:39:07 -07:00 · b52638f8e3
commit b52638f8e3
parent 19beb24e03
11 changed files with 98 additions and 25 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,8 +1,8 @@
-## 0.6.5-dev0
+## 0.6.5

 ### Enhancements

-* PLACEHOLDER - delete this line when there is an actual changelog item to report for 0.6.5
+* Added support for SpooledTemporaryFile file argument.

 ### Features

--- a/test_unstructured/partition/test_docx.py
+++ b/test_unstructured/partition/test_docx.py
@ -62,6 +62,21 @@ def test_partition_docx_with_filename(mock_document, expected_elements, tmpdir):
    assert elements == expected_elements


+def test_partition_docx_with_spooled_file(mock_document, expected_elements, tmpdir):
+    # Test that the partition_docx function can handle a SpooledTemporaryFile
+    filename = os.path.join(tmpdir.dirname, "mock_document.docx")
+    mock_document.save(filename)
+
+    from tempfile import SpooledTemporaryFile
+
+    with open(filename, "rb") as test_file:
+        spooled_temp_file = SpooledTemporaryFile()
+        spooled_temp_file.write(test_file.read())
+        spooled_temp_file.seek(0)
+        elements = partition_docx(file=spooled_temp_file)
+        assert elements == expected_elements
+
+
 def test_partition_docx_with_file(mock_document, expected_elements, tmpdir):
    filename = os.path.join(tmpdir.dirname, "mock_document.docx")
    mock_document.save(filename)
--- a/test_unstructured/partition/test_pdf.py
+++ b/test_unstructured/partition/test_pdf.py
@ -1,4 +1,5 @@
 import os
+from tempfile import SpooledTemporaryFile
 from unittest import mock

 import pytest
@ -172,6 +173,24 @@ def test_partition_pdf(url, api_called, local_called, monkeypatch):
        assert pdf._partition_pdf_or_image_local.called == local_called


+@pytest.mark.parametrize(
+    ("strategy"),
+    [("fast"), ("hi_res"), ("ocr_only")],
+)
+def test_partition_pdf_with_spooled_file(
+    strategy,
+    filename="example-docs/layout-parser-paper-fast.pdf",
+):
+    # Test that the partition_pdf function can handle a SpooledTemporaryFile
+    with open(filename, "rb") as test_file:
+        spooled_temp_file = SpooledTemporaryFile()
+        spooled_temp_file.write(test_file.read())
+        spooled_temp_file.seek(0)
+        result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy)
+        # validate that the result is a non-empty list of dicts
+        assert len(result) > 10
+
+
@pytest.mark.parametrize(
    ("url", "api_called", "local_called"),
    [("fakeurl", True, False), (None, False, True)],
--- a/test_unstructured/partition/test_pptx.py
+++ b/test_unstructured/partition/test_pptx.py
@ -32,6 +32,19 @@ def test_partition_pptx_from_filename():
    assert elements == EXPECTED_PPTX_OUTPUT


+def test_partition_pptx_with_spooled_file():
+    # Test that the partition_pptx function can handle a SpooledTemporaryFile
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
+    from tempfile import SpooledTemporaryFile
+
+    with open(filename, "rb") as test_file:
+        spooled_temp_file = SpooledTemporaryFile()
+        spooled_temp_file.write(test_file.read())
+        spooled_temp_file.seek(0)
+        elements = partition_pptx(file=spooled_temp_file)
+        assert elements == EXPECTED_PPTX_OUTPUT
+
+
 def test_partition_pptx_from_file():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
    with open(filename, "rb") as f:
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.6.5-dev0"  # pragma: no cover
+__version__ = "0.6.5"  # pragma: no cover
--- a/unstructured/partition/init.py
+++ b/unstructured/partition/init.py
@ -8,7 +8,7 @@ from unstructured.documents.elements import Element

 def _partition_via_api(
    filename: str = "",
-    file: Optional[bytes] = None,
+    file: Optional[Union[BinaryIO, bytes]] = None,
    url: str = "https://ml.unstructured.io/layout/pdf",
    token: Optional[str] = None,
    data: Optional[dict] = None,  # NOTE(alan): Remove after different models are handled by routing
--- a/unstructured/partition/common.py
+++ b/unstructured/partition/common.py
@ -1,5 +1,7 @@
 import subprocess
-from typing import List, Optional, Tuple, Union
+from io import BytesIO
+from tempfile import SpooledTemporaryFile
+from typing import BinaryIO, List, Optional, Tuple, Union

 from unstructured.documents.elements import (
    TYPE_TO_TEXT_ELEMENT_MAP,
@ -157,3 +159,15 @@ def exactly_one(**kwargs) -> None:
        else:
            message = f"{names[0]} must be specified."
        raise ValueError(message)
+
+
+def spooled_to_bytes_io_if_needed(
+    file_obj: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]],
+) -> Optional[Union[bytes, BinaryIO]]:
+    if isinstance(file_obj, SpooledTemporaryFile):
+        file_obj.seek(0)
+        contents = file_obj.read()
+        return BytesIO(contents)
+    else:
+        # Return the original file object if it's not a SpooledTemporaryFile
+        return file_obj
--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@ -1,6 +1,7 @@
 import os
 import tempfile
-from typing import IO, List, Optional
+from tempfile import SpooledTemporaryFile
+from typing import IO, BinaryIO, List, Optional, Union, cast

 import docx
 import pypandoc
@ -15,7 +16,7 @@ from unstructured.documents.elements import (
    Text,
    Title,
 )
-from unstructured.partition.common import exactly_one
+from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
 from unstructured.partition.text_type import (
    is_bulleted_text,
    is_possible_narrative_text,
@ -62,7 +63,7 @@ STYLE_TO_ELEMENT_MAPPING = {

 def partition_docx(
    filename: Optional[str] = None,
-    file: Optional[IO] = None,
+    file: Optional[Union[IO, SpooledTemporaryFile]] = None,
    metadata_filename: Optional[str] = None,
 ) -> List[Element]:
    """Partitions Microsoft Word Documents in .docx format into its document elements.
@ -85,7 +86,9 @@ def partition_docx(
    if filename is not None:
        document = docx.Document(filename)
    elif file is not None:
-        document = docx.Document(file)
+        document = docx.Document(
+            spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file)),
+        )

    metadata_filename = metadata_filename or filename
    elements: List[Element] = []
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -1,6 +1,7 @@
 import re
 import warnings
-from typing import BinaryIO, List, Optional, cast
+from tempfile import SpooledTemporaryFile
+from typing import BinaryIO, List, Optional, Union, cast

 import pdf2image
 import pytesseract
@ -16,6 +17,7 @@ from unstructured.partition.common import (
    add_element_metadata,
    document_to_element_list,
    exactly_one,
+    spooled_to_bytes_io_if_needed,
 )
 from unstructured.partition.strategies import determine_pdf_or_image_strategy
 from unstructured.partition.text import partition_text
@ -24,7 +26,7 @@ from unstructured.utils import requires_dependencies

 def partition_pdf(
    filename: str = "",
-    file: Optional[bytes] = None,
+    file: Optional[Union[BinaryIO, SpooledTemporaryFile]] = None,
    url: Optional[str] = None,
    template: str = "layout/pdf",
    token: Optional[str] = None,
@ -86,7 +88,7 @@ def partition_pdf(

 def partition_pdf_or_image(
    filename: str = "",
-    file: Optional[bytes] = None,
+    file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
    url: Optional[str] = "https://ml.unstructured.io/",
    template: str = "layout/pdf",
    token: Optional[str] = None,
@ -122,7 +124,7 @@ def partition_pdf_or_image(
                warnings.simplefilter("ignore")
                layout_elements = _partition_pdf_or_image_local(
                    filename=filename,
-                    file=file,
+                    file=spooled_to_bytes_io_if_needed(file),
                    template=out_template,
                    is_image=is_image,
                    infer_table_structure=infer_table_structure,
@ -133,7 +135,7 @@ def partition_pdf_or_image(
        elif strategy == "fast":
            return _partition_pdf_with_pdfminer(
                filename=filename,
-                file=file,
+                file=spooled_to_bytes_io_if_needed(file),
                include_page_breaks=include_page_breaks,
                encoding=encoding,
            )
@ -159,7 +161,7 @@ def partition_pdf_or_image(
        # NOTE(alan): Remove "data=data" after different models are handled by routing
        layout_elements = _partition_via_api(
            filename=filename,
-            file=file,
+            file=cast(BinaryIO, file),
            url=url,
            token=token,
            data=data,
@ -175,7 +177,7 @@ def partition_pdf_or_image(

 def _partition_pdf_or_image_local(
    filename: str = "",
-    file: Optional[bytes] = None,
+    file: Optional[Union[bytes, BinaryIO]] = None,
    template: Optional[str] = None,
    is_image: bool = False,
    infer_table_structure: bool = False,
@ -226,7 +228,7 @@ def _partition_pdf_or_image_local(
@requires_dependencies("pdfminer", "local-inference")
 def _partition_pdf_with_pdfminer(
    filename: str = "",
-    file: Optional[bytes] = None,
+    file: Optional[BinaryIO] = None,
    include_page_breaks: bool = False,
    encoding: str = "utf-8",
 ) -> List[Element]:
@ -300,7 +302,7 @@ def _process_pdfminer_pages(

 def _partition_pdf_or_image_with_ocr(
    filename: str = "",
-    file: Optional[bytes] = None,
+    file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
    include_page_breaks: bool = False,
    ocr_languages: str = "eng",
    is_image: bool = False,
--- a/unstructured/partition/pptx.py
+++ b/unstructured/partition/pptx.py
@ -1,4 +1,5 @@
-from typing import IO, List, Optional
+from tempfile import SpooledTemporaryFile
+from typing import IO, BinaryIO, List, Optional, Union, cast

 import pptx

@ -11,7 +12,7 @@ from unstructured.documents.elements import (
    Text,
    Title,
 )
-from unstructured.partition.common import exactly_one
+from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
 from unstructured.partition.text_type import (
    is_possible_narrative_text,
    is_possible_title,
@ -22,7 +23,7 @@ OPENXML_SCHEMA_NAME = "{http://schemas.openxmlformats.org/drawingml/2006/main}"

 def partition_pptx(
    filename: Optional[str] = None,
-    file: Optional[IO] = None,
+    file: Optional[Union[IO, SpooledTemporaryFile]] = None,
    include_page_breaks: bool = True,
    metadata_filename: Optional[str] = None,
 ) -> List[Element]:
@ -48,7 +49,9 @@ def partition_pptx(
    if filename is not None:
        presentation = pptx.Presentation(filename)
    elif file is not None:
-        presentation = pptx.Presentation(file)
+        presentation = pptx.Presentation(
+            spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file)),
+        )

    elements: List[Element] = []
    metadata_filename = metadata_filename or filename
--- a/unstructured/partition/strategies.py
+++ b/unstructured/partition/strategies.py
@ -1,4 +1,5 @@
-from typing import BinaryIO, Dict, List, Optional, cast
+from tempfile import SpooledTemporaryFile
+from typing import BinaryIO, Dict, List, Optional, Union, cast

 from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
 from pdfminer.utils import open_filename
@ -31,7 +32,10 @@ def validate_strategy(strategy: str, filetype: str):
        raise ValueError(f"{strategy} is not a valid strategy for filetype {filetype}.")


-def is_pdf_text_extractable(filename: str = "", file: Optional[bytes] = None):
+def is_pdf_text_extractable(
+    filename: str = "",
+    file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
+):
    """Checks to see if the text from a PDF document is extractable. Sometimes the
    text is not extractable due to PDF security settings."""
    exactly_one(filename=filename, file=file)
@ -56,7 +60,7 @@ def is_pdf_text_extractable(filename: str = "", file: Optional[bytes] = None):
 def determine_pdf_or_image_strategy(
    strategy: str,
    filename: str = "",
-    file: Optional[bytes] = None,
+    file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
    is_image: bool = False,
 ):
    """Determines what strategy to use for processing PDFs or images, accounting for fallback