From b52638f8e31d62680722fdc73083cad8c55780a0 Mon Sep 17 00:00:00 2001
From: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Date: Tue, 9 May 2023 21:39:07 -0700
Subject: [PATCH] chore: add support for SpooledTemporaryFiles (#569)

---
 CHANGELOG.md                             |  4 ++--
 test_unstructured/partition/test_docx.py | 15 +++++++++++++++
 test_unstructured/partition/test_pdf.py  | 19 +++++++++++++++++++
 test_unstructured/partition/test_pptx.py | 13 +++++++++++++
 unstructured/__version__.py              |  2 +-
 unstructured/partition/__init__.py       |  2 +-
 unstructured/partition/common.py         | 16 +++++++++++++++-
 unstructured/partition/docx.py           | 11 +++++++----
 unstructured/partition/pdf.py            | 20 +++++++++++---------
 unstructured/partition/pptx.py           | 11 +++++++----
 unstructured/partition/strategies.py     | 10 +++++++---
 11 files changed, 98 insertions(+), 25 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d2672e193..ce62c2e75 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,8 +1,8 @@
-## 0.6.5-dev0
+## 0.6.5
 
 ### Enhancements
 
-* PLACEHOLDER - delete this line when there is an actual changelog item to report for 0.6.5
+* Added support for SpooledTemporaryFile file argument.
 
 ### Features
 
diff --git a/test_unstructured/partition/test_docx.py b/test_unstructured/partition/test_docx.py
index 8171db171..e65272ce4 100644
--- a/test_unstructured/partition/test_docx.py
+++ b/test_unstructured/partition/test_docx.py
@@ -62,6 +62,21 @@ def test_partition_docx_with_filename(mock_document, expected_elements, tmpdir):
     assert elements == expected_elements
 
 
+def test_partition_docx_with_spooled_file(mock_document, expected_elements, tmpdir):
+    # Test that the partition_docx function can handle a SpooledTemporaryFile
+    filename = os.path.join(tmpdir.dirname, "mock_document.docx")
+    mock_document.save(filename)
+
+    from tempfile import SpooledTemporaryFile
+
+    with open(filename, "rb") as test_file:
+        spooled_temp_file = SpooledTemporaryFile()
+        spooled_temp_file.write(test_file.read())
+        spooled_temp_file.seek(0)
+        elements = partition_docx(file=spooled_temp_file)
+        assert elements == expected_elements
+
+
 def test_partition_docx_with_file(mock_document, expected_elements, tmpdir):
     filename = os.path.join(tmpdir.dirname, "mock_document.docx")
     mock_document.save(filename)
diff --git a/test_unstructured/partition/test_pdf.py b/test_unstructured/partition/test_pdf.py
index 50a059ad5..a950d97af 100644
--- a/test_unstructured/partition/test_pdf.py
+++ b/test_unstructured/partition/test_pdf.py
@@ -1,4 +1,5 @@
 import os
+from tempfile import SpooledTemporaryFile
 from unittest import mock
 
 import pytest
@@ -172,6 +173,24 @@ def test_partition_pdf(url, api_called, local_called, monkeypatch):
         assert pdf._partition_pdf_or_image_local.called == local_called
 
 
+@pytest.mark.parametrize(
+    ("strategy"),
+    [("fast"), ("hi_res"), ("ocr_only")],
+)
+def test_partition_pdf_with_spooled_file(
+    strategy,
+    filename="example-docs/layout-parser-paper-fast.pdf",
+):
+    # Test that the partition_pdf function can handle a SpooledTemporaryFile
+    with open(filename, "rb") as test_file:
+        spooled_temp_file = SpooledTemporaryFile()
+        spooled_temp_file.write(test_file.read())
+        spooled_temp_file.seek(0)
+        result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy)
+        # validate that the result is a non-empty list of dicts
+        assert len(result) > 10
+
+
 @pytest.mark.parametrize(
     ("url", "api_called", "local_called"),
     [("fakeurl", True, False), (None, False, True)],
diff --git a/test_unstructured/partition/test_pptx.py b/test_unstructured/partition/test_pptx.py
index c54d54dca..8a868e95d 100644
--- a/test_unstructured/partition/test_pptx.py
+++ b/test_unstructured/partition/test_pptx.py
@@ -32,6 +32,19 @@ def test_partition_pptx_from_filename():
     assert elements == EXPECTED_PPTX_OUTPUT
 
 
+def test_partition_pptx_with_spooled_file():
+    # Test that the partition_pptx function can handle a SpooledTemporaryFile
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
+    from tempfile import SpooledTemporaryFile
+
+    with open(filename, "rb") as test_file:
+        spooled_temp_file = SpooledTemporaryFile()
+        spooled_temp_file.write(test_file.read())
+        spooled_temp_file.seek(0)
+        elements = partition_pptx(file=spooled_temp_file)
+        assert elements == EXPECTED_PPTX_OUTPUT
+
+
 def test_partition_pptx_from_file():
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
     with open(filename, "rb") as f:
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 530da768d..36225a4ca 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.6.5-dev0"  # pragma: no cover
+__version__ = "0.6.5"  # pragma: no cover
diff --git a/unstructured/partition/__init__.py b/unstructured/partition/__init__.py
index 6d406a49f..b3ed9e2fe 100644
--- a/unstructured/partition/__init__.py
+++ b/unstructured/partition/__init__.py
@@ -8,7 +8,7 @@ from unstructured.documents.elements import Element
 
 def _partition_via_api(
     filename: str = "",
-    file: Optional[bytes] = None,
+    file: Optional[Union[BinaryIO, bytes]] = None,
     url: str = "https://ml.unstructured.io/layout/pdf",
     token: Optional[str] = None,
     data: Optional[dict] = None,  # NOTE(alan): Remove after different models are handled by routing
diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py
index da325edce..0c60bfaee 100644
--- a/unstructured/partition/common.py
+++ b/unstructured/partition/common.py
@@ -1,5 +1,7 @@
 import subprocess
-from typing import List, Optional, Tuple, Union
+from io import BytesIO
+from tempfile import SpooledTemporaryFile
+from typing import BinaryIO, List, Optional, Tuple, Union
 
 from unstructured.documents.elements import (
     TYPE_TO_TEXT_ELEMENT_MAP,
@@ -157,3 +159,15 @@ def exactly_one(**kwargs) -> None:
         else:
             message = f"{names[0]} must be specified."
         raise ValueError(message)
+
+
+def spooled_to_bytes_io_if_needed(
+    file_obj: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]],
+) -> Optional[Union[bytes, BinaryIO]]:
+    if isinstance(file_obj, SpooledTemporaryFile):
+        file_obj.seek(0)
+        contents = file_obj.read()
+        return BytesIO(contents)
+    else:
+        # Return the original file object if it's not a SpooledTemporaryFile
+        return file_obj
diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py
index 14c71e2e1..42e86d45f 100644
--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@@ -1,6 +1,7 @@
 import os
 import tempfile
-from typing import IO, List, Optional
+from tempfile import SpooledTemporaryFile
+from typing import IO, BinaryIO, List, Optional, Union, cast
 
 import docx
 import pypandoc
@@ -15,7 +16,7 @@ from unstructured.documents.elements import (
     Text,
     Title,
 )
-from unstructured.partition.common import exactly_one
+from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
 from unstructured.partition.text_type import (
     is_bulleted_text,
     is_possible_narrative_text,
@@ -62,7 +63,7 @@ STYLE_TO_ELEMENT_MAPPING = {
 
 def partition_docx(
     filename: Optional[str] = None,
-    file: Optional[IO] = None,
+    file: Optional[Union[IO, SpooledTemporaryFile]] = None,
     metadata_filename: Optional[str] = None,
 ) -> List[Element]:
     """Partitions Microsoft Word Documents in .docx format into its document elements.
@@ -85,7 +86,9 @@ def partition_docx(
     if filename is not None:
         document = docx.Document(filename)
     elif file is not None:
-        document = docx.Document(file)
+        document = docx.Document(
+            spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file)),
+        )
 
     metadata_filename = metadata_filename or filename
     elements: List[Element] = []
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
index 0b7bb9742..84a3838a8 100644
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@@ -1,6 +1,7 @@
 import re
 import warnings
-from typing import BinaryIO, List, Optional, cast
+from tempfile import SpooledTemporaryFile
+from typing import BinaryIO, List, Optional, Union, cast
 
 import pdf2image
 import pytesseract
@@ -16,6 +17,7 @@ from unstructured.partition.common import (
     add_element_metadata,
     document_to_element_list,
     exactly_one,
+    spooled_to_bytes_io_if_needed,
 )
 from unstructured.partition.strategies import determine_pdf_or_image_strategy
 from unstructured.partition.text import partition_text
@@ -24,7 +26,7 @@ from unstructured.utils import requires_dependencies
 
 def partition_pdf(
     filename: str = "",
-    file: Optional[bytes] = None,
+    file: Optional[Union[BinaryIO, SpooledTemporaryFile]] = None,
     url: Optional[str] = None,
     template: str = "layout/pdf",
     token: Optional[str] = None,
@@ -86,7 +88,7 @@ def partition_pdf(
 
 def partition_pdf_or_image(
     filename: str = "",
-    file: Optional[bytes] = None,
+    file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
     url: Optional[str] = "https://ml.unstructured.io/",
     template: str = "layout/pdf",
     token: Optional[str] = None,
@@ -122,7 +124,7 @@ def partition_pdf_or_image(
                 warnings.simplefilter("ignore")
                 layout_elements = _partition_pdf_or_image_local(
                     filename=filename,
-                    file=file,
+                    file=spooled_to_bytes_io_if_needed(file),
                     template=out_template,
                     is_image=is_image,
                     infer_table_structure=infer_table_structure,
@@ -133,7 +135,7 @@ def partition_pdf_or_image(
         elif strategy == "fast":
             return _partition_pdf_with_pdfminer(
                 filename=filename,
-                file=file,
+                file=spooled_to_bytes_io_if_needed(file),
                 include_page_breaks=include_page_breaks,
                 encoding=encoding,
             )
@@ -159,7 +161,7 @@ def partition_pdf_or_image(
         # NOTE(alan): Remove "data=data" after different models are handled by routing
         layout_elements = _partition_via_api(
             filename=filename,
-            file=file,
+            file=cast(BinaryIO, file),
             url=url,
             token=token,
             data=data,
@@ -175,7 +177,7 @@ def partition_pdf_or_image(
 
 def _partition_pdf_or_image_local(
     filename: str = "",
-    file: Optional[bytes] = None,
+    file: Optional[Union[bytes, BinaryIO]] = None,
     template: Optional[str] = None,
     is_image: bool = False,
     infer_table_structure: bool = False,
@@ -226,7 +228,7 @@ def _partition_pdf_or_image_local(
 @requires_dependencies("pdfminer", "local-inference")
 def _partition_pdf_with_pdfminer(
     filename: str = "",
-    file: Optional[bytes] = None,
+    file: Optional[BinaryIO] = None,
     include_page_breaks: bool = False,
     encoding: str = "utf-8",
 ) -> List[Element]:
@@ -300,7 +302,7 @@ def _process_pdfminer_pages(
 
 def _partition_pdf_or_image_with_ocr(
     filename: str = "",
-    file: Optional[bytes] = None,
+    file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
     include_page_breaks: bool = False,
     ocr_languages: str = "eng",
     is_image: bool = False,
diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py
index 0105cc0a5..f8cd2fdfd 100644
--- a/unstructured/partition/pptx.py
+++ b/unstructured/partition/pptx.py
@@ -1,4 +1,5 @@
-from typing import IO, List, Optional
+from tempfile import SpooledTemporaryFile
+from typing import IO, BinaryIO, List, Optional, Union, cast
 
 import pptx
 
@@ -11,7 +12,7 @@ from unstructured.documents.elements import (
     Text,
     Title,
 )
-from unstructured.partition.common import exactly_one
+from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
 from unstructured.partition.text_type import (
     is_possible_narrative_text,
     is_possible_title,
@@ -22,7 +23,7 @@ OPENXML_SCHEMA_NAME = "{http://schemas.openxmlformats.org/drawingml/2006/main}"
 
 def partition_pptx(
     filename: Optional[str] = None,
-    file: Optional[IO] = None,
+    file: Optional[Union[IO, SpooledTemporaryFile]] = None,
     include_page_breaks: bool = True,
     metadata_filename: Optional[str] = None,
 ) -> List[Element]:
@@ -48,7 +49,9 @@ def partition_pptx(
     if filename is not None:
         presentation = pptx.Presentation(filename)
     elif file is not None:
-        presentation = pptx.Presentation(file)
+        presentation = pptx.Presentation(
+            spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file)),
+        )
 
     elements: List[Element] = []
     metadata_filename = metadata_filename or filename
diff --git a/unstructured/partition/strategies.py b/unstructured/partition/strategies.py
index a54d8f546..1838f98de 100644
--- a/unstructured/partition/strategies.py
+++ b/unstructured/partition/strategies.py
@@ -1,4 +1,5 @@
-from typing import BinaryIO, Dict, List, Optional, cast
+from tempfile import SpooledTemporaryFile
+from typing import BinaryIO, Dict, List, Optional, Union, cast
 
 from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
 from pdfminer.utils import open_filename
@@ -31,7 +32,10 @@ def validate_strategy(strategy: str, filetype: str):
         raise ValueError(f"{strategy} is not a valid strategy for filetype {filetype}.")
 
 
-def is_pdf_text_extractable(filename: str = "", file: Optional[bytes] = None):
+def is_pdf_text_extractable(
+    filename: str = "",
+    file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
+):
     """Checks to see if the text from a PDF document is extractable. Sometimes the
     text is not extractable due to PDF security settings."""
     exactly_one(filename=filename, file=file)
@@ -56,7 +60,7 @@ def is_pdf_text_extractable(filename: str = "", file: Optional[bytes] = None):
 def determine_pdf_or_image_strategy(
     strategy: str,
     filename: str = "",
-    file: Optional[bytes] = None,
+    file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
     is_image: bool = False,
 ):
     """Determines what strategy to use for processing PDFs or images, accounting for fallback