From b52638f8e31d62680722fdc73083cad8c55780a0 Mon Sep 17 00:00:00 2001 From: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Date: Tue, 9 May 2023 21:39:07 -0700 Subject: [PATCH] chore: add support for SpooledTemporaryFiles (#569) --- CHANGELOG.md | 4 ++-- test_unstructured/partition/test_docx.py | 15 +++++++++++++++ test_unstructured/partition/test_pdf.py | 19 +++++++++++++++++++ test_unstructured/partition/test_pptx.py | 13 +++++++++++++ unstructured/__version__.py | 2 +- unstructured/partition/__init__.py | 2 +- unstructured/partition/common.py | 16 +++++++++++++++- unstructured/partition/docx.py | 11 +++++++---- unstructured/partition/pdf.py | 20 +++++++++++--------- unstructured/partition/pptx.py | 11 +++++++---- unstructured/partition/strategies.py | 10 +++++++--- 11 files changed, 98 insertions(+), 25 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d2672e193..ce62c2e75 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,8 @@ -## 0.6.5-dev0 +## 0.6.5 ### Enhancements -* PLACEHOLDER - delete this line when there is an actual changelog item to report for 0.6.5 +* Added support for SpooledTemporaryFile file argument. ### Features diff --git a/test_unstructured/partition/test_docx.py b/test_unstructured/partition/test_docx.py index 8171db171..e65272ce4 100644 --- a/test_unstructured/partition/test_docx.py +++ b/test_unstructured/partition/test_docx.py @@ -62,6 +62,21 @@ def test_partition_docx_with_filename(mock_document, expected_elements, tmpdir): assert elements == expected_elements +def test_partition_docx_with_spooled_file(mock_document, expected_elements, tmpdir): + # Test that the partition_docx function can handle a SpooledTemporaryFile + filename = os.path.join(tmpdir.dirname, "mock_document.docx") + mock_document.save(filename) + + from tempfile import SpooledTemporaryFile + + with open(filename, "rb") as test_file: + spooled_temp_file = SpooledTemporaryFile() + spooled_temp_file.write(test_file.read()) + spooled_temp_file.seek(0) + elements = partition_docx(file=spooled_temp_file) + assert elements == expected_elements + + def test_partition_docx_with_file(mock_document, expected_elements, tmpdir): filename = os.path.join(tmpdir.dirname, "mock_document.docx") mock_document.save(filename) diff --git a/test_unstructured/partition/test_pdf.py b/test_unstructured/partition/test_pdf.py index 50a059ad5..a950d97af 100644 --- a/test_unstructured/partition/test_pdf.py +++ b/test_unstructured/partition/test_pdf.py @@ -1,4 +1,5 @@ import os +from tempfile import SpooledTemporaryFile from unittest import mock import pytest @@ -172,6 +173,24 @@ def test_partition_pdf(url, api_called, local_called, monkeypatch): assert pdf._partition_pdf_or_image_local.called == local_called +@pytest.mark.parametrize( + ("strategy"), + [("fast"), ("hi_res"), ("ocr_only")], +) +def test_partition_pdf_with_spooled_file( + strategy, + filename="example-docs/layout-parser-paper-fast.pdf", +): + # Test that the partition_pdf function can handle a SpooledTemporaryFile + with open(filename, "rb") as test_file: + spooled_temp_file = SpooledTemporaryFile() + spooled_temp_file.write(test_file.read()) + spooled_temp_file.seek(0) + result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy) + # validate that the result is a non-empty list of dicts + assert len(result) > 10 + + @pytest.mark.parametrize( ("url", "api_called", "local_called"), [("fakeurl", True, False), (None, False, True)], diff --git a/test_unstructured/partition/test_pptx.py b/test_unstructured/partition/test_pptx.py index c54d54dca..8a868e95d 100644 --- a/test_unstructured/partition/test_pptx.py +++ b/test_unstructured/partition/test_pptx.py @@ -32,6 +32,19 @@ def test_partition_pptx_from_filename(): assert elements == EXPECTED_PPTX_OUTPUT +def test_partition_pptx_with_spooled_file(): + # Test that the partition_pptx function can handle a SpooledTemporaryFile + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx") + from tempfile import SpooledTemporaryFile + + with open(filename, "rb") as test_file: + spooled_temp_file = SpooledTemporaryFile() + spooled_temp_file.write(test_file.read()) + spooled_temp_file.seek(0) + elements = partition_pptx(file=spooled_temp_file) + assert elements == EXPECTED_PPTX_OUTPUT + + def test_partition_pptx_from_file(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx") with open(filename, "rb") as f: diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 530da768d..36225a4ca 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.6.5-dev0" # pragma: no cover +__version__ = "0.6.5" # pragma: no cover diff --git a/unstructured/partition/__init__.py b/unstructured/partition/__init__.py index 6d406a49f..b3ed9e2fe 100644 --- a/unstructured/partition/__init__.py +++ b/unstructured/partition/__init__.py @@ -8,7 +8,7 @@ from unstructured.documents.elements import Element def _partition_via_api( filename: str = "", - file: Optional[bytes] = None, + file: Optional[Union[BinaryIO, bytes]] = None, url: str = "https://ml.unstructured.io/layout/pdf", token: Optional[str] = None, data: Optional[dict] = None, # NOTE(alan): Remove after different models are handled by routing diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py index da325edce..0c60bfaee 100644 --- a/unstructured/partition/common.py +++ b/unstructured/partition/common.py @@ -1,5 +1,7 @@ import subprocess -from typing import List, Optional, Tuple, Union +from io import BytesIO +from tempfile import SpooledTemporaryFile +from typing import BinaryIO, List, Optional, Tuple, Union from unstructured.documents.elements import ( TYPE_TO_TEXT_ELEMENT_MAP, @@ -157,3 +159,15 @@ def exactly_one(**kwargs) -> None: else: message = f"{names[0]} must be specified." raise ValueError(message) + + +def spooled_to_bytes_io_if_needed( + file_obj: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]], +) -> Optional[Union[bytes, BinaryIO]]: + if isinstance(file_obj, SpooledTemporaryFile): + file_obj.seek(0) + contents = file_obj.read() + return BytesIO(contents) + else: + # Return the original file object if it's not a SpooledTemporaryFile + return file_obj diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index 14c71e2e1..42e86d45f 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -1,6 +1,7 @@ import os import tempfile -from typing import IO, List, Optional +from tempfile import SpooledTemporaryFile +from typing import IO, BinaryIO, List, Optional, Union, cast import docx import pypandoc @@ -15,7 +16,7 @@ from unstructured.documents.elements import ( Text, Title, ) -from unstructured.partition.common import exactly_one +from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed from unstructured.partition.text_type import ( is_bulleted_text, is_possible_narrative_text, @@ -62,7 +63,7 @@ STYLE_TO_ELEMENT_MAPPING = { def partition_docx( filename: Optional[str] = None, - file: Optional[IO] = None, + file: Optional[Union[IO, SpooledTemporaryFile]] = None, metadata_filename: Optional[str] = None, ) -> List[Element]: """Partitions Microsoft Word Documents in .docx format into its document elements. @@ -85,7 +86,9 @@ def partition_docx( if filename is not None: document = docx.Document(filename) elif file is not None: - document = docx.Document(file) + document = docx.Document( + spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file)), + ) metadata_filename = metadata_filename or filename elements: List[Element] = [] diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 0b7bb9742..84a3838a8 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -1,6 +1,7 @@ import re import warnings -from typing import BinaryIO, List, Optional, cast +from tempfile import SpooledTemporaryFile +from typing import BinaryIO, List, Optional, Union, cast import pdf2image import pytesseract @@ -16,6 +17,7 @@ from unstructured.partition.common import ( add_element_metadata, document_to_element_list, exactly_one, + spooled_to_bytes_io_if_needed, ) from unstructured.partition.strategies import determine_pdf_or_image_strategy from unstructured.partition.text import partition_text @@ -24,7 +26,7 @@ from unstructured.utils import requires_dependencies def partition_pdf( filename: str = "", - file: Optional[bytes] = None, + file: Optional[Union[BinaryIO, SpooledTemporaryFile]] = None, url: Optional[str] = None, template: str = "layout/pdf", token: Optional[str] = None, @@ -86,7 +88,7 @@ def partition_pdf( def partition_pdf_or_image( filename: str = "", - file: Optional[bytes] = None, + file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None, url: Optional[str] = "https://ml.unstructured.io/", template: str = "layout/pdf", token: Optional[str] = None, @@ -122,7 +124,7 @@ def partition_pdf_or_image( warnings.simplefilter("ignore") layout_elements = _partition_pdf_or_image_local( filename=filename, - file=file, + file=spooled_to_bytes_io_if_needed(file), template=out_template, is_image=is_image, infer_table_structure=infer_table_structure, @@ -133,7 +135,7 @@ def partition_pdf_or_image( elif strategy == "fast": return _partition_pdf_with_pdfminer( filename=filename, - file=file, + file=spooled_to_bytes_io_if_needed(file), include_page_breaks=include_page_breaks, encoding=encoding, ) @@ -159,7 +161,7 @@ def partition_pdf_or_image( # NOTE(alan): Remove "data=data" after different models are handled by routing layout_elements = _partition_via_api( filename=filename, - file=file, + file=cast(BinaryIO, file), url=url, token=token, data=data, @@ -175,7 +177,7 @@ def partition_pdf_or_image( def _partition_pdf_or_image_local( filename: str = "", - file: Optional[bytes] = None, + file: Optional[Union[bytes, BinaryIO]] = None, template: Optional[str] = None, is_image: bool = False, infer_table_structure: bool = False, @@ -226,7 +228,7 @@ def _partition_pdf_or_image_local( @requires_dependencies("pdfminer", "local-inference") def _partition_pdf_with_pdfminer( filename: str = "", - file: Optional[bytes] = None, + file: Optional[BinaryIO] = None, include_page_breaks: bool = False, encoding: str = "utf-8", ) -> List[Element]: @@ -300,7 +302,7 @@ def _process_pdfminer_pages( def _partition_pdf_or_image_with_ocr( filename: str = "", - file: Optional[bytes] = None, + file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None, include_page_breaks: bool = False, ocr_languages: str = "eng", is_image: bool = False, diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py index 0105cc0a5..f8cd2fdfd 100644 --- a/unstructured/partition/pptx.py +++ b/unstructured/partition/pptx.py @@ -1,4 +1,5 @@ -from typing import IO, List, Optional +from tempfile import SpooledTemporaryFile +from typing import IO, BinaryIO, List, Optional, Union, cast import pptx @@ -11,7 +12,7 @@ from unstructured.documents.elements import ( Text, Title, ) -from unstructured.partition.common import exactly_one +from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed from unstructured.partition.text_type import ( is_possible_narrative_text, is_possible_title, @@ -22,7 +23,7 @@ OPENXML_SCHEMA_NAME = "{http://schemas.openxmlformats.org/drawingml/2006/main}" def partition_pptx( filename: Optional[str] = None, - file: Optional[IO] = None, + file: Optional[Union[IO, SpooledTemporaryFile]] = None, include_page_breaks: bool = True, metadata_filename: Optional[str] = None, ) -> List[Element]: @@ -48,7 +49,9 @@ def partition_pptx( if filename is not None: presentation = pptx.Presentation(filename) elif file is not None: - presentation = pptx.Presentation(file) + presentation = pptx.Presentation( + spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file)), + ) elements: List[Element] = [] metadata_filename = metadata_filename or filename diff --git a/unstructured/partition/strategies.py b/unstructured/partition/strategies.py index a54d8f546..1838f98de 100644 --- a/unstructured/partition/strategies.py +++ b/unstructured/partition/strategies.py @@ -1,4 +1,5 @@ -from typing import BinaryIO, Dict, List, Optional, cast +from tempfile import SpooledTemporaryFile +from typing import BinaryIO, Dict, List, Optional, Union, cast from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed from pdfminer.utils import open_filename @@ -31,7 +32,10 @@ def validate_strategy(strategy: str, filetype: str): raise ValueError(f"{strategy} is not a valid strategy for filetype {filetype}.") -def is_pdf_text_extractable(filename: str = "", file: Optional[bytes] = None): +def is_pdf_text_extractable( + filename: str = "", + file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None, +): """Checks to see if the text from a PDF document is extractable. Sometimes the text is not extractable due to PDF security settings.""" exactly_one(filename=filename, file=file) @@ -56,7 +60,7 @@ def is_pdf_text_extractable(filename: str = "", file: Optional[bytes] = None): def determine_pdf_or_image_strategy( strategy: str, filename: str = "", - file: Optional[bytes] = None, + file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None, is_image: bool = False, ): """Determines what strategy to use for processing PDFs or images, accounting for fallback