chore: add support for SpooledTemporaryFiles (#569)

This commit is contained in:
ryannikolaidis 2023-05-09 21:39:07 -07:00 committed by GitHub
parent 19beb24e03
commit b52638f8e3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 98 additions and 25 deletions

View File

@ -1,8 +1,8 @@
## 0.6.5-dev0
## 0.6.5
### Enhancements
* PLACEHOLDER - delete this line when there is an actual changelog item to report for 0.6.5
* Added support for SpooledTemporaryFile file argument.
### Features

View File

@ -62,6 +62,21 @@ def test_partition_docx_with_filename(mock_document, expected_elements, tmpdir):
assert elements == expected_elements
def test_partition_docx_with_spooled_file(mock_document, expected_elements, tmpdir):
# Test that the partition_docx function can handle a SpooledTemporaryFile
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
mock_document.save(filename)
from tempfile import SpooledTemporaryFile
with open(filename, "rb") as test_file:
spooled_temp_file = SpooledTemporaryFile()
spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0)
elements = partition_docx(file=spooled_temp_file)
assert elements == expected_elements
def test_partition_docx_with_file(mock_document, expected_elements, tmpdir):
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
mock_document.save(filename)

View File

@ -1,4 +1,5 @@
import os
from tempfile import SpooledTemporaryFile
from unittest import mock
import pytest
@ -172,6 +173,24 @@ def test_partition_pdf(url, api_called, local_called, monkeypatch):
assert pdf._partition_pdf_or_image_local.called == local_called
@pytest.mark.parametrize(
("strategy"),
[("fast"), ("hi_res"), ("ocr_only")],
)
def test_partition_pdf_with_spooled_file(
strategy,
filename="example-docs/layout-parser-paper-fast.pdf",
):
# Test that the partition_pdf function can handle a SpooledTemporaryFile
with open(filename, "rb") as test_file:
spooled_temp_file = SpooledTemporaryFile()
spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0)
result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy)
# validate that the result is a non-empty list of dicts
assert len(result) > 10
@pytest.mark.parametrize(
("url", "api_called", "local_called"),
[("fakeurl", True, False), (None, False, True)],

View File

@ -32,6 +32,19 @@ def test_partition_pptx_from_filename():
assert elements == EXPECTED_PPTX_OUTPUT
def test_partition_pptx_with_spooled_file():
# Test that the partition_pptx function can handle a SpooledTemporaryFile
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
from tempfile import SpooledTemporaryFile
with open(filename, "rb") as test_file:
spooled_temp_file = SpooledTemporaryFile()
spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0)
elements = partition_pptx(file=spooled_temp_file)
assert elements == EXPECTED_PPTX_OUTPUT
def test_partition_pptx_from_file():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
with open(filename, "rb") as f:

View File

@ -1 +1 @@
__version__ = "0.6.5-dev0" # pragma: no cover
__version__ = "0.6.5" # pragma: no cover

View File

@ -8,7 +8,7 @@ from unstructured.documents.elements import Element
def _partition_via_api(
filename: str = "",
file: Optional[bytes] = None,
file: Optional[Union[BinaryIO, bytes]] = None,
url: str = "https://ml.unstructured.io/layout/pdf",
token: Optional[str] = None,
data: Optional[dict] = None, # NOTE(alan): Remove after different models are handled by routing

View File

@ -1,5 +1,7 @@
import subprocess
from typing import List, Optional, Tuple, Union
from io import BytesIO
from tempfile import SpooledTemporaryFile
from typing import BinaryIO, List, Optional, Tuple, Union
from unstructured.documents.elements import (
TYPE_TO_TEXT_ELEMENT_MAP,
@ -157,3 +159,15 @@ def exactly_one(**kwargs) -> None:
else:
message = f"{names[0]} must be specified."
raise ValueError(message)
def spooled_to_bytes_io_if_needed(
file_obj: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]],
) -> Optional[Union[bytes, BinaryIO]]:
if isinstance(file_obj, SpooledTemporaryFile):
file_obj.seek(0)
contents = file_obj.read()
return BytesIO(contents)
else:
# Return the original file object if it's not a SpooledTemporaryFile
return file_obj

View File

@ -1,6 +1,7 @@
import os
import tempfile
from typing import IO, List, Optional
from tempfile import SpooledTemporaryFile
from typing import IO, BinaryIO, List, Optional, Union, cast
import docx
import pypandoc
@ -15,7 +16,7 @@ from unstructured.documents.elements import (
Text,
Title,
)
from unstructured.partition.common import exactly_one
from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
from unstructured.partition.text_type import (
is_bulleted_text,
is_possible_narrative_text,
@ -62,7 +63,7 @@ STYLE_TO_ELEMENT_MAPPING = {
def partition_docx(
filename: Optional[str] = None,
file: Optional[IO] = None,
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
metadata_filename: Optional[str] = None,
) -> List[Element]:
"""Partitions Microsoft Word Documents in .docx format into its document elements.
@ -85,7 +86,9 @@ def partition_docx(
if filename is not None:
document = docx.Document(filename)
elif file is not None:
document = docx.Document(file)
document = docx.Document(
spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file)),
)
metadata_filename = metadata_filename or filename
elements: List[Element] = []

View File

@ -1,6 +1,7 @@
import re
import warnings
from typing import BinaryIO, List, Optional, cast
from tempfile import SpooledTemporaryFile
from typing import BinaryIO, List, Optional, Union, cast
import pdf2image
import pytesseract
@ -16,6 +17,7 @@ from unstructured.partition.common import (
add_element_metadata,
document_to_element_list,
exactly_one,
spooled_to_bytes_io_if_needed,
)
from unstructured.partition.strategies import determine_pdf_or_image_strategy
from unstructured.partition.text import partition_text
@ -24,7 +26,7 @@ from unstructured.utils import requires_dependencies
def partition_pdf(
filename: str = "",
file: Optional[bytes] = None,
file: Optional[Union[BinaryIO, SpooledTemporaryFile]] = None,
url: Optional[str] = None,
template: str = "layout/pdf",
token: Optional[str] = None,
@ -86,7 +88,7 @@ def partition_pdf(
def partition_pdf_or_image(
filename: str = "",
file: Optional[bytes] = None,
file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
url: Optional[str] = "https://ml.unstructured.io/",
template: str = "layout/pdf",
token: Optional[str] = None,
@ -122,7 +124,7 @@ def partition_pdf_or_image(
warnings.simplefilter("ignore")
layout_elements = _partition_pdf_or_image_local(
filename=filename,
file=file,
file=spooled_to_bytes_io_if_needed(file),
template=out_template,
is_image=is_image,
infer_table_structure=infer_table_structure,
@ -133,7 +135,7 @@ def partition_pdf_or_image(
elif strategy == "fast":
return _partition_pdf_with_pdfminer(
filename=filename,
file=file,
file=spooled_to_bytes_io_if_needed(file),
include_page_breaks=include_page_breaks,
encoding=encoding,
)
@ -159,7 +161,7 @@ def partition_pdf_or_image(
# NOTE(alan): Remove "data=data" after different models are handled by routing
layout_elements = _partition_via_api(
filename=filename,
file=file,
file=cast(BinaryIO, file),
url=url,
token=token,
data=data,
@ -175,7 +177,7 @@ def partition_pdf_or_image(
def _partition_pdf_or_image_local(
filename: str = "",
file: Optional[bytes] = None,
file: Optional[Union[bytes, BinaryIO]] = None,
template: Optional[str] = None,
is_image: bool = False,
infer_table_structure: bool = False,
@ -226,7 +228,7 @@ def _partition_pdf_or_image_local(
@requires_dependencies("pdfminer", "local-inference")
def _partition_pdf_with_pdfminer(
filename: str = "",
file: Optional[bytes] = None,
file: Optional[BinaryIO] = None,
include_page_breaks: bool = False,
encoding: str = "utf-8",
) -> List[Element]:
@ -300,7 +302,7 @@ def _process_pdfminer_pages(
def _partition_pdf_or_image_with_ocr(
filename: str = "",
file: Optional[bytes] = None,
file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
include_page_breaks: bool = False,
ocr_languages: str = "eng",
is_image: bool = False,

View File

@ -1,4 +1,5 @@
from typing import IO, List, Optional
from tempfile import SpooledTemporaryFile
from typing import IO, BinaryIO, List, Optional, Union, cast
import pptx
@ -11,7 +12,7 @@ from unstructured.documents.elements import (
Text,
Title,
)
from unstructured.partition.common import exactly_one
from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
from unstructured.partition.text_type import (
is_possible_narrative_text,
is_possible_title,
@ -22,7 +23,7 @@ OPENXML_SCHEMA_NAME = "{http://schemas.openxmlformats.org/drawingml/2006/main}"
def partition_pptx(
filename: Optional[str] = None,
file: Optional[IO] = None,
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
include_page_breaks: bool = True,
metadata_filename: Optional[str] = None,
) -> List[Element]:
@ -48,7 +49,9 @@ def partition_pptx(
if filename is not None:
presentation = pptx.Presentation(filename)
elif file is not None:
presentation = pptx.Presentation(file)
presentation = pptx.Presentation(
spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file)),
)
elements: List[Element] = []
metadata_filename = metadata_filename or filename

View File

@ -1,4 +1,5 @@
from typing import BinaryIO, Dict, List, Optional, cast
from tempfile import SpooledTemporaryFile
from typing import BinaryIO, Dict, List, Optional, Union, cast
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
from pdfminer.utils import open_filename
@ -31,7 +32,10 @@ def validate_strategy(strategy: str, filetype: str):
raise ValueError(f"{strategy} is not a valid strategy for filetype {filetype}.")
def is_pdf_text_extractable(filename: str = "", file: Optional[bytes] = None):
def is_pdf_text_extractable(
filename: str = "",
file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
):
"""Checks to see if the text from a PDF document is extractable. Sometimes the
text is not extractable due to PDF security settings."""
exactly_one(filename=filename, file=file)
@ -56,7 +60,7 @@ def is_pdf_text_extractable(filename: str = "", file: Optional[bytes] = None):
def determine_pdf_or_image_strategy(
strategy: str,
filename: str = "",
file: Optional[bytes] = None,
file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
is_image: bool = False,
):
"""Determines what strategy to use for processing PDFs or images, accounting for fallback