mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-04 11:10:22 +00:00
chore: add support for SpooledTemporaryFiles (#569)
This commit is contained in:
parent
19beb24e03
commit
b52638f8e3
@ -1,8 +1,8 @@
|
||||
## 0.6.5-dev0
|
||||
## 0.6.5
|
||||
|
||||
### Enhancements
|
||||
|
||||
* PLACEHOLDER - delete this line when there is an actual changelog item to report for 0.6.5
|
||||
* Added support for SpooledTemporaryFile file argument.
|
||||
|
||||
### Features
|
||||
|
||||
|
||||
@ -62,6 +62,21 @@ def test_partition_docx_with_filename(mock_document, expected_elements, tmpdir):
|
||||
assert elements == expected_elements
|
||||
|
||||
|
||||
def test_partition_docx_with_spooled_file(mock_document, expected_elements, tmpdir):
|
||||
# Test that the partition_docx function can handle a SpooledTemporaryFile
|
||||
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
mock_document.save(filename)
|
||||
|
||||
from tempfile import SpooledTemporaryFile
|
||||
|
||||
with open(filename, "rb") as test_file:
|
||||
spooled_temp_file = SpooledTemporaryFile()
|
||||
spooled_temp_file.write(test_file.read())
|
||||
spooled_temp_file.seek(0)
|
||||
elements = partition_docx(file=spooled_temp_file)
|
||||
assert elements == expected_elements
|
||||
|
||||
|
||||
def test_partition_docx_with_file(mock_document, expected_elements, tmpdir):
|
||||
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
mock_document.save(filename)
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
import os
|
||||
from tempfile import SpooledTemporaryFile
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
@ -172,6 +173,24 @@ def test_partition_pdf(url, api_called, local_called, monkeypatch):
|
||||
assert pdf._partition_pdf_or_image_local.called == local_called
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("strategy"),
|
||||
[("fast"), ("hi_res"), ("ocr_only")],
|
||||
)
|
||||
def test_partition_pdf_with_spooled_file(
|
||||
strategy,
|
||||
filename="example-docs/layout-parser-paper-fast.pdf",
|
||||
):
|
||||
# Test that the partition_pdf function can handle a SpooledTemporaryFile
|
||||
with open(filename, "rb") as test_file:
|
||||
spooled_temp_file = SpooledTemporaryFile()
|
||||
spooled_temp_file.write(test_file.read())
|
||||
spooled_temp_file.seek(0)
|
||||
result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy)
|
||||
# validate that the result is a non-empty list of dicts
|
||||
assert len(result) > 10
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("url", "api_called", "local_called"),
|
||||
[("fakeurl", True, False), (None, False, True)],
|
||||
|
||||
@ -32,6 +32,19 @@ def test_partition_pptx_from_filename():
|
||||
assert elements == EXPECTED_PPTX_OUTPUT
|
||||
|
||||
|
||||
def test_partition_pptx_with_spooled_file():
|
||||
# Test that the partition_pptx function can handle a SpooledTemporaryFile
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
|
||||
from tempfile import SpooledTemporaryFile
|
||||
|
||||
with open(filename, "rb") as test_file:
|
||||
spooled_temp_file = SpooledTemporaryFile()
|
||||
spooled_temp_file.write(test_file.read())
|
||||
spooled_temp_file.seek(0)
|
||||
elements = partition_pptx(file=spooled_temp_file)
|
||||
assert elements == EXPECTED_PPTX_OUTPUT
|
||||
|
||||
|
||||
def test_partition_pptx_from_file():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
|
||||
with open(filename, "rb") as f:
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.6.5-dev0" # pragma: no cover
|
||||
__version__ = "0.6.5" # pragma: no cover
|
||||
|
||||
@ -8,7 +8,7 @@ from unstructured.documents.elements import Element
|
||||
|
||||
def _partition_via_api(
|
||||
filename: str = "",
|
||||
file: Optional[bytes] = None,
|
||||
file: Optional[Union[BinaryIO, bytes]] = None,
|
||||
url: str = "https://ml.unstructured.io/layout/pdf",
|
||||
token: Optional[str] = None,
|
||||
data: Optional[dict] = None, # NOTE(alan): Remove after different models are handled by routing
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
import subprocess
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from io import BytesIO
|
||||
from tempfile import SpooledTemporaryFile
|
||||
from typing import BinaryIO, List, Optional, Tuple, Union
|
||||
|
||||
from unstructured.documents.elements import (
|
||||
TYPE_TO_TEXT_ELEMENT_MAP,
|
||||
@ -157,3 +159,15 @@ def exactly_one(**kwargs) -> None:
|
||||
else:
|
||||
message = f"{names[0]} must be specified."
|
||||
raise ValueError(message)
|
||||
|
||||
|
||||
def spooled_to_bytes_io_if_needed(
|
||||
file_obj: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]],
|
||||
) -> Optional[Union[bytes, BinaryIO]]:
|
||||
if isinstance(file_obj, SpooledTemporaryFile):
|
||||
file_obj.seek(0)
|
||||
contents = file_obj.read()
|
||||
return BytesIO(contents)
|
||||
else:
|
||||
# Return the original file object if it's not a SpooledTemporaryFile
|
||||
return file_obj
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
import os
|
||||
import tempfile
|
||||
from typing import IO, List, Optional
|
||||
from tempfile import SpooledTemporaryFile
|
||||
from typing import IO, BinaryIO, List, Optional, Union, cast
|
||||
|
||||
import docx
|
||||
import pypandoc
|
||||
@ -15,7 +16,7 @@ from unstructured.documents.elements import (
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
from unstructured.partition.common import exactly_one
|
||||
from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
|
||||
from unstructured.partition.text_type import (
|
||||
is_bulleted_text,
|
||||
is_possible_narrative_text,
|
||||
@ -62,7 +63,7 @@ STYLE_TO_ELEMENT_MAPPING = {
|
||||
|
||||
def partition_docx(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
|
||||
metadata_filename: Optional[str] = None,
|
||||
) -> List[Element]:
|
||||
"""Partitions Microsoft Word Documents in .docx format into its document elements.
|
||||
@ -85,7 +86,9 @@ def partition_docx(
|
||||
if filename is not None:
|
||||
document = docx.Document(filename)
|
||||
elif file is not None:
|
||||
document = docx.Document(file)
|
||||
document = docx.Document(
|
||||
spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file)),
|
||||
)
|
||||
|
||||
metadata_filename = metadata_filename or filename
|
||||
elements: List[Element] = []
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
import re
|
||||
import warnings
|
||||
from typing import BinaryIO, List, Optional, cast
|
||||
from tempfile import SpooledTemporaryFile
|
||||
from typing import BinaryIO, List, Optional, Union, cast
|
||||
|
||||
import pdf2image
|
||||
import pytesseract
|
||||
@ -16,6 +17,7 @@ from unstructured.partition.common import (
|
||||
add_element_metadata,
|
||||
document_to_element_list,
|
||||
exactly_one,
|
||||
spooled_to_bytes_io_if_needed,
|
||||
)
|
||||
from unstructured.partition.strategies import determine_pdf_or_image_strategy
|
||||
from unstructured.partition.text import partition_text
|
||||
@ -24,7 +26,7 @@ from unstructured.utils import requires_dependencies
|
||||
|
||||
def partition_pdf(
|
||||
filename: str = "",
|
||||
file: Optional[bytes] = None,
|
||||
file: Optional[Union[BinaryIO, SpooledTemporaryFile]] = None,
|
||||
url: Optional[str] = None,
|
||||
template: str = "layout/pdf",
|
||||
token: Optional[str] = None,
|
||||
@ -86,7 +88,7 @@ def partition_pdf(
|
||||
|
||||
def partition_pdf_or_image(
|
||||
filename: str = "",
|
||||
file: Optional[bytes] = None,
|
||||
file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
|
||||
url: Optional[str] = "https://ml.unstructured.io/",
|
||||
template: str = "layout/pdf",
|
||||
token: Optional[str] = None,
|
||||
@ -122,7 +124,7 @@ def partition_pdf_or_image(
|
||||
warnings.simplefilter("ignore")
|
||||
layout_elements = _partition_pdf_or_image_local(
|
||||
filename=filename,
|
||||
file=file,
|
||||
file=spooled_to_bytes_io_if_needed(file),
|
||||
template=out_template,
|
||||
is_image=is_image,
|
||||
infer_table_structure=infer_table_structure,
|
||||
@ -133,7 +135,7 @@ def partition_pdf_or_image(
|
||||
elif strategy == "fast":
|
||||
return _partition_pdf_with_pdfminer(
|
||||
filename=filename,
|
||||
file=file,
|
||||
file=spooled_to_bytes_io_if_needed(file),
|
||||
include_page_breaks=include_page_breaks,
|
||||
encoding=encoding,
|
||||
)
|
||||
@ -159,7 +161,7 @@ def partition_pdf_or_image(
|
||||
# NOTE(alan): Remove "data=data" after different models are handled by routing
|
||||
layout_elements = _partition_via_api(
|
||||
filename=filename,
|
||||
file=file,
|
||||
file=cast(BinaryIO, file),
|
||||
url=url,
|
||||
token=token,
|
||||
data=data,
|
||||
@ -175,7 +177,7 @@ def partition_pdf_or_image(
|
||||
|
||||
def _partition_pdf_or_image_local(
|
||||
filename: str = "",
|
||||
file: Optional[bytes] = None,
|
||||
file: Optional[Union[bytes, BinaryIO]] = None,
|
||||
template: Optional[str] = None,
|
||||
is_image: bool = False,
|
||||
infer_table_structure: bool = False,
|
||||
@ -226,7 +228,7 @@ def _partition_pdf_or_image_local(
|
||||
@requires_dependencies("pdfminer", "local-inference")
|
||||
def _partition_pdf_with_pdfminer(
|
||||
filename: str = "",
|
||||
file: Optional[bytes] = None,
|
||||
file: Optional[BinaryIO] = None,
|
||||
include_page_breaks: bool = False,
|
||||
encoding: str = "utf-8",
|
||||
) -> List[Element]:
|
||||
@ -300,7 +302,7 @@ def _process_pdfminer_pages(
|
||||
|
||||
def _partition_pdf_or_image_with_ocr(
|
||||
filename: str = "",
|
||||
file: Optional[bytes] = None,
|
||||
file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
|
||||
include_page_breaks: bool = False,
|
||||
ocr_languages: str = "eng",
|
||||
is_image: bool = False,
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
from typing import IO, List, Optional
|
||||
from tempfile import SpooledTemporaryFile
|
||||
from typing import IO, BinaryIO, List, Optional, Union, cast
|
||||
|
||||
import pptx
|
||||
|
||||
@ -11,7 +12,7 @@ from unstructured.documents.elements import (
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
from unstructured.partition.common import exactly_one
|
||||
from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
|
||||
from unstructured.partition.text_type import (
|
||||
is_possible_narrative_text,
|
||||
is_possible_title,
|
||||
@ -22,7 +23,7 @@ OPENXML_SCHEMA_NAME = "{http://schemas.openxmlformats.org/drawingml/2006/main}"
|
||||
|
||||
def partition_pptx(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
|
||||
include_page_breaks: bool = True,
|
||||
metadata_filename: Optional[str] = None,
|
||||
) -> List[Element]:
|
||||
@ -48,7 +49,9 @@ def partition_pptx(
|
||||
if filename is not None:
|
||||
presentation = pptx.Presentation(filename)
|
||||
elif file is not None:
|
||||
presentation = pptx.Presentation(file)
|
||||
presentation = pptx.Presentation(
|
||||
spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file)),
|
||||
)
|
||||
|
||||
elements: List[Element] = []
|
||||
metadata_filename = metadata_filename or filename
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
from typing import BinaryIO, Dict, List, Optional, cast
|
||||
from tempfile import SpooledTemporaryFile
|
||||
from typing import BinaryIO, Dict, List, Optional, Union, cast
|
||||
|
||||
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
|
||||
from pdfminer.utils import open_filename
|
||||
@ -31,7 +32,10 @@ def validate_strategy(strategy: str, filetype: str):
|
||||
raise ValueError(f"{strategy} is not a valid strategy for filetype {filetype}.")
|
||||
|
||||
|
||||
def is_pdf_text_extractable(filename: str = "", file: Optional[bytes] = None):
|
||||
def is_pdf_text_extractable(
|
||||
filename: str = "",
|
||||
file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
|
||||
):
|
||||
"""Checks to see if the text from a PDF document is extractable. Sometimes the
|
||||
text is not extractable due to PDF security settings."""
|
||||
exactly_one(filename=filename, file=file)
|
||||
@ -56,7 +60,7 @@ def is_pdf_text_extractable(filename: str = "", file: Optional[bytes] = None):
|
||||
def determine_pdf_or_image_strategy(
|
||||
strategy: str,
|
||||
filename: str = "",
|
||||
file: Optional[bytes] = None,
|
||||
file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
|
||||
is_image: bool = False,
|
||||
):
|
||||
"""Determines what strategy to use for processing PDFs or images, accounting for fallback
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user