mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00

This PR aims to improve the organization and readability of our example documents used in unit tests, specifically focusing on PDF and image files. ### Summary - Created two new subdirectories in the `example-docs` folder: - `pdf/`: for all PDF example files - `img/`: for all image example files - Moved relevant PDF files from `example-docs/` to `example-docs/pdf/` - Moved relevant image files from `example-docs/` to `example-docs/img/` - Updated file paths in affected unit & ingest tests to reflect the new directory structure ### Testing All unit & ingest tests should be updated and verified to work with the new file structure. ## Notes Other file types (e.g., office documents, HTML files) remain in the root of `example-docs/` for now. ## Next Steps Consider similar reorganization for other file types if this structure proves to be beneficial. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
131 lines
4.2 KiB
Python
131 lines
4.2 KiB
Python
import pytest
|
|
|
|
from test_unstructured.unit_utils import example_doc_path
|
|
from unstructured.documents.elements import Text
|
|
from unstructured.partition import pdf, strategies
|
|
from unstructured.partition.utils.constants import PartitionStrategy
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"strategy",
|
|
[
|
|
PartitionStrategy.AUTO,
|
|
PartitionStrategy.FAST,
|
|
PartitionStrategy.OCR_ONLY,
|
|
PartitionStrategy.HI_RES,
|
|
],
|
|
)
|
|
def test_validate_strategy(strategy):
|
|
# Nothing should raise for a valid strategy
|
|
strategies.validate_strategy(strategy=strategy)
|
|
|
|
|
|
def test_validate_strategy_raises_for_fast_strategy():
|
|
with pytest.raises(ValueError):
|
|
strategies.validate_strategy(strategy=PartitionStrategy.FAST, is_image=True)
|
|
|
|
|
|
def test_validate_strategy_raises_for_bad_strategy():
|
|
with pytest.raises(ValueError):
|
|
strategies.validate_strategy("totally_guess_the_text")
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("filename", "from_file", "expected"),
|
|
[
|
|
("layout-parser-paper-fast.pdf", True, True),
|
|
("copy-protected.pdf", True, True),
|
|
("loremipsum-flat.pdf", True, False),
|
|
("layout-parser-paper-fast.pdf", False, True),
|
|
("copy-protected.pdf", False, True),
|
|
("loremipsum-flat.pdf", False, False),
|
|
],
|
|
)
|
|
def test_is_pdf_text_extractable(filename, from_file, expected):
|
|
filename = example_doc_path(f"pdf/{filename}")
|
|
|
|
if from_file:
|
|
with open(filename, "rb") as f:
|
|
extracted_elements = pdf.extractable_elements(file=f)
|
|
else:
|
|
extracted_elements = pdf.extractable_elements(filename=filename)
|
|
|
|
pdf_text_extractable = any(
|
|
isinstance(el, Text) and el.text.strip()
|
|
for page_elements in extracted_elements
|
|
for el in page_elements
|
|
)
|
|
|
|
assert pdf_text_extractable is expected
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("pdf_text_extractable", "infer_table_structure"),
|
|
[
|
|
(True, True),
|
|
(False, True),
|
|
(True, False),
|
|
(False, False),
|
|
],
|
|
)
|
|
def test_determine_pdf_or_image_fast_strategy(pdf_text_extractable, infer_table_structure):
|
|
strategy = strategies.determine_pdf_or_image_strategy(
|
|
strategy=PartitionStrategy.FAST,
|
|
pdf_text_extractable=pdf_text_extractable,
|
|
infer_table_structure=infer_table_structure,
|
|
)
|
|
assert strategy == PartitionStrategy.FAST
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
(
|
|
"pdf_text_extractable",
|
|
"infer_table_structure",
|
|
"extract_images_in_pdf",
|
|
"extract_image_block_types",
|
|
"expected",
|
|
),
|
|
[
|
|
(True, True, True, ["Image"], PartitionStrategy.HI_RES),
|
|
(True, True, True, [], PartitionStrategy.HI_RES),
|
|
(True, True, False, ["Image"], PartitionStrategy.HI_RES),
|
|
(True, True, False, [], PartitionStrategy.HI_RES),
|
|
(True, False, True, ["Image"], PartitionStrategy.HI_RES),
|
|
(True, False, True, [], PartitionStrategy.HI_RES),
|
|
(True, False, False, ["Image"], PartitionStrategy.HI_RES),
|
|
(True, False, False, [], PartitionStrategy.FAST),
|
|
(False, True, True, ["Image"], PartitionStrategy.HI_RES),
|
|
(False, True, True, [], PartitionStrategy.HI_RES),
|
|
(False, True, False, ["Image"], PartitionStrategy.HI_RES),
|
|
(False, True, False, [], PartitionStrategy.HI_RES),
|
|
(False, False, True, ["Image"], PartitionStrategy.HI_RES),
|
|
(False, False, True, [], PartitionStrategy.HI_RES),
|
|
(False, False, False, ["Image"], PartitionStrategy.HI_RES),
|
|
(False, False, False, [], PartitionStrategy.OCR_ONLY),
|
|
],
|
|
)
|
|
def test_determine_pdf_auto_strategy(
|
|
pdf_text_extractable,
|
|
infer_table_structure,
|
|
extract_images_in_pdf,
|
|
extract_image_block_types,
|
|
expected,
|
|
):
|
|
strategy = strategies.determine_pdf_or_image_strategy(
|
|
strategy=PartitionStrategy.AUTO,
|
|
is_image=False,
|
|
pdf_text_extractable=pdf_text_extractable,
|
|
infer_table_structure=infer_table_structure,
|
|
extract_images_in_pdf=extract_images_in_pdf,
|
|
extract_image_block_types=extract_image_block_types,
|
|
)
|
|
assert strategy == expected
|
|
|
|
|
|
def test_determine_image_auto_strategy():
|
|
strategy = strategies.determine_pdf_or_image_strategy(
|
|
strategy=PartitionStrategy.AUTO,
|
|
is_image=True,
|
|
)
|
|
assert strategy == PartitionStrategy.HI_RES
|