unstructured/test_unstructured/partition/test_strategies.py

import pytest

from test_unstructured.unit_utils import example_doc_path
from unstructured.documents.elements import Text
from unstructured.partition import pdf, strategies
from unstructured.partition.utils.constants import PartitionStrategy


@pytest.mark.parametrize(
    "strategy",
    [
        PartitionStrategy.AUTO,
        PartitionStrategy.FAST,
        PartitionStrategy.OCR_ONLY,
        PartitionStrategy.HI_RES,
    ],
)
def test_validate_strategy(strategy):
    # Nothing should raise for a valid strategy
    strategies.validate_strategy(strategy=strategy)


def test_validate_strategy_raises_for_fast_strategy():
    with pytest.raises(ValueError):
        strategies.validate_strategy(strategy=PartitionStrategy.FAST, is_image=True)


def test_validate_strategy_raises_for_bad_strategy():
    with pytest.raises(ValueError):
        strategies.validate_strategy("totally_guess_the_text")


@pytest.mark.parametrize(
    ("filename", "from_file", "expected"),
    [
        ("layout-parser-paper-fast.pdf", True, True),
        ("copy-protected.pdf", True, True),
        ("loremipsum-flat.pdf", True, False),
        ("layout-parser-paper-fast.pdf", False, True),
        ("copy-protected.pdf", False, True),
        ("loremipsum-flat.pdf", False, False),
    ],
)
def test_is_pdf_text_extractable(filename, from_file, expected):
    filename = example_doc_path(f"pdf/{filename}")

    if from_file:
        with open(filename, "rb") as f:
            extracted_elements = pdf.extractable_elements(file=f)
    else:
        extracted_elements = pdf.extractable_elements(filename=filename)

    pdf_text_extractable = any(
        isinstance(el, Text) and el.text.strip()
        for page_elements in extracted_elements
        for el in page_elements
    )

    assert pdf_text_extractable is expected


@pytest.mark.parametrize(
    ("pdf_text_extractable", "infer_table_structure"),
    [
        (True, True),
        (False, True),
        (True, False),
        (False, False),
    ],
)
def test_determine_pdf_or_image_fast_strategy(pdf_text_extractable, infer_table_structure):
    strategy = strategies.determine_pdf_or_image_strategy(
        strategy=PartitionStrategy.FAST,
        pdf_text_extractable=pdf_text_extractable,
        infer_table_structure=infer_table_structure,
    )
    assert strategy == PartitionStrategy.FAST


@pytest.mark.parametrize(
    (
        "pdf_text_extractable",
        "infer_table_structure",
        "extract_images_in_pdf",
        "extract_image_block_types",
        "expected",
    ),
    [
        (True, True, True, ["Image"], PartitionStrategy.HI_RES),
        (True, True, True, [], PartitionStrategy.HI_RES),
        (True, True, False, ["Image"], PartitionStrategy.HI_RES),
        (True, True, False, [], PartitionStrategy.HI_RES),
        (True, False, True, ["Image"], PartitionStrategy.HI_RES),
        (True, False, True, [], PartitionStrategy.HI_RES),
        (True, False, False, ["Image"], PartitionStrategy.HI_RES),
        (True, False, False, [], PartitionStrategy.FAST),
        (False, True, True, ["Image"], PartitionStrategy.HI_RES),
        (False, True, True, [], PartitionStrategy.HI_RES),
        (False, True, False, ["Image"], PartitionStrategy.HI_RES),
        (False, True, False, [], PartitionStrategy.HI_RES),
        (False, False, True, ["Image"], PartitionStrategy.HI_RES),
        (False, False, True, [], PartitionStrategy.HI_RES),
        (False, False, False, ["Image"], PartitionStrategy.HI_RES),
        (False, False, False, [], PartitionStrategy.OCR_ONLY),
    ],
)
def test_determine_pdf_auto_strategy(
    pdf_text_extractable,
    infer_table_structure,
    extract_images_in_pdf,
    extract_image_block_types,
    expected,
):
    strategy = strategies.determine_pdf_or_image_strategy(
        strategy=PartitionStrategy.AUTO,
        is_image=False,
        pdf_text_extractable=pdf_text_extractable,
        infer_table_structure=infer_table_structure,
        extract_images_in_pdf=extract_images_in_pdf,
        extract_image_block_types=extract_image_block_types,
    )
    assert strategy == expected


def test_determine_image_auto_strategy():
    strategy = strategies.determine_pdf_or_image_strategy(
        strategy=PartitionStrategy.AUTO,
        is_image=True,
    )
    assert strategy == PartitionStrategy.HI_RES