unstructured/test_unstructured/partition/test_strategies.py

import os

import pytest

from unstructured.partition import pdf, strategies


def test_validate_strategy_validates():
    # Nothing should raise for a valid strategy
    strategies.validate_strategy("hi_res", "pdf")


def test_validate_strategy_raises_for_bad_filetype():
    with pytest.raises(ValueError):
        strategies.validate_strategy("fast", "image")


def test_validate_strategy_raises_for_bad_strategy():
    with pytest.raises(ValueError):
        strategies.validate_strategy("totally_guess_the_text", "image")


@pytest.mark.parametrize(
    ("filename", "from_file", "expected"),
    [
        ("layout-parser-paper-fast.pdf", True, True),
        ("copy-protected.pdf", True, True),
        ("loremipsum-flat.pdf", True, False),
        ("layout-parser-paper-fast.pdf", False, True),
        ("copy-protected.pdf", False, True),
        ("loremipsum-flat.pdf", False, False),
    ],
)
def test_is_pdf_text_extractable(filename, from_file, expected):
    filename = os.path.join("example-docs", filename)

    if from_file:
        with open(filename, "rb") as f:
            extractable = pdf.extractable_elements(file=f)
    else:
        extractable = pdf.extractable_elements(filename=filename)

    assert bool(extractable) is expected


@pytest.mark.parametrize(
    ("infer_table_structure", "expected"),
    [
        (True, "hi_res"),
        (False, "ocr_only"),
    ],
)
def test_determine_image_auto_strategy(infer_table_structure, expected):
    strategy = strategies._determine_image_auto_strategy(
        infer_table_structure=infer_table_structure,
    )
    assert strategy is expected


@pytest.mark.parametrize(
    ("pdf_text_extractable", "infer_table_structure", "expected"),
    [
        (True, True, "hi_res"),
        (False, True, "hi_res"),
        (True, False, "fast"),
        (False, False, "ocr_only"),
    ],
)
def test_determine_image_pdf_strategy(pdf_text_extractable, infer_table_structure, expected):
    strategy = strategies._determine_pdf_auto_strategy(
        pdf_text_extractable=pdf_text_extractable,
        infer_table_structure=infer_table_structure,
    )
    assert strategy is expected


def test_determine_pdf_or_image_strategy_fallback_ocr():
    strategy = strategies.determine_pdf_or_image_strategy(
        strategy="fast",
        is_image=True,
    )
    assert strategy == "ocr_only"
enhancement: add "ocr_only" strategy for PDFs (#553) * add tests for validating strategy * refactor into determine_pdf_strategy function * refactor pdf strategies into strategies * remove commented out code * remove unreachable code * add in handling for image types * a little more refactoring * import ocr partioning for images * catch warnings, partition type for valid strategies * fallback to ocr_only from fast * fallback logic for hi_res * test for fallback to ocr only * fallback logic ofr ocr_only * more tests for fallback logic * update doc strings * version and changelog * linting, linting, linting * update docs to include notes about strategy * fix typos * change back patched filename 2023-05-08 13:21:24 -04:00			`import os`

			`import pytest`

fix: better extractable check (#900) auto strategy was choosing the fast strategy in cases where the pdf contents were just a flat image, resulting in no output. This PR changes the behavior of auto so that elements that can be extracted by fast are extracted, a cursory examination of the elements is made to see if there are elements with text present, and if so then these elements are used as the output. Otherwise fallback strategies come into play. 2023-07-07 23:41:37 -05:00			`from unstructured.partition import pdf, strategies`
enhancement: add "ocr_only" strategy for PDFs (#553) * add tests for validating strategy * refactor into determine_pdf_strategy function * refactor pdf strategies into strategies * remove commented out code * remove unreachable code * add in handling for image types * a little more refactoring * import ocr partioning for images * catch warnings, partition type for valid strategies * fallback to ocr_only from fast * fallback logic for hi_res * test for fallback to ocr only * fallback logic ofr ocr_only * more tests for fallback logic * update doc strings * version and changelog * linting, linting, linting * update docs to include notes about strategy * fix typos * change back patched filename 2023-05-08 13:21:24 -04:00

			`def test_validate_strategy_validates():`
			`# Nothing should raise for a valid strategy`
			`strategies.validate_strategy("hi_res", "pdf")`


			`def test_validate_strategy_raises_for_bad_filetype():`
			`with pytest.raises(ValueError):`
			`strategies.validate_strategy("fast", "image")`


			`def test_validate_strategy_raises_for_bad_strategy():`
			`with pytest.raises(ValueError):`
			`strategies.validate_strategy("totally_guess_the_text", "image")`


			`@pytest.mark.parametrize(`
			`("filename", "from_file", "expected"),`
			`[`
			`("layout-parser-paper-fast.pdf", True, True),`
fix: better extractable check (#900) auto strategy was choosing the fast strategy in cases where the pdf contents were just a flat image, resulting in no output. This PR changes the behavior of auto so that elements that can be extracted by fast are extracted, a cursory examination of the elements is made to see if there are elements with text present, and if so then these elements are used as the output. Otherwise fallback strategies come into play. 2023-07-07 23:41:37 -05:00			`("copy-protected.pdf", True, True),`
			`("loremipsum-flat.pdf", True, False),`
enhancement: add "ocr_only" strategy for PDFs (#553) * add tests for validating strategy * refactor into determine_pdf_strategy function * refactor pdf strategies into strategies * remove commented out code * remove unreachable code * add in handling for image types * a little more refactoring * import ocr partioning for images * catch warnings, partition type for valid strategies * fallback to ocr_only from fast * fallback logic for hi_res * test for fallback to ocr only * fallback logic ofr ocr_only * more tests for fallback logic * update doc strings * version and changelog * linting, linting, linting * update docs to include notes about strategy * fix typos * change back patched filename 2023-05-08 13:21:24 -04:00			`("layout-parser-paper-fast.pdf", False, True),`
fix: better extractable check (#900) auto strategy was choosing the fast strategy in cases where the pdf contents were just a flat image, resulting in no output. This PR changes the behavior of auto so that elements that can be extracted by fast are extracted, a cursory examination of the elements is made to see if there are elements with text present, and if so then these elements are used as the output. Otherwise fallback strategies come into play. 2023-07-07 23:41:37 -05:00			`("copy-protected.pdf", False, True),`
			`("loremipsum-flat.pdf", False, False),`
enhancement: add "ocr_only" strategy for PDFs (#553) * add tests for validating strategy * refactor into determine_pdf_strategy function * refactor pdf strategies into strategies * remove commented out code * remove unreachable code * add in handling for image types * a little more refactoring * import ocr partioning for images * catch warnings, partition type for valid strategies * fallback to ocr_only from fast * fallback logic for hi_res * test for fallback to ocr only * fallback logic ofr ocr_only * more tests for fallback logic * update doc strings * version and changelog * linting, linting, linting * update docs to include notes about strategy * fix typos * change back patched filename 2023-05-08 13:21:24 -04:00			`],`
			`)`
			`def test_is_pdf_text_extractable(filename, from_file, expected):`
			`filename = os.path.join("example-docs", filename)`

			`if from_file:`
			`with open(filename, "rb") as f:`
fix: better extractable check (#900) auto strategy was choosing the fast strategy in cases where the pdf contents were just a flat image, resulting in no output. This PR changes the behavior of auto so that elements that can be extracted by fast are extracted, a cursory examination of the elements is made to see if there are elements with text present, and if so then these elements are used as the output. Otherwise fallback strategies come into play. 2023-07-07 23:41:37 -05:00			`extractable = pdf.extractable_elements(file=f)`
enhancement: add "ocr_only" strategy for PDFs (#553) * add tests for validating strategy * refactor into determine_pdf_strategy function * refactor pdf strategies into strategies * remove commented out code * remove unreachable code * add in handling for image types * a little more refactoring * import ocr partioning for images * catch warnings, partition type for valid strategies * fallback to ocr_only from fast * fallback logic for hi_res * test for fallback to ocr only * fallback logic ofr ocr_only * more tests for fallback logic * update doc strings * version and changelog * linting, linting, linting * update docs to include notes about strategy * fix typos * change back patched filename 2023-05-08 13:21:24 -04:00			`else:`
fix: better extractable check (#900) auto strategy was choosing the fast strategy in cases where the pdf contents were just a flat image, resulting in no output. This PR changes the behavior of auto so that elements that can be extracted by fast are extracted, a cursory examination of the elements is made to see if there are elements with text present, and if so then these elements are used as the output. Otherwise fallback strategies come into play. 2023-07-07 23:41:37 -05:00			`extractable = pdf.extractable_elements(filename=filename)`
enhancement: add "ocr_only" strategy for PDFs (#553) * add tests for validating strategy * refactor into determine_pdf_strategy function * refactor pdf strategies into strategies * remove commented out code * remove unreachable code * add in handling for image types * a little more refactoring * import ocr partioning for images * catch warnings, partition type for valid strategies * fallback to ocr_only from fast * fallback logic for hi_res * test for fallback to ocr only * fallback logic ofr ocr_only * more tests for fallback logic * update doc strings * version and changelog * linting, linting, linting * update docs to include notes about strategy * fix typos * change back patched filename 2023-05-08 13:21:24 -04:00
fix: better extractable check (#900) auto strategy was choosing the fast strategy in cases where the pdf contents were just a flat image, resulting in no output. This PR changes the behavior of auto so that elements that can be extracted by fast are extracted, a cursory examination of the elements is made to see if there are elements with text present, and if so then these elements are used as the output. Otherwise fallback strategies come into play. 2023-07-07 23:41:37 -05:00			`assert bool(extractable) is expected`
enhancement: auto strategy for PDFs and images (#578) * added functions for determining auto stratgy * change default strategy to auto * tests for auto strategy * update docs * changelog and version * bump version * remove ingest file in wrong location * update jpg output * typo fix 2023-05-12 13:45:08 -04:00

			`@pytest.mark.parametrize(`
			`("infer_table_structure", "expected"),`
			`[`
			`(True, "hi_res"),`
			`(False, "ocr_only"),`
			`],`
			`)`
			`def test_determine_image_auto_strategy(infer_table_structure, expected):`
			`strategy = strategies._determine_image_auto_strategy(`
			`infer_table_structure=infer_table_structure,`
			`)`
			`assert strategy is expected`


			`@pytest.mark.parametrize(`
			`("pdf_text_extractable", "infer_table_structure", "expected"),`
			`[`
			`(True, True, "hi_res"),`
			`(False, True, "hi_res"),`
			`(True, False, "fast"),`
			`(False, False, "ocr_only"),`
			`],`
			`)`
			`def test_determine_image_pdf_strategy(pdf_text_extractable, infer_table_structure, expected):`
			`strategy = strategies._determine_pdf_auto_strategy(`
			`pdf_text_extractable=pdf_text_extractable,`
			`infer_table_structure=infer_table_structure,`
			`)`
			`assert strategy is expected`
Chore: convert fast strategy to ocr_only for images (#735) * fall back to ocr only * more note * add test case * maybe remove skipping dockertest for kor ocr? * bump again * clean up flag * empty commit 2023-06-16 10:59:13 -04:00

			`def test_determine_pdf_or_image_strategy_fallback_ocr():`
			`strategy = strategies.determine_pdf_or_image_strategy(`
			`strategy="fast",`
			`is_image=True,`
			`)`
			`assert strategy == "ocr_only"`