2023-05-08 13:21:24 -04:00
|
|
|
import os
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
2023-12-01 12:56:31 -08:00
|
|
|
from unstructured.partition import strategies
|
|
|
|
from unstructured.partition.pdf_image import pdf
|
2023-11-15 21:41:02 -08:00
|
|
|
from unstructured.partition.utils.constants import PartitionStrategy
|
2023-05-08 13:21:24 -04:00
|
|
|
|
|
|
|
|
2023-11-15 21:41:02 -08:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"strategy",
|
|
|
|
[
|
|
|
|
PartitionStrategy.AUTO,
|
|
|
|
PartitionStrategy.FAST,
|
|
|
|
PartitionStrategy.OCR_ONLY,
|
|
|
|
PartitionStrategy.HI_RES,
|
|
|
|
],
|
|
|
|
)
|
2023-11-14 10:46:41 -08:00
|
|
|
def test_validate_strategy(strategy):
|
2023-05-08 13:21:24 -04:00
|
|
|
# Nothing should raise for a valid strategy
|
2023-11-14 10:46:41 -08:00
|
|
|
strategies.validate_strategy(strategy=strategy)
|
2023-05-08 13:21:24 -04:00
|
|
|
|
|
|
|
|
2023-11-14 10:46:41 -08:00
|
|
|
def test_validate_strategy_raises_for_fast_strategy():
|
2023-05-08 13:21:24 -04:00
|
|
|
with pytest.raises(ValueError):
|
2023-11-15 21:41:02 -08:00
|
|
|
strategies.validate_strategy(strategy=PartitionStrategy.FAST, is_image=True)
|
2023-05-08 13:21:24 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_validate_strategy_raises_for_bad_strategy():
|
|
|
|
with pytest.raises(ValueError):
|
2023-11-14 10:46:41 -08:00
|
|
|
strategies.validate_strategy("totally_guess_the_text")
|
2023-05-08 13:21:24 -04:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("filename", "from_file", "expected"),
|
|
|
|
[
|
|
|
|
("layout-parser-paper-fast.pdf", True, True),
|
2023-07-07 23:41:37 -05:00
|
|
|
("copy-protected.pdf", True, True),
|
|
|
|
("loremipsum-flat.pdf", True, False),
|
2023-05-08 13:21:24 -04:00
|
|
|
("layout-parser-paper-fast.pdf", False, True),
|
2023-07-07 23:41:37 -05:00
|
|
|
("copy-protected.pdf", False, True),
|
|
|
|
("loremipsum-flat.pdf", False, False),
|
2023-05-08 13:21:24 -04:00
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_is_pdf_text_extractable(filename, from_file, expected):
|
|
|
|
filename = os.path.join("example-docs", filename)
|
|
|
|
|
|
|
|
if from_file:
|
|
|
|
with open(filename, "rb") as f:
|
2023-07-07 23:41:37 -05:00
|
|
|
extractable = pdf.extractable_elements(file=f)
|
2023-05-08 13:21:24 -04:00
|
|
|
else:
|
2023-07-07 23:41:37 -05:00
|
|
|
extractable = pdf.extractable_elements(filename=filename)
|
2023-05-08 13:21:24 -04:00
|
|
|
|
2023-07-07 23:41:37 -05:00
|
|
|
assert bool(extractable) is expected
|
2023-05-12 13:45:08 -04:00
|
|
|
|
|
|
|
|
2023-08-02 09:22:20 -07:00
|
|
|
def test_determine_image_auto_strategy():
|
|
|
|
strategy = strategies._determine_image_auto_strategy()
|
2023-11-15 21:41:02 -08:00
|
|
|
assert strategy == PartitionStrategy.HI_RES
|
2023-05-12 13:45:08 -04:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("pdf_text_extractable", "infer_table_structure", "expected"),
|
|
|
|
[
|
2023-11-15 21:41:02 -08:00
|
|
|
(True, True, PartitionStrategy.HI_RES),
|
|
|
|
(False, True, PartitionStrategy.HI_RES),
|
|
|
|
(True, False, PartitionStrategy.FAST),
|
|
|
|
(False, False, PartitionStrategy.OCR_ONLY),
|
2023-05-12 13:45:08 -04:00
|
|
|
],
|
|
|
|
)
|
2023-11-14 10:46:41 -08:00
|
|
|
def test_determine_pdf_auto_strategy(pdf_text_extractable, infer_table_structure, expected):
|
2023-05-12 13:45:08 -04:00
|
|
|
strategy = strategies._determine_pdf_auto_strategy(
|
|
|
|
pdf_text_extractable=pdf_text_extractable,
|
|
|
|
infer_table_structure=infer_table_structure,
|
|
|
|
)
|
|
|
|
assert strategy is expected
|
2023-06-16 10:59:13 -04:00
|
|
|
|
|
|
|
|
2023-11-14 10:46:41 -08:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("pdf_text_extractable", "infer_table_structure"),
|
|
|
|
[
|
|
|
|
(True, True),
|
|
|
|
(False, True),
|
|
|
|
(True, False),
|
|
|
|
(False, False),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_determine_pdf_or_image_fast_strategy(pdf_text_extractable, infer_table_structure):
|
2023-06-16 10:59:13 -04:00
|
|
|
strategy = strategies.determine_pdf_or_image_strategy(
|
2023-11-15 21:41:02 -08:00
|
|
|
strategy=PartitionStrategy.FAST,
|
2023-11-14 10:46:41 -08:00
|
|
|
pdf_text_extractable=pdf_text_extractable,
|
|
|
|
infer_table_structure=infer_table_structure,
|
2023-06-16 10:59:13 -04:00
|
|
|
)
|
2023-11-15 21:41:02 -08:00
|
|
|
assert strategy == PartitionStrategy.FAST
|