Christine Straub 69d0ee1aea
Refactor: support merging extracted layout with inferred layout (#2158)
### Summary
This PR is the second part of `pdfminer` refactor to move it from
`unstructured-inference` repo to `unstructured` repo, the first part is
done in
https://github.com/Unstructured-IO/unstructured-inference/pull/294. This
PR adds logic to merge the extracted layout with the inferred layout.

The updated workflow for the `hi_res` strategy:
* pass the document (as data/filename) to the `inference` repo to get
`inferred_layout` (DocumentLayout)
* pass the `inferred_layout` returned from the `inference` repo and the
document (as data/filename) to the `pdfminer_processing` module, which
first opens the document (create temp file/dir as needed), and splits
the document by pages
* if is_image is `True`, return the passed
inferred_layout(DocumentLayout)
  * if is_image is `False`:
* get extracted_layout (TextRegions) from the passed
document(data/filename) by pdfminer
* merge `extracted_layout` (TextRegions) with the passed
`inferred_layout` (DocumentLayout)
* return the `inferred_layout `(DocumentLayout) with updated elements
(all merged LayoutElements) as merged_layout (DocumentLayout)
* pass merged_layout and the document (as data/filename) to the `OCR`
module, which first opens the document (create temp file/dir as needed),
and splits the document by pages (convert PDF pages to image pages for
PDF file)

### Note
This PR also fixes issue #2164 by using functionality similar to the one
implemented in the `fast` strategy workflow when extracting elements by
`pdfminer`.

### TODO
* image extraction refactor to move it from `unstructured-inference`
repo to `unstructured` repo
* improving natural reading order by applying the current default
`xycut` sorting to the elements extracted by `pdfminer`
2023-12-01 20:56:31 +00:00

95 lines
2.8 KiB
Python

import os
import pytest
from unstructured.partition import strategies
from unstructured.partition.pdf_image import pdf
from unstructured.partition.utils.constants import PartitionStrategy
@pytest.mark.parametrize(
"strategy",
[
PartitionStrategy.AUTO,
PartitionStrategy.FAST,
PartitionStrategy.OCR_ONLY,
PartitionStrategy.HI_RES,
],
)
def test_validate_strategy(strategy):
# Nothing should raise for a valid strategy
strategies.validate_strategy(strategy=strategy)
def test_validate_strategy_raises_for_fast_strategy():
with pytest.raises(ValueError):
strategies.validate_strategy(strategy=PartitionStrategy.FAST, is_image=True)
def test_validate_strategy_raises_for_bad_strategy():
with pytest.raises(ValueError):
strategies.validate_strategy("totally_guess_the_text")
@pytest.mark.parametrize(
("filename", "from_file", "expected"),
[
("layout-parser-paper-fast.pdf", True, True),
("copy-protected.pdf", True, True),
("loremipsum-flat.pdf", True, False),
("layout-parser-paper-fast.pdf", False, True),
("copy-protected.pdf", False, True),
("loremipsum-flat.pdf", False, False),
],
)
def test_is_pdf_text_extractable(filename, from_file, expected):
filename = os.path.join("example-docs", filename)
if from_file:
with open(filename, "rb") as f:
extractable = pdf.extractable_elements(file=f)
else:
extractable = pdf.extractable_elements(filename=filename)
assert bool(extractable) is expected
def test_determine_image_auto_strategy():
strategy = strategies._determine_image_auto_strategy()
assert strategy == PartitionStrategy.HI_RES
@pytest.mark.parametrize(
("pdf_text_extractable", "infer_table_structure", "expected"),
[
(True, True, PartitionStrategy.HI_RES),
(False, True, PartitionStrategy.HI_RES),
(True, False, PartitionStrategy.FAST),
(False, False, PartitionStrategy.OCR_ONLY),
],
)
def test_determine_pdf_auto_strategy(pdf_text_extractable, infer_table_structure, expected):
strategy = strategies._determine_pdf_auto_strategy(
pdf_text_extractable=pdf_text_extractable,
infer_table_structure=infer_table_structure,
)
assert strategy is expected
@pytest.mark.parametrize(
("pdf_text_extractable", "infer_table_structure"),
[
(True, True),
(False, True),
(True, False),
(False, False),
],
)
def test_determine_pdf_or_image_fast_strategy(pdf_text_extractable, infer_table_structure):
strategy = strategies.determine_pdf_or_image_strategy(
strategy=PartitionStrategy.FAST,
pdf_text_extractable=pdf_text_extractable,
infer_table_structure=infer_table_structure,
)
assert strategy == PartitionStrategy.FAST