mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-07 09:02:59 +00:00

### Summary This PR is the second part of `pdfminer` refactor to move it from `unstructured-inference` repo to `unstructured` repo, the first part is done in https://github.com/Unstructured-IO/unstructured-inference/pull/294. This PR adds logic to merge the extracted layout with the inferred layout. The updated workflow for the `hi_res` strategy: * pass the document (as data/filename) to the `inference` repo to get `inferred_layout` (DocumentLayout) * pass the `inferred_layout` returned from the `inference` repo and the document (as data/filename) to the `pdfminer_processing` module, which first opens the document (create temp file/dir as needed), and splits the document by pages * if is_image is `True`, return the passed inferred_layout(DocumentLayout) * if is_image is `False`: * get extracted_layout (TextRegions) from the passed document(data/filename) by pdfminer * merge `extracted_layout` (TextRegions) with the passed `inferred_layout` (DocumentLayout) * return the `inferred_layout `(DocumentLayout) with updated elements (all merged LayoutElements) as merged_layout (DocumentLayout) * pass merged_layout and the document (as data/filename) to the `OCR` module, which first opens the document (create temp file/dir as needed), and splits the document by pages (convert PDF pages to image pages for PDF file) ### Note This PR also fixes issue #2164 by using functionality similar to the one implemented in the `fast` strategy workflow when extracting elements by `pdfminer`. ### TODO * image extraction refactor to move it from `unstructured-inference` repo to `unstructured` repo * improving natural reading order by applying the current default `xycut` sorting to the elements extracted by `pdfminer`
86 lines
3.3 KiB
Python
86 lines
3.3 KiB
Python
import pytest
|
|
from PIL import Image
|
|
from unstructured_inference.constants import Source as InferenceSource
|
|
from unstructured_inference.inference.elements import Rectangle
|
|
from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout
|
|
|
|
from unstructured.partition.utils.constants import Source
|
|
from unstructured.partition.utils.processing_elements import clean_pdfminer_inner_elements
|
|
|
|
# A set of elements with pdfminer elements inside tables
|
|
deletable_elements_inside_table = [
|
|
LayoutElement(
|
|
bbox=Rectangle(0, 0, 100, 100),
|
|
text="Table with inner elements",
|
|
type="Table",
|
|
),
|
|
LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="text1", source=Source.PDFMINER),
|
|
LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="text2", source=Source.PDFMINER),
|
|
]
|
|
|
|
# A set of elements without pdfminer elements inside
|
|
# tables (no elements with source=Source.PDFMINER)
|
|
no_deletable_elements_inside_table = [
|
|
LayoutElement(
|
|
bbox=Rectangle(0, 0, 100, 100),
|
|
text="Table with inner elements",
|
|
type="Table",
|
|
source=InferenceSource.YOLOX,
|
|
),
|
|
LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="text1", source=InferenceSource.YOLOX),
|
|
LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="text2", source=InferenceSource.YOLOX),
|
|
]
|
|
# A set of elements with pdfminer elements inside tables and other
|
|
# elements with source=Source.PDFMINER
|
|
# Note: there is some elements with source=Source.PDFMINER are not inside tables
|
|
mix_elements_inside_table = [
|
|
LayoutElement(
|
|
bbox=Rectangle(0, 0, 100, 100),
|
|
text="Table1 with inner elements",
|
|
type="Table",
|
|
source=InferenceSource.YOLOX,
|
|
),
|
|
LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="Inside table1"),
|
|
LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="Inside table1", source=Source.PDFMINER),
|
|
LayoutElement(
|
|
bbox=Rectangle(150, 150, 170, 170),
|
|
text="Outside tables",
|
|
source=Source.PDFMINER,
|
|
),
|
|
LayoutElement(
|
|
bbox=Rectangle(180, 180, 200, 200),
|
|
text="Outside tables",
|
|
source=Source.PDFMINER,
|
|
),
|
|
LayoutElement(
|
|
bbox=Rectangle(0, 500, 100, 700),
|
|
text="Table2 with inner elements",
|
|
type="Table",
|
|
source=InferenceSource.YOLOX,
|
|
),
|
|
LayoutElement(bbox=Rectangle(0, 510, 50, 300), text="Inside table2", source=Source.PDFMINER),
|
|
LayoutElement(bbox=Rectangle(0, 550, 70, 400), text="Inside table2", source=Source.PDFMINER),
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("elements", "length_extra_info", "expected_document_length"),
|
|
[
|
|
(deletable_elements_inside_table, 1, 1),
|
|
(no_deletable_elements_inside_table, 0, 3),
|
|
(mix_elements_inside_table, 2, 5),
|
|
],
|
|
)
|
|
def test_clean_pdfminer_inner_elements(elements, length_extra_info, expected_document_length):
|
|
# create a sample document with pdfminer elements inside tables
|
|
page = PageLayout(number=1, image=Image.new("1", (1, 1)))
|
|
page.elements = elements
|
|
document_with_table = DocumentLayout(pages=[page])
|
|
document = document_with_table
|
|
|
|
# call the function to clean the pdfminer inner elements
|
|
cleaned_doc = clean_pdfminer_inner_elements(document)
|
|
|
|
# check that the pdfminer elements were stored in the extra_info dictionary
|
|
assert len(cleaned_doc.pages[0].elements) == expected_document_length
|