2023-10-31 16:02:00 -05:00
|
|
|
import pytest
|
|
|
|
|
2024-07-18 15:21:32 -07:00
|
|
|
from test_unstructured.unit_utils import example_doc_path
|
2023-12-15 14:29:58 -08:00
|
|
|
from unstructured.partition import pdf
|
2023-11-15 21:41:02 -08:00
|
|
|
from unstructured.partition.utils.constants import PartitionStrategy
|
2023-10-31 16:02:00 -05:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
|
|
def chipper_results():
|
|
|
|
elements = pdf.partition_pdf(
|
2024-07-18 15:21:32 -07:00
|
|
|
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
2023-11-15 21:41:02 -08:00
|
|
|
strategy=PartitionStrategy.HI_RES,
|
2023-10-31 16:02:00 -05:00
|
|
|
model_name="chipper",
|
|
|
|
)
|
|
|
|
return elements
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
|
|
def chipper_children(chipper_results):
|
|
|
|
return [el for el in chipper_results if el.metadata.parent_id is not None]
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.chipper()
|
|
|
|
def test_chipper_has_hierarchy(chipper_children):
|
|
|
|
assert chipper_children
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.chipper()
|
|
|
|
def test_chipper_not_losing_parents(chipper_results, chipper_children):
|
|
|
|
assert all(
|
|
|
|
[el for el in chipper_results if el.id == child.metadata.parent_id]
|
|
|
|
for child in chipper_children
|
|
|
|
)
|
Check chipper version and prevent running pdfminer with chipper (#2347)
We have added a new version of chipper (Chipperv3), which needs to allow
unstructured to effective work with all the current Chipper versions.
This implies resizing images with the appropriate resolution and make
sure that Chipper elements are not sorted by unstructured.
In addition, it seems that PDFMiner is being called when calling
Chipper, which adds repeated elements from Chipper and PDFMiner.
To evaluate this PR, you can test the code below with the attached PDF.
The code writes a JSON file with the generated elements. The output can
be examined with `cat out.un.json | python -m json.tool`. There are
three things to check:
1. The size of the image passed to Chipper, which can be identiied in
the layout_height and layout_width attributes, which should have values
3301 and 2550 as shown in the example below:
```
[
{
"element_id": "c0493a7872f227e4172c4192c5f48a06",
"metadata": {
"coordinates": {
"layout_height": 3301,
"layout_width": 2550,
```
2. There should be no repeated elements.
3. Order should be closer to reading order.
The script to run Chipper from unstructured is:
```
from unstructured import __version__
print(__version__.__version__)
import json
from unstructured.partition.auto import partition
from unstructured.staging.base import elements_to_json
elements = json.loads(elements_to_json(partition("Huang_Improving_Table_Structure_Recognition_With_Visual-Alignment_Sequential_Coordinate_Modeling_CVPR_2023_paper-p6.pdf", strategy="hi_res", model_name="chipperv3")))
with open('out.un.json', 'w') as w:
json.dump(elements, w)
```
[Huang_Improving_Table_Structure_Recognition_With_Visual-Alignment_Sequential_Coordinate_Modeling_CVPR_2023_paper-p6.pdf](https://github.com/Unstructured-IO/unstructured/files/13817273/Huang_Improving_Table_Structure_Recognition_With_Visual-Alignment_Sequential_Coordinate_Modeling_CVPR_2023_paper-p6.pdf)
---------
Co-authored-by: Antonio Jimeno Yepes <antonio@unstructured.io>
2024-01-25 13:33:32 +11:00
|
|
|
|
|
|
|
|
|
|
|
def chipper_test_pdfminer_repeated(chipper_results):
|
|
|
|
"""
|
|
|
|
Test to verify that PDFMiner has not been run together with Chipper
|
|
|
|
"""
|
|
|
|
elements = chipper_results
|
|
|
|
assert len([element.text for element in elements]) == len(
|
|
|
|
{element.text for element in elements}
|
|
|
|
)
|