Check chipper version and prevent running pdfminer with chipper (#2347)

We have added a new version of chipper (Chipperv3), which needs to allow
unstructured to effective work with all the current Chipper versions.
This implies resizing images with the appropriate resolution and make
sure that Chipper elements are not sorted by unstructured.

In addition, it seems that PDFMiner is being called when calling
Chipper, which adds repeated elements from Chipper and PDFMiner.

To evaluate this PR, you can test the code below with the attached PDF.
The code writes a JSON file with the generated elements. The output can
be examined with `cat out.un.json | python -m json.tool`. There are
three things to check:

1. The size of the image passed to Chipper, which can be identiied in
the layout_height and layout_width attributes, which should have values
3301 and 2550 as shown in the example below:

```
[
    {
        "element_id": "c0493a7872f227e4172c4192c5f48a06",
        "metadata": {
            "coordinates": {
                "layout_height": 3301,
                "layout_width": 2550,

```

2. There should be no repeated elements. 
3. Order should be closer to reading order.

The script to run Chipper from unstructured is:

```
from unstructured import __version__
print(__version__.__version__)

import json
from unstructured.partition.auto import partition
from unstructured.staging.base import elements_to_json

elements = json.loads(elements_to_json(partition("Huang_Improving_Table_Structure_Recognition_With_Visual-Alignment_Sequential_Coordinate_Modeling_CVPR_2023_paper-p6.pdf", strategy="hi_res", model_name="chipperv3")))

with open('out.un.json', 'w') as w:
    json.dump(elements, w)

```



[Huang_Improving_Table_Structure_Recognition_With_Visual-Alignment_Sequential_Coordinate_Modeling_CVPR_2023_paper-p6.pdf](https://github.com/Unstructured-IO/unstructured/files/13817273/Huang_Improving_Table_Structure_Recognition_With_Visual-Alignment_Sequential_Coordinate_Modeling_CVPR_2023_paper-p6.pdf)

---------

Co-authored-by: Antonio Jimeno Yepes <antonio@unstructured.io>
This commit is contained in:
Antonio Jose Jimeno Yepes 2024-01-25 13:33:32 +11:00 committed by GitHub
parent 4613e52e11
commit d8b3bdb919
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 56 additions and 40 deletions

View File

@ -1,4 +1,4 @@
## 0.12.3-dev4
## 0.12.3-dev5
### Enhancements
@ -10,6 +10,7 @@
* **Add Databricks Volumes destination connector** Databricks Volumes connector added to ingest CLI. Users may now use `unstructured-ingest` to write partitioned data to a Databricks Volumes storage service.
### Fixes
* **Fix support for different Chipper versions and prevent running PDFMiner with Chipper**
* **Treat YAML files as text.** Adds YAML MIME types to the file detection code and treats those
files as text.
* **Fix FSSpec destination connectors check_connection.** FSSpec destination connectors did not use `check_connection`. There was an error when trying to `ls` destination directory - it may not exist at the moment of connector creation. Now `check_connection` calls `ls` on bucket root and this method is called on `initialize` of destination connector.

View File

@ -30,3 +30,13 @@ def test_chipper_not_losing_parents(chipper_results, chipper_children):
[el for el in chipper_results if el.id == child.metadata.parent_id]
for child in chipper_children
)
def chipper_test_pdfminer_repeated(chipper_results):
"""
Test to verify that PDFMiner has not been run together with Chipper
"""
elements = chipper_results
assert len([element.text for element in elements]) == len(
{element.text for element in elements}
)

View File

@ -1 +1 @@
__version__ = "0.12.3-dev4" # pragma: no cover
__version__ = "0.12.3-dev5" # pragma: no cover

View File

@ -298,8 +298,8 @@ def _partition_pdf_or_image_local(
hi_res_model_name or model_name or default_hi_res_model(infer_table_structure)
)
if pdf_image_dpi is None:
pdf_image_dpi = 300 if hi_res_model_name == "chipper" else 200
if (pdf_image_dpi < 300) and (hi_res_model_name == "chipper"):
pdf_image_dpi = 300 if hi_res_model_name.startswith("chipper") else 200
if (pdf_image_dpi < 300) and (hi_res_model_name.startswith("chipper")):
logger.warning(
"The Chipper model performs better when images are rendered with DPI >= 300 "
f"(currently {pdf_image_dpi}).",
@ -313,32 +313,33 @@ def _partition_pdf_or_image_local(
pdf_image_dpi=pdf_image_dpi,
)
extracted_layout = (
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
if pdf_text_extractable
else []
)
if analysis:
annotate_layout_elements(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
filename=filename,
output_dir_path=analyzed_image_output_dir_path,
pdf_image_dpi=pdf_image_dpi,
is_image=is_image,
)
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = merge_inferred_with_extracted_layout(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
)
if hi_res_model_name.startswith("chipper"):
# NOTE(alan): We shouldn't do OCR with chipper
final_document_layout = merged_document_layout
# NOTE(antonio): We shouldn't do PDFMiner with chipper
final_document_layout = inferred_document_layout
else:
extracted_layout = (
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
if pdf_text_extractable
else []
)
if analysis:
annotate_layout_elements(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
filename=filename,
output_dir_path=analyzed_image_output_dir_path,
pdf_image_dpi=pdf_image_dpi,
is_image=is_image,
)
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = merge_inferred_with_extracted_layout(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
)
final_document_layout = process_file_with_ocr(
filename,
merged_document_layout,
@ -355,23 +356,27 @@ def _partition_pdf_or_image_local(
model_name=hi_res_model_name,
pdf_image_dpi=pdf_image_dpi,
)
if hasattr(file, "seek"):
file.seek(0)
extracted_layout = (
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) if pdf_text_extractable else []
)
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = merge_inferred_with_extracted_layout(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
)
if hi_res_model_name.startswith("chipper"):
# NOTE(alan): We shouldn't do OCR with chipper
# NOTE(antonio): We shouldn't do PDFMiner with chipper
final_document_layout = merged_document_layout
else:
if hasattr(file, "seek"):
file.seek(0)
extracted_layout = (
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi)
if pdf_text_extractable
else []
)
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = merge_inferred_with_extracted_layout(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
)
if hasattr(file, "seek"):
file.seek(0)
final_document_layout = process_data_with_ocr(
@ -385,7 +390,7 @@ def _partition_pdf_or_image_local(
)
# NOTE(alan): starting with v2, chipper sorts the elements itself.
if hi_res_model_name == "chipper":
if hi_res_model_name.startswith("chipper") and hi_res_model_name != "chipperv1":
kwargs["sort_mode"] = SORT_MODE_DONT
final_document_layout = clean_pdfminer_inner_elements(final_document_layout)