mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-01 18:43:04 +00:00
Check chipper version and prevent running pdfminer with chipper (#2347)
We have added a new version of chipper (Chipperv3), which needs to allow
unstructured to effective work with all the current Chipper versions.
This implies resizing images with the appropriate resolution and make
sure that Chipper elements are not sorted by unstructured.
In addition, it seems that PDFMiner is being called when calling
Chipper, which adds repeated elements from Chipper and PDFMiner.
To evaluate this PR, you can test the code below with the attached PDF.
The code writes a JSON file with the generated elements. The output can
be examined with `cat out.un.json | python -m json.tool`. There are
three things to check:
1. The size of the image passed to Chipper, which can be identiied in
the layout_height and layout_width attributes, which should have values
3301 and 2550 as shown in the example below:
```
[
{
"element_id": "c0493a7872f227e4172c4192c5f48a06",
"metadata": {
"coordinates": {
"layout_height": 3301,
"layout_width": 2550,
```
2. There should be no repeated elements.
3. Order should be closer to reading order.
The script to run Chipper from unstructured is:
```
from unstructured import __version__
print(__version__.__version__)
import json
from unstructured.partition.auto import partition
from unstructured.staging.base import elements_to_json
elements = json.loads(elements_to_json(partition("Huang_Improving_Table_Structure_Recognition_With_Visual-Alignment_Sequential_Coordinate_Modeling_CVPR_2023_paper-p6.pdf", strategy="hi_res", model_name="chipperv3")))
with open('out.un.json', 'w') as w:
json.dump(elements, w)
```
[Huang_Improving_Table_Structure_Recognition_With_Visual-Alignment_Sequential_Coordinate_Modeling_CVPR_2023_paper-p6.pdf](https://github.com/Unstructured-IO/unstructured/files/13817273/Huang_Improving_Table_Structure_Recognition_With_Visual-Alignment_Sequential_Coordinate_Modeling_CVPR_2023_paper-p6.pdf)
---------
Co-authored-by: Antonio Jimeno Yepes <antonio@unstructured.io>
This commit is contained in:
parent
4613e52e11
commit
d8b3bdb919
@ -1,4 +1,4 @@
|
||||
## 0.12.3-dev4
|
||||
## 0.12.3-dev5
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
* **Add Databricks Volumes destination connector** Databricks Volumes connector added to ingest CLI. Users may now use `unstructured-ingest` to write partitioned data to a Databricks Volumes storage service.
|
||||
|
||||
### Fixes
|
||||
* **Fix support for different Chipper versions and prevent running PDFMiner with Chipper**
|
||||
* **Treat YAML files as text.** Adds YAML MIME types to the file detection code and treats those
|
||||
files as text.
|
||||
* **Fix FSSpec destination connectors check_connection.** FSSpec destination connectors did not use `check_connection`. There was an error when trying to `ls` destination directory - it may not exist at the moment of connector creation. Now `check_connection` calls `ls` on bucket root and this method is called on `initialize` of destination connector.
|
||||
|
||||
@ -30,3 +30,13 @@ def test_chipper_not_losing_parents(chipper_results, chipper_children):
|
||||
[el for el in chipper_results if el.id == child.metadata.parent_id]
|
||||
for child in chipper_children
|
||||
)
|
||||
|
||||
|
||||
def chipper_test_pdfminer_repeated(chipper_results):
|
||||
"""
|
||||
Test to verify that PDFMiner has not been run together with Chipper
|
||||
"""
|
||||
elements = chipper_results
|
||||
assert len([element.text for element in elements]) == len(
|
||||
{element.text for element in elements}
|
||||
)
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.12.3-dev4" # pragma: no cover
|
||||
__version__ = "0.12.3-dev5" # pragma: no cover
|
||||
|
||||
@ -298,8 +298,8 @@ def _partition_pdf_or_image_local(
|
||||
hi_res_model_name or model_name or default_hi_res_model(infer_table_structure)
|
||||
)
|
||||
if pdf_image_dpi is None:
|
||||
pdf_image_dpi = 300 if hi_res_model_name == "chipper" else 200
|
||||
if (pdf_image_dpi < 300) and (hi_res_model_name == "chipper"):
|
||||
pdf_image_dpi = 300 if hi_res_model_name.startswith("chipper") else 200
|
||||
if (pdf_image_dpi < 300) and (hi_res_model_name.startswith("chipper")):
|
||||
logger.warning(
|
||||
"The Chipper model performs better when images are rendered with DPI >= 300 "
|
||||
f"(currently {pdf_image_dpi}).",
|
||||
@ -313,32 +313,33 @@ def _partition_pdf_or_image_local(
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
)
|
||||
|
||||
extracted_layout = (
|
||||
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
|
||||
if pdf_text_extractable
|
||||
else []
|
||||
)
|
||||
|
||||
if analysis:
|
||||
annotate_layout_elements(
|
||||
inferred_document_layout=inferred_document_layout,
|
||||
extracted_layout=extracted_layout,
|
||||
filename=filename,
|
||||
output_dir_path=analyzed_image_output_dir_path,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
is_image=is_image,
|
||||
)
|
||||
|
||||
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
|
||||
merged_document_layout = merge_inferred_with_extracted_layout(
|
||||
inferred_document_layout=inferred_document_layout,
|
||||
extracted_layout=extracted_layout,
|
||||
)
|
||||
|
||||
if hi_res_model_name.startswith("chipper"):
|
||||
# NOTE(alan): We shouldn't do OCR with chipper
|
||||
final_document_layout = merged_document_layout
|
||||
# NOTE(antonio): We shouldn't do PDFMiner with chipper
|
||||
final_document_layout = inferred_document_layout
|
||||
else:
|
||||
extracted_layout = (
|
||||
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
|
||||
if pdf_text_extractable
|
||||
else []
|
||||
)
|
||||
|
||||
if analysis:
|
||||
annotate_layout_elements(
|
||||
inferred_document_layout=inferred_document_layout,
|
||||
extracted_layout=extracted_layout,
|
||||
filename=filename,
|
||||
output_dir_path=analyzed_image_output_dir_path,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
is_image=is_image,
|
||||
)
|
||||
|
||||
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
|
||||
merged_document_layout = merge_inferred_with_extracted_layout(
|
||||
inferred_document_layout=inferred_document_layout,
|
||||
extracted_layout=extracted_layout,
|
||||
)
|
||||
|
||||
final_document_layout = process_file_with_ocr(
|
||||
filename,
|
||||
merged_document_layout,
|
||||
@ -355,23 +356,27 @@ def _partition_pdf_or_image_local(
|
||||
model_name=hi_res_model_name,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
)
|
||||
if hasattr(file, "seek"):
|
||||
file.seek(0)
|
||||
|
||||
extracted_layout = (
|
||||
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) if pdf_text_extractable else []
|
||||
)
|
||||
|
||||
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
|
||||
merged_document_layout = merge_inferred_with_extracted_layout(
|
||||
inferred_document_layout=inferred_document_layout,
|
||||
extracted_layout=extracted_layout,
|
||||
)
|
||||
|
||||
if hi_res_model_name.startswith("chipper"):
|
||||
# NOTE(alan): We shouldn't do OCR with chipper
|
||||
# NOTE(antonio): We shouldn't do PDFMiner with chipper
|
||||
final_document_layout = merged_document_layout
|
||||
else:
|
||||
if hasattr(file, "seek"):
|
||||
file.seek(0)
|
||||
|
||||
extracted_layout = (
|
||||
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi)
|
||||
if pdf_text_extractable
|
||||
else []
|
||||
)
|
||||
|
||||
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
|
||||
merged_document_layout = merge_inferred_with_extracted_layout(
|
||||
inferred_document_layout=inferred_document_layout,
|
||||
extracted_layout=extracted_layout,
|
||||
)
|
||||
|
||||
if hasattr(file, "seek"):
|
||||
file.seek(0)
|
||||
final_document_layout = process_data_with_ocr(
|
||||
@ -385,7 +390,7 @@ def _partition_pdf_or_image_local(
|
||||
)
|
||||
|
||||
# NOTE(alan): starting with v2, chipper sorts the elements itself.
|
||||
if hi_res_model_name == "chipper":
|
||||
if hi_res_model_name.startswith("chipper") and hi_res_model_name != "chipperv1":
|
||||
kwargs["sort_mode"] = SORT_MODE_DONT
|
||||
|
||||
final_document_layout = clean_pdfminer_inner_elements(final_document_layout)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user