diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e8ea1027..702919886 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.12.3-dev4 +## 0.12.3-dev5 ### Enhancements @@ -10,6 +10,7 @@ * **Add Databricks Volumes destination connector** Databricks Volumes connector added to ingest CLI. Users may now use `unstructured-ingest` to write partitioned data to a Databricks Volumes storage service. ### Fixes +* **Fix support for different Chipper versions and prevent running PDFMiner with Chipper** * **Treat YAML files as text.** Adds YAML MIME types to the file detection code and treats those files as text. * **Fix FSSpec destination connectors check_connection.** FSSpec destination connectors did not use `check_connection`. There was an error when trying to `ls` destination directory - it may not exist at the moment of connector creation. Now `check_connection` calls `ls` on bucket root and this method is called on `initialize` of destination connector. diff --git a/test_unstructured/partition/pdf_image/test_chipper.py b/test_unstructured/partition/pdf_image/test_chipper.py index d625f9787..81f421159 100644 --- a/test_unstructured/partition/pdf_image/test_chipper.py +++ b/test_unstructured/partition/pdf_image/test_chipper.py @@ -30,3 +30,13 @@ def test_chipper_not_losing_parents(chipper_results, chipper_children): [el for el in chipper_results if el.id == child.metadata.parent_id] for child in chipper_children ) + + +def chipper_test_pdfminer_repeated(chipper_results): + """ + Test to verify that PDFMiner has not been run together with Chipper + """ + elements = chipper_results + assert len([element.text for element in elements]) == len( + {element.text for element in elements} + ) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 5b743183b..230fc8d54 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.12.3-dev4" # pragma: no cover +__version__ = "0.12.3-dev5" # pragma: no cover diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 8a0b00302..867a53a07 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -298,8 +298,8 @@ def _partition_pdf_or_image_local( hi_res_model_name or model_name or default_hi_res_model(infer_table_structure) ) if pdf_image_dpi is None: - pdf_image_dpi = 300 if hi_res_model_name == "chipper" else 200 - if (pdf_image_dpi < 300) and (hi_res_model_name == "chipper"): + pdf_image_dpi = 300 if hi_res_model_name.startswith("chipper") else 200 + if (pdf_image_dpi < 300) and (hi_res_model_name.startswith("chipper")): logger.warning( "The Chipper model performs better when images are rendered with DPI >= 300 " f"(currently {pdf_image_dpi}).", @@ -313,32 +313,33 @@ def _partition_pdf_or_image_local( pdf_image_dpi=pdf_image_dpi, ) - extracted_layout = ( - process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi) - if pdf_text_extractable - else [] - ) - - if analysis: - annotate_layout_elements( - inferred_document_layout=inferred_document_layout, - extracted_layout=extracted_layout, - filename=filename, - output_dir_path=analyzed_image_output_dir_path, - pdf_image_dpi=pdf_image_dpi, - is_image=is_image, - ) - - # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout - merged_document_layout = merge_inferred_with_extracted_layout( - inferred_document_layout=inferred_document_layout, - extracted_layout=extracted_layout, - ) - if hi_res_model_name.startswith("chipper"): # NOTE(alan): We shouldn't do OCR with chipper - final_document_layout = merged_document_layout + # NOTE(antonio): We shouldn't do PDFMiner with chipper + final_document_layout = inferred_document_layout else: + extracted_layout = ( + process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi) + if pdf_text_extractable + else [] + ) + + if analysis: + annotate_layout_elements( + inferred_document_layout=inferred_document_layout, + extracted_layout=extracted_layout, + filename=filename, + output_dir_path=analyzed_image_output_dir_path, + pdf_image_dpi=pdf_image_dpi, + is_image=is_image, + ) + + # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout + merged_document_layout = merge_inferred_with_extracted_layout( + inferred_document_layout=inferred_document_layout, + extracted_layout=extracted_layout, + ) + final_document_layout = process_file_with_ocr( filename, merged_document_layout, @@ -355,23 +356,27 @@ def _partition_pdf_or_image_local( model_name=hi_res_model_name, pdf_image_dpi=pdf_image_dpi, ) - if hasattr(file, "seek"): - file.seek(0) - - extracted_layout = ( - process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) if pdf_text_extractable else [] - ) - - # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout - merged_document_layout = merge_inferred_with_extracted_layout( - inferred_document_layout=inferred_document_layout, - extracted_layout=extracted_layout, - ) if hi_res_model_name.startswith("chipper"): # NOTE(alan): We shouldn't do OCR with chipper + # NOTE(antonio): We shouldn't do PDFMiner with chipper final_document_layout = merged_document_layout else: + if hasattr(file, "seek"): + file.seek(0) + + extracted_layout = ( + process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) + if pdf_text_extractable + else [] + ) + + # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout + merged_document_layout = merge_inferred_with_extracted_layout( + inferred_document_layout=inferred_document_layout, + extracted_layout=extracted_layout, + ) + if hasattr(file, "seek"): file.seek(0) final_document_layout = process_data_with_ocr( @@ -385,7 +390,7 @@ def _partition_pdf_or_image_local( ) # NOTE(alan): starting with v2, chipper sorts the elements itself. - if hi_res_model_name == "chipper": + if hi_res_model_name.startswith("chipper") and hi_res_model_name != "chipperv1": kwargs["sort_mode"] = SORT_MODE_DONT final_document_layout = clean_pdfminer_inner_elements(final_document_layout)