From 9c66eab8a97417a6520abb37a72f91ab4222141c Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Thu, 16 Nov 2023 21:42:36 -0800 Subject: [PATCH] Fix: handle pdf text extraction errors (#2101) Closes #2084. ### Summary Certain pdfs throw unexpected errors when being opened by `pdfminer`, causing `partition_pdf()` to fail. We expect to be able to partition smoothly using an alternative strategy if text extraction doesn't work. Added exception handling to handle unexpected errors when extracting pdf text and to help determine pdf strategy. ### Testing PDF: [NASA-SNA-8-D-027III-Rev2-CsmLmSpacecraftOperationalDataBook-Volume3-MassProperties-pg856.pdf](https://github.com/Unstructured-IO/unstructured/files/13383215/NASA-SNA-8-D-027III-Rev2-CsmLmSpacecraftOperationalDataBook-Volume3-MassProperties-pg856.pdf) ``` elements = partition_pdf( filename="NASA-SNA-8-D-027III-Rev2-CsmLmSpacecraftOperationalDataBook-Volume3-MassProperties-pg856.pdf", ) ``` --- CHANGELOG.md | 3 ++- unstructured/partition/pdf.py | 29 ++++++++++++++++------------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index feb7cf518..7548708e4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,8 @@ ### Fixes -* **Fix `fast` strategy fall back to `ocr_only`.** The `fast` strategy should not fall back to a more expensive strategy. +* **Handle errors when extracting PDF text** Certain pdfs throw unexpected errors when being opened by `pdfminer`, causing `partition_pdf()` to fail. We expect to be able to partition smoothly using an alternative strategy if text extraction doesn't work. Added exception handling to handle unexpected errors when extracting pdf text and to help determine pdf strategy. +* **Fix `fast` strategy fall back to `ocr_only`** The `fast` strategy should not fall back to a more expensive strategy. * **Remove default user ./ssh folder** The default notebook user during image build would create the known_hosts file with incorrect ownership, this is legacy and no longer needed so it was removed. * **Include `languages` in metadata when partitioning strategy='hi_res' or 'fast'** User defined `languages` was previously used for text detection, but not included in the resulting element metadata for some strategies. `languages` will now be included in the metadata regardless of partition strategy for pdfs and images. * **Handle a case where Paddle returns a list item in ocr_data as None** In partition, while parsing PaddleOCR data, it was assumed that PaddleOCR does not return None for any list item in ocr_data. Removed the assumption by skipping the text region whenever this happens. diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index a9e3ea4ac..8e9c0e151 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -245,20 +245,23 @@ def partition_pdf_or_image( ) extracted_elements = [] + pdf_text_extractable = False if not is_image: - extracted_elements = extractable_elements( - filename=filename, - file=spooled_to_bytes_io_if_needed(file), - include_page_breaks=include_page_breaks, - languages=languages, - metadata_last_modified=metadata_last_modified or last_modification_date, - **kwargs, - ) - pdf_text_extractable = any( - isinstance(el, Text) and el.text.strip() for el in extracted_elements - ) - else: - pdf_text_extractable = False + try: + extracted_elements = extractable_elements( + filename=filename, + file=spooled_to_bytes_io_if_needed(file), + include_page_breaks=include_page_breaks, + languages=languages, + metadata_last_modified=metadata_last_modified or last_modification_date, + **kwargs, + ) + pdf_text_extractable = any( + isinstance(el, Text) and el.text.strip() for el in extracted_elements + ) + except Exception as e: + logger.error(e, exc_info=True) + logger.warning("PDF text extraction failed, skip text extraction...") strategy = determine_pdf_or_image_strategy( strategy,