diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a26cd450..554ef679d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ -## 0.16.21-dev3 +## 0.16.21-dev4 ### Enhancements +- **Use password** to load PDF with all modes - **use vectorized logic to merge inferred and extracted layouts**. Using the new `LayoutElements` data structure and numpy library to refactor the layout merging logic to improve compute performance as well as making logic more clear diff --git a/example-docs/pdf/password.pdf b/example-docs/pdf/password.pdf new file mode 100644 index 000000000..40f63f2af Binary files /dev/null and b/example-docs/pdf/password.pdf differ diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index 99df48105..816388cbe 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -11,5 +11,5 @@ google-cloud-vision effdet # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded # when unstructured library is. -unstructured-inference>=0.8.6 +unstructured-inference>=0.8.7 unstructured.pytesseract>=0.3.12 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 910c2e279..f30252303 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -263,7 +263,7 @@ typing-extensions==4.12.2 # torch tzdata==2025.1 # via pandas -unstructured-inference==0.8.6 +unstructured-inference==0.8.7 # via -r ./extra-pdf-image.in unstructured-pytesseract==0.3.13 # via -r ./extra-pdf-image.in diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 0746eab82..8ece56304 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -262,7 +262,7 @@ def test_partition_pdf_outputs_valid_amount_of_elements_and_metadata_values( strategy=strategy, starting_page_number=starting_page_number, ) - _test(result) + _test(result) @mock.patch.dict(os.environ, {"UNSTRUCTURED_HI_RES_MODEL_NAME": "checkbox"}) @@ -1545,3 +1545,43 @@ def test_document_to_element_list_sets_category_depth_titles(): assert elements[1].metadata.category_depth == 2 assert elements[2].metadata.category_depth is None assert elements[3].metadata.category_depth == 0 + + +@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"]) +@pytest.mark.parametrize( + "strategy", + # fast: can't capture the "intentionally left blank page" page + # others: will ignore the actual blank page + [ + PartitionStrategy.FAST, + PartitionStrategy.HI_RES, + PartitionStrategy.OCR_ONLY, + ], +) +def test_partition_pdf_with_password( + file_mode, + strategy, + filename=example_doc_path("pdf/password.pdf"), +): + # Test that the partition_pdf function can handle filename + def _test(result): + # validate that the result is a non-empty list of dicts + assert len(result) == 1 + assert result[0].text == "File with password" + + if file_mode == "filename": + result = pdf.partition_pdf(filename=filename, strategy=strategy, password="password") + _test(result) + elif file_mode == "rb": + with open(filename, "rb") as f: + result = pdf.partition_pdf(file=f, strategy=strategy, password="password") + _test(result) + else: + with open(filename, "rb") as test_file: + with SpooledTemporaryFile() as spooled_temp_file: + spooled_temp_file.write(test_file.read()) + spooled_temp_file.seek(0) + result = pdf.partition_pdf( + file=spooled_temp_file, strategy=strategy, password="password" + ) + _test(result) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 15608835a..464c14198 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.21-dev3" # pragma: no cover +__version__ = "0.16.21-dev4" # pragma: no cover diff --git a/unstructured/partition/image.py b/unstructured/partition/image.py index 50ceaa118..712384e0d 100644 --- a/unstructured/partition/image.py +++ b/unstructured/partition/image.py @@ -32,6 +32,7 @@ def partition_image( starting_page_number: int = 1, extract_forms: bool = False, form_extraction_skip_tables: bool = True, + password: Optional[str] = None, **kwargs: Any, ) -> list[Element]: """Parses an image into a list of interpreted elements. @@ -91,6 +92,8 @@ def partition_image( (results in adding FormKeysValues elements to output). form_extraction_skip_tables Whether the form extraction logic should ignore regions designated as Tables. + password + The password to decrypt the PDF file. """ exactly_one(filename=filename, file=file) @@ -113,5 +116,6 @@ def partition_image( starting_page_number=starting_page_number, extract_forms=extract_forms, form_extraction_skip_tables=form_extraction_skip_tables, + password=password, **kwargs, ) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 55d3f3c03..9a2efcd65 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -144,6 +144,7 @@ def partition_pdf( starting_page_number: int = 1, extract_forms: bool = False, form_extraction_skip_tables: bool = True, + password: Optional[str] = None, **kwargs: Any, ) -> list[Element]: """Parses a pdf document into a list of interpreted elements. @@ -224,6 +225,7 @@ def partition_pdf( starting_page_number=starting_page_number, extract_forms=extract_forms, form_extraction_skip_tables=form_extraction_skip_tables, + password=password, **kwargs, ) @@ -245,6 +247,7 @@ def partition_pdf_or_image( starting_page_number: int = 1, extract_forms: bool = False, form_extraction_skip_tables: bool = True, + password: Optional[str] = None, **kwargs: Any, ) -> list[Element]: """Parses a pdf or image document into a list of interpreted elements.""" @@ -273,6 +276,7 @@ def partition_pdf_or_image( languages=languages, metadata_last_modified=metadata_last_modified or last_modified, starting_page_number=starting_page_number, + password=password, **kwargs, ) pdf_text_extractable = any( @@ -322,6 +326,7 @@ def partition_pdf_or_image( starting_page_number=starting_page_number, extract_forms=extract_forms, form_extraction_skip_tables=form_extraction_skip_tables, + password=password, **kwargs, ) out_elements = _process_uncategorized_text_elements(elements) @@ -347,6 +352,7 @@ def partition_pdf_or_image( is_image=is_image, metadata_last_modified=metadata_last_modified or last_modified, starting_page_number=starting_page_number, + password=password, **kwargs, ) out_elements = _process_uncategorized_text_elements(elements) @@ -360,6 +366,7 @@ def extractable_elements( languages: Optional[list[str]] = None, metadata_last_modified: Optional[str] = None, starting_page_number: int = 1, + password: Optional[str] = None, **kwargs: Any, ) -> list[list[Element]]: if isinstance(file, bytes): @@ -370,6 +377,7 @@ def extractable_elements( languages=languages, metadata_last_modified=metadata_last_modified, starting_page_number=starting_page_number, + password=password, **kwargs, ) @@ -380,6 +388,7 @@ def _partition_pdf_with_pdfminer( languages: list[str], metadata_last_modified: Optional[str], starting_page_number: int = 1, + password: Optional[str] = None, **kwargs: Any, ) -> list[list[Element]]: """Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster @@ -403,6 +412,7 @@ def _partition_pdf_with_pdfminer( languages=languages, metadata_last_modified=metadata_last_modified, starting_page_number=starting_page_number, + password=password, **kwargs, ) @@ -413,6 +423,7 @@ def _partition_pdf_with_pdfminer( languages=languages, metadata_last_modified=metadata_last_modified, starting_page_number=starting_page_number, + password=password, **kwargs, ) @@ -427,6 +438,7 @@ def _process_pdfminer_pages( metadata_last_modified: Optional[str], annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD, starting_page_number: int = 1, + password: Optional[str] = None, **kwargs, ) -> list[list[Element]]: """Uses PDFMiner to split a document into pages and process them.""" @@ -434,7 +446,8 @@ def _process_pdfminer_pages( elements = [] for page_number, (page, page_layout) in enumerate( - open_pdfminer_pages_generator(fp), start=starting_page_number + open_pdfminer_pages_generator(fp, password=password), + start=starting_page_number, ): width, height = page_layout.width, page_layout.height @@ -556,6 +569,7 @@ def _partition_pdf_or_image_local( extract_forms: bool = False, form_extraction_skip_tables: bool = True, pdf_hi_res_max_pages: Optional[int] = None, + password: Optional[str] = None, **kwargs: Any, ) -> list[Element]: """Partition using package installed locally""" @@ -592,10 +606,11 @@ def _partition_pdf_or_image_local( is_image=is_image, model_name=hi_res_model_name, pdf_image_dpi=pdf_image_dpi, + password=password, ) extracted_layout, layouts_links = ( - process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi) + process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi, password=password) if pdf_text_extractable else ([], []) ) @@ -635,6 +650,7 @@ def _partition_pdf_or_image_local( ocr_mode=ocr_mode, pdf_image_dpi=pdf_image_dpi, ocr_layout_dumper=ocr_layout_dumper, + password=password, ) else: inferred_document_layout = process_data_with_model( @@ -642,13 +658,14 @@ def _partition_pdf_or_image_local( is_image=is_image, model_name=hi_res_model_name, pdf_image_dpi=pdf_image_dpi, + password=password, ) if hasattr(file, "seek"): file.seek(0) extracted_layout, layouts_links = ( - process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) + process_data_with_pdfminer(file=file, dpi=pdf_image_dpi, password=password) if pdf_text_extractable else ([], []) ) @@ -690,6 +707,7 @@ def _partition_pdf_or_image_local( ocr_mode=ocr_mode, pdf_image_dpi=pdf_image_dpi, ocr_layout_dumper=ocr_layout_dumper, + password=password, ) # vectorization of the data structure ends here @@ -837,6 +855,7 @@ def _partition_pdf_or_image_with_ocr( is_image: bool = False, metadata_last_modified: Optional[str] = None, starting_page_number: int = 1, + password: Optional[str] = None, **kwargs: Any, ): """Partitions an image or PDF using OCR. For PDFs, each page is converted @@ -861,7 +880,7 @@ def _partition_pdf_or_image_with_ocr( elements.extend(page_elements) else: for page_number, image in enumerate( - convert_pdf_to_images(filename, file), start=starting_page_number + convert_pdf_to_images(filename, file, password=password), start=starting_page_number ): page_elements = _partition_pdf_or_image_with_ocr_from_image( image=image, diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py index 4642d5b59..953d36713 100644 --- a/unstructured/partition/pdf_image/ocr.py +++ b/unstructured/partition/pdf_image/ocr.py @@ -42,6 +42,7 @@ def process_data_with_ocr( ocr_mode: str = OCRMode.FULL_PAGE.value, pdf_image_dpi: int = 200, ocr_layout_dumper: Optional[OCRLayoutDumper] = None, + password: Optional[str] = None, ) -> "DocumentLayout": """ Process OCR data from a given data and supplement the output DocumentLayout @@ -89,6 +90,7 @@ def process_data_with_ocr( ocr_mode=ocr_mode, pdf_image_dpi=pdf_image_dpi, ocr_layout_dumper=ocr_layout_dumper, + password=password, ) return merged_layouts @@ -105,6 +107,7 @@ def process_file_with_ocr( ocr_mode: str = OCRMode.FULL_PAGE.value, pdf_image_dpi: int = 200, ocr_layout_dumper: Optional[OCRLayoutDumper] = None, + password: Optional[str] = None, ) -> "DocumentLayout": """ Process OCR data from a given file and supplement the output DocumentLayout @@ -165,6 +168,7 @@ def process_file_with_ocr( dpi=pdf_image_dpi, output_folder=temp_dir, paths_only=True, + userpw=password or "", ) image_paths = cast(List[str], _image_paths) for i, image_path in enumerate(image_paths): diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py index a809c7f76..d57af9d53 100644 --- a/unstructured/partition/pdf_image/pdf_image_utils.py +++ b/unstructured/partition/pdf_image/pdf_image_utils.py @@ -58,6 +58,7 @@ def convert_pdf_to_image( dpi: int = 200, output_folder: Optional[Union[str, PurePath]] = None, path_only: bool = False, + password: Optional[str] = None, ) -> Union[List[Image.Image], List[str]]: """Get the image renderings of the pdf pages using pdf2image""" @@ -71,6 +72,7 @@ def convert_pdf_to_image( dpi=dpi, output_folder=output_folder, paths_only=path_only, + userpw=password, ) else: images = pdf2image.convert_from_path( @@ -125,6 +127,7 @@ def save_elements( is_image: bool = False, extract_image_block_to_payload: bool = False, output_dir_path: str | None = None, + password: Optional[str] = None, ): """ Saves specific elements from a PDF as images either to a directory or embeds them in the @@ -167,6 +170,7 @@ def save_elements( pdf_image_dpi, output_folder=temp_dir, path_only=True, + password=password, ) image_paths = cast(List[str], _image_paths) @@ -389,15 +393,16 @@ def convert_pdf_to_images( filename: str = "", file: Optional[bytes | IO[bytes]] = None, chunk_size: int = 10, + password: Optional[str] = None, ) -> Iterator[Image.Image]: # Convert a PDF in small chunks of pages at a time (e.g. 1-10, 11-20... and so on) exactly_one(filename=filename, file=file) if file is not None: f_bytes = convert_to_bytes(file) - info = pdf2image.pdfinfo_from_bytes(f_bytes) + info = pdf2image.pdfinfo_from_bytes(f_bytes, userpw=password) else: f_bytes = None - info = pdf2image.pdfinfo_from_path(filename) + info = pdf2image.pdfinfo_from_path(filename, userpw=password) total_pages = info["Pages"] for start_page in range(1, total_pages + 1, chunk_size): @@ -407,12 +412,14 @@ def convert_pdf_to_images( f_bytes, first_page=start_page, last_page=end_page, + userpw=password, ) else: chunk_images = pdf2image.convert_from_path( filename, first_page=start_page, last_page=end_page, + userpw=password, ) for image in chunk_images: diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index fbce8ec2a..724e34a81 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -38,12 +38,14 @@ DEFAULT_ROUND = 15 def process_file_with_pdfminer( filename: str = "", dpi: int = 200, + password: Optional[str] = None, ) -> tuple[List[List["TextRegion"]], List[List]]: with open_filename(filename, "rb") as fp: fp = cast(BinaryIO, fp) extracted_layout, layouts_links = process_data_with_pdfminer( file=fp, dpi=dpi, + password=password, ) return extracted_layout, layouts_links @@ -432,6 +434,7 @@ def process_page_layout_from_pdfminer( def process_data_with_pdfminer( file: Optional[Union[bytes, BinaryIO]] = None, dpi: int = 200, + password: Optional[str] = None, ) -> tuple[List[LayoutElements], List[List]]: """Loads the image and word objects from a pdf using pdfplumber and the image renderings of the pdf pages using pdf2image""" @@ -442,7 +445,9 @@ def process_data_with_pdfminer( layouts_links = [] # Coefficient to rescale bounding box to be compatible with images coef = dpi / 72 - for page_number, (page, page_layout) in enumerate(open_pdfminer_pages_generator(file)): + for page_number, (page, page_layout) in enumerate( + open_pdfminer_pages_generator(file, password=password) + ): width, height = page_layout.width, page_layout.height annotation_list = [] diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py index 929affeaa..3544e2676 100644 --- a/unstructured/partition/pdf_image/pdfminer_utils.py +++ b/unstructured/partition/pdf_image/pdfminer_utils.py @@ -1,6 +1,6 @@ import os import tempfile -from typing import BinaryIO, List, Tuple +from typing import BinaryIO, List, Optional, Tuple from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem, LTTextLine @@ -73,6 +73,7 @@ def rect_to_bbox( @requires_dependencies(["pikepdf", "pypdf"]) def open_pdfminer_pages_generator( fp: BinaryIO, + password: Optional[str] = None, ): """Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs.""" @@ -84,7 +85,7 @@ def open_pdfminer_pages_generator( with tempfile.TemporaryDirectory() as tmp_dir_path: tmp_file_path = os.path.join(tmp_dir_path, "tmp_file") try: - pages = PDFPage.get_pages(fp) + pages = PDFPage.get_pages(fp, password=password or "") # Detect invalid dictionary construct for entire PDF for i, page in enumerate(pages): try: @@ -93,7 +94,7 @@ def open_pdfminer_pages_generator( page_layout = device.get_result() except PSSyntaxError: logger.info("Detected invalid dictionary construct for PDFminer") - logger.info(f"Repairing the PDF page {i+1} ...") + logger.info(f"Repairing the PDF page {i + 1} ...") # find the error page from binary data fp error_page_data = get_page_data(fp, page_number=i) # repair the error page with pikepdf