Add password with PDF files (#3721)

Add password with PDF files Must be combined with [PR 392 in unstructured-inference](https://github.com/Unstructured-IO/unstructured-inference/pull/392) --------- Co-authored-by: John J <43506685+Coniferish@users.noreply.github.com>
2025-07-24 17:41:15 +00:00 · 2025-02-11 18:39:16 +01:00 · 2025-02-11 18:39:16 +01:00 · b521bce9c6
commit b521bce9c6
parent 92be4eb2dd
12 changed files with 96 additions and 15 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,6 +1,7 @@
-## 0.16.21-dev3
+## 0.16.21-dev4
 ### Enhancements
 - **Use password** to load PDF with all modes
 - **use vectorized logic to merge inferred and extracted layouts**. Using the new `LayoutElements` data structure and numpy library to refactor the layout merging logic to improve compute performance as well as making logic more clear
--- a/example-docs/pdf/password.pdf
+++ b/example-docs/pdf/password.pdf
--- a/requirements/extra-pdf-image.in
+++ b/requirements/extra-pdf-image.in
@ -11,5 +11,5 @@ google-cloud-vision
 effdet
 # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
 # when unstructured library is.
-unstructured-inference>=0.8.6
+unstructured-inference>=0.8.7
 unstructured.pytesseract>=0.3.12
--- a/requirements/extra-pdf-image.txt
+++ b/requirements/extra-pdf-image.txt
@ -263,7 +263,7 @@ typing-extensions==4.12.2
    #   torch
 tzdata==2025.1
    # via pandas
-unstructured-inference==0.8.6
+unstructured-inference==0.8.7
    # via -r ./extra-pdf-image.in
 unstructured-pytesseract==0.3.13
    # via -r ./extra-pdf-image.in
--- a/test_unstructured/partition/pdf_image/test_pdf.py
+++ b/test_unstructured/partition/pdf_image/test_pdf.py
@ -1545,3 +1545,43 @@ def test_document_to_element_list_sets_category_depth_titles():
    assert elements[1].metadata.category_depth == 2
    assert elements[2].metadata.category_depth is None
    assert elements[3].metadata.category_depth == 0
@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
@pytest.mark.parametrize(
    "strategy",
    # fast: can't capture the "intentionally left blank page" page
    # others: will ignore the actual blank page
    [
        PartitionStrategy.FAST,
        PartitionStrategy.HI_RES,
        PartitionStrategy.OCR_ONLY,
    ],
 )
 def test_partition_pdf_with_password(
    file_mode,
    strategy,
    filename=example_doc_path("pdf/password.pdf"),
 ):
    # Test that the partition_pdf function can handle filename
    def _test(result):
        # validate that the result is a non-empty list of dicts
        assert len(result) == 1
        assert result[0].text == "File with password"
    if file_mode == "filename":
        result = pdf.partition_pdf(filename=filename, strategy=strategy, password="password")
        _test(result)
    elif file_mode == "rb":
        with open(filename, "rb") as f:
            result = pdf.partition_pdf(file=f, strategy=strategy, password="password")
            _test(result)
    else:
        with open(filename, "rb") as test_file:
            with SpooledTemporaryFile() as spooled_temp_file:
                spooled_temp_file.write(test_file.read())
                spooled_temp_file.seek(0)
                result = pdf.partition_pdf(
                    file=spooled_temp_file, strategy=strategy, password="password"
                )
                _test(result)
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.16.21-dev3"  # pragma: no cover
+__version__ = "0.16.21-dev4"  # pragma: no cover
--- a/unstructured/partition/image.py
+++ b/unstructured/partition/image.py
@ -32,6 +32,7 @@ def partition_image(
    starting_page_number: int = 1,
    extract_forms: bool = False,
    form_extraction_skip_tables: bool = True,
    password: Optional[str] = None,
    **kwargs: Any,
 ) -> list[Element]:
    """Parses an image into a list of interpreted elements.
@ -91,6 +92,8 @@ def partition_image(
        (results in adding FormKeysValues elements to output).
    form_extraction_skip_tables
        Whether the form extraction logic should ignore regions designated as Tables.
    password
        The password to decrypt the PDF file.
    """
    exactly_one(filename=filename, file=file)
@ -113,5 +116,6 @@ def partition_image(
        starting_page_number=starting_page_number,
        extract_forms=extract_forms,
        form_extraction_skip_tables=form_extraction_skip_tables,
        password=password,
        **kwargs,
    )
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -144,6 +144,7 @@ def partition_pdf(
    starting_page_number: int = 1,
    extract_forms: bool = False,
    form_extraction_skip_tables: bool = True,
    password: Optional[str] = None,
    **kwargs: Any,
 ) -> list[Element]:
    """Parses a pdf document into a list of interpreted elements.
@ -224,6 +225,7 @@ def partition_pdf(
        starting_page_number=starting_page_number,
        extract_forms=extract_forms,
        form_extraction_skip_tables=form_extraction_skip_tables,
        password=password,
        **kwargs,
    )
@ -245,6 +247,7 @@ def partition_pdf_or_image(
    starting_page_number: int = 1,
    extract_forms: bool = False,
    form_extraction_skip_tables: bool = True,
    password: Optional[str] = None,
    **kwargs: Any,
 ) -> list[Element]:
    """Parses a pdf or image document into a list of interpreted elements."""
@ -273,6 +276,7 @@ def partition_pdf_or_image(
                languages=languages,
                metadata_last_modified=metadata_last_modified or last_modified,
                starting_page_number=starting_page_number,
                password=password,
                **kwargs,
            )
            pdf_text_extractable = any(
@ -322,6 +326,7 @@ def partition_pdf_or_image(
                starting_page_number=starting_page_number,
                extract_forms=extract_forms,
                form_extraction_skip_tables=form_extraction_skip_tables,
                password=password,
                **kwargs,
            )
            out_elements = _process_uncategorized_text_elements(elements)
@ -347,6 +352,7 @@ def partition_pdf_or_image(
                is_image=is_image,
                metadata_last_modified=metadata_last_modified or last_modified,
                starting_page_number=starting_page_number,
                password=password,
                **kwargs,
            )
            out_elements = _process_uncategorized_text_elements(elements)
@ -360,6 +366,7 @@ def extractable_elements(
    languages: Optional[list[str]] = None,
    metadata_last_modified: Optional[str] = None,
    starting_page_number: int = 1,
    password: Optional[str] = None,
    **kwargs: Any,
 ) -> list[list[Element]]:
    if isinstance(file, bytes):
@ -370,6 +377,7 @@ def extractable_elements(
        languages=languages,
        metadata_last_modified=metadata_last_modified,
        starting_page_number=starting_page_number,
        password=password,
        **kwargs,
    )
@ -380,6 +388,7 @@ def _partition_pdf_with_pdfminer(
    languages: list[str],
    metadata_last_modified: Optional[str],
    starting_page_number: int = 1,
    password: Optional[str] = None,
    **kwargs: Any,
 ) -> list[list[Element]]:
    """Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster
@ -403,6 +412,7 @@ def _partition_pdf_with_pdfminer(
                languages=languages,
                metadata_last_modified=metadata_last_modified,
                starting_page_number=starting_page_number,
                password=password,
                **kwargs,
            )
@ -413,6 +423,7 @@ def _partition_pdf_with_pdfminer(
            languages=languages,
            metadata_last_modified=metadata_last_modified,
            starting_page_number=starting_page_number,
            password=password,
            **kwargs,
        )
@ -427,6 +438,7 @@ def _process_pdfminer_pages(
    metadata_last_modified: Optional[str],
    annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD,
    starting_page_number: int = 1,
    password: Optional[str] = None,
    **kwargs,
 ) -> list[list[Element]]:
    """Uses PDFMiner to split a document into pages and process them."""
@ -434,7 +446,8 @@ def _process_pdfminer_pages(
    elements = []
    for page_number, (page, page_layout) in enumerate(
-        open_pdfminer_pages_generator(fp), start=starting_page_number
+        open_pdfminer_pages_generator(fp, password=password),
        start=starting_page_number,
    ):
        width, height = page_layout.width, page_layout.height
@ -556,6 +569,7 @@ def _partition_pdf_or_image_local(
    extract_forms: bool = False,
    form_extraction_skip_tables: bool = True,
    pdf_hi_res_max_pages: Optional[int] = None,
    password: Optional[str] = None,
    **kwargs: Any,
 ) -> list[Element]:
    """Partition using package installed locally"""
@ -592,10 +606,11 @@ def _partition_pdf_or_image_local(
            is_image=is_image,
            model_name=hi_res_model_name,
            pdf_image_dpi=pdf_image_dpi,
            password=password,
        )
        extracted_layout, layouts_links = (
-            process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
+            process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi, password=password)
            if pdf_text_extractable
            else ([], [])
        )
@ -635,6 +650,7 @@ def _partition_pdf_or_image_local(
            ocr_mode=ocr_mode,
            pdf_image_dpi=pdf_image_dpi,
            ocr_layout_dumper=ocr_layout_dumper,
            password=password,
        )
    else:
        inferred_document_layout = process_data_with_model(
@ -642,13 +658,14 @@ def _partition_pdf_or_image_local(
            is_image=is_image,
            model_name=hi_res_model_name,
            pdf_image_dpi=pdf_image_dpi,
            password=password,
        )
        if hasattr(file, "seek"):
            file.seek(0)
        extracted_layout, layouts_links = (
-            process_data_with_pdfminer(file=file, dpi=pdf_image_dpi)
+            process_data_with_pdfminer(file=file, dpi=pdf_image_dpi, password=password)
            if pdf_text_extractable
            else ([], [])
        )
@ -690,6 +707,7 @@ def _partition_pdf_or_image_local(
            ocr_mode=ocr_mode,
            pdf_image_dpi=pdf_image_dpi,
            ocr_layout_dumper=ocr_layout_dumper,
            password=password,
        )
    # vectorization of the data structure ends here
@ -837,6 +855,7 @@ def _partition_pdf_or_image_with_ocr(
    is_image: bool = False,
    metadata_last_modified: Optional[str] = None,
    starting_page_number: int = 1,
    password: Optional[str] = None,
    **kwargs: Any,
 ):
    """Partitions an image or PDF using OCR. For PDFs, each page is converted
@ -861,7 +880,7 @@ def _partition_pdf_or_image_with_ocr(
            elements.extend(page_elements)
    else:
        for page_number, image in enumerate(
-            convert_pdf_to_images(filename, file), start=starting_page_number
+            convert_pdf_to_images(filename, file, password=password), start=starting_page_number
        ):
            page_elements = _partition_pdf_or_image_with_ocr_from_image(
                image=image,
--- a/unstructured/partition/pdf_image/ocr.py
+++ b/unstructured/partition/pdf_image/ocr.py
@ -42,6 +42,7 @@ def process_data_with_ocr(
    ocr_mode: str = OCRMode.FULL_PAGE.value,
    pdf_image_dpi: int = 200,
    ocr_layout_dumper: Optional[OCRLayoutDumper] = None,
    password: Optional[str] = None,
 ) -> "DocumentLayout":
    """
    Process OCR data from a given data and supplement the output DocumentLayout
@ -89,6 +90,7 @@ def process_data_with_ocr(
            ocr_mode=ocr_mode,
            pdf_image_dpi=pdf_image_dpi,
            ocr_layout_dumper=ocr_layout_dumper,
            password=password,
        )
    return merged_layouts
@ -105,6 +107,7 @@ def process_file_with_ocr(
    ocr_mode: str = OCRMode.FULL_PAGE.value,
    pdf_image_dpi: int = 200,
    ocr_layout_dumper: Optional[OCRLayoutDumper] = None,
    password: Optional[str] = None,
 ) -> "DocumentLayout":
    """
    Process OCR data from a given file and supplement the output DocumentLayout
@ -165,6 +168,7 @@ def process_file_with_ocr(
                    dpi=pdf_image_dpi,
                    output_folder=temp_dir,
                    paths_only=True,
                    userpw=password or "",
                )
                image_paths = cast(List[str], _image_paths)
                for i, image_path in enumerate(image_paths):
--- a/unstructured/partition/pdf_image/pdf_image_utils.py
+++ b/unstructured/partition/pdf_image/pdf_image_utils.py
@ -58,6 +58,7 @@ def convert_pdf_to_image(
    dpi: int = 200,
    output_folder: Optional[Union[str, PurePath]] = None,
    path_only: bool = False,
    password: Optional[str] = None,
 ) -> Union[List[Image.Image], List[str]]:
    """Get the image renderings of the pdf pages using pdf2image"""
@ -71,6 +72,7 @@ def convert_pdf_to_image(
            dpi=dpi,
            output_folder=output_folder,
            paths_only=path_only,
            userpw=password,
        )
    else:
        images = pdf2image.convert_from_path(
@ -125,6 +127,7 @@ def save_elements(
    is_image: bool = False,
    extract_image_block_to_payload: bool = False,
    output_dir_path: str | None = None,
    password: Optional[str] = None,
 ):
    """
    Saves specific elements from a PDF as images either to a directory or embeds them in the
@ -167,6 +170,7 @@ def save_elements(
                pdf_image_dpi,
                output_folder=temp_dir,
                path_only=True,
                password=password,
            )
            image_paths = cast(List[str], _image_paths)
@ -389,15 +393,16 @@ def convert_pdf_to_images(
    filename: str = "",
    file: Optional[bytes | IO[bytes]] = None,
    chunk_size: int = 10,
    password: Optional[str] = None,
 ) -> Iterator[Image.Image]:
    # Convert a PDF in small chunks of pages at a time (e.g. 1-10, 11-20... and so on)
    exactly_one(filename=filename, file=file)
    if file is not None:
        f_bytes = convert_to_bytes(file)
-        info = pdf2image.pdfinfo_from_bytes(f_bytes)
+        info = pdf2image.pdfinfo_from_bytes(f_bytes, userpw=password)
    else:
        f_bytes = None
-        info = pdf2image.pdfinfo_from_path(filename)
+        info = pdf2image.pdfinfo_from_path(filename, userpw=password)
    total_pages = info["Pages"]
    for start_page in range(1, total_pages + 1, chunk_size):
@ -407,12 +412,14 @@ def convert_pdf_to_images(
                f_bytes,
                first_page=start_page,
                last_page=end_page,
                userpw=password,
            )
        else:
            chunk_images = pdf2image.convert_from_path(
                filename,
                first_page=start_page,
                last_page=end_page,
                userpw=password,
            )
        for image in chunk_images:
--- a/unstructured/partition/pdf_image/pdfminer_processing.py
+++ b/unstructured/partition/pdf_image/pdfminer_processing.py
@ -38,12 +38,14 @@ DEFAULT_ROUND = 15
 def process_file_with_pdfminer(
    filename: str = "",
    dpi: int = 200,
    password: Optional[str] = None,
 ) -> tuple[List[List["TextRegion"]], List[List]]:
    with open_filename(filename, "rb") as fp:
        fp = cast(BinaryIO, fp)
        extracted_layout, layouts_links = process_data_with_pdfminer(
            file=fp,
            dpi=dpi,
            password=password,
        )
        return extracted_layout, layouts_links
@ -432,6 +434,7 @@ def process_page_layout_from_pdfminer(
 def process_data_with_pdfminer(
    file: Optional[Union[bytes, BinaryIO]] = None,
    dpi: int = 200,
    password: Optional[str] = None,
 ) -> tuple[List[LayoutElements], List[List]]:
    """Loads the image and word objects from a pdf using pdfplumber and the image renderings of the
    pdf pages using pdf2image"""
@ -442,7 +445,9 @@ def process_data_with_pdfminer(
    layouts_links = []
    # Coefficient to rescale bounding box to be compatible with images
    coef = dpi / 72
-    for page_number, (page, page_layout) in enumerate(open_pdfminer_pages_generator(file)):
+    for page_number, (page, page_layout) in enumerate(
        open_pdfminer_pages_generator(file, password=password)
    ):
        width, height = page_layout.width, page_layout.height
        annotation_list = []
--- a/unstructured/partition/pdf_image/pdfminer_utils.py
+++ b/unstructured/partition/pdf_image/pdfminer_utils.py
@ -1,6 +1,6 @@
 import os
 import tempfile
-from typing import BinaryIO, List, Tuple
+from typing import BinaryIO, List, Optional, Tuple
 from pdfminer.converter import PDFPageAggregator
 from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem, LTTextLine
@ -73,6 +73,7 @@ def rect_to_bbox(
@requires_dependencies(["pikepdf", "pypdf"])
 def open_pdfminer_pages_generator(
    fp: BinaryIO,
    password: Optional[str] = None,
 ):
    """Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs."""
@ -84,7 +85,7 @@ def open_pdfminer_pages_generator(
    with tempfile.TemporaryDirectory() as tmp_dir_path:
        tmp_file_path = os.path.join(tmp_dir_path, "tmp_file")
        try:
-            pages = PDFPage.get_pages(fp)
+            pages = PDFPage.get_pages(fp, password=password or "")
            # Detect invalid dictionary construct for entire PDF
            for i, page in enumerate(pages):
                try:
`@ -1 +1 @@`
	`__version__ = "0.16.21-dev3" # pragma: no cover`	`__version__ = "0.16.21-dev4" # pragma: no cover`