Add password with PDF files (#3721)

Add password with PDF files
Must be combined with [PR 392 in
unstructured-inference](https://github.com/Unstructured-IO/unstructured-inference/pull/392)

---------

Co-authored-by: John J <43506685+Coniferish@users.noreply.github.com>
This commit is contained in:
Philippe PRADOS 2025-02-11 18:39:16 +01:00 committed by GitHub
parent 92be4eb2dd
commit b521bce9c6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 96 additions and 15 deletions

View File

@ -1,6 +1,7 @@
## 0.16.21-dev3 ## 0.16.21-dev4
### Enhancements ### Enhancements
- **Use password** to load PDF with all modes
- **use vectorized logic to merge inferred and extracted layouts**. Using the new `LayoutElements` data structure and numpy library to refactor the layout merging logic to improve compute performance as well as making logic more clear - **use vectorized logic to merge inferred and extracted layouts**. Using the new `LayoutElements` data structure and numpy library to refactor the layout merging logic to improve compute performance as well as making logic more clear

Binary file not shown.

View File

@ -11,5 +11,5 @@ google-cloud-vision
effdet effdet
# Do not move to constraints.in, otherwise unstructured-inference will not be upgraded # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
# when unstructured library is. # when unstructured library is.
unstructured-inference>=0.8.6 unstructured-inference>=0.8.7
unstructured.pytesseract>=0.3.12 unstructured.pytesseract>=0.3.12

View File

@ -263,7 +263,7 @@ typing-extensions==4.12.2
# torch # torch
tzdata==2025.1 tzdata==2025.1
# via pandas # via pandas
unstructured-inference==0.8.6 unstructured-inference==0.8.7
# via -r ./extra-pdf-image.in # via -r ./extra-pdf-image.in
unstructured-pytesseract==0.3.13 unstructured-pytesseract==0.3.13
# via -r ./extra-pdf-image.in # via -r ./extra-pdf-image.in

View File

@ -1545,3 +1545,43 @@ def test_document_to_element_list_sets_category_depth_titles():
assert elements[1].metadata.category_depth == 2 assert elements[1].metadata.category_depth == 2
assert elements[2].metadata.category_depth is None assert elements[2].metadata.category_depth is None
assert elements[3].metadata.category_depth == 0 assert elements[3].metadata.category_depth == 0
@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
@pytest.mark.parametrize(
"strategy",
# fast: can't capture the "intentionally left blank page" page
# others: will ignore the actual blank page
[
PartitionStrategy.FAST,
PartitionStrategy.HI_RES,
PartitionStrategy.OCR_ONLY,
],
)
def test_partition_pdf_with_password(
file_mode,
strategy,
filename=example_doc_path("pdf/password.pdf"),
):
# Test that the partition_pdf function can handle filename
def _test(result):
# validate that the result is a non-empty list of dicts
assert len(result) == 1
assert result[0].text == "File with password"
if file_mode == "filename":
result = pdf.partition_pdf(filename=filename, strategy=strategy, password="password")
_test(result)
elif file_mode == "rb":
with open(filename, "rb") as f:
result = pdf.partition_pdf(file=f, strategy=strategy, password="password")
_test(result)
else:
with open(filename, "rb") as test_file:
with SpooledTemporaryFile() as spooled_temp_file:
spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0)
result = pdf.partition_pdf(
file=spooled_temp_file, strategy=strategy, password="password"
)
_test(result)

View File

@ -1 +1 @@
__version__ = "0.16.21-dev3" # pragma: no cover __version__ = "0.16.21-dev4" # pragma: no cover

View File

@ -32,6 +32,7 @@ def partition_image(
starting_page_number: int = 1, starting_page_number: int = 1,
extract_forms: bool = False, extract_forms: bool = False,
form_extraction_skip_tables: bool = True, form_extraction_skip_tables: bool = True,
password: Optional[str] = None,
**kwargs: Any, **kwargs: Any,
) -> list[Element]: ) -> list[Element]:
"""Parses an image into a list of interpreted elements. """Parses an image into a list of interpreted elements.
@ -91,6 +92,8 @@ def partition_image(
(results in adding FormKeysValues elements to output). (results in adding FormKeysValues elements to output).
form_extraction_skip_tables form_extraction_skip_tables
Whether the form extraction logic should ignore regions designated as Tables. Whether the form extraction logic should ignore regions designated as Tables.
password
The password to decrypt the PDF file.
""" """
exactly_one(filename=filename, file=file) exactly_one(filename=filename, file=file)
@ -113,5 +116,6 @@ def partition_image(
starting_page_number=starting_page_number, starting_page_number=starting_page_number,
extract_forms=extract_forms, extract_forms=extract_forms,
form_extraction_skip_tables=form_extraction_skip_tables, form_extraction_skip_tables=form_extraction_skip_tables,
password=password,
**kwargs, **kwargs,
) )

View File

@ -144,6 +144,7 @@ def partition_pdf(
starting_page_number: int = 1, starting_page_number: int = 1,
extract_forms: bool = False, extract_forms: bool = False,
form_extraction_skip_tables: bool = True, form_extraction_skip_tables: bool = True,
password: Optional[str] = None,
**kwargs: Any, **kwargs: Any,
) -> list[Element]: ) -> list[Element]:
"""Parses a pdf document into a list of interpreted elements. """Parses a pdf document into a list of interpreted elements.
@ -224,6 +225,7 @@ def partition_pdf(
starting_page_number=starting_page_number, starting_page_number=starting_page_number,
extract_forms=extract_forms, extract_forms=extract_forms,
form_extraction_skip_tables=form_extraction_skip_tables, form_extraction_skip_tables=form_extraction_skip_tables,
password=password,
**kwargs, **kwargs,
) )
@ -245,6 +247,7 @@ def partition_pdf_or_image(
starting_page_number: int = 1, starting_page_number: int = 1,
extract_forms: bool = False, extract_forms: bool = False,
form_extraction_skip_tables: bool = True, form_extraction_skip_tables: bool = True,
password: Optional[str] = None,
**kwargs: Any, **kwargs: Any,
) -> list[Element]: ) -> list[Element]:
"""Parses a pdf or image document into a list of interpreted elements.""" """Parses a pdf or image document into a list of interpreted elements."""
@ -273,6 +276,7 @@ def partition_pdf_or_image(
languages=languages, languages=languages,
metadata_last_modified=metadata_last_modified or last_modified, metadata_last_modified=metadata_last_modified or last_modified,
starting_page_number=starting_page_number, starting_page_number=starting_page_number,
password=password,
**kwargs, **kwargs,
) )
pdf_text_extractable = any( pdf_text_extractable = any(
@ -322,6 +326,7 @@ def partition_pdf_or_image(
starting_page_number=starting_page_number, starting_page_number=starting_page_number,
extract_forms=extract_forms, extract_forms=extract_forms,
form_extraction_skip_tables=form_extraction_skip_tables, form_extraction_skip_tables=form_extraction_skip_tables,
password=password,
**kwargs, **kwargs,
) )
out_elements = _process_uncategorized_text_elements(elements) out_elements = _process_uncategorized_text_elements(elements)
@ -347,6 +352,7 @@ def partition_pdf_or_image(
is_image=is_image, is_image=is_image,
metadata_last_modified=metadata_last_modified or last_modified, metadata_last_modified=metadata_last_modified or last_modified,
starting_page_number=starting_page_number, starting_page_number=starting_page_number,
password=password,
**kwargs, **kwargs,
) )
out_elements = _process_uncategorized_text_elements(elements) out_elements = _process_uncategorized_text_elements(elements)
@ -360,6 +366,7 @@ def extractable_elements(
languages: Optional[list[str]] = None, languages: Optional[list[str]] = None,
metadata_last_modified: Optional[str] = None, metadata_last_modified: Optional[str] = None,
starting_page_number: int = 1, starting_page_number: int = 1,
password: Optional[str] = None,
**kwargs: Any, **kwargs: Any,
) -> list[list[Element]]: ) -> list[list[Element]]:
if isinstance(file, bytes): if isinstance(file, bytes):
@ -370,6 +377,7 @@ def extractable_elements(
languages=languages, languages=languages,
metadata_last_modified=metadata_last_modified, metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number, starting_page_number=starting_page_number,
password=password,
**kwargs, **kwargs,
) )
@ -380,6 +388,7 @@ def _partition_pdf_with_pdfminer(
languages: list[str], languages: list[str],
metadata_last_modified: Optional[str], metadata_last_modified: Optional[str],
starting_page_number: int = 1, starting_page_number: int = 1,
password: Optional[str] = None,
**kwargs: Any, **kwargs: Any,
) -> list[list[Element]]: ) -> list[list[Element]]:
"""Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster """Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster
@ -403,6 +412,7 @@ def _partition_pdf_with_pdfminer(
languages=languages, languages=languages,
metadata_last_modified=metadata_last_modified, metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number, starting_page_number=starting_page_number,
password=password,
**kwargs, **kwargs,
) )
@ -413,6 +423,7 @@ def _partition_pdf_with_pdfminer(
languages=languages, languages=languages,
metadata_last_modified=metadata_last_modified, metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number, starting_page_number=starting_page_number,
password=password,
**kwargs, **kwargs,
) )
@ -427,6 +438,7 @@ def _process_pdfminer_pages(
metadata_last_modified: Optional[str], metadata_last_modified: Optional[str],
annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD, annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD,
starting_page_number: int = 1, starting_page_number: int = 1,
password: Optional[str] = None,
**kwargs, **kwargs,
) -> list[list[Element]]: ) -> list[list[Element]]:
"""Uses PDFMiner to split a document into pages and process them.""" """Uses PDFMiner to split a document into pages and process them."""
@ -434,7 +446,8 @@ def _process_pdfminer_pages(
elements = [] elements = []
for page_number, (page, page_layout) in enumerate( for page_number, (page, page_layout) in enumerate(
open_pdfminer_pages_generator(fp), start=starting_page_number open_pdfminer_pages_generator(fp, password=password),
start=starting_page_number,
): ):
width, height = page_layout.width, page_layout.height width, height = page_layout.width, page_layout.height
@ -556,6 +569,7 @@ def _partition_pdf_or_image_local(
extract_forms: bool = False, extract_forms: bool = False,
form_extraction_skip_tables: bool = True, form_extraction_skip_tables: bool = True,
pdf_hi_res_max_pages: Optional[int] = None, pdf_hi_res_max_pages: Optional[int] = None,
password: Optional[str] = None,
**kwargs: Any, **kwargs: Any,
) -> list[Element]: ) -> list[Element]:
"""Partition using package installed locally""" """Partition using package installed locally"""
@ -592,10 +606,11 @@ def _partition_pdf_or_image_local(
is_image=is_image, is_image=is_image,
model_name=hi_res_model_name, model_name=hi_res_model_name,
pdf_image_dpi=pdf_image_dpi, pdf_image_dpi=pdf_image_dpi,
password=password,
) )
extracted_layout, layouts_links = ( extracted_layout, layouts_links = (
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi) process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi, password=password)
if pdf_text_extractable if pdf_text_extractable
else ([], []) else ([], [])
) )
@ -635,6 +650,7 @@ def _partition_pdf_or_image_local(
ocr_mode=ocr_mode, ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi, pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper, ocr_layout_dumper=ocr_layout_dumper,
password=password,
) )
else: else:
inferred_document_layout = process_data_with_model( inferred_document_layout = process_data_with_model(
@ -642,13 +658,14 @@ def _partition_pdf_or_image_local(
is_image=is_image, is_image=is_image,
model_name=hi_res_model_name, model_name=hi_res_model_name,
pdf_image_dpi=pdf_image_dpi, pdf_image_dpi=pdf_image_dpi,
password=password,
) )
if hasattr(file, "seek"): if hasattr(file, "seek"):
file.seek(0) file.seek(0)
extracted_layout, layouts_links = ( extracted_layout, layouts_links = (
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) process_data_with_pdfminer(file=file, dpi=pdf_image_dpi, password=password)
if pdf_text_extractable if pdf_text_extractable
else ([], []) else ([], [])
) )
@ -690,6 +707,7 @@ def _partition_pdf_or_image_local(
ocr_mode=ocr_mode, ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi, pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper, ocr_layout_dumper=ocr_layout_dumper,
password=password,
) )
# vectorization of the data structure ends here # vectorization of the data structure ends here
@ -837,6 +855,7 @@ def _partition_pdf_or_image_with_ocr(
is_image: bool = False, is_image: bool = False,
metadata_last_modified: Optional[str] = None, metadata_last_modified: Optional[str] = None,
starting_page_number: int = 1, starting_page_number: int = 1,
password: Optional[str] = None,
**kwargs: Any, **kwargs: Any,
): ):
"""Partitions an image or PDF using OCR. For PDFs, each page is converted """Partitions an image or PDF using OCR. For PDFs, each page is converted
@ -861,7 +880,7 @@ def _partition_pdf_or_image_with_ocr(
elements.extend(page_elements) elements.extend(page_elements)
else: else:
for page_number, image in enumerate( for page_number, image in enumerate(
convert_pdf_to_images(filename, file), start=starting_page_number convert_pdf_to_images(filename, file, password=password), start=starting_page_number
): ):
page_elements = _partition_pdf_or_image_with_ocr_from_image( page_elements = _partition_pdf_or_image_with_ocr_from_image(
image=image, image=image,

View File

@ -42,6 +42,7 @@ def process_data_with_ocr(
ocr_mode: str = OCRMode.FULL_PAGE.value, ocr_mode: str = OCRMode.FULL_PAGE.value,
pdf_image_dpi: int = 200, pdf_image_dpi: int = 200,
ocr_layout_dumper: Optional[OCRLayoutDumper] = None, ocr_layout_dumper: Optional[OCRLayoutDumper] = None,
password: Optional[str] = None,
) -> "DocumentLayout": ) -> "DocumentLayout":
""" """
Process OCR data from a given data and supplement the output DocumentLayout Process OCR data from a given data and supplement the output DocumentLayout
@ -89,6 +90,7 @@ def process_data_with_ocr(
ocr_mode=ocr_mode, ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi, pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper, ocr_layout_dumper=ocr_layout_dumper,
password=password,
) )
return merged_layouts return merged_layouts
@ -105,6 +107,7 @@ def process_file_with_ocr(
ocr_mode: str = OCRMode.FULL_PAGE.value, ocr_mode: str = OCRMode.FULL_PAGE.value,
pdf_image_dpi: int = 200, pdf_image_dpi: int = 200,
ocr_layout_dumper: Optional[OCRLayoutDumper] = None, ocr_layout_dumper: Optional[OCRLayoutDumper] = None,
password: Optional[str] = None,
) -> "DocumentLayout": ) -> "DocumentLayout":
""" """
Process OCR data from a given file and supplement the output DocumentLayout Process OCR data from a given file and supplement the output DocumentLayout
@ -165,6 +168,7 @@ def process_file_with_ocr(
dpi=pdf_image_dpi, dpi=pdf_image_dpi,
output_folder=temp_dir, output_folder=temp_dir,
paths_only=True, paths_only=True,
userpw=password or "",
) )
image_paths = cast(List[str], _image_paths) image_paths = cast(List[str], _image_paths)
for i, image_path in enumerate(image_paths): for i, image_path in enumerate(image_paths):

View File

@ -58,6 +58,7 @@ def convert_pdf_to_image(
dpi: int = 200, dpi: int = 200,
output_folder: Optional[Union[str, PurePath]] = None, output_folder: Optional[Union[str, PurePath]] = None,
path_only: bool = False, path_only: bool = False,
password: Optional[str] = None,
) -> Union[List[Image.Image], List[str]]: ) -> Union[List[Image.Image], List[str]]:
"""Get the image renderings of the pdf pages using pdf2image""" """Get the image renderings of the pdf pages using pdf2image"""
@ -71,6 +72,7 @@ def convert_pdf_to_image(
dpi=dpi, dpi=dpi,
output_folder=output_folder, output_folder=output_folder,
paths_only=path_only, paths_only=path_only,
userpw=password,
) )
else: else:
images = pdf2image.convert_from_path( images = pdf2image.convert_from_path(
@ -125,6 +127,7 @@ def save_elements(
is_image: bool = False, is_image: bool = False,
extract_image_block_to_payload: bool = False, extract_image_block_to_payload: bool = False,
output_dir_path: str | None = None, output_dir_path: str | None = None,
password: Optional[str] = None,
): ):
""" """
Saves specific elements from a PDF as images either to a directory or embeds them in the Saves specific elements from a PDF as images either to a directory or embeds them in the
@ -167,6 +170,7 @@ def save_elements(
pdf_image_dpi, pdf_image_dpi,
output_folder=temp_dir, output_folder=temp_dir,
path_only=True, path_only=True,
password=password,
) )
image_paths = cast(List[str], _image_paths) image_paths = cast(List[str], _image_paths)
@ -389,15 +393,16 @@ def convert_pdf_to_images(
filename: str = "", filename: str = "",
file: Optional[bytes | IO[bytes]] = None, file: Optional[bytes | IO[bytes]] = None,
chunk_size: int = 10, chunk_size: int = 10,
password: Optional[str] = None,
) -> Iterator[Image.Image]: ) -> Iterator[Image.Image]:
# Convert a PDF in small chunks of pages at a time (e.g. 1-10, 11-20... and so on) # Convert a PDF in small chunks of pages at a time (e.g. 1-10, 11-20... and so on)
exactly_one(filename=filename, file=file) exactly_one(filename=filename, file=file)
if file is not None: if file is not None:
f_bytes = convert_to_bytes(file) f_bytes = convert_to_bytes(file)
info = pdf2image.pdfinfo_from_bytes(f_bytes) info = pdf2image.pdfinfo_from_bytes(f_bytes, userpw=password)
else: else:
f_bytes = None f_bytes = None
info = pdf2image.pdfinfo_from_path(filename) info = pdf2image.pdfinfo_from_path(filename, userpw=password)
total_pages = info["Pages"] total_pages = info["Pages"]
for start_page in range(1, total_pages + 1, chunk_size): for start_page in range(1, total_pages + 1, chunk_size):
@ -407,12 +412,14 @@ def convert_pdf_to_images(
f_bytes, f_bytes,
first_page=start_page, first_page=start_page,
last_page=end_page, last_page=end_page,
userpw=password,
) )
else: else:
chunk_images = pdf2image.convert_from_path( chunk_images = pdf2image.convert_from_path(
filename, filename,
first_page=start_page, first_page=start_page,
last_page=end_page, last_page=end_page,
userpw=password,
) )
for image in chunk_images: for image in chunk_images:

View File

@ -38,12 +38,14 @@ DEFAULT_ROUND = 15
def process_file_with_pdfminer( def process_file_with_pdfminer(
filename: str = "", filename: str = "",
dpi: int = 200, dpi: int = 200,
password: Optional[str] = None,
) -> tuple[List[List["TextRegion"]], List[List]]: ) -> tuple[List[List["TextRegion"]], List[List]]:
with open_filename(filename, "rb") as fp: with open_filename(filename, "rb") as fp:
fp = cast(BinaryIO, fp) fp = cast(BinaryIO, fp)
extracted_layout, layouts_links = process_data_with_pdfminer( extracted_layout, layouts_links = process_data_with_pdfminer(
file=fp, file=fp,
dpi=dpi, dpi=dpi,
password=password,
) )
return extracted_layout, layouts_links return extracted_layout, layouts_links
@ -432,6 +434,7 @@ def process_page_layout_from_pdfminer(
def process_data_with_pdfminer( def process_data_with_pdfminer(
file: Optional[Union[bytes, BinaryIO]] = None, file: Optional[Union[bytes, BinaryIO]] = None,
dpi: int = 200, dpi: int = 200,
password: Optional[str] = None,
) -> tuple[List[LayoutElements], List[List]]: ) -> tuple[List[LayoutElements], List[List]]:
"""Loads the image and word objects from a pdf using pdfplumber and the image renderings of the """Loads the image and word objects from a pdf using pdfplumber and the image renderings of the
pdf pages using pdf2image""" pdf pages using pdf2image"""
@ -442,7 +445,9 @@ def process_data_with_pdfminer(
layouts_links = [] layouts_links = []
# Coefficient to rescale bounding box to be compatible with images # Coefficient to rescale bounding box to be compatible with images
coef = dpi / 72 coef = dpi / 72
for page_number, (page, page_layout) in enumerate(open_pdfminer_pages_generator(file)): for page_number, (page, page_layout) in enumerate(
open_pdfminer_pages_generator(file, password=password)
):
width, height = page_layout.width, page_layout.height width, height = page_layout.width, page_layout.height
annotation_list = [] annotation_list = []

View File

@ -1,6 +1,6 @@
import os import os
import tempfile import tempfile
from typing import BinaryIO, List, Tuple from typing import BinaryIO, List, Optional, Tuple
from pdfminer.converter import PDFPageAggregator from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem, LTTextLine from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem, LTTextLine
@ -73,6 +73,7 @@ def rect_to_bbox(
@requires_dependencies(["pikepdf", "pypdf"]) @requires_dependencies(["pikepdf", "pypdf"])
def open_pdfminer_pages_generator( def open_pdfminer_pages_generator(
fp: BinaryIO, fp: BinaryIO,
password: Optional[str] = None,
): ):
"""Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs.""" """Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs."""
@ -84,7 +85,7 @@ def open_pdfminer_pages_generator(
with tempfile.TemporaryDirectory() as tmp_dir_path: with tempfile.TemporaryDirectory() as tmp_dir_path:
tmp_file_path = os.path.join(tmp_dir_path, "tmp_file") tmp_file_path = os.path.join(tmp_dir_path, "tmp_file")
try: try:
pages = PDFPage.get_pages(fp) pages = PDFPage.get_pages(fp, password=password or "")
# Detect invalid dictionary construct for entire PDF # Detect invalid dictionary construct for entire PDF
for i, page in enumerate(pages): for i, page in enumerate(pages):
try: try: