mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
Add password with PDF files (#3721)
Add password with PDF files Must be combined with [PR 392 in unstructured-inference](https://github.com/Unstructured-IO/unstructured-inference/pull/392) --------- Co-authored-by: John J <43506685+Coniferish@users.noreply.github.com>
This commit is contained in:
parent
92be4eb2dd
commit
b521bce9c6
@ -1,6 +1,7 @@
|
||||
## 0.16.21-dev3
|
||||
## 0.16.21-dev4
|
||||
|
||||
### Enhancements
|
||||
- **Use password** to load PDF with all modes
|
||||
|
||||
- **use vectorized logic to merge inferred and extracted layouts**. Using the new `LayoutElements` data structure and numpy library to refactor the layout merging logic to improve compute performance as well as making logic more clear
|
||||
|
||||
|
BIN
example-docs/pdf/password.pdf
Normal file
BIN
example-docs/pdf/password.pdf
Normal file
Binary file not shown.
@ -11,5 +11,5 @@ google-cloud-vision
|
||||
effdet
|
||||
# Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
|
||||
# when unstructured library is.
|
||||
unstructured-inference>=0.8.6
|
||||
unstructured-inference>=0.8.7
|
||||
unstructured.pytesseract>=0.3.12
|
||||
|
@ -263,7 +263,7 @@ typing-extensions==4.12.2
|
||||
# torch
|
||||
tzdata==2025.1
|
||||
# via pandas
|
||||
unstructured-inference==0.8.6
|
||||
unstructured-inference==0.8.7
|
||||
# via -r ./extra-pdf-image.in
|
||||
unstructured-pytesseract==0.3.13
|
||||
# via -r ./extra-pdf-image.in
|
||||
|
@ -262,7 +262,7 @@ def test_partition_pdf_outputs_valid_amount_of_elements_and_metadata_values(
|
||||
strategy=strategy,
|
||||
starting_page_number=starting_page_number,
|
||||
)
|
||||
_test(result)
|
||||
_test(result)
|
||||
|
||||
|
||||
@mock.patch.dict(os.environ, {"UNSTRUCTURED_HI_RES_MODEL_NAME": "checkbox"})
|
||||
@ -1545,3 +1545,43 @@ def test_document_to_element_list_sets_category_depth_titles():
|
||||
assert elements[1].metadata.category_depth == 2
|
||||
assert elements[2].metadata.category_depth is None
|
||||
assert elements[3].metadata.category_depth == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
|
||||
@pytest.mark.parametrize(
|
||||
"strategy",
|
||||
# fast: can't capture the "intentionally left blank page" page
|
||||
# others: will ignore the actual blank page
|
||||
[
|
||||
PartitionStrategy.FAST,
|
||||
PartitionStrategy.HI_RES,
|
||||
PartitionStrategy.OCR_ONLY,
|
||||
],
|
||||
)
|
||||
def test_partition_pdf_with_password(
|
||||
file_mode,
|
||||
strategy,
|
||||
filename=example_doc_path("pdf/password.pdf"),
|
||||
):
|
||||
# Test that the partition_pdf function can handle filename
|
||||
def _test(result):
|
||||
# validate that the result is a non-empty list of dicts
|
||||
assert len(result) == 1
|
||||
assert result[0].text == "File with password"
|
||||
|
||||
if file_mode == "filename":
|
||||
result = pdf.partition_pdf(filename=filename, strategy=strategy, password="password")
|
||||
_test(result)
|
||||
elif file_mode == "rb":
|
||||
with open(filename, "rb") as f:
|
||||
result = pdf.partition_pdf(file=f, strategy=strategy, password="password")
|
||||
_test(result)
|
||||
else:
|
||||
with open(filename, "rb") as test_file:
|
||||
with SpooledTemporaryFile() as spooled_temp_file:
|
||||
spooled_temp_file.write(test_file.read())
|
||||
spooled_temp_file.seek(0)
|
||||
result = pdf.partition_pdf(
|
||||
file=spooled_temp_file, strategy=strategy, password="password"
|
||||
)
|
||||
_test(result)
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.16.21-dev3" # pragma: no cover
|
||||
__version__ = "0.16.21-dev4" # pragma: no cover
|
||||
|
@ -32,6 +32,7 @@ def partition_image(
|
||||
starting_page_number: int = 1,
|
||||
extract_forms: bool = False,
|
||||
form_extraction_skip_tables: bool = True,
|
||||
password: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> list[Element]:
|
||||
"""Parses an image into a list of interpreted elements.
|
||||
@ -91,6 +92,8 @@ def partition_image(
|
||||
(results in adding FormKeysValues elements to output).
|
||||
form_extraction_skip_tables
|
||||
Whether the form extraction logic should ignore regions designated as Tables.
|
||||
password
|
||||
The password to decrypt the PDF file.
|
||||
"""
|
||||
exactly_one(filename=filename, file=file)
|
||||
|
||||
@ -113,5 +116,6 @@ def partition_image(
|
||||
starting_page_number=starting_page_number,
|
||||
extract_forms=extract_forms,
|
||||
form_extraction_skip_tables=form_extraction_skip_tables,
|
||||
password=password,
|
||||
**kwargs,
|
||||
)
|
||||
|
@ -144,6 +144,7 @@ def partition_pdf(
|
||||
starting_page_number: int = 1,
|
||||
extract_forms: bool = False,
|
||||
form_extraction_skip_tables: bool = True,
|
||||
password: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> list[Element]:
|
||||
"""Parses a pdf document into a list of interpreted elements.
|
||||
@ -224,6 +225,7 @@ def partition_pdf(
|
||||
starting_page_number=starting_page_number,
|
||||
extract_forms=extract_forms,
|
||||
form_extraction_skip_tables=form_extraction_skip_tables,
|
||||
password=password,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@ -245,6 +247,7 @@ def partition_pdf_or_image(
|
||||
starting_page_number: int = 1,
|
||||
extract_forms: bool = False,
|
||||
form_extraction_skip_tables: bool = True,
|
||||
password: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> list[Element]:
|
||||
"""Parses a pdf or image document into a list of interpreted elements."""
|
||||
@ -273,6 +276,7 @@ def partition_pdf_or_image(
|
||||
languages=languages,
|
||||
metadata_last_modified=metadata_last_modified or last_modified,
|
||||
starting_page_number=starting_page_number,
|
||||
password=password,
|
||||
**kwargs,
|
||||
)
|
||||
pdf_text_extractable = any(
|
||||
@ -322,6 +326,7 @@ def partition_pdf_or_image(
|
||||
starting_page_number=starting_page_number,
|
||||
extract_forms=extract_forms,
|
||||
form_extraction_skip_tables=form_extraction_skip_tables,
|
||||
password=password,
|
||||
**kwargs,
|
||||
)
|
||||
out_elements = _process_uncategorized_text_elements(elements)
|
||||
@ -347,6 +352,7 @@ def partition_pdf_or_image(
|
||||
is_image=is_image,
|
||||
metadata_last_modified=metadata_last_modified or last_modified,
|
||||
starting_page_number=starting_page_number,
|
||||
password=password,
|
||||
**kwargs,
|
||||
)
|
||||
out_elements = _process_uncategorized_text_elements(elements)
|
||||
@ -360,6 +366,7 @@ def extractable_elements(
|
||||
languages: Optional[list[str]] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
starting_page_number: int = 1,
|
||||
password: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> list[list[Element]]:
|
||||
if isinstance(file, bytes):
|
||||
@ -370,6 +377,7 @@ def extractable_elements(
|
||||
languages=languages,
|
||||
metadata_last_modified=metadata_last_modified,
|
||||
starting_page_number=starting_page_number,
|
||||
password=password,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@ -380,6 +388,7 @@ def _partition_pdf_with_pdfminer(
|
||||
languages: list[str],
|
||||
metadata_last_modified: Optional[str],
|
||||
starting_page_number: int = 1,
|
||||
password: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> list[list[Element]]:
|
||||
"""Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster
|
||||
@ -403,6 +412,7 @@ def _partition_pdf_with_pdfminer(
|
||||
languages=languages,
|
||||
metadata_last_modified=metadata_last_modified,
|
||||
starting_page_number=starting_page_number,
|
||||
password=password,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@ -413,6 +423,7 @@ def _partition_pdf_with_pdfminer(
|
||||
languages=languages,
|
||||
metadata_last_modified=metadata_last_modified,
|
||||
starting_page_number=starting_page_number,
|
||||
password=password,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@ -427,6 +438,7 @@ def _process_pdfminer_pages(
|
||||
metadata_last_modified: Optional[str],
|
||||
annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD,
|
||||
starting_page_number: int = 1,
|
||||
password: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> list[list[Element]]:
|
||||
"""Uses PDFMiner to split a document into pages and process them."""
|
||||
@ -434,7 +446,8 @@ def _process_pdfminer_pages(
|
||||
elements = []
|
||||
|
||||
for page_number, (page, page_layout) in enumerate(
|
||||
open_pdfminer_pages_generator(fp), start=starting_page_number
|
||||
open_pdfminer_pages_generator(fp, password=password),
|
||||
start=starting_page_number,
|
||||
):
|
||||
width, height = page_layout.width, page_layout.height
|
||||
|
||||
@ -556,6 +569,7 @@ def _partition_pdf_or_image_local(
|
||||
extract_forms: bool = False,
|
||||
form_extraction_skip_tables: bool = True,
|
||||
pdf_hi_res_max_pages: Optional[int] = None,
|
||||
password: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> list[Element]:
|
||||
"""Partition using package installed locally"""
|
||||
@ -592,10 +606,11 @@ def _partition_pdf_or_image_local(
|
||||
is_image=is_image,
|
||||
model_name=hi_res_model_name,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
password=password,
|
||||
)
|
||||
|
||||
extracted_layout, layouts_links = (
|
||||
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
|
||||
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi, password=password)
|
||||
if pdf_text_extractable
|
||||
else ([], [])
|
||||
)
|
||||
@ -635,6 +650,7 @@ def _partition_pdf_or_image_local(
|
||||
ocr_mode=ocr_mode,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
ocr_layout_dumper=ocr_layout_dumper,
|
||||
password=password,
|
||||
)
|
||||
else:
|
||||
inferred_document_layout = process_data_with_model(
|
||||
@ -642,13 +658,14 @@ def _partition_pdf_or_image_local(
|
||||
is_image=is_image,
|
||||
model_name=hi_res_model_name,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
password=password,
|
||||
)
|
||||
|
||||
if hasattr(file, "seek"):
|
||||
file.seek(0)
|
||||
|
||||
extracted_layout, layouts_links = (
|
||||
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi)
|
||||
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi, password=password)
|
||||
if pdf_text_extractable
|
||||
else ([], [])
|
||||
)
|
||||
@ -690,6 +707,7 @@ def _partition_pdf_or_image_local(
|
||||
ocr_mode=ocr_mode,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
ocr_layout_dumper=ocr_layout_dumper,
|
||||
password=password,
|
||||
)
|
||||
|
||||
# vectorization of the data structure ends here
|
||||
@ -837,6 +855,7 @@ def _partition_pdf_or_image_with_ocr(
|
||||
is_image: bool = False,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
starting_page_number: int = 1,
|
||||
password: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""Partitions an image or PDF using OCR. For PDFs, each page is converted
|
||||
@ -861,7 +880,7 @@ def _partition_pdf_or_image_with_ocr(
|
||||
elements.extend(page_elements)
|
||||
else:
|
||||
for page_number, image in enumerate(
|
||||
convert_pdf_to_images(filename, file), start=starting_page_number
|
||||
convert_pdf_to_images(filename, file, password=password), start=starting_page_number
|
||||
):
|
||||
page_elements = _partition_pdf_or_image_with_ocr_from_image(
|
||||
image=image,
|
||||
|
@ -42,6 +42,7 @@ def process_data_with_ocr(
|
||||
ocr_mode: str = OCRMode.FULL_PAGE.value,
|
||||
pdf_image_dpi: int = 200,
|
||||
ocr_layout_dumper: Optional[OCRLayoutDumper] = None,
|
||||
password: Optional[str] = None,
|
||||
) -> "DocumentLayout":
|
||||
"""
|
||||
Process OCR data from a given data and supplement the output DocumentLayout
|
||||
@ -89,6 +90,7 @@ def process_data_with_ocr(
|
||||
ocr_mode=ocr_mode,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
ocr_layout_dumper=ocr_layout_dumper,
|
||||
password=password,
|
||||
)
|
||||
|
||||
return merged_layouts
|
||||
@ -105,6 +107,7 @@ def process_file_with_ocr(
|
||||
ocr_mode: str = OCRMode.FULL_PAGE.value,
|
||||
pdf_image_dpi: int = 200,
|
||||
ocr_layout_dumper: Optional[OCRLayoutDumper] = None,
|
||||
password: Optional[str] = None,
|
||||
) -> "DocumentLayout":
|
||||
"""
|
||||
Process OCR data from a given file and supplement the output DocumentLayout
|
||||
@ -165,6 +168,7 @@ def process_file_with_ocr(
|
||||
dpi=pdf_image_dpi,
|
||||
output_folder=temp_dir,
|
||||
paths_only=True,
|
||||
userpw=password or "",
|
||||
)
|
||||
image_paths = cast(List[str], _image_paths)
|
||||
for i, image_path in enumerate(image_paths):
|
||||
|
@ -58,6 +58,7 @@ def convert_pdf_to_image(
|
||||
dpi: int = 200,
|
||||
output_folder: Optional[Union[str, PurePath]] = None,
|
||||
path_only: bool = False,
|
||||
password: Optional[str] = None,
|
||||
) -> Union[List[Image.Image], List[str]]:
|
||||
"""Get the image renderings of the pdf pages using pdf2image"""
|
||||
|
||||
@ -71,6 +72,7 @@ def convert_pdf_to_image(
|
||||
dpi=dpi,
|
||||
output_folder=output_folder,
|
||||
paths_only=path_only,
|
||||
userpw=password,
|
||||
)
|
||||
else:
|
||||
images = pdf2image.convert_from_path(
|
||||
@ -125,6 +127,7 @@ def save_elements(
|
||||
is_image: bool = False,
|
||||
extract_image_block_to_payload: bool = False,
|
||||
output_dir_path: str | None = None,
|
||||
password: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Saves specific elements from a PDF as images either to a directory or embeds them in the
|
||||
@ -167,6 +170,7 @@ def save_elements(
|
||||
pdf_image_dpi,
|
||||
output_folder=temp_dir,
|
||||
path_only=True,
|
||||
password=password,
|
||||
)
|
||||
image_paths = cast(List[str], _image_paths)
|
||||
|
||||
@ -389,15 +393,16 @@ def convert_pdf_to_images(
|
||||
filename: str = "",
|
||||
file: Optional[bytes | IO[bytes]] = None,
|
||||
chunk_size: int = 10,
|
||||
password: Optional[str] = None,
|
||||
) -> Iterator[Image.Image]:
|
||||
# Convert a PDF in small chunks of pages at a time (e.g. 1-10, 11-20... and so on)
|
||||
exactly_one(filename=filename, file=file)
|
||||
if file is not None:
|
||||
f_bytes = convert_to_bytes(file)
|
||||
info = pdf2image.pdfinfo_from_bytes(f_bytes)
|
||||
info = pdf2image.pdfinfo_from_bytes(f_bytes, userpw=password)
|
||||
else:
|
||||
f_bytes = None
|
||||
info = pdf2image.pdfinfo_from_path(filename)
|
||||
info = pdf2image.pdfinfo_from_path(filename, userpw=password)
|
||||
|
||||
total_pages = info["Pages"]
|
||||
for start_page in range(1, total_pages + 1, chunk_size):
|
||||
@ -407,12 +412,14 @@ def convert_pdf_to_images(
|
||||
f_bytes,
|
||||
first_page=start_page,
|
||||
last_page=end_page,
|
||||
userpw=password,
|
||||
)
|
||||
else:
|
||||
chunk_images = pdf2image.convert_from_path(
|
||||
filename,
|
||||
first_page=start_page,
|
||||
last_page=end_page,
|
||||
userpw=password,
|
||||
)
|
||||
|
||||
for image in chunk_images:
|
||||
|
@ -38,12 +38,14 @@ DEFAULT_ROUND = 15
|
||||
def process_file_with_pdfminer(
|
||||
filename: str = "",
|
||||
dpi: int = 200,
|
||||
password: Optional[str] = None,
|
||||
) -> tuple[List[List["TextRegion"]], List[List]]:
|
||||
with open_filename(filename, "rb") as fp:
|
||||
fp = cast(BinaryIO, fp)
|
||||
extracted_layout, layouts_links = process_data_with_pdfminer(
|
||||
file=fp,
|
||||
dpi=dpi,
|
||||
password=password,
|
||||
)
|
||||
return extracted_layout, layouts_links
|
||||
|
||||
@ -432,6 +434,7 @@ def process_page_layout_from_pdfminer(
|
||||
def process_data_with_pdfminer(
|
||||
file: Optional[Union[bytes, BinaryIO]] = None,
|
||||
dpi: int = 200,
|
||||
password: Optional[str] = None,
|
||||
) -> tuple[List[LayoutElements], List[List]]:
|
||||
"""Loads the image and word objects from a pdf using pdfplumber and the image renderings of the
|
||||
pdf pages using pdf2image"""
|
||||
@ -442,7 +445,9 @@ def process_data_with_pdfminer(
|
||||
layouts_links = []
|
||||
# Coefficient to rescale bounding box to be compatible with images
|
||||
coef = dpi / 72
|
||||
for page_number, (page, page_layout) in enumerate(open_pdfminer_pages_generator(file)):
|
||||
for page_number, (page, page_layout) in enumerate(
|
||||
open_pdfminer_pages_generator(file, password=password)
|
||||
):
|
||||
width, height = page_layout.width, page_layout.height
|
||||
|
||||
annotation_list = []
|
||||
|
@ -1,6 +1,6 @@
|
||||
import os
|
||||
import tempfile
|
||||
from typing import BinaryIO, List, Tuple
|
||||
from typing import BinaryIO, List, Optional, Tuple
|
||||
|
||||
from pdfminer.converter import PDFPageAggregator
|
||||
from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem, LTTextLine
|
||||
@ -73,6 +73,7 @@ def rect_to_bbox(
|
||||
@requires_dependencies(["pikepdf", "pypdf"])
|
||||
def open_pdfminer_pages_generator(
|
||||
fp: BinaryIO,
|
||||
password: Optional[str] = None,
|
||||
):
|
||||
"""Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs."""
|
||||
|
||||
@ -84,7 +85,7 @@ def open_pdfminer_pages_generator(
|
||||
with tempfile.TemporaryDirectory() as tmp_dir_path:
|
||||
tmp_file_path = os.path.join(tmp_dir_path, "tmp_file")
|
||||
try:
|
||||
pages = PDFPage.get_pages(fp)
|
||||
pages = PDFPage.get_pages(fp, password=password or "")
|
||||
# Detect invalid dictionary construct for entire PDF
|
||||
for i, page in enumerate(pages):
|
||||
try:
|
||||
@ -93,7 +94,7 @@ def open_pdfminer_pages_generator(
|
||||
page_layout = device.get_result()
|
||||
except PSSyntaxError:
|
||||
logger.info("Detected invalid dictionary construct for PDFminer")
|
||||
logger.info(f"Repairing the PDF page {i+1} ...")
|
||||
logger.info(f"Repairing the PDF page {i + 1} ...")
|
||||
# find the error page from binary data fp
|
||||
error_page_data = get_page_data(fp, page_number=i)
|
||||
# repair the error page with pikepdf
|
||||
|
Loading…
x
Reference in New Issue
Block a user