Introduce start_page argument to partitioning functions that assign element.metadata.page_number (#2884)

This small change will be useful for users who partition only fragments
of their PDF documents.
It's a small step towards addressing this issue:
https://github.com/Unstructured-IO/unstructured/issues/2461

Related PRs:
* https://github.com/Unstructured-IO/unstructured/pull/2842
* https://github.com/Unstructured-IO/unstructured/pull/2673
This commit is contained in:
Michał Martyniak 2024-04-15 23:03:42 +02:00 committed by GitHub
parent ba3f374268
commit cb1e91058e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 126 additions and 31 deletions

View File

@ -1,10 +1,11 @@
## 0.13.3-dev4 ## 0.13.3-dev5
### Enhancements ### Enhancements
* **Add support for `start_index` in `html` links extraction** * **Add support for `start_index` in `html` links extraction**
* **Add `strategy` arg value to `_PptxPartitionerOptions`.** This makes this paritioning option available for sub-partitioners to come that may optionally use inference or other expensive operations to improve the partitioning. * **Add `strategy` arg value to `_PptxPartitionerOptions`.** This makes this paritioning option available for sub-partitioners to come that may optionally use inference or other expensive operations to improve the partitioning.
* **Support pluggable sub-partitioner for PPTX Picture shapes.** Use a distinct sub-partitioner for partitioning PPTX Picture (image) shapes and allow the default picture sub-partitioner to be replaced at run-time by one of the user's choosing. * **Support pluggable sub-partitioner for PPTX Picture shapes.** Use a distinct sub-partitioner for partitioning PPTX Picture (image) shapes and allow the default picture sub-partitioner to be replaced at run-time by one of the user's choosing.
* **Introduce `starting_page_number` parameter to partitioning functions** It applies to those partitioners which support `page_number` in element's metadata: PDF, TIFF, XLSX, DOC, DOCX, PPT, PPTX.
### Features ### Features

View File

@ -377,11 +377,13 @@ def test_partition_docx_includes_page_numbers_when_page_break_elements_are_suppr
def test_partition_docx_includes_page_break_elements_when_so_instructed(): def test_partition_docx_includes_page_break_elements_when_so_instructed():
elements = partition_docx(example_doc_path("handbook-1p.docx"), include_page_breaks=True) elements = partition_docx(
example_doc_path("handbook-1p.docx"), include_page_breaks=True, starting_page_number=3
)
assert "PageBreak" in [type(e).__name__ for e in elements] assert "PageBreak" in [type(e).__name__ for e in elements]
assert elements[1].metadata.page_number == 1 assert elements[1].metadata.page_number == 3
assert elements[-2].metadata.page_number == 2 assert elements[-2].metadata.page_number == 4
# ------------------------------------------------------------------------------------------------ # ------------------------------------------------------------------------------------------------

View File

@ -148,19 +148,21 @@ def test_partition_pdf_local_raises_with_no_filename():
@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"]) @pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
@pytest.mark.parametrize( @pytest.mark.parametrize(
("strategy", "expected", "origin"), ("strategy", "starting_page_number", "expected_page_numbers", "origin"),
# fast: can't capture the "intentionally left blank page" page # fast: can't capture the "intentionally left blank page" page
# others: will ignore the actual blank page # others: will ignore the actual blank page
[ [
(PartitionStrategy.FAST, {1, 4}, {"pdfminer"}), (PartitionStrategy.FAST, 1, {1, 4}, {"pdfminer"}),
(PartitionStrategy.HI_RES, {1, 3, 4}, {"yolox", "pdfminer"}), (PartitionStrategy.FAST, 3, {3, 6}, {"pdfminer"}),
(PartitionStrategy.OCR_ONLY, {1, 3, 4}, {"ocr_tesseract"}), (PartitionStrategy.HI_RES, 4, {4, 6, 7}, {"yolox", "pdfminer"}),
(PartitionStrategy.OCR_ONLY, 1, {1, 3, 4}, {"ocr_tesseract"}),
], ],
) )
def test_partition_pdf( def test_partition_pdf_outputs_valid_amount_of_elements_and_metadata_values(
file_mode, file_mode,
strategy, strategy,
expected, starting_page_number,
expected_page_numbers,
origin, origin,
filename=example_doc_path("layout-parser-paper-with-empty-pages.pdf"), filename=example_doc_path("layout-parser-paper-with-empty-pages.pdf"),
): ):
@ -169,23 +171,29 @@ def test_partition_pdf(
# validate that the result is a non-empty list of dicts # validate that the result is a non-empty list of dicts
assert len(result) > 10 assert len(result) > 10
# check that the pdf has multiple different page numbers # check that the pdf has multiple different page numbers
assert {element.metadata.page_number for element in result} == expected assert {element.metadata.page_number for element in result} == expected_page_numbers
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA: if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
assert {element.metadata.detection_origin for element in result} == origin assert {element.metadata.detection_origin for element in result} == origin
if file_mode == "filename": if file_mode == "filename":
result = pdf.partition_pdf(filename=filename, strategy=strategy) result = pdf.partition_pdf(
filename=filename, strategy=strategy, starting_page_number=starting_page_number
)
_test(result) _test(result)
elif file_mode == "rb": elif file_mode == "rb":
with open(filename, "rb") as f: with open(filename, "rb") as f:
result = pdf.partition_pdf(file=f, strategy=strategy) result = pdf.partition_pdf(
file=f, strategy=strategy, starting_page_number=starting_page_number
)
_test(result) _test(result)
else: else:
with open(filename, "rb") as test_file: with open(filename, "rb") as test_file:
spooled_temp_file = SpooledTemporaryFile() spooled_temp_file = SpooledTemporaryFile()
spooled_temp_file.write(test_file.read()) spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0) spooled_temp_file.seek(0)
result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy) result = pdf.partition_pdf(
file=spooled_temp_file, strategy=strategy, starting_page_number=starting_page_number
)
_test(result) _test(result)
@ -298,10 +306,12 @@ def test_partition_pdf_with_no_page_breaks(
def test_partition_pdf_with_fast_strategy( def test_partition_pdf_with_fast_strategy(
filename=example_doc_path("layout-parser-paper-fast.pdf"), filename=example_doc_path("layout-parser-paper-fast.pdf"),
): ):
elements = pdf.partition_pdf(filename=filename, url=None, strategy=PartitionStrategy.FAST) elements = pdf.partition_pdf(
filename=filename, url=None, strategy=PartitionStrategy.FAST, starting_page_number=3
)
assert len(elements) > 10 assert len(elements) > 10
# check that the pdf has multiple different page numbers # check that the pdf has multiple different page numbers
assert {element.metadata.page_number for element in elements} == {1, 2} assert {element.metadata.page_number for element in elements} == {3, 4}
for element in elements: for element in elements:
assert element.metadata.filename == "layout-parser-paper-fast.pdf" assert element.metadata.filename == "layout-parser-paper-fast.pdf"

View File

@ -703,6 +703,21 @@ class Describe_PptxPartitionerOptions:
list(opts.increment_page_number()) list(opts.increment_page_number())
assert opts.page_number == 2 assert opts.page_number == 2
def it_assigns_the_correct_page_number_when_starting_page_number_is_given(
self, opts_args: dict[str, Any]
):
opts = _PptxPartitionerOptions(**opts_args, starting_page_number=3)
# -- move to the "first" slide --
list(opts.increment_page_number())
table_metadata = opts.table_metadata(text_as_html="<table><tr/></table>")
text_metadata = opts.text_metadata()
assert isinstance(table_metadata, ElementMetadata)
assert isinstance(text_metadata, ElementMetadata)
assert text_metadata.page_number == 3
assert table_metadata.page_number == 3
# -- .pptx_file ------------------------------ # -- .pptx_file ------------------------------
def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided( def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided(

View File

@ -363,6 +363,7 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
extract_image_block_to_payload=False, extract_image_block_to_payload=False,
hi_res_model_name=None, hi_res_model_name=None,
date_from_file_object=False, date_from_file_object=False,
starting_page_number=1,
) )
@ -840,6 +841,11 @@ def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"
assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE
def test_auto_partition_respects_starting_page_number_argument_for_xlsx():
elements = partition("example-docs/stanley-cups.xlsx", starting_page_number=3)
assert elements[1].metadata.page_number == 3
EXPECTED_XLS_TEXT_LEN = 550 EXPECTED_XLS_TEXT_LEN = 550

View File

@ -1 +1 @@
__version__ = "0.13.3-dev4" # pragma: no cover __version__ = "0.13.3-dev5" # pragma: no cover

View File

@ -156,6 +156,7 @@ def partition(
hi_res_model_name: Optional[str] = None, hi_res_model_name: Optional[str] = None,
model_name: Optional[str] = None, # to be deprecated model_name: Optional[str] = None, # to be deprecated
date_from_file_object: bool = False, date_from_file_object: bool = False,
starting_page_number: int = 1,
**kwargs, **kwargs,
): ):
"""Partitions a document into its constituent elements. Will use libmagic to determine """Partitions a document into its constituent elements. Will use libmagic to determine
@ -243,6 +244,10 @@ def partition(
Applies only when providing file via `file` parameter. If this option is True and inference Applies only when providing file via `file` parameter. If this option is True and inference
from message header failed, attempt to infer last_modified metadata from bytes, from message header failed, attempt to infer last_modified metadata from bytes,
otherwise set it to None. otherwise set it to None.
starting_page_number
Indicates what page number should be assigned to the first page in the document.
This information will be reflected in elements' metadata and can be be especially
useful when partitioning a document that is part of a larger document.
""" """
exactly_one(file=file, filename=filename, url=url) exactly_one(file=file, filename=filename, url=url)
@ -308,6 +313,7 @@ def partition(
infer_table_structure=infer_table_structure, infer_table_structure=infer_table_structure,
languages=languages, languages=languages,
detect_language_per_element=detect_language_per_element, detect_language_per_element=detect_language_per_element,
starting_page_number=starting_page_number,
**kwargs, **kwargs,
) )
elif filetype == FileType.DOCX: elif filetype == FileType.DOCX:
@ -318,6 +324,7 @@ def partition(
infer_table_structure=infer_table_structure, infer_table_structure=infer_table_structure,
languages=languages, languages=languages,
detect_language_per_element=detect_language_per_element, detect_language_per_element=detect_language_per_element,
starting_page_number=starting_page_number,
**kwargs, **kwargs,
) )
elif filetype == FileType.ODT: elif filetype == FileType.ODT:
@ -426,6 +433,7 @@ def partition(
extract_image_block_types=extract_image_block_types, extract_image_block_types=extract_image_block_types,
extract_image_block_output_dir=extract_image_block_output_dir, extract_image_block_output_dir=extract_image_block_output_dir,
extract_image_block_to_payload=extract_image_block_to_payload, extract_image_block_to_payload=extract_image_block_to_payload,
starting_page_number=starting_page_number,
**kwargs, **kwargs,
) )
elif filetype in IMAGE_FILETYPES: elif filetype in IMAGE_FILETYPES:
@ -485,6 +493,7 @@ def partition(
infer_table_structure=infer_table_structure, infer_table_structure=infer_table_structure,
languages=languages, languages=languages,
detect_language_per_element=detect_language_per_element, detect_language_per_element=detect_language_per_element,
starting_page_number=starting_page_number,
**kwargs, **kwargs,
) )
elif filetype == FileType.JSON: elif filetype == FileType.JSON:
@ -502,6 +511,7 @@ def partition(
infer_table_structure=infer_table_structure, infer_table_structure=infer_table_structure,
languages=languages, languages=languages,
detect_language_per_element=detect_language_per_element, detect_language_per_element=detect_language_per_element,
starting_page_number=starting_page_number,
**kwargs, **kwargs,
) )
elif filetype == FileType.CSV: elif filetype == FileType.CSV:

View File

@ -540,13 +540,14 @@ def document_to_element_list(
detection_origin: Optional[str] = None, detection_origin: Optional[str] = None,
sort_mode: str = SORT_MODE_XY_CUT, sort_mode: str = SORT_MODE_XY_CUT,
languages: Optional[List[str]] = None, languages: Optional[List[str]] = None,
starting_page_number: int = 1,
**kwargs: Any, **kwargs: Any,
) -> List[Element]: ) -> List[Element]:
"""Converts a DocumentLayout object to a list of unstructured elements.""" """Converts a DocumentLayout object to a list of unstructured elements."""
elements: List[Element] = [] elements: List[Element] = []
num_pages = len(document.pages) num_pages = len(document.pages)
for i, page in enumerate(document.pages): for page_number, page in enumerate(document.pages, start=starting_page_number):
page_elements: List[Element] = [] page_elements: List[Element] = []
page_image_metadata = _get_page_image_metadata(page) page_image_metadata = _get_page_image_metadata(page)
@ -571,7 +572,7 @@ def document_to_element_list(
for el in element: for el in element:
if last_modification_date: if last_modification_date:
el.metadata.last_modified = last_modification_date el.metadata.last_modified = last_modification_date
el.metadata.page_number = i + 1 el.metadata.page_number = page_number
page_elements.extend(element) page_elements.extend(element)
translation_mapping.extend([(layout_element, el) for el in element]) translation_mapping.extend([(layout_element, el) for el in element])
continue continue
@ -601,7 +602,7 @@ def document_to_element_list(
add_element_metadata( add_element_metadata(
element, element,
page_number=i + 1, page_number=page_number,
filetype=image_format, filetype=image_format,
coordinates=coordinates, coordinates=coordinates,
coordinate_system=coordinate_system, coordinate_system=coordinate_system,
@ -622,7 +623,7 @@ def document_to_element_list(
if sortable and sort_mode != SORT_MODE_DONT: if sortable and sort_mode != SORT_MODE_DONT:
sorted_page_elements = sort_page_elements(page_elements, sort_mode) sorted_page_elements = sort_page_elements(page_elements, sort_mode)
if include_page_breaks and i < num_pages - 1: if include_page_breaks and page_number < num_pages + starting_page_number:
sorted_page_elements.append(PageBreak(text="")) sorted_page_elements.append(PageBreak(text=""))
elements.extend(sorted_page_elements) elements.extend(sorted_page_elements)

View File

@ -29,6 +29,7 @@ def partition_doc(
languages: Optional[List[str]] = ["auto"], languages: Optional[List[str]] = ["auto"],
detect_language_per_element: bool = False, detect_language_per_element: bool = False,
date_from_file_object: bool = False, date_from_file_object: bool = False,
starting_page_number: int = 1,
**kwargs: Any, **kwargs: Any,
) -> List[Element]: ) -> List[Element]:
"""Partitions Microsoft Word Documents in .doc format into its document elements. """Partitions Microsoft Word Documents in .doc format into its document elements.
@ -55,6 +56,10 @@ def partition_doc(
date_from_file_object date_from_file_object
Applies only when providing file via `file` parameter. If this option is True, attempt Applies only when providing file via `file` parameter. If this option is True, attempt
infer last_modified metadata from bytes, otherwise set it to None. infer last_modified metadata from bytes, otherwise set it to None.
starting_page_number
Indicates what page number should be assigned to the first page in the document.
This information will be reflected in elements' metadata and can be be especially
useful when partitioning a document that is part of a larger document.
""" """
# Verify that only one of the arguments was provided # Verify that only one of the arguments was provided
if filename is None: if filename is None:
@ -97,6 +102,7 @@ def partition_doc(
metadata_last_modified=metadata_last_modified or last_modification_date, metadata_last_modified=metadata_last_modified or last_modification_date,
languages=languages, languages=languages,
detect_language_per_element=detect_language_per_element, detect_language_per_element=detect_language_per_element,
starting_page_number=starting_page_number,
) )
# remove tmp.name from filename if parsing file # remove tmp.name from filename if parsing file
if file: if file:

View File

@ -181,6 +181,7 @@ def partition_docx(
languages: Optional[List[str]] = ["auto"], languages: Optional[List[str]] = ["auto"],
detect_language_per_element: bool = False, detect_language_per_element: bool = False,
date_from_file_object: bool = False, date_from_file_object: bool = False,
starting_page_number: int = 1,
**kwargs: Any, # used by decorator **kwargs: Any, # used by decorator
) -> List[Element]: ) -> List[Element]:
"""Partitions Microsoft Word Documents in .docx format into its document elements. """Partitions Microsoft Word Documents in .docx format into its document elements.
@ -212,6 +213,10 @@ def partition_docx(
date_from_file_object date_from_file_object
Applies only when providing file via `file` parameter. If this option is True, attempt Applies only when providing file via `file` parameter. If this option is True, attempt
infer last_modified metadata from bytes, otherwise set it to None. infer last_modified metadata from bytes, otherwise set it to None.
starting_page_number
Indicates what page number should be assigned to the first page in the document.
This information will be reflected in elements' metadata and can be be especially
useful when partitioning a document that is part of a larger document.
""" """
# -- verify that only one file-specifier argument was provided -- # -- verify that only one file-specifier argument was provided --
exactly_one(filename=filename, file=file) exactly_one(filename=filename, file=file)
@ -224,6 +229,7 @@ def partition_docx(
infer_table_structure, infer_table_structure,
metadata_last_modified, metadata_last_modified,
date_from_file_object, date_from_file_object,
starting_page_number=starting_page_number,
) )
elements = apply_lang_metadata( elements = apply_lang_metadata(
elements=elements, elements=elements,
@ -249,6 +255,7 @@ class _DocxPartitioner:
infer_table_structure: bool = True, infer_table_structure: bool = True,
metadata_last_modified: Optional[str] = None, metadata_last_modified: Optional[str] = None,
date_from_file_object: bool = False, date_from_file_object: bool = False,
starting_page_number: int = 1,
) -> None: ) -> None:
self._filename = filename self._filename = filename
self._file = file self._file = file
@ -256,7 +263,7 @@ class _DocxPartitioner:
self._include_page_breaks = include_page_breaks self._include_page_breaks = include_page_breaks
self._infer_table_structure = infer_table_structure self._infer_table_structure = infer_table_structure
self._metadata_last_modified = metadata_last_modified self._metadata_last_modified = metadata_last_modified
self._page_counter: int = 1 self._page_counter = starting_page_number
self._date_from_file_object = date_from_file_object self._date_from_file_object = date_from_file_object
@classmethod @classmethod
@ -269,6 +276,7 @@ class _DocxPartitioner:
infer_table_structure: bool = True, infer_table_structure: bool = True,
metadata_last_modified: Optional[str] = None, metadata_last_modified: Optional[str] = None,
date_from_file_object: bool = False, date_from_file_object: bool = False,
starting_page_number: int = 1,
) -> Iterator[Element]: ) -> Iterator[Element]:
"""Partition MS Word documents (.docx format) into its document elements.""" """Partition MS Word documents (.docx format) into its document elements."""
self = cls( self = cls(
@ -279,6 +287,7 @@ class _DocxPartitioner:
infer_table_structure=infer_table_structure, infer_table_structure=infer_table_structure,
metadata_last_modified=metadata_last_modified, metadata_last_modified=metadata_last_modified,
date_from_file_object=date_from_file_object, date_from_file_object=date_from_file_object,
starting_page_number=starting_page_number,
) )
# NOTE(scanny): It's possible for a Word document to have no sections. In particular, a # NOTE(scanny): It's possible for a Word document to have no sections. In particular, a
# Microsoft Teams chat transcript exported to DOCX contains no sections. Such a # Microsoft Teams chat transcript exported to DOCX contains no sections. Such a

View File

@ -152,6 +152,7 @@ def partition_pdf(
extract_image_block_output_dir: Optional[str] = None, extract_image_block_output_dir: Optional[str] = None,
extract_image_block_to_payload: bool = False, extract_image_block_to_payload: bool = False,
date_from_file_object: bool = False, date_from_file_object: bool = False,
starting_page_number: int = 1,
**kwargs: Any, **kwargs: Any,
) -> List[Element]: ) -> List[Element]:
"""Parses a pdf document into a list of interpreted elements. """Parses a pdf document into a list of interpreted elements.
@ -228,6 +229,7 @@ def partition_pdf(
extract_image_block_output_dir=extract_image_block_output_dir, extract_image_block_output_dir=extract_image_block_output_dir,
extract_image_block_to_payload=extract_image_block_to_payload, extract_image_block_to_payload=extract_image_block_to_payload,
date_from_file_object=date_from_file_object, date_from_file_object=date_from_file_object,
starting_page_number=starting_page_number,
**kwargs, **kwargs,
) )
@ -248,6 +250,7 @@ def partition_pdf_or_image(
extract_image_block_output_dir: Optional[str] = None, extract_image_block_output_dir: Optional[str] = None,
extract_image_block_to_payload: bool = False, extract_image_block_to_payload: bool = False,
date_from_file_object: bool = False, date_from_file_object: bool = False,
starting_page_number: int = 1,
**kwargs, **kwargs,
) -> List[Element]: ) -> List[Element]:
"""Parses a pdf or image document into a list of interpreted elements.""" """Parses a pdf or image document into a list of interpreted elements."""
@ -277,6 +280,7 @@ def partition_pdf_or_image(
include_page_breaks=include_page_breaks, include_page_breaks=include_page_breaks,
languages=languages, languages=languages,
metadata_last_modified=metadata_last_modified or last_modification_date, metadata_last_modified=metadata_last_modified or last_modification_date,
starting_page_number=starting_page_number,
**kwargs, **kwargs,
) )
pdf_text_extractable = any( pdf_text_extractable = any(
@ -316,6 +320,7 @@ def partition_pdf_or_image(
extract_image_block_types=extract_image_block_types, extract_image_block_types=extract_image_block_types,
extract_image_block_output_dir=extract_image_block_output_dir, extract_image_block_output_dir=extract_image_block_output_dir,
extract_image_block_to_payload=extract_image_block_to_payload, extract_image_block_to_payload=extract_image_block_to_payload,
starting_page_number=starting_page_number,
**kwargs, **kwargs,
) )
out_elements = _process_uncategorized_text_elements(elements) out_elements = _process_uncategorized_text_elements(elements)
@ -333,6 +338,7 @@ def partition_pdf_or_image(
languages=languages, languages=languages,
is_image=is_image, is_image=is_image,
metadata_last_modified=metadata_last_modified or last_modification_date, metadata_last_modified=metadata_last_modified or last_modification_date,
starting_page_number=starting_page_number,
**kwargs, **kwargs,
) )
out_elements = _process_uncategorized_text_elements(elements) out_elements = _process_uncategorized_text_elements(elements)
@ -346,6 +352,7 @@ def extractable_elements(
include_page_breaks: bool = False, include_page_breaks: bool = False,
languages: Optional[List[str]] = None, languages: Optional[List[str]] = None,
metadata_last_modified: Optional[str] = None, metadata_last_modified: Optional[str] = None,
starting_page_number: int = 1,
**kwargs: Any, **kwargs: Any,
): ):
if isinstance(file, bytes): if isinstance(file, bytes):
@ -356,6 +363,7 @@ def extractable_elements(
include_page_breaks=include_page_breaks, include_page_breaks=include_page_breaks,
languages=languages, languages=languages,
metadata_last_modified=metadata_last_modified, metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number,
**kwargs, **kwargs,
) )
@ -395,6 +403,7 @@ def _partition_pdf_or_image_local(
extract_image_block_to_payload: bool = False, extract_image_block_to_payload: bool = False,
analysis: bool = False, analysis: bool = False,
analyzed_image_output_dir_path: Optional[str] = None, analyzed_image_output_dir_path: Optional[str] = None,
starting_page_number: int = 1,
**kwargs, **kwargs,
) -> List[Element]: ) -> List[Element]:
"""Partition using package installed locally""" """Partition using package installed locally"""
@ -532,6 +541,7 @@ def _partition_pdf_or_image_local(
# unstructured.partition.common::layout_list_to_list_items often result in weird chunking. # unstructured.partition.common::layout_list_to_list_items often result in weird chunking.
infer_list_items=False, infer_list_items=False,
languages=languages, languages=languages,
starting_page_number=starting_page_number,
**kwargs, **kwargs,
) )
@ -610,6 +620,7 @@ def _partition_pdf_with_pdfminer(
include_page_breaks: bool, include_page_breaks: bool,
languages: List[str], languages: List[str],
metadata_last_modified: Optional[str], metadata_last_modified: Optional[str],
starting_page_number: int = 1,
**kwargs: Any, **kwargs: Any,
) -> List[Element]: ) -> List[Element]:
"""Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster """Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster
@ -633,6 +644,7 @@ def _partition_pdf_with_pdfminer(
include_page_breaks=include_page_breaks, include_page_breaks=include_page_breaks,
languages=languages, languages=languages,
metadata_last_modified=metadata_last_modified, metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number,
**kwargs, **kwargs,
) )
@ -644,6 +656,7 @@ def _partition_pdf_with_pdfminer(
include_page_breaks=include_page_breaks, include_page_breaks=include_page_breaks,
languages=languages, languages=languages,
metadata_last_modified=metadata_last_modified, metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number,
**kwargs, **kwargs,
) )
@ -691,13 +704,16 @@ def _process_pdfminer_pages(
metadata_last_modified: Optional[str], metadata_last_modified: Optional[str],
sort_mode: str = SORT_MODE_XY_CUT, sort_mode: str = SORT_MODE_XY_CUT,
annotation_threshold: Optional[float] = 0.9, annotation_threshold: Optional[float] = 0.9,
starting_page_number: int = 1,
**kwargs, **kwargs,
): ):
"""Uses PDFMiner to split a document into pages and process them.""" """Uses PDFMiner to split a document into pages and process them."""
elements: List[Element] = [] elements: List[Element] = []
for i, (page, page_layout) in enumerate(open_pdfminer_pages_generator(fp)): for page_number, (page, page_layout) in enumerate(
open_pdfminer_pages_generator(fp), start=starting_page_number
):
width, height = page_layout.width, page_layout.height width, height = page_layout.width, page_layout.height
page_elements: List[Element] = [] page_elements: List[Element] = []
@ -708,7 +724,7 @@ def _process_pdfminer_pages(
height=height, height=height,
) )
if page.annots: if page.annots:
annotation_list = get_uris(page.annots, height, coordinate_system, i + 1) annotation_list = get_uris(page.annots, height, coordinate_system, page_number)
for obj in page_layout: for obj in page_layout:
x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height) x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
@ -720,7 +736,7 @@ def _process_pdfminer_pages(
annotations_within_element = check_annotations_within_element( annotations_within_element = check_annotations_within_element(
annotation_list, annotation_list,
bbox, bbox,
i + 1, page_number,
annotation_threshold, annotation_threshold,
) )
_, words = get_word_bounding_box_from_element(obj, height) _, words = get_word_bounding_box_from_element(obj, height)
@ -750,7 +766,7 @@ def _process_pdfminer_pages(
element.metadata = ElementMetadata( element.metadata = ElementMetadata(
filename=filename, filename=filename,
page_number=i + 1, page_number=page_number,
coordinates=coordinates_metadata, coordinates=coordinates_metadata,
last_modified=metadata_last_modified, last_modified=metadata_last_modified,
links=links, links=links,
@ -892,6 +908,7 @@ def _partition_pdf_or_image_with_ocr(
languages: Optional[List[str]] = ["eng"], languages: Optional[List[str]] = ["eng"],
is_image: bool = False, is_image: bool = False,
metadata_last_modified: Optional[str] = None, metadata_last_modified: Optional[str] = None,
starting_page_number: int = 1,
**kwargs, **kwargs,
): ):
"""Partitions an image or PDF using OCR. For PDFs, each page is converted """Partitions an image or PDF using OCR. For PDFs, each page is converted
@ -903,18 +920,20 @@ def _partition_pdf_or_image_with_ocr(
image = PILImage.open(file) if file is not None else PILImage.open(filename) image = PILImage.open(file) if file is not None else PILImage.open(filename)
images.append(image) images.append(image)
for i, image in enumerate(images): for page_number, image in enumerate(images, start=starting_page_number):
page_elements = _partition_pdf_or_image_with_ocr_from_image( page_elements = _partition_pdf_or_image_with_ocr_from_image(
image=image, image=image,
languages=languages, languages=languages,
page_number=i + 1, page_number=page_number,
include_page_breaks=include_page_breaks, include_page_breaks=include_page_breaks,
metadata_last_modified=metadata_last_modified, metadata_last_modified=metadata_last_modified,
**kwargs, **kwargs,
) )
elements.extend(page_elements) elements.extend(page_elements)
else: else:
for page_number, image in enumerate(convert_pdf_to_images(filename, file), start=1): for page_number, image in enumerate(
convert_pdf_to_images(filename, file), start=starting_page_number
):
page_elements = _partition_pdf_or_image_with_ocr_from_image( page_elements = _partition_pdf_or_image_with_ocr_from_image(
image=image, image=image,
languages=languages, languages=languages,

View File

@ -29,6 +29,7 @@ def partition_ppt(
languages: Optional[List[str]] = ["auto"], languages: Optional[List[str]] = ["auto"],
detect_language_per_element: bool = False, detect_language_per_element: bool = False,
date_from_file_object: bool = False, date_from_file_object: bool = False,
starting_page_number: int = 1,
**kwargs, **kwargs,
) -> List[Element]: ) -> List[Element]:
"""Partitions Microsoft PowerPoint Documents in .ppt format into their document elements. """Partitions Microsoft PowerPoint Documents in .ppt format into their document elements.
@ -59,6 +60,10 @@ def partition_ppt(
date_from_file_object date_from_file_object
Applies only when providing file via `file` parameter. If this option is True, attempt Applies only when providing file via `file` parameter. If this option is True, attempt
infer last_modified metadata from bytes, otherwise set it to None. infer last_modified metadata from bytes, otherwise set it to None.
starting_page_number
Indicates what page number should be assigned to the first slide in the presentation.
This information will be reflected in elements' metadata and can be be especially
useful when partitioning a document that is part of a larger document.
""" """
# Verify that only one of the arguments was provided # Verify that only one of the arguments was provided
if filename is None: if filename is None:
@ -100,6 +105,7 @@ def partition_ppt(
metadata_last_modified=metadata_last_modified or last_modification_date, metadata_last_modified=metadata_last_modified or last_modification_date,
languages=languages, languages=languages,
detect_language_per_element=detect_language_per_element, detect_language_per_element=detect_language_per_element,
starting_page_number=starting_page_number,
) )
# remove tmp.name from filename if parsing file # remove tmp.name from filename if parsing file

View File

@ -92,6 +92,7 @@ def partition_pptx(
metadata_filename: Optional[str] = None, metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None, metadata_last_modified: Optional[str] = None,
strategy: str = PartitionStrategy.FAST, strategy: str = PartitionStrategy.FAST,
starting_page_number: int = 1,
**kwargs: Any, **kwargs: Any,
) -> list[Element]: ) -> list[Element]:
"""Partition PowerPoint document in .pptx format into its document elements. """Partition PowerPoint document in .pptx format into its document elements.
@ -128,6 +129,10 @@ def partition_pptx(
date_from_file_object date_from_file_object
Applies only when providing file via `file` parameter. If this option is True, attempt Applies only when providing file via `file` parameter. If this option is True, attempt
infer last_modified metadata from bytes, otherwise set it to None. infer last_modified metadata from bytes, otherwise set it to None.
starting_page_number
Indicates what page number should be assigned to the first slide in the presentation.
This information will be reflected in elements' metadata and can be be especially
useful when partitioning a document that is part of a larger document.
""" """
opts = _PptxPartitionerOptions( opts = _PptxPartitionerOptions(
date_from_file_object=date_from_file_object, date_from_file_object=date_from_file_object,
@ -139,6 +144,7 @@ def partition_pptx(
metadata_file_path=metadata_filename, metadata_file_path=metadata_filename,
metadata_last_modified=metadata_last_modified, metadata_last_modified=metadata_last_modified,
strategy=strategy, strategy=strategy,
starting_page_number=starting_page_number,
) )
elements = _PptxPartitioner.iter_presentation_elements(opts) elements = _PptxPartitioner.iter_presentation_elements(opts)
@ -369,6 +375,7 @@ class _PptxPartitionerOptions:
metadata_file_path: Optional[str], metadata_file_path: Optional[str],
metadata_last_modified: Optional[str], metadata_last_modified: Optional[str],
strategy: str, strategy: str,
starting_page_number: int = 1,
): ):
self._date_from_file_object = date_from_file_object self._date_from_file_object = date_from_file_object
self._file = file self._file = file
@ -380,7 +387,7 @@ class _PptxPartitionerOptions:
self._metadata_last_modified = metadata_last_modified self._metadata_last_modified = metadata_last_modified
self._strategy = strategy self._strategy = strategy
# -- options object maintains page-number state -- # -- options object maintains page-number state --
self._page_counter = 0 self._page_counter = starting_page_number - 1
@classmethod @classmethod
def register_picture_partitioner(cls, picture_partitioner: AbstractPicturePartitioner): def register_picture_partitioner(cls, picture_partitioner: AbstractPicturePartitioner):

View File

@ -55,6 +55,7 @@ def partition_xlsx(
include_header: bool = False, include_header: bool = False,
find_subtable: bool = True, find_subtable: bool = True,
date_from_file_object: bool = False, date_from_file_object: bool = False,
starting_page_number: int = 1,
**kwargs: Any, **kwargs: Any,
) -> list[Element]: ) -> list[Element]:
"""Partitions Microsoft Excel Documents in .xlsx format into its document elements. """Partitions Microsoft Excel Documents in .xlsx format into its document elements.
@ -103,7 +104,9 @@ def partition_xlsx(
) )
elements: list[Element] = [] elements: list[Element] = []
for page_number, (sheet_name, sheet) in enumerate(opts.sheets.items(), start=1): for page_number, (sheet_name, sheet) in enumerate(
opts.sheets.items(), start=starting_page_number
):
if not opts.find_subtable: if not opts.find_subtable:
html_text = ( html_text = (
sheet.to_html( # pyright: ignore[reportUnknownMemberType] sheet.to_html( # pyright: ignore[reportUnknownMemberType]