mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-16 09:47:18 +00:00
Introduce start_page argument to partitioning functions that assign element.metadata.page_number (#2884)
This small change will be useful for users who partition only fragments of their PDF documents. It's a small step towards addressing this issue: https://github.com/Unstructured-IO/unstructured/issues/2461 Related PRs: * https://github.com/Unstructured-IO/unstructured/pull/2842 * https://github.com/Unstructured-IO/unstructured/pull/2673
This commit is contained in:
parent
ba3f374268
commit
cb1e91058e
@ -1,10 +1,11 @@
|
|||||||
## 0.13.3-dev4
|
## 0.13.3-dev5
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
* **Add support for `start_index` in `html` links extraction**
|
* **Add support for `start_index` in `html` links extraction**
|
||||||
* **Add `strategy` arg value to `_PptxPartitionerOptions`.** This makes this paritioning option available for sub-partitioners to come that may optionally use inference or other expensive operations to improve the partitioning.
|
* **Add `strategy` arg value to `_PptxPartitionerOptions`.** This makes this paritioning option available for sub-partitioners to come that may optionally use inference or other expensive operations to improve the partitioning.
|
||||||
* **Support pluggable sub-partitioner for PPTX Picture shapes.** Use a distinct sub-partitioner for partitioning PPTX Picture (image) shapes and allow the default picture sub-partitioner to be replaced at run-time by one of the user's choosing.
|
* **Support pluggable sub-partitioner for PPTX Picture shapes.** Use a distinct sub-partitioner for partitioning PPTX Picture (image) shapes and allow the default picture sub-partitioner to be replaced at run-time by one of the user's choosing.
|
||||||
|
* **Introduce `starting_page_number` parameter to partitioning functions** It applies to those partitioners which support `page_number` in element's metadata: PDF, TIFF, XLSX, DOC, DOCX, PPT, PPTX.
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
|
|||||||
@ -377,11 +377,13 @@ def test_partition_docx_includes_page_numbers_when_page_break_elements_are_suppr
|
|||||||
|
|
||||||
|
|
||||||
def test_partition_docx_includes_page_break_elements_when_so_instructed():
|
def test_partition_docx_includes_page_break_elements_when_so_instructed():
|
||||||
elements = partition_docx(example_doc_path("handbook-1p.docx"), include_page_breaks=True)
|
elements = partition_docx(
|
||||||
|
example_doc_path("handbook-1p.docx"), include_page_breaks=True, starting_page_number=3
|
||||||
|
)
|
||||||
|
|
||||||
assert "PageBreak" in [type(e).__name__ for e in elements]
|
assert "PageBreak" in [type(e).__name__ for e in elements]
|
||||||
assert elements[1].metadata.page_number == 1
|
assert elements[1].metadata.page_number == 3
|
||||||
assert elements[-2].metadata.page_number == 2
|
assert elements[-2].metadata.page_number == 4
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------------------------
|
||||||
|
|||||||
@ -148,19 +148,21 @@ def test_partition_pdf_local_raises_with_no_filename():
|
|||||||
|
|
||||||
@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
|
@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("strategy", "expected", "origin"),
|
("strategy", "starting_page_number", "expected_page_numbers", "origin"),
|
||||||
# fast: can't capture the "intentionally left blank page" page
|
# fast: can't capture the "intentionally left blank page" page
|
||||||
# others: will ignore the actual blank page
|
# others: will ignore the actual blank page
|
||||||
[
|
[
|
||||||
(PartitionStrategy.FAST, {1, 4}, {"pdfminer"}),
|
(PartitionStrategy.FAST, 1, {1, 4}, {"pdfminer"}),
|
||||||
(PartitionStrategy.HI_RES, {1, 3, 4}, {"yolox", "pdfminer"}),
|
(PartitionStrategy.FAST, 3, {3, 6}, {"pdfminer"}),
|
||||||
(PartitionStrategy.OCR_ONLY, {1, 3, 4}, {"ocr_tesseract"}),
|
(PartitionStrategy.HI_RES, 4, {4, 6, 7}, {"yolox", "pdfminer"}),
|
||||||
|
(PartitionStrategy.OCR_ONLY, 1, {1, 3, 4}, {"ocr_tesseract"}),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_partition_pdf(
|
def test_partition_pdf_outputs_valid_amount_of_elements_and_metadata_values(
|
||||||
file_mode,
|
file_mode,
|
||||||
strategy,
|
strategy,
|
||||||
expected,
|
starting_page_number,
|
||||||
|
expected_page_numbers,
|
||||||
origin,
|
origin,
|
||||||
filename=example_doc_path("layout-parser-paper-with-empty-pages.pdf"),
|
filename=example_doc_path("layout-parser-paper-with-empty-pages.pdf"),
|
||||||
):
|
):
|
||||||
@ -169,23 +171,29 @@ def test_partition_pdf(
|
|||||||
# validate that the result is a non-empty list of dicts
|
# validate that the result is a non-empty list of dicts
|
||||||
assert len(result) > 10
|
assert len(result) > 10
|
||||||
# check that the pdf has multiple different page numbers
|
# check that the pdf has multiple different page numbers
|
||||||
assert {element.metadata.page_number for element in result} == expected
|
assert {element.metadata.page_number for element in result} == expected_page_numbers
|
||||||
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
|
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
|
||||||
assert {element.metadata.detection_origin for element in result} == origin
|
assert {element.metadata.detection_origin for element in result} == origin
|
||||||
|
|
||||||
if file_mode == "filename":
|
if file_mode == "filename":
|
||||||
result = pdf.partition_pdf(filename=filename, strategy=strategy)
|
result = pdf.partition_pdf(
|
||||||
|
filename=filename, strategy=strategy, starting_page_number=starting_page_number
|
||||||
|
)
|
||||||
_test(result)
|
_test(result)
|
||||||
elif file_mode == "rb":
|
elif file_mode == "rb":
|
||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
result = pdf.partition_pdf(file=f, strategy=strategy)
|
result = pdf.partition_pdf(
|
||||||
|
file=f, strategy=strategy, starting_page_number=starting_page_number
|
||||||
|
)
|
||||||
_test(result)
|
_test(result)
|
||||||
else:
|
else:
|
||||||
with open(filename, "rb") as test_file:
|
with open(filename, "rb") as test_file:
|
||||||
spooled_temp_file = SpooledTemporaryFile()
|
spooled_temp_file = SpooledTemporaryFile()
|
||||||
spooled_temp_file.write(test_file.read())
|
spooled_temp_file.write(test_file.read())
|
||||||
spooled_temp_file.seek(0)
|
spooled_temp_file.seek(0)
|
||||||
result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy)
|
result = pdf.partition_pdf(
|
||||||
|
file=spooled_temp_file, strategy=strategy, starting_page_number=starting_page_number
|
||||||
|
)
|
||||||
_test(result)
|
_test(result)
|
||||||
|
|
||||||
|
|
||||||
@ -298,10 +306,12 @@ def test_partition_pdf_with_no_page_breaks(
|
|||||||
def test_partition_pdf_with_fast_strategy(
|
def test_partition_pdf_with_fast_strategy(
|
||||||
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
||||||
):
|
):
|
||||||
elements = pdf.partition_pdf(filename=filename, url=None, strategy=PartitionStrategy.FAST)
|
elements = pdf.partition_pdf(
|
||||||
|
filename=filename, url=None, strategy=PartitionStrategy.FAST, starting_page_number=3
|
||||||
|
)
|
||||||
assert len(elements) > 10
|
assert len(elements) > 10
|
||||||
# check that the pdf has multiple different page numbers
|
# check that the pdf has multiple different page numbers
|
||||||
assert {element.metadata.page_number for element in elements} == {1, 2}
|
assert {element.metadata.page_number for element in elements} == {3, 4}
|
||||||
for element in elements:
|
for element in elements:
|
||||||
assert element.metadata.filename == "layout-parser-paper-fast.pdf"
|
assert element.metadata.filename == "layout-parser-paper-fast.pdf"
|
||||||
|
|
||||||
|
|||||||
@ -703,6 +703,21 @@ class Describe_PptxPartitionerOptions:
|
|||||||
list(opts.increment_page_number())
|
list(opts.increment_page_number())
|
||||||
assert opts.page_number == 2
|
assert opts.page_number == 2
|
||||||
|
|
||||||
|
def it_assigns_the_correct_page_number_when_starting_page_number_is_given(
|
||||||
|
self, opts_args: dict[str, Any]
|
||||||
|
):
|
||||||
|
opts = _PptxPartitionerOptions(**opts_args, starting_page_number=3)
|
||||||
|
# -- move to the "first" slide --
|
||||||
|
list(opts.increment_page_number())
|
||||||
|
|
||||||
|
table_metadata = opts.table_metadata(text_as_html="<table><tr/></table>")
|
||||||
|
text_metadata = opts.text_metadata()
|
||||||
|
|
||||||
|
assert isinstance(table_metadata, ElementMetadata)
|
||||||
|
assert isinstance(text_metadata, ElementMetadata)
|
||||||
|
assert text_metadata.page_number == 3
|
||||||
|
assert table_metadata.page_number == 3
|
||||||
|
|
||||||
# -- .pptx_file ------------------------------
|
# -- .pptx_file ------------------------------
|
||||||
|
|
||||||
def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided(
|
def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided(
|
||||||
|
|||||||
@ -363,6 +363,7 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
|
|||||||
extract_image_block_to_payload=False,
|
extract_image_block_to_payload=False,
|
||||||
hi_res_model_name=None,
|
hi_res_model_name=None,
|
||||||
date_from_file_object=False,
|
date_from_file_object=False,
|
||||||
|
starting_page_number=1,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -840,6 +841,11 @@ def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"
|
|||||||
assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE
|
assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_partition_respects_starting_page_number_argument_for_xlsx():
|
||||||
|
elements = partition("example-docs/stanley-cups.xlsx", starting_page_number=3)
|
||||||
|
assert elements[1].metadata.page_number == 3
|
||||||
|
|
||||||
|
|
||||||
EXPECTED_XLS_TEXT_LEN = 550
|
EXPECTED_XLS_TEXT_LEN = 550
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.13.3-dev4" # pragma: no cover
|
__version__ = "0.13.3-dev5" # pragma: no cover
|
||||||
|
|||||||
@ -156,6 +156,7 @@ def partition(
|
|||||||
hi_res_model_name: Optional[str] = None,
|
hi_res_model_name: Optional[str] = None,
|
||||||
model_name: Optional[str] = None, # to be deprecated
|
model_name: Optional[str] = None, # to be deprecated
|
||||||
date_from_file_object: bool = False,
|
date_from_file_object: bool = False,
|
||||||
|
starting_page_number: int = 1,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""Partitions a document into its constituent elements. Will use libmagic to determine
|
"""Partitions a document into its constituent elements. Will use libmagic to determine
|
||||||
@ -243,6 +244,10 @@ def partition(
|
|||||||
Applies only when providing file via `file` parameter. If this option is True and inference
|
Applies only when providing file via `file` parameter. If this option is True and inference
|
||||||
from message header failed, attempt to infer last_modified metadata from bytes,
|
from message header failed, attempt to infer last_modified metadata from bytes,
|
||||||
otherwise set it to None.
|
otherwise set it to None.
|
||||||
|
starting_page_number
|
||||||
|
Indicates what page number should be assigned to the first page in the document.
|
||||||
|
This information will be reflected in elements' metadata and can be be especially
|
||||||
|
useful when partitioning a document that is part of a larger document.
|
||||||
"""
|
"""
|
||||||
exactly_one(file=file, filename=filename, url=url)
|
exactly_one(file=file, filename=filename, url=url)
|
||||||
|
|
||||||
@ -308,6 +313,7 @@ def partition(
|
|||||||
infer_table_structure=infer_table_structure,
|
infer_table_structure=infer_table_structure,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
|
starting_page_number=starting_page_number,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.DOCX:
|
elif filetype == FileType.DOCX:
|
||||||
@ -318,6 +324,7 @@ def partition(
|
|||||||
infer_table_structure=infer_table_structure,
|
infer_table_structure=infer_table_structure,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
|
starting_page_number=starting_page_number,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.ODT:
|
elif filetype == FileType.ODT:
|
||||||
@ -426,6 +433,7 @@ def partition(
|
|||||||
extract_image_block_types=extract_image_block_types,
|
extract_image_block_types=extract_image_block_types,
|
||||||
extract_image_block_output_dir=extract_image_block_output_dir,
|
extract_image_block_output_dir=extract_image_block_output_dir,
|
||||||
extract_image_block_to_payload=extract_image_block_to_payload,
|
extract_image_block_to_payload=extract_image_block_to_payload,
|
||||||
|
starting_page_number=starting_page_number,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype in IMAGE_FILETYPES:
|
elif filetype in IMAGE_FILETYPES:
|
||||||
@ -485,6 +493,7 @@ def partition(
|
|||||||
infer_table_structure=infer_table_structure,
|
infer_table_structure=infer_table_structure,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
|
starting_page_number=starting_page_number,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.JSON:
|
elif filetype == FileType.JSON:
|
||||||
@ -502,6 +511,7 @@ def partition(
|
|||||||
infer_table_structure=infer_table_structure,
|
infer_table_structure=infer_table_structure,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
|
starting_page_number=starting_page_number,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.CSV:
|
elif filetype == FileType.CSV:
|
||||||
|
|||||||
@ -540,13 +540,14 @@ def document_to_element_list(
|
|||||||
detection_origin: Optional[str] = None,
|
detection_origin: Optional[str] = None,
|
||||||
sort_mode: str = SORT_MODE_XY_CUT,
|
sort_mode: str = SORT_MODE_XY_CUT,
|
||||||
languages: Optional[List[str]] = None,
|
languages: Optional[List[str]] = None,
|
||||||
|
starting_page_number: int = 1,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
"""Converts a DocumentLayout object to a list of unstructured elements."""
|
"""Converts a DocumentLayout object to a list of unstructured elements."""
|
||||||
elements: List[Element] = []
|
elements: List[Element] = []
|
||||||
|
|
||||||
num_pages = len(document.pages)
|
num_pages = len(document.pages)
|
||||||
for i, page in enumerate(document.pages):
|
for page_number, page in enumerate(document.pages, start=starting_page_number):
|
||||||
page_elements: List[Element] = []
|
page_elements: List[Element] = []
|
||||||
|
|
||||||
page_image_metadata = _get_page_image_metadata(page)
|
page_image_metadata = _get_page_image_metadata(page)
|
||||||
@ -571,7 +572,7 @@ def document_to_element_list(
|
|||||||
for el in element:
|
for el in element:
|
||||||
if last_modification_date:
|
if last_modification_date:
|
||||||
el.metadata.last_modified = last_modification_date
|
el.metadata.last_modified = last_modification_date
|
||||||
el.metadata.page_number = i + 1
|
el.metadata.page_number = page_number
|
||||||
page_elements.extend(element)
|
page_elements.extend(element)
|
||||||
translation_mapping.extend([(layout_element, el) for el in element])
|
translation_mapping.extend([(layout_element, el) for el in element])
|
||||||
continue
|
continue
|
||||||
@ -601,7 +602,7 @@ def document_to_element_list(
|
|||||||
|
|
||||||
add_element_metadata(
|
add_element_metadata(
|
||||||
element,
|
element,
|
||||||
page_number=i + 1,
|
page_number=page_number,
|
||||||
filetype=image_format,
|
filetype=image_format,
|
||||||
coordinates=coordinates,
|
coordinates=coordinates,
|
||||||
coordinate_system=coordinate_system,
|
coordinate_system=coordinate_system,
|
||||||
@ -622,7 +623,7 @@ def document_to_element_list(
|
|||||||
if sortable and sort_mode != SORT_MODE_DONT:
|
if sortable and sort_mode != SORT_MODE_DONT:
|
||||||
sorted_page_elements = sort_page_elements(page_elements, sort_mode)
|
sorted_page_elements = sort_page_elements(page_elements, sort_mode)
|
||||||
|
|
||||||
if include_page_breaks and i < num_pages - 1:
|
if include_page_breaks and page_number < num_pages + starting_page_number:
|
||||||
sorted_page_elements.append(PageBreak(text=""))
|
sorted_page_elements.append(PageBreak(text=""))
|
||||||
elements.extend(sorted_page_elements)
|
elements.extend(sorted_page_elements)
|
||||||
|
|
||||||
|
|||||||
@ -29,6 +29,7 @@ def partition_doc(
|
|||||||
languages: Optional[List[str]] = ["auto"],
|
languages: Optional[List[str]] = ["auto"],
|
||||||
detect_language_per_element: bool = False,
|
detect_language_per_element: bool = False,
|
||||||
date_from_file_object: bool = False,
|
date_from_file_object: bool = False,
|
||||||
|
starting_page_number: int = 1,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
"""Partitions Microsoft Word Documents in .doc format into its document elements.
|
"""Partitions Microsoft Word Documents in .doc format into its document elements.
|
||||||
@ -55,6 +56,10 @@ def partition_doc(
|
|||||||
date_from_file_object
|
date_from_file_object
|
||||||
Applies only when providing file via `file` parameter. If this option is True, attempt
|
Applies only when providing file via `file` parameter. If this option is True, attempt
|
||||||
infer last_modified metadata from bytes, otherwise set it to None.
|
infer last_modified metadata from bytes, otherwise set it to None.
|
||||||
|
starting_page_number
|
||||||
|
Indicates what page number should be assigned to the first page in the document.
|
||||||
|
This information will be reflected in elements' metadata and can be be especially
|
||||||
|
useful when partitioning a document that is part of a larger document.
|
||||||
"""
|
"""
|
||||||
# Verify that only one of the arguments was provided
|
# Verify that only one of the arguments was provided
|
||||||
if filename is None:
|
if filename is None:
|
||||||
@ -97,6 +102,7 @@ def partition_doc(
|
|||||||
metadata_last_modified=metadata_last_modified or last_modification_date,
|
metadata_last_modified=metadata_last_modified or last_modification_date,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
|
starting_page_number=starting_page_number,
|
||||||
)
|
)
|
||||||
# remove tmp.name from filename if parsing file
|
# remove tmp.name from filename if parsing file
|
||||||
if file:
|
if file:
|
||||||
|
|||||||
@ -181,6 +181,7 @@ def partition_docx(
|
|||||||
languages: Optional[List[str]] = ["auto"],
|
languages: Optional[List[str]] = ["auto"],
|
||||||
detect_language_per_element: bool = False,
|
detect_language_per_element: bool = False,
|
||||||
date_from_file_object: bool = False,
|
date_from_file_object: bool = False,
|
||||||
|
starting_page_number: int = 1,
|
||||||
**kwargs: Any, # used by decorator
|
**kwargs: Any, # used by decorator
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
"""Partitions Microsoft Word Documents in .docx format into its document elements.
|
"""Partitions Microsoft Word Documents in .docx format into its document elements.
|
||||||
@ -212,6 +213,10 @@ def partition_docx(
|
|||||||
date_from_file_object
|
date_from_file_object
|
||||||
Applies only when providing file via `file` parameter. If this option is True, attempt
|
Applies only when providing file via `file` parameter. If this option is True, attempt
|
||||||
infer last_modified metadata from bytes, otherwise set it to None.
|
infer last_modified metadata from bytes, otherwise set it to None.
|
||||||
|
starting_page_number
|
||||||
|
Indicates what page number should be assigned to the first page in the document.
|
||||||
|
This information will be reflected in elements' metadata and can be be especially
|
||||||
|
useful when partitioning a document that is part of a larger document.
|
||||||
"""
|
"""
|
||||||
# -- verify that only one file-specifier argument was provided --
|
# -- verify that only one file-specifier argument was provided --
|
||||||
exactly_one(filename=filename, file=file)
|
exactly_one(filename=filename, file=file)
|
||||||
@ -224,6 +229,7 @@ def partition_docx(
|
|||||||
infer_table_structure,
|
infer_table_structure,
|
||||||
metadata_last_modified,
|
metadata_last_modified,
|
||||||
date_from_file_object,
|
date_from_file_object,
|
||||||
|
starting_page_number=starting_page_number,
|
||||||
)
|
)
|
||||||
elements = apply_lang_metadata(
|
elements = apply_lang_metadata(
|
||||||
elements=elements,
|
elements=elements,
|
||||||
@ -249,6 +255,7 @@ class _DocxPartitioner:
|
|||||||
infer_table_structure: bool = True,
|
infer_table_structure: bool = True,
|
||||||
metadata_last_modified: Optional[str] = None,
|
metadata_last_modified: Optional[str] = None,
|
||||||
date_from_file_object: bool = False,
|
date_from_file_object: bool = False,
|
||||||
|
starting_page_number: int = 1,
|
||||||
) -> None:
|
) -> None:
|
||||||
self._filename = filename
|
self._filename = filename
|
||||||
self._file = file
|
self._file = file
|
||||||
@ -256,7 +263,7 @@ class _DocxPartitioner:
|
|||||||
self._include_page_breaks = include_page_breaks
|
self._include_page_breaks = include_page_breaks
|
||||||
self._infer_table_structure = infer_table_structure
|
self._infer_table_structure = infer_table_structure
|
||||||
self._metadata_last_modified = metadata_last_modified
|
self._metadata_last_modified = metadata_last_modified
|
||||||
self._page_counter: int = 1
|
self._page_counter = starting_page_number
|
||||||
self._date_from_file_object = date_from_file_object
|
self._date_from_file_object = date_from_file_object
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -269,6 +276,7 @@ class _DocxPartitioner:
|
|||||||
infer_table_structure: bool = True,
|
infer_table_structure: bool = True,
|
||||||
metadata_last_modified: Optional[str] = None,
|
metadata_last_modified: Optional[str] = None,
|
||||||
date_from_file_object: bool = False,
|
date_from_file_object: bool = False,
|
||||||
|
starting_page_number: int = 1,
|
||||||
) -> Iterator[Element]:
|
) -> Iterator[Element]:
|
||||||
"""Partition MS Word documents (.docx format) into its document elements."""
|
"""Partition MS Word documents (.docx format) into its document elements."""
|
||||||
self = cls(
|
self = cls(
|
||||||
@ -279,6 +287,7 @@ class _DocxPartitioner:
|
|||||||
infer_table_structure=infer_table_structure,
|
infer_table_structure=infer_table_structure,
|
||||||
metadata_last_modified=metadata_last_modified,
|
metadata_last_modified=metadata_last_modified,
|
||||||
date_from_file_object=date_from_file_object,
|
date_from_file_object=date_from_file_object,
|
||||||
|
starting_page_number=starting_page_number,
|
||||||
)
|
)
|
||||||
# NOTE(scanny): It's possible for a Word document to have no sections. In particular, a
|
# NOTE(scanny): It's possible for a Word document to have no sections. In particular, a
|
||||||
# Microsoft Teams chat transcript exported to DOCX contains no sections. Such a
|
# Microsoft Teams chat transcript exported to DOCX contains no sections. Such a
|
||||||
|
|||||||
@ -152,6 +152,7 @@ def partition_pdf(
|
|||||||
extract_image_block_output_dir: Optional[str] = None,
|
extract_image_block_output_dir: Optional[str] = None,
|
||||||
extract_image_block_to_payload: bool = False,
|
extract_image_block_to_payload: bool = False,
|
||||||
date_from_file_object: bool = False,
|
date_from_file_object: bool = False,
|
||||||
|
starting_page_number: int = 1,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
"""Parses a pdf document into a list of interpreted elements.
|
"""Parses a pdf document into a list of interpreted elements.
|
||||||
@ -228,6 +229,7 @@ def partition_pdf(
|
|||||||
extract_image_block_output_dir=extract_image_block_output_dir,
|
extract_image_block_output_dir=extract_image_block_output_dir,
|
||||||
extract_image_block_to_payload=extract_image_block_to_payload,
|
extract_image_block_to_payload=extract_image_block_to_payload,
|
||||||
date_from_file_object=date_from_file_object,
|
date_from_file_object=date_from_file_object,
|
||||||
|
starting_page_number=starting_page_number,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -248,6 +250,7 @@ def partition_pdf_or_image(
|
|||||||
extract_image_block_output_dir: Optional[str] = None,
|
extract_image_block_output_dir: Optional[str] = None,
|
||||||
extract_image_block_to_payload: bool = False,
|
extract_image_block_to_payload: bool = False,
|
||||||
date_from_file_object: bool = False,
|
date_from_file_object: bool = False,
|
||||||
|
starting_page_number: int = 1,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
"""Parses a pdf or image document into a list of interpreted elements."""
|
"""Parses a pdf or image document into a list of interpreted elements."""
|
||||||
@ -277,6 +280,7 @@ def partition_pdf_or_image(
|
|||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
metadata_last_modified=metadata_last_modified or last_modification_date,
|
metadata_last_modified=metadata_last_modified or last_modification_date,
|
||||||
|
starting_page_number=starting_page_number,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
pdf_text_extractable = any(
|
pdf_text_extractable = any(
|
||||||
@ -316,6 +320,7 @@ def partition_pdf_or_image(
|
|||||||
extract_image_block_types=extract_image_block_types,
|
extract_image_block_types=extract_image_block_types,
|
||||||
extract_image_block_output_dir=extract_image_block_output_dir,
|
extract_image_block_output_dir=extract_image_block_output_dir,
|
||||||
extract_image_block_to_payload=extract_image_block_to_payload,
|
extract_image_block_to_payload=extract_image_block_to_payload,
|
||||||
|
starting_page_number=starting_page_number,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
out_elements = _process_uncategorized_text_elements(elements)
|
out_elements = _process_uncategorized_text_elements(elements)
|
||||||
@ -333,6 +338,7 @@ def partition_pdf_or_image(
|
|||||||
languages=languages,
|
languages=languages,
|
||||||
is_image=is_image,
|
is_image=is_image,
|
||||||
metadata_last_modified=metadata_last_modified or last_modification_date,
|
metadata_last_modified=metadata_last_modified or last_modification_date,
|
||||||
|
starting_page_number=starting_page_number,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
out_elements = _process_uncategorized_text_elements(elements)
|
out_elements = _process_uncategorized_text_elements(elements)
|
||||||
@ -346,6 +352,7 @@ def extractable_elements(
|
|||||||
include_page_breaks: bool = False,
|
include_page_breaks: bool = False,
|
||||||
languages: Optional[List[str]] = None,
|
languages: Optional[List[str]] = None,
|
||||||
metadata_last_modified: Optional[str] = None,
|
metadata_last_modified: Optional[str] = None,
|
||||||
|
starting_page_number: int = 1,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
):
|
):
|
||||||
if isinstance(file, bytes):
|
if isinstance(file, bytes):
|
||||||
@ -356,6 +363,7 @@ def extractable_elements(
|
|||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
metadata_last_modified=metadata_last_modified,
|
metadata_last_modified=metadata_last_modified,
|
||||||
|
starting_page_number=starting_page_number,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -395,6 +403,7 @@ def _partition_pdf_or_image_local(
|
|||||||
extract_image_block_to_payload: bool = False,
|
extract_image_block_to_payload: bool = False,
|
||||||
analysis: bool = False,
|
analysis: bool = False,
|
||||||
analyzed_image_output_dir_path: Optional[str] = None,
|
analyzed_image_output_dir_path: Optional[str] = None,
|
||||||
|
starting_page_number: int = 1,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
"""Partition using package installed locally"""
|
"""Partition using package installed locally"""
|
||||||
@ -532,6 +541,7 @@ def _partition_pdf_or_image_local(
|
|||||||
# unstructured.partition.common::layout_list_to_list_items often result in weird chunking.
|
# unstructured.partition.common::layout_list_to_list_items often result in weird chunking.
|
||||||
infer_list_items=False,
|
infer_list_items=False,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
|
starting_page_number=starting_page_number,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -610,6 +620,7 @@ def _partition_pdf_with_pdfminer(
|
|||||||
include_page_breaks: bool,
|
include_page_breaks: bool,
|
||||||
languages: List[str],
|
languages: List[str],
|
||||||
metadata_last_modified: Optional[str],
|
metadata_last_modified: Optional[str],
|
||||||
|
starting_page_number: int = 1,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
"""Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster
|
"""Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster
|
||||||
@ -633,6 +644,7 @@ def _partition_pdf_with_pdfminer(
|
|||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
metadata_last_modified=metadata_last_modified,
|
metadata_last_modified=metadata_last_modified,
|
||||||
|
starting_page_number=starting_page_number,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -644,6 +656,7 @@ def _partition_pdf_with_pdfminer(
|
|||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
metadata_last_modified=metadata_last_modified,
|
metadata_last_modified=metadata_last_modified,
|
||||||
|
starting_page_number=starting_page_number,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -691,13 +704,16 @@ def _process_pdfminer_pages(
|
|||||||
metadata_last_modified: Optional[str],
|
metadata_last_modified: Optional[str],
|
||||||
sort_mode: str = SORT_MODE_XY_CUT,
|
sort_mode: str = SORT_MODE_XY_CUT,
|
||||||
annotation_threshold: Optional[float] = 0.9,
|
annotation_threshold: Optional[float] = 0.9,
|
||||||
|
starting_page_number: int = 1,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""Uses PDFMiner to split a document into pages and process them."""
|
"""Uses PDFMiner to split a document into pages and process them."""
|
||||||
|
|
||||||
elements: List[Element] = []
|
elements: List[Element] = []
|
||||||
|
|
||||||
for i, (page, page_layout) in enumerate(open_pdfminer_pages_generator(fp)):
|
for page_number, (page, page_layout) in enumerate(
|
||||||
|
open_pdfminer_pages_generator(fp), start=starting_page_number
|
||||||
|
):
|
||||||
width, height = page_layout.width, page_layout.height
|
width, height = page_layout.width, page_layout.height
|
||||||
|
|
||||||
page_elements: List[Element] = []
|
page_elements: List[Element] = []
|
||||||
@ -708,7 +724,7 @@ def _process_pdfminer_pages(
|
|||||||
height=height,
|
height=height,
|
||||||
)
|
)
|
||||||
if page.annots:
|
if page.annots:
|
||||||
annotation_list = get_uris(page.annots, height, coordinate_system, i + 1)
|
annotation_list = get_uris(page.annots, height, coordinate_system, page_number)
|
||||||
|
|
||||||
for obj in page_layout:
|
for obj in page_layout:
|
||||||
x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
|
x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
|
||||||
@ -720,7 +736,7 @@ def _process_pdfminer_pages(
|
|||||||
annotations_within_element = check_annotations_within_element(
|
annotations_within_element = check_annotations_within_element(
|
||||||
annotation_list,
|
annotation_list,
|
||||||
bbox,
|
bbox,
|
||||||
i + 1,
|
page_number,
|
||||||
annotation_threshold,
|
annotation_threshold,
|
||||||
)
|
)
|
||||||
_, words = get_word_bounding_box_from_element(obj, height)
|
_, words = get_word_bounding_box_from_element(obj, height)
|
||||||
@ -750,7 +766,7 @@ def _process_pdfminer_pages(
|
|||||||
|
|
||||||
element.metadata = ElementMetadata(
|
element.metadata = ElementMetadata(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
page_number=i + 1,
|
page_number=page_number,
|
||||||
coordinates=coordinates_metadata,
|
coordinates=coordinates_metadata,
|
||||||
last_modified=metadata_last_modified,
|
last_modified=metadata_last_modified,
|
||||||
links=links,
|
links=links,
|
||||||
@ -892,6 +908,7 @@ def _partition_pdf_or_image_with_ocr(
|
|||||||
languages: Optional[List[str]] = ["eng"],
|
languages: Optional[List[str]] = ["eng"],
|
||||||
is_image: bool = False,
|
is_image: bool = False,
|
||||||
metadata_last_modified: Optional[str] = None,
|
metadata_last_modified: Optional[str] = None,
|
||||||
|
starting_page_number: int = 1,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""Partitions an image or PDF using OCR. For PDFs, each page is converted
|
"""Partitions an image or PDF using OCR. For PDFs, each page is converted
|
||||||
@ -903,18 +920,20 @@ def _partition_pdf_or_image_with_ocr(
|
|||||||
image = PILImage.open(file) if file is not None else PILImage.open(filename)
|
image = PILImage.open(file) if file is not None else PILImage.open(filename)
|
||||||
images.append(image)
|
images.append(image)
|
||||||
|
|
||||||
for i, image in enumerate(images):
|
for page_number, image in enumerate(images, start=starting_page_number):
|
||||||
page_elements = _partition_pdf_or_image_with_ocr_from_image(
|
page_elements = _partition_pdf_or_image_with_ocr_from_image(
|
||||||
image=image,
|
image=image,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
page_number=i + 1,
|
page_number=page_number,
|
||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
metadata_last_modified=metadata_last_modified,
|
metadata_last_modified=metadata_last_modified,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elements.extend(page_elements)
|
elements.extend(page_elements)
|
||||||
else:
|
else:
|
||||||
for page_number, image in enumerate(convert_pdf_to_images(filename, file), start=1):
|
for page_number, image in enumerate(
|
||||||
|
convert_pdf_to_images(filename, file), start=starting_page_number
|
||||||
|
):
|
||||||
page_elements = _partition_pdf_or_image_with_ocr_from_image(
|
page_elements = _partition_pdf_or_image_with_ocr_from_image(
|
||||||
image=image,
|
image=image,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
|
|||||||
@ -29,6 +29,7 @@ def partition_ppt(
|
|||||||
languages: Optional[List[str]] = ["auto"],
|
languages: Optional[List[str]] = ["auto"],
|
||||||
detect_language_per_element: bool = False,
|
detect_language_per_element: bool = False,
|
||||||
date_from_file_object: bool = False,
|
date_from_file_object: bool = False,
|
||||||
|
starting_page_number: int = 1,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
"""Partitions Microsoft PowerPoint Documents in .ppt format into their document elements.
|
"""Partitions Microsoft PowerPoint Documents in .ppt format into their document elements.
|
||||||
@ -59,6 +60,10 @@ def partition_ppt(
|
|||||||
date_from_file_object
|
date_from_file_object
|
||||||
Applies only when providing file via `file` parameter. If this option is True, attempt
|
Applies only when providing file via `file` parameter. If this option is True, attempt
|
||||||
infer last_modified metadata from bytes, otherwise set it to None.
|
infer last_modified metadata from bytes, otherwise set it to None.
|
||||||
|
starting_page_number
|
||||||
|
Indicates what page number should be assigned to the first slide in the presentation.
|
||||||
|
This information will be reflected in elements' metadata and can be be especially
|
||||||
|
useful when partitioning a document that is part of a larger document.
|
||||||
"""
|
"""
|
||||||
# Verify that only one of the arguments was provided
|
# Verify that only one of the arguments was provided
|
||||||
if filename is None:
|
if filename is None:
|
||||||
@ -100,6 +105,7 @@ def partition_ppt(
|
|||||||
metadata_last_modified=metadata_last_modified or last_modification_date,
|
metadata_last_modified=metadata_last_modified or last_modification_date,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
|
starting_page_number=starting_page_number,
|
||||||
)
|
)
|
||||||
|
|
||||||
# remove tmp.name from filename if parsing file
|
# remove tmp.name from filename if parsing file
|
||||||
|
|||||||
@ -92,6 +92,7 @@ def partition_pptx(
|
|||||||
metadata_filename: Optional[str] = None,
|
metadata_filename: Optional[str] = None,
|
||||||
metadata_last_modified: Optional[str] = None,
|
metadata_last_modified: Optional[str] = None,
|
||||||
strategy: str = PartitionStrategy.FAST,
|
strategy: str = PartitionStrategy.FAST,
|
||||||
|
starting_page_number: int = 1,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> list[Element]:
|
) -> list[Element]:
|
||||||
"""Partition PowerPoint document in .pptx format into its document elements.
|
"""Partition PowerPoint document in .pptx format into its document elements.
|
||||||
@ -128,6 +129,10 @@ def partition_pptx(
|
|||||||
date_from_file_object
|
date_from_file_object
|
||||||
Applies only when providing file via `file` parameter. If this option is True, attempt
|
Applies only when providing file via `file` parameter. If this option is True, attempt
|
||||||
infer last_modified metadata from bytes, otherwise set it to None.
|
infer last_modified metadata from bytes, otherwise set it to None.
|
||||||
|
starting_page_number
|
||||||
|
Indicates what page number should be assigned to the first slide in the presentation.
|
||||||
|
This information will be reflected in elements' metadata and can be be especially
|
||||||
|
useful when partitioning a document that is part of a larger document.
|
||||||
"""
|
"""
|
||||||
opts = _PptxPartitionerOptions(
|
opts = _PptxPartitionerOptions(
|
||||||
date_from_file_object=date_from_file_object,
|
date_from_file_object=date_from_file_object,
|
||||||
@ -139,6 +144,7 @@ def partition_pptx(
|
|||||||
metadata_file_path=metadata_filename,
|
metadata_file_path=metadata_filename,
|
||||||
metadata_last_modified=metadata_last_modified,
|
metadata_last_modified=metadata_last_modified,
|
||||||
strategy=strategy,
|
strategy=strategy,
|
||||||
|
starting_page_number=starting_page_number,
|
||||||
)
|
)
|
||||||
|
|
||||||
elements = _PptxPartitioner.iter_presentation_elements(opts)
|
elements = _PptxPartitioner.iter_presentation_elements(opts)
|
||||||
@ -369,6 +375,7 @@ class _PptxPartitionerOptions:
|
|||||||
metadata_file_path: Optional[str],
|
metadata_file_path: Optional[str],
|
||||||
metadata_last_modified: Optional[str],
|
metadata_last_modified: Optional[str],
|
||||||
strategy: str,
|
strategy: str,
|
||||||
|
starting_page_number: int = 1,
|
||||||
):
|
):
|
||||||
self._date_from_file_object = date_from_file_object
|
self._date_from_file_object = date_from_file_object
|
||||||
self._file = file
|
self._file = file
|
||||||
@ -380,7 +387,7 @@ class _PptxPartitionerOptions:
|
|||||||
self._metadata_last_modified = metadata_last_modified
|
self._metadata_last_modified = metadata_last_modified
|
||||||
self._strategy = strategy
|
self._strategy = strategy
|
||||||
# -- options object maintains page-number state --
|
# -- options object maintains page-number state --
|
||||||
self._page_counter = 0
|
self._page_counter = starting_page_number - 1
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def register_picture_partitioner(cls, picture_partitioner: AbstractPicturePartitioner):
|
def register_picture_partitioner(cls, picture_partitioner: AbstractPicturePartitioner):
|
||||||
|
|||||||
@ -55,6 +55,7 @@ def partition_xlsx(
|
|||||||
include_header: bool = False,
|
include_header: bool = False,
|
||||||
find_subtable: bool = True,
|
find_subtable: bool = True,
|
||||||
date_from_file_object: bool = False,
|
date_from_file_object: bool = False,
|
||||||
|
starting_page_number: int = 1,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> list[Element]:
|
) -> list[Element]:
|
||||||
"""Partitions Microsoft Excel Documents in .xlsx format into its document elements.
|
"""Partitions Microsoft Excel Documents in .xlsx format into its document elements.
|
||||||
@ -103,7 +104,9 @@ def partition_xlsx(
|
|||||||
)
|
)
|
||||||
|
|
||||||
elements: list[Element] = []
|
elements: list[Element] = []
|
||||||
for page_number, (sheet_name, sheet) in enumerate(opts.sheets.items(), start=1):
|
for page_number, (sheet_name, sheet) in enumerate(
|
||||||
|
opts.sheets.items(), start=starting_page_number
|
||||||
|
):
|
||||||
if not opts.find_subtable:
|
if not opts.find_subtable:
|
||||||
html_text = (
|
html_text = (
|
||||||
sheet.to_html( # pyright: ignore[reportUnknownMemberType]
|
sheet.to_html( # pyright: ignore[reportUnknownMemberType]
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user