rfctr(pptx): extract _PptxPartitionerOptions (#2853)

**Reviewers:** Likely quicker to review commit-by-commit.

**Summary**

In preparation for adding a PPTX `Picture` shape _sub-partitioner_,
extract management of PPTX partitioning-run options to a separate
`_PptxPartitioningOptions` object similar to those used in chunking and
XLSX partitioning. This provides several benefits:
- Extract code dealing with applying defaults and computing derived
values from the main partitioning code, leaving it less cluttered and
focused on the partitioning algorithm itself.
- Allow the options set to be passed to helper objects, prominently
including sub-partitioners, without requiring a long list of parameters
or requiring the caller to couple itself to the particular option values
the helper object requires.
- Allow options behaviors to be thoroughly and efficiently tested in
isolation.
This commit is contained in:
Steve Canny 2024-04-08 12:01:03 -07:00 committed by GitHub
parent a9b6506724
commit 2c7e0289aa
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 537 additions and 289 deletions

View File

@ -116,13 +116,20 @@ jobs:
- name: Test
env:
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
TESSERACT_VERSION : "5.3.4"
run: |
source .venv/bin/activate
sudo apt-get update
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get update
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
tesseract --version
installed_tesseract_version=$(tesseract --version | grep -oP '(?<=tesseract )\d+\.\d+\.\d+')
if [ "$installed_tesseract_version" != "${{env.TESSERACT_VERSION}}" ]; then
echo "Tesseract version ${{env.TESSERACT_VERSION}} is required but found version $installed_tesseract_version"
exit 1
fi
# FIXME (yao): sometimes there is cache but we still miss argilla in the env; so we add make install-ci again
make install-ci
make test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
@ -156,6 +163,7 @@ jobs:
sudo apt-get install -y poppler-utils
make install-pandoc
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get update
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
tesseract --version
make test-chipper CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
@ -224,6 +232,7 @@ jobs:
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
make install-pandoc
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get update
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
tesseract --version
make test-extra-${{ matrix.extra }} CI=true
@ -343,6 +352,7 @@ jobs:
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
make install-pandoc
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get update
sudo apt-get install -y tesseract-ocr
sudo apt-get install -y tesseract-ocr-kor
sudo apt-get install diffstat
@ -408,6 +418,7 @@ jobs:
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
make install-pandoc
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get update
sudo apt-get install -y tesseract-ocr
sudo apt-get install -y tesseract-ocr-kor
sudo apt-get install diffstat
@ -454,6 +465,7 @@ jobs:
make install-ci
sudo apt-get update && sudo apt-get install --yes poppler-utils libreoffice
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get update
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
tesseract --version
make install-nltk-models

View File

@ -398,7 +398,7 @@ check-flake8-print:
.PHONY: check-ruff
check-ruff:
# -- ruff options are determined by pyproject.toml --
ruff .
ruff check .
.PHONY: check-autoflake
check-autoflake:

View File

@ -1,3 +0,0 @@
PPTX files in this directory are made by hand using PowerPoint. This most faithfully represents source files likely to be used by users of this library. Files produced by `python-pptx` are "cleaner" in certain respects and may pass tests that a "real" PowerPoint file would not.
We may also wish to add files made with LibreOffice since the XML it generates, while consistent with the ISO 29500 spec, can differ enough from that generated by Microsoft PowerPoint to produce different results on certain tests.

View File

@ -2,28 +2,35 @@
"""Test suite for `unstructured.partition.pptx` module."""
import os
from __future__ import annotations
import io
import pathlib
from tempfile import SpooledTemporaryFile
import tempfile
from typing import Any
import pptx
import pytest
from pptx.util import Inches
from pytest_mock import MockFixture
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
from test_unstructured.unit_utils import (
FixtureRequest,
Mock,
assert_round_trips_through_JSON,
example_doc_path,
function_mock,
)
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import (
ElementMetadata,
ListItem,
NarrativeText,
PageBreak,
Text,
Title,
)
from unstructured.partition.pptx import _PptxPartitioner, partition_pptx
DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "..", "example-docs")
from unstructured.partition.pptx import _PptxPartitionerOptions, partition_pptx
EXPECTED_PPTX_OUTPUT = [
Title(text="Adding a Bullet Slide"),
@ -35,24 +42,18 @@ EXPECTED_PPTX_OUTPUT = [
]
def get_test_file_path(filename: str) -> str:
return str(pathlib.Path(__file__).parent / "test_files" / filename)
# == DescribePptxPartitionerSourceFileBehaviors ==================================================
# == document file behaviors =====================================================================
def test_partition_pptx_from_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
elements = partition_pptx(filename=filename)
elements = partition_pptx(example_doc_path("fake-power-point.pptx"))
assert elements == EXPECTED_PPTX_OUTPUT
for element in elements:
assert element.metadata.filename == "fake-power-point.pptx"
def test_partition_pptx_from_filename_with_metadata_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
elements = partition_pptx(filename=filename, metadata_filename="test")
elements = partition_pptx(example_doc_path("fake-power-point.pptx"), metadata_filename="test")
assert elements == EXPECTED_PPTX_OUTPUT
for element in elements:
assert element.metadata.filename == "test"
@ -63,11 +64,8 @@ def test_partition_pptx_with_spooled_file():
Including one that does not have its read-pointer set to the start.
"""
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
from tempfile import SpooledTemporaryFile
with open(filename, "rb") as test_file:
spooled_temp_file = SpooledTemporaryFile()
with open(example_doc_path("fake-power-point.pptx"), "rb") as test_file:
spooled_temp_file = tempfile.SpooledTemporaryFile()
spooled_temp_file.write(test_file.read())
elements = partition_pptx(file=spooled_temp_file)
assert elements == EXPECTED_PPTX_OUTPUT
@ -76,8 +74,7 @@ def test_partition_pptx_with_spooled_file():
def test_partition_pptx_from_file():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
with open(filename, "rb") as f:
with open(example_doc_path("fake-power-point.pptx"), "rb") as f:
elements = partition_pptx(file=f)
assert elements == EXPECTED_PPTX_OUTPUT
for element in elements:
@ -85,37 +82,24 @@ def test_partition_pptx_from_file():
def test_partition_pptx_from_file_with_metadata_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
with open(filename, "rb") as f:
with open(example_doc_path("fake-power-point.pptx"), "rb") as f:
elements = partition_pptx(file=f, metadata_filename="test")
assert elements == EXPECTED_PPTX_OUTPUT
for element in elements:
assert element.metadata.filename == "test"
def test_partition_pptx_raises_with_both_specified():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
with open(filename, "rb") as f, pytest.raises(ValueError):
partition_pptx(filename=filename, file=f)
def test_partition_pptx_raises_with_neither():
with pytest.raises(ValueError):
partition_pptx()
class DescribePptxPartitionerShapeOrderingBehaviors:
"""Tests related to shape inclusion and ordering based on position."""
def it_recurses_into_group_shapes(self):
elements = _PptxPartitioner(
get_test_file_path("group-shapes-nested.pptx")
)._iter_presentation_elements()
assert [e.text for e in elements] == ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]
def test_partition_pptx_recurses_into_group_shapes():
elements = partition_pptx(example_doc_path("group-shapes-nested.pptx"))
assert [e.text for e in elements] == ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]
# == DescribePptxPartitionerPageBreakBehaviors ===================================================
# == page-break behaviors ========================================================================
def test_partition_pptx_adds_page_breaks(tmp_path: pathlib.Path):
@ -180,8 +164,7 @@ def test_partition_pptx_page_breaks_toggle_off(tmp_path: pathlib.Path):
def test_partition_pptx_many_pages():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-many-pages.pptx")
elements = partition_pptx(filename=filename)
elements = partition_pptx(example_doc_path("fake-power-point-many-pages.pptx"))
# The page_number of PageBreak is None
assert set(filter(None, (elt.metadata.page_number for elt in elements))) == {1, 2}
@ -189,7 +172,7 @@ def test_partition_pptx_many_pages():
assert element.metadata.filename == "fake-power-point-many-pages.pptx"
# == DescribePptxPartitionerMiscellaneousBehaviors ===============================================
# == miscellaneous behaviors =====================================================================
def test_partition_pptx_orders_elements(tmp_path: pathlib.Path):
@ -237,38 +220,31 @@ def test_partition_pptx_orders_elements(tmp_path: pathlib.Path):
assert element.metadata.filename == "test-ordering.pptx"
EXPECTED_HTML_TABLE = """<table>
<thead>
<tr><th>Column 1 </th><th>Column 2 </th><th>Column 3 </th></tr>
</thead>
<tbody>
<tr><td>Red </td><td>Green </td><td>Blue </td></tr>
<tr><td>Purple </td><td>Orange </td><td>Yellow </td></tr>
<tr><td>Tangerine </td><td>Pink </td><td>Aqua </td></tr>
</tbody>
</table>"""
def test_partition_pptx_grabs_tables():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-table.pptx")
elements = partition_pptx(filename=filename)
elements = partition_pptx(example_doc_path("fake-power-point-table.pptx"))
assert elements[1].text.startswith("Column 1")
assert elements[1].text.strip().endswith("Aqua")
assert elements[1].metadata.text_as_html == EXPECTED_HTML_TABLE
assert elements[1].metadata.text_as_html == (
"<table>\n"
"<thead>\n"
"<tr><th>Column 1 </th><th>Column 2 </th><th>Column 3 </th></tr>\n"
"</thead>\n"
"<tbody>\n"
"<tr><td>Red </td><td>Green </td><td>Blue </td></tr>\n"
"<tr><td>Purple </td><td>Orange </td><td>Yellow </td></tr>\n"
"<tr><td>Tangerine </td><td>Pink </td><td>Aqua </td></tr>\n"
"</tbody>\n"
"</table>"
)
assert elements[1].metadata.filename == "fake-power-point-table.pptx"
@pytest.mark.parametrize(
"infer_table_structure",
[
True,
False,
],
)
def test_partition_pptx_infer_table_structure(infer_table_structure):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-table.pptx")
elements = partition_pptx(filename=filename, infer_table_structure=infer_table_structure)
@pytest.mark.parametrize("infer_table_structure", [True, False])
def test_partition_pptx_infer_table_structure(infer_table_structure: bool):
elements = partition_pptx(
example_doc_path("fake-power-point-table.pptx"), infer_table_structure=infer_table_structure
)
table_element_has_text_as_html_field = (
hasattr(elements[1].metadata, "text_as_html")
and elements[1].metadata.text_as_html is not None
@ -277,8 +253,7 @@ def test_partition_pptx_infer_table_structure(infer_table_structure):
def test_partition_pptx_malformed():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
elements = partition_pptx(filename=filename)
elements = partition_pptx(example_doc_path("fake-power-point-malformed.pptx"))
assert elements[0].text == "Problem Date Placeholder"
assert elements[1].text == "Test Slide"
@ -289,106 +264,69 @@ def test_partition_pptx_malformed():
# == DescribePptxPartitionerMetadataBehaviors ====================================================
def test_partition_pptx_from_filename_exclude_metadata():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
elements = partition_pptx(filename=filename, include_metadata=False)
assert elements == EXPECTED_PPTX_OUTPUT
def test_partition_pptx_from_file_exclude_metadata():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
with open(filename, "rb") as f:
elements = partition_pptx(file=f, include_metadata=False)
assert elements == EXPECTED_PPTX_OUTPUT
def test_partition_pptx_metadata_date(mocker: MockFixture):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.pptx.get_last_modified_date",
return_value=mocked_last_modification_date,
"unstructured.partition.pptx.get_last_modified_date", return_value="2029-07-05T09:24:28"
)
elements = partition_pptx(
filename=filename,
)
elements = partition_pptx(example_doc_path("fake-power-point-malformed.pptx"))
assert elements[0].metadata.last_modified == mocked_last_modification_date
assert elements[0].metadata.last_modified == "2029-07-05T09:24:28"
def test_partition_pptx_with_custom_metadata_date(mocker: MockFixture):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modification_date = "2020-07-05T09:24:28"
mocker.patch(
"unstructured.partition.pptx.get_last_modified_date",
return_value=mocked_last_modification_date,
"unstructured.partition.pptx.get_last_modified_date", return_value="2022-11-22T11:22:33"
)
elements = partition_pptx(
filename=filename,
metadata_last_modified=expected_last_modification_date,
example_doc_path("fake-power-point-malformed.pptx"),
metadata_last_modified="2024-04-03T20:16:03",
)
assert elements[0].metadata.last_modified == expected_last_modification_date
assert elements[0].metadata.last_modified == "2024-04-03T20:16:03"
def test_partition_pptx_from_file_metadata_date(mocker: MockFixture):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.pptx.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
return_value="2029-07-05T09:24:28",
)
with open(filename, "rb") as f:
elements = partition_pptx(
file=f,
)
with open(example_doc_path("fake-power-point-malformed.pptx"), "rb") as f:
elements = partition_pptx(file=f)
assert elements[0].metadata.last_modified is None
def test_partition_pptx_from_file_explicit_get_metadata_date(mocker: MockFixture):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.pptx.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
return_value="2029-07-05T09:24:28",
)
with open(filename, "rb") as f:
with open(example_doc_path("fake-power-point-malformed.pptx"), "rb") as f:
elements = partition_pptx(file=f, date_from_file_object=True)
assert elements[0].metadata.last_modified == mocked_last_modification_date
assert elements[0].metadata.last_modified == "2029-07-05T09:24:28"
def test_partition_pptx_from_file_with_custom_metadata_date(mocker: MockFixture):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modification_date = "2020-07-05T09:24:28"
mocker.patch(
"unstructured.partition.pptx.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
return_value="2022-11-22T11:22:33",
)
with open(filename, "rb") as f:
elements = partition_pptx(file=f, metadata_last_modified=expected_last_modification_date)
with open(example_doc_path("fake-power-point-malformed.pptx"), "rb") as f:
elements = partition_pptx(file=f, metadata_last_modified="2024-04-03T20:16:03")
assert elements[0].metadata.last_modified == expected_last_modification_date
assert elements[0].metadata.last_modified == "2024-04-03T20:16:03"
def test_partition_pptx_from_file_without_metadata_date():
"""Test partition_pptx() with file that are not possible to get last modified date"""
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
with open(filename, "rb") as f:
sf = SpooledTemporaryFile()
with open(example_doc_path("fake-power-point-malformed.pptx"), "rb") as f:
sf = tempfile.SpooledTemporaryFile()
sf.write(f.read())
sf.seek(0)
elements = partition_pptx(file=sf, date_from_file_object=True)
@ -397,14 +335,15 @@ def test_partition_pptx_from_file_without_metadata_date():
def test_partition_pptx_element_metadata_has_languages():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
elements = partition_pptx(filename=filename)
elements = partition_pptx(example_doc_path("fake-power-point.pptx"))
assert elements[0].metadata.languages == ["eng"]
def test_partition_pptx_respects_detect_language_per_element():
filename = "example-docs/language-docs/eng_spa_mult.pptx"
elements = partition_pptx(filename=filename, detect_language_per_element=True)
elements = partition_pptx(
example_doc_path("language-docs/eng_spa_mult.pptx"), detect_language_per_element=True
)
langs = [element.metadata.languages for element in elements]
# languages other than English and Spanish are detected by this partitioner,
# so this test is slightly different from the other partition tests
@ -415,8 +354,7 @@ def test_partition_pptx_respects_detect_language_per_element():
def test_partition_pptx_raises_TypeError_for_invalid_languages():
with pytest.raises(TypeError):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
partition_pptx(filename=filename, languages="eng") # type: ignore
partition_pptx(example_doc_path("fake-power-point.pptx"), languages="eng") # type: ignore
# == DescribePptxPartitionerDownstreamBehaviors ==================================================
@ -428,10 +366,12 @@ def test_partition_pptx_with_json():
def test_add_chunking_strategy_by_title_on_partition_pptx():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "science-exploration-1p.pptx")
filename = example_doc_path("science-exploration-1p.pptx")
elements = partition_pptx(filename=filename)
chunk_elements = partition_pptx(filename, chunking_strategy="by_title")
chunks = chunk_by_title(elements)
assert chunk_elements != elements
assert chunk_elements == chunks
@ -444,6 +384,7 @@ def test_partition_pptx_title_shape_detection(tmp_path: pathlib.Path):
prs = pptx.Presentation()
slide = prs.slides.add_slide(prs.slide_layouts[0])
title_shape = slide.shapes.title
assert title_shape is not None
title_shape.text = (
"This is a title, it's a bit long so we can make sure it's not narrative text"
)
@ -479,10 +420,11 @@ def test_partition_pptx_level_detection(tmp_path: pathlib.Path):
slide = prs.slides.add_slide(blank_slide_layout)
shapes = slide.shapes
title_shape = shapes.title
body_shape = shapes.placeholders[1]
assert title_shape is not None
title_shape.text = (
"This is a title, it's a bit long so we can make sure it's not narrative text"
)
body_shape = shapes.placeholders[1]
tf = body_shape.text_frame
tf.text = "this is the root level bullet"
@ -517,7 +459,7 @@ def test_partition_pptx_level_detection(tmp_path: pathlib.Path):
assert isinstance(
element,
test_case[1],
), f"expected {test_case[1]}, got {element.category} for {element.text}"
), f"expected {test_case[1]}, got {type(element).__name__} for {element.text}"
assert (
element.metadata.category_depth == test_case[0]
), f"expected {test_case[0]}, got {element.metadata.category_depth} for {element.text}"
@ -525,8 +467,7 @@ def test_partition_pptx_level_detection(tmp_path: pathlib.Path):
def test_partition_pptx_hierarchy_sample_document():
"""This tests if the hierarchy of the sample document is correctly detected"""
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "sample-presentation.pptx")
elements = partition_pptx(filename=filename)
elements = partition_pptx(example_doc_path("sample-presentation.pptx"))
test_cases = [
# (expected category depth, parent id, child id)
@ -564,3 +505,275 @@ def test_partition_pptx_hierarchy_sample_document():
assert element.metadata.category_depth == test_case[0]
assert element.metadata.parent_id == test_case[1]
assert element.id == test_case[2]
# ================================================================================================
# ISOLATED UNIT TESTS
# ================================================================================================
# These test components used by `partition_pptx()` in isolation such that all edge cases can be
# exercised.
# ================================================================================================
class Describe_PptxPartitionerOptions:
"""Unit-test suite for `unstructured.partition.xlsx._PptxPartitionerOptions` objects."""
@pytest.mark.parametrize("arg_value", [True, False])
def it_knows_whether_to_emit_PageBreak_elements_as_part_of_the_output_element_stream(
self, arg_value: bool, opts_args: dict[str, Any]
):
opts_args["include_page_breaks"] = arg_value
opts = _PptxPartitionerOptions(**opts_args)
assert opts.include_page_breaks is arg_value
@pytest.mark.parametrize("arg_value", [True, False])
def it_knows_whether_to_partition_content_found_in_slide_notes(
self, arg_value: bool, opts_args: dict[str, Any]
):
opts_args["include_slide_notes"] = arg_value
opts = _PptxPartitionerOptions(**opts_args)
assert opts.include_slide_notes is arg_value
@pytest.mark.parametrize("arg_value", [True, False])
def it_knows_whether_to_include_text_as_html_in_Table_metadata(
self, arg_value: bool, opts_args: dict[str, Any]
):
opts_args["infer_table_structure"] = arg_value
opts = _PptxPartitionerOptions(**opts_args)
assert opts.infer_table_structure is arg_value
# -- .increment_page_number() ----------------
def it_generates_a_PageBreak_element_when_the_page_number_is_incremented(
self, opts_args: dict[str, Any]
):
opts = _PptxPartitionerOptions(**opts_args)
# -- move to the first slide --
list(opts.increment_page_number())
page_break_iter = opts.increment_page_number()
assert isinstance(next(page_break_iter, None), PageBreak)
assert opts.page_number == 2
with pytest.raises(StopIteration):
next(page_break_iter)
def but_it_does_not_generate_a_PageBreak_element_for_the_first_slide(
self, opts_args: dict[str, Any]
):
opts = _PptxPartitionerOptions(**opts_args)
page_break_iter = opts.increment_page_number()
with pytest.raises(StopIteration):
next(page_break_iter)
assert opts.page_number == 1
def and_it_does_not_generate_a_PageBreak_element_when_include_page_breaks_option_is_off(
self, opts_args: dict[str, Any]
):
opts_args["include_page_breaks"] = False
opts = _PptxPartitionerOptions(**opts_args)
# -- move to the first slide --
list(opts.increment_page_number())
page_break_iter = opts.increment_page_number()
with pytest.raises(StopIteration):
next(page_break_iter)
assert opts.page_number == 2
# -- .last_modified --------------------------
def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided(
self, opts_args: dict[str, Any]
):
opts_args["metadata_last_modified"] = "2024-03-05T17:02:53"
opts = _PptxPartitionerOptions(**opts_args)
assert opts.last_modified == "2024-03-05T17:02:53"
def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided(
self, opts_args: dict[str, Any], get_last_modified_date_: Mock
):
opts_args["file_path"] = "a/b/spreadsheet.pptx"
get_last_modified_date_.return_value = "2024-04-02T20:32:35"
opts = _PptxPartitionerOptions(**opts_args)
last_modified = opts.last_modified
get_last_modified_date_.assert_called_once_with("a/b/spreadsheet.pptx")
assert last_modified == "2024-04-02T20:32:35"
def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_file_like_object_is_provided(
self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock
):
file = io.BytesIO(b"abcdefg")
opts_args["file"] = file
opts_args["date_from_file_object"] = True
get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07"
opts = _PptxPartitionerOptions(**opts_args)
last_modified = opts.last_modified
get_last_modified_date_from_file_.assert_called_once_with(file)
assert last_modified == "2024-04-02T20:42:07"
def but_it_falls_back_to_None_for_the_last_modified_date_when_date_from_file_object_is_False(
self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock
):
file = io.BytesIO(b"abcdefg")
opts_args["file"] = file
opts_args["date_from_file_object"] = False
get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07"
opts = _PptxPartitionerOptions(**opts_args)
last_modified = opts.last_modified
get_last_modified_date_from_file_.assert_not_called()
assert last_modified is None
# -- .metadata_file_path ---------------------
def it_uses_the_user_provided_file_path_in_the_metadata_when_provided(
self, opts_args: dict[str, Any]
):
opts_args["file_path"] = "x/y/z.pptx"
opts_args["metadata_file_path"] = "a/b/c.pptx"
opts = _PptxPartitionerOptions(**opts_args)
assert opts.metadata_file_path == "a/b/c.pptx"
@pytest.mark.parametrize("file_path", ["u/v/w.pptx", None])
def and_it_falls_back_to_the_document_file_path_otherwise(
self, file_path: str | None, opts_args: dict[str, Any]
):
opts_args["file_path"] = file_path
opts_args["metadata_file_path"] = None
opts = _PptxPartitionerOptions(**opts_args)
assert opts.metadata_file_path == file_path
# -- .page_number ----------------------------
def it_keeps_track_of_the_page_number(self, opts_args: dict[str, Any]):
"""In PPTX, page-number is the slide number."""
opts = _PptxPartitionerOptions(**opts_args)
assert opts.page_number == 0
list(opts.increment_page_number())
assert opts.page_number == 1
list(opts.increment_page_number())
assert opts.page_number == 2
# -- .pptx_file ------------------------------
def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided(
self, opts_args: dict[str, Any]
):
opts_args["file_path"] = "l/m/n.pptx"
opts = _PptxPartitionerOptions(**opts_args)
assert opts.pptx_file == "l/m/n.pptx"
def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided(
self, opts_args: dict[str, Any]
):
spooled_temp_file = tempfile.SpooledTemporaryFile()
spooled_temp_file.write(b"abcdefg")
opts_args["file"] = spooled_temp_file
opts = _PptxPartitionerOptions(**opts_args)
pptx_file = opts.pptx_file
assert pptx_file is not spooled_temp_file
assert isinstance(pptx_file, io.BytesIO)
assert pptx_file.getvalue() == b"abcdefg"
def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile(
self, opts_args: dict[str, Any]
):
file = io.BytesIO(b"abcdefg")
opts_args["file"] = file
opts = _PptxPartitionerOptions(**opts_args)
pptx_file = opts.pptx_file
assert pptx_file is file
assert isinstance(pptx_file, io.BytesIO)
assert pptx_file.getvalue() == b"abcdefg"
def but_it_raises_ValueError_when_neither_a_file_path_or_file_is_provided(
self, opts_args: dict[str, Any]
):
opts = _PptxPartitionerOptions(**opts_args)
with pytest.raises(ValueError, match="No PPTX document specified, either `filename` or "):
opts.pptx_file
# -- .table_metadata -------------------------
def it_can_create_table_metadata(self, opts_args: dict[str, Any]):
opts_args["metadata_file_path"] = "d/e/f.pptx"
opts_args["metadata_last_modified"] = "2024-04-02T19:51:55"
opts = _PptxPartitionerOptions(**opts_args)
# -- move to the first slide --
list(opts.increment_page_number())
metadata = opts.table_metadata(text_as_html="<table><tr/></table>")
assert isinstance(metadata, ElementMetadata)
assert metadata.filename == "f.pptx"
assert metadata.last_modified == "2024-04-02T19:51:55"
assert metadata.page_number == 1
assert metadata.text_as_html == "<table><tr/></table>"
# -- .text_metadata -------------------------
def it_can_create_text_metadata(self, opts_args: dict[str, Any]):
opts_args["metadata_file_path"] = "d/e/f.pptx"
opts_args["metadata_last_modified"] = "2024-04-02T19:56:40"
opts = _PptxPartitionerOptions(**opts_args)
# -- move to the first slide --
list(opts.increment_page_number())
metadata = opts.text_metadata(category_depth=2)
assert isinstance(metadata, ElementMetadata)
assert metadata.filename == "f.pptx"
assert metadata.last_modified == "2024-04-02T19:56:40"
assert metadata.page_number == 1
assert metadata.category_depth == 2
# -- fixtures --------------------------------------------------------------------------------
@pytest.fixture()
def get_last_modified_date_(self, request: FixtureRequest):
return function_mock(request, "unstructured.partition.pptx.get_last_modified_date")
@pytest.fixture()
def get_last_modified_date_from_file_(self, request: FixtureRequest):
return function_mock(
request, "unstructured.partition.pptx.get_last_modified_date_from_file"
)
@pytest.fixture()
def opts_args(self) -> dict[str, Any]:
"""All default arguments for `_XlsxPartitionerOptions`.
Individual argument values can be changed to suit each test. Makes construction of opts more
compact for testing purposes.
"""
return {
"date_from_file_object": False,
"file": None,
"file_path": None,
"include_page_breaks": True,
"include_slide_notes": False,
"infer_table_structure": True,
"metadata_file_path": None,
"metadata_last_modified": None,
}

View File

@ -2,5 +2,6 @@ from pptx.shapes.base import BaseShape
from pptx.text.text import TextFrame
class Shape(BaseShape):
text: str
@property
def text_frame(self) -> TextFrame: ...

View File

@ -14,7 +14,12 @@ class _BaseGroupShapes(_BaseShapes):
class GroupShapes(_BaseGroupShapes): ...
class NotesSlideShapes(_BaseShapes): ...
class SlidePlaceholders(ParentedElementProxy):
def __getitem__(self, idx: int) -> Shape: ...
class SlideShapes(_BaseGroupShapes):
def __iter__(self) -> Iterator[BaseShape]: ...
@property
def placeholders(self) -> SlidePlaceholders: ...
@property
def title(self) -> Shape | None: ...

View File

@ -26,6 +26,7 @@ class SlideLayouts(ParentedElementProxy):
def __len__(self) -> int: ...
class Slides(ParentedElementProxy):
def __getitem__(self, idx: int) -> Slide: ...
def __iter__(self) -> Iterator[Slide]: ...
def __len__(self) -> int: ...
def add_slide(self, slide_layout: SlideLayout) -> Slide: ...

View File

@ -5,6 +5,7 @@ from pptx.shapes import Subshape
class TextFrame(Subshape):
text: str
def add_paragraph(self) -> _Paragraph: ...
@property
def paragraphs(self) -> Sequence[_Paragraph]: ...

View File

@ -478,15 +478,14 @@ def convert_ms_office_table_to_text(table: PptxTable, as_html: bool = True) -> s
Returns:
str: An table string representation of the input table.
"""
fmt = "html" if as_html else "plain"
rows = list(table.rows)
if len(rows) > 0:
headers = [cell.text for cell in rows[0].cells]
data = [[cell.text for cell in row.cells] for row in rows[1:]]
table_text = tabulate(data, headers=headers, tablefmt=fmt)
else:
table_text = ""
return table_text
if not rows:
return ""
headers = [cell.text for cell in rows[0].cells]
data = [[cell.text for cell in row.cells] for row in rows[1:]]
return tabulate(data, headers=headers, tablefmt="html" if as_html else "plain")
def contains_emoji(s: str) -> bool:

View File

@ -2,7 +2,7 @@ from __future__ import annotations
import io
from tempfile import SpooledTemporaryFile
from typing import IO, Any, Iterator, List, Optional, Sequence, Tuple, Union
from typing import IO, Any, Iterator, Optional, Sequence
import pptx
from pptx.presentation import Presentation
@ -30,7 +30,6 @@ from unstructured.documents.elements import (
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import (
convert_ms_office_table_to_text,
exactly_one,
get_last_modified_date,
get_last_modified_date_from_file,
)
@ -53,16 +52,14 @@ def partition_pptx(
file: Optional[IO[bytes]] = None,
include_page_breaks: bool = True,
metadata_filename: Optional[str] = None,
include_metadata: bool = True,
metadata_last_modified: Optional[str] = None,
include_slide_notes: bool = False,
infer_table_structure: bool = True,
chunking_strategy: Optional[str] = None,
languages: Optional[List[str]] = ["auto"],
languages: Optional[list[str]] = ["auto"],
detect_language_per_element: bool = False,
date_from_file_object: bool = False,
**kwargs: Any,
) -> List[Element]:
) -> list[Element]:
"""Partition PowerPoint document in .pptx format into its document elements.
Parameters
@ -98,28 +95,18 @@ def partition_pptx(
Applies only when providing file via `file` parameter. If this option is True, attempt
infer last_modified metadata from bytes, otherwise set it to None.
"""
# -- verify only one source-file argument was provided --
exactly_one(filename=filename, file=file)
# -- In Python <3.11 SpooledTemporaryFile does not implement ".seekable" which triggers an
# -- exception when Zipfile tries to open it. Both the docx and pptx formats are zip archives,
# -- so we need to work around that bug here.
if isinstance(file, SpooledTemporaryFile):
file.seek(0)
file = io.BytesIO(file.read())
source_file = file or filename
assert source_file is not None
elements = _PptxPartitioner.iter_presentation_elements(
source_file,
include_page_breaks,
include_slide_notes,
infer_table_structure,
metadata_filename,
metadata_last_modified,
opts = _PptxPartitionerOptions(
date_from_file_object=date_from_file_object,
file=file,
file_path=filename,
include_page_breaks=include_page_breaks,
include_slide_notes=include_slide_notes,
infer_table_structure=infer_table_structure,
metadata_file_path=metadata_filename,
metadata_last_modified=metadata_last_modified,
)
elements = _PptxPartitioner.iter_presentation_elements(opts)
elements = apply_lang_metadata(
elements=elements,
languages=languages,
@ -128,52 +115,16 @@ def partition_pptx(
return list(elements)
class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
class _PptxPartitioner:
"""Provides `.partition()` for PowerPoint 2007+ (.pptx) files."""
def __init__(
self,
file: Union[str, IO[bytes]],
# -- having default values for these arguments is not necessary for production uses because
# -- this object is always created by the classmethod. However it simplifies constructing
# -- this object in tests and makes them less sensitive to signature changes.
include_page_breaks: bool = True,
include_slide_notes: bool = False,
infer_table_structure: bool = True,
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
date_from_file_object: bool = False,
) -> None:
self._file = file
self._include_page_breaks = include_page_breaks
self._include_slide_notes = include_slide_notes
self._infer_table_structure = infer_table_structure
self._metadata_filename = metadata_filename
self._metadata_last_modified = metadata_last_modified
self._page_counter = 0
self._date_from_file_object = date_from_file_object
def __init__(self, opts: _PptxPartitionerOptions):
self._opts = opts
@classmethod
def iter_presentation_elements(
cls,
file: Union[str, IO[bytes]],
include_page_breaks: bool,
include_slide_notes: bool,
infer_table_structure: bool,
metadata_filename: Optional[str],
metadata_last_modified: Optional[str],
date_from_file_object: bool = False,
) -> Iterator[Element]:
def iter_presentation_elements(cls, opts: _PptxPartitionerOptions) -> Iterator[Element]:
"""Partition MS Word documents (.docx format) into its document elements."""
return cls(
file,
include_page_breaks,
include_slide_notes,
infer_table_structure,
metadata_filename,
metadata_last_modified,
date_from_file_object,
)._iter_presentation_elements()
return cls(opts)._iter_presentation_elements()
def _iter_presentation_elements(self) -> Iterator[Element]:
"""Generate each document-element in presentation in document order."""
@ -190,7 +141,7 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
# -- characteristic of a generator avoids repeated code to form interim results into lists.
for slide in self._presentation.slides:
yield from self._increment_page_number()
yield from self._opts.increment_page_number()
yield from self._iter_maybe_slide_notes(slide)
title_shape, shapes = self._order_shapes(slide)
@ -208,25 +159,6 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
# -- otherwise ditch it, this would include pictures, charts, connectors (lines),
# -- and free-form shapes (squiggly lines). Lines don't have text.
@lazyproperty
def _filename(self) -> Optional[str]:
"""Suitable for use as metadata.filename, does not necessarily name source-file."""
return (
self._metadata_filename
if self._metadata_filename
else self._file if isinstance(self._file, str) else None
)
def _increment_page_number(self) -> Iterator[PageBreak]:
"""Increment page-number by 1 and generate a PageBreak element if enabled."""
self._page_counter += 1
# -- no page-break before first page --
if self._page_counter < 2:
return
# -- only emit page-breaks when enabled --
if self._include_page_breaks:
yield PageBreak("", detection_origin=DETECTION_ORIGIN)
def _is_bulleted_paragraph(self, paragraph: _Paragraph) -> bool:
"""True when `paragraph` has a bullet-charcter prefix.
@ -238,7 +170,7 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
def _iter_maybe_slide_notes(self, slide: Slide) -> Iterator[NarrativeText]:
"""Generate zero-or-one NarrativeText element for the slide-notes."""
# -- only emit slide-notes elements when enabled --
if not self._include_slide_notes:
if not self._opts.include_slide_notes:
return
# -- not all slides have a notes slide --
@ -258,22 +190,15 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
return
yield NarrativeText(
text=notes_text,
metadata=self._text_metadata(),
detection_origin=DETECTION_ORIGIN,
text=notes_text, metadata=self._opts.text_metadata(), detection_origin=DETECTION_ORIGIN
)
def _is_invalid_shape(self, shape: Shape) -> bool:
# NOTE(robinson) - avoid processing shapes that are not on the actual slide
# NOTE - skip check if no top or left position (shape displayed top left)
return bool((shape.top and shape.left) and (shape.top < 0 or shape.left < 0))
def _iter_title_shape_element(self, shape: Shape) -> Iterator[Element]:
"""Generate Title element for each paragraph in title `shape`.
Text is most likely a title, but in the rare case that the title shape was used
for the slide body text, also check for bulleted paragraphs."""
if self._is_invalid_shape(shape):
if self._shape_is_off_slide(shape):
return
depth = 0
@ -286,7 +211,7 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
bullet_depth = paragraph.level or 0
yield ListItem(
text=text,
metadata=self._text_metadata(category_depth=bullet_depth),
metadata=self._opts.text_metadata(category_depth=bullet_depth),
detection_origin=DETECTION_ORIGIN,
)
elif is_email_address(text):
@ -295,14 +220,14 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
# increment the category depth by the paragraph increment in the shape
yield Title(
text=text,
metadata=self._text_metadata(category_depth=depth),
metadata=self._opts.text_metadata(category_depth=depth),
detection_origin=DETECTION_ORIGIN,
)
depth += 1 # Cannot enumerate because we want to skip empty paragraphs
def _iter_shape_elements(self, shape: Shape) -> Iterator[Element]:
"""Generate Text or subtype element for each paragraph in `shape`."""
if self._is_invalid_shape(shape):
if self._shape_is_off_slide(shape):
return
for paragraph in shape.text_frame.paragraphs:
@ -311,7 +236,7 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
continue
level = paragraph.level or 0
metadata = self._text_metadata(category_depth=level)
metadata = self._opts.text_metadata(category_depth=level)
if self._is_bulleted_paragraph(paragraph):
yield ListItem(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
@ -325,7 +250,7 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
)
elif is_possible_title(text):
# If text is a title but not the title shape increment the category depth)
metadata = self._text_metadata(category_depth=level + 1)
metadata = self._opts.text_metadata(category_depth=level + 1)
yield Title(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
else:
yield Text(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
@ -339,36 +264,15 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
if not text_table:
return
html_table = None
if self._infer_table_structure:
if self._opts.infer_table_structure:
html_table = convert_ms_office_table_to_text(graphfrm.table, as_html=True)
yield Table(
text=text_table,
metadata=self._table_metadata(html_table),
metadata=self._opts.table_metadata(html_table),
detection_origin=DETECTION_ORIGIN,
)
@lazyproperty
def _last_modified(self) -> Optional[str]:
"""Last-modified date suitable for use in element metadata."""
# -- if this file was converted from another format, any last-modified date for the file
# -- will be today, so we get it from the conversion step in `._metadata_last_modified`.
if self._metadata_last_modified:
return self._metadata_last_modified
file = self._file
# -- if the file is on the filesystem, get its date from there --
if isinstance(file, str):
return None if file.startswith("/tmp") else get_last_modified_date(file)
# -- otherwise try getting it from the file-like object; this can work if `file` comes from
# -- `with open(abc.pptx, "rb") as file:`, but I can't see folks doing that much when they
# -- can just send us "abc.pptx" instead.
if self._date_from_file_object:
return get_last_modified_date_from_file(file)
return None
def _order_shapes(self, slide: Slide) -> Tuple[Optional[Shape], Sequence[BaseShape]]:
def _order_shapes(self, slide: Slide) -> tuple[Optional[Shape], Sequence[BaseShape]]:
"""Orders the shapes on `slide` from top to bottom and left to right.
Returns the title shape if it exists and the ordered shapes."""
@ -380,38 +284,153 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
else:
yield shape
def sort_key(shape: BaseShape) -> Tuple[int, int]:
def sort_key(shape: BaseShape) -> tuple[int, int]:
return shape.top or 0, shape.left or 0
return slide.shapes.title, sorted(iter_shapes(slide.shapes), key=sort_key)
@lazyproperty
def _presentation(self) -> Presentation:
"""The python-pptx `Presentation` object loaded from the provided source file."""
return pptx.Presentation(self._opts.pptx_file)
def _shape_is_off_slide(self, shape: Shape) -> bool:
# NOTE(robinson) - avoid processing shapes that are not on the actual slide
# NOTE - skip check if no top or left position (shape displayed top left)
return bool((shape.top and shape.left) and (shape.top < 0 or shape.left < 0))
class _PptxPartitionerOptions:
"""Encapsulates partitioning option validation, computation, and application of defaults."""
def __init__(
self,
*,
date_from_file_object: bool,
file: Optional[IO[bytes]],
file_path: Optional[str],
include_page_breaks: bool,
include_slide_notes: bool,
infer_table_structure: bool,
metadata_file_path: Optional[str],
metadata_last_modified: Optional[str],
):
self._date_from_file_object = date_from_file_object
self._file = file
self._file_path = file_path
self._include_page_breaks = include_page_breaks
self._include_slide_notes = include_slide_notes
self._infer_table_structure = infer_table_structure
self._metadata_file_path = metadata_file_path
self._metadata_last_modified = metadata_last_modified
self._page_counter = 0
@lazyproperty
def include_page_breaks(self) -> bool:
"""When True, include `PageBreak` elements in element-stream.
Note that regardless of this setting, page-breaks are detected, and page-number is tracked
and included in element metadata. Only the presence of distinct `PageBreak` elements (which
contain no text) in the element stream is affected.
"""
return self._include_page_breaks
@lazyproperty
def include_slide_notes(self) -> bool:
"""When True, also partition any text found in slide notes as part of each slide."""
return self._include_slide_notes
def increment_page_number(self) -> Iterator[PageBreak]:
"""Increment page-number by 1 and generate a PageBreak element if enabled."""
self._page_counter += 1
# -- no page-break before first page --
if self._page_counter < 2:
return
# -- only emit page-breaks when enabled --
if self._include_page_breaks:
yield PageBreak("", detection_origin=DETECTION_ORIGIN)
@lazyproperty
def infer_table_structure(self) -> bool:
"""True when partitioner should compute and apply `text_as_html` metadata for tables."""
return self._infer_table_structure
@lazyproperty
def last_modified(self) -> Optional[str]:
"""The best last-modified date available, None if no sources are available."""
# -- Value explicitly specified by caller takes precedence. This is used for example when
# -- this file was converted from another format, and any last-modified date for the file
# -- would be just now.
if self._metadata_last_modified:
return self._metadata_last_modified
if self._file_path:
return (
None
if self._file_path.startswith("/tmp")
else get_last_modified_date(self._file_path)
)
if self._file:
return (
get_last_modified_date_from_file(self._file)
if self._date_from_file_object
else None
)
return None
@lazyproperty
def metadata_file_path(self) -> str | None:
"""The best available file-path for this document or `None` if unavailable."""
return self._metadata_file_path or self._file_path
@property
def _page_number(self) -> Optional[int]:
def page_number(self) -> int:
"""The current page (slide) number."""
return self._page_counter
@lazyproperty
def _presentation(self) -> Presentation:
"""The python-pptx `Presentation` object loaded from the provided source file."""
return pptx.Presentation(self._file)
def pptx_file(self) -> str | IO[bytes]:
"""The PowerPoint document file to be partitioned.
def _table_metadata(self, text_as_html: str):
This is either a str path or a file-like object. `python-pptx` accepts either for opening a
presentation file.
"""
if self._file_path:
return self._file_path
# -- In Python <3.11 SpooledTemporaryFile does not implement ".seekable" which triggers an
# -- exception when Zipfile tries to open it. The pptx format is a zip archive so we need
# -- to work around that bug here.
if isinstance(self._file, SpooledTemporaryFile):
self._file.seek(0)
return io.BytesIO(self._file.read())
if self._file:
return self._file
raise ValueError(
"No PPTX document specified, either `filename` or `file` argument must be provided"
)
def table_metadata(self, text_as_html: str | None):
"""ElementMetadata instance suitable for use with Table element."""
element_metadata = ElementMetadata(
filename=self._filename,
last_modified=self._last_modified,
page_number=self._page_number,
filename=self.metadata_file_path,
last_modified=self.last_modified,
page_number=self.page_number,
text_as_html=text_as_html,
)
element_metadata.detection_origin = DETECTION_ORIGIN
return element_metadata
def _text_metadata(self, category_depth: int = 0) -> ElementMetadata:
def text_metadata(self, category_depth: int = 0) -> ElementMetadata:
"""ElementMetadata instance suitable for use with Text and subtypes."""
element_metadata = ElementMetadata(
filename=self._filename,
last_modified=self._last_modified,
page_number=self._page_number,
filename=self.metadata_file_path,
last_modified=self.last_modified,
page_number=self.page_number,
category_depth=category_depth,
)
element_metadata.detection_origin = DETECTION_ORIGIN