mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-24 21:55:33 +00:00
rfctr(pptx): extract _PptxPartitionerOptions (#2853)
**Reviewers:** Likely quicker to review commit-by-commit. **Summary** In preparation for adding a PPTX `Picture` shape _sub-partitioner_, extract management of PPTX partitioning-run options to a separate `_PptxPartitioningOptions` object similar to those used in chunking and XLSX partitioning. This provides several benefits: - Extract code dealing with applying defaults and computing derived values from the main partitioning code, leaving it less cluttered and focused on the partitioning algorithm itself. - Allow the options set to be passed to helper objects, prominently including sub-partitioners, without requiring a long list of parameters or requiring the caller to couple itself to the particular option values the helper object requires. - Allow options behaviors to be thoroughly and efficiently tested in isolation.
This commit is contained in:
parent
a9b6506724
commit
2c7e0289aa
12
.github/workflows/ci.yml
vendored
12
.github/workflows/ci.yml
vendored
@ -116,13 +116,20 @@ jobs:
|
||||
- name: Test
|
||||
env:
|
||||
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
|
||||
TESSERACT_VERSION : "5.3.4"
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
|
||||
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
|
||||
tesseract --version
|
||||
installed_tesseract_version=$(tesseract --version | grep -oP '(?<=tesseract )\d+\.\d+\.\d+')
|
||||
if [ "$installed_tesseract_version" != "${{env.TESSERACT_VERSION}}" ]; then
|
||||
echo "Tesseract version ${{env.TESSERACT_VERSION}} is required but found version $installed_tesseract_version"
|
||||
exit 1
|
||||
fi
|
||||
# FIXME (yao): sometimes there is cache but we still miss argilla in the env; so we add make install-ci again
|
||||
make install-ci
|
||||
make test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
|
||||
@ -156,6 +163,7 @@ jobs:
|
||||
sudo apt-get install -y poppler-utils
|
||||
make install-pandoc
|
||||
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
|
||||
tesseract --version
|
||||
make test-chipper CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
|
||||
@ -224,6 +232,7 @@ jobs:
|
||||
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
|
||||
make install-pandoc
|
||||
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
|
||||
tesseract --version
|
||||
make test-extra-${{ matrix.extra }} CI=true
|
||||
@ -343,6 +352,7 @@ jobs:
|
||||
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
|
||||
make install-pandoc
|
||||
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y tesseract-ocr
|
||||
sudo apt-get install -y tesseract-ocr-kor
|
||||
sudo apt-get install diffstat
|
||||
@ -408,6 +418,7 @@ jobs:
|
||||
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
|
||||
make install-pandoc
|
||||
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y tesseract-ocr
|
||||
sudo apt-get install -y tesseract-ocr-kor
|
||||
sudo apt-get install diffstat
|
||||
@ -454,6 +465,7 @@ jobs:
|
||||
make install-ci
|
||||
sudo apt-get update && sudo apt-get install --yes poppler-utils libreoffice
|
||||
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
|
||||
tesseract --version
|
||||
make install-nltk-models
|
||||
|
||||
2
Makefile
2
Makefile
@ -398,7 +398,7 @@ check-flake8-print:
|
||||
.PHONY: check-ruff
|
||||
check-ruff:
|
||||
# -- ruff options are determined by pyproject.toml --
|
||||
ruff .
|
||||
ruff check .
|
||||
|
||||
.PHONY: check-autoflake
|
||||
check-autoflake:
|
||||
|
||||
@ -1,3 +0,0 @@
|
||||
PPTX files in this directory are made by hand using PowerPoint. This most faithfully represents source files likely to be used by users of this library. Files produced by `python-pptx` are "cleaner" in certain respects and may pass tests that a "real" PowerPoint file would not.
|
||||
|
||||
We may also wish to add files made with LibreOffice since the XML it generates, while consistent with the ISO 29500 spec, can differ enough from that generated by Microsoft PowerPoint to produce different results on certain tests.
|
||||
@ -2,28 +2,35 @@
|
||||
|
||||
"""Test suite for `unstructured.partition.pptx` module."""
|
||||
|
||||
import os
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import pathlib
|
||||
from tempfile import SpooledTemporaryFile
|
||||
import tempfile
|
||||
from typing import Any
|
||||
|
||||
import pptx
|
||||
import pytest
|
||||
from pptx.util import Inches
|
||||
from pytest_mock import MockFixture
|
||||
|
||||
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
||||
from test_unstructured.unit_utils import (
|
||||
FixtureRequest,
|
||||
Mock,
|
||||
assert_round_trips_through_JSON,
|
||||
example_doc_path,
|
||||
function_mock,
|
||||
)
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
from unstructured.documents.elements import (
|
||||
ElementMetadata,
|
||||
ListItem,
|
||||
NarrativeText,
|
||||
PageBreak,
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
from unstructured.partition.pptx import _PptxPartitioner, partition_pptx
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "..", "example-docs")
|
||||
from unstructured.partition.pptx import _PptxPartitionerOptions, partition_pptx
|
||||
|
||||
EXPECTED_PPTX_OUTPUT = [
|
||||
Title(text="Adding a Bullet Slide"),
|
||||
@ -35,24 +42,18 @@ EXPECTED_PPTX_OUTPUT = [
|
||||
]
|
||||
|
||||
|
||||
def get_test_file_path(filename: str) -> str:
|
||||
return str(pathlib.Path(__file__).parent / "test_files" / filename)
|
||||
|
||||
|
||||
# == DescribePptxPartitionerSourceFileBehaviors ==================================================
|
||||
# == document file behaviors =====================================================================
|
||||
|
||||
|
||||
def test_partition_pptx_from_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
|
||||
elements = partition_pptx(filename=filename)
|
||||
elements = partition_pptx(example_doc_path("fake-power-point.pptx"))
|
||||
assert elements == EXPECTED_PPTX_OUTPUT
|
||||
for element in elements:
|
||||
assert element.metadata.filename == "fake-power-point.pptx"
|
||||
|
||||
|
||||
def test_partition_pptx_from_filename_with_metadata_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
|
||||
elements = partition_pptx(filename=filename, metadata_filename="test")
|
||||
elements = partition_pptx(example_doc_path("fake-power-point.pptx"), metadata_filename="test")
|
||||
assert elements == EXPECTED_PPTX_OUTPUT
|
||||
for element in elements:
|
||||
assert element.metadata.filename == "test"
|
||||
@ -63,11 +64,8 @@ def test_partition_pptx_with_spooled_file():
|
||||
|
||||
Including one that does not have its read-pointer set to the start.
|
||||
"""
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
|
||||
from tempfile import SpooledTemporaryFile
|
||||
|
||||
with open(filename, "rb") as test_file:
|
||||
spooled_temp_file = SpooledTemporaryFile()
|
||||
with open(example_doc_path("fake-power-point.pptx"), "rb") as test_file:
|
||||
spooled_temp_file = tempfile.SpooledTemporaryFile()
|
||||
spooled_temp_file.write(test_file.read())
|
||||
elements = partition_pptx(file=spooled_temp_file)
|
||||
assert elements == EXPECTED_PPTX_OUTPUT
|
||||
@ -76,8 +74,7 @@ def test_partition_pptx_with_spooled_file():
|
||||
|
||||
|
||||
def test_partition_pptx_from_file():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
|
||||
with open(filename, "rb") as f:
|
||||
with open(example_doc_path("fake-power-point.pptx"), "rb") as f:
|
||||
elements = partition_pptx(file=f)
|
||||
assert elements == EXPECTED_PPTX_OUTPUT
|
||||
for element in elements:
|
||||
@ -85,37 +82,24 @@ def test_partition_pptx_from_file():
|
||||
|
||||
|
||||
def test_partition_pptx_from_file_with_metadata_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
|
||||
with open(filename, "rb") as f:
|
||||
with open(example_doc_path("fake-power-point.pptx"), "rb") as f:
|
||||
elements = partition_pptx(file=f, metadata_filename="test")
|
||||
assert elements == EXPECTED_PPTX_OUTPUT
|
||||
for element in elements:
|
||||
assert element.metadata.filename == "test"
|
||||
|
||||
|
||||
def test_partition_pptx_raises_with_both_specified():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
|
||||
with open(filename, "rb") as f, pytest.raises(ValueError):
|
||||
partition_pptx(filename=filename, file=f)
|
||||
|
||||
|
||||
def test_partition_pptx_raises_with_neither():
|
||||
with pytest.raises(ValueError):
|
||||
partition_pptx()
|
||||
|
||||
|
||||
class DescribePptxPartitionerShapeOrderingBehaviors:
|
||||
"""Tests related to shape inclusion and ordering based on position."""
|
||||
|
||||
def it_recurses_into_group_shapes(self):
|
||||
elements = _PptxPartitioner(
|
||||
get_test_file_path("group-shapes-nested.pptx")
|
||||
)._iter_presentation_elements()
|
||||
|
||||
assert [e.text for e in elements] == ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]
|
||||
def test_partition_pptx_recurses_into_group_shapes():
|
||||
elements = partition_pptx(example_doc_path("group-shapes-nested.pptx"))
|
||||
assert [e.text for e in elements] == ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]
|
||||
|
||||
|
||||
# == DescribePptxPartitionerPageBreakBehaviors ===================================================
|
||||
# == page-break behaviors ========================================================================
|
||||
|
||||
|
||||
def test_partition_pptx_adds_page_breaks(tmp_path: pathlib.Path):
|
||||
@ -180,8 +164,7 @@ def test_partition_pptx_page_breaks_toggle_off(tmp_path: pathlib.Path):
|
||||
|
||||
|
||||
def test_partition_pptx_many_pages():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-many-pages.pptx")
|
||||
elements = partition_pptx(filename=filename)
|
||||
elements = partition_pptx(example_doc_path("fake-power-point-many-pages.pptx"))
|
||||
|
||||
# The page_number of PageBreak is None
|
||||
assert set(filter(None, (elt.metadata.page_number for elt in elements))) == {1, 2}
|
||||
@ -189,7 +172,7 @@ def test_partition_pptx_many_pages():
|
||||
assert element.metadata.filename == "fake-power-point-many-pages.pptx"
|
||||
|
||||
|
||||
# == DescribePptxPartitionerMiscellaneousBehaviors ===============================================
|
||||
# == miscellaneous behaviors =====================================================================
|
||||
|
||||
|
||||
def test_partition_pptx_orders_elements(tmp_path: pathlib.Path):
|
||||
@ -237,38 +220,31 @@ def test_partition_pptx_orders_elements(tmp_path: pathlib.Path):
|
||||
assert element.metadata.filename == "test-ordering.pptx"
|
||||
|
||||
|
||||
EXPECTED_HTML_TABLE = """<table>
|
||||
<thead>
|
||||
<tr><th>Column 1 </th><th>Column 2 </th><th>Column 3 </th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr><td>Red </td><td>Green </td><td>Blue </td></tr>
|
||||
<tr><td>Purple </td><td>Orange </td><td>Yellow </td></tr>
|
||||
<tr><td>Tangerine </td><td>Pink </td><td>Aqua </td></tr>
|
||||
</tbody>
|
||||
</table>"""
|
||||
|
||||
|
||||
def test_partition_pptx_grabs_tables():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-table.pptx")
|
||||
elements = partition_pptx(filename=filename)
|
||||
elements = partition_pptx(example_doc_path("fake-power-point-table.pptx"))
|
||||
|
||||
assert elements[1].text.startswith("Column 1")
|
||||
assert elements[1].text.strip().endswith("Aqua")
|
||||
assert elements[1].metadata.text_as_html == EXPECTED_HTML_TABLE
|
||||
assert elements[1].metadata.text_as_html == (
|
||||
"<table>\n"
|
||||
"<thead>\n"
|
||||
"<tr><th>Column 1 </th><th>Column 2 </th><th>Column 3 </th></tr>\n"
|
||||
"</thead>\n"
|
||||
"<tbody>\n"
|
||||
"<tr><td>Red </td><td>Green </td><td>Blue </td></tr>\n"
|
||||
"<tr><td>Purple </td><td>Orange </td><td>Yellow </td></tr>\n"
|
||||
"<tr><td>Tangerine </td><td>Pink </td><td>Aqua </td></tr>\n"
|
||||
"</tbody>\n"
|
||||
"</table>"
|
||||
)
|
||||
assert elements[1].metadata.filename == "fake-power-point-table.pptx"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"infer_table_structure",
|
||||
[
|
||||
True,
|
||||
False,
|
||||
],
|
||||
)
|
||||
def test_partition_pptx_infer_table_structure(infer_table_structure):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-table.pptx")
|
||||
elements = partition_pptx(filename=filename, infer_table_structure=infer_table_structure)
|
||||
@pytest.mark.parametrize("infer_table_structure", [True, False])
|
||||
def test_partition_pptx_infer_table_structure(infer_table_structure: bool):
|
||||
elements = partition_pptx(
|
||||
example_doc_path("fake-power-point-table.pptx"), infer_table_structure=infer_table_structure
|
||||
)
|
||||
table_element_has_text_as_html_field = (
|
||||
hasattr(elements[1].metadata, "text_as_html")
|
||||
and elements[1].metadata.text_as_html is not None
|
||||
@ -277,8 +253,7 @@ def test_partition_pptx_infer_table_structure(infer_table_structure):
|
||||
|
||||
|
||||
def test_partition_pptx_malformed():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
|
||||
elements = partition_pptx(filename=filename)
|
||||
elements = partition_pptx(example_doc_path("fake-power-point-malformed.pptx"))
|
||||
|
||||
assert elements[0].text == "Problem Date Placeholder"
|
||||
assert elements[1].text == "Test Slide"
|
||||
@ -289,106 +264,69 @@ def test_partition_pptx_malformed():
|
||||
# == DescribePptxPartitionerMetadataBehaviors ====================================================
|
||||
|
||||
|
||||
def test_partition_pptx_from_filename_exclude_metadata():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
|
||||
elements = partition_pptx(filename=filename, include_metadata=False)
|
||||
assert elements == EXPECTED_PPTX_OUTPUT
|
||||
|
||||
|
||||
def test_partition_pptx_from_file_exclude_metadata():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_pptx(file=f, include_metadata=False)
|
||||
assert elements == EXPECTED_PPTX_OUTPUT
|
||||
|
||||
|
||||
def test_partition_pptx_metadata_date(mocker: MockFixture):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
|
||||
mocked_last_modification_date = "2029-07-05T09:24:28"
|
||||
|
||||
mocker.patch(
|
||||
"unstructured.partition.pptx.get_last_modified_date",
|
||||
return_value=mocked_last_modification_date,
|
||||
"unstructured.partition.pptx.get_last_modified_date", return_value="2029-07-05T09:24:28"
|
||||
)
|
||||
|
||||
elements = partition_pptx(
|
||||
filename=filename,
|
||||
)
|
||||
elements = partition_pptx(example_doc_path("fake-power-point-malformed.pptx"))
|
||||
|
||||
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
||||
assert elements[0].metadata.last_modified == "2029-07-05T09:24:28"
|
||||
|
||||
|
||||
def test_partition_pptx_with_custom_metadata_date(mocker: MockFixture):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
|
||||
mocked_last_modification_date = "2029-07-05T09:24:28"
|
||||
expected_last_modification_date = "2020-07-05T09:24:28"
|
||||
|
||||
mocker.patch(
|
||||
"unstructured.partition.pptx.get_last_modified_date",
|
||||
return_value=mocked_last_modification_date,
|
||||
"unstructured.partition.pptx.get_last_modified_date", return_value="2022-11-22T11:22:33"
|
||||
)
|
||||
|
||||
elements = partition_pptx(
|
||||
filename=filename,
|
||||
metadata_last_modified=expected_last_modification_date,
|
||||
example_doc_path("fake-power-point-malformed.pptx"),
|
||||
metadata_last_modified="2024-04-03T20:16:03",
|
||||
)
|
||||
|
||||
assert elements[0].metadata.last_modified == expected_last_modification_date
|
||||
assert elements[0].metadata.last_modified == "2024-04-03T20:16:03"
|
||||
|
||||
|
||||
def test_partition_pptx_from_file_metadata_date(mocker: MockFixture):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
|
||||
mocked_last_modification_date = "2029-07-05T09:24:28"
|
||||
|
||||
mocker.patch(
|
||||
"unstructured.partition.pptx.get_last_modified_date_from_file",
|
||||
return_value=mocked_last_modification_date,
|
||||
return_value="2029-07-05T09:24:28",
|
||||
)
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_pptx(
|
||||
file=f,
|
||||
)
|
||||
with open(example_doc_path("fake-power-point-malformed.pptx"), "rb") as f:
|
||||
elements = partition_pptx(file=f)
|
||||
|
||||
assert elements[0].metadata.last_modified is None
|
||||
|
||||
|
||||
def test_partition_pptx_from_file_explicit_get_metadata_date(mocker: MockFixture):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
|
||||
mocked_last_modification_date = "2029-07-05T09:24:28"
|
||||
|
||||
mocker.patch(
|
||||
"unstructured.partition.pptx.get_last_modified_date_from_file",
|
||||
return_value=mocked_last_modification_date,
|
||||
return_value="2029-07-05T09:24:28",
|
||||
)
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
with open(example_doc_path("fake-power-point-malformed.pptx"), "rb") as f:
|
||||
elements = partition_pptx(file=f, date_from_file_object=True)
|
||||
|
||||
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
||||
assert elements[0].metadata.last_modified == "2029-07-05T09:24:28"
|
||||
|
||||
|
||||
def test_partition_pptx_from_file_with_custom_metadata_date(mocker: MockFixture):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
|
||||
mocked_last_modification_date = "2029-07-05T09:24:28"
|
||||
expected_last_modification_date = "2020-07-05T09:24:28"
|
||||
|
||||
mocker.patch(
|
||||
"unstructured.partition.pptx.get_last_modified_date_from_file",
|
||||
return_value=mocked_last_modification_date,
|
||||
return_value="2022-11-22T11:22:33",
|
||||
)
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_pptx(file=f, metadata_last_modified=expected_last_modification_date)
|
||||
with open(example_doc_path("fake-power-point-malformed.pptx"), "rb") as f:
|
||||
elements = partition_pptx(file=f, metadata_last_modified="2024-04-03T20:16:03")
|
||||
|
||||
assert elements[0].metadata.last_modified == expected_last_modification_date
|
||||
assert elements[0].metadata.last_modified == "2024-04-03T20:16:03"
|
||||
|
||||
|
||||
def test_partition_pptx_from_file_without_metadata_date():
|
||||
"""Test partition_pptx() with file that are not possible to get last modified date"""
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
|
||||
with open(filename, "rb") as f:
|
||||
sf = SpooledTemporaryFile()
|
||||
with open(example_doc_path("fake-power-point-malformed.pptx"), "rb") as f:
|
||||
sf = tempfile.SpooledTemporaryFile()
|
||||
sf.write(f.read())
|
||||
sf.seek(0)
|
||||
elements = partition_pptx(file=sf, date_from_file_object=True)
|
||||
@ -397,14 +335,15 @@ def test_partition_pptx_from_file_without_metadata_date():
|
||||
|
||||
|
||||
def test_partition_pptx_element_metadata_has_languages():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
|
||||
elements = partition_pptx(filename=filename)
|
||||
elements = partition_pptx(example_doc_path("fake-power-point.pptx"))
|
||||
assert elements[0].metadata.languages == ["eng"]
|
||||
|
||||
|
||||
def test_partition_pptx_respects_detect_language_per_element():
|
||||
filename = "example-docs/language-docs/eng_spa_mult.pptx"
|
||||
elements = partition_pptx(filename=filename, detect_language_per_element=True)
|
||||
elements = partition_pptx(
|
||||
example_doc_path("language-docs/eng_spa_mult.pptx"), detect_language_per_element=True
|
||||
)
|
||||
|
||||
langs = [element.metadata.languages for element in elements]
|
||||
# languages other than English and Spanish are detected by this partitioner,
|
||||
# so this test is slightly different from the other partition tests
|
||||
@ -415,8 +354,7 @@ def test_partition_pptx_respects_detect_language_per_element():
|
||||
|
||||
def test_partition_pptx_raises_TypeError_for_invalid_languages():
|
||||
with pytest.raises(TypeError):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
|
||||
partition_pptx(filename=filename, languages="eng") # type: ignore
|
||||
partition_pptx(example_doc_path("fake-power-point.pptx"), languages="eng") # type: ignore
|
||||
|
||||
|
||||
# == DescribePptxPartitionerDownstreamBehaviors ==================================================
|
||||
@ -428,10 +366,12 @@ def test_partition_pptx_with_json():
|
||||
|
||||
|
||||
def test_add_chunking_strategy_by_title_on_partition_pptx():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "science-exploration-1p.pptx")
|
||||
filename = example_doc_path("science-exploration-1p.pptx")
|
||||
|
||||
elements = partition_pptx(filename=filename)
|
||||
chunk_elements = partition_pptx(filename, chunking_strategy="by_title")
|
||||
chunks = chunk_by_title(elements)
|
||||
|
||||
assert chunk_elements != elements
|
||||
assert chunk_elements == chunks
|
||||
|
||||
@ -444,6 +384,7 @@ def test_partition_pptx_title_shape_detection(tmp_path: pathlib.Path):
|
||||
prs = pptx.Presentation()
|
||||
slide = prs.slides.add_slide(prs.slide_layouts[0])
|
||||
title_shape = slide.shapes.title
|
||||
assert title_shape is not None
|
||||
title_shape.text = (
|
||||
"This is a title, it's a bit long so we can make sure it's not narrative text"
|
||||
)
|
||||
@ -479,10 +420,11 @@ def test_partition_pptx_level_detection(tmp_path: pathlib.Path):
|
||||
slide = prs.slides.add_slide(blank_slide_layout)
|
||||
shapes = slide.shapes
|
||||
title_shape = shapes.title
|
||||
body_shape = shapes.placeholders[1]
|
||||
assert title_shape is not None
|
||||
title_shape.text = (
|
||||
"This is a title, it's a bit long so we can make sure it's not narrative text"
|
||||
)
|
||||
body_shape = shapes.placeholders[1]
|
||||
|
||||
tf = body_shape.text_frame
|
||||
tf.text = "this is the root level bullet"
|
||||
@ -517,7 +459,7 @@ def test_partition_pptx_level_detection(tmp_path: pathlib.Path):
|
||||
assert isinstance(
|
||||
element,
|
||||
test_case[1],
|
||||
), f"expected {test_case[1]}, got {element.category} for {element.text}"
|
||||
), f"expected {test_case[1]}, got {type(element).__name__} for {element.text}"
|
||||
assert (
|
||||
element.metadata.category_depth == test_case[0]
|
||||
), f"expected {test_case[0]}, got {element.metadata.category_depth} for {element.text}"
|
||||
@ -525,8 +467,7 @@ def test_partition_pptx_level_detection(tmp_path: pathlib.Path):
|
||||
|
||||
def test_partition_pptx_hierarchy_sample_document():
|
||||
"""This tests if the hierarchy of the sample document is correctly detected"""
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "sample-presentation.pptx")
|
||||
elements = partition_pptx(filename=filename)
|
||||
elements = partition_pptx(example_doc_path("sample-presentation.pptx"))
|
||||
|
||||
test_cases = [
|
||||
# (expected category depth, parent id, child id)
|
||||
@ -564,3 +505,275 @@ def test_partition_pptx_hierarchy_sample_document():
|
||||
assert element.metadata.category_depth == test_case[0]
|
||||
assert element.metadata.parent_id == test_case[1]
|
||||
assert element.id == test_case[2]
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# ISOLATED UNIT TESTS
|
||||
# ================================================================================================
|
||||
# These test components used by `partition_pptx()` in isolation such that all edge cases can be
|
||||
# exercised.
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
class Describe_PptxPartitionerOptions:
|
||||
"""Unit-test suite for `unstructured.partition.xlsx._PptxPartitionerOptions` objects."""
|
||||
|
||||
@pytest.mark.parametrize("arg_value", [True, False])
|
||||
def it_knows_whether_to_emit_PageBreak_elements_as_part_of_the_output_element_stream(
|
||||
self, arg_value: bool, opts_args: dict[str, Any]
|
||||
):
|
||||
opts_args["include_page_breaks"] = arg_value
|
||||
opts = _PptxPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.include_page_breaks is arg_value
|
||||
|
||||
@pytest.mark.parametrize("arg_value", [True, False])
|
||||
def it_knows_whether_to_partition_content_found_in_slide_notes(
|
||||
self, arg_value: bool, opts_args: dict[str, Any]
|
||||
):
|
||||
opts_args["include_slide_notes"] = arg_value
|
||||
opts = _PptxPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.include_slide_notes is arg_value
|
||||
|
||||
@pytest.mark.parametrize("arg_value", [True, False])
|
||||
def it_knows_whether_to_include_text_as_html_in_Table_metadata(
|
||||
self, arg_value: bool, opts_args: dict[str, Any]
|
||||
):
|
||||
opts_args["infer_table_structure"] = arg_value
|
||||
opts = _PptxPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.infer_table_structure is arg_value
|
||||
|
||||
# -- .increment_page_number() ----------------
|
||||
|
||||
def it_generates_a_PageBreak_element_when_the_page_number_is_incremented(
|
||||
self, opts_args: dict[str, Any]
|
||||
):
|
||||
opts = _PptxPartitionerOptions(**opts_args)
|
||||
# -- move to the first slide --
|
||||
list(opts.increment_page_number())
|
||||
|
||||
page_break_iter = opts.increment_page_number()
|
||||
|
||||
assert isinstance(next(page_break_iter, None), PageBreak)
|
||||
assert opts.page_number == 2
|
||||
with pytest.raises(StopIteration):
|
||||
next(page_break_iter)
|
||||
|
||||
def but_it_does_not_generate_a_PageBreak_element_for_the_first_slide(
|
||||
self, opts_args: dict[str, Any]
|
||||
):
|
||||
opts = _PptxPartitionerOptions(**opts_args)
|
||||
|
||||
page_break_iter = opts.increment_page_number()
|
||||
|
||||
with pytest.raises(StopIteration):
|
||||
next(page_break_iter)
|
||||
assert opts.page_number == 1
|
||||
|
||||
def and_it_does_not_generate_a_PageBreak_element_when_include_page_breaks_option_is_off(
|
||||
self, opts_args: dict[str, Any]
|
||||
):
|
||||
opts_args["include_page_breaks"] = False
|
||||
opts = _PptxPartitionerOptions(**opts_args)
|
||||
# -- move to the first slide --
|
||||
list(opts.increment_page_number())
|
||||
|
||||
page_break_iter = opts.increment_page_number()
|
||||
|
||||
with pytest.raises(StopIteration):
|
||||
next(page_break_iter)
|
||||
assert opts.page_number == 2
|
||||
|
||||
# -- .last_modified --------------------------
|
||||
|
||||
def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided(
|
||||
self, opts_args: dict[str, Any]
|
||||
):
|
||||
opts_args["metadata_last_modified"] = "2024-03-05T17:02:53"
|
||||
opts = _PptxPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.last_modified == "2024-03-05T17:02:53"
|
||||
|
||||
def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided(
|
||||
self, opts_args: dict[str, Any], get_last_modified_date_: Mock
|
||||
):
|
||||
opts_args["file_path"] = "a/b/spreadsheet.pptx"
|
||||
get_last_modified_date_.return_value = "2024-04-02T20:32:35"
|
||||
opts = _PptxPartitionerOptions(**opts_args)
|
||||
|
||||
last_modified = opts.last_modified
|
||||
|
||||
get_last_modified_date_.assert_called_once_with("a/b/spreadsheet.pptx")
|
||||
assert last_modified == "2024-04-02T20:32:35"
|
||||
|
||||
def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_file_like_object_is_provided(
|
||||
self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock
|
||||
):
|
||||
file = io.BytesIO(b"abcdefg")
|
||||
opts_args["file"] = file
|
||||
opts_args["date_from_file_object"] = True
|
||||
get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07"
|
||||
opts = _PptxPartitionerOptions(**opts_args)
|
||||
|
||||
last_modified = opts.last_modified
|
||||
|
||||
get_last_modified_date_from_file_.assert_called_once_with(file)
|
||||
assert last_modified == "2024-04-02T20:42:07"
|
||||
|
||||
def but_it_falls_back_to_None_for_the_last_modified_date_when_date_from_file_object_is_False(
|
||||
self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock
|
||||
):
|
||||
file = io.BytesIO(b"abcdefg")
|
||||
opts_args["file"] = file
|
||||
opts_args["date_from_file_object"] = False
|
||||
get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07"
|
||||
opts = _PptxPartitionerOptions(**opts_args)
|
||||
|
||||
last_modified = opts.last_modified
|
||||
|
||||
get_last_modified_date_from_file_.assert_not_called()
|
||||
assert last_modified is None
|
||||
|
||||
# -- .metadata_file_path ---------------------
|
||||
|
||||
def it_uses_the_user_provided_file_path_in_the_metadata_when_provided(
|
||||
self, opts_args: dict[str, Any]
|
||||
):
|
||||
opts_args["file_path"] = "x/y/z.pptx"
|
||||
opts_args["metadata_file_path"] = "a/b/c.pptx"
|
||||
opts = _PptxPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.metadata_file_path == "a/b/c.pptx"
|
||||
|
||||
@pytest.mark.parametrize("file_path", ["u/v/w.pptx", None])
|
||||
def and_it_falls_back_to_the_document_file_path_otherwise(
|
||||
self, file_path: str | None, opts_args: dict[str, Any]
|
||||
):
|
||||
opts_args["file_path"] = file_path
|
||||
opts_args["metadata_file_path"] = None
|
||||
opts = _PptxPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.metadata_file_path == file_path
|
||||
|
||||
# -- .page_number ----------------------------
|
||||
|
||||
def it_keeps_track_of_the_page_number(self, opts_args: dict[str, Any]):
|
||||
"""In PPTX, page-number is the slide number."""
|
||||
opts = _PptxPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.page_number == 0
|
||||
list(opts.increment_page_number())
|
||||
assert opts.page_number == 1
|
||||
list(opts.increment_page_number())
|
||||
assert opts.page_number == 2
|
||||
|
||||
# -- .pptx_file ------------------------------
|
||||
|
||||
def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided(
|
||||
self, opts_args: dict[str, Any]
|
||||
):
|
||||
opts_args["file_path"] = "l/m/n.pptx"
|
||||
opts = _PptxPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.pptx_file == "l/m/n.pptx"
|
||||
|
||||
def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided(
|
||||
self, opts_args: dict[str, Any]
|
||||
):
|
||||
spooled_temp_file = tempfile.SpooledTemporaryFile()
|
||||
spooled_temp_file.write(b"abcdefg")
|
||||
opts_args["file"] = spooled_temp_file
|
||||
opts = _PptxPartitionerOptions(**opts_args)
|
||||
|
||||
pptx_file = opts.pptx_file
|
||||
|
||||
assert pptx_file is not spooled_temp_file
|
||||
assert isinstance(pptx_file, io.BytesIO)
|
||||
assert pptx_file.getvalue() == b"abcdefg"
|
||||
|
||||
def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile(
|
||||
self, opts_args: dict[str, Any]
|
||||
):
|
||||
file = io.BytesIO(b"abcdefg")
|
||||
opts_args["file"] = file
|
||||
opts = _PptxPartitionerOptions(**opts_args)
|
||||
|
||||
pptx_file = opts.pptx_file
|
||||
|
||||
assert pptx_file is file
|
||||
assert isinstance(pptx_file, io.BytesIO)
|
||||
assert pptx_file.getvalue() == b"abcdefg"
|
||||
|
||||
def but_it_raises_ValueError_when_neither_a_file_path_or_file_is_provided(
|
||||
self, opts_args: dict[str, Any]
|
||||
):
|
||||
opts = _PptxPartitionerOptions(**opts_args)
|
||||
|
||||
with pytest.raises(ValueError, match="No PPTX document specified, either `filename` or "):
|
||||
opts.pptx_file
|
||||
|
||||
# -- .table_metadata -------------------------
|
||||
|
||||
def it_can_create_table_metadata(self, opts_args: dict[str, Any]):
|
||||
opts_args["metadata_file_path"] = "d/e/f.pptx"
|
||||
opts_args["metadata_last_modified"] = "2024-04-02T19:51:55"
|
||||
opts = _PptxPartitionerOptions(**opts_args)
|
||||
# -- move to the first slide --
|
||||
list(opts.increment_page_number())
|
||||
|
||||
metadata = opts.table_metadata(text_as_html="<table><tr/></table>")
|
||||
|
||||
assert isinstance(metadata, ElementMetadata)
|
||||
assert metadata.filename == "f.pptx"
|
||||
assert metadata.last_modified == "2024-04-02T19:51:55"
|
||||
assert metadata.page_number == 1
|
||||
assert metadata.text_as_html == "<table><tr/></table>"
|
||||
|
||||
# -- .text_metadata -------------------------
|
||||
|
||||
def it_can_create_text_metadata(self, opts_args: dict[str, Any]):
|
||||
opts_args["metadata_file_path"] = "d/e/f.pptx"
|
||||
opts_args["metadata_last_modified"] = "2024-04-02T19:56:40"
|
||||
opts = _PptxPartitionerOptions(**opts_args)
|
||||
# -- move to the first slide --
|
||||
list(opts.increment_page_number())
|
||||
|
||||
metadata = opts.text_metadata(category_depth=2)
|
||||
|
||||
assert isinstance(metadata, ElementMetadata)
|
||||
assert metadata.filename == "f.pptx"
|
||||
assert metadata.last_modified == "2024-04-02T19:56:40"
|
||||
assert metadata.page_number == 1
|
||||
assert metadata.category_depth == 2
|
||||
|
||||
# -- fixtures --------------------------------------------------------------------------------
|
||||
|
||||
@pytest.fixture()
|
||||
def get_last_modified_date_(self, request: FixtureRequest):
|
||||
return function_mock(request, "unstructured.partition.pptx.get_last_modified_date")
|
||||
|
||||
@pytest.fixture()
|
||||
def get_last_modified_date_from_file_(self, request: FixtureRequest):
|
||||
return function_mock(
|
||||
request, "unstructured.partition.pptx.get_last_modified_date_from_file"
|
||||
)
|
||||
|
||||
@pytest.fixture()
|
||||
def opts_args(self) -> dict[str, Any]:
|
||||
"""All default arguments for `_XlsxPartitionerOptions`.
|
||||
|
||||
Individual argument values can be changed to suit each test. Makes construction of opts more
|
||||
compact for testing purposes.
|
||||
"""
|
||||
return {
|
||||
"date_from_file_object": False,
|
||||
"file": None,
|
||||
"file_path": None,
|
||||
"include_page_breaks": True,
|
||||
"include_slide_notes": False,
|
||||
"infer_table_structure": True,
|
||||
"metadata_file_path": None,
|
||||
"metadata_last_modified": None,
|
||||
}
|
||||
|
||||
@ -2,5 +2,6 @@ from pptx.shapes.base import BaseShape
|
||||
from pptx.text.text import TextFrame
|
||||
|
||||
class Shape(BaseShape):
|
||||
text: str
|
||||
@property
|
||||
def text_frame(self) -> TextFrame: ...
|
||||
|
||||
@ -14,7 +14,12 @@ class _BaseGroupShapes(_BaseShapes):
|
||||
class GroupShapes(_BaseGroupShapes): ...
|
||||
class NotesSlideShapes(_BaseShapes): ...
|
||||
|
||||
class SlidePlaceholders(ParentedElementProxy):
|
||||
def __getitem__(self, idx: int) -> Shape: ...
|
||||
|
||||
class SlideShapes(_BaseGroupShapes):
|
||||
def __iter__(self) -> Iterator[BaseShape]: ...
|
||||
@property
|
||||
def placeholders(self) -> SlidePlaceholders: ...
|
||||
@property
|
||||
def title(self) -> Shape | None: ...
|
||||
|
||||
@ -26,6 +26,7 @@ class SlideLayouts(ParentedElementProxy):
|
||||
def __len__(self) -> int: ...
|
||||
|
||||
class Slides(ParentedElementProxy):
|
||||
def __getitem__(self, idx: int) -> Slide: ...
|
||||
def __iter__(self) -> Iterator[Slide]: ...
|
||||
def __len__(self) -> int: ...
|
||||
def add_slide(self, slide_layout: SlideLayout) -> Slide: ...
|
||||
|
||||
@ -5,6 +5,7 @@ from pptx.shapes import Subshape
|
||||
|
||||
class TextFrame(Subshape):
|
||||
text: str
|
||||
def add_paragraph(self) -> _Paragraph: ...
|
||||
@property
|
||||
def paragraphs(self) -> Sequence[_Paragraph]: ...
|
||||
|
||||
|
||||
@ -478,15 +478,14 @@ def convert_ms_office_table_to_text(table: PptxTable, as_html: bool = True) -> s
|
||||
Returns:
|
||||
str: An table string representation of the input table.
|
||||
"""
|
||||
fmt = "html" if as_html else "plain"
|
||||
rows = list(table.rows)
|
||||
if len(rows) > 0:
|
||||
headers = [cell.text for cell in rows[0].cells]
|
||||
data = [[cell.text for cell in row.cells] for row in rows[1:]]
|
||||
table_text = tabulate(data, headers=headers, tablefmt=fmt)
|
||||
else:
|
||||
table_text = ""
|
||||
return table_text
|
||||
|
||||
if not rows:
|
||||
return ""
|
||||
|
||||
headers = [cell.text for cell in rows[0].cells]
|
||||
data = [[cell.text for cell in row.cells] for row in rows[1:]]
|
||||
return tabulate(data, headers=headers, tablefmt="html" if as_html else "plain")
|
||||
|
||||
|
||||
def contains_emoji(s: str) -> bool:
|
||||
|
||||
@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import io
|
||||
from tempfile import SpooledTemporaryFile
|
||||
from typing import IO, Any, Iterator, List, Optional, Sequence, Tuple, Union
|
||||
from typing import IO, Any, Iterator, Optional, Sequence
|
||||
|
||||
import pptx
|
||||
from pptx.presentation import Presentation
|
||||
@ -30,7 +30,6 @@ from unstructured.documents.elements import (
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.partition.common import (
|
||||
convert_ms_office_table_to_text,
|
||||
exactly_one,
|
||||
get_last_modified_date,
|
||||
get_last_modified_date_from_file,
|
||||
)
|
||||
@ -53,16 +52,14 @@ def partition_pptx(
|
||||
file: Optional[IO[bytes]] = None,
|
||||
include_page_breaks: bool = True,
|
||||
metadata_filename: Optional[str] = None,
|
||||
include_metadata: bool = True,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
include_slide_notes: bool = False,
|
||||
infer_table_structure: bool = True,
|
||||
chunking_strategy: Optional[str] = None,
|
||||
languages: Optional[List[str]] = ["auto"],
|
||||
languages: Optional[list[str]] = ["auto"],
|
||||
detect_language_per_element: bool = False,
|
||||
date_from_file_object: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> List[Element]:
|
||||
) -> list[Element]:
|
||||
"""Partition PowerPoint document in .pptx format into its document elements.
|
||||
|
||||
Parameters
|
||||
@ -98,28 +95,18 @@ def partition_pptx(
|
||||
Applies only when providing file via `file` parameter. If this option is True, attempt
|
||||
infer last_modified metadata from bytes, otherwise set it to None.
|
||||
"""
|
||||
# -- verify only one source-file argument was provided --
|
||||
exactly_one(filename=filename, file=file)
|
||||
|
||||
# -- In Python <3.11 SpooledTemporaryFile does not implement ".seekable" which triggers an
|
||||
# -- exception when Zipfile tries to open it. Both the docx and pptx formats are zip archives,
|
||||
# -- so we need to work around that bug here.
|
||||
if isinstance(file, SpooledTemporaryFile):
|
||||
file.seek(0)
|
||||
file = io.BytesIO(file.read())
|
||||
|
||||
source_file = file or filename
|
||||
assert source_file is not None
|
||||
|
||||
elements = _PptxPartitioner.iter_presentation_elements(
|
||||
source_file,
|
||||
include_page_breaks,
|
||||
include_slide_notes,
|
||||
infer_table_structure,
|
||||
metadata_filename,
|
||||
metadata_last_modified,
|
||||
opts = _PptxPartitionerOptions(
|
||||
date_from_file_object=date_from_file_object,
|
||||
file=file,
|
||||
file_path=filename,
|
||||
include_page_breaks=include_page_breaks,
|
||||
include_slide_notes=include_slide_notes,
|
||||
infer_table_structure=infer_table_structure,
|
||||
metadata_file_path=metadata_filename,
|
||||
metadata_last_modified=metadata_last_modified,
|
||||
)
|
||||
|
||||
elements = _PptxPartitioner.iter_presentation_elements(opts)
|
||||
elements = apply_lang_metadata(
|
||||
elements=elements,
|
||||
languages=languages,
|
||||
@ -128,52 +115,16 @@ def partition_pptx(
|
||||
return list(elements)
|
||||
|
||||
|
||||
class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
||||
class _PptxPartitioner:
|
||||
"""Provides `.partition()` for PowerPoint 2007+ (.pptx) files."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file: Union[str, IO[bytes]],
|
||||
# -- having default values for these arguments is not necessary for production uses because
|
||||
# -- this object is always created by the classmethod. However it simplifies constructing
|
||||
# -- this object in tests and makes them less sensitive to signature changes.
|
||||
include_page_breaks: bool = True,
|
||||
include_slide_notes: bool = False,
|
||||
infer_table_structure: bool = True,
|
||||
metadata_filename: Optional[str] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
date_from_file_object: bool = False,
|
||||
) -> None:
|
||||
self._file = file
|
||||
self._include_page_breaks = include_page_breaks
|
||||
self._include_slide_notes = include_slide_notes
|
||||
self._infer_table_structure = infer_table_structure
|
||||
self._metadata_filename = metadata_filename
|
||||
self._metadata_last_modified = metadata_last_modified
|
||||
self._page_counter = 0
|
||||
self._date_from_file_object = date_from_file_object
|
||||
def __init__(self, opts: _PptxPartitionerOptions):
|
||||
self._opts = opts
|
||||
|
||||
@classmethod
|
||||
def iter_presentation_elements(
|
||||
cls,
|
||||
file: Union[str, IO[bytes]],
|
||||
include_page_breaks: bool,
|
||||
include_slide_notes: bool,
|
||||
infer_table_structure: bool,
|
||||
metadata_filename: Optional[str],
|
||||
metadata_last_modified: Optional[str],
|
||||
date_from_file_object: bool = False,
|
||||
) -> Iterator[Element]:
|
||||
def iter_presentation_elements(cls, opts: _PptxPartitionerOptions) -> Iterator[Element]:
|
||||
"""Partition MS Word documents (.docx format) into its document elements."""
|
||||
return cls(
|
||||
file,
|
||||
include_page_breaks,
|
||||
include_slide_notes,
|
||||
infer_table_structure,
|
||||
metadata_filename,
|
||||
metadata_last_modified,
|
||||
date_from_file_object,
|
||||
)._iter_presentation_elements()
|
||||
return cls(opts)._iter_presentation_elements()
|
||||
|
||||
def _iter_presentation_elements(self) -> Iterator[Element]:
|
||||
"""Generate each document-element in presentation in document order."""
|
||||
@ -190,7 +141,7 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
||||
# -- characteristic of a generator avoids repeated code to form interim results into lists.
|
||||
|
||||
for slide in self._presentation.slides:
|
||||
yield from self._increment_page_number()
|
||||
yield from self._opts.increment_page_number()
|
||||
yield from self._iter_maybe_slide_notes(slide)
|
||||
|
||||
title_shape, shapes = self._order_shapes(slide)
|
||||
@ -208,25 +159,6 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
||||
# -- otherwise ditch it, this would include pictures, charts, connectors (lines),
|
||||
# -- and free-form shapes (squiggly lines). Lines don't have text.
|
||||
|
||||
@lazyproperty
|
||||
def _filename(self) -> Optional[str]:
|
||||
"""Suitable for use as metadata.filename, does not necessarily name source-file."""
|
||||
return (
|
||||
self._metadata_filename
|
||||
if self._metadata_filename
|
||||
else self._file if isinstance(self._file, str) else None
|
||||
)
|
||||
|
||||
def _increment_page_number(self) -> Iterator[PageBreak]:
|
||||
"""Increment page-number by 1 and generate a PageBreak element if enabled."""
|
||||
self._page_counter += 1
|
||||
# -- no page-break before first page --
|
||||
if self._page_counter < 2:
|
||||
return
|
||||
# -- only emit page-breaks when enabled --
|
||||
if self._include_page_breaks:
|
||||
yield PageBreak("", detection_origin=DETECTION_ORIGIN)
|
||||
|
||||
def _is_bulleted_paragraph(self, paragraph: _Paragraph) -> bool:
|
||||
"""True when `paragraph` has a bullet-charcter prefix.
|
||||
|
||||
@ -238,7 +170,7 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
||||
def _iter_maybe_slide_notes(self, slide: Slide) -> Iterator[NarrativeText]:
|
||||
"""Generate zero-or-one NarrativeText element for the slide-notes."""
|
||||
# -- only emit slide-notes elements when enabled --
|
||||
if not self._include_slide_notes:
|
||||
if not self._opts.include_slide_notes:
|
||||
return
|
||||
|
||||
# -- not all slides have a notes slide --
|
||||
@ -258,22 +190,15 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
||||
return
|
||||
|
||||
yield NarrativeText(
|
||||
text=notes_text,
|
||||
metadata=self._text_metadata(),
|
||||
detection_origin=DETECTION_ORIGIN,
|
||||
text=notes_text, metadata=self._opts.text_metadata(), detection_origin=DETECTION_ORIGIN
|
||||
)
|
||||
|
||||
def _is_invalid_shape(self, shape: Shape) -> bool:
|
||||
# NOTE(robinson) - avoid processing shapes that are not on the actual slide
|
||||
# NOTE - skip check if no top or left position (shape displayed top left)
|
||||
return bool((shape.top and shape.left) and (shape.top < 0 or shape.left < 0))
|
||||
|
||||
def _iter_title_shape_element(self, shape: Shape) -> Iterator[Element]:
|
||||
"""Generate Title element for each paragraph in title `shape`.
|
||||
|
||||
Text is most likely a title, but in the rare case that the title shape was used
|
||||
for the slide body text, also check for bulleted paragraphs."""
|
||||
if self._is_invalid_shape(shape):
|
||||
if self._shape_is_off_slide(shape):
|
||||
return
|
||||
|
||||
depth = 0
|
||||
@ -286,7 +211,7 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
||||
bullet_depth = paragraph.level or 0
|
||||
yield ListItem(
|
||||
text=text,
|
||||
metadata=self._text_metadata(category_depth=bullet_depth),
|
||||
metadata=self._opts.text_metadata(category_depth=bullet_depth),
|
||||
detection_origin=DETECTION_ORIGIN,
|
||||
)
|
||||
elif is_email_address(text):
|
||||
@ -295,14 +220,14 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
||||
# increment the category depth by the paragraph increment in the shape
|
||||
yield Title(
|
||||
text=text,
|
||||
metadata=self._text_metadata(category_depth=depth),
|
||||
metadata=self._opts.text_metadata(category_depth=depth),
|
||||
detection_origin=DETECTION_ORIGIN,
|
||||
)
|
||||
depth += 1 # Cannot enumerate because we want to skip empty paragraphs
|
||||
|
||||
def _iter_shape_elements(self, shape: Shape) -> Iterator[Element]:
|
||||
"""Generate Text or subtype element for each paragraph in `shape`."""
|
||||
if self._is_invalid_shape(shape):
|
||||
if self._shape_is_off_slide(shape):
|
||||
return
|
||||
|
||||
for paragraph in shape.text_frame.paragraphs:
|
||||
@ -311,7 +236,7 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
||||
continue
|
||||
|
||||
level = paragraph.level or 0
|
||||
metadata = self._text_metadata(category_depth=level)
|
||||
metadata = self._opts.text_metadata(category_depth=level)
|
||||
|
||||
if self._is_bulleted_paragraph(paragraph):
|
||||
yield ListItem(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
|
||||
@ -325,7 +250,7 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
||||
)
|
||||
elif is_possible_title(text):
|
||||
# If text is a title but not the title shape increment the category depth)
|
||||
metadata = self._text_metadata(category_depth=level + 1)
|
||||
metadata = self._opts.text_metadata(category_depth=level + 1)
|
||||
yield Title(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
|
||||
else:
|
||||
yield Text(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
|
||||
@ -339,36 +264,15 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
||||
if not text_table:
|
||||
return
|
||||
html_table = None
|
||||
if self._infer_table_structure:
|
||||
if self._opts.infer_table_structure:
|
||||
html_table = convert_ms_office_table_to_text(graphfrm.table, as_html=True)
|
||||
yield Table(
|
||||
text=text_table,
|
||||
metadata=self._table_metadata(html_table),
|
||||
metadata=self._opts.table_metadata(html_table),
|
||||
detection_origin=DETECTION_ORIGIN,
|
||||
)
|
||||
|
||||
@lazyproperty
|
||||
def _last_modified(self) -> Optional[str]:
|
||||
"""Last-modified date suitable for use in element metadata."""
|
||||
# -- if this file was converted from another format, any last-modified date for the file
|
||||
# -- will be today, so we get it from the conversion step in `._metadata_last_modified`.
|
||||
if self._metadata_last_modified:
|
||||
return self._metadata_last_modified
|
||||
|
||||
file = self._file
|
||||
|
||||
# -- if the file is on the filesystem, get its date from there --
|
||||
if isinstance(file, str):
|
||||
return None if file.startswith("/tmp") else get_last_modified_date(file)
|
||||
|
||||
# -- otherwise try getting it from the file-like object; this can work if `file` comes from
|
||||
# -- `with open(abc.pptx, "rb") as file:`, but I can't see folks doing that much when they
|
||||
# -- can just send us "abc.pptx" instead.
|
||||
if self._date_from_file_object:
|
||||
return get_last_modified_date_from_file(file)
|
||||
return None
|
||||
|
||||
def _order_shapes(self, slide: Slide) -> Tuple[Optional[Shape], Sequence[BaseShape]]:
|
||||
def _order_shapes(self, slide: Slide) -> tuple[Optional[Shape], Sequence[BaseShape]]:
|
||||
"""Orders the shapes on `slide` from top to bottom and left to right.
|
||||
|
||||
Returns the title shape if it exists and the ordered shapes."""
|
||||
@ -380,38 +284,153 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
||||
else:
|
||||
yield shape
|
||||
|
||||
def sort_key(shape: BaseShape) -> Tuple[int, int]:
|
||||
def sort_key(shape: BaseShape) -> tuple[int, int]:
|
||||
return shape.top or 0, shape.left or 0
|
||||
|
||||
return slide.shapes.title, sorted(iter_shapes(slide.shapes), key=sort_key)
|
||||
|
||||
@lazyproperty
|
||||
def _presentation(self) -> Presentation:
|
||||
"""The python-pptx `Presentation` object loaded from the provided source file."""
|
||||
return pptx.Presentation(self._opts.pptx_file)
|
||||
|
||||
def _shape_is_off_slide(self, shape: Shape) -> bool:
|
||||
# NOTE(robinson) - avoid processing shapes that are not on the actual slide
|
||||
# NOTE - skip check if no top or left position (shape displayed top left)
|
||||
return bool((shape.top and shape.left) and (shape.top < 0 or shape.left < 0))
|
||||
|
||||
|
||||
class _PptxPartitionerOptions:
|
||||
"""Encapsulates partitioning option validation, computation, and application of defaults."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
date_from_file_object: bool,
|
||||
file: Optional[IO[bytes]],
|
||||
file_path: Optional[str],
|
||||
include_page_breaks: bool,
|
||||
include_slide_notes: bool,
|
||||
infer_table_structure: bool,
|
||||
metadata_file_path: Optional[str],
|
||||
metadata_last_modified: Optional[str],
|
||||
):
|
||||
self._date_from_file_object = date_from_file_object
|
||||
self._file = file
|
||||
self._file_path = file_path
|
||||
self._include_page_breaks = include_page_breaks
|
||||
self._include_slide_notes = include_slide_notes
|
||||
self._infer_table_structure = infer_table_structure
|
||||
self._metadata_file_path = metadata_file_path
|
||||
self._metadata_last_modified = metadata_last_modified
|
||||
self._page_counter = 0
|
||||
|
||||
@lazyproperty
|
||||
def include_page_breaks(self) -> bool:
|
||||
"""When True, include `PageBreak` elements in element-stream.
|
||||
|
||||
Note that regardless of this setting, page-breaks are detected, and page-number is tracked
|
||||
and included in element metadata. Only the presence of distinct `PageBreak` elements (which
|
||||
contain no text) in the element stream is affected.
|
||||
"""
|
||||
return self._include_page_breaks
|
||||
|
||||
@lazyproperty
|
||||
def include_slide_notes(self) -> bool:
|
||||
"""When True, also partition any text found in slide notes as part of each slide."""
|
||||
return self._include_slide_notes
|
||||
|
||||
def increment_page_number(self) -> Iterator[PageBreak]:
|
||||
"""Increment page-number by 1 and generate a PageBreak element if enabled."""
|
||||
self._page_counter += 1
|
||||
# -- no page-break before first page --
|
||||
if self._page_counter < 2:
|
||||
return
|
||||
# -- only emit page-breaks when enabled --
|
||||
if self._include_page_breaks:
|
||||
yield PageBreak("", detection_origin=DETECTION_ORIGIN)
|
||||
|
||||
@lazyproperty
|
||||
def infer_table_structure(self) -> bool:
|
||||
"""True when partitioner should compute and apply `text_as_html` metadata for tables."""
|
||||
return self._infer_table_structure
|
||||
|
||||
@lazyproperty
|
||||
def last_modified(self) -> Optional[str]:
|
||||
"""The best last-modified date available, None if no sources are available."""
|
||||
# -- Value explicitly specified by caller takes precedence. This is used for example when
|
||||
# -- this file was converted from another format, and any last-modified date for the file
|
||||
# -- would be just now.
|
||||
if self._metadata_last_modified:
|
||||
return self._metadata_last_modified
|
||||
|
||||
if self._file_path:
|
||||
return (
|
||||
None
|
||||
if self._file_path.startswith("/tmp")
|
||||
else get_last_modified_date(self._file_path)
|
||||
)
|
||||
|
||||
if self._file:
|
||||
return (
|
||||
get_last_modified_date_from_file(self._file)
|
||||
if self._date_from_file_object
|
||||
else None
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
@lazyproperty
|
||||
def metadata_file_path(self) -> str | None:
|
||||
"""The best available file-path for this document or `None` if unavailable."""
|
||||
return self._metadata_file_path or self._file_path
|
||||
|
||||
@property
|
||||
def _page_number(self) -> Optional[int]:
|
||||
def page_number(self) -> int:
|
||||
"""The current page (slide) number."""
|
||||
return self._page_counter
|
||||
|
||||
@lazyproperty
|
||||
def _presentation(self) -> Presentation:
|
||||
"""The python-pptx `Presentation` object loaded from the provided source file."""
|
||||
return pptx.Presentation(self._file)
|
||||
def pptx_file(self) -> str | IO[bytes]:
|
||||
"""The PowerPoint document file to be partitioned.
|
||||
|
||||
def _table_metadata(self, text_as_html: str):
|
||||
This is either a str path or a file-like object. `python-pptx` accepts either for opening a
|
||||
presentation file.
|
||||
"""
|
||||
if self._file_path:
|
||||
return self._file_path
|
||||
|
||||
# -- In Python <3.11 SpooledTemporaryFile does not implement ".seekable" which triggers an
|
||||
# -- exception when Zipfile tries to open it. The pptx format is a zip archive so we need
|
||||
# -- to work around that bug here.
|
||||
if isinstance(self._file, SpooledTemporaryFile):
|
||||
self._file.seek(0)
|
||||
return io.BytesIO(self._file.read())
|
||||
|
||||
if self._file:
|
||||
return self._file
|
||||
|
||||
raise ValueError(
|
||||
"No PPTX document specified, either `filename` or `file` argument must be provided"
|
||||
)
|
||||
|
||||
def table_metadata(self, text_as_html: str | None):
|
||||
"""ElementMetadata instance suitable for use with Table element."""
|
||||
element_metadata = ElementMetadata(
|
||||
filename=self._filename,
|
||||
last_modified=self._last_modified,
|
||||
page_number=self._page_number,
|
||||
filename=self.metadata_file_path,
|
||||
last_modified=self.last_modified,
|
||||
page_number=self.page_number,
|
||||
text_as_html=text_as_html,
|
||||
)
|
||||
element_metadata.detection_origin = DETECTION_ORIGIN
|
||||
return element_metadata
|
||||
|
||||
def _text_metadata(self, category_depth: int = 0) -> ElementMetadata:
|
||||
def text_metadata(self, category_depth: int = 0) -> ElementMetadata:
|
||||
"""ElementMetadata instance suitable for use with Text and subtypes."""
|
||||
element_metadata = ElementMetadata(
|
||||
filename=self._filename,
|
||||
last_modified=self._last_modified,
|
||||
page_number=self._page_number,
|
||||
filename=self.metadata_file_path,
|
||||
last_modified=self.last_modified,
|
||||
page_number=self.page_number,
|
||||
category_depth=category_depth,
|
||||
)
|
||||
element_metadata.detection_origin = DETECTION_ORIGIN
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user