mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-27 23:24:27 +00:00
rfctr: prepare docx partitioner and tests for nested tables PR to follow (#1978)
*Reviewer:* May be quicker to review commit by commit as they are quite distinct and well-groomed to each focus on a single clean-up task. Clean up odds-and-ends in the docx partitioner in preparation for adding nested-tables support in a closely following PR. 1. Remove obsolete TODOs now in GitHub issues, which is probably where they belong in future anyway. 2. Remove local DOCX "workaround" code that has been implemented upstream and is now obsolete. 3. "Clean" the docx tests, introducing strict typing, extracting a fixture or two, and generally tightening things up. 4. Extract docx-local versions of `unstructured.partition.common.convert_ms_office_table_to_text()` which will be the base for adding nested-table support. More information on why this is required in that commit.
This commit is contained in:
parent
51d07b6434
commit
4e40999070
@ -1,4 +1,4 @@
|
||||
## 0.10.29-dev7
|
||||
## 0.10.29-dev8
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
* **Handle empty string for `ocr_languages` with values for `languages`** Some API users ran into an issue with sending `languages` params because the API defaulted to also using an empty string for `ocr_languages`. This update handles situations where `languages` is defined and `ocr_languages` is an empty string.
|
||||
* **Fix PDF tried to loop through None** Previously the PDF annotation extraction tried to loop through `annots` that resolved out as None. A logical check added to avoid such error.
|
||||
* **Ingest session handler not being shared correctly** All ingest docs that leverage the session handler should only need to set it once per process. It was recreating it each time because the right values weren't being set nor available given how dataclasses work in python.
|
||||
* **Ingest download-only fix** Previously the download only flag was being checked after the doc factory pipeline step, which occurs before the files are actually downloaded by the source node. This check was moved after the source node to allow for the files to be downloaded first before exiting the pipeline.
|
||||
* **Ingest download-only fix.** Previously the download only flag was being checked after the doc factory pipeline step, which occurs before the files are actually downloaded by the source node. This check was moved after the source node to allow for the files to be downloaded first before exiting the pipeline.
|
||||
* **Fix flaky chunk-metadata.** Prior implementation was sensitive to element order in the section resulting in metadata values sometimes being dropped. Also, not all metadata items can be consolidated across multiple elements (e.g. coordinates) and so are now dropped from consolidated metadata.
|
||||
|
||||
## 0.10.28
|
||||
|
||||
@ -10,6 +10,7 @@ from unstructured.documents.elements import (
|
||||
Address,
|
||||
ListItem,
|
||||
NarrativeText,
|
||||
Table,
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
@ -271,6 +272,25 @@ def test_partition_doc_from_file_without_metadata_date(
|
||||
assert elements[0].metadata.date == "2020-07-05"
|
||||
|
||||
|
||||
def test_partition_doc_grabs_emphasized_texts():
|
||||
expected_emphasized_text_contents = ["bold", "italic", "bold-italic", "bold-italic"]
|
||||
expected_emphasized_text_tags = ["b", "i", "b", "i"]
|
||||
|
||||
elements = partition_doc("example-docs/fake-doc-emphasized-text.doc")
|
||||
|
||||
assert isinstance(elements[0], Table)
|
||||
assert elements[0].metadata.emphasized_text_contents == expected_emphasized_text_contents
|
||||
assert elements[0].metadata.emphasized_text_tags == expected_emphasized_text_tags
|
||||
|
||||
assert elements[1] == NarrativeText("I am a bold italic bold-italic text.")
|
||||
assert elements[1].metadata.emphasized_text_contents == expected_emphasized_text_contents
|
||||
assert elements[1].metadata.emphasized_text_tags == expected_emphasized_text_tags
|
||||
|
||||
assert elements[2] == NarrativeText("I am a normal text.")
|
||||
assert elements[2].metadata.emphasized_text_contents is None
|
||||
assert elements[2].metadata.emphasized_text_tags is None
|
||||
|
||||
|
||||
def test_partition_doc_with_json(mock_document, tmpdir):
|
||||
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
# pyright: reportPrivateUsage=false
|
||||
|
||||
import os
|
||||
import pathlib
|
||||
from tempfile import SpooledTemporaryFile
|
||||
from typing import Dict, List, cast
|
||||
@ -8,27 +7,29 @@ from typing import Dict, List, cast
|
||||
import docx
|
||||
import pytest
|
||||
from docx.document import Document
|
||||
from pytest_mock import MockFixture
|
||||
|
||||
from test_unstructured.unit_utils import assert_round_trips_through_JSON
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
from unstructured.documents.elements import (
|
||||
Address,
|
||||
CompositeElement,
|
||||
Element,
|
||||
Footer,
|
||||
Header,
|
||||
ListItem,
|
||||
NarrativeText,
|
||||
Table,
|
||||
TableChunk,
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
from unstructured.partition.doc import partition_doc
|
||||
from unstructured.partition.docx import _DocxPartitioner, partition_docx
|
||||
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
|
||||
|
||||
|
||||
def test_parition_docx_from_team_chat():
|
||||
elements = partition_docx(filename="example-docs/teams_chat.docx")
|
||||
elements = cast(List[Text], partition_docx(example_doc_path("teams_chat.docx")))
|
||||
assert [element.text for element in elements] == [
|
||||
"0:0:0.0 --> 0:0:1.510\nSome Body\nOK. Yeah.",
|
||||
"0:0:3.270 --> 0:0:4.250\nJames Bond\nUmm.",
|
||||
@ -37,10 +38,10 @@ def test_parition_docx_from_team_chat():
|
||||
|
||||
|
||||
def test_partition_docx_from_filename(
|
||||
mock_document_filename: str,
|
||||
mock_document_file_path: str,
|
||||
expected_elements: List[Element],
|
||||
):
|
||||
elements = partition_docx(filename=mock_document_filename)
|
||||
elements = partition_docx(mock_document_file_path)
|
||||
|
||||
assert elements == expected_elements
|
||||
assert elements[0].metadata.page_number is None
|
||||
@ -50,19 +51,20 @@ def test_partition_docx_from_filename(
|
||||
assert {element.metadata.detection_origin for element in elements} == {"docx"}
|
||||
|
||||
|
||||
def test_partition_docx_from_filename_with_metadata_filename(mock_document, tmpdir):
|
||||
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
mock_document.save(filename)
|
||||
elements = partition_docx(filename=filename, metadata_filename="test")
|
||||
def test_partition_docx_from_filename_with_metadata_filename(mock_document_file_path: str):
|
||||
elements = partition_docx(mock_document_file_path, metadata_filename="test")
|
||||
assert all(element.metadata.filename == "test" for element in elements)
|
||||
|
||||
|
||||
def test_partition_docx_with_spooled_file(mock_document, expected_elements, tmpdir):
|
||||
# Test that the partition_docx function can handle a SpooledTemporaryFile
|
||||
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
mock_document.save(filename)
|
||||
def test_partition_docx_with_spooled_file(
|
||||
mock_document_file_path: str, expected_elements: List[Text]
|
||||
):
|
||||
"""`partition_docx()` accepts a SpooledTemporaryFile as its `file` argument.
|
||||
|
||||
with open(filename, "rb") as test_file:
|
||||
`python-docx` will NOT accept a `SpooledTemporaryFile` in Python versions before 3.11 so we need
|
||||
to ensure the source file is appropriately converted in this case.
|
||||
"""
|
||||
with open(mock_document_file_path, "rb") as test_file:
|
||||
spooled_temp_file = SpooledTemporaryFile()
|
||||
spooled_temp_file.write(test_file.read())
|
||||
spooled_temp_file.seek(0)
|
||||
@ -72,28 +74,18 @@ def test_partition_docx_with_spooled_file(mock_document, expected_elements, tmpd
|
||||
assert element.metadata.filename is None
|
||||
|
||||
|
||||
def test_partition_docx_from_file(mock_document, expected_elements, tmpdir):
|
||||
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
mock_document.save(filename)
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
def test_partition_docx_from_file(mock_document_file_path: str, expected_elements: List[Text]):
|
||||
with open(mock_document_file_path, "rb") as f:
|
||||
elements = partition_docx(file=f)
|
||||
assert elements == expected_elements
|
||||
for element in elements:
|
||||
assert element.metadata.filename is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"infer_table_structure",
|
||||
[
|
||||
True,
|
||||
False,
|
||||
],
|
||||
)
|
||||
def test_partition_docx_infer_table_structure(infer_table_structure):
|
||||
@pytest.mark.parametrize("infer_table_structure", [True, False])
|
||||
def test_partition_docx_infer_table_structure(infer_table_structure: bool):
|
||||
elements = partition_docx(
|
||||
filename="example-docs/fake_table.docx",
|
||||
infer_table_structure=infer_table_structure,
|
||||
example_doc_path("fake_table.docx"), infer_table_structure=infer_table_structure
|
||||
)
|
||||
table_element_has_text_as_html_field = (
|
||||
hasattr(elements[0].metadata, "text_as_html")
|
||||
@ -102,58 +94,58 @@ def test_partition_docx_infer_table_structure(infer_table_structure):
|
||||
assert table_element_has_text_as_html_field == infer_table_structure
|
||||
|
||||
|
||||
def test_partition_docx_from_file_with_metadata_filename(mock_document, expected_elements, tmpdir):
|
||||
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
mock_document.save(filename)
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
def test_partition_docx_from_file_with_metadata_filename(
|
||||
mock_document_file_path: str, expected_elements: List[Text]
|
||||
):
|
||||
with open(mock_document_file_path, "rb") as f:
|
||||
elements = partition_docx(file=f, metadata_filename="test")
|
||||
assert elements == expected_elements
|
||||
for element in elements:
|
||||
assert element.metadata.filename == "test"
|
||||
|
||||
|
||||
def test_partition_docx_raises_with_both_specified(mock_document, tmpdir):
|
||||
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
mock_document.save(filename)
|
||||
|
||||
with open(filename, "rb") as f, pytest.raises(ValueError):
|
||||
partition_docx(filename=filename, file=f)
|
||||
def test_partition_docx_raises_with_both_specified(mock_document_file_path: str):
|
||||
with open(mock_document_file_path, "rb") as f:
|
||||
with pytest.raises(ValueError, match="Exactly one of filename and file must be specified"):
|
||||
partition_docx(filename=mock_document_file_path, file=f)
|
||||
|
||||
|
||||
def test_partition_docx_raises_with_neither():
|
||||
with pytest.raises(ValueError):
|
||||
with pytest.raises(ValueError, match="Exactly one of filename and file must be specified"):
|
||||
partition_docx()
|
||||
|
||||
|
||||
def test_partition_docx_processes_table(filename="example-docs/fake_table.docx"):
|
||||
elements = partition_docx(filename=filename)
|
||||
def test_partition_docx_processes_table():
|
||||
elements = partition_docx(example_doc_path("fake_table.docx"))
|
||||
|
||||
assert isinstance(elements[0], Table)
|
||||
assert (
|
||||
elements[0].metadata.text_as_html
|
||||
== """<table>
|
||||
<thead>
|
||||
<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr><td>Lorem ipsum </td><td>A Link example</td></tr>
|
||||
</tbody>
|
||||
</table>"""
|
||||
assert elements[0].metadata.text_as_html == (
|
||||
"<table>\n"
|
||||
"<thead>\n"
|
||||
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
|
||||
"</thead>\n"
|
||||
"<tbody>\n"
|
||||
"<tr><td>Lorem ipsum </td><td>A Link example</td></tr>\n"
|
||||
"</tbody>\n"
|
||||
"</table>"
|
||||
)
|
||||
assert elements[0].metadata.filename == "fake_table.docx"
|
||||
|
||||
|
||||
def test_partition_docx_grabs_header_and_footer(filename="example-docs/handbook-1p.docx"):
|
||||
elements = partition_docx(filename=filename)
|
||||
def test_partition_docx_grabs_header_and_footer():
|
||||
elements = partition_docx(example_doc_path("handbook-1p.docx"))
|
||||
|
||||
assert elements[0] == Header("US Trustee Handbook")
|
||||
assert elements[-1] == Footer("Copyright")
|
||||
for element in elements:
|
||||
assert element.metadata.filename == "handbook-1p.docx"
|
||||
|
||||
|
||||
def test_partition_docx_includes_pages_if_present(filename="example-docs/handbook-1p.docx"):
|
||||
elements = partition_docx(filename=filename, include_page_breaks=False)
|
||||
def test_partition_docx_includes_pages_if_present():
|
||||
elements = cast(
|
||||
List[Text], partition_docx(example_doc_path("handbook-1p.docx"), include_page_breaks=False)
|
||||
)
|
||||
|
||||
assert "PageBreak" not in [elem.category for elem in elements]
|
||||
assert elements[1].metadata.page_number == 1
|
||||
assert elements[-2].metadata.page_number == 2
|
||||
@ -161,8 +153,11 @@ def test_partition_docx_includes_pages_if_present(filename="example-docs/handboo
|
||||
assert element.metadata.filename == "handbook-1p.docx"
|
||||
|
||||
|
||||
def test_partition_docx_includes_page_breaks(filename="example-docs/handbook-1p.docx"):
|
||||
elements = partition_docx(filename=filename, include_page_breaks=True)
|
||||
def test_partition_docx_includes_page_breaks():
|
||||
elements = cast(
|
||||
List[Text], partition_docx(example_doc_path("handbook-1p.docx"), include_page_breaks=True)
|
||||
)
|
||||
|
||||
assert "PageBreak" in [elem.category for elem in elements]
|
||||
assert elements[1].metadata.page_number == 1
|
||||
assert elements[-2].metadata.page_number == 2
|
||||
@ -170,115 +165,81 @@ def test_partition_docx_includes_page_breaks(filename="example-docs/handbook-1p.
|
||||
assert element.metadata.filename == "handbook-1p.docx"
|
||||
|
||||
|
||||
def test_partition_docx_detects_lists(filename="example-docs/example-list-items-multiple.docx"):
|
||||
elements = partition_docx(filename=filename)
|
||||
list_elements = []
|
||||
narrative_elements = []
|
||||
for element in elements:
|
||||
if isinstance(element, ListItem):
|
||||
list_elements.append(element)
|
||||
else:
|
||||
narrative_elements.append(element)
|
||||
def test_partition_docx_detects_lists():
|
||||
elements = partition_docx(example_doc_path("example-list-items-multiple.docx"))
|
||||
|
||||
assert elements[-1] == ListItem(
|
||||
"This is simply dummy text of the printing and typesetting industry.",
|
||||
)
|
||||
assert len(list_elements) == 10
|
||||
assert sum(1 for e in elements if isinstance(e, ListItem)) == 10
|
||||
|
||||
|
||||
def test_partition_docx_from_filename_exclude_metadata(filename="example-docs/handbook-1p.docx"):
|
||||
elements = partition_docx(filename=filename, include_metadata=False)
|
||||
def test_partition_docx_from_filename_exclude_metadata():
|
||||
elements = partition_docx(example_doc_path("handbook-1p.docx"), include_metadata=False)
|
||||
|
||||
assert elements[0].metadata.filetype is None
|
||||
assert elements[0].metadata.page_name is None
|
||||
assert elements[0].metadata.filename is None
|
||||
|
||||
|
||||
def test_partition_docx_from_file_exclude_metadata(mock_document, tmpdir):
|
||||
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
mock_document.save(filename)
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
def test_partition_docx_from_file_exclude_metadata(mock_document_file_path: str):
|
||||
with open(mock_document_file_path, "rb") as f:
|
||||
elements = partition_docx(file=f, include_metadata=False)
|
||||
|
||||
assert elements[0].metadata.filetype is None
|
||||
assert elements[0].metadata.page_name is None
|
||||
assert elements[0].metadata.filename is None
|
||||
|
||||
|
||||
def test_partition_docx_metadata_date(
|
||||
mocker,
|
||||
filename="example-docs/fake.docx",
|
||||
):
|
||||
mocked_last_modification_date = "2029-07-05T09:24:28"
|
||||
|
||||
def test_partition_docx_metadata_date(mocker: MockFixture):
|
||||
mocker.patch(
|
||||
"unstructured.partition.docx.get_last_modified_date",
|
||||
return_value=mocked_last_modification_date,
|
||||
"unstructured.partition.docx.get_last_modified_date", return_value="2029-07-05T09:24:28"
|
||||
)
|
||||
|
||||
elements = partition_docx(filename=filename)
|
||||
elements = partition_docx(example_doc_path("fake.docx"))
|
||||
|
||||
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
||||
assert elements[0].metadata.last_modified == "2029-07-05T09:24:28"
|
||||
|
||||
|
||||
def test_partition_docx_metadata_date_with_custom_metadata(
|
||||
mocker,
|
||||
filename="example-docs/fake.docx",
|
||||
):
|
||||
mocked_last_modification_date = "2029-07-05T09:24:28"
|
||||
expected_last_modified_date = "2020-07-05T09:24:28"
|
||||
|
||||
def test_partition_docx_metadata_date_with_custom_metadata(mocker: MockFixture):
|
||||
mocker.patch(
|
||||
"unstructured.partition.docx.get_last_modified_date",
|
||||
return_value=mocked_last_modification_date,
|
||||
"unstructured.partition.docx.get_last_modified_date", return_value="2023-11-01T14:13:07"
|
||||
)
|
||||
|
||||
elements = partition_docx(
|
||||
filename=filename,
|
||||
metadata_last_modified=expected_last_modified_date,
|
||||
example_doc_path("fake.docx"), metadata_last_modified="2020-07-05T09:24:28"
|
||||
)
|
||||
|
||||
assert elements[0].metadata.last_modified == expected_last_modified_date
|
||||
assert elements[0].metadata.last_modified == "2020-07-05T09:24:28"
|
||||
|
||||
|
||||
def test_partition_docx_from_file_metadata_date(
|
||||
mocker,
|
||||
filename="example-docs/fake.docx",
|
||||
):
|
||||
mocked_last_modification_date = "2029-07-05T09:24:28"
|
||||
|
||||
def test_partition_docx_from_file_metadata_date(mocker: MockFixture):
|
||||
mocker.patch(
|
||||
"unstructured.partition.docx.get_last_modified_date_from_file",
|
||||
return_value=mocked_last_modification_date,
|
||||
return_value="2029-07-05T09:24:28",
|
||||
)
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
with open(example_doc_path("fake.docx"), "rb") as f:
|
||||
elements = partition_docx(file=f)
|
||||
|
||||
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
||||
assert elements[0].metadata.last_modified == "2029-07-05T09:24:28"
|
||||
|
||||
|
||||
def test_partition_docx_from_file_metadata_date_with_custom_metadata(
|
||||
mocker,
|
||||
filename="example-docs/fake.docx",
|
||||
):
|
||||
mocked_last_modification_date = "2029-07-05T09:24:28"
|
||||
expected_last_modified_date = "2020-07-05T09:24:28"
|
||||
|
||||
def test_partition_docx_from_file_metadata_date_with_custom_metadata(mocker: MockFixture):
|
||||
mocker.patch(
|
||||
"unstructured.partition.docx.get_last_modified_date_from_file",
|
||||
return_value=mocked_last_modification_date,
|
||||
return_value="2023-11-01T14:13:07",
|
||||
)
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_docx(file=f, metadata_last_modified=expected_last_modified_date)
|
||||
|
||||
assert elements[0].metadata.last_modified == expected_last_modified_date
|
||||
with open(example_doc_path("fake.docx"), "rb") as f:
|
||||
elements = partition_docx(file=f, metadata_last_modified="2020-07-05T09:24:28")
|
||||
|
||||
assert elements[0].metadata.last_modified == "2020-07-05T09:24:28"
|
||||
|
||||
|
||||
def test_partition_docx_from_file_without_metadata_date(
|
||||
filename="example-docs/fake.docx",
|
||||
):
|
||||
def test_partition_docx_from_file_without_metadata_date():
|
||||
"""Test partition_docx() with file that are not possible to get last modified date"""
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
with open(example_doc_path("fake.docx"), "rb") as f:
|
||||
sf = SpooledTemporaryFile()
|
||||
sf.write(f.read())
|
||||
sf.seek(0)
|
||||
@ -344,20 +305,11 @@ def test_table_emphasis(
|
||||
assert emphasized_text_tags == expected_emphasized_text_tags
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("filename", "partition_func"),
|
||||
[
|
||||
("fake-doc-emphasized-text.docx", partition_docx),
|
||||
("fake-doc-emphasized-text.doc", partition_doc),
|
||||
],
|
||||
)
|
||||
def test_partition_docx_grabs_emphasized_texts(
|
||||
filename,
|
||||
partition_func,
|
||||
expected_emphasized_text_contents,
|
||||
expected_emphasized_text_tags,
|
||||
expected_emphasized_text_contents: List[str],
|
||||
expected_emphasized_text_tags: List[str],
|
||||
):
|
||||
elements = partition_func(filename=f"example-docs/{filename}")
|
||||
elements = partition_docx(example_doc_path("fake-doc-emphasized-text.docx"))
|
||||
|
||||
assert isinstance(elements[0], Table)
|
||||
assert elements[0].metadata.emphasized_text_contents == expected_emphasized_text_contents
|
||||
@ -372,11 +324,8 @@ def test_partition_docx_grabs_emphasized_texts(
|
||||
assert elements[2].metadata.emphasized_text_tags is None
|
||||
|
||||
|
||||
def test_partition_docx_with_json(mock_document, tmpdir):
|
||||
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
mock_document.save(filename)
|
||||
|
||||
elements = partition_docx(filename=filename)
|
||||
def test_partition_docx_with_json(mock_document_file_path: str):
|
||||
elements = partition_docx(mock_document_file_path)
|
||||
assert_round_trips_through_JSON(elements)
|
||||
|
||||
|
||||
@ -448,33 +397,30 @@ def test_parse_category_depth_by_style_ilvl():
|
||||
assert partitioner._parse_category_depth_by_style_ilvl() == 0
|
||||
|
||||
|
||||
def test_add_chunking_strategy_on_partition_docx_default_args(
|
||||
filename="example-docs/handbook-1p.docx",
|
||||
):
|
||||
chunk_elements = partition_docx(filename, chunking_strategy="by_title")
|
||||
elements = partition_docx(filename)
|
||||
def test_add_chunking_strategy_on_partition_docx_default_args():
|
||||
chunk_elements = partition_docx(
|
||||
example_doc_path("handbook-1p.docx"), chunking_strategy="by_title"
|
||||
)
|
||||
elements = partition_docx(example_doc_path("handbook-1p.docx"))
|
||||
chunks = chunk_by_title(elements)
|
||||
|
||||
assert chunk_elements != elements
|
||||
assert chunk_elements == chunks
|
||||
|
||||
|
||||
def test_add_chunking_strategy_on_partition_docx(
|
||||
filename="example-docs/fake-doc-emphasized-text.docx",
|
||||
):
|
||||
def test_add_chunking_strategy_on_partition_docx():
|
||||
docx_path = example_doc_path("fake-doc-emphasized-text.docx")
|
||||
|
||||
chunk_elements = partition_docx(
|
||||
filename,
|
||||
chunking_strategy="by_title",
|
||||
max_characters=9,
|
||||
combine_text_under_n_chars=5,
|
||||
docx_path, chunking_strategy="by_title", max_characters=9, combine_text_under_n_chars=5
|
||||
)
|
||||
elements = partition_docx(filename)
|
||||
elements = partition_docx(docx_path)
|
||||
chunks = chunk_by_title(elements, max_characters=9, combine_text_under_n_chars=5)
|
||||
|
||||
assert chunk_elements == chunks
|
||||
assert elements != chunk_elements
|
||||
|
||||
for chunk in chunks:
|
||||
assert isinstance(chunk, (CompositeElement, TableChunk))
|
||||
assert len(chunk.text) <= 9
|
||||
|
||||
|
||||
@ -507,7 +453,7 @@ def test_partition_docx_raises_TypeError_for_invalid_languages():
|
||||
|
||||
|
||||
def test_partition_docx_includes_hyperlink_metadata():
|
||||
elements = cast(List[Text], partition_docx(get_test_file_path("hlink-meta.docx")))
|
||||
elements = cast(List[Text], partition_docx(example_doc_path("hlink-meta.docx")))
|
||||
|
||||
# -- regular paragraph, no hyperlinks --
|
||||
element = elements[0]
|
||||
@ -593,8 +539,13 @@ def test_partition_docx_includes_hyperlink_metadata():
|
||||
# -- module-level fixtures -----------------------------------------------------------------------
|
||||
|
||||
|
||||
def example_doc_path(filename: str) -> str:
|
||||
"""String path to a file in the example-docs/ directory."""
|
||||
return str(pathlib.Path(__file__).parent.parent.parent.parent / "example-docs" / filename)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def expected_elements():
|
||||
def expected_elements() -> List[Text]:
|
||||
return [
|
||||
Title("These are a few of my favorite things:"),
|
||||
ListItem("Parrots"),
|
||||
@ -608,12 +559,12 @@ def expected_elements():
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def expected_emphasized_text_contents():
|
||||
def expected_emphasized_text_contents() -> List[str]:
|
||||
return ["bold", "italic", "bold-italic", "bold-italic"]
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def expected_emphasized_text_tags():
|
||||
def expected_emphasized_text_tags() -> List[str]:
|
||||
return ["b", "i", "b", "i"]
|
||||
|
||||
|
||||
@ -627,12 +578,6 @@ def expected_emphasized_texts():
|
||||
]
|
||||
|
||||
|
||||
def get_test_file_path(filename: str) -> str:
|
||||
"""String path to a file in the docx/test_files directory."""
|
||||
# -- needs the `get_` prefix on name so this doesn't get picked up as a test-function --
|
||||
return str(pathlib.Path(__file__).parent / "test_files" / filename)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def mock_document():
|
||||
document = docx.Document()
|
||||
@ -661,8 +606,7 @@ def mock_document():
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def mock_document_filename(mock_document: Document, tmp_path: pathlib.Path) -> str:
|
||||
def mock_document_file_path(mock_document: Document, tmp_path: pathlib.Path) -> str:
|
||||
filename = str(tmp_path / "mock_document.docx")
|
||||
print(f"filename = {filename}")
|
||||
mock_document.save(filename)
|
||||
return filename
|
||||
|
||||
@ -1,8 +1,10 @@
|
||||
from typing import Sequence
|
||||
from typing import Iterator, Sequence
|
||||
|
||||
from docx.blkcntnr import BlockItemContainer
|
||||
from docx.enum.section import WD_SECTION
|
||||
from docx.oxml.section import CT_SectPr
|
||||
from docx.table import Table
|
||||
from docx.text.paragraph import Paragraph
|
||||
|
||||
class Section:
|
||||
_sectPr: CT_SectPr
|
||||
@ -20,6 +22,7 @@ class Section:
|
||||
def footer(self) -> _Footer: ...
|
||||
@property
|
||||
def header(self) -> _Header: ...
|
||||
def iter_inner_content(self) -> Iterator[Paragraph | Table]: ...
|
||||
@property
|
||||
def start_type(self) -> WD_SECTION: ...
|
||||
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.10.29-dev7" # pragma: no cover
|
||||
__version__ = "0.10.29-dev8" # pragma: no cover
|
||||
|
||||
@ -14,7 +14,6 @@ from typing import (
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Tuple,
|
||||
Type,
|
||||
Union,
|
||||
@ -25,18 +24,15 @@ from typing import (
|
||||
import docx
|
||||
from docx.document import Document
|
||||
from docx.enum.section import WD_SECTION_START
|
||||
from docx.oxml.ns import nsmap, qn
|
||||
from docx.oxml.section import CT_SectPr
|
||||
from docx.oxml.table import CT_Tbl
|
||||
from docx.oxml.text.paragraph import CT_P
|
||||
from docx.oxml.text.run import CT_R
|
||||
from docx.oxml.xmlchemy import BaseOxmlElement
|
||||
from docx.section import Section, _Footer, _Header
|
||||
from docx.table import Table as DocxTable
|
||||
from docx.text.hyperlink import Hyperlink
|
||||
from docx.text.paragraph import Paragraph
|
||||
from docx.text.run import Run
|
||||
from lxml import etree
|
||||
from tabulate import tabulate
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
from unstructured.chunking.title import add_chunking_strategy
|
||||
@ -59,7 +55,6 @@ from unstructured.documents.elements import (
|
||||
)
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.partition.common import (
|
||||
convert_ms_office_table_to_text,
|
||||
exactly_one,
|
||||
get_last_modified_date,
|
||||
get_last_modified_date_from_file,
|
||||
@ -234,28 +229,6 @@ def partition_docx(
|
||||
class _DocxPartitioner:
|
||||
"""Provides `.partition()` for MS-Word 2007+ (.docx) files."""
|
||||
|
||||
# TODO: I think we can do better on metadata.filename. Should that only be populated when a
|
||||
# `metadata_filename` argument was provided to `partition_docx()`? What about when not but
|
||||
# we do get a `filename` arg or a `file` arg that has a `.name` attribute?
|
||||
# TODO: get last-modified date from document-properties (stored in docx package) rather than
|
||||
# relying on last filesystem-write date; maybe fall-back to filesystem-date.
|
||||
# TODO: improve `._element_contains_pagebreak()`. It uses substring matching on the rendered
|
||||
# XML text which is error-prone and not performant. Use XPath instead with the specific
|
||||
# locations a page-break can be located. Also, there can be more than one, so return a
|
||||
# count instead of a boolean.
|
||||
# TODO: Improve document-contains-pagebreaks algorithm to use XPath and to search for
|
||||
# `w:lastRenderedPageBreak` alone. Make it independent and don't rely on anything like
|
||||
# the "_element_contains_pagebreak()" function.
|
||||
# TODO: Improve ._is_list_item() to include list-styles such that telling whether a paragraph is
|
||||
# a list-item is encapsulated in a single place rather than distributed around the code.
|
||||
# TODO: Improve ._is_list_item() method of detecting a numbered-list-item to use XPath instead
|
||||
# of a substring match on the rendered XML. Include all permutations of how a numbered
|
||||
# list can be manually applied (as opposed to by using a style).
|
||||
# TODO: Move _SectBlockIterator upstream into `python-docx`. It requires too much
|
||||
# domain-specific knowledge to comfortable here and is of general use so welcome in the
|
||||
# library.
|
||||
# TODO: Move Paragraph._get_paragraph_runs() monkey-patch upstream to `python-docx`.
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
filename: Optional[str],
|
||||
@ -315,18 +288,62 @@ class _DocxPartitioner:
|
||||
yield from self._iter_section_page_breaks(section_idx, section)
|
||||
yield from self._iter_section_headers(section)
|
||||
|
||||
for block_item in _SectBlockItemIterator.iter_sect_block_items(section, self._document):
|
||||
# -- a block-item can only be a Paragraph ... --
|
||||
for block_item in section.iter_inner_content():
|
||||
# -- a block-item can be a Paragraph or a Table, maybe others later so elif here.
|
||||
# -- Paragraph is more common so check that first.
|
||||
if isinstance(block_item, Paragraph):
|
||||
yield from self._iter_paragraph_elements(block_item)
|
||||
# -- a paragraph can contain a page-break --
|
||||
yield from self._iter_maybe_paragraph_page_breaks(block_item)
|
||||
# -- ... or a Table --
|
||||
else:
|
||||
elif isinstance( # pyright: ignore[reportUnnecessaryIsInstance]
|
||||
block_item, DocxTable
|
||||
):
|
||||
yield from self._iter_table_element(block_item)
|
||||
|
||||
yield from self._iter_section_footers(section)
|
||||
|
||||
@staticmethod
|
||||
def _convert_table_to_html(table: DocxTable) -> str:
|
||||
"""HTML string version of `table`.
|
||||
|
||||
Example:
|
||||
|
||||
<table>
|
||||
<tbody>
|
||||
<tr><th>item </th><th style="text-align: right;"> qty</th></tr>
|
||||
<tr><td>spam </td><td style="text-align: right;"> 42</td></tr>
|
||||
<tr><td>eggs </td><td style="text-align: right;"> 451</td></tr>
|
||||
<tr><td>bacon </td><td style="text-align: right;"> 0</td></tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
"""
|
||||
return tabulate(
|
||||
[[cell.text for cell in row.cells] for row in table.rows],
|
||||
headers="firstrow",
|
||||
tablefmt="html",
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _convert_table_to_plain_text(table: DocxTable) -> str:
|
||||
"""Plain-text version of `table`.
|
||||
|
||||
Each row appears on its own line. Cells in a column are aligned using spaces as padding:
|
||||
|
||||
item qty
|
||||
spam 42
|
||||
eggs 451
|
||||
bacon 0
|
||||
|
||||
The first row is unconditionally considered column headings, although the column headings
|
||||
row is not differentiated in this format.
|
||||
"""
|
||||
return tabulate(
|
||||
[[cell.text for cell in row.cells] for row in table.rows],
|
||||
headers="firstrow",
|
||||
tablefmt="plain",
|
||||
)
|
||||
|
||||
@lazyproperty
|
||||
def _document(self) -> Document:
|
||||
"""The python-docx `Document` object loaded from file or filename."""
|
||||
@ -562,8 +579,8 @@ class _DocxPartitioner:
|
||||
# -- to skip, for example, an empty table, or accommodate nested tables.
|
||||
html_table = None
|
||||
if self._infer_table_structure:
|
||||
html_table = convert_ms_office_table_to_text(table, as_html=True)
|
||||
text_table = convert_ms_office_table_to_text(table, as_html=False)
|
||||
html_table = self._convert_table_to_html(table)
|
||||
text_table = self._convert_table_to_plain_text(table)
|
||||
emphasized_text_contents, emphasized_text_tags = self._table_emphasis(table)
|
||||
|
||||
yield Table(
|
||||
@ -800,161 +817,3 @@ class _DocxPartitioner:
|
||||
def _parse_category_depth_by_style_ilvl(self) -> int:
|
||||
# TODO(newelh) Parsing category depth by style ilvl is not yet implemented
|
||||
return 0
|
||||
|
||||
|
||||
class _SectBlockItemIterator:
|
||||
"""Generates the block-items in a section.
|
||||
|
||||
A block item is a docx Paragraph or Table. This small class is separated from
|
||||
`_SectBlockElementIterator` because these two aspects will live in different places upstream.
|
||||
This makes them easier to transplant, which we expect to do soon.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def iter_sect_block_items(cls, section: Section, document: Document) -> Iterator[BlockItem]:
|
||||
"""Generate each Paragraph or Table object in `section`."""
|
||||
for element in _SectBlockElementIterator.iter_sect_block_elements(section._sectPr):
|
||||
yield (
|
||||
Paragraph(element, document)
|
||||
if isinstance(element, CT_P)
|
||||
else DocxTable(element, document)
|
||||
)
|
||||
|
||||
|
||||
class _SectBlockElementIterator:
|
||||
"""Generates the block-item XML elements in a section.
|
||||
|
||||
A block-item element is a `CT_P` (paragraph) or a `CT_Tbl` (table).
|
||||
"""
|
||||
|
||||
_compiled_blocks_xpath: Optional[etree.XPath] = None
|
||||
_compiled_count_xpath: Optional[etree.XPath] = None
|
||||
|
||||
def __init__(self, sectPr: CT_SectPr):
|
||||
self._sectPr = sectPr
|
||||
|
||||
@classmethod
|
||||
def iter_sect_block_elements(cls, sectPr: CT_SectPr) -> Iterator[BlockElement]:
|
||||
"""Generate each CT_P or CT_Tbl element within the extents governed by `sectPr`."""
|
||||
return cls(sectPr)._iter_sect_block_elements()
|
||||
|
||||
def _iter_sect_block_elements(self) -> Iterator[BlockElement]:
|
||||
"""Generate each CT_P or CT_Tbl element in section."""
|
||||
# -- General strategy is to get all block (<w;p> and <w:tbl>) elements from start of doc
|
||||
# -- to and including this section, then compute the count of those elements that came
|
||||
# -- from prior sections and skip that many to leave only the ones in this section. It's
|
||||
# -- possible to express this "between here and there" (end of prior section and end of
|
||||
# -- this one) concept in XPath, but it would be harder to follow because there are
|
||||
# -- special cases (e.g. no prior section) and the boundary expressions are fairly hairy.
|
||||
# -- I also believe it would be computationally more expensive than doing it this
|
||||
# -- straighforward albeit (theoretically) slightly wasteful way.
|
||||
|
||||
sectPr, sectPrs = self._sectPr, self._sectPrs
|
||||
sectPr_idx = sectPrs.index(sectPr)
|
||||
|
||||
# -- count block items belonging to prior sections --
|
||||
n_blks_to_skip = (
|
||||
0
|
||||
if sectPr_idx == 0
|
||||
else self._count_of_blocks_in_and_above_section(sectPrs[sectPr_idx - 1])
|
||||
)
|
||||
|
||||
# -- and skip those in set of all blks from doc start to end of this section --
|
||||
for element in self._blocks_in_and_above_section(sectPr)[n_blks_to_skip:]:
|
||||
yield element
|
||||
|
||||
def _blocks_in_and_above_section(self, sectPr: CT_SectPr) -> Sequence[BlockElement]:
|
||||
"""All ps and tbls in section defined by `sectPr` and all prior sections."""
|
||||
if self._compiled_blocks_xpath is None:
|
||||
self._compiled_blocks_xpath = etree.XPath(
|
||||
self._blocks_in_and_above_section_xpath,
|
||||
namespaces=nsmap,
|
||||
regexp=False,
|
||||
)
|
||||
xpath = self._compiled_blocks_xpath
|
||||
# -- XPath callable results are Any (basically), so need a cast --
|
||||
return cast(Sequence[BlockElement], xpath(sectPr))
|
||||
|
||||
@lazyproperty
|
||||
def _blocks_in_and_above_section_xpath(self) -> str:
|
||||
"""XPath expr for ps and tbls in context of a sectPr and all prior sectPrs."""
|
||||
# -- "p_sect" is a section with sectPr located at w:p/w:pPr/w:sectPr. "body_sect" is a
|
||||
# -- section with sectPr located at w:body/w:sectPr. The last section in the document is a
|
||||
# -- "body_sect". All others are of the "p_sect" variety. "term" means "terminal", like
|
||||
# -- the last p or tbl in the section. "pred" means "predecessor", like a preceding p or
|
||||
# -- tbl in the section.
|
||||
|
||||
# -- the terminal block in a p-based sect is the p the sectPr appears in --
|
||||
p_sect_term_block = "./parent::w:pPr/parent::w:p"
|
||||
# -- the terminus of a body-based sect is the sectPr itself (not a block) --
|
||||
body_sect_term = "self::w:sectPr[parent::w:body]"
|
||||
# -- all the ps and tbls preceding (but not including) the context node --
|
||||
pred_ps_and_tbls = "preceding-sibling::*[self::w:p | self::w:tbl]"
|
||||
|
||||
# -- p_sect_term_block and body_sect_term(inus) are mutually exclusive. So the result is
|
||||
# -- either the union of nodes found by the first two selectors or the nodes found by the
|
||||
# -- last selector, never both.
|
||||
return (
|
||||
# -- include the p containing a sectPr --
|
||||
f"{p_sect_term_block}"
|
||||
# -- along with all the blocks that precede it --
|
||||
f" | {p_sect_term_block}/{pred_ps_and_tbls}"
|
||||
# -- or all the preceding blocks if sectPr is body-based (last sectPr) --
|
||||
f" | {body_sect_term}/{pred_ps_and_tbls}"
|
||||
)
|
||||
|
||||
def _count_of_blocks_in_and_above_section(self, sectPr: CT_SectPr) -> int:
|
||||
"""All ps and tbls in section defined by `sectPr` and all prior sections."""
|
||||
if self._compiled_count_xpath is None:
|
||||
self._compiled_count_xpath = etree.XPath(
|
||||
f"count({self._blocks_in_and_above_section_xpath})",
|
||||
namespaces=nsmap,
|
||||
regexp=False,
|
||||
)
|
||||
xpath = self._compiled_count_xpath
|
||||
# -- numeric XPath results are always float, so need an int() conversion --
|
||||
return int(cast(float, xpath(sectPr)))
|
||||
|
||||
@lazyproperty
|
||||
def _sectPrs(self) -> Sequence[CT_SectPr]:
|
||||
"""All w:sectPr elements in document, in document-order."""
|
||||
return self._sectPr.xpath(
|
||||
"/w:document/w:body/w:p/w:pPr/w:sectPr | /w:document/w:body/w:sectPr",
|
||||
)
|
||||
|
||||
|
||||
# == monkey-patch docx.text.Paragraph.runs ===========================================
|
||||
|
||||
|
||||
def _get_paragraph_runs(paragraph: Paragraph) -> Sequence[Run]:
|
||||
"""Gets all runs in paragraph, including hyperlinks python-docx skips.
|
||||
|
||||
Without this, the default runs function skips over hyperlinks.
|
||||
|
||||
Args:
|
||||
paragraph (Paragraph): A Paragraph object.
|
||||
|
||||
Returns:
|
||||
list: A list of Run objects.
|
||||
"""
|
||||
|
||||
def _get_runs(node: BaseOxmlElement, parent: Paragraph) -> Iterator[Run]:
|
||||
"""Recursively get runs."""
|
||||
for child in node:
|
||||
# -- the Paragraph has runs as direct children --
|
||||
if child.tag == qn("w:r"):
|
||||
yield Run(cast(CT_R, child), parent)
|
||||
continue
|
||||
# -- but it also has hyperlink children that themselves contain runs, so
|
||||
# -- recurse into those
|
||||
if child.tag == qn("w:hyperlink"):
|
||||
yield from _get_runs(child, parent)
|
||||
|
||||
return list(_get_runs(paragraph._element, paragraph))
|
||||
|
||||
|
||||
Paragraph.runs = property( # pyright: ignore[reportGeneralTypeIssues]
|
||||
lambda self: _get_paragraph_runs(self),
|
||||
)
|
||||
|
||||
# ====================================================================================
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user