fix: detect list items in MS Word documents (#909)

* fix merge conflict * update changelog and version
2025-06-27 02:30:08 +00:00 · 2023-07-10 10:29:08 -05:00 · 2023-07-10 10:29:08 -05:00 · 6173362620
commit 6173362620
parent 79f734d3f9
5 changed files with 32 additions and 8 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.8.1-dev0
+## 0.8.1-dev1

 ### Enhancements

@ -7,6 +7,7 @@
 ### Fixes

 * Fixed `auto` strategy detected scanned document as having extractable text and using `fast` strategy, resulting in no output.
+* Fix list detection in MS Word documents.

 ## 0.8.0

--- a/example-docs/example-list-items-multiple.docx
+++ b/example-docs/example-list-items-multiple.docx
--- a/test_unstructured/partition/test_docx.py
+++ b/test_unstructured/partition/test_docx.py
@ -171,6 +171,21 @@ def test_partition_docx_includes_page_breaks(filename="example-docs/handbook-1p.
        assert element.metadata.filename == "handbook-1p.docx"


+def test_partition_docx_detects_lists(filename="example-docs/example-list-items-multiple.docx"):
+    elements = partition_docx(filename=filename)
+    list_elements = []
+    narrative_elements = []
+    for element in elements:
+        if isinstance(element, ListItem):
+            list_elements.append(element)
+        else:
+            narrative_elements.append(element)
+    assert elements[-1] == ListItem(
+        "This is simply dummy text of the printing and typesetting industry.",
+    )
+    assert len(list_elements) == 10
+
+
 def test_partition_docx_from_filename_exclude_metadata(filename="example-docs/handbook-1p.docx"):
    elements = partition_docx(filename=filename, include_metadata=False)
    assert elements[0].metadata.filetype is None
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.8.1-dev0"  # pragma: no cover
+__version__ = "0.8.1-dev1"  # pragma: no cover
--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@ -148,8 +148,8 @@ def partition_docx(

    document_contains_pagebreaks = _element_contains_pagebreak(document._element)
    page_number = 1 if document_contains_pagebreaks else None
-
    section = 0
+    is_list = False
    for element_item in document.element.body:
        if element_item.tag.endswith("tbl"):
            table = document.tables[table_index]
@ -165,14 +165,17 @@ def partition_docx(
                elements.append(element)
            table_index += 1
        elif element_item.tag.endswith("p"):
+            if "<w:numPr>" in element_item.xml:
+                is_list = True
            paragraph = docx.text.paragraph.Paragraph(element_item, document)
-            para_element: Optional[Text] = _paragraph_to_element(paragraph)
+            para_element: Optional[Text] = _paragraph_to_element(paragraph, is_list)
            if para_element is not None:
                para_element.metadata = ElementMetadata(
                    filename=metadata_filename,
                    page_number=page_number,
                )
                elements.append(para_element)
+            is_list = False
        elif element_item.tag.endswith("sectPr"):
            if len(headers_and_footers) > section:
                footers = headers_and_footers[section][1]
@ -191,7 +194,10 @@ def partition_docx(
    return elements


-def _paragraph_to_element(paragraph: docx.text.paragraph.Paragraph) -> Optional[Text]:
+def _paragraph_to_element(
+    paragraph: docx.text.paragraph.Paragraph,
+    is_list=False,
+) -> Optional[Text]:
    """Converts a docx Paragraph object into the appropriate unstructured document element.
    If the paragraph style is "Normal" or unknown, we try to predict the element type from the
    raw text."""
@ -205,7 +211,9 @@ def _paragraph_to_element(paragraph: docx.text.paragraph.Paragraph) -> Optional[

    # NOTE(robinson) - The "Normal" style name will return None since it's in the mapping.
    # Unknown style names will also return None
-    if element_class is None:
+    if is_list:
+        return _text_to_element(text, is_list)
+    elif element_class is None:
        return _text_to_element(text)
    else:
        return element_class(text)
@ -227,9 +235,9 @@ def _element_contains_pagebreak(element) -> bool:
    return False


-def _text_to_element(text: str) -> Optional[Text]:
+def _text_to_element(text: str, is_list=False) -> Optional[Text]:
    """Converts raw text into an unstructured Text element."""
-    if is_bulleted_text(text):
+    if is_bulleted_text(text) or is_list:
        clean_text = clean_bullets(text).strip()
        return ListItem(text=clean_bullets(text)) if clean_text else None