feat: add partition_docx for Word documents (#131)

* first pass on docx parsing * linting, linting, linting * test docx with filename * added documentation * more tests; version bump * typo * another typo * another typo! * it -> its * save -> saved * remove None since it's the default argument
2025-11-30 09:09:53 +00:00 · 2023-01-05 15:13:39 -05:00 · 2023-01-05 15:13:39 -05:00 · fee95b643c
commit fee95b643c
parent 33b983fbf0
5 changed files with 218 additions and 3 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,9 +1,10 @@
-## 0.3.6-dev0
+## 0.3.6-dev1

 * Cleaning brick for removing ordered bullets `clean_ordered_bullets`.
 * Extract brick method for ordered bullets `extract_ordered_bullets`.
 * Test for `clean_ordered_bullets`.
 * Test for `extract_ordered_bullets`.
+* Added `partition_docx` for pre-processing Word Documents.

 ## 0.3.5

--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@ -14,6 +14,36 @@ The partitioning bricks in ``unstructured`` differentiate between different sect
 of text in a document. For example, the partitioning bricks can help distinguish between
 titles, narrative text, and tables.

+
+``partition_docx``
+------------------
+
+The ``partition_docx`` partitioning brick pre-processes Microsoft Word documents
+saved in the ``.docx`` format. This staging brick uses a combination of the styling
+information in the document and the structure of the text to determine the type
+of a text element. The ``partition_docx`` can take a filename or file-like object
+as input, as shown in the two examples below.
+
+Examples:
+
+.. code:: python
+
+  import docx
+
+  from unstructured.partition.docx import partition_docx
+
+  document = docx.Document()
+  document.add_paragraph("Important Analysis", style="Heading 1")
+  document.add_paragraph("Here is my first thought.", style="Body Text")
+  document.add_paragraph("Here is my second thought.", style="Normal")
+  document.save("mydoc.docx")
+
+  elements = partition_docx(filename="mydoc.docx")
+
+  with open("mydoc.docx", "rb") as f:
+      elements = partition_docx(file=f)
+
+
 ``partition_html``
 ---------------------

@ -329,7 +359,7 @@ Examples:


 ``clean_ordered_bullets``
-----------------
+-------------------------

 Remove alpha-numeric bullets from the beginning of text up to three “sub-section” levels.

--- a/test_unstructured/partition/test_docx.py
+++ b/test_unstructured/partition/test_docx.py
@ -0,0 +1,72 @@
+import os
+import pytest
+
+import docx
+
+from unstructured.documents.elements import ListItem, NarrativeText, Title, Text
+from unstructured.partition.docx import partition_docx
+
+
+@pytest.fixture
+def mock_document():
+    document = docx.Document()
+
+    document.add_paragraph("These are a few of my favorite things:", style="Heading 1")
+    # NOTE(robinson) - this should get picked up as a list item due to the •
+    document.add_paragraph("• Parrots", style="Normal")
+    document.add_paragraph("Hockey", style="List Bullet")
+    # NOTE(robinson) - this should get picked up as a title
+    document.add_paragraph("Analysis", style="Normal")
+    # NOTE(robinson) - this should get dropped because it is empty
+    document.add_paragraph("", style="Normal")
+    # NOTE(robinson) - this should get picked up as a narrative text
+    document.add_paragraph("This is my first thought. This is my second thought.", style="Normal")
+    document.add_paragraph("This is my third thought.", style="Body Text")
+    # NOTE(robinson) - this should just be regular text
+    document.add_paragraph("2023")
+
+    return document
+
+
+@pytest.fixture
+def expected_elements():
+    return [
+        Title("These are a few of my favorite things:"),
+        ListItem("Parrots"),
+        ListItem("Hockey"),
+        Title("Analysis"),
+        NarrativeText("This is my first thought. This is my second thought."),
+        NarrativeText("This is my third thought."),
+        Text("2023"),
+    ]
+
+
+def test_partition_docx_with_filename(mock_document, expected_elements, tmpdir):
+    filename = os.path.join(tmpdir.dirname, "mock_document.docx")
+    mock_document.save(filename)
+
+    elements = partition_docx(filename=filename)
+    assert elements == expected_elements
+
+
+def test_partition_docx_with_file(mock_document, expected_elements, tmpdir):
+    filename = os.path.join(tmpdir.dirname, "mock_document.docx")
+    mock_document.save(filename)
+
+    with open(filename, "rb") as f:
+        elements = partition_docx(file=f)
+    assert elements == expected_elements
+
+
+def test_partition_docx_raises_with_both_specified(mock_document, tmpdir):
+    filename = os.path.join(tmpdir.dirname, "mock_document.docx")
+    mock_document.save(filename)
+
+    with open(filename, "rb") as f:
+        with pytest.raises(ValueError):
+            partition_docx(filename=filename, file=f)
+
+
+def test_partition_docx_raises_with_neither():
+    with pytest.raises(ValueError):
+        partition_docx()
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.3.6-dev0"  # pragma: no cover
+__version__ = "0.3.6-dev1"  # pragma: no cover
--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@ -0,0 +1,112 @@
+from typing import IO, List, Optional
+
+import docx
+
+from unstructured.cleaners.core import clean_bullets
+from unstructured.documents.elements import Element, ListItem, NarrativeText, Text, Title
+from unstructured.partition.text_type import (
+    is_bulleted_text,
+    is_possible_narrative_text,
+    is_possible_title,
+)
+
+# NOTE(robinson) - documentation on built in styles can be found at the link below
+# ref: https://python-docx.readthedocs.io/en/latest/user/
+#   styles-understanding.html#paragraph-styles-in-default-template
+STYLE_TO_ELEMENT_MAPPING = {
+    "Body Text": NarrativeText,
+    "Body Text 2": NarrativeText,
+    "Body Text 3": NarrativeText,
+    "Caption": Text,  # TODO(robinson) - add caption element type
+    "Heading 1": Title,
+    "Heading 2": Title,
+    "Heading 3": Title,
+    "Heading 4": Title,
+    "Heading 5": Title,
+    "Heading 6": Title,
+    "Heading 7": Title,
+    "Heading 8": Title,
+    "Heading 9": Title,
+    "Intense Quote": Text,  # TODO(robinson) - add quote element type
+    "List": ListItem,
+    "List 2": ListItem,
+    "List 3": ListItem,
+    "List Bullet": ListItem,
+    "List Bullet 2": ListItem,
+    "List Bullet 3": ListItem,
+    "List Continue": ListItem,
+    "List Continue 2": ListItem,
+    "List Continue 3": ListItem,
+    "List Number": ListItem,
+    "List Number 2": ListItem,
+    "List Number 3": ListItem,
+    "List Paragraph": ListItem,
+    "Macro Text": Text,
+    "No Spacing": Text,
+    "Quote": Text,  # TODO(robinson) - add quote element type
+    "Subtitle": Title,
+    "TOCHeading": Title,
+    "Title": Title,
+}
+
+
+def partition_docx(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]:
+    """Partitions Microsoft Word Documents in .docx format into its document elements.
+
+    Parameters
+    ----------
+     filename
+        A string defining the target filename path.
+    file
+        A file-like object using "rb" mode --> open(filename, "rb").
+    """
+
+    if not any([filename, file]):
+        raise ValueError("One of filename or file must be specified.")
+
+    if filename is not None and not file:
+        document = docx.Document(filename)
+    elif file is not None and not filename:
+        document = docx.Document(file)
+    else:
+        raise ValueError("Only one of filename or file can be specified.")
+
+    elements: List[Element] = []
+    for paragraph in document.paragraphs:
+        element = _paragraph_to_element(paragraph)
+        if element is not None:
+            elements.append(element)
+
+    return elements
+
+
+def _paragraph_to_element(paragraph: docx.text.paragraph.Paragraph) -> Optional[Text]:
+    """Converts a docx Paragraph object into the appropriate unstructured document element.
+    If the paragaraph style is "Normal" or unknown, we try to predict the element type from the
+    raw text."""
+    text = paragraph.text
+    style_name = paragraph.style.name
+
+    element_class = STYLE_TO_ELEMENT_MAPPING.get(style_name)
+
+    # NOTE(robinson) - The "Normal" style name will return None since it's in the mapping.
+    # Unknown style names will also return None
+    if element_class is None:
+        return _text_to_element(text)
+    else:
+        return element_class(text)
+
+
+def _text_to_element(text: str) -> Optional[Text]:
+    """Converts raw text into an unstructured Text element."""
+    if is_bulleted_text(text):
+        return ListItem(text=clean_bullets(text))
+
+    if len(text) < 2:
+        return None
+    elif is_possible_narrative_text(text):
+        return NarrativeText(text)
+    elif is_possible_title(text):
+        return Title(text)
+    else:
+        return Text(text)