feat: add partition_doc for .doc files (#236)

* first pass on doc partitioning * add libreoffice to deps * update docs and readme * add .doc to auto * changelog bump * value error with missing doc * doc updates
2025-06-27 02:30:08 +00:00 · 2023-02-17 09:30:23 -05:00 · 2023-02-17 09:30:23 -05:00 · 6036af33e7
commit 6036af33e7
parent 9bbd4a1d56
13 changed files with 238 additions and 8 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -98,7 +98,7 @@ jobs:
        source .venv/bin/activate
        make install-nltk-models
        make install-detectron2
-        sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr
+        sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr libreoffice
        make test
        make check-coverage
        make install-ingest-s3
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,7 @@
+## 0.4.11-dev0
+
+* Adds `partition_doc` for partition Word documents in `.doc` format. Requires `libreoffice`.
+
 ## 0.4.10

 * Fixes `ElementMetadata` so that it's JSON serializable when the filename is a `Path` object.
--- a/README.md
+++ b/README.md
@ -78,7 +78,7 @@ To install the library, run `pip install unstructured`.
 You can run this [Colab notebook](https://colab.research.google.com/drive/1U8VCjY2-x8c6y5TYMbSFtQGlQVFHCVIW) to run the examples below.

 The following examples show how to get started with the `unstructured` library.
-You can parse **TXT**, **HTML**, **PDF**, **EML** and **DOCX** documents with one line of code!
+You can parse **TXT**, **HTML**, **PDF**, **EML** **DOC** and **DOCX** documents with one line of code!
 <br></br>
 See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description
 of the features in the library.
@ -92,7 +92,7 @@ If you are using the `partition` brick, you may need to install additional param
 instructions outlined [here](https://unstructured-io.github.io/unstructured/installing.html#filetype-detection)
 `partition` will always apply the default arguments. If you need
 advanced features, use a document-specific brick. The `partition` brick currently works for
-`.txt`, `.docx`, `.pptx`, `.jpg`, `.png`, `.eml`, `.html`, and `.pdf` documents.
+`.txt`, `.doc`, `.docx`, `.pptx`, `.jpg`, `.png`, `.eml`, `.html`, and `.pdf` documents.

 ```python
 from unstructured.partition.auto import partition
--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@ -22,7 +22,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
 file type and route it to the appropriate partitioning brick. All partitioning bricks
 called within ``partition`` are called using the defualt kwargs. Use the document-type
 specific bricks if you need to apply non-default settings.
-``partition`` currently supports ``.docx``, ``.pptx``, ``.eml``, ``.html``, ``.pdf``,
+``partition`` currently supports ``.docx``, ``.doc``, ``.pptx``, ``.eml``, ``.html``, ``.pdf``,
 ``.png``, ``.jpg``, and ``.txt`` files.
 If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
 ``.png``, and ``.jpg``.
@ -81,6 +81,28 @@ Examples:
  with open("mydoc.docx", "rb") as f:
      elements = partition_docx(file=f)

+
+``partition_doc``
+------------------
+
+The ``partition_doc`` partitioning brick pre-processes Microsoft Word documents
+saved in the ``.doc`` format. This staging brick uses a combination of the styling
+information in the document and the structure of the text to determine the type
+of a text element. The ``partition_doc`` can take a filename or file-like object
+as input, as shown in the two examples below. ``partiton_doc``
+uses ``libreoffice`` to convert the file to ``.docx`` and then
+calls ``partition_docx``. Ensure you have ``libreoffice`` installed
+before using ``partition_doc``.
+
+Examples:
+
+.. code:: python
+
+  from unstructured.partition.doc import partition_doc
+
+  elements = partition_doc(filename="example-docs/fake.doc")
+
+
 ``partition_pptx``
 ---------------------

--- a/example-docs/fake.doc
+++ b/example-docs/fake.doc
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -8,6 +8,7 @@ import docx
 from unstructured.documents.elements import Address, NarrativeText, PageBreak, Title, Text, ListItem
 from unstructured.partition.auto import partition
 import unstructured.partition.auto as auto
+from unstructured.partition.common import convert_office_doc

 DIRECTORY = pathlib.Path(__file__).parent.resolve()
 EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
@ -96,6 +97,30 @@ def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_element
    assert elements == expected_docx_elements


+def test_auto_partition_doc_with_filename(mock_docx_document, expected_docx_elements, tmpdir):
+    docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
+    doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
+    mock_docx_document.save(docx_filename)
+    convert_office_doc(docx_filename, tmpdir.dirname, "doc")
+
+    elements = partition(filename=doc_filename)
+    assert elements == expected_docx_elements
+
+
+# NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to
+# determine that the file is an .doc document
+@pytest.mark.xfail
+def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements, tmpdir):
+    docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
+    doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
+    mock_docx_document.save(docx_filename)
+    convert_office_doc(docx_filename, tmpdir.dirname, "doc")
+
+    with open(doc_filename, "rb") as f:
+        elements = partition(file=f)
+    assert elements == expected_docx_elements
+
+
 def test_auto_partition_html_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html")
    elements = partition(filename=filename)
--- a/test_unstructured/partition/test_doc.py
+++ b/test_unstructured/partition/test_doc.py
@ -0,0 +1,103 @@
+import os
+import pytest
+
+import docx
+
+from unstructured.documents.elements import Address, ListItem, NarrativeText, Title, Text
+from unstructured.partition.common import convert_office_doc
+from unstructured.partition.doc import partition_doc
+from unstructured.partition.docx import partition_docx
+
+
+@pytest.fixture
+def mock_document():
+    document = docx.Document()
+
+    document.add_paragraph("These are a few of my favorite things:", style="Heading 1")
+    # NOTE(robinson) - this should get picked up as a list item due to the •
+    document.add_paragraph("• Parrots", style="Normal")
+    # NOTE(robinson) - this should get dropped because it's empty
+    document.add_paragraph("• ", style="Normal")
+    document.add_paragraph("Hockey", style="List Bullet")
+    # NOTE(robinson) - this should get dropped because it's empty
+    document.add_paragraph("", style="List Bullet")
+    # NOTE(robinson) - this should get picked up as a title
+    document.add_paragraph("Analysis", style="Normal")
+    # NOTE(robinson) - this should get dropped because it is empty
+    document.add_paragraph("", style="Normal")
+    # NOTE(robinson) - this should get picked up as a narrative text
+    document.add_paragraph("This is my first thought. This is my second thought.", style="Normal")
+    document.add_paragraph("This is my third thought.", style="Body Text")
+    # NOTE(robinson) - this should just be regular text
+    document.add_paragraph("2023")
+    # NOTE(robinson) - this should be an address
+    document.add_paragraph("DOYLESTOWN, PA 18901")
+
+    return document
+
+
+@pytest.fixture
+def expected_elements():
+    return [
+        Title("These are a few of my favorite things:"),
+        ListItem("Parrots"),
+        ListItem("Hockey"),
+        Title("Analysis"),
+        NarrativeText("This is my first thought. This is my second thought."),
+        NarrativeText("This is my third thought."),
+        Text("2023"),
+        Address("DOYLESTOWN, PA 18901"),
+    ]
+
+
+def test_partition_doc_with_filename(mock_document, expected_elements, tmpdir):
+    docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
+    doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
+    mock_document.save(docx_filename)
+    convert_office_doc(docx_filename, tmpdir.dirname, "doc")
+
+    elements = partition_doc(filename=doc_filename)
+    assert elements == expected_elements
+
+
+def test_partition_doc_matches_partition_docx(mock_document, expected_elements, tmpdir):
+    docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
+    doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
+    mock_document.save(docx_filename)
+    convert_office_doc(docx_filename, tmpdir.dirname, "doc")
+
+    partition_doc(filename=doc_filename) == partition_docx(filename=docx_filename)
+
+
+def test_partition_raises_with_missing_doc(mock_document, expected_elements, tmpdir):
+    doc_filename = os.path.join(tmpdir.dirname, "asdf.doc")
+
+    with pytest.raises(ValueError):
+        partition_doc(filename=doc_filename)
+
+
+def test_partition_doc_with_file(mock_document, expected_elements, tmpdir):
+    docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
+    doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
+    mock_document.save(docx_filename)
+    convert_office_doc(docx_filename, tmpdir.dirname, "doc")
+
+    with open(doc_filename, "rb") as f:
+        elements = partition_doc(file=f)
+    assert elements == expected_elements
+
+
+def test_partition_doc_raises_with_both_specified(mock_document, tmpdir):
+    docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
+    doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
+    mock_document.save(docx_filename)
+    convert_office_doc(docx_filename, tmpdir.dirname, "doc")
+
+    with open(doc_filename, "rb") as f:
+        with pytest.raises(ValueError):
+            partition_doc(filename=doc_filename, file=f)
+
+
+def test_partition_doc_raises_with_neither():
+    with pytest.raises(ValueError):
+        partition_doc()
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.4.10"  # pragma: no cover
+__version__ = "0.4.11-dev0"  # pragma: no cover
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -1,6 +1,7 @@
 from typing import IO, Optional

 from unstructured.file_utils.filetype import detect_filetype, FileType
+from unstructured.partition.doc import partition_doc
 from unstructured.partition.docx import partition_docx
 from unstructured.partition.email import partition_email
 from unstructured.partition.html import partition_html
@ -34,6 +35,8 @@ def partition(
    if file is not None:
        file.seek(0)

+    if filetype == FileType.DOC:
+        return partition_doc(filename=filename, file=file)
    if filetype == FileType.DOCX:
        return partition_docx(filename=filename, file=file)
    elif filetype == FileType.EML:
--- a/unstructured/partition/common.py
+++ b/unstructured/partition/common.py
@ -1,3 +1,4 @@
+import subprocess
 from typing import List, Optional, Union

 from unstructured.documents.elements import (
@ -101,3 +102,32 @@ def add_element_metadata(
            element.metadata = metadata
            elements.append(element)
    return elements
+
+
+def convert_office_doc(input_filename: str, output_directory: str, target_format: str):
+    """Converts a .doc file to a .docx file using the libreoffice CLI."""
+    # NOTE(robinson) - In the future can also include win32com client as a fallback for windows
+    # users who do not have LibreOffice installed
+    # ref: https://stackoverflow.com/questions/38468442/
+    #       multiple-doc-to-docx-file-conversion-using-python
+    try:
+        subprocess.call(
+            [
+                "soffice",
+                "--headless",
+                "--convert-to",
+                target_format,
+                "--outdir",
+                output_directory,
+                input_filename,
+            ]
+        )
+    except FileNotFoundError:
+        raise FileNotFoundError(
+            """soffice command was not found. Please install libreoffice
+on your system and try again.
+
+- Install instructions: https://www.libreoffice.org/get-help/install-howto/
+- Mac: https://formulae.brew.sh/cask/libreoffice
+- Debian: https://wiki.debian.org/LibreOffice"""
+        )
--- a/unstructured/partition/doc
+++ b/unstructured/partition/doc
--- a/unstructured/partition/doc.py
+++ b/unstructured/partition/doc.py
@ -0,0 +1,45 @@
+import os
+import tempfile
+from typing import IO, List, Optional
+
+from unstructured.documents.elements import Element
+from unstructured.partition.common import convert_office_doc
+from unstructured.partition.docx import partition_docx
+
+
+def partition_doc(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]:
+    """Partitions Microsoft Word Documents in .doc format into its document elements.
+
+    Parameters
+    ----------
+     filename
+        A string defining the target filename path.
+    file
+        A file-like object using "rb" mode --> open(filename, "rb").
+    """
+    if not any([filename, file]):
+        raise ValueError("One of filename or file must be specified.")
+
+    if filename is not None and not file:
+        _, filename_no_path = os.path.split(os.path.abspath(filename))
+        base_filename, _ = os.path.splitext(filename_no_path)
+    elif file is not None and not filename:
+        tmp = tempfile.NamedTemporaryFile(delete=False)
+        tmp.write(file.read())
+        tmp.close()
+        filename = tmp.name
+        _, filename_no_path = os.path.split(os.path.abspath(tmp.name))
+    else:
+        raise ValueError("Only one of filename or file can be specified.")
+
+    if not os.path.exists(filename):
+        raise ValueError(f"The file {filename} does not exist.")
+
+    base_filename, _ = os.path.splitext(filename_no_path)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        convert_office_doc(filename, tmpdir, target_format="docx")
+        docx_filename = os.path.join(tmpdir, f"{base_filename}.docx")
+        elements = partition_docx(filename=docx_filename)
+
+    return elements
--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@ -56,9 +56,7 @@ STYLE_TO_ELEMENT_MAPPING = {
 }


-def partition_docx(
-    filename: Optional[str] = None, file: Optional[IO] = None, **kwargs
-) -> List[Element]:
+def partition_docx(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]:
    """Partitions Microsoft Word Documents in .docx format into its document elements.

    Parameters