feat: add partition_odt for open office docs (#548)

* added filetype detection for odt * add function for partition odt documents * add odt files to auto * changelog and version * docs and readme * update installation docs * skip tests if not supported or in docker * import pytest * fix docs typos
2025-12-27 15:13:35 +00:00 · 2023-05-04 15:28:08 -04:00 · 2023-05-04 15:28:08 -04:00 · fae5f8fdde
commit fae5f8fdde
parent 981805e435
13 changed files with 154 additions and 5 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.6.3-dev2
+## 0.6.3-dev3

 ### Enhancements

@ -7,6 +7,7 @@
 * Added `partition_multiple_via_api` for partitioning multiple documents in a single REST
  API call.
 * Added `stage_for_baseplate` function to prepare outputs for ingestion into Baseplate.
+* Added `partition_odt` for processing Open Office documents.

 ### Fixes

--- a/README.md
+++ b/README.md
@ -181,7 +181,8 @@ you can also uninstall the hooks with `pre-commit uninstall`.
 You can run this [Colab notebook](https://colab.research.google.com/drive/1U8VCjY2-x8c6y5TYMbSFtQGlQVFHCVIW) to run the examples below.

 The following examples show how to get started with the `unstructured` library.
-You can parse **TXT**, **HTML**, **PDF**, **EML**, **MSG**, **RTF**, **EPUB**, **DOC**, **DOCX**, **PPT**, **PPTX**, **JPG**,
+You can parse **TXT**, **HTML**, **PDF**, **EML**, **MSG**, **RTF**, **EPUB**, **DOC**, **DOCX**,
+**ODT**, **PPT**, **PPTX**, **JPG**,
 and **PNG** documents with one line of code!
 <br></br>
 See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description
--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@ -83,7 +83,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
 file type and route it to the appropriate partitioning brick. All partitioning bricks
 called within ``partition`` are called using the default kwargs. Use the document-type
 specific bricks if you need to apply non-default settings.
-``partition`` currently supports ``.docx``, ``.doc``, ``.pptx``, ``.ppt``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.pdf``,
+``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.pdf``,
 ``.png``, ``.jpg``, and ``.txt`` files.
 If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
 ``.png``, and ``.jpg``.
@ -251,6 +251,22 @@ Examples:
  elements = partition_doc(filename="example-docs/fake.doc")


+``partition_odt``
+------------------
+
+The ``partition_odt`` partitioning brick pre-processes Open Office documents
+saved in the ``.odt`` format. The function first converst the document
+to ``.docx`` using ``pandoc`` and then processes it using ``partition_docx``.
+
+Examples:
+
+.. code:: python
+
+  from unstructured.partition.odt import partition_odt
+
+  elements = partition_odt(filename="example-docs/fake.odt")
+
+
 ``partition_pptx``
 ---------------------

--- a/docs/source/installing.rst
+++ b/docs/source/installing.rst
@ -15,7 +15,7 @@ installation.
 	* ``poppler-utils`` (images and PDFs)
 	* ``tesseract-ocr`` (images and PDFs)
 	* ``libreoffice`` (MS Office docs)
-	* ``pandocs`` (EPUBs)
+	* ``pandocs`` (EPUBs, RTFs and Open Office docs)

 * If you are parsing PDFs, run the following to install the ``detectron2`` model, which ``unstructured`` uses for layout detection:
 	* ``pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@e2ce8dc#egg=detectron2"``
--- a/example-docs/fake.odt
+++ b/example-docs/fake.odt
--- a/test_unstructured/file_utils/test_filetype.py
+++ b/test_unstructured/file_utils/test_filetype.py
@ -32,6 +32,7 @@ EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs"
        ("fake-power-point.pptx", FileType.PPTX),
        ("winter-sports.epub", FileType.EPUB),
        ("spring-weather.html.json", FileType.JSON),
+        ("fake.odt", FileType.ODT),
    ],
 )
 def test_detect_filetype_from_filename(file, expected):
@ -55,6 +56,7 @@ def test_detect_filetype_from_filename(file, expected):
        ("winter-sports.epub", FileType.EPUB),
        ("fake-doc.rtf", FileType.RTF),
        ("spring-weather.html.json", FileType.JSON),
+        ("fake.odt", FileType.ODT),
    ],
 )
 def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expected):
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -33,6 +33,7 @@ EXPECTED_EMAIL_OUTPUT = [

 is_in_docker = os.path.exists("/.dockerenv")
 rtf_not_supported = "rtf" not in pypandoc.get_pandoc_formats()[0]
+odt_not_supported = "odt" not in pypandoc.get_pandoc_formats()[0]


 def test_auto_partition_email_from_filename():
@ -461,3 +462,21 @@ def test_auto_partition_works_with_unstructured_jsons_from_file():
    with open(filename, "rb") as f:
        elements = partition(file=f)
    assert elements[0].text == "News Around NOAA"
+
+
+@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
+@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
+def test_auto_partition_odt_from_filename():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
+    elements = partition(filename=filename)
+    assert elements == [Title("Lorem ipsum dolor sit amet.")]
+
+
+@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
+@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
+def test_auto_partition_odt_from_file():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
+    with open(filename, "rb") as f:
+        elements = partition(file=f)
+
+    assert elements == [Title("Lorem ipsum dolor sit amet.")]
--- a/test_unstructured/partition/test_odt.py
+++ b/test_unstructured/partition/test_odt.py
@ -0,0 +1,32 @@
+import os
+import pathlib
+
+import pypandoc
+import pytest
+
+from unstructured.documents.elements import Title
+from unstructured.partition.odt import partition_odt
+
+DIRECTORY = pathlib.Path(__file__).parent.resolve()
+EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
+
+odt_not_supported = "odt" not in pypandoc.get_pandoc_formats()[0]
+is_in_docker = os.path.exists("/.dockerenv")
+
+
+@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
+@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
+def test_partition_odt_from_filename():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
+    elements = partition_odt(filename=filename)
+    assert elements == [Title("Lorem ipsum dolor sit amet.")]
+
+
+@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
+@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
+def test_partition_odt_from_file():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
+    with open(filename, "rb") as f:
+        elements = partition_odt(file=f)
+
+    assert elements == [Title("Lorem ipsum dolor sit amet.")]
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.6.3-dev2"  # pragma: no cover
+__version__ = "0.6.3-dev3"  # pragma: no cover
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@ -25,6 +25,10 @@ DOC_MIME_TYPES = [
    "application/msword",
 ]

+ODT_MIME_TYPES = [
+    "application/vnd.oasis.opendocument.text",
+]
+
 XLSX_MIME_TYPES = [
    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
 ]
@ -114,6 +118,9 @@ class FileType(Enum):
    # Compressed Types
    ZIP = 60

+    # Open Office Types
+    ODT = 70
+
    # NOTE(robinson) - This is to support sorting for pandas groupby functions
    def __lt__(self, other):
        return self.name < other.name
@ -135,6 +142,7 @@ STR_TO_FILETYPE = {
    "application/vnd.openxmlformats-officedocument.presentationml.presentation": FileType.PPTX,
    "application/vnd.ms-powerpoint": FileType.PPT,
    "application/xml": FileType.XML,
+    "application/vnd.oasis.opendocument.text": FileType.ODT,
 }


@ -160,6 +168,7 @@ EXT_TO_FILETYPE = {
    ".json": FileType.JSON,
    ".epub": FileType.EPUB,
    ".msg": FileType.MSG,
+    ".odt": FileType.ODT,
    None: FileType.UNK,
 }

@ -221,6 +230,9 @@ def detect_filetype(
    elif mime_type in DOC_MIME_TYPES:
        return FileType.DOC

+    elif mime_type in ODT_MIME_TYPES:
+        return FileType.ODT
+
    elif mime_type in MSG_MIME_TYPES:
        return FileType.MSG

--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -15,6 +15,7 @@ from unstructured.partition.image import partition_image
 from unstructured.partition.json import partition_json
 from unstructured.partition.md import partition_md
 from unstructured.partition.msg import partition_msg
+from unstructured.partition.odt import partition_odt
 from unstructured.partition.pdf import partition_pdf
 from unstructured.partition.ppt import partition_ppt
 from unstructured.partition.pptx import partition_pptx
@ -106,6 +107,8 @@ def partition(
        elements = partition_doc(filename=filename, file=file)
    elif filetype == FileType.DOCX:
        elements = partition_docx(filename=filename, file=file)
+    elif filetype == FileType.ODT:
+        elements = partition_odt(filename=filename, file=file)
    elif filetype == FileType.EML:
        elements = partition_email(filename=filename, file=file, encoding=encoding)
    elif filetype == FileType.MSG:
--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@ -1,6 +1,9 @@
+import os
+import tempfile
 from typing import IO, List, Optional

 import docx
+import pypandoc

 from unstructured.cleaners.core import clean_bullets
 from unstructured.documents.elements import (
@ -132,3 +135,46 @@ def _text_to_element(text: str) -> Optional[Text]:
        return Title(text)
    else:
        return Text(text)
+
+
+def convert_and_partition_docx(
+    source_format: str,
+    filename: Optional[str] = None,
+    file: Optional[IO] = None,
+) -> List[Element]:
+    """Converts a document to DOCX and then partitions it using partition_html. Works with
+    any file format support by pandoc.
+
+    Parameters
+    ----------
+    source_format
+        The format of the source document, .e.g. odt
+    filename
+        A string defining the target filename path.
+    file
+        A file-like object using "rb" mode --> open(filename, "rb").
+    """
+    if filename is None:
+        filename = ""
+    exactly_one(filename=filename, file=file)
+
+    if len(filename) > 0:
+        _, filename_no_path = os.path.split(os.path.abspath(filename))
+        base_filename, _ = os.path.splitext(filename_no_path)
+        if not os.path.exists(filename):
+            raise ValueError(f"The file {filename} does not exist.")
+    elif file is not None:
+        tmp = tempfile.NamedTemporaryFile(delete=False)
+        tmp.write(file.read())
+        tmp.close()
+        filename = tmp.name
+        _, filename_no_path = os.path.split(os.path.abspath(tmp.name))
+
+    base_filename, _ = os.path.splitext(filename_no_path)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        docx_filename = os.path.join(tmpdir, f"{base_filename}.docx")
+        pypandoc.convert_file(filename, "docx", format=source_format, outputfile=docx_filename)
+        elements = partition_docx(filename=docx_filename, metadata_filename=filename)
+
+    return elements
--- a/unstructured/partition/odt.py
+++ b/unstructured/partition/odt.py
@ -0,0 +1,17 @@
+from typing import IO, List, Optional
+
+from unstructured.documents.elements import Element
+from unstructured.partition.docx import convert_and_partition_docx
+
+
+def partition_odt(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]:
+    """Partitions Open Office Documents in .odt format into its document elements.
+
+    Parameters
+    ----------
+    filename
+        A string defining the target filename path.
+    file
+        A file-like object using "rb" mode --> open(filename, "rb").
+    """
+    return convert_and_partition_docx(source_format="odt", filename=filename, file=file)