feat: add partition_odt for open office docs (#548)

* added filetype detection for odt

* add function for partition odt documents

* add odt files to auto

* changelog and version

* docs and readme

* update installation docs

* skip tests if not supported or in docker

* import pytest

* fix docs typos
This commit is contained in:
Matt Robinson 2023-05-04 15:28:08 -04:00 committed by GitHub
parent 981805e435
commit fae5f8fdde
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 154 additions and 5 deletions

View File

@ -1,4 +1,4 @@
## 0.6.3-dev2
## 0.6.3-dev3
### Enhancements
@ -7,6 +7,7 @@
* Added `partition_multiple_via_api` for partitioning multiple documents in a single REST
API call.
* Added `stage_for_baseplate` function to prepare outputs for ingestion into Baseplate.
* Added `partition_odt` for processing Open Office documents.
### Fixes

View File

@ -181,7 +181,8 @@ you can also uninstall the hooks with `pre-commit uninstall`.
You can run this [Colab notebook](https://colab.research.google.com/drive/1U8VCjY2-x8c6y5TYMbSFtQGlQVFHCVIW) to run the examples below.
The following examples show how to get started with the `unstructured` library.
You can parse **TXT**, **HTML**, **PDF**, **EML**, **MSG**, **RTF**, **EPUB**, **DOC**, **DOCX**, **PPT**, **PPTX**, **JPG**,
You can parse **TXT**, **HTML**, **PDF**, **EML**, **MSG**, **RTF**, **EPUB**, **DOC**, **DOCX**,
**ODT**, **PPT**, **PPTX**, **JPG**,
and **PNG** documents with one line of code!
<br></br>
See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description

View File

@ -83,7 +83,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
file type and route it to the appropriate partitioning brick. All partitioning bricks
called within ``partition`` are called using the default kwargs. Use the document-type
specific bricks if you need to apply non-default settings.
``partition`` currently supports ``.docx``, ``.doc``, ``.pptx``, ``.ppt``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.pdf``,
``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.pdf``,
``.png``, ``.jpg``, and ``.txt`` files.
If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
``.png``, and ``.jpg``.
@ -251,6 +251,22 @@ Examples:
elements = partition_doc(filename="example-docs/fake.doc")
``partition_odt``
------------------
The ``partition_odt`` partitioning brick pre-processes Open Office documents
saved in the ``.odt`` format. The function first converst the document
to ``.docx`` using ``pandoc`` and then processes it using ``partition_docx``.
Examples:
.. code:: python
from unstructured.partition.odt import partition_odt
elements = partition_odt(filename="example-docs/fake.odt")
``partition_pptx``
---------------------

View File

@ -15,7 +15,7 @@ installation.
* ``poppler-utils`` (images and PDFs)
* ``tesseract-ocr`` (images and PDFs)
* ``libreoffice`` (MS Office docs)
* ``pandocs`` (EPUBs)
* ``pandocs`` (EPUBs, RTFs and Open Office docs)
* If you are parsing PDFs, run the following to install the ``detectron2`` model, which ``unstructured`` uses for layout detection:
* ``pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@e2ce8dc#egg=detectron2"``

BIN
example-docs/fake.odt Normal file

Binary file not shown.

View File

@ -32,6 +32,7 @@ EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs"
("fake-power-point.pptx", FileType.PPTX),
("winter-sports.epub", FileType.EPUB),
("spring-weather.html.json", FileType.JSON),
("fake.odt", FileType.ODT),
],
)
def test_detect_filetype_from_filename(file, expected):
@ -55,6 +56,7 @@ def test_detect_filetype_from_filename(file, expected):
("winter-sports.epub", FileType.EPUB),
("fake-doc.rtf", FileType.RTF),
("spring-weather.html.json", FileType.JSON),
("fake.odt", FileType.ODT),
],
)
def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expected):

View File

@ -33,6 +33,7 @@ EXPECTED_EMAIL_OUTPUT = [
is_in_docker = os.path.exists("/.dockerenv")
rtf_not_supported = "rtf" not in pypandoc.get_pandoc_formats()[0]
odt_not_supported = "odt" not in pypandoc.get_pandoc_formats()[0]
def test_auto_partition_email_from_filename():
@ -461,3 +462,21 @@ def test_auto_partition_works_with_unstructured_jsons_from_file():
with open(filename, "rb") as f:
elements = partition(file=f)
assert elements[0].text == "News Around NOAA"
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
def test_auto_partition_odt_from_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
elements = partition(filename=filename)
assert elements == [Title("Lorem ipsum dolor sit amet.")]
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
def test_auto_partition_odt_from_file():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
with open(filename, "rb") as f:
elements = partition(file=f)
assert elements == [Title("Lorem ipsum dolor sit amet.")]

View File

@ -0,0 +1,32 @@
import os
import pathlib
import pypandoc
import pytest
from unstructured.documents.elements import Title
from unstructured.partition.odt import partition_odt
DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
odt_not_supported = "odt" not in pypandoc.get_pandoc_formats()[0]
is_in_docker = os.path.exists("/.dockerenv")
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
def test_partition_odt_from_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
elements = partition_odt(filename=filename)
assert elements == [Title("Lorem ipsum dolor sit amet.")]
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
def test_partition_odt_from_file():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
with open(filename, "rb") as f:
elements = partition_odt(file=f)
assert elements == [Title("Lorem ipsum dolor sit amet.")]

View File

@ -1 +1 @@
__version__ = "0.6.3-dev2" # pragma: no cover
__version__ = "0.6.3-dev3" # pragma: no cover

View File

@ -25,6 +25,10 @@ DOC_MIME_TYPES = [
"application/msword",
]
ODT_MIME_TYPES = [
"application/vnd.oasis.opendocument.text",
]
XLSX_MIME_TYPES = [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
]
@ -114,6 +118,9 @@ class FileType(Enum):
# Compressed Types
ZIP = 60
# Open Office Types
ODT = 70
# NOTE(robinson) - This is to support sorting for pandas groupby functions
def __lt__(self, other):
return self.name < other.name
@ -135,6 +142,7 @@ STR_TO_FILETYPE = {
"application/vnd.openxmlformats-officedocument.presentationml.presentation": FileType.PPTX,
"application/vnd.ms-powerpoint": FileType.PPT,
"application/xml": FileType.XML,
"application/vnd.oasis.opendocument.text": FileType.ODT,
}
@ -160,6 +168,7 @@ EXT_TO_FILETYPE = {
".json": FileType.JSON,
".epub": FileType.EPUB,
".msg": FileType.MSG,
".odt": FileType.ODT,
None: FileType.UNK,
}
@ -221,6 +230,9 @@ def detect_filetype(
elif mime_type in DOC_MIME_TYPES:
return FileType.DOC
elif mime_type in ODT_MIME_TYPES:
return FileType.ODT
elif mime_type in MSG_MIME_TYPES:
return FileType.MSG

View File

@ -15,6 +15,7 @@ from unstructured.partition.image import partition_image
from unstructured.partition.json import partition_json
from unstructured.partition.md import partition_md
from unstructured.partition.msg import partition_msg
from unstructured.partition.odt import partition_odt
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.ppt import partition_ppt
from unstructured.partition.pptx import partition_pptx
@ -106,6 +107,8 @@ def partition(
elements = partition_doc(filename=filename, file=file)
elif filetype == FileType.DOCX:
elements = partition_docx(filename=filename, file=file)
elif filetype == FileType.ODT:
elements = partition_odt(filename=filename, file=file)
elif filetype == FileType.EML:
elements = partition_email(filename=filename, file=file, encoding=encoding)
elif filetype == FileType.MSG:

View File

@ -1,6 +1,9 @@
import os
import tempfile
from typing import IO, List, Optional
import docx
import pypandoc
from unstructured.cleaners.core import clean_bullets
from unstructured.documents.elements import (
@ -132,3 +135,46 @@ def _text_to_element(text: str) -> Optional[Text]:
return Title(text)
else:
return Text(text)
def convert_and_partition_docx(
source_format: str,
filename: Optional[str] = None,
file: Optional[IO] = None,
) -> List[Element]:
"""Converts a document to DOCX and then partitions it using partition_html. Works with
any file format support by pandoc.
Parameters
----------
source_format
The format of the source document, .e.g. odt
filename
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
"""
if filename is None:
filename = ""
exactly_one(filename=filename, file=file)
if len(filename) > 0:
_, filename_no_path = os.path.split(os.path.abspath(filename))
base_filename, _ = os.path.splitext(filename_no_path)
if not os.path.exists(filename):
raise ValueError(f"The file {filename} does not exist.")
elif file is not None:
tmp = tempfile.NamedTemporaryFile(delete=False)
tmp.write(file.read())
tmp.close()
filename = tmp.name
_, filename_no_path = os.path.split(os.path.abspath(tmp.name))
base_filename, _ = os.path.splitext(filename_no_path)
with tempfile.TemporaryDirectory() as tmpdir:
docx_filename = os.path.join(tmpdir, f"{base_filename}.docx")
pypandoc.convert_file(filename, "docx", format=source_format, outputfile=docx_filename)
elements = partition_docx(filename=docx_filename, metadata_filename=filename)
return elements

View File

@ -0,0 +1,17 @@
from typing import IO, List, Optional
from unstructured.documents.elements import Element
from unstructured.partition.docx import convert_and_partition_docx
def partition_odt(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]:
"""Partitions Open Office Documents in .odt format into its document elements.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
"""
return convert_and_partition_docx(source_format="odt", filename=filename, file=file)