mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-27 07:03:52 +00:00
feat: add partition_odt for open office docs (#548)
* added filetype detection for odt * add function for partition odt documents * add odt files to auto * changelog and version * docs and readme * update installation docs * skip tests if not supported or in docker * import pytest * fix docs typos
This commit is contained in:
parent
981805e435
commit
fae5f8fdde
@ -1,4 +1,4 @@
|
||||
## 0.6.3-dev2
|
||||
## 0.6.3-dev3
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -7,6 +7,7 @@
|
||||
* Added `partition_multiple_via_api` for partitioning multiple documents in a single REST
|
||||
API call.
|
||||
* Added `stage_for_baseplate` function to prepare outputs for ingestion into Baseplate.
|
||||
* Added `partition_odt` for processing Open Office documents.
|
||||
|
||||
### Fixes
|
||||
|
||||
|
||||
@ -181,7 +181,8 @@ you can also uninstall the hooks with `pre-commit uninstall`.
|
||||
You can run this [Colab notebook](https://colab.research.google.com/drive/1U8VCjY2-x8c6y5TYMbSFtQGlQVFHCVIW) to run the examples below.
|
||||
|
||||
The following examples show how to get started with the `unstructured` library.
|
||||
You can parse **TXT**, **HTML**, **PDF**, **EML**, **MSG**, **RTF**, **EPUB**, **DOC**, **DOCX**, **PPT**, **PPTX**, **JPG**,
|
||||
You can parse **TXT**, **HTML**, **PDF**, **EML**, **MSG**, **RTF**, **EPUB**, **DOC**, **DOCX**,
|
||||
**ODT**, **PPT**, **PPTX**, **JPG**,
|
||||
and **PNG** documents with one line of code!
|
||||
<br></br>
|
||||
See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description
|
||||
|
||||
@ -83,7 +83,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
|
||||
file type and route it to the appropriate partitioning brick. All partitioning bricks
|
||||
called within ``partition`` are called using the default kwargs. Use the document-type
|
||||
specific bricks if you need to apply non-default settings.
|
||||
``partition`` currently supports ``.docx``, ``.doc``, ``.pptx``, ``.ppt``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.pdf``,
|
||||
``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.pdf``,
|
||||
``.png``, ``.jpg``, and ``.txt`` files.
|
||||
If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
|
||||
``.png``, and ``.jpg``.
|
||||
@ -251,6 +251,22 @@ Examples:
|
||||
elements = partition_doc(filename="example-docs/fake.doc")
|
||||
|
||||
|
||||
``partition_odt``
|
||||
------------------
|
||||
|
||||
The ``partition_odt`` partitioning brick pre-processes Open Office documents
|
||||
saved in the ``.odt`` format. The function first converst the document
|
||||
to ``.docx`` using ``pandoc`` and then processes it using ``partition_docx``.
|
||||
|
||||
Examples:
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.partition.odt import partition_odt
|
||||
|
||||
elements = partition_odt(filename="example-docs/fake.odt")
|
||||
|
||||
|
||||
``partition_pptx``
|
||||
---------------------
|
||||
|
||||
|
||||
@ -15,7 +15,7 @@ installation.
|
||||
* ``poppler-utils`` (images and PDFs)
|
||||
* ``tesseract-ocr`` (images and PDFs)
|
||||
* ``libreoffice`` (MS Office docs)
|
||||
* ``pandocs`` (EPUBs)
|
||||
* ``pandocs`` (EPUBs, RTFs and Open Office docs)
|
||||
|
||||
* If you are parsing PDFs, run the following to install the ``detectron2`` model, which ``unstructured`` uses for layout detection:
|
||||
* ``pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@e2ce8dc#egg=detectron2"``
|
||||
|
||||
BIN
example-docs/fake.odt
Normal file
BIN
example-docs/fake.odt
Normal file
Binary file not shown.
@ -32,6 +32,7 @@ EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs"
|
||||
("fake-power-point.pptx", FileType.PPTX),
|
||||
("winter-sports.epub", FileType.EPUB),
|
||||
("spring-weather.html.json", FileType.JSON),
|
||||
("fake.odt", FileType.ODT),
|
||||
],
|
||||
)
|
||||
def test_detect_filetype_from_filename(file, expected):
|
||||
@ -55,6 +56,7 @@ def test_detect_filetype_from_filename(file, expected):
|
||||
("winter-sports.epub", FileType.EPUB),
|
||||
("fake-doc.rtf", FileType.RTF),
|
||||
("spring-weather.html.json", FileType.JSON),
|
||||
("fake.odt", FileType.ODT),
|
||||
],
|
||||
)
|
||||
def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expected):
|
||||
|
||||
@ -33,6 +33,7 @@ EXPECTED_EMAIL_OUTPUT = [
|
||||
|
||||
is_in_docker = os.path.exists("/.dockerenv")
|
||||
rtf_not_supported = "rtf" not in pypandoc.get_pandoc_formats()[0]
|
||||
odt_not_supported = "odt" not in pypandoc.get_pandoc_formats()[0]
|
||||
|
||||
|
||||
def test_auto_partition_email_from_filename():
|
||||
@ -461,3 +462,21 @@ def test_auto_partition_works_with_unstructured_jsons_from_file():
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition(file=f)
|
||||
assert elements[0].text == "News Around NOAA"
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
|
||||
def test_auto_partition_odt_from_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
|
||||
elements = partition(filename=filename)
|
||||
assert elements == [Title("Lorem ipsum dolor sit amet.")]
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
|
||||
def test_auto_partition_odt_from_file():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition(file=f)
|
||||
|
||||
assert elements == [Title("Lorem ipsum dolor sit amet.")]
|
||||
|
||||
32
test_unstructured/partition/test_odt.py
Normal file
32
test_unstructured/partition/test_odt.py
Normal file
@ -0,0 +1,32 @@
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import pypandoc
|
||||
import pytest
|
||||
|
||||
from unstructured.documents.elements import Title
|
||||
from unstructured.partition.odt import partition_odt
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
||||
|
||||
odt_not_supported = "odt" not in pypandoc.get_pandoc_formats()[0]
|
||||
is_in_docker = os.path.exists("/.dockerenv")
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
|
||||
def test_partition_odt_from_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
|
||||
elements = partition_odt(filename=filename)
|
||||
assert elements == [Title("Lorem ipsum dolor sit amet.")]
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
|
||||
def test_partition_odt_from_file():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_odt(file=f)
|
||||
|
||||
assert elements == [Title("Lorem ipsum dolor sit amet.")]
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.6.3-dev2" # pragma: no cover
|
||||
__version__ = "0.6.3-dev3" # pragma: no cover
|
||||
|
||||
@ -25,6 +25,10 @@ DOC_MIME_TYPES = [
|
||||
"application/msword",
|
||||
]
|
||||
|
||||
ODT_MIME_TYPES = [
|
||||
"application/vnd.oasis.opendocument.text",
|
||||
]
|
||||
|
||||
XLSX_MIME_TYPES = [
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
]
|
||||
@ -114,6 +118,9 @@ class FileType(Enum):
|
||||
# Compressed Types
|
||||
ZIP = 60
|
||||
|
||||
# Open Office Types
|
||||
ODT = 70
|
||||
|
||||
# NOTE(robinson) - This is to support sorting for pandas groupby functions
|
||||
def __lt__(self, other):
|
||||
return self.name < other.name
|
||||
@ -135,6 +142,7 @@ STR_TO_FILETYPE = {
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation": FileType.PPTX,
|
||||
"application/vnd.ms-powerpoint": FileType.PPT,
|
||||
"application/xml": FileType.XML,
|
||||
"application/vnd.oasis.opendocument.text": FileType.ODT,
|
||||
}
|
||||
|
||||
|
||||
@ -160,6 +168,7 @@ EXT_TO_FILETYPE = {
|
||||
".json": FileType.JSON,
|
||||
".epub": FileType.EPUB,
|
||||
".msg": FileType.MSG,
|
||||
".odt": FileType.ODT,
|
||||
None: FileType.UNK,
|
||||
}
|
||||
|
||||
@ -221,6 +230,9 @@ def detect_filetype(
|
||||
elif mime_type in DOC_MIME_TYPES:
|
||||
return FileType.DOC
|
||||
|
||||
elif mime_type in ODT_MIME_TYPES:
|
||||
return FileType.ODT
|
||||
|
||||
elif mime_type in MSG_MIME_TYPES:
|
||||
return FileType.MSG
|
||||
|
||||
|
||||
@ -15,6 +15,7 @@ from unstructured.partition.image import partition_image
|
||||
from unstructured.partition.json import partition_json
|
||||
from unstructured.partition.md import partition_md
|
||||
from unstructured.partition.msg import partition_msg
|
||||
from unstructured.partition.odt import partition_odt
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
from unstructured.partition.ppt import partition_ppt
|
||||
from unstructured.partition.pptx import partition_pptx
|
||||
@ -106,6 +107,8 @@ def partition(
|
||||
elements = partition_doc(filename=filename, file=file)
|
||||
elif filetype == FileType.DOCX:
|
||||
elements = partition_docx(filename=filename, file=file)
|
||||
elif filetype == FileType.ODT:
|
||||
elements = partition_odt(filename=filename, file=file)
|
||||
elif filetype == FileType.EML:
|
||||
elements = partition_email(filename=filename, file=file, encoding=encoding)
|
||||
elif filetype == FileType.MSG:
|
||||
|
||||
@ -1,6 +1,9 @@
|
||||
import os
|
||||
import tempfile
|
||||
from typing import IO, List, Optional
|
||||
|
||||
import docx
|
||||
import pypandoc
|
||||
|
||||
from unstructured.cleaners.core import clean_bullets
|
||||
from unstructured.documents.elements import (
|
||||
@ -132,3 +135,46 @@ def _text_to_element(text: str) -> Optional[Text]:
|
||||
return Title(text)
|
||||
else:
|
||||
return Text(text)
|
||||
|
||||
|
||||
def convert_and_partition_docx(
|
||||
source_format: str,
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
) -> List[Element]:
|
||||
"""Converts a document to DOCX and then partitions it using partition_html. Works with
|
||||
any file format support by pandoc.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
source_format
|
||||
The format of the source document, .e.g. odt
|
||||
filename
|
||||
A string defining the target filename path.
|
||||
file
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
"""
|
||||
if filename is None:
|
||||
filename = ""
|
||||
exactly_one(filename=filename, file=file)
|
||||
|
||||
if len(filename) > 0:
|
||||
_, filename_no_path = os.path.split(os.path.abspath(filename))
|
||||
base_filename, _ = os.path.splitext(filename_no_path)
|
||||
if not os.path.exists(filename):
|
||||
raise ValueError(f"The file {filename} does not exist.")
|
||||
elif file is not None:
|
||||
tmp = tempfile.NamedTemporaryFile(delete=False)
|
||||
tmp.write(file.read())
|
||||
tmp.close()
|
||||
filename = tmp.name
|
||||
_, filename_no_path = os.path.split(os.path.abspath(tmp.name))
|
||||
|
||||
base_filename, _ = os.path.splitext(filename_no_path)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
docx_filename = os.path.join(tmpdir, f"{base_filename}.docx")
|
||||
pypandoc.convert_file(filename, "docx", format=source_format, outputfile=docx_filename)
|
||||
elements = partition_docx(filename=docx_filename, metadata_filename=filename)
|
||||
|
||||
return elements
|
||||
|
||||
17
unstructured/partition/odt.py
Normal file
17
unstructured/partition/odt.py
Normal file
@ -0,0 +1,17 @@
|
||||
from typing import IO, List, Optional
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.partition.docx import convert_and_partition_docx
|
||||
|
||||
|
||||
def partition_odt(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]:
|
||||
"""Partitions Open Office Documents in .odt format into its document elements.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename
|
||||
A string defining the target filename path.
|
||||
file
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
"""
|
||||
return convert_and_partition_docx(source_format="odt", filename=filename, file=file)
|
||||
Loading…
x
Reference in New Issue
Block a user