feat: add partition_doc for .doc files (#236)

* first pass on doc partitioning

* add libreoffice to deps

* update docs and readme

* add .doc to auto

* changelog bump

* value error with missing doc

* doc updates
This commit is contained in:
Matt Robinson 2023-02-17 09:30:23 -05:00 committed by GitHub
parent 9bbd4a1d56
commit 6036af33e7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 238 additions and 8 deletions

View File

@ -98,7 +98,7 @@ jobs:
source .venv/bin/activate
make install-nltk-models
make install-detectron2
sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr
sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr libreoffice
make test
make check-coverage
make install-ingest-s3

View File

@ -1,3 +1,7 @@
## 0.4.11-dev0
* Adds `partition_doc` for partition Word documents in `.doc` format. Requires `libreoffice`.
## 0.4.10
* Fixes `ElementMetadata` so that it's JSON serializable when the filename is a `Path` object.

View File

@ -78,7 +78,7 @@ To install the library, run `pip install unstructured`.
You can run this [Colab notebook](https://colab.research.google.com/drive/1U8VCjY2-x8c6y5TYMbSFtQGlQVFHCVIW) to run the examples below.
The following examples show how to get started with the `unstructured` library.
You can parse **TXT**, **HTML**, **PDF**, **EML** and **DOCX** documents with one line of code!
You can parse **TXT**, **HTML**, **PDF**, **EML** **DOC** and **DOCX** documents with one line of code!
<br></br>
See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description
of the features in the library.
@ -92,7 +92,7 @@ If you are using the `partition` brick, you may need to install additional param
instructions outlined [here](https://unstructured-io.github.io/unstructured/installing.html#filetype-detection)
`partition` will always apply the default arguments. If you need
advanced features, use a document-specific brick. The `partition` brick currently works for
`.txt`, `.docx`, `.pptx`, `.jpg`, `.png`, `.eml`, `.html`, and `.pdf` documents.
`.txt`, `.doc`, `.docx`, `.pptx`, `.jpg`, `.png`, `.eml`, `.html`, and `.pdf` documents.
```python
from unstructured.partition.auto import partition

View File

@ -22,7 +22,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
file type and route it to the appropriate partitioning brick. All partitioning bricks
called within ``partition`` are called using the defualt kwargs. Use the document-type
specific bricks if you need to apply non-default settings.
``partition`` currently supports ``.docx``, ``.pptx``, ``.eml``, ``.html``, ``.pdf``,
``partition`` currently supports ``.docx``, ``.doc``, ``.pptx``, ``.eml``, ``.html``, ``.pdf``,
``.png``, ``.jpg``, and ``.txt`` files.
If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
``.png``, and ``.jpg``.
@ -81,6 +81,28 @@ Examples:
with open("mydoc.docx", "rb") as f:
elements = partition_docx(file=f)
``partition_doc``
------------------
The ``partition_doc`` partitioning brick pre-processes Microsoft Word documents
saved in the ``.doc`` format. This staging brick uses a combination of the styling
information in the document and the structure of the text to determine the type
of a text element. The ``partition_doc`` can take a filename or file-like object
as input, as shown in the two examples below. ``partiton_doc``
uses ``libreoffice`` to convert the file to ``.docx`` and then
calls ``partition_docx``. Ensure you have ``libreoffice`` installed
before using ``partition_doc``.
Examples:
.. code:: python
from unstructured.partition.doc import partition_doc
elements = partition_doc(filename="example-docs/fake.doc")
``partition_pptx``
---------------------

BIN
example-docs/fake.doc Normal file

Binary file not shown.

View File

@ -8,6 +8,7 @@ import docx
from unstructured.documents.elements import Address, NarrativeText, PageBreak, Title, Text, ListItem
from unstructured.partition.auto import partition
import unstructured.partition.auto as auto
from unstructured.partition.common import convert_office_doc
DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
@ -96,6 +97,30 @@ def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_element
assert elements == expected_docx_elements
def test_auto_partition_doc_with_filename(mock_docx_document, expected_docx_elements, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_docx_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
elements = partition(filename=doc_filename)
assert elements == expected_docx_elements
# NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to
# determine that the file is an .doc document
@pytest.mark.xfail
def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_docx_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
with open(doc_filename, "rb") as f:
elements = partition(file=f)
assert elements == expected_docx_elements
def test_auto_partition_html_from_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html")
elements = partition(filename=filename)

View File

@ -0,0 +1,103 @@
import os
import pytest
import docx
from unstructured.documents.elements import Address, ListItem, NarrativeText, Title, Text
from unstructured.partition.common import convert_office_doc
from unstructured.partition.doc import partition_doc
from unstructured.partition.docx import partition_docx
@pytest.fixture
def mock_document():
document = docx.Document()
document.add_paragraph("These are a few of my favorite things:", style="Heading 1")
# NOTE(robinson) - this should get picked up as a list item due to the •
document.add_paragraph("• Parrots", style="Normal")
# NOTE(robinson) - this should get dropped because it's empty
document.add_paragraph("", style="Normal")
document.add_paragraph("Hockey", style="List Bullet")
# NOTE(robinson) - this should get dropped because it's empty
document.add_paragraph("", style="List Bullet")
# NOTE(robinson) - this should get picked up as a title
document.add_paragraph("Analysis", style="Normal")
# NOTE(robinson) - this should get dropped because it is empty
document.add_paragraph("", style="Normal")
# NOTE(robinson) - this should get picked up as a narrative text
document.add_paragraph("This is my first thought. This is my second thought.", style="Normal")
document.add_paragraph("This is my third thought.", style="Body Text")
# NOTE(robinson) - this should just be regular text
document.add_paragraph("2023")
# NOTE(robinson) - this should be an address
document.add_paragraph("DOYLESTOWN, PA 18901")
return document
@pytest.fixture
def expected_elements():
return [
Title("These are a few of my favorite things:"),
ListItem("Parrots"),
ListItem("Hockey"),
Title("Analysis"),
NarrativeText("This is my first thought. This is my second thought."),
NarrativeText("This is my third thought."),
Text("2023"),
Address("DOYLESTOWN, PA 18901"),
]
def test_partition_doc_with_filename(mock_document, expected_elements, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
elements = partition_doc(filename=doc_filename)
assert elements == expected_elements
def test_partition_doc_matches_partition_docx(mock_document, expected_elements, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
partition_doc(filename=doc_filename) == partition_docx(filename=docx_filename)
def test_partition_raises_with_missing_doc(mock_document, expected_elements, tmpdir):
doc_filename = os.path.join(tmpdir.dirname, "asdf.doc")
with pytest.raises(ValueError):
partition_doc(filename=doc_filename)
def test_partition_doc_with_file(mock_document, expected_elements, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
with open(doc_filename, "rb") as f:
elements = partition_doc(file=f)
assert elements == expected_elements
def test_partition_doc_raises_with_both_specified(mock_document, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
with open(doc_filename, "rb") as f:
with pytest.raises(ValueError):
partition_doc(filename=doc_filename, file=f)
def test_partition_doc_raises_with_neither():
with pytest.raises(ValueError):
partition_doc()

View File

@ -1 +1 @@
__version__ = "0.4.10" # pragma: no cover
__version__ = "0.4.11-dev0" # pragma: no cover

View File

@ -1,6 +1,7 @@
from typing import IO, Optional
from unstructured.file_utils.filetype import detect_filetype, FileType
from unstructured.partition.doc import partition_doc
from unstructured.partition.docx import partition_docx
from unstructured.partition.email import partition_email
from unstructured.partition.html import partition_html
@ -34,6 +35,8 @@ def partition(
if file is not None:
file.seek(0)
if filetype == FileType.DOC:
return partition_doc(filename=filename, file=file)
if filetype == FileType.DOCX:
return partition_docx(filename=filename, file=file)
elif filetype == FileType.EML:

View File

@ -1,3 +1,4 @@
import subprocess
from typing import List, Optional, Union
from unstructured.documents.elements import (
@ -101,3 +102,32 @@ def add_element_metadata(
element.metadata = metadata
elements.append(element)
return elements
def convert_office_doc(input_filename: str, output_directory: str, target_format: str):
"""Converts a .doc file to a .docx file using the libreoffice CLI."""
# NOTE(robinson) - In the future can also include win32com client as a fallback for windows
# users who do not have LibreOffice installed
# ref: https://stackoverflow.com/questions/38468442/
# multiple-doc-to-docx-file-conversion-using-python
try:
subprocess.call(
[
"soffice",
"--headless",
"--convert-to",
target_format,
"--outdir",
output_directory,
input_filename,
]
)
except FileNotFoundError:
raise FileNotFoundError(
"""soffice command was not found. Please install libreoffice
on your system and try again.
- Install instructions: https://www.libreoffice.org/get-help/install-howto/
- Mac: https://formulae.brew.sh/cask/libreoffice
- Debian: https://wiki.debian.org/LibreOffice"""
)

View File

View File

@ -0,0 +1,45 @@
import os
import tempfile
from typing import IO, List, Optional
from unstructured.documents.elements import Element
from unstructured.partition.common import convert_office_doc
from unstructured.partition.docx import partition_docx
def partition_doc(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]:
"""Partitions Microsoft Word Documents in .doc format into its document elements.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
"""
if not any([filename, file]):
raise ValueError("One of filename or file must be specified.")
if filename is not None and not file:
_, filename_no_path = os.path.split(os.path.abspath(filename))
base_filename, _ = os.path.splitext(filename_no_path)
elif file is not None and not filename:
tmp = tempfile.NamedTemporaryFile(delete=False)
tmp.write(file.read())
tmp.close()
filename = tmp.name
_, filename_no_path = os.path.split(os.path.abspath(tmp.name))
else:
raise ValueError("Only one of filename or file can be specified.")
if not os.path.exists(filename):
raise ValueError(f"The file {filename} does not exist.")
base_filename, _ = os.path.splitext(filename_no_path)
with tempfile.TemporaryDirectory() as tmpdir:
convert_office_doc(filename, tmpdir, target_format="docx")
docx_filename = os.path.join(tmpdir, f"{base_filename}.docx")
elements = partition_docx(filename=docx_filename)
return elements

View File

@ -56,9 +56,7 @@ STYLE_TO_ELEMENT_MAPPING = {
}
def partition_docx(
filename: Optional[str] = None, file: Optional[IO] = None, **kwargs
) -> List[Element]:
def partition_docx(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]:
"""Partitions Microsoft Word Documents in .docx format into its document elements.
Parameters