mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
feat: add partition_doc
for .doc
files (#236)
* first pass on doc partitioning * add libreoffice to deps * update docs and readme * add .doc to auto * changelog bump * value error with missing doc * doc updates
This commit is contained in:
parent
9bbd4a1d56
commit
6036af33e7
2
.github/workflows/ci.yml
vendored
2
.github/workflows/ci.yml
vendored
@ -98,7 +98,7 @@ jobs:
|
||||
source .venv/bin/activate
|
||||
make install-nltk-models
|
||||
make install-detectron2
|
||||
sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr
|
||||
sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr libreoffice
|
||||
make test
|
||||
make check-coverage
|
||||
make install-ingest-s3
|
||||
|
@ -1,3 +1,7 @@
|
||||
## 0.4.11-dev0
|
||||
|
||||
* Adds `partition_doc` for partition Word documents in `.doc` format. Requires `libreoffice`.
|
||||
|
||||
## 0.4.10
|
||||
|
||||
* Fixes `ElementMetadata` so that it's JSON serializable when the filename is a `Path` object.
|
||||
|
@ -78,7 +78,7 @@ To install the library, run `pip install unstructured`.
|
||||
You can run this [Colab notebook](https://colab.research.google.com/drive/1U8VCjY2-x8c6y5TYMbSFtQGlQVFHCVIW) to run the examples below.
|
||||
|
||||
The following examples show how to get started with the `unstructured` library.
|
||||
You can parse **TXT**, **HTML**, **PDF**, **EML** and **DOCX** documents with one line of code!
|
||||
You can parse **TXT**, **HTML**, **PDF**, **EML** **DOC** and **DOCX** documents with one line of code!
|
||||
<br></br>
|
||||
See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description
|
||||
of the features in the library.
|
||||
@ -92,7 +92,7 @@ If you are using the `partition` brick, you may need to install additional param
|
||||
instructions outlined [here](https://unstructured-io.github.io/unstructured/installing.html#filetype-detection)
|
||||
`partition` will always apply the default arguments. If you need
|
||||
advanced features, use a document-specific brick. The `partition` brick currently works for
|
||||
`.txt`, `.docx`, `.pptx`, `.jpg`, `.png`, `.eml`, `.html`, and `.pdf` documents.
|
||||
`.txt`, `.doc`, `.docx`, `.pptx`, `.jpg`, `.png`, `.eml`, `.html`, and `.pdf` documents.
|
||||
|
||||
```python
|
||||
from unstructured.partition.auto import partition
|
||||
|
@ -22,7 +22,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
|
||||
file type and route it to the appropriate partitioning brick. All partitioning bricks
|
||||
called within ``partition`` are called using the defualt kwargs. Use the document-type
|
||||
specific bricks if you need to apply non-default settings.
|
||||
``partition`` currently supports ``.docx``, ``.pptx``, ``.eml``, ``.html``, ``.pdf``,
|
||||
``partition`` currently supports ``.docx``, ``.doc``, ``.pptx``, ``.eml``, ``.html``, ``.pdf``,
|
||||
``.png``, ``.jpg``, and ``.txt`` files.
|
||||
If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
|
||||
``.png``, and ``.jpg``.
|
||||
@ -81,6 +81,28 @@ Examples:
|
||||
with open("mydoc.docx", "rb") as f:
|
||||
elements = partition_docx(file=f)
|
||||
|
||||
|
||||
``partition_doc``
|
||||
------------------
|
||||
|
||||
The ``partition_doc`` partitioning brick pre-processes Microsoft Word documents
|
||||
saved in the ``.doc`` format. This staging brick uses a combination of the styling
|
||||
information in the document and the structure of the text to determine the type
|
||||
of a text element. The ``partition_doc`` can take a filename or file-like object
|
||||
as input, as shown in the two examples below. ``partiton_doc``
|
||||
uses ``libreoffice`` to convert the file to ``.docx`` and then
|
||||
calls ``partition_docx``. Ensure you have ``libreoffice`` installed
|
||||
before using ``partition_doc``.
|
||||
|
||||
Examples:
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.partition.doc import partition_doc
|
||||
|
||||
elements = partition_doc(filename="example-docs/fake.doc")
|
||||
|
||||
|
||||
``partition_pptx``
|
||||
---------------------
|
||||
|
||||
|
BIN
example-docs/fake.doc
Normal file
BIN
example-docs/fake.doc
Normal file
Binary file not shown.
@ -8,6 +8,7 @@ import docx
|
||||
from unstructured.documents.elements import Address, NarrativeText, PageBreak, Title, Text, ListItem
|
||||
from unstructured.partition.auto import partition
|
||||
import unstructured.partition.auto as auto
|
||||
from unstructured.partition.common import convert_office_doc
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
||||
@ -96,6 +97,30 @@ def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_element
|
||||
assert elements == expected_docx_elements
|
||||
|
||||
|
||||
def test_auto_partition_doc_with_filename(mock_docx_document, expected_docx_elements, tmpdir):
|
||||
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
|
||||
mock_docx_document.save(docx_filename)
|
||||
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
|
||||
|
||||
elements = partition(filename=doc_filename)
|
||||
assert elements == expected_docx_elements
|
||||
|
||||
|
||||
# NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to
|
||||
# determine that the file is an .doc document
|
||||
@pytest.mark.xfail
|
||||
def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements, tmpdir):
|
||||
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
|
||||
mock_docx_document.save(docx_filename)
|
||||
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
|
||||
|
||||
with open(doc_filename, "rb") as f:
|
||||
elements = partition(file=f)
|
||||
assert elements == expected_docx_elements
|
||||
|
||||
|
||||
def test_auto_partition_html_from_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html")
|
||||
elements = partition(filename=filename)
|
||||
|
103
test_unstructured/partition/test_doc.py
Normal file
103
test_unstructured/partition/test_doc.py
Normal file
@ -0,0 +1,103 @@
|
||||
import os
|
||||
import pytest
|
||||
|
||||
import docx
|
||||
|
||||
from unstructured.documents.elements import Address, ListItem, NarrativeText, Title, Text
|
||||
from unstructured.partition.common import convert_office_doc
|
||||
from unstructured.partition.doc import partition_doc
|
||||
from unstructured.partition.docx import partition_docx
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_document():
|
||||
document = docx.Document()
|
||||
|
||||
document.add_paragraph("These are a few of my favorite things:", style="Heading 1")
|
||||
# NOTE(robinson) - this should get picked up as a list item due to the •
|
||||
document.add_paragraph("• Parrots", style="Normal")
|
||||
# NOTE(robinson) - this should get dropped because it's empty
|
||||
document.add_paragraph("• ", style="Normal")
|
||||
document.add_paragraph("Hockey", style="List Bullet")
|
||||
# NOTE(robinson) - this should get dropped because it's empty
|
||||
document.add_paragraph("", style="List Bullet")
|
||||
# NOTE(robinson) - this should get picked up as a title
|
||||
document.add_paragraph("Analysis", style="Normal")
|
||||
# NOTE(robinson) - this should get dropped because it is empty
|
||||
document.add_paragraph("", style="Normal")
|
||||
# NOTE(robinson) - this should get picked up as a narrative text
|
||||
document.add_paragraph("This is my first thought. This is my second thought.", style="Normal")
|
||||
document.add_paragraph("This is my third thought.", style="Body Text")
|
||||
# NOTE(robinson) - this should just be regular text
|
||||
document.add_paragraph("2023")
|
||||
# NOTE(robinson) - this should be an address
|
||||
document.add_paragraph("DOYLESTOWN, PA 18901")
|
||||
|
||||
return document
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def expected_elements():
|
||||
return [
|
||||
Title("These are a few of my favorite things:"),
|
||||
ListItem("Parrots"),
|
||||
ListItem("Hockey"),
|
||||
Title("Analysis"),
|
||||
NarrativeText("This is my first thought. This is my second thought."),
|
||||
NarrativeText("This is my third thought."),
|
||||
Text("2023"),
|
||||
Address("DOYLESTOWN, PA 18901"),
|
||||
]
|
||||
|
||||
|
||||
def test_partition_doc_with_filename(mock_document, expected_elements, tmpdir):
|
||||
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
|
||||
mock_document.save(docx_filename)
|
||||
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
|
||||
|
||||
elements = partition_doc(filename=doc_filename)
|
||||
assert elements == expected_elements
|
||||
|
||||
|
||||
def test_partition_doc_matches_partition_docx(mock_document, expected_elements, tmpdir):
|
||||
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
|
||||
mock_document.save(docx_filename)
|
||||
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
|
||||
|
||||
partition_doc(filename=doc_filename) == partition_docx(filename=docx_filename)
|
||||
|
||||
|
||||
def test_partition_raises_with_missing_doc(mock_document, expected_elements, tmpdir):
|
||||
doc_filename = os.path.join(tmpdir.dirname, "asdf.doc")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
partition_doc(filename=doc_filename)
|
||||
|
||||
|
||||
def test_partition_doc_with_file(mock_document, expected_elements, tmpdir):
|
||||
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
|
||||
mock_document.save(docx_filename)
|
||||
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
|
||||
|
||||
with open(doc_filename, "rb") as f:
|
||||
elements = partition_doc(file=f)
|
||||
assert elements == expected_elements
|
||||
|
||||
|
||||
def test_partition_doc_raises_with_both_specified(mock_document, tmpdir):
|
||||
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
|
||||
mock_document.save(docx_filename)
|
||||
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
|
||||
|
||||
with open(doc_filename, "rb") as f:
|
||||
with pytest.raises(ValueError):
|
||||
partition_doc(filename=doc_filename, file=f)
|
||||
|
||||
|
||||
def test_partition_doc_raises_with_neither():
|
||||
with pytest.raises(ValueError):
|
||||
partition_doc()
|
@ -1 +1 @@
|
||||
__version__ = "0.4.10" # pragma: no cover
|
||||
__version__ = "0.4.11-dev0" # pragma: no cover
|
||||
|
@ -1,6 +1,7 @@
|
||||
from typing import IO, Optional
|
||||
|
||||
from unstructured.file_utils.filetype import detect_filetype, FileType
|
||||
from unstructured.partition.doc import partition_doc
|
||||
from unstructured.partition.docx import partition_docx
|
||||
from unstructured.partition.email import partition_email
|
||||
from unstructured.partition.html import partition_html
|
||||
@ -34,6 +35,8 @@ def partition(
|
||||
if file is not None:
|
||||
file.seek(0)
|
||||
|
||||
if filetype == FileType.DOC:
|
||||
return partition_doc(filename=filename, file=file)
|
||||
if filetype == FileType.DOCX:
|
||||
return partition_docx(filename=filename, file=file)
|
||||
elif filetype == FileType.EML:
|
||||
|
@ -1,3 +1,4 @@
|
||||
import subprocess
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from unstructured.documents.elements import (
|
||||
@ -101,3 +102,32 @@ def add_element_metadata(
|
||||
element.metadata = metadata
|
||||
elements.append(element)
|
||||
return elements
|
||||
|
||||
|
||||
def convert_office_doc(input_filename: str, output_directory: str, target_format: str):
|
||||
"""Converts a .doc file to a .docx file using the libreoffice CLI."""
|
||||
# NOTE(robinson) - In the future can also include win32com client as a fallback for windows
|
||||
# users who do not have LibreOffice installed
|
||||
# ref: https://stackoverflow.com/questions/38468442/
|
||||
# multiple-doc-to-docx-file-conversion-using-python
|
||||
try:
|
||||
subprocess.call(
|
||||
[
|
||||
"soffice",
|
||||
"--headless",
|
||||
"--convert-to",
|
||||
target_format,
|
||||
"--outdir",
|
||||
output_directory,
|
||||
input_filename,
|
||||
]
|
||||
)
|
||||
except FileNotFoundError:
|
||||
raise FileNotFoundError(
|
||||
"""soffice command was not found. Please install libreoffice
|
||||
on your system and try again.
|
||||
|
||||
- Install instructions: https://www.libreoffice.org/get-help/install-howto/
|
||||
- Mac: https://formulae.brew.sh/cask/libreoffice
|
||||
- Debian: https://wiki.debian.org/LibreOffice"""
|
||||
)
|
||||
|
0
unstructured/partition/doc
Normal file
0
unstructured/partition/doc
Normal file
45
unstructured/partition/doc.py
Normal file
45
unstructured/partition/doc.py
Normal file
@ -0,0 +1,45 @@
|
||||
import os
|
||||
import tempfile
|
||||
from typing import IO, List, Optional
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.partition.common import convert_office_doc
|
||||
from unstructured.partition.docx import partition_docx
|
||||
|
||||
|
||||
def partition_doc(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]:
|
||||
"""Partitions Microsoft Word Documents in .doc format into its document elements.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename
|
||||
A string defining the target filename path.
|
||||
file
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
"""
|
||||
if not any([filename, file]):
|
||||
raise ValueError("One of filename or file must be specified.")
|
||||
|
||||
if filename is not None and not file:
|
||||
_, filename_no_path = os.path.split(os.path.abspath(filename))
|
||||
base_filename, _ = os.path.splitext(filename_no_path)
|
||||
elif file is not None and not filename:
|
||||
tmp = tempfile.NamedTemporaryFile(delete=False)
|
||||
tmp.write(file.read())
|
||||
tmp.close()
|
||||
filename = tmp.name
|
||||
_, filename_no_path = os.path.split(os.path.abspath(tmp.name))
|
||||
else:
|
||||
raise ValueError("Only one of filename or file can be specified.")
|
||||
|
||||
if not os.path.exists(filename):
|
||||
raise ValueError(f"The file {filename} does not exist.")
|
||||
|
||||
base_filename, _ = os.path.splitext(filename_no_path)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
convert_office_doc(filename, tmpdir, target_format="docx")
|
||||
docx_filename = os.path.join(tmpdir, f"{base_filename}.docx")
|
||||
elements = partition_docx(filename=docx_filename)
|
||||
|
||||
return elements
|
@ -56,9 +56,7 @@ STYLE_TO_ELEMENT_MAPPING = {
|
||||
}
|
||||
|
||||
|
||||
def partition_docx(
|
||||
filename: Optional[str] = None, file: Optional[IO] = None, **kwargs
|
||||
) -> List[Element]:
|
||||
def partition_docx(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]:
|
||||
"""Partitions Microsoft Word Documents in .docx format into its document elements.
|
||||
|
||||
Parameters
|
||||
|
Loading…
x
Reference in New Issue
Block a user