mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-16 04:34:54 +00:00
feat: add partition_docx
for Word documents (#131)
* first pass on docx parsing * linting, linting, linting * test docx with filename * added documentation * more tests; version bump * typo * another typo * another typo! * it -> its * save -> saved * remove None since it's the default argument
This commit is contained in:
parent
33b983fbf0
commit
fee95b643c
@ -1,9 +1,10 @@
|
|||||||
## 0.3.6-dev0
|
## 0.3.6-dev1
|
||||||
|
|
||||||
* Cleaning brick for removing ordered bullets `clean_ordered_bullets`.
|
* Cleaning brick for removing ordered bullets `clean_ordered_bullets`.
|
||||||
* Extract brick method for ordered bullets `extract_ordered_bullets`.
|
* Extract brick method for ordered bullets `extract_ordered_bullets`.
|
||||||
* Test for `clean_ordered_bullets`.
|
* Test for `clean_ordered_bullets`.
|
||||||
* Test for `extract_ordered_bullets`.
|
* Test for `extract_ordered_bullets`.
|
||||||
|
* Added `partition_docx` for pre-processing Word Documents.
|
||||||
|
|
||||||
## 0.3.5
|
## 0.3.5
|
||||||
|
|
||||||
|
@ -14,6 +14,36 @@ The partitioning bricks in ``unstructured`` differentiate between different sect
|
|||||||
of text in a document. For example, the partitioning bricks can help distinguish between
|
of text in a document. For example, the partitioning bricks can help distinguish between
|
||||||
titles, narrative text, and tables.
|
titles, narrative text, and tables.
|
||||||
|
|
||||||
|
|
||||||
|
``partition_docx``
|
||||||
|
------------------
|
||||||
|
|
||||||
|
The ``partition_docx`` partitioning brick pre-processes Microsoft Word documents
|
||||||
|
saved in the ``.docx`` format. This staging brick uses a combination of the styling
|
||||||
|
information in the document and the structure of the text to determine the type
|
||||||
|
of a text element. The ``partition_docx`` can take a filename or file-like object
|
||||||
|
as input, as shown in the two examples below.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
import docx
|
||||||
|
|
||||||
|
from unstructured.partition.docx import partition_docx
|
||||||
|
|
||||||
|
document = docx.Document()
|
||||||
|
document.add_paragraph("Important Analysis", style="Heading 1")
|
||||||
|
document.add_paragraph("Here is my first thought.", style="Body Text")
|
||||||
|
document.add_paragraph("Here is my second thought.", style="Normal")
|
||||||
|
document.save("mydoc.docx")
|
||||||
|
|
||||||
|
elements = partition_docx(filename="mydoc.docx")
|
||||||
|
|
||||||
|
with open("mydoc.docx", "rb") as f:
|
||||||
|
elements = partition_docx(file=f)
|
||||||
|
|
||||||
|
|
||||||
``partition_html``
|
``partition_html``
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
@ -329,7 +359,7 @@ Examples:
|
|||||||
|
|
||||||
|
|
||||||
``clean_ordered_bullets``
|
``clean_ordered_bullets``
|
||||||
-----------------
|
-------------------------
|
||||||
|
|
||||||
Remove alpha-numeric bullets from the beginning of text up to three “sub-section” levels.
|
Remove alpha-numeric bullets from the beginning of text up to three “sub-section” levels.
|
||||||
|
|
||||||
|
72
test_unstructured/partition/test_docx.py
Normal file
72
test_unstructured/partition/test_docx.py
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
import os
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
import docx
|
||||||
|
|
||||||
|
from unstructured.documents.elements import ListItem, NarrativeText, Title, Text
|
||||||
|
from unstructured.partition.docx import partition_docx
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_document():
|
||||||
|
document = docx.Document()
|
||||||
|
|
||||||
|
document.add_paragraph("These are a few of my favorite things:", style="Heading 1")
|
||||||
|
# NOTE(robinson) - this should get picked up as a list item due to the •
|
||||||
|
document.add_paragraph("• Parrots", style="Normal")
|
||||||
|
document.add_paragraph("Hockey", style="List Bullet")
|
||||||
|
# NOTE(robinson) - this should get picked up as a title
|
||||||
|
document.add_paragraph("Analysis", style="Normal")
|
||||||
|
# NOTE(robinson) - this should get dropped because it is empty
|
||||||
|
document.add_paragraph("", style="Normal")
|
||||||
|
# NOTE(robinson) - this should get picked up as a narrative text
|
||||||
|
document.add_paragraph("This is my first thought. This is my second thought.", style="Normal")
|
||||||
|
document.add_paragraph("This is my third thought.", style="Body Text")
|
||||||
|
# NOTE(robinson) - this should just be regular text
|
||||||
|
document.add_paragraph("2023")
|
||||||
|
|
||||||
|
return document
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def expected_elements():
|
||||||
|
return [
|
||||||
|
Title("These are a few of my favorite things:"),
|
||||||
|
ListItem("Parrots"),
|
||||||
|
ListItem("Hockey"),
|
||||||
|
Title("Analysis"),
|
||||||
|
NarrativeText("This is my first thought. This is my second thought."),
|
||||||
|
NarrativeText("This is my third thought."),
|
||||||
|
Text("2023"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_docx_with_filename(mock_document, expected_elements, tmpdir):
|
||||||
|
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||||
|
mock_document.save(filename)
|
||||||
|
|
||||||
|
elements = partition_docx(filename=filename)
|
||||||
|
assert elements == expected_elements
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_docx_with_file(mock_document, expected_elements, tmpdir):
|
||||||
|
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||||
|
mock_document.save(filename)
|
||||||
|
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
elements = partition_docx(file=f)
|
||||||
|
assert elements == expected_elements
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_docx_raises_with_both_specified(mock_document, tmpdir):
|
||||||
|
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||||
|
mock_document.save(filename)
|
||||||
|
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
partition_docx(filename=filename, file=f)
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_docx_raises_with_neither():
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
partition_docx()
|
@ -1 +1 @@
|
|||||||
__version__ = "0.3.6-dev0" # pragma: no cover
|
__version__ = "0.3.6-dev1" # pragma: no cover
|
||||||
|
112
unstructured/partition/docx.py
Normal file
112
unstructured/partition/docx.py
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
from typing import IO, List, Optional
|
||||||
|
|
||||||
|
import docx
|
||||||
|
|
||||||
|
from unstructured.cleaners.core import clean_bullets
|
||||||
|
from unstructured.documents.elements import Element, ListItem, NarrativeText, Text, Title
|
||||||
|
from unstructured.partition.text_type import (
|
||||||
|
is_bulleted_text,
|
||||||
|
is_possible_narrative_text,
|
||||||
|
is_possible_title,
|
||||||
|
)
|
||||||
|
|
||||||
|
# NOTE(robinson) - documentation on built in styles can be found at the link below
|
||||||
|
# ref: https://python-docx.readthedocs.io/en/latest/user/
|
||||||
|
# styles-understanding.html#paragraph-styles-in-default-template
|
||||||
|
STYLE_TO_ELEMENT_MAPPING = {
|
||||||
|
"Body Text": NarrativeText,
|
||||||
|
"Body Text 2": NarrativeText,
|
||||||
|
"Body Text 3": NarrativeText,
|
||||||
|
"Caption": Text, # TODO(robinson) - add caption element type
|
||||||
|
"Heading 1": Title,
|
||||||
|
"Heading 2": Title,
|
||||||
|
"Heading 3": Title,
|
||||||
|
"Heading 4": Title,
|
||||||
|
"Heading 5": Title,
|
||||||
|
"Heading 6": Title,
|
||||||
|
"Heading 7": Title,
|
||||||
|
"Heading 8": Title,
|
||||||
|
"Heading 9": Title,
|
||||||
|
"Intense Quote": Text, # TODO(robinson) - add quote element type
|
||||||
|
"List": ListItem,
|
||||||
|
"List 2": ListItem,
|
||||||
|
"List 3": ListItem,
|
||||||
|
"List Bullet": ListItem,
|
||||||
|
"List Bullet 2": ListItem,
|
||||||
|
"List Bullet 3": ListItem,
|
||||||
|
"List Continue": ListItem,
|
||||||
|
"List Continue 2": ListItem,
|
||||||
|
"List Continue 3": ListItem,
|
||||||
|
"List Number": ListItem,
|
||||||
|
"List Number 2": ListItem,
|
||||||
|
"List Number 3": ListItem,
|
||||||
|
"List Paragraph": ListItem,
|
||||||
|
"Macro Text": Text,
|
||||||
|
"No Spacing": Text,
|
||||||
|
"Quote": Text, # TODO(robinson) - add quote element type
|
||||||
|
"Subtitle": Title,
|
||||||
|
"TOCHeading": Title,
|
||||||
|
"Title": Title,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def partition_docx(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]:
|
||||||
|
"""Partitions Microsoft Word Documents in .docx format into its document elements.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filename
|
||||||
|
A string defining the target filename path.
|
||||||
|
file
|
||||||
|
A file-like object using "rb" mode --> open(filename, "rb").
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not any([filename, file]):
|
||||||
|
raise ValueError("One of filename or file must be specified.")
|
||||||
|
|
||||||
|
if filename is not None and not file:
|
||||||
|
document = docx.Document(filename)
|
||||||
|
elif file is not None and not filename:
|
||||||
|
document = docx.Document(file)
|
||||||
|
else:
|
||||||
|
raise ValueError("Only one of filename or file can be specified.")
|
||||||
|
|
||||||
|
elements: List[Element] = []
|
||||||
|
for paragraph in document.paragraphs:
|
||||||
|
element = _paragraph_to_element(paragraph)
|
||||||
|
if element is not None:
|
||||||
|
elements.append(element)
|
||||||
|
|
||||||
|
return elements
|
||||||
|
|
||||||
|
|
||||||
|
def _paragraph_to_element(paragraph: docx.text.paragraph.Paragraph) -> Optional[Text]:
|
||||||
|
"""Converts a docx Paragraph object into the appropriate unstructured document element.
|
||||||
|
If the paragaraph style is "Normal" or unknown, we try to predict the element type from the
|
||||||
|
raw text."""
|
||||||
|
text = paragraph.text
|
||||||
|
style_name = paragraph.style.name
|
||||||
|
|
||||||
|
element_class = STYLE_TO_ELEMENT_MAPPING.get(style_name)
|
||||||
|
|
||||||
|
# NOTE(robinson) - The "Normal" style name will return None since it's in the mapping.
|
||||||
|
# Unknown style names will also return None
|
||||||
|
if element_class is None:
|
||||||
|
return _text_to_element(text)
|
||||||
|
else:
|
||||||
|
return element_class(text)
|
||||||
|
|
||||||
|
|
||||||
|
def _text_to_element(text: str) -> Optional[Text]:
|
||||||
|
"""Converts raw text into an unstructured Text element."""
|
||||||
|
if is_bulleted_text(text):
|
||||||
|
return ListItem(text=clean_bullets(text))
|
||||||
|
|
||||||
|
if len(text) < 2:
|
||||||
|
return None
|
||||||
|
elif is_possible_narrative_text(text):
|
||||||
|
return NarrativeText(text)
|
||||||
|
elif is_possible_title(text):
|
||||||
|
return Title(text)
|
||||||
|
else:
|
||||||
|
return Text(text)
|
Loading…
x
Reference in New Issue
Block a user