diff --git a/CHANGELOG.md b/CHANGELOG.md index d3b92ecaf..687f7dd2f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,10 @@ -## 0.3.6-dev0 +## 0.3.6-dev1 * Cleaning brick for removing ordered bullets `clean_ordered_bullets`. * Extract brick method for ordered bullets `extract_ordered_bullets`. * Test for `clean_ordered_bullets`. * Test for `extract_ordered_bullets`. +* Added `partition_docx` for pre-processing Word Documents. ## 0.3.5 diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst index 12d87d67c..413f3b01a 100644 --- a/docs/source/bricks.rst +++ b/docs/source/bricks.rst @@ -14,6 +14,36 @@ The partitioning bricks in ``unstructured`` differentiate between different sect of text in a document. For example, the partitioning bricks can help distinguish between titles, narrative text, and tables. + +``partition_docx`` +------------------ + +The ``partition_docx`` partitioning brick pre-processes Microsoft Word documents +saved in the ``.docx`` format. This staging brick uses a combination of the styling +information in the document and the structure of the text to determine the type +of a text element. The ``partition_docx`` can take a filename or file-like object +as input, as shown in the two examples below. + +Examples: + +.. code:: python + + import docx + + from unstructured.partition.docx import partition_docx + + document = docx.Document() + document.add_paragraph("Important Analysis", style="Heading 1") + document.add_paragraph("Here is my first thought.", style="Body Text") + document.add_paragraph("Here is my second thought.", style="Normal") + document.save("mydoc.docx") + + elements = partition_docx(filename="mydoc.docx") + + with open("mydoc.docx", "rb") as f: + elements = partition_docx(file=f) + + ``partition_html`` --------------------- @@ -329,7 +359,7 @@ Examples: ``clean_ordered_bullets`` ------------------ +------------------------- Remove alpha-numeric bullets from the beginning of text up to three “sub-section” levels. diff --git a/test_unstructured/partition/test_docx.py b/test_unstructured/partition/test_docx.py new file mode 100644 index 000000000..aaa9f94c5 --- /dev/null +++ b/test_unstructured/partition/test_docx.py @@ -0,0 +1,72 @@ +import os +import pytest + +import docx + +from unstructured.documents.elements import ListItem, NarrativeText, Title, Text +from unstructured.partition.docx import partition_docx + + +@pytest.fixture +def mock_document(): + document = docx.Document() + + document.add_paragraph("These are a few of my favorite things:", style="Heading 1") + # NOTE(robinson) - this should get picked up as a list item due to the • + document.add_paragraph("• Parrots", style="Normal") + document.add_paragraph("Hockey", style="List Bullet") + # NOTE(robinson) - this should get picked up as a title + document.add_paragraph("Analysis", style="Normal") + # NOTE(robinson) - this should get dropped because it is empty + document.add_paragraph("", style="Normal") + # NOTE(robinson) - this should get picked up as a narrative text + document.add_paragraph("This is my first thought. This is my second thought.", style="Normal") + document.add_paragraph("This is my third thought.", style="Body Text") + # NOTE(robinson) - this should just be regular text + document.add_paragraph("2023") + + return document + + +@pytest.fixture +def expected_elements(): + return [ + Title("These are a few of my favorite things:"), + ListItem("Parrots"), + ListItem("Hockey"), + Title("Analysis"), + NarrativeText("This is my first thought. This is my second thought."), + NarrativeText("This is my third thought."), + Text("2023"), + ] + + +def test_partition_docx_with_filename(mock_document, expected_elements, tmpdir): + filename = os.path.join(tmpdir.dirname, "mock_document.docx") + mock_document.save(filename) + + elements = partition_docx(filename=filename) + assert elements == expected_elements + + +def test_partition_docx_with_file(mock_document, expected_elements, tmpdir): + filename = os.path.join(tmpdir.dirname, "mock_document.docx") + mock_document.save(filename) + + with open(filename, "rb") as f: + elements = partition_docx(file=f) + assert elements == expected_elements + + +def test_partition_docx_raises_with_both_specified(mock_document, tmpdir): + filename = os.path.join(tmpdir.dirname, "mock_document.docx") + mock_document.save(filename) + + with open(filename, "rb") as f: + with pytest.raises(ValueError): + partition_docx(filename=filename, file=f) + + +def test_partition_docx_raises_with_neither(): + with pytest.raises(ValueError): + partition_docx() diff --git a/unstructured/__version__.py b/unstructured/__version__.py index ca55e46be..272bf81d4 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.3.6-dev0" # pragma: no cover +__version__ = "0.3.6-dev1" # pragma: no cover diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py new file mode 100644 index 000000000..6558de261 --- /dev/null +++ b/unstructured/partition/docx.py @@ -0,0 +1,112 @@ +from typing import IO, List, Optional + +import docx + +from unstructured.cleaners.core import clean_bullets +from unstructured.documents.elements import Element, ListItem, NarrativeText, Text, Title +from unstructured.partition.text_type import ( + is_bulleted_text, + is_possible_narrative_text, + is_possible_title, +) + +# NOTE(robinson) - documentation on built in styles can be found at the link below +# ref: https://python-docx.readthedocs.io/en/latest/user/ +# styles-understanding.html#paragraph-styles-in-default-template +STYLE_TO_ELEMENT_MAPPING = { + "Body Text": NarrativeText, + "Body Text 2": NarrativeText, + "Body Text 3": NarrativeText, + "Caption": Text, # TODO(robinson) - add caption element type + "Heading 1": Title, + "Heading 2": Title, + "Heading 3": Title, + "Heading 4": Title, + "Heading 5": Title, + "Heading 6": Title, + "Heading 7": Title, + "Heading 8": Title, + "Heading 9": Title, + "Intense Quote": Text, # TODO(robinson) - add quote element type + "List": ListItem, + "List 2": ListItem, + "List 3": ListItem, + "List Bullet": ListItem, + "List Bullet 2": ListItem, + "List Bullet 3": ListItem, + "List Continue": ListItem, + "List Continue 2": ListItem, + "List Continue 3": ListItem, + "List Number": ListItem, + "List Number 2": ListItem, + "List Number 3": ListItem, + "List Paragraph": ListItem, + "Macro Text": Text, + "No Spacing": Text, + "Quote": Text, # TODO(robinson) - add quote element type + "Subtitle": Title, + "TOCHeading": Title, + "Title": Title, +} + + +def partition_docx(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]: + """Partitions Microsoft Word Documents in .docx format into its document elements. + + Parameters + ---------- + filename + A string defining the target filename path. + file + A file-like object using "rb" mode --> open(filename, "rb"). + """ + + if not any([filename, file]): + raise ValueError("One of filename or file must be specified.") + + if filename is not None and not file: + document = docx.Document(filename) + elif file is not None and not filename: + document = docx.Document(file) + else: + raise ValueError("Only one of filename or file can be specified.") + + elements: List[Element] = [] + for paragraph in document.paragraphs: + element = _paragraph_to_element(paragraph) + if element is not None: + elements.append(element) + + return elements + + +def _paragraph_to_element(paragraph: docx.text.paragraph.Paragraph) -> Optional[Text]: + """Converts a docx Paragraph object into the appropriate unstructured document element. + If the paragaraph style is "Normal" or unknown, we try to predict the element type from the + raw text.""" + text = paragraph.text + style_name = paragraph.style.name + + element_class = STYLE_TO_ELEMENT_MAPPING.get(style_name) + + # NOTE(robinson) - The "Normal" style name will return None since it's in the mapping. + # Unknown style names will also return None + if element_class is None: + return _text_to_element(text) + else: + return element_class(text) + + +def _text_to_element(text: str) -> Optional[Text]: + """Converts raw text into an unstructured Text element.""" + if is_bulleted_text(text): + return ListItem(text=clean_bullets(text)) + + if len(text) < 2: + return None + elif is_possible_narrative_text(text): + return NarrativeText(text) + elif is_possible_title(text): + return Title(text) + else: + return Text(text)