Matt Robinson 339c133326
fix: cleanup from live .docx tests (#177)
* add env var for cap threshold; raise default threshold

* update docs and tests

* added check for ending in a comma

* update docs

* no caps check for all upper text

* capture Text in html and text

* check category in Text equality check

* lower case all caps before checking for verbs

* added check for us city/state/zip

* added address type

* add address to html

* add address to text

* fix for text tests; escape for large text segments

* refactor regex for readability

* update comment

* additional test for text with linebreaks

* update docs

* update changelog

* update elements docs

* remove old comment

* case -> cast

* type fix
2023-01-26 15:52:25 +00:00

80 lines
2.8 KiB
Python

import os
import pytest
import docx
from unstructured.documents.elements import Address, ListItem, NarrativeText, Title, Text
from unstructured.partition.docx import partition_docx
@pytest.fixture
def mock_document():
document = docx.Document()
document.add_paragraph("These are a few of my favorite things:", style="Heading 1")
# NOTE(robinson) - this should get picked up as a list item due to the •
document.add_paragraph("• Parrots", style="Normal")
# NOTE(robinson) - this should get dropped because it's empty
document.add_paragraph("", style="Normal")
document.add_paragraph("Hockey", style="List Bullet")
# NOTE(robinson) - this should get dropped because it's empty
document.add_paragraph("", style="List Bullet")
# NOTE(robinson) - this should get picked up as a title
document.add_paragraph("Analysis", style="Normal")
# NOTE(robinson) - this should get dropped because it is empty
document.add_paragraph("", style="Normal")
# NOTE(robinson) - this should get picked up as a narrative text
document.add_paragraph("This is my first thought. This is my second thought.", style="Normal")
document.add_paragraph("This is my third thought.", style="Body Text")
# NOTE(robinson) - this should just be regular text
document.add_paragraph("2023")
# NOTE(robinson) - this should be an address
document.add_paragraph("DOYLESTOWN, PA 18901")
return document
@pytest.fixture
def expected_elements():
return [
Title("These are a few of my favorite things:"),
ListItem("Parrots"),
ListItem("Hockey"),
Title("Analysis"),
NarrativeText("This is my first thought. This is my second thought."),
NarrativeText("This is my third thought."),
Text("2023"),
Address("DOYLESTOWN, PA 18901"),
]
def test_partition_docx_with_filename(mock_document, expected_elements, tmpdir):
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
mock_document.save(filename)
elements = partition_docx(filename=filename)
assert elements == expected_elements
def test_partition_docx_with_file(mock_document, expected_elements, tmpdir):
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
mock_document.save(filename)
with open(filename, "rb") as f:
elements = partition_docx(file=f)
assert elements == expected_elements
def test_partition_docx_raises_with_both_specified(mock_document, tmpdir):
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
mock_document.save(filename)
with open(filename, "rb") as f:
with pytest.raises(ValueError):
partition_docx(filename=filename, file=f)
def test_partition_docx_raises_with_neither():
with pytest.raises(ValueError):
partition_docx()