rfctr(doc): organize test_doc.py (#3017)

**Summary**
Organize DOC tests into related groups with markers. This makes it
easier to assess coverage and find tests related to particular
behaviors.

This is in preparation for adding tests related to DOC image extraction.

No code changes, purely line-block moves.

- Move module-level fixtures to the bottom.
- Organize tests into related groups with markers.
This commit is contained in:
Steve Canny 2024-05-14 13:57:31 -07:00 committed by GitHub
parent b4a6009c09
commit db186dc23b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 140 additions and 119 deletions

View File

@ -1,4 +1,4 @@
## 0.13.8-dev6
## 0.13.8-dev7
### Enhancements

View File

@ -20,59 +20,15 @@ from unstructured.partition.doc import partition_doc
from unstructured.partition.docx import partition_docx
def test_partition_doc_for_deterministic_and_unique_ids():
ids = [element.id for element in partition_doc("example-docs/duplicate-paragraphs.doc")]
assert ids == [
"ade273c622c48d67a7be7b3816d5b4d8",
"7d0b32fdf169f9578723486cb4bc1235",
"1feb6e8e9c1662cfaef75907aeeb0900",
"aa2a8ac10143b12f0fe2087837ea11d2",
"da31ba7ed3919067d2c6572dc1617271",
"1914359c179a160df921b769acf8c353",
"f9d0d379fc791bae487b7a45f65caa50",
]
def test_partition_doc_matches_partition_docx(mock_document, expected_elements, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
assert partition_doc(filename=doc_filename) == partition_docx(filename=docx_filename)
@pytest.fixture()
def mock_document():
document = docx.Document()
document.add_paragraph("These are a few of my favorite things:", style="Heading 1")
# NOTE(robinson) - this should get picked up as a list item due to the •
document.add_paragraph("• Parrots", style="Normal")
# NOTE(robinson) - this should get dropped because it's empty
document.add_paragraph("", style="Normal")
document.add_paragraph("Hockey", style="List Bullet")
# NOTE(robinson) - this should get dropped because it's empty
document.add_paragraph("", style="List Bullet")
# NOTE(robinson) - this should get picked up as a title
document.add_paragraph("Analysis", style="Normal")
# NOTE(robinson) - this should get dropped because it is empty
document.add_paragraph("", style="Normal")
# NOTE(robinson) - this should get picked up as a narrative text
document.add_paragraph("This is my first thought. This is my second thought.", style="Normal")
document.add_paragraph("This is my third thought.", style="Body Text")
# NOTE(robinson) - this should just be regular text
document.add_paragraph("2023")
# NOTE(robinson) - this should be an address
document.add_paragraph("DOYLESTOWN, PA 18901")
return document
@pytest.fixture()
def expected_elements():
return [
Title("These are a few of my favorite things:"),
ListItem("Parrots"),
ListItem("Hockey"),
Title("Analysis"),
NarrativeText("This is my first thought. This is my second thought."),
NarrativeText("This is my third thought."),
Text("2023"),
Address("DOYLESTOWN, PA 18901"),
]
# -- document-source (file or filename) ----------------------------------------------------------
def test_partition_doc_from_filename(mock_document, expected_elements, tmpdir, capsys):
@ -88,36 +44,6 @@ def test_partition_doc_from_filename(mock_document, expected_elements, tmpdir, c
assert capsys.readouterr().err == ""
def test_partition_doc_from_filename_with_metadata_filename(
mock_document,
expected_elements,
tmpdir,
):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
elements = partition_doc(filename=doc_filename, metadata_filename="test")
assert elements == expected_elements
assert all(element.metadata.filename == "test" for element in elements)
def test_partition_doc_matches_partition_docx(mock_document, expected_elements, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
assert partition_doc(filename=doc_filename) == partition_docx(filename=docx_filename)
def test_partition_raises_with_missing_doc(mock_document, expected_elements, tmpdir):
doc_filename = os.path.join(tmpdir.dirname, "asdf.doc")
with pytest.raises(ValueError):
partition_doc(filename=doc_filename)
def test_partition_doc_from_file_with_filter(mock_document, expected_elements, tmpdir, capsys):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
@ -148,18 +74,6 @@ def test_partition_doc_from_file_with_no_filter(mock_document, expected_elements
assert element.metadata.filename is None
def test_partition_doc_from_file_with_metadata_filename(mock_document, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
with open(doc_filename, "rb") as f:
elements = partition_doc(file=f, metadata_filename="test")
for element in elements:
assert element.metadata.filename == "test"
def test_partition_doc_raises_with_both_specified(mock_document, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
@ -175,6 +89,29 @@ def test_partition_doc_raises_with_neither():
partition_doc()
def test_partition_raises_with_missing_doc(mock_document, expected_elements, tmpdir):
doc_filename = os.path.join(tmpdir.dirname, "asdf.doc")
with pytest.raises(ValueError):
partition_doc(filename=doc_filename)
# -- `include_metadata` arg ----------------------------------------------------------------------
def test_partition_doc_from_filename_exclude_metadata(mock_document, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
elements = partition_doc(filename=doc_filename, include_metadata=False)
assert elements[0].metadata.filetype is None
assert elements[0].metadata.page_name is None
assert elements[0].metadata.filename is None
def test_partition_doc_from_file_exclude_metadata(mock_document, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
@ -189,17 +126,37 @@ def test_partition_doc_from_file_exclude_metadata(mock_document, tmpdir):
assert elements[0].metadata.filename is None
def test_partition_doc_from_filename_exclude_metadata(mock_document, tmpdir):
# -- .metadata.filename --------------------------------------------------------------------------
def test_partition_doc_from_filename_with_metadata_filename(
mock_document,
expected_elements,
tmpdir,
):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
elements = partition_doc(filename=doc_filename, include_metadata=False)
elements = partition_doc(filename=doc_filename, metadata_filename="test")
assert elements == expected_elements
assert all(element.metadata.filename == "test" for element in elements)
assert elements[0].metadata.filetype is None
assert elements[0].metadata.page_name is None
assert elements[0].metadata.filename is None
def test_partition_doc_from_file_with_metadata_filename(mock_document, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
with open(doc_filename, "rb") as f:
elements = partition_doc(file=f, metadata_filename="test")
for element in elements:
assert element.metadata.filename == "test"
# -- .metadata.last_modified ---------------------------------------------------------------------
def test_partition_doc_metadata_date(
@ -283,6 +240,19 @@ def test_partition_doc_from_file_explicit_get_metadata_date(
assert elements[0].metadata.last_modified == mocked_last_modification_date
def test_partition_doc_from_file_without_metadata_date(
filename="example-docs/fake.doc",
):
"""Test partition_doc() with file that are not possible to get last modified date"""
with open(filename, "rb") as f:
sf = SpooledTemporaryFile()
sf.write(f.read())
sf.seek(0)
elements = partition_doc(file=sf, date_from_file_object=True)
assert elements[0].metadata.last_modified is None
def test_partition_doc_from_file_metadata_date_with_custom_metadata(
mocker,
filename="example-docs/fake.doc",
@ -302,17 +272,23 @@ def test_partition_doc_from_file_metadata_date_with_custom_metadata(
assert elements[0].metadata.last_modified == expected_last_modified_date
def test_partition_doc_from_file_without_metadata_date(
filename="example-docs/fake.doc",
):
"""Test partition_doc() with file that are not possible to get last modified date"""
with open(filename, "rb") as f:
sf = SpooledTemporaryFile()
sf.write(f.read())
sf.seek(0)
elements = partition_doc(file=sf, date_from_file_object=True)
# -- language-recognition metadata ---------------------------------------------------------------
assert elements[0].metadata.last_modified is None
def test_partition_doc_element_metadata_has_languages():
filename = "example-docs/fake-doc-emphasized-text.doc"
elements = partition_doc(filename=filename)
assert elements[0].metadata.languages == ["eng"]
def test_partition_doc_respects_detect_language_per_element():
filename = "example-docs/language-docs/eng_spa_mult.doc"
elements = partition_doc(filename=filename, detect_language_per_element=True)
langs = [element.metadata.languages for element in elements]
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
# -- miscellaneous -------------------------------------------------------------------------------
def test_partition_doc_grabs_emphasized_texts():
@ -352,14 +328,59 @@ def test_add_chunking_strategy_on_partition_doc(filename="example-docs/fake.doc"
assert chunk_elements == chunks
def test_partition_doc_element_metadata_has_languages():
filename = "example-docs/fake-doc-emphasized-text.doc"
elements = partition_doc(filename=filename)
assert elements[0].metadata.languages == ["eng"]
def test_partition_doc_for_deterministic_and_unique_ids():
ids = [element.id for element in partition_doc("example-docs/duplicate-paragraphs.doc")]
assert ids == [
"ade273c622c48d67a7be7b3816d5b4d8",
"7d0b32fdf169f9578723486cb4bc1235",
"1feb6e8e9c1662cfaef75907aeeb0900",
"aa2a8ac10143b12f0fe2087837ea11d2",
"da31ba7ed3919067d2c6572dc1617271",
"1914359c179a160df921b769acf8c353",
"f9d0d379fc791bae487b7a45f65caa50",
]
def test_partition_doc_respects_detect_language_per_element():
filename = "example-docs/language-docs/eng_spa_mult.doc"
elements = partition_doc(filename=filename, detect_language_per_element=True)
langs = [element.metadata.languages for element in elements]
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
# == module-level fixtures =======================================================================
@pytest.fixture()
def expected_elements():
return [
Title("These are a few of my favorite things:"),
ListItem("Parrots"),
ListItem("Hockey"),
Title("Analysis"),
NarrativeText("This is my first thought. This is my second thought."),
NarrativeText("This is my third thought."),
Text("2023"),
Address("DOYLESTOWN, PA 18901"),
]
@pytest.fixture()
def mock_document():
document = docx.Document()
document.add_paragraph("These are a few of my favorite things:", style="Heading 1")
# NOTE(robinson) - this should get picked up as a list item due to the •
document.add_paragraph("• Parrots", style="Normal")
# NOTE(robinson) - this should get dropped because it's empty
document.add_paragraph("", style="Normal")
document.add_paragraph("Hockey", style="List Bullet")
# NOTE(robinson) - this should get dropped because it's empty
document.add_paragraph("", style="List Bullet")
# NOTE(robinson) - this should get picked up as a title
document.add_paragraph("Analysis", style="Normal")
# NOTE(robinson) - this should get dropped because it is empty
document.add_paragraph("", style="Normal")
# NOTE(robinson) - this should get picked up as a narrative text
document.add_paragraph("This is my first thought. This is my second thought.", style="Normal")
document.add_paragraph("This is my third thought.", style="Body Text")
# NOTE(robinson) - this should just be regular text
document.add_paragraph("2023")
# NOTE(robinson) - this should be an address
document.add_paragraph("DOYLESTOWN, PA 18901")
return document

View File

@ -1 +1 @@
__version__ = "0.13.8-dev6" # pragma: no cover
__version__ = "0.13.8-dev7" # pragma: no cover