Matt Robinson 331c7faf38
build(deps): split up dependencies by document type (#986)
* split dependencies by document type

* make pip-compile with new requirements

* add extra requirements to setup.py

* add in all docs; re pip-compile

* extra for all docs

* add pandas to xlsx

* dependency requires for tsv and csv

* handling for doc, docx and odt

* dependency check for pypandoc

* required dependencies for pandoc files

* xml and html

* markdown

* msg

* add in pdf

* add in pptx

* add in excel

* add lxml as base req

* extra all docs for local inference

* local inference installs all

* pin pillow version

* fixes for plain text tests

* fixes for doc

* update make commands

* changelog and version

* add xlrd

* update pip-compile

* pin numpy for python 3.8 support

* more constraints

* contraint on scipy

* update install docs

* constrain ipython

* add outlook to pip-compile

* more ipython constraints

* add extras to dockerfile

* pin office365 client

* few doc tweaks

* types as strings

* last pip-compile

* re pip-comple

* make tidy

* make tidy
2023-08-01 11:31:13 -04:00

273 lines
9.7 KiB
Python

import os
from tempfile import SpooledTemporaryFile
import docx
import pytest
from unstructured.documents.elements import (
Address,
ListItem,
NarrativeText,
Text,
Title,
)
from unstructured.partition.common import convert_office_doc
from unstructured.partition.doc import partition_doc
from unstructured.partition.docx import partition_docx
@pytest.fixture()
def mock_document():
document = docx.Document()
document.add_paragraph("These are a few of my favorite things:", style="Heading 1")
# NOTE(robinson) - this should get picked up as a list item due to the •
document.add_paragraph("• Parrots", style="Normal")
# NOTE(robinson) - this should get dropped because it's empty
document.add_paragraph("", style="Normal")
document.add_paragraph("Hockey", style="List Bullet")
# NOTE(robinson) - this should get dropped because it's empty
document.add_paragraph("", style="List Bullet")
# NOTE(robinson) - this should get picked up as a title
document.add_paragraph("Analysis", style="Normal")
# NOTE(robinson) - this should get dropped because it is empty
document.add_paragraph("", style="Normal")
# NOTE(robinson) - this should get picked up as a narrative text
document.add_paragraph(
"This is my first thought. This is my second thought.",
style="Normal",
)
document.add_paragraph("This is my third thought.", style="Body Text")
# NOTE(robinson) - this should just be regular text
document.add_paragraph("2023")
# NOTE(robinson) - this should be an address
document.add_paragraph("DOYLESTOWN, PA 18901")
return document
@pytest.fixture()
def expected_elements():
return [
Title("These are a few of my favorite things:"),
ListItem("Parrots"),
ListItem("Hockey"),
Title("Analysis"),
NarrativeText("This is my first thought. This is my second thought."),
NarrativeText("This is my third thought."),
Text("2023"),
Address("DOYLESTOWN, PA 18901"),
]
def test_partition_doc_from_filename(mock_document, expected_elements, tmpdir, capsys):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
elements = partition_doc(filename=doc_filename)
assert elements == expected_elements
assert elements[0].metadata.filename == "mock_document.doc"
assert elements[0].metadata.file_directory == tmpdir.dirname
assert capsys.readouterr().out == ""
assert capsys.readouterr().err == ""
def test_partition_doc_from_filename_with_metadata_filename(
mock_document,
expected_elements,
tmpdir,
):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
elements = partition_doc(filename=doc_filename, metadata_filename="test")
assert elements == expected_elements
assert all(element.metadata.filename == "test" for element in elements)
def test_partition_doc_matches_partition_docx(mock_document, expected_elements, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
assert partition_doc(filename=doc_filename) == partition_docx(filename=docx_filename)
def test_partition_raises_with_missing_doc(mock_document, expected_elements, tmpdir):
doc_filename = os.path.join(tmpdir.dirname, "asdf.doc")
with pytest.raises(ValueError):
partition_doc(filename=doc_filename)
def test_partition_doc_from_file_with_filter(mock_document, expected_elements, tmpdir, capsys):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
with open(doc_filename, "rb") as f:
elements = partition_doc(file=f, libre_office_filter="MS Word 2007 XML")
assert elements == expected_elements
assert capsys.readouterr().out == ""
assert capsys.readouterr().err == ""
for element in elements:
assert element.metadata.filename is None
def test_partition_doc_from_file_with_no_filter(mock_document, expected_elements, tmpdir, capsys):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
with open(doc_filename, "rb") as f:
elements = partition_doc(file=f, libre_office_filter=None)
assert elements == expected_elements
assert capsys.readouterr().out == ""
assert capsys.readouterr().err == ""
for element in elements:
assert element.metadata.filename is None
def test_partition_doc_from_file_with_metadata_filename(mock_document, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
with open(doc_filename, "rb") as f:
elements = partition_doc(file=f, metadata_filename="test")
for element in elements:
assert element.metadata.filename == "test"
def test_partition_doc_raises_with_both_specified(mock_document, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
with open(doc_filename, "rb") as f, pytest.raises(ValueError):
partition_doc(filename=doc_filename, file=f)
def test_partition_doc_raises_with_neither():
with pytest.raises(ValueError):
partition_doc()
def test_partition_doc_from_file_exclude_metadata(mock_document, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
with open(doc_filename, "rb") as f:
elements = partition_doc(file=f, include_metadata=False)
assert elements[0].metadata.filetype is None
assert elements[0].metadata.page_name is None
assert elements[0].metadata.filename is None
def test_partition_doc_from_filename_exclude_metadata(mock_document, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
elements = partition_doc(filename=doc_filename, include_metadata=False)
assert elements[0].metadata.filetype is None
assert elements[0].metadata.page_name is None
assert elements[0].metadata.filename is None
def test_partition_doc_metadata_date(
mocker,
filename="example-docs/fake.doc",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.doc.get_last_modified_date",
return_value=mocked_last_modification_date,
)
elements = partition_doc(filename=filename)
assert elements[0].metadata.last_modified == mocked_last_modification_date
def test_partition_doc_metadata_date_with_custom_metadata(
mocker,
filename="example-docs/fake.doc",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modified_date = "2020-07-05T09:24:28"
mocker.patch(
"unstructured.partition.doc.get_last_modified_date",
return_value=mocked_last_modification_date,
)
elements = partition_doc(
filename=filename,
metadata_last_modified=expected_last_modified_date,
)
assert elements[0].metadata.last_modified == expected_last_modified_date
def test_partition_doc_from_file_metadata_date(
mocker,
filename="example-docs/fake.doc",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.doc.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
)
with open(filename, "rb") as f:
elements = partition_doc(file=f)
assert elements[0].metadata.last_modified == mocked_last_modification_date
def test_partition_doc_from_file_metadata_date_with_custom_metadata(
mocker,
filename="example-docs/fake.doc",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modified_date = "2020-07-05T09:24:28"
mocker.patch(
"unstructured.partition.doc.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
)
with open(filename, "rb") as f:
elements = partition_doc(file=f, metadata_last_modified=expected_last_modified_date)
assert elements[0].metadata.last_modified == expected_last_modified_date
@pytest.mark.xfail(reason="handling of last_modified for file vs. filename to be refined later")
def test_partition_doc_from_file_without_metadata_date(
filename="example-docs/fake.doc",
):
"""Test partition_doc() with file that are not possible to get last modified date"""
with open(filename, "rb") as f:
sf = SpooledTemporaryFile()
sf.write(f.read())
sf.seek(0)
elements = partition_doc(file=sf, metadata_date="2020-07-05")
assert elements[0].metadata.date == "2020-07-05"