diff --git a/CHANGELOG.md b/CHANGELOG.md index 4b8e29ebf..354be628f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.7.3-dev0 +## 0.7.3-dev1 ### Enhancements @@ -8,6 +8,7 @@ ### Fixes +* `convert_office_doc` no longers prints file conversion info messages to stdout. * `partition_via_api` reflects the actual filetype for the file processed in the API. ## 0.7.2 diff --git a/test_unstructured/partition/test_common.py b/test_unstructured/partition/test_common.py index 6da8e449b..48188aced 100644 --- a/test_unstructured/partition/test_common.py +++ b/test_unstructured/partition/test_common.py @@ -147,3 +147,19 @@ def test_normalize_layout_element_bulleted_list(): ListItem(text="You're cool too.", coordinates=((1, 2), (1, 4), (3, 4), (3, 2))), ListItem(text="We're all cool!", coordinates=((1, 2), (1, 4), (3, 4), (3, 2))), ] + + +class MockPopenWithError: + def __init__(self, *args, **kwargs): + pass + + def communicate(self): + return b"", b"an error occurred" + + +def test_convert_office_doc_captures_errors(monkeypatch, caplog): + import subprocess + + monkeypatch.setattr(subprocess, "Popen", MockPopenWithError) + common.convert_office_doc("no-real.docx", "fake-directory", target_format="docx") + assert "an error occurred" in caplog.text diff --git a/test_unstructured/partition/test_doc.py b/test_unstructured/partition/test_doc.py index 19b2e2b65..5c16ce144 100644 --- a/test_unstructured/partition/test_doc.py +++ b/test_unstructured/partition/test_doc.py @@ -56,7 +56,7 @@ def expected_elements(): ] -def test_partition_doc_with_filename(mock_document, expected_elements, tmpdir): +def test_partition_doc_with_filename(mock_document, expected_elements, tmpdir, capsys): docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") mock_document.save(docx_filename) @@ -67,6 +67,9 @@ def test_partition_doc_with_filename(mock_document, expected_elements, tmpdir): assert elements[0].metadata.filename == "mock_document.doc" assert elements[0].metadata.file_directory == tmpdir.dirname + assert capsys.readouterr().out == "" + assert capsys.readouterr().err == "" + def test_partition_doc_matches_partition_docx(mock_document, expected_elements, tmpdir): docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") @@ -84,7 +87,7 @@ def test_partition_raises_with_missing_doc(mock_document, expected_elements, tmp partition_doc(filename=doc_filename) -def test_partition_doc_with_file(mock_document, expected_elements, tmpdir): +def test_partition_doc_with_file(mock_document, expected_elements, tmpdir, capsys): docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") mock_document.save(docx_filename) @@ -94,6 +97,9 @@ def test_partition_doc_with_file(mock_document, expected_elements, tmpdir): elements = partition_doc(file=f) assert elements == expected_elements + assert capsys.readouterr().out == "" + assert capsys.readouterr().err == "" + def test_partition_doc_raises_with_both_specified(mock_document, tmpdir): docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") diff --git a/unstructured/__version__.py b/unstructured/__version__.py index a2a03f340..37b4438d8 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.7.3-dev0" # pragma: no cover +__version__ = "0.7.3-dev1" # pragma: no cover diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py index 2aa61814a..879bc5310 100644 --- a/unstructured/partition/common.py +++ b/unstructured/partition/common.py @@ -15,6 +15,7 @@ from unstructured.documents.elements import ( PageBreak, Text, ) +from unstructured.logger import logger from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE @@ -139,18 +140,22 @@ def convert_office_doc(input_filename: str, output_directory: str, target_format # users who do not have LibreOffice installed # ref: https://stackoverflow.com/questions/38468442/ # multiple-doc-to-docx-file-conversion-using-python + command = [ + "soffice", + "--headless", + "--convert-to", + target_format, + "--outdir", + output_directory, + input_filename, + ] try: - subprocess.call( - [ - "soffice", - "--headless", - "--convert-to", - target_format, - "--outdir", - output_directory, - input_filename, - ], + process = subprocess.Popen( + command, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, ) + output, error = process.communicate() except FileNotFoundError: raise FileNotFoundError( """soffice command was not found. Please install libreoffice @@ -161,6 +166,10 @@ on your system and try again. - Debian: https://wiki.debian.org/LibreOffice""", ) + logger.info(output.decode().strip()) + if error: + logger.error(error.decode().strip()) + def exactly_one(**kwargs) -> None: """