fix: suppress file conversion warnings in convert_office_doc (#703)

* test that output is suppressed

* add test for error output

* changelog and version
This commit is contained in:
Matt Robinson 2023-06-08 12:33:06 -04:00 committed by GitHub
parent 559a5578ba
commit c1ba090c34
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 46 additions and 14 deletions

View File

@ -1,4 +1,4 @@
## 0.7.3-dev0
## 0.7.3-dev1
### Enhancements
@ -8,6 +8,7 @@
### Fixes
* `convert_office_doc` no longers prints file conversion info messages to stdout.
* `partition_via_api` reflects the actual filetype for the file processed in the API.
## 0.7.2

View File

@ -147,3 +147,19 @@ def test_normalize_layout_element_bulleted_list():
ListItem(text="You're cool too.", coordinates=((1, 2), (1, 4), (3, 4), (3, 2))),
ListItem(text="We're all cool!", coordinates=((1, 2), (1, 4), (3, 4), (3, 2))),
]
class MockPopenWithError:
def __init__(self, *args, **kwargs):
pass
def communicate(self):
return b"", b"an error occurred"
def test_convert_office_doc_captures_errors(monkeypatch, caplog):
import subprocess
monkeypatch.setattr(subprocess, "Popen", MockPopenWithError)
common.convert_office_doc("no-real.docx", "fake-directory", target_format="docx")
assert "an error occurred" in caplog.text

View File

@ -56,7 +56,7 @@ def expected_elements():
]
def test_partition_doc_with_filename(mock_document, expected_elements, tmpdir):
def test_partition_doc_with_filename(mock_document, expected_elements, tmpdir, capsys):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
@ -67,6 +67,9 @@ def test_partition_doc_with_filename(mock_document, expected_elements, tmpdir):
assert elements[0].metadata.filename == "mock_document.doc"
assert elements[0].metadata.file_directory == tmpdir.dirname
assert capsys.readouterr().out == ""
assert capsys.readouterr().err == ""
def test_partition_doc_matches_partition_docx(mock_document, expected_elements, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
@ -84,7 +87,7 @@ def test_partition_raises_with_missing_doc(mock_document, expected_elements, tmp
partition_doc(filename=doc_filename)
def test_partition_doc_with_file(mock_document, expected_elements, tmpdir):
def test_partition_doc_with_file(mock_document, expected_elements, tmpdir, capsys):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
@ -94,6 +97,9 @@ def test_partition_doc_with_file(mock_document, expected_elements, tmpdir):
elements = partition_doc(file=f)
assert elements == expected_elements
assert capsys.readouterr().out == ""
assert capsys.readouterr().err == ""
def test_partition_doc_raises_with_both_specified(mock_document, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")

View File

@ -1 +1 @@
__version__ = "0.7.3-dev0" # pragma: no cover
__version__ = "0.7.3-dev1" # pragma: no cover

View File

@ -15,6 +15,7 @@ from unstructured.documents.elements import (
PageBreak,
Text,
)
from unstructured.logger import logger
from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
@ -139,18 +140,22 @@ def convert_office_doc(input_filename: str, output_directory: str, target_format
# users who do not have LibreOffice installed
# ref: https://stackoverflow.com/questions/38468442/
# multiple-doc-to-docx-file-conversion-using-python
command = [
"soffice",
"--headless",
"--convert-to",
target_format,
"--outdir",
output_directory,
input_filename,
]
try:
subprocess.call(
[
"soffice",
"--headless",
"--convert-to",
target_format,
"--outdir",
output_directory,
input_filename,
],
process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
output, error = process.communicate()
except FileNotFoundError:
raise FileNotFoundError(
"""soffice command was not found. Please install libreoffice
@ -161,6 +166,10 @@ on your system and try again.
- Debian: https://wiki.debian.org/LibreOffice""",
)
logger.info(output.decode().strip())
if error:
logger.error(error.decode().strip())
def exactly_one(**kwargs) -> None:
"""