mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-02 02:53:31 +00:00
fix: suppress file conversion warnings in convert_office_doc (#703)
* test that output is suppressed * add test for error output * changelog and version
This commit is contained in:
parent
559a5578ba
commit
c1ba090c34
@ -1,4 +1,4 @@
|
||||
## 0.7.3-dev0
|
||||
## 0.7.3-dev1
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -8,6 +8,7 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* `convert_office_doc` no longers prints file conversion info messages to stdout.
|
||||
* `partition_via_api` reflects the actual filetype for the file processed in the API.
|
||||
|
||||
## 0.7.2
|
||||
|
||||
@ -147,3 +147,19 @@ def test_normalize_layout_element_bulleted_list():
|
||||
ListItem(text="You're cool too.", coordinates=((1, 2), (1, 4), (3, 4), (3, 2))),
|
||||
ListItem(text="We're all cool!", coordinates=((1, 2), (1, 4), (3, 4), (3, 2))),
|
||||
]
|
||||
|
||||
|
||||
class MockPopenWithError:
|
||||
def __init__(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
def communicate(self):
|
||||
return b"", b"an error occurred"
|
||||
|
||||
|
||||
def test_convert_office_doc_captures_errors(monkeypatch, caplog):
|
||||
import subprocess
|
||||
|
||||
monkeypatch.setattr(subprocess, "Popen", MockPopenWithError)
|
||||
common.convert_office_doc("no-real.docx", "fake-directory", target_format="docx")
|
||||
assert "an error occurred" in caplog.text
|
||||
|
||||
@ -56,7 +56,7 @@ def expected_elements():
|
||||
]
|
||||
|
||||
|
||||
def test_partition_doc_with_filename(mock_document, expected_elements, tmpdir):
|
||||
def test_partition_doc_with_filename(mock_document, expected_elements, tmpdir, capsys):
|
||||
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
|
||||
mock_document.save(docx_filename)
|
||||
@ -67,6 +67,9 @@ def test_partition_doc_with_filename(mock_document, expected_elements, tmpdir):
|
||||
assert elements[0].metadata.filename == "mock_document.doc"
|
||||
assert elements[0].metadata.file_directory == tmpdir.dirname
|
||||
|
||||
assert capsys.readouterr().out == ""
|
||||
assert capsys.readouterr().err == ""
|
||||
|
||||
|
||||
def test_partition_doc_matches_partition_docx(mock_document, expected_elements, tmpdir):
|
||||
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
@ -84,7 +87,7 @@ def test_partition_raises_with_missing_doc(mock_document, expected_elements, tmp
|
||||
partition_doc(filename=doc_filename)
|
||||
|
||||
|
||||
def test_partition_doc_with_file(mock_document, expected_elements, tmpdir):
|
||||
def test_partition_doc_with_file(mock_document, expected_elements, tmpdir, capsys):
|
||||
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
|
||||
mock_document.save(docx_filename)
|
||||
@ -94,6 +97,9 @@ def test_partition_doc_with_file(mock_document, expected_elements, tmpdir):
|
||||
elements = partition_doc(file=f)
|
||||
assert elements == expected_elements
|
||||
|
||||
assert capsys.readouterr().out == ""
|
||||
assert capsys.readouterr().err == ""
|
||||
|
||||
|
||||
def test_partition_doc_raises_with_both_specified(mock_document, tmpdir):
|
||||
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.7.3-dev0" # pragma: no cover
|
||||
__version__ = "0.7.3-dev1" # pragma: no cover
|
||||
|
||||
@ -15,6 +15,7 @@ from unstructured.documents.elements import (
|
||||
PageBreak,
|
||||
Text,
|
||||
)
|
||||
from unstructured.logger import logger
|
||||
from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
|
||||
|
||||
|
||||
@ -139,18 +140,22 @@ def convert_office_doc(input_filename: str, output_directory: str, target_format
|
||||
# users who do not have LibreOffice installed
|
||||
# ref: https://stackoverflow.com/questions/38468442/
|
||||
# multiple-doc-to-docx-file-conversion-using-python
|
||||
command = [
|
||||
"soffice",
|
||||
"--headless",
|
||||
"--convert-to",
|
||||
target_format,
|
||||
"--outdir",
|
||||
output_directory,
|
||||
input_filename,
|
||||
]
|
||||
try:
|
||||
subprocess.call(
|
||||
[
|
||||
"soffice",
|
||||
"--headless",
|
||||
"--convert-to",
|
||||
target_format,
|
||||
"--outdir",
|
||||
output_directory,
|
||||
input_filename,
|
||||
],
|
||||
process = subprocess.Popen(
|
||||
command,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
output, error = process.communicate()
|
||||
except FileNotFoundError:
|
||||
raise FileNotFoundError(
|
||||
"""soffice command was not found. Please install libreoffice
|
||||
@ -161,6 +166,10 @@ on your system and try again.
|
||||
- Debian: https://wiki.debian.org/LibreOffice""",
|
||||
)
|
||||
|
||||
logger.info(output.decode().strip())
|
||||
if error:
|
||||
logger.error(error.decode().strip())
|
||||
|
||||
|
||||
def exactly_one(**kwargs) -> None:
|
||||
"""
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user