mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-11 07:57:21 +00:00
fix: suppress file conversion warnings in convert_office_doc (#703)
* test that output is suppressed * add test for error output * changelog and version
This commit is contained in:
parent
559a5578ba
commit
c1ba090c34
@ -1,4 +1,4 @@
|
|||||||
## 0.7.3-dev0
|
## 0.7.3-dev1
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -8,6 +8,7 @@
|
|||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
|
* `convert_office_doc` no longers prints file conversion info messages to stdout.
|
||||||
* `partition_via_api` reflects the actual filetype for the file processed in the API.
|
* `partition_via_api` reflects the actual filetype for the file processed in the API.
|
||||||
|
|
||||||
## 0.7.2
|
## 0.7.2
|
||||||
|
|||||||
@ -147,3 +147,19 @@ def test_normalize_layout_element_bulleted_list():
|
|||||||
ListItem(text="You're cool too.", coordinates=((1, 2), (1, 4), (3, 4), (3, 2))),
|
ListItem(text="You're cool too.", coordinates=((1, 2), (1, 4), (3, 4), (3, 2))),
|
||||||
ListItem(text="We're all cool!", coordinates=((1, 2), (1, 4), (3, 4), (3, 2))),
|
ListItem(text="We're all cool!", coordinates=((1, 2), (1, 4), (3, 4), (3, 2))),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class MockPopenWithError:
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def communicate(self):
|
||||||
|
return b"", b"an error occurred"
|
||||||
|
|
||||||
|
|
||||||
|
def test_convert_office_doc_captures_errors(monkeypatch, caplog):
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
monkeypatch.setattr(subprocess, "Popen", MockPopenWithError)
|
||||||
|
common.convert_office_doc("no-real.docx", "fake-directory", target_format="docx")
|
||||||
|
assert "an error occurred" in caplog.text
|
||||||
|
|||||||
@ -56,7 +56,7 @@ def expected_elements():
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_partition_doc_with_filename(mock_document, expected_elements, tmpdir):
|
def test_partition_doc_with_filename(mock_document, expected_elements, tmpdir, capsys):
|
||||||
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||||
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
|
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
|
||||||
mock_document.save(docx_filename)
|
mock_document.save(docx_filename)
|
||||||
@ -67,6 +67,9 @@ def test_partition_doc_with_filename(mock_document, expected_elements, tmpdir):
|
|||||||
assert elements[0].metadata.filename == "mock_document.doc"
|
assert elements[0].metadata.filename == "mock_document.doc"
|
||||||
assert elements[0].metadata.file_directory == tmpdir.dirname
|
assert elements[0].metadata.file_directory == tmpdir.dirname
|
||||||
|
|
||||||
|
assert capsys.readouterr().out == ""
|
||||||
|
assert capsys.readouterr().err == ""
|
||||||
|
|
||||||
|
|
||||||
def test_partition_doc_matches_partition_docx(mock_document, expected_elements, tmpdir):
|
def test_partition_doc_matches_partition_docx(mock_document, expected_elements, tmpdir):
|
||||||
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||||
@ -84,7 +87,7 @@ def test_partition_raises_with_missing_doc(mock_document, expected_elements, tmp
|
|||||||
partition_doc(filename=doc_filename)
|
partition_doc(filename=doc_filename)
|
||||||
|
|
||||||
|
|
||||||
def test_partition_doc_with_file(mock_document, expected_elements, tmpdir):
|
def test_partition_doc_with_file(mock_document, expected_elements, tmpdir, capsys):
|
||||||
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||||
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
|
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
|
||||||
mock_document.save(docx_filename)
|
mock_document.save(docx_filename)
|
||||||
@ -94,6 +97,9 @@ def test_partition_doc_with_file(mock_document, expected_elements, tmpdir):
|
|||||||
elements = partition_doc(file=f)
|
elements = partition_doc(file=f)
|
||||||
assert elements == expected_elements
|
assert elements == expected_elements
|
||||||
|
|
||||||
|
assert capsys.readouterr().out == ""
|
||||||
|
assert capsys.readouterr().err == ""
|
||||||
|
|
||||||
|
|
||||||
def test_partition_doc_raises_with_both_specified(mock_document, tmpdir):
|
def test_partition_doc_raises_with_both_specified(mock_document, tmpdir):
|
||||||
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.7.3-dev0" # pragma: no cover
|
__version__ = "0.7.3-dev1" # pragma: no cover
|
||||||
|
|||||||
@ -15,6 +15,7 @@ from unstructured.documents.elements import (
|
|||||||
PageBreak,
|
PageBreak,
|
||||||
Text,
|
Text,
|
||||||
)
|
)
|
||||||
|
from unstructured.logger import logger
|
||||||
from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
|
from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
|
||||||
|
|
||||||
|
|
||||||
@ -139,9 +140,7 @@ def convert_office_doc(input_filename: str, output_directory: str, target_format
|
|||||||
# users who do not have LibreOffice installed
|
# users who do not have LibreOffice installed
|
||||||
# ref: https://stackoverflow.com/questions/38468442/
|
# ref: https://stackoverflow.com/questions/38468442/
|
||||||
# multiple-doc-to-docx-file-conversion-using-python
|
# multiple-doc-to-docx-file-conversion-using-python
|
||||||
try:
|
command = [
|
||||||
subprocess.call(
|
|
||||||
[
|
|
||||||
"soffice",
|
"soffice",
|
||||||
"--headless",
|
"--headless",
|
||||||
"--convert-to",
|
"--convert-to",
|
||||||
@ -149,8 +148,14 @@ def convert_office_doc(input_filename: str, output_directory: str, target_format
|
|||||||
"--outdir",
|
"--outdir",
|
||||||
output_directory,
|
output_directory,
|
||||||
input_filename,
|
input_filename,
|
||||||
],
|
]
|
||||||
|
try:
|
||||||
|
process = subprocess.Popen(
|
||||||
|
command,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
)
|
)
|
||||||
|
output, error = process.communicate()
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
raise FileNotFoundError(
|
raise FileNotFoundError(
|
||||||
"""soffice command was not found. Please install libreoffice
|
"""soffice command was not found. Please install libreoffice
|
||||||
@ -161,6 +166,10 @@ on your system and try again.
|
|||||||
- Debian: https://wiki.debian.org/LibreOffice""",
|
- Debian: https://wiki.debian.org/LibreOffice""",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
logger.info(output.decode().strip())
|
||||||
|
if error:
|
||||||
|
logger.error(error.decode().strip())
|
||||||
|
|
||||||
|
|
||||||
def exactly_one(**kwargs) -> None:
|
def exactly_one(**kwargs) -> None:
|
||||||
"""
|
"""
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user