mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-10 15:37:58 +00:00
fix: enable passing filters to partition_doc for libreoffice conversion (#934)
* add optional filter to docx conversion * add filters to tests * changelog and version * update filter for power point
This commit is contained in:
parent
067eb5701f
commit
0d332743eb
@ -1,4 +1,4 @@
|
|||||||
## 0.8.2-dev1
|
## 0.8.2-dev2
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -6,6 +6,7 @@
|
|||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
|
* Enables filters to be passed to `partition_doc` so it doesn't error with LibreOffice7.
|
||||||
* Adds Onedrive connector.
|
* Adds Onedrive connector.
|
||||||
|
|
||||||
## 0.8.2-dev0
|
## 0.8.2-dev0
|
||||||
|
|||||||
@ -99,14 +99,29 @@ def test_partition_raises_with_missing_doc(mock_document, expected_elements, tmp
|
|||||||
partition_doc(filename=doc_filename)
|
partition_doc(filename=doc_filename)
|
||||||
|
|
||||||
|
|
||||||
def test_partition_doc_from_file(mock_document, expected_elements, tmpdir, capsys):
|
def test_partition_doc_from_file_with_filter(mock_document, expected_elements, tmpdir, capsys):
|
||||||
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||||
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
|
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
|
||||||
mock_document.save(docx_filename)
|
mock_document.save(docx_filename)
|
||||||
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
|
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
|
||||||
|
|
||||||
with open(doc_filename, "rb") as f:
|
with open(doc_filename, "rb") as f:
|
||||||
elements = partition_doc(file=f)
|
elements = partition_doc(file=f, libre_office_filter="MS Word 2007 XML")
|
||||||
|
assert elements == expected_elements
|
||||||
|
assert capsys.readouterr().out == ""
|
||||||
|
assert capsys.readouterr().err == ""
|
||||||
|
for element in elements:
|
||||||
|
assert element.metadata.filename is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_doc_from_file_with_no_filter(mock_document, expected_elements, tmpdir, capsys):
|
||||||
|
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||||
|
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
|
||||||
|
mock_document.save(docx_filename)
|
||||||
|
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
|
||||||
|
|
||||||
|
with open(doc_filename, "rb") as f:
|
||||||
|
elements = partition_doc(file=f, libre_office_filter=None)
|
||||||
assert elements == expected_elements
|
assert elements == expected_elements
|
||||||
assert capsys.readouterr().out == ""
|
assert capsys.readouterr().out == ""
|
||||||
assert capsys.readouterr().err == ""
|
assert capsys.readouterr().err == ""
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.8.2-dev1" # pragma: no cover
|
__version__ = "0.8.2-dev2" # pragma: no cover
|
||||||
|
|||||||
@ -152,8 +152,34 @@ def _remove_element_metadata(
|
|||||||
return elements
|
return elements
|
||||||
|
|
||||||
|
|
||||||
def convert_office_doc(input_filename: str, output_directory: str, target_format: str):
|
def convert_office_doc(
|
||||||
"""Converts a .doc file to a .docx file using the libreoffice CLI."""
|
input_filename: str,
|
||||||
|
output_directory: str,
|
||||||
|
target_format: str = "docx",
|
||||||
|
target_filter: Optional[str] = None,
|
||||||
|
):
|
||||||
|
"""Converts a .doc file to a .docx file using the libreoffice CLI.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
input_filename: str
|
||||||
|
The name of the .doc file to convert to .docx
|
||||||
|
output_directory: str
|
||||||
|
The output directory for the convert .docx file
|
||||||
|
target_format: str
|
||||||
|
The desired output format
|
||||||
|
target_filter: str
|
||||||
|
The output filter name to use when converting. See references below
|
||||||
|
for details.
|
||||||
|
|
||||||
|
References
|
||||||
|
----------
|
||||||
|
https://stackoverflow.com/questions/52277264/convert-doc-to-docx-using-soffice-not-working
|
||||||
|
https://git.libreoffice.org/core/+/refs/heads/master/filter/source/config/fragments/filters
|
||||||
|
|
||||||
|
"""
|
||||||
|
if target_filter is not None:
|
||||||
|
target_format = f"{target_format}:{target_filter}"
|
||||||
# NOTE(robinson) - In the future can also include win32com client as a fallback for windows
|
# NOTE(robinson) - In the future can also include win32com client as a fallback for windows
|
||||||
# users who do not have LibreOffice installed
|
# users who do not have LibreOffice installed
|
||||||
# ref: https://stackoverflow.com/questions/38468442/
|
# ref: https://stackoverflow.com/questions/38468442/
|
||||||
|
|||||||
@ -16,6 +16,7 @@ def partition_doc(
|
|||||||
include_page_breaks: bool = True,
|
include_page_breaks: bool = True,
|
||||||
include_metadata: bool = True,
|
include_metadata: bool = True,
|
||||||
metadata_filename: Optional[str] = None,
|
metadata_filename: Optional[str] = None,
|
||||||
|
libre_office_filter: Optional[str] = "MS Word 2007 XML",
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
"""Partitions Microsoft Word Documents in .doc format into its document elements.
|
"""Partitions Microsoft Word Documents in .doc format into its document elements.
|
||||||
@ -26,6 +27,10 @@ def partition_doc(
|
|||||||
A string defining the target filename path.
|
A string defining the target filename path.
|
||||||
file
|
file
|
||||||
A file-like object using "rb" mode --> open(filename, "rb").
|
A file-like object using "rb" mode --> open(filename, "rb").
|
||||||
|
libre_office_filter
|
||||||
|
The filter to use when coverting to .doc. The default is the
|
||||||
|
filter that is required when using LibreOffice7. Pass in None
|
||||||
|
if you do not want to apply any filter.
|
||||||
"""
|
"""
|
||||||
# Verify that only one of the arguments was provided
|
# Verify that only one of the arguments was provided
|
||||||
if filename is None:
|
if filename is None:
|
||||||
@ -46,7 +51,12 @@ def partition_doc(
|
|||||||
base_filename, _ = os.path.splitext(filename_no_path)
|
base_filename, _ = os.path.splitext(filename_no_path)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
convert_office_doc(filename, tmpdir, target_format="docx")
|
convert_office_doc(
|
||||||
|
filename,
|
||||||
|
tmpdir,
|
||||||
|
target_format="docx",
|
||||||
|
target_filter=libre_office_filter,
|
||||||
|
)
|
||||||
docx_filename = os.path.join(tmpdir, f"{base_filename}.docx")
|
docx_filename = os.path.join(tmpdir, f"{base_filename}.docx")
|
||||||
elements = partition_docx(
|
elements = partition_docx(
|
||||||
filename=docx_filename,
|
filename=docx_filename,
|
||||||
|
|||||||
@ -49,7 +49,12 @@ def partition_ppt(
|
|||||||
base_filename, _ = os.path.splitext(filename_no_path)
|
base_filename, _ = os.path.splitext(filename_no_path)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
convert_office_doc(filename, tmpdir, target_format="pptx")
|
convert_office_doc(
|
||||||
|
filename,
|
||||||
|
tmpdir,
|
||||||
|
target_format="pptx",
|
||||||
|
target_filter="Impress MS PowerPoint 2007 XML",
|
||||||
|
)
|
||||||
pptx_filename = os.path.join(tmpdir, f"{base_filename}.pptx")
|
pptx_filename = os.path.join(tmpdir, f"{base_filename}.pptx")
|
||||||
elements = partition_pptx(filename=pptx_filename, metadata_filename=metadata_filename)
|
elements = partition_pptx(filename=pptx_filename, metadata_filename=metadata_filename)
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user