mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-02 02:53:31 +00:00
fix: enable passing filters to partition_doc for libreoffice conversion (#934)
* add optional filter to docx conversion * add filters to tests * changelog and version * update filter for power point
This commit is contained in:
parent
067eb5701f
commit
0d332743eb
@ -1,4 +1,4 @@
|
||||
## 0.8.2-dev1
|
||||
## 0.8.2-dev2
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -6,6 +6,7 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* Enables filters to be passed to `partition_doc` so it doesn't error with LibreOffice7.
|
||||
* Adds Onedrive connector.
|
||||
|
||||
## 0.8.2-dev0
|
||||
@ -18,7 +19,7 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* Add functionality to write images to computer storage temporarily instead of keeping them in memory for `ocr_only` strategy
|
||||
* Add functionality to write images to computer storage temporarily instead of keeping them in memory for `ocr_only` strategy
|
||||
* Add functionality to convert a PDF in small chunks of pages at a time for `ocr_only` strategy
|
||||
* Adds `.txt`, `.text`, and `.tab` to list of extensions to check if file
|
||||
has a `text/plain` MIME type.
|
||||
|
||||
@ -99,14 +99,29 @@ def test_partition_raises_with_missing_doc(mock_document, expected_elements, tmp
|
||||
partition_doc(filename=doc_filename)
|
||||
|
||||
|
||||
def test_partition_doc_from_file(mock_document, expected_elements, tmpdir, capsys):
|
||||
def test_partition_doc_from_file_with_filter(mock_document, expected_elements, tmpdir, capsys):
|
||||
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
|
||||
mock_document.save(docx_filename)
|
||||
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
|
||||
|
||||
with open(doc_filename, "rb") as f:
|
||||
elements = partition_doc(file=f)
|
||||
elements = partition_doc(file=f, libre_office_filter="MS Word 2007 XML")
|
||||
assert elements == expected_elements
|
||||
assert capsys.readouterr().out == ""
|
||||
assert capsys.readouterr().err == ""
|
||||
for element in elements:
|
||||
assert element.metadata.filename is None
|
||||
|
||||
|
||||
def test_partition_doc_from_file_with_no_filter(mock_document, expected_elements, tmpdir, capsys):
|
||||
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
|
||||
mock_document.save(docx_filename)
|
||||
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
|
||||
|
||||
with open(doc_filename, "rb") as f:
|
||||
elements = partition_doc(file=f, libre_office_filter=None)
|
||||
assert elements == expected_elements
|
||||
assert capsys.readouterr().out == ""
|
||||
assert capsys.readouterr().err == ""
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.8.2-dev1" # pragma: no cover
|
||||
__version__ = "0.8.2-dev2" # pragma: no cover
|
||||
|
||||
@ -152,8 +152,34 @@ def _remove_element_metadata(
|
||||
return elements
|
||||
|
||||
|
||||
def convert_office_doc(input_filename: str, output_directory: str, target_format: str):
|
||||
"""Converts a .doc file to a .docx file using the libreoffice CLI."""
|
||||
def convert_office_doc(
|
||||
input_filename: str,
|
||||
output_directory: str,
|
||||
target_format: str = "docx",
|
||||
target_filter: Optional[str] = None,
|
||||
):
|
||||
"""Converts a .doc file to a .docx file using the libreoffice CLI.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
input_filename: str
|
||||
The name of the .doc file to convert to .docx
|
||||
output_directory: str
|
||||
The output directory for the convert .docx file
|
||||
target_format: str
|
||||
The desired output format
|
||||
target_filter: str
|
||||
The output filter name to use when converting. See references below
|
||||
for details.
|
||||
|
||||
References
|
||||
----------
|
||||
https://stackoverflow.com/questions/52277264/convert-doc-to-docx-using-soffice-not-working
|
||||
https://git.libreoffice.org/core/+/refs/heads/master/filter/source/config/fragments/filters
|
||||
|
||||
"""
|
||||
if target_filter is not None:
|
||||
target_format = f"{target_format}:{target_filter}"
|
||||
# NOTE(robinson) - In the future can also include win32com client as a fallback for windows
|
||||
# users who do not have LibreOffice installed
|
||||
# ref: https://stackoverflow.com/questions/38468442/
|
||||
|
||||
@ -16,6 +16,7 @@ def partition_doc(
|
||||
include_page_breaks: bool = True,
|
||||
include_metadata: bool = True,
|
||||
metadata_filename: Optional[str] = None,
|
||||
libre_office_filter: Optional[str] = "MS Word 2007 XML",
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions Microsoft Word Documents in .doc format into its document elements.
|
||||
@ -26,6 +27,10 @@ def partition_doc(
|
||||
A string defining the target filename path.
|
||||
file
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
libre_office_filter
|
||||
The filter to use when coverting to .doc. The default is the
|
||||
filter that is required when using LibreOffice7. Pass in None
|
||||
if you do not want to apply any filter.
|
||||
"""
|
||||
# Verify that only one of the arguments was provided
|
||||
if filename is None:
|
||||
@ -46,7 +51,12 @@ def partition_doc(
|
||||
base_filename, _ = os.path.splitext(filename_no_path)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
convert_office_doc(filename, tmpdir, target_format="docx")
|
||||
convert_office_doc(
|
||||
filename,
|
||||
tmpdir,
|
||||
target_format="docx",
|
||||
target_filter=libre_office_filter,
|
||||
)
|
||||
docx_filename = os.path.join(tmpdir, f"{base_filename}.docx")
|
||||
elements = partition_docx(
|
||||
filename=docx_filename,
|
||||
|
||||
@ -49,7 +49,12 @@ def partition_ppt(
|
||||
base_filename, _ = os.path.splitext(filename_no_path)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
convert_office_doc(filename, tmpdir, target_format="pptx")
|
||||
convert_office_doc(
|
||||
filename,
|
||||
tmpdir,
|
||||
target_format="pptx",
|
||||
target_filter="Impress MS PowerPoint 2007 XML",
|
||||
)
|
||||
pptx_filename = os.path.join(tmpdir, f"{base_filename}.pptx")
|
||||
elements = partition_pptx(filename=pptx_filename, metadata_filename=metadata_filename)
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user