fix: enable passing filters to partition_doc for libreoffice conversion (#934)

* add optional filter to docx conversion

* add filters to tests

* changelog and version

* update filter for power point
This commit is contained in:
Matt Robinson 2023-07-17 13:54:44 -04:00 committed by GitHub
parent 067eb5701f
commit 0d332743eb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 66 additions and 9 deletions

View File

@ -1,4 +1,4 @@
## 0.8.2-dev1
## 0.8.2-dev2
### Enhancements
@ -6,6 +6,7 @@
### Fixes
* Enables filters to be passed to `partition_doc` so it doesn't error with LibreOffice7.
* Adds Onedrive connector.
## 0.8.2-dev0
@ -18,7 +19,7 @@
### Fixes
* Add functionality to write images to computer storage temporarily instead of keeping them in memory for `ocr_only` strategy
* Add functionality to write images to computer storage temporarily instead of keeping them in memory for `ocr_only` strategy
* Add functionality to convert a PDF in small chunks of pages at a time for `ocr_only` strategy
* Adds `.txt`, `.text`, and `.tab` to list of extensions to check if file
has a `text/plain` MIME type.

View File

@ -99,14 +99,29 @@ def test_partition_raises_with_missing_doc(mock_document, expected_elements, tmp
partition_doc(filename=doc_filename)
def test_partition_doc_from_file(mock_document, expected_elements, tmpdir, capsys):
def test_partition_doc_from_file_with_filter(mock_document, expected_elements, tmpdir, capsys):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
with open(doc_filename, "rb") as f:
elements = partition_doc(file=f)
elements = partition_doc(file=f, libre_office_filter="MS Word 2007 XML")
assert elements == expected_elements
assert capsys.readouterr().out == ""
assert capsys.readouterr().err == ""
for element in elements:
assert element.metadata.filename is None
def test_partition_doc_from_file_with_no_filter(mock_document, expected_elements, tmpdir, capsys):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
with open(doc_filename, "rb") as f:
elements = partition_doc(file=f, libre_office_filter=None)
assert elements == expected_elements
assert capsys.readouterr().out == ""
assert capsys.readouterr().err == ""

View File

@ -1 +1 @@
__version__ = "0.8.2-dev1" # pragma: no cover
__version__ = "0.8.2-dev2" # pragma: no cover

View File

@ -152,8 +152,34 @@ def _remove_element_metadata(
return elements
def convert_office_doc(input_filename: str, output_directory: str, target_format: str):
"""Converts a .doc file to a .docx file using the libreoffice CLI."""
def convert_office_doc(
input_filename: str,
output_directory: str,
target_format: str = "docx",
target_filter: Optional[str] = None,
):
"""Converts a .doc file to a .docx file using the libreoffice CLI.
Parameters
----------
input_filename: str
The name of the .doc file to convert to .docx
output_directory: str
The output directory for the convert .docx file
target_format: str
The desired output format
target_filter: str
The output filter name to use when converting. See references below
for details.
References
----------
https://stackoverflow.com/questions/52277264/convert-doc-to-docx-using-soffice-not-working
https://git.libreoffice.org/core/+/refs/heads/master/filter/source/config/fragments/filters
"""
if target_filter is not None:
target_format = f"{target_format}:{target_filter}"
# NOTE(robinson) - In the future can also include win32com client as a fallback for windows
# users who do not have LibreOffice installed
# ref: https://stackoverflow.com/questions/38468442/

View File

@ -16,6 +16,7 @@ def partition_doc(
include_page_breaks: bool = True,
include_metadata: bool = True,
metadata_filename: Optional[str] = None,
libre_office_filter: Optional[str] = "MS Word 2007 XML",
**kwargs,
) -> List[Element]:
"""Partitions Microsoft Word Documents in .doc format into its document elements.
@ -26,6 +27,10 @@ def partition_doc(
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
libre_office_filter
The filter to use when coverting to .doc. The default is the
filter that is required when using LibreOffice7. Pass in None
if you do not want to apply any filter.
"""
# Verify that only one of the arguments was provided
if filename is None:
@ -46,7 +51,12 @@ def partition_doc(
base_filename, _ = os.path.splitext(filename_no_path)
with tempfile.TemporaryDirectory() as tmpdir:
convert_office_doc(filename, tmpdir, target_format="docx")
convert_office_doc(
filename,
tmpdir,
target_format="docx",
target_filter=libre_office_filter,
)
docx_filename = os.path.join(tmpdir, f"{base_filename}.docx")
elements = partition_docx(
filename=docx_filename,

View File

@ -49,7 +49,12 @@ def partition_ppt(
base_filename, _ = os.path.splitext(filename_no_path)
with tempfile.TemporaryDirectory() as tmpdir:
convert_office_doc(filename, tmpdir, target_format="pptx")
convert_office_doc(
filename,
tmpdir,
target_format="pptx",
target_filter="Impress MS PowerPoint 2007 XML",
)
pptx_filename = os.path.join(tmpdir, f"{base_filename}.pptx")
elements = partition_pptx(filename=pptx_filename, metadata_filename=metadata_filename)