diff --git a/CHANGELOG.md b/CHANGELOG.md index cf8affa03..7e255bfe5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.8.2-dev1 +## 0.8.2-dev2 ### Enhancements @@ -6,6 +6,7 @@ ### Fixes +* Enables filters to be passed to `partition_doc` so it doesn't error with LibreOffice7. * Adds Onedrive connector. ## 0.8.2-dev0 @@ -18,7 +19,7 @@ ### Fixes -* Add functionality to write images to computer storage temporarily instead of keeping them in memory for `ocr_only` strategy +* Add functionality to write images to computer storage temporarily instead of keeping them in memory for `ocr_only` strategy * Add functionality to convert a PDF in small chunks of pages at a time for `ocr_only` strategy * Adds `.txt`, `.text`, and `.tab` to list of extensions to check if file has a `text/plain` MIME type. diff --git a/test_unstructured/partition/test_doc.py b/test_unstructured/partition/test_doc.py index 207973914..b71da3e03 100644 --- a/test_unstructured/partition/test_doc.py +++ b/test_unstructured/partition/test_doc.py @@ -99,14 +99,29 @@ def test_partition_raises_with_missing_doc(mock_document, expected_elements, tmp partition_doc(filename=doc_filename) -def test_partition_doc_from_file(mock_document, expected_elements, tmpdir, capsys): +def test_partition_doc_from_file_with_filter(mock_document, expected_elements, tmpdir, capsys): docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") mock_document.save(docx_filename) convert_office_doc(docx_filename, tmpdir.dirname, "doc") with open(doc_filename, "rb") as f: - elements = partition_doc(file=f) + elements = partition_doc(file=f, libre_office_filter="MS Word 2007 XML") + assert elements == expected_elements + assert capsys.readouterr().out == "" + assert capsys.readouterr().err == "" + for element in elements: + assert element.metadata.filename is None + + +def test_partition_doc_from_file_with_no_filter(mock_document, expected_elements, tmpdir, capsys): + docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") + doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") + mock_document.save(docx_filename) + convert_office_doc(docx_filename, tmpdir.dirname, "doc") + + with open(doc_filename, "rb") as f: + elements = partition_doc(file=f, libre_office_filter=None) assert elements == expected_elements assert capsys.readouterr().out == "" assert capsys.readouterr().err == "" diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 840834c4e..0955fc049 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.8.2-dev1" # pragma: no cover +__version__ = "0.8.2-dev2" # pragma: no cover diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py index a4f7b64c1..8e7b5f8ee 100644 --- a/unstructured/partition/common.py +++ b/unstructured/partition/common.py @@ -152,8 +152,34 @@ def _remove_element_metadata( return elements -def convert_office_doc(input_filename: str, output_directory: str, target_format: str): - """Converts a .doc file to a .docx file using the libreoffice CLI.""" +def convert_office_doc( + input_filename: str, + output_directory: str, + target_format: str = "docx", + target_filter: Optional[str] = None, +): + """Converts a .doc file to a .docx file using the libreoffice CLI. + + Parameters + ---------- + input_filename: str + The name of the .doc file to convert to .docx + output_directory: str + The output directory for the convert .docx file + target_format: str + The desired output format + target_filter: str + The output filter name to use when converting. See references below + for details. + + References + ---------- + https://stackoverflow.com/questions/52277264/convert-doc-to-docx-using-soffice-not-working + https://git.libreoffice.org/core/+/refs/heads/master/filter/source/config/fragments/filters + + """ + if target_filter is not None: + target_format = f"{target_format}:{target_filter}" # NOTE(robinson) - In the future can also include win32com client as a fallback for windows # users who do not have LibreOffice installed # ref: https://stackoverflow.com/questions/38468442/ diff --git a/unstructured/partition/doc.py b/unstructured/partition/doc.py index df9501a8e..dc939c2ef 100644 --- a/unstructured/partition/doc.py +++ b/unstructured/partition/doc.py @@ -16,6 +16,7 @@ def partition_doc( include_page_breaks: bool = True, include_metadata: bool = True, metadata_filename: Optional[str] = None, + libre_office_filter: Optional[str] = "MS Word 2007 XML", **kwargs, ) -> List[Element]: """Partitions Microsoft Word Documents in .doc format into its document elements. @@ -26,6 +27,10 @@ def partition_doc( A string defining the target filename path. file A file-like object using "rb" mode --> open(filename, "rb"). + libre_office_filter + The filter to use when coverting to .doc. The default is the + filter that is required when using LibreOffice7. Pass in None + if you do not want to apply any filter. """ # Verify that only one of the arguments was provided if filename is None: @@ -46,7 +51,12 @@ def partition_doc( base_filename, _ = os.path.splitext(filename_no_path) with tempfile.TemporaryDirectory() as tmpdir: - convert_office_doc(filename, tmpdir, target_format="docx") + convert_office_doc( + filename, + tmpdir, + target_format="docx", + target_filter=libre_office_filter, + ) docx_filename = os.path.join(tmpdir, f"{base_filename}.docx") elements = partition_docx( filename=docx_filename, diff --git a/unstructured/partition/ppt.py b/unstructured/partition/ppt.py index d98f1ce6d..307945a43 100644 --- a/unstructured/partition/ppt.py +++ b/unstructured/partition/ppt.py @@ -49,7 +49,12 @@ def partition_ppt( base_filename, _ = os.path.splitext(filename_no_path) with tempfile.TemporaryDirectory() as tmpdir: - convert_office_doc(filename, tmpdir, target_format="pptx") + convert_office_doc( + filename, + tmpdir, + target_format="pptx", + target_filter="Impress MS PowerPoint 2007 XML", + ) pptx_filename = os.path.join(tmpdir, f"{base_filename}.pptx") elements = partition_pptx(filename=pptx_filename, metadata_filename=metadata_filename)