fix(odt): fix disk-space leak in partition_odt() (#3037)

Remedy disk-space leak where `partition_odt()` would leave an on-disk copy of each `.odt` file passed as a file-like object. `partition_odt()` creates a temporary file in which it writes each source-document provided as a file-like object. This file is not deleted and disk consumption grows without bound. The `convert_and_partition_docx()` function used to convert ODT->DOCX uses `pandoc` (a command-line program) to do the conversion. Because this command-line program operates in a different memory space, the source file cannot be passed as an in-memory object and needs to be on the filesystem. When the ODT source-document is passed as a file-like object, it is written to disk so the conversion program has access to it. It is not deleted afterward. Fix this by writing the temporary source ODT file in a `TemporaryDirectory` and also use that location to write the conversion-target DOCX file. That directory is automatically removed when `partition_odt()` completes. While we're in there, improve the factoring of `partition_odt()`. - Extract `convert_and_partition_docx()` from `partition.docx` (used only by `partition_odt()`) to `_convert_odt_to_docx()` in `partition.odt` where it is used. Decouple file conversion from calling `partition_docx()` with the converted file as the `partition_docx()` call is `partition_odt()`'s natural responsibility. - Improve docstrings, typing, and comments. - All tests pass both before and after.
2025-12-12 23:51:47 +00:00 · 2024-05-16 13:04:10 -07:00 · 2024-05-16 13:04:10 -07:00 · 8644a3b09a
commit 8644a3b09a
parent 0de9215db4
5 changed files with 95 additions and 132 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.13.8-dev14
+## 0.13.8-dev15

 ### Enhancements

@ -20,6 +20,7 @@
 * **Improve CSV delimeter detection.** `partition_csv()` would raise on CSV files with very long lines.
 * **Fix disk-space leak in `partition_doc()`.** Remove temporary file created but not removed when `file` argument is passed to `partition_doc()`.
 * **Fix possible `SyntaxError` or `SyntaxWarning` on regex patterns.** Change regex patterns to raw strings to avoid these warnings/errors in Python 3.11+.
+* **Fix disk-space leak in `partition_odt()`.** Remove temporary file created but not removed when `file` argument is passed to `partition_odt()`.

 ## 0.13.7

--- a/typings/pypandoc/init.pyi
+++ b/typings/pypandoc/init.pyi
@ -0,0 +1,5 @@
+import pathlib
+
+def convert_file(
+    source_file: str, to: str, format: str | None, outputfile: str | pathlib.Path | None
+) -> str: ...
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.13.8-dev14"  # pragma: no cover
+__version__ = "0.13.8-dev15"  # pragma: no cover
--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@ -5,9 +5,8 @@ from __future__ import annotations
 import html
 import io
 import itertools
-import os
 import tempfile
-from typing import IO, Any, Iterator, Optional, Type, cast
+from typing import IO, Any, Iterator, Optional, Type

 # -- CT_* stands for "complex-type", an XML element type in docx parlance --
 import docx
@ -45,7 +44,6 @@ from unstructured.documents.elements import (
 )
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import (
-    exactly_one,
    get_last_modified_date,
    get_last_modified_date_from_file,
 )
@ -58,114 +56,13 @@ from unstructured.partition.text_type import (
    is_us_city_state_zip,
 )
 from unstructured.partition.utils.constants import PartitionStrategy
-from unstructured.utils import (
-    dependency_exists,
-    is_temp_file_path,
-    lazyproperty,
-    requires_dependencies,
-)
-
-if dependency_exists("pypandoc"):
-    import pypandoc
+from unstructured.utils import is_temp_file_path, lazyproperty

 DETECTION_ORIGIN: str = "docx"
 BlockElement: TypeAlias = "CT_P | CT_Tbl"
 BlockItem: TypeAlias = "Paragraph | DocxTable"


-@requires_dependencies("pypandoc")
-def convert_and_partition_docx(
-    source_format: str,
-    filename: Optional[str] = None,
-    file: Optional[IO[bytes]] = None,
-    include_metadata: bool = True,
-    infer_table_structure: bool = True,
-    metadata_filename: Optional[str] = None,
-    metadata_last_modified: Optional[str] = None,
-    languages: Optional[list[str]] = ["auto"],
-    detect_language_per_element: bool = False,
-    starting_page_number: int = 1,
-) -> list[Element]:
-    """Converts a document to DOCX and then partitions it using partition_docx.
-
-    Works with any file format support by pandoc.
-
-    Parameters
-    ----------
-    source_format
-        The format of the source document, .e.g. odt
-    filename
-        A string defining the target filename path.
-    file
-        A file-like object using "rb" mode --> open(filename, "rb").
-    include_metadata
-        Determines whether or not metadata is included in the metadata attribute on the elements in
-        the output.
-    infer_table_structure
-        If True, any Table elements that are extracted will also have a metadata field
-        named "text_as_html" where the table's text content is rendered into an html string.
-        I.e., rows and cells are preserved.
-        Whether True or False, the "text" field is always present in any Table element
-        and is the text content of the table (no structure).
-    languages
-        User defined value for `metadata.languages` if provided. Otherwise language is detected
-        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
-        in either language.
-        Additional Parameters:
-            detect_language_per_element
-                Detect language per element instead of at the document level.
-    starting_page_number
-        Indicates what page number should be assigned to the first page in the document.
-        This information will be reflected in elements' metadata and can be be especially
-        useful when partitioning a document that is part of a larger document.
-    """
-    exactly_one(filename=filename, file=file)
-
-    def validate_filename(filename: str) -> str:
-        """Return path to a file confirmed to exist on the filesystem."""
-        if not os.path.exists(filename):
-            raise ValueError(f"The file {filename} does not exist.")
-        return filename
-
-    def copy_to_tempfile(file: IO[bytes]) -> str:
-        """Return path to temporary copy of file to be converted."""
-        with tempfile.NamedTemporaryFile(delete=False) as tmp:
-            tmp.write(file.read())
-            return tmp.name
-
-    def extract_docx_filename(file_path: str) -> str:
-        """Return a filename like "foo.docx" from a path like "a/b/foo.odt" """
-        # -- a/b/foo.odt -> foo.odt --
-        filename = os.path.basename(file_path)
-        # -- foo.odt -> foo --
-        root_name, _ = os.path.splitext(filename)
-        # -- foo -> foo.docx --
-        return f"{root_name}.docx"
-
-    file_path = validate_filename(filename) if filename else copy_to_tempfile(cast(IO[bytes], file))
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-        docx_path = os.path.join(tmpdir, extract_docx_filename(file_path))
-        pypandoc.convert_file(  # pyright: ignore
-            file_path,
-            "docx",
-            format=source_format,
-            outputfile=docx_path,
-        )
-        elements = partition_docx(
-            filename=docx_path,
-            metadata_filename=metadata_filename,
-            include_metadata=include_metadata,
-            infer_table_structure=infer_table_structure,
-            metadata_last_modified=metadata_last_modified,
-            languages=languages,
-            detect_language_per_element=detect_language_per_element,
-            starting_page_number=starting_page_number,
-        )
-
-    return elements
-
-
@process_metadata()
@add_metadata_with_filetype(FileType.DOCX)
@add_chunking_strategy
--- a/unstructured/partition/odt.py
+++ b/unstructured/partition/odt.py
@ -1,12 +1,19 @@
 from __future__ import annotations

-from typing import IO, Any, Optional
+import os
+import tempfile
+from typing import IO, Any, Optional, cast

 from unstructured.chunking import add_chunking_strategy
 from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
-from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file
-from unstructured.partition.docx import convert_and_partition_docx
+from unstructured.partition.common import (
+    exactly_one,
+    get_last_modified_date,
+    get_last_modified_date_from_file,
+)
+from unstructured.partition.docx import partition_docx
+from unstructured.utils import requires_dependencies


@process_metadata()
@ -14,15 +21,14 @@ from unstructured.partition.docx import convert_and_partition_docx
@add_chunking_strategy
 def partition_odt(
    filename: Optional[str] = None,
+    *,
+    date_from_file_object: bool = False,
+    detect_language_per_element: bool = False,
    file: Optional[IO[bytes]] = None,
-    include_metadata: bool = True,
    infer_table_structure: bool = True,
+    languages: Optional[list[str]] = ["auto"],
    metadata_filename: Optional[str] = None,
    metadata_last_modified: Optional[str] = None,
-    chunking_strategy: Optional[str] = None,
-    languages: Optional[list[str]] = ["auto"],
-    detect_language_per_element: bool = False,
-    date_from_file_object: bool = False,
    starting_page_number: int = 1,
    **kwargs: Any,
 ) -> list[Element]:
@ -51,25 +57,79 @@ def partition_odt(
                Detect language per element instead of at the document level.
    date_from_file_object
        Applies only when providing file via `file` parameter. If this option is True, attempt
-        infer last_modified metadata from bytes, otherwise set it to None.
+        infer last_modified metadata from the file-like object, otherwise set it to None.
    """

-    last_modification_date = None
-    if filename:
-        last_modification_date = get_last_modified_date(filename)
-    elif file:
-        last_modification_date = (
-            get_last_modified_date_from_file(file) if date_from_file_object else None
+    last_modification_date = (
+        get_last_modified_date(filename)
+        if filename
+        else get_last_modified_date_from_file(file) if file and date_from_file_object else None
+    )
+
+    with tempfile.TemporaryDirectory() as target_dir:
+        docx_path = _convert_odt_to_docx(target_dir, filename, file)
+        elements = partition_docx(
+            filename=docx_path,
+            detect_language_per_element=detect_language_per_element,
+            infer_table_structure=infer_table_structure,
+            languages=languages,
+            metadata_filename=metadata_filename,
+            metadata_last_modified=metadata_last_modified or last_modification_date,
+            starting_page_number=starting_page_number,
        )

-    return convert_and_partition_docx(
-        source_format="odt",
-        filename=filename,
-        file=file,
-        infer_table_structure=infer_table_structure,
-        metadata_filename=metadata_filename,
-        metadata_last_modified=metadata_last_modified or last_modification_date,
-        languages=languages,
-        detect_language_per_element=detect_language_per_element,
-        starting_page_number=starting_page_number,
+    return elements
+
+
+@requires_dependencies("pypandoc")
+def _convert_odt_to_docx(
+    target_dir: str, filename: Optional[str], file: Optional[IO[bytes]]
+) -> str:
+    """Convert ODT document to DOCX returning the new .docx file's path.
+
+    Parameters
+    ----------
+    target_dir
+        The str directory-path to use for conversion purposes. The new DOCX file is written to this
+        directory. When passed as a file-like object, a copy of the source file is written here as
+        well. It is the caller's responsibility to remove this directory and its contents when
+        they are no longer needed.
+    filename
+        A str file-path specifying the location of the source ODT file on the local filesystem.
+    file
+        A file-like object open for reading in binary mode ("rb" mode).
+    """
+    exactly_one(filename=filename, file=file)
+
+    # -- validate file-path when provided so we can provide a more meaningful error than whatever
+    # -- would come from pandoc.
+    if filename is not None and not os.path.exists(filename):
+        raise ValueError(f"The file {filename} does not exist.")
+
+    # -- Pandoc is a command-line program running in its own memory-space. It can therefore only
+    # -- operate on files on the filesystem. If the source document was passed as `file`, write
+    # -- it to `target_dir/document.odt` and use that path as the source-path.
+    source_file_path = f"{target_dir}/document.odt" if file is not None else cast(str, filename)
+    if file is not None:
+        with open(source_file_path, "wb") as f:
+            f.write(file.read())
+
+    # -- Compute the path of the resulting .docx document. We want its file-name to be preserved
+    # -- if the source-document was provided as `filename`.
+    # -- a/b/foo.odt -> foo.odt --
+    file_name = os.path.basename(source_file_path)
+    # -- foo.odt -> foo --
+    base_name, _ = os.path.splitext(file_name)
+    # -- foo -> foo.docx --
+    target_docx_path = os.path.join(target_dir, f"{base_name}.docx")
+
+    import pypandoc
+
+    pypandoc.convert_file(
+        source_file_path,
+        "docx",
+        format="odt",
+        outputfile=target_docx_path,
    )
+
+    return target_docx_path