fix(odt): fix disk-space leak in partition_odt() (#3037)

Remedy disk-space leak where `partition_odt()` would leave an on-disk copy of each `.odt` file passed as a file-like object. `partition_odt()` creates a temporary file in which it writes each source-document provided as a file-like object. This file is not deleted and disk consumption grows without bound. The `convert_and_partition_docx()` function used to convert ODT->DOCX uses `pandoc` (a command-line program) to do the conversion. Because this command-line program operates in a different memory space, the source file cannot be passed as an in-memory object and needs to be on the filesystem. When the ODT source-document is passed as a file-like object, it is written to disk so the conversion program has access to it. It is not deleted afterward. Fix this by writing the temporary source ODT file in a `TemporaryDirectory` and also use that location to write the conversion-target DOCX file. That directory is automatically removed when `partition_odt()` completes. While we're in there, improve the factoring of `partition_odt()`. - Extract `convert_and_partition_docx()` from `partition.docx` (used only by `partition_odt()`) to `_convert_odt_to_docx()` in `partition.odt` where it is used. Decouple file conversion from calling `partition_docx()` with the converted file as the `partition_docx()` call is `partition_odt()`'s natural responsibility. - Improve docstrings, typing, and comments. - All tests pass both before and after.
2025-12-14 08:44:29 +00:00 · 2024-05-16 13:04:10 -07:00 · 2024-05-16 13:04:10 -07:00 · 8644a3b09a
commit 8644a3b09a
parent 0de9215db4
5 changed files with 95 additions and 132 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.13.8-dev14
+## 0.13.8-dev15
 ### Enhancements
@ -20,6 +20,7 @@
 * **Improve CSV delimeter detection.** `partition_csv()` would raise on CSV files with very long lines.
 * **Fix disk-space leak in `partition_doc()`.** Remove temporary file created but not removed when `file` argument is passed to `partition_doc()`.
 * **Fix possible `SyntaxError` or `SyntaxWarning` on regex patterns.** Change regex patterns to raw strings to avoid these warnings/errors in Python 3.11+.
 * **Fix disk-space leak in `partition_odt()`.** Remove temporary file created but not removed when `file` argument is passed to `partition_odt()`.
 ## 0.13.7
--- a/typings/pypandoc/init.pyi
+++ b/typings/pypandoc/init.pyi
@ -0,0 +1,5 @@
 import pathlib
 def convert_file(
    source_file: str, to: str, format: str | None, outputfile: str | pathlib.Path | None
 ) -> str: ...
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.13.8-dev14"  # pragma: no cover
+__version__ = "0.13.8-dev15"  # pragma: no cover
--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@ -5,9 +5,8 @@ from __future__ import annotations
 import html
 import io
 import itertools
 import os
 import tempfile
-from typing import IO, Any, Iterator, Optional, Type, cast
+from typing import IO, Any, Iterator, Optional, Type
 # -- CT_* stands for "complex-type", an XML element type in docx parlance --
 import docx
@ -45,7 +44,6 @@ from unstructured.documents.elements import (
 )
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import (
    exactly_one,
    get_last_modified_date,
    get_last_modified_date_from_file,
 )
@ -58,114 +56,13 @@ from unstructured.partition.text_type import (
    is_us_city_state_zip,
 )
 from unstructured.partition.utils.constants import PartitionStrategy
-from unstructured.utils import (
+from unstructured.utils import is_temp_file_path, lazyproperty
    dependency_exists,
    is_temp_file_path,
    lazyproperty,
    requires_dependencies,
 )
 if dependency_exists("pypandoc"):
    import pypandoc
 DETECTION_ORIGIN: str = "docx"
 BlockElement: TypeAlias = "CT_P | CT_Tbl"
 BlockItem: TypeAlias = "Paragraph | DocxTable"
@requires_dependencies("pypandoc")
 def convert_and_partition_docx(
    source_format: str,
    filename: Optional[str] = None,
    file: Optional[IO[bytes]] = None,
    include_metadata: bool = True,
    infer_table_structure: bool = True,
    metadata_filename: Optional[str] = None,
    metadata_last_modified: Optional[str] = None,
    languages: Optional[list[str]] = ["auto"],
    detect_language_per_element: bool = False,
    starting_page_number: int = 1,
 ) -> list[Element]:
    """Converts a document to DOCX and then partitions it using partition_docx.
    Works with any file format support by pandoc.
    Parameters
    ----------
    source_format
        The format of the source document, .e.g. odt
    filename
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
    include_metadata
        Determines whether or not metadata is included in the metadata attribute on the elements in
        the output.
    infer_table_structure
        If True, any Table elements that are extracted will also have a metadata field
        named "text_as_html" where the table's text content is rendered into an html string.
        I.e., rows and cells are preserved.
        Whether True or False, the "text" field is always present in any Table element
        and is the text content of the table (no structure).
    languages
        User defined value for `metadata.languages` if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
        in either language.
        Additional Parameters:
            detect_language_per_element
                Detect language per element instead of at the document level.
    starting_page_number
        Indicates what page number should be assigned to the first page in the document.
        This information will be reflected in elements' metadata and can be be especially
        useful when partitioning a document that is part of a larger document.
    """
    exactly_one(filename=filename, file=file)
    def validate_filename(filename: str) -> str:
        """Return path to a file confirmed to exist on the filesystem."""
        if not os.path.exists(filename):
            raise ValueError(f"The file {filename} does not exist.")
        return filename
    def copy_to_tempfile(file: IO[bytes]) -> str:
        """Return path to temporary copy of file to be converted."""
        with tempfile.NamedTemporaryFile(delete=False) as tmp:
            tmp.write(file.read())
            return tmp.name
    def extract_docx_filename(file_path: str) -> str:
        """Return a filename like "foo.docx" from a path like "a/b/foo.odt" """
        # -- a/b/foo.odt -> foo.odt --
        filename = os.path.basename(file_path)
        # -- foo.odt -> foo --
        root_name, _ = os.path.splitext(filename)
        # -- foo -> foo.docx --
        return f"{root_name}.docx"
    file_path = validate_filename(filename) if filename else copy_to_tempfile(cast(IO[bytes], file))
    with tempfile.TemporaryDirectory() as tmpdir:
        docx_path = os.path.join(tmpdir, extract_docx_filename(file_path))
        pypandoc.convert_file(  # pyright: ignore
            file_path,
            "docx",
            format=source_format,
            outputfile=docx_path,
        )
        elements = partition_docx(
            filename=docx_path,
            metadata_filename=metadata_filename,
            include_metadata=include_metadata,
            infer_table_structure=infer_table_structure,
            metadata_last_modified=metadata_last_modified,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
            starting_page_number=starting_page_number,
        )
    return elements
@process_metadata()
@add_metadata_with_filetype(FileType.DOCX)
@add_chunking_strategy
--- a/unstructured/partition/odt.py
+++ b/unstructured/partition/odt.py
@ -1,12 +1,19 @@
 from __future__ import annotations
-from typing import IO, Any, Optional
+import os
 import tempfile
 from typing import IO, Any, Optional, cast
 from unstructured.chunking import add_chunking_strategy
 from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
-from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file
+from unstructured.partition.common import (
-from unstructured.partition.docx import convert_and_partition_docx
+    exactly_one,
    get_last_modified_date,
    get_last_modified_date_from_file,
 )
 from unstructured.partition.docx import partition_docx
 from unstructured.utils import requires_dependencies
@process_metadata()
@ -14,15 +21,14 @@ from unstructured.partition.docx import convert_and_partition_docx
@add_chunking_strategy
 def partition_odt(
    filename: Optional[str] = None,
    *,
    date_from_file_object: bool = False,
    detect_language_per_element: bool = False,
    file: Optional[IO[bytes]] = None,
    include_metadata: bool = True,
    infer_table_structure: bool = True,
    languages: Optional[list[str]] = ["auto"],
    metadata_filename: Optional[str] = None,
    metadata_last_modified: Optional[str] = None,
    chunking_strategy: Optional[str] = None,
    languages: Optional[list[str]] = ["auto"],
    detect_language_per_element: bool = False,
    date_from_file_object: bool = False,
    starting_page_number: int = 1,
    **kwargs: Any,
 ) -> list[Element]:
@ -51,25 +57,79 @@ def partition_odt(
                Detect language per element instead of at the document level.
    date_from_file_object
        Applies only when providing file via `file` parameter. If this option is True, attempt
-        infer last_modified metadata from bytes, otherwise set it to None.
+        infer last_modified metadata from the file-like object, otherwise set it to None.
    """
    last_modification_date = None
    if filename:
        last_modification_date = get_last_modified_date(filename)
    elif file:
    last_modification_date = (
-            get_last_modified_date_from_file(file) if date_from_file_object else None
+        get_last_modified_date(filename)
        if filename
        else get_last_modified_date_from_file(file) if file and date_from_file_object else None
    )
-    return convert_and_partition_docx(
+    with tempfile.TemporaryDirectory() as target_dir:
-        source_format="odt",
+        docx_path = _convert_odt_to_docx(target_dir, filename, file)
-        filename=filename,
+        elements = partition_docx(
-        file=file,
+            filename=docx_path,
            detect_language_per_element=detect_language_per_element,
            infer_table_structure=infer_table_structure,
            languages=languages,
            metadata_filename=metadata_filename,
            metadata_last_modified=metadata_last_modified or last_modification_date,
        languages=languages,
        detect_language_per_element=detect_language_per_element,
            starting_page_number=starting_page_number,
        )
    return elements
@requires_dependencies("pypandoc")
 def _convert_odt_to_docx(
    target_dir: str, filename: Optional[str], file: Optional[IO[bytes]]
 ) -> str:
    """Convert ODT document to DOCX returning the new .docx file's path.
    Parameters
    ----------
    target_dir
        The str directory-path to use for conversion purposes. The new DOCX file is written to this
        directory. When passed as a file-like object, a copy of the source file is written here as
        well. It is the caller's responsibility to remove this directory and its contents when
        they are no longer needed.
    filename
        A str file-path specifying the location of the source ODT file on the local filesystem.
    file
        A file-like object open for reading in binary mode ("rb" mode).
    """
    exactly_one(filename=filename, file=file)
    # -- validate file-path when provided so we can provide a more meaningful error than whatever
    # -- would come from pandoc.
    if filename is not None and not os.path.exists(filename):
        raise ValueError(f"The file {filename} does not exist.")
    # -- Pandoc is a command-line program running in its own memory-space. It can therefore only
    # -- operate on files on the filesystem. If the source document was passed as `file`, write
    # -- it to `target_dir/document.odt` and use that path as the source-path.
    source_file_path = f"{target_dir}/document.odt" if file is not None else cast(str, filename)
    if file is not None:
        with open(source_file_path, "wb") as f:
            f.write(file.read())
    # -- Compute the path of the resulting .docx document. We want its file-name to be preserved
    # -- if the source-document was provided as `filename`.
    # -- a/b/foo.odt -> foo.odt --
    file_name = os.path.basename(source_file_path)
    # -- foo.odt -> foo --
    base_name, _ = os.path.splitext(file_name)
    # -- foo -> foo.docx --
    target_docx_path = os.path.join(target_dir, f"{base_name}.docx")
    import pypandoc
    pypandoc.convert_file(
        source_file_path,
        "docx",
        format="odt",
        outputfile=target_docx_path,
    )
    return target_docx_path
`@ -1 +1 @@`
	`__version__ = "0.13.8-dev14" # pragma: no cover`	`__version__ = "0.13.8-dev15" # pragma: no cover`