mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-12 23:51:47 +00:00
fix(odt): fix disk-space leak in partition_odt() (#3037)
Remedy disk-space leak where `partition_odt()` would leave an on-disk copy of each `.odt` file passed as a file-like object. `partition_odt()` creates a temporary file in which it writes each source-document provided as a file-like object. This file is not deleted and disk consumption grows without bound. The `convert_and_partition_docx()` function used to convert ODT->DOCX uses `pandoc` (a command-line program) to do the conversion. Because this command-line program operates in a different memory space, the source file cannot be passed as an in-memory object and needs to be on the filesystem. When the ODT source-document is passed as a file-like object, it is written to disk so the conversion program has access to it. It is not deleted afterward. Fix this by writing the temporary source ODT file in a `TemporaryDirectory` and also use that location to write the conversion-target DOCX file. That directory is automatically removed when `partition_odt()` completes. While we're in there, improve the factoring of `partition_odt()`. - Extract `convert_and_partition_docx()` from `partition.docx` (used only by `partition_odt()`) to `_convert_odt_to_docx()` in `partition.odt` where it is used. Decouple file conversion from calling `partition_docx()` with the converted file as the `partition_docx()` call is `partition_odt()`'s natural responsibility. - Improve docstrings, typing, and comments. - All tests pass both before and after.
This commit is contained in:
parent
0de9215db4
commit
8644a3b09a
@ -1,4 +1,4 @@
|
||||
## 0.13.8-dev14
|
||||
## 0.13.8-dev15
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -20,6 +20,7 @@
|
||||
* **Improve CSV delimeter detection.** `partition_csv()` would raise on CSV files with very long lines.
|
||||
* **Fix disk-space leak in `partition_doc()`.** Remove temporary file created but not removed when `file` argument is passed to `partition_doc()`.
|
||||
* **Fix possible `SyntaxError` or `SyntaxWarning` on regex patterns.** Change regex patterns to raw strings to avoid these warnings/errors in Python 3.11+.
|
||||
* **Fix disk-space leak in `partition_odt()`.** Remove temporary file created but not removed when `file` argument is passed to `partition_odt()`.
|
||||
|
||||
## 0.13.7
|
||||
|
||||
|
||||
@ -0,0 +1,5 @@
|
||||
import pathlib
|
||||
|
||||
def convert_file(
|
||||
source_file: str, to: str, format: str | None, outputfile: str | pathlib.Path | None
|
||||
) -> str: ...
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.13.8-dev14" # pragma: no cover
|
||||
__version__ = "0.13.8-dev15" # pragma: no cover
|
||||
|
||||
@ -5,9 +5,8 @@ from __future__ import annotations
|
||||
import html
|
||||
import io
|
||||
import itertools
|
||||
import os
|
||||
import tempfile
|
||||
from typing import IO, Any, Iterator, Optional, Type, cast
|
||||
from typing import IO, Any, Iterator, Optional, Type
|
||||
|
||||
# -- CT_* stands for "complex-type", an XML element type in docx parlance --
|
||||
import docx
|
||||
@ -45,7 +44,6 @@ from unstructured.documents.elements import (
|
||||
)
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.partition.common import (
|
||||
exactly_one,
|
||||
get_last_modified_date,
|
||||
get_last_modified_date_from_file,
|
||||
)
|
||||
@ -58,114 +56,13 @@ from unstructured.partition.text_type import (
|
||||
is_us_city_state_zip,
|
||||
)
|
||||
from unstructured.partition.utils.constants import PartitionStrategy
|
||||
from unstructured.utils import (
|
||||
dependency_exists,
|
||||
is_temp_file_path,
|
||||
lazyproperty,
|
||||
requires_dependencies,
|
||||
)
|
||||
|
||||
if dependency_exists("pypandoc"):
|
||||
import pypandoc
|
||||
from unstructured.utils import is_temp_file_path, lazyproperty
|
||||
|
||||
DETECTION_ORIGIN: str = "docx"
|
||||
BlockElement: TypeAlias = "CT_P | CT_Tbl"
|
||||
BlockItem: TypeAlias = "Paragraph | DocxTable"
|
||||
|
||||
|
||||
@requires_dependencies("pypandoc")
|
||||
def convert_and_partition_docx(
|
||||
source_format: str,
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
include_metadata: bool = True,
|
||||
infer_table_structure: bool = True,
|
||||
metadata_filename: Optional[str] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
languages: Optional[list[str]] = ["auto"],
|
||||
detect_language_per_element: bool = False,
|
||||
starting_page_number: int = 1,
|
||||
) -> list[Element]:
|
||||
"""Converts a document to DOCX and then partitions it using partition_docx.
|
||||
|
||||
Works with any file format support by pandoc.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
source_format
|
||||
The format of the source document, .e.g. odt
|
||||
filename
|
||||
A string defining the target filename path.
|
||||
file
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
include_metadata
|
||||
Determines whether or not metadata is included in the metadata attribute on the elements in
|
||||
the output.
|
||||
infer_table_structure
|
||||
If True, any Table elements that are extracted will also have a metadata field
|
||||
named "text_as_html" where the table's text content is rendered into an html string.
|
||||
I.e., rows and cells are preserved.
|
||||
Whether True or False, the "text" field is always present in any Table element
|
||||
and is the text content of the table (no structure).
|
||||
languages
|
||||
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
||||
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
||||
in either language.
|
||||
Additional Parameters:
|
||||
detect_language_per_element
|
||||
Detect language per element instead of at the document level.
|
||||
starting_page_number
|
||||
Indicates what page number should be assigned to the first page in the document.
|
||||
This information will be reflected in elements' metadata and can be be especially
|
||||
useful when partitioning a document that is part of a larger document.
|
||||
"""
|
||||
exactly_one(filename=filename, file=file)
|
||||
|
||||
def validate_filename(filename: str) -> str:
|
||||
"""Return path to a file confirmed to exist on the filesystem."""
|
||||
if not os.path.exists(filename):
|
||||
raise ValueError(f"The file {filename} does not exist.")
|
||||
return filename
|
||||
|
||||
def copy_to_tempfile(file: IO[bytes]) -> str:
|
||||
"""Return path to temporary copy of file to be converted."""
|
||||
with tempfile.NamedTemporaryFile(delete=False) as tmp:
|
||||
tmp.write(file.read())
|
||||
return tmp.name
|
||||
|
||||
def extract_docx_filename(file_path: str) -> str:
|
||||
"""Return a filename like "foo.docx" from a path like "a/b/foo.odt" """
|
||||
# -- a/b/foo.odt -> foo.odt --
|
||||
filename = os.path.basename(file_path)
|
||||
# -- foo.odt -> foo --
|
||||
root_name, _ = os.path.splitext(filename)
|
||||
# -- foo -> foo.docx --
|
||||
return f"{root_name}.docx"
|
||||
|
||||
file_path = validate_filename(filename) if filename else copy_to_tempfile(cast(IO[bytes], file))
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
docx_path = os.path.join(tmpdir, extract_docx_filename(file_path))
|
||||
pypandoc.convert_file( # pyright: ignore
|
||||
file_path,
|
||||
"docx",
|
||||
format=source_format,
|
||||
outputfile=docx_path,
|
||||
)
|
||||
elements = partition_docx(
|
||||
filename=docx_path,
|
||||
metadata_filename=metadata_filename,
|
||||
include_metadata=include_metadata,
|
||||
infer_table_structure=infer_table_structure,
|
||||
metadata_last_modified=metadata_last_modified,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
starting_page_number=starting_page_number,
|
||||
)
|
||||
|
||||
return elements
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.DOCX)
|
||||
@add_chunking_strategy
|
||||
|
||||
@ -1,12 +1,19 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import IO, Any, Optional
|
||||
import os
|
||||
import tempfile
|
||||
from typing import IO, Any, Optional, cast
|
||||
|
||||
from unstructured.chunking import add_chunking_strategy
|
||||
from unstructured.documents.elements import Element, process_metadata
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file
|
||||
from unstructured.partition.docx import convert_and_partition_docx
|
||||
from unstructured.partition.common import (
|
||||
exactly_one,
|
||||
get_last_modified_date,
|
||||
get_last_modified_date_from_file,
|
||||
)
|
||||
from unstructured.partition.docx import partition_docx
|
||||
from unstructured.utils import requires_dependencies
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@ -14,15 +21,14 @@ from unstructured.partition.docx import convert_and_partition_docx
|
||||
@add_chunking_strategy
|
||||
def partition_odt(
|
||||
filename: Optional[str] = None,
|
||||
*,
|
||||
date_from_file_object: bool = False,
|
||||
detect_language_per_element: bool = False,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
include_metadata: bool = True,
|
||||
infer_table_structure: bool = True,
|
||||
languages: Optional[list[str]] = ["auto"],
|
||||
metadata_filename: Optional[str] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
chunking_strategy: Optional[str] = None,
|
||||
languages: Optional[list[str]] = ["auto"],
|
||||
detect_language_per_element: bool = False,
|
||||
date_from_file_object: bool = False,
|
||||
starting_page_number: int = 1,
|
||||
**kwargs: Any,
|
||||
) -> list[Element]:
|
||||
@ -51,25 +57,79 @@ def partition_odt(
|
||||
Detect language per element instead of at the document level.
|
||||
date_from_file_object
|
||||
Applies only when providing file via `file` parameter. If this option is True, attempt
|
||||
infer last_modified metadata from bytes, otherwise set it to None.
|
||||
infer last_modified metadata from the file-like object, otherwise set it to None.
|
||||
"""
|
||||
|
||||
last_modification_date = None
|
||||
if filename:
|
||||
last_modification_date = get_last_modified_date(filename)
|
||||
elif file:
|
||||
last_modification_date = (
|
||||
get_last_modified_date_from_file(file) if date_from_file_object else None
|
||||
last_modification_date = (
|
||||
get_last_modified_date(filename)
|
||||
if filename
|
||||
else get_last_modified_date_from_file(file) if file and date_from_file_object else None
|
||||
)
|
||||
|
||||
with tempfile.TemporaryDirectory() as target_dir:
|
||||
docx_path = _convert_odt_to_docx(target_dir, filename, file)
|
||||
elements = partition_docx(
|
||||
filename=docx_path,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
metadata_filename=metadata_filename,
|
||||
metadata_last_modified=metadata_last_modified or last_modification_date,
|
||||
starting_page_number=starting_page_number,
|
||||
)
|
||||
|
||||
return convert_and_partition_docx(
|
||||
source_format="odt",
|
||||
filename=filename,
|
||||
file=file,
|
||||
infer_table_structure=infer_table_structure,
|
||||
metadata_filename=metadata_filename,
|
||||
metadata_last_modified=metadata_last_modified or last_modification_date,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
starting_page_number=starting_page_number,
|
||||
return elements
|
||||
|
||||
|
||||
@requires_dependencies("pypandoc")
|
||||
def _convert_odt_to_docx(
|
||||
target_dir: str, filename: Optional[str], file: Optional[IO[bytes]]
|
||||
) -> str:
|
||||
"""Convert ODT document to DOCX returning the new .docx file's path.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
target_dir
|
||||
The str directory-path to use for conversion purposes. The new DOCX file is written to this
|
||||
directory. When passed as a file-like object, a copy of the source file is written here as
|
||||
well. It is the caller's responsibility to remove this directory and its contents when
|
||||
they are no longer needed.
|
||||
filename
|
||||
A str file-path specifying the location of the source ODT file on the local filesystem.
|
||||
file
|
||||
A file-like object open for reading in binary mode ("rb" mode).
|
||||
"""
|
||||
exactly_one(filename=filename, file=file)
|
||||
|
||||
# -- validate file-path when provided so we can provide a more meaningful error than whatever
|
||||
# -- would come from pandoc.
|
||||
if filename is not None and not os.path.exists(filename):
|
||||
raise ValueError(f"The file {filename} does not exist.")
|
||||
|
||||
# -- Pandoc is a command-line program running in its own memory-space. It can therefore only
|
||||
# -- operate on files on the filesystem. If the source document was passed as `file`, write
|
||||
# -- it to `target_dir/document.odt` and use that path as the source-path.
|
||||
source_file_path = f"{target_dir}/document.odt" if file is not None else cast(str, filename)
|
||||
if file is not None:
|
||||
with open(source_file_path, "wb") as f:
|
||||
f.write(file.read())
|
||||
|
||||
# -- Compute the path of the resulting .docx document. We want its file-name to be preserved
|
||||
# -- if the source-document was provided as `filename`.
|
||||
# -- a/b/foo.odt -> foo.odt --
|
||||
file_name = os.path.basename(source_file_path)
|
||||
# -- foo.odt -> foo --
|
||||
base_name, _ = os.path.splitext(file_name)
|
||||
# -- foo -> foo.docx --
|
||||
target_docx_path = os.path.join(target_dir, f"{base_name}.docx")
|
||||
|
||||
import pypandoc
|
||||
|
||||
pypandoc.convert_file(
|
||||
source_file_path,
|
||||
"docx",
|
||||
format="odt",
|
||||
outputfile=target_docx_path,
|
||||
)
|
||||
|
||||
return target_docx_path
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user