mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-14 08:44:29 +00:00
fix(odt): fix disk-space leak in partition_odt() (#3037)
Remedy disk-space leak where `partition_odt()` would leave an on-disk copy of each `.odt` file passed as a file-like object. `partition_odt()` creates a temporary file in which it writes each source-document provided as a file-like object. This file is not deleted and disk consumption grows without bound. The `convert_and_partition_docx()` function used to convert ODT->DOCX uses `pandoc` (a command-line program) to do the conversion. Because this command-line program operates in a different memory space, the source file cannot be passed as an in-memory object and needs to be on the filesystem. When the ODT source-document is passed as a file-like object, it is written to disk so the conversion program has access to it. It is not deleted afterward. Fix this by writing the temporary source ODT file in a `TemporaryDirectory` and also use that location to write the conversion-target DOCX file. That directory is automatically removed when `partition_odt()` completes. While we're in there, improve the factoring of `partition_odt()`. - Extract `convert_and_partition_docx()` from `partition.docx` (used only by `partition_odt()`) to `_convert_odt_to_docx()` in `partition.odt` where it is used. Decouple file conversion from calling `partition_docx()` with the converted file as the `partition_docx()` call is `partition_odt()`'s natural responsibility. - Improve docstrings, typing, and comments. - All tests pass both before and after.
This commit is contained in:
parent
0de9215db4
commit
8644a3b09a
@ -1,4 +1,4 @@
|
|||||||
## 0.13.8-dev14
|
## 0.13.8-dev15
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -20,6 +20,7 @@
|
|||||||
* **Improve CSV delimeter detection.** `partition_csv()` would raise on CSV files with very long lines.
|
* **Improve CSV delimeter detection.** `partition_csv()` would raise on CSV files with very long lines.
|
||||||
* **Fix disk-space leak in `partition_doc()`.** Remove temporary file created but not removed when `file` argument is passed to `partition_doc()`.
|
* **Fix disk-space leak in `partition_doc()`.** Remove temporary file created but not removed when `file` argument is passed to `partition_doc()`.
|
||||||
* **Fix possible `SyntaxError` or `SyntaxWarning` on regex patterns.** Change regex patterns to raw strings to avoid these warnings/errors in Python 3.11+.
|
* **Fix possible `SyntaxError` or `SyntaxWarning` on regex patterns.** Change regex patterns to raw strings to avoid these warnings/errors in Python 3.11+.
|
||||||
|
* **Fix disk-space leak in `partition_odt()`.** Remove temporary file created but not removed when `file` argument is passed to `partition_odt()`.
|
||||||
|
|
||||||
## 0.13.7
|
## 0.13.7
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,5 @@
|
|||||||
|
import pathlib
|
||||||
|
|
||||||
|
def convert_file(
|
||||||
|
source_file: str, to: str, format: str | None, outputfile: str | pathlib.Path | None
|
||||||
|
) -> str: ...
|
||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.13.8-dev14" # pragma: no cover
|
__version__ = "0.13.8-dev15" # pragma: no cover
|
||||||
|
|||||||
@ -5,9 +5,8 @@ from __future__ import annotations
|
|||||||
import html
|
import html
|
||||||
import io
|
import io
|
||||||
import itertools
|
import itertools
|
||||||
import os
|
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import IO, Any, Iterator, Optional, Type, cast
|
from typing import IO, Any, Iterator, Optional, Type
|
||||||
|
|
||||||
# -- CT_* stands for "complex-type", an XML element type in docx parlance --
|
# -- CT_* stands for "complex-type", an XML element type in docx parlance --
|
||||||
import docx
|
import docx
|
||||||
@ -45,7 +44,6 @@ from unstructured.documents.elements import (
|
|||||||
)
|
)
|
||||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||||
from unstructured.partition.common import (
|
from unstructured.partition.common import (
|
||||||
exactly_one,
|
|
||||||
get_last_modified_date,
|
get_last_modified_date,
|
||||||
get_last_modified_date_from_file,
|
get_last_modified_date_from_file,
|
||||||
)
|
)
|
||||||
@ -58,114 +56,13 @@ from unstructured.partition.text_type import (
|
|||||||
is_us_city_state_zip,
|
is_us_city_state_zip,
|
||||||
)
|
)
|
||||||
from unstructured.partition.utils.constants import PartitionStrategy
|
from unstructured.partition.utils.constants import PartitionStrategy
|
||||||
from unstructured.utils import (
|
from unstructured.utils import is_temp_file_path, lazyproperty
|
||||||
dependency_exists,
|
|
||||||
is_temp_file_path,
|
|
||||||
lazyproperty,
|
|
||||||
requires_dependencies,
|
|
||||||
)
|
|
||||||
|
|
||||||
if dependency_exists("pypandoc"):
|
|
||||||
import pypandoc
|
|
||||||
|
|
||||||
DETECTION_ORIGIN: str = "docx"
|
DETECTION_ORIGIN: str = "docx"
|
||||||
BlockElement: TypeAlias = "CT_P | CT_Tbl"
|
BlockElement: TypeAlias = "CT_P | CT_Tbl"
|
||||||
BlockItem: TypeAlias = "Paragraph | DocxTable"
|
BlockItem: TypeAlias = "Paragraph | DocxTable"
|
||||||
|
|
||||||
|
|
||||||
@requires_dependencies("pypandoc")
|
|
||||||
def convert_and_partition_docx(
|
|
||||||
source_format: str,
|
|
||||||
filename: Optional[str] = None,
|
|
||||||
file: Optional[IO[bytes]] = None,
|
|
||||||
include_metadata: bool = True,
|
|
||||||
infer_table_structure: bool = True,
|
|
||||||
metadata_filename: Optional[str] = None,
|
|
||||||
metadata_last_modified: Optional[str] = None,
|
|
||||||
languages: Optional[list[str]] = ["auto"],
|
|
||||||
detect_language_per_element: bool = False,
|
|
||||||
starting_page_number: int = 1,
|
|
||||||
) -> list[Element]:
|
|
||||||
"""Converts a document to DOCX and then partitions it using partition_docx.
|
|
||||||
|
|
||||||
Works with any file format support by pandoc.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
source_format
|
|
||||||
The format of the source document, .e.g. odt
|
|
||||||
filename
|
|
||||||
A string defining the target filename path.
|
|
||||||
file
|
|
||||||
A file-like object using "rb" mode --> open(filename, "rb").
|
|
||||||
include_metadata
|
|
||||||
Determines whether or not metadata is included in the metadata attribute on the elements in
|
|
||||||
the output.
|
|
||||||
infer_table_structure
|
|
||||||
If True, any Table elements that are extracted will also have a metadata field
|
|
||||||
named "text_as_html" where the table's text content is rendered into an html string.
|
|
||||||
I.e., rows and cells are preserved.
|
|
||||||
Whether True or False, the "text" field is always present in any Table element
|
|
||||||
and is the text content of the table (no structure).
|
|
||||||
languages
|
|
||||||
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
|
||||||
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
|
||||||
in either language.
|
|
||||||
Additional Parameters:
|
|
||||||
detect_language_per_element
|
|
||||||
Detect language per element instead of at the document level.
|
|
||||||
starting_page_number
|
|
||||||
Indicates what page number should be assigned to the first page in the document.
|
|
||||||
This information will be reflected in elements' metadata and can be be especially
|
|
||||||
useful when partitioning a document that is part of a larger document.
|
|
||||||
"""
|
|
||||||
exactly_one(filename=filename, file=file)
|
|
||||||
|
|
||||||
def validate_filename(filename: str) -> str:
|
|
||||||
"""Return path to a file confirmed to exist on the filesystem."""
|
|
||||||
if not os.path.exists(filename):
|
|
||||||
raise ValueError(f"The file {filename} does not exist.")
|
|
||||||
return filename
|
|
||||||
|
|
||||||
def copy_to_tempfile(file: IO[bytes]) -> str:
|
|
||||||
"""Return path to temporary copy of file to be converted."""
|
|
||||||
with tempfile.NamedTemporaryFile(delete=False) as tmp:
|
|
||||||
tmp.write(file.read())
|
|
||||||
return tmp.name
|
|
||||||
|
|
||||||
def extract_docx_filename(file_path: str) -> str:
|
|
||||||
"""Return a filename like "foo.docx" from a path like "a/b/foo.odt" """
|
|
||||||
# -- a/b/foo.odt -> foo.odt --
|
|
||||||
filename = os.path.basename(file_path)
|
|
||||||
# -- foo.odt -> foo --
|
|
||||||
root_name, _ = os.path.splitext(filename)
|
|
||||||
# -- foo -> foo.docx --
|
|
||||||
return f"{root_name}.docx"
|
|
||||||
|
|
||||||
file_path = validate_filename(filename) if filename else copy_to_tempfile(cast(IO[bytes], file))
|
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
docx_path = os.path.join(tmpdir, extract_docx_filename(file_path))
|
|
||||||
pypandoc.convert_file( # pyright: ignore
|
|
||||||
file_path,
|
|
||||||
"docx",
|
|
||||||
format=source_format,
|
|
||||||
outputfile=docx_path,
|
|
||||||
)
|
|
||||||
elements = partition_docx(
|
|
||||||
filename=docx_path,
|
|
||||||
metadata_filename=metadata_filename,
|
|
||||||
include_metadata=include_metadata,
|
|
||||||
infer_table_structure=infer_table_structure,
|
|
||||||
metadata_last_modified=metadata_last_modified,
|
|
||||||
languages=languages,
|
|
||||||
detect_language_per_element=detect_language_per_element,
|
|
||||||
starting_page_number=starting_page_number,
|
|
||||||
)
|
|
||||||
|
|
||||||
return elements
|
|
||||||
|
|
||||||
|
|
||||||
@process_metadata()
|
@process_metadata()
|
||||||
@add_metadata_with_filetype(FileType.DOCX)
|
@add_metadata_with_filetype(FileType.DOCX)
|
||||||
@add_chunking_strategy
|
@add_chunking_strategy
|
||||||
|
|||||||
@ -1,12 +1,19 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import IO, Any, Optional
|
import os
|
||||||
|
import tempfile
|
||||||
|
from typing import IO, Any, Optional, cast
|
||||||
|
|
||||||
from unstructured.chunking import add_chunking_strategy
|
from unstructured.chunking import add_chunking_strategy
|
||||||
from unstructured.documents.elements import Element, process_metadata
|
from unstructured.documents.elements import Element, process_metadata
|
||||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||||
from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file
|
from unstructured.partition.common import (
|
||||||
from unstructured.partition.docx import convert_and_partition_docx
|
exactly_one,
|
||||||
|
get_last_modified_date,
|
||||||
|
get_last_modified_date_from_file,
|
||||||
|
)
|
||||||
|
from unstructured.partition.docx import partition_docx
|
||||||
|
from unstructured.utils import requires_dependencies
|
||||||
|
|
||||||
|
|
||||||
@process_metadata()
|
@process_metadata()
|
||||||
@ -14,15 +21,14 @@ from unstructured.partition.docx import convert_and_partition_docx
|
|||||||
@add_chunking_strategy
|
@add_chunking_strategy
|
||||||
def partition_odt(
|
def partition_odt(
|
||||||
filename: Optional[str] = None,
|
filename: Optional[str] = None,
|
||||||
|
*,
|
||||||
|
date_from_file_object: bool = False,
|
||||||
|
detect_language_per_element: bool = False,
|
||||||
file: Optional[IO[bytes]] = None,
|
file: Optional[IO[bytes]] = None,
|
||||||
include_metadata: bool = True,
|
|
||||||
infer_table_structure: bool = True,
|
infer_table_structure: bool = True,
|
||||||
|
languages: Optional[list[str]] = ["auto"],
|
||||||
metadata_filename: Optional[str] = None,
|
metadata_filename: Optional[str] = None,
|
||||||
metadata_last_modified: Optional[str] = None,
|
metadata_last_modified: Optional[str] = None,
|
||||||
chunking_strategy: Optional[str] = None,
|
|
||||||
languages: Optional[list[str]] = ["auto"],
|
|
||||||
detect_language_per_element: bool = False,
|
|
||||||
date_from_file_object: bool = False,
|
|
||||||
starting_page_number: int = 1,
|
starting_page_number: int = 1,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> list[Element]:
|
) -> list[Element]:
|
||||||
@ -51,25 +57,79 @@ def partition_odt(
|
|||||||
Detect language per element instead of at the document level.
|
Detect language per element instead of at the document level.
|
||||||
date_from_file_object
|
date_from_file_object
|
||||||
Applies only when providing file via `file` parameter. If this option is True, attempt
|
Applies only when providing file via `file` parameter. If this option is True, attempt
|
||||||
infer last_modified metadata from bytes, otherwise set it to None.
|
infer last_modified metadata from the file-like object, otherwise set it to None.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
last_modification_date = None
|
|
||||||
if filename:
|
|
||||||
last_modification_date = get_last_modified_date(filename)
|
|
||||||
elif file:
|
|
||||||
last_modification_date = (
|
last_modification_date = (
|
||||||
get_last_modified_date_from_file(file) if date_from_file_object else None
|
get_last_modified_date(filename)
|
||||||
|
if filename
|
||||||
|
else get_last_modified_date_from_file(file) if file and date_from_file_object else None
|
||||||
)
|
)
|
||||||
|
|
||||||
return convert_and_partition_docx(
|
with tempfile.TemporaryDirectory() as target_dir:
|
||||||
source_format="odt",
|
docx_path = _convert_odt_to_docx(target_dir, filename, file)
|
||||||
filename=filename,
|
elements = partition_docx(
|
||||||
file=file,
|
filename=docx_path,
|
||||||
|
detect_language_per_element=detect_language_per_element,
|
||||||
infer_table_structure=infer_table_structure,
|
infer_table_structure=infer_table_structure,
|
||||||
|
languages=languages,
|
||||||
metadata_filename=metadata_filename,
|
metadata_filename=metadata_filename,
|
||||||
metadata_last_modified=metadata_last_modified or last_modification_date,
|
metadata_last_modified=metadata_last_modified or last_modification_date,
|
||||||
languages=languages,
|
|
||||||
detect_language_per_element=detect_language_per_element,
|
|
||||||
starting_page_number=starting_page_number,
|
starting_page_number=starting_page_number,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
return elements
|
||||||
|
|
||||||
|
|
||||||
|
@requires_dependencies("pypandoc")
|
||||||
|
def _convert_odt_to_docx(
|
||||||
|
target_dir: str, filename: Optional[str], file: Optional[IO[bytes]]
|
||||||
|
) -> str:
|
||||||
|
"""Convert ODT document to DOCX returning the new .docx file's path.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
target_dir
|
||||||
|
The str directory-path to use for conversion purposes. The new DOCX file is written to this
|
||||||
|
directory. When passed as a file-like object, a copy of the source file is written here as
|
||||||
|
well. It is the caller's responsibility to remove this directory and its contents when
|
||||||
|
they are no longer needed.
|
||||||
|
filename
|
||||||
|
A str file-path specifying the location of the source ODT file on the local filesystem.
|
||||||
|
file
|
||||||
|
A file-like object open for reading in binary mode ("rb" mode).
|
||||||
|
"""
|
||||||
|
exactly_one(filename=filename, file=file)
|
||||||
|
|
||||||
|
# -- validate file-path when provided so we can provide a more meaningful error than whatever
|
||||||
|
# -- would come from pandoc.
|
||||||
|
if filename is not None and not os.path.exists(filename):
|
||||||
|
raise ValueError(f"The file {filename} does not exist.")
|
||||||
|
|
||||||
|
# -- Pandoc is a command-line program running in its own memory-space. It can therefore only
|
||||||
|
# -- operate on files on the filesystem. If the source document was passed as `file`, write
|
||||||
|
# -- it to `target_dir/document.odt` and use that path as the source-path.
|
||||||
|
source_file_path = f"{target_dir}/document.odt" if file is not None else cast(str, filename)
|
||||||
|
if file is not None:
|
||||||
|
with open(source_file_path, "wb") as f:
|
||||||
|
f.write(file.read())
|
||||||
|
|
||||||
|
# -- Compute the path of the resulting .docx document. We want its file-name to be preserved
|
||||||
|
# -- if the source-document was provided as `filename`.
|
||||||
|
# -- a/b/foo.odt -> foo.odt --
|
||||||
|
file_name = os.path.basename(source_file_path)
|
||||||
|
# -- foo.odt -> foo --
|
||||||
|
base_name, _ = os.path.splitext(file_name)
|
||||||
|
# -- foo -> foo.docx --
|
||||||
|
target_docx_path = os.path.join(target_dir, f"{base_name}.docx")
|
||||||
|
|
||||||
|
import pypandoc
|
||||||
|
|
||||||
|
pypandoc.convert_file(
|
||||||
|
source_file_path,
|
||||||
|
"docx",
|
||||||
|
format="odt",
|
||||||
|
outputfile=target_docx_path,
|
||||||
|
)
|
||||||
|
|
||||||
|
return target_docx_path
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user