mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-16 12:49:12 +00:00
feat(docx): differentiate no-file from not-ZIP (#3306)
**Summary** The `python-docx` error `docx.opc.exceptions.PackageNotFoundError` arises both when no file exists at the given path and when the file exists but is not a ZIP archive (and so is not a DOCX file). This ambiguity is unwelcome when diagnosing the error as the two possible conditions generally indicate a different course of action to resolve the error. Add detailed validation to `DocxPartitionerOptions` to distinguish these two and provide more precise exception messages. **Additional Context** - `python-pptx` shares the same OPC-Package (file) loading code used by `python-docx`, so the same ambiguity will be present in `python-pptx`. - It would be preferable for this distinguished exception behavior to be upstream in `python-docx` and `python-pptx`. If we're willing to take the version bump it might be worth considering doing that instead.
This commit is contained in:
parent
54ec311c55
commit
087adb218f
@ -1,11 +1,9 @@
|
|||||||
## 0.14.9-dev6
|
## 0.14.9-dev7
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
* **Added visualization and OD model result dump for PDF** In PDF `hi_res` strategy the `analysis` parameter can be used
|
* **Added visualization and OD model result dump for PDF** In PDF `hi_res` strategy the `analysis` parameter can be used to visualize the result of the OD model and dump the result to a file. Additionally, the visualization of bounding boxes of each layout source is rendered and saved for each page.
|
||||||
to visualize the result of the OD model and dump the result to a file.
|
* **`partition_docx()` distinguishes "file not found" from "not a ZIP archive" error.** `partition_docx()` now provides different error messages for "file not found" and "file is not a ZIP archive (and therefore not a DOCX file)". This aids diagnosis since these two conditions generally point in different directions as to the cause and fix.
|
||||||
Additionally, the visualization of bounding boxes of each layout source is rendered and saved
|
|
||||||
for each page.
|
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
|
@ -770,6 +770,19 @@ def opts_args() -> dict[str, Any]:
|
|||||||
class DescribeDocxPartitionerOptions:
|
class DescribeDocxPartitionerOptions:
|
||||||
"""Unit-test suite for `unstructured.partition.docx.DocxPartitionerOptions` objects."""
|
"""Unit-test suite for `unstructured.partition.docx.DocxPartitionerOptions` objects."""
|
||||||
|
|
||||||
|
# -- .load() ---------------------------------
|
||||||
|
|
||||||
|
def it_provides_a_validating_constructor(self, opts_args: dict[str, Any]):
|
||||||
|
opts_args["file_path"] = example_doc_path("simple.docx")
|
||||||
|
|
||||||
|
opts = DocxPartitionerOptions.load(**opts_args)
|
||||||
|
|
||||||
|
assert isinstance(opts, DocxPartitionerOptions)
|
||||||
|
|
||||||
|
def and_it_raises_when_options_are_not_valid(self, opts_args: dict[str, Any]):
|
||||||
|
with pytest.raises(ValueError, match="no DOCX document specified, "):
|
||||||
|
DocxPartitionerOptions.load(**opts_args)
|
||||||
|
|
||||||
# -- .document -------------------------------
|
# -- .document -------------------------------
|
||||||
|
|
||||||
def it_loads_the_docx_document(
|
def it_loads_the_docx_document(
|
||||||
@ -1024,13 +1037,31 @@ class DescribeDocxPartitionerOptions:
|
|||||||
assert isinstance(docx_file, io.BytesIO)
|
assert isinstance(docx_file, io.BytesIO)
|
||||||
assert docx_file.getvalue() == b"abcdefg"
|
assert docx_file.getvalue() == b"abcdefg"
|
||||||
|
|
||||||
def but_it_raises_ValueError_when_neither_a_file_path_or_file_is_provided(
|
# -- ._validate() ----------------------------
|
||||||
|
|
||||||
|
def it_raises_when_no_file_exists_at_file_path(self, opts_args: dict[str, Any]):
|
||||||
|
opts_args["file_path"] = "l/m/n.docx"
|
||||||
|
with pytest.raises(FileNotFoundError, match="no such file or directory: 'l/m/n.docx'"):
|
||||||
|
DocxPartitionerOptions.load(**opts_args)
|
||||||
|
|
||||||
|
def and_it_raises_when_the_file_at_file_path_is_not_a_ZIP_archive(
|
||||||
self, opts_args: dict[str, Any]
|
self, opts_args: dict[str, Any]
|
||||||
):
|
):
|
||||||
opts = DocxPartitionerOptions(**opts_args)
|
opts_args["file_path"] = example_doc_path("simple.doc")
|
||||||
|
with pytest.raises(ValueError, match=r"not a ZIP archive \(so not a DOCX file\): "):
|
||||||
|
DocxPartitionerOptions.load(**opts_args)
|
||||||
|
|
||||||
with pytest.raises(ValueError, match="No DOCX document specified, either `filename` or "):
|
def and_it_raises_when_the_file_like_object_is_not_a_ZIP_archive(
|
||||||
opts._docx_file
|
self, opts_args: dict[str, Any]
|
||||||
|
):
|
||||||
|
with open(example_doc_path("simple.doc"), "rb") as f:
|
||||||
|
opts_args["file"] = f
|
||||||
|
with pytest.raises(ValueError, match=r"not a ZIP archive \(so not a DOCX file\): "):
|
||||||
|
DocxPartitionerOptions.load(**opts_args)
|
||||||
|
|
||||||
|
def and_it_raises_when_neither_a_file_path_or_file_is_provided(self, opts_args: dict[str, Any]):
|
||||||
|
with pytest.raises(ValueError, match="no DOCX document specified, either `filename` or "):
|
||||||
|
DocxPartitionerOptions.load(**opts_args)
|
||||||
|
|
||||||
# -- fixtures --------------------------------------------------------------------------------
|
# -- fixtures --------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.14.9-dev6" # pragma: no cover
|
__version__ = "0.14.9-dev7" # pragma: no cover
|
||||||
|
@ -5,7 +5,9 @@ from __future__ import annotations
|
|||||||
import html
|
import html
|
||||||
import io
|
import io
|
||||||
import itertools
|
import itertools
|
||||||
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import zipfile
|
||||||
from typing import IO, Any, Iterator, Optional, Protocol, Type
|
from typing import IO, Any, Iterator, Optional, Protocol, Type
|
||||||
|
|
||||||
# -- CT_* stands for "complex-type", an XML element type in docx parlance --
|
# -- CT_* stands for "complex-type", an XML element type in docx parlance --
|
||||||
@ -155,7 +157,7 @@ def partition_docx(
|
|||||||
Assign this number to the first page of this document and increment the page number from
|
Assign this number to the first page of this document and increment the page number from
|
||||||
there.
|
there.
|
||||||
"""
|
"""
|
||||||
opts = DocxPartitionerOptions(
|
opts = DocxPartitionerOptions.load(
|
||||||
date_from_file_object=date_from_file_object,
|
date_from_file_object=date_from_file_object,
|
||||||
file=file,
|
file=file,
|
||||||
file_path=filename,
|
file_path=filename,
|
||||||
@ -214,6 +216,11 @@ class DocxPartitionerOptions:
|
|||||||
# -- options object maintains page-number state --
|
# -- options object maintains page-number state --
|
||||||
self._page_counter = starting_page_number
|
self._page_counter = starting_page_number
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(cls, **kwargs: Any) -> DocxPartitionerOptions:
|
||||||
|
"""Construct and validate an instance."""
|
||||||
|
return cls(**kwargs)._validate()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def register_picture_partitioner(cls, picture_partitioner: PicturePartitionerT):
|
def register_picture_partitioner(cls, picture_partitioner: PicturePartitionerT):
|
||||||
"""Specify a pluggable sub-partitioner to extract images from DOCX paragraphs."""
|
"""Specify a pluggable sub-partitioner to extract images from DOCX paragraphs."""
|
||||||
@ -358,13 +365,27 @@ class DocxPartitionerOptions:
|
|||||||
self._file.seek(0)
|
self._file.seek(0)
|
||||||
return io.BytesIO(self._file.read())
|
return io.BytesIO(self._file.read())
|
||||||
|
|
||||||
if self._file:
|
assert self._file is not None # -- assured by `._validate()` --
|
||||||
return self._file
|
return self._file
|
||||||
|
|
||||||
|
def _validate(self) -> DocxPartitionerOptions:
|
||||||
|
"""Raise on first invalide option, return self otherwise."""
|
||||||
|
# -- provide distinguished error between "file-not-found" and "not-a-DOCX-file" --
|
||||||
|
if self._file_path:
|
||||||
|
if not os.path.isfile(self._file_path):
|
||||||
|
raise FileNotFoundError(f"no such file or directory: {repr(self._file_path)}")
|
||||||
|
if not zipfile.is_zipfile(self._file_path):
|
||||||
|
raise ValueError(f"not a ZIP archive (so not a DOCX file): {repr(self._file_path)}")
|
||||||
|
elif self._file:
|
||||||
|
if not zipfile.is_zipfile(self._file):
|
||||||
|
raise ValueError(f"not a ZIP archive (so not a DOCX file): {repr(self._file)}")
|
||||||
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"No DOCX document specified, either `filename` or `file` argument must be provided"
|
"no DOCX document specified, either `filename` or `file` argument must be provided"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
class _DocxPartitioner:
|
class _DocxPartitioner:
|
||||||
"""Provides `.partition()` for MS-Word 2007+ (.docx) files."""
|
"""Provides `.partition()` for MS-Word 2007+ (.docx) files."""
|
||||||
|
Loading…
x
Reference in New Issue
Block a user