feat(docx): differentiate no-file from not-ZIP (#3306)

**Summary**
The `python-docx` error `docx.opc.exceptions.PackageNotFoundError`
arises both when no file exists at the given path and when the file
exists but is not a ZIP archive (and so is not a DOCX file).

This ambiguity is unwelcome when diagnosing the error as the two
possible conditions generally indicate a different course of action to
resolve the error.

Add detailed validation to `DocxPartitionerOptions` to distinguish these
two and provide more precise exception messages.

**Additional Context**
- `python-pptx` shares the same OPC-Package (file) loading code used by
`python-docx`, so the same ambiguity will be present in `python-pptx`.
- It would be preferable for this distinguished exception behavior to be
upstream in `python-docx` and `python-pptx`. If we're willing to take
the version bump it might be worth considering doing that instead.
This commit is contained in:
Steve Canny 2024-06-26 17:18:56 -07:00 committed by GitHub
parent 54ec311c55
commit 087adb218f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 66 additions and 16 deletions

View File

@ -1,11 +1,9 @@
## 0.14.9-dev6 ## 0.14.9-dev7
### Enhancements ### Enhancements
* **Added visualization and OD model result dump for PDF** In PDF `hi_res` strategy the `analysis` parameter can be used * **Added visualization and OD model result dump for PDF** In PDF `hi_res` strategy the `analysis` parameter can be used to visualize the result of the OD model and dump the result to a file. Additionally, the visualization of bounding boxes of each layout source is rendered and saved for each page.
to visualize the result of the OD model and dump the result to a file. * **`partition_docx()` distinguishes "file not found" from "not a ZIP archive" error.** `partition_docx()` now provides different error messages for "file not found" and "file is not a ZIP archive (and therefore not a DOCX file)". This aids diagnosis since these two conditions generally point in different directions as to the cause and fix.
Additionally, the visualization of bounding boxes of each layout source is rendered and saved
for each page.
### Features ### Features

View File

@ -770,6 +770,19 @@ def opts_args() -> dict[str, Any]:
class DescribeDocxPartitionerOptions: class DescribeDocxPartitionerOptions:
"""Unit-test suite for `unstructured.partition.docx.DocxPartitionerOptions` objects.""" """Unit-test suite for `unstructured.partition.docx.DocxPartitionerOptions` objects."""
# -- .load() ---------------------------------
def it_provides_a_validating_constructor(self, opts_args: dict[str, Any]):
opts_args["file_path"] = example_doc_path("simple.docx")
opts = DocxPartitionerOptions.load(**opts_args)
assert isinstance(opts, DocxPartitionerOptions)
def and_it_raises_when_options_are_not_valid(self, opts_args: dict[str, Any]):
with pytest.raises(ValueError, match="no DOCX document specified, "):
DocxPartitionerOptions.load(**opts_args)
# -- .document ------------------------------- # -- .document -------------------------------
def it_loads_the_docx_document( def it_loads_the_docx_document(
@ -1024,13 +1037,31 @@ class DescribeDocxPartitionerOptions:
assert isinstance(docx_file, io.BytesIO) assert isinstance(docx_file, io.BytesIO)
assert docx_file.getvalue() == b"abcdefg" assert docx_file.getvalue() == b"abcdefg"
def but_it_raises_ValueError_when_neither_a_file_path_or_file_is_provided( # -- ._validate() ----------------------------
def it_raises_when_no_file_exists_at_file_path(self, opts_args: dict[str, Any]):
opts_args["file_path"] = "l/m/n.docx"
with pytest.raises(FileNotFoundError, match="no such file or directory: 'l/m/n.docx'"):
DocxPartitionerOptions.load(**opts_args)
def and_it_raises_when_the_file_at_file_path_is_not_a_ZIP_archive(
self, opts_args: dict[str, Any] self, opts_args: dict[str, Any]
): ):
opts = DocxPartitionerOptions(**opts_args) opts_args["file_path"] = example_doc_path("simple.doc")
with pytest.raises(ValueError, match=r"not a ZIP archive \(so not a DOCX file\): "):
DocxPartitionerOptions.load(**opts_args)
with pytest.raises(ValueError, match="No DOCX document specified, either `filename` or "): def and_it_raises_when_the_file_like_object_is_not_a_ZIP_archive(
opts._docx_file self, opts_args: dict[str, Any]
):
with open(example_doc_path("simple.doc"), "rb") as f:
opts_args["file"] = f
with pytest.raises(ValueError, match=r"not a ZIP archive \(so not a DOCX file\): "):
DocxPartitionerOptions.load(**opts_args)
def and_it_raises_when_neither_a_file_path_or_file_is_provided(self, opts_args: dict[str, Any]):
with pytest.raises(ValueError, match="no DOCX document specified, either `filename` or "):
DocxPartitionerOptions.load(**opts_args)
# -- fixtures -------------------------------------------------------------------------------- # -- fixtures --------------------------------------------------------------------------------

View File

@ -1 +1 @@
__version__ = "0.14.9-dev6" # pragma: no cover __version__ = "0.14.9-dev7" # pragma: no cover

View File

@ -5,7 +5,9 @@ from __future__ import annotations
import html import html
import io import io
import itertools import itertools
import os
import tempfile import tempfile
import zipfile
from typing import IO, Any, Iterator, Optional, Protocol, Type from typing import IO, Any, Iterator, Optional, Protocol, Type
# -- CT_* stands for "complex-type", an XML element type in docx parlance -- # -- CT_* stands for "complex-type", an XML element type in docx parlance --
@ -155,7 +157,7 @@ def partition_docx(
Assign this number to the first page of this document and increment the page number from Assign this number to the first page of this document and increment the page number from
there. there.
""" """
opts = DocxPartitionerOptions( opts = DocxPartitionerOptions.load(
date_from_file_object=date_from_file_object, date_from_file_object=date_from_file_object,
file=file, file=file,
file_path=filename, file_path=filename,
@ -214,6 +216,11 @@ class DocxPartitionerOptions:
# -- options object maintains page-number state -- # -- options object maintains page-number state --
self._page_counter = starting_page_number self._page_counter = starting_page_number
@classmethod
def load(cls, **kwargs: Any) -> DocxPartitionerOptions:
"""Construct and validate an instance."""
return cls(**kwargs)._validate()
@classmethod @classmethod
def register_picture_partitioner(cls, picture_partitioner: PicturePartitionerT): def register_picture_partitioner(cls, picture_partitioner: PicturePartitionerT):
"""Specify a pluggable sub-partitioner to extract images from DOCX paragraphs.""" """Specify a pluggable sub-partitioner to extract images from DOCX paragraphs."""
@ -358,12 +365,26 @@ class DocxPartitionerOptions:
self._file.seek(0) self._file.seek(0)
return io.BytesIO(self._file.read()) return io.BytesIO(self._file.read())
if self._file: assert self._file is not None # -- assured by `._validate()` --
return self._file return self._file
raise ValueError( def _validate(self) -> DocxPartitionerOptions:
"No DOCX document specified, either `filename` or `file` argument must be provided" """Raise on first invalide option, return self otherwise."""
) # -- provide distinguished error between "file-not-found" and "not-a-DOCX-file" --
if self._file_path:
if not os.path.isfile(self._file_path):
raise FileNotFoundError(f"no such file or directory: {repr(self._file_path)}")
if not zipfile.is_zipfile(self._file_path):
raise ValueError(f"not a ZIP archive (so not a DOCX file): {repr(self._file_path)}")
elif self._file:
if not zipfile.is_zipfile(self._file):
raise ValueError(f"not a ZIP archive (so not a DOCX file): {repr(self._file)}")
else:
raise ValueError(
"no DOCX document specified, either `filename` or `file` argument must be provided"
)
return self
class _DocxPartitioner: class _DocxPartitioner: