feat(docx): differentiate no-file from not-ZIP (#3306)

**Summary** The `python-docx` error `docx.opc.exceptions.PackageNotFoundError` arises both when no file exists at the given path and when the file exists but is not a ZIP archive (and so is not a DOCX file). This ambiguity is unwelcome when diagnosing the error as the two possible conditions generally indicate a different course of action to resolve the error. Add detailed validation to `DocxPartitionerOptions` to distinguish these two and provide more precise exception messages. **Additional Context** - `python-pptx` shares the same OPC-Package (file) loading code used by `python-docx`, so the same ambiguity will be present in `python-pptx`. - It would be preferable for this distinguished exception behavior to be upstream in `python-docx` and `python-pptx`. If we're willing to take the version bump it might be worth considering doing that instead.
2025-10-07 14:27:19 +00:00 · 2024-06-26 17:18:56 -07:00 · 2024-06-26 17:18:56 -07:00 · 087adb218f
commit 087adb218f
parent 54ec311c55
4 changed files with 66 additions and 16 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,11 +1,9 @@
-## 0.14.9-dev6
+## 0.14.9-dev7
 ### Enhancements
-* **Added visualization and OD model result dump for PDF** In PDF `hi_res` strategy the `analysis` parameter can be used
+* **Added visualization and OD model result dump for PDF** In PDF `hi_res` strategy the `analysis` parameter can be used to visualize the result of the OD model and dump the result to a file. Additionally, the visualization of bounding boxes of each layout source is rendered and saved for each page.
-  to visualize the result of the OD model and dump the result to a file.
+* **`partition_docx()` distinguishes "file not found" from "not a ZIP archive" error.** `partition_docx()` now provides different error messages for "file not found" and "file is not a ZIP archive (and therefore not a DOCX file)". This aids diagnosis since these two conditions generally point in different directions as to the cause and fix.
  Additionally, the visualization of bounding boxes of each layout source is rendered and saved
  for each page.
 ### Features
--- a/test_unstructured/partition/test_docx.py
+++ b/test_unstructured/partition/test_docx.py
@ -770,6 +770,19 @@ def opts_args() -> dict[str, Any]:
 class DescribeDocxPartitionerOptions:
    """Unit-test suite for `unstructured.partition.docx.DocxPartitionerOptions` objects."""
    # -- .load() ---------------------------------
    def it_provides_a_validating_constructor(self, opts_args: dict[str, Any]):
        opts_args["file_path"] = example_doc_path("simple.docx")
        opts = DocxPartitionerOptions.load(**opts_args)
        assert isinstance(opts, DocxPartitionerOptions)
    def and_it_raises_when_options_are_not_valid(self, opts_args: dict[str, Any]):
        with pytest.raises(ValueError, match="no DOCX document specified, "):
            DocxPartitionerOptions.load(**opts_args)
    # -- .document -------------------------------
    def it_loads_the_docx_document(
@ -1024,13 +1037,31 @@ class DescribeDocxPartitionerOptions:
        assert isinstance(docx_file, io.BytesIO)
        assert docx_file.getvalue() == b"abcdefg"
-    def but_it_raises_ValueError_when_neither_a_file_path_or_file_is_provided(
+    # -- ._validate() ----------------------------
    def it_raises_when_no_file_exists_at_file_path(self, opts_args: dict[str, Any]):
        opts_args["file_path"] = "l/m/n.docx"
        with pytest.raises(FileNotFoundError, match="no such file or directory: 'l/m/n.docx'"):
            DocxPartitionerOptions.load(**opts_args)
    def and_it_raises_when_the_file_at_file_path_is_not_a_ZIP_archive(
        self, opts_args: dict[str, Any]
    ):
-        opts = DocxPartitionerOptions(**opts_args)
+        opts_args["file_path"] = example_doc_path("simple.doc")
        with pytest.raises(ValueError, match=r"not a ZIP archive \(so not a DOCX file\): "):
            DocxPartitionerOptions.load(**opts_args)
-        with pytest.raises(ValueError, match="No DOCX document specified, either `filename` or "):
+    def and_it_raises_when_the_file_like_object_is_not_a_ZIP_archive(
-            opts._docx_file
+        self, opts_args: dict[str, Any]
    ):
        with open(example_doc_path("simple.doc"), "rb") as f:
            opts_args["file"] = f
            with pytest.raises(ValueError, match=r"not a ZIP archive \(so not a DOCX file\): "):
                DocxPartitionerOptions.load(**opts_args)
    def and_it_raises_when_neither_a_file_path_or_file_is_provided(self, opts_args: dict[str, Any]):
        with pytest.raises(ValueError, match="no DOCX document specified, either `filename` or "):
            DocxPartitionerOptions.load(**opts_args)
    # -- fixtures --------------------------------------------------------------------------------
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.14.9-dev6"  # pragma: no cover
+__version__ = "0.14.9-dev7"  # pragma: no cover
--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@ -5,7 +5,9 @@ from __future__ import annotations
 import html
 import io
 import itertools
 import os
 import tempfile
 import zipfile
 from typing import IO, Any, Iterator, Optional, Protocol, Type
 # -- CT_* stands for "complex-type", an XML element type in docx parlance --
@ -155,7 +157,7 @@ def partition_docx(
        Assign this number to the first page of this document and increment the page number from
        there.
    """
-    opts = DocxPartitionerOptions(
+    opts = DocxPartitionerOptions.load(
        date_from_file_object=date_from_file_object,
        file=file,
        file_path=filename,
@ -214,6 +216,11 @@ class DocxPartitionerOptions:
        # -- options object maintains page-number state --
        self._page_counter = starting_page_number
    @classmethod
    def load(cls, **kwargs: Any) -> DocxPartitionerOptions:
        """Construct and validate an instance."""
        return cls(**kwargs)._validate()
    @classmethod
    def register_picture_partitioner(cls, picture_partitioner: PicturePartitionerT):
        """Specify a pluggable sub-partitioner to extract images from DOCX paragraphs."""
@ -358,13 +365,27 @@ class DocxPartitionerOptions:
            self._file.seek(0)
            return io.BytesIO(self._file.read())
-        if self._file:
+        assert self._file is not None  # -- assured by `._validate()` --
        return self._file
    def _validate(self) -> DocxPartitionerOptions:
        """Raise on first invalide option, return self otherwise."""
        # -- provide distinguished error between "file-not-found" and "not-a-DOCX-file" --
        if self._file_path:
            if not os.path.isfile(self._file_path):
                raise FileNotFoundError(f"no such file or directory: {repr(self._file_path)}")
            if not zipfile.is_zipfile(self._file_path):
                raise ValueError(f"not a ZIP archive (so not a DOCX file): {repr(self._file_path)}")
        elif self._file:
            if not zipfile.is_zipfile(self._file):
                raise ValueError(f"not a ZIP archive (so not a DOCX file): {repr(self._file)}")
        else:
            raise ValueError(
-            "No DOCX document specified, either `filename` or `file` argument must be provided"
+                "no DOCX document specified, either `filename` or `file` argument must be provided"
            )
        return self
 class _DocxPartitioner:
    """Provides `.partition()` for MS-Word 2007+ (.docx) files."""
`@ -1 +1 @@`
	`__version__ = "0.14.9-dev6" # pragma: no cover`	`__version__ = "0.14.9-dev7" # pragma: no cover`