From 087adb218fa2cb4b674ec9ee806071c7b8052ef3 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Wed, 26 Jun 2024 17:18:56 -0700 Subject: [PATCH] feat(docx): differentiate no-file from not-ZIP (#3306) **Summary** The `python-docx` error `docx.opc.exceptions.PackageNotFoundError` arises both when no file exists at the given path and when the file exists but is not a ZIP archive (and so is not a DOCX file). This ambiguity is unwelcome when diagnosing the error as the two possible conditions generally indicate a different course of action to resolve the error. Add detailed validation to `DocxPartitionerOptions` to distinguish these two and provide more precise exception messages. **Additional Context** - `python-pptx` shares the same OPC-Package (file) loading code used by `python-docx`, so the same ambiguity will be present in `python-pptx`. - It would be preferable for this distinguished exception behavior to be upstream in `python-docx` and `python-pptx`. If we're willing to take the version bump it might be worth considering doing that instead. --- CHANGELOG.md | 8 ++--- test_unstructured/partition/test_docx.py | 39 +++++++++++++++++++++--- unstructured/__version__.py | 2 +- unstructured/partition/docx.py | 33 ++++++++++++++++---- 4 files changed, 66 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4c342ada6..06c108f98 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,9 @@ -## 0.14.9-dev6 +## 0.14.9-dev7 ### Enhancements -* **Added visualization and OD model result dump for PDF** In PDF `hi_res` strategy the `analysis` parameter can be used - to visualize the result of the OD model and dump the result to a file. - Additionally, the visualization of bounding boxes of each layout source is rendered and saved - for each page. +* **Added visualization and OD model result dump for PDF** In PDF `hi_res` strategy the `analysis` parameter can be used to visualize the result of the OD model and dump the result to a file. Additionally, the visualization of bounding boxes of each layout source is rendered and saved for each page. +* **`partition_docx()` distinguishes "file not found" from "not a ZIP archive" error.** `partition_docx()` now provides different error messages for "file not found" and "file is not a ZIP archive (and therefore not a DOCX file)". This aids diagnosis since these two conditions generally point in different directions as to the cause and fix. ### Features diff --git a/test_unstructured/partition/test_docx.py b/test_unstructured/partition/test_docx.py index c5b4462c4..996dbe219 100644 --- a/test_unstructured/partition/test_docx.py +++ b/test_unstructured/partition/test_docx.py @@ -770,6 +770,19 @@ def opts_args() -> dict[str, Any]: class DescribeDocxPartitionerOptions: """Unit-test suite for `unstructured.partition.docx.DocxPartitionerOptions` objects.""" + # -- .load() --------------------------------- + + def it_provides_a_validating_constructor(self, opts_args: dict[str, Any]): + opts_args["file_path"] = example_doc_path("simple.docx") + + opts = DocxPartitionerOptions.load(**opts_args) + + assert isinstance(opts, DocxPartitionerOptions) + + def and_it_raises_when_options_are_not_valid(self, opts_args: dict[str, Any]): + with pytest.raises(ValueError, match="no DOCX document specified, "): + DocxPartitionerOptions.load(**opts_args) + # -- .document ------------------------------- def it_loads_the_docx_document( @@ -1024,13 +1037,31 @@ class DescribeDocxPartitionerOptions: assert isinstance(docx_file, io.BytesIO) assert docx_file.getvalue() == b"abcdefg" - def but_it_raises_ValueError_when_neither_a_file_path_or_file_is_provided( + # -- ._validate() ---------------------------- + + def it_raises_when_no_file_exists_at_file_path(self, opts_args: dict[str, Any]): + opts_args["file_path"] = "l/m/n.docx" + with pytest.raises(FileNotFoundError, match="no such file or directory: 'l/m/n.docx'"): + DocxPartitionerOptions.load(**opts_args) + + def and_it_raises_when_the_file_at_file_path_is_not_a_ZIP_archive( self, opts_args: dict[str, Any] ): - opts = DocxPartitionerOptions(**opts_args) + opts_args["file_path"] = example_doc_path("simple.doc") + with pytest.raises(ValueError, match=r"not a ZIP archive \(so not a DOCX file\): "): + DocxPartitionerOptions.load(**opts_args) - with pytest.raises(ValueError, match="No DOCX document specified, either `filename` or "): - opts._docx_file + def and_it_raises_when_the_file_like_object_is_not_a_ZIP_archive( + self, opts_args: dict[str, Any] + ): + with open(example_doc_path("simple.doc"), "rb") as f: + opts_args["file"] = f + with pytest.raises(ValueError, match=r"not a ZIP archive \(so not a DOCX file\): "): + DocxPartitionerOptions.load(**opts_args) + + def and_it_raises_when_neither_a_file_path_or_file_is_provided(self, opts_args: dict[str, Any]): + with pytest.raises(ValueError, match="no DOCX document specified, either `filename` or "): + DocxPartitionerOptions.load(**opts_args) # -- fixtures -------------------------------------------------------------------------------- diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 76c077cd3..9a5c41d6d 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.9-dev6" # pragma: no cover +__version__ = "0.14.9-dev7" # pragma: no cover diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index 9ccf2ee9a..8092ac321 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -5,7 +5,9 @@ from __future__ import annotations import html import io import itertools +import os import tempfile +import zipfile from typing import IO, Any, Iterator, Optional, Protocol, Type # -- CT_* stands for "complex-type", an XML element type in docx parlance -- @@ -155,7 +157,7 @@ def partition_docx( Assign this number to the first page of this document and increment the page number from there. """ - opts = DocxPartitionerOptions( + opts = DocxPartitionerOptions.load( date_from_file_object=date_from_file_object, file=file, file_path=filename, @@ -214,6 +216,11 @@ class DocxPartitionerOptions: # -- options object maintains page-number state -- self._page_counter = starting_page_number + @classmethod + def load(cls, **kwargs: Any) -> DocxPartitionerOptions: + """Construct and validate an instance.""" + return cls(**kwargs)._validate() + @classmethod def register_picture_partitioner(cls, picture_partitioner: PicturePartitionerT): """Specify a pluggable sub-partitioner to extract images from DOCX paragraphs.""" @@ -358,12 +365,26 @@ class DocxPartitionerOptions: self._file.seek(0) return io.BytesIO(self._file.read()) - if self._file: - return self._file + assert self._file is not None # -- assured by `._validate()` -- + return self._file - raise ValueError( - "No DOCX document specified, either `filename` or `file` argument must be provided" - ) + def _validate(self) -> DocxPartitionerOptions: + """Raise on first invalide option, return self otherwise.""" + # -- provide distinguished error between "file-not-found" and "not-a-DOCX-file" -- + if self._file_path: + if not os.path.isfile(self._file_path): + raise FileNotFoundError(f"no such file or directory: {repr(self._file_path)}") + if not zipfile.is_zipfile(self._file_path): + raise ValueError(f"not a ZIP archive (so not a DOCX file): {repr(self._file_path)}") + elif self._file: + if not zipfile.is_zipfile(self._file): + raise ValueError(f"not a ZIP archive (so not a DOCX file): {repr(self._file)}") + else: + raise ValueError( + "no DOCX document specified, either `filename` or `file` argument must be provided" + ) + + return self class _DocxPartitioner: