diff --git a/CHANGELOG.md b/CHANGELOG.md index d451a6ba7..f402924e8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +## 0.15.1-dev0 + +### Enhancements + +### Features + +### Fixes + ## 0.15.0 ### Enhancements diff --git a/test_unstructured/file_utils/test_model.py b/test_unstructured/file_utils/test_model.py index b3a67f152..91d2b8bec 100644 --- a/test_unstructured/file_utils/test_model.py +++ b/test_unstructured/file_utils/test_model.py @@ -50,6 +50,68 @@ class DescribeFileType: def but_not_when_that_mime_type_is_not_registered_by_a_file_type(self, mime_type: str): assert FileType.from_mime_type(mime_type) is None + @pytest.mark.parametrize( + ("file_type", "expected_value"), + [ + (FileType.BMP, ("unstructured_inference",)), + (FileType.CSV, ("pandas",)), + (FileType.DOC, ("docx",)), + (FileType.EMPTY, ()), + (FileType.HTML, ()), + (FileType.ODT, ("docx", "pypandoc")), + (FileType.PDF, ("pdf2image", "pdfminer", "PIL")), + (FileType.UNK, ()), + (FileType.WAV, ()), + (FileType.ZIP, ()), + ], + ) + def it_knows_which_importable_packages_its_partitioner_depends_on( + self, file_type: FileType, expected_value: tuple[str, ...] + ): + assert file_type.importable_package_dependencies == expected_value + + @pytest.mark.parametrize( + ("file_type", "expected_value"), + [ + (FileType.BMP, "image"), + (FileType.DOC, "doc"), + (FileType.DOCX, "docx"), + (FileType.EML, None), + (FileType.EMPTY, None), + (FileType.MSG, "msg"), + (FileType.PDF, "pdf"), + (FileType.XLS, "xlsx"), + (FileType.UNK, None), + (FileType.WAV, None), + (FileType.ZIP, None), + ], + ) + def and_it_knows_which_pip_extra_needs_to_be_installed_to_get_those_dependencies( + self, file_type: FileType, expected_value: str | None + ): + assert file_type.extra_name == expected_value + + @pytest.mark.parametrize( + ("file_type", "expected_value"), + [ + (FileType.BMP, True), + (FileType.CSV, True), + (FileType.DOC, True), + (FileType.EML, True), + (FileType.JPG, True), + (FileType.PDF, True), + (FileType.PPTX, True), + (FileType.WAV, False), + (FileType.ZIP, False), + (FileType.EMPTY, False), + (FileType.UNK, False), + ], + ) + def it_knows_whether_files_of_its_type_are_directly_partitionable( + self, file_type: FileType, expected_value: str + ): + assert file_type.is_partitionable is expected_value + @pytest.mark.parametrize( ("file_type", "mime_type"), [ @@ -68,3 +130,72 @@ class DescribeFileType: ) def it_knows_its_canonical_MIME_type(self, file_type: FileType, mime_type: str): assert file_type.mime_type == mime_type + + @pytest.mark.parametrize( + ("file_type", "expected_value"), + [ + (FileType.BMP, "partition_image"), + (FileType.CSV, "partition_csv"), + (FileType.DOC, "partition_doc"), + (FileType.DOCX, "partition_docx"), + (FileType.JPG, "partition_image"), + (FileType.PNG, "partition_image"), + (FileType.TIFF, "partition_image"), + ], + ) + def it_knows_its_partitioner_function_name(self, file_type: FileType, expected_value: str): + assert file_type.partitioner_function_name == expected_value + + @pytest.mark.parametrize( + "file_type", [FileType.WAV, FileType.ZIP, FileType.EMPTY, FileType.UNK] + ) + def but_it_raises_on_partitioner_function_name_access_when_the_file_type_is_not_partitionable( + self, file_type: FileType + ): + with pytest.raises(ValueError, match="`.partitioner_function_name` is undefined because "): + file_type.partitioner_function_name + + @pytest.mark.parametrize( + ("file_type", "expected_value"), + [ + (FileType.BMP, "unstructured.partition.image"), + (FileType.CSV, "unstructured.partition.csv"), + (FileType.DOC, "unstructured.partition.doc"), + (FileType.DOCX, "unstructured.partition.docx"), + (FileType.JPG, "unstructured.partition.image"), + (FileType.PNG, "unstructured.partition.image"), + (FileType.TIFF, "unstructured.partition.image"), + ], + ) + def it_knows_the_fully_qualified_name_of_its_partitioner_module( + self, file_type: FileType, expected_value: str + ): + assert file_type.partitioner_module_qname == expected_value + + @pytest.mark.parametrize( + "file_type", [FileType.WAV, FileType.ZIP, FileType.EMPTY, FileType.UNK] + ) + def but_it_raises_on_partitioner_module_qname_access_when_the_file_type_is_not_partitionable( + self, file_type: FileType + ): + with pytest.raises(ValueError, match="`.partitioner_module_qname` is undefined because "): + file_type.partitioner_module_qname + + @pytest.mark.parametrize( + ("file_type", "expected_value"), + [ + (FileType.BMP, "image"), + (FileType.CSV, "csv"), + (FileType.DOC, "doc"), + (FileType.DOCX, "docx"), + (FileType.JPG, "image"), + (FileType.PNG, "image"), + (FileType.TIFF, "image"), + (FileType.XLS, "xlsx"), + (FileType.XLSX, "xlsx"), + ], + ) + def it_provides_access_to_the_partitioner_shortname( + self, file_type: FileType, expected_value: str + ): + assert file_type.partitioner_shortname == expected_value diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index ef10b9ede..a09d45f2d 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -10,7 +10,7 @@ import sys import tempfile import warnings from importlib import import_module -from typing import Callable, Iterator, cast +from typing import Iterator, cast from unittest.mock import patch import pytest @@ -27,7 +27,6 @@ from test_unstructured.unit_utils import ( ANY, FixtureRequest, LogCaptureFixture, - MonkeyPatch, example_doc_path, function_mock, method_mock, @@ -46,8 +45,7 @@ from unstructured.documents.elements import ( Title, ) from unstructured.file_utils.model import FileType -from unstructured.partition import auto -from unstructured.partition.auto import IMAGE_FILETYPES, _get_partition_with_extras, partition +from unstructured.partition.auto import _PartitionerLoader, partition from unstructured.partition.utils.constants import PartitionStrategy from unstructured.staging.base import elements_from_json, elements_to_dicts, elements_to_json @@ -570,16 +568,21 @@ def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type assert e.text.startswith("Zejiang Shen") -def test_auto_partition_pdf_with_fast_strategy(monkeypatch: MonkeyPatch): +def test_auto_partition_pdf_with_fast_strategy(request: FixtureRequest): + partition_pdf_ = function_mock( + request, + "unstructured.partition.pdf.partition_pdf", + return_value=[NarrativeText("Hello there!")], + ) + partitioner_loader_get_ = method_mock( + request, _PartitionerLoader, "get", return_value=partition_pdf_ + ) file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf") - mock_return = [NarrativeText("Hello there!")] - with patch.object(auto, "partition_pdf", return_value=mock_return) as mock_partition: - mock_partition_with_extras_map = {"pdf": mock_partition} - monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map) - partition(filename=file_path, strategy=PartitionStrategy.FAST) + partition(file_path, strategy=PartitionStrategy.FAST) - mock_partition.assert_called_once_with( + partitioner_loader_get_.assert_called_once_with(ANY, FileType.PDF) + partition_pdf_.assert_called_once_with( filename=file_path, file=None, url=None, @@ -919,10 +922,10 @@ def test_auto_partition_xml_from_file_with_tags(): def test_auto_partition_raises_with_bad_type(request: FixtureRequest): detect_filetype_ = function_mock( - request, "unstructured.partition.auto.detect_filetype", return_value=None + request, "unstructured.partition.auto.detect_filetype", return_value=FileType.UNK ) - with pytest.raises(ValueError, match="Invalid file made-up.fake. The None file type is not "): + with pytest.raises(ValueError, match="Invalid file made-up.fake. The FileType.UNK file type "): partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES) detect_filetype_.assert_called_once_with( @@ -1026,23 +1029,7 @@ def test_auto_partition_respects_detect_language_per_element_arg(): @pytest.mark.parametrize( - "file_extension", - [ - "doc", - "docx", - "eml", - "epub", - "html", - "md", - "odt", - "org", - "ppt", - "pptx", - "rst", - "rtf", - "txt", - "xml", - ], + "file_extension", "doc docx eml epub html md odt org ppt pptx rst rtf txt xml".split() ) def test_auto_partition_respects_language_arg(file_extension: str): elements = partition( @@ -1167,7 +1154,7 @@ def test_auto_partition_respects_skip_infer_table_types( @pytest.mark.parametrize( - ("content_type", "filetype_shortname", "expected_value"), + ("content_type", "shortname", "expected_value"), [ ("text/csv", "csv", "text/csv"), ("text/html", "html", "text/html"), @@ -1177,22 +1164,23 @@ def test_auto_partition_respects_skip_infer_table_types( def test_auto_partition_adds_filetype_to_metadata( request: FixtureRequest, content_type: str, - filetype_shortname: str, + shortname: str, expected_value: str | None, - monkeypatch: MonkeyPatch, ): partition_fn_ = function_mock( request, - f"unstructured.partition.auto.partition_{filetype_shortname}", + f"unstructured.partition.{shortname}.partition_{shortname}", return_value=[Text("text 1"), Text("text 2")], ) - mock_partition_with_extras_map = {filetype_shortname: partition_fn_} - monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map) + partitioner_loader_get_ = method_mock( + request, _PartitionerLoader, "get", return_value=partition_fn_ + ) elements = partition( example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type ) + partitioner_loader_get_.assert_called_once() assert len(elements) == 2 assert all(e.metadata.filetype == expected_value for e in elements) @@ -1207,20 +1195,23 @@ def test_auto_partition_adds_filetype_to_metadata( ], ) def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partitioner( - request: FixtureRequest, content_type: str | None, monkeypatch: MonkeyPatch + request: FixtureRequest, content_type: str | None ): metadata = ElementMetadata(filetype="imapdf") partition_pdf_ = function_mock( request, - "unstructured.partition.auto.partition_pdf", + "unstructured.partition.pdf.partition_pdf", return_value=[Text("text 1", metadata=metadata), Text("text 2", metadata=metadata)], ) - monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", {"pdf": partition_pdf_}) + partitioner_loader_get_ = method_mock( + request, _PartitionerLoader, "get", return_value=partition_pdf_ + ) elements = partition( example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type ) + partitioner_loader_get_.assert_called_once_with(ANY, FileType.PDF) assert len(elements) == 2 assert all(e.metadata.filetype == "application/pdf" for e in elements) @@ -1231,7 +1222,7 @@ def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partiti t for t in FileType if t not in (FileType.EMPTY, FileType.UNK, FileType.WAV, FileType.XLS, FileType.ZIP) - and t not in IMAGE_FILETYPES + and t.partitioner_function_name != "partition_image" ], ) def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(filetype: FileType): @@ -1305,10 +1296,18 @@ def test_auto_partition_from_file_works_on_empty_file(): assert partition(file=f) == [] -def test_auto_partition_requiring_extras_prompts_to_install_missing_dependencies(): - partition_with_extras_map: dict[str, Callable[..., list[Element]]] = {} - with pytest.raises(ImportError, match="partition_pdf is not available. Install the pdf depen"): - _get_partition_with_extras("pdf", partition_with_extras_map) +def test_auto_partition_that_requires_extras_raises_when_dependencies_are_not_installed( + request: FixtureRequest, +): + _PartitionerLoader._partitioners.pop(FileType.PDF, None) + dependency_exists_ = function_mock( + request, "unstructured.partition.auto.dependency_exists", return_value=False + ) + match = r"partition_pdf\(\) is not available because one or more dependencies are not installed" + with pytest.raises(ImportError, match=match): + partition(example_doc_path("layout-parser-paper-fast.pdf")) + + dependency_exists_.assert_called_once_with("pdf2image") # ================================================================================================ diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 25f64ffb0..a9a0aaf8c 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.0" # pragma: no cover +__version__ = "0.15.1-dev0" # pragma: no cover diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 9472a10b1..c11d81c6c 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -32,7 +32,7 @@ def detect_filetype( file: Optional[IO[bytes]] = None, file_filename: Optional[str] = None, encoding: Optional[str] = "utf-8", -) -> Optional[FileType]: +) -> FileType: """Use libmagic to determine a file's type. Helps determine which partition brick to use for a given file. A return value of None indicates @@ -122,7 +122,7 @@ def detect_filetype( ".tsv", ".json", ]: - return FileType.from_extension(extension) + return FileType.from_extension(extension) or FileType.TXT # NOTE(crag): for older versions of the OS libmagic package, such as is currently # installed on the Unstructured docker image, .json files resolve to "text/plain" diff --git a/unstructured/file_utils/model.py b/unstructured/file_utils/model.py index 0ec769b7c..6c285a704 100644 --- a/unstructured/file_utils/model.py +++ b/unstructured/file_utils/model.py @@ -12,7 +12,17 @@ class FileType(enum.Enum): Note not all of these can be partitioned, e.g. WAV and ZIP have no partitioner. """ + _partitioner_shortname: str | None + """Like "docx", from which partitioner module and function-name can be derived via template.""" + + _importable_package_dependencies: tuple[str, ...] + """Packages that must be available for import for this file-type's partitioner to work.""" + + _extra_name: str | None + """`pip install` extra that provides package dependencies for this file-type.""" + _extensions: tuple[str, ...] + """Filename-extensions recognized as this file-type. Use for secondary identification only.""" _canonical_mime_type: str """The MIME-type used as `.metadata.filetype` for this file-type.""" @@ -23,12 +33,18 @@ class FileType(enum.Enum): def __new__( cls, value: str, + partitioner_shortname: str | None, + importable_package_dependencies: Iterable[str], + extra_name: str | None, extensions: Iterable[str], canonical_mime_type: str, alias_mime_types: Iterable[str], ): self = object.__new__(cls) self._value_ = value + self._partitioner_shortname = partitioner_shortname + self._importable_package_dependencies = tuple(importable_package_dependencies) + self._extra_name = extra_name self._extensions = tuple(extensions) self._canonical_mime_type = canonical_mime_type self._alias_mime_types = tuple(alias_mime_types) @@ -41,8 +57,150 @@ class FileType(enum.Enum): """ return self.name < other.name - BMP = ("bmp", [".bmp"], "image/bmp", cast(list[str], [])) + @classmethod + def from_extension(cls, extension: str | None) -> FileType | None: + """Select a FileType member based on an extension. + + `extension` must include the leading period, like `".pdf"`. Extension is suitable as a + secondary file-type identification method but is unreliable for primary identification. + + Returns `None` when `extension` is not registered for any supported file-type. + """ + if extension in (None, "", "."): + return None + # -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids + # -- limitations on defining a class variable on an Enum. + for m in cls.__members__.values(): + if extension in m._extensions: + return m + return None + + @classmethod + def from_mime_type(cls, mime_type: str) -> FileType | None: + """Select a FileType member based on a MIME-type. + + Returns `None` when `mime_type` is `None` or does not map to the canonical MIME-type of a + `FileType` member or one of its alias MIME-types. + """ + # -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids + # -- limitations on defining a class variable on an Enum. + for m in cls.__members__.values(): + if mime_type == m._canonical_mime_type or mime_type in m._alias_mime_types: + return m + return None + + @property + def extra_name(self) -> str | None: + """The `pip` "extra" that must be installed to provide this file-type's dependencies. + + Like "image" for PNG, as in `pip install "unstructured[image]"`. + + `None` when partitioning this file-type requires only the base `unstructured` install. + """ + return self._extra_name + + @property + def importable_package_dependencies(self) -> tuple[str, ...]: + """Packages that must be importable for this file-type's partitioner to work. + + In general, these are the packages provided by the `pip install` "extra" for this file-type, + like `pip install "unstructured[docx]"` loads the `python-docx` package. + + Note that these names are the ones used in an `import` statement, which is not necessarily + the same as the _distribution_ package name used by `pip`. For example, the DOCX + distribution package name is `"python-docx"` whereas the _importable_ package name is + `"docx"`. This latter name as it appears like `import docx` is what is provided by this + property. + + The return value is an empty tuple for file-types that do not require optional dependencies. + + Note this property does not complain when accessed on a non-partitionable file-type, it + simply returns an empty tuple because file-types that are not partitionable require no + optional dependencies. + """ + return self._importable_package_dependencies + + @property + def is_partitionable(self) -> bool: + """True when there is a partitioner for this file-type. + + Note this does not check whether the dependencies for this file-type are installed so + attempting to partition a file of this type may still fail. This is meant for + distinguishing file-types like WAV, ZIP, EMPTY, and UNK which are legitimate file-types + but have no associated partitioner. + """ + return bool(self._partitioner_shortname) + + @property + def mime_type(self) -> str: + """The canonical MIME-type for this file-type, suitable for use in metadata. + + This value is used in `.metadata.filetype` for elements partitioned from files of this + type. In general it is the "offical", "recommended", or "defacto-standard" MIME-type for + files of this type, in that order, as available. + """ + return self._canonical_mime_type + + @property + def partitioner_function_name(self) -> str: + """Name of partitioner function for this file-type. Like "partition_docx". + + Raises when this property is accessed on a file-type that is not partitionable. Use + `.is_partitionable` to avoid exceptions when partitionability is unknown. + """ + # -- Raise when this property is accessed on a FileType member that has no partitioner + # -- shortname. This prevents a harder-to-find bug from appearing far away from this call + # -- when code would try to `getattr(module, None)` or whatever. + if (shortname := self._partitioner_shortname) is None: + raise ValueError( + f"`.partitioner_function_name` is undefined because FileType.{self.name} is not" + f" partitionable. Use `.is_partitionable` to determine whether a `FileType`" + f" is partitionable." + ) + return f"partition_{shortname}" + + @property + def partitioner_module_qname(self) -> str: + """Fully-qualified name of module providing partitioner for this file-type. + + e.g. "unstructured.partition.docx" for FileType.DOCX. + """ + # -- Raise when this property is accessed on a FileType member that has no partitioner + # -- shortname. This prevents a harder-to-find bug from appearing far away from this call + # -- when code would try to `importlib.import_module(None)` or whatever. + if (shortname := self._partitioner_shortname) is None: + raise ValueError( + f"`.partitioner_module_qname` is undefined because FileType.{self.name} is not" + f" partitionable. Use `.is_partitionable` to determine whether a `FileType`" + f" is partitionable." + ) + return f"unstructured.partition.{shortname}" + + @property + def partitioner_shortname(self) -> str | None: + """Familiar name of partitioner, like "image" for file-types that use `partition_image()`. + + One use is to determine whether a file-type is one of the five image types, all of which + are processed by `partition_image()`. + + `None` for file-types that are not partitionable, although `.is_partitionable` is the + preferred way of discovering that. + """ + return self._partitioner_shortname + + BMP = ( + "bmp", # -- value for this Enum member, like BMP = "bmp" in a simple enum -- + "image", # -- partitioner_shortname -- + ["unstructured_inference"], # -- importable_package_dependencies -- + "image", # -- extra_name - like `pip install "unstructured[image]"` in this case -- + [".bmp"], # -- extensions - filename extensions that map to this file-type -- + "image/bmp", # -- canonical_mime_type - MIME-type written to `.metadata.filetype` -- + cast(list[str], []), # -- alias_mime-types - other MIME-types that map to this file-type -- + ) CSV = ( + "csv", + "csv", + ["pandas"], "csv", [".csv"], "text/csv", @@ -54,38 +212,143 @@ class FileType(enum.Enum): "text/x-csv", ], ) - DOC = ("doc", [".doc"], "application/msword", cast(list[str], [])) + DOC = ("doc", "doc", ["docx"], "doc", [".doc"], "application/msword", cast(list[str], [])) DOCX = ( + "docx", + "docx", + ["docx"], "docx", [".docx"], "application/vnd.openxmlformats-officedocument.wordprocessingml.document", cast(list[str], []), ) - EML = ("eml", [".eml", ".p7s"], "message/rfc822", cast(list[str], [])) - EPUB = ("epub", [".epub"], "application/epub", ["application/epub+zip"]) - HEIC = ("heic", [".heic"], "image/heic", cast(list[str], [])) - HTML = ("html", [".html", ".htm"], "text/html", cast(list[str], [])) - JPG = ("jpg", [".jpeg", ".jpg"], "image/jpeg", cast(list[str], [])) - JSON = ("json", [".json"], "application/json", cast(list[str], [])) - MD = ("md", [".md"], "text/markdown", ["text/x-markdown"]) - MSG = ("msg", [".msg"], "application/vnd.ms-outlook", ["application/x-ole-storage"]) - ODT = ("odt", [".odt"], "application/vnd.oasis.opendocument.text", cast(list[str], [])) - ORG = ("org", [".org"], "text/org", cast(list[str], [])) - PDF = ("pdf", [".pdf"], "application/pdf", cast(list[str], [])) - PNG = ("png", [".png"], "image/png", cast(list[str], [])) - PPT = ("ppt", [".ppt"], "application/vnd.ms-powerpoint", cast(list[str], [])) + EML = ( + "eml", + "email", + cast(list[str], []), + None, + [".eml", ".p7s"], + "message/rfc822", + cast(list[str], []), + ) + EPUB = ( + "epub", + "epub", + ["pypandoc"], + "epub", + [".epub"], + "application/epub", + ["application/epub+zip"], + ) + HEIC = ( + "heic", + "image", + ["unstructured_inference"], + "image", + [".heic"], + "image/heic", + cast(list[str], []), + ) + HTML = ( + "html", + "html", + cast(list[str], []), + None, + [".html", ".htm"], + "text/html", + cast(list[str], []), + ) + JPG = ( + "jpg", + "image", + ["unstructured_inference"], + "image", + [".jpeg", ".jpg"], + "image/jpeg", + cast(list[str], []), + ) + JSON = ( + "json", + "json", + cast(list[str], []), + None, + [".json"], + "application/json", + cast(list[str], []), + ) + MD = ("md", "md", ["markdown"], "md", [".md"], "text/markdown", ["text/x-markdown"]) + MSG = ( + "msg", + "msg", + ["oxmsg"], + "msg", + [".msg"], + "application/vnd.ms-outlook", + ["application/x-ole-storage"], + ) + ODT = ( + "odt", + "odt", + ["docx", "pypandoc"], + "odt", + [".odt"], + "application/vnd.oasis.opendocument.text", + cast(list[str], []), + ) + ORG = ("org", "org", ["pypandoc"], "org", [".org"], "text/org", cast(list[str], [])) + PDF = ( + "pdf", + "pdf", + ["pdf2image", "pdfminer", "PIL"], + "pdf", + [".pdf"], + "application/pdf", + cast(list[str], []), + ) + PNG = ( + "png", + "image", + ["unstructured_inference"], + "image", + [".png"], + "image/png", + cast(list[str], []), + ) + PPT = ( + "ppt", + "ppt", + ["pptx"], + "ppt", + [".ppt"], + "application/vnd.ms-powerpoint", + cast(list[str], []), + ) PPTX = ( + "pptx", + "pptx", + ["pptx"], "pptx", [".pptx"], "application/vnd.openxmlformats-officedocument.presentationml.presentation", cast(list[str], []), ) - RST = ("rst", [".rst"], "text/x-rst", cast(list[str], [])) - RTF = ("rtf", [".rtf"], "text/rtf", ["application/rtf"]) - TIFF = ("tiff", [".tiff"], "image/tiff", cast(list[str], [])) - TSV = ("tsv", [".tab", ".tsv"], "text/tsv", cast(list[str], [])) + RST = ("rst", "rst", ["pypandoc"], "rst", [".rst"], "text/x-rst", cast(list[str], [])) + RTF = ("rtf", "rtf", ["pypandoc"], "rtf", [".rtf"], "text/rtf", ["application/rtf"]) + TIFF = ( + "tiff", + "image", + ["unstructured_inference"], + "image", + [".tiff"], + "image/tiff", + cast(list[str], []), + ) + TSV = ("tsv", "tsv", ["pandas"], "tsv", [".tab", ".tsv"], "text/tsv", cast(list[str], [])) TXT = ( "txt", + "text", + cast(list[str], []), + None, [ ".txt", ".text", @@ -119,6 +382,9 @@ class FileType(enum.Enum): ) WAV = ( "wav", + None, + cast(list[str], []), + None, [".wav"], "audio/wav", [ @@ -129,60 +395,45 @@ class FileType(enum.Enum): "audio/x-wav", ], ) - XLS = ("xls", [".xls"], "application/vnd.ms-excel", cast(list[str], [])) + XLS = ( + "xls", + "xlsx", + ["pandas", "openpyxl"], + "xlsx", + [".xls"], + "application/vnd.ms-excel", + cast(list[str], []), + ) XLSX = ( + "xlsx", + "xlsx", + ["pandas", "openpyxl"], "xlsx", [".xlsx"], "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", cast(list[str], []), ) - XML = ("xml", [".xml"], "application/xml", ["text/xml"]) - ZIP = ("zip", [".zip"], "application/zip", cast(list[str], [])) + XML = ("xml", "xml", cast(list[str], []), None, [".xml"], "application/xml", ["text/xml"]) + ZIP = ("zip", None, cast(list[str], []), None, [".zip"], "application/zip", cast(list[str], [])) - UNK = ("unk", cast(list[str], []), "application/octet-stream", cast(list[str], [])) - EMPTY = ("empty", cast(list[str], []), "inode/x-empty", cast(list[str], [])) - - @classmethod - def from_extension(cls, extension: str | None) -> FileType | None: - """Select a FileType member based on an extension. - - `extension` must include the leading period, like `".pdf"`. Extension is suitable as a - secondary file-type identification method but is unreliable for primary identification.. - - Returns `None` when `extension` is not registered for any supported file-type. - """ - if extension in (None, "", "."): - return None - # -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids - # -- limitations on defining a class variable on an Enum. - for m in cls.__members__.values(): - if extension in m._extensions: - return m - return None - - @classmethod - def from_mime_type(cls, mime_type: str) -> FileType | None: - """Select a FileType member based on a MIME-type. - - `extension` must include the leading period, like `".pdf"`. Extension is suitable as a - secondary file-type identification method but is unreliable for primary identification.. - """ - # -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids - # -- limitations on defining a class variable on an Enum. - for m in cls.__members__.values(): - if mime_type == m._canonical_mime_type or mime_type in m._alias_mime_types: - return m - return None - - @property - def mime_type(self) -> str: - """The canonical MIME-type for this file-type, suitable for use in metadata. - - This value is used in `.metadata.filetype` for elements partitioned from files of this - type. In general it is the "offical", "recommended", or "defacto-standard" MIME-type for - files of this type, in that order, as available. - """ - return self._canonical_mime_type + UNK = ( + "unk", + None, + cast(list[str], []), + None, + cast(list[str], []), + "application/octet-stream", + cast(list[str], []), + ) + EMPTY = ( + "empty", + None, + cast(list[str], []), + None, + cast(list[str], []), + "inode/x-empty", + cast(list[str], []), + ) PLAIN_TEXT_EXTENSIONS = ".csv .eml .html .json .md .org .p7s .rst .rtf .tab .text .tsv .txt".split() diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index fe13d26fe..b7cad8055 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -2,131 +2,28 @@ from __future__ import annotations +import importlib import io from typing import IO, Any, Callable, Literal, Optional import requests +from typing_extensions import TypeAlias from unstructured.documents.elements import DataSourceMetadata, Element from unstructured.file_utils.filetype import detect_filetype, is_json_processable from unstructured.file_utils.model import FileType from unstructured.logger import logger from unstructured.partition.common import exactly_one -from unstructured.partition.email import partition_email -from unstructured.partition.html import partition_html -from unstructured.partition.json import partition_json from unstructured.partition.lang import check_language_args -from unstructured.partition.text import partition_text from unstructured.partition.utils.constants import PartitionStrategy -from unstructured.partition.xml import partition_xml from unstructured.utils import dependency_exists -PARTITION_WITH_EXTRAS_MAP: dict[str, Callable[..., list[Element]]] = {} - -if dependency_exists("pandas"): - from unstructured.partition.csv import partition_csv - from unstructured.partition.tsv import partition_tsv - - PARTITION_WITH_EXTRAS_MAP["csv"] = partition_csv - PARTITION_WITH_EXTRAS_MAP["tsv"] = partition_tsv - - -if dependency_exists("docx"): - from unstructured.partition.doc import partition_doc - from unstructured.partition.docx import partition_docx - - PARTITION_WITH_EXTRAS_MAP["doc"] = partition_doc - PARTITION_WITH_EXTRAS_MAP["docx"] = partition_docx - - -if dependency_exists("docx") and dependency_exists("pypandoc"): - from unstructured.partition.odt import partition_odt - - PARTITION_WITH_EXTRAS_MAP["odt"] = partition_odt - - -if dependency_exists("pypandoc"): - from unstructured.partition.epub import partition_epub - - PARTITION_WITH_EXTRAS_MAP["epub"] = partition_epub - - -if dependency_exists("pypandoc"): - from unstructured.partition.org import partition_org - from unstructured.partition.rst import partition_rst - from unstructured.partition.rtf import partition_rtf - - PARTITION_WITH_EXTRAS_MAP["org"] = partition_org - PARTITION_WITH_EXTRAS_MAP["rst"] = partition_rst - PARTITION_WITH_EXTRAS_MAP["rtf"] = partition_rtf - - -if dependency_exists("markdown"): - from unstructured.partition.md import partition_md - - PARTITION_WITH_EXTRAS_MAP["md"] = partition_md - - -if dependency_exists("oxmsg"): - from unstructured.partition.msg import partition_msg - - PARTITION_WITH_EXTRAS_MAP["msg"] = partition_msg - - -pdf_imports = ["pdf2image", "pdfminer", "PIL"] -if all(dependency_exists(dep) for dep in pdf_imports): - from unstructured.partition.pdf import partition_pdf - - PARTITION_WITH_EXTRAS_MAP["pdf"] = partition_pdf - - -if dependency_exists("unstructured_inference"): - from unstructured.partition.image import partition_image - - PARTITION_WITH_EXTRAS_MAP["image"] = partition_image - - -if dependency_exists("pptx"): - from unstructured.partition.ppt import partition_ppt - from unstructured.partition.pptx import partition_pptx - - PARTITION_WITH_EXTRAS_MAP["ppt"] = partition_ppt - PARTITION_WITH_EXTRAS_MAP["pptx"] = partition_pptx - - -if dependency_exists("pandas") and dependency_exists("openpyxl"): - from unstructured.partition.xlsx import partition_xlsx - - PARTITION_WITH_EXTRAS_MAP["xlsx"] = partition_xlsx - - -IMAGE_FILETYPES = [ - FileType.HEIC, - FileType.PNG, - FileType.JPG, - FileType.TIFF, - FileType.BMP, -] - - -def _get_partition_with_extras( - doc_type: str, - partition_with_extras_map: Optional[dict[str, Callable[..., list[Element]]]] = None, -): - if partition_with_extras_map is None: - partition_with_extras_map = PARTITION_WITH_EXTRAS_MAP - _partition_func = partition_with_extras_map.get(doc_type) - if _partition_func is None: - raise ImportError( - f"partition_{doc_type} is not available. " - f"Install the {doc_type} dependencies with " - f'pip install "unstructured[{doc_type}]"', - ) - return _partition_func +Partitioner: TypeAlias = Callable[..., list[Element]] def partition( filename: Optional[str] = None, + *, content_type: Optional[str] = None, file: Optional[IO[bytes]] = None, file_filename: Optional[str] = None, @@ -156,10 +53,11 @@ def partition( starting_page_number: int = 1, **kwargs: Any, ): - """Partitions a document into its constituent elements. Will use libmagic to determine - the file's type and route it to the appropriate partitioning function. Applies the default - parameters for each partitioning function. Use the document-type specific partitioning - functions if you need access to additional kwarg options. + """Partitions a document into its constituent elements. + + Uses libmagic to determine the file's type and route it to the appropriate partitioning + function. Applies the default parameters for each partitioning function. Use the document-type + specific partitioning functions if you need access to additional kwarg options. Parameters ---------- @@ -272,7 +170,7 @@ def partition( languages = check_language_args(languages or [], ocr_languages) if url is not None: - file, filetype = file_and_type_from_url( + file, file_type = file_and_type_from_url( url=url, content_type=content_type, headers=headers, @@ -285,7 +183,7 @@ def partition( "The headers kwarg is set but the url kwarg is not. " "The headers kwarg will be ignored.", ) - filetype = detect_filetype( + file_type = detect_filetype( filename=filename, file=file, file_filename=metadata_filename, @@ -297,14 +195,16 @@ def partition( file.seek(0) infer_table_structure = decide_table_extraction( - filetype, + file_type, skip_infer_table_types, pdf_infer_table_structure, ) - if filetype == FileType.CSV: - _partition_csv = _get_partition_with_extras("csv") - elements = _partition_csv( + partitioner_loader = _PartitionerLoader() + + if file_type == FileType.CSV: + partition_csv = partitioner_loader.get(file_type) + elements = partition_csv( filename=filename, file=file, infer_table_structure=infer_table_structure, @@ -312,9 +212,9 @@ def partition( detect_language_per_element=detect_language_per_element, **kwargs, ) - elif filetype == FileType.DOC: - _partition_doc = _get_partition_with_extras("doc") - elements = _partition_doc( + elif file_type == FileType.DOC: + partition_doc = partitioner_loader.get(file_type) + elements = partition_doc( filename=filename, file=file, infer_table_structure=infer_table_structure, @@ -324,9 +224,9 @@ def partition( strategy=strategy, **kwargs, ) - elif filetype == FileType.DOCX: - _partition_docx = _get_partition_with_extras("docx") - elements = _partition_docx( + elif file_type == FileType.DOCX: + partition_docx = partitioner_loader.get(file_type) + elements = partition_docx( filename=filename, file=file, infer_table_structure=infer_table_structure, @@ -336,7 +236,8 @@ def partition( strategy=strategy, **kwargs, ) - elif filetype == FileType.EML: + elif file_type == FileType.EML: + partition_email = partitioner_loader.get(file_type) elements = partition_email( filename=filename, file=file, @@ -345,9 +246,9 @@ def partition( detect_language_per_element=detect_language_per_element, **kwargs, ) - elif filetype == FileType.EPUB: - _partition_epub = _get_partition_with_extras("epub") - elements = _partition_epub( + elif file_type == FileType.EPUB: + partition_epub = partitioner_loader.get(file_type) + elements = partition_epub( filename=filename, file=file, include_page_breaks=include_page_breaks, @@ -356,7 +257,8 @@ def partition( detect_language_per_element=detect_language_per_element, **kwargs, ) - elif filetype == FileType.HTML: + elif file_type == FileType.HTML: + partition_html = partitioner_loader.get(file_type) elements = partition_html( filename=filename, file=file, @@ -366,9 +268,9 @@ def partition( detect_language_per_element=detect_language_per_element, **kwargs, ) - elif filetype in IMAGE_FILETYPES: - _partition_image = _get_partition_with_extras("image") - elements = _partition_image( + elif file_type.partitioner_shortname == "image": + partition_image = partitioner_loader.get(file_type) + elements = partition_image( filename=filename, file=file, url=None, @@ -384,16 +286,17 @@ def partition( starting_page_number=starting_page_number, **kwargs, ) - elif filetype == FileType.JSON: + elif file_type == FileType.JSON: if not is_json_processable(filename=filename, file=file): raise ValueError( "Detected a JSON file that does not conform to the Unstructured schema. " "partition_json currently only processes serialized Unstructured output.", ) + partition_json = partitioner_loader.get(file_type) elements = partition_json(filename=filename, file=file, **kwargs) - elif filetype == FileType.MD: - _partition_md = _get_partition_with_extras("md") - elements = _partition_md( + elif file_type == FileType.MD: + partition_md = partitioner_loader.get(file_type) + elements = partition_md( filename=filename, file=file, include_page_breaks=include_page_breaks, @@ -402,18 +305,18 @@ def partition( detect_language_per_element=detect_language_per_element, **kwargs, ) - elif filetype == FileType.MSG: - _partition_msg = _get_partition_with_extras("msg") - elements = _partition_msg( + elif file_type == FileType.MSG: + partition_msg = partitioner_loader.get(file_type) + elements = partition_msg( filename=filename, file=file, languages=languages, detect_language_per_element=detect_language_per_element, **kwargs, ) - elif filetype == FileType.ODT: - _partition_odt = _get_partition_with_extras("odt") - elements = _partition_odt( + elif file_type == FileType.ODT: + partition_odt = partitioner_loader.get(file_type) + elements = partition_odt( filename=filename, file=file, infer_table_structure=infer_table_structure, @@ -423,9 +326,9 @@ def partition( strategy=strategy, **kwargs, ) - elif filetype == FileType.ORG: - _partition_org = _get_partition_with_extras("org") - elements = _partition_org( + elif file_type == FileType.ORG: + partition_org = partitioner_loader.get(file_type) + elements = partition_org( filename=filename, file=file, include_page_breaks=include_page_breaks, @@ -433,9 +336,9 @@ def partition( detect_language_per_element=detect_language_per_element, **kwargs, ) - elif filetype == FileType.PDF: - _partition_pdf = _get_partition_with_extras("pdf") - elements = _partition_pdf( + elif file_type == FileType.PDF: + partition_pdf = partitioner_loader.get(file_type) + elements = partition_pdf( filename=filename, file=file, url=None, @@ -451,9 +354,9 @@ def partition( starting_page_number=starting_page_number, **kwargs, ) - elif filetype == FileType.PPT: - _partition_ppt = _get_partition_with_extras("ppt") - elements = _partition_ppt( + elif file_type == FileType.PPT: + partition_ppt = partitioner_loader.get(file_type) + elements = partition_ppt( filename=filename, file=file, include_page_breaks=include_page_breaks, @@ -463,9 +366,9 @@ def partition( strategy=strategy, **kwargs, ) - elif filetype == FileType.PPTX: - _partition_pptx = _get_partition_with_extras("pptx") - elements = _partition_pptx( + elif file_type == FileType.PPTX: + partition_pptx = partitioner_loader.get(file_type) + elements = partition_pptx( filename=filename, file=file, include_page_breaks=include_page_breaks, @@ -476,9 +379,9 @@ def partition( strategy=strategy, **kwargs, ) - elif filetype == FileType.RST: - _partition_rst = _get_partition_with_extras("rst") - elements = _partition_rst( + elif file_type == FileType.RST: + partition_rst = partitioner_loader.get(file_type) + elements = partition_rst( filename=filename, file=file, include_page_breaks=include_page_breaks, @@ -487,9 +390,9 @@ def partition( detect_language_per_element=detect_language_per_element, **kwargs, ) - elif filetype == FileType.RTF: - _partition_rtf = _get_partition_with_extras("rtf") - elements = _partition_rtf( + elif file_type == FileType.RTF: + partition_rtf = partitioner_loader.get(file_type) + elements = partition_rtf( filename=filename, file=file, include_page_breaks=include_page_breaks, @@ -498,16 +401,17 @@ def partition( detect_language_per_element=detect_language_per_element, **kwargs, ) - elif filetype == FileType.TSV: - _partition_tsv = _get_partition_with_extras("tsv") - elements = _partition_tsv( + elif file_type == FileType.TSV: + partition_tsv = partitioner_loader.get(file_type) + elements = partition_tsv( filename=filename, file=file, languages=languages, detect_language_per_element=detect_language_per_element, **kwargs, ) - elif filetype == FileType.TXT: + elif file_type == FileType.TXT: + partition_text = partitioner_loader.get(file_type) elements = partition_text( filename=filename, file=file, @@ -517,9 +421,9 @@ def partition( detect_language_per_element=detect_language_per_element, **kwargs, ) - elif filetype in (FileType.XLS, FileType.XLSX): - _partition_xlsx = _get_partition_with_extras("xlsx") - elements = _partition_xlsx( + elif file_type in (FileType.XLS, FileType.XLSX): + partition_xlsx = partitioner_loader.get(file_type) + elements = partition_xlsx( filename=filename, file=file, infer_table_structure=infer_table_structure, @@ -528,7 +432,8 @@ def partition( starting_page_number=starting_page_number, **kwargs, ) - elif filetype == FileType.XML: + elif file_type == FileType.XML: + partition_xml = partitioner_loader.get(file_type) elements = partition_xml( filename=filename, file=file, @@ -538,11 +443,11 @@ def partition( detect_language_per_element=detect_language_per_element, **kwargs, ) - elif filetype == FileType.EMPTY: + elif file_type == FileType.EMPTY: elements = [] else: msg = "Invalid file" if not filename else f"Invalid file {filename}" - raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.") + raise ValueError(f"{msg}. The {file_type} file type is not supported in partition.") for element in elements: element.metadata.url = url @@ -551,7 +456,7 @@ def partition( out_filetype = FileType.from_mime_type(content_type) element.metadata.filetype = out_filetype.mime_type if out_filetype is not None else None else: - element.metadata.filetype = filetype.mime_type + element.metadata.filetype = file_type.mime_type return elements @@ -562,7 +467,7 @@ def file_and_type_from_url( headers: dict[str, str] = {}, ssl_verify: bool = True, request_timeout: Optional[int] = None, -) -> tuple[io.BytesIO, Optional[FileType]]: +) -> tuple[io.BytesIO, FileType]: response = requests.get(url, headers=headers, verify=ssl_verify, timeout=request_timeout) file = io.BytesIO(response.content) @@ -590,3 +495,51 @@ def decide_table_extraction( return pdf_infer_table_structure or doc_type not in skip_infer_table_types return doc_type not in skip_infer_table_types + + +class _PartitionerLoader: + """Provides uniform helpful error when a partitioner dependency is not installed. + + Used by `partition()` to encapsulate coping with the possibility the Python + environment it is executing in may not have all dependencies installed for a + particular partitioner. + + Provides `.get()` to access partitioners by file-type, which raises when one or + more dependencies for that partitioner are not installed. + + The error message indicates what extra needs to be installed to enable that + partitioner. This avoids an inconsistent variety of possibly puzzling exceptions + arising from much deeper in the partitioner when access to the missing dependency is + first attempted. + """ + + # -- module-lifetime cache for partitioners once loaded -- + _partitioners: dict[FileType, Partitioner] = {} + + def get(self, file_type: FileType) -> Partitioner: + """Return partitioner for `file_type`. + + Raises when one or more package dependencies for that file-type have not been + installed. + """ + if file_type not in self._partitioners: + self._partitioners[file_type] = self._load_partitioner(file_type) + + return self._partitioners[file_type] + + def _load_partitioner(self, file_type: FileType) -> Partitioner: + """Load the partitioner for `file_type` after verifying dependencies.""" + # -- verify all package dependencies are installed -- + for pkg_name in file_type.importable_package_dependencies: + if not dependency_exists(pkg_name): + raise ImportError( + f"{file_type.partitioner_function_name}() is not available because one or" + f" more dependencies are not installed. Use:" + f' pip install "unstructured[{file_type.extra_name}]" (including quotes)' + f" to install the required dependencies", + ) + + # -- load the partitioner and return it -- + assert file_type.is_partitionable # -- would be a programming error if this failed -- + partitioner_module = importlib.import_module(file_type.partitioner_module_qname) + return getattr(partitioner_module, file_type.partitioner_function_name)