mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-24 17:41:15 +00:00

**Summary** Elaborate the `FileType` enum to be a complete descriptor of file-types. Add methods to allow `STR_TO_FILETYPE`, `EXT_TO_FILETYPE` and `FILETYPE_TO_MIMETYPE` mappings to be replaced, removing those redundant and noisy declarations. In the process, fix some lingering file-type identification and `.metadata.filetype` errors that had been skipped in the tests. **Additional Context** Gathering the various attributes of a file-type into the `FileType` enum eliminates the duplication inherent in the separate `STR_TO_FILETYPE` etc. mappings and makes access to those values convenient for callers. These attributes include what MIME-type a file-type should record in metadata and what MIME-types and extensions map to that file-type. These values and others are made available as methods and properties directly on the `FileType` class and members. Because all attributes are defined in the `FileType` enum there is no risk of inconsistency across multiple locations and any changes happen in one and only one place. Further attributes and methods will be added in later commits to support other file-type related operations like mapping to a partitioner and verifying its dependencies are installed.
1324 lines
49 KiB
Python
1324 lines
49 KiB
Python
# pyright: reportPrivateUsage=false
|
|
|
|
from __future__ import annotations
|
|
|
|
import io
|
|
import json
|
|
import os
|
|
import pathlib
|
|
import sys
|
|
import tempfile
|
|
import warnings
|
|
from importlib import import_module
|
|
from typing import Callable, Iterator, cast
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
from PIL import Image
|
|
|
|
from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
|
|
from test_unstructured.partition.test_constants import (
|
|
EXPECTED_TABLE,
|
|
EXPECTED_TABLE_XLSX,
|
|
EXPECTED_TEXT,
|
|
EXPECTED_XLS_TABLE,
|
|
)
|
|
from test_unstructured.unit_utils import (
|
|
ANY,
|
|
FixtureRequest,
|
|
LogCaptureFixture,
|
|
MonkeyPatch,
|
|
example_doc_path,
|
|
function_mock,
|
|
method_mock,
|
|
)
|
|
from unstructured.cleaners.core import clean_extra_whitespace
|
|
from unstructured.documents.elements import (
|
|
Address,
|
|
CompositeElement,
|
|
Element,
|
|
ElementMetadata,
|
|
ListItem,
|
|
NarrativeText,
|
|
Table,
|
|
TableChunk,
|
|
Text,
|
|
Title,
|
|
)
|
|
from unstructured.file_utils.model import FileType
|
|
from unstructured.partition import auto
|
|
from unstructured.partition.auto import IMAGE_FILETYPES, _get_partition_with_extras, partition
|
|
from unstructured.partition.utils.constants import PartitionStrategy
|
|
from unstructured.staging.base import elements_from_json, elements_to_dicts, elements_to_json
|
|
|
|
is_in_docker = os.path.exists("/.dockerenv")
|
|
|
|
|
|
# ================================================================================================
|
|
# CSV
|
|
# ================================================================================================
|
|
|
|
|
|
def test_auto_partition_csv_from_filename():
|
|
elements = partition(example_doc_path("stanley-cups.csv"))
|
|
|
|
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
|
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
|
assert elements[0].metadata.filetype == "text/csv"
|
|
|
|
|
|
def test_auto_partition_csv_from_file():
|
|
with open(example_doc_path("stanley-cups.csv"), "rb") as f:
|
|
elements = partition(file=f)
|
|
|
|
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
|
assert isinstance(elements[0], Table)
|
|
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
|
assert elements[0].metadata.filetype == "text/csv"
|
|
|
|
|
|
# ================================================================================================
|
|
# DOC
|
|
# ================================================================================================
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("pass_metadata_filename", "content_type"),
|
|
[(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
|
|
)
|
|
def test_auto_partition_doc_from_filename(
|
|
pass_metadata_filename: bool, content_type: str | None, expected_docx_elements: list[Element]
|
|
):
|
|
file_path = example_doc_path("simple.doc")
|
|
metadata_filename = file_path if pass_metadata_filename else None
|
|
|
|
elements = partition(
|
|
filename=file_path,
|
|
metadata_filename=metadata_filename,
|
|
content_type=content_type,
|
|
strategy=PartitionStrategy.HI_RES,
|
|
)
|
|
|
|
for e in elements:
|
|
print(f"{type(e).__name__}({repr(e.text)})")
|
|
|
|
assert elements == expected_docx_elements
|
|
assert all(e.metadata.filename == "simple.doc" for e in elements)
|
|
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Passes in CI but not Docker. Remove skip on #3364 fix.")
|
|
@pytest.mark.xfail(sys.platform == "darwin", reason="#3364", raises=KeyError, strict=True)
|
|
def test_auto_partition_doc_from_file(expected_docx_elements: list[Element]):
|
|
# -- NOTE(scanny): https://github.com/Unstructured-IO/unstructured/issues/3364
|
|
# -- detect_filetype() identifies .doc as `application/x-ole-storage` which is true but not
|
|
# -- specific enough. The `FileType.MSG` file-type is assigned (which is also an OLE file)
|
|
# -- and `partition()` routes the document to `partition_msg` which is where the `KeyError`
|
|
# -- comes from.
|
|
# -- For some reason, this xfail problem only occurs locally, not in CI, possibly because we
|
|
# -- use two different `libmagic` sourcs (`libmagic` on CI and `libmagic1` on Mac). Doesn't
|
|
# -- matter much though because when we add disambiguation they'll both get it right.
|
|
with open(example_doc_path("simple.doc"), "rb") as f:
|
|
elements = partition(file=f)
|
|
|
|
assert elements == expected_docx_elements
|
|
|
|
|
|
# ================================================================================================
|
|
# DOCX
|
|
# ================================================================================================
|
|
|
|
|
|
def test_auto_partition_docx_from_filename(expected_docx_elements: list[Element]):
|
|
elements = partition(example_doc_path("simple.docx"), strategy=PartitionStrategy.HI_RES)
|
|
|
|
assert elements == expected_docx_elements
|
|
assert all(e.metadata.filename == "simple.docx" for e in elements)
|
|
|
|
|
|
def test_auto_partition_docx_from_file(expected_docx_elements: list[Element]):
|
|
with open(example_doc_path("simple.docx"), "rb") as f:
|
|
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
|
assert elements == expected_docx_elements
|
|
|
|
|
|
@pytest.mark.parametrize("file_name", ["simple.docx", "simple.doc", "simple.odt"])
|
|
@pytest.mark.parametrize(
|
|
"strategy",
|
|
[
|
|
PartitionStrategy.AUTO,
|
|
PartitionStrategy.FAST,
|
|
PartitionStrategy.HI_RES,
|
|
PartitionStrategy.OCR_ONLY,
|
|
],
|
|
)
|
|
def test_partition_forwards_strategy_arg_to_partition_docx_and_its_brokers(
|
|
request: FixtureRequest, file_name: str, strategy: str
|
|
):
|
|
"""The `strategy` arg value received by `partition()` is received by `partition_docx().
|
|
|
|
To do this in the brokering-partitioner cases (DOC, ODT) it must make its way to
|
|
`partition_doc()` or `partition_odt()` which must then forward it to `partition_docx()`. This
|
|
test makes sure it made it all the way.
|
|
|
|
Note this is 3 file-types X 4 strategies = 12 test-cases.
|
|
"""
|
|
from unstructured.partition.docx import _DocxPartitioner
|
|
|
|
def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]:
|
|
yield Text(f"strategy=={self._opts.strategy}")
|
|
|
|
_iter_elements_ = method_mock(
|
|
request,
|
|
_DocxPartitioner,
|
|
"_iter_document_elements",
|
|
side_effect=fake_iter_document_elements,
|
|
)
|
|
|
|
(element,) = partition(example_doc_path(file_name), strategy=strategy)
|
|
|
|
_iter_elements_.assert_called_once_with(ANY)
|
|
assert element.text == f"strategy=={strategy}"
|
|
|
|
|
|
# ================================================================================================
|
|
# EML
|
|
# ================================================================================================
|
|
|
|
EXPECTED_EMAIL_OUTPUT = [
|
|
NarrativeText(text="This is a test email to use for unit tests."),
|
|
Title(text="Important points:"),
|
|
ListItem(text="Roses are red"),
|
|
ListItem(text="Violets are blue"),
|
|
]
|
|
|
|
|
|
def test_auto_partition_email_from_filename():
|
|
file_path = example_doc_path("eml/fake-email.eml")
|
|
|
|
elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
|
|
|
|
assert len(elements) > 0
|
|
assert elements == EXPECTED_EMAIL_OUTPUT
|
|
assert elements[0].metadata.filename == os.path.basename(file_path)
|
|
assert elements[0].metadata.file_directory == os.path.split(file_path)[0]
|
|
|
|
|
|
def test_auto_partition_email_from_file():
|
|
with open(example_doc_path("eml/fake-email.eml"), "rb") as f:
|
|
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
|
|
|
assert len(elements) > 0
|
|
assert elements == EXPECTED_EMAIL_OUTPUT
|
|
|
|
|
|
def test_auto_partition_eml_add_signature_to_metadata():
|
|
elements = partition(example_doc_path("eml/signed-doc.p7s"))
|
|
|
|
assert len(elements) == 1
|
|
assert elements[0].text == "This is a test"
|
|
assert elements[0].metadata.signature == "<SIGNATURE>\n"
|
|
|
|
|
|
# ================================================================================================
|
|
# EPUB
|
|
# ================================================================================================
|
|
|
|
|
|
def test_auto_partition_epub_from_filename():
|
|
elements = partition(example_doc_path("winter-sports.epub"), strategy=PartitionStrategy.HI_RES)
|
|
|
|
assert len(elements) > 0
|
|
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
|
|
|
|
|
|
def test_auto_partition_epub_from_file():
|
|
with open(example_doc_path("winter-sports.epub"), "rb") as f:
|
|
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
|
|
|
assert len(elements) > 0
|
|
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
|
|
|
|
|
|
# ================================================================================================
|
|
# HTML
|
|
# ================================================================================================
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("pass_metadata_filename", "content_type"),
|
|
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
|
|
)
|
|
def test_auto_partition_html_from_filename(pass_metadata_filename: bool, content_type: str | None):
|
|
file_path = example_doc_path("example-10k-1p.html")
|
|
metadata_filename = file_path if pass_metadata_filename else None
|
|
|
|
elements = partition(
|
|
filename=file_path,
|
|
metadata_filename=metadata_filename,
|
|
content_type=content_type,
|
|
strategy=PartitionStrategy.HI_RES,
|
|
)
|
|
|
|
assert elements
|
|
expected_filename, expected_directory = os.path.basename(file_path), os.path.split(file_path)[0]
|
|
assert all(e.metadata.filename == expected_filename for e in elements)
|
|
assert all(e.metadata.file_directory == expected_directory for e in elements)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("pass_metadata_filename", "content_type"),
|
|
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
|
|
)
|
|
def test_auto_partition_html_from_file(pass_metadata_filename: bool, content_type: str | None):
|
|
file_path = example_doc_path("example-10k-1p.html")
|
|
metadata_filename = file_path if pass_metadata_filename else None
|
|
|
|
with open(file_path, "rb") as f:
|
|
elements = partition(
|
|
file=f,
|
|
metadata_filename=metadata_filename,
|
|
content_type=content_type,
|
|
strategy=PartitionStrategy.HI_RES,
|
|
)
|
|
|
|
assert len(elements) > 0
|
|
|
|
|
|
def test_auto_partition_html_pre_from_file():
|
|
elements = partition(example_doc_path("fake-html-pre.htm"))
|
|
|
|
assert len(elements) > 0
|
|
assert "PageBreak" not in [elem.category for elem in elements]
|
|
assert clean_extra_whitespace(elements[0].text).startswith("[107th Congress Public Law 56]")
|
|
assert isinstance(elements[0], NarrativeText)
|
|
assert all(e.metadata.filetype == "text/html" for e in elements)
|
|
assert all(e.metadata.filename == "fake-html-pre.htm" for e in elements)
|
|
|
|
|
|
# ================================================================================================
|
|
# IMAGE
|
|
# ================================================================================================
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("pass_metadata_filename", "content_type"),
|
|
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
|
)
|
|
def test_auto_partition_jpeg_from_filename(pass_metadata_filename: bool, content_type: str | None):
|
|
file_path = example_doc_path("layout-parser-paper-fast.jpg")
|
|
metadata_filename = file_path if pass_metadata_filename else None
|
|
|
|
elements = partition(
|
|
filename=file_path,
|
|
metadata_filename=metadata_filename,
|
|
content_type=content_type,
|
|
strategy=PartitionStrategy.AUTO,
|
|
)
|
|
|
|
e = elements[2]
|
|
assert e.text == (
|
|
"LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
|
)
|
|
assert e.metadata.coordinates is not None
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("pass_metadata_filename", "content_type"),
|
|
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
|
)
|
|
def test_auto_partition_jpeg_from_file(pass_metadata_filename: bool, content_type: str | None):
|
|
file_path = example_doc_path("layout-parser-paper-fast.jpg")
|
|
metadata_filename = file_path if pass_metadata_filename else None
|
|
|
|
with open(file_path, "rb") as f:
|
|
elements = partition(
|
|
file=f,
|
|
metadata_filename=metadata_filename,
|
|
content_type=content_type,
|
|
strategy=PartitionStrategy.AUTO,
|
|
)
|
|
|
|
e = elements[2]
|
|
assert e.text == (
|
|
"LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
|
)
|
|
assert e.metadata.coordinates is not None
|
|
|
|
|
|
def test_auto_partition_bmp_from_filename(tmp_path: pathlib.Path):
|
|
bmp_filename = str(tmp_path / "example.bmp")
|
|
with Image.open(example_doc_path("layout-parser-paper-with-table.jpg")) as img:
|
|
img.save(bmp_filename)
|
|
|
|
elements = partition(filename=bmp_filename, strategy=PartitionStrategy.HI_RES)
|
|
|
|
table = [e.metadata.text_as_html for e in elements if e.metadata.text_as_html]
|
|
assert len(table) == 1
|
|
assert "<table><thead><tr>" in table[0]
|
|
assert "</thead><tbody><tr>" in table[0]
|
|
|
|
|
|
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
|
|
def test_auto_partition_image_element_extraction(extract_image_block_to_payload: bool):
|
|
extract_image_block_types = ["Image", "Table"]
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
elements = partition(
|
|
filename=example_doc_path("embedded-images-tables.jpg"),
|
|
extract_image_block_types=extract_image_block_types,
|
|
extract_image_block_to_payload=extract_image_block_to_payload,
|
|
extract_image_block_output_dir=tmpdir,
|
|
)
|
|
|
|
assert_element_extraction(
|
|
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
|
|
)
|
|
|
|
|
|
# ================================================================================================
|
|
# JSON
|
|
# ================================================================================================
|
|
|
|
|
|
# TODO(scanny): This test should go away when we fix #3365. This test glosses over several
|
|
# important JSON "rehydration" behaviors, in particular that the metadata should match exactly.
|
|
# The following test `test_auto_partition_json_from_file_preserves_original_elements` will be the
|
|
# replacement for this test.
|
|
def test_auto_partitioned_json_output_maintains_consistency_with_fixture_elements():
|
|
"""Test auto-processing an unstructured json output file by filename."""
|
|
json_file_path = example_doc_path("spring-weather.html.json")
|
|
original_file_name = "spring-weather.html"
|
|
with open(json_file_path) as json_f:
|
|
expected_result = json.load(json_f)
|
|
|
|
partitioning_result = json.loads(
|
|
cast(
|
|
str,
|
|
elements_to_json(
|
|
partition(
|
|
filename=str(json_file_path),
|
|
# -- use the original file name to get the same element IDs (hashes) --
|
|
metadata_filename=original_file_name,
|
|
strategy=PartitionStrategy.HI_RES,
|
|
)
|
|
),
|
|
)
|
|
)
|
|
for elem in partitioning_result:
|
|
elem.pop("metadata")
|
|
for elem in expected_result:
|
|
elem.pop("metadata")
|
|
|
|
assert expected_result == partitioning_result
|
|
|
|
|
|
@pytest.mark.xfail(
|
|
reason=(
|
|
"https://github.com/Unstructured-IO/unstructured/issues/3365"
|
|
" partition_json() does not preserve original element-id or metadata"
|
|
),
|
|
raises=AssertionError,
|
|
strict=True,
|
|
)
|
|
def test_auto_partition_json_from_file_preserves_original_elements():
|
|
file_path = example_doc_path("simple.json")
|
|
original_elements = elements_from_json(file_path)
|
|
|
|
with open(file_path, "rb") as f:
|
|
partitioned_elements = partition(file=f)
|
|
|
|
assert elements_to_dicts(partitioned_elements) == elements_to_dicts(original_elements)
|
|
|
|
|
|
def test_auto_partition_json_raises_with_unprocessable_json(tmp_path: pathlib.Path):
|
|
# NOTE(robinson) - This is unprocessable because it is not a list of dicts, per the
|
|
# Unstructured JSON serialization format
|
|
text = '{"hi": "there"}'
|
|
|
|
file_path = str(tmp_path / "unprocessable.json")
|
|
with open(file_path, "w") as f:
|
|
f.write(text)
|
|
|
|
with pytest.raises(ValueError, match="Detected a JSON file that does not conform to the Unst"):
|
|
partition(filename=file_path)
|
|
|
|
|
|
# ================================================================================================
|
|
# MD
|
|
# ================================================================================================
|
|
|
|
|
|
def test_partition_md_from_url_works_with_embedded_html():
|
|
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/README.md"
|
|
elements = partition(url=url, content_type="text/markdown", strategy=PartitionStrategy.HI_RES)
|
|
assert "unstructured" in elements[0].text
|
|
|
|
|
|
# ================================================================================================
|
|
# MSG
|
|
# ================================================================================================
|
|
|
|
|
|
def test_auto_partition_msg_from_filename():
|
|
assert partition(example_doc_path("fake-email.msg"), strategy=PartitionStrategy.HI_RES) == [
|
|
NarrativeText(text="This is a test email to use for unit tests."),
|
|
Title(text="Important points:"),
|
|
ListItem(text="Roses are red"),
|
|
ListItem(text="Violets are blue"),
|
|
]
|
|
|
|
|
|
# ================================================================================================
|
|
# ODT
|
|
# ================================================================================================
|
|
|
|
|
|
def test_auto_partition_odt_from_filename(expected_docx_elements: list[Element]):
|
|
elements = partition(example_doc_path("simple.odt"), strategy=PartitionStrategy.HI_RES)
|
|
assert elements == expected_docx_elements
|
|
|
|
|
|
def test_auto_partition_odt_from_file(expected_docx_elements: list[Element]):
|
|
with open(example_doc_path("simple.odt"), "rb") as f:
|
|
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
|
|
|
assert elements == expected_docx_elements
|
|
|
|
|
|
# ================================================================================================
|
|
# ORG
|
|
# ================================================================================================
|
|
|
|
|
|
def test_auto_partition_org_from_filename():
|
|
elements = partition(example_doc_path("README.org"))
|
|
|
|
assert elements[0] == Title("Example Docs")
|
|
assert elements[0].metadata.filetype == "text/org"
|
|
|
|
|
|
def test_auto_partition_org_from_file():
|
|
with open(example_doc_path("README.org"), "rb") as f:
|
|
elements = partition(file=f, content_type="text/org")
|
|
|
|
assert elements[0] == Title("Example Docs")
|
|
assert elements[0].metadata.filetype == "text/org"
|
|
|
|
|
|
# ================================================================================================
|
|
# PDF
|
|
# ================================================================================================
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("pass_metadata_filename", "content_type"),
|
|
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
|
|
)
|
|
def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_type: str | None):
|
|
file_path = example_doc_path("layout-parser-paper-fast.pdf")
|
|
metadata_filename = file_path if pass_metadata_filename else None
|
|
|
|
elements = partition(
|
|
filename=file_path,
|
|
metadata_filename=metadata_filename,
|
|
content_type=content_type,
|
|
strategy=PartitionStrategy.HI_RES,
|
|
)
|
|
|
|
# NOTE(scanny): gave up trying to figure out why, but this file partitions differently locally
|
|
# (on Mac) than it does in CI. Basically the first element when partitioning locally is split
|
|
# in two when partitioning on CI. Other than that split the text is exactly the same.
|
|
idx = 2 if sys.platform == "darwin" else 3
|
|
|
|
e = elements[idx]
|
|
assert isinstance(e, Title)
|
|
assert e.text.startswith("LayoutParser")
|
|
assert e.metadata.filename == os.path.basename(file_path)
|
|
assert e.metadata.file_directory == os.path.split(file_path)[0]
|
|
|
|
e = elements[idx + 1]
|
|
assert isinstance(e, NarrativeText)
|
|
assert e.text.startswith("Zejiang Shen")
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("pass_metadata_filename", "content_type"),
|
|
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
|
|
)
|
|
def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type: str | None):
|
|
file_path = example_doc_path("layout-parser-paper-fast.pdf")
|
|
metadata_filename = file_path if pass_metadata_filename else None
|
|
|
|
with open(file_path, "rb") as f:
|
|
elements = partition(
|
|
file=f,
|
|
metadata_filename=metadata_filename,
|
|
content_type=content_type,
|
|
strategy=PartitionStrategy.HI_RES,
|
|
)
|
|
|
|
# NOTE(scanny): see "from_filename" version of this test above for more on this oddness
|
|
idx = 2 if sys.platform == "darwin" else 3
|
|
|
|
e = elements[idx]
|
|
assert isinstance(e, Title)
|
|
assert e.text.startswith("LayoutParser")
|
|
|
|
e = elements[idx + 1]
|
|
assert isinstance(e, NarrativeText)
|
|
assert e.text.startswith("Zejiang Shen")
|
|
|
|
|
|
def test_auto_partition_pdf_with_fast_strategy(monkeypatch: MonkeyPatch):
|
|
file_path = example_doc_path("layout-parser-paper-fast.pdf")
|
|
|
|
mock_return = [NarrativeText("Hello there!")]
|
|
with patch.object(auto, "partition_pdf", return_value=mock_return) as mock_partition:
|
|
mock_partition_with_extras_map = {"pdf": mock_partition}
|
|
monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
|
|
partition(filename=file_path, strategy=PartitionStrategy.FAST)
|
|
|
|
mock_partition.assert_called_once_with(
|
|
filename=file_path,
|
|
file=None,
|
|
url=None,
|
|
strategy=PartitionStrategy.FAST,
|
|
languages=None,
|
|
metadata_filename=None,
|
|
include_page_breaks=False,
|
|
infer_table_structure=False,
|
|
extract_images_in_pdf=False,
|
|
extract_image_block_types=None,
|
|
extract_image_block_output_dir=None,
|
|
extract_image_block_to_payload=False,
|
|
hi_res_model_name=None,
|
|
date_from_file_object=False,
|
|
starting_page_number=1,
|
|
)
|
|
|
|
|
|
def test_auto_partition_pdf_uses_pdf_infer_table_structure_argument():
|
|
with patch(
|
|
"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
|
|
) as mock_process_file_with_model:
|
|
partition(
|
|
example_doc_path("layout-parser-paper-fast.pdf"),
|
|
pdf_infer_table_structure=True,
|
|
strategy=PartitionStrategy.HI_RES,
|
|
)
|
|
assert mock_process_file_with_model.call_args[1]["infer_table_structure"]
|
|
|
|
|
|
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
|
|
def test_auto_partition_pdf_element_extraction(extract_image_block_to_payload: bool):
|
|
extract_image_block_types = ["Image", "Table"]
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
elements = partition(
|
|
example_doc_path("embedded-images-tables.pdf"),
|
|
extract_image_block_types=extract_image_block_types,
|
|
extract_image_block_to_payload=extract_image_block_to_payload,
|
|
extract_image_block_output_dir=tmpdir,
|
|
)
|
|
|
|
assert_element_extraction(
|
|
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
|
|
)
|
|
|
|
|
|
def test_partition_pdf_does_not_raise_warning():
|
|
# NOTE(robinson): This is the recommended way to check that no warning is emitted,
|
|
# per the pytest docs.
|
|
# ref: https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html
|
|
# #additional-use-cases-of-warnings-in-tests
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("error")
|
|
partition(
|
|
example_doc_path("layout-parser-paper-fast.pdf"), strategy=PartitionStrategy.HI_RES
|
|
)
|
|
|
|
|
|
# ================================================================================================
|
|
# PPT
|
|
# ================================================================================================
|
|
|
|
|
|
def test_auto_partition_ppt_from_filename():
|
|
file_path = example_doc_path("fake-power-point.ppt")
|
|
|
|
elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
|
|
|
|
assert elements == [
|
|
Title(text="Adding a Bullet Slide"),
|
|
ListItem(text="Find the bullet slide layout"),
|
|
ListItem(text="Use _TextFrame.text for first bullet"),
|
|
ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
|
|
NarrativeText(text="Here is a lot of text!"),
|
|
NarrativeText(text="Here is some text in a text box!"),
|
|
]
|
|
assert all(e.metadata.filename == "fake-power-point.ppt" for e in elements)
|
|
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
|
|
|
|
|
|
# ================================================================================================
|
|
# PPTX
|
|
# ================================================================================================
|
|
|
|
|
|
def test_auto_partition_pptx_from_filename():
|
|
file_path = example_doc_path("fake-power-point.pptx")
|
|
|
|
elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
|
|
|
|
assert elements == [
|
|
Title(text="Adding a Bullet Slide"),
|
|
ListItem(text="Find the bullet slide layout"),
|
|
ListItem(text="Use _TextFrame.text for first bullet"),
|
|
ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
|
|
NarrativeText(text="Here is a lot of text!"),
|
|
NarrativeText(text="Here is some text in a text box!"),
|
|
]
|
|
assert all(e.metadata.filename == "fake-power-point.pptx" for e in elements)
|
|
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
|
|
|
|
|
|
@pytest.mark.parametrize("file_name", ["simple.pptx", "fake-power-point.ppt"])
|
|
@pytest.mark.parametrize(
|
|
"strategy",
|
|
[
|
|
PartitionStrategy.AUTO,
|
|
PartitionStrategy.FAST,
|
|
PartitionStrategy.HI_RES,
|
|
PartitionStrategy.OCR_ONLY,
|
|
],
|
|
)
|
|
def test_partition_forwards_strategy_arg_to_partition_pptx_and_its_brokers(
|
|
request: FixtureRequest, file_name: str, strategy: str
|
|
):
|
|
"""The `strategy` arg value received by `partition()` is received by `partition_pptx().
|
|
|
|
To do this in the brokering-partitioner case (PPT) the strategy argument must make its way to
|
|
`partition_ppt()` which must then forward it to `partition_pptx()`. This test makes sure it
|
|
made it all the way.
|
|
|
|
Note this is 2 file-types X 4 strategies = 8 test-cases.
|
|
"""
|
|
from unstructured.partition.pptx import _PptxPartitioner
|
|
|
|
def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]:
|
|
yield Text(f"strategy=={self._opts.strategy}")
|
|
|
|
_iter_elements_ = method_mock(
|
|
request,
|
|
_PptxPartitioner,
|
|
"_iter_presentation_elements",
|
|
side_effect=fake_iter_presentation_elements,
|
|
)
|
|
|
|
(element,) = partition(example_doc_path(file_name), strategy=strategy)
|
|
|
|
_iter_elements_.assert_called_once_with(ANY)
|
|
assert element.text == f"strategy=={strategy}"
|
|
|
|
|
|
# ================================================================================================
|
|
# RST
|
|
# ================================================================================================
|
|
|
|
|
|
def test_auto_partition_rst_from_filename():
|
|
elements = partition(example_doc_path("README.rst"))
|
|
|
|
assert elements[0] == Title("Example Docs")
|
|
assert elements[0].metadata.filetype == "text/x-rst"
|
|
|
|
|
|
def test_auto_partition_rst_from_file():
|
|
with open(example_doc_path("README.rst"), "rb") as f:
|
|
elements = partition(file=f, content_type="text/x-rst")
|
|
|
|
assert elements[0] == Title("Example Docs")
|
|
assert elements[0].metadata.filetype == "text/x-rst"
|
|
|
|
|
|
# ================================================================================================
|
|
# RTF
|
|
# ================================================================================================
|
|
|
|
|
|
def test_auto_partition_rtf_from_filename():
|
|
elements = partition(example_doc_path("fake-doc.rtf"), strategy=PartitionStrategy.HI_RES)
|
|
assert elements[0] == Title("My First Heading")
|
|
|
|
|
|
# ================================================================================================
|
|
# TSV
|
|
# ================================================================================================
|
|
|
|
|
|
def test_auto_partition_tsv_from_filename():
|
|
elements = partition(example_doc_path("stanley-cups.tsv"))
|
|
|
|
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
|
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
|
assert elements[0].metadata.filetype == "text/tsv"
|
|
|
|
|
|
# ================================================================================================
|
|
# TXT
|
|
# ================================================================================================
|
|
|
|
|
|
def test_auto_partition_text_from_filename():
|
|
file_path = example_doc_path("fake-text.txt")
|
|
|
|
elements = partition(filename=file_path, strategy=PartitionStrategy.HI_RES)
|
|
|
|
assert elements == [
|
|
NarrativeText(text="This is a test document to use for unit tests."),
|
|
Address(text="Doylestown, PA 18901"),
|
|
Title(text="Important points:"),
|
|
ListItem(text="Hamburgers are delicious"),
|
|
ListItem(text="Dogs are the best"),
|
|
ListItem(text="I love fuzzy blankets"),
|
|
]
|
|
assert all(e.metadata.filename == "fake-text.txt" for e in elements)
|
|
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
|
|
|
|
|
|
def test_auto_partition_text_from_file():
|
|
with open(example_doc_path("fake-text.txt"), "rb") as f:
|
|
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
|
|
|
assert len(elements) > 0
|
|
assert elements == [
|
|
NarrativeText(text="This is a test document to use for unit tests."),
|
|
Address(text="Doylestown, PA 18901"),
|
|
Title(text="Important points:"),
|
|
ListItem(text="Hamburgers are delicious"),
|
|
ListItem(text="Dogs are the best"),
|
|
ListItem(text="I love fuzzy blankets"),
|
|
]
|
|
|
|
|
|
# ================================================================================================
|
|
# XLS
|
|
# ================================================================================================
|
|
|
|
|
|
def test_auto_partition_xls_from_filename():
|
|
elements = partition(
|
|
example_doc_path("tests-example.xls"), include_header=False, skip_infer_table_types=[]
|
|
)
|
|
|
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
|
assert len(elements) == 14
|
|
|
|
assert clean_extra_whitespace(elements[0].text)[:45] == (
|
|
"MC What is 2+2? 4 correct 3 incorrect MA What"
|
|
)
|
|
# NOTE(crag): if the beautifulsoup4 package is installed, some (but not all) additional
|
|
# whitespace is removed, so the expected text length is less than is the case when
|
|
# beautifulsoup4 is *not* installed. E.g.
|
|
# "\n\n\nMA\nWhat C datatypes are 8 bits"
|
|
# vs. '\n \n \n MA\n What C datatypes are 8 bits?... "
|
|
assert len(elements[0].text) == 550
|
|
assert elements[0].metadata.text_as_html == EXPECTED_XLS_TABLE
|
|
|
|
|
|
# ================================================================================================
|
|
# XLSX
|
|
# ================================================================================================
|
|
|
|
|
|
def test_auto_partition_xlsx_from_filename():
|
|
elements = partition(
|
|
example_doc_path("stanley-cups.xlsx"), include_header=False, skip_infer_table_types=[]
|
|
)
|
|
|
|
assert len(elements) == 4
|
|
assert sum(isinstance(e, Table) for e in elements) == 2
|
|
assert sum(isinstance(e, Title) for e in elements) == 2
|
|
assert clean_extra_whitespace(elements[0].text) == "Stanley Cups"
|
|
assert clean_extra_whitespace(elements[1].text) == (
|
|
"Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
|
|
)
|
|
assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
|
|
assert all(e.metadata.page_number == 1 for e in elements[:2])
|
|
assert all(e.metadata.page_number == 2 for e in elements[2:])
|
|
assert all(
|
|
e.metadata.filetype == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
for e in elements
|
|
)
|
|
|
|
|
|
def test_auto_partition_xlsx_from_file():
|
|
with open(example_doc_path("stanley-cups.xlsx"), "rb") as f:
|
|
elements = partition(file=f, include_header=False, skip_infer_table_types=[])
|
|
|
|
assert len(elements) == 4
|
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
|
assert sum(isinstance(element, Title) for element in elements) == 2
|
|
assert clean_extra_whitespace(elements[0].text) == "Stanley Cups"
|
|
assert clean_extra_whitespace(elements[1].text) == (
|
|
"Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
|
|
)
|
|
assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
|
|
assert all(e.metadata.page_number == 1 for e in elements[:2])
|
|
assert all(e.metadata.page_number == 2 for e in elements[2:])
|
|
assert all(
|
|
e.metadata.filetype == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
for e in elements
|
|
)
|
|
|
|
|
|
def test_auto_partition_xlsx_respects_starting_page_number_argument():
|
|
elements = partition(example_doc_path("stanley-cups.xlsx"), starting_page_number=3)
|
|
assert all(e.metadata.page_number == 3 for e in elements[:2])
|
|
assert all(e.metadata.page_number == 4 for e in elements[2:])
|
|
|
|
|
|
# ================================================================================================
|
|
# XML
|
|
# ================================================================================================
|
|
|
|
|
|
def test_auto_partition_xml_from_filename():
|
|
elements = partition(example_doc_path("factbook.xml"), xml_keep_tags=False)
|
|
|
|
assert elements[0].text == "United States"
|
|
assert all(e.metadata.filename == "factbook.xml" for e in elements)
|
|
|
|
|
|
def test_auto_partition_xml_from_file():
|
|
with open(example_doc_path("factbook.xml"), "rb") as f:
|
|
elements = partition(file=f, xml_keep_tags=False)
|
|
|
|
assert elements[0].text == "United States"
|
|
|
|
|
|
def test_auto_partition_xml_from_filename_with_tags():
|
|
elements = partition(example_doc_path("factbook.xml"), xml_keep_tags=True)
|
|
|
|
assert "<leader>Joe Biden</leader>" in elements[0].text
|
|
assert elements[0].metadata.filename == "factbook.xml"
|
|
|
|
|
|
def test_auto_partition_xml_from_file_with_tags():
|
|
with open(example_doc_path("factbook.xml"), "rb") as f:
|
|
elements = partition(file=f, xml_keep_tags=True)
|
|
|
|
assert "<leader>Joe Biden</leader>" in elements[0].text
|
|
|
|
|
|
# ================================================================================================
|
|
# FILE_TYPE NOT RECOGNIZED OR NOT SUPPORTED
|
|
# ================================================================================================
|
|
|
|
|
|
def test_auto_partition_raises_with_bad_type(request: FixtureRequest):
|
|
detect_filetype_ = function_mock(
|
|
request, "unstructured.partition.auto.detect_filetype", return_value=None
|
|
)
|
|
|
|
with pytest.raises(ValueError, match="Invalid file made-up.fake. The None file type is not "):
|
|
partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES)
|
|
|
|
detect_filetype_.assert_called_once_with(
|
|
content_type=None, encoding=None, file=None, file_filename=None, filename="made-up.fake"
|
|
)
|
|
|
|
|
|
# ================================================================================================
|
|
# LOAD FROM URL
|
|
# ================================================================================================
|
|
|
|
|
|
def test_auto_partition_from_url():
|
|
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
|
|
|
|
elements = partition(url=url, content_type="text/plain", strategy=PartitionStrategy.HI_RES)
|
|
|
|
assert elements[0] == Title("Apache License")
|
|
assert all(e.metadata.url == url for e in elements)
|
|
|
|
|
|
def test_auto_partition_from_url_with_rfc9110_content_type():
|
|
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
|
|
|
|
elements = partition(
|
|
url=url, content_type="text/plain; charset=utf-8", strategy=PartitionStrategy.HI_RES
|
|
)
|
|
|
|
assert elements[0] == Title("Apache License")
|
|
assert all(e.metadata.url == url for e in elements)
|
|
|
|
|
|
def test_auto_partition_from_url_without_providing_content_type():
|
|
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
|
|
|
|
elements = partition(url=url, strategy=PartitionStrategy.HI_RES)
|
|
|
|
assert elements[0] == Title("Apache License")
|
|
assert all(e.metadata.url == url for e in elements)
|
|
|
|
|
|
def test_auto_partition_warns_if_header_set_and_not_url(caplog: LogCaptureFixture):
|
|
partition(
|
|
example_doc_path("eml/fake-email.eml"),
|
|
headers={"Accept": "application/pdf"},
|
|
strategy=PartitionStrategy.HI_RES,
|
|
)
|
|
|
|
assert caplog.records[0].levelname == "WARNING"
|
|
assert "headers kwarg is set but the url kwarg is not. The headers kwarg will b" in caplog.text
|
|
|
|
|
|
def test_auto_partition_from_url_routes_timeout_to_HTTP_request(request: FixtureRequest):
|
|
file_and_type_from_url_ = function_mock(
|
|
request,
|
|
"unstructured.partition.auto.file_and_type_from_url",
|
|
side_effect=ConnectionError("Trouble on the wire ..."),
|
|
)
|
|
|
|
with pytest.raises(ConnectionError, match="Trouble on the wire ..."):
|
|
partition(url="http://eie.io", request_timeout=326)
|
|
|
|
file_and_type_from_url_.assert_called_once_with(
|
|
url="http://eie.io", content_type=None, headers={}, ssl_verify=True, request_timeout=326
|
|
)
|
|
|
|
|
|
# ================================================================================================
|
|
# OTHER ARGS
|
|
# ================================================================================================
|
|
|
|
# -- chunking_strategy ----------------------------------------------------
|
|
|
|
|
|
def test_auto_partition_forwards_chunking_strategy_via_kwargs():
|
|
chunks = partition(example_doc_path("example-10k-1p.html"), chunking_strategy="by_title")
|
|
assert all(isinstance(chunk, (CompositeElement, Table, TableChunk)) for chunk in chunks)
|
|
|
|
|
|
def test_auto_partition_forwards_max_characters_via_kwargs():
|
|
chunks = partition(
|
|
example_doc_path("example-10k-1p.html"),
|
|
chunking_strategy="by_title",
|
|
max_characters=250,
|
|
)
|
|
assert all(len(chunk.text) <= 250 for chunk in chunks)
|
|
|
|
|
|
# -- detect_language_per_element ------------------------------------------
|
|
|
|
|
|
def test_auto_partition_respects_detect_language_per_element_arg():
|
|
elements = partition(
|
|
example_doc_path("language-docs/eng_spa_mult.txt"), detect_language_per_element=True
|
|
)
|
|
langs = [element.metadata.languages for element in elements]
|
|
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
|
|
|
|
|
|
# -- languages ------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"file_extension",
|
|
[
|
|
"doc",
|
|
"docx",
|
|
"eml",
|
|
"epub",
|
|
"html",
|
|
"md",
|
|
"odt",
|
|
"org",
|
|
"ppt",
|
|
"pptx",
|
|
"rst",
|
|
"rtf",
|
|
"txt",
|
|
"xml",
|
|
],
|
|
)
|
|
def test_auto_partition_respects_language_arg(file_extension: str):
|
|
elements = partition(
|
|
example_doc_path(f"language-docs/eng_spa_mult.{file_extension}"), languages=["deu"]
|
|
)
|
|
assert all(element.metadata.languages == ["deu"] for element in elements)
|
|
|
|
|
|
# -- include_page_breaks --------------------------------------------------
|
|
|
|
|
|
def test_auto_partition_forwards_include_page_breaks_to_partition_pdf():
|
|
elements = partition(
|
|
example_doc_path("layout-parser-paper-fast.pdf"),
|
|
include_page_breaks=True,
|
|
strategy=PartitionStrategy.HI_RES,
|
|
)
|
|
assert "PageBreak" in [elem.category for elem in elements]
|
|
|
|
|
|
# -- metadata_filename ----------------------------------------------------
|
|
|
|
|
|
def test_auto_partition_forwards_metadata_filename_via_kwargs():
|
|
with open(example_doc_path("fake-text.txt"), "rb") as f:
|
|
elements = partition(file=f, metadata_filename="much-more-interesting-name.txt")
|
|
|
|
assert all(e.metadata.filename == "much-more-interesting-name.txt" for e in elements)
|
|
|
|
|
|
def test_auto_partition_warns_about_file_filename_deprecation(caplog: LogCaptureFixture):
|
|
file_path = example_doc_path("fake-text.txt")
|
|
|
|
with open(file_path, "rb") as f:
|
|
elements = partition(file=f, file_filename=file_path)
|
|
|
|
assert all(e.metadata.filename == "fake-text.txt" for e in elements)
|
|
assert caplog.records[0].levelname == "WARNING"
|
|
assert "The file_filename kwarg will be deprecated" in caplog.text
|
|
|
|
|
|
def test_auto_partition_raises_when_both_file_filename_and_metadata_filename_args_are_used():
|
|
file_path = example_doc_path("fake-text.txt")
|
|
with open(file_path, "rb") as f:
|
|
file = io.BytesIO(f.read())
|
|
|
|
with pytest.raises(ValueError, match="Only one of metadata_filename and file_filename is spe"):
|
|
partition(file=file, file_filename=file_path, metadata_filename=file_path)
|
|
|
|
|
|
# -- ocr_languages --------------------------------------------------------
|
|
|
|
|
|
def test_auto_partition_image_formats_languages_for_tesseract(request: FixtureRequest):
|
|
process_file_with_ocr_ = function_mock(
|
|
request, "unstructured.partition.pdf_image.ocr.process_file_with_ocr"
|
|
)
|
|
|
|
partition(
|
|
example_doc_path("chi_sim_image.jpeg"), strategy=PartitionStrategy.HI_RES, languages=["zh"]
|
|
)
|
|
|
|
call_kwargs = process_file_with_ocr_.call_args_list[0][1]
|
|
assert call_kwargs["ocr_languages"] == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
|
|
|
|
|
|
@pytest.mark.parametrize(("languages", "ocr_languages"), [(["auto"], ""), (["eng"], "")])
|
|
def test_auto_partition_ignores_empty_string_for_ocr_languages(
|
|
languages: list[str], ocr_languages: str
|
|
):
|
|
elements = partition(
|
|
example_doc_path("book-war-and-peace-1p.txt"),
|
|
strategy=PartitionStrategy.OCR_ONLY,
|
|
ocr_languages=ocr_languages,
|
|
languages=languages,
|
|
)
|
|
assert all(e.metadata.languages == ["eng"] for e in elements)
|
|
|
|
|
|
def test_auto_partition_warns_with_ocr_languages(caplog: LogCaptureFixture):
|
|
partition(
|
|
example_doc_path("chevron-page.pdf"), strategy=PartitionStrategy.HI_RES, ocr_languages="eng"
|
|
)
|
|
|
|
assert caplog.records[0].levelname == "WARNING"
|
|
assert "The ocr_languages kwarg will be deprecated" in caplog.text
|
|
|
|
|
|
# -- skip_infer_table_types -----------------------------------------------
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("skip_infer_table_types", "filename", "has_text_as_html"),
|
|
[
|
|
(["xlsx"], "stanley-cups.xlsx", False),
|
|
([], "stanley-cups.xlsx", True),
|
|
(["odt"], "fake.odt", False),
|
|
([], "fake.odt", True),
|
|
],
|
|
)
|
|
def test_auto_partition_respects_skip_infer_table_types(
|
|
skip_infer_table_types: list[str], filename: str, has_text_as_html: bool
|
|
):
|
|
with open(example_doc_path(filename), "rb") as f:
|
|
elements = partition(file=f, skip_infer_table_types=skip_infer_table_types)
|
|
|
|
table_elements = [e for e in elements if isinstance(e, Table)]
|
|
assert table_elements
|
|
for e in table_elements:
|
|
assert (e.metadata.text_as_html is not None) == has_text_as_html
|
|
|
|
|
|
# ================================================================================================
|
|
# METADATA BEHAVIORS
|
|
# ================================================================================================
|
|
|
|
# -- .filetype ------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("content_type", "filetype_shortname", "expected_value"),
|
|
[
|
|
("text/csv", "csv", "text/csv"),
|
|
("text/html", "html", "text/html"),
|
|
("jdsfjdfsjkds", "pdf", None),
|
|
],
|
|
)
|
|
def test_auto_partition_adds_filetype_to_metadata(
|
|
request: FixtureRequest,
|
|
content_type: str,
|
|
filetype_shortname: str,
|
|
expected_value: str | None,
|
|
monkeypatch: MonkeyPatch,
|
|
):
|
|
partition_fn_ = function_mock(
|
|
request,
|
|
f"unstructured.partition.auto.partition_{filetype_shortname}",
|
|
return_value=[Text("text 1"), Text("text 2")],
|
|
)
|
|
mock_partition_with_extras_map = {filetype_shortname: partition_fn_}
|
|
monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
|
|
|
|
elements = partition(
|
|
example_doc_path("layout-parser-paper-fast.pdf"), content_type=content_type
|
|
)
|
|
|
|
assert len(elements) == 2
|
|
assert all(e.metadata.filetype == expected_value for e in elements)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"content_type",
|
|
[
|
|
# -- content-type provided as argument --
|
|
"application/pdf",
|
|
# -- auto-detected content-type --
|
|
None,
|
|
],
|
|
)
|
|
def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partitioner(
|
|
request: FixtureRequest, content_type: str | None, monkeypatch: MonkeyPatch
|
|
):
|
|
metadata = ElementMetadata(filetype="imapdf")
|
|
partition_pdf_ = function_mock(
|
|
request,
|
|
"unstructured.partition.auto.partition_pdf",
|
|
return_value=[Text("text 1", metadata=metadata), Text("text 2", metadata=metadata)],
|
|
)
|
|
monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", {"pdf": partition_pdf_})
|
|
|
|
elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)
|
|
|
|
assert len(elements) == 2
|
|
assert all(e.metadata.filetype == "application/pdf" for e in elements)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"filetype",
|
|
[
|
|
t
|
|
for t in FileType
|
|
if t not in (FileType.EMPTY, FileType.UNK, FileType.WAV, FileType.XLS, FileType.ZIP)
|
|
and t not in IMAGE_FILETYPES
|
|
],
|
|
)
|
|
def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(filetype: FileType):
|
|
extension = filetype.name.lower()
|
|
# -- except for two oddballs, the shortname is the extension --
|
|
partitioner_shortname = {FileType.TXT: "text", FileType.EML: "email"}.get(filetype, extension)
|
|
partition_fn_name = f"partition_{partitioner_shortname}"
|
|
module = import_module(f"unstructured.partition.{partitioner_shortname}")
|
|
partition_fn = getattr(module, partition_fn_name)
|
|
|
|
# -- partition the first example-doc with the extension for this filetype --
|
|
elements: list[Element] = []
|
|
for file in pathlib.Path(example_doc_path("")).iterdir():
|
|
if file.is_file() and file.suffix == f".{extension}":
|
|
elements = partition_fn(str(file))
|
|
break
|
|
|
|
assert elements
|
|
assert all(
|
|
e.metadata.filetype == filetype.mime_type
|
|
for e in elements
|
|
if e.metadata.filetype is not None
|
|
)
|
|
|
|
|
|
# -- .languages -----------------------------------------------------------
|
|
|
|
|
|
def test_auto_partition_passes_user_provided_languages_arg_to_PDF():
|
|
elements = partition(
|
|
example_doc_path("chevron-page.pdf"),
|
|
strategy=PartitionStrategy.OCR_ONLY,
|
|
languages=["eng"],
|
|
)
|
|
assert all(e.metadata.languages == ["eng"] for e in elements)
|
|
|
|
|
|
def test_auto_partition_languages_argument_default_to_None_when_omitted():
|
|
elements = partition(example_doc_path("handbook-1p.docx"), detect_language_per_element=True)
|
|
# -- PageBreak and any other element with no text is assigned `None` --
|
|
assert all(e.text == "" for e in elements if e.metadata.languages is None)
|
|
|
|
|
|
def test_auto_partition_default_does_not_overwrite_other_defaults():
|
|
"""`partition()` ["eng"] default does not overwrite ["auto"] default in other partitioners."""
|
|
# the default for `languages` is ["auto"] in partiton_text
|
|
from unstructured.partition.text import partition_text
|
|
|
|
# Use a document that is primarily in a language other than English
|
|
file_path = example_doc_path("language-docs/UDHR_first_article_all.txt")
|
|
text_elements = partition_text(file_path)
|
|
assert text_elements[0].metadata.languages != ["eng"]
|
|
|
|
auto_elements = partition(file_path)
|
|
assert auto_elements[0].metadata.languages != ["eng"]
|
|
assert auto_elements[0].metadata.languages == text_elements[0].metadata.languages
|
|
|
|
|
|
# ================================================================================================
|
|
# MISCELLANEOUS BEHAVIORS
|
|
# ================================================================================================
|
|
|
|
|
|
def test_auto_partition_from_filename_works_on_empty_file():
|
|
assert partition(example_doc_path("empty.txt")) == []
|
|
|
|
|
|
def test_auto_partition_from_file_works_on_empty_file():
|
|
with open(example_doc_path("empty.txt"), "rb") as f:
|
|
assert partition(file=f) == []
|
|
|
|
|
|
def test_auto_partition_requiring_extras_prompts_to_install_missing_dependencies():
|
|
partition_with_extras_map: dict[str, Callable[..., list[Element]]] = {}
|
|
with pytest.raises(ImportError, match="partition_pdf is not available. Install the pdf depen"):
|
|
_get_partition_with_extras("pdf", partition_with_extras_map)
|
|
|
|
|
|
# ================================================================================================
|
|
# MODULE-LEVEL FIXTURES
|
|
# ================================================================================================
|
|
|
|
|
|
@pytest.fixture()
|
|
def expected_docx_elements():
|
|
return [
|
|
Title("These are a few of my favorite things:"),
|
|
ListItem("Parrots"),
|
|
ListItem("Hockey"),
|
|
Title("Analysis"),
|
|
NarrativeText("This is my first thought. This is my second thought."),
|
|
NarrativeText("This is my third thought."),
|
|
Text("2023"),
|
|
Address("DOYLESTOWN, PA 18901"),
|
|
]
|