mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-08 16:16:35 +00:00

**Summary** Prepare auto-partitioning for pluggable partitioners. Move toward a uniform partitioner call signature in `auto/partition()` such that a custom or override partitioner can be registered without requiring code changes. **Additional Context** The central job of `auto/partition()` is to detect the file-type of the given file and use that to dispatch partitioning to the corresponding partitioner function e.g. `partition_pdf()` or `partition_docx()`. In the existing code, each partitioner function is called with parameters "hand-picked" from the available parameters passed to the `partition()` function. This is unnecessary and couples those partitioners tightly with the dispatch function. The desired state is that all available arguments are passed as `kwargs` and the partitioner function "self-selects" the arguments it will be sensitive to, applies its own appropriate default values when the argument is omitted, and simply ignore any arguments it doesn't use. Note that achieving this requires no changes to partitioner functions because they already do precisely this. So the job is to pass all arguments (other than `filename` and `file`) to the partitioner as `kwargs`. This will allow additional or alternate partitioners to be registered at runtime and dispatched to, because as long as they have the signature `partition_x(filename, file, kwargs) -> list[Element]` then they can be dispatched to without customization.
1295 lines
47 KiB
Python
1295 lines
47 KiB
Python
# pyright: reportPrivateUsage=false
|
||
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import os
|
||
import pathlib
|
||
import tempfile
|
||
import warnings
|
||
from importlib import import_module
|
||
from typing import Iterator
|
||
from unittest.mock import patch
|
||
|
||
import pytest
|
||
from PIL import Image
|
||
|
||
from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
|
||
from test_unstructured.partition.test_constants import (
|
||
EXPECTED_TABLE,
|
||
EXPECTED_TABLE_XLSX,
|
||
EXPECTED_TEXT,
|
||
EXPECTED_XLS_TABLE,
|
||
)
|
||
from test_unstructured.unit_utils import (
|
||
ANY,
|
||
FixtureRequest,
|
||
LogCaptureFixture,
|
||
example_doc_path,
|
||
function_mock,
|
||
method_mock,
|
||
)
|
||
from unstructured.cleaners.core import clean_extra_whitespace
|
||
from unstructured.documents.elements import (
|
||
Address,
|
||
CompositeElement,
|
||
Element,
|
||
ElementMetadata,
|
||
ListItem,
|
||
NarrativeText,
|
||
Table,
|
||
TableChunk,
|
||
Text,
|
||
Title,
|
||
)
|
||
from unstructured.file_utils.model import FileType
|
||
from unstructured.partition.auto import _PartitionerLoader, partition
|
||
from unstructured.partition.common import UnsupportedFileFormatError
|
||
from unstructured.partition.utils.constants import PartitionStrategy
|
||
from unstructured.staging.base import elements_from_json, elements_to_dicts, elements_to_json
|
||
|
||
is_in_docker = os.path.exists("/.dockerenv")
|
||
|
||
|
||
# ================================================================================================
|
||
# CSV
|
||
# ================================================================================================
|
||
|
||
|
||
def test_auto_partition_csv_from_filename():
|
||
elements = partition(example_doc_path("stanley-cups.csv"))
|
||
|
||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
||
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
||
assert elements[0].metadata.filetype == "text/csv"
|
||
|
||
|
||
def test_auto_partition_csv_from_file():
|
||
with open(example_doc_path("stanley-cups.csv"), "rb") as f:
|
||
elements = partition(file=f)
|
||
|
||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
||
assert isinstance(elements[0], Table)
|
||
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
||
assert elements[0].metadata.filetype == "text/csv"
|
||
|
||
|
||
# ================================================================================================
|
||
# DOC
|
||
# ================================================================================================
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
("pass_metadata_filename", "content_type"),
|
||
[(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
|
||
)
|
||
def test_auto_partition_doc_from_filename(
|
||
pass_metadata_filename: bool, content_type: str | None, expected_docx_elements: list[Element]
|
||
):
|
||
file_path = example_doc_path("simple.doc")
|
||
metadata_filename = file_path if pass_metadata_filename else None
|
||
|
||
elements = partition(
|
||
filename=file_path,
|
||
metadata_filename=metadata_filename,
|
||
content_type=content_type,
|
||
strategy=PartitionStrategy.HI_RES,
|
||
)
|
||
|
||
for e in elements:
|
||
print(f"{type(e).__name__}({repr(e.text)})")
|
||
|
||
assert elements == expected_docx_elements
|
||
assert all(e.metadata.filename == "simple.doc" for e in elements)
|
||
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
|
||
|
||
|
||
@pytest.mark.skipif(is_in_docker, reason="Passes in CI but not Docker. Remove skip on #3364 fix.")
|
||
def test_auto_partition_doc_from_file(expected_docx_elements: list[Element]):
|
||
with open(example_doc_path("simple.doc"), "rb") as f:
|
||
elements = partition(file=f)
|
||
|
||
assert elements == expected_docx_elements
|
||
|
||
|
||
# ================================================================================================
|
||
# DOCX
|
||
# ================================================================================================
|
||
|
||
|
||
def test_auto_partition_docx_from_filename(expected_docx_elements: list[Element]):
|
||
elements = partition(example_doc_path("simple.docx"), strategy=PartitionStrategy.HI_RES)
|
||
|
||
assert elements == expected_docx_elements
|
||
assert all(e.metadata.filename == "simple.docx" for e in elements)
|
||
|
||
|
||
def test_auto_partition_docx_from_file(expected_docx_elements: list[Element]):
|
||
with open(example_doc_path("simple.docx"), "rb") as f:
|
||
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
||
assert elements == expected_docx_elements
|
||
|
||
|
||
@pytest.mark.parametrize("file_name", ["simple.docx", "simple.doc", "simple.odt"])
|
||
@pytest.mark.parametrize(
|
||
"strategy",
|
||
[
|
||
PartitionStrategy.AUTO,
|
||
PartitionStrategy.FAST,
|
||
PartitionStrategy.HI_RES,
|
||
PartitionStrategy.OCR_ONLY,
|
||
],
|
||
)
|
||
def test_partition_forwards_strategy_arg_to_partition_docx_and_its_brokers(
|
||
request: FixtureRequest, file_name: str, strategy: str
|
||
):
|
||
"""The `strategy` arg value received by `partition()` is received by `partition_docx().
|
||
|
||
To do this in the brokering-partitioner cases (DOC, ODT) it must make its way to
|
||
`partition_doc()` or `partition_odt()` which must then forward it to `partition_docx()`. This
|
||
test makes sure it made it all the way.
|
||
|
||
Note this is 3 file-types X 4 strategies = 12 test-cases.
|
||
"""
|
||
from unstructured.partition.docx import _DocxPartitioner
|
||
|
||
def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]:
|
||
yield Text(f"strategy=={self._opts.strategy}")
|
||
|
||
_iter_elements_ = method_mock(
|
||
request,
|
||
_DocxPartitioner,
|
||
"_iter_document_elements",
|
||
side_effect=fake_iter_document_elements,
|
||
)
|
||
|
||
(element,) = partition(example_doc_path(file_name), strategy=strategy)
|
||
|
||
_iter_elements_.assert_called_once_with(ANY)
|
||
assert element.text == f"strategy=={strategy}"
|
||
|
||
|
||
# ================================================================================================
|
||
# EML
|
||
# ================================================================================================
|
||
|
||
EXPECTED_EMAIL_OUTPUT = [
|
||
NarrativeText(text="This is a test email to use for unit tests."),
|
||
Title(text="Important points:"),
|
||
ListItem(text="Roses are red"),
|
||
ListItem(text="Violets are blue"),
|
||
]
|
||
|
||
|
||
def test_auto_partition_email_from_filename():
|
||
file_path = example_doc_path("eml/fake-email.eml")
|
||
|
||
elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
|
||
|
||
assert len(elements) > 0
|
||
assert elements == EXPECTED_EMAIL_OUTPUT
|
||
assert elements[0].metadata.filename == os.path.basename(file_path)
|
||
assert elements[0].metadata.file_directory == os.path.split(file_path)[0]
|
||
|
||
|
||
def test_auto_partition_email_from_file():
|
||
with open(example_doc_path("eml/fake-email.eml"), "rb") as f:
|
||
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
||
|
||
assert len(elements) > 0
|
||
assert elements == EXPECTED_EMAIL_OUTPUT
|
||
|
||
|
||
# ================================================================================================
|
||
# EPUB
|
||
# ================================================================================================
|
||
|
||
|
||
def test_auto_partition_epub_from_filename():
|
||
elements = partition(example_doc_path("winter-sports.epub"), strategy=PartitionStrategy.HI_RES)
|
||
|
||
assert len(elements) > 0
|
||
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
|
||
|
||
|
||
def test_auto_partition_epub_from_file():
|
||
with open(example_doc_path("winter-sports.epub"), "rb") as f:
|
||
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
||
|
||
assert len(elements) > 0
|
||
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
|
||
|
||
|
||
# ================================================================================================
|
||
# HTML
|
||
# ================================================================================================
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
("pass_metadata_filename", "content_type"),
|
||
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
|
||
)
|
||
def test_auto_partition_html_from_filename(pass_metadata_filename: bool, content_type: str | None):
|
||
file_path = example_doc_path("example-10k-1p.html")
|
||
metadata_filename = file_path if pass_metadata_filename else None
|
||
|
||
elements = partition(
|
||
filename=file_path,
|
||
metadata_filename=metadata_filename,
|
||
content_type=content_type,
|
||
strategy=PartitionStrategy.HI_RES,
|
||
)
|
||
|
||
assert elements
|
||
expected_filename, expected_directory = os.path.basename(file_path), os.path.split(file_path)[0]
|
||
assert all(e.metadata.filename == expected_filename for e in elements)
|
||
assert all(e.metadata.file_directory == expected_directory for e in elements)
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
("pass_metadata_filename", "content_type"),
|
||
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
|
||
)
|
||
def test_auto_partition_html_from_file(pass_metadata_filename: bool, content_type: str | None):
|
||
file_path = example_doc_path("example-10k-1p.html")
|
||
metadata_filename = file_path if pass_metadata_filename else None
|
||
|
||
with open(file_path, "rb") as f:
|
||
elements = partition(
|
||
file=f,
|
||
metadata_filename=metadata_filename,
|
||
content_type=content_type,
|
||
strategy=PartitionStrategy.HI_RES,
|
||
)
|
||
|
||
assert len(elements) > 0
|
||
|
||
|
||
def test_auto_partition_html_pre_from_file():
|
||
elements = partition(example_doc_path("fake-html-pre.htm"))
|
||
|
||
assert len(elements) > 0
|
||
assert "PageBreak" not in [elem.category for elem in elements]
|
||
assert clean_extra_whitespace(elements[0].text).startswith("[107th Congress Public Law 56]")
|
||
assert isinstance(elements[0], NarrativeText)
|
||
assert all(e.metadata.filetype == "text/html" for e in elements)
|
||
assert all(e.metadata.filename == "fake-html-pre.htm" for e in elements)
|
||
|
||
|
||
# ================================================================================================
|
||
# IMAGE
|
||
# ================================================================================================
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
("pass_metadata_filename", "content_type"),
|
||
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
||
)
|
||
def test_auto_partition_jpeg_from_filename(pass_metadata_filename: bool, content_type: str | None):
|
||
file_path = example_doc_path("img/layout-parser-paper-fast.jpg")
|
||
metadata_filename = file_path if pass_metadata_filename else None
|
||
|
||
elements = partition(
|
||
filename=file_path,
|
||
metadata_filename=metadata_filename,
|
||
content_type=content_type,
|
||
strategy=PartitionStrategy.AUTO,
|
||
)
|
||
|
||
e = elements[2]
|
||
assert e.text == (
|
||
"LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
||
)
|
||
assert e.metadata.coordinates is not None
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
("pass_metadata_filename", "content_type"),
|
||
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
||
)
|
||
def test_auto_partition_jpeg_from_file(pass_metadata_filename: bool, content_type: str | None):
|
||
file_path = example_doc_path("img/layout-parser-paper-fast.jpg")
|
||
metadata_filename = file_path if pass_metadata_filename else None
|
||
|
||
with open(file_path, "rb") as f:
|
||
elements = partition(
|
||
file=f,
|
||
metadata_filename=metadata_filename,
|
||
content_type=content_type,
|
||
strategy=PartitionStrategy.AUTO,
|
||
)
|
||
|
||
e = elements[2]
|
||
assert e.text == (
|
||
"LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
||
)
|
||
assert e.metadata.coordinates is not None
|
||
|
||
|
||
def test_auto_partition_bmp_from_filename(tmp_path: pathlib.Path):
|
||
bmp_filename = str(tmp_path / "example.bmp")
|
||
with Image.open(example_doc_path("img/layout-parser-paper-with-table.jpg")) as img:
|
||
img.save(bmp_filename)
|
||
|
||
elements = partition(filename=bmp_filename, strategy=PartitionStrategy.HI_RES)
|
||
|
||
table = [e.metadata.text_as_html for e in elements if e.metadata.text_as_html]
|
||
assert len(table) == 1
|
||
assert "<table><thead><tr>" in table[0]
|
||
assert "</thead><tbody><tr>" in table[0]
|
||
|
||
|
||
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
|
||
def test_auto_partition_image_element_extraction(extract_image_block_to_payload: bool):
|
||
extract_image_block_types = ["Image", "Table"]
|
||
|
||
with tempfile.TemporaryDirectory() as tmpdir:
|
||
elements = partition(
|
||
filename=example_doc_path("img/embedded-images-tables.jpg"),
|
||
extract_image_block_types=extract_image_block_types,
|
||
extract_image_block_to_payload=extract_image_block_to_payload,
|
||
extract_image_block_output_dir=tmpdir,
|
||
)
|
||
|
||
assert_element_extraction(
|
||
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
|
||
)
|
||
|
||
|
||
# ================================================================================================
|
||
# JSON
|
||
# ================================================================================================
|
||
|
||
|
||
# TODO(scanny): This test should go away when we fix #3365. This test glosses over several
|
||
# important JSON "rehydration" behaviors, in particular that the metadata should match exactly.
|
||
# The following test `test_auto_partition_json_from_file_preserves_original_elements` will be the
|
||
# replacement for this test.
|
||
def test_auto_partitioned_json_output_maintains_consistency_with_fixture_elements():
|
||
"""Test auto-processing an unstructured json output file by filename."""
|
||
json_file_path = example_doc_path("spring-weather.html.json")
|
||
original_file_name = "spring-weather.html"
|
||
with open(json_file_path) as json_f:
|
||
expected_result = json.load(json_f)
|
||
|
||
partitioning_result = json.loads(
|
||
elements_to_json(
|
||
partition(
|
||
filename=str(json_file_path),
|
||
# -- use the original file name to get the same element IDs (hashes) --
|
||
metadata_filename=original_file_name,
|
||
strategy=PartitionStrategy.HI_RES,
|
||
)
|
||
)
|
||
)
|
||
for elem in partitioning_result:
|
||
elem.pop("metadata")
|
||
for elem in expected_result:
|
||
elem.pop("metadata")
|
||
|
||
assert expected_result == partitioning_result
|
||
|
||
|
||
@pytest.mark.xfail(
|
||
reason=(
|
||
"https://github.com/Unstructured-IO/unstructured/issues/3365"
|
||
" partition_json() does not preserve original element-id or metadata"
|
||
),
|
||
raises=AssertionError,
|
||
strict=True,
|
||
)
|
||
def test_auto_partition_json_from_file_preserves_original_elements():
|
||
file_path = example_doc_path("simple.json")
|
||
original_elements = elements_from_json(file_path)
|
||
|
||
with open(file_path, "rb") as f:
|
||
partitioned_elements = partition(file=f)
|
||
|
||
assert elements_to_dicts(partitioned_elements) == elements_to_dicts(original_elements)
|
||
|
||
|
||
def test_auto_partition_json_raises_with_unprocessable_json(tmp_path: pathlib.Path):
|
||
# NOTE(robinson) - This is unprocessable because it is not a list of dicts, per the
|
||
# Unstructured JSON serialization format
|
||
text = '{"hi": "there"}'
|
||
|
||
file_path = str(tmp_path / "unprocessable.json")
|
||
with open(file_path, "w") as f:
|
||
f.write(text)
|
||
|
||
with pytest.raises(ValueError, match="Detected a JSON file that does not conform to the Unst"):
|
||
partition(filename=file_path)
|
||
|
||
|
||
# ================================================================================================
|
||
# MD
|
||
# ================================================================================================
|
||
|
||
|
||
def test_partition_md_from_url_works_with_embedded_html():
|
||
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/README.md"
|
||
elements = partition(url=url, content_type="text/markdown", strategy=PartitionStrategy.HI_RES)
|
||
assert "unstructured" in elements[0].text
|
||
|
||
|
||
# ================================================================================================
|
||
# MSG
|
||
# ================================================================================================
|
||
|
||
|
||
def test_auto_partition_msg_from_filename():
|
||
assert partition(example_doc_path("fake-email.msg"), strategy=PartitionStrategy.HI_RES) == [
|
||
NarrativeText(text="This is a test email to use for unit tests."),
|
||
Title(text="Important points:"),
|
||
ListItem(text="Roses are red"),
|
||
ListItem(text="Violets are blue"),
|
||
]
|
||
|
||
|
||
# ================================================================================================
|
||
# ODT
|
||
# ================================================================================================
|
||
|
||
|
||
def test_auto_partition_odt_from_filename(expected_docx_elements: list[Element]):
|
||
elements = partition(example_doc_path("simple.odt"), strategy=PartitionStrategy.HI_RES)
|
||
assert elements == expected_docx_elements
|
||
|
||
|
||
def test_auto_partition_odt_from_file(expected_docx_elements: list[Element]):
|
||
with open(example_doc_path("simple.odt"), "rb") as f:
|
||
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
||
|
||
assert elements == expected_docx_elements
|
||
|
||
|
||
# ================================================================================================
|
||
# ORG
|
||
# ================================================================================================
|
||
|
||
|
||
def test_auto_partition_org_from_filename():
|
||
elements = partition(example_doc_path("README.org"))
|
||
|
||
assert elements[0] == Title("Example Docs")
|
||
assert elements[0].metadata.filetype == "text/org"
|
||
|
||
|
||
def test_auto_partition_org_from_file():
|
||
with open(example_doc_path("README.org"), "rb") as f:
|
||
elements = partition(file=f, content_type="text/org")
|
||
|
||
assert elements[0] == Title("Example Docs")
|
||
assert elements[0].metadata.filetype == "text/org"
|
||
|
||
|
||
# ================================================================================================
|
||
# PDF
|
||
# ================================================================================================
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
("pass_metadata_filename", "content_type"),
|
||
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
|
||
)
|
||
def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_type: str | None):
|
||
file_path = example_doc_path("pdf/chevron-page.pdf")
|
||
metadata_filename = file_path if pass_metadata_filename else None
|
||
|
||
elements = partition(
|
||
filename=file_path,
|
||
metadata_filename=metadata_filename,
|
||
content_type=content_type,
|
||
strategy=PartitionStrategy.HI_RES,
|
||
)
|
||
|
||
e = elements[0]
|
||
assert isinstance(e, Title)
|
||
assert e.text.startswith("eastern mediterranean")
|
||
assert e.metadata.filename == os.path.basename(file_path)
|
||
assert e.metadata.file_directory == os.path.split(file_path)[0]
|
||
|
||
e = elements[1]
|
||
assert isinstance(e, NarrativeText)
|
||
assert e.text.startswith("We’re investing")
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
("pass_metadata_filename", "content_type"),
|
||
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
|
||
)
|
||
def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type: str | None):
|
||
file_path = example_doc_path("pdf/chevron-page.pdf")
|
||
metadata_filename = file_path if pass_metadata_filename else None
|
||
|
||
with open(file_path, "rb") as f:
|
||
elements = partition(
|
||
file=f,
|
||
metadata_filename=metadata_filename,
|
||
content_type=content_type,
|
||
strategy=PartitionStrategy.HI_RES,
|
||
)
|
||
|
||
e = elements[0]
|
||
assert isinstance(e, Title)
|
||
assert e.text.startswith("eastern mediterranean")
|
||
|
||
e = elements[1]
|
||
assert isinstance(e, NarrativeText)
|
||
assert e.text.startswith("We’re investing")
|
||
|
||
|
||
def test_auto_partition_pdf_with_fast_strategy(request: FixtureRequest):
|
||
partition_pdf_ = function_mock(
|
||
request,
|
||
"unstructured.partition.pdf.partition_pdf",
|
||
return_value=[NarrativeText("Hello there!")],
|
||
)
|
||
partitioner_loader_get_ = method_mock(
|
||
request, _PartitionerLoader, "get", return_value=partition_pdf_
|
||
)
|
||
file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf")
|
||
|
||
partition(file_path, strategy=PartitionStrategy.FAST)
|
||
|
||
partitioner_loader_get_.assert_called_once_with(ANY, FileType.PDF)
|
||
partition_pdf_.assert_called_once_with(
|
||
filename=file_path,
|
||
file=None,
|
||
url=None,
|
||
strategy=PartitionStrategy.FAST,
|
||
languages=None,
|
||
metadata_filename=None,
|
||
infer_table_structure=False,
|
||
extract_images_in_pdf=False,
|
||
extract_image_block_types=None,
|
||
extract_image_block_output_dir=None,
|
||
extract_image_block_to_payload=False,
|
||
hi_res_model_name=None,
|
||
starting_page_number=1,
|
||
)
|
||
|
||
|
||
def test_auto_partition_pdf_uses_pdf_infer_table_structure_argument():
|
||
with patch(
|
||
"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
|
||
) as mock_process_file_with_model:
|
||
partition(
|
||
example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||
pdf_infer_table_structure=True,
|
||
strategy=PartitionStrategy.HI_RES,
|
||
)
|
||
assert mock_process_file_with_model.call_args[1]["infer_table_structure"]
|
||
|
||
|
||
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
|
||
def test_auto_partition_pdf_element_extraction(extract_image_block_to_payload: bool):
|
||
extract_image_block_types = ["Image", "Table"]
|
||
|
||
with tempfile.TemporaryDirectory() as tmpdir:
|
||
elements = partition(
|
||
example_doc_path("pdf/embedded-images-tables.pdf"),
|
||
extract_image_block_types=extract_image_block_types,
|
||
extract_image_block_to_payload=extract_image_block_to_payload,
|
||
extract_image_block_output_dir=tmpdir,
|
||
)
|
||
|
||
assert_element_extraction(
|
||
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
|
||
)
|
||
|
||
|
||
def test_partition_pdf_does_not_raise_warning():
|
||
# NOTE(robinson): This is the recommended way to check that no warning is emitted,
|
||
# per the pytest docs.
|
||
# ref: https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html
|
||
# #additional-use-cases-of-warnings-in-tests
|
||
with warnings.catch_warnings():
|
||
warnings.simplefilter("error")
|
||
partition(
|
||
example_doc_path("pdf/layout-parser-paper-fast.pdf"), strategy=PartitionStrategy.HI_RES
|
||
)
|
||
|
||
|
||
# ================================================================================================
|
||
# PPT
|
||
# ================================================================================================
|
||
|
||
|
||
def test_auto_partition_ppt_from_filename():
|
||
file_path = example_doc_path("fake-power-point.ppt")
|
||
|
||
elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
|
||
|
||
assert elements == [
|
||
Title(text="Adding a Bullet Slide"),
|
||
ListItem(text="Find the bullet slide layout"),
|
||
ListItem(text="Use _TextFrame.text for first bullet"),
|
||
ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
|
||
NarrativeText(text="Here is a lot of text!"),
|
||
NarrativeText(text="Here is some text in a text box!"),
|
||
]
|
||
assert all(e.metadata.filename == "fake-power-point.ppt" for e in elements)
|
||
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
|
||
|
||
|
||
# ================================================================================================
|
||
# PPTX
|
||
# ================================================================================================
|
||
|
||
|
||
def test_auto_partition_pptx_from_filename():
|
||
file_path = example_doc_path("fake-power-point.pptx")
|
||
|
||
elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
|
||
|
||
assert elements == [
|
||
Title(text="Adding a Bullet Slide"),
|
||
ListItem(text="Find the bullet slide layout"),
|
||
ListItem(text="Use _TextFrame.text for first bullet"),
|
||
ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
|
||
NarrativeText(text="Here is a lot of text!"),
|
||
NarrativeText(text="Here is some text in a text box!"),
|
||
]
|
||
assert all(e.metadata.filename == "fake-power-point.pptx" for e in elements)
|
||
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
|
||
|
||
|
||
@pytest.mark.parametrize("file_name", ["simple.pptx", "fake-power-point.ppt"])
|
||
@pytest.mark.parametrize(
|
||
"strategy",
|
||
[
|
||
PartitionStrategy.AUTO,
|
||
PartitionStrategy.FAST,
|
||
PartitionStrategy.HI_RES,
|
||
PartitionStrategy.OCR_ONLY,
|
||
],
|
||
)
|
||
def test_partition_forwards_strategy_arg_to_partition_pptx_and_its_brokers(
|
||
request: FixtureRequest, file_name: str, strategy: str
|
||
):
|
||
"""The `strategy` arg value received by `partition()` is received by `partition_pptx().
|
||
|
||
To do this in the brokering-partitioner case (PPT) the strategy argument must make its way to
|
||
`partition_ppt()` which must then forward it to `partition_pptx()`. This test makes sure it
|
||
made it all the way.
|
||
|
||
Note this is 2 file-types X 4 strategies = 8 test-cases.
|
||
"""
|
||
from unstructured.partition.pptx import _PptxPartitioner
|
||
|
||
def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]:
|
||
yield Text(f"strategy=={self._opts.strategy}")
|
||
|
||
_iter_elements_ = method_mock(
|
||
request,
|
||
_PptxPartitioner,
|
||
"_iter_presentation_elements",
|
||
side_effect=fake_iter_presentation_elements,
|
||
)
|
||
|
||
(element,) = partition(example_doc_path(file_name), strategy=strategy)
|
||
|
||
_iter_elements_.assert_called_once_with(ANY)
|
||
assert element.text == f"strategy=={strategy}"
|
||
|
||
|
||
# ================================================================================================
|
||
# RST
|
||
# ================================================================================================
|
||
|
||
|
||
def test_auto_partition_rst_from_filename():
|
||
elements = partition(example_doc_path("README.rst"))
|
||
|
||
assert elements[0] == Title("Example Docs")
|
||
assert elements[0].metadata.filetype == "text/x-rst"
|
||
|
||
|
||
def test_auto_partition_rst_from_file():
|
||
with open(example_doc_path("README.rst"), "rb") as f:
|
||
elements = partition(file=f, content_type="text/x-rst")
|
||
|
||
assert elements[0] == Title("Example Docs")
|
||
assert elements[0].metadata.filetype == "text/x-rst"
|
||
|
||
|
||
# ================================================================================================
|
||
# RTF
|
||
# ================================================================================================
|
||
|
||
|
||
def test_auto_partition_rtf_from_filename():
|
||
elements = partition(example_doc_path("fake-doc.rtf"), strategy=PartitionStrategy.HI_RES)
|
||
assert elements[0] == Title("My First Heading")
|
||
|
||
|
||
# ================================================================================================
|
||
# TSV
|
||
# ================================================================================================
|
||
|
||
|
||
def test_auto_partition_tsv_from_filename():
|
||
elements = partition(example_doc_path("stanley-cups.tsv"))
|
||
|
||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
||
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
||
assert elements[0].metadata.filetype == "text/tsv"
|
||
|
||
|
||
# ================================================================================================
|
||
# TXT
|
||
# ================================================================================================
|
||
@pytest.mark.parametrize(
|
||
("filename", "expected_elements"),
|
||
[
|
||
(
|
||
"fake-text.txt",
|
||
[
|
||
NarrativeText(text="This is a test document to use for unit tests."),
|
||
Address(text="Doylestown, PA 18901"),
|
||
Title(text="Important points:"),
|
||
ListItem(text="Hamburgers are delicious"),
|
||
ListItem(text="Dogs are the best"),
|
||
ListItem(text="I love fuzzy blankets"),
|
||
],
|
||
),
|
||
("fake-text-all-whitespace.txt", []),
|
||
],
|
||
)
|
||
def test_auto_partition_text_from_filename(filename: str, expected_elements: list[Element]):
|
||
file_path = example_doc_path(filename)
|
||
|
||
elements = partition(filename=file_path, strategy=PartitionStrategy.HI_RES)
|
||
|
||
assert elements == expected_elements
|
||
assert all(e.metadata.filename == filename for e in elements)
|
||
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
|
||
|
||
|
||
def test_auto_partition_text_from_file():
|
||
with open(example_doc_path("fake-text.txt"), "rb") as f:
|
||
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
||
|
||
assert len(elements) > 0
|
||
assert elements == [
|
||
NarrativeText(text="This is a test document to use for unit tests."),
|
||
Address(text="Doylestown, PA 18901"),
|
||
Title(text="Important points:"),
|
||
ListItem(text="Hamburgers are delicious"),
|
||
ListItem(text="Dogs are the best"),
|
||
ListItem(text="I love fuzzy blankets"),
|
||
]
|
||
|
||
|
||
# ================================================================================================
|
||
# XLS
|
||
# ================================================================================================
|
||
|
||
|
||
def test_auto_partition_xls_from_filename():
|
||
elements = partition(
|
||
example_doc_path("tests-example.xls"), include_header=False, skip_infer_table_types=[]
|
||
)
|
||
|
||
assert len(elements) == 14
|
||
assert sum(isinstance(e, Table) for e in elements) == 2
|
||
assert elements[0].metadata.text_as_html == EXPECTED_XLS_TABLE
|
||
assert len(elements[0].text) == 507
|
||
|
||
|
||
# ================================================================================================
|
||
# XLSX
|
||
# ================================================================================================
|
||
|
||
|
||
def test_auto_partition_xlsx_from_filename():
|
||
elements = partition(
|
||
example_doc_path("stanley-cups.xlsx"), include_header=False, skip_infer_table_types=[]
|
||
)
|
||
|
||
assert len(elements) == 4
|
||
assert sum(isinstance(e, Table) for e in elements) == 2
|
||
assert sum(isinstance(e, Title) for e in elements) == 2
|
||
assert clean_extra_whitespace(elements[0].text) == "Stanley Cups"
|
||
assert clean_extra_whitespace(elements[1].text) == (
|
||
"Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
|
||
)
|
||
assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
|
||
assert all(e.metadata.page_number == 1 for e in elements[:2])
|
||
assert all(e.metadata.page_number == 2 for e in elements[2:])
|
||
assert all(
|
||
e.metadata.filetype == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||
for e in elements
|
||
)
|
||
|
||
|
||
def test_auto_partition_xlsx_from_file():
|
||
with open(example_doc_path("stanley-cups.xlsx"), "rb") as f:
|
||
elements = partition(file=f, include_header=False, skip_infer_table_types=[])
|
||
|
||
assert len(elements) == 4
|
||
assert sum(isinstance(element, Table) for element in elements) == 2
|
||
assert sum(isinstance(element, Title) for element in elements) == 2
|
||
assert clean_extra_whitespace(elements[0].text) == "Stanley Cups"
|
||
assert clean_extra_whitespace(elements[1].text) == (
|
||
"Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
|
||
)
|
||
assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
|
||
assert all(e.metadata.page_number == 1 for e in elements[:2])
|
||
assert all(e.metadata.page_number == 2 for e in elements[2:])
|
||
assert all(
|
||
e.metadata.filetype == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||
for e in elements
|
||
)
|
||
|
||
|
||
def test_auto_partition_xlsx_respects_starting_page_number_argument():
|
||
elements = partition(example_doc_path("stanley-cups.xlsx"), starting_page_number=3)
|
||
assert all(e.metadata.page_number == 3 for e in elements[:2])
|
||
assert all(e.metadata.page_number == 4 for e in elements[2:])
|
||
|
||
|
||
# ================================================================================================
|
||
# XML
|
||
# ================================================================================================
|
||
|
||
|
||
def test_auto_partition_xml_from_filename():
|
||
elements = partition(example_doc_path("factbook.xml"), xml_keep_tags=False)
|
||
|
||
assert elements[0].text == "United States"
|
||
assert all(e.metadata.filename == "factbook.xml" for e in elements)
|
||
|
||
|
||
def test_auto_partition_xml_from_file():
|
||
with open(example_doc_path("factbook.xml"), "rb") as f:
|
||
elements = partition(file=f, xml_keep_tags=False)
|
||
|
||
assert elements[0].text == "United States"
|
||
|
||
|
||
def test_auto_partition_xml_from_filename_with_tags():
|
||
elements = partition(example_doc_path("factbook.xml"), xml_keep_tags=True)
|
||
|
||
assert "<leader>Joe Biden</leader>" in elements[0].text
|
||
assert elements[0].metadata.filename == "factbook.xml"
|
||
|
||
|
||
def test_auto_partition_xml_from_file_with_tags():
|
||
with open(example_doc_path("factbook.xml"), "rb") as f:
|
||
elements = partition(file=f, xml_keep_tags=True)
|
||
|
||
assert "<leader>Joe Biden</leader>" in elements[0].text
|
||
|
||
|
||
# ================================================================================================
|
||
# FILE_TYPE NOT RECOGNIZED OR NOT SUPPORTED
|
||
# ================================================================================================
|
||
|
||
|
||
def test_auto_partition_raises_with_bad_type(request: FixtureRequest):
|
||
detect_filetype_ = function_mock(
|
||
request, "unstructured.partition.auto.detect_filetype", return_value=FileType.UNK
|
||
)
|
||
|
||
with pytest.raises(
|
||
UnsupportedFileFormatError,
|
||
match="Partitioning is not supported for the FileType.UNK file type.",
|
||
):
|
||
partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES)
|
||
|
||
detect_filetype_.assert_called_once_with(
|
||
file_path="made-up.fake",
|
||
file=None,
|
||
encoding=None,
|
||
content_type=None,
|
||
metadata_file_path=None,
|
||
)
|
||
|
||
|
||
# ================================================================================================
|
||
# LOAD FROM URL
|
||
# ================================================================================================
|
||
|
||
|
||
def test_auto_partition_from_url():
|
||
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
|
||
|
||
elements = partition(url=url, content_type="text/plain", strategy=PartitionStrategy.HI_RES)
|
||
|
||
assert elements[0] == Title("Apache License")
|
||
assert all(e.metadata.url == url for e in elements)
|
||
|
||
|
||
def test_auto_partition_from_url_with_rfc9110_content_type():
|
||
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
|
||
|
||
elements = partition(
|
||
url=url, content_type="text/plain; charset=utf-8", strategy=PartitionStrategy.HI_RES
|
||
)
|
||
|
||
assert elements[0] == Title("Apache License")
|
||
assert all(e.metadata.url == url for e in elements)
|
||
|
||
|
||
def test_auto_partition_from_url_without_providing_content_type():
|
||
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
|
||
|
||
elements = partition(url=url, strategy=PartitionStrategy.HI_RES)
|
||
|
||
assert elements[0] == Title("Apache License")
|
||
assert all(e.metadata.url == url for e in elements)
|
||
|
||
|
||
def test_auto_partition_warns_if_header_set_and_not_url(caplog: LogCaptureFixture):
|
||
partition(
|
||
example_doc_path("eml/fake-email.eml"),
|
||
headers={"Accept": "application/pdf"},
|
||
strategy=PartitionStrategy.HI_RES,
|
||
)
|
||
|
||
assert caplog.records[0].levelname == "WARNING"
|
||
assert "headers kwarg is set but the url kwarg is not. The headers kwarg will b" in caplog.text
|
||
|
||
|
||
def test_auto_partition_from_url_routes_timeout_to_HTTP_request(request: FixtureRequest):
|
||
file_and_type_from_url_ = function_mock(
|
||
request,
|
||
"unstructured.partition.auto.file_and_type_from_url",
|
||
side_effect=ConnectionError("Trouble on the wire ..."),
|
||
)
|
||
|
||
with pytest.raises(ConnectionError, match="Trouble on the wire ..."):
|
||
partition(url="http://eie.io", request_timeout=326)
|
||
|
||
file_and_type_from_url_.assert_called_once_with(
|
||
url="http://eie.io", content_type=None, headers={}, ssl_verify=True, request_timeout=326
|
||
)
|
||
|
||
|
||
# ================================================================================================
|
||
# OTHER ARGS
|
||
# ================================================================================================
|
||
|
||
# -- chunking_strategy ----------------------------------------------------
|
||
|
||
|
||
def test_auto_partition_forwards_chunking_strategy_via_kwargs():
|
||
chunks = partition(example_doc_path("example-10k-1p.html"), chunking_strategy="by_title")
|
||
assert all(isinstance(chunk, (CompositeElement, Table, TableChunk)) for chunk in chunks)
|
||
|
||
|
||
def test_auto_partition_forwards_max_characters_via_kwargs():
|
||
chunks = partition(
|
||
example_doc_path("example-10k-1p.html"),
|
||
chunking_strategy="by_title",
|
||
max_characters=250,
|
||
)
|
||
assert all(len(chunk.text) <= 250 for chunk in chunks)
|
||
|
||
|
||
# -- detect_language_per_element ------------------------------------------
|
||
|
||
|
||
def test_auto_partition_respects_detect_language_per_element_arg():
|
||
elements = partition(
|
||
example_doc_path("language-docs/eng_spa_mult.txt"), detect_language_per_element=True
|
||
)
|
||
langs = [element.metadata.languages for element in elements]
|
||
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
|
||
|
||
|
||
# -- languages ------------------------------------------------------------
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
"file_extension", "doc docx eml epub html md odt org ppt pptx rst rtf txt xml".split()
|
||
)
|
||
def test_auto_partition_respects_language_arg(file_extension: str):
|
||
elements = partition(
|
||
example_doc_path(f"language-docs/eng_spa_mult.{file_extension}"), languages=["deu"]
|
||
)
|
||
assert all(element.metadata.languages == ["deu"] for element in elements)
|
||
|
||
|
||
# -- include_page_breaks --------------------------------------------------
|
||
|
||
|
||
def test_auto_partition_forwards_include_page_breaks_to_partition_pdf():
|
||
elements = partition(
|
||
example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||
include_page_breaks=True,
|
||
strategy=PartitionStrategy.HI_RES,
|
||
)
|
||
assert "PageBreak" in [elem.category for elem in elements]
|
||
|
||
|
||
# -- metadata_filename ----------------------------------------------------
|
||
|
||
|
||
def test_auto_partition_forwards_metadata_filename_via_kwargs():
|
||
with open(example_doc_path("fake-text.txt"), "rb") as f:
|
||
elements = partition(file=f, metadata_filename="much-more-interesting-name.txt")
|
||
|
||
assert all(e.metadata.filename == "much-more-interesting-name.txt" for e in elements)
|
||
|
||
|
||
# -- ocr_languages --------------------------------------------------------
|
||
|
||
|
||
def test_auto_partition_image_formats_languages_for_tesseract(request: FixtureRequest):
|
||
process_file_with_ocr_ = function_mock(
|
||
request, "unstructured.partition.pdf_image.ocr.process_file_with_ocr"
|
||
)
|
||
|
||
partition(
|
||
example_doc_path("img/chi_sim_image.jpeg"),
|
||
strategy=PartitionStrategy.HI_RES,
|
||
languages=["zh"],
|
||
)
|
||
|
||
call_kwargs = process_file_with_ocr_.call_args_list[0][1]
|
||
assert call_kwargs["ocr_languages"] == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
|
||
|
||
|
||
@pytest.mark.parametrize(("languages", "ocr_languages"), [(["auto"], ""), (["eng"], "")])
|
||
def test_auto_partition_ignores_empty_string_for_ocr_languages(
|
||
languages: list[str], ocr_languages: str
|
||
):
|
||
elements = partition(
|
||
example_doc_path("book-war-and-peace-1p.txt"),
|
||
strategy=PartitionStrategy.OCR_ONLY,
|
||
ocr_languages=ocr_languages,
|
||
languages=languages,
|
||
)
|
||
assert all(e.metadata.languages == ["eng"] for e in elements)
|
||
|
||
|
||
def test_auto_partition_warns_with_ocr_languages(caplog: LogCaptureFixture):
|
||
partition(
|
||
example_doc_path("pdf/chevron-page.pdf"),
|
||
strategy=PartitionStrategy.HI_RES,
|
||
ocr_languages="eng",
|
||
)
|
||
|
||
assert caplog.records[0].levelname == "WARNING"
|
||
assert "The ocr_languages kwarg will be deprecated" in caplog.text
|
||
|
||
|
||
# -- skip_infer_table_types -----------------------------------------------
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
("skip_infer_table_types", "filename", "has_text_as_html"),
|
||
[
|
||
(["xlsx"], "stanley-cups.xlsx", False),
|
||
([], "stanley-cups.xlsx", True),
|
||
(["odt"], "fake.odt", False),
|
||
([], "fake.odt", True),
|
||
],
|
||
)
|
||
def test_auto_partition_respects_skip_infer_table_types(
|
||
skip_infer_table_types: list[str], filename: str, has_text_as_html: bool
|
||
):
|
||
with open(example_doc_path(filename), "rb") as f:
|
||
elements = partition(file=f, skip_infer_table_types=skip_infer_table_types)
|
||
|
||
table_elements = [e for e in elements if isinstance(e, Table)]
|
||
assert table_elements
|
||
for e in table_elements:
|
||
assert (e.metadata.text_as_html is not None) == has_text_as_html
|
||
|
||
|
||
# ================================================================================================
|
||
# METADATA BEHAVIORS
|
||
# ================================================================================================
|
||
|
||
# -- .filetype ------------------------------------------------------------
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
("content_type", "shortname", "expected_value"),
|
||
[
|
||
("text/csv", "csv", "text/csv"),
|
||
("text/html", "html", "text/html"),
|
||
("jdsfjdfsjkds", "pdf", None),
|
||
],
|
||
)
|
||
def test_auto_partition_adds_filetype_to_metadata(
|
||
request: FixtureRequest,
|
||
content_type: str,
|
||
shortname: str,
|
||
expected_value: str | None,
|
||
):
|
||
partition_fn_ = function_mock(
|
||
request,
|
||
f"unstructured.partition.{shortname}.partition_{shortname}",
|
||
return_value=[Text("text 1"), Text("text 2")],
|
||
)
|
||
partitioner_loader_get_ = method_mock(
|
||
request, _PartitionerLoader, "get", return_value=partition_fn_
|
||
)
|
||
|
||
elements = partition(
|
||
example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type
|
||
)
|
||
|
||
partitioner_loader_get_.assert_called_once()
|
||
assert len(elements) == 2
|
||
assert all(e.metadata.filetype == expected_value for e in elements)
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
"content_type",
|
||
[
|
||
# -- content-type provided as argument --
|
||
"application/pdf",
|
||
# -- auto-detected content-type --
|
||
None,
|
||
],
|
||
)
|
||
def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partitioner(
|
||
request: FixtureRequest, content_type: str | None
|
||
):
|
||
metadata = ElementMetadata(filetype="imapdf")
|
||
partition_pdf_ = function_mock(
|
||
request,
|
||
"unstructured.partition.pdf.partition_pdf",
|
||
return_value=[Text("text 1", metadata=metadata), Text("text 2", metadata=metadata)],
|
||
)
|
||
partitioner_loader_get_ = method_mock(
|
||
request, _PartitionerLoader, "get", return_value=partition_pdf_
|
||
)
|
||
|
||
elements = partition(
|
||
example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type
|
||
)
|
||
|
||
partitioner_loader_get_.assert_called_once_with(ANY, FileType.PDF)
|
||
assert len(elements) == 2
|
||
assert all(e.metadata.filetype == "application/pdf" for e in elements)
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
("file_name", "file_type"),
|
||
[
|
||
("stanley-cups.csv", FileType.CSV),
|
||
("simple.doc", FileType.DOC),
|
||
("simple.docx", FileType.DOCX),
|
||
("fake-email.eml", FileType.EML),
|
||
("simple.epub", FileType.EPUB),
|
||
("fake-html.html", FileType.HTML),
|
||
("README.md", FileType.MD),
|
||
("fake-email.msg", FileType.MSG),
|
||
("simple.odt", FileType.ODT),
|
||
("pdf/DA-1p.pdf", FileType.PDF),
|
||
("fake-power-point.ppt", FileType.PPT),
|
||
("simple.pptx", FileType.PPTX),
|
||
("README.rst", FileType.RST),
|
||
("fake-doc.rtf", FileType.RTF),
|
||
("stanley-cups.tsv", FileType.TSV),
|
||
("fake-text.txt", FileType.TXT),
|
||
("tests-example.xls", FileType.XLSX),
|
||
("stanley-cups.xlsx", FileType.XLSX),
|
||
("factbook.xml", FileType.XML),
|
||
],
|
||
)
|
||
def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(
|
||
file_name: str, file_type: FileType
|
||
):
|
||
file_path = example_doc_path(file_name)
|
||
partition_fn_name = file_type.partitioner_function_name
|
||
module = import_module(file_type.partitioner_module_qname)
|
||
partition_fn = getattr(module, partition_fn_name)
|
||
|
||
# -- partition the example-doc for this filetype --
|
||
elements = partition_fn(file_path, process_attachments=False)
|
||
|
||
assert elements
|
||
assert all(
|
||
e.metadata.filetype == file_type.mime_type
|
||
for e in elements
|
||
if e.metadata.filetype is not None
|
||
)
|
||
|
||
|
||
# -- .languages -----------------------------------------------------------
|
||
|
||
|
||
def test_auto_partition_passes_user_provided_languages_arg_to_PDF():
|
||
elements = partition(
|
||
example_doc_path("pdf/chevron-page.pdf"),
|
||
strategy=PartitionStrategy.OCR_ONLY,
|
||
languages=["eng"],
|
||
)
|
||
assert all(e.metadata.languages == ["eng"] for e in elements)
|
||
|
||
|
||
def test_auto_partition_languages_argument_default_to_None_when_omitted():
|
||
elements = partition(example_doc_path("handbook-1p.docx"), detect_language_per_element=True)
|
||
# -- PageBreak and any other element with no text is assigned `None` --
|
||
assert all(e.text == "" for e in elements if e.metadata.languages is None)
|
||
|
||
|
||
def test_auto_partition_default_does_not_overwrite_other_defaults():
|
||
"""`partition()` ["eng"] default does not overwrite ["auto"] default in other partitioners."""
|
||
# the default for `languages` is ["auto"] in partiton_text
|
||
from unstructured.partition.text import partition_text
|
||
|
||
# Use a document that is primarily in a language other than English
|
||
file_path = example_doc_path("language-docs/UDHR_first_article_all.txt")
|
||
text_elements = partition_text(file_path)
|
||
assert text_elements[0].metadata.languages != ["eng"]
|
||
|
||
auto_elements = partition(file_path)
|
||
assert auto_elements[0].metadata.languages != ["eng"]
|
||
assert auto_elements[0].metadata.languages == text_elements[0].metadata.languages
|
||
|
||
|
||
# ================================================================================================
|
||
# MISCELLANEOUS BEHAVIORS
|
||
# ================================================================================================
|
||
|
||
|
||
def test_auto_partition_from_filename_works_on_empty_file():
|
||
assert partition(example_doc_path("empty.txt")) == []
|
||
|
||
|
||
def test_auto_partition_from_file_works_on_empty_file():
|
||
with open(example_doc_path("empty.txt"), "rb") as f:
|
||
assert partition(file=f) == []
|
||
|
||
|
||
def test_auto_partition_that_requires_extras_raises_when_dependencies_are_not_installed(
|
||
request: FixtureRequest,
|
||
):
|
||
_PartitionerLoader._partitioners.pop(FileType.PDF, None)
|
||
dependency_exists_ = function_mock(
|
||
request, "unstructured.partition.auto.dependency_exists", return_value=False
|
||
)
|
||
match = r"partition_pdf\(\) is not available because one or more dependencies are not installed"
|
||
with pytest.raises(ImportError, match=match):
|
||
partition(example_doc_path("pdf/layout-parser-paper-fast.pdf"))
|
||
|
||
dependency_exists_.assert_called_once_with("pdf2image")
|
||
|
||
|
||
# ================================================================================================
|
||
# MODULE-LEVEL FIXTURES
|
||
# ================================================================================================
|
||
|
||
|
||
@pytest.fixture()
|
||
def expected_docx_elements():
|
||
return [
|
||
Title("These are a few of my favorite things:"),
|
||
ListItem("Parrots"),
|
||
ListItem("Hockey"),
|
||
Title("Analysis"),
|
||
NarrativeText("This is my first thought. This is my second thought."),
|
||
NarrativeText("This is my third thought."),
|
||
Text("2023"),
|
||
Address("DOYLESTOWN, PA 18901"),
|
||
]
|