Yao You 7de630e45e
Feat/bump numpy to 2 (#3961)
This PR updates a few dependencies so that they are compatible with
`numpy>=2`.
2025-03-18 21:33:48 +00:00

1384 lines
49 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# pyright: reportPrivateUsage=false
from __future__ import annotations
import json
import os
import pathlib
import tempfile
import warnings
from importlib import import_module
from typing import Iterator
from unittest.mock import MagicMock, patch
import pytest
from PIL import Image
from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
from test_unstructured.partition.test_constants import (
EXPECTED_TABLE,
EXPECTED_TABLE_XLSX,
EXPECTED_TEXT,
EXPECTED_XLS_TABLE,
)
from test_unstructured.unit_utils import (
ANY,
FixtureRequest,
LogCaptureFixture,
example_doc_path,
function_mock,
method_mock,
)
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import (
Address,
CompositeElement,
Element,
ElementMetadata,
ListItem,
NarrativeText,
Table,
TableChunk,
Text,
Title,
)
from unstructured.file_utils.filetype import detect_filetype
from unstructured.file_utils.model import FileType, create_file_type, register_partitioner
from unstructured.partition.auto import _PartitionerLoader, partition
from unstructured.partition.common import UnsupportedFileFormatError
from unstructured.partition.utils.constants import PartitionStrategy
from unstructured.staging.base import elements_from_json, elements_to_dicts, elements_to_json
is_in_docker = os.path.exists("/.dockerenv")
# ================================================================================================
# CSV
# ================================================================================================
def test_auto_partition_csv_from_filename():
elements = partition(example_doc_path("stanley-cups.csv"))
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.filetype == "text/csv"
def test_auto_partition_csv_from_file():
with open(example_doc_path("stanley-cups.csv"), "rb") as f:
elements = partition(file=f)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert isinstance(elements[0], Table)
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.filetype == "text/csv"
# ================================================================================================
# DOC
# ================================================================================================
@pytest.mark.parametrize(
("pass_metadata_filename", "content_type"),
[(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
)
def test_auto_partition_doc_from_filename(
pass_metadata_filename: bool, content_type: str | None, expected_docx_elements: list[Element]
):
file_path = example_doc_path("simple.doc")
metadata_filename = file_path if pass_metadata_filename else None
elements = partition(
filename=file_path,
metadata_filename=metadata_filename,
content_type=content_type,
strategy=PartitionStrategy.HI_RES,
)
for e in elements:
print(f"{type(e).__name__}({repr(e.text)})")
assert elements == expected_docx_elements
assert all(e.metadata.filename == "simple.doc" for e in elements)
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
@pytest.mark.skipif(is_in_docker, reason="Passes in CI but not Docker. Remove skip on #3364 fix.")
def test_auto_partition_doc_from_file(expected_docx_elements: list[Element]):
with open(example_doc_path("simple.doc"), "rb") as f:
elements = partition(file=f)
assert elements == expected_docx_elements
# ================================================================================================
# DOCX
# ================================================================================================
def test_auto_partition_docx_from_filename(expected_docx_elements: list[Element]):
elements = partition(example_doc_path("simple.docx"), strategy=PartitionStrategy.HI_RES)
assert elements == expected_docx_elements
assert all(e.metadata.filename == "simple.docx" for e in elements)
def test_auto_partition_docx_from_file(expected_docx_elements: list[Element]):
with open(example_doc_path("simple.docx"), "rb") as f:
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
assert elements == expected_docx_elements
@pytest.mark.parametrize("file_name", ["simple.docx", "simple.doc", "simple.odt"])
@pytest.mark.parametrize(
"strategy",
[
PartitionStrategy.AUTO,
PartitionStrategy.FAST,
PartitionStrategy.HI_RES,
PartitionStrategy.OCR_ONLY,
],
)
def test_partition_forwards_strategy_arg_to_partition_docx_and_its_brokers(
request: FixtureRequest, file_name: str, strategy: str
):
"""The `strategy` arg value received by `partition()` is received by `partition_docx().
To do this in the brokering-partitioner cases (DOC, ODT) it must make its way to
`partition_doc()` or `partition_odt()` which must then forward it to `partition_docx()`. This
test makes sure it made it all the way.
Note this is 3 file-types X 4 strategies = 12 test-cases.
"""
from unstructured.partition.docx import _DocxPartitioner
def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]:
yield Text(f"strategy=={self._opts.strategy}")
_iter_elements_ = method_mock(
request,
_DocxPartitioner,
"_iter_document_elements",
side_effect=fake_iter_document_elements,
)
(element,) = partition(example_doc_path(file_name), strategy=strategy)
_iter_elements_.assert_called_once_with(ANY)
assert element.text == f"strategy=={strategy}"
# ================================================================================================
# EML
# ================================================================================================
EXPECTED_EMAIL_OUTPUT = [
NarrativeText(text="This is a test email to use for unit tests."),
Text(text="Important points:"),
ListItem(text="Roses are red"),
ListItem(text="Violets are blue"),
]
def test_auto_partition_email_from_filename():
file_path = example_doc_path("eml/fake-email.eml")
elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
assert len(elements) > 0
assert elements == EXPECTED_EMAIL_OUTPUT
assert elements[0].metadata.filename == os.path.basename(file_path)
assert elements[0].metadata.file_directory == os.path.split(file_path)[0]
def test_auto_partition_email_from_file():
with open(example_doc_path("eml/fake-email.eml"), "rb") as f:
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
assert len(elements) > 0
assert elements == EXPECTED_EMAIL_OUTPUT
# ================================================================================================
# EPUB
# ================================================================================================
def test_auto_partition_epub_from_filename():
elements = partition(example_doc_path("winter-sports.epub"), strategy=PartitionStrategy.HI_RES)
assert len(elements) > 0
assert elements[2].text.startswith("The Project Gutenberg eBook of Winter Sports")
def test_auto_partition_epub_from_file():
with open(example_doc_path("winter-sports.epub"), "rb") as f:
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
assert len(elements) > 0
assert elements[2].text.startswith("The Project Gutenberg eBook of Winter Sports")
# ================================================================================================
# HTML
# ================================================================================================
@pytest.mark.parametrize(
("pass_metadata_filename", "content_type"),
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
)
def test_auto_partition_html_from_filename(pass_metadata_filename: bool, content_type: str | None):
file_path = example_doc_path("example-10k-1p.html")
metadata_filename = file_path if pass_metadata_filename else None
elements = partition(
filename=file_path,
metadata_filename=metadata_filename,
content_type=content_type,
strategy=PartitionStrategy.HI_RES,
)
assert elements
expected_filename, expected_directory = os.path.basename(file_path), os.path.split(file_path)[0]
assert all(e.metadata.filename == expected_filename for e in elements)
assert all(e.metadata.file_directory == expected_directory for e in elements)
@pytest.mark.parametrize(
("pass_metadata_filename", "content_type"),
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
)
def test_auto_partition_html_from_file(pass_metadata_filename: bool, content_type: str | None):
file_path = example_doc_path("example-10k-1p.html")
metadata_filename = file_path if pass_metadata_filename else None
with open(file_path, "rb") as f:
elements = partition(
file=f,
metadata_filename=metadata_filename,
content_type=content_type,
strategy=PartitionStrategy.HI_RES,
)
assert len(elements) > 0
def test_auto_partition_html_pre_from_file():
elements = partition(example_doc_path("fake-html-pre.htm"))
assert len(elements) > 0
assert "PageBreak" not in [elem.category for elem in elements]
assert clean_extra_whitespace(elements[0].text).startswith("[107th Congress Public Law 56]")
assert isinstance(elements[0], NarrativeText)
assert all(e.metadata.filetype == "text/html" for e in elements)
assert all(e.metadata.filename == "fake-html-pre.htm" for e in elements)
# ================================================================================================
# IMAGE
# ================================================================================================
@pytest.mark.parametrize(
("pass_metadata_filename", "content_type"),
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_jpeg_from_filename(pass_metadata_filename: bool, content_type: str | None):
file_path = example_doc_path("img/layout-parser-paper-fast.jpg")
metadata_filename = file_path if pass_metadata_filename else None
elements = partition(
filename=file_path,
metadata_filename=metadata_filename,
content_type=content_type,
strategy=PartitionStrategy.AUTO,
)
e = elements[2]
assert e.text == (
"LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
)
assert e.metadata.coordinates is not None
@pytest.mark.parametrize(
("pass_metadata_filename", "content_type"),
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_jpeg_from_file(pass_metadata_filename: bool, content_type: str | None):
file_path = example_doc_path("img/layout-parser-paper-fast.jpg")
metadata_filename = file_path if pass_metadata_filename else None
with open(file_path, "rb") as f:
elements = partition(
file=f,
metadata_filename=metadata_filename,
content_type=content_type,
strategy=PartitionStrategy.AUTO,
)
e = elements[2]
assert e.text == (
"LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
)
assert e.metadata.coordinates is not None
def test_auto_partition_bmp_from_filename(tmp_path: pathlib.Path):
bmp_filename = str(tmp_path / "example.bmp")
with Image.open(example_doc_path("img/layout-parser-paper-with-table.jpg")) as img:
img.save(bmp_filename)
elements = partition(filename=bmp_filename, strategy=PartitionStrategy.HI_RES)
table = [e.metadata.text_as_html for e in elements if e.metadata.text_as_html]
assert len(table) == 1
assert "<table><thead><tr>" in table[0]
assert "</thead><tbody><tr>" in table[0]
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
def test_auto_partition_image_element_extraction(extract_image_block_to_payload: bool):
extract_image_block_types = ["Image", "Table"]
with tempfile.TemporaryDirectory() as tmpdir:
elements = partition(
filename=example_doc_path("img/embedded-images-tables.jpg"),
extract_image_block_types=extract_image_block_types,
extract_image_block_to_payload=extract_image_block_to_payload,
extract_image_block_output_dir=tmpdir,
)
assert_element_extraction(
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
)
# ================================================================================================
# JSON
# ================================================================================================
# TODO(scanny): This test should go away when we fix #3365. This test glosses over several
# important JSON "rehydration" behaviors, in particular that the metadata should match exactly.
# The following test `test_auto_partition_json_from_file_preserves_original_elements` will be the
# replacement for this test.
def test_auto_partitioned_json_output_maintains_consistency_with_fixture_elements():
"""Test auto-processing an unstructured json output file by filename."""
json_file_path = example_doc_path("spring-weather.html.json")
original_file_name = "spring-weather.html"
with open(json_file_path) as json_f:
expected_result = json.load(json_f)
partitioning_result = json.loads(
elements_to_json(
partition(
filename=str(json_file_path),
# -- use the original file name to get the same element IDs (hashes) --
metadata_filename=original_file_name,
strategy=PartitionStrategy.HI_RES,
)
)
)
for elem in partitioning_result:
elem.pop("metadata")
for elem in expected_result:
elem.pop("metadata")
assert expected_result == partitioning_result
@pytest.mark.xfail(
reason=(
"https://github.com/Unstructured-IO/unstructured/issues/3365"
" partition_json() does not preserve original element-id or metadata"
),
raises=AssertionError,
strict=True,
)
def test_auto_partition_json_from_file_preserves_original_elements():
file_path = example_doc_path("simple.json")
original_elements = elements_from_json(file_path)
with open(file_path, "rb") as f:
partitioned_elements = partition(file=f)
assert elements_to_dicts(partitioned_elements) == elements_to_dicts(original_elements)
def test_auto_partition_processes_simple_ndjson(tmp_path: pathlib.Path):
text = '{"text": "hello", "type": "NarrativeText"}'
file_path = str(tmp_path / "unprocessable.json")
with open(file_path, "w") as f:
f.write(text)
result = partition(filename=file_path)
assert len(result) == 1
assert isinstance(result[0], NarrativeText)
assert "hello" in result[0].text
# ================================================================================================
# MD
# ================================================================================================
def test_partition_md_from_url_works_with_embedded_html():
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/README.md"
elements = partition(url=url, content_type="text/markdown", strategy=PartitionStrategy.HI_RES)
assert "unstructured" in elements[1].text
# ================================================================================================
# MSG
# ================================================================================================
def test_auto_partition_msg_from_filename():
assert partition(example_doc_path("fake-email.msg"), strategy=PartitionStrategy.HI_RES) == [
NarrativeText(text="This is a test email to use for unit tests."),
Text(text="Important points:"),
ListItem(text="Roses are red"),
ListItem(text="Violets are blue"),
]
# ================================================================================================
# ODT
# ================================================================================================
def test_auto_partition_odt_from_filename(expected_docx_elements: list[Element]):
elements = partition(example_doc_path("simple.odt"), strategy=PartitionStrategy.HI_RES)
assert elements == expected_docx_elements
def test_auto_partition_odt_from_file(expected_docx_elements: list[Element]):
with open(example_doc_path("simple.odt"), "rb") as f:
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
assert elements == expected_docx_elements
# ================================================================================================
# ORG
# ================================================================================================
def test_auto_partition_org_from_filename():
elements = partition(example_doc_path("README.org"))
assert elements[0] == Title("Example Docs")
assert elements[0].metadata.filetype == "text/org"
def test_auto_partition_org_from_file():
with open(example_doc_path("README.org"), "rb") as f:
elements = partition(file=f, content_type="text/org")
assert elements[0] == Title("Example Docs")
assert elements[0].metadata.filetype == "text/org"
# ================================================================================================
# PDF
# ================================================================================================
@pytest.mark.parametrize(
("pass_metadata_filename", "content_type"),
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_type: str | None):
file_path = example_doc_path("pdf/chevron-page.pdf")
metadata_filename = file_path if pass_metadata_filename else None
elements = partition(
filename=file_path,
metadata_filename=metadata_filename,
content_type=content_type,
strategy=PartitionStrategy.HI_RES,
)
e = elements[0]
assert isinstance(e, Title)
assert e.text.startswith("eastern mediterranean")
assert e.metadata.filename == os.path.basename(file_path)
assert e.metadata.file_directory == os.path.split(file_path)[0]
e = elements[1]
assert isinstance(e, NarrativeText)
assert e.text.startswith("Were investing")
@pytest.mark.parametrize(
("pass_metadata_filename", "content_type"),
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type: str | None):
file_path = example_doc_path("pdf/chevron-page.pdf")
metadata_filename = file_path if pass_metadata_filename else None
with open(file_path, "rb") as f:
elements = partition(
file=f,
metadata_filename=metadata_filename,
content_type=content_type,
strategy=PartitionStrategy.HI_RES,
)
e = elements[0]
assert isinstance(e, Title)
assert e.text.startswith("eastern mediterranean")
e = elements[1]
assert isinstance(e, NarrativeText)
assert e.text.startswith("Were investing")
def test_auto_partition_pdf_with_fast_strategy(request: FixtureRequest):
partition_pdf_ = function_mock(
request,
"unstructured.partition.pdf.partition_pdf",
return_value=[NarrativeText("Hello there!")],
)
partitioner_loader_get_ = method_mock(
request, _PartitionerLoader, "get", return_value=partition_pdf_
)
file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf")
partition(file_path, strategy=PartitionStrategy.FAST)
partitioner_loader_get_.assert_called_once_with(ANY, FileType.PDF)
partition_pdf_.assert_called_once_with(
filename=file_path,
file=None,
url=None,
strategy=PartitionStrategy.FAST,
languages=None,
metadata_filename=None,
infer_table_structure=False,
extract_images_in_pdf=False,
extract_image_block_types=None,
extract_image_block_output_dir=None,
extract_image_block_to_payload=False,
hi_res_model_name=None,
starting_page_number=1,
)
@pytest.mark.parametrize("infer_bool", [True, False])
def test_auto_handles_kwarg_with_infer_table_structure(infer_bool):
with patch(
"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
) as mock_process_file_with_model:
partition(
example_doc_path("pdf/layout-parser-paper-fast.pdf"),
pdf_infer_table_structure=True,
strategy=PartitionStrategy.HI_RES,
infer_table_structure=infer_bool,
)
assert mock_process_file_with_model.call_args[1]["infer_table_structure"] is infer_bool
def test_auto_handles_kwarg_with_infer_table_structure_when_none():
with patch(
"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
) as mock_process_file_with_model:
partition(
example_doc_path("pdf/layout-parser-paper-fast.pdf"),
pdf_infer_table_structure=True,
strategy=PartitionStrategy.HI_RES,
infer_table_structure=None,
)
assert mock_process_file_with_model.call_args[1]["infer_table_structure"] is True
def test_auto_partition_pdf_uses_pdf_infer_table_structure_argument():
with patch(
"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
) as mock_process_file_with_model:
partition(
example_doc_path("pdf/layout-parser-paper-fast.pdf"),
pdf_infer_table_structure=True,
strategy=PartitionStrategy.HI_RES,
)
assert mock_process_file_with_model.call_args[1]["infer_table_structure"]
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
def test_auto_partition_pdf_element_extraction(extract_image_block_to_payload: bool):
extract_image_block_types = ["Image", "Table"]
with tempfile.TemporaryDirectory() as tmpdir:
elements = partition(
example_doc_path("pdf/embedded-images-tables.pdf"),
extract_image_block_types=extract_image_block_types,
extract_image_block_to_payload=extract_image_block_to_payload,
extract_image_block_output_dir=tmpdir,
)
assert_element_extraction(
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
)
def test_auto_partition_html_element_extraction():
extract_image_block_types = ["Image"]
with tempfile.TemporaryDirectory() as tmpdir:
elements = partition(
example_doc_path("fake-html-with-base64-image.html"),
extract_image_block_types=extract_image_block_types,
extract_image_block_to_payload=True,
)
assert_element_extraction(elements, extract_image_block_types, True, tmpdir)
def test_auto_partition_html_image_with_url():
elements = partition(
example_doc_path("fake-html-with-image-from-url.html"),
)
assert elements[1].metadata.image_url is not None
def test_partition_pdf_does_not_raise_warning():
# NOTE(robinson): This is the recommended way to check that no warning is emitted,
# per the pytest docs.
# ref: https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html
# #additional-use-cases-of-warnings-in-tests
with warnings.catch_warnings():
warnings.simplefilter("error")
partition(
example_doc_path("pdf/layout-parser-paper-fast.pdf"), strategy=PartitionStrategy.HI_RES
)
# ================================================================================================
# PPT
# ================================================================================================
def test_auto_partition_ppt_from_filename():
file_path = example_doc_path("fake-power-point.ppt")
elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
assert elements == [
Title(text="Adding a Bullet Slide"),
ListItem(text="Find the bullet slide layout"),
ListItem(text="Use _TextFrame.text for first bullet"),
ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
NarrativeText(text="Here is a lot of text!"),
NarrativeText(text="Here is some text in a text box!"),
]
assert all(e.metadata.filename == "fake-power-point.ppt" for e in elements)
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
# ================================================================================================
# PPTX
# ================================================================================================
def test_auto_partition_pptx_from_filename():
file_path = example_doc_path("fake-power-point.pptx")
elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
assert elements == [
Title(text="Adding a Bullet Slide"),
ListItem(text="Find the bullet slide layout"),
ListItem(text="Use _TextFrame.text for first bullet"),
ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
NarrativeText(text="Here is a lot of text!"),
NarrativeText(text="Here is some text in a text box!"),
]
assert all(e.metadata.filename == "fake-power-point.pptx" for e in elements)
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
@pytest.mark.parametrize("file_name", ["simple.pptx", "fake-power-point.ppt"])
@pytest.mark.parametrize(
"strategy",
[
PartitionStrategy.AUTO,
PartitionStrategy.FAST,
PartitionStrategy.HI_RES,
PartitionStrategy.OCR_ONLY,
],
)
def test_partition_forwards_strategy_arg_to_partition_pptx_and_its_brokers(
request: FixtureRequest, file_name: str, strategy: str
):
"""The `strategy` arg value received by `partition()` is received by `partition_pptx().
To do this in the brokering-partitioner case (PPT) the strategy argument must make its way to
`partition_ppt()` which must then forward it to `partition_pptx()`. This test makes sure it
made it all the way.
Note this is 2 file-types X 4 strategies = 8 test-cases.
"""
from unstructured.partition.pptx import _PptxPartitioner
def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]:
yield Text(f"strategy=={self._opts.strategy}")
_iter_elements_ = method_mock(
request,
_PptxPartitioner,
"_iter_presentation_elements",
side_effect=fake_iter_presentation_elements,
)
(element,) = partition(example_doc_path(file_name), strategy=strategy)
_iter_elements_.assert_called_once_with(ANY)
assert element.text == f"strategy=={strategy}"
# ================================================================================================
# RST
# ================================================================================================
def test_auto_partition_rst_from_filename():
elements = partition(example_doc_path("README.rst"))
assert elements[0] == Title("Example Docs")
assert elements[0].metadata.filetype == "text/x-rst"
def test_auto_partition_rst_from_file():
with open(example_doc_path("README.rst"), "rb") as f:
elements = partition(file=f, content_type="text/x-rst")
assert elements[0] == Title("Example Docs")
assert elements[0].metadata.filetype == "text/x-rst"
# ================================================================================================
# RTF
# ================================================================================================
def test_auto_partition_rtf_from_filename():
elements = partition(example_doc_path("fake-doc.rtf"), strategy=PartitionStrategy.HI_RES)
assert elements[0] == Title("My First Heading")
# ================================================================================================
# TSV
# ================================================================================================
def test_auto_partition_tsv_from_filename():
elements = partition(example_doc_path("stanley-cups.tsv"))
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.filetype == "text/tsv"
# ================================================================================================
# TXT
# ================================================================================================
@pytest.mark.parametrize(
("filename", "expected_elements"),
[
(
"fake-text.txt",
[
NarrativeText(text="This is a test document to use for unit tests."),
Address(text="Doylestown, PA 18901"),
Title(text="Important points:"),
ListItem(text="Hamburgers are delicious"),
ListItem(text="Dogs are the best"),
ListItem(text="I love fuzzy blankets"),
],
),
("fake-text-all-whitespace.txt", []),
],
)
def test_auto_partition_text_from_filename(filename: str, expected_elements: list[Element]):
file_path = example_doc_path(filename)
elements = partition(filename=file_path, strategy=PartitionStrategy.HI_RES)
assert elements == expected_elements
assert all(e.metadata.filename == filename for e in elements)
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
def test_auto_partition_text_from_file():
with open(example_doc_path("fake-text.txt"), "rb") as f:
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
assert len(elements) > 0
assert elements == [
NarrativeText(text="This is a test document to use for unit tests."),
Address(text="Doylestown, PA 18901"),
Title(text="Important points:"),
ListItem(text="Hamburgers are delicious"),
ListItem(text="Dogs are the best"),
ListItem(text="I love fuzzy blankets"),
]
# ================================================================================================
# XLS
# ================================================================================================
def test_auto_partition_xls_from_filename():
elements = partition(
example_doc_path("tests-example.xls"), include_header=False, skip_infer_table_types=[]
)
assert len(elements) == 14
assert sum(isinstance(e, Table) for e in elements) == 2
assert elements[0].metadata.text_as_html == EXPECTED_XLS_TABLE
assert len(elements[0].text) == 507
# ================================================================================================
# XLSX
# ================================================================================================
def test_auto_partition_xlsx_from_filename():
elements = partition(
example_doc_path("stanley-cups.xlsx"), include_header=False, skip_infer_table_types=[]
)
assert len(elements) == 4
assert sum(isinstance(e, Table) for e in elements) == 2
assert sum(isinstance(e, Title) for e in elements) == 2
assert clean_extra_whitespace(elements[0].text) == "Stanley Cups"
assert clean_extra_whitespace(elements[1].text) == (
"Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
)
assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
assert all(e.metadata.page_number == 1 for e in elements[:2])
assert all(e.metadata.page_number == 2 for e in elements[2:])
assert all(
e.metadata.filetype == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
for e in elements
)
def test_auto_partition_xlsx_from_file():
with open(example_doc_path("stanley-cups.xlsx"), "rb") as f:
elements = partition(file=f, include_header=False, skip_infer_table_types=[])
assert len(elements) == 4
assert sum(isinstance(element, Table) for element in elements) == 2
assert sum(isinstance(element, Title) for element in elements) == 2
assert clean_extra_whitespace(elements[0].text) == "Stanley Cups"
assert clean_extra_whitespace(elements[1].text) == (
"Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
)
assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
assert all(e.metadata.page_number == 1 for e in elements[:2])
assert all(e.metadata.page_number == 2 for e in elements[2:])
assert all(
e.metadata.filetype == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
for e in elements
)
def test_auto_partition_xlsx_respects_starting_page_number_argument():
elements = partition(example_doc_path("stanley-cups.xlsx"), starting_page_number=3)
assert all(e.metadata.page_number == 3 for e in elements[:2])
assert all(e.metadata.page_number == 4 for e in elements[2:])
# ================================================================================================
# XML
# ================================================================================================
def test_auto_partition_xml_from_filename():
elements = partition(example_doc_path("factbook.xml"), xml_keep_tags=False)
assert elements[0].text == "United States"
assert all(e.metadata.filename == "factbook.xml" for e in elements)
def test_auto_partition_xml_from_file():
with open(example_doc_path("factbook.xml"), "rb") as f:
elements = partition(file=f, xml_keep_tags=False)
assert elements[0].text == "United States"
def test_auto_partition_xml_from_filename_with_tags():
elements = partition(example_doc_path("factbook.xml"), xml_keep_tags=True)
assert "<leader>Joe Biden</leader>" in elements[0].text
assert elements[0].metadata.filename == "factbook.xml"
def test_auto_partition_xml_from_file_with_tags():
with open(example_doc_path("factbook.xml"), "rb") as f:
elements = partition(file=f, xml_keep_tags=True)
assert "<leader>Joe Biden</leader>" in elements[0].text
# ================================================================================================
# FILE_TYPE NOT RECOGNIZED OR NOT SUPPORTED
# ================================================================================================
def test_auto_partition_raises_with_bad_type(request: FixtureRequest):
detect_filetype_ = function_mock(
request, "unstructured.partition.auto.detect_filetype", return_value=FileType.UNK
)
with pytest.raises(
UnsupportedFileFormatError,
match="Partitioning is not supported for the FileType.UNK file type.",
):
partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES)
detect_filetype_.assert_called_once_with(
file_path="made-up.fake",
file=None,
encoding=None,
content_type=None,
metadata_file_path=None,
)
# ================================================================================================
# LOAD FROM URL
# ================================================================================================
def test_auto_partition_from_url():
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
elements = partition(url=url, content_type="text/plain", strategy=PartitionStrategy.HI_RES)
assert elements[0] == Title("Apache License")
assert all(e.metadata.url == url for e in elements)
def test_auto_partition_from_url_with_rfc9110_content_type():
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
elements = partition(
url=url, content_type="text/plain; charset=utf-8", strategy=PartitionStrategy.HI_RES
)
assert elements[0] == Title("Apache License")
assert all(e.metadata.url == url for e in elements)
def test_auto_partition_from_url_without_providing_content_type():
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
elements = partition(url=url, strategy=PartitionStrategy.HI_RES)
assert elements[0] == Title("Apache License")
assert all(e.metadata.url == url for e in elements)
def test_auto_partition_warns_if_header_set_and_not_url(caplog: LogCaptureFixture):
partition(
example_doc_path("eml/fake-email.eml"),
headers={"Accept": "application/pdf"},
strategy=PartitionStrategy.HI_RES,
)
assert caplog.records[0].levelname == "WARNING"
assert "headers kwarg is set but the url kwarg is not. The headers kwarg will b" in caplog.text
def test_auto_partition_from_url_routes_timeout_to_HTTP_request(request: FixtureRequest):
file_and_type_from_url_ = function_mock(
request,
"unstructured.partition.auto.file_and_type_from_url",
side_effect=ConnectionError("Trouble on the wire ..."),
)
with pytest.raises(ConnectionError, match="Trouble on the wire ..."):
partition(url="http://eie.io", request_timeout=326)
file_and_type_from_url_.assert_called_once_with(
url="http://eie.io", content_type=None, headers={}, ssl_verify=True, request_timeout=326
)
# ================================================================================================
# OTHER ARGS
# ================================================================================================
# -- chunking_strategy ----------------------------------------------------
def test_auto_partition_forwards_chunking_strategy_via_kwargs():
chunks = partition(example_doc_path("example-10k-1p.html"), chunking_strategy="by_title")
assert all(isinstance(chunk, (CompositeElement, Table, TableChunk)) for chunk in chunks)
def test_auto_partition_forwards_max_characters_via_kwargs():
chunks = partition(
example_doc_path("example-10k-1p.html"),
chunking_strategy="by_title",
max_characters=250,
)
assert all(len(chunk.text) <= 250 for chunk in chunks)
# -- detect_language_per_element ------------------------------------------
def test_auto_partition_respects_detect_language_per_element_arg():
elements = partition(
example_doc_path("language-docs/eng_spa_mult.txt"), detect_language_per_element=True
)
langs = [element.metadata.languages for element in elements]
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
# -- languages ------------------------------------------------------------
@pytest.mark.parametrize(
"file_extension",
[
"doc",
"docx",
"eml",
"epub",
"html",
"md",
"odt",
"org",
"ppt",
"pptx",
"rst",
"rtf",
"txt",
"xml",
],
)
def test_auto_partition_respects_language_arg(file_extension: str):
elements = partition(
example_doc_path(f"language-docs/eng_spa_mult.{file_extension}"), languages=["deu"]
)
assert all(element.metadata.languages == ["deu"] for element in elements)
# -- include_page_breaks --------------------------------------------------
def test_auto_partition_forwards_include_page_breaks_to_partition_pdf():
elements = partition(
example_doc_path("pdf/layout-parser-paper-fast.pdf"),
include_page_breaks=True,
strategy=PartitionStrategy.HI_RES,
)
assert "PageBreak" in [elem.category for elem in elements]
# -- metadata_filename ----------------------------------------------------
def test_auto_partition_forwards_metadata_filename_via_kwargs():
with open(example_doc_path("fake-text.txt"), "rb") as f:
elements = partition(file=f, metadata_filename="much-more-interesting-name.txt")
assert all(e.metadata.filename == "much-more-interesting-name.txt" for e in elements)
# -- ocr_languages --------------------------------------------------------
def test_auto_partition_image_formats_languages_for_tesseract(request: FixtureRequest):
process_file_with_ocr_ = function_mock(
request, "unstructured.partition.pdf_image.ocr.process_file_with_ocr"
)
partition(
example_doc_path("img/chi_sim_image.jpeg"),
strategy=PartitionStrategy.HI_RES,
languages=["zh"],
)
call_kwargs = process_file_with_ocr_.call_args_list[0][1]
assert call_kwargs["ocr_languages"] == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
@pytest.mark.parametrize(("languages", "ocr_languages"), [(["auto"], ""), (["eng"], "")])
def test_auto_partition_ignores_empty_string_for_ocr_languages(
languages: list[str], ocr_languages: str
):
elements = partition(
example_doc_path("book-war-and-peace-1p.txt"),
strategy=PartitionStrategy.OCR_ONLY,
ocr_languages=ocr_languages,
languages=languages,
)
assert all(e.metadata.languages == ["eng"] for e in elements)
def test_auto_partition_warns_with_ocr_languages(caplog: LogCaptureFixture):
partition(
example_doc_path("pdf/chevron-page.pdf"),
strategy=PartitionStrategy.HI_RES,
ocr_languages="eng",
)
assert caplog.records[0].levelname == "WARNING"
assert "The ocr_languages kwarg will be deprecated" in caplog.text
# -- skip_infer_table_types -----------------------------------------------
@pytest.mark.parametrize(
("skip_infer_table_types", "filename", "has_text_as_html"),
[
(["xlsx"], "stanley-cups.xlsx", False),
([], "stanley-cups.xlsx", True),
(["odt"], "fake.odt", False),
([], "fake.odt", True),
],
)
def test_auto_partition_respects_skip_infer_table_types(
skip_infer_table_types: list[str], filename: str, has_text_as_html: bool
):
with open(example_doc_path(filename), "rb") as f:
elements = partition(file=f, skip_infer_table_types=skip_infer_table_types)
table_elements = [e for e in elements if isinstance(e, Table)]
assert table_elements
for e in table_elements:
assert (e.metadata.text_as_html is not None) == has_text_as_html
# ================================================================================================
# METADATA BEHAVIORS
# ================================================================================================
# -- .filetype ------------------------------------------------------------
@pytest.mark.parametrize(
("content_type", "shortname", "expected_value"),
[
("text/csv", "csv", "text/csv"),
("text/html", "html", "text/html"),
("jdsfjdfsjkds", "pdf", None),
],
)
def test_auto_partition_adds_filetype_to_metadata(
request: FixtureRequest,
content_type: str,
shortname: str,
expected_value: str | None,
):
partition_fn_ = function_mock(
request,
f"unstructured.partition.{shortname}.partition_{shortname}",
return_value=[Text("text 1"), Text("text 2")],
)
partitioner_loader_get_ = method_mock(
request, _PartitionerLoader, "get", return_value=partition_fn_
)
elements = partition(
example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type
)
partitioner_loader_get_.assert_called_once()
assert len(elements) == 2
assert all(e.metadata.filetype == expected_value for e in elements)
@pytest.mark.parametrize(
"content_type",
[
# -- content-type provided as argument --
"application/pdf",
# -- auto-detected content-type --
None,
],
)
def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partitioner(
request: FixtureRequest, content_type: str | None
):
metadata = ElementMetadata(filetype="imapdf")
partition_pdf_ = function_mock(
request,
"unstructured.partition.pdf.partition_pdf",
return_value=[Text("text 1", metadata=metadata), Text("text 2", metadata=metadata)],
)
partitioner_loader_get_ = method_mock(
request, _PartitionerLoader, "get", return_value=partition_pdf_
)
elements = partition(
example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type
)
partitioner_loader_get_.assert_called_once_with(ANY, FileType.PDF)
assert len(elements) == 2
assert all(e.metadata.filetype == "application/pdf" for e in elements)
@pytest.mark.parametrize(
("file_name", "file_type"),
[
("stanley-cups.csv", FileType.CSV),
("simple.doc", FileType.DOC),
("simple.docx", FileType.DOCX),
("fake-email.eml", FileType.EML),
("simple.epub", FileType.EPUB),
("fake-html.html", FileType.HTML),
("README.md", FileType.MD),
("fake-email.msg", FileType.MSG),
("simple.odt", FileType.ODT),
("pdf/DA-1p.pdf", FileType.PDF),
("fake-power-point.ppt", FileType.PPT),
("simple.pptx", FileType.PPTX),
("README.rst", FileType.RST),
("fake-doc.rtf", FileType.RTF),
("stanley-cups.tsv", FileType.TSV),
("fake-text.txt", FileType.TXT),
("tests-example.xls", FileType.XLSX),
("stanley-cups.xlsx", FileType.XLSX),
("factbook.xml", FileType.XML),
],
)
def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(
file_name: str, file_type: FileType
):
file_path = example_doc_path(file_name)
partition_fn_name = file_type.partitioner_function_name
module = import_module(file_type.partitioner_module_qname)
partition_fn = getattr(module, partition_fn_name)
# -- partition the example-doc for this filetype --
elements = partition_fn(file_path, process_attachments=False)
assert elements
assert all(
e.metadata.filetype == file_type.mime_type
for e in elements
if e.metadata.filetype is not None
)
def test_detect_filetype_maps_file_to_bytes_io_when_spooled_temp_file_used(mocker):
detect_filetype_mock = MagicMock(return_value=FileType.JSON)
mocker.patch("unstructured.file_utils.filetype._FileTypeDetector", detect_filetype_mock)
with tempfile.SpooledTemporaryFile() as f:
f.write(b'{"text": Hello, world!}')
f.seek(0)
detect_filetype(file=f)
file_detection_context = detect_filetype_mock.file_type.call_args[0][0]
assert file_detection_context.text_head == '{"text": Hello, world!}'
# -- .languages -----------------------------------------------------------
def test_auto_partition_passes_user_provided_languages_arg_to_PDF():
elements = partition(
example_doc_path("pdf/chevron-page.pdf"),
strategy=PartitionStrategy.OCR_ONLY,
languages=["eng"],
)
assert all(e.metadata.languages == ["eng"] for e in elements)
def test_auto_partition_languages_argument_default_to_None_when_omitted():
elements = partition(example_doc_path("handbook-1p.docx"), detect_language_per_element=True)
# -- PageBreak and any other element with no text is assigned `None` --
assert all(e.text == "" for e in elements if e.metadata.languages is None)
def test_auto_partition_default_does_not_overwrite_other_defaults():
"""`partition()` ["eng"] default does not overwrite ["auto"] default in other partitioners."""
# the default for `languages` is ["auto"] in partiton_text
from unstructured.partition.text import partition_text
# Use a document that is primarily in a language other than English
file_path = example_doc_path("language-docs/UDHR_first_article_all.txt")
text_elements = partition_text(file_path)
assert text_elements[0].metadata.languages != ["eng"]
auto_elements = partition(file_path)
assert auto_elements[0].metadata.languages != ["eng"]
assert auto_elements[0].metadata.languages == text_elements[0].metadata.languages
# ================================================================================================
# MISCELLANEOUS BEHAVIORS
# ================================================================================================
def test_auto_partition_from_filename_works_on_empty_file():
assert partition(example_doc_path("empty.txt")) == []
def test_auto_partition_from_file_works_on_empty_file():
with open(example_doc_path("empty.txt"), "rb") as f:
assert partition(file=f) == []
def test_auto_partition_that_requires_extras_raises_when_dependencies_are_not_installed(
request: FixtureRequest,
):
_PartitionerLoader._partitioners.pop(FileType.PDF, None)
dependency_exists_ = function_mock(
request, "unstructured.partition.auto.dependency_exists", return_value=False
)
match = r"partition_pdf\(\) is not available because one or more dependencies are not installed"
with pytest.raises(ImportError, match=match):
partition(example_doc_path("pdf/layout-parser-paper-fast.pdf"))
dependency_exists_.assert_called_once_with("pdf2image")
# ================================================================================================
# MODULE-LEVEL FIXTURES
# ================================================================================================
@pytest.fixture()
def expected_docx_elements():
return [
Title("These are a few of my favorite things:"),
ListItem("Parrots"),
ListItem("Hockey"),
Text("Analysis"),
NarrativeText("This is my first thought. This is my second thought."),
NarrativeText("This is my third thought."),
Text("2023"),
Address("DOYLESTOWN, PA 18901"),
]
def _test_partition_foo():
pass
def test_auto_partition_works_with_custom_types(
request: FixtureRequest,
):
file_type = create_file_type("FOO", canonical_mime_type="application/foo", extensions=[".foo"])
register_partitioner(file_type)(_test_partition_foo)
loader = _PartitionerLoader()
assert loader.get(file_type) is _test_partition_foo