1384 lines
49 KiB
Python
Raw Permalink Normal View History

# pyright: reportPrivateUsage=false
from __future__ import annotations
import json
import os
import pathlib
import tempfile
import warnings
from importlib import import_module
rfctr(part): prepare for pluggable auto-partitioners 1 (#3655) **Summary** In preparation for pluggable auto-partitioners simplify metadata as discussed. **Additional Context** - Pluggable auto-partitioners requires partitioners to have a consistent call signature. An arbitrary partitioner provided at runtime needs to have a call signature that is known and consistent. Basically `partition_x(filename, *, file, **kwargs)`. - The current `auto.partition()` is highly coupled to each distinct file-type partitioner, deciding which arguments to forward to each. - This is driven by the existence of "delegating" partitioners, those that convert their file-type and then call a second partitioner to do the actual partitioning. Both the delegating and proxy partitioners are decorated with metadata-post-processing decorators and those decorators are not idempotent. We call the situation where those decorators would run twice "double-decorating". For example, EPUB converts to HTML and calls `partition_html()` and both `partition_epub()` and `partition_html()` are decorated. - The way double-decorating has been avoided in the past is to avoid sending the arguments the metadata decorators are sensitive to to the proxy partitioner. This is very obscure, complex to reason about, error-prone, and just overall not a viable strategy. The better solution is to not decorate delegating partitioners and let the proxy partitioner handle all the metadata. - This first step in preparation for that is part of simplifying the metadata processing by removing unused or unwanted legacy parameters. - `date_from_file_object` is a misnomer because a file-object never contains last-modified data. - It can never produce useful results in the API where last-modified information must be provided by `metadata_last_modified`. - It is an undocumented parameter so not in use. - Using it can produce incorrect metadata.
2024-09-23 15:23:10 -07:00
from typing import Iterator
from unittest.mock import MagicMock, patch
import pytest
from PIL import Image
from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
from test_unstructured.partition.test_constants import (
EXPECTED_TABLE,
EXPECTED_TABLE_XLSX,
EXPECTED_TEXT,
EXPECTED_XLS_TABLE,
)
from test_unstructured.unit_utils import (
ANY,
FixtureRequest,
LogCaptureFixture,
example_doc_path,
function_mock,
method_mock,
)
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import (
Address,
CompositeElement,
Element,
ElementMetadata,
ListItem,
NarrativeText,
Table,
chore: Table chunking (#1540) This change is adding to our `add_chunking_strategy` logic so that we are able to chunk Table elements' `text` and `text_as_html` params. In order to keep the functionality under the same `by_title` chunking strategy we have renamed the `combine_under_n_chars` to `max_characters`. It functions the same way for the combining elements under Title's, as well as specifying a chunk size (in chars) for TableChunk elements. *renaming the variable to `max_characters` will also reflect the 'hard max' we will implement for large elements in followup PRs Additionally -> some lint changes snuck in when I ran `make tidy` hence the minor changes in unrelated files :) TODO: ✅ add unit tests --> note: added where I could to unit tests! Some unit tests I just clarified that the chunking strategy was now 'by_title' because we don't have a file example that has Table elements to test the 'by_num_characters' chunking strategy ✅ update changelog To manually test: ``` In [1]: filename="example-docs/example-10k.html" In [2]: from unstructured.chunking.title import chunk_table_element In [3]: from unstructured.partition.auto import partition In [4]: elements = partition(filename) # element at -2 happens to be a Table, and we'll get chunks of char size 4 here In [5]: chunks = chunk_table_element(elements[-2], 4) # examine text and text_as_html params ln [6]: for c in chunks: print(c.text) print(c.metadata.text_as_html) ``` --------- Co-authored-by: Yao You <theyaoyou@gmail.com>
2023-10-03 09:40:34 -07:00
TableChunk,
Text,
Title,
)
from unstructured.file_utils.filetype import detect_filetype
from unstructured.file_utils.model import FileType, create_file_type, register_partitioner
from unstructured.partition.auto import _PartitionerLoader, partition
rfctr(email): eml partitioner rewrite (#3694) **Summary** Initial attempts to incrementally refactor `partition_email()` into shape to allow pluggable partitioning quickly became too complex for ready code-review. Prepare separate rewritten module and tests and swap them out whole. **Additional Context** - Uses the modern stdlib `email` module to reliably accomplish several manual decoding steps in the legacy code. - Remove obsolete email-specific element-types which were replaced 18 months or so ago with email-specific metadata fields for things like Cc: addresses, subject, etc. - Remove accepting an email as `text: str` because MIME-email is inherently a binary format which can and often does contain multiple and contradictory character-encodings. - Remove `encoding` parameters as it is now unused. An email file is not a text file and as such does not have a single overall encoding. Character encoding is specified individually for each MIME-part within the message and often varies from one part to another in the same message. - Remove the need for a caller to specify `attachment_partitioner`. There is only one reasonable choice for this which is `auto.partition()`, consistent with the same interface and operation in `partition_msg()`. - Fixes #3671 along the way by silently skipping attachments with a file-type for which there is no partitioner. - Substantially extend the test-suite to cover multiple transport-encoding/charset combinations. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: scanny <scanny@users.noreply.github.com>
2024-10-15 19:02:33 -07:00
from unstructured.partition.common import UnsupportedFileFormatError
from unstructured.partition.utils.constants import PartitionStrategy
from unstructured.staging.base import elements_from_json, elements_to_dicts, elements_to_json
is_in_docker = os.path.exists("/.dockerenv")
# ================================================================================================
# CSV
# ================================================================================================
def test_auto_partition_csv_from_filename():
elements = partition(example_doc_path("stanley-cups.csv"))
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.filetype == "text/csv"
def test_auto_partition_csv_from_file():
with open(example_doc_path("stanley-cups.csv"), "rb") as f:
elements = partition(file=f)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert isinstance(elements[0], Table)
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.filetype == "text/csv"
# ================================================================================================
# DOC
# ================================================================================================
@pytest.mark.parametrize(
("pass_metadata_filename", "content_type"),
[(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
)
def test_auto_partition_doc_from_filename(
pass_metadata_filename: bool, content_type: str | None, expected_docx_elements: list[Element]
):
file_path = example_doc_path("simple.doc")
metadata_filename = file_path if pass_metadata_filename else None
elements = partition(
filename=file_path,
metadata_filename=metadata_filename,
content_type=content_type,
strategy=PartitionStrategy.HI_RES,
)
for e in elements:
print(f"{type(e).__name__}({repr(e.text)})")
assert elements == expected_docx_elements
assert all(e.metadata.filename == "simple.doc" for e in elements)
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
@pytest.mark.skipif(is_in_docker, reason="Passes in CI but not Docker. Remove skip on #3364 fix.")
def test_auto_partition_doc_from_file(expected_docx_elements: list[Element]):
with open(example_doc_path("simple.doc"), "rb") as f:
elements = partition(file=f)
assert elements == expected_docx_elements
# ================================================================================================
# DOCX
# ================================================================================================
def test_auto_partition_docx_from_filename(expected_docx_elements: list[Element]):
elements = partition(example_doc_path("simple.docx"), strategy=PartitionStrategy.HI_RES)
assert elements == expected_docx_elements
assert all(e.metadata.filename == "simple.docx" for e in elements)
def test_auto_partition_docx_from_file(expected_docx_elements: list[Element]):
with open(example_doc_path("simple.docx"), "rb") as f:
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
assert elements == expected_docx_elements
@pytest.mark.parametrize("file_name", ["simple.docx", "simple.doc", "simple.odt"])
@pytest.mark.parametrize(
"strategy",
[
PartitionStrategy.AUTO,
PartitionStrategy.FAST,
PartitionStrategy.HI_RES,
PartitionStrategy.OCR_ONLY,
],
)
def test_partition_forwards_strategy_arg_to_partition_docx_and_its_brokers(
request: FixtureRequest, file_name: str, strategy: str
):
"""The `strategy` arg value received by `partition()` is received by `partition_docx().
To do this in the brokering-partitioner cases (DOC, ODT) it must make its way to
`partition_doc()` or `partition_odt()` which must then forward it to `partition_docx()`. This
test makes sure it made it all the way.
Note this is 3 file-types X 4 strategies = 12 test-cases.
"""
from unstructured.partition.docx import _DocxPartitioner
def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]:
yield Text(f"strategy=={self._opts.strategy}")
_iter_elements_ = method_mock(
request,
_DocxPartitioner,
"_iter_document_elements",
side_effect=fake_iter_document_elements,
)
(element,) = partition(example_doc_path(file_name), strategy=strategy)
_iter_elements_.assert_called_once_with(ANY)
assert element.text == f"strategy=={strategy}"
# ================================================================================================
# EML
# ================================================================================================
EXPECTED_EMAIL_OUTPUT = [
NarrativeText(text="This is a test email to use for unit tests."),
Text(text="Important points:"),
ListItem(text="Roses are red"),
ListItem(text="Violets are blue"),
]
def test_auto_partition_email_from_filename():
file_path = example_doc_path("eml/fake-email.eml")
elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
assert len(elements) > 0
assert elements == EXPECTED_EMAIL_OUTPUT
assert elements[0].metadata.filename == os.path.basename(file_path)
assert elements[0].metadata.file_directory == os.path.split(file_path)[0]
def test_auto_partition_email_from_file():
with open(example_doc_path("eml/fake-email.eml"), "rb") as f:
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
assert len(elements) > 0
assert elements == EXPECTED_EMAIL_OUTPUT
# ================================================================================================
# EPUB
# ================================================================================================
def test_auto_partition_epub_from_filename():
elements = partition(example_doc_path("winter-sports.epub"), strategy=PartitionStrategy.HI_RES)
assert len(elements) > 0
assert elements[2].text.startswith("The Project Gutenberg eBook of Winter Sports")
def test_auto_partition_epub_from_file():
with open(example_doc_path("winter-sports.epub"), "rb") as f:
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
assert len(elements) > 0
assert elements[2].text.startswith("The Project Gutenberg eBook of Winter Sports")
# ================================================================================================
# HTML
# ================================================================================================
@pytest.mark.parametrize(
("pass_metadata_filename", "content_type"),
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
)
def test_auto_partition_html_from_filename(pass_metadata_filename: bool, content_type: str | None):
file_path = example_doc_path("example-10k-1p.html")
metadata_filename = file_path if pass_metadata_filename else None
elements = partition(
filename=file_path,
metadata_filename=metadata_filename,
content_type=content_type,
strategy=PartitionStrategy.HI_RES,
)
assert elements
expected_filename, expected_directory = os.path.basename(file_path), os.path.split(file_path)[0]
assert all(e.metadata.filename == expected_filename for e in elements)
assert all(e.metadata.file_directory == expected_directory for e in elements)
@pytest.mark.parametrize(
("pass_metadata_filename", "content_type"),
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
)
def test_auto_partition_html_from_file(pass_metadata_filename: bool, content_type: str | None):
file_path = example_doc_path("example-10k-1p.html")
metadata_filename = file_path if pass_metadata_filename else None
with open(file_path, "rb") as f:
elements = partition(
file=f,
metadata_filename=metadata_filename,
content_type=content_type,
strategy=PartitionStrategy.HI_RES,
)
assert len(elements) > 0
def test_auto_partition_html_pre_from_file():
elements = partition(example_doc_path("fake-html-pre.htm"))
assert len(elements) > 0
assert "PageBreak" not in [elem.category for elem in elements]
assert clean_extra_whitespace(elements[0].text).startswith("[107th Congress Public Law 56]")
assert isinstance(elements[0], NarrativeText)
assert all(e.metadata.filetype == "text/html" for e in elements)
assert all(e.metadata.filename == "fake-html-pre.htm" for e in elements)
# ================================================================================================
# IMAGE
# ================================================================================================
@pytest.mark.parametrize(
("pass_metadata_filename", "content_type"),
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_jpeg_from_filename(pass_metadata_filename: bool, content_type: str | None):
file_path = example_doc_path("img/layout-parser-paper-fast.jpg")
metadata_filename = file_path if pass_metadata_filename else None
elements = partition(
filename=file_path,
metadata_filename=metadata_filename,
content_type=content_type,
strategy=PartitionStrategy.AUTO,
)
e = elements[2]
assert e.text == (
"LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
)
assert e.metadata.coordinates is not None
@pytest.mark.parametrize(
("pass_metadata_filename", "content_type"),
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_jpeg_from_file(pass_metadata_filename: bool, content_type: str | None):
file_path = example_doc_path("img/layout-parser-paper-fast.jpg")
metadata_filename = file_path if pass_metadata_filename else None
with open(file_path, "rb") as f:
elements = partition(
file=f,
metadata_filename=metadata_filename,
content_type=content_type,
strategy=PartitionStrategy.AUTO,
)
e = elements[2]
assert e.text == (
"LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
)
assert e.metadata.coordinates is not None
def test_auto_partition_bmp_from_filename(tmp_path: pathlib.Path):
bmp_filename = str(tmp_path / "example.bmp")
with Image.open(example_doc_path("img/layout-parser-paper-with-table.jpg")) as img:
img.save(bmp_filename)
elements = partition(filename=bmp_filename, strategy=PartitionStrategy.HI_RES)
table = [e.metadata.text_as_html for e in elements if e.metadata.text_as_html]
assert len(table) == 1
assert "<table><thead><tr>" in table[0]
assert "</thead><tbody><tr>" in table[0]
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
def test_auto_partition_image_element_extraction(extract_image_block_to_payload: bool):
extract_image_block_types = ["Image", "Table"]
with tempfile.TemporaryDirectory() as tmpdir:
elements = partition(
filename=example_doc_path("img/embedded-images-tables.jpg"),
extract_image_block_types=extract_image_block_types,
extract_image_block_to_payload=extract_image_block_to_payload,
extract_image_block_output_dir=tmpdir,
)
assert_element_extraction(
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
)
# ================================================================================================
# JSON
# ================================================================================================
# TODO(scanny): This test should go away when we fix #3365. This test glosses over several
# important JSON "rehydration" behaviors, in particular that the metadata should match exactly.
# The following test `test_auto_partition_json_from_file_preserves_original_elements` will be the
# replacement for this test.
def test_auto_partitioned_json_output_maintains_consistency_with_fixture_elements():
"""Test auto-processing an unstructured json output file by filename."""
json_file_path = example_doc_path("spring-weather.html.json")
original_file_name = "spring-weather.html"
with open(json_file_path) as json_f:
expected_result = json.load(json_f)
partitioning_result = json.loads(
rfctr(part): prepare for pluggable auto-partitioners 1 (#3655) **Summary** In preparation for pluggable auto-partitioners simplify metadata as discussed. **Additional Context** - Pluggable auto-partitioners requires partitioners to have a consistent call signature. An arbitrary partitioner provided at runtime needs to have a call signature that is known and consistent. Basically `partition_x(filename, *, file, **kwargs)`. - The current `auto.partition()` is highly coupled to each distinct file-type partitioner, deciding which arguments to forward to each. - This is driven by the existence of "delegating" partitioners, those that convert their file-type and then call a second partitioner to do the actual partitioning. Both the delegating and proxy partitioners are decorated with metadata-post-processing decorators and those decorators are not idempotent. We call the situation where those decorators would run twice "double-decorating". For example, EPUB converts to HTML and calls `partition_html()` and both `partition_epub()` and `partition_html()` are decorated. - The way double-decorating has been avoided in the past is to avoid sending the arguments the metadata decorators are sensitive to to the proxy partitioner. This is very obscure, complex to reason about, error-prone, and just overall not a viable strategy. The better solution is to not decorate delegating partitioners and let the proxy partitioner handle all the metadata. - This first step in preparation for that is part of simplifying the metadata processing by removing unused or unwanted legacy parameters. - `date_from_file_object` is a misnomer because a file-object never contains last-modified data. - It can never produce useful results in the API where last-modified information must be provided by `metadata_last_modified`. - It is an undocumented parameter so not in use. - Using it can produce incorrect metadata.
2024-09-23 15:23:10 -07:00
elements_to_json(
partition(
filename=str(json_file_path),
# -- use the original file name to get the same element IDs (hashes) --
metadata_filename=original_file_name,
strategy=PartitionStrategy.HI_RES,
)
)
)
for elem in partitioning_result:
elem.pop("metadata")
for elem in expected_result:
elem.pop("metadata")
assert expected_result == partitioning_result
@pytest.mark.xfail(
reason=(
"https://github.com/Unstructured-IO/unstructured/issues/3365"
" partition_json() does not preserve original element-id or metadata"
),
raises=AssertionError,
strict=True,
)
def test_auto_partition_json_from_file_preserves_original_elements():
file_path = example_doc_path("simple.json")
original_elements = elements_from_json(file_path)
with open(file_path, "rb") as f:
partitioned_elements = partition(file=f)
assert elements_to_dicts(partitioned_elements) == elements_to_dicts(original_elements)
def test_auto_partition_processes_simple_ndjson(tmp_path: pathlib.Path):
text = '{"text": "hello", "type": "NarrativeText"}'
file_path = str(tmp_path / "unprocessable.json")
with open(file_path, "w") as f:
f.write(text)
result = partition(filename=file_path)
assert len(result) == 1
assert isinstance(result[0], NarrativeText)
assert "hello" in result[0].text
# ================================================================================================
# MD
# ================================================================================================
def test_partition_md_from_url_works_with_embedded_html():
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/README.md"
elements = partition(url=url, content_type="text/markdown", strategy=PartitionStrategy.HI_RES)
assert "unstructured" in elements[1].text
# ================================================================================================
# MSG
# ================================================================================================
def test_auto_partition_msg_from_filename():
assert partition(example_doc_path("fake-email.msg"), strategy=PartitionStrategy.HI_RES) == [
NarrativeText(text="This is a test email to use for unit tests."),
Text(text="Important points:"),
ListItem(text="Roses are red"),
ListItem(text="Violets are blue"),
]
# ================================================================================================
# ODT
# ================================================================================================
def test_auto_partition_odt_from_filename(expected_docx_elements: list[Element]):
elements = partition(example_doc_path("simple.odt"), strategy=PartitionStrategy.HI_RES)
assert elements == expected_docx_elements
def test_auto_partition_odt_from_file(expected_docx_elements: list[Element]):
with open(example_doc_path("simple.odt"), "rb") as f:
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
assert elements == expected_docx_elements
# ================================================================================================
# ORG
# ================================================================================================
def test_auto_partition_org_from_filename():
elements = partition(example_doc_path("README.org"))
assert elements[0] == Title("Example Docs")
assert elements[0].metadata.filetype == "text/org"
def test_auto_partition_org_from_file():
with open(example_doc_path("README.org"), "rb") as f:
elements = partition(file=f, content_type="text/org")
assert elements[0] == Title("Example Docs")
assert elements[0].metadata.filetype == "text/org"
# ================================================================================================
# PDF
# ================================================================================================
@pytest.mark.parametrize(
("pass_metadata_filename", "content_type"),
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_type: str | None):
file_path = example_doc_path("pdf/chevron-page.pdf")
metadata_filename = file_path if pass_metadata_filename else None
elements = partition(
filename=file_path,
metadata_filename=metadata_filename,
content_type=content_type,
strategy=PartitionStrategy.HI_RES,
)
e = elements[0]
assert isinstance(e, Title)
assert e.text.startswith("eastern mediterranean")
assert e.metadata.filename == os.path.basename(file_path)
assert e.metadata.file_directory == os.path.split(file_path)[0]
e = elements[1]
assert isinstance(e, NarrativeText)
assert e.text.startswith("Were investing")
@pytest.mark.parametrize(
("pass_metadata_filename", "content_type"),
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type: str | None):
file_path = example_doc_path("pdf/chevron-page.pdf")
metadata_filename = file_path if pass_metadata_filename else None
with open(file_path, "rb") as f:
elements = partition(
file=f,
metadata_filename=metadata_filename,
content_type=content_type,
strategy=PartitionStrategy.HI_RES,
)
e = elements[0]
assert isinstance(e, Title)
assert e.text.startswith("eastern mediterranean")
e = elements[1]
assert isinstance(e, NarrativeText)
assert e.text.startswith("Were investing")
def test_auto_partition_pdf_with_fast_strategy(request: FixtureRequest):
partition_pdf_ = function_mock(
request,
"unstructured.partition.pdf.partition_pdf",
return_value=[NarrativeText("Hello there!")],
)
partitioner_loader_get_ = method_mock(
request, _PartitionerLoader, "get", return_value=partition_pdf_
)
file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf")
partition(file_path, strategy=PartitionStrategy.FAST)
partitioner_loader_get_.assert_called_once_with(ANY, FileType.PDF)
partition_pdf_.assert_called_once_with(
filename=file_path,
file=None,
url=None,
strategy=PartitionStrategy.FAST,
languages=None,
metadata_filename=None,
infer_table_structure=False,
extract_images_in_pdf=False,
extract_image_block_types=None,
extract_image_block_output_dir=None,
extract_image_block_to_payload=False,
hi_res_model_name=None,
starting_page_number=1,
)
@pytest.mark.parametrize("infer_bool", [True, False])
def test_auto_handles_kwarg_with_infer_table_structure(infer_bool):
with patch(
"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
) as mock_process_file_with_model:
partition(
example_doc_path("pdf/layout-parser-paper-fast.pdf"),
pdf_infer_table_structure=True,
strategy=PartitionStrategy.HI_RES,
infer_table_structure=infer_bool,
)
assert mock_process_file_with_model.call_args[1]["infer_table_structure"] is infer_bool
def test_auto_handles_kwarg_with_infer_table_structure_when_none():
with patch(
"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
) as mock_process_file_with_model:
partition(
example_doc_path("pdf/layout-parser-paper-fast.pdf"),
pdf_infer_table_structure=True,
strategy=PartitionStrategy.HI_RES,
infer_table_structure=None,
)
assert mock_process_file_with_model.call_args[1]["infer_table_structure"] is True
def test_auto_partition_pdf_uses_pdf_infer_table_structure_argument():
with patch(
"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
) as mock_process_file_with_model:
partition(
example_doc_path("pdf/layout-parser-paper-fast.pdf"),
pdf_infer_table_structure=True,
strategy=PartitionStrategy.HI_RES,
)
assert mock_process_file_with_model.call_args[1]["infer_table_structure"]
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
def test_auto_partition_pdf_element_extraction(extract_image_block_to_payload: bool):
extract_image_block_types = ["Image", "Table"]
with tempfile.TemporaryDirectory() as tmpdir:
elements = partition(
example_doc_path("pdf/embedded-images-tables.pdf"),
extract_image_block_types=extract_image_block_types,
extract_image_block_to_payload=extract_image_block_to_payload,
extract_image_block_output_dir=tmpdir,
)
assert_element_extraction(
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
)
def test_auto_partition_html_element_extraction():
extract_image_block_types = ["Image"]
with tempfile.TemporaryDirectory() as tmpdir:
elements = partition(
example_doc_path("fake-html-with-base64-image.html"),
extract_image_block_types=extract_image_block_types,
extract_image_block_to_payload=True,
)
assert_element_extraction(elements, extract_image_block_types, True, tmpdir)
def test_auto_partition_html_image_with_url():
elements = partition(
example_doc_path("fake-html-with-image-from-url.html"),
)
assert elements[1].metadata.image_url is not None
def test_partition_pdf_does_not_raise_warning():
# NOTE(robinson): This is the recommended way to check that no warning is emitted,
# per the pytest docs.
# ref: https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html
# #additional-use-cases-of-warnings-in-tests
with warnings.catch_warnings():
warnings.simplefilter("error")
partition(
example_doc_path("pdf/layout-parser-paper-fast.pdf"), strategy=PartitionStrategy.HI_RES
)
# ================================================================================================
# PPT
# ================================================================================================
def test_auto_partition_ppt_from_filename():
file_path = example_doc_path("fake-power-point.ppt")
elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
assert elements == [
Title(text="Adding a Bullet Slide"),
ListItem(text="Find the bullet slide layout"),
ListItem(text="Use _TextFrame.text for first bullet"),
ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
NarrativeText(text="Here is a lot of text!"),
NarrativeText(text="Here is some text in a text box!"),
]
assert all(e.metadata.filename == "fake-power-point.ppt" for e in elements)
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
# ================================================================================================
# PPTX
# ================================================================================================
def test_auto_partition_pptx_from_filename():
file_path = example_doc_path("fake-power-point.pptx")
elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
assert elements == [
Title(text="Adding a Bullet Slide"),
ListItem(text="Find the bullet slide layout"),
ListItem(text="Use _TextFrame.text for first bullet"),
ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
NarrativeText(text="Here is a lot of text!"),
NarrativeText(text="Here is some text in a text box!"),
]
assert all(e.metadata.filename == "fake-power-point.pptx" for e in elements)
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
@pytest.mark.parametrize("file_name", ["simple.pptx", "fake-power-point.ppt"])
@pytest.mark.parametrize(
"strategy",
[
PartitionStrategy.AUTO,
PartitionStrategy.FAST,
PartitionStrategy.HI_RES,
PartitionStrategy.OCR_ONLY,
],
)
def test_partition_forwards_strategy_arg_to_partition_pptx_and_its_brokers(
request: FixtureRequest, file_name: str, strategy: str
):
"""The `strategy` arg value received by `partition()` is received by `partition_pptx().
To do this in the brokering-partitioner case (PPT) the strategy argument must make its way to
`partition_ppt()` which must then forward it to `partition_pptx()`. This test makes sure it
made it all the way.
Note this is 2 file-types X 4 strategies = 8 test-cases.
"""
from unstructured.partition.pptx import _PptxPartitioner
def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]:
yield Text(f"strategy=={self._opts.strategy}")
_iter_elements_ = method_mock(
request,
_PptxPartitioner,
"_iter_presentation_elements",
side_effect=fake_iter_presentation_elements,
)
(element,) = partition(example_doc_path(file_name), strategy=strategy)
_iter_elements_.assert_called_once_with(ANY)
assert element.text == f"strategy=={strategy}"
# ================================================================================================
# RST
# ================================================================================================
def test_auto_partition_rst_from_filename():
elements = partition(example_doc_path("README.rst"))
assert elements[0] == Title("Example Docs")
assert elements[0].metadata.filetype == "text/x-rst"
def test_auto_partition_rst_from_file():
with open(example_doc_path("README.rst"), "rb") as f:
elements = partition(file=f, content_type="text/x-rst")
assert elements[0] == Title("Example Docs")
assert elements[0].metadata.filetype == "text/x-rst"
# ================================================================================================
# RTF
# ================================================================================================
def test_auto_partition_rtf_from_filename():
elements = partition(example_doc_path("fake-doc.rtf"), strategy=PartitionStrategy.HI_RES)
assert elements[0] == Title("My First Heading")
# ================================================================================================
# TSV
# ================================================================================================
fix: parse URL response Content-Type according to RFC 9110 (#2950) Currently, `file_and_type_from_url()` does not correctly handle the `Content-Type` header. Specifically, it assumes that the header contains only the mime-type (e.g. `text/html`), however, [RFC 9110](https://www.rfc-editor.org/rfc/rfc9110#field.content-type) allows for additional directives — specifically the `charset` — to be returned in the header. This leads to a `ValueError` when loading a URL with a response Content-Type header such as `text/html; charset=UTF-8`. To reproduce the issue: ```python from unstructured.partition.auto import partition url = "https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/" partition(url=url) ``` Which will result in the following exception: ```python { "name": "ValueError", "message": "Invalid file. The FileType.UNK file type is not supported in partition.", "stack": "--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[1], line 4 1 from unstructured.partition.auto import partition 3 url = \"https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/\" ----> 4 partition(url=url) File ~/miniconda3/envs/ai-tasks/lib/python3.11/site-packages/unstructured/partition/auto.py:541, in partition(filename, content_type, file, file_filename, url, include_page_breaks, strategy, encoding, paragraph_grouper, headers, skip_infer_table_types, ssl_verify, ocr_languages, languages, detect_language_per_element, pdf_infer_table_structure, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, xml_keep_tags, data_source_metadata, metadata_filename, request_timeout, hi_res_model_name, model_name, date_from_file_object, starting_page_number, **kwargs) 539 else: 540 msg = \"Invalid file\" if not filename else f\"Invalid file {filename}\" --> 541 raise ValueError(f\"{msg}. The {filetype} file type is not supported in partition.\") 543 for element in elements: 544 element.metadata.url = url ValueError: Invalid file. The FileType.UNK file type is not supported in partition." } ``` This PR fixes the issue by parsing the mime-type out of the `Content-Type` header string. Closes #2257
2024-04-30 07:53:44 +02:00
def test_auto_partition_tsv_from_filename():
elements = partition(example_doc_path("stanley-cups.tsv"))
fix: parse URL response Content-Type according to RFC 9110 (#2950) Currently, `file_and_type_from_url()` does not correctly handle the `Content-Type` header. Specifically, it assumes that the header contains only the mime-type (e.g. `text/html`), however, [RFC 9110](https://www.rfc-editor.org/rfc/rfc9110#field.content-type) allows for additional directives — specifically the `charset` — to be returned in the header. This leads to a `ValueError` when loading a URL with a response Content-Type header such as `text/html; charset=UTF-8`. To reproduce the issue: ```python from unstructured.partition.auto import partition url = "https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/" partition(url=url) ``` Which will result in the following exception: ```python { "name": "ValueError", "message": "Invalid file. The FileType.UNK file type is not supported in partition.", "stack": "--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[1], line 4 1 from unstructured.partition.auto import partition 3 url = \"https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/\" ----> 4 partition(url=url) File ~/miniconda3/envs/ai-tasks/lib/python3.11/site-packages/unstructured/partition/auto.py:541, in partition(filename, content_type, file, file_filename, url, include_page_breaks, strategy, encoding, paragraph_grouper, headers, skip_infer_table_types, ssl_verify, ocr_languages, languages, detect_language_per_element, pdf_infer_table_structure, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, xml_keep_tags, data_source_metadata, metadata_filename, request_timeout, hi_res_model_name, model_name, date_from_file_object, starting_page_number, **kwargs) 539 else: 540 msg = \"Invalid file\" if not filename else f\"Invalid file {filename}\" --> 541 raise ValueError(f\"{msg}. The {filetype} file type is not supported in partition.\") 543 for element in elements: 544 element.metadata.url = url ValueError: Invalid file. The FileType.UNK file type is not supported in partition." } ``` This PR fixes the issue by parsing the mime-type out of the `Content-Type` header string. Closes #2257
2024-04-30 07:53:44 +02:00
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.filetype == "text/tsv"
fix: parse URL response Content-Type according to RFC 9110 (#2950) Currently, `file_and_type_from_url()` does not correctly handle the `Content-Type` header. Specifically, it assumes that the header contains only the mime-type (e.g. `text/html`), however, [RFC 9110](https://www.rfc-editor.org/rfc/rfc9110#field.content-type) allows for additional directives — specifically the `charset` — to be returned in the header. This leads to a `ValueError` when loading a URL with a response Content-Type header such as `text/html; charset=UTF-8`. To reproduce the issue: ```python from unstructured.partition.auto import partition url = "https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/" partition(url=url) ``` Which will result in the following exception: ```python { "name": "ValueError", "message": "Invalid file. The FileType.UNK file type is not supported in partition.", "stack": "--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[1], line 4 1 from unstructured.partition.auto import partition 3 url = \"https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/\" ----> 4 partition(url=url) File ~/miniconda3/envs/ai-tasks/lib/python3.11/site-packages/unstructured/partition/auto.py:541, in partition(filename, content_type, file, file_filename, url, include_page_breaks, strategy, encoding, paragraph_grouper, headers, skip_infer_table_types, ssl_verify, ocr_languages, languages, detect_language_per_element, pdf_infer_table_structure, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, xml_keep_tags, data_source_metadata, metadata_filename, request_timeout, hi_res_model_name, model_name, date_from_file_object, starting_page_number, **kwargs) 539 else: 540 msg = \"Invalid file\" if not filename else f\"Invalid file {filename}\" --> 541 raise ValueError(f\"{msg}. The {filetype} file type is not supported in partition.\") 543 for element in elements: 544 element.metadata.url = url ValueError: Invalid file. The FileType.UNK file type is not supported in partition." } ``` This PR fixes the issue by parsing the mime-type out of the `Content-Type` header string. Closes #2257
2024-04-30 07:53:44 +02:00
# ================================================================================================
# TXT
# ================================================================================================
@pytest.mark.parametrize(
("filename", "expected_elements"),
[
(
"fake-text.txt",
[
NarrativeText(text="This is a test document to use for unit tests."),
Address(text="Doylestown, PA 18901"),
Title(text="Important points:"),
ListItem(text="Hamburgers are delicious"),
ListItem(text="Dogs are the best"),
ListItem(text="I love fuzzy blankets"),
],
),
("fake-text-all-whitespace.txt", []),
],
)
def test_auto_partition_text_from_filename(filename: str, expected_elements: list[Element]):
file_path = example_doc_path(filename)
elements = partition(filename=file_path, strategy=PartitionStrategy.HI_RES)
assert elements == expected_elements
assert all(e.metadata.filename == filename for e in elements)
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
def test_auto_partition_text_from_file():
with open(example_doc_path("fake-text.txt"), "rb") as f:
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
assert len(elements) > 0
assert elements == [
NarrativeText(text="This is a test document to use for unit tests."),
Address(text="Doylestown, PA 18901"),
Title(text="Important points:"),
ListItem(text="Hamburgers are delicious"),
ListItem(text="Dogs are the best"),
ListItem(text="I love fuzzy blankets"),
]
# ================================================================================================
# XLS
# ================================================================================================
def test_auto_partition_xls_from_filename():
elements = partition(
example_doc_path("tests-example.xls"), include_header=False, skip_infer_table_types=[]
)
fix(xlsx): xlsx subtable algorithm (#2534) **Reviewers:** It may be easier to review each of the two commits separately. The first adds the new `_SubtableParser` object with its unit-tests and the second one uses that object to replace the flawed existing subtable-parsing algorithm. **Summary** There are a cluster of bugs in `partition_xlsx()` that all derive from flaws in the algorithm we use to detect "subtables". These are encountered when the user wants to get multiple document-elements from each worksheet, which is the default (argument `find_subtable = True`). This PR replaces the flawed existing algorithm with a `_SubtableParser` object that encapsulates all that logic and has thorough unit-tests. **Additional Context** This is a summary of the failure cases. There are a few other cases but they're closely related and this was enough evidence and scope for my purposes. This PR fixes all these bugs: ```python # # -- ✅ CASE 1: There are no leading or trailing single-cell rows. # -> this subtable functions never get called, subtable is emitted as the only element # # a b -> Table(a, b, c, d) # c d # -- ✅ CASE 2: There is exactly one leading single-cell row. # -> Leading single-cell row emitted as `Title` element, core-table properly identified. # # a -> [ Title(a), # b c Table(b, c, d, e) ] # d e # -- ❌ CASE 3: There are two-or-more leading single-cell rows. # -> leading single-cell rows are included in subtable # # a -> [ Table(a, b, c, d, e, f) ] # b # c d # e f # -- ❌ CASE 4: There is exactly one trailing single-cell row. # -> core table is dropped. trailing single-cell row is emitted as Title # (this is the behavior in the reported bug) # # a b -> [ Title(e) ] # c d # e # -- ❌ CASE 5: There are two-or-more trailing single-cell rows. # -> core table is dropped. trailing single-cell rows are each emitted as a Title # # a b -> [ Title(e), # c d Title(f) ] # e # f # -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b c Table(b, c, d, e), # d e Title(f) ] # f # -- ✅ CASE 7: There are two leading and one trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g) ] # g # -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows. # -> core table is correctly identified, leading and trailing single-cell rows are each # emitted as a Title. # # a -> [ Title(a), # b Title(b), # c d Table(c, d, e, f), # e f Title(g), # g Title(h) ] # h # -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below. # -> First cell is mistakenly emitted as title, remaining cells are dropped. # # a b c -> [ Title(a) ] # -- ❌ CASE 10: Single-row subtable with one leading single-cell row. # -> Leading single-row cell is correctly identified as title, core-table is mis-identified # as a `Title` and truncated. # # a -> [ Title(a), # b c d Title(b) ] ```
2024-02-13 20:29:17 -08:00
assert len(elements) == 14
assert sum(isinstance(e, Table) for e in elements) == 2
assert elements[0].metadata.text_as_html == EXPECTED_XLS_TABLE
assert len(elements[0].text) == 507
# ================================================================================================
# XLSX
# ================================================================================================
def test_auto_partition_xlsx_from_filename():
elements = partition(
example_doc_path("stanley-cups.xlsx"), include_header=False, skip_infer_table_types=[]
)
assert len(elements) == 4
assert sum(isinstance(e, Table) for e in elements) == 2
assert sum(isinstance(e, Title) for e in elements) == 2
assert clean_extra_whitespace(elements[0].text) == "Stanley Cups"
assert clean_extra_whitespace(elements[1].text) == (
"Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
)
assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
assert all(e.metadata.page_number == 1 for e in elements[:2])
assert all(e.metadata.page_number == 2 for e in elements[2:])
assert all(
e.metadata.filetype == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
for e in elements
)
def test_auto_partition_xlsx_from_file():
with open(example_doc_path("stanley-cups.xlsx"), "rb") as f:
elements = partition(file=f, include_header=False, skip_infer_table_types=[])
assert len(elements) == 4
assert sum(isinstance(element, Table) for element in elements) == 2
assert sum(isinstance(element, Title) for element in elements) == 2
assert clean_extra_whitespace(elements[0].text) == "Stanley Cups"
assert clean_extra_whitespace(elements[1].text) == (
"Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
)
assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
assert all(e.metadata.page_number == 1 for e in elements[:2])
assert all(e.metadata.page_number == 2 for e in elements[2:])
assert all(
e.metadata.filetype == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
for e in elements
)
def test_auto_partition_xlsx_respects_starting_page_number_argument():
elements = partition(example_doc_path("stanley-cups.xlsx"), starting_page_number=3)
assert all(e.metadata.page_number == 3 for e in elements[:2])
assert all(e.metadata.page_number == 4 for e in elements[2:])
# ================================================================================================
# XML
# ================================================================================================
def test_auto_partition_xml_from_filename():
elements = partition(example_doc_path("factbook.xml"), xml_keep_tags=False)
assert elements[0].text == "United States"
assert all(e.metadata.filename == "factbook.xml" for e in elements)
def test_auto_partition_xml_from_file():
with open(example_doc_path("factbook.xml"), "rb") as f:
elements = partition(file=f, xml_keep_tags=False)
assert elements[0].text == "United States"
def test_auto_partition_xml_from_filename_with_tags():
elements = partition(example_doc_path("factbook.xml"), xml_keep_tags=True)
assert "<leader>Joe Biden</leader>" in elements[0].text
assert elements[0].metadata.filename == "factbook.xml"
def test_auto_partition_xml_from_file_with_tags():
with open(example_doc_path("factbook.xml"), "rb") as f:
elements = partition(file=f, xml_keep_tags=True)
assert "<leader>Joe Biden</leader>" in elements[0].text
# ================================================================================================
# FILE_TYPE NOT RECOGNIZED OR NOT SUPPORTED
# ================================================================================================
def test_auto_partition_raises_with_bad_type(request: FixtureRequest):
detect_filetype_ = function_mock(
request, "unstructured.partition.auto.detect_filetype", return_value=FileType.UNK
)
rfctr(email): eml partitioner rewrite (#3694) **Summary** Initial attempts to incrementally refactor `partition_email()` into shape to allow pluggable partitioning quickly became too complex for ready code-review. Prepare separate rewritten module and tests and swap them out whole. **Additional Context** - Uses the modern stdlib `email` module to reliably accomplish several manual decoding steps in the legacy code. - Remove obsolete email-specific element-types which were replaced 18 months or so ago with email-specific metadata fields for things like Cc: addresses, subject, etc. - Remove accepting an email as `text: str` because MIME-email is inherently a binary format which can and often does contain multiple and contradictory character-encodings. - Remove `encoding` parameters as it is now unused. An email file is not a text file and as such does not have a single overall encoding. Character encoding is specified individually for each MIME-part within the message and often varies from one part to another in the same message. - Remove the need for a caller to specify `attachment_partitioner`. There is only one reasonable choice for this which is `auto.partition()`, consistent with the same interface and operation in `partition_msg()`. - Fixes #3671 along the way by silently skipping attachments with a file-type for which there is no partitioner. - Substantially extend the test-suite to cover multiple transport-encoding/charset combinations. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: scanny <scanny@users.noreply.github.com>
2024-10-15 19:02:33 -07:00
with pytest.raises(
UnsupportedFileFormatError,
2024-12-10 12:44:34 -08:00
match="Partitioning is not supported for the FileType.UNK file type.",
rfctr(email): eml partitioner rewrite (#3694) **Summary** Initial attempts to incrementally refactor `partition_email()` into shape to allow pluggable partitioning quickly became too complex for ready code-review. Prepare separate rewritten module and tests and swap them out whole. **Additional Context** - Uses the modern stdlib `email` module to reliably accomplish several manual decoding steps in the legacy code. - Remove obsolete email-specific element-types which were replaced 18 months or so ago with email-specific metadata fields for things like Cc: addresses, subject, etc. - Remove accepting an email as `text: str` because MIME-email is inherently a binary format which can and often does contain multiple and contradictory character-encodings. - Remove `encoding` parameters as it is now unused. An email file is not a text file and as such does not have a single overall encoding. Character encoding is specified individually for each MIME-part within the message and often varies from one part to another in the same message. - Remove the need for a caller to specify `attachment_partitioner`. There is only one reasonable choice for this which is `auto.partition()`, consistent with the same interface and operation in `partition_msg()`. - Fixes #3671 along the way by silently skipping attachments with a file-type for which there is no partitioner. - Substantially extend the test-suite to cover multiple transport-encoding/charset combinations. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: scanny <scanny@users.noreply.github.com>
2024-10-15 19:02:33 -07:00
):
partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES)
detect_filetype_.assert_called_once_with(
file_path="made-up.fake",
file=None,
encoding=None,
content_type=None,
metadata_file_path=None,
)
# ================================================================================================
# LOAD FROM URL
# ================================================================================================
def test_auto_partition_from_url():
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
elements = partition(url=url, content_type="text/plain", strategy=PartitionStrategy.HI_RES)
assert elements[0] == Title("Apache License")
assert all(e.metadata.url == url for e in elements)
def test_auto_partition_from_url_with_rfc9110_content_type():
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
elements = partition(
url=url, content_type="text/plain; charset=utf-8", strategy=PartitionStrategy.HI_RES
)
assert elements[0] == Title("Apache License")
assert all(e.metadata.url == url for e in elements)
def test_auto_partition_from_url_without_providing_content_type():
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
elements = partition(url=url, strategy=PartitionStrategy.HI_RES)
assert elements[0] == Title("Apache License")
assert all(e.metadata.url == url for e in elements)
def test_auto_partition_warns_if_header_set_and_not_url(caplog: LogCaptureFixture):
partition(
example_doc_path("eml/fake-email.eml"),
headers={"Accept": "application/pdf"},
strategy=PartitionStrategy.HI_RES,
)
assert caplog.records[0].levelname == "WARNING"
assert "headers kwarg is set but the url kwarg is not. The headers kwarg will b" in caplog.text
def test_auto_partition_from_url_routes_timeout_to_HTTP_request(request: FixtureRequest):
file_and_type_from_url_ = function_mock(
request,
"unstructured.partition.auto.file_and_type_from_url",
side_effect=ConnectionError("Trouble on the wire ..."),
)
with pytest.raises(ConnectionError, match="Trouble on the wire ..."):
partition(url="http://eie.io", request_timeout=326)
file_and_type_from_url_.assert_called_once_with(
url="http://eie.io", content_type=None, headers={}, ssl_verify=True, request_timeout=326
)
# ================================================================================================
# OTHER ARGS
# ================================================================================================
# -- chunking_strategy ----------------------------------------------------
def test_auto_partition_forwards_chunking_strategy_via_kwargs():
chunks = partition(example_doc_path("example-10k-1p.html"), chunking_strategy="by_title")
assert all(isinstance(chunk, (CompositeElement, Table, TableChunk)) for chunk in chunks)
chore: Table chunking (#1540) This change is adding to our `add_chunking_strategy` logic so that we are able to chunk Table elements' `text` and `text_as_html` params. In order to keep the functionality under the same `by_title` chunking strategy we have renamed the `combine_under_n_chars` to `max_characters`. It functions the same way for the combining elements under Title's, as well as specifying a chunk size (in chars) for TableChunk elements. *renaming the variable to `max_characters` will also reflect the 'hard max' we will implement for large elements in followup PRs Additionally -> some lint changes snuck in when I ran `make tidy` hence the minor changes in unrelated files :) TODO: ✅ add unit tests --> note: added where I could to unit tests! Some unit tests I just clarified that the chunking strategy was now 'by_title' because we don't have a file example that has Table elements to test the 'by_num_characters' chunking strategy ✅ update changelog To manually test: ``` In [1]: filename="example-docs/example-10k.html" In [2]: from unstructured.chunking.title import chunk_table_element In [3]: from unstructured.partition.auto import partition In [4]: elements = partition(filename) # element at -2 happens to be a Table, and we'll get chunks of char size 4 here In [5]: chunks = chunk_table_element(elements[-2], 4) # examine text and text_as_html params ln [6]: for c in chunks: print(c.text) print(c.metadata.text_as_html) ``` --------- Co-authored-by: Yao You <theyaoyou@gmail.com>
2023-10-03 09:40:34 -07:00
def test_auto_partition_forwards_max_characters_via_kwargs():
chunks = partition(
example_doc_path("example-10k-1p.html"),
chunking_strategy="by_title",
max_characters=250,
)
assert all(len(chunk.text) <= 250 for chunk in chunks)
# -- detect_language_per_element ------------------------------------------
def test_auto_partition_respects_detect_language_per_element_arg():
elements = partition(
example_doc_path("language-docs/eng_spa_mult.txt"), detect_language_per_element=True
)
langs = [element.metadata.languages for element in elements]
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
# -- languages ------------------------------------------------------------
@pytest.mark.parametrize(
"file_extension",
[
"doc",
"docx",
"eml",
"epub",
"html",
"md",
"odt",
"org",
"ppt",
"pptx",
"rst",
"rtf",
"txt",
"xml",
],
)
def test_auto_partition_respects_language_arg(file_extension: str):
elements = partition(
example_doc_path(f"language-docs/eng_spa_mult.{file_extension}"), languages=["deu"]
)
assert all(element.metadata.languages == ["deu"] for element in elements)
# -- include_page_breaks --------------------------------------------------
def test_auto_partition_forwards_include_page_breaks_to_partition_pdf():
elements = partition(
example_doc_path("pdf/layout-parser-paper-fast.pdf"),
include_page_breaks=True,
strategy=PartitionStrategy.HI_RES,
)
assert "PageBreak" in [elem.category for elem in elements]
# -- metadata_filename ----------------------------------------------------
def test_auto_partition_forwards_metadata_filename_via_kwargs():
with open(example_doc_path("fake-text.txt"), "rb") as f:
elements = partition(file=f, metadata_filename="much-more-interesting-name.txt")
assert all(e.metadata.filename == "much-more-interesting-name.txt" for e in elements)
# -- ocr_languages --------------------------------------------------------
def test_auto_partition_image_formats_languages_for_tesseract(request: FixtureRequest):
process_file_with_ocr_ = function_mock(
request, "unstructured.partition.pdf_image.ocr.process_file_with_ocr"
)
partition(
example_doc_path("img/chi_sim_image.jpeg"),
strategy=PartitionStrategy.HI_RES,
languages=["zh"],
)
call_kwargs = process_file_with_ocr_.call_args_list[0][1]
assert call_kwargs["ocr_languages"] == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
@pytest.mark.parametrize(("languages", "ocr_languages"), [(["auto"], ""), (["eng"], "")])
def test_auto_partition_ignores_empty_string_for_ocr_languages(
languages: list[str], ocr_languages: str
):
elements = partition(
example_doc_path("book-war-and-peace-1p.txt"),
strategy=PartitionStrategy.OCR_ONLY,
ocr_languages=ocr_languages,
languages=languages,
)
assert all(e.metadata.languages == ["eng"] for e in elements)
def test_auto_partition_warns_with_ocr_languages(caplog: LogCaptureFixture):
partition(
example_doc_path("pdf/chevron-page.pdf"),
strategy=PartitionStrategy.HI_RES,
ocr_languages="eng",
)
assert caplog.records[0].levelname == "WARNING"
assert "The ocr_languages kwarg will be deprecated" in caplog.text
# -- skip_infer_table_types -----------------------------------------------
@pytest.mark.parametrize(
("skip_infer_table_types", "filename", "has_text_as_html"),
[
(["xlsx"], "stanley-cups.xlsx", False),
([], "stanley-cups.xlsx", True),
(["odt"], "fake.odt", False),
([], "fake.odt", True),
],
)
def test_auto_partition_respects_skip_infer_table_types(
skip_infer_table_types: list[str], filename: str, has_text_as_html: bool
):
with open(example_doc_path(filename), "rb") as f:
elements = partition(file=f, skip_infer_table_types=skip_infer_table_types)
table_elements = [e for e in elements if isinstance(e, Table)]
assert table_elements
for e in table_elements:
assert (e.metadata.text_as_html is not None) == has_text_as_html
# ================================================================================================
# METADATA BEHAVIORS
# ================================================================================================
# -- .filetype ------------------------------------------------------------
@pytest.mark.parametrize(
("content_type", "shortname", "expected_value"),
[
("text/csv", "csv", "text/csv"),
("text/html", "html", "text/html"),
("jdsfjdfsjkds", "pdf", None),
],
)
def test_auto_partition_adds_filetype_to_metadata(
request: FixtureRequest,
content_type: str,
shortname: str,
expected_value: str | None,
):
partition_fn_ = function_mock(
request,
f"unstructured.partition.{shortname}.partition_{shortname}",
return_value=[Text("text 1"), Text("text 2")],
)
partitioner_loader_get_ = method_mock(
request, _PartitionerLoader, "get", return_value=partition_fn_
)
elements = partition(
example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type
)
partitioner_loader_get_.assert_called_once()
assert len(elements) == 2
assert all(e.metadata.filetype == expected_value for e in elements)
@pytest.mark.parametrize(
"content_type",
[
# -- content-type provided as argument --
"application/pdf",
# -- auto-detected content-type --
None,
],
)
def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partitioner(
request: FixtureRequest, content_type: str | None
):
metadata = ElementMetadata(filetype="imapdf")
partition_pdf_ = function_mock(
request,
"unstructured.partition.pdf.partition_pdf",
return_value=[Text("text 1", metadata=metadata), Text("text 2", metadata=metadata)],
)
partitioner_loader_get_ = method_mock(
request, _PartitionerLoader, "get", return_value=partition_pdf_
)
elements = partition(
example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type
)
partitioner_loader_get_.assert_called_once_with(ANY, FileType.PDF)
assert len(elements) == 2
assert all(e.metadata.filetype == "application/pdf" for e in elements)
@pytest.mark.parametrize(
("file_name", "file_type"),
[
("stanley-cups.csv", FileType.CSV),
("simple.doc", FileType.DOC),
("simple.docx", FileType.DOCX),
("fake-email.eml", FileType.EML),
("simple.epub", FileType.EPUB),
("fake-html.html", FileType.HTML),
("README.md", FileType.MD),
("fake-email.msg", FileType.MSG),
("simple.odt", FileType.ODT),
("pdf/DA-1p.pdf", FileType.PDF),
("fake-power-point.ppt", FileType.PPT),
("simple.pptx", FileType.PPTX),
("README.rst", FileType.RST),
("fake-doc.rtf", FileType.RTF),
("stanley-cups.tsv", FileType.TSV),
("fake-text.txt", FileType.TXT),
("tests-example.xls", FileType.XLSX),
("stanley-cups.xlsx", FileType.XLSX),
("factbook.xml", FileType.XML),
],
)
def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(
file_name: str, file_type: FileType
):
file_path = example_doc_path(file_name)
partition_fn_name = file_type.partitioner_function_name
module = import_module(file_type.partitioner_module_qname)
partition_fn = getattr(module, partition_fn_name)
# -- partition the example-doc for this filetype --
rfctr(email): eml partitioner rewrite (#3694) **Summary** Initial attempts to incrementally refactor `partition_email()` into shape to allow pluggable partitioning quickly became too complex for ready code-review. Prepare separate rewritten module and tests and swap them out whole. **Additional Context** - Uses the modern stdlib `email` module to reliably accomplish several manual decoding steps in the legacy code. - Remove obsolete email-specific element-types which were replaced 18 months or so ago with email-specific metadata fields for things like Cc: addresses, subject, etc. - Remove accepting an email as `text: str` because MIME-email is inherently a binary format which can and often does contain multiple and contradictory character-encodings. - Remove `encoding` parameters as it is now unused. An email file is not a text file and as such does not have a single overall encoding. Character encoding is specified individually for each MIME-part within the message and often varies from one part to another in the same message. - Remove the need for a caller to specify `attachment_partitioner`. There is only one reasonable choice for this which is `auto.partition()`, consistent with the same interface and operation in `partition_msg()`. - Fixes #3671 along the way by silently skipping attachments with a file-type for which there is no partitioner. - Substantially extend the test-suite to cover multiple transport-encoding/charset combinations. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: scanny <scanny@users.noreply.github.com>
2024-10-15 19:02:33 -07:00
elements = partition_fn(file_path, process_attachments=False)
assert elements
assert all(
e.metadata.filetype == file_type.mime_type
for e in elements
if e.metadata.filetype is not None
)
def test_detect_filetype_maps_file_to_bytes_io_when_spooled_temp_file_used(mocker):
detect_filetype_mock = MagicMock(return_value=FileType.JSON)
mocker.patch("unstructured.file_utils.filetype._FileTypeDetector", detect_filetype_mock)
with tempfile.SpooledTemporaryFile() as f:
f.write(b'{"text": Hello, world!}')
f.seek(0)
detect_filetype(file=f)
file_detection_context = detect_filetype_mock.file_type.call_args[0][0]
assert file_detection_context.text_head == '{"text": Hello, world!}'
# -- .languages -----------------------------------------------------------
def test_auto_partition_passes_user_provided_languages_arg_to_PDF():
elements = partition(
example_doc_path("pdf/chevron-page.pdf"),
strategy=PartitionStrategy.OCR_ONLY,
languages=["eng"],
)
assert all(e.metadata.languages == ["eng"] for e in elements)
def test_auto_partition_languages_argument_default_to_None_when_omitted():
elements = partition(example_doc_path("handbook-1p.docx"), detect_language_per_element=True)
# -- PageBreak and any other element with no text is assigned `None` --
assert all(e.text == "" for e in elements if e.metadata.languages is None)
def test_auto_partition_default_does_not_overwrite_other_defaults():
"""`partition()` ["eng"] default does not overwrite ["auto"] default in other partitioners."""
# the default for `languages` is ["auto"] in partiton_text
from unstructured.partition.text import partition_text
# Use a document that is primarily in a language other than English
file_path = example_doc_path("language-docs/UDHR_first_article_all.txt")
text_elements = partition_text(file_path)
assert text_elements[0].metadata.languages != ["eng"]
auto_elements = partition(file_path)
assert auto_elements[0].metadata.languages != ["eng"]
assert auto_elements[0].metadata.languages == text_elements[0].metadata.languages
# ================================================================================================
# MISCELLANEOUS BEHAVIORS
# ================================================================================================
def test_auto_partition_from_filename_works_on_empty_file():
assert partition(example_doc_path("empty.txt")) == []
def test_auto_partition_from_file_works_on_empty_file():
with open(example_doc_path("empty.txt"), "rb") as f:
assert partition(file=f) == []
def test_auto_partition_that_requires_extras_raises_when_dependencies_are_not_installed(
request: FixtureRequest,
):
_PartitionerLoader._partitioners.pop(FileType.PDF, None)
dependency_exists_ = function_mock(
request, "unstructured.partition.auto.dependency_exists", return_value=False
)
match = r"partition_pdf\(\) is not available because one or more dependencies are not installed"
with pytest.raises(ImportError, match=match):
partition(example_doc_path("pdf/layout-parser-paper-fast.pdf"))
dependency_exists_.assert_called_once_with("pdf2image")
# ================================================================================================
# MODULE-LEVEL FIXTURES
# ================================================================================================
@pytest.fixture()
def expected_docx_elements():
return [
Title("These are a few of my favorite things:"),
ListItem("Parrots"),
ListItem("Hockey"),
Text("Analysis"),
NarrativeText("This is my first thought. This is my second thought."),
NarrativeText("This is my third thought."),
Text("2023"),
Address("DOYLESTOWN, PA 18901"),
]
def _test_partition_foo():
pass
def test_auto_partition_works_with_custom_types(
request: FixtureRequest,
):
file_type = create_file_type("FOO", canonical_mime_type="application/foo", extensions=[".foo"])
register_partitioner(file_type)(_test_partition_foo)
loader = _PartitionerLoader()
assert loader.get(file_type) is _test_partition_foo