2024-06-21 17:16:39 -07:00
|
|
|
|
# pyright: reportPrivateUsage=false
|
|
|
|
|
|
2024-05-22 17:51:08 -07:00
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
2023-04-17 23:11:21 -07:00
|
|
|
|
import json
|
2023-01-09 16:15:14 -05:00
|
|
|
|
import os
|
|
|
|
|
import pathlib
|
2024-01-04 09:52:00 -08:00
|
|
|
|
import tempfile
|
2023-01-27 12:08:18 -05:00
|
|
|
|
import warnings
|
2023-05-15 13:23:19 -05:00
|
|
|
|
from importlib import import_module
|
2024-09-23 15:23:10 -07:00
|
|
|
|
from typing import Iterator
|
2025-02-20 14:00:25 +01:00
|
|
|
|
from unittest.mock import MagicMock, patch
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
2023-02-27 17:30:54 +01:00
|
|
|
|
import pytest
|
2024-01-17 17:50:36 -05:00
|
|
|
|
from PIL import Image
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
2024-01-04 09:52:00 -08:00
|
|
|
|
from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
|
fix: stop csv and tsv dropping the first line of the file (#1530)
The current code assumes the first line of csv and tsv files are a
header line. Most csv and tsv files don't have a header line, and even
for those that do, dropping this line may not be the desired behavior.
Here is a snippet of code that demonstrates the current behavior and the
proposed fix
```
import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring
c1 = """
Stanley Cups,,
Team,Location,Stanley Cups
Blues,STL,1
Flyers,PHI,2
Maple Leafs,TOR,13
"""
f = "./test.csv"
with open(f, 'w') as ff:
ff.write(c1)
print("Suggested Improvement Keep First Line")
table = pd.read_csv(f, header=None)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
print("\n\nOriginal Looses First Line")
table = pd.read_csv(f)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
```
---------
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-17 00:59:35 +02:00
|
|
|
|
from test_unstructured.partition.test_constants import (
|
|
|
|
|
EXPECTED_TABLE,
|
|
|
|
|
EXPECTED_TABLE_XLSX,
|
|
|
|
|
EXPECTED_TEXT,
|
2024-07-11 12:57:28 -07:00
|
|
|
|
EXPECTED_XLS_TABLE,
|
fix: stop csv and tsv dropping the first line of the file (#1530)
The current code assumes the first line of csv and tsv files are a
header line. Most csv and tsv files don't have a header line, and even
for those that do, dropping this line may not be the desired behavior.
Here is a snippet of code that demonstrates the current behavior and the
proposed fix
```
import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring
c1 = """
Stanley Cups,,
Team,Location,Stanley Cups
Blues,STL,1
Flyers,PHI,2
Maple Leafs,TOR,13
"""
f = "./test.csv"
with open(f, 'w') as ff:
ff.write(c1)
print("Suggested Improvement Keep First Line")
table = pd.read_csv(f, header=None)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
print("\n\nOriginal Looses First Line")
table = pd.read_csv(f)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
```
---------
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-17 00:59:35 +02:00
|
|
|
|
)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
from test_unstructured.unit_utils import (
|
|
|
|
|
ANY,
|
|
|
|
|
FixtureRequest,
|
|
|
|
|
LogCaptureFixture,
|
|
|
|
|
example_doc_path,
|
|
|
|
|
function_mock,
|
|
|
|
|
method_mock,
|
|
|
|
|
)
|
2023-05-16 15:40:40 -04:00
|
|
|
|
from unstructured.cleaners.core import clean_extra_whitespace
|
2023-02-27 17:30:54 +01:00
|
|
|
|
from unstructured.documents.elements import (
|
|
|
|
|
Address,
|
2024-07-11 12:57:28 -07:00
|
|
|
|
CompositeElement,
|
2024-06-21 17:16:39 -07:00
|
|
|
|
Element,
|
2023-05-15 13:23:19 -05:00
|
|
|
|
ElementMetadata,
|
2023-02-27 17:30:54 +01:00
|
|
|
|
ListItem,
|
|
|
|
|
NarrativeText,
|
2023-05-16 15:40:40 -04:00
|
|
|
|
Table,
|
2023-10-03 09:40:34 -07:00
|
|
|
|
TableChunk,
|
2023-02-27 17:30:54 +01:00
|
|
|
|
Text,
|
|
|
|
|
Title,
|
|
|
|
|
)
|
2025-02-20 14:00:25 +01:00
|
|
|
|
from unstructured.file_utils.filetype import detect_filetype
|
2025-03-06 17:09:42 -05:00
|
|
|
|
from unstructured.file_utils.model import FileType, create_file_type, register_partitioner
|
2024-07-21 23:03:55 -07:00
|
|
|
|
from unstructured.partition.auto import _PartitionerLoader, partition
|
2024-10-15 19:02:33 -07:00
|
|
|
|
from unstructured.partition.common import UnsupportedFileFormatError
|
2023-11-15 21:41:02 -08:00
|
|
|
|
from unstructured.partition.utils.constants import PartitionStrategy
|
2024-07-09 22:29:07 -07:00
|
|
|
|
from unstructured.staging.base import elements_from_json, elements_to_dicts, elements_to_json
|
2023-06-16 17:52:13 -07:00
|
|
|
|
|
2023-03-30 16:54:29 -04:00
|
|
|
|
is_in_docker = os.path.exists("/.dockerenv")
|
|
|
|
|
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
# ================================================================================================
|
|
|
|
|
# CSV
|
|
|
|
|
# ================================================================================================
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
def test_auto_partition_csv_from_filename():
|
|
|
|
|
elements = partition(example_doc_path("stanley-cups.csv"))
|
|
|
|
|
|
|
|
|
|
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
|
|
|
|
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
|
|
|
|
assert elements[0].metadata.filetype == "text/csv"
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
def test_auto_partition_csv_from_file():
|
|
|
|
|
with open(example_doc_path("stanley-cups.csv"), "rb") as f:
|
|
|
|
|
elements = partition(file=f)
|
|
|
|
|
|
|
|
|
|
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
|
|
|
|
assert isinstance(elements[0], Table)
|
|
|
|
|
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
|
|
|
|
assert elements[0].metadata.filetype == "text/csv"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
# DOC
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
("pass_metadata_filename", "content_type"),
|
|
|
|
|
[(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
|
|
|
|
|
)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_auto_partition_doc_from_filename(
|
|
|
|
|
pass_metadata_filename: bool, content_type: str | None, expected_docx_elements: list[Element]
|
2024-07-08 14:25:17 -07:00
|
|
|
|
):
|
2024-07-11 12:57:28 -07:00
|
|
|
|
file_path = example_doc_path("simple.doc")
|
|
|
|
|
metadata_filename = file_path if pass_metadata_filename else None
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
elements = partition(
|
2024-07-11 12:57:28 -07:00
|
|
|
|
filename=file_path,
|
2024-07-08 14:25:17 -07:00
|
|
|
|
metadata_filename=metadata_filename,
|
|
|
|
|
content_type=content_type,
|
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
|
|
|
|
)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
|
|
|
|
for e in elements:
|
|
|
|
|
print(f"{type(e).__name__}({repr(e.text)})")
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
assert elements == expected_docx_elements
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert all(e.metadata.filename == "simple.doc" for e in elements)
|
|
|
|
|
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
|
|
|
|
|
2024-07-09 22:29:07 -07:00
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Passes in CI but not Docker. Remove skip on #3364 fix.")
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_auto_partition_doc_from_file(expected_docx_elements: list[Element]):
|
2024-07-09 22:29:07 -07:00
|
|
|
|
with open(example_doc_path("simple.doc"), "rb") as f:
|
|
|
|
|
elements = partition(file=f)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert elements == expected_docx_elements
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
# DOCX
|
|
|
|
|
# ================================================================================================
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_auto_partition_docx_from_filename(expected_docx_elements: list[Element]):
|
|
|
|
|
elements = partition(example_doc_path("simple.docx"), strategy=PartitionStrategy.HI_RES)
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
|
|
|
|
assert elements == expected_docx_elements
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert all(e.metadata.filename == "simple.docx" for e in elements)
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_auto_partition_docx_from_file(expected_docx_elements: list[Element]):
|
|
|
|
|
with open(example_doc_path("simple.docx"), "rb") as f:
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
2023-01-09 16:15:14 -05:00
|
|
|
|
assert elements == expected_docx_elements
|
|
|
|
|
|
|
|
|
|
|
2024-06-25 17:29:47 -07:00
|
|
|
|
@pytest.mark.parametrize("file_name", ["simple.docx", "simple.doc", "simple.odt"])
|
2024-06-21 17:16:39 -07:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"strategy",
|
|
|
|
|
[
|
|
|
|
|
PartitionStrategy.AUTO,
|
|
|
|
|
PartitionStrategy.FAST,
|
|
|
|
|
PartitionStrategy.HI_RES,
|
|
|
|
|
PartitionStrategy.OCR_ONLY,
|
|
|
|
|
],
|
|
|
|
|
)
|
2024-06-25 17:29:47 -07:00
|
|
|
|
def test_partition_forwards_strategy_arg_to_partition_docx_and_its_brokers(
|
|
|
|
|
request: FixtureRequest, file_name: str, strategy: str
|
|
|
|
|
):
|
|
|
|
|
"""The `strategy` arg value received by `partition()` is received by `partition_docx().
|
|
|
|
|
|
|
|
|
|
To do this in the brokering-partitioner cases (DOC, ODT) it must make its way to
|
|
|
|
|
`partition_doc()` or `partition_odt()` which must then forward it to `partition_docx()`. This
|
|
|
|
|
test makes sure it made it all the way.
|
|
|
|
|
|
|
|
|
|
Note this is 3 file-types X 4 strategies = 12 test-cases.
|
|
|
|
|
"""
|
2024-06-21 17:16:39 -07:00
|
|
|
|
from unstructured.partition.docx import _DocxPartitioner
|
|
|
|
|
|
|
|
|
|
def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]:
|
|
|
|
|
yield Text(f"strategy=={self._opts.strategy}")
|
|
|
|
|
|
|
|
|
|
_iter_elements_ = method_mock(
|
|
|
|
|
request,
|
|
|
|
|
_DocxPartitioner,
|
|
|
|
|
"_iter_document_elements",
|
|
|
|
|
side_effect=fake_iter_document_elements,
|
|
|
|
|
)
|
|
|
|
|
|
2024-06-25 17:29:47 -07:00
|
|
|
|
(element,) = partition(example_doc_path(file_name), strategy=strategy)
|
2024-06-21 17:16:39 -07:00
|
|
|
|
|
|
|
|
|
_iter_elements_.assert_called_once_with(ANY)
|
|
|
|
|
assert element.text == f"strategy=={strategy}"
|
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
# ================================================================================================
|
|
|
|
|
# EML
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
|
2024-07-09 22:29:07 -07:00
|
|
|
|
EXPECTED_EMAIL_OUTPUT = [
|
|
|
|
|
NarrativeText(text="This is a test email to use for unit tests."),
|
2024-12-18 10:46:54 -08:00
|
|
|
|
Text(text="Important points:"),
|
2024-07-09 22:29:07 -07:00
|
|
|
|
ListItem(text="Roses are red"),
|
|
|
|
|
ListItem(text="Violets are blue"),
|
|
|
|
|
]
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
|
|
|
|
def test_auto_partition_email_from_filename():
|
2024-07-09 22:29:07 -07:00
|
|
|
|
file_path = example_doc_path("eml/fake-email.eml")
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
2024-07-09 22:29:07 -07:00
|
|
|
|
elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
assert len(elements) > 0
|
|
|
|
|
assert elements == EXPECTED_EMAIL_OUTPUT
|
2024-07-09 22:29:07 -07:00
|
|
|
|
assert elements[0].metadata.filename == os.path.basename(file_path)
|
|
|
|
|
assert elements[0].metadata.file_directory == os.path.split(file_path)[0]
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_email_from_file():
|
2024-07-09 22:29:07 -07:00
|
|
|
|
with open(example_doc_path("eml/fake-email.eml"), "rb") as f:
|
2024-07-08 14:25:17 -07:00
|
|
|
|
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
assert len(elements) > 0
|
|
|
|
|
assert elements == EXPECTED_EMAIL_OUTPUT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
# EPUB
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_epub_from_filename():
|
2024-07-09 22:29:07 -07:00
|
|
|
|
elements = partition(example_doc_path("winter-sports.epub"), strategy=PartitionStrategy.HI_RES)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
assert len(elements) > 0
|
2025-03-07 17:25:21 -08:00
|
|
|
|
assert elements[2].text.startswith("The Project Gutenberg eBook of Winter Sports")
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_epub_from_file():
|
2024-07-09 22:29:07 -07:00
|
|
|
|
with open(example_doc_path("winter-sports.epub"), "rb") as f:
|
2024-07-08 14:25:17 -07:00
|
|
|
|
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
assert len(elements) > 0
|
2025-03-07 17:25:21 -08:00
|
|
|
|
assert elements[2].text.startswith("The Project Gutenberg eBook of Winter Sports")
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
# HTML
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
|
|
|
|
|
|
2023-03-24 16:32:45 -07:00
|
|
|
|
@pytest.mark.parametrize(
|
2023-08-24 03:02:47 -04:00
|
|
|
|
("pass_metadata_filename", "content_type"),
|
2023-03-24 16:32:45 -07:00
|
|
|
|
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
|
|
|
|
|
)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
def test_auto_partition_html_from_filename(pass_metadata_filename: bool, content_type: str | None):
|
2024-07-11 12:57:28 -07:00
|
|
|
|
file_path = example_doc_path("example-10k-1p.html")
|
2024-07-09 22:29:07 -07:00
|
|
|
|
metadata_filename = file_path if pass_metadata_filename else None
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
2023-05-15 15:26:53 -04:00
|
|
|
|
elements = partition(
|
2024-07-09 22:29:07 -07:00
|
|
|
|
filename=file_path,
|
2023-08-24 03:02:47 -04:00
|
|
|
|
metadata_filename=metadata_filename,
|
2023-05-15 15:26:53 -04:00
|
|
|
|
content_type=content_type,
|
2023-11-15 21:41:02 -08:00
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
2023-05-15 15:26:53 -04:00
|
|
|
|
)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
|
|
|
|
assert elements
|
|
|
|
|
expected_filename, expected_directory = os.path.basename(file_path), os.path.split(file_path)[0]
|
|
|
|
|
assert all(e.metadata.filename == expected_filename for e in elements)
|
|
|
|
|
assert all(e.metadata.file_directory == expected_directory for e in elements)
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
|
|
|
|
|
2023-03-24 16:32:45 -07:00
|
|
|
|
@pytest.mark.parametrize(
|
2023-08-24 03:02:47 -04:00
|
|
|
|
("pass_metadata_filename", "content_type"),
|
2023-03-24 16:32:45 -07:00
|
|
|
|
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
|
|
|
|
|
)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
def test_auto_partition_html_from_file(pass_metadata_filename: bool, content_type: str | None):
|
2024-07-11 12:57:28 -07:00
|
|
|
|
file_path = example_doc_path("example-10k-1p.html")
|
2024-07-09 22:29:07 -07:00
|
|
|
|
metadata_filename = file_path if pass_metadata_filename else None
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
2024-07-09 22:29:07 -07:00
|
|
|
|
with open(file_path, "rb") as f:
|
2023-05-15 15:26:53 -04:00
|
|
|
|
elements = partition(
|
|
|
|
|
file=f,
|
2023-08-24 03:02:47 -04:00
|
|
|
|
metadata_filename=metadata_filename,
|
2023-05-15 15:26:53 -04:00
|
|
|
|
content_type=content_type,
|
2023-11-15 21:41:02 -08:00
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
2023-05-15 15:26:53 -04:00
|
|
|
|
)
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
|
|
|
|
assert len(elements) > 0
|
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
def test_auto_partition_html_pre_from_file():
|
|
|
|
|
elements = partition(example_doc_path("fake-html-pre.htm"))
|
|
|
|
|
|
|
|
|
|
assert len(elements) > 0
|
|
|
|
|
assert "PageBreak" not in [elem.category for elem in elements]
|
|
|
|
|
assert clean_extra_whitespace(elements[0].text).startswith("[107th Congress Public Law 56]")
|
|
|
|
|
assert isinstance(elements[0], NarrativeText)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert all(e.metadata.filetype == "text/html" for e in elements)
|
|
|
|
|
assert all(e.metadata.filename == "fake-html-pre.htm" for e in elements)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
# IMAGE
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
("pass_metadata_filename", "content_type"),
|
|
|
|
|
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
|
|
|
|
)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_auto_partition_jpeg_from_filename(pass_metadata_filename: bool, content_type: str | None):
|
2024-07-18 15:21:32 -07:00
|
|
|
|
file_path = example_doc_path("img/layout-parser-paper-fast.jpg")
|
2024-07-09 22:29:07 -07:00
|
|
|
|
metadata_filename = file_path if pass_metadata_filename else None
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
elements = partition(
|
2024-07-09 22:29:07 -07:00
|
|
|
|
filename=file_path,
|
2024-07-08 14:25:17 -07:00
|
|
|
|
metadata_filename=metadata_filename,
|
|
|
|
|
content_type=content_type,
|
|
|
|
|
strategy=PartitionStrategy.AUTO,
|
|
|
|
|
)
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
e = elements[2]
|
|
|
|
|
assert e.text == (
|
|
|
|
|
"LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
2024-07-08 14:25:17 -07:00
|
|
|
|
)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert e.metadata.coordinates is not None
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
("pass_metadata_filename", "content_type"),
|
|
|
|
|
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
|
|
|
|
)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_auto_partition_jpeg_from_file(pass_metadata_filename: bool, content_type: str | None):
|
2024-07-18 15:21:32 -07:00
|
|
|
|
file_path = example_doc_path("img/layout-parser-paper-fast.jpg")
|
2024-07-09 22:29:07 -07:00
|
|
|
|
metadata_filename = file_path if pass_metadata_filename else None
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
2024-07-09 22:29:07 -07:00
|
|
|
|
with open(file_path, "rb") as f:
|
2024-07-08 14:25:17 -07:00
|
|
|
|
elements = partition(
|
|
|
|
|
file=f,
|
|
|
|
|
metadata_filename=metadata_filename,
|
|
|
|
|
content_type=content_type,
|
|
|
|
|
strategy=PartitionStrategy.AUTO,
|
|
|
|
|
)
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
e = elements[2]
|
|
|
|
|
assert e.text == (
|
|
|
|
|
"LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
|
|
|
|
)
|
|
|
|
|
assert e.metadata.coordinates is not None
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
|
|
|
|
def test_auto_partition_bmp_from_filename(tmp_path: pathlib.Path):
|
2024-07-08 14:25:17 -07:00
|
|
|
|
bmp_filename = str(tmp_path / "example.bmp")
|
2024-07-18 15:21:32 -07:00
|
|
|
|
with Image.open(example_doc_path("img/layout-parser-paper-with-table.jpg")) as img:
|
2024-07-08 14:25:17 -07:00
|
|
|
|
img.save(bmp_filename)
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
elements = partition(filename=bmp_filename, strategy=PartitionStrategy.HI_RES)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
|
|
|
|
table = [e.metadata.text_as_html for e in elements if e.metadata.text_as_html]
|
|
|
|
|
assert len(table) == 1
|
|
|
|
|
assert "<table><thead><tr>" in table[0]
|
|
|
|
|
assert "</thead><tbody><tr>" in table[0]
|
|
|
|
|
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
|
|
|
|
|
def test_auto_partition_image_element_extraction(extract_image_block_to_payload: bool):
|
|
|
|
|
extract_image_block_types = ["Image", "Table"]
|
|
|
|
|
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
|
|
|
elements = partition(
|
2024-07-18 15:21:32 -07:00
|
|
|
|
filename=example_doc_path("img/embedded-images-tables.jpg"),
|
2024-07-11 12:57:28 -07:00
|
|
|
|
extract_image_block_types=extract_image_block_types,
|
|
|
|
|
extract_image_block_to_payload=extract_image_block_to_payload,
|
|
|
|
|
extract_image_block_output_dir=tmpdir,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert_element_extraction(
|
|
|
|
|
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
# ================================================================================================
|
|
|
|
|
# JSON
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
# TODO(scanny): This test should go away when we fix #3365. This test glosses over several
|
|
|
|
|
# important JSON "rehydration" behaviors, in particular that the metadata should match exactly.
|
|
|
|
|
# The following test `test_auto_partition_json_from_file_preserves_original_elements` will be the
|
|
|
|
|
# replacement for this test.
|
2024-04-24 09:05:20 +02:00
|
|
|
|
def test_auto_partitioned_json_output_maintains_consistency_with_fixture_elements():
|
2023-04-17 23:11:21 -07:00
|
|
|
|
"""Test auto-processing an unstructured json output file by filename."""
|
2024-07-09 22:29:07 -07:00
|
|
|
|
json_file_path = example_doc_path("spring-weather.html.json")
|
2024-04-24 09:05:20 +02:00
|
|
|
|
original_file_name = "spring-weather.html"
|
|
|
|
|
with open(json_file_path) as json_f:
|
|
|
|
|
expected_result = json.load(json_f)
|
|
|
|
|
|
|
|
|
|
partitioning_result = json.loads(
|
2024-09-23 15:23:10 -07:00
|
|
|
|
elements_to_json(
|
|
|
|
|
partition(
|
|
|
|
|
filename=str(json_file_path),
|
|
|
|
|
# -- use the original file name to get the same element IDs (hashes) --
|
|
|
|
|
metadata_filename=original_file_name,
|
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
|
|
|
|
)
|
2024-04-24 09:05:20 +02:00
|
|
|
|
)
|
2023-11-15 21:41:02 -08:00
|
|
|
|
)
|
2024-04-24 09:05:20 +02:00
|
|
|
|
for elem in partitioning_result:
|
2023-05-15 13:23:19 -05:00
|
|
|
|
elem.pop("metadata")
|
2024-04-24 09:05:20 +02:00
|
|
|
|
for elem in expected_result:
|
2023-05-15 13:23:19 -05:00
|
|
|
|
elem.pop("metadata")
|
2023-04-17 23:11:21 -07:00
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert expected_result == partitioning_result
|
2023-07-25 15:59:45 -04:00
|
|
|
|
|
|
|
|
|
|
2023-04-17 23:11:21 -07:00
|
|
|
|
@pytest.mark.xfail(
|
2024-07-09 22:29:07 -07:00
|
|
|
|
reason=(
|
|
|
|
|
"https://github.com/Unstructured-IO/unstructured/issues/3365"
|
|
|
|
|
" partition_json() does not preserve original element-id or metadata"
|
|
|
|
|
),
|
|
|
|
|
raises=AssertionError,
|
|
|
|
|
strict=True,
|
2023-04-17 23:11:21 -07:00
|
|
|
|
)
|
2024-07-09 22:29:07 -07:00
|
|
|
|
def test_auto_partition_json_from_file_preserves_original_elements():
|
|
|
|
|
file_path = example_doc_path("simple.json")
|
|
|
|
|
original_elements = elements_from_json(file_path)
|
|
|
|
|
|
|
|
|
|
with open(file_path, "rb") as f:
|
|
|
|
|
partitioned_elements = partition(file=f)
|
|
|
|
|
|
|
|
|
|
assert elements_to_dicts(partitioned_elements) == elements_to_dicts(original_elements)
|
2023-04-17 23:11:21 -07:00
|
|
|
|
|
|
|
|
|
|
2025-03-07 11:33:33 +01:00
|
|
|
|
def test_auto_partition_processes_simple_ndjson(tmp_path: pathlib.Path):
|
|
|
|
|
text = '{"text": "hello", "type": "NarrativeText"}'
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
file_path = str(tmp_path / "unprocessable.json")
|
|
|
|
|
with open(file_path, "w") as f:
|
|
|
|
|
f.write(text)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
2025-03-07 11:33:33 +01:00
|
|
|
|
result = partition(filename=file_path)
|
|
|
|
|
assert len(result) == 1
|
|
|
|
|
assert isinstance(result[0], NarrativeText)
|
|
|
|
|
assert "hello" in result[0].text
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
# MD
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_partition_md_from_url_works_with_embedded_html():
|
2024-07-08 14:25:17 -07:00
|
|
|
|
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/README.md"
|
|
|
|
|
elements = partition(url=url, content_type="text/markdown", strategy=PartitionStrategy.HI_RES)
|
2025-03-07 17:25:21 -08:00
|
|
|
|
assert "unstructured" in elements[1].text
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
# MSG
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_msg_from_filename():
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert partition(example_doc_path("fake-email.msg"), strategy=PartitionStrategy.HI_RES) == [
|
|
|
|
|
NarrativeText(text="This is a test email to use for unit tests."),
|
2024-12-18 10:46:54 -08:00
|
|
|
|
Text(text="Important points:"),
|
2024-07-11 12:57:28 -07:00
|
|
|
|
ListItem(text="Roses are red"),
|
|
|
|
|
ListItem(text="Violets are blue"),
|
|
|
|
|
]
|
2023-01-13 16:39:53 -05:00
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
# ================================================================================================
|
|
|
|
|
# ODT
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_auto_partition_odt_from_filename(expected_docx_elements: list[Element]):
|
|
|
|
|
elements = partition(example_doc_path("simple.odt"), strategy=PartitionStrategy.HI_RES)
|
|
|
|
|
assert elements == expected_docx_elements
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_auto_partition_odt_from_file(expected_docx_elements: list[Element]):
|
|
|
|
|
with open(example_doc_path("simple.odt"), "rb") as f:
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert elements == expected_docx_elements
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
# ORG
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_org_from_filename():
|
|
|
|
|
elements = partition(example_doc_path("README.org"))
|
|
|
|
|
|
|
|
|
|
assert elements[0] == Title("Example Docs")
|
|
|
|
|
assert elements[0].metadata.filetype == "text/org"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_org_from_file():
|
|
|
|
|
with open(example_doc_path("README.org"), "rb") as f:
|
|
|
|
|
elements = partition(file=f, content_type="text/org")
|
|
|
|
|
|
|
|
|
|
assert elements[0] == Title("Example Docs")
|
|
|
|
|
assert elements[0].metadata.filetype == "text/org"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
# PDF
|
|
|
|
|
# ================================================================================================
|
2023-01-27 12:08:18 -05:00
|
|
|
|
|
|
|
|
|
|
2023-03-24 16:32:45 -07:00
|
|
|
|
@pytest.mark.parametrize(
|
2023-08-24 03:02:47 -04:00
|
|
|
|
("pass_metadata_filename", "content_type"),
|
2023-03-24 16:32:45 -07:00
|
|
|
|
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
|
|
|
|
|
)
|
2024-07-09 22:29:07 -07:00
|
|
|
|
def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_type: str | None):
|
2024-09-12 14:17:27 -07:00
|
|
|
|
file_path = example_doc_path("pdf/chevron-page.pdf")
|
2024-07-09 22:29:07 -07:00
|
|
|
|
metadata_filename = file_path if pass_metadata_filename else None
|
2023-03-24 16:32:45 -07:00
|
|
|
|
|
2023-05-15 15:26:53 -04:00
|
|
|
|
elements = partition(
|
2024-07-09 22:29:07 -07:00
|
|
|
|
filename=file_path,
|
2023-08-24 03:02:47 -04:00
|
|
|
|
metadata_filename=metadata_filename,
|
2023-05-15 15:26:53 -04:00
|
|
|
|
content_type=content_type,
|
2023-11-15 21:41:02 -08:00
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
2023-05-15 15:26:53 -04:00
|
|
|
|
)
|
2023-01-19 09:29:28 -05:00
|
|
|
|
|
2024-09-12 14:17:27 -07:00
|
|
|
|
e = elements[0]
|
2024-07-09 22:29:07 -07:00
|
|
|
|
assert isinstance(e, Title)
|
2024-09-12 14:17:27 -07:00
|
|
|
|
assert e.text.startswith("eastern mediterranean")
|
2024-07-09 22:29:07 -07:00
|
|
|
|
assert e.metadata.filename == os.path.basename(file_path)
|
|
|
|
|
assert e.metadata.file_directory == os.path.split(file_path)[0]
|
2023-02-15 13:26:20 -05:00
|
|
|
|
|
2024-09-12 14:17:27 -07:00
|
|
|
|
e = elements[1]
|
2024-07-09 22:29:07 -07:00
|
|
|
|
assert isinstance(e, NarrativeText)
|
2024-09-12 14:17:27 -07:00
|
|
|
|
assert e.text.startswith("We’re investing")
|
2023-05-31 13:50:15 -05:00
|
|
|
|
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
("pass_metadata_filename", "content_type"),
|
|
|
|
|
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
|
|
|
|
|
)
|
|
|
|
|
def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type: str | None):
|
2024-09-12 14:17:27 -07:00
|
|
|
|
file_path = example_doc_path("pdf/chevron-page.pdf")
|
2024-07-11 12:57:28 -07:00
|
|
|
|
metadata_filename = file_path if pass_metadata_filename else None
|
|
|
|
|
|
|
|
|
|
with open(file_path, "rb") as f:
|
|
|
|
|
elements = partition(
|
|
|
|
|
file=f,
|
|
|
|
|
metadata_filename=metadata_filename,
|
|
|
|
|
content_type=content_type,
|
2024-07-09 22:29:07 -07:00
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
|
|
|
|
)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
2024-09-12 14:17:27 -07:00
|
|
|
|
e = elements[0]
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert isinstance(e, Title)
|
2024-09-12 14:17:27 -07:00
|
|
|
|
assert e.text.startswith("eastern mediterranean")
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
2024-09-12 14:17:27 -07:00
|
|
|
|
e = elements[1]
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert isinstance(e, NarrativeText)
|
2024-09-12 14:17:27 -07:00
|
|
|
|
assert e.text.startswith("We’re investing")
|
2023-04-21 12:01:29 -05:00
|
|
|
|
|
|
|
|
|
|
2024-07-21 23:03:55 -07:00
|
|
|
|
def test_auto_partition_pdf_with_fast_strategy(request: FixtureRequest):
|
|
|
|
|
partition_pdf_ = function_mock(
|
|
|
|
|
request,
|
|
|
|
|
"unstructured.partition.pdf.partition_pdf",
|
|
|
|
|
return_value=[NarrativeText("Hello there!")],
|
|
|
|
|
)
|
|
|
|
|
partitioner_loader_get_ = method_mock(
|
|
|
|
|
request, _PartitionerLoader, "get", return_value=partition_pdf_
|
|
|
|
|
)
|
2024-07-18 15:21:32 -07:00
|
|
|
|
file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf")
|
2023-03-10 22:16:05 -05:00
|
|
|
|
|
2024-07-21 23:03:55 -07:00
|
|
|
|
partition(file_path, strategy=PartitionStrategy.FAST)
|
2023-03-10 22:16:05 -05:00
|
|
|
|
|
2024-07-21 23:03:55 -07:00
|
|
|
|
partitioner_loader_get_.assert_called_once_with(ANY, FileType.PDF)
|
|
|
|
|
partition_pdf_.assert_called_once_with(
|
2024-07-09 22:29:07 -07:00
|
|
|
|
filename=file_path,
|
2023-03-10 22:16:05 -05:00
|
|
|
|
file=None,
|
|
|
|
|
url=None,
|
2023-11-15 21:41:02 -08:00
|
|
|
|
strategy=PartitionStrategy.FAST,
|
2023-10-10 20:47:56 -05:00
|
|
|
|
languages=None,
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
|
metadata_filename=None,
|
2024-05-17 11:28:11 -04:00
|
|
|
|
infer_table_structure=False,
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
|
extract_images_in_pdf=False,
|
2024-01-04 09:52:00 -08:00
|
|
|
|
extract_image_block_types=None,
|
|
|
|
|
extract_image_block_output_dir=None,
|
|
|
|
|
extract_image_block_to_payload=False,
|
2023-12-22 09:06:54 -06:00
|
|
|
|
hi_res_model_name=None,
|
2024-04-15 23:03:42 +02:00
|
|
|
|
starting_page_number=1,
|
2023-03-10 22:16:05 -05:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2025-01-17 12:41:04 -06:00
|
|
|
|
@pytest.mark.parametrize("infer_bool", [True, False])
|
|
|
|
|
def test_auto_handles_kwarg_with_infer_table_structure(infer_bool):
|
|
|
|
|
with patch(
|
|
|
|
|
"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
|
|
|
|
|
) as mock_process_file_with_model:
|
|
|
|
|
partition(
|
|
|
|
|
example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
|
|
|
|
pdf_infer_table_structure=True,
|
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
|
|
|
|
infer_table_structure=infer_bool,
|
|
|
|
|
)
|
|
|
|
|
assert mock_process_file_with_model.call_args[1]["infer_table_structure"] is infer_bool
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_handles_kwarg_with_infer_table_structure_when_none():
|
|
|
|
|
with patch(
|
|
|
|
|
"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
|
|
|
|
|
) as mock_process_file_with_model:
|
|
|
|
|
partition(
|
|
|
|
|
example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
|
|
|
|
pdf_infer_table_structure=True,
|
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
|
|
|
|
infer_table_structure=None,
|
|
|
|
|
)
|
|
|
|
|
assert mock_process_file_with_model.call_args[1]["infer_table_structure"] is True
|
|
|
|
|
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_auto_partition_pdf_uses_pdf_infer_table_structure_argument():
|
|
|
|
|
with patch(
|
|
|
|
|
"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
|
|
|
|
|
) as mock_process_file_with_model:
|
2024-07-09 22:29:07 -07:00
|
|
|
|
partition(
|
2024-07-18 15:21:32 -07:00
|
|
|
|
example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
2024-07-11 12:57:28 -07:00
|
|
|
|
pdf_infer_table_structure=True,
|
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
2024-07-09 22:29:07 -07:00
|
|
|
|
)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert mock_process_file_with_model.call_args[1]["infer_table_structure"]
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
|
|
|
|
|
2024-01-04 09:52:00 -08:00
|
|
|
|
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
|
2024-07-08 14:25:17 -07:00
|
|
|
|
def test_auto_partition_pdf_element_extraction(extract_image_block_to_payload: bool):
|
2024-01-04 09:52:00 -08:00
|
|
|
|
extract_image_block_types = ["Image", "Table"]
|
|
|
|
|
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
|
|
|
elements = partition(
|
2024-07-18 15:21:32 -07:00
|
|
|
|
example_doc_path("pdf/embedded-images-tables.pdf"),
|
2024-01-04 09:52:00 -08:00
|
|
|
|
extract_image_block_types=extract_image_block_types,
|
|
|
|
|
extract_image_block_to_payload=extract_image_block_to_payload,
|
|
|
|
|
extract_image_block_output_dir=tmpdir,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert_element_extraction(
|
|
|
|
|
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2025-03-09 21:15:08 -07:00
|
|
|
|
def test_auto_partition_html_element_extraction():
|
|
|
|
|
extract_image_block_types = ["Image"]
|
|
|
|
|
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
|
|
|
elements = partition(
|
2025-03-13 15:41:10 -07:00
|
|
|
|
example_doc_path("fake-html-with-base64-image.html"),
|
2025-03-09 21:15:08 -07:00
|
|
|
|
extract_image_block_types=extract_image_block_types,
|
|
|
|
|
extract_image_block_to_payload=True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert_element_extraction(elements, extract_image_block_types, True, tmpdir)
|
|
|
|
|
|
|
|
|
|
|
2025-03-13 15:41:10 -07:00
|
|
|
|
def test_auto_partition_html_image_with_url():
|
|
|
|
|
elements = partition(
|
|
|
|
|
example_doc_path("fake-html-with-image-from-url.html"),
|
|
|
|
|
)
|
|
|
|
|
assert elements[1].metadata.image_url is not None
|
|
|
|
|
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_partition_pdf_does_not_raise_warning():
|
|
|
|
|
# NOTE(robinson): This is the recommended way to check that no warning is emitted,
|
|
|
|
|
# per the pytest docs.
|
|
|
|
|
# ref: https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html
|
|
|
|
|
# #additional-use-cases-of-warnings-in-tests
|
|
|
|
|
with warnings.catch_warnings():
|
|
|
|
|
warnings.simplefilter("error")
|
|
|
|
|
partition(
|
2024-07-18 15:21:32 -07:00
|
|
|
|
example_doc_path("pdf/layout-parser-paper-fast.pdf"), strategy=PartitionStrategy.HI_RES
|
2024-07-11 12:57:28 -07:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
# ================================================================================================
|
|
|
|
|
# PPT
|
|
|
|
|
# ================================================================================================
|
2023-01-13 22:24:13 -06:00
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
def test_auto_partition_ppt_from_filename():
|
2024-07-09 22:29:07 -07:00
|
|
|
|
file_path = example_doc_path("fake-power-point.ppt")
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
2024-07-09 22:29:07 -07:00
|
|
|
|
elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
|
|
|
|
assert elements == [
|
|
|
|
|
Title(text="Adding a Bullet Slide"),
|
|
|
|
|
ListItem(text="Find the bullet slide layout"),
|
|
|
|
|
ListItem(text="Use _TextFrame.text for first bullet"),
|
|
|
|
|
ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
|
|
|
|
|
NarrativeText(text="Here is a lot of text!"),
|
|
|
|
|
NarrativeText(text="Here is some text in a text box!"),
|
|
|
|
|
]
|
|
|
|
|
assert all(e.metadata.filename == "fake-power-point.ppt" for e in elements)
|
|
|
|
|
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
|
2023-01-13 22:24:13 -06:00
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
# ================================================================================================
|
|
|
|
|
# PPTX
|
|
|
|
|
# ================================================================================================
|
2023-01-23 12:03:09 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_pptx_from_filename():
|
2024-07-09 22:29:07 -07:00
|
|
|
|
file_path = example_doc_path("fake-power-point.pptx")
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
2024-07-09 22:29:07 -07:00
|
|
|
|
elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
|
|
|
|
assert elements == [
|
|
|
|
|
Title(text="Adding a Bullet Slide"),
|
|
|
|
|
ListItem(text="Find the bullet slide layout"),
|
|
|
|
|
ListItem(text="Use _TextFrame.text for first bullet"),
|
|
|
|
|
ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
|
|
|
|
|
NarrativeText(text="Here is a lot of text!"),
|
|
|
|
|
NarrativeText(text="Here is some text in a text box!"),
|
|
|
|
|
]
|
|
|
|
|
assert all(e.metadata.filename == "fake-power-point.pptx" for e in elements)
|
|
|
|
|
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
|
2023-02-08 10:11:15 -05:00
|
|
|
|
|
|
|
|
|
|
2024-06-25 17:29:47 -07:00
|
|
|
|
@pytest.mark.parametrize("file_name", ["simple.pptx", "fake-power-point.ppt"])
|
2024-06-21 17:16:39 -07:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"strategy",
|
|
|
|
|
[
|
|
|
|
|
PartitionStrategy.AUTO,
|
|
|
|
|
PartitionStrategy.FAST,
|
|
|
|
|
PartitionStrategy.HI_RES,
|
|
|
|
|
PartitionStrategy.OCR_ONLY,
|
|
|
|
|
],
|
|
|
|
|
)
|
2024-06-25 17:29:47 -07:00
|
|
|
|
def test_partition_forwards_strategy_arg_to_partition_pptx_and_its_brokers(
|
|
|
|
|
request: FixtureRequest, file_name: str, strategy: str
|
|
|
|
|
):
|
|
|
|
|
"""The `strategy` arg value received by `partition()` is received by `partition_pptx().
|
2024-06-21 17:16:39 -07:00
|
|
|
|
|
2024-06-25 17:29:47 -07:00
|
|
|
|
To do this in the brokering-partitioner case (PPT) the strategy argument must make its way to
|
|
|
|
|
`partition_ppt()` which must then forward it to `partition_pptx()`. This test makes sure it
|
|
|
|
|
made it all the way.
|
2024-06-21 17:16:39 -07:00
|
|
|
|
|
2024-06-25 17:29:47 -07:00
|
|
|
|
Note this is 2 file-types X 4 strategies = 8 test-cases.
|
|
|
|
|
"""
|
2024-06-21 21:23:58 -05:00
|
|
|
|
from unstructured.partition.pptx import _PptxPartitioner
|
|
|
|
|
|
|
|
|
|
def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]:
|
|
|
|
|
yield Text(f"strategy=={self._opts.strategy}")
|
|
|
|
|
|
|
|
|
|
_iter_elements_ = method_mock(
|
|
|
|
|
request,
|
|
|
|
|
_PptxPartitioner,
|
|
|
|
|
"_iter_presentation_elements",
|
|
|
|
|
side_effect=fake_iter_presentation_elements,
|
|
|
|
|
)
|
|
|
|
|
|
2024-06-25 17:29:47 -07:00
|
|
|
|
(element,) = partition(example_doc_path(file_name), strategy=strategy)
|
2024-06-21 21:23:58 -05:00
|
|
|
|
|
|
|
|
|
_iter_elements_.assert_called_once_with(ANY)
|
|
|
|
|
assert element.text == f"strategy=={strategy}"
|
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
# ================================================================================================
|
|
|
|
|
# RST
|
|
|
|
|
# ================================================================================================
|
2023-03-14 11:52:21 -04:00
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
def test_auto_partition_rst_from_filename():
|
|
|
|
|
elements = partition(example_doc_path("README.rst"))
|
2023-03-14 11:52:21 -04:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
assert elements[0] == Title("Example Docs")
|
|
|
|
|
assert elements[0].metadata.filetype == "text/x-rst"
|
2023-03-14 11:52:21 -04:00
|
|
|
|
|
2023-03-28 16:15:22 -04:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
def test_auto_partition_rst_from_file():
|
|
|
|
|
with open(example_doc_path("README.rst"), "rb") as f:
|
|
|
|
|
elements = partition(file=f, content_type="text/x-rst")
|
2023-03-28 16:15:22 -04:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
assert elements[0] == Title("Example Docs")
|
|
|
|
|
assert elements[0].metadata.filetype == "text/x-rst"
|
2023-03-28 16:15:22 -04:00
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
# ================================================================================================
|
|
|
|
|
# RTF
|
|
|
|
|
# ================================================================================================
|
2023-04-10 17:25:03 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_rtf_from_filename():
|
2024-07-09 22:29:07 -07:00
|
|
|
|
elements = partition(example_doc_path("fake-doc.rtf"), strategy=PartitionStrategy.HI_RES)
|
2023-04-10 17:25:03 -04:00
|
|
|
|
assert elements[0] == Title("My First Heading")
|
2023-04-12 14:31:01 -04:00
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
# ================================================================================================
|
|
|
|
|
# TSV
|
|
|
|
|
# ================================================================================================
|
2023-04-13 11:04:15 -04:00
|
|
|
|
|
fix: parse URL response Content-Type according to RFC 9110 (#2950)
Currently, `file_and_type_from_url()` does not correctly handle the
`Content-Type` header. Specifically, it assumes that the header contains
only the mime-type (e.g. `text/html`), however, [RFC
9110](https://www.rfc-editor.org/rfc/rfc9110#field.content-type) allows
for additional directives — specifically the `charset` — to be returned
in the header. This leads to a `ValueError` when loading a URL with a
response Content-Type header such as `text/html; charset=UTF-8`.
To reproduce the issue:
```python
from unstructured.partition.auto import partition
url = "https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/"
partition(url=url)
```
Which will result in the following exception:
```python
{
"name": "ValueError",
"message": "Invalid file. The FileType.UNK file type is not supported in partition.",
"stack": "---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[1], line 4
1 from unstructured.partition.auto import partition
3 url = \"https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/\"
----> 4 partition(url=url)
File ~/miniconda3/envs/ai-tasks/lib/python3.11/site-packages/unstructured/partition/auto.py:541, in partition(filename, content_type, file, file_filename, url, include_page_breaks, strategy, encoding, paragraph_grouper, headers, skip_infer_table_types, ssl_verify, ocr_languages, languages, detect_language_per_element, pdf_infer_table_structure, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, xml_keep_tags, data_source_metadata, metadata_filename, request_timeout, hi_res_model_name, model_name, date_from_file_object, starting_page_number, **kwargs)
539 else:
540 msg = \"Invalid file\" if not filename else f\"Invalid file {filename}\"
--> 541 raise ValueError(f\"{msg}. The {filetype} file type is not supported in partition.\")
543 for element in elements:
544 element.metadata.url = url
ValueError: Invalid file. The FileType.UNK file type is not supported in partition."
}
```
This PR fixes the issue by parsing the mime-type out of the
`Content-Type` header string.
Closes #2257
2024-04-30 07:53:44 +02:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
def test_auto_partition_tsv_from_filename():
|
|
|
|
|
elements = partition(example_doc_path("stanley-cups.tsv"))
|
fix: parse URL response Content-Type according to RFC 9110 (#2950)
Currently, `file_and_type_from_url()` does not correctly handle the
`Content-Type` header. Specifically, it assumes that the header contains
only the mime-type (e.g. `text/html`), however, [RFC
9110](https://www.rfc-editor.org/rfc/rfc9110#field.content-type) allows
for additional directives — specifically the `charset` — to be returned
in the header. This leads to a `ValueError` when loading a URL with a
response Content-Type header such as `text/html; charset=UTF-8`.
To reproduce the issue:
```python
from unstructured.partition.auto import partition
url = "https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/"
partition(url=url)
```
Which will result in the following exception:
```python
{
"name": "ValueError",
"message": "Invalid file. The FileType.UNK file type is not supported in partition.",
"stack": "---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[1], line 4
1 from unstructured.partition.auto import partition
3 url = \"https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/\"
----> 4 partition(url=url)
File ~/miniconda3/envs/ai-tasks/lib/python3.11/site-packages/unstructured/partition/auto.py:541, in partition(filename, content_type, file, file_filename, url, include_page_breaks, strategy, encoding, paragraph_grouper, headers, skip_infer_table_types, ssl_verify, ocr_languages, languages, detect_language_per_element, pdf_infer_table_structure, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, xml_keep_tags, data_source_metadata, metadata_filename, request_timeout, hi_res_model_name, model_name, date_from_file_object, starting_page_number, **kwargs)
539 else:
540 msg = \"Invalid file\" if not filename else f\"Invalid file {filename}\"
--> 541 raise ValueError(f\"{msg}. The {filetype} file type is not supported in partition.\")
543 for element in elements:
544 element.metadata.url = url
ValueError: Invalid file. The FileType.UNK file type is not supported in partition."
}
```
This PR fixes the issue by parsing the mime-type out of the
`Content-Type` header string.
Closes #2257
2024-04-30 07:53:44 +02:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
|
|
|
|
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
|
|
|
|
assert elements[0].metadata.filetype == "text/tsv"
|
fix: parse URL response Content-Type according to RFC 9110 (#2950)
Currently, `file_and_type_from_url()` does not correctly handle the
`Content-Type` header. Specifically, it assumes that the header contains
only the mime-type (e.g. `text/html`), however, [RFC
9110](https://www.rfc-editor.org/rfc/rfc9110#field.content-type) allows
for additional directives — specifically the `charset` — to be returned
in the header. This leads to a `ValueError` when loading a URL with a
response Content-Type header such as `text/html; charset=UTF-8`.
To reproduce the issue:
```python
from unstructured.partition.auto import partition
url = "https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/"
partition(url=url)
```
Which will result in the following exception:
```python
{
"name": "ValueError",
"message": "Invalid file. The FileType.UNK file type is not supported in partition.",
"stack": "---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[1], line 4
1 from unstructured.partition.auto import partition
3 url = \"https://arstechnica.com/space/2024/04/nasa-still-doesnt-understand-root-cause-of-orion-heat-shield-issue/\"
----> 4 partition(url=url)
File ~/miniconda3/envs/ai-tasks/lib/python3.11/site-packages/unstructured/partition/auto.py:541, in partition(filename, content_type, file, file_filename, url, include_page_breaks, strategy, encoding, paragraph_grouper, headers, skip_infer_table_types, ssl_verify, ocr_languages, languages, detect_language_per_element, pdf_infer_table_structure, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, xml_keep_tags, data_source_metadata, metadata_filename, request_timeout, hi_res_model_name, model_name, date_from_file_object, starting_page_number, **kwargs)
539 else:
540 msg = \"Invalid file\" if not filename else f\"Invalid file {filename}\"
--> 541 raise ValueError(f\"{msg}. The {filetype} file type is not supported in partition.\")
543 for element in elements:
544 element.metadata.url = url
ValueError: Invalid file. The FileType.UNK file type is not supported in partition."
}
```
This PR fixes the issue by parsing the mime-type out of the
`Content-Type` header string.
Closes #2257
2024-04-30 07:53:44 +02:00
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
# ================================================================================================
|
|
|
|
|
# TXT
|
|
|
|
|
# ================================================================================================
|
2024-09-29 06:16:33 +02:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
("filename", "expected_elements"),
|
|
|
|
|
[
|
|
|
|
|
(
|
|
|
|
|
"fake-text.txt",
|
|
|
|
|
[
|
|
|
|
|
NarrativeText(text="This is a test document to use for unit tests."),
|
|
|
|
|
Address(text="Doylestown, PA 18901"),
|
|
|
|
|
Title(text="Important points:"),
|
|
|
|
|
ListItem(text="Hamburgers are delicious"),
|
|
|
|
|
ListItem(text="Dogs are the best"),
|
|
|
|
|
ListItem(text="I love fuzzy blankets"),
|
|
|
|
|
],
|
|
|
|
|
),
|
|
|
|
|
("fake-text-all-whitespace.txt", []),
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_auto_partition_text_from_filename(filename: str, expected_elements: list[Element]):
|
|
|
|
|
file_path = example_doc_path(filename)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
2024-07-09 22:29:07 -07:00
|
|
|
|
elements = partition(filename=file_path, strategy=PartitionStrategy.HI_RES)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
2024-09-29 06:16:33 +02:00
|
|
|
|
assert elements == expected_elements
|
|
|
|
|
assert all(e.metadata.filename == filename for e in elements)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
|
2023-04-26 13:52:47 -04:00
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
def test_auto_partition_text_from_file():
|
2024-07-09 22:29:07 -07:00
|
|
|
|
with open(example_doc_path("fake-text.txt"), "rb") as f:
|
2023-11-15 21:41:02 -08:00
|
|
|
|
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
assert len(elements) > 0
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert elements == [
|
|
|
|
|
NarrativeText(text="This is a test document to use for unit tests."),
|
|
|
|
|
Address(text="Doylestown, PA 18901"),
|
|
|
|
|
Title(text="Important points:"),
|
|
|
|
|
ListItem(text="Hamburgers are delicious"),
|
|
|
|
|
ListItem(text="Dogs are the best"),
|
|
|
|
|
ListItem(text="I love fuzzy blankets"),
|
|
|
|
|
]
|
2023-05-19 15:57:42 -04:00
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
# ================================================================================================
|
|
|
|
|
# XLS
|
|
|
|
|
# ================================================================================================
|
2024-04-15 23:03:42 +02:00
|
|
|
|
|
2023-05-26 01:55:32 -07:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
def test_auto_partition_xls_from_filename():
|
|
|
|
|
elements = partition(
|
|
|
|
|
example_doc_path("tests-example.xls"), include_header=False, skip_infer_table_types=[]
|
|
|
|
|
)
|
2023-05-26 01:55:32 -07:00
|
|
|
|
|
fix(xlsx): xlsx subtable algorithm (#2534)
**Reviewers:** It may be easier to review each of the two commits
separately. The first adds the new `_SubtableParser` object with its
unit-tests and the second one uses that object to replace the flawed
existing subtable-parsing algorithm.
**Summary**
There are a cluster of bugs in `partition_xlsx()` that all derive from
flaws in the algorithm we use to detect "subtables". These are
encountered when the user wants to get multiple document-elements from
each worksheet, which is the default (argument `find_subtable = True`).
This PR replaces the flawed existing algorithm with a `_SubtableParser`
object that encapsulates all that logic and has thorough unit-tests.
**Additional Context**
This is a summary of the failure cases. There are a few other cases but
they're closely related and this was enough evidence and scope for my
purposes. This PR fixes all these bugs:
```python
#
# -- ✅ CASE 1: There are no leading or trailing single-cell rows.
# -> this subtable functions never get called, subtable is emitted as the only element
#
# a b -> Table(a, b, c, d)
# c d
# -- ✅ CASE 2: There is exactly one leading single-cell row.
# -> Leading single-cell row emitted as `Title` element, core-table properly identified.
#
# a -> [ Title(a),
# b c Table(b, c, d, e) ]
# d e
# -- ❌ CASE 3: There are two-or-more leading single-cell rows.
# -> leading single-cell rows are included in subtable
#
# a -> [ Table(a, b, c, d, e, f) ]
# b
# c d
# e f
# -- ❌ CASE 4: There is exactly one trailing single-cell row.
# -> core table is dropped. trailing single-cell row is emitted as Title
# (this is the behavior in the reported bug)
#
# a b -> [ Title(e) ]
# c d
# e
# -- ❌ CASE 5: There are two-or-more trailing single-cell rows.
# -> core table is dropped. trailing single-cell rows are each emitted as a Title
#
# a b -> [ Title(e),
# c d Title(f) ]
# e
# f
# -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows.
# -> core table is correctly identified, leading and trailing single-cell rows are each
# emitted as a Title.
#
# a -> [ Title(a),
# b c Table(b, c, d, e),
# d e Title(f) ]
# f
# -- ✅ CASE 7: There are two leading and one trailing single-cell rows.
# -> core table is correctly identified, leading and trailing single-cell rows are each
# emitted as a Title.
#
# a -> [ Title(a),
# b Title(b),
# c d Table(c, d, e, f),
# e f Title(g) ]
# g
# -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows.
# -> core table is correctly identified, leading and trailing single-cell rows are each
# emitted as a Title.
#
# a -> [ Title(a),
# b Title(b),
# c d Table(c, d, e, f),
# e f Title(g),
# g Title(h) ]
# h
# -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below.
# -> First cell is mistakenly emitted as title, remaining cells are dropped.
#
# a b c -> [ Title(a) ]
# -- ❌ CASE 10: Single-row subtable with one leading single-cell row.
# -> Leading single-row cell is correctly identified as title, core-table is mis-identified
# as a `Title` and truncated.
#
# a -> [ Title(a),
# b c d Title(b) ]
```
2024-02-13 20:29:17 -08:00
|
|
|
|
assert len(elements) == 14
|
2024-10-17 15:05:11 -07:00
|
|
|
|
assert sum(isinstance(e, Table) for e in elements) == 2
|
2023-05-26 01:55:32 -07:00
|
|
|
|
assert elements[0].metadata.text_as_html == EXPECTED_XLS_TABLE
|
2024-10-17 15:05:11 -07:00
|
|
|
|
assert len(elements[0].text) == 507
|
2023-05-26 01:55:32 -07:00
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
# ================================================================================================
|
|
|
|
|
# XLSX
|
|
|
|
|
# ================================================================================================
|
2023-05-19 15:57:42 -04:00
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
def test_auto_partition_xlsx_from_filename():
|
|
|
|
|
elements = partition(
|
|
|
|
|
example_doc_path("stanley-cups.xlsx"), include_header=False, skip_infer_table_types=[]
|
|
|
|
|
)
|
2023-07-27 13:33:36 -04:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
assert len(elements) == 4
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert sum(isinstance(e, Table) for e in elements) == 2
|
|
|
|
|
assert sum(isinstance(e, Title) for e in elements) == 2
|
|
|
|
|
assert clean_extra_whitespace(elements[0].text) == "Stanley Cups"
|
|
|
|
|
assert clean_extra_whitespace(elements[1].text) == (
|
|
|
|
|
"Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
|
|
|
|
|
)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert all(e.metadata.page_number == 1 for e in elements[:2])
|
|
|
|
|
assert all(e.metadata.page_number == 2 for e in elements[2:])
|
|
|
|
|
assert all(
|
|
|
|
|
e.metadata.filetype == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
|
|
|
for e in elements
|
|
|
|
|
)
|
2023-05-19 15:57:42 -04:00
|
|
|
|
|
2023-06-09 16:07:50 -04:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
def test_auto_partition_xlsx_from_file():
|
|
|
|
|
with open(example_doc_path("stanley-cups.xlsx"), "rb") as f:
|
|
|
|
|
elements = partition(file=f, include_header=False, skip_infer_table_types=[])
|
2023-06-09 16:07:50 -04:00
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert len(elements) == 4
|
2024-07-08 14:25:17 -07:00
|
|
|
|
assert sum(isinstance(element, Table) for element in elements) == 2
|
|
|
|
|
assert sum(isinstance(element, Title) for element in elements) == 2
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert clean_extra_whitespace(elements[0].text) == "Stanley Cups"
|
|
|
|
|
assert clean_extra_whitespace(elements[1].text) == (
|
|
|
|
|
"Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
|
|
|
|
|
)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert all(e.metadata.page_number == 1 for e in elements[:2])
|
|
|
|
|
assert all(e.metadata.page_number == 2 for e in elements[2:])
|
|
|
|
|
assert all(
|
|
|
|
|
e.metadata.filetype == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
|
|
|
for e in elements
|
|
|
|
|
)
|
2023-06-27 21:52:39 +03:00
|
|
|
|
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_auto_partition_xlsx_respects_starting_page_number_argument():
|
2024-07-09 22:29:07 -07:00
|
|
|
|
elements = partition(example_doc_path("stanley-cups.xlsx"), starting_page_number=3)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert all(e.metadata.page_number == 3 for e in elements[:2])
|
|
|
|
|
assert all(e.metadata.page_number == 4 for e in elements[2:])
|
2023-06-09 16:07:50 -04:00
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
# ================================================================================================
|
|
|
|
|
# XML
|
|
|
|
|
# ================================================================================================
|
2023-06-12 15:31:10 -04:00
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
def test_auto_partition_xml_from_filename():
|
2024-07-11 12:57:28 -07:00
|
|
|
|
elements = partition(example_doc_path("factbook.xml"), xml_keep_tags=False)
|
2023-06-23 20:45:31 +02:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
assert elements[0].text == "United States"
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert all(e.metadata.filename == "factbook.xml" for e in elements)
|
2023-06-23 20:45:31 +02:00
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
def test_auto_partition_xml_from_file():
|
|
|
|
|
with open(example_doc_path("factbook.xml"), "rb") as f:
|
|
|
|
|
elements = partition(file=f, xml_keep_tags=False)
|
2023-06-23 20:45:31 +02:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
assert elements[0].text == "United States"
|
2023-06-23 20:45:31 +02:00
|
|
|
|
|
2023-06-12 15:31:10 -04:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
def test_auto_partition_xml_from_filename_with_tags():
|
|
|
|
|
elements = partition(example_doc_path("factbook.xml"), xml_keep_tags=True)
|
2023-06-12 15:31:10 -04:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
assert "<leader>Joe Biden</leader>" in elements[0].text
|
|
|
|
|
assert elements[0].metadata.filename == "factbook.xml"
|
2023-06-12 15:31:10 -04:00
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
def test_auto_partition_xml_from_file_with_tags():
|
|
|
|
|
with open(example_doc_path("factbook.xml"), "rb") as f:
|
|
|
|
|
elements = partition(file=f, xml_keep_tags=True)
|
2023-08-02 18:14:15 -07:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
assert "<leader>Joe Biden</leader>" in elements[0].text
|
2023-08-02 18:14:15 -07:00
|
|
|
|
|
2023-08-24 03:02:47 -04:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
# ================================================================================================
|
|
|
|
|
# FILE_TYPE NOT RECOGNIZED OR NOT SUPPORTED
|
|
|
|
|
# ================================================================================================
|
2023-08-24 03:02:47 -04:00
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
def test_auto_partition_raises_with_bad_type(request: FixtureRequest):
|
|
|
|
|
detect_filetype_ = function_mock(
|
2024-07-21 23:03:55 -07:00
|
|
|
|
request, "unstructured.partition.auto.detect_filetype", return_value=FileType.UNK
|
2024-07-08 14:25:17 -07:00
|
|
|
|
)
|
2023-08-24 03:02:47 -04:00
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
|
with pytest.raises(
|
|
|
|
|
UnsupportedFileFormatError,
|
2024-12-10 12:44:34 -08:00
|
|
|
|
match="Partitioning is not supported for the FileType.UNK file type.",
|
2024-10-15 19:02:33 -07:00
|
|
|
|
):
|
2024-07-08 14:25:17 -07:00
|
|
|
|
partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES)
|
2023-08-21 23:00:21 -04:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
detect_filetype_.assert_called_once_with(
|
2024-07-23 16:18:48 -07:00
|
|
|
|
file_path="made-up.fake",
|
|
|
|
|
file=None,
|
|
|
|
|
encoding=None,
|
|
|
|
|
content_type=None,
|
|
|
|
|
metadata_file_path=None,
|
2024-07-08 14:25:17 -07:00
|
|
|
|
)
|
2023-08-21 23:00:21 -04:00
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
# ================================================================================================
|
|
|
|
|
# LOAD FROM URL
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_from_url():
|
|
|
|
|
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
elements = partition(url=url, content_type="text/plain", strategy=PartitionStrategy.HI_RES)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
assert elements[0] == Title("Apache License")
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert all(e.metadata.url == url for e in elements)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_from_url_with_rfc9110_content_type():
|
|
|
|
|
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
elements = partition(
|
|
|
|
|
url=url, content_type="text/plain; charset=utf-8", strategy=PartitionStrategy.HI_RES
|
|
|
|
|
)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
assert elements[0] == Title("Apache License")
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert all(e.metadata.url == url for e in elements)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_from_url_without_providing_content_type():
|
|
|
|
|
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
elements = partition(url=url, strategy=PartitionStrategy.HI_RES)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
assert elements[0] == Title("Apache License")
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert all(e.metadata.url == url for e in elements)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_warns_if_header_set_and_not_url(caplog: LogCaptureFixture):
|
|
|
|
|
partition(
|
2024-07-09 22:29:07 -07:00
|
|
|
|
example_doc_path("eml/fake-email.eml"),
|
|
|
|
|
headers={"Accept": "application/pdf"},
|
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
2024-07-08 14:25:17 -07:00
|
|
|
|
)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
assert caplog.records[0].levelname == "WARNING"
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert "headers kwarg is set but the url kwarg is not. The headers kwarg will b" in caplog.text
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_auto_partition_from_url_routes_timeout_to_HTTP_request(request: FixtureRequest):
|
|
|
|
|
file_and_type_from_url_ = function_mock(
|
|
|
|
|
request,
|
|
|
|
|
"unstructured.partition.auto.file_and_type_from_url",
|
|
|
|
|
side_effect=ConnectionError("Trouble on the wire ..."),
|
|
|
|
|
)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
with pytest.raises(ConnectionError, match="Trouble on the wire ..."):
|
|
|
|
|
partition(url="http://eie.io", request_timeout=326)
|
|
|
|
|
|
|
|
|
|
file_and_type_from_url_.assert_called_once_with(
|
|
|
|
|
url="http://eie.io", content_type=None, headers={}, ssl_verify=True, request_timeout=326
|
|
|
|
|
)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
# OTHER ARGS
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
|
|
|
|
|
# -- chunking_strategy ----------------------------------------------------
|
2023-09-11 16:00:14 -05:00
|
|
|
|
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_auto_partition_forwards_chunking_strategy_via_kwargs():
|
|
|
|
|
chunks = partition(example_doc_path("example-10k-1p.html"), chunking_strategy="by_title")
|
|
|
|
|
assert all(isinstance(chunk, (CompositeElement, Table, TableChunk)) for chunk in chunks)
|
2023-10-03 09:40:34 -07:00
|
|
|
|
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_auto_partition_forwards_max_characters_via_kwargs():
|
|
|
|
|
chunks = partition(
|
|
|
|
|
example_doc_path("example-10k-1p.html"),
|
|
|
|
|
chunking_strategy="by_title",
|
|
|
|
|
max_characters=250,
|
|
|
|
|
)
|
|
|
|
|
assert all(len(chunk.text) <= 250 for chunk in chunks)
|
2023-10-10 20:47:56 -05:00
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
# -- detect_language_per_element ------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_auto_partition_respects_detect_language_per_element_arg():
|
2024-07-09 22:29:07 -07:00
|
|
|
|
elements = partition(
|
|
|
|
|
example_doc_path("language-docs/eng_spa_mult.txt"), detect_language_per_element=True
|
|
|
|
|
)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
langs = [element.metadata.languages for element in elements]
|
|
|
|
|
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# -- languages ------------------------------------------------------------
|
2023-10-10 20:47:56 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
2025-03-18 16:33:48 -05:00
|
|
|
|
"file_extension",
|
|
|
|
|
[
|
|
|
|
|
"doc",
|
|
|
|
|
"docx",
|
|
|
|
|
"eml",
|
|
|
|
|
"epub",
|
|
|
|
|
"html",
|
|
|
|
|
"md",
|
|
|
|
|
"odt",
|
|
|
|
|
"org",
|
|
|
|
|
"ppt",
|
|
|
|
|
"pptx",
|
|
|
|
|
"rst",
|
|
|
|
|
"rtf",
|
|
|
|
|
"txt",
|
|
|
|
|
"xml",
|
|
|
|
|
],
|
2023-10-10 20:47:56 -05:00
|
|
|
|
)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_auto_partition_respects_language_arg(file_extension: str):
|
2024-07-08 14:25:17 -07:00
|
|
|
|
elements = partition(
|
|
|
|
|
example_doc_path(f"language-docs/eng_spa_mult.{file_extension}"), languages=["deu"]
|
|
|
|
|
)
|
2023-10-10 20:47:56 -05:00
|
|
|
|
assert all(element.metadata.languages == ["deu"] for element in elements)
|
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
# -- include_page_breaks --------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_auto_partition_forwards_include_page_breaks_to_partition_pdf():
|
2024-07-08 14:25:17 -07:00
|
|
|
|
elements = partition(
|
2024-07-18 15:21:32 -07:00
|
|
|
|
example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
2024-07-09 22:29:07 -07:00
|
|
|
|
include_page_breaks=True,
|
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
2024-07-08 14:25:17 -07:00
|
|
|
|
)
|
|
|
|
|
assert "PageBreak" in [elem.category for elem in elements]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# -- metadata_filename ----------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_auto_partition_forwards_metadata_filename_via_kwargs():
|
|
|
|
|
with open(example_doc_path("fake-text.txt"), "rb") as f:
|
|
|
|
|
elements = partition(file=f, metadata_filename="much-more-interesting-name.txt")
|
|
|
|
|
|
|
|
|
|
assert all(e.metadata.filename == "much-more-interesting-name.txt" for e in elements)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# -- ocr_languages --------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_auto_partition_image_formats_languages_for_tesseract(request: FixtureRequest):
|
|
|
|
|
process_file_with_ocr_ = function_mock(
|
|
|
|
|
request, "unstructured.partition.pdf_image.ocr.process_file_with_ocr"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
partition(
|
2024-07-18 15:21:32 -07:00
|
|
|
|
example_doc_path("img/chi_sim_image.jpeg"),
|
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
|
|
|
|
languages=["zh"],
|
2024-07-11 12:57:28 -07:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
call_kwargs = process_file_with_ocr_.call_args_list[0][1]
|
|
|
|
|
assert call_kwargs["ocr_languages"] == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(("languages", "ocr_languages"), [(["auto"], ""), (["eng"], "")])
|
|
|
|
|
def test_auto_partition_ignores_empty_string_for_ocr_languages(
|
|
|
|
|
languages: list[str], ocr_languages: str
|
|
|
|
|
):
|
|
|
|
|
elements = partition(
|
2024-07-09 22:29:07 -07:00
|
|
|
|
example_doc_path("book-war-and-peace-1p.txt"),
|
2024-07-08 14:25:17 -07:00
|
|
|
|
strategy=PartitionStrategy.OCR_ONLY,
|
|
|
|
|
ocr_languages=ocr_languages,
|
|
|
|
|
languages=languages,
|
|
|
|
|
)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert all(e.metadata.languages == ["eng"] for e in elements)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_warns_with_ocr_languages(caplog: LogCaptureFixture):
|
2024-07-09 22:29:07 -07:00
|
|
|
|
partition(
|
2024-07-18 15:21:32 -07:00
|
|
|
|
example_doc_path("pdf/chevron-page.pdf"),
|
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
|
|
|
|
ocr_languages="eng",
|
2024-07-09 22:29:07 -07:00
|
|
|
|
)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
|
|
|
|
assert caplog.records[0].levelname == "WARNING"
|
2024-07-08 14:25:17 -07:00
|
|
|
|
assert "The ocr_languages kwarg will be deprecated" in caplog.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# -- skip_infer_table_types -----------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
2024-07-11 12:57:28 -07:00
|
|
|
|
("skip_infer_table_types", "filename", "has_text_as_html"),
|
2024-07-08 14:25:17 -07:00
|
|
|
|
[
|
|
|
|
|
(["xlsx"], "stanley-cups.xlsx", False),
|
|
|
|
|
([], "stanley-cups.xlsx", True),
|
|
|
|
|
(["odt"], "fake.odt", False),
|
|
|
|
|
([], "fake.odt", True),
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_auto_partition_respects_skip_infer_table_types(
|
2024-07-11 12:57:28 -07:00
|
|
|
|
skip_infer_table_types: list[str], filename: str, has_text_as_html: bool
|
2024-07-08 14:25:17 -07:00
|
|
|
|
):
|
|
|
|
|
with open(example_doc_path(filename), "rb") as f:
|
2024-07-11 12:57:28 -07:00
|
|
|
|
elements = partition(file=f, skip_infer_table_types=skip_infer_table_types)
|
|
|
|
|
|
|
|
|
|
table_elements = [e for e in elements if isinstance(e, Table)]
|
|
|
|
|
assert table_elements
|
|
|
|
|
for e in table_elements:
|
|
|
|
|
assert (e.metadata.text_as_html is not None) == has_text_as_html
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
# METADATA BEHAVIORS
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
|
|
|
|
|
# -- .filetype ------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
2024-07-21 23:03:55 -07:00
|
|
|
|
("content_type", "shortname", "expected_value"),
|
2024-07-08 14:25:17 -07:00
|
|
|
|
[
|
|
|
|
|
("text/csv", "csv", "text/csv"),
|
|
|
|
|
("text/html", "html", "text/html"),
|
|
|
|
|
("jdsfjdfsjkds", "pdf", None),
|
|
|
|
|
],
|
|
|
|
|
)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_auto_partition_adds_filetype_to_metadata(
|
2024-07-08 14:25:17 -07:00
|
|
|
|
request: FixtureRequest,
|
|
|
|
|
content_type: str,
|
2024-07-21 23:03:55 -07:00
|
|
|
|
shortname: str,
|
2024-07-11 12:57:28 -07:00
|
|
|
|
expected_value: str | None,
|
2024-07-08 14:25:17 -07:00
|
|
|
|
):
|
|
|
|
|
partition_fn_ = function_mock(
|
|
|
|
|
request,
|
2024-07-21 23:03:55 -07:00
|
|
|
|
f"unstructured.partition.{shortname}.partition_{shortname}",
|
2024-07-08 14:25:17 -07:00
|
|
|
|
return_value=[Text("text 1"), Text("text 2")],
|
|
|
|
|
)
|
2024-07-21 23:03:55 -07:00
|
|
|
|
partitioner_loader_get_ = method_mock(
|
|
|
|
|
request, _PartitionerLoader, "get", return_value=partition_fn_
|
|
|
|
|
)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
elements = partition(
|
2024-07-18 15:21:32 -07:00
|
|
|
|
example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type
|
2024-07-11 12:57:28 -07:00
|
|
|
|
)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
2024-07-21 23:03:55 -07:00
|
|
|
|
partitioner_loader_get_.assert_called_once()
|
2024-07-08 14:25:17 -07:00
|
|
|
|
assert len(elements) == 2
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert all(e.metadata.filetype == expected_value for e in elements)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
2024-07-11 12:57:28 -07:00
|
|
|
|
"content_type",
|
2024-07-08 14:25:17 -07:00
|
|
|
|
[
|
2024-07-11 12:57:28 -07:00
|
|
|
|
# -- content-type provided as argument --
|
|
|
|
|
"application/pdf",
|
|
|
|
|
# -- auto-detected content-type --
|
|
|
|
|
None,
|
2024-07-08 14:25:17 -07:00
|
|
|
|
],
|
|
|
|
|
)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partitioner(
|
2024-07-21 23:03:55 -07:00
|
|
|
|
request: FixtureRequest, content_type: str | None
|
2024-07-08 14:25:17 -07:00
|
|
|
|
):
|
2024-07-11 12:57:28 -07:00
|
|
|
|
metadata = ElementMetadata(filetype="imapdf")
|
2024-07-08 14:25:17 -07:00
|
|
|
|
partition_pdf_ = function_mock(
|
|
|
|
|
request,
|
2024-07-21 23:03:55 -07:00
|
|
|
|
"unstructured.partition.pdf.partition_pdf",
|
2024-07-11 12:57:28 -07:00
|
|
|
|
return_value=[Text("text 1", metadata=metadata), Text("text 2", metadata=metadata)],
|
2024-07-08 14:25:17 -07:00
|
|
|
|
)
|
2024-07-21 23:03:55 -07:00
|
|
|
|
partitioner_loader_get_ = method_mock(
|
|
|
|
|
request, _PartitionerLoader, "get", return_value=partition_pdf_
|
|
|
|
|
)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
2024-07-18 15:21:32 -07:00
|
|
|
|
elements = partition(
|
|
|
|
|
example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type
|
|
|
|
|
)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
2024-07-21 23:03:55 -07:00
|
|
|
|
partitioner_loader_get_.assert_called_once_with(ANY, FileType.PDF)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
assert len(elements) == 2
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert all(e.metadata.filetype == "application/pdf" for e in elements)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
@pytest.mark.parametrize(
|
2024-10-10 11:44:00 -07:00
|
|
|
|
("file_name", "file_type"),
|
2024-07-11 12:57:28 -07:00
|
|
|
|
[
|
2024-10-10 11:44:00 -07:00
|
|
|
|
("stanley-cups.csv", FileType.CSV),
|
|
|
|
|
("simple.doc", FileType.DOC),
|
|
|
|
|
("simple.docx", FileType.DOCX),
|
|
|
|
|
("fake-email.eml", FileType.EML),
|
|
|
|
|
("simple.epub", FileType.EPUB),
|
|
|
|
|
("fake-html.html", FileType.HTML),
|
|
|
|
|
("README.md", FileType.MD),
|
|
|
|
|
("fake-email.msg", FileType.MSG),
|
|
|
|
|
("simple.odt", FileType.ODT),
|
|
|
|
|
("pdf/DA-1p.pdf", FileType.PDF),
|
|
|
|
|
("fake-power-point.ppt", FileType.PPT),
|
|
|
|
|
("simple.pptx", FileType.PPTX),
|
|
|
|
|
("README.rst", FileType.RST),
|
|
|
|
|
("fake-doc.rtf", FileType.RTF),
|
|
|
|
|
("stanley-cups.tsv", FileType.TSV),
|
|
|
|
|
("fake-text.txt", FileType.TXT),
|
|
|
|
|
("tests-example.xls", FileType.XLSX),
|
|
|
|
|
("stanley-cups.xlsx", FileType.XLSX),
|
|
|
|
|
("factbook.xml", FileType.XML),
|
2024-07-11 12:57:28 -07:00
|
|
|
|
],
|
|
|
|
|
)
|
2024-10-10 11:44:00 -07:00
|
|
|
|
def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(
|
|
|
|
|
file_name: str, file_type: FileType
|
|
|
|
|
):
|
|
|
|
|
file_path = example_doc_path(file_name)
|
2024-07-25 10:25:41 -07:00
|
|
|
|
partition_fn_name = file_type.partitioner_function_name
|
|
|
|
|
module = import_module(file_type.partitioner_module_qname)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
partition_fn = getattr(module, partition_fn_name)
|
|
|
|
|
|
2024-10-10 11:44:00 -07:00
|
|
|
|
# -- partition the example-doc for this filetype --
|
2024-10-15 19:02:33 -07:00
|
|
|
|
elements = partition_fn(file_path, process_attachments=False)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert elements
|
|
|
|
|
assert all(
|
2024-07-25 10:25:41 -07:00
|
|
|
|
e.metadata.filetype == file_type.mime_type
|
2024-07-11 12:57:28 -07:00
|
|
|
|
for e in elements
|
|
|
|
|
if e.metadata.filetype is not None
|
|
|
|
|
)
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
2025-02-20 14:00:25 +01:00
|
|
|
|
def test_detect_filetype_maps_file_to_bytes_io_when_spooled_temp_file_used(mocker):
|
|
|
|
|
detect_filetype_mock = MagicMock(return_value=FileType.JSON)
|
|
|
|
|
mocker.patch("unstructured.file_utils.filetype._FileTypeDetector", detect_filetype_mock)
|
|
|
|
|
with tempfile.SpooledTemporaryFile() as f:
|
|
|
|
|
f.write(b'{"text": Hello, world!}')
|
|
|
|
|
f.seek(0)
|
|
|
|
|
detect_filetype(file=f)
|
|
|
|
|
file_detection_context = detect_filetype_mock.file_type.call_args[0][0]
|
|
|
|
|
assert file_detection_context.text_head == '{"text": Hello, world!}'
|
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
# -- .languages -----------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_auto_partition_passes_user_provided_languages_arg_to_PDF():
|
2024-07-09 22:29:07 -07:00
|
|
|
|
elements = partition(
|
2024-07-18 15:21:32 -07:00
|
|
|
|
example_doc_path("pdf/chevron-page.pdf"),
|
2024-07-09 22:29:07 -07:00
|
|
|
|
strategy=PartitionStrategy.OCR_ONLY,
|
|
|
|
|
languages=["eng"],
|
|
|
|
|
)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
assert all(e.metadata.languages == ["eng"] for e in elements)
|
2024-07-08 14:25:17 -07:00
|
|
|
|
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_auto_partition_languages_argument_default_to_None_when_omitted():
|
2024-07-09 22:29:07 -07:00
|
|
|
|
elements = partition(example_doc_path("handbook-1p.docx"), detect_language_per_element=True)
|
2024-07-11 12:57:28 -07:00
|
|
|
|
# -- PageBreak and any other element with no text is assigned `None` --
|
|
|
|
|
assert all(e.text == "" for e in elements if e.metadata.languages is None)
|
2023-10-10 20:47:56 -05:00
|
|
|
|
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_auto_partition_default_does_not_overwrite_other_defaults():
|
2024-07-08 14:25:17 -07:00
|
|
|
|
"""`partition()` ["eng"] default does not overwrite ["auto"] default in other partitioners."""
|
2023-10-10 20:47:56 -05:00
|
|
|
|
# the default for `languages` is ["auto"] in partiton_text
|
|
|
|
|
from unstructured.partition.text import partition_text
|
|
|
|
|
|
|
|
|
|
# Use a document that is primarily in a language other than English
|
2024-07-09 22:29:07 -07:00
|
|
|
|
file_path = example_doc_path("language-docs/UDHR_first_article_all.txt")
|
|
|
|
|
text_elements = partition_text(file_path)
|
2023-10-10 20:47:56 -05:00
|
|
|
|
assert text_elements[0].metadata.languages != ["eng"]
|
|
|
|
|
|
2024-07-09 22:29:07 -07:00
|
|
|
|
auto_elements = partition(file_path)
|
2023-10-10 20:47:56 -05:00
|
|
|
|
assert auto_elements[0].metadata.languages != ["eng"]
|
|
|
|
|
assert auto_elements[0].metadata.languages == text_elements[0].metadata.languages
|
2023-10-14 17:46:24 -05:00
|
|
|
|
|
|
|
|
|
|
2024-07-08 14:25:17 -07:00
|
|
|
|
# ================================================================================================
|
|
|
|
|
# MISCELLANEOUS BEHAVIORS
|
|
|
|
|
# ================================================================================================
|
2023-11-07 18:44:58 -06:00
|
|
|
|
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_auto_partition_from_filename_works_on_empty_file():
|
2024-07-08 14:25:17 -07:00
|
|
|
|
assert partition(example_doc_path("empty.txt")) == []
|
2024-01-17 17:50:36 -05:00
|
|
|
|
|
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
def test_auto_partition_from_file_works_on_empty_file():
|
2024-07-08 14:25:17 -07:00
|
|
|
|
with open(example_doc_path("empty.txt"), "rb") as f:
|
|
|
|
|
assert partition(file=f) == []
|
2024-01-17 17:50:36 -05:00
|
|
|
|
|
2024-02-07 17:31:49 -05:00
|
|
|
|
|
2024-07-21 23:03:55 -07:00
|
|
|
|
def test_auto_partition_that_requires_extras_raises_when_dependencies_are_not_installed(
|
|
|
|
|
request: FixtureRequest,
|
|
|
|
|
):
|
|
|
|
|
_PartitionerLoader._partitioners.pop(FileType.PDF, None)
|
|
|
|
|
dependency_exists_ = function_mock(
|
|
|
|
|
request, "unstructured.partition.auto.dependency_exists", return_value=False
|
|
|
|
|
)
|
|
|
|
|
match = r"partition_pdf\(\) is not available because one or more dependencies are not installed"
|
|
|
|
|
with pytest.raises(ImportError, match=match):
|
2024-07-23 16:18:48 -07:00
|
|
|
|
partition(example_doc_path("pdf/layout-parser-paper-fast.pdf"))
|
2024-07-21 23:03:55 -07:00
|
|
|
|
|
|
|
|
|
dependency_exists_.assert_called_once_with("pdf2image")
|
2024-02-07 17:31:49 -05:00
|
|
|
|
|
2024-07-11 12:57:28 -07:00
|
|
|
|
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
# MODULE-LEVEL FIXTURES
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture()
|
|
|
|
|
def expected_docx_elements():
|
|
|
|
|
return [
|
|
|
|
|
Title("These are a few of my favorite things:"),
|
|
|
|
|
ListItem("Parrots"),
|
|
|
|
|
ListItem("Hockey"),
|
2024-12-17 17:16:42 -08:00
|
|
|
|
Text("Analysis"),
|
2024-07-11 12:57:28 -07:00
|
|
|
|
NarrativeText("This is my first thought. This is my second thought."),
|
|
|
|
|
NarrativeText("This is my third thought."),
|
|
|
|
|
Text("2023"),
|
|
|
|
|
Address("DOYLESTOWN, PA 18901"),
|
|
|
|
|
]
|
2025-03-06 17:09:42 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _test_partition_foo():
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_partition_works_with_custom_types(
|
|
|
|
|
request: FixtureRequest,
|
|
|
|
|
):
|
|
|
|
|
file_type = create_file_type("FOO", canonical_mime_type="application/foo", extensions=[".foo"])
|
|
|
|
|
|
|
|
|
|
register_partitioner(file_type)(_test_partition_foo)
|
|
|
|
|
loader = _PartitionerLoader()
|
|
|
|
|
assert loader.get(file_type) is _test_partition_foo
|