mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-04 19:16:03 +00:00
rfctr(auto): fix auto-partition test xfails and skips (#3367)
**Summary** Improve expression in auto-partition tests and fix xfails and skips. Add issues for the two hard-fails where xfail needed to stay.
This commit is contained in:
parent
543057317f
commit
0c562d8050
@ -1,4 +1,4 @@
|
||||
## 0.14.11-dev4
|
||||
## 0.14.11-dev5
|
||||
|
||||
### Enhancements
|
||||
|
||||
|
||||
127
example-docs/simple.json
Normal file
127
example-docs/simple.json
Normal file
@ -0,0 +1,127 @@
|
||||
[
|
||||
{
|
||||
"element_id": "a06d2d9e65212d4aa955c3ab32950ffa",
|
||||
"metadata": {
|
||||
"category_depth": 0,
|
||||
"file_directory": "unstructured/example-docs",
|
||||
"filename": "simple.docx",
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"last_modified": "2024-07-06T16:44:51"
|
||||
},
|
||||
"text": "These are a few of my favorite things:",
|
||||
"type": "Title"
|
||||
},
|
||||
{
|
||||
"element_id": "b334c93e9b1cbca3b6f6d78ce8bc2484",
|
||||
"metadata": {
|
||||
"category_depth": 0,
|
||||
"file_directory": "unstructured/example-docs",
|
||||
"filename": "simple.docx",
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"last_modified": "2024-07-06T16:44:51",
|
||||
"parent_id": "a06d2d9e65212d4aa955c3ab32950ffa"
|
||||
},
|
||||
"text": "Parrots",
|
||||
"type": "ListItem"
|
||||
},
|
||||
{
|
||||
"element_id": "76469ecb9f1459943c8d8cca1a550b5a",
|
||||
"metadata": {
|
||||
"category_depth": 0,
|
||||
"file_directory": "unstructured/example-docs",
|
||||
"filename": "simple.docx",
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"last_modified": "2024-07-06T16:44:51",
|
||||
"parent_id": "a06d2d9e65212d4aa955c3ab32950ffa"
|
||||
},
|
||||
"text": "Hockey",
|
||||
"type": "ListItem"
|
||||
},
|
||||
{
|
||||
"element_id": "261fac731945a138415adc2dd4434b17",
|
||||
"metadata": {
|
||||
"category_depth": 0,
|
||||
"file_directory": "unstructured/example-docs",
|
||||
"filename": "simple.docx",
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"last_modified": "2024-07-06T16:44:51"
|
||||
},
|
||||
"text": "Analysis",
|
||||
"type": "Title"
|
||||
},
|
||||
{
|
||||
"element_id": "95f392d32c5271bfdb30eaef45921e59",
|
||||
"metadata": {
|
||||
"category_depth": 0,
|
||||
"file_directory": "unstructured/example-docs",
|
||||
"filename": "simple.docx",
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"last_modified": "2024-07-06T16:44:51",
|
||||
"parent_id": "261fac731945a138415adc2dd4434b17"
|
||||
},
|
||||
"text": "This is my first thought. This is my second thought.",
|
||||
"type": "NarrativeText"
|
||||
},
|
||||
{
|
||||
"element_id": "0de25bd6f0d74bc4f909f2678f385736",
|
||||
"metadata": {
|
||||
"category_depth": 0,
|
||||
"file_directory": "unstructured/example-docs",
|
||||
"filename": "simple.docx",
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"last_modified": "2024-07-06T16:44:51",
|
||||
"parent_id": "261fac731945a138415adc2dd4434b17"
|
||||
},
|
||||
"text": "This is my third thought.",
|
||||
"type": "NarrativeText"
|
||||
},
|
||||
{
|
||||
"element_id": "f296a3bc8a901f19199fda1da92829b6",
|
||||
"metadata": {
|
||||
"category_depth": 0,
|
||||
"file_directory": "unstructured/example-docs",
|
||||
"filename": "simple.docx",
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"last_modified": "2024-07-06T16:44:51",
|
||||
"parent_id": "261fac731945a138415adc2dd4434b17"
|
||||
},
|
||||
"text": "2023",
|
||||
"type": "UncategorizedText"
|
||||
},
|
||||
{
|
||||
"element_id": "78c62edbc674fdca0f6a0e3ffb459f86",
|
||||
"metadata": {
|
||||
"category_depth": 0,
|
||||
"file_directory": "unstructured/example-docs",
|
||||
"filename": "simple.docx",
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"last_modified": "2024-07-06T16:44:51"
|
||||
},
|
||||
"text": "DOYLESTOWN, PA 18901",
|
||||
"type": "Address"
|
||||
}
|
||||
]
|
||||
File diff suppressed because it is too large
Load Diff
@ -5,6 +5,7 @@ from __future__ import annotations
|
||||
import json
|
||||
import os
|
||||
import pathlib
|
||||
import sys
|
||||
import tempfile
|
||||
import warnings
|
||||
from importlib import import_module
|
||||
@ -51,19 +52,7 @@ from unstructured.partition import auto
|
||||
from unstructured.partition.auto import _get_partition_with_extras, partition
|
||||
from unstructured.partition.common import convert_office_doc
|
||||
from unstructured.partition.utils.constants import PartitionStrategy
|
||||
from unstructured.staging.base import elements_to_json
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
||||
|
||||
EXPECTED_EMAIL_OUTPUT = [
|
||||
NarrativeText(text="This is a test email to use for unit tests."),
|
||||
Title(text="Important points:"),
|
||||
ListItem(text="Roses are red"),
|
||||
ListItem(text="Violets are blue"),
|
||||
]
|
||||
|
||||
EML_TEST_FILE = "eml/fake-email.eml"
|
||||
from unstructured.staging.base import elements_from_json, elements_to_dicts, elements_to_json
|
||||
|
||||
is_in_docker = os.path.exists("/.dockerenv")
|
||||
|
||||
@ -98,7 +87,6 @@ def test_auto_partition_csv_from_file():
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
@pytest.mark.parametrize(
|
||||
("pass_metadata_filename", "content_type"),
|
||||
[(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
|
||||
@ -126,20 +114,30 @@ def test_auto_partition_doc_with_filename(
|
||||
assert elements[0].metadata.file_directory == str(tmp_path)
|
||||
|
||||
|
||||
# NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to
|
||||
# determine that the file is an .doc document
|
||||
@pytest.mark.xfail()
|
||||
def test_auto_partition_doc_with_file(
|
||||
mock_docx_document: Document, expected_docx_elements: list[Element], tmp_path: pathlib.Path
|
||||
):
|
||||
docx_filename = str(tmp_path / "mock_document.docx")
|
||||
doc_filename = str(tmp_path / "mock_document.doc")
|
||||
mock_docx_document.save(docx_filename)
|
||||
convert_office_doc(docx_filename, str(tmp_path), "doc")
|
||||
@pytest.mark.skipif(is_in_docker, reason="Passes in CI but not Docker. Remove skip on #3364 fix.")
|
||||
@pytest.mark.xfail(sys.platform == "darwin", reason="#3364", raises=KeyError, strict=True)
|
||||
def test_auto_partition_doc_with_file():
|
||||
# -- NOTE(scanny): https://github.com/Unstructured-IO/unstructured/issues/3364
|
||||
# -- detect_filetype() identifies .doc as `application/x-ole-storage` which is true but not
|
||||
# -- specific enough. The `FileType.MSG` file-type is assigned (which is also an OLE file)
|
||||
# -- and `partition()` routes the document to `partition_msg` which is where the `KeyError`
|
||||
# -- comes from.
|
||||
# -- For some reason, this xfail problem only occurs locally, not in CI, possibly because we
|
||||
# -- use two different `libmagic` sourcs (`libmagic` on CI and `libmagic1` on Mac). Doesn't
|
||||
# -- matter much though because when we add disambiguation they'll both get it right.
|
||||
with open(example_doc_path("simple.doc"), "rb") as f:
|
||||
elements = partition(file=f)
|
||||
|
||||
with open(doc_filename, "rb") as f:
|
||||
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
||||
assert elements == expected_docx_elements
|
||||
assert elements == [
|
||||
Title("These are a few of my favorite things:"),
|
||||
ListItem("Parrots"),
|
||||
ListItem("Hockey"),
|
||||
Title("Analysis"),
|
||||
NarrativeText("This is my first thought. This is my second thought."),
|
||||
NarrativeText("This is my third thought."),
|
||||
Text("2023"),
|
||||
Address("DOYLESTOWN, PA 18901"),
|
||||
]
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
@ -184,21 +182,21 @@ def expected_docx_elements():
|
||||
def test_auto_partition_docx_with_filename(
|
||||
mock_docx_document: Document, expected_docx_elements: list[Element], tmp_path: pathlib.Path
|
||||
):
|
||||
filename = str(tmp_path / "mock_document.docx")
|
||||
mock_docx_document.save(filename)
|
||||
file_path = str(tmp_path / "mock_document.docx")
|
||||
mock_docx_document.save(file_path)
|
||||
|
||||
elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
|
||||
elements = partition(filename=file_path, strategy=PartitionStrategy.HI_RES)
|
||||
assert elements == expected_docx_elements
|
||||
assert elements[0].metadata.filename == os.path.basename(filename)
|
||||
assert elements[0].metadata.filename == os.path.basename(file_path)
|
||||
|
||||
|
||||
def test_auto_partition_docx_with_file(
|
||||
mock_docx_document: Document, expected_docx_elements: list[Element], tmp_path: pathlib.Path
|
||||
):
|
||||
filename = str(tmp_path / "mock_document.docx")
|
||||
mock_docx_document.save(filename)
|
||||
file_path = str(tmp_path / "mock_document.docx")
|
||||
mock_docx_document.save(file_path)
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
with open(file_path, "rb") as f:
|
||||
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
||||
assert elements == expected_docx_elements
|
||||
|
||||
@ -246,34 +244,32 @@ def test_partition_forwards_strategy_arg_to_partition_docx_and_its_brokers(
|
||||
# EML
|
||||
# ================================================================================================
|
||||
|
||||
EXPECTED_EMAIL_OUTPUT = [
|
||||
NarrativeText(text="This is a test email to use for unit tests."),
|
||||
Title(text="Important points:"),
|
||||
ListItem(text="Roses are red"),
|
||||
ListItem(text="Violets are blue"),
|
||||
]
|
||||
|
||||
|
||||
def test_auto_partition_email_from_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
|
||||
elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
|
||||
file_path = example_doc_path("eml/fake-email.eml")
|
||||
elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
|
||||
assert len(elements) > 0
|
||||
assert elements == EXPECTED_EMAIL_OUTPUT
|
||||
assert elements[0].metadata.filename == os.path.basename(filename)
|
||||
assert elements[0].metadata.file_directory == os.path.split(filename)[0]
|
||||
assert elements[0].metadata.filename == os.path.basename(file_path)
|
||||
assert elements[0].metadata.file_directory == os.path.split(file_path)[0]
|
||||
|
||||
|
||||
def test_auto_partition_email_from_file():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
||||
assert len(elements) > 0
|
||||
assert elements == EXPECTED_EMAIL_OUTPUT
|
||||
|
||||
|
||||
def test_auto_partition_email_from_file_rb():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
|
||||
with open(filename, "rb") as f:
|
||||
with open(example_doc_path("eml/fake-email.eml"), "rb") as f:
|
||||
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
||||
assert len(elements) > 0
|
||||
assert elements == EXPECTED_EMAIL_OUTPUT
|
||||
|
||||
|
||||
def test_auto_partition_eml_add_signature_to_metadata():
|
||||
elements = partition(filename="example-docs/eml/signed-doc.p7s")
|
||||
elements = partition(example_doc_path("eml/signed-doc.p7s"))
|
||||
assert len(elements) == 1
|
||||
assert elements[0].text == "This is a test"
|
||||
assert elements[0].metadata.signature == "<SIGNATURE>\n"
|
||||
@ -285,15 +281,13 @@ def test_auto_partition_eml_add_signature_to_metadata():
|
||||
|
||||
|
||||
def test_auto_partition_epub_from_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
|
||||
elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
|
||||
elements = partition(example_doc_path("winter-sports.epub"), strategy=PartitionStrategy.HI_RES)
|
||||
assert len(elements) > 0
|
||||
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
|
||||
|
||||
|
||||
def test_auto_partition_epub_from_file():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
|
||||
with open(filename, "rb") as f:
|
||||
with open(example_doc_path("winter-sports.epub"), "rb") as f:
|
||||
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
||||
assert len(elements) > 0
|
||||
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
|
||||
@ -309,17 +303,17 @@ def test_auto_partition_epub_from_file():
|
||||
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
|
||||
)
|
||||
def test_auto_partition_html_from_filename(pass_metadata_filename: bool, content_type: str | None):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html")
|
||||
metadata_filename = filename if pass_metadata_filename else None
|
||||
file_path = example_doc_path("example-10k.html")
|
||||
metadata_filename = file_path if pass_metadata_filename else None
|
||||
elements = partition(
|
||||
filename=filename,
|
||||
filename=file_path,
|
||||
metadata_filename=metadata_filename,
|
||||
content_type=content_type,
|
||||
strategy=PartitionStrategy.HI_RES,
|
||||
)
|
||||
assert len(elements) > 0
|
||||
assert elements[0].metadata.filename == os.path.basename(filename)
|
||||
assert elements[0].metadata.file_directory == os.path.split(filename)[0]
|
||||
assert elements[0].metadata.filename == os.path.basename(file_path)
|
||||
assert elements[0].metadata.file_directory == os.path.split(file_path)[0]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -327,9 +321,9 @@ def test_auto_partition_html_from_filename(pass_metadata_filename: bool, content
|
||||
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
|
||||
)
|
||||
def test_auto_partition_html_from_file(pass_metadata_filename: bool, content_type: str | None):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
|
||||
metadata_filename = filename if pass_metadata_filename else None
|
||||
with open(filename, "rb") as f:
|
||||
file_path = example_doc_path("fake-html.html")
|
||||
metadata_filename = file_path if pass_metadata_filename else None
|
||||
with open(file_path, "rb") as f:
|
||||
elements = partition(
|
||||
file=f,
|
||||
metadata_filename=metadata_filename,
|
||||
@ -340,8 +334,7 @@ def test_auto_partition_html_from_file(pass_metadata_filename: bool, content_typ
|
||||
|
||||
|
||||
def test_auto_partition_html_from_file_rb():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
|
||||
with open(filename, "rb") as f:
|
||||
with open(example_doc_path("fake-html.html"), "rb") as f:
|
||||
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
||||
assert len(elements) > 0
|
||||
|
||||
@ -367,10 +360,10 @@ def test_auto_partition_html_pre_from_file():
|
||||
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
||||
)
|
||||
def test_auto_partition_image(pass_metadata_filename: bool, content_type: str | None):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
|
||||
metadata_filename = filename if pass_metadata_filename else None
|
||||
file_path = example_doc_path("layout-parser-paper-fast.jpg")
|
||||
metadata_filename = file_path if pass_metadata_filename else None
|
||||
elements = partition(
|
||||
filename=filename,
|
||||
filename=file_path,
|
||||
metadata_filename=metadata_filename,
|
||||
content_type=content_type,
|
||||
strategy=PartitionStrategy.AUTO,
|
||||
@ -405,10 +398,10 @@ def test_auto_partition_image_element_extraction(extract_image_block_to_payload:
|
||||
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
||||
)
|
||||
def test_auto_partition_jpg(pass_metadata_filename: bool, content_type: str | None):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
|
||||
metadata_filename = filename if pass_metadata_filename else None
|
||||
file_path = example_doc_path("layout-parser-paper-fast.jpg")
|
||||
metadata_filename = file_path if pass_metadata_filename else None
|
||||
elements = partition(
|
||||
filename=filename,
|
||||
filename=file_path,
|
||||
metadata_filename=metadata_filename,
|
||||
content_type=content_type,
|
||||
strategy=PartitionStrategy.AUTO,
|
||||
@ -421,9 +414,9 @@ def test_auto_partition_jpg(pass_metadata_filename: bool, content_type: str | No
|
||||
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
||||
)
|
||||
def test_auto_partition_jpg_from_file(pass_metadata_filename: bool, content_type: str | None):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
|
||||
metadata_filename = filename if pass_metadata_filename else None
|
||||
with open(filename, "rb") as f:
|
||||
file_path = example_doc_path("layout-parser-paper-fast.jpg")
|
||||
metadata_filename = file_path if pass_metadata_filename else None
|
||||
with open(file_path, "rb") as f:
|
||||
elements = partition(
|
||||
file=f,
|
||||
metadata_filename=metadata_filename,
|
||||
@ -454,19 +447,10 @@ def test_partition_image_with_bmp_with_auto(tmp_path: pathlib.Path):
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
# NOTE(robinson) - skipping this test with docker image to avoid putting the
|
||||
# test fixtures into the image
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
def test_auto_partitioned_json_output_maintains_consistency_with_fixture_elements():
|
||||
"""Test auto-processing an unstructured json output file by filename."""
|
||||
json_file_path = example_doc_path("spring-weather.html.json")
|
||||
original_file_name = "spring-weather.html"
|
||||
json_file_path = (
|
||||
pathlib.Path(DIRECTORY).parents[1]
|
||||
/ "test_unstructured_ingest"
|
||||
/ "expected-structured-output"
|
||||
/ "azure"
|
||||
/ f"{original_file_name}.json"
|
||||
)
|
||||
with open(json_file_path) as json_f:
|
||||
expected_result = json.load(json_f)
|
||||
|
||||
@ -495,52 +479,41 @@ def test_auto_partition_json_raises_with_unprocessable_json(tmp_path: pathlib.Pa
|
||||
# per the Unstructured ISD format
|
||||
text = '{"hi": "there"}'
|
||||
|
||||
filename = str(tmp_path / "unprocessable.json")
|
||||
with open(filename, "w") as f:
|
||||
file_path = str(tmp_path / "unprocessable.json")
|
||||
with open(file_path, "w") as f:
|
||||
f.write(text)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
partition(filename=filename)
|
||||
partition(filename=file_path)
|
||||
|
||||
|
||||
@pytest.mark.xfail(
|
||||
reason="parsed as text not json, https://github.com/Unstructured-IO/unstructured/issues/492",
|
||||
reason=(
|
||||
"https://github.com/Unstructured-IO/unstructured/issues/3365"
|
||||
" partition_json() does not preserve original element-id or metadata"
|
||||
),
|
||||
raises=AssertionError,
|
||||
strict=True,
|
||||
)
|
||||
def test_auto_partition_json_from_file():
|
||||
"""Test auto-processing an unstructured json output file by file handle."""
|
||||
filename = os.path.join(
|
||||
EXAMPLE_DOCS_DIRECTORY,
|
||||
"..",
|
||||
"test_unstructured_ingest",
|
||||
"expected-structured-output",
|
||||
"azure-blob-storage",
|
||||
"spring-weather.html.json",
|
||||
)
|
||||
with open(filename) as json_f:
|
||||
json_data = json.load(json_f)
|
||||
with open(filename, "rb") as partition_f:
|
||||
json_elems = json.loads(
|
||||
cast(
|
||||
str,
|
||||
elements_to_json(partition(file=partition_f, strategy=PartitionStrategy.HI_RES)),
|
||||
)
|
||||
)
|
||||
for elem in json_elems:
|
||||
# coordinates are always in the element data structures, even if None
|
||||
elem.pop("coordinates")
|
||||
elem.pop("coordinate_system")
|
||||
assert json_data == json_elems
|
||||
def test_auto_partition_json_from_file_preserves_original_elements():
|
||||
file_path = example_doc_path("simple.json")
|
||||
original_elements = elements_from_json(file_path)
|
||||
|
||||
with open(file_path, "rb") as f:
|
||||
partitioned_elements = partition(file=f)
|
||||
|
||||
assert elements_to_dicts(partitioned_elements) == elements_to_dicts(original_elements)
|
||||
|
||||
|
||||
def test_auto_partition_works_with_unstructured_jsons():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "spring-weather.html.json")
|
||||
elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
|
||||
elements = partition(
|
||||
example_doc_path("spring-weather.html.json"), strategy=PartitionStrategy.HI_RES
|
||||
)
|
||||
assert elements[0].text == "News Around NOAA"
|
||||
|
||||
|
||||
def test_auto_partition_works_with_unstructured_jsons_from_file():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "spring-weather.html.json")
|
||||
with open(filename, "rb") as f:
|
||||
with open(example_doc_path("spring-weather.html.json"), "rb") as f:
|
||||
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
||||
assert elements[0].text == "News Around NOAA"
|
||||
|
||||
@ -570,8 +543,7 @@ EXPECTED_MSG_OUTPUT = [
|
||||
|
||||
|
||||
def test_auto_partition_msg_from_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
||||
elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
|
||||
elements = partition(example_doc_path("fake-email.msg"), strategy=PartitionStrategy.HI_RES)
|
||||
assert elements == EXPECTED_MSG_OUTPUT
|
||||
|
||||
|
||||
@ -581,14 +553,12 @@ def test_auto_partition_msg_from_filename():
|
||||
|
||||
|
||||
def test_auto_partition_odt_from_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
|
||||
elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
|
||||
elements = partition(example_doc_path("fake.odt"), strategy=PartitionStrategy.HI_RES)
|
||||
assert elements[0] == Title("Lorem ipsum dolor sit amet.")
|
||||
|
||||
|
||||
def test_auto_partition_odt_from_file():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
|
||||
with open(filename, "rb") as f:
|
||||
with open(example_doc_path("fake.odt"), "rb") as f:
|
||||
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
||||
|
||||
assert elements[0] == Title("Lorem ipsum dolor sit amet.")
|
||||
@ -623,54 +593,56 @@ def test_auto_partition_org_from_file():
|
||||
("pass_metadata_filename", "content_type"),
|
||||
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
|
||||
)
|
||||
def test_auto_partition_pdf_from_filename(
|
||||
request: FixtureRequest, pass_metadata_filename: bool, content_type: str | None
|
||||
):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
||||
metadata_filename = filename if pass_metadata_filename else None
|
||||
def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_type: str | None):
|
||||
file_path = example_doc_path("layout-parser-paper-fast.pdf")
|
||||
metadata_filename = file_path if pass_metadata_filename else None
|
||||
|
||||
elements = partition(
|
||||
filename=filename,
|
||||
filename=file_path,
|
||||
metadata_filename=metadata_filename,
|
||||
content_type=content_type,
|
||||
strategy=PartitionStrategy.HI_RES,
|
||||
)
|
||||
|
||||
# NOTE(alan): Xfail since new model skips the word Zejiang
|
||||
request.applymarker(pytest.mark.xfail)
|
||||
# NOTE(scanny): gave up trying to figure out why, but this file partitions differently locally
|
||||
# (on Mac) than it does in CI. Basically the first element when partitioning locally is split
|
||||
# in two when partitioning on CI. Other than that split the text is exactly the same.
|
||||
idx = 2 if sys.platform == "darwin" else 3
|
||||
|
||||
idx = 3
|
||||
assert isinstance(elements[idx], Title)
|
||||
assert elements[idx].text.startswith("LayoutParser")
|
||||
e = elements[idx]
|
||||
assert isinstance(e, Title)
|
||||
assert e.text.startswith("LayoutParser")
|
||||
assert e.metadata.filename == os.path.basename(file_path)
|
||||
assert e.metadata.file_directory == os.path.split(file_path)[0]
|
||||
|
||||
assert elements[idx].metadata.filename == os.path.basename(filename)
|
||||
assert elements[idx].metadata.file_directory == os.path.split(filename)[0]
|
||||
|
||||
idx += 1
|
||||
assert isinstance(elements[idx], NarrativeText)
|
||||
assert elements[idx].text.startswith("Zejiang Shen")
|
||||
e = elements[idx + 1]
|
||||
assert isinstance(e, NarrativeText)
|
||||
assert e.text.startswith("Zejiang Shen")
|
||||
|
||||
|
||||
def test_auto_partition_pdf_uses_table_extraction():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
||||
with patch(
|
||||
"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
|
||||
) as mock_process_file_with_model:
|
||||
partition(filename, pdf_infer_table_structure=True, strategy=PartitionStrategy.HI_RES)
|
||||
partition(
|
||||
example_doc_path("layout-parser-paper-fast.pdf"),
|
||||
pdf_infer_table_structure=True,
|
||||
strategy=PartitionStrategy.HI_RES,
|
||||
)
|
||||
assert mock_process_file_with_model.call_args[1]["infer_table_structure"]
|
||||
|
||||
|
||||
def test_auto_partition_pdf_with_fast_strategy(monkeypatch: MonkeyPatch):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
||||
file_path = example_doc_path("layout-parser-paper-fast.pdf")
|
||||
|
||||
mock_return = [NarrativeText("Hello there!")]
|
||||
with patch.object(auto, "partition_pdf", return_value=mock_return) as mock_partition:
|
||||
mock_partition_with_extras_map = {"pdf": mock_partition}
|
||||
monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
|
||||
partition(filename=filename, strategy=PartitionStrategy.FAST)
|
||||
partition(filename=file_path, strategy=PartitionStrategy.FAST)
|
||||
|
||||
mock_partition.assert_called_once_with(
|
||||
filename=filename,
|
||||
filename=file_path,
|
||||
file=None,
|
||||
url=None,
|
||||
strategy=PartitionStrategy.FAST,
|
||||
@ -692,13 +664,11 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch: MonkeyPatch):
|
||||
("pass_metadata_filename", "content_type"),
|
||||
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
|
||||
)
|
||||
def test_auto_partition_pdf_from_file(
|
||||
request: FixtureRequest, pass_metadata_filename: bool, content_type: str | None
|
||||
):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
||||
metadata_filename = filename if pass_metadata_filename else None
|
||||
def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type: str | None):
|
||||
file_path = example_doc_path("layout-parser-paper-fast.pdf")
|
||||
metadata_filename = file_path if pass_metadata_filename else None
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
with open(file_path, "rb") as f:
|
||||
elements = partition(
|
||||
file=f,
|
||||
metadata_filename=metadata_filename,
|
||||
@ -706,27 +676,28 @@ def test_auto_partition_pdf_from_file(
|
||||
strategy=PartitionStrategy.HI_RES,
|
||||
)
|
||||
|
||||
# NOTE(alan): Xfail since new model skips the word Zejiang
|
||||
request.applymarker(pytest.mark.xfail)
|
||||
# NOTE(scanny): see "with_filename" version of this test above for more on this oddness
|
||||
idx = 2 if sys.platform == "darwin" else 3
|
||||
|
||||
idx = 3
|
||||
assert isinstance(elements[idx], Title)
|
||||
assert elements[idx].text.startswith("LayoutParser")
|
||||
e = elements[idx]
|
||||
assert isinstance(e, Title)
|
||||
assert e.text.startswith("LayoutParser")
|
||||
|
||||
idx += 1
|
||||
assert isinstance(elements[idx], NarrativeText)
|
||||
assert elements[idx].text.startswith("Zejiang Shen")
|
||||
e = elements[idx + 1]
|
||||
assert isinstance(e, NarrativeText)
|
||||
assert e.text.startswith("Zejiang Shen")
|
||||
|
||||
|
||||
def test_partition_pdf_does_not_raise_warning():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
||||
# NOTE(robinson): This is the recommended way to check that no warning is emitted,
|
||||
# per the pytest docs.
|
||||
# ref: https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html
|
||||
# #additional-use-cases-of-warnings-in-tests
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error")
|
||||
partition(filename=filename, strategy=PartitionStrategy.HI_RES)
|
||||
partition(
|
||||
example_doc_path("layout-parser-paper-fast.pdf"), strategy=PartitionStrategy.HI_RES
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
|
||||
@ -753,11 +724,11 @@ def test_auto_partition_pdf_element_extraction(extract_image_block_to_payload: b
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
def test_auto_partition_ppt_from_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt")
|
||||
elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
|
||||
file_path = example_doc_path("fake-power-point.ppt")
|
||||
elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
|
||||
assert elements == EXPECTED_PPTX_OUTPUT
|
||||
assert elements[0].metadata.filename == os.path.basename(filename)
|
||||
assert elements[0].metadata.file_directory == os.path.split(filename)[0]
|
||||
assert elements[0].metadata.filename == os.path.basename(file_path)
|
||||
assert elements[0].metadata.file_directory == os.path.split(file_path)[0]
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
@ -776,11 +747,11 @@ EXPECTED_PPTX_OUTPUT = [
|
||||
|
||||
|
||||
def test_auto_partition_pptx_from_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
|
||||
elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
|
||||
file_path = example_doc_path("fake-power-point.pptx")
|
||||
elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
|
||||
assert elements == EXPECTED_PPTX_OUTPUT
|
||||
assert elements[0].metadata.filename == os.path.basename(filename)
|
||||
assert elements[0].metadata.file_directory == os.path.split(filename)[0]
|
||||
assert elements[0].metadata.filename == os.path.basename(file_path)
|
||||
assert elements[0].metadata.file_directory == os.path.split(file_path)[0]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("file_name", ["simple.pptx", "fake-power-point.ppt"])
|
||||
@ -848,8 +819,7 @@ def test_auto_partition_rst_from_file():
|
||||
|
||||
|
||||
def test_auto_partition_rtf_from_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-doc.rtf")
|
||||
elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
|
||||
elements = partition(example_doc_path("fake-doc.rtf"), strategy=PartitionStrategy.HI_RES)
|
||||
assert elements[0] == Title("My First Heading")
|
||||
|
||||
|
||||
@ -883,17 +853,16 @@ EXPECTED_TEXT_OUTPUT = [
|
||||
|
||||
|
||||
def test_auto_partition_text_from_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
||||
elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
|
||||
file_path = example_doc_path("fake-text.txt")
|
||||
elements = partition(filename=file_path, strategy=PartitionStrategy.HI_RES)
|
||||
assert len(elements) > 0
|
||||
assert elements == EXPECTED_TEXT_OUTPUT
|
||||
assert elements[0].metadata.filename == os.path.basename(filename)
|
||||
assert elements[0].metadata.file_directory == os.path.split(filename)[0]
|
||||
assert elements[0].metadata.filename == os.path.basename(file_path)
|
||||
assert elements[0].metadata.file_directory == os.path.split(file_path)[0]
|
||||
|
||||
|
||||
def test_auto_partition_text_from_file():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
||||
with open(filename, "rb") as f:
|
||||
with open(example_doc_path("fake-text.txt"), "rb") as f:
|
||||
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
|
||||
assert len(elements) > 0
|
||||
assert elements == EXPECTED_TEXT_OUTPUT
|
||||
@ -903,10 +872,8 @@ def test_auto_partition_text_from_file():
|
||||
# XLS
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
EXPECTED_XLS_TEXT_LEN = 550
|
||||
|
||||
|
||||
EXPECTED_XLS_INITIAL_45_CLEAN_TEXT = "MC What is 2+2? 4 correct 3 incorrect MA What"
|
||||
|
||||
EXPECTED_XLS_TABLE = (
|
||||
@ -1054,7 +1021,7 @@ def test_auto_partition_xlsx_from_file():
|
||||
|
||||
|
||||
def test_auto_partition_respects_starting_page_number_argument_for_xlsx():
|
||||
elements = partition("example-docs/stanley-cups.xlsx", starting_page_number=3)
|
||||
elements = partition(example_doc_path("stanley-cups.xlsx"), starting_page_number=3)
|
||||
assert elements[1].metadata.page_number == 3
|
||||
|
||||
|
||||
@ -1140,9 +1107,10 @@ def test_auto_partition_from_url_without_providing_content_type():
|
||||
|
||||
|
||||
def test_auto_partition_warns_if_header_set_and_not_url(caplog: LogCaptureFixture):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
|
||||
partition(
|
||||
filename=filename, headers={"Accept": "application/pdf"}, strategy=PartitionStrategy.HI_RES
|
||||
example_doc_path("eml/fake-email.eml"),
|
||||
headers={"Accept": "application/pdf"},
|
||||
strategy=PartitionStrategy.HI_RES,
|
||||
)
|
||||
assert caplog.records[0].levelname == "WARNING"
|
||||
|
||||
@ -1169,22 +1137,22 @@ def test_partition_timeout_gets_routed():
|
||||
|
||||
|
||||
def test_add_chunking_strategy_on_partition_auto():
|
||||
filename = "example-docs/example-10k-1p.html"
|
||||
elements = partition(filename)
|
||||
chunk_elements = partition(filename, chunking_strategy="by_title")
|
||||
file_path = example_doc_path("example-10k-1p.html")
|
||||
elements = partition(file_path)
|
||||
chunk_elements = partition(file_path, chunking_strategy="by_title")
|
||||
chunks = chunk_by_title(elements)
|
||||
assert chunk_elements != elements
|
||||
assert chunk_elements == chunks
|
||||
|
||||
|
||||
def test_add_chunking_strategy_on_partition_auto_respects_max_chars():
|
||||
filename = "example-docs/example-10k-1p.html"
|
||||
file_path = example_doc_path("example-10k-1p.html")
|
||||
|
||||
# default chunk size in chars is 200
|
||||
partitioned_table_elements_200_chars = [
|
||||
e
|
||||
for e in partition(
|
||||
filename,
|
||||
file_path,
|
||||
chunking_strategy="by_title",
|
||||
max_characters=200,
|
||||
combine_text_under_n_chars=5,
|
||||
@ -1195,7 +1163,7 @@ def test_add_chunking_strategy_on_partition_auto_respects_max_chars():
|
||||
partitioned_table_elements_5_chars = [
|
||||
e
|
||||
for e in partition(
|
||||
filename,
|
||||
file_path,
|
||||
chunking_strategy="by_title",
|
||||
max_characters=5,
|
||||
combine_text_under_n_chars=5,
|
||||
@ -1203,7 +1171,7 @@ def test_add_chunking_strategy_on_partition_auto_respects_max_chars():
|
||||
if isinstance(e, (Table, TableChunk))
|
||||
]
|
||||
|
||||
elements = partition(filename)
|
||||
elements = partition(file_path)
|
||||
|
||||
table_elements = [e for e in elements if isinstance(e, Table)]
|
||||
|
||||
@ -1224,12 +1192,12 @@ def test_add_chunking_strategy_on_partition_auto_respects_max_chars():
|
||||
|
||||
|
||||
def test_add_chunking_strategy_chars_on_partition_auto_adds_is_continuation():
|
||||
filename = "example-docs/example-10k-1p.html"
|
||||
file_path = example_doc_path("example-10k-1p.html")
|
||||
|
||||
table_elements = [e for e in partition(filename) if isinstance(e, Table)]
|
||||
table_elements = [e for e in partition(file_path) if isinstance(e, Table)]
|
||||
table_chunks = [
|
||||
e
|
||||
for e in partition(filename, chunking_strategy="by_title")
|
||||
for e in partition(file_path, chunking_strategy="by_title")
|
||||
if isinstance(e, (Table, TableChunk))
|
||||
]
|
||||
|
||||
@ -1249,8 +1217,9 @@ def test_add_chunking_strategy_chars_on_partition_auto_adds_is_continuation():
|
||||
|
||||
|
||||
def test_partition_respects_detect_language_per_element_arg():
|
||||
filename = "example-docs/language-docs/eng_spa_mult.txt"
|
||||
elements = partition(filename=filename, detect_language_per_element=True)
|
||||
elements = partition(
|
||||
example_doc_path("language-docs/eng_spa_mult.txt"), detect_language_per_element=True
|
||||
)
|
||||
langs = [element.metadata.languages for element in elements]
|
||||
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
|
||||
|
||||
@ -1288,9 +1257,10 @@ def test_partition_respects_language_arg(file_extension: str):
|
||||
|
||||
|
||||
def test_auto_with_page_breaks():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
||||
elements = partition(
|
||||
filename=filename, include_page_breaks=True, strategy=PartitionStrategy.HI_RES
|
||||
example_doc_path("layout-parser-paper-fast.pdf"),
|
||||
include_page_breaks=True,
|
||||
strategy=PartitionStrategy.HI_RES,
|
||||
)
|
||||
assert "PageBreak" in [elem.category for elem in elements]
|
||||
|
||||
@ -1299,36 +1269,39 @@ def test_auto_with_page_breaks():
|
||||
|
||||
|
||||
def test_auto_partition_metadata_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition(file=f, metadata_filename=filename)
|
||||
assert elements[0].metadata.filename == os.path.split(filename)[-1]
|
||||
file_path = example_doc_path("fake-text.txt")
|
||||
with open(file_path, "rb") as f:
|
||||
elements = partition(file=f, metadata_filename=file_path)
|
||||
assert elements[0].metadata.filename == os.path.split(file_path)[-1]
|
||||
|
||||
|
||||
def test_auto_partition_warns_about_file_filename_deprecation(caplog: LogCaptureFixture):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition(file=f, file_filename=filename)
|
||||
assert elements[0].metadata.filename == os.path.split(filename)[-1]
|
||||
file_path = example_doc_path("fake-text.txt")
|
||||
with open(file_path, "rb") as f:
|
||||
elements = partition(file=f, file_filename=file_path)
|
||||
assert elements[0].metadata.filename == os.path.split(file_path)[-1]
|
||||
assert "WARNING" in caplog.text
|
||||
assert "The file_filename kwarg will be deprecated" in caplog.text
|
||||
|
||||
|
||||
def test_auto_partition_raises_with_file_and_metadata_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
||||
with open(filename, "rb") as f, pytest.raises(ValueError):
|
||||
partition(file=f, file_filename=filename, metadata_filename=filename)
|
||||
file_path = example_doc_path("fake-text.txt")
|
||||
with open(file_path, "rb") as f, pytest.raises(ValueError):
|
||||
partition(file=f, file_filename=file_path, metadata_filename=file_path)
|
||||
|
||||
|
||||
# -- ocr_languages --------------------------------------------------------
|
||||
|
||||
|
||||
def test_auto_partition_formats_languages_for_tesseract():
|
||||
filename = "example-docs/chi_sim_image.jpeg"
|
||||
with patch(
|
||||
"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
|
||||
) as mock_process_file_with_ocr:
|
||||
partition(filename, strategy=PartitionStrategy.HI_RES, languages=["zh"])
|
||||
partition(
|
||||
example_doc_path("chi_sim_image.jpeg"),
|
||||
strategy=PartitionStrategy.HI_RES,
|
||||
languages=["zh"],
|
||||
)
|
||||
_, kwargs = mock_process_file_with_ocr.call_args_list[0]
|
||||
assert "ocr_languages" in kwargs
|
||||
assert kwargs["ocr_languages"] == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
|
||||
@ -1338,9 +1311,8 @@ def test_auto_partition_formats_languages_for_tesseract():
|
||||
def test_auto_partition_ignores_empty_string_for_ocr_languages(
|
||||
languages: list[str], ocr_languages: str
|
||||
):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "book-war-and-peace-1p.txt")
|
||||
elements = partition(
|
||||
filename=filename,
|
||||
example_doc_path("book-war-and-peace-1p.txt"),
|
||||
strategy=PartitionStrategy.OCR_ONLY,
|
||||
ocr_languages=ocr_languages,
|
||||
languages=languages,
|
||||
@ -1349,8 +1321,9 @@ def test_auto_partition_ignores_empty_string_for_ocr_languages(
|
||||
|
||||
|
||||
def test_auto_partition_warns_with_ocr_languages(caplog: LogCaptureFixture):
|
||||
filename = "example-docs/chevron-page.pdf"
|
||||
partition(filename=filename, strategy=PartitionStrategy.HI_RES, ocr_languages="eng")
|
||||
partition(
|
||||
example_doc_path("chevron-page.pdf"), strategy=PartitionStrategy.HI_RES, ocr_languages="eng"
|
||||
)
|
||||
assert "The ocr_languages kwarg will be deprecated" in caplog.text
|
||||
|
||||
|
||||
@ -1463,7 +1436,7 @@ def test_file_specific_produces_correct_filetype(filetype: FileType):
|
||||
fun_name = "partition_" + filetype_module
|
||||
module = import_module(f"unstructured.partition.{filetype_module}")
|
||||
fun = getattr(module, fun_name)
|
||||
for file in pathlib.Path("example-docs").iterdir():
|
||||
for file in pathlib.Path(example_doc_path("")).iterdir():
|
||||
if file.is_file() and file.suffix == f".{extension}":
|
||||
elements = fun(str(file))
|
||||
assert all(
|
||||
@ -1478,8 +1451,11 @@ def test_file_specific_produces_correct_filetype(filetype: FileType):
|
||||
|
||||
|
||||
def test_auto_partition_element_metadata_user_provided_languages():
|
||||
filename = "example-docs/chevron-page.pdf"
|
||||
elements = partition(filename=filename, strategy=PartitionStrategy.OCR_ONLY, languages=["eng"])
|
||||
elements = partition(
|
||||
example_doc_path("chevron-page.pdf"),
|
||||
strategy=PartitionStrategy.OCR_ONLY,
|
||||
languages=["eng"],
|
||||
)
|
||||
assert elements[0].metadata.languages == ["eng"]
|
||||
|
||||
|
||||
@ -1495,8 +1471,7 @@ def test_partition_languages_incorrectly_defaults_to_English(tmp_path: pathlib.P
|
||||
|
||||
|
||||
def test_partition_languages_default_to_None():
|
||||
filename = "example-docs/handbook-1p.docx"
|
||||
elements = partition(filename=filename, detect_language_per_element=True)
|
||||
elements = partition(example_doc_path("handbook-1p.docx"), detect_language_per_element=True)
|
||||
# PageBreak and other elements with no text will have `None` for `languages`
|
||||
none_langs = [element for element in elements if element.metadata.languages is None]
|
||||
assert none_langs[0].text == ""
|
||||
@ -1508,11 +1483,11 @@ def test_partition_default_does_not_overwrite_other_defaults():
|
||||
from unstructured.partition.text import partition_text
|
||||
|
||||
# Use a document that is primarily in a language other than English
|
||||
filename = "example-docs/language-docs/UDHR_first_article_all.txt"
|
||||
text_elements = partition_text(filename)
|
||||
file_path = example_doc_path("language-docs/UDHR_first_article_all.txt")
|
||||
text_elements = partition_text(file_path)
|
||||
assert text_elements[0].metadata.languages != ["eng"]
|
||||
|
||||
auto_elements = partition(filename)
|
||||
auto_elements = partition(file_path)
|
||||
assert auto_elements[0].metadata.languages != ["eng"]
|
||||
assert auto_elements[0].metadata.languages == text_elements[0].metadata.languages
|
||||
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.14.11-dev4" # pragma: no cover
|
||||
__version__ = "0.14.11-dev5" # pragma: no cover
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user