mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00

This PR aims to improve the organization and readability of our example documents used in unit tests, specifically focusing on PDF and image files. ### Summary - Created two new subdirectories in the `example-docs` folder: - `pdf/`: for all PDF example files - `img/`: for all image example files - Moved relevant PDF files from `example-docs/` to `example-docs/pdf/` - Moved relevant image files from `example-docs/` to `example-docs/img/` - Updated file paths in affected unit & ingest tests to reflect the new directory structure ### Testing All unit & ingest tests should be updated and verified to work with the new file structure. ## Notes Other file types (e.g., office documents, HTML files) remain in the root of `example-docs/` for now. ## Next Steps Consider similar reorganization for other file types if this structure proves to be beneficial. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
44 lines
1.2 KiB
Python
44 lines
1.2 KiB
Python
import pytest
|
|
|
|
from test_unstructured.unit_utils import example_doc_path
|
|
from unstructured.partition import pdf
|
|
from unstructured.partition.utils.constants import PartitionStrategy
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def chipper_results():
|
|
elements = pdf.partition_pdf(
|
|
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
|
strategy=PartitionStrategy.HI_RES,
|
|
model_name="chipper",
|
|
)
|
|
return elements
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def chipper_children(chipper_results):
|
|
return [el for el in chipper_results if el.metadata.parent_id is not None]
|
|
|
|
|
|
@pytest.mark.chipper()
|
|
def test_chipper_has_hierarchy(chipper_children):
|
|
assert chipper_children
|
|
|
|
|
|
@pytest.mark.chipper()
|
|
def test_chipper_not_losing_parents(chipper_results, chipper_children):
|
|
assert all(
|
|
[el for el in chipper_results if el.id == child.metadata.parent_id]
|
|
for child in chipper_children
|
|
)
|
|
|
|
|
|
def chipper_test_pdfminer_repeated(chipper_results):
|
|
"""
|
|
Test to verify that PDFMiner has not been run together with Chipper
|
|
"""
|
|
elements = chipper_results
|
|
assert len([element.text for element in elements]) == len(
|
|
{element.text for element in elements}
|
|
)
|