mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00

### Description Alternative to https://github.com/Unstructured-IO/unstructured/pull/3572 but maintaining all ingest tests, running them by pulling in the latest version of unstructured-ingest. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: rbiseck3 <rbiseck3@users.noreply.github.com> Co-authored-by: Christine Straub <christinemstraub@gmail.com> Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
334 lines
11 KiB
Python
334 lines
11 KiB
Python
from __future__ import annotations
|
||
|
||
import json
|
||
import os
|
||
|
||
import pytest
|
||
|
||
from unstructured import utils
|
||
from unstructured.documents.coordinates import PixelSpace
|
||
from unstructured.documents.elements import ElementMetadata, NarrativeText, Title
|
||
|
||
|
||
@pytest.fixture()
|
||
def input_data():
|
||
return [
|
||
{"text": "This is a sentence."},
|
||
{"text": "This is another sentence.", "meta": {"score": 0.1}},
|
||
]
|
||
|
||
|
||
@pytest.fixture()
|
||
def output_jsonl_file(tmp_path):
|
||
return os.path.join(tmp_path, "output.jsonl")
|
||
|
||
|
||
@pytest.fixture()
|
||
def input_jsonl_file(tmp_path, input_data):
|
||
file_path = os.path.join(tmp_path, "input.jsonl")
|
||
with open(file_path, "w+") as input_file:
|
||
input_file.writelines([json.dumps(obj) + "\n" for obj in input_data])
|
||
return file_path
|
||
|
||
|
||
def test_save_as_jsonl(input_data, output_jsonl_file):
|
||
utils.save_as_jsonl(input_data, output_jsonl_file)
|
||
with open(output_jsonl_file) as output_file:
|
||
file_data = [json.loads(line) for line in output_file]
|
||
assert file_data == input_data
|
||
|
||
|
||
def test_read_as_jsonl(input_jsonl_file, input_data):
|
||
file_data = utils.read_from_jsonl(input_jsonl_file)
|
||
assert file_data == input_data
|
||
|
||
|
||
def test_requires_dependencies_decorator():
|
||
@utils.requires_dependencies(dependencies="numpy")
|
||
def test_func():
|
||
import numpy # noqa: F401
|
||
|
||
test_func()
|
||
|
||
|
||
def test_requires_dependencies_decorator_multiple():
|
||
@utils.requires_dependencies(dependencies=["numpy", "pandas"])
|
||
def test_func():
|
||
import numpy # noqa: F401
|
||
import pandas # noqa: F401
|
||
|
||
test_func()
|
||
|
||
|
||
def test_requires_dependencies_decorator_import_error():
|
||
@utils.requires_dependencies(dependencies="not_a_package")
|
||
def test_func():
|
||
import not_a_package # noqa: F401
|
||
|
||
with pytest.raises(ImportError):
|
||
test_func()
|
||
|
||
|
||
def test_requires_dependencies_decorator_import_error_multiple():
|
||
@utils.requires_dependencies(dependencies=["not_a_package", "numpy"])
|
||
def test_func():
|
||
import not_a_package # noqa: F401
|
||
import numpy # noqa: F401
|
||
|
||
with pytest.raises(ImportError):
|
||
test_func()
|
||
|
||
|
||
def test_requires_dependencies_decorator_in_class():
|
||
@utils.requires_dependencies(dependencies="numpy")
|
||
class TestClass:
|
||
def __init__(self):
|
||
import numpy # noqa: F401
|
||
|
||
TestClass()
|
||
|
||
|
||
@pytest.mark.parametrize("iterator", [[0, 1], (0, 1), range(10), [0], (0,), range(1)])
|
||
def test_first_gives_first(iterator):
|
||
assert utils.first(iterator) == 0
|
||
|
||
|
||
@pytest.mark.parametrize("iterator", [[], ()])
|
||
def test_first_raises_if_empty(iterator):
|
||
with pytest.raises(ValueError):
|
||
utils.first(iterator)
|
||
|
||
|
||
@pytest.mark.parametrize("iterator", [[0], (0,), range(1)])
|
||
def test_only_gives_only(iterator):
|
||
assert utils.first(iterator) == 0
|
||
|
||
|
||
@pytest.mark.parametrize("iterator", [[0, 1], (0, 1), range(10)])
|
||
def test_only_raises_when_len_more_than_1(iterator):
|
||
with pytest.raises(ValueError):
|
||
utils.only(iterator)
|
||
|
||
|
||
@pytest.mark.parametrize("iterator", [[], ()])
|
||
def test_only_raises_if_empty(iterator):
|
||
with pytest.raises(ValueError):
|
||
utils.only(iterator)
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
("coords1", "coords2", "text1", "text2", "nested_error_tolerance_px", "expectation"),
|
||
[
|
||
(
|
||
((4, 5), (4, 8), (7, 8), (7, 5)),
|
||
((2, 3), (2, 6), (5, 6), (5, 3)),
|
||
"Some lovely title",
|
||
"Some lovely text",
|
||
5, # large nested_error_tolerance_px
|
||
{
|
||
"overlapping_elements": ["Title(ix=0)", "NarrativeText(ix=1)"],
|
||
"parent_element": "Title(ix=0)",
|
||
"overlapping_case": "nested NarrativeText in Title",
|
||
"overlap_percentage": "100%",
|
||
"metadata": {
|
||
"largest_ngram_percentage": 0,
|
||
"overlap_percentage_total": "5.88%",
|
||
"max_area": "9pxˆ2",
|
||
"min_area": "9pxˆ2",
|
||
"total_area": "18pxˆ2",
|
||
},
|
||
},
|
||
),
|
||
(
|
||
((4, 5), (4, 8), (7, 8), (7, 5)),
|
||
((2, 3), (2, 6), (5, 6), (5, 3)),
|
||
"Some lovely title",
|
||
"Some lovely text",
|
||
1, # small nested_error_tolerance_px
|
||
{
|
||
"overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
|
||
"parent_element": None,
|
||
"overlapping_case": "partial overlap sharing 50.0% of the text from1. "
|
||
"NarrativeText(2-gram)",
|
||
"overlap_percentage": "11.11%",
|
||
"metadata": {
|
||
"largest_ngram_percentage": 50.0,
|
||
"overlap_percentage_total": "5.88%",
|
||
"max_area": "9pxˆ2",
|
||
"min_area": "9pxˆ2",
|
||
"total_area": "18pxˆ2",
|
||
},
|
||
},
|
||
),
|
||
(
|
||
((4, 5), (4, 8), (7, 8), (7, 5)),
|
||
((2, 3), (2, 6), (5, 6), (5, 3)),
|
||
"Some lovely title",
|
||
"Some lovely title", # same title
|
||
1,
|
||
{
|
||
"overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
|
||
"parent_element": None,
|
||
"overlapping_case": "partial overlap with duplicate text",
|
||
"overlap_percentage": "11.11%",
|
||
"metadata": {
|
||
"largest_ngram_percentage": 0,
|
||
"overlap_percentage_total": "5.88%",
|
||
"max_area": "9pxˆ2",
|
||
"min_area": "9pxˆ2",
|
||
"total_area": "18pxˆ2",
|
||
},
|
||
},
|
||
),
|
||
(
|
||
((4, 5), (4, 8), (7, 8), (7, 5)),
|
||
((2, 3), (2, 6), (5, 6), (5, 3)),
|
||
"Some lovely title",
|
||
"", # empty title
|
||
1,
|
||
{
|
||
"overlapping_elements": ["1. NarrativeText(ix=1)", "0. Title(ix=0)"],
|
||
"parent_element": None,
|
||
"overlapping_case": ("partial overlap with empty content in 1. NarrativeText"),
|
||
"overlap_percentage": "11.11%",
|
||
"metadata": {
|
||
"largest_ngram_percentage": 0,
|
||
"overlap_percentage_total": "5.88%",
|
||
"max_area": "9pxˆ2",
|
||
"min_area": "9pxˆ2",
|
||
"total_area": "18pxˆ2",
|
||
},
|
||
},
|
||
),
|
||
(
|
||
((4, 5), (4, 8), (7, 8), (7, 5)),
|
||
((2, 3), (2, 6), (5, 6), (5, 3)),
|
||
"", # empty 1st title
|
||
"Some lovely title",
|
||
1,
|
||
{
|
||
"overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
|
||
"parent_element": None,
|
||
"overlapping_case": "partial overlap with empty content in 0. Title",
|
||
"overlap_percentage": "11.11%",
|
||
"metadata": {
|
||
"largest_ngram_percentage": 0,
|
||
"overlap_percentage_total": "5.88%",
|
||
"max_area": "9pxˆ2",
|
||
"min_area": "9pxˆ2",
|
||
"total_area": "18pxˆ2",
|
||
},
|
||
},
|
||
),
|
||
(
|
||
((4, 5), (4, 8), (7, 8), (7, 5)),
|
||
((2, 3), (2, 6), (5, 6), (5, 3)),
|
||
"Some lovely title",
|
||
"Something totally different here", # diff text
|
||
1,
|
||
{
|
||
"overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
|
||
"parent_element": None,
|
||
"overlapping_case": "partial overlap without sharing text",
|
||
"overlap_percentage": "11.11%",
|
||
"metadata": {
|
||
"largest_ngram_percentage": 0,
|
||
"overlap_percentage_total": "5.88%",
|
||
"max_area": "9pxˆ2",
|
||
"min_area": "9pxˆ2",
|
||
"total_area": "18pxˆ2",
|
||
},
|
||
},
|
||
),
|
||
(
|
||
((5, 6), (5, 10), (8, 10), (8, 6)), # diff coordinates
|
||
((1, 3), (2, 7), (6, 7), (5, 3)),
|
||
"Some lovely title",
|
||
"Some lovely text",
|
||
1,
|
||
{
|
||
"overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
|
||
"parent_element": None,
|
||
"overlapping_case": "Small partial overlap",
|
||
"overlap_percentage": "8.33%",
|
||
"metadata": {
|
||
"largest_ngram_percentage": 0,
|
||
"overlap_percentage_total": "3.23%",
|
||
"max_area": "20pxˆ2",
|
||
"min_area": "12pxˆ2",
|
||
"total_area": "32pxˆ2",
|
||
},
|
||
},
|
||
),
|
||
],
|
||
)
|
||
def test_catch_overlapping_and_nested_bboxes(
|
||
coords1, coords2, text1, text2, nested_error_tolerance_px, expectation
|
||
):
|
||
elements = [
|
||
Title(
|
||
text=text1,
|
||
coordinates=coords1,
|
||
coordinate_system=PixelSpace(width=20, height=20),
|
||
metadata=ElementMetadata(page_number=1),
|
||
),
|
||
NarrativeText(
|
||
text=text2,
|
||
coordinates=coords2,
|
||
coordinate_system=PixelSpace(width=20, height=20),
|
||
metadata=ElementMetadata(page_number=1),
|
||
),
|
||
]
|
||
overlapping_flag, overlapping_cases = utils.catch_overlapping_and_nested_bboxes(
|
||
elements,
|
||
nested_error_tolerance_px,
|
||
sm_overlap_threshold=10.0,
|
||
)
|
||
assert overlapping_flag is True
|
||
assert overlapping_cases[0] == expectation
|
||
|
||
|
||
def test_catch_overlapping_and_nested_bboxes_non_overlapping_case():
|
||
elements = [
|
||
Title(
|
||
text="Some lovely title",
|
||
coordinates=((4, 6), (4, 7), (7, 7), (7, 6)),
|
||
coordinate_system=PixelSpace(width=20, height=20),
|
||
metadata=ElementMetadata(page_number=1),
|
||
),
|
||
NarrativeText(
|
||
text="Some lovely text",
|
||
coordinates=((6, 8), (6, 9), (9, 9), (9, 8)),
|
||
coordinate_system=PixelSpace(width=20, height=20),
|
||
metadata=ElementMetadata(page_number=1),
|
||
),
|
||
]
|
||
overlapping_flag, overlapping_cases = utils.catch_overlapping_and_nested_bboxes(
|
||
elements,
|
||
1,
|
||
sm_overlap_threshold=10.0,
|
||
)
|
||
assert overlapping_flag is False
|
||
assert overlapping_cases == []
|
||
|
||
|
||
def test_only_returns_singleton_iterable():
|
||
singleton_iterable = [42]
|
||
result = utils.only(singleton_iterable)
|
||
assert result == 42
|
||
|
||
|
||
def test_only_raises_on_non_singleton_iterable():
|
||
singleton_iterable = [42, 0]
|
||
with pytest.raises(ValueError):
|
||
utils.only(singleton_iterable)
|
||
|
||
|
||
def test_calculate_shared_ngram_percentage_returns_null_vals_for_empty_str():
|
||
str1 = ""
|
||
str2 = "banana orange pineapple"
|
||
n = 2
|
||
percent, common_ngrams = utils.calculate_shared_ngram_percentage(str1, str2, n)
|
||
assert percent == 0
|
||
assert not bool(common_ngrams)
|