113 lines
2.9 KiB
Python
Raw Normal View History

import json
import os
import pytest
from unstructured import utils
@pytest.fixture()
def input_data():
return [
{"text": "This is a sentence."},
{"text": "This is another sentence.", "meta": {"score": 0.1}},
]
@pytest.fixture()
def output_jsonl_file(tmp_path):
return os.path.join(tmp_path, "output.jsonl")
@pytest.fixture()
def input_jsonl_file(tmp_path, input_data):
file_path = os.path.join(tmp_path, "input.jsonl")
with open(file_path, "w+") as input_file:
input_file.writelines([json.dumps(obj) + "\n" for obj in input_data])
return file_path
def test_save_as_jsonl(input_data, output_jsonl_file):
utils.save_as_jsonl(input_data, output_jsonl_file)
with open(output_jsonl_file) as output_file:
file_data = [json.loads(line) for line in output_file]
assert file_data == input_data
def test_read_as_jsonl(input_jsonl_file, input_data):
file_data = utils.read_from_jsonl(input_jsonl_file)
assert file_data == input_data
def test_requires_dependencies_decorator():
@utils.requires_dependencies(dependencies="numpy")
def test_func():
import numpy # noqa: F401
test_func()
def test_requires_dependencies_decorator_multiple():
@utils.requires_dependencies(dependencies=["numpy", "pandas"])
def test_func():
import numpy # noqa: F401
import pandas # noqa: F401
test_func()
def test_requires_dependencies_decorator_import_error():
@utils.requires_dependencies(dependencies="not_a_package")
def test_func():
import not_a_package # noqa: F401
with pytest.raises(ImportError):
test_func()
def test_requires_dependencies_decorator_import_error_multiple():
@utils.requires_dependencies(dependencies=["not_a_package", "numpy"])
def test_func():
import not_a_package # noqa: F401
import numpy # noqa: F401
with pytest.raises(ImportError):
test_func()
def test_requires_dependencies_decorator_in_class():
@utils.requires_dependencies(dependencies="numpy")
class TestClass:
def __init__(self):
import numpy # noqa: F401
TestClass()
chore: process chipper hierarchy (#1634) PR to support schema changes introduced from [PR 232](https://github.com/Unstructured-IO/unstructured-inference/pull/232) in `unstructured-inference`. Specifically what needs to be supported is: * Change to the way `LayoutElement` from `unstructured-inference` is structured, specifically that this class is no longer a subclass of `Rectangle`, and instead `LayoutElement` has a `bbox` property that captures the location information and a `from_coords` method that allows construction of a `LayoutElement` directly from coordinates. * Removal of `LocationlessLayoutElement` since chipper now exports bounding boxes, and if we need to support elements without bounding boxes, we can make the `bbox` property mentioned above optional. * Getting hierarchy data directly from the inference elements rather than in post-processing * Don't try to reorder elements received from chipper v2, as they should already be ordered. #### Testing: The following demonstrates that the new version of chipper is inferring hierarchy. ```python from unstructured.partition.pdf import partition_pdf elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf", strategy="hi_res", model_name="chipper") children = [el for el in elements if el.metadata.parent_id is not None] print(children) ``` Also verify that running the traditional `hi_res` gives different results: ```python from unstructured.partition.pdf import partition_pdf elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf", strategy="hi_res") ``` --------- Co-authored-by: Sebastian Laverde Alfonso <lavmlk20201@gmail.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: christinestraub <christinemstraub@gmail.com>
2023-10-12 20:28:46 -05:00
@pytest.mark.parametrize("iterator", [[0, 1], (0, 1), range(10), [0], (0,), range(1)])
def test_first_gives_first(iterator):
assert utils.first(iterator) == 0
@pytest.mark.parametrize("iterator", [[], ()])
def test_first_raises_if_empty(iterator):
with pytest.raises(ValueError):
utils.first(iterator)
@pytest.mark.parametrize("iterator", [[0], (0,), range(1)])
def test_only_gives_only(iterator):
assert utils.first(iterator) == 0
@pytest.mark.parametrize("iterator", [[0, 1], (0, 1), range(10)])
def test_only_raises_when_len_more_than_1(iterator):
with pytest.raises(ValueError):
utils.only(iterator) == 0
@pytest.mark.parametrize("iterator", [[], ()])
def test_only_raises_if_empty(iterator):
with pytest.raises(ValueError):
utils.only(iterator)