111 lines
3.4 KiB
Python
Raw Normal View History

import os
import pathlib
import tempfile
import pytest
from unstructured.partition.auto import partition
from unstructured.partition.json import partition_json
from unstructured.staging.base import elements_to_json
DIRECTORY = pathlib.Path(__file__).parent.resolve()
test_files = [
"fake-text.txt",
"layout-parser-paper-fast.pdf",
"fake-html.html",
"fake.doc",
"fake-email.eml",
2023-03-21 13:46:09 -07:00
pytest.param(
"fake-power-point.ppt",
marks=pytest.mark.xfail(reason="Requirements mismatch, should only fail in docker test"),
),
"fake.docx",
"fake-power-point.pptx",
]
@pytest.mark.parametrize("filename", test_files)
def test_partition_json_from_filename(filename: str):
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
elements = partition(filename=path)
with tempfile.TemporaryDirectory() as tmpdir:
test_path = os.path.join(tmpdir, filename + ".json")
elements_to_json(elements, filename=test_path, indent=2)
test_elements = partition_json(filename=test_path)
assert len(elements) > 0
assert len(str(elements[0])) > 0
assert len(elements) == len(test_elements)
for i in range(len(elements)):
assert elements[i] == test_elements[i]
@pytest.mark.parametrize("filename", test_files)
def test_partition_json_from_file(filename: str):
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
elements = partition(filename=path)
with tempfile.TemporaryDirectory() as tmpdir:
test_path = os.path.join(tmpdir, filename + ".json")
elements_to_json(elements, filename=test_path, indent=2)
with open(test_path) as f:
test_elements = partition_json(file=f)
assert len(elements) > 0
assert len(str(elements[0])) > 0
assert len(elements) == len(test_elements)
for i in range(len(elements)):
assert elements[i] == test_elements[i]
@pytest.mark.parametrize("filename", test_files)
def test_partition_json_from_text(filename: str):
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
elements = partition(filename=path)
with tempfile.TemporaryDirectory() as tmpdir:
test_path = os.path.join(tmpdir, filename + ".json")
elements_to_json(elements, filename=test_path, indent=2)
with open(test_path) as f:
text = f.read()
test_elements = partition_json(text=text)
assert len(elements) > 0
assert len(str(elements[0])) > 0
assert len(elements) == len(test_elements)
for i in range(len(elements)):
assert elements[i] == test_elements[i]
def test_partition_json_raises_with_none_specified():
with pytest.raises(ValueError):
partition_json()
def test_partition_json_raises_with_too_many_specified():
path = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
elements = partition(filename=path)
with tempfile.TemporaryDirectory() as tmpdir:
test_path = os.path.join(tmpdir, "fake-text.txt.json")
elements_to_json(elements, filename=test_path, indent=2)
with open(test_path) as f:
text = f.read()
with pytest.raises(ValueError):
partition_json(filename=test_path, file=f)
with pytest.raises(ValueError):
partition_json(filename=test_path, text=text)
with pytest.raises(ValueError):
partition_json(file=f, text=text)
with pytest.raises(ValueError):
partition_json(filename=test_path, file=f, text=text)