2023-03-09 03:36:01 +09:00
|
|
|
import os
|
|
|
|
import pathlib
|
|
|
|
import tempfile
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
|
|
from unstructured.partition.auto import partition
|
|
|
|
from unstructured.partition.json import partition_json
|
|
|
|
from unstructured.staging.base import elements_to_json
|
|
|
|
|
|
|
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
|
|
|
|
2023-03-30 16:54:29 -04:00
|
|
|
is_in_docker = os.path.exists("/.dockerenv")
|
|
|
|
|
2023-03-09 03:36:01 +09:00
|
|
|
test_files = [
|
|
|
|
"fake-text.txt",
|
|
|
|
"layout-parser-paper-fast.pdf",
|
|
|
|
"fake-html.html",
|
|
|
|
"fake.doc",
|
2023-06-16 17:52:13 -07:00
|
|
|
"eml/fake-email.eml",
|
2023-03-21 13:46:09 -07:00
|
|
|
pytest.param(
|
|
|
|
"fake-power-point.ppt",
|
2023-07-26 15:10:14 -04:00
|
|
|
marks=pytest.mark.skipif(
|
|
|
|
is_in_docker,
|
|
|
|
reason="Skipping this test in Docker container",
|
|
|
|
),
|
2023-03-21 13:46:09 -07:00
|
|
|
),
|
2023-03-09 03:36:01 +09:00
|
|
|
"fake.docx",
|
|
|
|
"fake-power-point.pptx",
|
|
|
|
]
|
|
|
|
|
2023-03-30 16:54:29 -04:00
|
|
|
is_in_docker = os.path.exists("/.dockerenv")
|
|
|
|
|
2023-03-09 03:36:01 +09:00
|
|
|
|
|
|
|
@pytest.mark.parametrize("filename", test_files)
|
|
|
|
def test_partition_json_from_filename(filename: str):
|
|
|
|
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
|
|
|
elements = partition(filename=path)
|
|
|
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
2023-06-16 17:52:13 -07:00
|
|
|
_filename = os.path.basename(filename)
|
|
|
|
test_path = os.path.join(tmpdir, _filename + ".json")
|
2023-03-09 03:36:01 +09:00
|
|
|
elements_to_json(elements, filename=test_path, indent=2)
|
|
|
|
test_elements = partition_json(filename=test_path)
|
|
|
|
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert len(str(elements[0])) > 0
|
|
|
|
|
|
|
|
assert len(elements) == len(test_elements)
|
|
|
|
for i in range(len(elements)):
|
|
|
|
assert elements[i] == test_elements[i]
|
2023-07-05 15:02:22 -05:00
|
|
|
assert elements[i].metadata.filename == filename.split("/")[-1]
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("filename", test_files)
|
|
|
|
def test_partition_json_from_filename_with_metadata_filename(filename: str):
|
|
|
|
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
|
|
|
elements = partition(filename=path)
|
|
|
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
|
|
_filename = os.path.basename(filename)
|
|
|
|
test_path = os.path.join(tmpdir, _filename + ".json")
|
|
|
|
elements_to_json(elements, filename=test_path, indent=2)
|
|
|
|
test_elements = partition_json(filename=test_path, metadata_filename="test")
|
|
|
|
|
|
|
|
assert len(test_elements) > 0
|
|
|
|
assert len(str(test_elements[0])) > 0
|
|
|
|
assert all(element.metadata.filename == "test" for element in test_elements)
|
2023-03-09 03:36:01 +09:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("filename", test_files)
|
|
|
|
def test_partition_json_from_file(filename: str):
|
|
|
|
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
|
|
|
elements = partition(filename=path)
|
|
|
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
2023-06-16 17:52:13 -07:00
|
|
|
_filename = os.path.basename(filename)
|
|
|
|
test_path = os.path.join(tmpdir, _filename + ".json")
|
2023-03-09 03:36:01 +09:00
|
|
|
elements_to_json(elements, filename=test_path, indent=2)
|
|
|
|
with open(test_path) as f:
|
|
|
|
test_elements = partition_json(file=f)
|
|
|
|
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert len(str(elements[0])) > 0
|
|
|
|
assert len(elements) == len(test_elements)
|
|
|
|
for i in range(len(elements)):
|
|
|
|
assert elements[i] == test_elements[i]
|
2023-07-05 15:02:22 -05:00
|
|
|
assert elements[i].metadata.filename == filename.split("/")[-1]
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("filename", test_files)
|
|
|
|
def test_partition_json_from_file_with_metadata_filename(filename: str):
|
|
|
|
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
|
|
|
elements = partition(filename=path)
|
|
|
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
|
|
_filename = os.path.basename(filename)
|
|
|
|
test_path = os.path.join(tmpdir, _filename + ".json")
|
|
|
|
elements_to_json(elements, filename=test_path, indent=2)
|
|
|
|
with open(test_path) as f:
|
|
|
|
test_elements = partition_json(file=f, metadata_filename="test")
|
|
|
|
|
|
|
|
for i in range(len(test_elements)):
|
|
|
|
assert test_elements[i].metadata.filename == "test"
|
2023-03-09 03:36:01 +09:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("filename", test_files)
|
|
|
|
def test_partition_json_from_text(filename: str):
|
|
|
|
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
|
|
|
elements = partition(filename=path)
|
|
|
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
2023-06-16 17:52:13 -07:00
|
|
|
_filename = os.path.basename(filename)
|
|
|
|
test_path = os.path.join(tmpdir, _filename + ".json")
|
2023-03-09 03:36:01 +09:00
|
|
|
elements_to_json(elements, filename=test_path, indent=2)
|
|
|
|
with open(test_path) as f:
|
|
|
|
text = f.read()
|
|
|
|
test_elements = partition_json(text=text)
|
|
|
|
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert len(str(elements[0])) > 0
|
|
|
|
assert len(elements) == len(test_elements)
|
|
|
|
for i in range(len(elements)):
|
|
|
|
assert elements[i] == test_elements[i]
|
2023-07-05 15:02:22 -05:00
|
|
|
assert elements[i].metadata.filename == filename.split("/")[-1]
|
2023-03-09 03:36:01 +09:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_json_raises_with_none_specified():
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
partition_json()
|
|
|
|
|
|
|
|
|
2023-03-28 17:03:51 -04:00
|
|
|
def test_partition_json_works_with_empty_string():
|
|
|
|
assert partition_json(text="") == []
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_json_works_with_empty_list():
|
|
|
|
assert partition_json(text="[]") == []
|
|
|
|
|
|
|
|
|
2023-03-09 03:36:01 +09:00
|
|
|
def test_partition_json_raises_with_too_many_specified():
|
|
|
|
path = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
|
|
|
elements = partition(filename=path)
|
|
|
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
|
|
test_path = os.path.join(tmpdir, "fake-text.txt.json")
|
|
|
|
elements_to_json(elements, filename=test_path, indent=2)
|
|
|
|
with open(test_path) as f:
|
|
|
|
text = f.read()
|
|
|
|
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
partition_json(filename=test_path, file=f)
|
|
|
|
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
partition_json(filename=test_path, text=text)
|
|
|
|
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
partition_json(file=f, text=text)
|
|
|
|
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
partition_json(filename=test_path, file=f, text=text)
|
2023-06-30 09:44:46 -05:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("filename", test_files)
|
|
|
|
def test_partition_json_from_filename_exclude_metadata(filename: str):
|
|
|
|
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
|
|
|
elements = partition(filename=path)
|
|
|
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
|
|
_filename = os.path.basename(filename)
|
|
|
|
test_path = os.path.join(tmpdir, _filename + ".json")
|
|
|
|
elements_to_json(elements, filename=test_path, indent=2)
|
|
|
|
test_elements = partition_json(filename=test_path, include_metadata=False)
|
|
|
|
|
|
|
|
for i in range(len(test_elements)):
|
|
|
|
assert any(test_elements[i].metadata.to_dict()) is False
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("filename", test_files)
|
|
|
|
def test_partition_json_from_file_exclude_metadata(filename: str):
|
|
|
|
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
|
|
|
elements = partition(filename=path)
|
|
|
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
|
|
_filename = os.path.basename(filename)
|
|
|
|
test_path = os.path.join(tmpdir, _filename + ".json")
|
|
|
|
elements_to_json(elements, filename=test_path, indent=2)
|
|
|
|
with open(test_path) as f:
|
|
|
|
test_elements = partition_json(file=f, include_metadata=False)
|
|
|
|
|
|
|
|
for i in range(len(test_elements)):
|
|
|
|
assert any(test_elements[i].metadata.to_dict()) is False
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("filename", test_files)
|
|
|
|
def test_partition_json_from_text_exclude_metadata(filename: str):
|
|
|
|
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
|
|
|
elements = partition(filename=path)
|
|
|
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
|
|
_filename = os.path.basename(filename)
|
|
|
|
test_path = os.path.join(tmpdir, _filename + ".json")
|
|
|
|
elements_to_json(elements, filename=test_path, indent=2)
|
|
|
|
with open(test_path) as f:
|
|
|
|
text = f.read()
|
|
|
|
test_elements = partition_json(text=text, include_metadata=False)
|
|
|
|
|
|
|
|
for i in range(len(test_elements)):
|
|
|
|
assert any(test_elements[i].metadata.to_dict()) is False
|
2023-07-25 15:59:45 -04:00
|
|
|
|
|
|
|
|
2023-07-26 15:10:14 -04:00
|
|
|
def test_partition_json_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/spring-weather.html.json",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.json.get_last_modified_date",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
elements = partition_json(
|
|
|
|
filename=filename,
|
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_json_with_custom_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/spring-weather.html.json",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
expected_last_modification_date = "2020-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.json.get_last_modified_date",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
elements = partition_json(
|
|
|
|
filename=filename,
|
2023-07-31 19:55:43 -07:00
|
|
|
metadata_last_modified=expected_last_modification_date,
|
2023-07-26 15:10:14 -04:00
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_json_from_file_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/spring-weather.html.json",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.json.get_last_modified_date_from_file",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = partition_json(
|
|
|
|
file=f,
|
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_json_from_file_with_custom_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/spring-weather.html.json",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
expected_last_modification_date = "2020-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.json.get_last_modified_date_from_file",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
with open(filename, "rb") as f:
|
2023-07-31 19:55:43 -07:00
|
|
|
elements = partition_json(file=f, metadata_last_modified=expected_last_modification_date)
|
2023-07-26 15:10:14 -04:00
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_json_from_text_metadata_date(
|
|
|
|
filename="example-docs/spring-weather.html.json",
|
|
|
|
):
|
|
|
|
with open(filename) as f:
|
|
|
|
text = f.read()
|
|
|
|
|
|
|
|
elements = partition_json(
|
|
|
|
text=text,
|
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified is None
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_json_from_text_with_custom_metadata_date(
|
|
|
|
filename="example-docs/spring-weather.html.json",
|
|
|
|
):
|
|
|
|
expected_last_modification_date = "2020-07-05T09:24:28"
|
|
|
|
|
|
|
|
with open(filename) as f:
|
|
|
|
text = f.read()
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
elements = partition_json(text=text, metadata_last_modified=expected_last_modification_date)
|
2023-07-26 15:10:14 -04:00
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
2023-07-25 15:59:45 -04:00
|
|
|
def test_partition_json_raises_with_unprocessable_json():
|
|
|
|
# NOTE(robinson) - This is unprocessable because it is not a list of dicts,
|
|
|
|
# per the Unstructured ISD format
|
|
|
|
text = '{"hi": "there"}'
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
partition_json(text=text)
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_json_raises_with_invalid_json():
|
|
|
|
text = '[{"hi": "there"}]]'
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
partition_json(text=text)
|