Steve Canny 3fe5c094fa
rfctr(file): refactor detect_filetype() (#3429)
**Summary**
In preparation for fixing a cluster of bugs with automatic file-type
detection and paving the way for some reliability improvements, refactor
`unstructured.file_utils.filetype` module and improve thoroughness of
tests.

**Additional Context**
Factor type-recognition process into three distinct strategies that are
attempted in sequence. Attempted in order of preference,
type-recognition falls to the next strategy when the one before it is
not applicable or cannot determine the file-type. This provides a clear
basis for organizing the code and tests at the top level.

Consolidate the existing tests around these strategies, adding
additional cases to achieve better coverage.

Several bugs were uncovered in the process. Small ones were just fixed,
bigger ones will be remedied in following PRs.
2024-07-23 23:18:48 +00:00

420 lines
15 KiB
Python

"""Test-suite for `unstructured.partition.json` module."""
from __future__ import annotations
import os
import pathlib
import tempfile
import pytest
from pytest_mock import MockFixture
from test_unstructured.unit_utils import example_doc_path
from unstructured.documents.elements import CompositeElement
from unstructured.file_utils.model import FileType
from unstructured.partition.email import partition_email
from unstructured.partition.html import partition_html
from unstructured.partition.json import partition_json
from unstructured.partition.text import partition_text
from unstructured.partition.xml import partition_xml
from unstructured.staging.base import elements_to_json
DIRECTORY = pathlib.Path(__file__).parent.resolve()
is_in_docker = os.path.exists("/.dockerenv")
test_files = [
"fake-text.txt",
"fake-html.html",
"eml/fake-email.eml",
]
is_in_docker = os.path.exists("/.dockerenv")
def test_it_chunks_elements_when_a_chunking_strategy_is_specified():
chunks = partition_json(
"example-docs/spring-weather.html.json", chunking_strategy="basic", max_characters=1500
)
assert len(chunks) == 10
assert all(isinstance(ch, CompositeElement) for ch in chunks)
@pytest.mark.parametrize("filename", test_files)
def test_partition_json_from_filename(filename: str):
path = example_doc_path(filename)
elements = []
filetype = FileType.from_extension(os.path.splitext(path)[1])
if filetype == FileType.TXT:
elements = partition_text(filename=path)
if filetype == FileType.HTML:
elements = partition_html(filename=path)
if filetype == FileType.XML:
elements = partition_xml(filename=path)
if filetype == FileType.EML:
elements = partition_email(filename=path)
with tempfile.TemporaryDirectory() as tmpdir:
_filename = os.path.basename(filename)
test_path = os.path.join(tmpdir, _filename + ".json")
elements_to_json(elements, filename=test_path, indent=2)
test_elements = partition_json(filename=test_path)
assert len(elements) > 0
assert len(str(elements[0])) > 0
assert len(elements) == len(test_elements)
for i in range(len(elements)):
assert elements[i] == test_elements[i]
assert elements[i].metadata.filename == filename.split("/")[-1]
@pytest.mark.parametrize("filename", test_files)
def test_partition_json_from_filename_with_metadata_filename(filename: str):
path = example_doc_path(filename)
elements = []
filetype = FileType.from_extension(os.path.splitext(path)[1])
if filetype == FileType.TXT:
elements = partition_text(filename=path)
if filetype == FileType.HTML:
elements = partition_html(filename=path)
if filetype == FileType.XML:
elements = partition_xml(filename=path)
if filetype == FileType.EML:
elements = partition_email(filename=path)
with tempfile.TemporaryDirectory() as tmpdir:
_filename = os.path.basename(filename)
test_path = os.path.join(tmpdir, _filename + ".json")
elements_to_json(elements, filename=test_path, indent=2)
test_elements = partition_json(filename=test_path, metadata_filename="test")
assert len(test_elements) > 0
assert len(str(test_elements[0])) > 0
assert all(element.metadata.filename == "test" for element in test_elements)
@pytest.mark.parametrize("filename", test_files)
def test_partition_json_from_file(filename: str):
path = example_doc_path(filename)
elements = []
filetype = FileType.from_extension(os.path.splitext(path)[1])
if filetype == FileType.TXT:
elements = partition_text(filename=path)
if filetype == FileType.HTML:
elements = partition_html(filename=path)
if filetype == FileType.XML:
elements = partition_xml(filename=path)
if filetype == FileType.EML:
elements = partition_email(filename=path)
with tempfile.TemporaryDirectory() as tmpdir:
_filename = os.path.basename(filename)
test_path = os.path.join(tmpdir, _filename + ".json")
elements_to_json(elements, filename=test_path, indent=2)
with open(test_path, "rb") as f:
test_elements = partition_json(file=f)
assert len(elements) > 0
assert len(str(elements[0])) > 0
assert len(elements) == len(test_elements)
for i in range(len(elements)):
assert elements[i] == test_elements[i]
assert elements[i].metadata.filename == filename.split("/")[-1]
@pytest.mark.parametrize("filename", test_files)
def test_partition_json_from_file_with_metadata_filename(filename: str):
path = example_doc_path(filename)
elements = []
filetype = FileType.from_extension(os.path.splitext(path)[1])
if filetype == FileType.TXT:
elements = partition_text(filename=path)
if filetype == FileType.HTML:
elements = partition_html(filename=path)
if filetype == FileType.XML:
elements = partition_xml(filename=path)
if filetype == FileType.EML:
elements = partition_email(filename=path)
with tempfile.TemporaryDirectory() as tmpdir:
_filename = os.path.basename(filename)
test_path = os.path.join(tmpdir, _filename + ".json")
elements_to_json(elements, filename=test_path, indent=2)
with open(test_path, "rb") as f:
test_elements = partition_json(file=f, metadata_filename="test")
for i in range(len(test_elements)):
assert test_elements[i].metadata.filename == "test"
@pytest.mark.parametrize("filename", test_files)
def test_partition_json_from_text(filename: str):
path = example_doc_path(filename)
elements = []
filetype = FileType.from_extension(os.path.splitext(path)[1])
if filetype == FileType.TXT:
elements = partition_text(filename=path)
if filetype == FileType.HTML:
elements = partition_html(filename=path)
if filetype == FileType.XML:
elements = partition_xml(filename=path)
if filetype == FileType.EML:
elements = partition_email(filename=path)
with tempfile.TemporaryDirectory() as tmpdir:
_filename = os.path.basename(filename)
test_path = os.path.join(tmpdir, _filename + ".json")
elements_to_json(elements, filename=test_path, indent=2)
with open(test_path) as f:
text = f.read()
test_elements = partition_json(text=text)
assert len(elements) > 0
assert len(str(elements[0])) > 0
assert len(elements) == len(test_elements)
for i in range(len(elements)):
assert elements[i] == test_elements[i]
assert elements[i].metadata.filename == filename.split("/")[-1]
def test_partition_json_raises_with_none_specified():
with pytest.raises(ValueError):
partition_json()
def test_partition_json_works_with_empty_string():
assert partition_json(text="") == []
def test_partition_json_works_with_empty_list():
assert partition_json(text="[]") == []
def test_partition_json_raises_with_too_many_specified():
path = example_doc_path("fake-text.txt")
elements = []
filetype = FileType.from_extension(os.path.splitext(path)[1])
if filetype == FileType.TXT:
elements = partition_text(filename=path)
if filetype == FileType.HTML:
elements = partition_html(filename=path)
if filetype == FileType.XML:
elements = partition_xml(filename=path)
if filetype == FileType.EML:
elements = partition_email(filename=path)
with tempfile.TemporaryDirectory() as tmpdir:
test_path = os.path.join(tmpdir, "fake-text.txt.json")
elements_to_json(elements, filename=test_path, indent=2)
with open(test_path, "rb") as f:
text = f.read().decode("utf-8")
with pytest.raises(ValueError):
partition_json(filename=test_path, file=f)
with pytest.raises(ValueError):
partition_json(filename=test_path, text=text)
with pytest.raises(ValueError):
partition_json(file=f, text=text)
with pytest.raises(ValueError):
partition_json(filename=test_path, file=f, text=text)
@pytest.mark.parametrize("filename", test_files)
def test_partition_json_from_filename_exclude_metadata(filename: str):
path = example_doc_path(filename)
elements = []
filetype = FileType.from_extension(os.path.splitext(path)[1])
if filetype == FileType.TXT:
elements = partition_text(filename=path)
if filetype == FileType.HTML:
elements = partition_html(filename=path)
if filetype == FileType.XML:
elements = partition_xml(filename=path)
if filetype == FileType.EML:
elements = partition_email(filename=path)
with tempfile.TemporaryDirectory() as tmpdir:
_filename = os.path.basename(filename)
test_path = os.path.join(tmpdir, _filename + ".json")
elements_to_json(elements, filename=test_path, indent=2)
test_elements = partition_json(filename=test_path, include_metadata=False)
for i in range(len(test_elements)):
assert any(test_elements[i].metadata.to_dict()) is False
@pytest.mark.parametrize("filename", test_files)
def test_partition_json_from_file_exclude_metadata(filename: str):
path = example_doc_path(filename)
elements = []
filetype = FileType.from_extension(os.path.splitext(path)[1])
if filetype == FileType.TXT:
elements = partition_text(filename=path)
if filetype == FileType.HTML:
elements = partition_html(filename=path)
if filetype == FileType.XML:
elements = partition_xml(filename=path)
if filetype == FileType.EML:
elements = partition_email(filename=path)
with tempfile.TemporaryDirectory() as tmpdir:
_filename = os.path.basename(filename)
test_path = os.path.join(tmpdir, _filename + ".json")
elements_to_json(elements, filename=test_path, indent=2)
with open(test_path, "rb") as f:
test_elements = partition_json(file=f, include_metadata=False)
for i in range(len(test_elements)):
assert any(test_elements[i].metadata.to_dict()) is False
@pytest.mark.parametrize("filename", test_files)
def test_partition_json_from_text_exclude_metadata(filename: str):
path = example_doc_path(filename)
elements = []
filetype = FileType.from_extension(os.path.splitext(path)[1])
if filetype == FileType.TXT:
elements = partition_text(filename=path)
if filetype == FileType.HTML:
elements = partition_html(filename=path)
if filetype == FileType.XML:
elements = partition_xml(filename=path)
if filetype == FileType.EML:
elements = partition_email(filename=path)
with tempfile.TemporaryDirectory() as tmpdir:
_filename = os.path.basename(filename)
test_path = os.path.join(tmpdir, _filename + ".json")
elements_to_json(elements, filename=test_path, indent=2)
with open(test_path) as f:
text = f.read()
test_elements = partition_json(text=text, include_metadata=False)
for i in range(len(test_elements)):
assert any(test_elements[i].metadata.to_dict()) is False
def test_partition_json_metadata_date(mocker: MockFixture):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.json.get_last_modified_date",
return_value=mocked_last_modification_date,
)
elements = partition_json("example-docs/spring-weather.html.json")
assert elements[0].metadata.last_modified == mocked_last_modification_date
def test_partition_json_with_custom_metadata_date(mocker: MockFixture):
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modification_date = "2020-07-05T09:24:28"
mocker.patch(
"unstructured.partition.json.get_last_modified_date",
return_value=mocked_last_modification_date,
)
elements = partition_json(
"example-docs/spring-weather.html.json",
metadata_last_modified=expected_last_modification_date,
)
assert elements[0].metadata.last_modified == expected_last_modification_date
def test_partition_json_from_file_metadata_date(mocker: MockFixture):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.json.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
)
with open("example-docs/spring-weather.html.json", "rb") as f:
elements = partition_json(file=f)
assert elements[0].metadata.last_modified is None
def test_partition_json_from_file_explicit_get_metadata_date(
mocker,
filename="example-docs/spring-weather.html.json",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.json.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
)
with open(filename, "rb") as f:
elements = partition_json(
file=f,
date_from_file_object=True,
)
assert elements[0].metadata.last_modified == mocked_last_modification_date
def test_partition_json_from_file_with_custom_metadata_date(mocker: MockFixture):
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modification_date = "2020-07-05T09:24:28"
mocker.patch(
"unstructured.partition.json.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
)
with open("example-docs/spring-weather.html.json", "rb") as f:
elements = partition_json(file=f, metadata_last_modified=expected_last_modification_date)
assert elements[0].metadata.last_modified == expected_last_modification_date
def test_partition_json_from_text_metadata_date():
with open("example-docs/spring-weather.html.json") as f:
text = f.read()
elements = partition_json(text=text)
assert elements[0].metadata.last_modified is None
def test_partition_json_from_text_with_custom_metadata_date():
expected_last_modification_date = "2020-07-05T09:24:28"
with open("example-docs/spring-weather.html.json") as f:
text = f.read()
elements = partition_json(text=text, metadata_last_modified=expected_last_modification_date)
assert elements[0].metadata.last_modified == expected_last_modification_date
def test_partition_json_from_file_without_metadata_date(
filename="example-docs/spring-weather.html.json",
):
"""Test partition_json() with file that are not possible to get last modified date"""
with open(filename, "rb") as f:
sf = tempfile.SpooledTemporaryFile()
sf.write(f.read())
sf.seek(0)
elements = partition_json(file=sf, date_from_file_object=True)
assert elements[0].metadata.last_modified is None
def test_partition_json_raises_with_unprocessable_json():
# NOTE(robinson) - This is unprocessable because it is not a list of dicts,
# per the Unstructured ISD format
text = '{"hi": "there"}'
with pytest.raises(ValueError):
partition_json(text=text)
def test_partition_json_raises_with_invalid_json():
text = '[{"hi": "there"}]]'
with pytest.raises(ValueError):
partition_json(text=text)