2023-12-05 15:55:19 -05:00
|
|
|
import json
|
|
|
|
import typing as t
|
|
|
|
from dataclasses import dataclass, field
|
2024-03-15 07:01:05 -07:00
|
|
|
from datetime import datetime
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
import pytz
|
2023-12-05 15:55:19 -05:00
|
|
|
|
|
|
|
from unstructured.ingest.cli.utils import extract_config
|
|
|
|
from unstructured.ingest.interfaces import BaseConfig
|
2024-03-15 07:01:05 -07:00
|
|
|
from unstructured.ingest.utils.string_and_date_utils import ensure_isoformat_datetime, json_to_dict
|
2023-12-05 15:55:19 -05:00
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class A(BaseConfig):
|
|
|
|
a: str
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class B(BaseConfig):
|
|
|
|
a: A
|
|
|
|
b: int
|
|
|
|
|
|
|
|
|
|
|
|
flat_data = {"a": "test", "b": 4, "c": True}
|
|
|
|
|
|
|
|
|
|
|
|
def test_extract_config_concrete():
|
|
|
|
@dataclass
|
|
|
|
class C(BaseConfig):
|
|
|
|
b: B
|
|
|
|
c: bool
|
|
|
|
|
|
|
|
c = extract_config(flat_data=flat_data, config=C)
|
|
|
|
expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": True}
|
|
|
|
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
|
|
|
|
|
|
|
|
|
|
|
|
def test_extract_config_optional():
|
|
|
|
@dataclass
|
|
|
|
class C(BaseConfig):
|
|
|
|
c: bool
|
|
|
|
b: t.Optional[B] = None
|
|
|
|
|
|
|
|
c = extract_config(flat_data=flat_data, config=C)
|
|
|
|
expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": True}
|
|
|
|
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
|
|
|
|
|
|
|
|
|
|
|
|
def test_extract_config_union():
|
|
|
|
@dataclass
|
|
|
|
class C(BaseConfig):
|
|
|
|
c: bool
|
|
|
|
b: t.Optional[t.Union[B, int]] = None
|
|
|
|
|
|
|
|
c = extract_config(flat_data=flat_data, config=C)
|
|
|
|
expected_result = {"b": 4, "c": True}
|
|
|
|
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
|
|
|
|
|
|
|
|
|
|
|
|
def test_extract_config_list():
|
|
|
|
@dataclass
|
|
|
|
class C(BaseConfig):
|
|
|
|
c: t.List[int]
|
|
|
|
b: B
|
|
|
|
|
|
|
|
flat_data = {"a": "test", "b": 4, "c": [1, 2, 3]}
|
|
|
|
c = extract_config(flat_data=flat_data, config=C)
|
|
|
|
expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": [1, 2, 3]}
|
|
|
|
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
|
|
|
|
|
|
|
|
|
|
|
|
def test_extract_config_optional_list():
|
|
|
|
@dataclass
|
|
|
|
class C(BaseConfig):
|
|
|
|
b: B
|
|
|
|
c: t.Optional[t.List[int]] = None
|
|
|
|
|
|
|
|
flat_data = {"a": "test", "b": 4, "c": [1, 2, 3]}
|
|
|
|
c = extract_config(flat_data=flat_data, config=C)
|
|
|
|
expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": [1, 2, 3]}
|
|
|
|
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
|
|
|
|
|
|
|
|
|
|
|
|
def test_extract_config_dataclass_list():
|
|
|
|
@dataclass
|
|
|
|
class C(BaseConfig):
|
|
|
|
c: bool
|
|
|
|
b: t.List[B] = field(default_factory=list)
|
|
|
|
|
|
|
|
flat_data = {"a": "test", "c": True}
|
|
|
|
c = extract_config(flat_data=flat_data, config=C)
|
|
|
|
expected_result = {"b": [], "c": True}
|
|
|
|
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
|
|
|
|
|
|
|
|
|
|
|
|
def test_extract_config_dict():
|
|
|
|
@dataclass
|
|
|
|
class C(BaseConfig):
|
|
|
|
c: bool
|
|
|
|
b: t.Dict[str, B] = field(default_factory=dict)
|
|
|
|
|
|
|
|
flat_data = {"c": True}
|
|
|
|
c = extract_config(flat_data=flat_data, config=C)
|
|
|
|
expected_result = {"c": True, "b": {}}
|
|
|
|
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
|
2024-03-03 07:30:35 -08:00
|
|
|
|
|
|
|
|
|
|
|
def test_json_to_dict_valid_json():
|
|
|
|
json_string = '{"key": "value"}'
|
|
|
|
expected_result = {"key": "value"}
|
|
|
|
assert json_to_dict(json_string) == expected_result
|
|
|
|
assert isinstance(json_to_dict(json_string), dict)
|
|
|
|
|
|
|
|
|
|
|
|
def test_json_to_dict_malformed_json():
|
|
|
|
json_string = '{"key": "value"'
|
|
|
|
expected_result = '{"key": "value"'
|
|
|
|
assert json_to_dict(json_string) == expected_result
|
|
|
|
assert isinstance(json_to_dict(json_string), str)
|
|
|
|
|
|
|
|
|
|
|
|
def test_json_to_dict_single_quotes():
|
|
|
|
json_string = "{'key': 'value'}"
|
|
|
|
expected_result = {"key": "value"}
|
|
|
|
assert json_to_dict(json_string) == expected_result
|
|
|
|
assert isinstance(json_to_dict(json_string), dict)
|
|
|
|
|
|
|
|
|
|
|
|
def test_json_to_dict_path():
|
|
|
|
json_string = "/path/to/file.json"
|
|
|
|
expected_result = "/path/to/file.json"
|
|
|
|
assert json_to_dict(json_string) == expected_result
|
|
|
|
assert isinstance(json_to_dict(json_string), str)
|
2024-03-15 07:01:05 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_ensure_isoformat_datetime_for_datetime():
|
|
|
|
dt = ensure_isoformat_datetime(datetime(2021, 1, 1, 12, 0, 0))
|
|
|
|
assert dt == "2021-01-01T12:00:00"
|
|
|
|
|
|
|
|
|
|
|
|
def test_ensure_isoformat_datetime_for_datetime_with_tz():
|
|
|
|
dt = ensure_isoformat_datetime(datetime(2021, 1, 1, 12, 0, 0, tzinfo=pytz.UTC))
|
|
|
|
assert dt == "2021-01-01T12:00:00+00:00"
|
|
|
|
|
|
|
|
|
|
|
|
def test_ensure_isoformat_datetime_for_string():
|
|
|
|
dt = ensure_isoformat_datetime("2021-01-01T12:00:00")
|
|
|
|
assert dt == "2021-01-01T12:00:00"
|
|
|
|
|
|
|
|
|
|
|
|
def test_ensure_isoformat_datetime_for_string2():
|
|
|
|
dt = ensure_isoformat_datetime("2021-01-01T12:00:00+00:00")
|
|
|
|
assert dt == "2021-01-01T12:00:00+00:00"
|
|
|
|
|
|
|
|
|
|
|
|
def test_ensure_isoformat_datetime_fails_on_string():
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
ensure_isoformat_datetime("bad timestamp")
|
|
|
|
|
|
|
|
|
|
|
|
def test_ensure_isoformat_datetime_fails_on_int():
|
|
|
|
with pytest.raises(TypeError):
|
|
|
|
ensure_isoformat_datetime(1111)
|