David Potter 5b92e0bb6b
bug CORE-4089: Onedrive partitioning fails - datetime formatting error (#2638)
Fixes Onedrive bug the same way Ryan fixed the Sharepoint error. (both
are microsoft products)
https://github.com/Unstructured-IO/unstructured/pull/2591
https://github.com/Unstructured-IO/unstructured/pull/2592/files

We are seeing occurrences of inconsistency in the timestamps returned by
Onedrive when fetching created and modified dates. Furthermore, in
future versions of this library, a datetime object will be returned
rather than a string.

Changes
This adds logic to guarantee Onedrive dates will be properly formatted
as ISO, regardless of the format provided by the onedrive library.
Bumps timestamp format output to include timezone offset (as we do with
others)

Adds unit tests for isofomat.

json_to_dict already unit tested here:

https://github.com/Unstructured-IO/unstructured/blob/main/test_unstructured_ingest/unit/test_utils.py

Adds small change for AstraDB to allow them to see what source called
their api
2024-03-15 14:01:05 +00:00

165 lines
4.7 KiB
Python

import json
import typing as t
from dataclasses import dataclass, field
from datetime import datetime
import pytest
import pytz
from unstructured.ingest.cli.utils import extract_config
from unstructured.ingest.interfaces import BaseConfig
from unstructured.ingest.utils.string_and_date_utils import ensure_isoformat_datetime, json_to_dict
@dataclass
class A(BaseConfig):
a: str
@dataclass
class B(BaseConfig):
a: A
b: int
flat_data = {"a": "test", "b": 4, "c": True}
def test_extract_config_concrete():
@dataclass
class C(BaseConfig):
b: B
c: bool
c = extract_config(flat_data=flat_data, config=C)
expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": True}
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
def test_extract_config_optional():
@dataclass
class C(BaseConfig):
c: bool
b: t.Optional[B] = None
c = extract_config(flat_data=flat_data, config=C)
expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": True}
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
def test_extract_config_union():
@dataclass
class C(BaseConfig):
c: bool
b: t.Optional[t.Union[B, int]] = None
c = extract_config(flat_data=flat_data, config=C)
expected_result = {"b": 4, "c": True}
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
def test_extract_config_list():
@dataclass
class C(BaseConfig):
c: t.List[int]
b: B
flat_data = {"a": "test", "b": 4, "c": [1, 2, 3]}
c = extract_config(flat_data=flat_data, config=C)
expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": [1, 2, 3]}
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
def test_extract_config_optional_list():
@dataclass
class C(BaseConfig):
b: B
c: t.Optional[t.List[int]] = None
flat_data = {"a": "test", "b": 4, "c": [1, 2, 3]}
c = extract_config(flat_data=flat_data, config=C)
expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": [1, 2, 3]}
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
def test_extract_config_dataclass_list():
@dataclass
class C(BaseConfig):
c: bool
b: t.List[B] = field(default_factory=list)
flat_data = {"a": "test", "c": True}
c = extract_config(flat_data=flat_data, config=C)
expected_result = {"b": [], "c": True}
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
def test_extract_config_dict():
@dataclass
class C(BaseConfig):
c: bool
b: t.Dict[str, B] = field(default_factory=dict)
flat_data = {"c": True}
c = extract_config(flat_data=flat_data, config=C)
expected_result = {"c": True, "b": {}}
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
def test_json_to_dict_valid_json():
json_string = '{"key": "value"}'
expected_result = {"key": "value"}
assert json_to_dict(json_string) == expected_result
assert isinstance(json_to_dict(json_string), dict)
def test_json_to_dict_malformed_json():
json_string = '{"key": "value"'
expected_result = '{"key": "value"'
assert json_to_dict(json_string) == expected_result
assert isinstance(json_to_dict(json_string), str)
def test_json_to_dict_single_quotes():
json_string = "{'key': 'value'}"
expected_result = {"key": "value"}
assert json_to_dict(json_string) == expected_result
assert isinstance(json_to_dict(json_string), dict)
def test_json_to_dict_path():
json_string = "/path/to/file.json"
expected_result = "/path/to/file.json"
assert json_to_dict(json_string) == expected_result
assert isinstance(json_to_dict(json_string), str)
def test_ensure_isoformat_datetime_for_datetime():
dt = ensure_isoformat_datetime(datetime(2021, 1, 1, 12, 0, 0))
assert dt == "2021-01-01T12:00:00"
def test_ensure_isoformat_datetime_for_datetime_with_tz():
dt = ensure_isoformat_datetime(datetime(2021, 1, 1, 12, 0, 0, tzinfo=pytz.UTC))
assert dt == "2021-01-01T12:00:00+00:00"
def test_ensure_isoformat_datetime_for_string():
dt = ensure_isoformat_datetime("2021-01-01T12:00:00")
assert dt == "2021-01-01T12:00:00"
def test_ensure_isoformat_datetime_for_string2():
dt = ensure_isoformat_datetime("2021-01-01T12:00:00+00:00")
assert dt == "2021-01-01T12:00:00+00:00"
def test_ensure_isoformat_datetime_fails_on_string():
with pytest.raises(ValueError):
ensure_isoformat_datetime("bad timestamp")
def test_ensure_isoformat_datetime_fails_on_int():
with pytest.raises(TypeError):
ensure_isoformat_datetime(1111)