mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 10:03:07 +00:00 
			
		
		
		
	 5b92e0bb6b
			
		
	
	
		5b92e0bb6b
		
			
		
	
	
	
	
		
			
			Fixes Onedrive bug the same way Ryan fixed the Sharepoint error. (both are microsoft products) https://github.com/Unstructured-IO/unstructured/pull/2591 https://github.com/Unstructured-IO/unstructured/pull/2592/files We are seeing occurrences of inconsistency in the timestamps returned by Onedrive when fetching created and modified dates. Furthermore, in future versions of this library, a datetime object will be returned rather than a string. Changes This adds logic to guarantee Onedrive dates will be properly formatted as ISO, regardless of the format provided by the onedrive library. Bumps timestamp format output to include timezone offset (as we do with others) Adds unit tests for isofomat. json_to_dict already unit tested here: https://github.com/Unstructured-IO/unstructured/blob/main/test_unstructured_ingest/unit/test_utils.py Adds small change for AstraDB to allow them to see what source called their api
		
			
				
	
	
		
			165 lines
		
	
	
		
			4.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			165 lines
		
	
	
		
			4.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import json
 | |
| import typing as t
 | |
| from dataclasses import dataclass, field
 | |
| from datetime import datetime
 | |
| 
 | |
| import pytest
 | |
| import pytz
 | |
| 
 | |
| from unstructured.ingest.cli.utils import extract_config
 | |
| from unstructured.ingest.interfaces import BaseConfig
 | |
| from unstructured.ingest.utils.string_and_date_utils import ensure_isoformat_datetime, json_to_dict
 | |
| 
 | |
| 
 | |
| @dataclass
 | |
| class A(BaseConfig):
 | |
|     a: str
 | |
| 
 | |
| 
 | |
| @dataclass
 | |
| class B(BaseConfig):
 | |
|     a: A
 | |
|     b: int
 | |
| 
 | |
| 
 | |
| flat_data = {"a": "test", "b": 4, "c": True}
 | |
| 
 | |
| 
 | |
| def test_extract_config_concrete():
 | |
|     @dataclass
 | |
|     class C(BaseConfig):
 | |
|         b: B
 | |
|         c: bool
 | |
| 
 | |
|     c = extract_config(flat_data=flat_data, config=C)
 | |
|     expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": True}
 | |
|     assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
 | |
| 
 | |
| 
 | |
| def test_extract_config_optional():
 | |
|     @dataclass
 | |
|     class C(BaseConfig):
 | |
|         c: bool
 | |
|         b: t.Optional[B] = None
 | |
| 
 | |
|     c = extract_config(flat_data=flat_data, config=C)
 | |
|     expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": True}
 | |
|     assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
 | |
| 
 | |
| 
 | |
| def test_extract_config_union():
 | |
|     @dataclass
 | |
|     class C(BaseConfig):
 | |
|         c: bool
 | |
|         b: t.Optional[t.Union[B, int]] = None
 | |
| 
 | |
|     c = extract_config(flat_data=flat_data, config=C)
 | |
|     expected_result = {"b": 4, "c": True}
 | |
|     assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
 | |
| 
 | |
| 
 | |
| def test_extract_config_list():
 | |
|     @dataclass
 | |
|     class C(BaseConfig):
 | |
|         c: t.List[int]
 | |
|         b: B
 | |
| 
 | |
|     flat_data = {"a": "test", "b": 4, "c": [1, 2, 3]}
 | |
|     c = extract_config(flat_data=flat_data, config=C)
 | |
|     expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": [1, 2, 3]}
 | |
|     assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
 | |
| 
 | |
| 
 | |
| def test_extract_config_optional_list():
 | |
|     @dataclass
 | |
|     class C(BaseConfig):
 | |
|         b: B
 | |
|         c: t.Optional[t.List[int]] = None
 | |
| 
 | |
|     flat_data = {"a": "test", "b": 4, "c": [1, 2, 3]}
 | |
|     c = extract_config(flat_data=flat_data, config=C)
 | |
|     expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": [1, 2, 3]}
 | |
|     assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
 | |
| 
 | |
| 
 | |
| def test_extract_config_dataclass_list():
 | |
|     @dataclass
 | |
|     class C(BaseConfig):
 | |
|         c: bool
 | |
|         b: t.List[B] = field(default_factory=list)
 | |
| 
 | |
|     flat_data = {"a": "test", "c": True}
 | |
|     c = extract_config(flat_data=flat_data, config=C)
 | |
|     expected_result = {"b": [], "c": True}
 | |
|     assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
 | |
| 
 | |
| 
 | |
| def test_extract_config_dict():
 | |
|     @dataclass
 | |
|     class C(BaseConfig):
 | |
|         c: bool
 | |
|         b: t.Dict[str, B] = field(default_factory=dict)
 | |
| 
 | |
|     flat_data = {"c": True}
 | |
|     c = extract_config(flat_data=flat_data, config=C)
 | |
|     expected_result = {"c": True, "b": {}}
 | |
|     assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
 | |
| 
 | |
| 
 | |
| def test_json_to_dict_valid_json():
 | |
|     json_string = '{"key": "value"}'
 | |
|     expected_result = {"key": "value"}
 | |
|     assert json_to_dict(json_string) == expected_result
 | |
|     assert isinstance(json_to_dict(json_string), dict)
 | |
| 
 | |
| 
 | |
| def test_json_to_dict_malformed_json():
 | |
|     json_string = '{"key": "value"'
 | |
|     expected_result = '{"key": "value"'
 | |
|     assert json_to_dict(json_string) == expected_result
 | |
|     assert isinstance(json_to_dict(json_string), str)
 | |
| 
 | |
| 
 | |
| def test_json_to_dict_single_quotes():
 | |
|     json_string = "{'key': 'value'}"
 | |
|     expected_result = {"key": "value"}
 | |
|     assert json_to_dict(json_string) == expected_result
 | |
|     assert isinstance(json_to_dict(json_string), dict)
 | |
| 
 | |
| 
 | |
| def test_json_to_dict_path():
 | |
|     json_string = "/path/to/file.json"
 | |
|     expected_result = "/path/to/file.json"
 | |
|     assert json_to_dict(json_string) == expected_result
 | |
|     assert isinstance(json_to_dict(json_string), str)
 | |
| 
 | |
| 
 | |
| def test_ensure_isoformat_datetime_for_datetime():
 | |
|     dt = ensure_isoformat_datetime(datetime(2021, 1, 1, 12, 0, 0))
 | |
|     assert dt == "2021-01-01T12:00:00"
 | |
| 
 | |
| 
 | |
| def test_ensure_isoformat_datetime_for_datetime_with_tz():
 | |
|     dt = ensure_isoformat_datetime(datetime(2021, 1, 1, 12, 0, 0, tzinfo=pytz.UTC))
 | |
|     assert dt == "2021-01-01T12:00:00+00:00"
 | |
| 
 | |
| 
 | |
| def test_ensure_isoformat_datetime_for_string():
 | |
|     dt = ensure_isoformat_datetime("2021-01-01T12:00:00")
 | |
|     assert dt == "2021-01-01T12:00:00"
 | |
| 
 | |
| 
 | |
| def test_ensure_isoformat_datetime_for_string2():
 | |
|     dt = ensure_isoformat_datetime("2021-01-01T12:00:00+00:00")
 | |
|     assert dt == "2021-01-01T12:00:00+00:00"
 | |
| 
 | |
| 
 | |
| def test_ensure_isoformat_datetime_fails_on_string():
 | |
|     with pytest.raises(ValueError):
 | |
|         ensure_isoformat_datetime("bad timestamp")
 | |
| 
 | |
| 
 | |
| def test_ensure_isoformat_datetime_fails_on_int():
 | |
|     with pytest.raises(TypeError):
 | |
|         ensure_isoformat_datetime(1111)
 |