| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  | """Test-suite for `unstructured.partition.json` module.""" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from __future__ import annotations | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-03-09 03:36:01 +09:00
										 |  |  | import os | 
					
						
							|  |  |  | import pathlib | 
					
						
							|  |  |  | import tempfile | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import pytest | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  | from pytest_mock import MockFixture | 
					
						
							| 
									
										
										
										
											2023-03-09 03:36:01 +09:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  | from unstructured.documents.elements import CompositeElement | 
					
						
							| 
									
										
										
										
											2023-08-29 16:59:26 -04:00
										 |  |  | from unstructured.file_utils.filetype import FileType, detect_filetype | 
					
						
							|  |  |  | from unstructured.partition.email import partition_email | 
					
						
							|  |  |  | from unstructured.partition.html import partition_html | 
					
						
							| 
									
										
										
										
											2023-03-09 03:36:01 +09:00
										 |  |  | from unstructured.partition.json import partition_json | 
					
						
							| 
									
										
										
										
											2023-08-29 16:59:26 -04:00
										 |  |  | from unstructured.partition.text import partition_text | 
					
						
							|  |  |  | from unstructured.partition.xml import partition_xml | 
					
						
							| 
									
										
										
										
											2023-03-09 03:36:01 +09:00
										 |  |  | from unstructured.staging.base import elements_to_json | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | DIRECTORY = pathlib.Path(__file__).parent.resolve() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-03-30 16:54:29 -04:00
										 |  |  | is_in_docker = os.path.exists("/.dockerenv") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-03-09 03:36:01 +09:00
										 |  |  | test_files = [ | 
					
						
							|  |  |  |     "fake-text.txt", | 
					
						
							|  |  |  |     "fake-html.html", | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     "eml/fake-email.eml", | 
					
						
							| 
									
										
										
										
											2023-03-09 03:36:01 +09:00
										 |  |  | ] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-03-30 16:54:29 -04:00
										 |  |  | is_in_docker = os.path.exists("/.dockerenv") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-03-09 03:36:01 +09:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  | def test_it_chunks_elements_when_a_chunking_strategy_is_specified(): | 
					
						
							|  |  |  |     chunks = partition_json( | 
					
						
							|  |  |  |         "example-docs/spring-weather.html.json", chunking_strategy="basic", max_characters=1500 | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert len(chunks) == 10 | 
					
						
							|  |  |  |     assert all(isinstance(ch, CompositeElement) for ch in chunks) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-03-09 03:36:01 +09:00
										 |  |  | @pytest.mark.parametrize("filename", test_files) | 
					
						
							|  |  |  | def test_partition_json_from_filename(filename: str): | 
					
						
							|  |  |  |     path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  |     elements = [] | 
					
						
							| 
									
										
										
										
											2023-08-29 16:59:26 -04:00
										 |  |  |     filetype = detect_filetype(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.TXT: | 
					
						
							|  |  |  |         elements = partition_text(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.HTML: | 
					
						
							|  |  |  |         elements = partition_html(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.XML: | 
					
						
							|  |  |  |         elements = partition_xml(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.EML: | 
					
						
							|  |  |  |         elements = partition_email(filename=path) | 
					
						
							| 
									
										
										
										
											2023-03-09 03:36:01 +09:00
										 |  |  | 
 | 
					
						
							|  |  |  |     with tempfile.TemporaryDirectory() as tmpdir: | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |         _filename = os.path.basename(filename) | 
					
						
							|  |  |  |         test_path = os.path.join(tmpdir, _filename + ".json") | 
					
						
							| 
									
										
										
										
											2023-03-09 03:36:01 +09:00
										 |  |  |         elements_to_json(elements, filename=test_path, indent=2) | 
					
						
							|  |  |  |         test_elements = partition_json(filename=test_path) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert len(elements) > 0 | 
					
						
							|  |  |  |     assert len(str(elements[0])) > 0 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert len(elements) == len(test_elements) | 
					
						
							|  |  |  |     for i in range(len(elements)): | 
					
						
							|  |  |  |         assert elements[i] == test_elements[i] | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |         assert elements[i].metadata.filename == filename.split("/")[-1] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize("filename", test_files) | 
					
						
							|  |  |  | def test_partition_json_from_filename_with_metadata_filename(filename: str): | 
					
						
							|  |  |  |     path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  |     elements = [] | 
					
						
							| 
									
										
										
										
											2023-08-29 16:59:26 -04:00
										 |  |  |     filetype = detect_filetype(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.TXT: | 
					
						
							|  |  |  |         elements = partition_text(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.HTML: | 
					
						
							|  |  |  |         elements = partition_html(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.XML: | 
					
						
							|  |  |  |         elements = partition_xml(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.EML: | 
					
						
							|  |  |  |         elements = partition_email(filename=path) | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  |     with tempfile.TemporaryDirectory() as tmpdir: | 
					
						
							|  |  |  |         _filename = os.path.basename(filename) | 
					
						
							|  |  |  |         test_path = os.path.join(tmpdir, _filename + ".json") | 
					
						
							|  |  |  |         elements_to_json(elements, filename=test_path, indent=2) | 
					
						
							|  |  |  |         test_elements = partition_json(filename=test_path, metadata_filename="test") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert len(test_elements) > 0 | 
					
						
							|  |  |  |     assert len(str(test_elements[0])) > 0 | 
					
						
							|  |  |  |     assert all(element.metadata.filename == "test" for element in test_elements) | 
					
						
							| 
									
										
										
										
											2023-03-09 03:36:01 +09:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize("filename", test_files) | 
					
						
							|  |  |  | def test_partition_json_from_file(filename: str): | 
					
						
							|  |  |  |     path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  |     elements = [] | 
					
						
							| 
									
										
										
										
											2023-08-29 16:59:26 -04:00
										 |  |  |     filetype = detect_filetype(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.TXT: | 
					
						
							|  |  |  |         elements = partition_text(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.HTML: | 
					
						
							|  |  |  |         elements = partition_html(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.XML: | 
					
						
							|  |  |  |         elements = partition_xml(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.EML: | 
					
						
							|  |  |  |         elements = partition_email(filename=path) | 
					
						
							| 
									
										
										
										
											2023-03-09 03:36:01 +09:00
										 |  |  | 
 | 
					
						
							|  |  |  |     with tempfile.TemporaryDirectory() as tmpdir: | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |         _filename = os.path.basename(filename) | 
					
						
							|  |  |  |         test_path = os.path.join(tmpdir, _filename + ".json") | 
					
						
							| 
									
										
										
										
											2023-03-09 03:36:01 +09:00
										 |  |  |         elements_to_json(elements, filename=test_path, indent=2) | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  |         with open(test_path, "rb") as f: | 
					
						
							| 
									
										
										
										
											2023-03-09 03:36:01 +09:00
										 |  |  |             test_elements = partition_json(file=f) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert len(elements) > 0 | 
					
						
							|  |  |  |     assert len(str(elements[0])) > 0 | 
					
						
							|  |  |  |     assert len(elements) == len(test_elements) | 
					
						
							|  |  |  |     for i in range(len(elements)): | 
					
						
							|  |  |  |         assert elements[i] == test_elements[i] | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |         assert elements[i].metadata.filename == filename.split("/")[-1] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize("filename", test_files) | 
					
						
							|  |  |  | def test_partition_json_from_file_with_metadata_filename(filename: str): | 
					
						
							|  |  |  |     path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  |     elements = [] | 
					
						
							| 
									
										
										
										
											2023-08-29 16:59:26 -04:00
										 |  |  |     filetype = detect_filetype(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.TXT: | 
					
						
							|  |  |  |         elements = partition_text(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.HTML: | 
					
						
							|  |  |  |         elements = partition_html(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.XML: | 
					
						
							|  |  |  |         elements = partition_xml(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.EML: | 
					
						
							|  |  |  |         elements = partition_email(filename=path) | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |     with tempfile.TemporaryDirectory() as tmpdir: | 
					
						
							|  |  |  |         _filename = os.path.basename(filename) | 
					
						
							|  |  |  |         test_path = os.path.join(tmpdir, _filename + ".json") | 
					
						
							|  |  |  |         elements_to_json(elements, filename=test_path, indent=2) | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  |         with open(test_path, "rb") as f: | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |             test_elements = partition_json(file=f, metadata_filename="test") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for i in range(len(test_elements)): | 
					
						
							|  |  |  |         assert test_elements[i].metadata.filename == "test" | 
					
						
							| 
									
										
										
										
											2023-03-09 03:36:01 +09:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize("filename", test_files) | 
					
						
							|  |  |  | def test_partition_json_from_text(filename: str): | 
					
						
							|  |  |  |     path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  |     elements = [] | 
					
						
							| 
									
										
										
										
											2023-08-29 16:59:26 -04:00
										 |  |  |     filetype = detect_filetype(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.TXT: | 
					
						
							|  |  |  |         elements = partition_text(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.HTML: | 
					
						
							|  |  |  |         elements = partition_html(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.XML: | 
					
						
							|  |  |  |         elements = partition_xml(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.EML: | 
					
						
							|  |  |  |         elements = partition_email(filename=path) | 
					
						
							| 
									
										
										
										
											2023-03-09 03:36:01 +09:00
										 |  |  | 
 | 
					
						
							|  |  |  |     with tempfile.TemporaryDirectory() as tmpdir: | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |         _filename = os.path.basename(filename) | 
					
						
							|  |  |  |         test_path = os.path.join(tmpdir, _filename + ".json") | 
					
						
							| 
									
										
										
										
											2023-03-09 03:36:01 +09:00
										 |  |  |         elements_to_json(elements, filename=test_path, indent=2) | 
					
						
							|  |  |  |         with open(test_path) as f: | 
					
						
							|  |  |  |             text = f.read() | 
					
						
							|  |  |  |         test_elements = partition_json(text=text) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert len(elements) > 0 | 
					
						
							|  |  |  |     assert len(str(elements[0])) > 0 | 
					
						
							|  |  |  |     assert len(elements) == len(test_elements) | 
					
						
							|  |  |  |     for i in range(len(elements)): | 
					
						
							|  |  |  |         assert elements[i] == test_elements[i] | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |         assert elements[i].metadata.filename == filename.split("/")[-1] | 
					
						
							| 
									
										
										
										
											2023-03-09 03:36:01 +09:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_json_raises_with_none_specified(): | 
					
						
							|  |  |  |     with pytest.raises(ValueError): | 
					
						
							|  |  |  |         partition_json() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-03-28 17:03:51 -04:00
										 |  |  | def test_partition_json_works_with_empty_string(): | 
					
						
							|  |  |  |     assert partition_json(text="") == [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_json_works_with_empty_list(): | 
					
						
							|  |  |  |     assert partition_json(text="[]") == [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-03-09 03:36:01 +09:00
										 |  |  | def test_partition_json_raises_with_too_many_specified(): | 
					
						
							|  |  |  |     path = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt") | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  |     elements = [] | 
					
						
							| 
									
										
										
										
											2023-08-29 16:59:26 -04:00
										 |  |  |     filetype = detect_filetype(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.TXT: | 
					
						
							|  |  |  |         elements = partition_text(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.HTML: | 
					
						
							|  |  |  |         elements = partition_html(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.XML: | 
					
						
							|  |  |  |         elements = partition_xml(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.EML: | 
					
						
							|  |  |  |         elements = partition_email(filename=path) | 
					
						
							| 
									
										
										
										
											2023-03-09 03:36:01 +09:00
										 |  |  | 
 | 
					
						
							|  |  |  |     with tempfile.TemporaryDirectory() as tmpdir: | 
					
						
							|  |  |  |         test_path = os.path.join(tmpdir, "fake-text.txt.json") | 
					
						
							|  |  |  |         elements_to_json(elements, filename=test_path, indent=2) | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  |         with open(test_path, "rb") as f: | 
					
						
							|  |  |  |             text = f.read().decode("utf-8") | 
					
						
							| 
									
										
										
										
											2023-03-09 03:36:01 +09:00
										 |  |  | 
 | 
					
						
							|  |  |  |     with pytest.raises(ValueError): | 
					
						
							|  |  |  |         partition_json(filename=test_path, file=f) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     with pytest.raises(ValueError): | 
					
						
							|  |  |  |         partition_json(filename=test_path, text=text) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     with pytest.raises(ValueError): | 
					
						
							|  |  |  |         partition_json(file=f, text=text) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     with pytest.raises(ValueError): | 
					
						
							|  |  |  |         partition_json(filename=test_path, file=f, text=text) | 
					
						
							| 
									
										
										
										
											2023-06-30 09:44:46 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize("filename", test_files) | 
					
						
							|  |  |  | def test_partition_json_from_filename_exclude_metadata(filename: str): | 
					
						
							|  |  |  |     path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  |     elements = [] | 
					
						
							| 
									
										
										
										
											2023-08-29 16:59:26 -04:00
										 |  |  |     filetype = detect_filetype(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.TXT: | 
					
						
							|  |  |  |         elements = partition_text(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.HTML: | 
					
						
							|  |  |  |         elements = partition_html(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.XML: | 
					
						
							|  |  |  |         elements = partition_xml(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.EML: | 
					
						
							|  |  |  |         elements = partition_email(filename=path) | 
					
						
							| 
									
										
										
										
											2023-06-30 09:44:46 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  |     with tempfile.TemporaryDirectory() as tmpdir: | 
					
						
							|  |  |  |         _filename = os.path.basename(filename) | 
					
						
							|  |  |  |         test_path = os.path.join(tmpdir, _filename + ".json") | 
					
						
							|  |  |  |         elements_to_json(elements, filename=test_path, indent=2) | 
					
						
							|  |  |  |         test_elements = partition_json(filename=test_path, include_metadata=False) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for i in range(len(test_elements)): | 
					
						
							|  |  |  |         assert any(test_elements[i].metadata.to_dict()) is False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize("filename", test_files) | 
					
						
							|  |  |  | def test_partition_json_from_file_exclude_metadata(filename: str): | 
					
						
							|  |  |  |     path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  |     elements = [] | 
					
						
							| 
									
										
										
										
											2023-08-29 16:59:26 -04:00
										 |  |  |     filetype = detect_filetype(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.TXT: | 
					
						
							|  |  |  |         elements = partition_text(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.HTML: | 
					
						
							|  |  |  |         elements = partition_html(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.XML: | 
					
						
							|  |  |  |         elements = partition_xml(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.EML: | 
					
						
							|  |  |  |         elements = partition_email(filename=path) | 
					
						
							| 
									
										
										
										
											2023-06-30 09:44:46 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  |     with tempfile.TemporaryDirectory() as tmpdir: | 
					
						
							|  |  |  |         _filename = os.path.basename(filename) | 
					
						
							|  |  |  |         test_path = os.path.join(tmpdir, _filename + ".json") | 
					
						
							|  |  |  |         elements_to_json(elements, filename=test_path, indent=2) | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  |         with open(test_path, "rb") as f: | 
					
						
							| 
									
										
										
										
											2023-06-30 09:44:46 -05:00
										 |  |  |             test_elements = partition_json(file=f, include_metadata=False) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for i in range(len(test_elements)): | 
					
						
							|  |  |  |         assert any(test_elements[i].metadata.to_dict()) is False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize("filename", test_files) | 
					
						
							|  |  |  | def test_partition_json_from_text_exclude_metadata(filename: str): | 
					
						
							|  |  |  |     path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  |     elements = [] | 
					
						
							| 
									
										
										
										
											2023-08-29 16:59:26 -04:00
										 |  |  |     filetype = detect_filetype(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.TXT: | 
					
						
							|  |  |  |         elements = partition_text(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.HTML: | 
					
						
							|  |  |  |         elements = partition_html(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.XML: | 
					
						
							|  |  |  |         elements = partition_xml(filename=path) | 
					
						
							|  |  |  |     if filetype == FileType.EML: | 
					
						
							|  |  |  |         elements = partition_email(filename=path) | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-30 09:44:46 -05:00
										 |  |  |     with tempfile.TemporaryDirectory() as tmpdir: | 
					
						
							|  |  |  |         _filename = os.path.basename(filename) | 
					
						
							|  |  |  |         test_path = os.path.join(tmpdir, _filename + ".json") | 
					
						
							|  |  |  |         elements_to_json(elements, filename=test_path, indent=2) | 
					
						
							|  |  |  |         with open(test_path) as f: | 
					
						
							|  |  |  |             text = f.read() | 
					
						
							|  |  |  |         test_elements = partition_json(text=text, include_metadata=False) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for i in range(len(test_elements)): | 
					
						
							|  |  |  |         assert any(test_elements[i].metadata.to_dict()) is False | 
					
						
							| 
									
										
										
										
											2023-07-25 15:59:45 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  | def test_partition_json_metadata_date(mocker: MockFixture): | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  |     mocked_last_modification_date = "2029-07-05T09:24:28" | 
					
						
							|  |  |  |     mocker.patch( | 
					
						
							|  |  |  |         "unstructured.partition.json.get_last_modified_date", | 
					
						
							|  |  |  |         return_value=mocked_last_modification_date, | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  |     elements = partition_json("example-docs/spring-weather.html.json") | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-31 19:55:43 -07:00
										 |  |  |     assert elements[0].metadata.last_modified == mocked_last_modification_date | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  | def test_partition_json_with_custom_metadata_date(mocker: MockFixture): | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  |     mocked_last_modification_date = "2029-07-05T09:24:28" | 
					
						
							|  |  |  |     expected_last_modification_date = "2020-07-05T09:24:28" | 
					
						
							|  |  |  |     mocker.patch( | 
					
						
							|  |  |  |         "unstructured.partition.json.get_last_modified_date", | 
					
						
							|  |  |  |         return_value=mocked_last_modification_date, | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     elements = partition_json( | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  |         "example-docs/spring-weather.html.json", | 
					
						
							| 
									
										
										
										
											2023-07-31 19:55:43 -07:00
										 |  |  |         metadata_last_modified=expected_last_modification_date, | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-31 19:55:43 -07:00
										 |  |  |     assert elements[0].metadata.last_modified == expected_last_modification_date | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  | def test_partition_json_from_file_metadata_date(mocker: MockFixture): | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  |     mocked_last_modification_date = "2029-07-05T09:24:28" | 
					
						
							|  |  |  |     mocker.patch( | 
					
						
							|  |  |  |         "unstructured.partition.json.get_last_modified_date_from_file", | 
					
						
							|  |  |  |         return_value=mocked_last_modification_date, | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  |     with open("example-docs/spring-weather.html.json", "rb") as f: | 
					
						
							|  |  |  |         elements = partition_json(file=f) | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-18 02:09:44 +01:00
										 |  |  |     assert elements[0].metadata.last_modified is None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_json_from_file_explicit_get_metadata_date( | 
					
						
							|  |  |  |     mocker, | 
					
						
							|  |  |  |     filename="example-docs/spring-weather.html.json", | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     mocked_last_modification_date = "2029-07-05T09:24:28" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     mocker.patch( | 
					
						
							|  |  |  |         "unstructured.partition.json.get_last_modified_date_from_file", | 
					
						
							|  |  |  |         return_value=mocked_last_modification_date, | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     with open(filename, "rb") as f: | 
					
						
							|  |  |  |         elements = partition_json( | 
					
						
							|  |  |  |             file=f, | 
					
						
							|  |  |  |             date_from_file_object=True, | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-31 19:55:43 -07:00
										 |  |  |     assert elements[0].metadata.last_modified == mocked_last_modification_date | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  | def test_partition_json_from_file_with_custom_metadata_date(mocker: MockFixture): | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  |     mocked_last_modification_date = "2029-07-05T09:24:28" | 
					
						
							|  |  |  |     expected_last_modification_date = "2020-07-05T09:24:28" | 
					
						
							|  |  |  |     mocker.patch( | 
					
						
							|  |  |  |         "unstructured.partition.json.get_last_modified_date_from_file", | 
					
						
							|  |  |  |         return_value=mocked_last_modification_date, | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  |     with open("example-docs/spring-weather.html.json", "rb") as f: | 
					
						
							| 
									
										
										
										
											2023-07-31 19:55:43 -07:00
										 |  |  |         elements = partition_json(file=f, metadata_last_modified=expected_last_modification_date) | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-31 19:55:43 -07:00
										 |  |  |     assert elements[0].metadata.last_modified == expected_last_modification_date | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  | def test_partition_json_from_text_metadata_date(): | 
					
						
							|  |  |  |     with open("example-docs/spring-weather.html.json") as f: | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  |         text = f.read() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  |     elements = partition_json(text=text) | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-31 19:55:43 -07:00
										 |  |  |     assert elements[0].metadata.last_modified is None | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  | def test_partition_json_from_text_with_custom_metadata_date(): | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  |     expected_last_modification_date = "2020-07-05T09:24:28" | 
					
						
							| 
									
										
										
										
											2024-02-20 17:35:16 -08:00
										 |  |  |     with open("example-docs/spring-weather.html.json") as f: | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  |         text = f.read() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-31 19:55:43 -07:00
										 |  |  |     elements = partition_json(text=text, metadata_last_modified=expected_last_modification_date) | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-31 19:55:43 -07:00
										 |  |  |     assert elements[0].metadata.last_modified == expected_last_modification_date | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-18 02:09:44 +01:00
										 |  |  | def test_partition_json_from_file_without_metadata_date( | 
					
						
							|  |  |  |     filename="example-docs/spring-weather.html.json", | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     """Test partition_json() with file that are not possible to get last modified date""" | 
					
						
							|  |  |  |     with open(filename, "rb") as f: | 
					
						
							|  |  |  |         sf = tempfile.SpooledTemporaryFile() | 
					
						
							|  |  |  |         sf.write(f.read()) | 
					
						
							|  |  |  |         sf.seek(0) | 
					
						
							|  |  |  |         elements = partition_json(file=sf, date_from_file_object=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert elements[0].metadata.last_modified is None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-25 15:59:45 -04:00
										 |  |  | def test_partition_json_raises_with_unprocessable_json(): | 
					
						
							|  |  |  |     # NOTE(robinson) - This is unprocessable because it is not a list of dicts, | 
					
						
							|  |  |  |     # per the Unstructured ISD format | 
					
						
							|  |  |  |     text = '{"hi": "there"}' | 
					
						
							|  |  |  |     with pytest.raises(ValueError): | 
					
						
							|  |  |  |         partition_json(text=text) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_json_raises_with_invalid_json(): | 
					
						
							|  |  |  |     text = '[{"hi": "there"}]]' | 
					
						
							|  |  |  |     with pytest.raises(ValueError): | 
					
						
							|  |  |  |         partition_json(text=text) |