| 
									
										
										
										
											2023-05-03 15:06:06 -04:00
										 |  |  | import contextlib | 
					
						
							|  |  |  | import json | 
					
						
							| 
									
										
										
										
											2023-04-26 09:05:35 -04:00
										 |  |  | import os | 
					
						
							|  |  |  | import pathlib | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import pytest | 
					
						
							|  |  |  | import requests | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from unstructured.documents.elements import NarrativeText | 
					
						
							| 
									
										
										
										
											2023-05-03 15:06:06 -04:00
										 |  |  | from unstructured.partition.api import partition_multiple_via_api, partition_via_api | 
					
						
							| 
									
										
										
										
											2023-04-26 09:05:35 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | DIRECTORY = pathlib.Path(__file__).parent.resolve() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  | EML_TEST_FILE = "eml/fake-email.eml" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-29 10:31:01 -07:00
										 |  |  | skip_outside_ci = os.getenv("CI", "").lower() in {"", "false", "f", "0"} | 
					
						
							| 
									
										
										
										
											2023-06-30 09:44:46 -05:00
										 |  |  | skip_not_on_main = os.getenv("GITHUB_REF_NAME", "").lower() != "main" | 
					
						
							| 
									
										
										
										
											2023-06-29 10:31:01 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-04-26 09:05:35 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | class MockResponse: | 
					
						
							|  |  |  |     def __init__(self, status_code): | 
					
						
							|  |  |  |         self.status_code = status_code | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def text(self): | 
					
						
							|  |  |  |         return """[
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         "element_id": "f49fbd614ddf5b72e06f59e554e6ae2b", | 
					
						
							|  |  |  |         "text": "This is a test email to use for unit tests.", | 
					
						
							|  |  |  |         "type": "NarrativeText", | 
					
						
							|  |  |  |         "metadata": { | 
					
						
							|  |  |  |             "sent_from": [ | 
					
						
							|  |  |  |                 "Matthew Robinson <mrobinson@unstructured.io>" | 
					
						
							|  |  |  |             ], | 
					
						
							|  |  |  |             "sent_to": [ | 
					
						
							|  |  |  |                 "Matthew Robinson <mrobinson@unstructured.io>" | 
					
						
							|  |  |  |             ], | 
					
						
							|  |  |  |             "subject": "Test Email", | 
					
						
							| 
									
										
										
										
											2023-06-08 09:24:16 -04:00
										 |  |  |             "filename": "fake-email.eml", | 
					
						
							|  |  |  |             "filetype": "message/rfc822" | 
					
						
							| 
									
										
										
										
											2023-04-26 09:05:35 -04:00
										 |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | ]"""
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-08 09:24:16 -04:00
										 |  |  |     def json(self): | 
					
						
							|  |  |  |         return json.loads(self.text) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-04-26 09:05:35 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | def test_partition_via_api_from_filename(monkeypatch): | 
					
						
							|  |  |  |     monkeypatch.setattr( | 
					
						
							|  |  |  |         requests, | 
					
						
							|  |  |  |         "post", | 
					
						
							|  |  |  |         lambda *args, **kwargs: MockResponse(status_code=200), | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE) | 
					
						
							| 
									
										
										
										
											2023-06-15 15:18:22 -04:00
										 |  |  |     elements = partition_via_api(filename=filename) | 
					
						
							| 
									
										
										
										
											2023-04-26 09:05:35 -04:00
										 |  |  |     assert elements[0] == NarrativeText("This is a test email to use for unit tests.") | 
					
						
							| 
									
										
										
										
											2023-06-08 09:24:16 -04:00
										 |  |  |     assert elements[0].metadata.filetype == "message/rfc822" | 
					
						
							| 
									
										
										
										
											2023-04-26 09:05:35 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_via_api_from_file(monkeypatch): | 
					
						
							|  |  |  |     monkeypatch.setattr( | 
					
						
							|  |  |  |         requests, | 
					
						
							|  |  |  |         "post", | 
					
						
							|  |  |  |         lambda *args, **kwargs: MockResponse(status_code=200), | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE) | 
					
						
							| 
									
										
										
										
											2023-04-26 09:05:35 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  |     with open(filename, "rb") as f: | 
					
						
							| 
									
										
										
										
											2023-08-24 03:02:47 -04:00
										 |  |  |         elements = partition_via_api(file=f, metadata_filename=filename) | 
					
						
							| 
									
										
										
										
											2023-04-26 09:05:35 -04:00
										 |  |  |     assert elements[0] == NarrativeText("This is a test email to use for unit tests.") | 
					
						
							| 
									
										
										
										
											2023-06-08 09:24:16 -04:00
										 |  |  |     assert elements[0].metadata.filetype == "message/rfc822" | 
					
						
							| 
									
										
										
										
											2023-04-26 09:05:35 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-24 03:02:47 -04:00
										 |  |  | def test_partition_via_api_from_file_warns_with_file_filename(monkeypatch, caplog): | 
					
						
							|  |  |  |     monkeypatch.setattr( | 
					
						
							|  |  |  |         requests, | 
					
						
							|  |  |  |         "post", | 
					
						
							|  |  |  |         lambda *args, **kwargs: MockResponse(status_code=200), | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     with open(filename, "rb") as f: | 
					
						
							|  |  |  |         partition_via_api(file=f, file_filename=filename) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert "WARNING" in caplog.text | 
					
						
							|  |  |  |     assert "The file_filename kwarg will be deprecated" in caplog.text | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_via_api_from_file_raises_with_metadata_and_file_filename(monkeypatch): | 
					
						
							|  |  |  |     monkeypatch.setattr( | 
					
						
							|  |  |  |         requests, | 
					
						
							|  |  |  |         "post", | 
					
						
							|  |  |  |         lambda *args, **kwargs: MockResponse(status_code=200), | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     with open(filename, "rb") as f, pytest.raises(ValueError): | 
					
						
							|  |  |  |         partition_via_api(file=f, file_filename=filename, metadata_filename=filename) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-31 14:09:58 -04:00
										 |  |  | def test_partition_via_api_from_file_raises_without_filename(monkeypatch): | 
					
						
							|  |  |  |     monkeypatch.setattr( | 
					
						
							|  |  |  |         requests, | 
					
						
							|  |  |  |         "post", | 
					
						
							|  |  |  |         lambda *args, **kwargs: MockResponse(status_code=200), | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE) | 
					
						
							| 
									
										
										
										
											2023-05-31 14:09:58 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  |     with open(filename, "rb") as f, pytest.raises(ValueError): | 
					
						
							| 
									
										
										
										
											2023-06-15 15:18:22 -04:00
										 |  |  |         partition_via_api(file=f) | 
					
						
							| 
									
										
										
										
											2023-05-31 14:09:58 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-04-26 09:05:35 -04:00
										 |  |  | def test_partition_via_api_raises_with_bad_response(monkeypatch): | 
					
						
							|  |  |  |     monkeypatch.setattr( | 
					
						
							|  |  |  |         requests, | 
					
						
							|  |  |  |         "post", | 
					
						
							|  |  |  |         lambda *args, **kwargs: MockResponse(status_code=500), | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE) | 
					
						
							| 
									
										
										
										
											2023-04-26 09:05:35 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  |     with pytest.raises(ValueError): | 
					
						
							| 
									
										
										
										
											2023-06-15 15:18:22 -04:00
										 |  |  |         partition_via_api(filename=filename) | 
					
						
							| 
									
										
										
										
											2023-05-03 15:06:06 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-10-24 17:17:54 -05:00
										 |  |  | @pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI") | 
					
						
							|  |  |  | @pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch") | 
					
						
							| 
									
										
										
										
											2023-07-26 09:56:39 -07:00
										 |  |  | def test_partition_via_api_with_no_strategy(): | 
					
						
							| 
									
										
										
										
											2023-10-24 17:17:54 -05:00
										 |  |  |     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf") | 
					
						
							| 
									
										
										
										
											2023-07-26 09:56:39 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-18 10:06:17 -07:00
										 |  |  |     elements_no_strategy = partition_via_api( | 
					
						
							|  |  |  |         filename=filename, | 
					
						
							|  |  |  |         strategy="auto", | 
					
						
							|  |  |  |         api_key=get_api_key(), | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2023-07-26 09:56:39 -07:00
										 |  |  |     elements_hi_res = partition_via_api(filename=filename, strategy="hi_res", api_key=get_api_key()) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-18 10:06:17 -07:00
										 |  |  |     # confirm that hi_res strategy was not passed as default to partition by comparing outputs | 
					
						
							| 
									
										
										
										
											2023-10-24 17:17:54 -05:00
										 |  |  |     # elements_hi_res[3].text = | 
					
						
							|  |  |  |     #     'LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis' | 
					
						
							|  |  |  |     # while elements_no_strategy[3].text = ']' (as of this writing) | 
					
						
							|  |  |  |     assert elements_no_strategy[3].text != elements_hi_res[3].text | 
					
						
							| 
									
										
										
										
											2023-07-26 09:56:39 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-31 19:55:43 -07:00
										 |  |  | @pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI") | 
					
						
							|  |  |  | @pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch") | 
					
						
							| 
									
										
										
										
											2023-07-26 09:56:39 -07:00
										 |  |  | def test_partition_via_api_with_image_hi_res_strategy_includes_coordinates(): | 
					
						
							|  |  |  |     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.jpg") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # coordinates not included by default to limit payload size | 
					
						
							|  |  |  |     elements = partition_via_api( | 
					
						
							|  |  |  |         filename=filename, | 
					
						
							|  |  |  |         strategy="hi_res", | 
					
						
							|  |  |  |         coordinates="true", | 
					
						
							|  |  |  |         api_key=get_api_key(), | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert elements[0].metadata.coordinates is not None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-29 17:47:51 -07:00
										 |  |  | @pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI") | 
					
						
							| 
									
										
										
										
											2023-06-30 09:44:46 -05:00
										 |  |  | @pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch") | 
					
						
							| 
									
										
										
										
											2023-06-12 12:39:58 -04:00
										 |  |  | def test_partition_via_api_valid_request_data_kwargs(): | 
					
						
							|  |  |  |     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-29 10:31:01 -07:00
										 |  |  |     elements = partition_via_api(filename=filename, strategy="fast", api_key=get_api_key()) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-12 12:39:58 -04:00
										 |  |  |     assert isinstance(elements, list) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_via_api_invalid_request_data_kwargs(): | 
					
						
							|  |  |  |     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf") | 
					
						
							|  |  |  |     with pytest.raises(ValueError): | 
					
						
							| 
									
										
										
										
											2023-06-29 10:31:01 -07:00
										 |  |  |         partition_via_api(filename=filename, strategy="not_a_strategy", api_key=get_api_key()) | 
					
						
							| 
									
										
										
										
											2023-06-12 12:39:58 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-03 15:06:06 -04:00
										 |  |  | class MockMultipleResponse: | 
					
						
							|  |  |  |     def __init__(self, status_code): | 
					
						
							|  |  |  |         self.status_code = status_code | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def json(self): | 
					
						
							|  |  |  |         return json.loads(self.text) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def text(self): | 
					
						
							|  |  |  |         return """[
 | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         { | 
					
						
							|  |  |  |             "element_id": "f49fbd614ddf5b72e06f59e554e6ae2b", | 
					
						
							|  |  |  |             "text": "This is a test email to use for unit tests.", | 
					
						
							|  |  |  |             "type": "NarrativeText", | 
					
						
							|  |  |  |             "metadata": { | 
					
						
							|  |  |  |                 "sent_from": [ | 
					
						
							|  |  |  |                     "Matthew Robinson <mrobinson@unstructured.io>" | 
					
						
							|  |  |  |                 ], | 
					
						
							|  |  |  |                 "sent_to": [ | 
					
						
							|  |  |  |                     "Matthew Robinson <mrobinson@unstructured.io>" | 
					
						
							|  |  |  |                 ], | 
					
						
							|  |  |  |                 "subject": "Test Email", | 
					
						
							| 
									
										
										
										
											2023-06-08 09:24:16 -04:00
										 |  |  |                 "filename": "fake-email.eml", | 
					
						
							|  |  |  |                 "filetype": "message/rfc822" | 
					
						
							| 
									
										
										
										
											2023-05-03 15:06:06 -04:00
										 |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         { | 
					
						
							|  |  |  |             "element_id": "f49fbd614ddf5b72e06f59e554e6ae2b", | 
					
						
							|  |  |  |             "text": "This is a test email to use for unit tests.", | 
					
						
							|  |  |  |             "type": "NarrativeText", | 
					
						
							|  |  |  |             "metadata": { | 
					
						
							|  |  |  |                 "sent_from": [ | 
					
						
							|  |  |  |                     "Matthew Robinson <mrobinson@unstructured.io>" | 
					
						
							|  |  |  |                 ], | 
					
						
							|  |  |  |                 "sent_to": [ | 
					
						
							|  |  |  |                     "Matthew Robinson <mrobinson@unstructured.io>" | 
					
						
							|  |  |  |                 ], | 
					
						
							|  |  |  |                 "subject": "Test Email", | 
					
						
							| 
									
										
										
										
											2023-06-08 09:24:16 -04:00
										 |  |  |                 "filename": "fake-email.eml", | 
					
						
							|  |  |  |                 "filetype": "message/rfc822" | 
					
						
							| 
									
										
										
										
											2023-05-03 15:06:06 -04:00
										 |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     ] | 
					
						
							|  |  |  | ]"""
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-08 09:24:16 -04:00
										 |  |  | def test_partition_multiple_via_api_with_single_filename(monkeypatch): | 
					
						
							|  |  |  |     monkeypatch.setattr( | 
					
						
							|  |  |  |         requests, | 
					
						
							|  |  |  |         "post", | 
					
						
							|  |  |  |         lambda *args, **kwargs: MockResponse(status_code=200), | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE) | 
					
						
							| 
									
										
										
										
											2023-06-08 09:24:16 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-15 15:18:22 -04:00
										 |  |  |     elements = partition_multiple_via_api(filenames=[filename]) | 
					
						
							| 
									
										
										
										
											2023-06-08 09:24:16 -04:00
										 |  |  |     assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.") | 
					
						
							|  |  |  |     assert elements[0][0].metadata.filetype == "message/rfc822" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-03 15:06:06 -04:00
										 |  |  | def test_partition_multiple_via_api_from_filenames(monkeypatch): | 
					
						
							|  |  |  |     monkeypatch.setattr( | 
					
						
							|  |  |  |         requests, | 
					
						
							|  |  |  |         "post", | 
					
						
							|  |  |  |         lambda *args, **kwargs: MockMultipleResponse(status_code=200), | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     filenames = [ | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |         os.path.join(DIRECTORY, "..", "..", "example-docs", "eml/fake-email.eml"), | 
					
						
							| 
									
										
										
										
											2023-05-03 15:06:06 -04:00
										 |  |  |         os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), | 
					
						
							|  |  |  |     ] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-15 15:18:22 -04:00
										 |  |  |     elements = partition_multiple_via_api(filenames=filenames) | 
					
						
							| 
									
										
										
										
											2023-05-03 15:06:06 -04:00
										 |  |  |     assert len(elements) == 2 | 
					
						
							|  |  |  |     assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.") | 
					
						
							| 
									
										
										
										
											2023-06-08 09:24:16 -04:00
										 |  |  |     assert elements[0][0].metadata.filetype == "message/rfc822" | 
					
						
							| 
									
										
										
										
											2023-05-03 15:06:06 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_multiple_via_api_from_files(monkeypatch): | 
					
						
							|  |  |  |     monkeypatch.setattr( | 
					
						
							|  |  |  |         requests, | 
					
						
							|  |  |  |         "post", | 
					
						
							|  |  |  |         lambda *args, **kwargs: MockMultipleResponse(status_code=200), | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     filenames = [ | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |         os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE), | 
					
						
							| 
									
										
										
										
											2023-05-03 15:06:06 -04:00
										 |  |  |         os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), | 
					
						
							|  |  |  |     ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     with contextlib.ExitStack() as stack: | 
					
						
							|  |  |  |         files = [stack.enter_context(open(filename, "rb")) for filename in filenames] | 
					
						
							|  |  |  |         elements = partition_multiple_via_api( | 
					
						
							|  |  |  |             files=files, | 
					
						
							| 
									
										
										
										
											2023-08-24 03:02:47 -04:00
										 |  |  |             metadata_filenames=filenames, | 
					
						
							| 
									
										
										
										
											2023-05-03 15:06:06 -04:00
										 |  |  |         ) | 
					
						
							|  |  |  |     assert len(elements) == 2 | 
					
						
							|  |  |  |     assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.") | 
					
						
							| 
									
										
										
										
											2023-06-08 09:24:16 -04:00
										 |  |  |     assert elements[0][0].metadata.filetype == "message/rfc822" | 
					
						
							| 
									
										
										
										
											2023-05-03 15:06:06 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-24 03:02:47 -04:00
										 |  |  | def test_partition_multiple_via_api_warns_with_file_filename(monkeypatch, caplog): | 
					
						
							|  |  |  |     monkeypatch.setattr( | 
					
						
							|  |  |  |         requests, | 
					
						
							|  |  |  |         "post", | 
					
						
							|  |  |  |         lambda *args, **kwargs: MockMultipleResponse(status_code=200), | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     filenames = [ | 
					
						
							|  |  |  |         os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE), | 
					
						
							|  |  |  |         os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), | 
					
						
							|  |  |  |     ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     with contextlib.ExitStack() as stack: | 
					
						
							|  |  |  |         files = [stack.enter_context(open(filename, "rb")) for filename in filenames] | 
					
						
							|  |  |  |         partition_multiple_via_api( | 
					
						
							|  |  |  |             files=files, | 
					
						
							|  |  |  |             file_filenames=filenames, | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  |     assert "WARNING" in caplog.text | 
					
						
							|  |  |  |     assert "The file_filenames kwarg will be deprecated" in caplog.text | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_multiple_via_api_warns_with_file_and_metadata_filename(monkeypatch): | 
					
						
							|  |  |  |     monkeypatch.setattr( | 
					
						
							|  |  |  |         requests, | 
					
						
							|  |  |  |         "post", | 
					
						
							|  |  |  |         lambda *args, **kwargs: MockMultipleResponse(status_code=200), | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     filenames = [ | 
					
						
							|  |  |  |         os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE), | 
					
						
							|  |  |  |         os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), | 
					
						
							|  |  |  |     ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     with contextlib.ExitStack() as stack: | 
					
						
							|  |  |  |         files = [stack.enter_context(open(filename, "rb")) for filename in filenames] | 
					
						
							|  |  |  |         with pytest.raises(ValueError): | 
					
						
							|  |  |  |             partition_multiple_via_api( | 
					
						
							|  |  |  |                 files=files, | 
					
						
							|  |  |  |                 metadata_filenames=filenames, | 
					
						
							|  |  |  |                 file_filenames=filenames, | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-03 15:06:06 -04:00
										 |  |  | def test_partition_multiple_via_api_raises_with_bad_response(monkeypatch): | 
					
						
							|  |  |  |     monkeypatch.setattr( | 
					
						
							|  |  |  |         requests, | 
					
						
							|  |  |  |         "post", | 
					
						
							|  |  |  |         lambda *args, **kwargs: MockMultipleResponse(status_code=500), | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     filenames = [ | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |         os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE), | 
					
						
							| 
									
										
										
										
											2023-05-03 15:06:06 -04:00
										 |  |  |         os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), | 
					
						
							|  |  |  |     ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     with pytest.raises(ValueError): | 
					
						
							| 
									
										
										
										
											2023-06-15 15:18:22 -04:00
										 |  |  |         partition_multiple_via_api(filenames=filenames) | 
					
						
							| 
									
										
										
										
											2023-05-03 15:06:06 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_multiple_via_api_raises_with_content_types_size_mismatch(monkeypatch): | 
					
						
							|  |  |  |     monkeypatch.setattr( | 
					
						
							|  |  |  |         requests, | 
					
						
							|  |  |  |         "post", | 
					
						
							|  |  |  |         lambda *args, **kwargs: MockMultipleResponse(status_code=500), | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     filenames = [ | 
					
						
							|  |  |  |         os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml"), | 
					
						
							|  |  |  |         os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), | 
					
						
							|  |  |  |     ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     with pytest.raises(ValueError): | 
					
						
							|  |  |  |         partition_multiple_via_api( | 
					
						
							|  |  |  |             filenames=filenames, | 
					
						
							|  |  |  |             content_types=["text/plain"], | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_multiple_via_api_from_files_raises_with_size_mismatch(monkeypatch): | 
					
						
							|  |  |  |     monkeypatch.setattr( | 
					
						
							|  |  |  |         requests, | 
					
						
							|  |  |  |         "post", | 
					
						
							|  |  |  |         lambda *args, **kwargs: MockMultipleResponse(status_code=200), | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     filenames = [ | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |         os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE), | 
					
						
							| 
									
										
										
										
											2023-05-03 15:06:06 -04:00
										 |  |  |         os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), | 
					
						
							|  |  |  |     ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     with contextlib.ExitStack() as stack: | 
					
						
							|  |  |  |         files = [stack.enter_context(open(filename, "rb")) for filename in filenames] | 
					
						
							|  |  |  |         with pytest.raises(ValueError): | 
					
						
							|  |  |  |             partition_multiple_via_api( | 
					
						
							|  |  |  |                 files=files, | 
					
						
							| 
									
										
										
										
											2023-08-24 03:02:47 -04:00
										 |  |  |                 metadata_filenames=filenames, | 
					
						
							| 
									
										
										
										
											2023-05-03 15:06:06 -04:00
										 |  |  |                 content_types=["text/plain"], | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_multiple_via_api_from_files_raises_without_filenames(monkeypatch): | 
					
						
							|  |  |  |     monkeypatch.setattr( | 
					
						
							|  |  |  |         requests, | 
					
						
							|  |  |  |         "post", | 
					
						
							|  |  |  |         lambda *args, **kwargs: MockMultipleResponse(status_code=200), | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     filenames = [ | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |         os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE), | 
					
						
							| 
									
										
										
										
											2023-05-03 15:06:06 -04:00
										 |  |  |         os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), | 
					
						
							|  |  |  |     ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     with contextlib.ExitStack() as stack: | 
					
						
							|  |  |  |         files = [stack.enter_context(open(filename, "rb")) for filename in filenames] | 
					
						
							|  |  |  |         with pytest.raises(ValueError): | 
					
						
							|  |  |  |             partition_multiple_via_api( | 
					
						
							|  |  |  |                 files=files, | 
					
						
							|  |  |  |             ) | 
					
						
							| 
									
										
										
										
											2023-06-12 12:39:58 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-29 10:31:01 -07:00
										 |  |  | def get_api_key(): | 
					
						
							|  |  |  |     api_key = os.getenv("UNS_API_KEY") | 
					
						
							|  |  |  |     if api_key is None: | 
					
						
							|  |  |  |         raise ValueError("UNS_API_KEY environment variable not set") | 
					
						
							|  |  |  |     return api_key | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI") | 
					
						
							| 
									
										
										
										
											2023-06-30 09:44:46 -05:00
										 |  |  | @pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch") | 
					
						
							| 
									
										
										
										
											2023-06-12 12:39:58 -04:00
										 |  |  | def test_partition_multiple_via_api_valid_request_data_kwargs(): | 
					
						
							|  |  |  |     filenames = [ | 
					
						
							|  |  |  |         os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf"), | 
					
						
							|  |  |  |         os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.jpg"), | 
					
						
							|  |  |  |     ] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-29 10:31:01 -07:00
										 |  |  |     elements = partition_multiple_via_api( | 
					
						
							|  |  |  |         filenames=filenames, | 
					
						
							|  |  |  |         strategy="fast", | 
					
						
							|  |  |  |         api_key=get_api_key(), | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2023-06-12 12:39:58 -04:00
										 |  |  |     assert isinstance(elements, list) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-29 10:31:01 -07:00
										 |  |  | @pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI") | 
					
						
							| 
									
										
										
										
											2023-06-12 12:39:58 -04:00
										 |  |  | def test_partition_multiple_via_api_invalid_request_data_kwargs(): | 
					
						
							|  |  |  |     filenames = [ | 
					
						
							|  |  |  |         os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf"), | 
					
						
							|  |  |  |         os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.jpg"), | 
					
						
							|  |  |  |     ] | 
					
						
							|  |  |  |     with pytest.raises(ValueError): | 
					
						
							|  |  |  |         partition_multiple_via_api( | 
					
						
							|  |  |  |             filenames=filenames, | 
					
						
							|  |  |  |             strategy="not_a_strategy", | 
					
						
							| 
									
										
										
										
											2023-06-29 10:31:01 -07:00
										 |  |  |             api_key=get_api_key(), | 
					
						
							| 
									
										
										
										
											2023-06-12 12:39:58 -04:00
										 |  |  |         ) |