| 
									
										
										
										
											2023-04-21 17:35:43 -04:00
										 |  |  | import os | 
					
						
							| 
									
										
										
										
											2023-05-09 21:39:07 -07:00
										 |  |  | from tempfile import SpooledTemporaryFile | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  | from unittest import mock | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-11-30 16:34:24 -05:00
										 |  |  | import pytest | 
					
						
							| 
									
										
										
										
											2023-07-05 11:25:11 -07:00
										 |  |  | from PIL import Image | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  | from unstructured_inference.inference import layout | 
					
						
							| 
									
										
										
										
											2022-11-30 16:34:24 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-20 11:19:55 -05:00
										 |  |  | from unstructured.documents.coordinates import PixelSpace | 
					
						
							| 
									
										
										
										
											2023-07-05 11:25:11 -07:00
										 |  |  | from unstructured.documents.elements import ( | 
					
						
							|  |  |  |     CoordinatesMetadata, | 
					
						
							|  |  |  |     ElementMetadata, | 
					
						
							|  |  |  |     NarrativeText, | 
					
						
							|  |  |  |     Text, | 
					
						
							|  |  |  |     Title, | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 |  |  | from unstructured.partition import pdf, strategies | 
					
						
							| 
									
										
										
										
											2022-11-21 17:27:23 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-11-30 16:34:24 -05:00
										 |  |  | class MockResponse: | 
					
						
							|  |  |  |     def __init__(self, status_code, response): | 
					
						
							|  |  |  |         self.status_code = status_code | 
					
						
							|  |  |  |         self.response = response | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def json(self): | 
					
						
							|  |  |  |         return self.response | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def mock_healthy_get(url, **kwargs): | 
					
						
							|  |  |  |     return MockResponse(status_code=200, response={}) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def mock_unhealthy_get(url, **kwargs): | 
					
						
							|  |  |  |     return MockResponse(status_code=500, response={}) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def mock_unsuccessful_post(url, **kwargs): | 
					
						
							|  |  |  |     return MockResponse(status_code=500, response={}) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def mock_successful_post(url, **kwargs): | 
					
						
							|  |  |  |     response = { | 
					
						
							|  |  |  |         "pages": [ | 
					
						
							|  |  |  |             { | 
					
						
							|  |  |  |                 "number": 0, | 
					
						
							| 
									
										
										
										
											2023-05-31 13:50:15 -05:00
										 |  |  |                 "elements": [ | 
					
						
							|  |  |  |                     {"type": "Title", "text": "Charlie Brown and the Great Pumpkin"}, | 
					
						
							|  |  |  |                 ], | 
					
						
							| 
									
										
										
										
											2023-02-08 10:11:15 -05:00
										 |  |  |             }, | 
					
						
							|  |  |  |             { | 
					
						
							|  |  |  |                 "number": 1, | 
					
						
							|  |  |  |                 "elements": [{"type": "Title", "text": "A Charlie Brown Christmas"}], | 
					
						
							|  |  |  |             }, | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  |         ], | 
					
						
							| 
									
										
										
										
											2022-11-30 16:34:24 -05:00
										 |  |  |     } | 
					
						
							|  |  |  |     return MockResponse(status_code=200, response=response) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-01-04 16:19:05 -06:00
										 |  |  | class MockPageLayout(layout.PageLayout): | 
					
						
							| 
									
										
										
										
											2023-07-05 11:25:11 -07:00
										 |  |  |     def __init__(self, number: int, image: Image): | 
					
						
							|  |  |  |         self.number = number | 
					
						
							|  |  |  |         self.image = image | 
					
						
							| 
									
										
										
										
											2023-01-04 16:19:05 -06:00
										 |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def elements(self): | 
					
						
							|  |  |  |         return [ | 
					
						
							|  |  |  |             layout.LayoutElement( | 
					
						
							|  |  |  |                 type="Title", | 
					
						
							| 
									
										
										
										
											2023-04-04 19:59:06 -07:00
										 |  |  |                 x1=0, | 
					
						
							|  |  |  |                 y1=0, | 
					
						
							|  |  |  |                 x2=2, | 
					
						
							|  |  |  |                 y2=2, | 
					
						
							| 
									
										
										
										
											2023-01-04 16:19:05 -06:00
										 |  |  |                 text="Charlie Brown and the Great Pumpkin", | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  |             ), | 
					
						
							| 
									
										
										
										
											2023-01-04 16:19:05 -06:00
										 |  |  |         ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class MockDocumentLayout(layout.DocumentLayout): | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def pages(self): | 
					
						
							|  |  |  |         return [ | 
					
						
							| 
									
										
										
										
											2023-07-05 11:25:11 -07:00
										 |  |  |             MockPageLayout(number=0, image=Image.new("1", (1, 1))), | 
					
						
							| 
									
										
										
										
											2023-01-04 16:19:05 -06:00
										 |  |  |         ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize( | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  |     ("filename", "file"), | 
					
						
							|  |  |  |     [("example-docs/layout-parser-paper-fast.pdf", None), (None, b"0000")], | 
					
						
							| 
									
										
										
										
											2023-01-04 16:19:05 -06:00
										 |  |  | ) | 
					
						
							|  |  |  | def test_partition_pdf_local(monkeypatch, filename, file): | 
					
						
							| 
									
										
										
										
											2023-01-13 22:24:13 -06:00
										 |  |  |     monkeypatch.setattr( | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  |         layout, | 
					
						
							|  |  |  |         "process_data_with_model", | 
					
						
							|  |  |  |         lambda *args, **kwargs: MockDocumentLayout(), | 
					
						
							| 
									
										
										
										
											2023-01-13 22:24:13 -06:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2023-01-04 16:19:05 -06:00
										 |  |  |     monkeypatch.setattr( | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  |         layout, | 
					
						
							|  |  |  |         "process_file_with_model", | 
					
						
							|  |  |  |         lambda *args, **kwargs: MockDocumentLayout(), | 
					
						
							| 
									
										
										
										
											2023-01-04 16:19:05 -06:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-01-13 22:24:13 -06:00
										 |  |  |     partition_pdf_response = pdf._partition_pdf_or_image_local(filename, file) | 
					
						
							| 
									
										
										
										
											2023-01-04 16:19:05 -06:00
										 |  |  |     assert partition_pdf_response[0].text == "Charlie Brown and the Great Pumpkin" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-01-13 22:24:13 -06:00
										 |  |  | def test_partition_pdf_local_raises_with_no_filename(): | 
					
						
							| 
									
										
										
										
											2022-11-30 16:34:24 -05:00
										 |  |  |     with pytest.raises(FileNotFoundError): | 
					
						
							| 
									
										
										
										
											2023-01-13 22:24:13 -06:00
										 |  |  |         pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False) | 
					
						
							| 
									
										
										
										
											2022-11-30 16:34:24 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-09 21:39:07 -07:00
										 |  |  | @pytest.mark.parametrize( | 
					
						
							| 
									
										
										
										
											2023-07-14 13:08:33 -07:00
										 |  |  |     "strategy", | 
					
						
							|  |  |  |     ["fast", "hi_res", "ocr_only"], | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | def test_partition_pdf_with_filename( | 
					
						
							|  |  |  |     strategy, | 
					
						
							|  |  |  |     filename="example-docs/layout-parser-paper-fast.pdf", | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     # Test that the partition_pdf function can handle filename | 
					
						
							|  |  |  |     result = pdf.partition_pdf(filename=filename, strategy=strategy) | 
					
						
							|  |  |  |     # validate that the result is a non-empty list of dicts | 
					
						
							|  |  |  |     assert len(result) > 10 | 
					
						
							|  |  |  |     # check that the pdf has multiple different page numbers | 
					
						
							|  |  |  |     assert {element.metadata.page_number for element in result} == {1, 2} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     "strategy", | 
					
						
							|  |  |  |     ["fast", "hi_res", "ocr_only"], | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | def test_partition_pdf_with_file_rb( | 
					
						
							|  |  |  |     strategy, | 
					
						
							|  |  |  |     filename="example-docs/layout-parser-paper-fast.pdf", | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     # Test that the partition_pdf function can handle BufferedReader | 
					
						
							|  |  |  |     with open(filename, "rb") as f: | 
					
						
							|  |  |  |         result = pdf.partition_pdf(file=f, strategy=strategy) | 
					
						
							|  |  |  |         # validate that the result is a non-empty list of dicts | 
					
						
							|  |  |  |         assert len(result) > 10 | 
					
						
							|  |  |  |         # check that the pdf has multiple different page numbers | 
					
						
							|  |  |  |         assert {element.metadata.page_number for element in result} == {1, 2} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     "strategy", | 
					
						
							|  |  |  |     ["fast", "hi_res", "ocr_only"], | 
					
						
							| 
									
										
										
										
											2023-05-09 21:39:07 -07:00
										 |  |  | ) | 
					
						
							|  |  |  | def test_partition_pdf_with_spooled_file( | 
					
						
							|  |  |  |     strategy, | 
					
						
							|  |  |  |     filename="example-docs/layout-parser-paper-fast.pdf", | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     # Test that the partition_pdf function can handle a SpooledTemporaryFile | 
					
						
							|  |  |  |     with open(filename, "rb") as test_file: | 
					
						
							|  |  |  |         spooled_temp_file = SpooledTemporaryFile() | 
					
						
							|  |  |  |         spooled_temp_file.write(test_file.read()) | 
					
						
							|  |  |  |         spooled_temp_file.seek(0) | 
					
						
							|  |  |  |         result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy) | 
					
						
							|  |  |  |         # validate that the result is a non-empty list of dicts | 
					
						
							|  |  |  |         assert len(result) > 10 | 
					
						
							| 
									
										
										
										
											2023-05-30 15:10:14 -04:00
										 |  |  |         # check that the pdf has multiple different page numbers | 
					
						
							| 
									
										
										
										
											2023-06-15 12:21:17 -04:00
										 |  |  |         assert {element.metadata.page_number for element in result} == {1, 2} | 
					
						
							| 
									
										
										
										
											2023-05-09 21:39:07 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-27 23:06:08 -05:00
										 |  |  | @mock.patch.dict(os.environ, {"UNSTRUCTURED_HI_RES_MODEL_NAME": "checkbox"}) | 
					
						
							| 
									
										
										
										
											2023-07-07 11:16:55 -04:00
										 |  |  | def test_partition_pdf_with_model_name_env_var( | 
					
						
							| 
									
										
										
										
											2023-05-31 13:50:15 -05:00
										 |  |  |     monkeypatch, | 
					
						
							|  |  |  |     filename="example-docs/layout-parser-paper-fast.pdf", | 
					
						
							|  |  |  | ): | 
					
						
							| 
									
										
										
										
											2023-07-07 23:41:37 -05:00
										 |  |  |     monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: []) | 
					
						
							| 
									
										
										
										
											2023-06-27 23:06:08 -05:00
										 |  |  |     with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process: | 
					
						
							| 
									
										
										
										
											2023-07-07 23:41:37 -05:00
										 |  |  |         pdf.partition_pdf(filename=filename, strategy="hi_res") | 
					
						
							| 
									
										
										
										
											2023-06-27 23:06:08 -05:00
										 |  |  |         mock_process.assert_called_once_with( | 
					
						
							|  |  |  |             filename, | 
					
						
							|  |  |  |             is_image=False, | 
					
						
							|  |  |  |             ocr_languages="eng", | 
					
						
							|  |  |  |             extract_tables=False, | 
					
						
							|  |  |  |             model_name="checkbox", | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2023-02-08 10:11:15 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-07 11:16:55 -04:00
										 |  |  | def test_partition_pdf_with_model_name( | 
					
						
							|  |  |  |     monkeypatch, | 
					
						
							|  |  |  |     filename="example-docs/layout-parser-paper-fast.pdf", | 
					
						
							|  |  |  | ): | 
					
						
							| 
									
										
										
										
											2023-07-07 23:41:37 -05:00
										 |  |  |     monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: []) | 
					
						
							| 
									
										
										
										
											2023-07-07 11:16:55 -04:00
										 |  |  |     with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process: | 
					
						
							|  |  |  |         pdf.partition_pdf(filename=filename, strategy="hi_res", model_name="checkbox") | 
					
						
							|  |  |  |         mock_process.assert_called_once_with( | 
					
						
							|  |  |  |             filename, | 
					
						
							|  |  |  |             is_image=False, | 
					
						
							|  |  |  |             ocr_languages="eng", | 
					
						
							|  |  |  |             extract_tables=False, | 
					
						
							|  |  |  |             model_name="checkbox", | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-31 13:50:15 -05:00
										 |  |  | def test_partition_pdf_with_auto_strategy( | 
					
						
							|  |  |  |     filename="example-docs/layout-parser-paper-fast.pdf", | 
					
						
							|  |  |  | ): | 
					
						
							| 
									
										
										
										
											2023-05-12 13:45:08 -04:00
										 |  |  |     elements = pdf.partition_pdf(filename=filename, strategy="auto") | 
					
						
							|  |  |  |     title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis" | 
					
						
							| 
									
										
										
										
											2023-06-22 11:19:54 -04:00
										 |  |  |     assert elements[0].text == title | 
					
						
							|  |  |  |     assert elements[0].metadata.filename == "layout-parser-paper-fast.pdf" | 
					
						
							|  |  |  |     assert elements[0].metadata.file_directory == "example-docs" | 
					
						
							| 
									
										
										
										
											2023-05-12 13:45:08 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-31 13:50:15 -05:00
										 |  |  | def test_partition_pdf_with_page_breaks( | 
					
						
							|  |  |  |     filename="example-docs/layout-parser-paper-fast.pdf", | 
					
						
							|  |  |  | ): | 
					
						
							| 
									
										
										
										
											2023-02-08 10:11:15 -05:00
										 |  |  |     elements = pdf.partition_pdf(filename=filename, url=None, include_page_breaks=True) | 
					
						
							| 
									
										
										
										
											2023-06-28 23:14:05 -04:00
										 |  |  |     assert "PageBreak" in [elem.category for elem in elements] | 
					
						
							| 
									
										
										
										
											2023-02-08 10:11:15 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-31 13:50:15 -05:00
										 |  |  | def test_partition_pdf_with_no_page_breaks( | 
					
						
							|  |  |  |     filename="example-docs/layout-parser-paper-fast.pdf", | 
					
						
							|  |  |  | ): | 
					
						
							| 
									
										
										
										
											2023-02-08 10:11:15 -05:00
										 |  |  |     elements = pdf.partition_pdf(filename=filename, url=None) | 
					
						
							| 
									
										
										
										
											2023-06-28 23:14:05 -04:00
										 |  |  |     assert "PageBreak" not in [elem.category for elem in elements] | 
					
						
							| 
									
										
										
										
											2023-03-10 22:16:05 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-31 13:50:15 -05:00
										 |  |  | def test_partition_pdf_with_fast_strategy( | 
					
						
							|  |  |  |     filename="example-docs/layout-parser-paper-fast.pdf", | 
					
						
							|  |  |  | ): | 
					
						
							| 
									
										
										
										
											2023-03-10 22:16:05 -05:00
										 |  |  |     elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast") | 
					
						
							|  |  |  |     assert len(elements) > 10 | 
					
						
							| 
									
										
										
										
											2023-05-30 15:10:14 -04:00
										 |  |  |     # check that the pdf has multiple different page numbers | 
					
						
							| 
									
										
										
										
											2023-06-15 12:21:17 -04:00
										 |  |  |     assert {element.metadata.page_number for element in elements} == {1, 2} | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |     for element in elements: | 
					
						
							|  |  |  |         assert element.metadata.filename == "layout-parser-paper-fast.pdf" | 
					
						
							| 
									
										
										
										
											2023-03-10 22:16:05 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-31 13:50:15 -05:00
										 |  |  | def test_partition_pdf_with_fast_groups_text( | 
					
						
							|  |  |  |     filename="example-docs/layout-parser-paper-fast.pdf", | 
					
						
							|  |  |  | ): | 
					
						
							| 
									
										
										
										
											2023-04-19 13:54:17 -04:00
										 |  |  |     elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     first_narrative_element = None | 
					
						
							|  |  |  |     for element in elements: | 
					
						
							|  |  |  |         if isinstance(element, NarrativeText): | 
					
						
							|  |  |  |             first_narrative_element = element | 
					
						
							|  |  |  |             break | 
					
						
							|  |  |  |     assert len(first_narrative_element.text) > 1000 | 
					
						
							|  |  |  |     assert first_narrative_element.text.startswith("Abstract. Recent advances") | 
					
						
							|  |  |  |     assert first_narrative_element.text.endswith("https://layout-parser.github.io.") | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |     assert first_narrative_element.metadata.filename == "layout-parser-paper-fast.pdf" | 
					
						
							| 
									
										
										
										
											2023-04-19 13:54:17 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-03-10 22:16:05 -05:00
										 |  |  | def test_partition_pdf_with_fast_strategy_from_file( | 
					
						
							|  |  |  |     filename="example-docs/layout-parser-paper-fast.pdf", | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     with open(filename, "rb") as f: | 
					
						
							|  |  |  |         elements = pdf.partition_pdf(file=f, url=None, strategy="fast") | 
					
						
							|  |  |  |     assert len(elements) > 10 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_pdf_with_fast_strategy_and_page_breaks( | 
					
						
							| 
									
										
										
										
											2023-04-13 11:46:35 -04:00
										 |  |  |     caplog, | 
					
						
							| 
									
										
										
										
											2023-03-10 22:16:05 -05:00
										 |  |  |     filename="example-docs/layout-parser-paper-fast.pdf", | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     elements = pdf.partition_pdf( | 
					
						
							|  |  |  |         filename=filename, | 
					
						
							|  |  |  |         url=None, | 
					
						
							|  |  |  |         strategy="fast", | 
					
						
							|  |  |  |         include_page_breaks=True, | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert len(elements) > 10 | 
					
						
							| 
									
										
										
										
											2023-06-28 23:14:05 -04:00
										 |  |  |     assert "PageBreak" in [elem.category for elem in elements] | 
					
						
							| 
									
										
										
										
											2023-03-10 22:16:05 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-31 13:50:15 -05:00
										 |  |  |     assert "unstructured_inference is not installed" not in caplog.text | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |     for element in elements: | 
					
						
							|  |  |  |         assert element.metadata.filename == "layout-parser-paper-fast.pdf" | 
					
						
							| 
									
										
										
										
											2023-04-13 11:46:35 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-03-10 22:16:05 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | def test_partition_pdf_raises_with_bad_strategy( | 
					
						
							|  |  |  |     filename="example-docs/layout-parser-paper-fast.pdf", | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     with pytest.raises(ValueError): | 
					
						
							|  |  |  |         pdf.partition_pdf(filename=filename, url=None, strategy="made_up") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_pdf_falls_back_to_fast( | 
					
						
							|  |  |  |     monkeypatch, | 
					
						
							| 
									
										
										
										
											2023-04-13 11:46:35 -04:00
										 |  |  |     caplog, | 
					
						
							| 
									
										
										
										
											2023-03-10 22:16:05 -05:00
										 |  |  |     filename="example-docs/layout-parser-paper-fast.pdf", | 
					
						
							|  |  |  | ): | 
					
						
							| 
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 |  |  |     def mock_exists(dep): | 
					
						
							| 
									
										
										
										
											2023-05-31 13:50:15 -05:00
										 |  |  |         return dep not in ["unstructured_inference", "pytesseract"] | 
					
						
							| 
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  |     monkeypatch.setattr(strategies, "dependency_exists", mock_exists) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     mock_return = [Text("Hello there!")] | 
					
						
							|  |  |  |     with mock.patch.object( | 
					
						
							|  |  |  |         pdf, | 
					
						
							| 
									
										
										
										
											2023-07-07 23:41:37 -05:00
										 |  |  |         "extractable_elements", | 
					
						
							| 
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 |  |  |         return_value=mock_return, | 
					
						
							|  |  |  |     ) as mock_partition: | 
					
						
							|  |  |  |         pdf.partition_pdf(filename=filename, url=None, strategy="hi_res") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     mock_partition.assert_called_once() | 
					
						
							| 
									
										
										
										
											2023-05-31 13:50:15 -05:00
										 |  |  |     assert "unstructured_inference is not installed" in caplog.text | 
					
						
							| 
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_pdf_falls_back_to_fast_from_ocr_only( | 
					
						
							|  |  |  |     monkeypatch, | 
					
						
							|  |  |  |     caplog, | 
					
						
							|  |  |  |     filename="example-docs/layout-parser-paper-fast.pdf", | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     def mock_exists(dep): | 
					
						
							|  |  |  |         return dep not in ["pytesseract"] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     monkeypatch.setattr(strategies, "dependency_exists", mock_exists) | 
					
						
							| 
									
										
										
										
											2023-03-10 22:16:05 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  |     mock_return = [Text("Hello there!")] | 
					
						
							|  |  |  |     with mock.patch.object( | 
					
						
							|  |  |  |         pdf, | 
					
						
							| 
									
										
										
										
											2023-07-07 23:41:37 -05:00
										 |  |  |         "extractable_elements", | 
					
						
							| 
									
										
										
										
											2023-03-10 22:16:05 -05:00
										 |  |  |         return_value=mock_return, | 
					
						
							| 
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 |  |  |     ) as mock_partition: | 
					
						
							|  |  |  |         pdf.partition_pdf(filename=filename, url=None, strategy="ocr_only") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     mock_partition.assert_called_once() | 
					
						
							|  |  |  |     assert "pytesseract is not installed" in caplog.text | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_pdf_falls_back_to_hi_res_from_ocr_only( | 
					
						
							|  |  |  |     monkeypatch, | 
					
						
							|  |  |  |     caplog, | 
					
						
							|  |  |  |     filename="example-docs/layout-parser-paper-fast.pdf", | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     def mock_exists(dep): | 
					
						
							|  |  |  |         return dep not in ["pytesseract"] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     monkeypatch.setattr(strategies, "dependency_exists", mock_exists) | 
					
						
							| 
									
										
										
										
											2023-07-07 23:41:37 -05:00
										 |  |  |     monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: []) | 
					
						
							| 
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  |     mock_return = [Text("Hello there!")] | 
					
						
							|  |  |  |     with mock.patch.object( | 
					
						
							|  |  |  |         pdf, | 
					
						
							|  |  |  |         "_partition_pdf_or_image_local", | 
					
						
							|  |  |  |         return_value=mock_return, | 
					
						
							|  |  |  |     ) as mock_partition: | 
					
						
							|  |  |  |         pdf.partition_pdf(filename=filename, url=None, strategy="ocr_only") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     mock_partition.assert_called_once() | 
					
						
							|  |  |  |     assert "pytesseract is not installed" in caplog.text | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_pdf_falls_back_to_ocr_only( | 
					
						
							|  |  |  |     monkeypatch, | 
					
						
							|  |  |  |     caplog, | 
					
						
							|  |  |  |     filename="example-docs/layout-parser-paper-fast.pdf", | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     def mock_exists(dep): | 
					
						
							| 
									
										
										
										
											2023-05-31 13:50:15 -05:00
										 |  |  |         return dep not in ["unstructured_inference"] | 
					
						
							| 
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  |     monkeypatch.setattr(strategies, "dependency_exists", mock_exists) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     mock_return = [Text("Hello there!")] | 
					
						
							|  |  |  |     with mock.patch.object( | 
					
						
							|  |  |  |         pdf, | 
					
						
							|  |  |  |         "_partition_pdf_or_image_with_ocr", | 
					
						
							|  |  |  |         return_value=mock_return, | 
					
						
							| 
									
										
										
										
											2023-03-10 22:16:05 -05:00
										 |  |  |     ) as mock_partition: | 
					
						
							| 
									
										
										
										
											2023-04-13 11:46:35 -04:00
										 |  |  |         pdf.partition_pdf(filename=filename, url=None, strategy="hi_res") | 
					
						
							| 
									
										
										
										
											2023-03-10 22:16:05 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  |     mock_partition.assert_called_once() | 
					
						
							| 
									
										
										
										
											2023-05-31 13:50:15 -05:00
										 |  |  |     assert "unstructured_inference is not installed" in caplog.text | 
					
						
							| 
									
										
										
										
											2023-04-21 12:01:29 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_pdf_uses_table_extraction(): | 
					
						
							|  |  |  |     filename = "example-docs/layout-parser-paper-fast.pdf" | 
					
						
							|  |  |  |     with mock.patch( | 
					
						
							|  |  |  |         "unstructured_inference.inference.layout.process_file_with_model", | 
					
						
							|  |  |  |     ) as mock_process_file_with_model: | 
					
						
							| 
									
										
										
										
											2023-04-21 13:48:19 -05:00
										 |  |  |         pdf.partition_pdf(filename, infer_table_structure=True) | 
					
						
							| 
									
										
										
										
											2023-04-21 12:01:29 -05:00
										 |  |  |         assert mock_process_file_with_model.call_args[1]["extract_tables"] | 
					
						
							| 
									
										
										
										
											2023-04-21 17:35:43 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_pdf_with_copy_protection(): | 
					
						
							|  |  |  |     filename = os.path.join("example-docs", "copy-protected.pdf") | 
					
						
							|  |  |  |     elements = pdf.partition_pdf(filename=filename, strategy="hi_res") | 
					
						
							|  |  |  |     elements[0] == Title("LayoutParser: A Unified Toolkit for Deep Based Document Image Analysis") | 
					
						
							| 
									
										
										
										
											2023-05-30 15:10:14 -04:00
										 |  |  |     # check that the pdf has multiple different page numbers | 
					
						
							| 
									
										
										
										
											2023-06-15 12:21:17 -04:00
										 |  |  |     assert {element.metadata.page_number for element in elements} == {1, 2} | 
					
						
							| 
									
										
										
										
											2023-07-26 09:26:06 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_pdf_with_dpi(): | 
					
						
							|  |  |  |     filename = os.path.join("example-docs", "copy-protected.pdf") | 
					
						
							|  |  |  |     with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process: | 
					
						
							|  |  |  |         pdf.partition_pdf(filename=filename, strategy="hi_res", pdf_image_dpi=100) | 
					
						
							|  |  |  |         mock_process.assert_called_once_with( | 
					
						
							|  |  |  |             filename, | 
					
						
							|  |  |  |             is_image=False, | 
					
						
							|  |  |  |             ocr_languages="eng", | 
					
						
							|  |  |  |             extract_tables=False, | 
					
						
							|  |  |  |             model_name=None, | 
					
						
							|  |  |  |             pdf_image_dpi=100, | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2023-04-21 17:35:43 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-22 11:19:54 -04:00
										 |  |  | def test_partition_pdf_requiring_recursive_text_grab(filename="example-docs/reliance.pdf"): | 
					
						
							|  |  |  |     elements = pdf.partition_pdf(filename=filename, strategy="fast") | 
					
						
							|  |  |  |     assert len(elements) > 50 | 
					
						
							|  |  |  |     assert elements[0].metadata.page_number == 1 | 
					
						
							|  |  |  |     assert elements[-1].metadata.page_number == 3 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-04-21 17:35:43 -04:00
										 |  |  | def test_partition_pdf_with_copy_protection_fallback_to_hi_res(caplog): | 
					
						
							| 
									
										
										
										
											2023-07-07 23:41:37 -05:00
										 |  |  |     filename = os.path.join("example-docs", "loremipsum-flat.pdf") | 
					
						
							| 
									
										
										
										
											2023-04-21 17:35:43 -04:00
										 |  |  |     elements = pdf.partition_pdf(filename=filename, strategy="fast") | 
					
						
							| 
									
										
										
										
											2023-05-31 13:50:15 -05:00
										 |  |  |     elements[0] == Title( | 
					
						
							|  |  |  |         "LayoutParser: A Unified Toolkit for Deep Based Document Image Analysis", | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2023-04-21 17:35:43 -04:00
										 |  |  |     assert "PDF text is not extractable" in caplog.text | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_pdf_fails_if_pdf_not_processable( | 
					
						
							|  |  |  |     monkeypatch, | 
					
						
							|  |  |  |     filename="example-docs/layout-parser-paper-fast.pdf", | 
					
						
							|  |  |  | ): | 
					
						
							| 
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 |  |  |     def mock_exists(dep): | 
					
						
							| 
									
										
										
										
											2023-05-31 13:50:15 -05:00
										 |  |  |         return dep not in ["unstructured_inference", "pytesseract"] | 
					
						
							| 
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  |     monkeypatch.setattr(strategies, "dependency_exists", mock_exists) | 
					
						
							| 
									
										
										
										
											2023-07-07 23:41:37 -05:00
										 |  |  |     monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: []) | 
					
						
							| 
									
										
										
										
											2023-04-21 17:35:43 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  |     with pytest.raises(ValueError): | 
					
						
							|  |  |  |         pdf.partition_pdf(filename=filename) | 
					
						
							| 
									
										
										
										
											2023-05-03 18:33:24 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_pdf_fast_groups_text_in_text_box(): | 
					
						
							|  |  |  |     filename = os.path.join("example-docs", "chevron-page.pdf") | 
					
						
							|  |  |  |     elements = pdf.partition_pdf(filename=filename, strategy="fast") | 
					
						
							| 
									
										
										
										
											2023-07-05 11:25:11 -07:00
										 |  |  |     expected_coordinate_points_0 = ( | 
					
						
							|  |  |  |         (193.1741, 71.94000000000005), | 
					
						
							|  |  |  |         (193.1741, 91.94000000000005), | 
					
						
							|  |  |  |         (418.6881, 91.94000000000005), | 
					
						
							|  |  |  |         (418.6881, 71.94000000000005), | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     expected_coordinate_system_0 = PixelSpace(width=612, height=792) | 
					
						
							|  |  |  |     expected_elem_metadata_0 = ElementMetadata( | 
					
						
							|  |  |  |         coordinates=CoordinatesMetadata( | 
					
						
							|  |  |  |             points=expected_coordinate_points_0, | 
					
						
							|  |  |  |             system=expected_coordinate_system_0, | 
					
						
							| 
									
										
										
										
											2023-05-20 16:26:55 -05:00
										 |  |  |         ), | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2023-07-05 11:25:11 -07:00
										 |  |  |     assert elements[0] == Title("eastern mediterranean", metadata=expected_elem_metadata_0) | 
					
						
							| 
									
										
										
										
											2023-05-03 18:33:24 -04:00
										 |  |  |     assert isinstance(elements[1], NarrativeText) | 
					
						
							|  |  |  |     assert str(elements[1]).startswith("We") | 
					
						
							|  |  |  |     assert str(elements[1]).endswith("Jordan and Egypt.") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-05 11:25:11 -07:00
										 |  |  |     expected_coordinate_points_3 = ( | 
					
						
							|  |  |  |         (273.9929, 181.16470000000004), | 
					
						
							|  |  |  |         (273.9929, 226.16470000000004), | 
					
						
							|  |  |  |         (333.59990000000005, 226.16470000000004), | 
					
						
							|  |  |  |         (333.59990000000005, 181.16470000000004), | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     expected_coordinate_system_3 = PixelSpace(width=612, height=792) | 
					
						
							|  |  |  |     expected_elem_metadata_3 = ElementMetadata( | 
					
						
							|  |  |  |         coordinates=CoordinatesMetadata( | 
					
						
							|  |  |  |             points=expected_coordinate_points_3, | 
					
						
							|  |  |  |             system=expected_coordinate_system_3, | 
					
						
							| 
									
										
										
										
											2023-05-20 16:26:55 -05:00
										 |  |  |         ), | 
					
						
							| 
									
										
										
										
											2023-05-03 18:33:24 -04:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2023-07-05 11:25:11 -07:00
										 |  |  |     assert elements[3] == Title("1st", metadata=expected_elem_metadata_3) | 
					
						
							| 
									
										
										
										
											2023-06-30 09:44:46 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  | def test_partition_pdf_with_metadata_filename( | 
					
						
							|  |  |  |     filename="example-docs/layout-parser-paper-fast.pdf", | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     elements = pdf.partition_pdf( | 
					
						
							|  |  |  |         filename=filename, | 
					
						
							|  |  |  |         url=None, | 
					
						
							|  |  |  |         include_page_breaks=True, | 
					
						
							|  |  |  |         metadata_filename="test", | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     for element in elements: | 
					
						
							|  |  |  |         assert element.metadata.filename == "test" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_pdf_with_fast_strategy_from_file_with_metadata_filename( | 
					
						
							|  |  |  |     filename="example-docs/layout-parser-paper-fast.pdf", | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     with open(filename, "rb") as f: | 
					
						
							|  |  |  |         elements = pdf.partition_pdf(file=f, url=None, strategy="fast", metadata_filename="test") | 
					
						
							|  |  |  |     for element in elements: | 
					
						
							|  |  |  |         assert element.metadata.filename == "test" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-30 09:44:46 -05:00
										 |  |  | def test_partition_pdf_with_auto_strategy_exclude_metadata( | 
					
						
							|  |  |  |     filename="example-docs/layout-parser-paper-fast.pdf", | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     elements = pdf.partition_pdf(filename=filename, strategy="auto", include_metadata=False) | 
					
						
							|  |  |  |     title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis" | 
					
						
							|  |  |  |     assert elements[0].text == title | 
					
						
							|  |  |  |     for i in range(len(elements)): | 
					
						
							|  |  |  |         assert elements[i].metadata.to_dict() == {} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_pdf_with_fast_strategy_from_file_exclude_metadata( | 
					
						
							|  |  |  |     filename="example-docs/layout-parser-paper-fast.pdf", | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     with open(filename, "rb") as f: | 
					
						
							|  |  |  |         elements = pdf.partition_pdf(file=f, url=None, strategy="fast", include_metadata=False) | 
					
						
							|  |  |  |     for i in range(len(elements)): | 
					
						
							|  |  |  |         assert elements[i].metadata.to_dict() == {} |