| 
									
										
										
										
											2023-05-04 16:23:51 -04:00
										 |  |  | import os | 
					
						
							|  |  |  | import pathlib | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  | from unittest import mock | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-01-13 22:24:13 -06:00
										 |  |  | import pytest | 
					
						
							| 
									
										
										
										
											2023-07-05 11:25:11 -07:00
										 |  |  | from PIL import Image | 
					
						
							| 
									
										
										
										
											2023-04-21 09:41:26 -04:00
										 |  |  | from pytesseract import TesseractError | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  | from unstructured_inference.inference import layout | 
					
						
							| 
									
										
										
										
											2023-01-13 22:24:13 -06:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-04 16:23:51 -04:00
										 |  |  | from unstructured.documents.elements import Title | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  | from unstructured.partition import image, pdf | 
					
						
							| 
									
										
										
										
											2023-01-13 22:24:13 -06:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-04 16:23:51 -04:00
										 |  |  | DIRECTORY = pathlib.Path(__file__).parent.resolve() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-01-13 22:24:13 -06:00
										 |  |  | 
 | 
					
						
							|  |  |  | class MockResponse: | 
					
						
							|  |  |  |     def __init__(self, status_code, response): | 
					
						
							|  |  |  |         self.status_code = status_code | 
					
						
							|  |  |  |         self.response = response | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def json(self): | 
					
						
							|  |  |  |         return self.response | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def mock_healthy_get(url, **kwargs): | 
					
						
							|  |  |  |     return MockResponse(status_code=200, response={}) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def mock_unhealthy_get(url, **kwargs): | 
					
						
							|  |  |  |     return MockResponse(status_code=500, response={}) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def mock_unsuccessful_post(url, **kwargs): | 
					
						
							|  |  |  |     return MockResponse(status_code=500, response={}) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def mock_successful_post(url, **kwargs): | 
					
						
							|  |  |  |     response = { | 
					
						
							|  |  |  |         "pages": [ | 
					
						
							|  |  |  |             { | 
					
						
							|  |  |  |                 "number": 0, | 
					
						
							|  |  |  |                 "elements": [{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"}], | 
					
						
							| 
									
										
										
										
											2023-02-08 10:11:15 -05:00
										 |  |  |             }, | 
					
						
							|  |  |  |             { | 
					
						
							|  |  |  |                 "number": 1, | 
					
						
							|  |  |  |                 "elements": [{"type": "Title", "text": "A Charlie Brown Christmas"}], | 
					
						
							|  |  |  |             }, | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  |         ], | 
					
						
							| 
									
										
										
										
											2023-01-13 22:24:13 -06:00
										 |  |  |     } | 
					
						
							|  |  |  |     return MockResponse(status_code=200, response=response) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class MockPageLayout(layout.PageLayout): | 
					
						
							| 
									
										
										
										
											2023-07-05 11:25:11 -07:00
										 |  |  |     def __init__(self, number: int, image: Image): | 
					
						
							|  |  |  |         self.number = number | 
					
						
							|  |  |  |         self.image = image | 
					
						
							| 
									
										
										
										
											2023-01-13 22:24:13 -06:00
										 |  |  | 
 | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def elements(self): | 
					
						
							|  |  |  |         return [ | 
					
						
							|  |  |  |             layout.LayoutElement( | 
					
						
							|  |  |  |                 type="Title", | 
					
						
							| 
									
										
										
										
											2023-04-04 19:59:06 -07:00
										 |  |  |                 x1=0, | 
					
						
							|  |  |  |                 y1=0, | 
					
						
							|  |  |  |                 x2=2, | 
					
						
							|  |  |  |                 y2=2, | 
					
						
							| 
									
										
										
										
											2023-01-13 22:24:13 -06:00
										 |  |  |                 text="Charlie Brown and the Great Pumpkin", | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  |             ), | 
					
						
							| 
									
										
										
										
											2023-01-13 22:24:13 -06:00
										 |  |  |         ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class MockDocumentLayout(layout.DocumentLayout): | 
					
						
							|  |  |  |     @property | 
					
						
							|  |  |  |     def pages(self): | 
					
						
							|  |  |  |         return [ | 
					
						
							| 
									
										
										
										
											2023-07-05 11:25:11 -07:00
										 |  |  |             MockPageLayout(number=0, image=Image.new("1", (1, 1))), | 
					
						
							| 
									
										
										
										
											2023-01-13 22:24:13 -06:00
										 |  |  |         ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     ("filename", "file"), | 
					
						
							|  |  |  |     [("example-docs/example.jpg", None), (None, b"0000")], | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2023-01-13 22:24:13 -06:00
										 |  |  | def test_partition_image_local(monkeypatch, filename, file): | 
					
						
							|  |  |  |     monkeypatch.setattr( | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  |         layout, | 
					
						
							|  |  |  |         "process_data_with_model", | 
					
						
							|  |  |  |         lambda *args, **kwargs: MockDocumentLayout(), | 
					
						
							| 
									
										
										
										
											2023-01-13 22:24:13 -06:00
										 |  |  |     ) | 
					
						
							|  |  |  |     monkeypatch.setattr( | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  |         layout, | 
					
						
							|  |  |  |         "process_file_with_model", | 
					
						
							|  |  |  |         lambda *args, **kwargs: MockDocumentLayout(), | 
					
						
							| 
									
										
										
										
											2023-01-13 22:24:13 -06:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     partition_image_response = pdf._partition_pdf_or_image_local(filename, file, is_image=True) | 
					
						
							|  |  |  |     assert partition_image_response[0].text == "Charlie Brown and the Great Pumpkin" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.skip("Needs to be fixed upstream in unstructured-inference") | 
					
						
							|  |  |  | def test_partition_image_local_raises_with_no_filename(): | 
					
						
							|  |  |  |     with pytest.raises(FileNotFoundError): | 
					
						
							|  |  |  |         pdf._partition_pdf_or_image_local(filename="", file=None, is_image=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-12 13:45:08 -04:00
										 |  |  | def test_partition_image_with_auto_strategy(filename="example-docs/layout-parser-paper-fast.jpg"): | 
					
						
							|  |  |  |     elements = image.partition_image(filename=filename, strategy="auto") | 
					
						
							|  |  |  |     titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10] | 
					
						
							|  |  |  |     title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis" | 
					
						
							|  |  |  |     assert titles[0].text == title | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-04-21 09:41:26 -04:00
										 |  |  | def test_partition_image_with_language_passed(filename="example-docs/example.jpg"): | 
					
						
							|  |  |  |     with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_partition: | 
					
						
							| 
									
										
										
										
											2023-05-12 13:45:08 -04:00
										 |  |  |         image.partition_image(filename=filename, strategy="hi_res", ocr_languages="eng+swe") | 
					
						
							| 
									
										
										
										
											2023-04-21 09:41:26 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  |     assert mock_partition.call_args.kwargs.get("ocr_languages") == "eng+swe" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_image_from_file_with_language_passed(filename="example-docs/example.jpg"): | 
					
						
							|  |  |  |     with mock.patch.object(layout, "process_data_with_model", mock.MagicMock()) as mock_partition: | 
					
						
							|  |  |  |         with open(filename, "rb") as f: | 
					
						
							| 
									
										
										
										
											2023-05-12 13:45:08 -04:00
										 |  |  |             image.partition_image(file=f, strategy="hi_res", ocr_languages="eng+swe") | 
					
						
							| 
									
										
										
										
											2023-04-21 09:41:26 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  |     assert mock_partition.call_args.kwargs.get("ocr_languages") == "eng+swe" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_image_raises_with_invalid_language(filename="example-docs/example.jpg"): | 
					
						
							|  |  |  |     with pytest.raises(TesseractError): | 
					
						
							| 
									
										
										
										
											2023-05-12 13:45:08 -04:00
										 |  |  |         image.partition_image(filename=filename, strategy="hi_res", ocr_languages="fakeroo") | 
					
						
							| 
									
										
										
										
											2023-05-04 16:23:51 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_image_with_ocr_detects_korean(): | 
					
						
							|  |  |  |     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png") | 
					
						
							|  |  |  |     elements = image.partition_image( | 
					
						
							|  |  |  |         filename=filename, | 
					
						
							|  |  |  |         ocr_languages="eng+kor", | 
					
						
							|  |  |  |         strategy="ocr_only", | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert elements[0] == Title("RULES AND INSTRUCTIONS") | 
					
						
							| 
									
										
										
										
											2023-05-15 13:23:19 -05:00
										 |  |  |     assert elements[3].text.replace(" ", "").startswith("안녕하세요") | 
					
						
							| 
									
										
										
										
											2023-05-04 16:23:51 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_image_with_ocr_detects_korean_from_file(): | 
					
						
							|  |  |  |     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png") | 
					
						
							|  |  |  |     with open(filename, "rb") as f: | 
					
						
							|  |  |  |         elements = image.partition_image( | 
					
						
							|  |  |  |             file=f, | 
					
						
							|  |  |  |             ocr_languages="eng+kor", | 
					
						
							|  |  |  |             strategy="ocr_only", | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert elements[0] == Title("RULES AND INSTRUCTIONS") | 
					
						
							| 
									
										
										
										
											2023-05-15 13:23:19 -05:00
										 |  |  |     assert elements[3].text.replace(" ", "").startswith("안녕하세요") | 
					
						
							| 
									
										
										
										
											2023-05-04 16:23:51 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_image_raises_with_bad_strategy(): | 
					
						
							|  |  |  |     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png") | 
					
						
							|  |  |  |     with pytest.raises(ValueError): | 
					
						
							|  |  |  |         image.partition_image(filename=filename, strategy="fakeroo") |