mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 01:54:25 +00:00 
			
		
		
		
	 5eb1466acc
			
		
	
	
		5eb1466acc
		
			
		
	
	
	
	
		
			
			* Apply import sorting ruff . --select I --fix * Remove unnecessary open mode parameter ruff . --select UP015 --fix * Use f-string formatting rather than .format * Remove extraneous parentheses Also use "" instead of str() * Resolve missing trailing commas ruff . --select COM --fix * Rewrite list() and dict() calls using literals ruff . --select C4 --fix * Add () to pytest.fixture, use tuples for parametrize, etc. ruff . --select PT --fix * Simplify code: merge conditionals, context managers ruff . --select SIM --fix * Import without unnecessary alias ruff . --select PLR0402 --fix * Apply formatting via black * Rewrite ValueError somewhat Slightly unrelated to the rest of the PR * Apply formatting to tests via black * Update expected exception message to match 0d81564 * Satisfy E501 line too long in test * Update changelog & version * Add ruff to make tidy and test deps * Run 'make tidy' * Update changelog & version * Update changelog & version * Add ruff to 'check' target Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
		
			
				
	
	
		
			193 lines
		
	
	
		
			6.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			193 lines
		
	
	
		
			6.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from unittest import mock
 | |
| 
 | |
| import pytest
 | |
| import requests
 | |
| from unstructured_inference.inference import layout
 | |
| 
 | |
| from unstructured.documents.elements import PageBreak
 | |
| from unstructured.partition import pdf
 | |
| 
 | |
| 
 | |
| class MockResponse:
 | |
|     def __init__(self, status_code, response):
 | |
|         self.status_code = status_code
 | |
|         self.response = response
 | |
| 
 | |
|     def json(self):
 | |
|         return self.response
 | |
| 
 | |
| 
 | |
| def mock_healthy_get(url, **kwargs):
 | |
|     return MockResponse(status_code=200, response={})
 | |
| 
 | |
| 
 | |
| def mock_unhealthy_get(url, **kwargs):
 | |
|     return MockResponse(status_code=500, response={})
 | |
| 
 | |
| 
 | |
| def mock_unsuccessful_post(url, **kwargs):
 | |
|     return MockResponse(status_code=500, response={})
 | |
| 
 | |
| 
 | |
| def mock_successful_post(url, **kwargs):
 | |
|     response = {
 | |
|         "pages": [
 | |
|             {
 | |
|                 "number": 0,
 | |
|                 "elements": [{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"}],
 | |
|             },
 | |
|             {
 | |
|                 "number": 1,
 | |
|                 "elements": [{"type": "Title", "text": "A Charlie Brown Christmas"}],
 | |
|             },
 | |
|         ],
 | |
|     }
 | |
|     return MockResponse(status_code=200, response=response)
 | |
| 
 | |
| 
 | |
| class MockPageLayout(layout.PageLayout):
 | |
|     def __init__(self, number: int):
 | |
|         pass
 | |
| 
 | |
|     @property
 | |
|     def elements(self):
 | |
|         return [
 | |
|             layout.LayoutElement(
 | |
|                 type="Title",
 | |
|                 coordinates=[(0, 0), (2, 2)],
 | |
|                 text="Charlie Brown and the Great Pumpkin",
 | |
|             ),
 | |
|         ]
 | |
| 
 | |
| 
 | |
| class MockDocumentLayout(layout.DocumentLayout):
 | |
|     @property
 | |
|     def pages(self):
 | |
|         return [
 | |
|             MockPageLayout(
 | |
|                 number=0,
 | |
|             ),
 | |
|         ]
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_api(monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"):
 | |
|     monkeypatch.setattr(requests, "post", mock_successful_post)
 | |
|     monkeypatch.setattr(requests, "get", mock_healthy_get)
 | |
| 
 | |
|     partition_pdf_response = pdf._partition_via_api(filename)
 | |
|     assert partition_pdf_response[0]["type"] == "Title"
 | |
|     assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
 | |
|     assert partition_pdf_response[1]["type"] == "Title"
 | |
|     assert partition_pdf_response[1]["text"] == "A Charlie Brown Christmas"
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_api_page_breaks(
 | |
|     monkeypatch,
 | |
|     filename="example-docs/layout-parser-paper-fast.pdf",
 | |
| ):
 | |
|     monkeypatch.setattr(requests, "post", mock_successful_post)
 | |
|     monkeypatch.setattr(requests, "get", mock_healthy_get)
 | |
| 
 | |
|     partition_pdf_response = pdf._partition_via_api(filename, include_page_breaks=True)
 | |
|     assert partition_pdf_response[0]["type"] == "Title"
 | |
|     assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
 | |
|     assert partition_pdf_response[1]["type"] == "PageBreak"
 | |
|     assert partition_pdf_response[2]["type"] == "Title"
 | |
|     assert partition_pdf_response[2]["text"] == "A Charlie Brown Christmas"
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     ("filename", "file"),
 | |
|     [("example-docs/layout-parser-paper-fast.pdf", None), (None, b"0000")],
 | |
| )
 | |
| def test_partition_pdf_local(monkeypatch, filename, file):
 | |
|     monkeypatch.setattr(
 | |
|         layout,
 | |
|         "process_data_with_model",
 | |
|         lambda *args, **kwargs: MockDocumentLayout(),
 | |
|     )
 | |
|     monkeypatch.setattr(
 | |
|         layout,
 | |
|         "process_file_with_model",
 | |
|         lambda *args, **kwargs: MockDocumentLayout(),
 | |
|     )
 | |
| 
 | |
|     partition_pdf_response = pdf._partition_pdf_or_image_local(filename, file)
 | |
|     assert partition_pdf_response[0].type == "Title"
 | |
|     assert partition_pdf_response[0].text == "Charlie Brown and the Great Pumpkin"
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_api_raises_with_no_filename(monkeypatch):
 | |
|     monkeypatch.setattr(requests, "post", mock_successful_post)
 | |
|     monkeypatch.setattr(requests, "get", mock_healthy_get)
 | |
| 
 | |
|     with pytest.raises(FileNotFoundError):
 | |
|         pdf._partition_via_api(filename=None, file=None)
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_local_raises_with_no_filename():
 | |
|     with pytest.raises(FileNotFoundError):
 | |
|         pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False)
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_api_raises_with_failed_healthcheck(
 | |
|     monkeypatch,
 | |
|     filename="example-docs/layout-parser-paper-fast.pdf",
 | |
| ):
 | |
|     monkeypatch.setattr(requests, "post", mock_successful_post)
 | |
|     monkeypatch.setattr(requests, "get", mock_unhealthy_get)
 | |
| 
 | |
|     with pytest.raises(ValueError):
 | |
|         pdf._partition_via_api(filename=filename)
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_api_raises_with_failed_api_call(
 | |
|     monkeypatch,
 | |
|     filename="example-docs/layout-parser-paper-fast.pdf",
 | |
| ):
 | |
|     monkeypatch.setattr(requests, "post", mock_unsuccessful_post)
 | |
|     monkeypatch.setattr(requests, "get", mock_healthy_get)
 | |
| 
 | |
|     with pytest.raises(ValueError):
 | |
|         pdf._partition_via_api(filename=filename)
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     ("url", "api_called", "local_called"),
 | |
|     [("fakeurl", True, False), (None, False, True)],
 | |
| )
 | |
| def test_partition_pdf(url, api_called, local_called):
 | |
|     with mock.patch.object(
 | |
|         pdf,
 | |
|         attribute="_partition_via_api",
 | |
|         new=mock.MagicMock(),
 | |
|     ), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
 | |
|         pdf.partition_pdf(filename="fake.pdf", url=url)
 | |
|         assert pdf._partition_via_api.called == api_called
 | |
|         assert pdf._partition_pdf_or_image_local.called == local_called
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     ("url", "api_called", "local_called"),
 | |
|     [("fakeurl", True, False), (None, False, True)],
 | |
| )
 | |
| def test_partition_pdf_with_template(url, api_called, local_called):
 | |
|     with mock.patch.object(
 | |
|         pdf,
 | |
|         attribute="_partition_via_api",
 | |
|         new=mock.MagicMock(),
 | |
|     ), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
 | |
|         pdf.partition_pdf(filename="fake.pdf", url=url, template="checkbox")
 | |
|         assert pdf._partition_via_api.called == api_called
 | |
|         assert pdf._partition_pdf_or_image_local.called == local_called
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_with_page_breaks(filename="example-docs/layout-parser-paper-fast.pdf"):
 | |
|     elements = pdf.partition_pdf(filename=filename, url=None, include_page_breaks=True)
 | |
|     assert PageBreak() in elements
 | |
| 
 | |
| 
 | |
| def test_partition_pdf_with_no_page_breaks(filename="example-docs/layout-parser-paper-fast.pdf"):
 | |
|     elements = pdf.partition_pdf(filename=filename, url=None)
 | |
|     assert PageBreak() not in elements
 |