From b0dbd71affb90de736ff1ad32dfef72165f53f96 Mon Sep 17 00:00:00 2001 From: jiajun-unstructured Date: Mon, 16 Jun 2025 16:29:35 -0700 Subject: [PATCH] Parallelize tests (#4024) --- Makefile | 24 ++++---- requirements/test.in | 1 + requirements/test.txt | 61 ++++++++++--------- test_unstructured/partition/conftest.py | 18 ++++++ .../partition/pdf_image/test_ocr.py | 26 +++++--- .../partition/pdf_image/test_pdf.py | 16 +++-- 6 files changed, 92 insertions(+), 54 deletions(-) create mode 100644 test_unstructured/partition/conftest.py diff --git a/Makefile b/Makefile index fe1350d5f..cff7fdfdc 100644 --- a/Makefile +++ b/Makefile @@ -142,7 +142,7 @@ export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false .PHONY: test test: PYTHONPATH=. CI=$(CI) \ - UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40 + UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest -n auto test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40 .PHONY: test-unstructured-api-unit test-unstructured-api-unit: @@ -151,7 +151,7 @@ test-unstructured-api-unit: .PHONY: test-no-extras test-no-extras: PYTHONPATH=. CI=$(CI) \ - UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest \ + UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest -n auto \ test_${PACKAGE_NAME}/partition/test_text.py \ test_${PACKAGE_NAME}/partition/test_email.py \ test_${PACKAGE_NAME}/partition/html/test_partition.py \ @@ -159,52 +159,52 @@ test-no-extras: .PHONY: test-extra-csv test-extra-csv: - PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \ + PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \ test_unstructured/partition/test_csv.py \ test_unstructured/partition/test_tsv.py .PHONY: test-extra-docx test-extra-docx: - PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \ + PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \ test_unstructured/partition/test_doc.py \ test_unstructured/partition/test_docx.py .PHONY: test-extra-epub test-extra-epub: - PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_epub.py + PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_epub.py .PHONY: test-extra-markdown test-extra-markdown: - PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_md.py + PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_md.py .PHONY: test-extra-odt test-extra-odt: - PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_odt.py + PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_odt.py .PHONY: test-extra-pdf-image test-extra-pdf-image: - PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/pdf_image + PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/pdf_image .PHONY: test-extra-pptx test-extra-pptx: - PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \ + PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \ test_unstructured/partition/test_ppt.py \ test_unstructured/partition/test_pptx.py .PHONY: test-extra-pypandoc test-extra-pypandoc: - PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \ + PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \ test_unstructured/partition/test_org.py \ test_unstructured/partition/test_rst.py \ test_unstructured/partition/test_rtf.py .PHONY: test-extra-xlsx test-extra-xlsx: - PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_xlsx.py + PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_xlsx.py .PHONY: test-text-extraction-evaluate test-text-extraction-evaluate: - PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/metrics/test_text_extraction.py + PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/metrics/test_text_extraction.py ## check: runs linters (includes tests) .PHONY: check diff --git a/requirements/test.in b/requirements/test.in index e9b8fadbf..3bf9c5edf 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -10,6 +10,7 @@ mypy pydantic pytest-cov pytest-mock +pytest-xdist ruff types-Markdown types-requests diff --git a/requirements/test.txt b/requirements/test.txt index 2e38a48d8..6492b3dcd 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -2,54 +2,56 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile ./test.in +# pip-compile requirements/test.in # annotated-types==0.7.0 # via pydantic autoflake==2.3.1 - # via -r ./test.in + # via -r requirements/test.in black==25.1.0 - # via -r ./test.in + # via -r requirements/test.in click==8.2.1 # via - # -c ./base.txt + # -c requirements/base.txt # black coverage[toml]==7.9.0 # via - # -r ./test.in + # -r requirements/test.in # pytest-cov exceptiongroup==1.3.0 # via - # -c ./base.txt + # -c requirements/base.txt # pytest +execnet==2.1.1 + # via pytest-xdist flake8==7.2.0 # via - # -r ./test.in + # -r requirements/test.in # flake8-print flake8-print==5.0.0 - # via -r ./test.in + # via -r requirements/test.in freezegun==1.5.2 - # via -r ./test.in + # via -r requirements/test.in grpcio==1.73.0 # via - # -c ././deps/constraints.txt - # -r ./test.in + # -c requirements/./deps/constraints.txt + # -r requirements/test.in iniconfig==2.1.0 # via pytest liccheck==0.9.2 - # via -r ./test.in + # via -r requirements/test.in mccabe==0.7.0 # via flake8 mypy==1.16.0 - # via -r ./test.in + # via -r requirements/test.in mypy-extensions==1.1.0 # via - # -c ./base.txt + # -c requirements/base.txt # black # mypy packaging==25.0 # via - # -c ./base.txt + # -c requirements/base.txt # black # pytest pathspec==0.12.1 @@ -67,7 +69,7 @@ pycodestyle==2.13.0 # flake8 # flake8-print pydantic==2.11.5 - # via -r ./test.in + # via -r requirements/test.in pydantic-core==2.33.2 # via pydantic pyflakes==3.3.2 @@ -80,21 +82,24 @@ pytest==8.4.0 # via # pytest-cov # pytest-mock + # pytest-xdist pytest-cov==6.2.1 - # via -r ./test.in + # via -r requirements/test.in pytest-mock==3.14.1 - # via -r ./test.in + # via -r requirements/test.in +pytest-xdist==3.7.0 + # via -r requirements/test.in python-dateutil==2.9.0.post0 # via - # -c ./base.txt + # -c requirements/base.txt # freezegun ruff==0.11.13 - # via -r ./test.in + # via -r requirements/test.in semantic-version==2.10.0 # via liccheck six==1.17.0 # via - # -c ./base.txt + # -c requirements/base.txt # python-dateutil toml==0.10.2 # via liccheck @@ -106,16 +111,16 @@ tomli==2.2.1 # mypy # pytest types-click==7.1.8 - # via -r ./test.in + # via -r requirements/test.in types-markdown==3.8.0.20250415 - # via -r ./test.in + # via -r requirements/test.in types-requests==2.32.4.20250611 - # via -r ./test.in + # via -r requirements/test.in types-tabulate==0.9.0.20241207 - # via -r ./test.in + # via -r requirements/test.in typing-extensions==4.14.0 # via - # -c ./base.txt + # -c requirements/base.txt # black # exceptiongroup # mypy @@ -126,6 +131,6 @@ typing-inspection==0.4.1 # via pydantic urllib3==2.4.0 # via - # -c ././deps/constraints.txt - # -c ./base.txt + # -c requirements/./deps/constraints.txt + # -c requirements/base.txt # types-requests diff --git a/test_unstructured/partition/conftest.py b/test_unstructured/partition/conftest.py new file mode 100644 index 000000000..f512a294b --- /dev/null +++ b/test_unstructured/partition/conftest.py @@ -0,0 +1,18 @@ +import pytest + +from unstructured.partition.utils.constants import OCR_AGENT_PADDLE, OCR_AGENT_TESSERACT + + +@pytest.fixture +def mock_ocr_get_instance(mocker): + """Fixture that mocks OCRAgent.get_instance to prevent real OCR agent instantiation.""" + + def mock_get_instance(ocr_agent_module, language): + if ocr_agent_module in (OCR_AGENT_TESSERACT, OCR_AGENT_PADDLE): + return mocker.MagicMock() + else: + raise ValueError(f"Unknown OCR agent: {ocr_agent_module}") + + from unstructured.partition.pdf_image.ocr import OCRAgent + + return mocker.patch.object(OCRAgent, "get_instance", side_effect=mock_get_instance) diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py index 3b8585626..88649bce0 100644 --- a/test_unstructured/partition/pdf_image/test_ocr.py +++ b/test_unstructured/partition/pdf_image/test_ocr.py @@ -622,11 +622,10 @@ def mock_page(mock_ocr_layout, mock_layout): return mock_page -def test_supplement_layout_with_ocr(mocker, mock_page): +def test_supplement_layout_with_ocr(mock_ocr_get_instance, mocker, mock_page): from unstructured.partition.pdf_image.ocr import OCRAgent mocker.patch.object(OCRAgent, "get_layout_from_image", return_value=mock_ocr_layout) - spy = mocker.spy(OCRAgent, "get_instance") ocr.supplement_page_layout_with_ocr( mock_page, @@ -637,16 +636,21 @@ def test_supplement_layout_with_ocr(mocker, mock_page): table_ocr_agent=OCR_AGENT_PADDLE, ) - assert spy.call_args_list[0][1] == {"language": "eng", "ocr_agent_module": OCR_AGENT_TESSERACT} - assert spy.call_args_list[1][1] == {"language": "en", "ocr_agent_module": OCR_AGENT_PADDLE} + assert mock_ocr_get_instance.call_args_list[0][1] == { + "language": "eng", + "ocr_agent_module": OCR_AGENT_TESSERACT, + } + assert mock_ocr_get_instance.call_args_list[1][1] == { + "language": "en", + "ocr_agent_module": OCR_AGENT_PADDLE, + } -def test_pass_down_agents(mocker, mock_page): +def test_pass_down_agents(mock_ocr_get_instance, mocker, mock_page): from unstructured.partition.pdf_image.ocr import OCRAgent, PILImage mocker.patch.object(OCRAgent, "get_layout_from_image", return_value=mock_ocr_layout) mocker.patch.object(PILImage, "open", return_value=Image.new("RGB", (100, 100))) - spy = mocker.spy(OCRAgent, "get_instance") doc = MagicMock(DocumentLayout) doc.pages = [mock_page] @@ -661,5 +665,11 @@ def test_pass_down_agents(mocker, mock_page): table_ocr_agent=OCR_AGENT_TESSERACT, ) - assert spy.call_args_list[0][1] == {"language": "en", "ocr_agent_module": OCR_AGENT_PADDLE} - assert spy.call_args_list[1][1] == {"language": "eng", "ocr_agent_module": OCR_AGENT_TESSERACT} + assert mock_ocr_get_instance.call_args_list[0][1] == { + "language": "en", + "ocr_agent_module": OCR_AGENT_PADDLE, + } + assert mock_ocr_get_instance.call_args_list[1][1] == { + "language": "eng", + "ocr_agent_module": OCR_AGENT_TESSERACT, + } diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 70eec35fd..85b7d9428 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -1588,10 +1588,7 @@ def test_partition_pdf_with_password( _test(result) -def test_partition_pdf_with_specified_ocr_agents(mocker): - from unstructured.partition.pdf_image.ocr import OCRAgent - - spy = mocker.spy(OCRAgent, "get_instance") +def test_partition_pdf_with_specified_ocr_agents(mock_ocr_get_instance, mocker): pdf.partition_pdf( filename=example_doc_path("pdf/layout-parser-paper-with-table.pdf"), @@ -1601,8 +1598,15 @@ def test_partition_pdf_with_specified_ocr_agents(mocker): table_ocr_agent=OCR_AGENT_PADDLE, ) - assert spy.call_args_list[0][1] == {"language": "eng", "ocr_agent_module": OCR_AGENT_TESSERACT} - assert spy.call_args_list[1][1] == {"language": "en", "ocr_agent_module": OCR_AGENT_PADDLE} + # Verify get_instance was called with correct parameters + assert mock_ocr_get_instance.call_args_list[0][1] == { + "language": "eng", + "ocr_agent_module": OCR_AGENT_TESSERACT, + } + assert mock_ocr_get_instance.call_args_list[1][1] == { + "language": "en", + "ocr_agent_module": OCR_AGENT_PADDLE, + } def test_reproductible_pdf_loader():