Parallelize tests (#4024)

This commit is contained in:
jiajun-unstructured 2025-06-16 16:29:35 -07:00 committed by GitHub
parent 531490d013
commit b0dbd71aff
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 92 additions and 54 deletions

View File

@ -142,7 +142,7 @@ export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false
.PHONY: test
test:
PYTHONPATH=. CI=$(CI) \
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest -n auto test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
.PHONY: test-unstructured-api-unit
test-unstructured-api-unit:
@ -151,7 +151,7 @@ test-unstructured-api-unit:
.PHONY: test-no-extras
test-no-extras:
PYTHONPATH=. CI=$(CI) \
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest \
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest -n auto \
test_${PACKAGE_NAME}/partition/test_text.py \
test_${PACKAGE_NAME}/partition/test_email.py \
test_${PACKAGE_NAME}/partition/html/test_partition.py \
@ -159,52 +159,52 @@ test-no-extras:
.PHONY: test-extra-csv
test-extra-csv:
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \
test_unstructured/partition/test_csv.py \
test_unstructured/partition/test_tsv.py
.PHONY: test-extra-docx
test-extra-docx:
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \
test_unstructured/partition/test_doc.py \
test_unstructured/partition/test_docx.py
.PHONY: test-extra-epub
test-extra-epub:
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_epub.py
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_epub.py
.PHONY: test-extra-markdown
test-extra-markdown:
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_md.py
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_md.py
.PHONY: test-extra-odt
test-extra-odt:
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_odt.py
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_odt.py
.PHONY: test-extra-pdf-image
test-extra-pdf-image:
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/pdf_image
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/pdf_image
.PHONY: test-extra-pptx
test-extra-pptx:
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \
test_unstructured/partition/test_ppt.py \
test_unstructured/partition/test_pptx.py
.PHONY: test-extra-pypandoc
test-extra-pypandoc:
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \
test_unstructured/partition/test_org.py \
test_unstructured/partition/test_rst.py \
test_unstructured/partition/test_rtf.py
.PHONY: test-extra-xlsx
test-extra-xlsx:
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_xlsx.py
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_xlsx.py
.PHONY: test-text-extraction-evaluate
test-text-extraction-evaluate:
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/metrics/test_text_extraction.py
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/metrics/test_text_extraction.py
## check: runs linters (includes tests)
.PHONY: check

View File

@ -10,6 +10,7 @@ mypy
pydantic
pytest-cov
pytest-mock
pytest-xdist
ruff
types-Markdown
types-requests

View File

@ -2,54 +2,56 @@
# This file is autogenerated by pip-compile with Python 3.10
# by the following command:
#
# pip-compile ./test.in
# pip-compile requirements/test.in
#
annotated-types==0.7.0
# via pydantic
autoflake==2.3.1
# via -r ./test.in
# via -r requirements/test.in
black==25.1.0
# via -r ./test.in
# via -r requirements/test.in
click==8.2.1
# via
# -c ./base.txt
# -c requirements/base.txt
# black
coverage[toml]==7.9.0
# via
# -r ./test.in
# -r requirements/test.in
# pytest-cov
exceptiongroup==1.3.0
# via
# -c ./base.txt
# -c requirements/base.txt
# pytest
execnet==2.1.1
# via pytest-xdist
flake8==7.2.0
# via
# -r ./test.in
# -r requirements/test.in
# flake8-print
flake8-print==5.0.0
# via -r ./test.in
# via -r requirements/test.in
freezegun==1.5.2
# via -r ./test.in
# via -r requirements/test.in
grpcio==1.73.0
# via
# -c ././deps/constraints.txt
# -r ./test.in
# -c requirements/./deps/constraints.txt
# -r requirements/test.in
iniconfig==2.1.0
# via pytest
liccheck==0.9.2
# via -r ./test.in
# via -r requirements/test.in
mccabe==0.7.0
# via flake8
mypy==1.16.0
# via -r ./test.in
# via -r requirements/test.in
mypy-extensions==1.1.0
# via
# -c ./base.txt
# -c requirements/base.txt
# black
# mypy
packaging==25.0
# via
# -c ./base.txt
# -c requirements/base.txt
# black
# pytest
pathspec==0.12.1
@ -67,7 +69,7 @@ pycodestyle==2.13.0
# flake8
# flake8-print
pydantic==2.11.5
# via -r ./test.in
# via -r requirements/test.in
pydantic-core==2.33.2
# via pydantic
pyflakes==3.3.2
@ -80,21 +82,24 @@ pytest==8.4.0
# via
# pytest-cov
# pytest-mock
# pytest-xdist
pytest-cov==6.2.1
# via -r ./test.in
# via -r requirements/test.in
pytest-mock==3.14.1
# via -r ./test.in
# via -r requirements/test.in
pytest-xdist==3.7.0
# via -r requirements/test.in
python-dateutil==2.9.0.post0
# via
# -c ./base.txt
# -c requirements/base.txt
# freezegun
ruff==0.11.13
# via -r ./test.in
# via -r requirements/test.in
semantic-version==2.10.0
# via liccheck
six==1.17.0
# via
# -c ./base.txt
# -c requirements/base.txt
# python-dateutil
toml==0.10.2
# via liccheck
@ -106,16 +111,16 @@ tomli==2.2.1
# mypy
# pytest
types-click==7.1.8
# via -r ./test.in
# via -r requirements/test.in
types-markdown==3.8.0.20250415
# via -r ./test.in
# via -r requirements/test.in
types-requests==2.32.4.20250611
# via -r ./test.in
# via -r requirements/test.in
types-tabulate==0.9.0.20241207
# via -r ./test.in
# via -r requirements/test.in
typing-extensions==4.14.0
# via
# -c ./base.txt
# -c requirements/base.txt
# black
# exceptiongroup
# mypy
@ -126,6 +131,6 @@ typing-inspection==0.4.1
# via pydantic
urllib3==2.4.0
# via
# -c ././deps/constraints.txt
# -c ./base.txt
# -c requirements/./deps/constraints.txt
# -c requirements/base.txt
# types-requests

View File

@ -0,0 +1,18 @@
import pytest
from unstructured.partition.utils.constants import OCR_AGENT_PADDLE, OCR_AGENT_TESSERACT
@pytest.fixture
def mock_ocr_get_instance(mocker):
"""Fixture that mocks OCRAgent.get_instance to prevent real OCR agent instantiation."""
def mock_get_instance(ocr_agent_module, language):
if ocr_agent_module in (OCR_AGENT_TESSERACT, OCR_AGENT_PADDLE):
return mocker.MagicMock()
else:
raise ValueError(f"Unknown OCR agent: {ocr_agent_module}")
from unstructured.partition.pdf_image.ocr import OCRAgent
return mocker.patch.object(OCRAgent, "get_instance", side_effect=mock_get_instance)

View File

@ -622,11 +622,10 @@ def mock_page(mock_ocr_layout, mock_layout):
return mock_page
def test_supplement_layout_with_ocr(mocker, mock_page):
def test_supplement_layout_with_ocr(mock_ocr_get_instance, mocker, mock_page):
from unstructured.partition.pdf_image.ocr import OCRAgent
mocker.patch.object(OCRAgent, "get_layout_from_image", return_value=mock_ocr_layout)
spy = mocker.spy(OCRAgent, "get_instance")
ocr.supplement_page_layout_with_ocr(
mock_page,
@ -637,16 +636,21 @@ def test_supplement_layout_with_ocr(mocker, mock_page):
table_ocr_agent=OCR_AGENT_PADDLE,
)
assert spy.call_args_list[0][1] == {"language": "eng", "ocr_agent_module": OCR_AGENT_TESSERACT}
assert spy.call_args_list[1][1] == {"language": "en", "ocr_agent_module": OCR_AGENT_PADDLE}
assert mock_ocr_get_instance.call_args_list[0][1] == {
"language": "eng",
"ocr_agent_module": OCR_AGENT_TESSERACT,
}
assert mock_ocr_get_instance.call_args_list[1][1] == {
"language": "en",
"ocr_agent_module": OCR_AGENT_PADDLE,
}
def test_pass_down_agents(mocker, mock_page):
def test_pass_down_agents(mock_ocr_get_instance, mocker, mock_page):
from unstructured.partition.pdf_image.ocr import OCRAgent, PILImage
mocker.patch.object(OCRAgent, "get_layout_from_image", return_value=mock_ocr_layout)
mocker.patch.object(PILImage, "open", return_value=Image.new("RGB", (100, 100)))
spy = mocker.spy(OCRAgent, "get_instance")
doc = MagicMock(DocumentLayout)
doc.pages = [mock_page]
@ -661,5 +665,11 @@ def test_pass_down_agents(mocker, mock_page):
table_ocr_agent=OCR_AGENT_TESSERACT,
)
assert spy.call_args_list[0][1] == {"language": "en", "ocr_agent_module": OCR_AGENT_PADDLE}
assert spy.call_args_list[1][1] == {"language": "eng", "ocr_agent_module": OCR_AGENT_TESSERACT}
assert mock_ocr_get_instance.call_args_list[0][1] == {
"language": "en",
"ocr_agent_module": OCR_AGENT_PADDLE,
}
assert mock_ocr_get_instance.call_args_list[1][1] == {
"language": "eng",
"ocr_agent_module": OCR_AGENT_TESSERACT,
}

View File

@ -1588,10 +1588,7 @@ def test_partition_pdf_with_password(
_test(result)
def test_partition_pdf_with_specified_ocr_agents(mocker):
from unstructured.partition.pdf_image.ocr import OCRAgent
spy = mocker.spy(OCRAgent, "get_instance")
def test_partition_pdf_with_specified_ocr_agents(mock_ocr_get_instance, mocker):
pdf.partition_pdf(
filename=example_doc_path("pdf/layout-parser-paper-with-table.pdf"),
@ -1601,8 +1598,15 @@ def test_partition_pdf_with_specified_ocr_agents(mocker):
table_ocr_agent=OCR_AGENT_PADDLE,
)
assert spy.call_args_list[0][1] == {"language": "eng", "ocr_agent_module": OCR_AGENT_TESSERACT}
assert spy.call_args_list[1][1] == {"language": "en", "ocr_agent_module": OCR_AGENT_PADDLE}
# Verify get_instance was called with correct parameters
assert mock_ocr_get_instance.call_args_list[0][1] == {
"language": "eng",
"ocr_agent_module": OCR_AGENT_TESSERACT,
}
assert mock_ocr_get_instance.call_args_list[1][1] == {
"language": "en",
"ocr_agent_module": OCR_AGENT_PADDLE,
}
def test_reproductible_pdf_loader():