Parallelize tests (#4024)

This commit is contained in:
jiajun-unstructured 2025-06-16 16:29:35 -07:00 committed by GitHub
parent 531490d013
commit b0dbd71aff
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 92 additions and 54 deletions

View File

@ -142,7 +142,7 @@ export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false
.PHONY: test .PHONY: test
test: test:
PYTHONPATH=. CI=$(CI) \ PYTHONPATH=. CI=$(CI) \
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40 UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest -n auto test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
.PHONY: test-unstructured-api-unit .PHONY: test-unstructured-api-unit
test-unstructured-api-unit: test-unstructured-api-unit:
@ -151,7 +151,7 @@ test-unstructured-api-unit:
.PHONY: test-no-extras .PHONY: test-no-extras
test-no-extras: test-no-extras:
PYTHONPATH=. CI=$(CI) \ PYTHONPATH=. CI=$(CI) \
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest \ UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest -n auto \
test_${PACKAGE_NAME}/partition/test_text.py \ test_${PACKAGE_NAME}/partition/test_text.py \
test_${PACKAGE_NAME}/partition/test_email.py \ test_${PACKAGE_NAME}/partition/test_email.py \
test_${PACKAGE_NAME}/partition/html/test_partition.py \ test_${PACKAGE_NAME}/partition/html/test_partition.py \
@ -159,52 +159,52 @@ test-no-extras:
.PHONY: test-extra-csv .PHONY: test-extra-csv
test-extra-csv: test-extra-csv:
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \ PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \
test_unstructured/partition/test_csv.py \ test_unstructured/partition/test_csv.py \
test_unstructured/partition/test_tsv.py test_unstructured/partition/test_tsv.py
.PHONY: test-extra-docx .PHONY: test-extra-docx
test-extra-docx: test-extra-docx:
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \ PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \
test_unstructured/partition/test_doc.py \ test_unstructured/partition/test_doc.py \
test_unstructured/partition/test_docx.py test_unstructured/partition/test_docx.py
.PHONY: test-extra-epub .PHONY: test-extra-epub
test-extra-epub: test-extra-epub:
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_epub.py PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_epub.py
.PHONY: test-extra-markdown .PHONY: test-extra-markdown
test-extra-markdown: test-extra-markdown:
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_md.py PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_md.py
.PHONY: test-extra-odt .PHONY: test-extra-odt
test-extra-odt: test-extra-odt:
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_odt.py PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_odt.py
.PHONY: test-extra-pdf-image .PHONY: test-extra-pdf-image
test-extra-pdf-image: test-extra-pdf-image:
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/pdf_image PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/pdf_image
.PHONY: test-extra-pptx .PHONY: test-extra-pptx
test-extra-pptx: test-extra-pptx:
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \ PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \
test_unstructured/partition/test_ppt.py \ test_unstructured/partition/test_ppt.py \
test_unstructured/partition/test_pptx.py test_unstructured/partition/test_pptx.py
.PHONY: test-extra-pypandoc .PHONY: test-extra-pypandoc
test-extra-pypandoc: test-extra-pypandoc:
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \ PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \
test_unstructured/partition/test_org.py \ test_unstructured/partition/test_org.py \
test_unstructured/partition/test_rst.py \ test_unstructured/partition/test_rst.py \
test_unstructured/partition/test_rtf.py test_unstructured/partition/test_rtf.py
.PHONY: test-extra-xlsx .PHONY: test-extra-xlsx
test-extra-xlsx: test-extra-xlsx:
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_xlsx.py PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_xlsx.py
.PHONY: test-text-extraction-evaluate .PHONY: test-text-extraction-evaluate
test-text-extraction-evaluate: test-text-extraction-evaluate:
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/metrics/test_text_extraction.py PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/metrics/test_text_extraction.py
## check: runs linters (includes tests) ## check: runs linters (includes tests)
.PHONY: check .PHONY: check

View File

@ -10,6 +10,7 @@ mypy
pydantic pydantic
pytest-cov pytest-cov
pytest-mock pytest-mock
pytest-xdist
ruff ruff
types-Markdown types-Markdown
types-requests types-requests

View File

@ -2,54 +2,56 @@
# This file is autogenerated by pip-compile with Python 3.10 # This file is autogenerated by pip-compile with Python 3.10
# by the following command: # by the following command:
# #
# pip-compile ./test.in # pip-compile requirements/test.in
# #
annotated-types==0.7.0 annotated-types==0.7.0
# via pydantic # via pydantic
autoflake==2.3.1 autoflake==2.3.1
# via -r ./test.in # via -r requirements/test.in
black==25.1.0 black==25.1.0
# via -r ./test.in # via -r requirements/test.in
click==8.2.1 click==8.2.1
# via # via
# -c ./base.txt # -c requirements/base.txt
# black # black
coverage[toml]==7.9.0 coverage[toml]==7.9.0
# via # via
# -r ./test.in # -r requirements/test.in
# pytest-cov # pytest-cov
exceptiongroup==1.3.0 exceptiongroup==1.3.0
# via # via
# -c ./base.txt # -c requirements/base.txt
# pytest # pytest
execnet==2.1.1
# via pytest-xdist
flake8==7.2.0 flake8==7.2.0
# via # via
# -r ./test.in # -r requirements/test.in
# flake8-print # flake8-print
flake8-print==5.0.0 flake8-print==5.0.0
# via -r ./test.in # via -r requirements/test.in
freezegun==1.5.2 freezegun==1.5.2
# via -r ./test.in # via -r requirements/test.in
grpcio==1.73.0 grpcio==1.73.0
# via # via
# -c ././deps/constraints.txt # -c requirements/./deps/constraints.txt
# -r ./test.in # -r requirements/test.in
iniconfig==2.1.0 iniconfig==2.1.0
# via pytest # via pytest
liccheck==0.9.2 liccheck==0.9.2
# via -r ./test.in # via -r requirements/test.in
mccabe==0.7.0 mccabe==0.7.0
# via flake8 # via flake8
mypy==1.16.0 mypy==1.16.0
# via -r ./test.in # via -r requirements/test.in
mypy-extensions==1.1.0 mypy-extensions==1.1.0
# via # via
# -c ./base.txt # -c requirements/base.txt
# black # black
# mypy # mypy
packaging==25.0 packaging==25.0
# via # via
# -c ./base.txt # -c requirements/base.txt
# black # black
# pytest # pytest
pathspec==0.12.1 pathspec==0.12.1
@ -67,7 +69,7 @@ pycodestyle==2.13.0
# flake8 # flake8
# flake8-print # flake8-print
pydantic==2.11.5 pydantic==2.11.5
# via -r ./test.in # via -r requirements/test.in
pydantic-core==2.33.2 pydantic-core==2.33.2
# via pydantic # via pydantic
pyflakes==3.3.2 pyflakes==3.3.2
@ -80,21 +82,24 @@ pytest==8.4.0
# via # via
# pytest-cov # pytest-cov
# pytest-mock # pytest-mock
# pytest-xdist
pytest-cov==6.2.1 pytest-cov==6.2.1
# via -r ./test.in # via -r requirements/test.in
pytest-mock==3.14.1 pytest-mock==3.14.1
# via -r ./test.in # via -r requirements/test.in
pytest-xdist==3.7.0
# via -r requirements/test.in
python-dateutil==2.9.0.post0 python-dateutil==2.9.0.post0
# via # via
# -c ./base.txt # -c requirements/base.txt
# freezegun # freezegun
ruff==0.11.13 ruff==0.11.13
# via -r ./test.in # via -r requirements/test.in
semantic-version==2.10.0 semantic-version==2.10.0
# via liccheck # via liccheck
six==1.17.0 six==1.17.0
# via # via
# -c ./base.txt # -c requirements/base.txt
# python-dateutil # python-dateutil
toml==0.10.2 toml==0.10.2
# via liccheck # via liccheck
@ -106,16 +111,16 @@ tomli==2.2.1
# mypy # mypy
# pytest # pytest
types-click==7.1.8 types-click==7.1.8
# via -r ./test.in # via -r requirements/test.in
types-markdown==3.8.0.20250415 types-markdown==3.8.0.20250415
# via -r ./test.in # via -r requirements/test.in
types-requests==2.32.4.20250611 types-requests==2.32.4.20250611
# via -r ./test.in # via -r requirements/test.in
types-tabulate==0.9.0.20241207 types-tabulate==0.9.0.20241207
# via -r ./test.in # via -r requirements/test.in
typing-extensions==4.14.0 typing-extensions==4.14.0
# via # via
# -c ./base.txt # -c requirements/base.txt
# black # black
# exceptiongroup # exceptiongroup
# mypy # mypy
@ -126,6 +131,6 @@ typing-inspection==0.4.1
# via pydantic # via pydantic
urllib3==2.4.0 urllib3==2.4.0
# via # via
# -c ././deps/constraints.txt # -c requirements/./deps/constraints.txt
# -c ./base.txt # -c requirements/base.txt
# types-requests # types-requests

View File

@ -0,0 +1,18 @@
import pytest
from unstructured.partition.utils.constants import OCR_AGENT_PADDLE, OCR_AGENT_TESSERACT
@pytest.fixture
def mock_ocr_get_instance(mocker):
"""Fixture that mocks OCRAgent.get_instance to prevent real OCR agent instantiation."""
def mock_get_instance(ocr_agent_module, language):
if ocr_agent_module in (OCR_AGENT_TESSERACT, OCR_AGENT_PADDLE):
return mocker.MagicMock()
else:
raise ValueError(f"Unknown OCR agent: {ocr_agent_module}")
from unstructured.partition.pdf_image.ocr import OCRAgent
return mocker.patch.object(OCRAgent, "get_instance", side_effect=mock_get_instance)

View File

@ -622,11 +622,10 @@ def mock_page(mock_ocr_layout, mock_layout):
return mock_page return mock_page
def test_supplement_layout_with_ocr(mocker, mock_page): def test_supplement_layout_with_ocr(mock_ocr_get_instance, mocker, mock_page):
from unstructured.partition.pdf_image.ocr import OCRAgent from unstructured.partition.pdf_image.ocr import OCRAgent
mocker.patch.object(OCRAgent, "get_layout_from_image", return_value=mock_ocr_layout) mocker.patch.object(OCRAgent, "get_layout_from_image", return_value=mock_ocr_layout)
spy = mocker.spy(OCRAgent, "get_instance")
ocr.supplement_page_layout_with_ocr( ocr.supplement_page_layout_with_ocr(
mock_page, mock_page,
@ -637,16 +636,21 @@ def test_supplement_layout_with_ocr(mocker, mock_page):
table_ocr_agent=OCR_AGENT_PADDLE, table_ocr_agent=OCR_AGENT_PADDLE,
) )
assert spy.call_args_list[0][1] == {"language": "eng", "ocr_agent_module": OCR_AGENT_TESSERACT} assert mock_ocr_get_instance.call_args_list[0][1] == {
assert spy.call_args_list[1][1] == {"language": "en", "ocr_agent_module": OCR_AGENT_PADDLE} "language": "eng",
"ocr_agent_module": OCR_AGENT_TESSERACT,
}
assert mock_ocr_get_instance.call_args_list[1][1] == {
"language": "en",
"ocr_agent_module": OCR_AGENT_PADDLE,
}
def test_pass_down_agents(mocker, mock_page): def test_pass_down_agents(mock_ocr_get_instance, mocker, mock_page):
from unstructured.partition.pdf_image.ocr import OCRAgent, PILImage from unstructured.partition.pdf_image.ocr import OCRAgent, PILImage
mocker.patch.object(OCRAgent, "get_layout_from_image", return_value=mock_ocr_layout) mocker.patch.object(OCRAgent, "get_layout_from_image", return_value=mock_ocr_layout)
mocker.patch.object(PILImage, "open", return_value=Image.new("RGB", (100, 100))) mocker.patch.object(PILImage, "open", return_value=Image.new("RGB", (100, 100)))
spy = mocker.spy(OCRAgent, "get_instance")
doc = MagicMock(DocumentLayout) doc = MagicMock(DocumentLayout)
doc.pages = [mock_page] doc.pages = [mock_page]
@ -661,5 +665,11 @@ def test_pass_down_agents(mocker, mock_page):
table_ocr_agent=OCR_AGENT_TESSERACT, table_ocr_agent=OCR_AGENT_TESSERACT,
) )
assert spy.call_args_list[0][1] == {"language": "en", "ocr_agent_module": OCR_AGENT_PADDLE} assert mock_ocr_get_instance.call_args_list[0][1] == {
assert spy.call_args_list[1][1] == {"language": "eng", "ocr_agent_module": OCR_AGENT_TESSERACT} "language": "en",
"ocr_agent_module": OCR_AGENT_PADDLE,
}
assert mock_ocr_get_instance.call_args_list[1][1] == {
"language": "eng",
"ocr_agent_module": OCR_AGENT_TESSERACT,
}

View File

@ -1588,10 +1588,7 @@ def test_partition_pdf_with_password(
_test(result) _test(result)
def test_partition_pdf_with_specified_ocr_agents(mocker): def test_partition_pdf_with_specified_ocr_agents(mock_ocr_get_instance, mocker):
from unstructured.partition.pdf_image.ocr import OCRAgent
spy = mocker.spy(OCRAgent, "get_instance")
pdf.partition_pdf( pdf.partition_pdf(
filename=example_doc_path("pdf/layout-parser-paper-with-table.pdf"), filename=example_doc_path("pdf/layout-parser-paper-with-table.pdf"),
@ -1601,8 +1598,15 @@ def test_partition_pdf_with_specified_ocr_agents(mocker):
table_ocr_agent=OCR_AGENT_PADDLE, table_ocr_agent=OCR_AGENT_PADDLE,
) )
assert spy.call_args_list[0][1] == {"language": "eng", "ocr_agent_module": OCR_AGENT_TESSERACT} # Verify get_instance was called with correct parameters
assert spy.call_args_list[1][1] == {"language": "en", "ocr_agent_module": OCR_AGENT_PADDLE} assert mock_ocr_get_instance.call_args_list[0][1] == {
"language": "eng",
"ocr_agent_module": OCR_AGENT_TESSERACT,
}
assert mock_ocr_get_instance.call_args_list[1][1] == {
"language": "en",
"ocr_agent_module": OCR_AGENT_PADDLE,
}
def test_reproductible_pdf_loader(): def test_reproductible_pdf_loader():