mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
Parallelize tests (#4024)
This commit is contained in:
parent
531490d013
commit
b0dbd71aff
24
Makefile
24
Makefile
@ -142,7 +142,7 @@ export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false
|
||||
.PHONY: test
|
||||
test:
|
||||
PYTHONPATH=. CI=$(CI) \
|
||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
|
||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest -n auto test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
|
||||
|
||||
.PHONY: test-unstructured-api-unit
|
||||
test-unstructured-api-unit:
|
||||
@ -151,7 +151,7 @@ test-unstructured-api-unit:
|
||||
.PHONY: test-no-extras
|
||||
test-no-extras:
|
||||
PYTHONPATH=. CI=$(CI) \
|
||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest \
|
||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest -n auto \
|
||||
test_${PACKAGE_NAME}/partition/test_text.py \
|
||||
test_${PACKAGE_NAME}/partition/test_email.py \
|
||||
test_${PACKAGE_NAME}/partition/html/test_partition.py \
|
||||
@ -159,52 +159,52 @@ test-no-extras:
|
||||
|
||||
.PHONY: test-extra-csv
|
||||
test-extra-csv:
|
||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \
|
||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \
|
||||
test_unstructured/partition/test_csv.py \
|
||||
test_unstructured/partition/test_tsv.py
|
||||
|
||||
.PHONY: test-extra-docx
|
||||
test-extra-docx:
|
||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \
|
||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \
|
||||
test_unstructured/partition/test_doc.py \
|
||||
test_unstructured/partition/test_docx.py
|
||||
|
||||
.PHONY: test-extra-epub
|
||||
test-extra-epub:
|
||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_epub.py
|
||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_epub.py
|
||||
|
||||
.PHONY: test-extra-markdown
|
||||
test-extra-markdown:
|
||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_md.py
|
||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_md.py
|
||||
|
||||
.PHONY: test-extra-odt
|
||||
test-extra-odt:
|
||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_odt.py
|
||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_odt.py
|
||||
|
||||
.PHONY: test-extra-pdf-image
|
||||
test-extra-pdf-image:
|
||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/pdf_image
|
||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/pdf_image
|
||||
|
||||
.PHONY: test-extra-pptx
|
||||
test-extra-pptx:
|
||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \
|
||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \
|
||||
test_unstructured/partition/test_ppt.py \
|
||||
test_unstructured/partition/test_pptx.py
|
||||
|
||||
.PHONY: test-extra-pypandoc
|
||||
test-extra-pypandoc:
|
||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \
|
||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \
|
||||
test_unstructured/partition/test_org.py \
|
||||
test_unstructured/partition/test_rst.py \
|
||||
test_unstructured/partition/test_rtf.py
|
||||
|
||||
.PHONY: test-extra-xlsx
|
||||
test-extra-xlsx:
|
||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_xlsx.py
|
||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_xlsx.py
|
||||
|
||||
.PHONY: test-text-extraction-evaluate
|
||||
test-text-extraction-evaluate:
|
||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/metrics/test_text_extraction.py
|
||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/metrics/test_text_extraction.py
|
||||
|
||||
## check: runs linters (includes tests)
|
||||
.PHONY: check
|
||||
|
@ -10,6 +10,7 @@ mypy
|
||||
pydantic
|
||||
pytest-cov
|
||||
pytest-mock
|
||||
pytest-xdist
|
||||
ruff
|
||||
types-Markdown
|
||||
types-requests
|
||||
|
@ -2,54 +2,56 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.10
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile ./test.in
|
||||
# pip-compile requirements/test.in
|
||||
#
|
||||
annotated-types==0.7.0
|
||||
# via pydantic
|
||||
autoflake==2.3.1
|
||||
# via -r ./test.in
|
||||
# via -r requirements/test.in
|
||||
black==25.1.0
|
||||
# via -r ./test.in
|
||||
# via -r requirements/test.in
|
||||
click==8.2.1
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# -c requirements/base.txt
|
||||
# black
|
||||
coverage[toml]==7.9.0
|
||||
# via
|
||||
# -r ./test.in
|
||||
# -r requirements/test.in
|
||||
# pytest-cov
|
||||
exceptiongroup==1.3.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# -c requirements/base.txt
|
||||
# pytest
|
||||
execnet==2.1.1
|
||||
# via pytest-xdist
|
||||
flake8==7.2.0
|
||||
# via
|
||||
# -r ./test.in
|
||||
# -r requirements/test.in
|
||||
# flake8-print
|
||||
flake8-print==5.0.0
|
||||
# via -r ./test.in
|
||||
# via -r requirements/test.in
|
||||
freezegun==1.5.2
|
||||
# via -r ./test.in
|
||||
# via -r requirements/test.in
|
||||
grpcio==1.73.0
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# -r ./test.in
|
||||
# -c requirements/./deps/constraints.txt
|
||||
# -r requirements/test.in
|
||||
iniconfig==2.1.0
|
||||
# via pytest
|
||||
liccheck==0.9.2
|
||||
# via -r ./test.in
|
||||
# via -r requirements/test.in
|
||||
mccabe==0.7.0
|
||||
# via flake8
|
||||
mypy==1.16.0
|
||||
# via -r ./test.in
|
||||
# via -r requirements/test.in
|
||||
mypy-extensions==1.1.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# -c requirements/base.txt
|
||||
# black
|
||||
# mypy
|
||||
packaging==25.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# -c requirements/base.txt
|
||||
# black
|
||||
# pytest
|
||||
pathspec==0.12.1
|
||||
@ -67,7 +69,7 @@ pycodestyle==2.13.0
|
||||
# flake8
|
||||
# flake8-print
|
||||
pydantic==2.11.5
|
||||
# via -r ./test.in
|
||||
# via -r requirements/test.in
|
||||
pydantic-core==2.33.2
|
||||
# via pydantic
|
||||
pyflakes==3.3.2
|
||||
@ -80,21 +82,24 @@ pytest==8.4.0
|
||||
# via
|
||||
# pytest-cov
|
||||
# pytest-mock
|
||||
# pytest-xdist
|
||||
pytest-cov==6.2.1
|
||||
# via -r ./test.in
|
||||
# via -r requirements/test.in
|
||||
pytest-mock==3.14.1
|
||||
# via -r ./test.in
|
||||
# via -r requirements/test.in
|
||||
pytest-xdist==3.7.0
|
||||
# via -r requirements/test.in
|
||||
python-dateutil==2.9.0.post0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# -c requirements/base.txt
|
||||
# freezegun
|
||||
ruff==0.11.13
|
||||
# via -r ./test.in
|
||||
# via -r requirements/test.in
|
||||
semantic-version==2.10.0
|
||||
# via liccheck
|
||||
six==1.17.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# -c requirements/base.txt
|
||||
# python-dateutil
|
||||
toml==0.10.2
|
||||
# via liccheck
|
||||
@ -106,16 +111,16 @@ tomli==2.2.1
|
||||
# mypy
|
||||
# pytest
|
||||
types-click==7.1.8
|
||||
# via -r ./test.in
|
||||
# via -r requirements/test.in
|
||||
types-markdown==3.8.0.20250415
|
||||
# via -r ./test.in
|
||||
# via -r requirements/test.in
|
||||
types-requests==2.32.4.20250611
|
||||
# via -r ./test.in
|
||||
# via -r requirements/test.in
|
||||
types-tabulate==0.9.0.20241207
|
||||
# via -r ./test.in
|
||||
# via -r requirements/test.in
|
||||
typing-extensions==4.14.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# -c requirements/base.txt
|
||||
# black
|
||||
# exceptiongroup
|
||||
# mypy
|
||||
@ -126,6 +131,6 @@ typing-inspection==0.4.1
|
||||
# via pydantic
|
||||
urllib3==2.4.0
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# -c ./base.txt
|
||||
# -c requirements/./deps/constraints.txt
|
||||
# -c requirements/base.txt
|
||||
# types-requests
|
||||
|
18
test_unstructured/partition/conftest.py
Normal file
18
test_unstructured/partition/conftest.py
Normal file
@ -0,0 +1,18 @@
|
||||
import pytest
|
||||
|
||||
from unstructured.partition.utils.constants import OCR_AGENT_PADDLE, OCR_AGENT_TESSERACT
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_ocr_get_instance(mocker):
|
||||
"""Fixture that mocks OCRAgent.get_instance to prevent real OCR agent instantiation."""
|
||||
|
||||
def mock_get_instance(ocr_agent_module, language):
|
||||
if ocr_agent_module in (OCR_AGENT_TESSERACT, OCR_AGENT_PADDLE):
|
||||
return mocker.MagicMock()
|
||||
else:
|
||||
raise ValueError(f"Unknown OCR agent: {ocr_agent_module}")
|
||||
|
||||
from unstructured.partition.pdf_image.ocr import OCRAgent
|
||||
|
||||
return mocker.patch.object(OCRAgent, "get_instance", side_effect=mock_get_instance)
|
@ -622,11 +622,10 @@ def mock_page(mock_ocr_layout, mock_layout):
|
||||
return mock_page
|
||||
|
||||
|
||||
def test_supplement_layout_with_ocr(mocker, mock_page):
|
||||
def test_supplement_layout_with_ocr(mock_ocr_get_instance, mocker, mock_page):
|
||||
from unstructured.partition.pdf_image.ocr import OCRAgent
|
||||
|
||||
mocker.patch.object(OCRAgent, "get_layout_from_image", return_value=mock_ocr_layout)
|
||||
spy = mocker.spy(OCRAgent, "get_instance")
|
||||
|
||||
ocr.supplement_page_layout_with_ocr(
|
||||
mock_page,
|
||||
@ -637,16 +636,21 @@ def test_supplement_layout_with_ocr(mocker, mock_page):
|
||||
table_ocr_agent=OCR_AGENT_PADDLE,
|
||||
)
|
||||
|
||||
assert spy.call_args_list[0][1] == {"language": "eng", "ocr_agent_module": OCR_AGENT_TESSERACT}
|
||||
assert spy.call_args_list[1][1] == {"language": "en", "ocr_agent_module": OCR_AGENT_PADDLE}
|
||||
assert mock_ocr_get_instance.call_args_list[0][1] == {
|
||||
"language": "eng",
|
||||
"ocr_agent_module": OCR_AGENT_TESSERACT,
|
||||
}
|
||||
assert mock_ocr_get_instance.call_args_list[1][1] == {
|
||||
"language": "en",
|
||||
"ocr_agent_module": OCR_AGENT_PADDLE,
|
||||
}
|
||||
|
||||
|
||||
def test_pass_down_agents(mocker, mock_page):
|
||||
def test_pass_down_agents(mock_ocr_get_instance, mocker, mock_page):
|
||||
from unstructured.partition.pdf_image.ocr import OCRAgent, PILImage
|
||||
|
||||
mocker.patch.object(OCRAgent, "get_layout_from_image", return_value=mock_ocr_layout)
|
||||
mocker.patch.object(PILImage, "open", return_value=Image.new("RGB", (100, 100)))
|
||||
spy = mocker.spy(OCRAgent, "get_instance")
|
||||
doc = MagicMock(DocumentLayout)
|
||||
doc.pages = [mock_page]
|
||||
|
||||
@ -661,5 +665,11 @@ def test_pass_down_agents(mocker, mock_page):
|
||||
table_ocr_agent=OCR_AGENT_TESSERACT,
|
||||
)
|
||||
|
||||
assert spy.call_args_list[0][1] == {"language": "en", "ocr_agent_module": OCR_AGENT_PADDLE}
|
||||
assert spy.call_args_list[1][1] == {"language": "eng", "ocr_agent_module": OCR_AGENT_TESSERACT}
|
||||
assert mock_ocr_get_instance.call_args_list[0][1] == {
|
||||
"language": "en",
|
||||
"ocr_agent_module": OCR_AGENT_PADDLE,
|
||||
}
|
||||
assert mock_ocr_get_instance.call_args_list[1][1] == {
|
||||
"language": "eng",
|
||||
"ocr_agent_module": OCR_AGENT_TESSERACT,
|
||||
}
|
||||
|
@ -1588,10 +1588,7 @@ def test_partition_pdf_with_password(
|
||||
_test(result)
|
||||
|
||||
|
||||
def test_partition_pdf_with_specified_ocr_agents(mocker):
|
||||
from unstructured.partition.pdf_image.ocr import OCRAgent
|
||||
|
||||
spy = mocker.spy(OCRAgent, "get_instance")
|
||||
def test_partition_pdf_with_specified_ocr_agents(mock_ocr_get_instance, mocker):
|
||||
|
||||
pdf.partition_pdf(
|
||||
filename=example_doc_path("pdf/layout-parser-paper-with-table.pdf"),
|
||||
@ -1601,8 +1598,15 @@ def test_partition_pdf_with_specified_ocr_agents(mocker):
|
||||
table_ocr_agent=OCR_AGENT_PADDLE,
|
||||
)
|
||||
|
||||
assert spy.call_args_list[0][1] == {"language": "eng", "ocr_agent_module": OCR_AGENT_TESSERACT}
|
||||
assert spy.call_args_list[1][1] == {"language": "en", "ocr_agent_module": OCR_AGENT_PADDLE}
|
||||
# Verify get_instance was called with correct parameters
|
||||
assert mock_ocr_get_instance.call_args_list[0][1] == {
|
||||
"language": "eng",
|
||||
"ocr_agent_module": OCR_AGENT_TESSERACT,
|
||||
}
|
||||
assert mock_ocr_get_instance.call_args_list[1][1] == {
|
||||
"language": "en",
|
||||
"ocr_agent_module": OCR_AGENT_PADDLE,
|
||||
}
|
||||
|
||||
|
||||
def test_reproductible_pdf_loader():
|
||||
|
Loading…
x
Reference in New Issue
Block a user