mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
Parallelize tests (#4024)
This commit is contained in:
parent
531490d013
commit
b0dbd71aff
24
Makefile
24
Makefile
@ -142,7 +142,7 @@ export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false
|
|||||||
.PHONY: test
|
.PHONY: test
|
||||||
test:
|
test:
|
||||||
PYTHONPATH=. CI=$(CI) \
|
PYTHONPATH=. CI=$(CI) \
|
||||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
|
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest -n auto test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
|
||||||
|
|
||||||
.PHONY: test-unstructured-api-unit
|
.PHONY: test-unstructured-api-unit
|
||||||
test-unstructured-api-unit:
|
test-unstructured-api-unit:
|
||||||
@ -151,7 +151,7 @@ test-unstructured-api-unit:
|
|||||||
.PHONY: test-no-extras
|
.PHONY: test-no-extras
|
||||||
test-no-extras:
|
test-no-extras:
|
||||||
PYTHONPATH=. CI=$(CI) \
|
PYTHONPATH=. CI=$(CI) \
|
||||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest \
|
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest -n auto \
|
||||||
test_${PACKAGE_NAME}/partition/test_text.py \
|
test_${PACKAGE_NAME}/partition/test_text.py \
|
||||||
test_${PACKAGE_NAME}/partition/test_email.py \
|
test_${PACKAGE_NAME}/partition/test_email.py \
|
||||||
test_${PACKAGE_NAME}/partition/html/test_partition.py \
|
test_${PACKAGE_NAME}/partition/html/test_partition.py \
|
||||||
@ -159,52 +159,52 @@ test-no-extras:
|
|||||||
|
|
||||||
.PHONY: test-extra-csv
|
.PHONY: test-extra-csv
|
||||||
test-extra-csv:
|
test-extra-csv:
|
||||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \
|
||||||
test_unstructured/partition/test_csv.py \
|
test_unstructured/partition/test_csv.py \
|
||||||
test_unstructured/partition/test_tsv.py
|
test_unstructured/partition/test_tsv.py
|
||||||
|
|
||||||
.PHONY: test-extra-docx
|
.PHONY: test-extra-docx
|
||||||
test-extra-docx:
|
test-extra-docx:
|
||||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \
|
||||||
test_unstructured/partition/test_doc.py \
|
test_unstructured/partition/test_doc.py \
|
||||||
test_unstructured/partition/test_docx.py
|
test_unstructured/partition/test_docx.py
|
||||||
|
|
||||||
.PHONY: test-extra-epub
|
.PHONY: test-extra-epub
|
||||||
test-extra-epub:
|
test-extra-epub:
|
||||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_epub.py
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_epub.py
|
||||||
|
|
||||||
.PHONY: test-extra-markdown
|
.PHONY: test-extra-markdown
|
||||||
test-extra-markdown:
|
test-extra-markdown:
|
||||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_md.py
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_md.py
|
||||||
|
|
||||||
.PHONY: test-extra-odt
|
.PHONY: test-extra-odt
|
||||||
test-extra-odt:
|
test-extra-odt:
|
||||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_odt.py
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_odt.py
|
||||||
|
|
||||||
.PHONY: test-extra-pdf-image
|
.PHONY: test-extra-pdf-image
|
||||||
test-extra-pdf-image:
|
test-extra-pdf-image:
|
||||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/pdf_image
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/pdf_image
|
||||||
|
|
||||||
.PHONY: test-extra-pptx
|
.PHONY: test-extra-pptx
|
||||||
test-extra-pptx:
|
test-extra-pptx:
|
||||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \
|
||||||
test_unstructured/partition/test_ppt.py \
|
test_unstructured/partition/test_ppt.py \
|
||||||
test_unstructured/partition/test_pptx.py
|
test_unstructured/partition/test_pptx.py
|
||||||
|
|
||||||
.PHONY: test-extra-pypandoc
|
.PHONY: test-extra-pypandoc
|
||||||
test-extra-pypandoc:
|
test-extra-pypandoc:
|
||||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \
|
||||||
test_unstructured/partition/test_org.py \
|
test_unstructured/partition/test_org.py \
|
||||||
test_unstructured/partition/test_rst.py \
|
test_unstructured/partition/test_rst.py \
|
||||||
test_unstructured/partition/test_rtf.py
|
test_unstructured/partition/test_rtf.py
|
||||||
|
|
||||||
.PHONY: test-extra-xlsx
|
.PHONY: test-extra-xlsx
|
||||||
test-extra-xlsx:
|
test-extra-xlsx:
|
||||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_xlsx.py
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_xlsx.py
|
||||||
|
|
||||||
.PHONY: test-text-extraction-evaluate
|
.PHONY: test-text-extraction-evaluate
|
||||||
test-text-extraction-evaluate:
|
test-text-extraction-evaluate:
|
||||||
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/metrics/test_text_extraction.py
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/metrics/test_text_extraction.py
|
||||||
|
|
||||||
## check: runs linters (includes tests)
|
## check: runs linters (includes tests)
|
||||||
.PHONY: check
|
.PHONY: check
|
||||||
|
@ -10,6 +10,7 @@ mypy
|
|||||||
pydantic
|
pydantic
|
||||||
pytest-cov
|
pytest-cov
|
||||||
pytest-mock
|
pytest-mock
|
||||||
|
pytest-xdist
|
||||||
ruff
|
ruff
|
||||||
types-Markdown
|
types-Markdown
|
||||||
types-requests
|
types-requests
|
||||||
|
@ -2,54 +2,56 @@
|
|||||||
# This file is autogenerated by pip-compile with Python 3.10
|
# This file is autogenerated by pip-compile with Python 3.10
|
||||||
# by the following command:
|
# by the following command:
|
||||||
#
|
#
|
||||||
# pip-compile ./test.in
|
# pip-compile requirements/test.in
|
||||||
#
|
#
|
||||||
annotated-types==0.7.0
|
annotated-types==0.7.0
|
||||||
# via pydantic
|
# via pydantic
|
||||||
autoflake==2.3.1
|
autoflake==2.3.1
|
||||||
# via -r ./test.in
|
# via -r requirements/test.in
|
||||||
black==25.1.0
|
black==25.1.0
|
||||||
# via -r ./test.in
|
# via -r requirements/test.in
|
||||||
click==8.2.1
|
click==8.2.1
|
||||||
# via
|
# via
|
||||||
# -c ./base.txt
|
# -c requirements/base.txt
|
||||||
# black
|
# black
|
||||||
coverage[toml]==7.9.0
|
coverage[toml]==7.9.0
|
||||||
# via
|
# via
|
||||||
# -r ./test.in
|
# -r requirements/test.in
|
||||||
# pytest-cov
|
# pytest-cov
|
||||||
exceptiongroup==1.3.0
|
exceptiongroup==1.3.0
|
||||||
# via
|
# via
|
||||||
# -c ./base.txt
|
# -c requirements/base.txt
|
||||||
# pytest
|
# pytest
|
||||||
|
execnet==2.1.1
|
||||||
|
# via pytest-xdist
|
||||||
flake8==7.2.0
|
flake8==7.2.0
|
||||||
# via
|
# via
|
||||||
# -r ./test.in
|
# -r requirements/test.in
|
||||||
# flake8-print
|
# flake8-print
|
||||||
flake8-print==5.0.0
|
flake8-print==5.0.0
|
||||||
# via -r ./test.in
|
# via -r requirements/test.in
|
||||||
freezegun==1.5.2
|
freezegun==1.5.2
|
||||||
# via -r ./test.in
|
# via -r requirements/test.in
|
||||||
grpcio==1.73.0
|
grpcio==1.73.0
|
||||||
# via
|
# via
|
||||||
# -c ././deps/constraints.txt
|
# -c requirements/./deps/constraints.txt
|
||||||
# -r ./test.in
|
# -r requirements/test.in
|
||||||
iniconfig==2.1.0
|
iniconfig==2.1.0
|
||||||
# via pytest
|
# via pytest
|
||||||
liccheck==0.9.2
|
liccheck==0.9.2
|
||||||
# via -r ./test.in
|
# via -r requirements/test.in
|
||||||
mccabe==0.7.0
|
mccabe==0.7.0
|
||||||
# via flake8
|
# via flake8
|
||||||
mypy==1.16.0
|
mypy==1.16.0
|
||||||
# via -r ./test.in
|
# via -r requirements/test.in
|
||||||
mypy-extensions==1.1.0
|
mypy-extensions==1.1.0
|
||||||
# via
|
# via
|
||||||
# -c ./base.txt
|
# -c requirements/base.txt
|
||||||
# black
|
# black
|
||||||
# mypy
|
# mypy
|
||||||
packaging==25.0
|
packaging==25.0
|
||||||
# via
|
# via
|
||||||
# -c ./base.txt
|
# -c requirements/base.txt
|
||||||
# black
|
# black
|
||||||
# pytest
|
# pytest
|
||||||
pathspec==0.12.1
|
pathspec==0.12.1
|
||||||
@ -67,7 +69,7 @@ pycodestyle==2.13.0
|
|||||||
# flake8
|
# flake8
|
||||||
# flake8-print
|
# flake8-print
|
||||||
pydantic==2.11.5
|
pydantic==2.11.5
|
||||||
# via -r ./test.in
|
# via -r requirements/test.in
|
||||||
pydantic-core==2.33.2
|
pydantic-core==2.33.2
|
||||||
# via pydantic
|
# via pydantic
|
||||||
pyflakes==3.3.2
|
pyflakes==3.3.2
|
||||||
@ -80,21 +82,24 @@ pytest==8.4.0
|
|||||||
# via
|
# via
|
||||||
# pytest-cov
|
# pytest-cov
|
||||||
# pytest-mock
|
# pytest-mock
|
||||||
|
# pytest-xdist
|
||||||
pytest-cov==6.2.1
|
pytest-cov==6.2.1
|
||||||
# via -r ./test.in
|
# via -r requirements/test.in
|
||||||
pytest-mock==3.14.1
|
pytest-mock==3.14.1
|
||||||
# via -r ./test.in
|
# via -r requirements/test.in
|
||||||
|
pytest-xdist==3.7.0
|
||||||
|
# via -r requirements/test.in
|
||||||
python-dateutil==2.9.0.post0
|
python-dateutil==2.9.0.post0
|
||||||
# via
|
# via
|
||||||
# -c ./base.txt
|
# -c requirements/base.txt
|
||||||
# freezegun
|
# freezegun
|
||||||
ruff==0.11.13
|
ruff==0.11.13
|
||||||
# via -r ./test.in
|
# via -r requirements/test.in
|
||||||
semantic-version==2.10.0
|
semantic-version==2.10.0
|
||||||
# via liccheck
|
# via liccheck
|
||||||
six==1.17.0
|
six==1.17.0
|
||||||
# via
|
# via
|
||||||
# -c ./base.txt
|
# -c requirements/base.txt
|
||||||
# python-dateutil
|
# python-dateutil
|
||||||
toml==0.10.2
|
toml==0.10.2
|
||||||
# via liccheck
|
# via liccheck
|
||||||
@ -106,16 +111,16 @@ tomli==2.2.1
|
|||||||
# mypy
|
# mypy
|
||||||
# pytest
|
# pytest
|
||||||
types-click==7.1.8
|
types-click==7.1.8
|
||||||
# via -r ./test.in
|
# via -r requirements/test.in
|
||||||
types-markdown==3.8.0.20250415
|
types-markdown==3.8.0.20250415
|
||||||
# via -r ./test.in
|
# via -r requirements/test.in
|
||||||
types-requests==2.32.4.20250611
|
types-requests==2.32.4.20250611
|
||||||
# via -r ./test.in
|
# via -r requirements/test.in
|
||||||
types-tabulate==0.9.0.20241207
|
types-tabulate==0.9.0.20241207
|
||||||
# via -r ./test.in
|
# via -r requirements/test.in
|
||||||
typing-extensions==4.14.0
|
typing-extensions==4.14.0
|
||||||
# via
|
# via
|
||||||
# -c ./base.txt
|
# -c requirements/base.txt
|
||||||
# black
|
# black
|
||||||
# exceptiongroup
|
# exceptiongroup
|
||||||
# mypy
|
# mypy
|
||||||
@ -126,6 +131,6 @@ typing-inspection==0.4.1
|
|||||||
# via pydantic
|
# via pydantic
|
||||||
urllib3==2.4.0
|
urllib3==2.4.0
|
||||||
# via
|
# via
|
||||||
# -c ././deps/constraints.txt
|
# -c requirements/./deps/constraints.txt
|
||||||
# -c ./base.txt
|
# -c requirements/base.txt
|
||||||
# types-requests
|
# types-requests
|
||||||
|
18
test_unstructured/partition/conftest.py
Normal file
18
test_unstructured/partition/conftest.py
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
from unstructured.partition.utils.constants import OCR_AGENT_PADDLE, OCR_AGENT_TESSERACT
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_ocr_get_instance(mocker):
|
||||||
|
"""Fixture that mocks OCRAgent.get_instance to prevent real OCR agent instantiation."""
|
||||||
|
|
||||||
|
def mock_get_instance(ocr_agent_module, language):
|
||||||
|
if ocr_agent_module in (OCR_AGENT_TESSERACT, OCR_AGENT_PADDLE):
|
||||||
|
return mocker.MagicMock()
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown OCR agent: {ocr_agent_module}")
|
||||||
|
|
||||||
|
from unstructured.partition.pdf_image.ocr import OCRAgent
|
||||||
|
|
||||||
|
return mocker.patch.object(OCRAgent, "get_instance", side_effect=mock_get_instance)
|
@ -622,11 +622,10 @@ def mock_page(mock_ocr_layout, mock_layout):
|
|||||||
return mock_page
|
return mock_page
|
||||||
|
|
||||||
|
|
||||||
def test_supplement_layout_with_ocr(mocker, mock_page):
|
def test_supplement_layout_with_ocr(mock_ocr_get_instance, mocker, mock_page):
|
||||||
from unstructured.partition.pdf_image.ocr import OCRAgent
|
from unstructured.partition.pdf_image.ocr import OCRAgent
|
||||||
|
|
||||||
mocker.patch.object(OCRAgent, "get_layout_from_image", return_value=mock_ocr_layout)
|
mocker.patch.object(OCRAgent, "get_layout_from_image", return_value=mock_ocr_layout)
|
||||||
spy = mocker.spy(OCRAgent, "get_instance")
|
|
||||||
|
|
||||||
ocr.supplement_page_layout_with_ocr(
|
ocr.supplement_page_layout_with_ocr(
|
||||||
mock_page,
|
mock_page,
|
||||||
@ -637,16 +636,21 @@ def test_supplement_layout_with_ocr(mocker, mock_page):
|
|||||||
table_ocr_agent=OCR_AGENT_PADDLE,
|
table_ocr_agent=OCR_AGENT_PADDLE,
|
||||||
)
|
)
|
||||||
|
|
||||||
assert spy.call_args_list[0][1] == {"language": "eng", "ocr_agent_module": OCR_AGENT_TESSERACT}
|
assert mock_ocr_get_instance.call_args_list[0][1] == {
|
||||||
assert spy.call_args_list[1][1] == {"language": "en", "ocr_agent_module": OCR_AGENT_PADDLE}
|
"language": "eng",
|
||||||
|
"ocr_agent_module": OCR_AGENT_TESSERACT,
|
||||||
|
}
|
||||||
|
assert mock_ocr_get_instance.call_args_list[1][1] == {
|
||||||
|
"language": "en",
|
||||||
|
"ocr_agent_module": OCR_AGENT_PADDLE,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def test_pass_down_agents(mocker, mock_page):
|
def test_pass_down_agents(mock_ocr_get_instance, mocker, mock_page):
|
||||||
from unstructured.partition.pdf_image.ocr import OCRAgent, PILImage
|
from unstructured.partition.pdf_image.ocr import OCRAgent, PILImage
|
||||||
|
|
||||||
mocker.patch.object(OCRAgent, "get_layout_from_image", return_value=mock_ocr_layout)
|
mocker.patch.object(OCRAgent, "get_layout_from_image", return_value=mock_ocr_layout)
|
||||||
mocker.patch.object(PILImage, "open", return_value=Image.new("RGB", (100, 100)))
|
mocker.patch.object(PILImage, "open", return_value=Image.new("RGB", (100, 100)))
|
||||||
spy = mocker.spy(OCRAgent, "get_instance")
|
|
||||||
doc = MagicMock(DocumentLayout)
|
doc = MagicMock(DocumentLayout)
|
||||||
doc.pages = [mock_page]
|
doc.pages = [mock_page]
|
||||||
|
|
||||||
@ -661,5 +665,11 @@ def test_pass_down_agents(mocker, mock_page):
|
|||||||
table_ocr_agent=OCR_AGENT_TESSERACT,
|
table_ocr_agent=OCR_AGENT_TESSERACT,
|
||||||
)
|
)
|
||||||
|
|
||||||
assert spy.call_args_list[0][1] == {"language": "en", "ocr_agent_module": OCR_AGENT_PADDLE}
|
assert mock_ocr_get_instance.call_args_list[0][1] == {
|
||||||
assert spy.call_args_list[1][1] == {"language": "eng", "ocr_agent_module": OCR_AGENT_TESSERACT}
|
"language": "en",
|
||||||
|
"ocr_agent_module": OCR_AGENT_PADDLE,
|
||||||
|
}
|
||||||
|
assert mock_ocr_get_instance.call_args_list[1][1] == {
|
||||||
|
"language": "eng",
|
||||||
|
"ocr_agent_module": OCR_AGENT_TESSERACT,
|
||||||
|
}
|
||||||
|
@ -1588,10 +1588,7 @@ def test_partition_pdf_with_password(
|
|||||||
_test(result)
|
_test(result)
|
||||||
|
|
||||||
|
|
||||||
def test_partition_pdf_with_specified_ocr_agents(mocker):
|
def test_partition_pdf_with_specified_ocr_agents(mock_ocr_get_instance, mocker):
|
||||||
from unstructured.partition.pdf_image.ocr import OCRAgent
|
|
||||||
|
|
||||||
spy = mocker.spy(OCRAgent, "get_instance")
|
|
||||||
|
|
||||||
pdf.partition_pdf(
|
pdf.partition_pdf(
|
||||||
filename=example_doc_path("pdf/layout-parser-paper-with-table.pdf"),
|
filename=example_doc_path("pdf/layout-parser-paper-with-table.pdf"),
|
||||||
@ -1601,8 +1598,15 @@ def test_partition_pdf_with_specified_ocr_agents(mocker):
|
|||||||
table_ocr_agent=OCR_AGENT_PADDLE,
|
table_ocr_agent=OCR_AGENT_PADDLE,
|
||||||
)
|
)
|
||||||
|
|
||||||
assert spy.call_args_list[0][1] == {"language": "eng", "ocr_agent_module": OCR_AGENT_TESSERACT}
|
# Verify get_instance was called with correct parameters
|
||||||
assert spy.call_args_list[1][1] == {"language": "en", "ocr_agent_module": OCR_AGENT_PADDLE}
|
assert mock_ocr_get_instance.call_args_list[0][1] == {
|
||||||
|
"language": "eng",
|
||||||
|
"ocr_agent_module": OCR_AGENT_TESSERACT,
|
||||||
|
}
|
||||||
|
assert mock_ocr_get_instance.call_args_list[1][1] == {
|
||||||
|
"language": "en",
|
||||||
|
"ocr_agent_module": OCR_AGENT_PADDLE,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def test_reproductible_pdf_loader():
|
def test_reproductible_pdf_loader():
|
||||||
|
Loading…
x
Reference in New Issue
Block a user