tests: separate chipper tests (#1939)

Separates chipper tests to speed up testing and CI.
This commit is contained in:
qued 2023-10-31 16:02:00 -05:00 committed by GitHub
parent 123ad20f4c
commit b08562ba1a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 83 additions and 30 deletions

View File

@ -104,7 +104,6 @@ jobs:
runs-on: ubuntu-latest
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
needs: [setup, lint]
steps:
- uses: actions/checkout@v3
@ -142,6 +141,48 @@ jobs:
make test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
make check-coverage
test_chipper:
strategy:
matrix:
python-version: ["3.10"]
runs-on: ubuntu-latest
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
needs: [setup, lint]
steps:
- uses: actions/checkout@v3
- uses: actions/cache/restore@v3
id: virtualenv-cache
with:
path: |
.venv
nltk_data
key: unstructured-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }}
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Setup virtual environment (no cache hit)
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
run: |
python${{ matrix.python-version}} -m venv .venv
source .venv/bin/activate
mkdir "$NLTK_DATA"
make install-ci
- name: Test
env:
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
run: |
source .venv/bin/activate
sudo apt-get update
sudo apt-get install -y poppler-utils
make install-pandoc
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
tesseract --version
make test-chipper CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
test_unit_no_extras:
strategy:
matrix:
@ -463,6 +504,5 @@ jobs:
run: |
source .venv/bin/activate
echo "UNS_API_KEY=${{ secrets.UNS_API_KEY }}" > uns_test_env_file
echo "UNSTRUCTURED_HF_TOKEN=${{ secrets.HF_TOKEN }}" > uns_test_env_file
make docker-build
make docker-test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true

View File

@ -2,6 +2,7 @@
### Enhancements
* **Separate chipper tests** Chipper tests are long-running and require special access, so the tests have been separated into their own file under their own marker, and now have a separate `make` target.
* **Add include_header argument for partition_csv and partition_tsv** Now supports retaining header rows in CSV and TSV documents element partitioning.
* **Add retry logic for all source connectors** All http calls being made by the ingest source connectors have been isolated and wrapped by the `SourceConnectionNetworkError` custom error, which triggers the retry logic, if enabled, in the ingest pipeline.
* **Google Drive source connector supports credentials from memory** Originally, the connector expected a filepath to pull the credentials from when creating the client. This was expanded to support passing that information from memory as a dict if access to the file system might not be available.

View File

@ -256,7 +256,12 @@ export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false
.PHONY: test
test:
PYTHONPATH=. CI=$(CI) \
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest test_${PACKAGE_NAME} -m "not chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
.PHONY: test-chipper
test-chipper:
PYTHONPATH=. CI=$(CI) \
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest test_${PACKAGE_NAME} -m "chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
.PHONY: test-unstructured-api-unit
test-unstructured-api-unit:
@ -408,7 +413,7 @@ docker-test:
$(DOCKER_IMAGE) \
bash -c "CI=$(CI) \
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
pytest $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
pytest -m 'not chipper' $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
.PHONY: docker-smoke-test
docker-smoke-test:

View File

@ -11,6 +11,8 @@ filterwarnings =
ignore::DeprecationWarning
python_classes = Test Describe
python_functions = test_ it_ they_ but_ and_
markers =
chipper: mark a test as running chipper, which tends to be slow and compute-heavy.
[autoflake]
expand_star_imports=true

View File

@ -0,0 +1,31 @@
import pytest
from unstructured.partition import pdf
@pytest.fixture(scope="session")
def chipper_results():
elements = pdf.partition_pdf(
"example-docs/layout-parser-paper-fast.pdf",
strategy="hi_res",
model_name="chipper",
)
return elements
@pytest.fixture(scope="session")
def chipper_children(chipper_results):
return [el for el in chipper_results if el.metadata.parent_id is not None]
@pytest.mark.chipper()
def test_chipper_has_hierarchy(chipper_children):
assert chipper_children
@pytest.mark.chipper()
def test_chipper_not_losing_parents(chipper_results, chipper_children):
assert all(
[el for el in chipper_results if el.id == child.metadata.parent_id]
for child in chipper_children
)

View File

@ -892,32 +892,6 @@ def test_check_annotations_within_element(threshold, expected):
assert results == expected
@pytest.fixture(scope="session")
def chipper_results():
elements = pdf.partition_pdf(
"example-docs/layout-parser-paper-fast.pdf",
strategy="hi_res",
model_name="chipper",
)
return elements
@pytest.fixture(scope="session")
def chipper_children(chipper_results):
return [el for el in chipper_results if el.metadata.parent_id is not None]
def test_chipper_has_hierarchy(chipper_children):
assert chipper_children
def test_chipper_not_losing_parents(chipper_results, chipper_children):
assert all(
[el for el in chipper_results if el.id == child.metadata.parent_id]
for child in chipper_children
)
@pytest.mark.parametrize(
("infer_table_structure", "env", "expected"),
[