diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fbb07776f..3aebc54bf 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -104,7 +104,6 @@ jobs: runs-on: ubuntu-latest env: NLTK_DATA: ${{ github.workspace }}/nltk_data - UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }} needs: [setup, lint] steps: - uses: actions/checkout@v3 @@ -142,6 +141,48 @@ jobs: make test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true make check-coverage + test_chipper: + strategy: + matrix: + python-version: ["3.10"] + runs-on: ubuntu-latest + env: + NLTK_DATA: ${{ github.workspace }}/nltk_data + UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }} + needs: [setup, lint] + steps: + - uses: actions/checkout@v3 + - uses: actions/cache/restore@v3 + id: virtualenv-cache + with: + path: | + .venv + nltk_data + key: unstructured-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }} + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Setup virtual environment (no cache hit) + if: steps.virtualenv-cache.outputs.cache-hit != 'true' + run: | + python${{ matrix.python-version}} -m venv .venv + source .venv/bin/activate + mkdir "$NLTK_DATA" + make install-ci + - name: Test + env: + UNS_API_KEY: ${{ secrets.UNS_API_KEY }} + run: | + source .venv/bin/activate + sudo apt-get update + sudo apt-get install -y poppler-utils + make install-pandoc + sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 + sudo apt-get install -y tesseract-ocr tesseract-ocr-kor + tesseract --version + make test-chipper CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true + test_unit_no_extras: strategy: matrix: @@ -463,6 +504,5 @@ jobs: run: | source .venv/bin/activate echo "UNS_API_KEY=${{ secrets.UNS_API_KEY }}" > uns_test_env_file - echo "UNSTRUCTURED_HF_TOKEN=${{ secrets.HF_TOKEN }}" > uns_test_env_file make docker-build make docker-test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true diff --git a/CHANGELOG.md b/CHANGELOG.md index d33d34b24..a9ee03f50 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ### Enhancements +* **Separate chipper tests** Chipper tests are long-running and require special access, so the tests have been separated into their own file under their own marker, and now have a separate `make` target. * **Add include_header argument for partition_csv and partition_tsv** Now supports retaining header rows in CSV and TSV documents element partitioning. * **Add retry logic for all source connectors** All http calls being made by the ingest source connectors have been isolated and wrapped by the `SourceConnectionNetworkError` custom error, which triggers the retry logic, if enabled, in the ingest pipeline. * **Google Drive source connector supports credentials from memory** Originally, the connector expected a filepath to pull the credentials from when creating the client. This was expanded to support passing that information from memory as a dict if access to the file system might not be available. diff --git a/Makefile b/Makefile index ff31c388e..439e813e6 100644 --- a/Makefile +++ b/Makefile @@ -256,7 +256,12 @@ export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false .PHONY: test test: PYTHONPATH=. CI=$(CI) \ - UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40 + UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest test_${PACKAGE_NAME} -m "not chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40 + +.PHONY: test-chipper +test-chipper: + PYTHONPATH=. CI=$(CI) \ + UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest test_${PACKAGE_NAME} -m "chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40 .PHONY: test-unstructured-api-unit test-unstructured-api-unit: @@ -408,7 +413,7 @@ docker-test: $(DOCKER_IMAGE) \ bash -c "CI=$(CI) \ UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \ - pytest $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)" + pytest -m 'not chipper' $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)" .PHONY: docker-smoke-test docker-smoke-test: diff --git a/setup.cfg b/setup.cfg index ea20540b0..66bda74e3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -11,6 +11,8 @@ filterwarnings = ignore::DeprecationWarning python_classes = Test Describe python_functions = test_ it_ they_ but_ and_ +markers = + chipper: mark a test as running chipper, which tends to be slow and compute-heavy. [autoflake] expand_star_imports=true diff --git a/test_unstructured/partition/pdf_image/test_chipper.py b/test_unstructured/partition/pdf_image/test_chipper.py new file mode 100644 index 000000000..a8080fa06 --- /dev/null +++ b/test_unstructured/partition/pdf_image/test_chipper.py @@ -0,0 +1,31 @@ +import pytest + +from unstructured.partition import pdf + + +@pytest.fixture(scope="session") +def chipper_results(): + elements = pdf.partition_pdf( + "example-docs/layout-parser-paper-fast.pdf", + strategy="hi_res", + model_name="chipper", + ) + return elements + + +@pytest.fixture(scope="session") +def chipper_children(chipper_results): + return [el for el in chipper_results if el.metadata.parent_id is not None] + + +@pytest.mark.chipper() +def test_chipper_has_hierarchy(chipper_children): + assert chipper_children + + +@pytest.mark.chipper() +def test_chipper_not_losing_parents(chipper_results, chipper_children): + assert all( + [el for el in chipper_results if el.id == child.metadata.parent_id] + for child in chipper_children + ) diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index fe386546c..60771f63f 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -892,32 +892,6 @@ def test_check_annotations_within_element(threshold, expected): assert results == expected -@pytest.fixture(scope="session") -def chipper_results(): - elements = pdf.partition_pdf( - "example-docs/layout-parser-paper-fast.pdf", - strategy="hi_res", - model_name="chipper", - ) - return elements - - -@pytest.fixture(scope="session") -def chipper_children(chipper_results): - return [el for el in chipper_results if el.metadata.parent_id is not None] - - -def test_chipper_has_hierarchy(chipper_children): - assert chipper_children - - -def test_chipper_not_losing_parents(chipper_results, chipper_children): - assert all( - [el for el in chipper_results if el.id == child.metadata.parent_id] - for child in chipper_children - ) - - @pytest.mark.parametrize( ("infer_table_structure", "env", "expected"), [