mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-02 02:53:31 +00:00
tests: separate chipper tests (#1939)
Separates chipper tests to speed up testing and CI.
This commit is contained in:
parent
123ad20f4c
commit
b08562ba1a
44
.github/workflows/ci.yml
vendored
44
.github/workflows/ci.yml
vendored
@ -104,7 +104,6 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
||||
UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
needs: [setup, lint]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
@ -142,6 +141,48 @@ jobs:
|
||||
make test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
|
||||
make check-coverage
|
||||
|
||||
test_chipper:
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.10"]
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
||||
UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
needs: [setup, lint]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/cache/restore@v3
|
||||
id: virtualenv-cache
|
||||
with:
|
||||
path: |
|
||||
.venv
|
||||
nltk_data
|
||||
key: unstructured-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }}
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Setup virtual environment (no cache hit)
|
||||
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
python${{ matrix.python-version}} -m venv .venv
|
||||
source .venv/bin/activate
|
||||
mkdir "$NLTK_DATA"
|
||||
make install-ci
|
||||
- name: Test
|
||||
env:
|
||||
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y poppler-utils
|
||||
make install-pandoc
|
||||
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
||||
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
|
||||
tesseract --version
|
||||
make test-chipper CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
|
||||
|
||||
test_unit_no_extras:
|
||||
strategy:
|
||||
matrix:
|
||||
@ -463,6 +504,5 @@ jobs:
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
echo "UNS_API_KEY=${{ secrets.UNS_API_KEY }}" > uns_test_env_file
|
||||
echo "UNSTRUCTURED_HF_TOKEN=${{ secrets.HF_TOKEN }}" > uns_test_env_file
|
||||
make docker-build
|
||||
make docker-test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **Separate chipper tests** Chipper tests are long-running and require special access, so the tests have been separated into their own file under their own marker, and now have a separate `make` target.
|
||||
* **Add include_header argument for partition_csv and partition_tsv** Now supports retaining header rows in CSV and TSV documents element partitioning.
|
||||
* **Add retry logic for all source connectors** All http calls being made by the ingest source connectors have been isolated and wrapped by the `SourceConnectionNetworkError` custom error, which triggers the retry logic, if enabled, in the ingest pipeline.
|
||||
* **Google Drive source connector supports credentials from memory** Originally, the connector expected a filepath to pull the credentials from when creating the client. This was expanded to support passing that information from memory as a dict if access to the file system might not be available.
|
||||
|
||||
9
Makefile
9
Makefile
@ -256,7 +256,12 @@ export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false
|
||||
.PHONY: test
|
||||
test:
|
||||
PYTHONPATH=. CI=$(CI) \
|
||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
|
||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest test_${PACKAGE_NAME} -m "not chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
|
||||
|
||||
.PHONY: test-chipper
|
||||
test-chipper:
|
||||
PYTHONPATH=. CI=$(CI) \
|
||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest test_${PACKAGE_NAME} -m "chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
|
||||
|
||||
.PHONY: test-unstructured-api-unit
|
||||
test-unstructured-api-unit:
|
||||
@ -408,7 +413,7 @@ docker-test:
|
||||
$(DOCKER_IMAGE) \
|
||||
bash -c "CI=$(CI) \
|
||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
|
||||
pytest $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
|
||||
pytest -m 'not chipper' $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
|
||||
|
||||
.PHONY: docker-smoke-test
|
||||
docker-smoke-test:
|
||||
|
||||
@ -11,6 +11,8 @@ filterwarnings =
|
||||
ignore::DeprecationWarning
|
||||
python_classes = Test Describe
|
||||
python_functions = test_ it_ they_ but_ and_
|
||||
markers =
|
||||
chipper: mark a test as running chipper, which tends to be slow and compute-heavy.
|
||||
|
||||
[autoflake]
|
||||
expand_star_imports=true
|
||||
|
||||
31
test_unstructured/partition/pdf_image/test_chipper.py
Normal file
31
test_unstructured/partition/pdf_image/test_chipper.py
Normal file
@ -0,0 +1,31 @@
|
||||
import pytest
|
||||
|
||||
from unstructured.partition import pdf
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def chipper_results():
|
||||
elements = pdf.partition_pdf(
|
||||
"example-docs/layout-parser-paper-fast.pdf",
|
||||
strategy="hi_res",
|
||||
model_name="chipper",
|
||||
)
|
||||
return elements
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def chipper_children(chipper_results):
|
||||
return [el for el in chipper_results if el.metadata.parent_id is not None]
|
||||
|
||||
|
||||
@pytest.mark.chipper()
|
||||
def test_chipper_has_hierarchy(chipper_children):
|
||||
assert chipper_children
|
||||
|
||||
|
||||
@pytest.mark.chipper()
|
||||
def test_chipper_not_losing_parents(chipper_results, chipper_children):
|
||||
assert all(
|
||||
[el for el in chipper_results if el.id == child.metadata.parent_id]
|
||||
for child in chipper_children
|
||||
)
|
||||
@ -892,32 +892,6 @@ def test_check_annotations_within_element(threshold, expected):
|
||||
assert results == expected
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def chipper_results():
|
||||
elements = pdf.partition_pdf(
|
||||
"example-docs/layout-parser-paper-fast.pdf",
|
||||
strategy="hi_res",
|
||||
model_name="chipper",
|
||||
)
|
||||
return elements
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def chipper_children(chipper_results):
|
||||
return [el for el in chipper_results if el.metadata.parent_id is not None]
|
||||
|
||||
|
||||
def test_chipper_has_hierarchy(chipper_children):
|
||||
assert chipper_children
|
||||
|
||||
|
||||
def test_chipper_not_losing_parents(chipper_results, chipper_children):
|
||||
assert all(
|
||||
[el for el in chipper_results if el.id == child.metadata.parent_id]
|
||||
for child in chipper_children
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("infer_table_structure", "env", "expected"),
|
||||
[
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user