mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-10 23:47:36 +00:00
tests: separate chipper tests (#1939)
Separates chipper tests to speed up testing and CI.
This commit is contained in:
parent
123ad20f4c
commit
b08562ba1a
44
.github/workflows/ci.yml
vendored
44
.github/workflows/ci.yml
vendored
@ -104,7 +104,6 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
env:
|
env:
|
||||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
||||||
UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
||||||
needs: [setup, lint]
|
needs: [setup, lint]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
@ -142,6 +141,48 @@ jobs:
|
|||||||
make test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
|
make test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
|
||||||
make check-coverage
|
make check-coverage
|
||||||
|
|
||||||
|
test_chipper:
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
python-version: ["3.10"]
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
env:
|
||||||
|
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
||||||
|
UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||||
|
needs: [setup, lint]
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
- uses: actions/cache/restore@v3
|
||||||
|
id: virtualenv-cache
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
.venv
|
||||||
|
nltk_data
|
||||||
|
key: unstructured-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }}
|
||||||
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
- name: Setup virtual environment (no cache hit)
|
||||||
|
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
|
||||||
|
run: |
|
||||||
|
python${{ matrix.python-version}} -m venv .venv
|
||||||
|
source .venv/bin/activate
|
||||||
|
mkdir "$NLTK_DATA"
|
||||||
|
make install-ci
|
||||||
|
- name: Test
|
||||||
|
env:
|
||||||
|
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
|
||||||
|
run: |
|
||||||
|
source .venv/bin/activate
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y poppler-utils
|
||||||
|
make install-pandoc
|
||||||
|
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
||||||
|
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
|
||||||
|
tesseract --version
|
||||||
|
make test-chipper CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
|
||||||
|
|
||||||
test_unit_no_extras:
|
test_unit_no_extras:
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
@ -463,6 +504,5 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
source .venv/bin/activate
|
source .venv/bin/activate
|
||||||
echo "UNS_API_KEY=${{ secrets.UNS_API_KEY }}" > uns_test_env_file
|
echo "UNS_API_KEY=${{ secrets.UNS_API_KEY }}" > uns_test_env_file
|
||||||
echo "UNSTRUCTURED_HF_TOKEN=${{ secrets.HF_TOKEN }}" > uns_test_env_file
|
|
||||||
make docker-build
|
make docker-build
|
||||||
make docker-test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
|
make docker-test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
|
* **Separate chipper tests** Chipper tests are long-running and require special access, so the tests have been separated into their own file under their own marker, and now have a separate `make` target.
|
||||||
* **Add include_header argument for partition_csv and partition_tsv** Now supports retaining header rows in CSV and TSV documents element partitioning.
|
* **Add include_header argument for partition_csv and partition_tsv** Now supports retaining header rows in CSV and TSV documents element partitioning.
|
||||||
* **Add retry logic for all source connectors** All http calls being made by the ingest source connectors have been isolated and wrapped by the `SourceConnectionNetworkError` custom error, which triggers the retry logic, if enabled, in the ingest pipeline.
|
* **Add retry logic for all source connectors** All http calls being made by the ingest source connectors have been isolated and wrapped by the `SourceConnectionNetworkError` custom error, which triggers the retry logic, if enabled, in the ingest pipeline.
|
||||||
* **Google Drive source connector supports credentials from memory** Originally, the connector expected a filepath to pull the credentials from when creating the client. This was expanded to support passing that information from memory as a dict if access to the file system might not be available.
|
* **Google Drive source connector supports credentials from memory** Originally, the connector expected a filepath to pull the credentials from when creating the client. This was expanded to support passing that information from memory as a dict if access to the file system might not be available.
|
||||||
|
|||||||
9
Makefile
9
Makefile
@ -256,7 +256,12 @@ export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false
|
|||||||
.PHONY: test
|
.PHONY: test
|
||||||
test:
|
test:
|
||||||
PYTHONPATH=. CI=$(CI) \
|
PYTHONPATH=. CI=$(CI) \
|
||||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
|
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest test_${PACKAGE_NAME} -m "not chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
|
||||||
|
|
||||||
|
.PHONY: test-chipper
|
||||||
|
test-chipper:
|
||||||
|
PYTHONPATH=. CI=$(CI) \
|
||||||
|
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest test_${PACKAGE_NAME} -m "chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
|
||||||
|
|
||||||
.PHONY: test-unstructured-api-unit
|
.PHONY: test-unstructured-api-unit
|
||||||
test-unstructured-api-unit:
|
test-unstructured-api-unit:
|
||||||
@ -408,7 +413,7 @@ docker-test:
|
|||||||
$(DOCKER_IMAGE) \
|
$(DOCKER_IMAGE) \
|
||||||
bash -c "CI=$(CI) \
|
bash -c "CI=$(CI) \
|
||||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
|
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
|
||||||
pytest $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
|
pytest -m 'not chipper' $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
|
||||||
|
|
||||||
.PHONY: docker-smoke-test
|
.PHONY: docker-smoke-test
|
||||||
docker-smoke-test:
|
docker-smoke-test:
|
||||||
|
|||||||
@ -11,6 +11,8 @@ filterwarnings =
|
|||||||
ignore::DeprecationWarning
|
ignore::DeprecationWarning
|
||||||
python_classes = Test Describe
|
python_classes = Test Describe
|
||||||
python_functions = test_ it_ they_ but_ and_
|
python_functions = test_ it_ they_ but_ and_
|
||||||
|
markers =
|
||||||
|
chipper: mark a test as running chipper, which tends to be slow and compute-heavy.
|
||||||
|
|
||||||
[autoflake]
|
[autoflake]
|
||||||
expand_star_imports=true
|
expand_star_imports=true
|
||||||
|
|||||||
31
test_unstructured/partition/pdf_image/test_chipper.py
Normal file
31
test_unstructured/partition/pdf_image/test_chipper.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
from unstructured.partition import pdf
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def chipper_results():
|
||||||
|
elements = pdf.partition_pdf(
|
||||||
|
"example-docs/layout-parser-paper-fast.pdf",
|
||||||
|
strategy="hi_res",
|
||||||
|
model_name="chipper",
|
||||||
|
)
|
||||||
|
return elements
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def chipper_children(chipper_results):
|
||||||
|
return [el for el in chipper_results if el.metadata.parent_id is not None]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.chipper()
|
||||||
|
def test_chipper_has_hierarchy(chipper_children):
|
||||||
|
assert chipper_children
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.chipper()
|
||||||
|
def test_chipper_not_losing_parents(chipper_results, chipper_children):
|
||||||
|
assert all(
|
||||||
|
[el for el in chipper_results if el.id == child.metadata.parent_id]
|
||||||
|
for child in chipper_children
|
||||||
|
)
|
||||||
@ -892,32 +892,6 @@ def test_check_annotations_within_element(threshold, expected):
|
|||||||
assert results == expected
|
assert results == expected
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def chipper_results():
|
|
||||||
elements = pdf.partition_pdf(
|
|
||||||
"example-docs/layout-parser-paper-fast.pdf",
|
|
||||||
strategy="hi_res",
|
|
||||||
model_name="chipper",
|
|
||||||
)
|
|
||||||
return elements
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def chipper_children(chipper_results):
|
|
||||||
return [el for el in chipper_results if el.metadata.parent_id is not None]
|
|
||||||
|
|
||||||
|
|
||||||
def test_chipper_has_hierarchy(chipper_children):
|
|
||||||
assert chipper_children
|
|
||||||
|
|
||||||
|
|
||||||
def test_chipper_not_losing_parents(chipper_results, chipper_children):
|
|
||||||
assert all(
|
|
||||||
[el for el in chipper_results if el.id == child.metadata.parent_id]
|
|
||||||
for child in chipper_children
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("infer_table_structure", "env", "expected"),
|
("infer_table_structure", "env", "expected"),
|
||||||
[
|
[
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user