test: create CI pipelines for verifying base and extras pass respective tests (#1137)

**Summary**
Closes #747
* Create CI Pipeline for running text, xml, email, and html doc tests
against the library installed without extras
* Create CI Pipeline for running each library extra against their
respective tests
This commit is contained in:
Newel H 2023-08-19 12:56:13 -04:00 committed by GitHub
parent 69edffb0c0
commit e4aa7373e2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 190 additions and 32 deletions

View File

@ -152,6 +152,87 @@ jobs:
make test CI=true
make check-coverage
test_unit_no_extras:
strategy:
matrix:
python-version: ["3.8"]
runs-on: ubuntu-latest
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: [setup, lint]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- uses: actions/cache/restore@v3
id: virtualenv-cache
with:
path: |
nltk_data
.venv-base
key: unstructured-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }}-base
- name: Setup virtual environment
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
run: |
python${{ matrix.python-version}} -m venv .venv-base
source .venv-base/bin/activate
mkdir "$NLTK_DATA"
make install-base-ci
- name: Test
env:
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
run: |
source .venv-base/bin/activate
make test-no-extras CI=true
test_unit_dependency_extras:
# NOTE(newelh) - Split extras into separate steps in the same pipeline (avoid using matrix)
strategy:
matrix:
python-version: ["3.8"]
extra: ["csv", "docx", "odt", "markdown", "pypandoc", "msg", "pdf-image", "pptx", "xlsx"]
runs-on: ubuntu-latest
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: [setup, lint, test_unit_no_extras]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- uses: actions/cache/restore@v3
id: virtualenv-cache
with:
path: |
nltk_data
key: unstructured-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }}-base
- name: Setup virtual environment
run: |
python${{ matrix.python-version}} -m venv .venv-${{ matrix.extra }}
source .venv-${{ matrix.extra }}/bin/activate
make install-base-ci
make install-${{ matrix.extra }}
- name: Test
env:
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
run: |
source .venv-${{ matrix.extra }}/bin/activate
# NOTE(newelh) - determine what needs to be installed here
sudo apt-get update
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
make install-pandoc
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
tesseract --version
# NOTE(robinson) - Installing weaviate-client separately here because the requests
# version conflicts with label_studio_sdk
pip install weaviate-client
pip install argilla
make test-extra-${{ matrix.extra }} CI=true
test_ingest:
strategy:
matrix:

View File

@ -1,6 +1,11 @@
## 0.10.5-dev0
### Enhancements
* Create new CI Pipelines
- Checking text, xml, email, and html doc tests against the library installed without extras
- Checking each library extra against their respective tests
## 0.10.3
* Adds ability to reuse connections per process in unstructured-ingest
* Pass ocr_mode in partition_pdf and set the default back to individual pages for now

View File

@ -23,6 +23,9 @@ install: install-base-pip-packages install-dev install-nltk-models install-test
.PHONY: install-ci
install-ci: install-base-pip-packages install-nltk-models install-huggingface install-all-docs install-test
.PHONY: install-base-ci
install-base-ci: install-base-pip-packages install-nltk-models install-test
.PHONY: install-base-pip-packages
install-base-pip-packages:
python3 -m pip install pip==${PIP_VERSION}
@ -253,6 +256,65 @@ test:
test-unstructured-api-unit:
scripts/test-unstructured-api-unit.sh
.PHONY: test-no-extras
# TODO(newelh) Add json test when fixed
test-no-extras:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/test_text.py \
test_${PACKAGE_NAME}/partition/test_email.py \
test_${PACKAGE_NAME}/partition/test_html_partition.py \
test_${PACKAGE_NAME}/partition/test_xml_partition.py
.PHONY: test-extra-csv
test-extra-csv:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/csv
.PHONY: test-extra-docx
test-extra-docx:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/docx
.PHONY: test-extra-markdown
test-extra-markdown:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/markdown
.PHONY: test-extra-msg
test-extra-msg:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/msg
.PHONY: test-extra-odt
test-extra-odt:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/odt
.PHONY: test-extra-pdf-image
test-extra-pdf-image:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/pdf-image
.PHONY: test-extra-pptx
test-extra-pptx:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/pptx
.PHONY: test-extra-epub
test-extra-pypandoc:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/epub
.PHONY: test-extra-pypandoc
test-extra-pypandoc:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/pypandoc
.PHONY: test-extra-xlsx
test-extra-xlsx:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/xlsx
## check: runs linters (includes tests)
.PHONY: check
check: check-src check-tests check-version

View File

@ -19,7 +19,7 @@ expected_sections = {
def test_partition_epub_from_filename():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub")
elements = partition_epub(filename=filename)
assert len(elements) > 0
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
@ -32,7 +32,7 @@ def test_partition_epub_from_filename():
def test_partition_epub_from_filename_with_metadata_filename():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub")
elements = partition_epub(filename=filename, metadata_filename="test")
assert len(elements) > 0
assert all(element.metadata.filename == "test" for element in elements)
@ -40,7 +40,7 @@ def test_partition_epub_from_filename_with_metadata_filename():
def test_partition_epub_from_file():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub")
with open(filename, "rb") as f:
elements = partition_epub(file=f)
assert len(elements) > 0
@ -53,7 +53,7 @@ def test_partition_epub_from_file():
def test_partition_epub_from_file_with_metadata_filename():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub")
with open(filename, "rb") as f:
elements = partition_epub(file=f, metadata_filename="test")
assert len(elements) > 0
@ -62,7 +62,7 @@ def test_partition_epub_from_file_with_metadata_filename():
def test_partition_epub_from_filename_exclude_metadata():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub")
elements = partition_epub(filename=filename, include_metadata=False)
assert elements[0].metadata.filetype is None
assert elements[0].metadata.page_name is None
@ -71,7 +71,7 @@ def test_partition_epub_from_filename_exclude_metadata():
def test_partition_epub_from_file_exlcude_metadata():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub")
with open(filename, "rb") as f:
elements = partition_epub(file=f, include_metadata=False)
assert elements[0].metadata.filetype is None

View File

@ -11,7 +11,7 @@ DIRECTORY = pathlib.Path(__file__).parent.resolve()
def test_partition_md_from_filename():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
elements = partition_md(filename=filename)
assert "PageBreak" not in [elem.category for elem in elements]
assert len(elements) > 0
@ -20,7 +20,7 @@ def test_partition_md_from_filename():
def test_partition_md_from_filename_with_metadata_filename():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
elements = partition_md(filename=filename, metadata_filename="test")
assert "PageBreak" not in [elem.category for elem in elements]
assert len(elements) > 0
@ -29,7 +29,7 @@ def test_partition_md_from_filename_with_metadata_filename():
def test_partition_md_from_file():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
with open(filename) as f:
elements = partition_md(file=f)
assert len(elements) > 0
@ -38,7 +38,7 @@ def test_partition_md_from_file():
def test_partition_md_from_file_with_metadata_filename():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
with open(filename) as f:
elements = partition_md(file=f, metadata_filename="test")
assert len(elements) > 0
@ -46,7 +46,7 @@ def test_partition_md_from_file_with_metadata_filename():
def test_partition_md_from_text():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
with open(filename) as f:
text = f.read()
elements = partition_md(text=text)
@ -64,7 +64,7 @@ class MockResponse:
def test_partition_md_from_url():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
with open(filename) as f:
text = f.read()
@ -82,7 +82,7 @@ def test_partition_md_from_url():
def test_partition_md_from_url_raises_with_bad_status_code():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
with open(filename) as f:
text = f.read()
@ -97,7 +97,7 @@ def test_partition_md_from_url_raises_with_bad_status_code():
def test_partition_md_from_url_raises_with_bad_content_type():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
with open(filename) as f:
text = f.read()
@ -117,7 +117,7 @@ def test_partition_md_raises_with_none_specified():
def test_partition_md_raises_with_too_many_specified():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
with open(filename) as f:
text = f.read()
@ -126,14 +126,14 @@ def test_partition_md_raises_with_too_many_specified():
def test_partition_md_from_filename_exclude_metadata():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
elements = partition_md(filename=filename, include_metadata=False)
for i in range(len(elements)):
assert elements[i].metadata.to_dict() == {}
def test_partition_md_from_file_exclude_metadata():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
with open(filename) as f:
elements = partition_md(file=f, include_metadata=False)
for i in range(len(elements)):
@ -141,7 +141,7 @@ def test_partition_md_from_file_exclude_metadata():
def test_partition_md_from_text_exclude_metadata():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
with open(filename) as f:
text = f.read()
elements = partition_md(text=text, include_metadata=False)

View File

@ -14,7 +14,7 @@ from unstructured.partition.msg import extract_msg_attachment_info, partition_ms
from unstructured.partition.text import partition_text
DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "..", "example-docs")
EXPECTED_MSG_OUTPUT = [
NarrativeText(text="This is a test email to use for unit tests."),
@ -104,6 +104,7 @@ def test_extract_attachment_info():
DIRECTORY,
"..",
"..",
"..",
"example-docs",
"fake-email-attachment.msg",
)

View File

@ -5,7 +5,7 @@ from unstructured.documents.elements import Title
from unstructured.partition.odt import partition_odt
DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "..", "example-docs")
def test_partition_odt_from_filename():

View File

@ -177,6 +177,7 @@ def test_partition_image_with_ocr_detects_korean():
DIRECTORY,
"..",
"..",
"..",
"example-docs",
"english-and-korean.png",
)
@ -191,7 +192,7 @@ def test_partition_image_with_ocr_detects_korean():
def test_partition_image_with_ocr_detects_korean_from_file():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "english-and-korean.png")
with open(filename, "rb") as f:
elements = image.partition_image(
file=f,
@ -208,6 +209,7 @@ def test_partition_image_raises_with_bad_strategy():
DIRECTORY,
"..",
"..",
"..",
"example-docs",
"english-and-korean.png",
)
@ -216,7 +218,14 @@ def test_partition_image_raises_with_bad_strategy():
def test_partition_image_default_strategy_hi_res():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.jpg")
filename = os.path.join(
DIRECTORY,
"..",
"..",
"..",
"example-docs",
"layout-parser-paper-fast.jpg",
)
with open(filename, "rb") as f:
elements = image.partition_image(file=f)

View File

@ -7,7 +7,7 @@ from unstructured.documents.elements import ListItem, NarrativeText, Title
from unstructured.partition.ppt import partition_ppt
DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "..", "example-docs")
EXPECTED_PPT_OUTPUT = [
Title(text="Adding a Bullet Slide"),

View File

@ -14,7 +14,7 @@ from unstructured.documents.elements import (
from unstructured.partition.pptx import partition_pptx
DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "..", "example-docs")
EXPECTED_PPTX_OUTPUT = [
Title(text="Adding a Bullet Slide"),

View File

@ -8,7 +8,7 @@ DIRECTORY = pathlib.Path(__file__).parent.resolve()
def test_partition_rtf_from_filename():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
elements = partition_rtf(filename=filename)
assert len(elements) > 0
assert elements[0] == Title("My First Heading")
@ -17,14 +17,14 @@ def test_partition_rtf_from_filename():
def test_partition_rtf_from_filename_with_metadata_filename():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
elements = partition_rtf(filename=filename, metadata_filename="test")
assert len(elements) > 0
assert all(element.metadata.filename == "test" for element in elements)
def test_partition_rtf_from_file():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
with open(filename, "rb") as f:
elements = partition_rtf(file=f)
assert len(elements) > 0
@ -34,7 +34,7 @@ def test_partition_rtf_from_file():
def test_partition_rtf_from_file_with_metadata_filename():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
with open(filename, "rb") as f:
elements = partition_rtf(file=f, metadata_filename="test")
assert elements[0] == Title("My First Heading")
@ -43,14 +43,14 @@ def test_partition_rtf_from_file_with_metadata_filename():
def test_partition_rtf_from_filename_exclude_metadata():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
elements = partition_rtf(filename=filename, include_metadata=False)
for i in range(len(elements)):
assert elements[i].metadata.to_dict() == {}
def test_partition_rtf_from_file_exclude_metadata():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
with open(filename, "rb") as f:
elements = partition_rtf(file=f, include_metadata=False)
for i in range(len(elements)):

View File

@ -25,8 +25,8 @@ from unstructured.logger import logger
from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
from unstructured.utils import dependency_exists
if dependency_exists("docx"):
import docx.table as docxtable
if dependency_exists("docx") and dependency_exists("docx.table"):
from docx.table import Table as docxtable
if TYPE_CHECKING:
from unstructured_inference.inference.layoutelement import (