mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-31 01:03:30 +00:00
449 lines
17 KiB
YAML
449 lines
17 KiB
YAML
name: CI
|
|
|
|
on:
|
|
push:
|
|
branches: [ main ]
|
|
pull_request:
|
|
branches: [ main ]
|
|
merge_group:
|
|
branches: [ main ]
|
|
|
|
env:
|
|
GHA_CACHE_KEY_VERSION: "v1"
|
|
|
|
jobs:
|
|
setup:
|
|
strategy:
|
|
matrix:
|
|
python-version: ["3.8","3.9","3.10","3.11"]
|
|
runs-on: ubuntu-latest
|
|
env:
|
|
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
|
steps:
|
|
- uses: actions/checkout@v3
|
|
- uses: actions/cache@v3
|
|
id: virtualenv-cache
|
|
with:
|
|
path: |
|
|
.venv
|
|
nltk_data
|
|
key: unstructured-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }}
|
|
lookup-only: true
|
|
- name: Set up Python ${{ matrix.python-version }}
|
|
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
|
|
uses: actions/setup-python@v4
|
|
with:
|
|
python-version: ${{ matrix.python-version }}
|
|
- name: Setup virtual environment (no cache hit)
|
|
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
|
|
run: |
|
|
python${{ matrix.python-version }} -m venv .venv
|
|
source .venv/bin/activate
|
|
[ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA"
|
|
make install-ci
|
|
|
|
check-deps:
|
|
strategy:
|
|
matrix:
|
|
python-version: ["3.8","3.9","3.10","3.11"]
|
|
runs-on: ubuntu-latest
|
|
steps:
|
|
- uses: actions/checkout@v3
|
|
- name: Set up Python ${{ matrix.python-version }}
|
|
uses: actions/setup-python@v4
|
|
with:
|
|
python-version: ${{ matrix.python-version }}
|
|
- name: Check for dependency conflicts
|
|
run: make check-deps
|
|
|
|
lint:
|
|
strategy:
|
|
matrix:
|
|
python-version: ["3.8","3.9","3.10","3.11"]
|
|
runs-on: ubuntu-latest
|
|
needs: setup
|
|
steps:
|
|
- uses: actions/checkout@v3
|
|
- uses: actions/cache/restore@v3
|
|
id: virtualenv-cache
|
|
with:
|
|
path: |
|
|
.venv
|
|
nltk_data
|
|
key: unstructured-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }}
|
|
- name: Set up Python ${{ matrix.python-version }}
|
|
uses: actions/setup-python@v4
|
|
with:
|
|
python-version: ${{ matrix.python-version }}
|
|
- name: Setup virtual environment (no cache hit)
|
|
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
|
|
run: |
|
|
python${{ matrix.python-version }} -m venv .venv
|
|
source .venv/bin/activate
|
|
make install-ci
|
|
- name: Lint
|
|
run: |
|
|
source .venv/bin/activate
|
|
make check
|
|
|
|
shellcheck:
|
|
runs-on: ubuntu-latest
|
|
steps:
|
|
- uses: actions/checkout@v3
|
|
- name: ShellCheck
|
|
uses: ludeeus/action-shellcheck@master
|
|
|
|
test_unit:
|
|
strategy:
|
|
matrix:
|
|
python-version: ["3.8","3.9","3.10","3.11"]
|
|
runs-on: ubuntu-latest
|
|
env:
|
|
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
|
UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
needs: [setup, lint]
|
|
steps:
|
|
- uses: actions/checkout@v3
|
|
- uses: actions/cache/restore@v3
|
|
id: virtualenv-cache
|
|
with:
|
|
path: |
|
|
.venv
|
|
nltk_data
|
|
key: unstructured-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }}
|
|
- name: Set up Python ${{ matrix.python-version }}
|
|
uses: actions/setup-python@v4
|
|
with:
|
|
python-version: ${{ matrix.python-version }}
|
|
- name: Setup virtual environment (no cache hit)
|
|
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
|
|
run: |
|
|
python${{ matrix.python-version}} -m venv .venv
|
|
source .venv/bin/activate
|
|
mkdir "$NLTK_DATA"
|
|
make install-ci
|
|
- name: Test
|
|
env:
|
|
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
|
|
run: |
|
|
source .venv/bin/activate
|
|
sudo apt-get update
|
|
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
|
|
make install-pandoc
|
|
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
|
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
|
|
tesseract --version
|
|
# FIXME (yao): sometimes there is cache but we still miss argilla in the env; so we add make install-ci again
|
|
make install-ci
|
|
make test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
|
|
make check-coverage
|
|
|
|
test_unit_no_extras:
|
|
strategy:
|
|
matrix:
|
|
python-version: ["3.8"]
|
|
runs-on: ubuntu-latest
|
|
env:
|
|
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
|
needs: [setup, lint]
|
|
steps:
|
|
- uses: actions/checkout@v3
|
|
- name: Set up Python ${{ matrix.python-version }}
|
|
uses: actions/setup-python@v4
|
|
with:
|
|
python-version: ${{ matrix.python-version }}
|
|
- uses: actions/cache/restore@v3
|
|
id: virtualenv-cache
|
|
with:
|
|
path: |
|
|
nltk_data
|
|
.venv-base
|
|
key: unstructured-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }}-base
|
|
- name: Setup virtual environment
|
|
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
|
|
run: |
|
|
python${{ matrix.python-version}} -m venv .venv-base
|
|
source .venv-base/bin/activate
|
|
mkdir "$NLTK_DATA"
|
|
make install-base-ci
|
|
- name: Test
|
|
env:
|
|
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
|
|
run: |
|
|
source .venv-base/bin/activate
|
|
make test-no-extras CI=true
|
|
|
|
test_unit_dependency_extras:
|
|
# NOTE(newelh) - Split extras into separate steps in the same pipeline (avoid using matrix)
|
|
strategy:
|
|
matrix:
|
|
python-version: ["3.8"]
|
|
extra: ["csv", "docx", "odt", "markdown", "pypandoc", "msg", "pdf-image", "pptx", "xlsx"]
|
|
runs-on: ubuntu-latest
|
|
env:
|
|
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
|
needs: [setup, lint, test_unit_no_extras]
|
|
steps:
|
|
- uses: actions/checkout@v3
|
|
- name: Set up Python ${{ matrix.python-version }}
|
|
uses: actions/setup-python@v4
|
|
with:
|
|
python-version: ${{ matrix.python-version }}
|
|
- uses: actions/cache/restore@v3
|
|
id: virtualenv-cache
|
|
with:
|
|
path: |
|
|
nltk_data
|
|
key: unstructured-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }}-base
|
|
- name: Setup virtual environment
|
|
run: |
|
|
python${{ matrix.python-version}} -m venv .venv-${{ matrix.extra }}
|
|
source .venv-${{ matrix.extra }}/bin/activate
|
|
make install-base-ci
|
|
make install-${{ matrix.extra }}
|
|
- name: Test
|
|
env:
|
|
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
|
|
UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
run: |
|
|
source .venv-${{ matrix.extra }}/bin/activate
|
|
# NOTE(newelh) - determine what needs to be installed here
|
|
sudo apt-get update
|
|
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
|
|
make install-pandoc
|
|
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
|
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
|
|
tesseract --version
|
|
make test-extra-${{ matrix.extra }} CI=true
|
|
|
|
setup_ingest:
|
|
strategy:
|
|
matrix:
|
|
python-version: [ "3.8","3.9","3.10","3.11" ]
|
|
runs-on: ubuntu-latest
|
|
env:
|
|
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
|
needs: [setup]
|
|
steps:
|
|
- uses: actions/checkout@v3
|
|
# The following checks to see whether the ingest cache exists.
|
|
# If so we don't want to download the base cache.
|
|
- uses: actions/cache@v3
|
|
id: ingest-cache-exists
|
|
with:
|
|
lookup-only: true
|
|
path: |
|
|
.venv
|
|
nltk_data
|
|
key: unstructured-ingest-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/ingest/*.txt') }}
|
|
- uses: actions/cache/restore@v3
|
|
if: steps.ingest-cache-exists.outputs.cache-hit != 'true'
|
|
id: base-virtualenv-cache
|
|
with:
|
|
path: |
|
|
.venv
|
|
nltk_data
|
|
key: unstructured-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }}
|
|
# Due to the dependency on setup, the cache should exist before this is ran. Set failed if it wasn't found.
|
|
fail-on-cache-miss: true
|
|
- uses: actions/cache@v3
|
|
if: steps.ingest-cache-exists.outputs.cache-hit != 'true'
|
|
id: virtualenv-cache
|
|
with:
|
|
path: |
|
|
.venv
|
|
nltk_data
|
|
key: unstructured-ingest-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/ingest/*.txt') }}
|
|
- name: Set up Python ${{ matrix.python-version }}
|
|
if: steps.ingest-cache-exists.outputs.cache-hit != 'true'
|
|
uses: actions/setup-python@v4
|
|
with:
|
|
python-version: ${{ matrix.python-version }}
|
|
- name: Setup virtual environment (no cache hit)
|
|
if: steps.ingest-cache-exists.outputs.cache-hit != 'true'
|
|
run: |
|
|
python${{ matrix.python-version }} -m venv .venv
|
|
source .venv/bin/activate
|
|
make install-all-ingest
|
|
|
|
|
|
test_ingest:
|
|
strategy:
|
|
matrix:
|
|
python-version: ["3.8","3.9","3.10","3.11"]
|
|
runs-on: ubuntu-latest
|
|
env:
|
|
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
|
MAX_PROCESSES: 2
|
|
needs: [setup_ingest, lint]
|
|
steps:
|
|
- uses: actions/checkout@v3
|
|
- name: Set up Python ${{ matrix.python-version }}
|
|
uses: actions/setup-python@v4
|
|
with:
|
|
python-version: ${{ matrix.python-version }}
|
|
- name: Get full Python version
|
|
id: full-python-version
|
|
run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT
|
|
- uses: actions/cache/restore@v3
|
|
id: virtualenv-cache
|
|
with:
|
|
path: |
|
|
.venv
|
|
nltk_data
|
|
key: unstructured-ingest-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/ingest/*.txt') }}
|
|
# Due to the dependency on setup_ingest, the cache should exist before this is ran. Set failed if it wasn't found.
|
|
fail-on-cache-miss: true
|
|
- name: Setup docker-compose
|
|
uses: KengoTODA/actions-setup-docker-compose@v1
|
|
with:
|
|
version: '2.22.0'
|
|
- name: Test Ingest (unit)
|
|
run: |
|
|
source .venv/bin/activate
|
|
PYTHONPATH=. pytest test_unstructured_ingest/unit
|
|
- name: Test (end-to-end)
|
|
env:
|
|
AIRTABLE_PERSONAL_ACCESS_TOKEN: ${{ secrets.AIRTABLE_PERSONAL_ACCESS_TOKEN }}
|
|
BOX_APP_CONFIG: ${{ secrets.BOX_APP_CONFIG }}
|
|
CONFLUENCE_API_TOKEN: ${{ secrets.CONFLUENCE_API_TOKEN }}
|
|
CONFLUENCE_USER_EMAIL: ${{ secrets.CONFLUENCE_USER_EMAIL }}
|
|
DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }}
|
|
DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }}
|
|
DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }}
|
|
DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }}
|
|
GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }}
|
|
GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }}
|
|
JIRA_INGEST_API_TOKEN: ${{ secrets.JIRA_INGEST_API_TOKEN }}
|
|
JIRA_INGEST_USER_EMAIL: ${{ secrets.JIRA_INGEST_USER_EMAIL }}
|
|
MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }}
|
|
MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }}
|
|
MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }}
|
|
MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }}
|
|
MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }}
|
|
SALESFORCE_USERNAME: ${{secrets.SALESFORCE_USERNAME}}
|
|
SALESFORCE_CONSUMER_KEY: ${{secrets.SALESFORCE_CONSUMER_KEY}}
|
|
SALESFORCE_PRIVATE_KEY: ${{secrets.SALESFORCE_PRIVATE_KEY}}
|
|
SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}}
|
|
SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}}
|
|
SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}}
|
|
SHAREPOINT_PERMISSIONS_APP_ID: ${{secrets.SHAREPOINT_PERMISSIONS_APP_ID}}
|
|
SHAREPOINT_PERMISSIONS_APP_CRED: ${{secrets.SHAREPOINT_PERMISSIONS_APP_CRED}}
|
|
SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}}
|
|
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
|
|
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
|
|
NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }}
|
|
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
|
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
|
AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }}
|
|
AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }}
|
|
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
TABLE_OCR: "tesseract"
|
|
OCR_AGENT: "tesseract"
|
|
CI: "true"
|
|
run: |
|
|
source .venv/bin/activate
|
|
sudo apt-get update
|
|
sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc
|
|
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
|
sudo apt-get install -y tesseract-ocr
|
|
sudo apt-get install -y tesseract-ocr-kor
|
|
sudo apt-get install diffstat
|
|
tesseract --version
|
|
./test_unstructured_ingest/test-ingest.sh
|
|
|
|
test_unstructured_api_unit:
|
|
strategy:
|
|
matrix:
|
|
# NOTE(yuming): Unstructured API only use Python 3.8
|
|
python-version: ["3.8"]
|
|
runs-on: ubuntu-latest
|
|
env:
|
|
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
|
needs: [setup, lint]
|
|
steps:
|
|
- uses: actions/checkout@v3
|
|
- uses: actions/cache/restore@v3
|
|
id: virtualenv-cache
|
|
with:
|
|
path: |
|
|
.venv
|
|
nltk_data
|
|
key: unstructured-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }}
|
|
- name: Set up flag for running Unstructured API unit tests
|
|
run: |
|
|
# NOTE: Change env `SKIP_API_UNIT_FOR_BREAKING_CHANGE` to true if there is a breaking change in Unstructured repo that will break unstructured api unit tests
|
|
# TODO: Change env back to false once API unit tests is in sync with unstructured repo
|
|
echo "SKIP_API_UNIT_FOR_BREAKING_CHANGE=true" >> $GITHUB_ENV
|
|
- name: Set up Python ${{ matrix.python-version }}
|
|
if: env.SKIP_API_UNIT_FOR_BREAKING_CHANGE == 'false'
|
|
uses: actions/setup-python@v4
|
|
with:
|
|
python-version: ${{ matrix.python-version }}
|
|
- name: Setup virtual environment (no cache hit)
|
|
if: steps.virtualenv-cache.outputs.cache-hit != 'true' && env.SKIP_API_UNIT_FOR_BREAKING_CHANGE == 'false'
|
|
run: |
|
|
python${{ matrix.python-version}} -m venv .venv
|
|
source .venv/bin/activate
|
|
mkdir "$NLTK_DATA"
|
|
make install-ci
|
|
- name: Test Unstructured API Unit
|
|
if: env.SKIP_API_UNIT_FOR_BREAKING_CHANGE == 'false'
|
|
run: |
|
|
source .venv/bin/activate
|
|
# FIXME (yao): sometimes there is cache but we still miss argilla in the env; so we add make install-ci again
|
|
make install-ci
|
|
sudo apt-get update && sudo apt-get install --yes poppler-utils libreoffice
|
|
make install-pandoc
|
|
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
|
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
|
|
tesseract --version
|
|
make install-nltk-models
|
|
make test-unstructured-api-unit
|
|
|
|
changelog:
|
|
runs-on: ubuntu-latest
|
|
steps:
|
|
# need to checkout otherwise paths-filter will fail on merge-queue trigger
|
|
- uses: actions/checkout@v3
|
|
- if: github.ref != 'refs/heads/main'
|
|
uses: dorny/paths-filter@v2
|
|
id: changes
|
|
with:
|
|
filters: |
|
|
src:
|
|
- 'unstructured/**'
|
|
|
|
- if: steps.changes.outputs.src == 'true' && github.ref != 'refs/heads/main'
|
|
uses: dangoslen/changelog-enforcer@v3
|
|
|
|
# TODO - figure out best practice for caching docker images
|
|
# (Using the virtualenv to get pytest)
|
|
test_dockerfile:
|
|
runs-on: ubuntu-latest
|
|
needs: [ setup, lint ]
|
|
steps:
|
|
- uses: actions/checkout@v3
|
|
- uses: actions/cache/restore@v3
|
|
id: virtualenv-cache
|
|
with:
|
|
path: |
|
|
.venv
|
|
nltk_data
|
|
key: unstructured-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }}
|
|
- name: Set up Python ${{ matrix.python-version }}
|
|
uses: actions/setup-python@v4
|
|
with:
|
|
python-version: ${{ matrix.python-version }}
|
|
- name: Setup virtual environment (no cache hit)
|
|
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
|
|
run: |
|
|
python${{ matrix.python-version }} -m venv .venv
|
|
- name: Test Dockerfile
|
|
run: |
|
|
source .venv/bin/activate
|
|
echo "UNS_API_KEY=${{ secrets.UNS_API_KEY }}" > uns_test_env_file
|
|
echo "UNSTRUCTURED_HF_TOKEN=${{ secrets.HF_TOKEN }}" > uns_test_env_file
|
|
make docker-build
|
|
make docker-test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
|