Matt Robinson 6036af33e7
feat: add partition_doc for .doc files (#236)
* first pass on doc partitioning

* add libreoffice to deps

* update docs and readme

* add .doc to auto

* changelog bump

* value error with missing doc

* doc updates
2023-02-17 09:30:23 -05:00

120 lines
3.5 KiB
YAML

name: CI
on:
# NOTE(robinson) - We are limiting when we run CI avoid exceeding our 2,000 min/month limt.
# We can switch to running on push if we make this repo public or are fine with
# paying for CI minutes.
push:
branches: [ main ]
pull_request:
branches: [ main ]
env:
PYTHON_VERSION: 3.8
jobs:
setup:
runs-on: ubuntu-latest
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
steps:
- uses: actions/checkout@v3
- uses: actions/cache@v3
id: virtualenv-cache
with:
path: |
.venv
nltk_data
key: unstructured-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Setup virtual environment (no cache hit)
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
run: |
python${{ env.PYTHON_VERSION }} -m venv .venv
source .venv/bin/activate
make install-ci
lint:
runs-on: ubuntu-latest
needs: setup
steps:
- uses: actions/checkout@v3
- uses: actions/cache@v3
id: virtualenv-cache
with:
path: .venv
key: unstructured-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
# NOTE(robinson) - This is a fallback in case the lint job does not find the cache.
# We can take this out when we implement the fix in CORE-99
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Setup virtual environment (no cache hit)
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
run: |
python${{ env.PYTHON_VERSION }} -m venv .venv
source .venv/bin/activate
make install-ci
- name: Lint
run: |
source .venv/bin/activate
make check
shellcheck:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: ShellCheck
uses: ludeeus/action-shellcheck@master
test:
runs-on: ubuntu-latest
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: [setup, lint]
steps:
- uses: actions/checkout@v3
- uses: actions/cache@v3
id: virtualenv-cache
with:
path: |
.venv
nltk_data
key: unstructured-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
# NOTE(robinson) - This is a fallback in case the lint job does not find the cache.
# We can take this out when we implement the fix in CORE-99
- name: Setup virtual environment (no cache hit)
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
run: |
python${{ env.PYTHON_VERSION }} -m venv .venv
source .venv/bin/activate
make install-ci
- name: Test
run: |
source .venv/bin/activate
make install-nltk-models
make install-detectron2
sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr libreoffice
make test
make check-coverage
make install-ingest-s3
./test_unstructured_ingest/test-ingest.sh
changelog:
runs-on: ubuntu-latest
steps:
- if: github.ref != 'refs/heads/main'
uses: dorny/paths-filter@v2
id: changes
with:
filters: |
src:
- 'unstructured/**'
- if: steps.changes.outputs.src == 'true' && github.ref != 'refs/heads/main'
uses: dangoslen/changelog-enforcer@v3