add requirements files to ingest cache hash key (#3641)

This PR adds the requirement files for base and extras for the ingest
cache's hash key.

- The current workflow uses only the ingest requirements to generate
hash key for the gitaction cache
- Sometimes only base or extra requirements (like extra-pdf.txt) updated
but not any ingest requirements -> this would mean the ingest test would
fetch a cache with outdated non-ingest dependencies
- When we generate new ingest cache we actually do check first base and
extra requirements and generate a base env before layer on top the
ingest dependencies.
- This PR allows the ingest step to recognize changes to non-ingest
dependency changes and trigger new cache generation when either ingest
or base/extra requirement files changes.

This PR also bumps the setup python action version in cache actions; it
also adds installation of `virtualenv` for the ingest cache action to
avoid errors like
https://github.com/Unstructured-IO/unstructured/actions/runs/10905551870/job/30265057515?pr=3641#step:3:111
This commit is contained in:
Yao You 2024-09-18 18:39:14 -05:00 committed by GitHub
parent 2d3cd45b23
commit 22998354db
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 114 additions and 93 deletions

View File

@ -22,7 +22,7 @@ runs:
lookup-only: ${{ inputs.check-only }} lookup-only: ${{ inputs.check-only }}
- name: Set up Python ${{ inputs.python-version }} - name: Set up Python ${{ inputs.python-version }}
if: steps.virtualenv-cache-restore.outputs.cache-hit != 'true' if: steps.virtualenv-cache-restore.outputs.cache-hit != 'true'
uses: actions/setup-python@v4 uses: actions/setup-python@v5
with: with:
python-version: ${{ inputs.python-version }} python-version: ${{ inputs.python-version }}
- name: Setup virtual environment (no cache hit) - name: Setup virtual environment (no cache hit)

View File

@ -18,24 +18,27 @@ runs:
path: | path: |
.venv .venv
nltk_data nltk_data
key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt') }} key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt') }}-${{ hashFiles('requirements/*.txt') }}
lookup-only: ${{ inputs.check-only }} lookup-only: ${{ inputs.check-only }}
- name: Restore base virtual environment
if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true'
uses: ./.github/actions/base-cache
with:
python-version: ${{ inputs.python-version }}
- name: Set up Python ${{ inputs.python-version }} - name: Set up Python ${{ inputs.python-version }}
if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true' if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true'
uses: actions/setup-python@v4 uses: actions/setup-python@v5
with: with:
python-version: ${{ inputs.python-version }} python-version: ${{ inputs.python-version }}
- name: Setup virtual environment (no cache hit) - name: Setup virtual environment (no cache hit)
if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true' if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true'
shell: bash shell: bash
run: | run: |
python${{ inputs.python-version }} -m venv .venv python${{ inputs.python-version }} -m pip install --upgrade virtualenv
if [ ! -d ".venv" ]; then
python${{ inputs.python-version }} -m venv .venv
fi
source .venv/bin/activate source .venv/bin/activate
if [ "${{ inputs.python-version == '3.12' }}" == "true" ]; then
python -m ensurepip --upgrade
python -m pip install --upgrade setuptools
fi
make install-ci
make install-all-ingest make install-all-ingest
- name: Save Ingest Cache - name: Save Ingest Cache
if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true' if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true'
@ -45,5 +48,5 @@ runs:
path: | path: |
.venv .venv
nltk_data nltk_data
key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt') }} key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt') }}-${{ hashFiles('requirements/*.txt') }}

View File

@ -96,6 +96,7 @@ jobs:
- name: Lint - name: Lint
run: | run: |
source .venv/bin/activate source .venv/bin/activate
make install-ci
make check make check
shellcheck: shellcheck:
@ -174,9 +175,10 @@ jobs:
- name: Test - name: Test
env: env:
UNS_API_KEY: ${{ secrets.UNS_API_KEY }} UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
PYTHON: python${{ matrix.python-version }}
NLTK_DATA: ${{ github.workspace }}/nltk_data
run: | run: |
source .venv/bin/activate source .venv/bin/activate
make install-nltk-models
sudo apt-get update sudo apt-get update
sudo apt-get install -y poppler-utils sudo apt-get install -y poppler-utils
make install-pandoc install-test make install-pandoc install-test
@ -207,6 +209,7 @@ jobs:
UNS_API_KEY: ${{ secrets.UNS_API_KEY }} UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
run: | run: |
source .venv/bin/activate source .venv/bin/activate
make install-ci
make install-nltk-models make install-nltk-models
make test-no-extras CI=true make test-no-extras CI=true
@ -267,6 +270,7 @@ jobs:
- uses: ./.github/actions/base-ingest-cache - uses: ./.github/actions/base-ingest-cache
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
check-only: 'true'
test_ingest_unit: test_ingest_unit:
strategy: strategy:
@ -289,10 +293,14 @@ jobs:
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
- name: Test Ingest (unit) - name: Test Ingest (unit)
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
PYTHON: python${{ matrix.python-version }}
run: | run: |
source .venv/bin/activate source .venv/bin/activate
make install-nltk-models make install-ci
PYTHONPATH=. pytest test_unstructured_ingest/unit make install-all-ingest
PYTHONPATH=. ${PYTHON} -m pytest test_unstructured_ingest/unit
test_ingest_src: test_ingest_src:
@ -365,9 +373,12 @@ jobs:
MXBAI_API_KEY: ${{secrets.MXBAI_API_KEY}} MXBAI_API_KEY: ${{secrets.MXBAI_API_KEY}}
OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract" OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract"
CI: "true" CI: "true"
NLTK_DATA: ${{ github.workspace }}/nltk_data
PYTHON: python${{ matrix.python-version }}
run: | run: |
source .venv/bin/activate source .venv/bin/activate
make install-nltk-models make install-ci
make install-all-ingest
sudo apt-get update sudo apt-get update
sudo apt-get install -y libmagic-dev poppler-utils libreoffice sudo apt-get install -y libmagic-dev poppler-utils libreoffice
make install-pandoc make install-pandoc
@ -435,9 +446,12 @@ jobs:
DATABRICKS_CATALOG: ${{secrets.DATABRICKS_CATALOG}} DATABRICKS_CATALOG: ${{secrets.DATABRICKS_CATALOG}}
OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract" OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract"
CI: "true" CI: "true"
NLTK_DATA: ${{ github.workspace }}/nltk_data
PYTHON: python${{ matrix.python-version }}
run: | run: |
source .venv/bin/activate source .venv/bin/activate
make install-nltk-models make install-ci
make install-all-ingest
sudo apt-get update sudo apt-get update
sudo apt-get install -y libmagic-dev poppler-utils libreoffice sudo apt-get install -y libmagic-dev poppler-utils libreoffice
make install-pandoc make install-pandoc
@ -469,6 +483,8 @@ jobs:
- name: Validate --help - name: Validate --help
run: | run: |
source .venv/bin/activate source .venv/bin/activate
make install-ci
make install-all-ingest
./test_unstructured_ingest/test-help.sh ./test_unstructured_ingest/test-help.sh

155
Makefile
View File

@ -2,6 +2,7 @@ PACKAGE_NAME := unstructured
PIP_VERSION := 23.2.1 PIP_VERSION := 23.2.1
CURRENT_DIR := $(shell pwd) CURRENT_DIR := $(shell pwd)
ARCH := $(shell uname -m) ARCH := $(shell uname -m)
PYTHON ?= python3
.PHONY: help .PHONY: help
help: Makefile help: Makefile
@ -28,177 +29,177 @@ install-base-ci: install-base-pip-packages install-nltk-models install-test inst
.PHONY: install-base-pip-packages .PHONY: install-base-pip-packages
install-base-pip-packages: install-base-pip-packages:
python3 -m pip install pip==${PIP_VERSION} ${PYTHON} -m pip install pip==${PIP_VERSION}
python3 -m pip install -r requirements/base.txt ${PYTHON} -m pip install -r requirements/base.txt
.PHONY: install-huggingface .PHONY: install-huggingface
install-huggingface: install-huggingface:
python3 -m pip install pip==${PIP_VERSION} ${PYTHON} -m pip install pip==${PIP_VERSION}
python3 -m pip install -r requirements/huggingface.txt ${PYTHON} -m pip install -r requirements/huggingface.txt
.PHONY: install-nltk-models .PHONY: install-nltk-models
install-nltk-models: install-nltk-models:
python3 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" ${PYTHON} -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()"
.PHONY: install-test .PHONY: install-test
install-test: install-test:
python3 -m pip install -r requirements/test.txt ${PYTHON} -m pip install -r requirements/test.txt
# NOTE(yao) - CI seem to always install tesseract to test so it would make sense to also require # NOTE(yao) - CI seem to always install tesseract to test so it would make sense to also require
# pytesseract installation into the virtual env for testing # pytesseract installation into the virtual env for testing
python3 -m pip install unstructured_pytesseract ${PYTHON} -m pip install unstructured_pytesseract
# python3 -m pip install argilla==1.28.0 -c requirements/deps/constraints.txt # ${PYTHON} -m pip install argilla==1.28.0 -c requirements/deps/constraints.txt
# NOTE(robinson) - Installing weaviate-client separately here because the requests # NOTE(robinson) - Installing weaviate-client separately here because the requests
# version conflicts with label_studio_sdk # version conflicts with label_studio_sdk
python3 -m pip install weaviate-client -c requirements/deps/constraints.txt ${PYTHON} -m pip install weaviate-client -c requirements/deps/constraints.txt
.PHONY: install-dev .PHONY: install-dev
install-dev: install-dev:
python3 -m pip install -r requirements/dev.txt ${PYTHON} -m pip install -r requirements/dev.txt
.PHONY: install-build .PHONY: install-build
install-build: install-build:
python3 -m pip install -r requirements/build.txt ${PYTHON} -m pip install -r requirements/build.txt
.PHONY: install-csv .PHONY: install-csv
install-csv: install-csv:
python3 -m pip install -r requirements/extra-csv.txt ${PYTHON} -m pip install -r requirements/extra-csv.txt
.PHONY: install-docx .PHONY: install-docx
install-docx: install-docx:
python3 -m pip install -r requirements/extra-docx.txt ${PYTHON} -m pip install -r requirements/extra-docx.txt
.PHONY: install-epub .PHONY: install-epub
install-epub: install-epub:
python3 -m pip install -r requirements/extra-epub.txt ${PYTHON} -m pip install -r requirements/extra-epub.txt
.PHONY: install-odt .PHONY: install-odt
install-odt: install-odt:
python3 -m pip install -r requirements/extra-odt.txt ${PYTHON} -m pip install -r requirements/extra-odt.txt
.PHONY: install-pypandoc .PHONY: install-pypandoc
install-pypandoc: install-pypandoc:
python3 -m pip install -r requirements/extra-pandoc.txt ${PYTHON} -m pip install -r requirements/extra-pandoc.txt
.PHONY: install-markdown .PHONY: install-markdown
install-markdown: install-markdown:
python3 -m pip install -r requirements/extra-markdown.txt ${PYTHON} -m pip install -r requirements/extra-markdown.txt
.PHONY: install-pdf-image .PHONY: install-pdf-image
install-pdf-image: install-pdf-image:
python3 -m pip install -r requirements/extra-pdf-image.txt ${PYTHON} -m pip install -r requirements/extra-pdf-image.txt
.PHONY: install-pptx .PHONY: install-pptx
install-pptx: install-pptx:
python3 -m pip install -r requirements/extra-pptx.txt ${PYTHON} -m pip install -r requirements/extra-pptx.txt
.PHONY: install-xlsx .PHONY: install-xlsx
install-xlsx: install-xlsx:
python3 -m pip install -r requirements/extra-xlsx.txt ${PYTHON} -m pip install -r requirements/extra-xlsx.txt
.PHONY: install-all-docs .PHONY: install-all-docs
install-all-docs: install-base install-csv install-docx install-epub install-odt install-pypandoc install-markdown install-pdf-image install-pptx install-xlsx install-all-docs: install-base install-csv install-docx install-epub install-odt install-pypandoc install-markdown install-pdf-image install-pptx install-xlsx
.PHONY: install-all-ingest .PHONY: install-all-ingest
install-all-ingest: install-all-ingest:
find requirements/ingest -type f -name "*.txt" -exec python3 -m pip install -r '{}' ';' find requirements/ingest -type f -name "*.txt" -exec ${PYTHON} -m pip install -r '{}' ';'
.PHONY: install-ingest-google-drive .PHONY: install-ingest-google-drive
install-ingest-google-drive: install-ingest-google-drive:
python3 -m pip install -r requirements/ingest/google-drive.txt ${PYTHON} -m pip install -r requirements/ingest/google-drive.txt
## install-ingest-s3: install requirements for the s3 connector ## install-ingest-s3: install requirements for the s3 connector
.PHONY: install-ingest-s3 .PHONY: install-ingest-s3
install-ingest-s3: install-ingest-s3:
python3 -m pip install -r requirements/ingest/s3.txt ${PYTHON} -m pip install -r requirements/ingest/s3.txt
.PHONY: install-ingest-gcs .PHONY: install-ingest-gcs
install-ingest-gcs: install-ingest-gcs:
python3 -m pip install -r requirements/ingest/gcs.txt ${PYTHON} -m pip install -r requirements/ingest/gcs.txt
.PHONY: install-ingest-dropbox .PHONY: install-ingest-dropbox
install-ingest-dropbox: install-ingest-dropbox:
python3 -m pip install -r requirements/ingest/dropbox.txt ${PYTHON} -m pip install -r requirements/ingest/dropbox.txt
.PHONY: install-ingest-azure .PHONY: install-ingest-azure
install-ingest-azure: install-ingest-azure:
python3 -m pip install -r requirements/ingest/azure.txt ${PYTHON} -m pip install -r requirements/ingest/azure.txt
.PHONY: install-ingest-box .PHONY: install-ingest-box
install-ingest-box: install-ingest-box:
python3 -m pip install -r requirements/ingest/box.txt ${PYTHON} -m pip install -r requirements/ingest/box.txt
.PHONY: install-ingest-delta-table .PHONY: install-ingest-delta-table
install-ingest-delta-table: install-ingest-delta-table:
python3 -m pip install -r requirements/ingest/delta-table.txt ${PYTHON} -m pip install -r requirements/ingest/delta-table.txt
.PHONY: install-ingest-discord .PHONY: install-ingest-discord
install-ingest-discord: install-ingest-discord:
pip install -r requirements/ingest/discord.txt ${PYTHON} -m pip install -r requirements/ingest/discord.txt
.PHONY: install-ingest-github .PHONY: install-ingest-github
install-ingest-github: install-ingest-github:
python3 -m pip install -r requirements/ingest/github.txt ${PYTHON} -m pip install -r requirements/ingest/github.txt
.PHONY: install-ingest-biomed .PHONY: install-ingest-biomed
install-ingest-biomed: install-ingest-biomed:
python3 -m pip install -r requirements/ingest/biomed.txt ${PYTHON} -m pip install -r requirements/ingest/biomed.txt
.PHONY: install-ingest-gitlab .PHONY: install-ingest-gitlab
install-ingest-gitlab: install-ingest-gitlab:
python3 -m pip install -r requirements/ingest/gitlab.txt ${PYTHON} -m pip install -r requirements/ingest/gitlab.txt
.PHONY: install-ingest-onedrive .PHONY: install-ingest-onedrive
install-ingest-onedrive: install-ingest-onedrive:
python3 -m pip install -r requirements/ingest/onedrive.txt ${PYTHON} -m pip install -r requirements/ingest/onedrive.txt
.PHONY: install-ingest-outlook .PHONY: install-ingest-outlook
install-ingest-outlook: install-ingest-outlook:
python3 -m pip install -r requirements/ingest/outlook.txt ${PYTHON} -m pip install -r requirements/ingest/outlook.txt
.PHONY: install-ingest-reddit .PHONY: install-ingest-reddit
install-ingest-reddit: install-ingest-reddit:
python3 -m pip install -r requirements/ingest/reddit.txt ${PYTHON} -m pip install -r requirements/ingest/reddit.txt
.PHONY: install-ingest-slack .PHONY: install-ingest-slack
install-ingest-slack: install-ingest-slack:
pip install -r requirements/ingest/slack.txt ${PYTHON} -m pip install -r requirements/ingest/slack.txt
.PHONY: install-ingest-kafka .PHONY: install-ingest-kafka
install-ingest-kafka: install-ingest-kafka:
python3 -m pip install -r requirements/ingest/kafka.txt ${PYTHON} -m pip install -r requirements/ingest/kafka.txt
.PHONY: install-ingest-wikipedia .PHONY: install-ingest-wikipedia
install-ingest-wikipedia: install-ingest-wikipedia:
python3 -m pip install -r requirements/ingest/wikipedia.txt ${PYTHON} -m pip install -r requirements/ingest/wikipedia.txt
.PHONY: install-ingest-elasticsearch .PHONY: install-ingest-elasticsearch
install-ingest-elasticsearch: install-ingest-elasticsearch:
python3 -m pip install -r requirements/ingest/elasticsearch.txt ${PYTHON} -m pip install -r requirements/ingest/elasticsearch.txt
.PHONY: install-ingest-opensearch .PHONY: install-ingest-opensearch
install-ingest-opensearch: install-ingest-opensearch:
python3 -m pip install -r requirements/ingest/opensearch.txt ${PYTHON} -m pip install -r requirements/ingest/opensearch.txt
.PHONY: install-ingest-confluence .PHONY: install-ingest-confluence
install-ingest-confluence: install-ingest-confluence:
python3 -m pip install -r requirements/ingest/confluence.txt ${PYTHON} -m pip install -r requirements/ingest/confluence.txt
.PHONY: install-ingest-airtable .PHONY: install-ingest-airtable
install-ingest-airtable: install-ingest-airtable:
python3 -m pip install -r requirements/ingest/airtable.txt ${PYTHON} -m pip install -r requirements/ingest/airtable.txt
.PHONY: install-ingest-sharepoint .PHONY: install-ingest-sharepoint
install-ingest-sharepoint: install-ingest-sharepoint:
python3 -m pip install -r requirements/ingest/sharepoint.txt ${PYTHON} -m pip install -r requirements/ingest/sharepoint.txt
.PHONY: install-ingest-singlestore .PHONY: install-ingest-singlestore
install-ingest-singlestore: install-ingest-singlestore:
python3 -m pip install -r requirements/ingest/singlestore.txt ${PYTHON} -m pip install -r requirements/ingest/singlestore.txt
.PHONY: install-ingest-weaviate .PHONY: install-ingest-weaviate
install-ingest-weaviate: install-ingest-weaviate:
python3 -m pip install -r requirements/ingest/weaviate.txt ${PYTHON} -m pip install -r requirements/ingest/weaviate.txt
.PHONY: install-ingest-local .PHONY: install-ingest-local
install-ingest-local: install-ingest-local:
@ -206,63 +207,63 @@ install-ingest-local:
.PHONY: install-ingest-notion .PHONY: install-ingest-notion
install-ingest-notion: install-ingest-notion:
python3 -m pip install -r requirements/ingest/notion.txt ${PYTHON} -m pip install -r requirements/ingest/notion.txt
.PHONY: install-ingest-salesforce .PHONY: install-ingest-salesforce
install-ingest-salesforce: install-ingest-salesforce:
python3 -m pip install -r requirements/ingest/salesforce.txt ${PYTHON} -m pip install -r requirements/ingest/salesforce.txt
.PHONY: install-ingest-jira .PHONY: install-ingest-jira
install-ingest-jira: install-ingest-jira:
python3 -m pip install -r requirements/ingest/jira.txt ${PYTHON} -m pip install -r requirements/ingest/jira.txt
.PHONY: install-ingest-hubspot .PHONY: install-ingest-hubspot
install-ingest-hubspot: install-ingest-hubspot:
python3 -m pip install -r requirements/ingest/hubspot.txt ${PYTHON} -m pip install -r requirements/ingest/hubspot.txt
.PHONY: install-ingest-sftp .PHONY: install-ingest-sftp
install-ingest-sftp: install-ingest-sftp:
python3 -m pip install -r requirements/ingest/sftp.txt ${PYTHON} -m pip install -r requirements/ingest/sftp.txt
.PHONY: install-ingest-pinecone .PHONY: install-ingest-pinecone
install-ingest-pinecone: install-ingest-pinecone:
python3 -m pip install -r requirements/ingest/pinecone.txt ${PYTHON} -m pip install -r requirements/ingest/pinecone.txt
.PHONY: install-ingest-qdrant .PHONY: install-ingest-qdrant
install-ingest-qdrant: install-ingest-qdrant:
python3 -m pip install -r requirements/ingest/qdrant.txt ${PYTHON} -m pip install -r requirements/ingest/qdrant.txt
.PHONY: install-ingest-chroma .PHONY: install-ingest-chroma
install-ingest-chroma: install-ingest-chroma:
python3 -m pip install -r requirements/ingest/chroma.txt ${PYTHON} -m pip install -r requirements/ingest/chroma.txt
.PHONY: install-ingest-postgres .PHONY: install-ingest-postgres
install-ingest-postgres: install-ingest-postgres:
python3 -m pip install -r requirements/ingest/postgres.txt ${PYTHON} -m pip install -r requirements/ingest/postgres.txt
.PHONY: install-ingest-mongodb .PHONY: install-ingest-mongodb
install-ingest-mongodb: install-ingest-mongodb:
python3 -m pip install -r requirements/ingest/mongodb.txt ${PYTHON} -m pip install -r requirements/ingest/mongodb.txt
.PHONY: install-ingest-databricks-volumes .PHONY: install-ingest-databricks-volumes
install-ingest-databricks-volumes: install-ingest-databricks-volumes:
python3 -m pip install -r requirements/ingest/databricks-volumes.txt ${PYTHON} -m pip install -r requirements/ingest/databricks-volumes.txt
.PHONY: install-ingest-astradb .PHONY: install-ingest-astradb
install-ingest-astradb: install-ingest-astradb:
python3 -m pip install -r requirements/ingest/astradb.txt ${PYTHON} -m pip install -r requirements/ingest/astradb.txt
.PHONY: install-ingest-clarifai .PHONY: install-ingest-clarifai
install-ingest-clarifai: install-ingest-clarifai:
python3 -m pip install -r requirements/ingest/clarifai.txt ${PYTHON} -m pip install -r requirements/ingest/clarifai.txt
.PHONY: install-embed-huggingface .PHONY: install-embed-huggingface
install-embed-huggingface: install-embed-huggingface:
python3 -m pip install -r requirements/ingest/embed-huggingface.txt ${PYTHON} -m pip install -r requirements/ingest/embed-huggingface.txt
.PHONY: install-unstructured-inference .PHONY: install-unstructured-inference
install-unstructured-inference: install-unstructured-inference:
python3 -m pip install -r requirements/ingest/local-inference.txt ${PYTHON} -m pip install -r requirements/ingest/local-inference.txt
## install-local-inference: installs requirements for local inference ## install-local-inference: installs requirements for local inference
.PHONY: install-local-inference .PHONY: install-local-inference
@ -281,12 +282,12 @@ pip-compile:
.PHONY: install-project-local .PHONY: install-project-local
install-project-local: install install-project-local: install
# MAYBE TODO: fail if already exists? # MAYBE TODO: fail if already exists?
pip install -e . ${PYTHON} -m pip install -e .
## uninstall-project-local: uninstall unstructured from your local python environment ## uninstall-project-local: uninstall unstructured from your local python environment
.PHONY: uninstall-project-local .PHONY: uninstall-project-local
uninstall-project-local: uninstall-project-local:
pip uninstall ${PACKAGE_NAME} ${PYTHON} -m pip uninstall ${PACKAGE_NAME}
################# #################
# Test and Lint # # Test and Lint #
@ -299,12 +300,12 @@ export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false
.PHONY: test .PHONY: test
test: test:
PYTHONPATH=. CI=$(CI) \ PYTHONPATH=. CI=$(CI) \
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest test_${PACKAGE_NAME} -m "not chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40 UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} -m "not chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
.PHONY: test-chipper .PHONY: test-chipper
test-chipper: test-chipper:
PYTHONPATH=. CI=$(CI) \ PYTHONPATH=. CI=$(CI) \
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest test_${PACKAGE_NAME} -m "chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40 UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} -m "chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
.PHONY: test-unstructured-api-unit .PHONY: test-unstructured-api-unit
test-unstructured-api-unit: test-unstructured-api-unit:
@ -313,7 +314,7 @@ test-unstructured-api-unit:
.PHONY: test-no-extras .PHONY: test-no-extras
test-no-extras: test-no-extras:
PYTHONPATH=. CI=$(CI) \ PYTHONPATH=. CI=$(CI) \
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest \ UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest \
test_${PACKAGE_NAME}/partition/test_text.py \ test_${PACKAGE_NAME}/partition/test_text.py \
test_${PACKAGE_NAME}/partition/test_email.py \ test_${PACKAGE_NAME}/partition/test_email.py \
test_${PACKAGE_NAME}/partition/html/test_partition.py \ test_${PACKAGE_NAME}/partition/html/test_partition.py \
@ -321,48 +322,48 @@ test-no-extras:
.PHONY: test-extra-csv .PHONY: test-extra-csv
test-extra-csv: test-extra-csv:
PYTHONPATH=. CI=$(CI) pytest \ PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \
test_unstructured/partition/test_csv.py \ test_unstructured/partition/test_csv.py \
test_unstructured/partition/test_tsv.py test_unstructured/partition/test_tsv.py
.PHONY: test-extra-docx .PHONY: test-extra-docx
test-extra-docx: test-extra-docx:
PYTHONPATH=. CI=$(CI) pytest \ PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \
test_unstructured/partition/test_doc.py \ test_unstructured/partition/test_doc.py \
test_unstructured/partition/test_docx.py test_unstructured/partition/test_docx.py
.PHONY: test-extra-epub .PHONY: test-extra-epub
test-extra-epub: test-extra-epub:
PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_epub.py PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_epub.py
.PHONY: test-extra-markdown .PHONY: test-extra-markdown
test-extra-markdown: test-extra-markdown:
PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_md.py PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_md.py
.PHONY: test-extra-odt .PHONY: test-extra-odt
test-extra-odt: test-extra-odt:
PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_odt.py PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_odt.py
.PHONY: test-extra-pdf-image .PHONY: test-extra-pdf-image
test-extra-pdf-image: test-extra-pdf-image:
PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/pdf_image PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/pdf_image
.PHONY: test-extra-pptx .PHONY: test-extra-pptx
test-extra-pptx: test-extra-pptx:
PYTHONPATH=. CI=$(CI) pytest \ PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \
test_unstructured/partition/test_ppt.py \ test_unstructured/partition/test_ppt.py \
test_unstructured/partition/test_pptx.py test_unstructured/partition/test_pptx.py
.PHONY: test-extra-pypandoc .PHONY: test-extra-pypandoc
test-extra-pypandoc: test-extra-pypandoc:
PYTHONPATH=. CI=$(CI) pytest \ PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \
test_unstructured/partition/test_org.py \ test_unstructured/partition/test_org.py \
test_unstructured/partition/test_rst.py \ test_unstructured/partition/test_rst.py \
test_unstructured/partition/test_rtf.py test_unstructured/partition/test_rtf.py
.PHONY: test-extra-xlsx .PHONY: test-extra-xlsx
test-extra-xlsx: test-extra-xlsx:
PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_xlsx.py PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_xlsx.py
## check: runs linters (includes tests) ## check: runs linters (includes tests)
.PHONY: check .PHONY: check
@ -374,11 +375,11 @@ check-shfmt:
.PHONY: check-black .PHONY: check-black
check-black: check-black:
black . --check --line-length=100 ${PYTHON} -m black . --check --line-length=100
.PHONY: check-flake8 .PHONY: check-flake8
check-flake8: check-flake8:
flake8 . ${PYTHON} -m flake8 .
.PHONY: check-licenses .PHONY: check-licenses
check-licenses: check-licenses:
@ -434,7 +435,7 @@ version-sync:
.PHONY: check-coverage .PHONY: check-coverage
check-coverage: check-coverage:
coverage report --fail-under=90 ${PYTHON} -m coverage report --fail-under=90
## check-deps: check consistency of dependencies ## check-deps: check consistency of dependencies
.PHONY: check-deps .PHONY: check-deps

View File

@ -4,6 +4,7 @@ set -u -o pipefail
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
SKIPPED_FILES_LOG=$SCRIPT_DIR/skipped-files.txt SKIPPED_FILES_LOG=$SCRIPT_DIR/skipped-files.txt
PYTHON=${PYTHON:-python}
# If the file already exists, reset it # If the file already exists, reset it
if [ -f "$SKIPPED_FILES_LOG" ]; then if [ -f "$SKIPPED_FILES_LOG" ]; then
rm "$SKIPPED_FILES_LOG" rm "$SKIPPED_FILES_LOG"
@ -92,7 +93,7 @@ function print_last_run() {
trap print_last_run EXIT trap print_last_run EXIT
python_version=$(python --version 2>&1) python_version=$(${PYTHON} --version 2>&1)
tests_to_ignore=( tests_to_ignore=(
'notion.sh' 'notion.sh'