mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
345 lines
10 KiB
Makefile
345 lines
10 KiB
Makefile
PACKAGE_NAME := unstructured
|
|
PIP_VERSION := 23.2.1
|
|
CURRENT_DIR := $(shell pwd)
|
|
ARCH := $(shell uname -m)
|
|
PYTHON ?= python3
|
|
|
|
.PHONY: help
|
|
help: Makefile
|
|
@sed -n 's/^\(## \)\([a-zA-Z]\)/\2/p' $<
|
|
|
|
|
|
###########
|
|
# Install #
|
|
###########
|
|
|
|
## install-base: installs core requirements needed for text processing bricks
|
|
.PHONY: install-base
|
|
install-base: install-base-pip-packages install-nltk-models
|
|
|
|
## install: installs all test, dev, and experimental requirements
|
|
.PHONY: install
|
|
install: install-base-pip-packages install-dev install-nltk-models install-test install-huggingface install-all-docs
|
|
|
|
.PHONY: install-ci
|
|
install-ci: install-base-pip-packages install-nltk-models install-huggingface install-all-docs install-test install-pandoc install-paddleocr
|
|
|
|
.PHONY: install-base-ci
|
|
install-base-ci: install-base-pip-packages install-nltk-models install-test install-pandoc
|
|
|
|
.PHONY: install-base-pip-packages
|
|
install-base-pip-packages:
|
|
${PYTHON} -m pip install pip==${PIP_VERSION}
|
|
${PYTHON} -m pip install -r requirements/base.txt
|
|
|
|
.PHONY: install-huggingface
|
|
install-huggingface:
|
|
${PYTHON} -m pip install pip==${PIP_VERSION}
|
|
${PYTHON} -m pip install -r requirements/huggingface.txt
|
|
|
|
.PHONY: install-nltk-models
|
|
install-nltk-models:
|
|
${PYTHON} -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()"
|
|
|
|
.PHONY: install-test
|
|
install-test:
|
|
${PYTHON} -m pip install -r requirements/test.txt
|
|
# NOTE(yao) - CI seem to always install tesseract to test so it would make sense to also require
|
|
# pytesseract installation into the virtual env for testing
|
|
${PYTHON} -m pip install unstructured_pytesseract
|
|
# ${PYTHON} -m pip install argilla==1.28.0 -c requirements/deps/constraints.txt
|
|
# NOTE(robinson) - Installing weaviate-client separately here because the requests
|
|
# version conflicts with label_studio_sdk
|
|
${PYTHON} -m pip install weaviate-client -c requirements/deps/constraints.txt
|
|
|
|
.PHONY: install-dev
|
|
install-dev:
|
|
${PYTHON} -m pip install -r requirements/dev.txt
|
|
|
|
.PHONY: install-build
|
|
install-build:
|
|
${PYTHON} -m pip install -r requirements/build.txt
|
|
|
|
.PHONY: install-csv
|
|
install-csv:
|
|
${PYTHON} -m pip install -r requirements/extra-csv.txt
|
|
|
|
.PHONY: install-docx
|
|
install-docx:
|
|
${PYTHON} -m pip install -r requirements/extra-docx.txt
|
|
|
|
.PHONY: install-epub
|
|
install-epub:
|
|
${PYTHON} -m pip install -r requirements/extra-epub.txt
|
|
|
|
.PHONY: install-odt
|
|
install-odt:
|
|
${PYTHON} -m pip install -r requirements/extra-odt.txt
|
|
|
|
.PHONY: install-pypandoc
|
|
install-pypandoc:
|
|
${PYTHON} -m pip install -r requirements/extra-pandoc.txt
|
|
|
|
.PHONY: install-paddleocr
|
|
install-paddleocr:
|
|
${PYTHON} -m pip install -r requirements/extra-paddleocr.txt
|
|
|
|
.PHONY: install-markdown
|
|
install-markdown:
|
|
${PYTHON} -m pip install -r requirements/extra-markdown.txt
|
|
|
|
.PHONY: install-pdf-image
|
|
install-pdf-image:
|
|
${PYTHON} -m pip install -r requirements/extra-pdf-image.txt
|
|
|
|
.PHONY: install-pptx
|
|
install-pptx:
|
|
${PYTHON} -m pip install -r requirements/extra-pptx.txt
|
|
|
|
.PHONY: install-xlsx
|
|
install-xlsx:
|
|
${PYTHON} -m pip install -r requirements/extra-xlsx.txt
|
|
|
|
.PHONY: install-all-docs
|
|
install-all-docs: install-base install-csv install-docx install-epub install-odt install-pypandoc install-markdown install-pdf-image install-pptx install-xlsx
|
|
|
|
.PHONY: install-ingest
|
|
install-ingest:
|
|
python3 -m pip install -r requirements/ingest/ingest.txt
|
|
|
|
## install-local-inference: installs requirements for local inference
|
|
.PHONY: install-local-inference
|
|
install-local-inference: install install-all-docs
|
|
|
|
.PHONY: install-pandoc
|
|
install-pandoc:
|
|
ARCH=${ARCH} ./scripts/install-pandoc.sh
|
|
|
|
## pip-compile: compiles all base/dev/test requirements
|
|
.PHONY: pip-compile
|
|
pip-compile:
|
|
@scripts/pip-compile.sh
|
|
|
|
## install-project-local: install unstructured into your local python environment
|
|
.PHONY: install-project-local
|
|
install-project-local: install
|
|
# MAYBE TODO: fail if already exists?
|
|
${PYTHON} -m pip install -e .
|
|
|
|
## uninstall-project-local: uninstall unstructured from your local python environment
|
|
.PHONY: uninstall-project-local
|
|
uninstall-project-local:
|
|
${PYTHON} -m pip uninstall ${PACKAGE_NAME}
|
|
|
|
#################
|
|
# Test and Lint #
|
|
#################
|
|
|
|
export CI ?= false
|
|
export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false
|
|
|
|
## test: runs all unittests
|
|
.PHONY: test
|
|
test:
|
|
PYTHONPATH=. CI=$(CI) \
|
|
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest -n auto test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
|
|
|
|
.PHONY: test-unstructured-api-unit
|
|
test-unstructured-api-unit:
|
|
scripts/test-unstructured-api-unit.sh
|
|
|
|
.PHONY: test-no-extras
|
|
test-no-extras:
|
|
PYTHONPATH=. CI=$(CI) \
|
|
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest -n auto \
|
|
test_${PACKAGE_NAME}/partition/test_text.py \
|
|
test_${PACKAGE_NAME}/partition/test_email.py \
|
|
test_${PACKAGE_NAME}/partition/html/test_partition.py \
|
|
test_${PACKAGE_NAME}/partition/test_xml.py
|
|
|
|
.PHONY: test-extra-csv
|
|
test-extra-csv:
|
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \
|
|
test_unstructured/partition/test_csv.py \
|
|
test_unstructured/partition/test_tsv.py
|
|
|
|
.PHONY: test-extra-docx
|
|
test-extra-docx:
|
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \
|
|
test_unstructured/partition/test_doc.py \
|
|
test_unstructured/partition/test_docx.py
|
|
|
|
.PHONY: test-extra-epub
|
|
test-extra-epub:
|
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_epub.py
|
|
|
|
.PHONY: test-extra-markdown
|
|
test-extra-markdown:
|
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_md.py
|
|
|
|
.PHONY: test-extra-odt
|
|
test-extra-odt:
|
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_odt.py
|
|
|
|
.PHONY: test-extra-pdf-image
|
|
test-extra-pdf-image:
|
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/pdf_image
|
|
|
|
.PHONY: test-extra-pptx
|
|
test-extra-pptx:
|
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \
|
|
test_unstructured/partition/test_ppt.py \
|
|
test_unstructured/partition/test_pptx.py
|
|
|
|
.PHONY: test-extra-pypandoc
|
|
test-extra-pypandoc:
|
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto \
|
|
test_unstructured/partition/test_org.py \
|
|
test_unstructured/partition/test_rst.py \
|
|
test_unstructured/partition/test_rtf.py
|
|
|
|
.PHONY: test-extra-xlsx
|
|
test-extra-xlsx:
|
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/partition/test_xlsx.py
|
|
|
|
.PHONY: test-text-extraction-evaluate
|
|
test-text-extraction-evaluate:
|
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest -n auto test_unstructured/metrics/test_text_extraction.py
|
|
|
|
## check: runs linters (includes tests)
|
|
.PHONY: check
|
|
check: check-ruff check-black check-flake8 check-version
|
|
|
|
.PHONY: check-shfmt
|
|
check-shfmt:
|
|
shfmt -i 2 -d .
|
|
|
|
.PHONY: check-black
|
|
check-black:
|
|
${PYTHON} -m black . --check --line-length=100
|
|
|
|
.PHONY: check-flake8
|
|
check-flake8:
|
|
${PYTHON} -m flake8 .
|
|
|
|
.PHONY: check-licenses
|
|
check-licenses:
|
|
@scripts/check-licenses.sh
|
|
|
|
.PHONY: check-ruff
|
|
check-ruff:
|
|
# -- ruff options are determined by pyproject.toml --
|
|
ruff check .
|
|
|
|
.PHONY: check-autoflake
|
|
check-autoflake:
|
|
autoflake --check-diff .
|
|
|
|
## check-scripts: run shellcheck
|
|
.PHONY: check-scripts
|
|
check-scripts:
|
|
# Fail if any of these files have warnings
|
|
scripts/shellcheck.sh
|
|
|
|
## check-version: run check to ensure version in CHANGELOG.md matches version in package
|
|
.PHONY: check-version
|
|
check-version:
|
|
# Fail if syncing version would produce changes
|
|
scripts/version-sync.sh -c \
|
|
-f "unstructured/__version__.py" semver
|
|
|
|
## tidy: run black
|
|
.PHONY: tidy
|
|
tidy: tidy-python
|
|
|
|
.PHONY: tidy_shell
|
|
tidy-shell:
|
|
shfmt -i 2 -l -w .
|
|
|
|
.PHONY: tidy-python
|
|
tidy-python:
|
|
ruff check . --fix-only || true
|
|
autoflake --in-place .
|
|
black --line-length=100 .
|
|
|
|
## version-sync: update __version__.py with most recent version from CHANGELOG.md
|
|
.PHONY: version-sync
|
|
version-sync:
|
|
scripts/version-sync.sh \
|
|
-f "unstructured/__version__.py" semver
|
|
|
|
.PHONY: check-coverage
|
|
check-coverage:
|
|
${PYTHON} -m coverage report --fail-under=90
|
|
|
|
## check-deps: check consistency of dependencies
|
|
.PHONY: check-deps
|
|
check-deps:
|
|
scripts/consistent-deps.sh
|
|
|
|
.PHONY: check-extras
|
|
check-extras:
|
|
scripts/check-extras.sh
|
|
|
|
##########
|
|
# Docker #
|
|
##########
|
|
|
|
# Docker targets are provided for convenience only and are not required in a standard development environment
|
|
|
|
DOCKER_IMAGE ?= unstructured:dev
|
|
|
|
.PHONY: docker-build
|
|
docker-build:
|
|
PIP_VERSION=${PIP_VERSION} DOCKER_IMAGE_NAME=${DOCKER_IMAGE} ./scripts/docker-build.sh
|
|
|
|
.PHONY: docker-start-bash
|
|
docker-start-bash:
|
|
docker run -ti --rm ${DOCKER_IMAGE}
|
|
|
|
.PHONY: docker-start-dev
|
|
docker-start-dev:
|
|
docker run --rm \
|
|
-v ${CURRENT_DIR}:/mnt/local_unstructued \
|
|
-ti ${DOCKER_IMAGE}
|
|
|
|
.PHONY: docker-test
|
|
docker-test:
|
|
docker run --rm \
|
|
-v ${CURRENT_DIR}/test_unstructured:/home/notebook-user/test_unstructured \
|
|
-v ${CURRENT_DIR}/test_unstructured_ingest:/home/notebook-user/test_unstructured_ingest \
|
|
$(if $(wildcard uns_test_env_file),--env-file uns_test_env_file,) \
|
|
$(DOCKER_IMAGE) \
|
|
bash -c "pip install -r requirements/test.txt -r requirements/dev.txt && \
|
|
CI=$(CI) \
|
|
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
|
|
python3 -m pytest $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
|
|
|
|
.PHONY: docker-smoke-test
|
|
docker-smoke-test:
|
|
DOCKER_IMAGE=${DOCKER_IMAGE} ./scripts/docker-smoke-test.sh
|
|
|
|
|
|
###########
|
|
# Jupyter #
|
|
###########
|
|
|
|
.PHONY: docker-jupyter-notebook
|
|
docker-jupyter-notebook:
|
|
docker run -p 8888:8888 --mount type=bind,source=$(realpath .),target=/home --entrypoint jupyter-notebook -t --rm ${DOCKER_IMAGE} --allow-root --port 8888 --ip 0.0.0.0 --NotebookApp.token='' --NotebookApp.password=''
|
|
|
|
|
|
.PHONY: run-jupyter
|
|
run-jupyter:
|
|
PYTHONPATH=$(realpath .) JUPYTER_PATH=$(realpath .) jupyter-notebook --NotebookApp.token='' --NotebookApp.password=''
|
|
|
|
|
|
###########
|
|
# Other #
|
|
###########
|
|
|
|
.PHONY: html-fixtures-update
|
|
html-fixtures-update:
|
|
rm -r test_unstructured_ingest/expected-structured-output-html && \
|
|
test_unstructured_ingest/structured-json-to-html.sh test_unstructured_ingest/expected-structured-output-html
|