unstructured/Makefile

423 lines
12 KiB
Makefile
Raw Normal View History

2022-06-29 14:35:19 -04:00
PACKAGE_NAME := unstructured
PIP_VERSION := 23.2.1
2023-03-21 13:46:09 -07:00
CURRENT_DIR := $(shell pwd)
ARCH := $(shell uname -m)
2022-06-29 14:35:19 -04:00
.PHONY: help
help: Makefile
@sed -n 's/^\(## \)\([a-zA-Z]\)/\2/p' $<
###########
# Install #
###########
## install-base: installs core requirements needed for text processing bricks
.PHONY: install-base
install-base: install-base-pip-packages install-nltk-models
## install: installs all test, dev, and experimental requirements
.PHONY: install
install: install-base-pip-packages install-dev install-nltk-models install-test install-huggingface install-all-docs
2022-06-29 14:35:19 -04:00
.PHONY: install-ci
install-ci: install-base-pip-packages install-nltk-models install-huggingface install-all-docs install-test
2022-06-29 14:35:19 -04:00
.PHONY: install-base-ci
install-base-ci: install-base-pip-packages install-nltk-models install-test
2022-06-29 14:35:19 -04:00
.PHONY: install-base-pip-packages
install-base-pip-packages:
python3 -m pip install pip==${PIP_VERSION}
python3 -m pip install -r requirements/base.txt
2022-06-29 14:35:19 -04:00
.PHONY: install-huggingface
install-huggingface:
python3 -m pip install pip==${PIP_VERSION}
python3 -m pip install -r requirements/huggingface.txt
2022-06-29 14:35:19 -04:00
.PHONE: install-nltk-models
install-nltk-models:
python -c "import nltk; nltk.download('punkt')"
python -c "import nltk; nltk.download('averaged_perceptron_tagger')"
.PHONY: install-test
install-test:
python3 -m pip install -r requirements/test.txt
[CORE-1741] use forked pytesseract to reduce calls to tesseract (#1298) This PR resolves [CORE-1741](https://unstructured-ai.atlassian.net/browse/CORE-1741) by using a new function `pytesseract.run_and_get_multiple_output`, see forked repo for more details: https://github.com/Unstructured-IO/unstructured.pytesseract/releases/tag/0.3.11-dev1 This reduces the call to `tesseract` by half per page of PDF/image during partition, roughly reducing the runtime by 48%. The new function is in forked `unstructured.pytesseract`. A PR has been made to the upstream repo and once that is merged we should switch to the up stream version. For now we add a new dependency: `unstructured.pytesseract`. ## testing Existing unit tests should serve as tests to the new function. To demonstrate the changes in performance: - checkout main - run `./scripts/performance/profile.sh` and select `ocr_only` strategy, using the 10th document (16 page layout paper in pdf format) - examine the speedscope profile or time profile in flamegraph -> should see two dominant time spenders are `pytesseract.image_to_text` and `pytesseract.image_to_boxes`, with both about the same total time (see attached first image) - checkout this branch - run the same `profile.sh` with the same options - examine the profile again and this time should notice 1) total runtime is reduced by more than 40%; 2) only `unstructured_pytesseract.run_and_get_multiple_output` is the top time spender and its total time is about the same as either the `pytesseract.image_to_text` or `pytesseract.image_to_boxes` time (see second image below) ![Screenshot 2023-09-06 at 9 45 10 AM](https://github.com/Unstructured-IO/unstructured/assets/647930/fed6118b-a0dc-493d-bef8-85d73027c968) ![Screenshot 2023-09-06 at 9 46 37 AM](https://github.com/Unstructured-IO/unstructured/assets/647930/dd1d6369-cfba-43d4-b1c6-87a8a98b2e16) [CORE-1741]: https://unstructured-ai.atlassian.net/browse/CORE-1741?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ --------- Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com> Co-authored-by: cragwolfe <crag@unstructured.io>
2023-09-14 18:27:18 -05:00
# NOTE(yao) - CI seem to always install tesseract to test so it would make sense to also require
# pytesseract installation into the virtual env for testing
python3 -m pip install unstructured.pytesseract -c requirements/constraints.in
python3 -m pip install argilla -c requirements/constraints.in
# NOTE(robinson) - Installing weaviate-client separately here because the requests
# version conflicts with label_studio_sdk
python3 -m pip install weaviate-client -c requirements/constraints.in
# TODO (yao): find out if how to constrain argilla properly without causing conflicts
python3 -m pip install argilla
2022-06-29 14:35:19 -04:00
.PHONY: install-dev
install-dev:
python3 -m pip install -r requirements/dev.txt
2022-06-29 14:35:19 -04:00
.PHONY: install-build
install-build:
python3 -m pip install -r requirements/build.txt
2022-06-29 14:35:19 -04:00
.PHONY: install-csv
install-csv:
python3 -m pip install -r requirements/extra-csv.txt
.PHONY: install-docx
install-docx:
python3 -m pip install -r requirements/extra-docx.txt
.PHONY: install-epub
install-epub:
python3 -m pip install -r requirements/extra-epub.txt
.PHONY: install-odt
install-odt:
python3 -m pip install -r requirements/extra-odt.txt
.PHONY: install-pypandoc
install-pypandoc:
python3 -m pip install -r requirements/extra-pandoc.txt
.PHONY: install-markdown
install-markdown:
python3 -m pip install -r requirements/extra-markdown.txt
.PHONY: install-msg
install-msg:
python3 -m pip install -r requirements/extra-msg.txt
.PHONY: install-pdf-image
install-pdf-image:
python3 -m pip install -r requirements/extra-pdf-image.txt
.PHONY: install-pptx
install-pptx:
python3 -m pip install -r requirements/extra-pptx.txt
.PHONY: install-xlsx
install-xlsx:
python3 -m pip install -r requirements/extra-xlsx.txt
.PHONY: install-all-docs
install-all-docs: install-base install-csv install-docx install-epub install-odt install-pypandoc install-markdown install-msg install-pdf-image install-pptx install-xlsx
.PHONY: install-ingest-google-drive
install-ingest-google-drive:
python3 -m pip install -r requirements/ingest-google-drive.txt
## install-ingest-s3: install requirements for the s3 connector
.PHONY: install-ingest-s3
install-ingest-s3:
python3 -m pip install -r requirements/ingest-s3.txt
.PHONY: install-ingest-gcs
install-ingest-gcs:
python3 -m pip install -r requirements/ingest-gcs.txt
2023-06-30 17:08:27 -07:00
.PHONY: install-ingest-dropbox
install-ingest-dropbox:
python3 -m pip install -r requirements/ingest-dropbox.txt
.PHONY: install-ingest-azure
install-ingest-azure:
python3 -m pip install -r requirements/ingest-azure.txt
2023-07-31 18:10:10 -07:00
.PHONY: install-ingest-box
install-ingest-box:
python3 -m pip install -r requirements/ingest-box.txt
.PHONY: install-ingest-delta-table
install-ingest-delta-table:
python3 -m pip install -r requirements/ingest-delta-table.txt
.PHONY: install-ingest-discord
install-ingest-discord:
pip install -r requirements/ingest-discord.txt
.PHONY: install-ingest-github
install-ingest-github:
python3 -m pip install -r requirements/ingest-github.txt
.PHONY: install-ingest-biomed
install-ingest-biomed:
python3 -m pip install -r requirements/ingest-biomed.txt
.PHONY: install-ingest-gitlab
install-ingest-gitlab:
python3 -m pip install -r requirements/ingest-gitlab.txt
2023-07-13 14:57:54 -06:00
.PHONY: install-ingest-onedrive
install-ingest-onedrive:
python3 -m pip install -r requirements/ingest-onedrive.txt
.PHONY: install-ingest-outlook
install-ingest-outlook:
python3 -m pip install -r requirements/ingest-outlook.txt
.PHONY: install-ingest-reddit
install-ingest-reddit:
python3 -m pip install -r requirements/ingest-reddit.txt
.PHONY: install-ingest-slack
install-ingest-slack:
pip install -r requirements/ingest-slack.txt
.PHONY: install-ingest-wikipedia
install-ingest-wikipedia:
python3 -m pip install -r requirements/ingest-wikipedia.txt
2023-07-01 18:45:28 +01:00
.PHONY: install-ingest-elasticsearch
install-ingest-elasticsearch:
python3 -m pip install -r requirements/ingest-elasticsearch.txt
feat: confluence connector (cloud) (#906) * Add confluence connector and an example script * add test script, add dependency installations * add authentication secret variables for ci tests and actions * add dependency installation commands for workflows * add dependency installation commands for workflows * Update ingest test fixtures (#907) Co-authored-by: ahmetmeleq <ahmetmeleq@users.noreply.github.com> * add add ingest test fixtures update workflow for python 3.10, update example script with dummy values * change workflow name to avoid confusion * change workflow name to avoid confusion * only leave 3.8 in ingest test matrix to test consistent partitioning among python versions, remove 3.10 workflow for the test fixtures update * only leave 3.8 in ingest test matrix to test consistent partitioning among python versions * Update ingest test fixtures (#911) Co-authored-by: ahmetmeleq <ahmetmeleq@users.noreply.github.com> * revert back the test python version matrix * recompile dependencies * modifications for shellcheck * update changelog and version * changelog and version * remove comments * Update ingest test fixtures (#915) Co-authored-by: ahmetmeleq <ahmetmeleq@users.noreply.github.com> * add the option to state the number of spaces to be fetched * add scroll functionality, expose --confluence-num-of-spaces, --confluence-list-of-spaces and --confluence-num-of-docs-from-each-space to users * add help message * add docstrings for two tests, validate grabbing every doc in the fetched spaces, count number of files instead of diffing for confluence2 test * change test names * rename connector arg Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> * change arg name for connector Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> * add comment to example * change arg names * add new tests to ingest test * shellcheck remove redundant statement * Update ingest test fixtures (#932) Co-authored-by: ahmetmeleq <ahmetmeleq@users.noreply.github.com> * Update ingest test fixtures (#936) Co-authored-by: ahmetmeleq <ahmetmeleq@users.noreply.github.com> * linting * change file extensions to parse as html * Update ingest test fixtures (#943) Co-authored-by: ahmetmeleq <ahmetmeleq@users.noreply.github.com> * remove old fixtures * update version to 0.8.2-dev3 * change file to trigger CI * change file to trigger CI * change file to trigger CI * change file to trigger CI --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: ahmetmeleq <ahmetmeleq@users.noreply.github.com>
2023-07-18 19:29:41 +01:00
.PHONY: install-ingest-confluence
install-ingest-confluence:
python3 -m pip install -r requirements/ingest-confluence.txt
feat: airtable connector (#1012) * add the first version of airtable connector * change imports as inline to fail gracefully in case of lacking dependency * parse tables as csv rather than plain text * add relevant logic to be able to use --airtable-list-of-paths * add script for creation of reseources for testing, add test script (large) for testing with a large number of tables to validate scroll functionality, update test script (diff) based on the new settings * fix ingest test names * add scripts for the large table test * remove large table test from diff test * make base and table ids explicit * add and remove comments * use -ne instead of != * update code based on the recent ingest refactor, update changelog and version * shellcheck fix * update comments * update check-num-rows-and-columns-output error message Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> * update help comments * update help comments * update help comments * update workflows to set auth tokens and to run make install * add comments on create_scale_test_components * separate component ids from the test script, add comments to document test component creation * add LARGE_BASE test, implement LARGE_BASE component creation, replace component id * shellcheck fixes * shellcheck fixes * update docs * update comment * bump version * add wrongly deleted file * sort columns before saving to process * Update ingest test fixtures (#1098) Co-authored-by: ahmetmeleq <ahmetmeleq@users.noreply.github.com> --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: ahmetmeleq <ahmetmeleq@users.noreply.github.com>
2023-08-11 22:02:51 +03:00
.PHONY: install-ingest-airtable
install-ingest-airtable:
python3 -m pip install -r requirements/ingest-airtable.txt
.PHONY: install-ingest-sharepoint
install-ingest-sharepoint:
python3 -m pip install -r requirements/ingest-sharepoint.txt
.PHONY: install-ingest-local
install-ingest-local:
echo "no unique dependencies for local connector"
.PHONY: install-ingest-notion
install-ingest-notion:
python3 -m pip install -r requirements/ingest-notion.txt
2023-09-02 08:50:31 -07:00
.PHONY: install-ingest-salesforce
install-ingest-salesforce:
python3 -m pip install -r requirements/ingest-salesforce.txt
.PHONY: install-ingest-jira
install-ingest-jira:
python3 -m pip install -r requirements/ingest-jira.txt
.PHONY: install-unstructured-inference
install-unstructured-inference:
python3 -m pip install -r requirements/local-inference.txt
## install-local-inference: installs requirements for local inference
.PHONY: install-local-inference
install-local-inference: install install-all-docs
.PHONY: install-pandoc
install-pandoc:
ARCH=${ARCH} ./scripts/install-pandoc.sh
.PHONY: install-paddleocr
install-paddleocr:
ARCH=${ARCH} ./scripts/install-paddleocr.sh
2022-06-29 14:35:19 -04:00
## pip-compile: compiles all base/dev/test requirements
.PHONY: pip-compile
pip-compile:
@for file in $(shell ls requirements/*.in); do \
if [[ "$${file}" =~ "constraints" ]]; then \
continue; \
fi; \
echo "running: pip-compile --upgrade $${file}"; \
pip-compile --upgrade $${file}; \
done
2022-06-29 14:35:19 -04:00
cp requirements/build.txt docs/requirements.txt
2022-06-29 14:35:19 -04:00
## install-project-local: install unstructured into your local python environment
.PHONY: install-project-local
install-project-local: install
# MAYBE TODO: fail if already exists?
pip install -e .
## uninstall-project-local: uninstall unstructured from your local python environment
.PHONY: uninstall-project-local
uninstall-project-local:
pip uninstall ${PACKAGE_NAME}
#################
# Test and Lint #
#################
export CI ?= false
2022-06-29 14:35:19 -04:00
## test: runs all unittests
.PHONY: test
test:
PYTHONPATH=. CI=$(CI) pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing
2022-06-29 14:35:19 -04:00
.PHONY: test-unstructured-api-unit
test-unstructured-api-unit:
scripts/test-unstructured-api-unit.sh
.PHONY: test-no-extras
# TODO(newelh) Add json test when fixed
test-no-extras:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/test_text.py \
test_${PACKAGE_NAME}/partition/test_email.py \
test_${PACKAGE_NAME}/partition/test_html_partition.py \
test_${PACKAGE_NAME}/partition/test_xml_partition.py
.PHONY: test-extra-csv
test-extra-csv:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/csv
.PHONY: test-extra-docx
test-extra-docx:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/docx
.PHONY: test-extra-markdown
test-extra-markdown:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/markdown
.PHONY: test-extra-msg
test-extra-msg:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/msg
.PHONY: test-extra-odt
test-extra-odt:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/odt
.PHONY: test-extra-pdf-image
test-extra-pdf-image:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/pdf-image
.PHONY: test-extra-pptx
test-extra-pptx:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/pptx
.PHONY: test-extra-epub
test-extra-epub:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/epub
.PHONY: test-extra-pypandoc
test-extra-pypandoc:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/pypandoc
.PHONY: test-extra-xlsx
test-extra-xlsx:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/xlsx
2022-06-29 14:35:19 -04:00
## check: runs linters (includes tests)
.PHONY: check
check: check-src check-tests check-version
2022-06-29 14:35:19 -04:00
## check-src: runs linters (source only, no tests)
.PHONY: check-src
check-src:
ruff . --select I,UP015,UP032,UP034,UP018,COM,C4,PT,SIM,PLR0402 --ignore COM812,PT011,PT012,SIM117
2022-06-29 14:35:19 -04:00
black --line-length 100 ${PACKAGE_NAME} --check
flake8 ${PACKAGE_NAME}
mypy ${PACKAGE_NAME} --ignore-missing-imports --check-untyped-defs
2022-06-29 14:35:19 -04:00
.PHONY: check-tests
check-tests:
black --line-length 100 test_${PACKAGE_NAME} --check
black --line-length 100 test_${PACKAGE_NAME}_ingest --check
2022-06-29 14:35:19 -04:00
flake8 test_${PACKAGE_NAME}
flake8 test_${PACKAGE_NAME}_ingest
2022-06-29 14:35:19 -04:00
## check-scripts: run shellcheck
.PHONY: check-scripts
check-scripts:
# Fail if any of these files have warnings
scripts/shellcheck.sh
## check-version: run check to ensure version in CHANGELOG.md matches version in package
.PHONY: check-version
check-version:
# Fail if syncing version would produce changes
scripts/version-sync.sh -c \
-f "unstructured/__version__.py" semver
2022-06-29 14:35:19 -04:00
## tidy: run black
.PHONY: tidy
tidy:
ruff . --select I,UP015,UP032,UP034,UP018,COM,C4,PT,SIM,PLR0402 --fix-only || true
2022-06-29 14:35:19 -04:00
black --line-length 100 ${PACKAGE_NAME}
black --line-length 100 test_${PACKAGE_NAME}
black --line-length 100 test_${PACKAGE_NAME}_ingest
2022-06-29 14:35:19 -04:00
## version-sync: update __version__.py with most recent version from CHANGELOG.md
.PHONY: version-sync
version-sync:
scripts/version-sync.sh \
-f "unstructured/__version__.py" semver
2022-06-29 14:35:19 -04:00
.PHONY: check-coverage
check-coverage:
coverage report --fail-under=95
2023-03-14 13:40:01 -07:00
## check-deps: check consistency of dependencies
.PHONY: check-deps
check-deps:
scripts/consistent-deps.sh
2023-03-14 13:40:01 -07:00
##########
# Docker #
##########
# Docker targets are provided for convenience only and are not required in a standard development environment
DOCKER_IMAGE ?= unstructured:dev
2023-03-29 00:02:39 -07:00
2023-03-14 13:40:01 -07:00
.PHONY: docker-build
docker-build:
PIP_VERSION=${PIP_VERSION} DOCKER_IMAGE_NAME=${DOCKER_IMAGE} ./scripts/docker-build.sh
2023-03-14 13:40:01 -07:00
.PHONY: docker-start-bash
docker-start-bash:
docker run -ti --rm ${DOCKER_IMAGE}
2023-03-21 13:46:09 -07:00
.PHONY: docker-start-dev
docker-start-dev:
docker run --rm \
-v ${CURRENT_DIR}:/mnt/local_unstructued \
-ti ${DOCKER_IMAGE}
2023-03-21 13:46:09 -07:00
.PHONY: docker-test
docker-test:
docker run --rm \
-v ${CURRENT_DIR}/test_unstructured:/home/notebook-user/test_unstructured \
-v ${CURRENT_DIR}/test_unstructured_ingest:/home/notebook-user/test_unstructured_ingest \
$(if $(wildcard uns_test_env_file),--env-file uns_test_env_file,) \
2023-03-29 00:02:39 -07:00
$(DOCKER_IMAGE) \
bash -c "CI=$(CI) pytest $(if $(TEST_NAME),-k $(TEST_NAME),) test_unstructured"
.PHONY: docker-smoke-test
docker-smoke-test:
2023-04-06 10:03:42 -07:00
DOCKER_IMAGE=${DOCKER_IMAGE} ./scripts/docker-smoke-test.sh
###########
# Jupyter #
###########
.PHONY: docker-jupyter-notebook
docker-jupyter-notebook:
docker run -p 8888:8888 --mount type=bind,source=$(realpath .),target=/home --entrypoint jupyter-notebook -t --rm ${DOCKER_IMAGE} --allow-root --port 8888 --ip 0.0.0.0 --NotebookApp.token='' --NotebookApp.password=''
.PHONY: run-jupyter
run-jupyter:
PYTHONPATH=$(realpath .) JUPYTER_PATH=$(realpath .) jupyter-notebook --NotebookApp.token='' --NotebookApp.password=''