mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2026-01-08 05:10:11 +00:00
**Summary** In preparation for pluggable auto-partitioners simplify metadata as discussed. **Additional Context** - Pluggable auto-partitioners requires partitioners to have a consistent call signature. An arbitrary partitioner provided at runtime needs to have a call signature that is known and consistent. Basically `partition_x(filename, *, file, **kwargs)`. - The current `auto.partition()` is highly coupled to each distinct file-type partitioner, deciding which arguments to forward to each. - This is driven by the existence of "delegating" partitioners, those that convert their file-type and then call a second partitioner to do the actual partitioning. Both the delegating and proxy partitioners are decorated with metadata-post-processing decorators and those decorators are not idempotent. We call the situation where those decorators would run twice "double-decorating". For example, EPUB converts to HTML and calls `partition_html()` and both `partition_epub()` and `partition_html()` are decorated. - The way double-decorating has been avoided in the past is to avoid sending the arguments the metadata decorators are sensitive to to the proxy partitioner. This is very obscure, complex to reason about, error-prone, and just overall not a viable strategy. The better solution is to not decorate delegating partitioners and let the proxy partitioner handle all the metadata. - This first step in preparation for that is part of simplifying the metadata processing by removing unused or unwanted legacy parameters. - `date_from_file_object` is a misnomer because a file-object never contains last-modified data. - It can never produce useful results in the API where last-modified information must be provided by `metadata_last_modified`. - It is an undocumented parameter so not in use. - Using it can produce incorrect metadata.
499 lines
15 KiB
Makefile
499 lines
15 KiB
Makefile
PACKAGE_NAME := unstructured
|
|
PIP_VERSION := 23.2.1
|
|
CURRENT_DIR := $(shell pwd)
|
|
ARCH := $(shell uname -m)
|
|
PYTHON ?= python3
|
|
|
|
.PHONY: help
|
|
help: Makefile
|
|
@sed -n 's/^\(## \)\([a-zA-Z]\)/\2/p' $<
|
|
|
|
|
|
###########
|
|
# Install #
|
|
###########
|
|
|
|
## install-base: installs core requirements needed for text processing bricks
|
|
.PHONY: install-base
|
|
install-base: install-base-pip-packages install-nltk-models
|
|
|
|
## install: installs all test, dev, and experimental requirements
|
|
.PHONY: install
|
|
install: install-base-pip-packages install-dev install-nltk-models install-test install-huggingface install-all-docs
|
|
|
|
.PHONY: install-ci
|
|
install-ci: install-base-pip-packages install-nltk-models install-huggingface install-all-docs install-test install-pandoc
|
|
|
|
.PHONY: install-base-ci
|
|
install-base-ci: install-base-pip-packages install-nltk-models install-test install-pandoc
|
|
|
|
.PHONY: install-base-pip-packages
|
|
install-base-pip-packages:
|
|
${PYTHON} -m pip install pip==${PIP_VERSION}
|
|
${PYTHON} -m pip install -r requirements/base.txt
|
|
|
|
.PHONY: install-huggingface
|
|
install-huggingface:
|
|
${PYTHON} -m pip install pip==${PIP_VERSION}
|
|
${PYTHON} -m pip install -r requirements/huggingface.txt
|
|
|
|
.PHONY: install-nltk-models
|
|
install-nltk-models:
|
|
${PYTHON} -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()"
|
|
|
|
.PHONY: install-test
|
|
install-test:
|
|
${PYTHON} -m pip install -r requirements/test.txt
|
|
# NOTE(yao) - CI seem to always install tesseract to test so it would make sense to also require
|
|
# pytesseract installation into the virtual env for testing
|
|
${PYTHON} -m pip install unstructured_pytesseract
|
|
# ${PYTHON} -m pip install argilla==1.28.0 -c requirements/deps/constraints.txt
|
|
# NOTE(robinson) - Installing weaviate-client separately here because the requests
|
|
# version conflicts with label_studio_sdk
|
|
${PYTHON} -m pip install weaviate-client -c requirements/deps/constraints.txt
|
|
|
|
.PHONY: install-dev
|
|
install-dev:
|
|
${PYTHON} -m pip install -r requirements/dev.txt
|
|
|
|
.PHONY: install-build
|
|
install-build:
|
|
${PYTHON} -m pip install -r requirements/build.txt
|
|
|
|
.PHONY: install-csv
|
|
install-csv:
|
|
${PYTHON} -m pip install -r requirements/extra-csv.txt
|
|
|
|
.PHONY: install-docx
|
|
install-docx:
|
|
${PYTHON} -m pip install -r requirements/extra-docx.txt
|
|
|
|
.PHONY: install-epub
|
|
install-epub:
|
|
${PYTHON} -m pip install -r requirements/extra-epub.txt
|
|
|
|
.PHONY: install-odt
|
|
install-odt:
|
|
${PYTHON} -m pip install -r requirements/extra-odt.txt
|
|
|
|
.PHONY: install-pypandoc
|
|
install-pypandoc:
|
|
${PYTHON} -m pip install -r requirements/extra-pandoc.txt
|
|
|
|
.PHONY: install-markdown
|
|
install-markdown:
|
|
${PYTHON} -m pip install -r requirements/extra-markdown.txt
|
|
|
|
.PHONY: install-pdf-image
|
|
install-pdf-image:
|
|
${PYTHON} -m pip install -r requirements/extra-pdf-image.txt
|
|
|
|
.PHONY: install-pptx
|
|
install-pptx:
|
|
${PYTHON} -m pip install -r requirements/extra-pptx.txt
|
|
|
|
.PHONY: install-xlsx
|
|
install-xlsx:
|
|
${PYTHON} -m pip install -r requirements/extra-xlsx.txt
|
|
|
|
.PHONY: install-all-docs
|
|
install-all-docs: install-base install-csv install-docx install-epub install-odt install-pypandoc install-markdown install-pdf-image install-pptx install-xlsx
|
|
|
|
.PHONY: install-all-ingest
|
|
install-all-ingest:
|
|
find requirements/ingest -type f -name "*.txt" -exec ${PYTHON} -m pip install -r '{}' ';'
|
|
|
|
|
|
.PHONY: install-ingest-google-drive
|
|
install-ingest-google-drive:
|
|
${PYTHON} -m pip install -r requirements/ingest/google-drive.txt
|
|
|
|
## install-ingest-s3: install requirements for the s3 connector
|
|
.PHONY: install-ingest-s3
|
|
install-ingest-s3:
|
|
${PYTHON} -m pip install -r requirements/ingest/s3.txt
|
|
|
|
.PHONY: install-ingest-gcs
|
|
install-ingest-gcs:
|
|
${PYTHON} -m pip install -r requirements/ingest/gcs.txt
|
|
|
|
.PHONY: install-ingest-dropbox
|
|
install-ingest-dropbox:
|
|
${PYTHON} -m pip install -r requirements/ingest/dropbox.txt
|
|
|
|
.PHONY: install-ingest-azure
|
|
install-ingest-azure:
|
|
${PYTHON} -m pip install -r requirements/ingest/azure.txt
|
|
|
|
.PHONY: install-ingest-box
|
|
install-ingest-box:
|
|
${PYTHON} -m pip install -r requirements/ingest/box.txt
|
|
|
|
.PHONY: install-ingest-delta-table
|
|
install-ingest-delta-table:
|
|
${PYTHON} -m pip install -r requirements/ingest/delta-table.txt
|
|
|
|
.PHONY: install-ingest-discord
|
|
install-ingest-discord:
|
|
${PYTHON} -m pip install -r requirements/ingest/discord.txt
|
|
|
|
.PHONY: install-ingest-github
|
|
install-ingest-github:
|
|
${PYTHON} -m pip install -r requirements/ingest/github.txt
|
|
|
|
.PHONY: install-ingest-biomed
|
|
install-ingest-biomed:
|
|
${PYTHON} -m pip install -r requirements/ingest/biomed.txt
|
|
|
|
.PHONY: install-ingest-gitlab
|
|
install-ingest-gitlab:
|
|
${PYTHON} -m pip install -r requirements/ingest/gitlab.txt
|
|
|
|
.PHONY: install-ingest-onedrive
|
|
install-ingest-onedrive:
|
|
${PYTHON} -m pip install -r requirements/ingest/onedrive.txt
|
|
|
|
.PHONY: install-ingest-outlook
|
|
install-ingest-outlook:
|
|
${PYTHON} -m pip install -r requirements/ingest/outlook.txt
|
|
|
|
.PHONY: install-ingest-reddit
|
|
install-ingest-reddit:
|
|
${PYTHON} -m pip install -r requirements/ingest/reddit.txt
|
|
|
|
.PHONY: install-ingest-slack
|
|
install-ingest-slack:
|
|
${PYTHON} -m pip install -r requirements/ingest/slack.txt
|
|
|
|
.PHONY: install-ingest-kafka
|
|
install-ingest-kafka:
|
|
${PYTHON} -m pip install -r requirements/ingest/kafka.txt
|
|
|
|
.PHONY: install-ingest-wikipedia
|
|
install-ingest-wikipedia:
|
|
${PYTHON} -m pip install -r requirements/ingest/wikipedia.txt
|
|
|
|
.PHONY: install-ingest-elasticsearch
|
|
install-ingest-elasticsearch:
|
|
${PYTHON} -m pip install -r requirements/ingest/elasticsearch.txt
|
|
|
|
.PHONY: install-ingest-opensearch
|
|
install-ingest-opensearch:
|
|
${PYTHON} -m pip install -r requirements/ingest/opensearch.txt
|
|
|
|
.PHONY: install-ingest-confluence
|
|
install-ingest-confluence:
|
|
${PYTHON} -m pip install -r requirements/ingest/confluence.txt
|
|
|
|
.PHONY: install-ingest-airtable
|
|
install-ingest-airtable:
|
|
${PYTHON} -m pip install -r requirements/ingest/airtable.txt
|
|
|
|
.PHONY: install-ingest-sharepoint
|
|
install-ingest-sharepoint:
|
|
${PYTHON} -m pip install -r requirements/ingest/sharepoint.txt
|
|
|
|
.PHONY: install-ingest-singlestore
|
|
install-ingest-singlestore:
|
|
${PYTHON} -m pip install -r requirements/ingest/singlestore.txt
|
|
|
|
.PHONY: install-ingest-weaviate
|
|
install-ingest-weaviate:
|
|
${PYTHON} -m pip install -r requirements/ingest/weaviate.txt
|
|
|
|
.PHONY: install-ingest-local
|
|
install-ingest-local:
|
|
echo "no unique dependencies for local connector"
|
|
|
|
.PHONY: install-ingest-notion
|
|
install-ingest-notion:
|
|
${PYTHON} -m pip install -r requirements/ingest/notion.txt
|
|
|
|
.PHONY: install-ingest-salesforce
|
|
install-ingest-salesforce:
|
|
${PYTHON} -m pip install -r requirements/ingest/salesforce.txt
|
|
|
|
.PHONY: install-ingest-jira
|
|
install-ingest-jira:
|
|
${PYTHON} -m pip install -r requirements/ingest/jira.txt
|
|
|
|
.PHONY: install-ingest-hubspot
|
|
install-ingest-hubspot:
|
|
${PYTHON} -m pip install -r requirements/ingest/hubspot.txt
|
|
|
|
.PHONY: install-ingest-sftp
|
|
install-ingest-sftp:
|
|
${PYTHON} -m pip install -r requirements/ingest/sftp.txt
|
|
|
|
.PHONY: install-ingest-pinecone
|
|
install-ingest-pinecone:
|
|
${PYTHON} -m pip install -r requirements/ingest/pinecone.txt
|
|
|
|
.PHONY: install-ingest-qdrant
|
|
install-ingest-qdrant:
|
|
${PYTHON} -m pip install -r requirements/ingest/qdrant.txt
|
|
|
|
.PHONY: install-ingest-chroma
|
|
install-ingest-chroma:
|
|
${PYTHON} -m pip install -r requirements/ingest/chroma.txt
|
|
|
|
.PHONY: install-ingest-postgres
|
|
install-ingest-postgres:
|
|
${PYTHON} -m pip install -r requirements/ingest/postgres.txt
|
|
|
|
.PHONY: install-ingest-mongodb
|
|
install-ingest-mongodb:
|
|
${PYTHON} -m pip install -r requirements/ingest/mongodb.txt
|
|
|
|
.PHONY: install-ingest-databricks-volumes
|
|
install-ingest-databricks-volumes:
|
|
${PYTHON} -m pip install -r requirements/ingest/databricks-volumes.txt
|
|
|
|
.PHONY: install-ingest-astradb
|
|
install-ingest-astradb:
|
|
${PYTHON} -m pip install -r requirements/ingest/astradb.txt
|
|
|
|
.PHONY: install-ingest-clarifai
|
|
install-ingest-clarifai:
|
|
${PYTHON} -m pip install -r requirements/ingest/clarifai.txt
|
|
|
|
.PHONY: install-embed-huggingface
|
|
install-embed-huggingface:
|
|
${PYTHON} -m pip install -r requirements/ingest/embed-huggingface.txt
|
|
|
|
.PHONY: install-unstructured-inference
|
|
install-unstructured-inference:
|
|
${PYTHON} -m pip install -r requirements/ingest/local-inference.txt
|
|
|
|
## install-local-inference: installs requirements for local inference
|
|
.PHONY: install-local-inference
|
|
install-local-inference: install install-all-docs
|
|
|
|
.PHONY: install-pandoc
|
|
install-pandoc:
|
|
ARCH=${ARCH} ./scripts/install-pandoc.sh
|
|
|
|
## pip-compile: compiles all base/dev/test requirements
|
|
.PHONY: pip-compile
|
|
pip-compile:
|
|
@scripts/pip-compile.sh
|
|
|
|
## install-project-local: install unstructured into your local python environment
|
|
.PHONY: install-project-local
|
|
install-project-local: install
|
|
# MAYBE TODO: fail if already exists?
|
|
${PYTHON} -m pip install -e .
|
|
|
|
## uninstall-project-local: uninstall unstructured from your local python environment
|
|
.PHONY: uninstall-project-local
|
|
uninstall-project-local:
|
|
${PYTHON} -m pip uninstall ${PACKAGE_NAME}
|
|
|
|
#################
|
|
# Test and Lint #
|
|
#################
|
|
|
|
export CI ?= false
|
|
export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false
|
|
|
|
## test: runs all unittests
|
|
.PHONY: test
|
|
test:
|
|
PYTHONPATH=. CI=$(CI) \
|
|
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} -m "not chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
|
|
|
|
.PHONY: test-chipper
|
|
test-chipper:
|
|
PYTHONPATH=. CI=$(CI) \
|
|
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} -m "chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
|
|
|
|
.PHONY: test-unstructured-api-unit
|
|
test-unstructured-api-unit:
|
|
scripts/test-unstructured-api-unit.sh
|
|
|
|
.PHONY: test-no-extras
|
|
test-no-extras:
|
|
PYTHONPATH=. CI=$(CI) \
|
|
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest \
|
|
test_${PACKAGE_NAME}/partition/test_text.py \
|
|
test_${PACKAGE_NAME}/partition/test_email.py \
|
|
test_${PACKAGE_NAME}/partition/html/test_partition.py \
|
|
test_${PACKAGE_NAME}/partition/test_xml.py
|
|
|
|
.PHONY: test-extra-csv
|
|
test-extra-csv:
|
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \
|
|
test_unstructured/partition/test_csv.py \
|
|
test_unstructured/partition/test_tsv.py
|
|
|
|
.PHONY: test-extra-docx
|
|
test-extra-docx:
|
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \
|
|
test_unstructured/partition/test_doc.py \
|
|
test_unstructured/partition/test_docx.py
|
|
|
|
.PHONY: test-extra-epub
|
|
test-extra-epub:
|
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_epub.py
|
|
|
|
.PHONY: test-extra-markdown
|
|
test-extra-markdown:
|
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_md.py
|
|
|
|
.PHONY: test-extra-odt
|
|
test-extra-odt:
|
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_odt.py
|
|
|
|
.PHONY: test-extra-pdf-image
|
|
test-extra-pdf-image:
|
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/pdf_image
|
|
|
|
.PHONY: test-extra-pptx
|
|
test-extra-pptx:
|
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \
|
|
test_unstructured/partition/test_ppt.py \
|
|
test_unstructured/partition/test_pptx.py
|
|
|
|
.PHONY: test-extra-pypandoc
|
|
test-extra-pypandoc:
|
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest \
|
|
test_unstructured/partition/test_org.py \
|
|
test_unstructured/partition/test_rst.py \
|
|
test_unstructured/partition/test_rtf.py
|
|
|
|
.PHONY: test-extra-xlsx
|
|
test-extra-xlsx:
|
|
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_xlsx.py
|
|
|
|
## check: runs linters (includes tests)
|
|
.PHONY: check
|
|
check: check-ruff check-black check-flake8 check-version check-flake8-print
|
|
|
|
.PHONY: check-shfmt
|
|
check-shfmt:
|
|
shfmt -i 2 -d .
|
|
|
|
.PHONY: check-black
|
|
check-black:
|
|
${PYTHON} -m black . --check --line-length=100
|
|
|
|
.PHONY: check-flake8
|
|
check-flake8:
|
|
${PYTHON} -m flake8 .
|
|
|
|
.PHONY: check-licenses
|
|
check-licenses:
|
|
@scripts/check-licenses.sh
|
|
|
|
# Check for print statements in ingest since anything going to console should be using the ingest logger
|
|
# as it has a built in filter to redact sensitive information
|
|
.PHONY: check-flake8-print
|
|
check-flake8-print:
|
|
flake8 --per-file-ignores "" ./unstructured/ingest
|
|
|
|
.PHONY: check-ruff
|
|
check-ruff:
|
|
# -- ruff options are determined by pyproject.toml --
|
|
ruff check .
|
|
|
|
.PHONY: check-autoflake
|
|
check-autoflake:
|
|
autoflake --check-diff .
|
|
|
|
## check-scripts: run shellcheck
|
|
.PHONY: check-scripts
|
|
check-scripts:
|
|
# Fail if any of these files have warnings
|
|
scripts/shellcheck.sh
|
|
|
|
## check-version: run check to ensure version in CHANGELOG.md matches version in package
|
|
.PHONY: check-version
|
|
check-version:
|
|
# Fail if syncing version would produce changes
|
|
scripts/version-sync.sh -c \
|
|
-f "unstructured/__version__.py" semver
|
|
|
|
## tidy: run black
|
|
.PHONY: tidy
|
|
tidy: tidy-python
|
|
|
|
.PHONY: tidy_shell
|
|
tidy-shell:
|
|
shfmt -i 2 -l -w .
|
|
|
|
.PHONY: tidy-python
|
|
tidy-python:
|
|
ruff . --fix-only || true
|
|
autoflake --in-place .
|
|
black --line-length=100 .
|
|
|
|
## version-sync: update __version__.py with most recent version from CHANGELOG.md
|
|
.PHONY: version-sync
|
|
version-sync:
|
|
scripts/version-sync.sh \
|
|
-f "unstructured/__version__.py" semver
|
|
|
|
.PHONY: check-coverage
|
|
check-coverage:
|
|
${PYTHON} -m coverage report --fail-under=90
|
|
|
|
## check-deps: check consistency of dependencies
|
|
.PHONY: check-deps
|
|
check-deps:
|
|
scripts/consistent-deps.sh
|
|
|
|
.PHONY: check-extras
|
|
check-extras:
|
|
scripts/check-extras.sh
|
|
|
|
##########
|
|
# Docker #
|
|
##########
|
|
|
|
# Docker targets are provided for convenience only and are not required in a standard development environment
|
|
|
|
DOCKER_IMAGE ?= unstructured:dev
|
|
|
|
.PHONY: docker-build
|
|
docker-build:
|
|
PIP_VERSION=${PIP_VERSION} DOCKER_IMAGE_NAME=${DOCKER_IMAGE} ./scripts/docker-build.sh
|
|
|
|
.PHONY: docker-start-bash
|
|
docker-start-bash:
|
|
docker run -ti --rm ${DOCKER_IMAGE}
|
|
|
|
.PHONY: docker-start-dev
|
|
docker-start-dev:
|
|
docker run --rm \
|
|
-v ${CURRENT_DIR}:/mnt/local_unstructued \
|
|
-ti ${DOCKER_IMAGE}
|
|
|
|
.PHONY: docker-test
|
|
docker-test:
|
|
docker run --rm \
|
|
-v ${CURRENT_DIR}/test_unstructured:/home/notebook-user/test_unstructured \
|
|
-v ${CURRENT_DIR}/test_unstructured_ingest:/home/notebook-user/test_unstructured_ingest \
|
|
$(if $(wildcard uns_test_env_file),--env-file uns_test_env_file,) \
|
|
$(DOCKER_IMAGE) \
|
|
bash -c "CI=$(CI) \
|
|
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
|
|
pytest -m 'not chipper' $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
|
|
|
|
.PHONY: docker-smoke-test
|
|
docker-smoke-test:
|
|
DOCKER_IMAGE=${DOCKER_IMAGE} ./scripts/docker-smoke-test.sh
|
|
|
|
|
|
###########
|
|
# Jupyter #
|
|
###########
|
|
|
|
.PHONY: docker-jupyter-notebook
|
|
docker-jupyter-notebook:
|
|
docker run -p 8888:8888 --mount type=bind,source=$(realpath .),target=/home --entrypoint jupyter-notebook -t --rm ${DOCKER_IMAGE} --allow-root --port 8888 --ip 0.0.0.0 --NotebookApp.token='' --NotebookApp.password=''
|
|
|
|
|
|
.PHONY: run-jupyter
|
|
run-jupyter:
|
|
PYTHONPATH=$(realpath .) JUPYTER_PATH=$(realpath .) jupyter-notebook --NotebookApp.token='' --NotebookApp.password=''
|