2022-06-29 14:35:19 -04:00
|
|
|
PACKAGE_NAME := unstructured
|
2023-07-27 14:38:51 -07:00
|
|
|
PIP_VERSION := 23.2.1
|
2023-03-21 13:46:09 -07:00
|
|
|
CURRENT_DIR := $(shell pwd)
|
2023-05-05 17:16:28 -07:00
|
|
|
ARCH := $(shell uname -m)
|
2022-06-29 14:35:19 -04:00
|
|
|
|
|
|
|
.PHONY: help
|
|
|
|
help: Makefile
|
|
|
|
@sed -n 's/^\(## \)\([a-zA-Z]\)/\2/p' $<
|
|
|
|
|
|
|
|
|
|
|
|
###########
|
|
|
|
# Install #
|
|
|
|
###########
|
|
|
|
|
|
|
|
## install-base: installs core requirements needed for text processing bricks
|
|
|
|
.PHONY: install-base
|
|
|
|
install-base: install-base-pip-packages install-nltk-models
|
|
|
|
|
|
|
|
## install: installs all test, dev, and experimental requirements
|
|
|
|
.PHONY: install
|
2023-08-01 11:31:13 -04:00
|
|
|
install: install-base-pip-packages install-dev install-nltk-models install-test install-huggingface install-all-docs
|
2022-06-29 14:35:19 -04:00
|
|
|
|
|
|
|
.PHONY: install-ci
|
fix: Install pandoc consistently, via Makefile recipe (version that supports .rtf files as input format) (#2593)
## Problem Description
In some cases you might find yourselves in a situation when pandoc won't
be able to process an `rtf` as input file format, because older versions
simply do not support that.
```
RuntimeError: Invalid input format! Got "rtf" but expected one of these: commonmark, creole, csv, docbook, docx, dokuwiki, epub, fb2, gfm, haddock, html, ipynb, jats, jira, json, latex, man, markdown, markdown_github, markdown_mmd, markdown_phpextra, markdown_strict, mediawiki, muse, native, odt, opml, org, rst, t2t, textile, tikiwiki, twiki, vimwiki
```
Basically, some user may install the wrong version. The `README.md` is
not be precise enough when mentioning RTF files support:
https://github.com/Unstructured-IO/unstructured/blob/47b35ccdd61ffbc376c86e9bb08a2039b042cc2b/README.md?plain=1#L120-L122
## Example
Installing `pandoc` from a [stable repository, like
Debian](https://packages.debian.org/source/bullseye/pandoc) will give
you `2.9` and the official documentation shows clearly that support for
rtf was introduced in `2.14`
https://pandoc.org/releases.html#pandoc-2.14.2-2021-08-21

### Note that `rtf` is not there

### More detail

## Proposed Solution
- [x] I've simply added/copied `make install-pandoc` calls, mimicking
other recipes in order to ensure that `3.1.2` will be installed in all
cases. **Side note**: `make install-pandoc` calls
`./scripts/install-pandoc.sh` under the hood.
- [x] Update README file - mention that `make install-pandoc` is
recommended (`>=2.14.2`)
- [x] Verify tests that cover `rtf` cases:
https://github.com/Unstructured-IO/unstructured/blob/47b35ccdd61ffbc376c86e9bb08a2039b042cc2b/test_unstructured/file_utils/test_file_conversion.py#L14
- [x] Update `setup_ubuntu.sh` if needed?:
https://github.com/Unstructured-IO/unstructured/blob/47b35ccdd61ffbc376c86e9bb08a2039b042cc2b/scripts/setup_ubuntu.sh#L87
-
2024-03-04 12:02:32 +01:00
|
|
|
install-ci: install-base-pip-packages install-nltk-models install-huggingface install-all-docs install-test install-pandoc
|
2022-06-29 14:35:19 -04:00
|
|
|
|
2023-08-19 12:56:13 -04:00
|
|
|
.PHONY: install-base-ci
|
fix: Install pandoc consistently, via Makefile recipe (version that supports .rtf files as input format) (#2593)
## Problem Description
In some cases you might find yourselves in a situation when pandoc won't
be able to process an `rtf` as input file format, because older versions
simply do not support that.
```
RuntimeError: Invalid input format! Got "rtf" but expected one of these: commonmark, creole, csv, docbook, docx, dokuwiki, epub, fb2, gfm, haddock, html, ipynb, jats, jira, json, latex, man, markdown, markdown_github, markdown_mmd, markdown_phpextra, markdown_strict, mediawiki, muse, native, odt, opml, org, rst, t2t, textile, tikiwiki, twiki, vimwiki
```
Basically, some user may install the wrong version. The `README.md` is
not be precise enough when mentioning RTF files support:
https://github.com/Unstructured-IO/unstructured/blob/47b35ccdd61ffbc376c86e9bb08a2039b042cc2b/README.md?plain=1#L120-L122
## Example
Installing `pandoc` from a [stable repository, like
Debian](https://packages.debian.org/source/bullseye/pandoc) will give
you `2.9` and the official documentation shows clearly that support for
rtf was introduced in `2.14`
https://pandoc.org/releases.html#pandoc-2.14.2-2021-08-21

### Note that `rtf` is not there

### More detail

## Proposed Solution
- [x] I've simply added/copied `make install-pandoc` calls, mimicking
other recipes in order to ensure that `3.1.2` will be installed in all
cases. **Side note**: `make install-pandoc` calls
`./scripts/install-pandoc.sh` under the hood.
- [x] Update README file - mention that `make install-pandoc` is
recommended (`>=2.14.2`)
- [x] Verify tests that cover `rtf` cases:
https://github.com/Unstructured-IO/unstructured/blob/47b35ccdd61ffbc376c86e9bb08a2039b042cc2b/test_unstructured/file_utils/test_file_conversion.py#L14
- [x] Update `setup_ubuntu.sh` if needed?:
https://github.com/Unstructured-IO/unstructured/blob/47b35ccdd61ffbc376c86e9bb08a2039b042cc2b/scripts/setup_ubuntu.sh#L87
-
2024-03-04 12:02:32 +01:00
|
|
|
install-base-ci: install-base-pip-packages install-nltk-models install-test install-pandoc
|
2023-08-19 12:56:13 -04:00
|
|
|
|
2022-06-29 14:35:19 -04:00
|
|
|
.PHONY: install-base-pip-packages
|
|
|
|
install-base-pip-packages:
|
|
|
|
python3 -m pip install pip==${PIP_VERSION}
|
2023-10-11 12:34:49 -07:00
|
|
|
python3 -m pip install -r requirements/base.txt
|
2022-06-29 14:35:19 -04:00
|
|
|
|
2022-10-13 11:18:27 -04:00
|
|
|
.PHONY: install-huggingface
|
|
|
|
install-huggingface:
|
|
|
|
python3 -m pip install pip==${PIP_VERSION}
|
2023-04-11 00:11:50 -07:00
|
|
|
python3 -m pip install -r requirements/huggingface.txt
|
2022-10-13 11:18:27 -04:00
|
|
|
|
2024-01-30 12:12:35 -06:00
|
|
|
.PHONY: install-nltk-models
|
2022-06-29 14:35:19 -04:00
|
|
|
install-nltk-models:
|
|
|
|
python -c "import nltk; nltk.download('punkt')"
|
|
|
|
python -c "import nltk; nltk.download('averaged_perceptron_tagger')"
|
|
|
|
|
|
|
|
.PHONY: install-test
|
|
|
|
install-test:
|
2023-04-11 00:11:50 -07:00
|
|
|
python3 -m pip install -r requirements/test.txt
|
2023-09-14 18:27:18 -05:00
|
|
|
# NOTE(yao) - CI seem to always install tesseract to test so it would make sense to also require
|
|
|
|
# pytesseract installation into the virtual env for testing
|
2024-04-04 15:58:23 -04:00
|
|
|
python3 -m pip install unstructured.pytesseract -c requirements/deps/constraints.txt
|
|
|
|
python3 -m pip install argilla -c requirements/deps/constraints.txt
|
2023-06-01 16:48:54 -04:00
|
|
|
# NOTE(robinson) - Installing weaviate-client separately here because the requests
|
|
|
|
# version conflicts with label_studio_sdk
|
2024-04-04 15:58:23 -04:00
|
|
|
python3 -m pip install weaviate-client -c requirements/deps/constraints.txt
|
2023-09-13 18:19:20 -05:00
|
|
|
# TODO (yao): find out if how to constrain argilla properly without causing conflicts
|
2023-09-11 11:40:56 -04:00
|
|
|
python3 -m pip install argilla
|
2022-06-29 14:35:19 -04:00
|
|
|
|
|
|
|
.PHONY: install-dev
|
|
|
|
install-dev:
|
2023-04-11 00:11:50 -07:00
|
|
|
python3 -m pip install -r requirements/dev.txt
|
2022-06-29 14:35:19 -04:00
|
|
|
|
|
|
|
.PHONY: install-build
|
|
|
|
install-build:
|
2023-04-11 00:11:50 -07:00
|
|
|
python3 -m pip install -r requirements/build.txt
|
2022-06-29 14:35:19 -04:00
|
|
|
|
2023-08-01 11:31:13 -04:00
|
|
|
.PHONY: install-csv
|
|
|
|
install-csv:
|
|
|
|
python3 -m pip install -r requirements/extra-csv.txt
|
|
|
|
|
|
|
|
.PHONY: install-docx
|
|
|
|
install-docx:
|
|
|
|
python3 -m pip install -r requirements/extra-docx.txt
|
|
|
|
|
2023-08-12 16:02:06 -05:00
|
|
|
.PHONY: install-epub
|
|
|
|
install-epub:
|
|
|
|
python3 -m pip install -r requirements/extra-epub.txt
|
|
|
|
|
2023-08-01 11:31:13 -04:00
|
|
|
.PHONY: install-odt
|
|
|
|
install-odt:
|
|
|
|
python3 -m pip install -r requirements/extra-odt.txt
|
|
|
|
|
|
|
|
.PHONY: install-pypandoc
|
|
|
|
install-pypandoc:
|
|
|
|
python3 -m pip install -r requirements/extra-pandoc.txt
|
|
|
|
|
|
|
|
.PHONY: install-markdown
|
|
|
|
install-markdown:
|
|
|
|
python3 -m pip install -r requirements/extra-markdown.txt
|
|
|
|
|
|
|
|
.PHONY: install-msg
|
|
|
|
install-msg:
|
|
|
|
python3 -m pip install -r requirements/extra-msg.txt
|
|
|
|
|
|
|
|
.PHONY: install-pdf-image
|
|
|
|
install-pdf-image:
|
|
|
|
python3 -m pip install -r requirements/extra-pdf-image.txt
|
|
|
|
|
|
|
|
.PHONY: install-pptx
|
|
|
|
install-pptx:
|
|
|
|
python3 -m pip install -r requirements/extra-pptx.txt
|
|
|
|
|
|
|
|
.PHONY: install-xlsx
|
|
|
|
install-xlsx:
|
|
|
|
python3 -m pip install -r requirements/extra-xlsx.txt
|
|
|
|
|
|
|
|
.PHONY: install-all-docs
|
2023-08-12 16:02:06 -05:00
|
|
|
install-all-docs: install-base install-csv install-docx install-epub install-odt install-pypandoc install-markdown install-msg install-pdf-image install-pptx install-xlsx
|
2023-08-01 11:31:13 -04:00
|
|
|
|
2023-10-24 10:54:00 -04:00
|
|
|
.PHONY: install-all-ingest
|
|
|
|
install-all-ingest:
|
|
|
|
find requirements/ingest -type f -name "*.txt" -exec python3 -m pip install -r '{}' ';'
|
|
|
|
|
|
|
|
|
2023-03-07 06:01:02 +00:00
|
|
|
.PHONY: install-ingest-google-drive
|
|
|
|
install-ingest-google-drive:
|
2023-11-09 21:55:27 -08:00
|
|
|
python3 -m pip install -r requirements/ingest/google-drive.txt
|
2023-03-07 06:01:02 +00:00
|
|
|
|
2023-02-14 12:27:45 -08:00
|
|
|
## install-ingest-s3: install requirements for the s3 connector
|
|
|
|
.PHONY: install-ingest-s3
|
|
|
|
install-ingest-s3:
|
2023-11-09 21:55:27 -08:00
|
|
|
python3 -m pip install -r requirements/ingest/s3.txt
|
2023-02-14 12:27:45 -08:00
|
|
|
|
2023-06-21 15:14:50 -07:00
|
|
|
.PHONY: install-ingest-gcs
|
|
|
|
install-ingest-gcs:
|
2023-11-09 21:55:27 -08:00
|
|
|
python3 -m pip install -r requirements/ingest/gcs.txt
|
2023-06-21 15:14:50 -07:00
|
|
|
|
2023-06-30 17:08:27 -07:00
|
|
|
.PHONY: install-ingest-dropbox
|
|
|
|
install-ingest-dropbox:
|
2023-11-09 21:55:27 -08:00
|
|
|
python3 -m pip install -r requirements/ingest/dropbox.txt
|
2023-06-30 17:08:27 -07:00
|
|
|
|
2023-03-11 00:43:40 +01:00
|
|
|
.PHONY: install-ingest-azure
|
|
|
|
install-ingest-azure:
|
2023-11-09 21:55:27 -08:00
|
|
|
python3 -m pip install -r requirements/ingest/azure.txt
|
2023-03-11 00:43:40 +01:00
|
|
|
|
2023-07-31 18:10:10 -07:00
|
|
|
.PHONY: install-ingest-box
|
|
|
|
install-ingest-box:
|
2023-11-09 21:55:27 -08:00
|
|
|
python3 -m pip install -r requirements/ingest/box.txt
|
2023-07-31 18:10:10 -07:00
|
|
|
|
2023-08-22 10:19:46 -04:00
|
|
|
.PHONY: install-ingest-delta-table
|
|
|
|
install-ingest-delta-table:
|
2023-11-09 21:55:27 -08:00
|
|
|
python3 -m pip install -r requirements/ingest/delta-table.txt
|
2023-08-22 10:19:46 -04:00
|
|
|
|
2023-05-16 11:46:30 -07:00
|
|
|
.PHONY: install-ingest-discord
|
|
|
|
install-ingest-discord:
|
2023-11-09 21:55:27 -08:00
|
|
|
pip install -r requirements/ingest/discord.txt
|
2023-05-16 11:46:30 -07:00
|
|
|
|
2023-02-27 23:36:44 +01:00
|
|
|
.PHONY: install-ingest-github
|
|
|
|
install-ingest-github:
|
2023-11-09 21:55:27 -08:00
|
|
|
python3 -m pip install -r requirements/ingest/github.txt
|
2023-02-27 23:36:44 +01:00
|
|
|
|
2023-08-21 15:16:50 -04:00
|
|
|
.PHONY: install-ingest-biomed
|
|
|
|
install-ingest-biomed:
|
2023-11-09 21:55:27 -08:00
|
|
|
python3 -m pip install -r requirements/ingest/biomed.txt
|
2023-08-21 15:16:50 -04:00
|
|
|
|
2023-03-08 09:15:21 +01:00
|
|
|
.PHONY: install-ingest-gitlab
|
|
|
|
install-ingest-gitlab:
|
2023-11-09 21:55:27 -08:00
|
|
|
python3 -m pip install -r requirements/ingest/gitlab.txt
|
2023-03-08 09:15:21 +01:00
|
|
|
|
2023-07-13 14:57:54 -06:00
|
|
|
.PHONY: install-ingest-onedrive
|
|
|
|
install-ingest-onedrive:
|
2023-11-09 21:55:27 -08:00
|
|
|
python3 -m pip install -r requirements/ingest/onedrive.txt
|
2023-07-13 14:57:54 -06:00
|
|
|
|
2023-07-25 21:09:26 -07:00
|
|
|
.PHONY: install-ingest-outlook
|
|
|
|
install-ingest-outlook:
|
2023-11-09 21:55:27 -08:00
|
|
|
python3 -m pip install -r requirements/ingest/outlook.txt
|
2023-07-25 21:09:26 -07:00
|
|
|
|
2023-02-27 09:11:04 +01:00
|
|
|
.PHONY: install-ingest-reddit
|
|
|
|
install-ingest-reddit:
|
2023-11-09 21:55:27 -08:00
|
|
|
python3 -m pip install -r requirements/ingest/reddit.txt
|
2023-02-27 09:11:04 +01:00
|
|
|
|
2023-04-16 12:34:43 -07:00
|
|
|
.PHONY: install-ingest-slack
|
|
|
|
install-ingest-slack:
|
2023-11-09 21:55:27 -08:00
|
|
|
pip install -r requirements/ingest/slack.txt
|
2023-04-16 12:34:43 -07:00
|
|
|
|
2023-02-28 09:25:11 +01:00
|
|
|
.PHONY: install-ingest-wikipedia
|
|
|
|
install-ingest-wikipedia:
|
2023-11-09 21:55:27 -08:00
|
|
|
python3 -m pip install -r requirements/ingest/wikipedia.txt
|
2023-02-28 09:25:11 +01:00
|
|
|
|
2023-07-01 18:45:28 +01:00
|
|
|
.PHONY: install-ingest-elasticsearch
|
|
|
|
install-ingest-elasticsearch:
|
2023-11-09 21:55:27 -08:00
|
|
|
python3 -m pip install -r requirements/ingest/elasticsearch.txt
|
2023-07-18 19:29:41 +01:00
|
|
|
|
2024-01-16 20:31:49 -08:00
|
|
|
.PHONY: install-ingest-opensearch
|
|
|
|
install-ingest-opensearch:
|
|
|
|
python3 -m pip install -r requirements/ingest/opensearch.txt
|
|
|
|
|
2023-07-18 19:29:41 +01:00
|
|
|
.PHONY: install-ingest-confluence
|
|
|
|
install-ingest-confluence:
|
2023-11-09 21:55:27 -08:00
|
|
|
python3 -m pip install -r requirements/ingest/confluence.txt
|
2023-07-18 19:29:41 +01:00
|
|
|
|
2023-08-11 22:02:51 +03:00
|
|
|
.PHONY: install-ingest-airtable
|
|
|
|
install-ingest-airtable:
|
2023-11-09 21:55:27 -08:00
|
|
|
python3 -m pip install -r requirements/ingest/airtable.txt
|
2023-08-11 22:02:51 +03:00
|
|
|
|
2023-08-15 00:15:44 -05:00
|
|
|
.PHONY: install-ingest-sharepoint
|
|
|
|
install-ingest-sharepoint:
|
2023-11-09 21:55:27 -08:00
|
|
|
python3 -m pip install -r requirements/ingest/sharepoint.txt
|
2023-08-15 00:15:44 -05:00
|
|
|
|
2023-12-01 16:27:41 -06:00
|
|
|
.PHONY: install-ingest-weaviate
|
|
|
|
install-ingest-weaviate:
|
|
|
|
python3 -m pip install -r requirements/ingest/weaviate.txt
|
|
|
|
|
2023-08-21 15:16:50 -04:00
|
|
|
.PHONY: install-ingest-local
|
|
|
|
install-ingest-local:
|
|
|
|
echo "no unique dependencies for local connector"
|
|
|
|
|
|
|
|
.PHONY: install-ingest-notion
|
|
|
|
install-ingest-notion:
|
2023-11-09 21:55:27 -08:00
|
|
|
python3 -m pip install -r requirements/ingest/notion.txt
|
2023-08-21 15:16:50 -04:00
|
|
|
|
2023-09-02 08:50:31 -07:00
|
|
|
.PHONY: install-ingest-salesforce
|
|
|
|
install-ingest-salesforce:
|
2023-11-09 21:55:27 -08:00
|
|
|
python3 -m pip install -r requirements/ingest/salesforce.txt
|
2023-09-02 08:50:31 -07:00
|
|
|
|
feat: jira connector (cloud) (#1238)
This connector:
- takes a Jira Cloud URL, user email and api token; to authenticate into
Jira Cloud
- ingests:
- either all issues in all projects in a Jira Cloud Organization
- or
- issues in user specified projects, boards
- user specified issues
- processes this kind of data:
- text fields such as issue summary, description, and comments
- dropdown fields such as issue type, status, priority, assignee,
reporter, labels, and components
- other data such as issue id, issue key, project id, information on
subtasks
- notes down attachment URLs, however does not process attachments
- stores each downloaded issue in a txt file, in a predefined template
form (consisting of the data above)
- then processes each downloaded issue document into elements using
unstructured library
- related to: https://github.com/Unstructured-IO/unstructured/issues/263
To test the changes, make the necessary setups and run the relevant
ingest test scripts.
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: ahmetmeleq <ahmetmeleq@users.noreply.github.com>
2023-09-06 13:10:48 +03:00
|
|
|
.PHONY: install-ingest-jira
|
|
|
|
install-ingest-jira:
|
2023-11-09 21:55:27 -08:00
|
|
|
python3 -m pip install -r requirements/ingest/jira.txt
|
feat: jira connector (cloud) (#1238)
This connector:
- takes a Jira Cloud URL, user email and api token; to authenticate into
Jira Cloud
- ingests:
- either all issues in all projects in a Jira Cloud Organization
- or
- issues in user specified projects, boards
- user specified issues
- processes this kind of data:
- text fields such as issue summary, description, and comments
- dropdown fields such as issue type, status, priority, assignee,
reporter, labels, and components
- other data such as issue id, issue key, project id, information on
subtasks
- notes down attachment URLs, however does not process attachments
- stores each downloaded issue in a txt file, in a predefined template
form (consisting of the data above)
- then processes each downloaded issue document into elements using
unstructured library
- related to: https://github.com/Unstructured-IO/unstructured/issues/263
To test the changes, make the necessary setups and run the relevant
ingest test scripts.
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: ahmetmeleq <ahmetmeleq@users.noreply.github.com>
2023-09-06 13:10:48 +03:00
|
|
|
|
2023-11-28 17:07:57 -06:00
|
|
|
.PHONY: install-ingest-hubspot
|
|
|
|
install-ingest-hubspot:
|
2023-12-07 11:33:19 -08:00
|
|
|
python3 -m pip install -r requirements/ingest/hubspot.txt
|
|
|
|
|
|
|
|
.PHONY: install-ingest-sftp
|
|
|
|
install-ingest-sftp:
|
|
|
|
python3 -m pip install -r requirements/ingest/sftp.txt
|
2023-11-28 17:07:57 -06:00
|
|
|
|
2023-11-29 22:37:32 +00:00
|
|
|
.PHONY: install-ingest-pinecone
|
|
|
|
install-ingest-pinecone:
|
2023-12-19 08:58:23 -08:00
|
|
|
python3 -m pip install -r requirements/ingest/pinecone.txt
|
|
|
|
|
2024-01-02 14:08:20 -08:00
|
|
|
.PHONY: install-ingest-qdrant
|
|
|
|
install-ingest-qdrant:
|
|
|
|
python3 -m pip install -r requirements/ingest/qdrant.txt
|
|
|
|
|
2023-12-19 08:58:23 -08:00
|
|
|
.PHONY: install-ingest-chroma
|
|
|
|
install-ingest-chroma:
|
|
|
|
python3 -m pip install -r requirements/ingest/chroma.txt
|
2023-11-29 22:37:32 +00:00
|
|
|
|
2024-01-04 13:33:16 -06:00
|
|
|
.PHONY: install-ingest-postgres
|
|
|
|
install-ingest-postgres:
|
2024-01-22 20:25:51 -05:00
|
|
|
python3 -m pip install -r requirements/ingest/postgres.txt
|
2024-01-04 13:33:16 -06:00
|
|
|
|
2024-01-16 12:56:29 -08:00
|
|
|
.PHONY: install-ingest-mongodb
|
|
|
|
install-ingest-mongodb:
|
2024-01-22 20:25:51 -05:00
|
|
|
python3 -m pip install -r requirements/ingest/mongodb.txt
|
|
|
|
|
|
|
|
.PHONY: install-ingest-databricks-volumes
|
|
|
|
install-ingest-databricks-volumes:
|
|
|
|
python3 -m pip install -r requirements/ingest/databricks-volumes.txt
|
2024-01-16 12:56:29 -08:00
|
|
|
|
2024-02-23 12:50:50 -08:00
|
|
|
.PHONY: install-ingest-astra
|
|
|
|
install-ingest-astra:
|
|
|
|
python3 -m pip install -r requirements/ingest/astra.txt
|
|
|
|
|
2024-03-21 09:36:21 -07:00
|
|
|
.PHONY: install-ingest-clarifai
|
|
|
|
install-ingest-clarifai:
|
|
|
|
python3 -m pip install -r requirements/ingest/clarifai.txt
|
|
|
|
|
2023-10-19 11:51:36 -05:00
|
|
|
.PHONY: install-embed-huggingface
|
|
|
|
install-embed-huggingface:
|
2023-11-09 21:55:27 -08:00
|
|
|
python3 -m pip install -r requirements/ingest/embed-huggingface.txt
|
2023-10-19 11:51:36 -05:00
|
|
|
|
2023-01-04 16:19:05 -06:00
|
|
|
.PHONY: install-unstructured-inference
|
|
|
|
install-unstructured-inference:
|
2023-11-09 21:55:27 -08:00
|
|
|
python3 -m pip install -r requirements/ingest/local-inference.txt
|
2023-01-04 16:19:05 -06:00
|
|
|
|
|
|
|
## install-local-inference: installs requirements for local inference
|
|
|
|
.PHONY: install-local-inference
|
2023-08-01 11:31:13 -04:00
|
|
|
install-local-inference: install install-all-docs
|
2023-01-04 16:19:05 -06:00
|
|
|
|
2023-05-26 15:38:48 -04:00
|
|
|
.PHONY: install-pandoc
|
|
|
|
install-pandoc:
|
|
|
|
ARCH=${ARCH} ./scripts/install-pandoc.sh
|
|
|
|
|
2023-09-15 17:05:48 -07:00
|
|
|
.PHONY: install-paddleocr
|
|
|
|
install-paddleocr:
|
|
|
|
ARCH=${ARCH} ./scripts/install-paddleocr.sh
|
2023-05-26 15:38:48 -04:00
|
|
|
|
2022-06-29 14:35:19 -04:00
|
|
|
## pip-compile: compiles all base/dev/test requirements
|
|
|
|
.PHONY: pip-compile
|
|
|
|
pip-compile:
|
2023-09-25 10:27:42 -04:00
|
|
|
@scripts/pip-compile.sh
|
2023-08-31 18:19:53 -04:00
|
|
|
|
2022-06-29 14:35:19 -04:00
|
|
|
## install-project-local: install unstructured into your local python environment
|
|
|
|
.PHONY: install-project-local
|
|
|
|
install-project-local: install
|
|
|
|
# MAYBE TODO: fail if already exists?
|
|
|
|
pip install -e .
|
|
|
|
|
|
|
|
## uninstall-project-local: uninstall unstructured from your local python environment
|
|
|
|
.PHONY: uninstall-project-local
|
|
|
|
uninstall-project-local:
|
|
|
|
pip uninstall ${PACKAGE_NAME}
|
|
|
|
|
|
|
|
#################
|
|
|
|
# Test and Lint #
|
|
|
|
#################
|
|
|
|
|
2023-06-29 10:31:01 -07:00
|
|
|
export CI ?= false
|
2023-10-05 15:26:47 -05:00
|
|
|
export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false
|
2023-06-29 10:31:01 -07:00
|
|
|
|
2022-06-29 14:35:19 -04:00
|
|
|
## test: runs all unittests
|
|
|
|
.PHONY: test
|
|
|
|
test:
|
2023-10-05 15:26:47 -05:00
|
|
|
PYTHONPATH=. CI=$(CI) \
|
2023-10-31 16:02:00 -05:00
|
|
|
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest test_${PACKAGE_NAME} -m "not chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
|
|
|
|
|
|
|
|
.PHONY: test-chipper
|
|
|
|
test-chipper:
|
|
|
|
PYTHONPATH=. CI=$(CI) \
|
|
|
|
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest test_${PACKAGE_NAME} -m "chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
|
2022-06-29 14:35:19 -04:00
|
|
|
|
2023-07-26 16:55:35 -04:00
|
|
|
.PHONY: test-unstructured-api-unit
|
|
|
|
test-unstructured-api-unit:
|
|
|
|
scripts/test-unstructured-api-unit.sh
|
|
|
|
|
2023-08-19 12:56:13 -04:00
|
|
|
.PHONY: test-no-extras
|
|
|
|
# TODO(newelh) Add json test when fixed
|
|
|
|
test-no-extras:
|
2023-10-05 15:26:47 -05:00
|
|
|
PYTHONPATH=. CI=$(CI) \
|
|
|
|
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest \
|
2023-08-19 12:56:13 -04:00
|
|
|
test_${PACKAGE_NAME}/partition/test_text.py \
|
|
|
|
test_${PACKAGE_NAME}/partition/test_email.py \
|
|
|
|
test_${PACKAGE_NAME}/partition/test_html_partition.py \
|
2023-08-31 18:19:53 -04:00
|
|
|
test_${PACKAGE_NAME}/partition/test_xml_partition.py
|
2023-08-19 12:56:13 -04:00
|
|
|
|
|
|
|
.PHONY: test-extra-csv
|
|
|
|
test-extra-csv:
|
|
|
|
PYTHONPATH=. CI=$(CI) pytest \
|
|
|
|
test_${PACKAGE_NAME}/partition/csv
|
|
|
|
|
|
|
|
.PHONY: test-extra-docx
|
|
|
|
test-extra-docx:
|
|
|
|
PYTHONPATH=. CI=$(CI) pytest \
|
|
|
|
test_${PACKAGE_NAME}/partition/docx
|
|
|
|
|
|
|
|
.PHONY: test-extra-markdown
|
|
|
|
test-extra-markdown:
|
|
|
|
PYTHONPATH=. CI=$(CI) pytest \
|
|
|
|
test_${PACKAGE_NAME}/partition/markdown
|
|
|
|
|
|
|
|
.PHONY: test-extra-msg
|
|
|
|
test-extra-msg:
|
|
|
|
PYTHONPATH=. CI=$(CI) pytest \
|
|
|
|
test_${PACKAGE_NAME}/partition/msg
|
|
|
|
|
|
|
|
.PHONY: test-extra-odt
|
|
|
|
test-extra-odt:
|
|
|
|
PYTHONPATH=. CI=$(CI) pytest \
|
|
|
|
test_${PACKAGE_NAME}/partition/odt
|
|
|
|
|
|
|
|
.PHONY: test-extra-pdf-image
|
|
|
|
test-extra-pdf-image:
|
|
|
|
PYTHONPATH=. CI=$(CI) pytest \
|
2023-10-12 12:47:55 -07:00
|
|
|
test_${PACKAGE_NAME}/partition/pdf_image
|
2023-08-19 12:56:13 -04:00
|
|
|
|
|
|
|
.PHONY: test-extra-pptx
|
|
|
|
test-extra-pptx:
|
|
|
|
PYTHONPATH=. CI=$(CI) pytest \
|
|
|
|
test_${PACKAGE_NAME}/partition/pptx
|
|
|
|
|
|
|
|
.PHONY: test-extra-epub
|
2023-08-29 01:49:18 -05:00
|
|
|
test-extra-epub:
|
2023-08-19 12:56:13 -04:00
|
|
|
PYTHONPATH=. CI=$(CI) pytest \
|
|
|
|
test_${PACKAGE_NAME}/partition/epub
|
|
|
|
|
|
|
|
.PHONY: test-extra-pypandoc
|
|
|
|
test-extra-pypandoc:
|
|
|
|
PYTHONPATH=. CI=$(CI) pytest \
|
|
|
|
test_${PACKAGE_NAME}/partition/pypandoc
|
|
|
|
|
|
|
|
.PHONY: test-extra-xlsx
|
|
|
|
test-extra-xlsx:
|
|
|
|
PYTHONPATH=. CI=$(CI) pytest \
|
|
|
|
test_${PACKAGE_NAME}/partition/xlsx
|
|
|
|
|
2022-06-29 14:35:19 -04:00
|
|
|
## check: runs linters (includes tests)
|
|
|
|
.PHONY: check
|
2023-12-05 11:42:23 -05:00
|
|
|
check: check-ruff check-black check-flake8 check-version check-flake8-print
|
2022-06-29 14:35:19 -04:00
|
|
|
|
2023-12-11 20:04:15 -05:00
|
|
|
.PHONY: check-shfmt
|
|
|
|
check-shfmt:
|
2023-12-18 23:48:21 -08:00
|
|
|
shfmt -i 2 -d .
|
2023-12-11 20:04:15 -05:00
|
|
|
|
2023-10-17 08:45:12 -04:00
|
|
|
.PHONY: check-black
|
|
|
|
check-black:
|
2024-03-21 14:21:04 -04:00
|
|
|
black . --check --line-length=100
|
2023-10-17 08:45:12 -04:00
|
|
|
|
|
|
|
.PHONY: check-flake8
|
|
|
|
check-flake8:
|
|
|
|
flake8 .
|
|
|
|
|
2023-12-05 11:42:23 -05:00
|
|
|
# Check for print statements in ingest since anything going to console should be using the ingest logger
|
|
|
|
# as it has a built in filter to redact sensitive information
|
|
|
|
.PHONY: check-flake8-print
|
|
|
|
check-flake8-print:
|
|
|
|
flake8 --per-file-ignores "" ./unstructured/ingest
|
|
|
|
|
2023-10-17 08:45:12 -04:00
|
|
|
.PHONY: check-ruff
|
|
|
|
check-ruff:
|
2024-03-14 14:31:58 -07:00
|
|
|
# -- ruff options are determined by pyproject.toml --
|
|
|
|
ruff .
|
2022-06-29 14:35:19 -04:00
|
|
|
|
2023-10-17 08:45:12 -04:00
|
|
|
.PHONY: check-autoflake
|
|
|
|
check-autoflake:
|
|
|
|
autoflake --check-diff .
|
2022-06-29 14:35:19 -04:00
|
|
|
|
2022-09-29 15:24:28 -04:00
|
|
|
## check-scripts: run shellcheck
|
|
|
|
.PHONY: check-scripts
|
|
|
|
check-scripts:
|
|
|
|
# Fail if any of these files have warnings
|
|
|
|
scripts/shellcheck.sh
|
|
|
|
|
2022-10-10 13:11:48 -05:00
|
|
|
## check-version: run check to ensure version in CHANGELOG.md matches version in package
|
|
|
|
.PHONY: check-version
|
|
|
|
check-version:
|
|
|
|
# Fail if syncing version would produce changes
|
2023-04-21 05:48:38 +09:00
|
|
|
scripts/version-sync.sh -c \
|
|
|
|
-f "unstructured/__version__.py" semver
|
2022-10-10 13:11:48 -05:00
|
|
|
|
2022-06-29 14:35:19 -04:00
|
|
|
## tidy: run black
|
|
|
|
.PHONY: tidy
|
2023-12-11 20:04:15 -05:00
|
|
|
tidy: tidy-python
|
|
|
|
|
|
|
|
.PHONY: tidy_shell
|
|
|
|
tidy-shell:
|
2023-12-18 23:48:21 -08:00
|
|
|
shfmt -i 2 -l -w .
|
2023-12-11 20:04:15 -05:00
|
|
|
|
|
|
|
.PHONY: tidy-python
|
|
|
|
tidy-python:
|
2024-03-15 11:48:07 -07:00
|
|
|
ruff . --fix-only || true
|
2023-10-17 08:45:12 -04:00
|
|
|
autoflake --in-place .
|
2024-03-21 14:21:04 -04:00
|
|
|
black --line-length=100 .
|
2022-06-29 14:35:19 -04:00
|
|
|
|
2022-10-10 13:11:48 -05:00
|
|
|
## version-sync: update __version__.py with most recent version from CHANGELOG.md
|
|
|
|
.PHONY: version-sync
|
|
|
|
version-sync:
|
2023-04-21 05:48:38 +09:00
|
|
|
scripts/version-sync.sh \
|
|
|
|
-f "unstructured/__version__.py" semver
|
2022-10-10 13:11:48 -05:00
|
|
|
|
2022-06-29 14:35:19 -04:00
|
|
|
.PHONY: check-coverage
|
|
|
|
check-coverage:
|
|
|
|
coverage report --fail-under=95
|
2023-03-14 13:40:01 -07:00
|
|
|
|
2023-05-24 17:29:35 -05:00
|
|
|
## check-deps: check consistency of dependencies
|
|
|
|
.PHONY: check-deps
|
|
|
|
check-deps:
|
|
|
|
scripts/consistent-deps.sh
|
|
|
|
|
2024-03-06 13:59:08 -05:00
|
|
|
.PHONY: check-extras
|
|
|
|
check-extras:
|
|
|
|
scripts/check-extras.sh
|
|
|
|
|
2023-03-14 13:40:01 -07:00
|
|
|
##########
|
|
|
|
# Docker #
|
|
|
|
##########
|
|
|
|
|
|
|
|
# Docker targets are provided for convenience only and are not required in a standard development environment
|
|
|
|
|
2023-04-06 00:34:07 -07:00
|
|
|
DOCKER_IMAGE ?= unstructured:dev
|
2023-03-29 00:02:39 -07:00
|
|
|
|
2023-03-14 13:40:01 -07:00
|
|
|
.PHONY: docker-build
|
|
|
|
docker-build:
|
2023-04-06 00:34:07 -07:00
|
|
|
PIP_VERSION=${PIP_VERSION} DOCKER_IMAGE_NAME=${DOCKER_IMAGE} ./scripts/docker-build.sh
|
2023-03-14 13:40:01 -07:00
|
|
|
|
|
|
|
.PHONY: docker-start-bash
|
|
|
|
docker-start-bash:
|
2023-04-06 00:34:07 -07:00
|
|
|
docker run -ti --rm ${DOCKER_IMAGE}
|
2023-03-21 13:46:09 -07:00
|
|
|
|
2023-08-31 14:26:29 -05:00
|
|
|
.PHONY: docker-start-dev
|
|
|
|
docker-start-dev:
|
|
|
|
docker run --rm \
|
|
|
|
-v ${CURRENT_DIR}:/mnt/local_unstructued \
|
|
|
|
-ti ${DOCKER_IMAGE}
|
|
|
|
|
2023-03-21 13:46:09 -07:00
|
|
|
.PHONY: docker-test
|
|
|
|
docker-test:
|
2023-04-06 00:34:07 -07:00
|
|
|
docker run --rm \
|
2023-08-29 18:01:44 -07:00
|
|
|
-v ${CURRENT_DIR}/test_unstructured:/home/notebook-user/test_unstructured \
|
|
|
|
-v ${CURRENT_DIR}/test_unstructured_ingest:/home/notebook-user/test_unstructured_ingest \
|
2023-06-29 10:31:01 -07:00
|
|
|
$(if $(wildcard uns_test_env_file),--env-file uns_test_env_file,) \
|
2023-03-29 00:02:39 -07:00
|
|
|
$(DOCKER_IMAGE) \
|
2023-10-05 15:26:47 -05:00
|
|
|
bash -c "CI=$(CI) \
|
|
|
|
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
|
2023-10-31 16:02:00 -05:00
|
|
|
pytest -m 'not chipper' $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
|
2023-03-30 13:23:30 -07:00
|
|
|
|
|
|
|
.PHONY: docker-smoke-test
|
|
|
|
docker-smoke-test:
|
2023-04-06 10:03:42 -07:00
|
|
|
DOCKER_IMAGE=${DOCKER_IMAGE} ./scripts/docker-smoke-test.sh
|
2023-05-31 17:01:23 +03:00
|
|
|
|
|
|
|
|
|
|
|
###########
|
|
|
|
# Jupyter #
|
|
|
|
###########
|
|
|
|
|
|
|
|
.PHONY: docker-jupyter-notebook
|
|
|
|
docker-jupyter-notebook:
|
|
|
|
docker run -p 8888:8888 --mount type=bind,source=$(realpath .),target=/home --entrypoint jupyter-notebook -t --rm ${DOCKER_IMAGE} --allow-root --port 8888 --ip 0.0.0.0 --NotebookApp.token='' --NotebookApp.password=''
|
|
|
|
|
|
|
|
|
|
|
|
.PHONY: run-jupyter
|
|
|
|
run-jupyter:
|
2023-06-01 16:48:54 -04:00
|
|
|
PYTHONPATH=$(realpath .) JUPYTER_PATH=$(realpath .) jupyter-notebook --NotebookApp.token='' --NotebookApp.password=''
|