diff --git a/CHANGELOG.md b/CHANGELOG.md index 580b0234c..d57e4a1b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## 0.9.0 + +### Enhancements + +* Dependencies are now split by document type, creating a slimmer base installation. + ## 0.8.8 ### Enhancements @@ -6,6 +12,7 @@ ### Fixes + * Rename "date" field to "last_modified" * Adds Box connector diff --git a/Dockerfile b/Dockerfile index f84cfb6ec..d3ec67f0e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -30,7 +30,15 @@ RUN python3.8 -m pip install pip==${PIP_VERSION} && \ pip install --no-cache -r requirements/ingest-s3.txt && \ pip install --no-cache -r requirements/ingest-slack.txt && \ pip install --no-cache -r requirements/ingest-wikipedia.txt && \ - pip install --no-cache -r requirements/local-inference.txt && \ + pip install --no-cache -r requirements/extra-csv.txt && \ + pip install --no-cache -r requirements/extra-docx.txt && \ + pip install --no-cache -r requirements/extra-markdown.txt && \ + pip install --no-cache -r requirements/extra-msg.txt && \ + pip install --no-cache -r requirements/extra-odt.txt && \ + pip install --no-cache -r requirements/extra-pandoc.txt && \ + pip install --no-cache -r requirements/extra-pdf-image.txt && \ + pip install --no-cache -r requirements/extra-pptx.txt && \ + pip install --no-cache -r requirements/extra-xlsx.txt && \ dnf -y groupremove "Development Tools" && \ dnf clean all diff --git a/Makefile b/Makefile index fd5f73a68..10c77f045 100644 --- a/Makefile +++ b/Makefile @@ -18,10 +18,10 @@ install-base: install-base-pip-packages install-nltk-models ## install: installs all test, dev, and experimental requirements .PHONY: install -install: install-base-pip-packages install-dev install-nltk-models install-test install-huggingface install-unstructured-inference +install: install-base-pip-packages install-dev install-nltk-models install-test install-huggingface install-all-docs .PHONY: install-ci -install-ci: install-base-pip-packages install-nltk-models install-huggingface install-unstructured-inference install-test +install-ci: install-base-pip-packages install-nltk-models install-huggingface install-all-docs install-test .PHONY: install-base-pip-packages install-base-pip-packages: @@ -53,6 +53,45 @@ install-dev: install-build: python3 -m pip install -r requirements/build.txt +.PHONY: install-csv +install-csv: + python3 -m pip install -r requirements/extra-csv.txt + +.PHONY: install-docx +install-docx: + python3 -m pip install -r requirements/extra-docx.txt + +.PHONY: install-odt +install-odt: + python3 -m pip install -r requirements/extra-odt.txt + +.PHONY: install-pypandoc +install-pypandoc: + python3 -m pip install -r requirements/extra-pandoc.txt + +.PHONY: install-markdown +install-markdown: + python3 -m pip install -r requirements/extra-markdown.txt + +.PHONY: install-msg +install-msg: + python3 -m pip install -r requirements/extra-msg.txt + +.PHONY: install-pdf-image +install-pdf-image: + python3 -m pip install -r requirements/extra-pdf-image.txt + +.PHONY: install-pptx +install-pptx: + python3 -m pip install -r requirements/extra-pptx.txt + +.PHONY: install-xlsx +install-xlsx: + python3 -m pip install -r requirements/extra-xlsx.txt + +.PHONY: install-all-docs +install-all-docs: install-base install-csv install-docx install-docx install-odt install-pypandoc install-markdown install-msg install-pdf-image install-pptx install-xlsx + .PHONY: install-ingest-google-drive install-ingest-google-drive: python3 -m pip install -r requirements/ingest-google-drive.txt @@ -124,7 +163,7 @@ install-unstructured-inference: ## install-local-inference: installs requirements for local inference .PHONY: install-local-inference -install-local-inference: install install-unstructured-inference +install-local-inference: install install-all-docs .PHONY: install-pandoc install-pandoc: @@ -135,12 +174,23 @@ install-pandoc: .PHONY: pip-compile pip-compile: pip-compile --upgrade requirements/base.in + + # Extra requirements that are specific to document types + pip-compile --upgrade requirements/extra-csv.in + pip-compile --upgrade requirements/extra-docx.in + pip-compile --upgrade requirements/extra-pandoc.in + pip-compile --upgrade requirements/extra-markdown.in + pip-compile --upgrade requirements/extra-msg.in + pip-compile --upgrade requirements/extra-odt.in + pip-compile --upgrade requirements/extra-pdf-image.in + pip-compile --upgrade requirements/extra-pptx.in + pip-compile --upgrade requirements/extra-xlsx.in + # Extra requirements for huggingface staging functions pip-compile --upgrade requirements/huggingface.in pip-compile --upgrade requirements/test.in pip-compile --upgrade requirements/dev.in pip-compile --upgrade requirements/build.in - pip-compile --upgrade requirements/local-inference.in # NOTE(robinson) - doc/requirements.txt is where the GitHub action for building # sphinx docs looks for additional requirements cp requirements/build.txt docs/requirements.txt @@ -158,6 +208,7 @@ pip-compile: pip-compile --upgrade requirements/ingest-google-drive.in pip-compile --upgrade requirements/ingest-elasticsearch.in pip-compile --upgrade requirements/ingest-onedrive.in + pip-compile --upgrade requirements/ingest-outlook.in pip-compile --upgrade requirements/ingest-confluence.in ## install-project-local: install unstructured into your local python environment diff --git a/docs/requirements.txt b/docs/requirements.txt index f831436ff..958e379c9 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile requirements/build.in @@ -12,10 +12,14 @@ beautifulsoup4==4.12.2 # via furo certifi==2023.7.22 # via + # -c requirements/base.txt + # -c requirements/constraints.in # -r requirements/build.in # requests charset-normalizer==3.2.0 - # via requests + # via + # -c requirements/base.txt + # requests docutils==0.18.1 # via # sphinx @@ -23,11 +27,11 @@ docutils==0.18.1 furo==2023.7.26 # via -r requirements/build.in idna==3.4 - # via requests + # via + # -c requirements/base.txt + # requests imagesize==1.4.1 # via sphinx -importlib-metadata==6.8.0 - # via sphinx jinja2==3.1.2 # via sphinx markupsafe==2.1.3 @@ -38,10 +42,10 @@ pygments==2.15.1 # via # furo # sphinx -pytz==2023.3 - # via babel requests==2.31.0 - # via sphinx + # via + # -c requirements/base.txt + # sphinx snowballstemmer==2.2.0 # via sphinx soupsieve==2.4.1 @@ -71,7 +75,8 @@ sphinxcontrib-qthelp==1.0.3 # via sphinx sphinxcontrib-serializinghtml==1.1.5 # via sphinx -urllib3==2.0.4 - # via requests -zipp==3.16.2 - # via importlib-metadata +urllib3==1.26.16 + # via + # -c requirements/base.txt + # -c requirements/constraints.in + # requests diff --git a/docs/source/installing.rst b/docs/source/installing.rst index 49d465078..5171ebed9 100644 --- a/docs/source/installing.rst +++ b/docs/source/installing.rst @@ -7,8 +7,15 @@ Quick Start Use the following instructions to get up and running with ``unstructured`` and test your installation. -* Install the Python SDK with ``pip install "unstructured[local-inference]"`` - * If you do not need to process PDFs or images, you can run ``pip install unstructured`` +* Install the Python SDK with ``pip install unstructured`` + * Plain text files, HTML, XML, JSON and Emails do not require any extra dependencies. + * If you need to process other document types, you can install the extras required for those documents + with ``pip install "unstructured[docx,pptx]"``. + * To install the extras for every document type, use ``pip install "unstructured[all-docs]"``. + * For ``unstructured<0.9.0``, you can install the extras for all document types with + ``pip install "unstructured[local-inference]"``. The ``local-inference`` extra is still + supported in newer versions for backward compatibility, but may be deprecated in a future version. + The ``all-docs`` extra is the officially supported installation pattern. * Install the following system dependencies if they are not already available on your system. Depending on what document types you're parsing, you may not need all of these. * ``libmagic-dev`` (filetype detection) diff --git a/requirements/base.in b/requirements/base.in index 5ac5957e3..acc32d982 100644 --- a/requirements/base.in +++ b/requirements/base.in @@ -1,19 +1,8 @@ -c "constraints.in" chardet filetype -lxml -msg_parser -nltk -openpyxl -pandas -pdf2image -pdfminer.six -pillow -pypandoc -python-docx -python-pptx python-magic -markdown -requests +lxml +nltk tabulate -xlrd +requests diff --git a/requirements/base.txt b/requirements/base.txt index 03125fc3c..d4f27a422 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile requirements/base.in @@ -8,89 +8,33 @@ certifi==2023.7.22 # via # -c requirements/constraints.in # requests -cffi==1.15.1 - # via cryptography chardet==5.1.0 # via -r requirements/base.in charset-normalizer==3.2.0 - # via - # pdfminer-six - # requests + # via requests click==8.1.6 # via nltk -cryptography==41.0.2 - # via pdfminer-six -et-xmlfile==1.1.0 - # via openpyxl filetype==1.2.0 # via -r requirements/base.in idna==3.4 # via requests -importlib-metadata==6.8.0 - # via markdown joblib==1.3.1 # via nltk lxml==4.9.3 - # via - # -r requirements/base.in - # python-docx - # python-pptx -markdown==3.4.4 - # via -r requirements/base.in -msg-parser==1.2.0 # via -r requirements/base.in nltk==3.8.1 # via -r requirements/base.in -numpy==1.24.4 - # via pandas -olefile==0.46 - # via msg-parser -openpyxl==3.1.2 - # via -r requirements/base.in -pandas==2.0.3 - # via -r requirements/base.in -pdf2image==1.16.3 - # via -r requirements/base.in -pdfminer-six==20221105 - # via -r requirements/base.in -pillow==10.0.0 - # via - # -r requirements/base.in - # pdf2image - # python-pptx -pycparser==2.21 - # via cffi -pypandoc==1.11 - # via -r requirements/base.in -python-dateutil==2.8.2 - # via pandas -python-docx==0.8.11 - # via -r requirements/base.in python-magic==0.4.27 # via -r requirements/base.in -python-pptx==0.6.21 - # via -r requirements/base.in -pytz==2023.3 - # via pandas regex==2023.6.3 # via nltk requests==2.31.0 # via -r requirements/base.in -six==1.16.0 - # via python-dateutil tabulate==0.9.0 # via -r requirements/base.in tqdm==4.65.0 # via nltk -tzdata==2023.3 - # via pandas urllib3==1.26.16 # via # -c requirements/constraints.in # requests -xlrd==2.0.1 - # via -r requirements/base.in -xlsxwriter==3.1.2 - # via python-pptx -zipp==3.16.2 - # via importlib-metadata diff --git a/requirements/build.in b/requirements/build.in index 0ba653ba8..dba8fb1f4 100644 --- a/requirements/build.in +++ b/requirements/build.in @@ -1,3 +1,6 @@ +-c base.txt +-c constraints.in + sphinx # NOTE(alan) - Pinning to resolve a conflict with sphinx. We can unpin on next sphinx_rtd_theme release. sphinx_rtd_theme==1.2.2 diff --git a/requirements/build.txt b/requirements/build.txt index f831436ff..958e379c9 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile requirements/build.in @@ -12,10 +12,14 @@ beautifulsoup4==4.12.2 # via furo certifi==2023.7.22 # via + # -c requirements/base.txt + # -c requirements/constraints.in # -r requirements/build.in # requests charset-normalizer==3.2.0 - # via requests + # via + # -c requirements/base.txt + # requests docutils==0.18.1 # via # sphinx @@ -23,11 +27,11 @@ docutils==0.18.1 furo==2023.7.26 # via -r requirements/build.in idna==3.4 - # via requests + # via + # -c requirements/base.txt + # requests imagesize==1.4.1 # via sphinx -importlib-metadata==6.8.0 - # via sphinx jinja2==3.1.2 # via sphinx markupsafe==2.1.3 @@ -38,10 +42,10 @@ pygments==2.15.1 # via # furo # sphinx -pytz==2023.3 - # via babel requests==2.31.0 - # via sphinx + # via + # -c requirements/base.txt + # sphinx snowballstemmer==2.2.0 # via sphinx soupsieve==2.4.1 @@ -71,7 +75,8 @@ sphinxcontrib-qthelp==1.0.3 # via sphinx sphinxcontrib-serializinghtml==1.1.5 # via sphinx -urllib3==2.0.4 - # via requests -zipp==3.16.2 - # via importlib-metadata +urllib3==1.26.16 + # via + # -c requirements/base.txt + # -c requirements/constraints.in + # requests diff --git a/requirements/constraints.in b/requirements/constraints.in index 65faee62a..fbe60d75f 100644 --- a/requirements/constraints.in +++ b/requirements/constraints.in @@ -12,6 +12,13 @@ jupyter-core>=4.11.2 wheel>=0.38.1 # NOTE(robinson) - The following pins are to address # vulnerabilities in dependency scans -certifi>=2022.12.07 +certifi>=2023.7.22 # From pycocotools in local-inference pyparsing<3.1.0 +# NOTE(robinson) - Numpy dropped Python 3.8 support in 1.25.0 +numpy<1.25.0 +scipy<1.11.0 +IPython<8.13 +# NOTE(robinson) - See this issue here +# https://github.com/facebookresearch/detectron2/issues/5010 +Pillow<10.0.0 diff --git a/requirements/dev.txt b/requirements/dev.txt index 89f19697d..c976f81ba 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile requirements/dev.in @@ -41,9 +41,7 @@ certifi==2023.7.22 # -c requirements/test.txt # requests cffi==1.15.1 - # via - # -c requirements/base.txt - # argon2-cffi-bindings + # via argon2-cffi-bindings cfgv==3.3.1 # via pre-commit charset-normalizer==3.2.0 @@ -57,7 +55,9 @@ click==8.1.6 # -c requirements/test.txt # pip-tools comm==0.1.3 - # via ipykernel + # via + # ipykernel + # ipywidgets debugpy==1.6.7 # via ipykernel decorator==5.1.1 @@ -66,10 +66,6 @@ defusedxml==0.7.1 # via nbconvert distlib==0.3.7 # via virtualenv -exceptiongroup==1.1.2 - # via - # -c requirements/test.txt - # anyio executing==1.2.0 # via stack-data fastjsonschema==2.18.0 @@ -87,40 +83,26 @@ idna==3.4 # anyio # jsonschema # requests -importlib-metadata==6.8.0 - # via - # -c requirements/base.txt - # jupyter-client - # jupyter-lsp - # jupyterlab - # jupyterlab-server - # nbconvert -importlib-resources==6.0.0 - # via - # jsonschema - # jsonschema-specifications - # jupyterlab - # notebook ipykernel==6.25.0 # via - # ipywidgets # jupyter # jupyter-console # jupyterlab # qtconsole ipython==8.12.2 # via + # -c requirements/constraints.in # -r requirements/dev.in # ipykernel # ipywidgets # jupyter-console ipython-genutils==0.2.0 # via qtconsole -ipywidgets==8.0.7 +ipywidgets==8.1.0 # via jupyter isoduration==20.11.0 # via jsonschema -jedi==0.18.2 +jedi==0.19.0 # via ipython jinja2==3.1.2 # via @@ -162,7 +144,7 @@ jupyter-core==5.3.1 # nbconvert # nbformat # qtconsole -jupyter-events==0.6.3 +jupyter-events==0.7.0 # via jupyter-server jupyter-lsp==2.2.0 # via jupyterlab @@ -201,16 +183,16 @@ nbconvert==7.7.3 # via # jupyter # jupyter-server -nbformat==5.9.1 +nbformat==5.9.2 # via # jupyter-server # nbclient # nbconvert -nest-asyncio==1.5.6 +nest-asyncio==1.5.7 # via ipykernel nodeenv==1.8.0 # via pre-commit -notebook==7.0.0 +notebook==7.0.1 # via jupyter notebook-shim==0.2.3 # via @@ -239,9 +221,7 @@ pickleshare==0.7.5 # via ipython pip-tools==7.1.0 # via -r requirements/dev.in -pkgutil-resolve-name==1.3.10 - # via jsonschema -platformdirs==3.9.1 +platformdirs==3.10.0 # via # -c requirements/test.txt # jupyter-core @@ -263,9 +243,7 @@ ptyprocess==0.7.0 pure-eval==0.2.2 # via stack-data pycparser==2.21 - # via - # -c requirements/base.txt - # cffi + # via cffi pygments==2.15.1 # via # ipython @@ -276,16 +254,11 @@ pyproject-hooks==1.0.0 # via build python-dateutil==2.8.2 # via - # -c requirements/base.txt # -c requirements/test.txt # arrow # jupyter-client python-json-logger==2.0.7 # via jupyter-events -pytz==2023.3 - # via - # -c requirements/base.txt - # babel pyyaml==6.0.1 # via # -c requirements/test.txt @@ -306,6 +279,7 @@ referencing==0.30.0 # via # jsonschema # jsonschema-specifications + # jupyter-events requests==2.31.0 # via # -c requirements/base.txt @@ -327,7 +301,6 @@ send2trash==1.8.2 # via jupyter-server six==1.16.0 # via - # -c requirements/base.txt # -c requirements/test.txt # asttokens # bleach @@ -345,13 +318,6 @@ terminado==0.17.1 # jupyter-server-terminals tinycss2==1.2.1 # via nbconvert -tomli==2.0.1 - # via - # -c requirements/test.txt - # build - # jupyterlab - # pip-tools - # pyproject-hooks tornado==6.3.2 # via # ipykernel @@ -377,11 +343,6 @@ traitlets==5.9.0 # nbconvert # nbformat # qtconsole -typing-extensions==4.7.1 - # via - # -c requirements/test.txt - # async-lru - # ipython uri-template==1.3.0 # via jsonschema urllib3==1.26.16 @@ -408,11 +369,6 @@ wheel==0.41.0 # pip-tools widgetsnbextension==4.0.8 # via ipywidgets -zipp==3.16.2 - # via - # -c requirements/base.txt - # importlib-metadata - # importlib-resources # The following packages are considered to be unsafe in a requirements file: # pip diff --git a/requirements/local-inference.in b/requirements/extra-csv.in similarity index 50% rename from requirements/local-inference.in rename to requirements/extra-csv.in index 9ac43c9a7..fed74c14e 100644 --- a/requirements/local-inference.in +++ b/requirements/extra-csv.in @@ -1,3 +1,4 @@ -c constraints.in -c base.txt -unstructured-inference==0.5.7 + +pandas diff --git a/requirements/extra-csv.txt b/requirements/extra-csv.txt new file mode 100644 index 000000000..74886420a --- /dev/null +++ b/requirements/extra-csv.txt @@ -0,0 +1,20 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile requirements/extra-csv.in +# +numpy==1.24.4 + # via + # -c requirements/constraints.in + # pandas +pandas==2.0.3 + # via -r requirements/extra-csv.in +python-dateutil==2.8.2 + # via pandas +pytz==2023.3 + # via pandas +six==1.16.0 + # via python-dateutil +tzdata==2023.3 + # via pandas diff --git a/requirements/extra-docx.in b/requirements/extra-docx.in new file mode 100644 index 000000000..46569e09c --- /dev/null +++ b/requirements/extra-docx.in @@ -0,0 +1,4 @@ +-c constraints.in +-c base.txt + +python-docx diff --git a/requirements/extra-docx.txt b/requirements/extra-docx.txt new file mode 100644 index 000000000..c5ddbc39d --- /dev/null +++ b/requirements/extra-docx.txt @@ -0,0 +1,12 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile requirements/extra-docx.in +# +lxml==4.9.3 + # via + # -c requirements/base.txt + # python-docx +python-docx==0.8.11 + # via -r requirements/extra-docx.in diff --git a/requirements/extra-markdown.in b/requirements/extra-markdown.in new file mode 100644 index 000000000..44e817488 --- /dev/null +++ b/requirements/extra-markdown.in @@ -0,0 +1,4 @@ +-c "constraints.in" +-c "base.txt" + +markdown diff --git a/requirements/extra-markdown.txt b/requirements/extra-markdown.txt new file mode 100644 index 000000000..e09266a16 --- /dev/null +++ b/requirements/extra-markdown.txt @@ -0,0 +1,8 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile requirements/extra-markdown.in +# +markdown==3.4.4 + # via -r requirements/extra-markdown.in diff --git a/requirements/extra-msg.in b/requirements/extra-msg.in new file mode 100644 index 000000000..01021471b --- /dev/null +++ b/requirements/extra-msg.in @@ -0,0 +1,4 @@ +-c constraints.in +-c base.txt + +msg_parser diff --git a/requirements/extra-msg.txt b/requirements/extra-msg.txt new file mode 100644 index 000000000..722b0980e --- /dev/null +++ b/requirements/extra-msg.txt @@ -0,0 +1,10 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile requirements/extra-msg.in +# +msg-parser==1.2.0 + # via -r requirements/extra-msg.in +olefile==0.46 + # via msg-parser diff --git a/requirements/extra-odt.in b/requirements/extra-odt.in new file mode 100644 index 000000000..6076a76a1 --- /dev/null +++ b/requirements/extra-odt.in @@ -0,0 +1,5 @@ +-c constraints.in +-c base.txt + +python-docx +pypandoc diff --git a/requirements/extra-odt.txt b/requirements/extra-odt.txt new file mode 100644 index 000000000..dea5ebb68 --- /dev/null +++ b/requirements/extra-odt.txt @@ -0,0 +1,14 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile requirements/extra-odt.in +# +lxml==4.9.3 + # via + # -c requirements/base.txt + # python-docx +pypandoc==1.11 + # via -r requirements/extra-odt.in +python-docx==0.8.11 + # via -r requirements/extra-odt.in diff --git a/requirements/extra-pandoc.in b/requirements/extra-pandoc.in new file mode 100644 index 000000000..21720efda --- /dev/null +++ b/requirements/extra-pandoc.in @@ -0,0 +1,4 @@ +-c constraints.in +-c base.txt + +pypandoc diff --git a/requirements/extra-pandoc.txt b/requirements/extra-pandoc.txt new file mode 100644 index 000000000..b0804f16d --- /dev/null +++ b/requirements/extra-pandoc.txt @@ -0,0 +1,8 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile requirements/extra-pandoc.in +# +pypandoc==1.11 + # via -r requirements/extra-pandoc.in diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in new file mode 100644 index 000000000..812d902ce --- /dev/null +++ b/requirements/extra-pdf-image.in @@ -0,0 +1,9 @@ +-c constraints.in +-c base.txt + +pdf2image +pdfminer.six +# NOTE(robinson) - See this issue here +# https://github.com/facebookresearch/detectron2/issues/5010 +Pillow<10 +unstructured-inference==0.5.7 diff --git a/requirements/local-inference.txt b/requirements/extra-pdf-image.txt similarity index 80% rename from requirements/local-inference.txt rename to requirements/extra-pdf-image.txt index 110e2d0a3..130b774c9 100644 --- a/requirements/local-inference.txt +++ b/requirements/extra-pdf-image.txt @@ -1,8 +1,8 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # -# pip-compile requirements/local-inference.in +# pip-compile requirements/extra-pdf-image.in # antlr4-python3-runtime==4.9.3 # via omegaconf @@ -12,9 +12,7 @@ certifi==2023.7.22 # -c requirements/constraints.in # requests cffi==1.15.1 - # via - # -c requirements/base.txt - # cryptography + # via cryptography charset-normalizer==3.2.0 # via # -c requirements/base.txt @@ -25,9 +23,7 @@ coloredlogs==15.0.1 contourpy==1.1.0 # via matplotlib cryptography==41.0.2 - # via - # -c requirements/base.txt - # pdfminer-six + # via pdfminer-six cycler==0.11.0 # via matplotlib effdet==0.4.1 @@ -54,8 +50,6 @@ idna==3.4 # via # -c requirements/base.txt # requests -importlib-resources==6.0.0 - # via matplotlib iopath==0.1.10 # via layoutparser jinja2==3.1.2 @@ -74,7 +68,7 @@ networkx==3.1 # via torch numpy==1.24.4 # via - # -c requirements/base.txt + # -c requirements/constraints.in # contourpy # layoutparser # matplotlib @@ -101,22 +95,21 @@ packaging==23.1 # pytesseract # transformers pandas==2.0.3 - # via - # -c requirements/base.txt - # layoutparser + # via layoutparser pdf2image==1.16.3 # via - # -c requirements/base.txt + # -r requirements/extra-pdf-image.in # layoutparser pdfminer-six==20221105 # via - # -c requirements/base.txt + # -r requirements/extra-pdf-image.in # pdfplumber -pdfplumber==0.10.1 +pdfplumber==0.10.2 # via layoutparser -pillow==10.0.0 +pillow==9.5.0 # via - # -c requirements/base.txt + # -c requirements/constraints.in + # -r requirements/extra-pdf-image.in # layoutparser # matplotlib # pdf2image @@ -132,9 +125,7 @@ protobuf==4.23.4 pycocotools==2.0.6 # via effdet pycparser==2.21 - # via - # -c requirements/base.txt - # cffi + # via cffi pyparsing==3.0.9 # via # -c requirements/constraints.in @@ -145,15 +136,12 @@ pytesseract==0.3.10 # via layoutparser python-dateutil==2.8.2 # via - # -c requirements/base.txt # matplotlib # pandas python-multipart==0.0.6 # via unstructured-inference pytz==2023.3 - # via - # -c requirements/base.txt - # pandas + # via pandas pyyaml==6.0.1 # via # huggingface-hub @@ -176,11 +164,11 @@ safetensors==0.3.1 # timm # transformers scipy==1.10.1 - # via layoutparser -six==1.16.0 # via - # -c requirements/base.txt - # python-dateutil + # -c requirements/constraints.in + # layoutparser +six==1.16.0 + # via python-dateutil sympy==1.12 # via # onnxruntime @@ -214,17 +202,11 @@ typing-extensions==4.7.1 # iopath # torch tzdata==2023.3 - # via - # -c requirements/base.txt - # pandas + # via pandas unstructured-inference==0.5.7 - # via -r requirements/local-inference.in + # via -r requirements/extra-pdf-image.in urllib3==1.26.16 # via # -c requirements/base.txt # -c requirements/constraints.in # requests -zipp==3.16.2 - # via - # -c requirements/base.txt - # importlib-resources diff --git a/requirements/extra-pptx.in b/requirements/extra-pptx.in new file mode 100644 index 000000000..9f5499c68 --- /dev/null +++ b/requirements/extra-pptx.in @@ -0,0 +1,3 @@ +-c "constraints.in" + +python-pptx diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt new file mode 100644 index 000000000..550afe1a4 --- /dev/null +++ b/requirements/extra-pptx.txt @@ -0,0 +1,16 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile requirements/extra-pptx.in +# +lxml==4.9.3 + # via python-pptx +pillow==9.5.0 + # via + # -c requirements/constraints.in + # python-pptx +python-pptx==0.6.21 + # via -r requirements/extra-pptx.in +xlsxwriter==3.1.2 + # via python-pptx diff --git a/requirements/extra-xlsx.in b/requirements/extra-xlsx.in new file mode 100644 index 000000000..5e296abf8 --- /dev/null +++ b/requirements/extra-xlsx.in @@ -0,0 +1,6 @@ +-c constraints.in +-c base.txt + +openpyxl +pandas +xlrd diff --git a/requirements/extra-xlsx.txt b/requirements/extra-xlsx.txt new file mode 100644 index 000000000..b9be1f037 --- /dev/null +++ b/requirements/extra-xlsx.txt @@ -0,0 +1,26 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile requirements/extra-xlsx.in +# +et-xmlfile==1.1.0 + # via openpyxl +numpy==1.24.4 + # via + # -c requirements/constraints.in + # pandas +openpyxl==3.1.2 + # via -r requirements/extra-xlsx.in +pandas==2.0.3 + # via -r requirements/extra-xlsx.in +python-dateutil==2.8.2 + # via pandas +pytz==2023.3 + # via pandas +six==1.16.0 + # via python-dateutil +tzdata==2023.3 + # via pandas +xlrd==2.0.1 + # via -r requirements/extra-xlsx.in diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index 21686e2a3..b5ffcacfd 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile requirements/huggingface.in @@ -46,7 +46,7 @@ networkx==3.1 # via torch numpy==1.24.4 # via - # -c requirements/base.txt + # -c requirements/constraints.in # transformers packaging==23.1 # via @@ -74,7 +74,6 @@ sentencepiece==0.1.99 # via -r requirements/huggingface.in six==1.16.0 # via - # -c requirements/base.txt # langdetect # sacremoses sympy==1.12 diff --git a/requirements/ingest-azure.txt b/requirements/ingest-azure.txt index feba108e9..1d3d50436 100644 --- a/requirements/ingest-azure.txt +++ b/requirements/ingest-azure.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile requirements/ingest-azure.in @@ -32,7 +32,6 @@ certifi==2023.7.22 # requests cffi==1.15.1 # via - # -c requirements/base.txt # azure-datalake-store # cryptography charset-normalizer==3.2.0 @@ -42,7 +41,6 @@ charset-normalizer==3.2.0 # requests cryptography==41.0.2 # via - # -c requirements/base.txt # azure-identity # azure-storage-blob # msal @@ -76,9 +74,7 @@ multidict==6.0.4 portalocker==2.7.0 # via msal-extensions pycparser==2.21 - # via - # -c requirements/base.txt - # cffi + # via cffi pyjwt[crypto]==2.8.0 # via msal requests==2.31.0 @@ -89,7 +85,6 @@ requests==2.31.0 # msal six==1.16.0 # via - # -c requirements/base.txt # azure-core # azure-identity # isodate diff --git a/requirements/ingest-box.txt b/requirements/ingest-box.txt index 81f70b2fa..a00a671c2 100644 --- a/requirements/ingest-box.txt +++ b/requirements/ingest-box.txt @@ -8,7 +8,7 @@ attrs==23.1.0 # via boxsdk boxfs==0.2.0 # via -r requirements/ingest-box.in -boxsdk[jwt]==3.8.0 +boxsdk[jwt]==3.8.1 # via boxfs certifi==2023.7.22 # via @@ -16,17 +16,13 @@ certifi==2023.7.22 # -c requirements/constraints.in # requests cffi==1.15.1 - # via - # -c requirements/base.txt - # cryptography + # via cryptography charset-normalizer==3.2.0 # via # -c requirements/base.txt # requests cryptography==41.0.2 - # via - # -c requirements/base.txt - # boxsdk + # via boxsdk fsspec==2023.6.0 # via # -r requirements/ingest-box.in @@ -36,15 +32,11 @@ idna==3.4 # -c requirements/base.txt # requests pycparser==2.21 - # via - # -c requirements/base.txt - # cffi + # via cffi pyjwt==2.8.0 # via boxsdk python-dateutil==2.8.2 - # via - # -c requirements/base.txt - # boxsdk + # via boxsdk requests==2.31.0 # via # -c requirements/base.txt @@ -53,9 +45,7 @@ requests==2.31.0 requests-toolbelt==1.0.0 # via boxsdk six==1.16.0 - # via - # -c requirements/base.txt - # python-dateutil + # via python-dateutil urllib3==1.26.16 # via # -c requirements/base.txt diff --git a/requirements/ingest-confluence.txt b/requirements/ingest-confluence.txt index 31a80b3f8..6239278e5 100644 --- a/requirements/ingest-confluence.txt +++ b/requirements/ingest-confluence.txt @@ -1,10 +1,10 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile requirements/ingest-confluence.in # -atlassian-python-api==3.39.0 +atlassian-python-api==3.40.0 # via -r requirements/ingest-confluence.in certifi==2023.7.22 # via @@ -33,9 +33,7 @@ requests==2.31.0 requests-oauthlib==1.3.1 # via atlassian-python-api six==1.16.0 - # via - # -c requirements/base.txt - # atlassian-python-api + # via atlassian-python-api urllib3==1.26.16 # via # -c requirements/base.txt diff --git a/requirements/ingest-discord.txt b/requirements/ingest-discord.txt index 544398b4c..3f68d17af 100644 --- a/requirements/ingest-discord.txt +++ b/requirements/ingest-discord.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile requirements/ingest-discord.in diff --git a/requirements/ingest-dropbox.txt b/requirements/ingest-dropbox.txt index 1bd06426e..42afe8a64 100644 --- a/requirements/ingest-dropbox.txt +++ b/requirements/ingest-dropbox.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile requirements/ingest-dropbox.in @@ -34,7 +34,6 @@ requests==2.31.0 # dropboxdrivefs six==1.16.0 # via - # -c requirements/base.txt # dropbox # stone stone==3.3.1 diff --git a/requirements/ingest-elasticsearch.txt b/requirements/ingest-elasticsearch.txt index 78b7d9623..ccd9575c5 100644 --- a/requirements/ingest-elasticsearch.txt +++ b/requirements/ingest-elasticsearch.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile requirements/ingest-elasticsearch.in @@ -11,7 +11,7 @@ certifi==2023.7.22 # elastic-transport elastic-transport==8.4.0 # via elasticsearch -elasticsearch==8.8.2 +elasticsearch==8.9.0 # via -r requirements/ingest-elasticsearch.in jq==1.4.1 # via -r requirements/ingest-elasticsearch.in diff --git a/requirements/ingest-gcs.txt b/requirements/ingest-gcs.txt index 129eae20a..eba10abe5 100644 --- a/requirements/ingest-gcs.txt +++ b/requirements/ingest-gcs.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile requirements/ingest-gcs.in @@ -57,7 +57,7 @@ google-crc32c==1.5.0 # via google-resumable-media google-resumable-media==2.5.0 # via google-cloud-storage -googleapis-common-protos==1.59.1 +googleapis-common-protos==1.60.0 # via google-api-core idna==3.4 # via @@ -74,7 +74,6 @@ protobuf==4.23.4 # via # -c requirements/constraints.in # google-api-core - # googleapis-common-protos pyasn1==0.5.0 # via # pyasn1-modules @@ -93,9 +92,7 @@ requests-oauthlib==1.3.1 rsa==4.9 # via google-auth six==1.16.0 - # via - # -c requirements/base.txt - # google-auth + # via google-auth urllib3==1.26.16 # via # -c requirements/base.txt diff --git a/requirements/ingest-github.txt b/requirements/ingest-github.txt index 3c1504b9c..6f580e3a2 100644 --- a/requirements/ingest-github.txt +++ b/requirements/ingest-github.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile requirements/ingest-github.in @@ -11,7 +11,6 @@ certifi==2023.7.22 # requests cffi==1.15.1 # via - # -c requirements/base.txt # cryptography # pynacl charset-normalizer==3.2.0 @@ -19,9 +18,7 @@ charset-normalizer==3.2.0 # -c requirements/base.txt # requests cryptography==41.0.2 - # via - # -c requirements/base.txt - # pyjwt + # via pyjwt deprecated==1.2.14 # via pygithub idna==3.4 @@ -29,9 +26,7 @@ idna==3.4 # -c requirements/base.txt # requests pycparser==2.21 - # via - # -c requirements/base.txt - # cffi + # via cffi pygithub==1.58.2 # via -r requirements/ingest-github.in pyjwt[crypto]==2.8.0 diff --git a/requirements/ingest-gitlab.txt b/requirements/ingest-gitlab.txt index dbff64042..b0f34d769 100644 --- a/requirements/ingest-gitlab.txt +++ b/requirements/ingest-gitlab.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile requirements/ingest-gitlab.in diff --git a/requirements/ingest-google-drive.txt b/requirements/ingest-google-drive.txt index fc48a89ca..8400e3597 100644 --- a/requirements/ingest-google-drive.txt +++ b/requirements/ingest-google-drive.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile requirements/ingest-google-drive.in @@ -26,7 +26,7 @@ google-auth==2.22.0 # google-auth-httplib2 google-auth-httplib2==0.1.0 # via google-api-python-client -googleapis-common-protos==1.59.1 +googleapis-common-protos==1.60.0 # via google-api-core httplib2==0.22.0 # via @@ -59,7 +59,6 @@ rsa==4.9 # via google-auth six==1.16.0 # via - # -c requirements/base.txt # google-auth # google-auth-httplib2 uritemplate==4.1.1 diff --git a/requirements/ingest-onedrive.txt b/requirements/ingest-onedrive.txt index 7502a3bdc..c87c1494e 100644 --- a/requirements/ingest-onedrive.txt +++ b/requirements/ingest-onedrive.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile requirements/ingest-onedrive.in @@ -10,16 +10,13 @@ certifi==2023.7.22 # -c requirements/constraints.in # requests cffi==1.15.1 - # via - # -c requirements/base.txt - # cryptography + # via cryptography charset-normalizer==3.2.0 # via # -c requirements/base.txt # requests cryptography==41.0.2 # via - # -c requirements/base.txt # -r requirements/ingest-onedrive.in # msal # pyjwt @@ -34,15 +31,11 @@ msal==1.23.0 office365-rest-python-client==2.4.2 # via -r requirements/ingest-onedrive.in pycparser==2.21 - # via - # -c requirements/base.txt - # cffi + # via cffi pyjwt[crypto]==2.8.0 # via msal pytz==2023.3 - # via - # -c requirements/base.txt - # office365-rest-python-client + # via office365-rest-python-client requests==2.31.0 # via # -c requirements/base.txt diff --git a/requirements/ingest-outlook.txt b/requirements/ingest-outlook.txt index f8984bf4a..fd89eab61 100644 --- a/requirements/ingest-outlook.txt +++ b/requirements/ingest-outlook.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile requirements/ingest-outlook.in @@ -10,16 +10,13 @@ certifi==2023.7.22 # -c requirements/constraints.in # requests cffi==1.15.1 - # via - # -c requirements/base.txt - # cryptography + # via cryptography charset-normalizer==3.2.0 # via # -c requirements/base.txt # requests cryptography==41.0.2 # via - # -c requirements/base.txt # -r requirements/ingest-outlook.in # msal # pyjwt @@ -34,15 +31,11 @@ msal==1.23.0 office365-rest-python-client==2.4.2 # via -r requirements/ingest-outlook.in pycparser==2.21 - # via - # -c requirements/base.txt - # cffi + # via cffi pyjwt[crypto]==2.8.0 # via msal pytz==2023.3 - # via - # -c requirements/base.txt - # office365-rest-python-client + # via office365-rest-python-client requests==2.31.0 # via # -c requirements/base.txt diff --git a/requirements/ingest-reddit.txt b/requirements/ingest-reddit.txt index 42bffeadd..c7f364fc2 100644 --- a/requirements/ingest-reddit.txt +++ b/requirements/ingest-reddit.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile requirements/ingest-reddit.in diff --git a/requirements/ingest-s3.txt b/requirements/ingest-s3.txt index e9801a500..26c6ee4fd 100644 --- a/requirements/ingest-s3.txt +++ b/requirements/ingest-s3.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile requirements/ingest-s3.in @@ -43,17 +43,11 @@ multidict==6.0.4 # aiohttp # yarl python-dateutil==2.8.2 - # via - # -c requirements/base.txt - # botocore + # via botocore s3fs==2023.6.0 # via -r requirements/ingest-s3.in six==1.16.0 - # via - # -c requirements/base.txt - # python-dateutil -typing-extensions==4.7.1 - # via aioitertools + # via python-dateutil urllib3==1.26.16 # via # -c requirements/base.txt diff --git a/requirements/ingest-slack.txt b/requirements/ingest-slack.txt index b8c94147b..f88d94e18 100644 --- a/requirements/ingest-slack.txt +++ b/requirements/ingest-slack.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile requirements/ingest-slack.in diff --git a/requirements/ingest-wikipedia.txt b/requirements/ingest-wikipedia.txt index 7455fb82c..e25272b66 100644 --- a/requirements/ingest-wikipedia.txt +++ b/requirements/ingest-wikipedia.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile requirements/ingest-wikipedia.in diff --git a/requirements/test.txt b/requirements/test.txt index 1e35908d7..11d41d6da 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile requirements/test.in @@ -26,9 +26,7 @@ coverage[toml]==7.2.7 # via # -r requirements/test.in # pytest-cov -exceptiongroup==1.1.2 - # via pytest -flake8==6.0.0 +flake8==6.1.0 # via -r requirements/test.in freezegun==1.2.2 # via -r requirements/test.in @@ -64,17 +62,17 @@ packaging==23.1 # via # black # pytest -pathspec==0.11.1 +pathspec==0.11.2 # via black -platformdirs==3.9.1 +platformdirs==3.10.0 # via black pluggy==1.2.0 # via pytest -pycodestyle==2.10.0 +pycodestyle==2.11.0 # via flake8 pydantic==1.10.12 # via label-studio-sdk -pyflakes==3.0.1 +pyflakes==3.1.0 # via flake8 pytest==7.4.0 # via @@ -85,28 +83,17 @@ pytest-cov==4.1.0 pytest-mock==3.11.1 # via -r requirements/test.in python-dateutil==2.8.2 - # via - # -c requirements/base.txt - # freezegun + # via freezegun pyyaml==6.0.1 # via vcrpy requests==2.31.0 # via # -c requirements/base.txt # label-studio-sdk -ruff==0.0.280 +ruff==0.0.281 # via -r requirements/test.in six==1.16.0 - # via - # -c requirements/base.txt - # python-dateutil - # vcrpy -tomli==2.0.1 - # via - # black - # coverage - # mypy - # pytest + # via python-dateutil types-click==7.1.8 # via -r requirements/test.in types-markdown==3.4.2.10 @@ -119,7 +106,6 @@ types-urllib3==1.26.25.14 # via types-requests typing-extensions==4.7.1 # via - # black # mypy # pydantic urllib3==1.26.16 @@ -127,8 +113,7 @@ urllib3==1.26.16 # -c requirements/base.txt # -c requirements/constraints.in # requests - # vcrpy -vcrpy==5.0.0 +vcrpy==5.1.0 # via -r requirements/test.in wrapt==1.15.0 # via vcrpy diff --git a/setup.py b/setup.py index c05f592d4..eaaec0ce1 100644 --- a/setup.py +++ b/setup.py @@ -34,11 +34,48 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List with open(file, encoding="utf-8") as f: requirements.extend(f.readlines()) requirements = [ - req for req in requirements if not req.startswith("#") and not req.startswith("-") + req + for req in requirements + if not req.startswith("#") and not req.startswith("-") ] return requirements +csv_reqs = load_requirements("requirements/extra-csv.in") +docx_reqs = load_requirements("requirements/extra-docx.in") +epub_reqs = load_requirements("requirements/extra-pandoc.in") +image_reqs = load_requirements("requirements/extra-pdf-image.in") +markdown_reqs = load_requirements("requirements/extra-markdown.in") +msg_reqs = load_requirements("requirements/extra-msg.in") +odt_reqs = load_requirements("requirements/extra-odt.in") +org_reqs = load_requirements("requirements/extra-pandoc.in") +pdf_reqs = load_requirements("requirements/extra-pdf-image.in") +pptx_reqs = load_requirements("requirements/extra-pptx.in") +rtf_reqs = load_requirements("requirements/extra-pandoc.in") +rst_reqs = load_requirements("requirements/extra-pandoc.in") +tsv_reqs = load_requirements("requirements/extra-csv.in") +xlsx_reqs = load_requirements("requirements/extra-xlsx.in") + +all_doc_reqs = list( + set( + csv_reqs + + docx_reqs + + epub_reqs + + image_reqs + + markdown_reqs + + msg_reqs + + odt_reqs + + org_reqs + + pdf_reqs + + pptx_reqs + + rtf_reqs + + rst_reqs + + tsv_reqs + + xlsx_reqs, + ), +) + + setup( name="unstructured", description="A library that prepares raw documents for downstream ML tasks.", @@ -71,8 +108,23 @@ setup( }, install_requires=load_requirements(), extras_require={ - "huggingface": load_requirements("requirements/huggingface.in"), - "local-inference": load_requirements("requirements/local-inference.in"), + # Document specific extra requirements + "all-docs": all_doc_reqs, + "csv": csv_reqs, + "docx": docx_reqs, + "epub": epub_reqs, + "image": image_reqs, + "md": markdown_reqs, + "msg": msg_reqs, + "odt": odt_reqs, + "org": org_reqs, + "pdf": pdf_reqs, + "pptx": pptx_reqs, + "rtf": rtf_reqs, + "rst": rst_reqs, + "tsv": tsv_reqs, + "xlsx": xlsx_reqs, + # Extra requirements for data connectors "s3": load_requirements("requirements/ingest-s3.in"), "azure": load_requirements("requirements/ingest-azure.in"), "discord": load_requirements("requirements/ingest-discord.in"), @@ -89,6 +141,9 @@ setup( "onedrive": load_requirements("requirements/ingest-onedrive.in"), "outlook": load_requirements("requirements/ingest-outlook.in"), "confluence": load_requirements("requirements/ingest-confluence.in"), + # Legacy extra requirements + "huggingface": load_requirements("requirements/huggingface.in"), + "local-inference": all_doc_reqs, }, package_dir={"unstructured": "unstructured"}, package_data={"unstructured": ["nlp/*.txt"]}, diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index 6aee31a73..d96401b65 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -478,4 +478,4 @@ def test_document_to_element_list_omits_coord_system_when_coord_points_absent(): def test_get_page_image_metadata_and_coordinate_system(): doc = MockDocumentLayout() metadata = _get_page_image_metadata(doc.pages[0]) - assert type(metadata) == dict + assert isinstance(metadata, dict) diff --git a/test_unstructured/partition/test_doc.py b/test_unstructured/partition/test_doc.py index 271b17ff7..bd47749e0 100644 --- a/test_unstructured/partition/test_doc.py +++ b/test_unstructured/partition/test_doc.py @@ -267,6 +267,6 @@ def test_partition_doc_from_file_without_metadata_date( sf = SpooledTemporaryFile() sf.write(f.read()) sf.seek(0) - elements = partition_doc(file=sf, metadata_last_modified=None) + elements = partition_doc(file=sf, metadata_date="2020-07-05") - assert elements[0].metadata.last_modified is None + assert elements[0].metadata.date == "2020-07-05" diff --git a/test_unstructured_ingest/unit/test_interfaces.py b/test_unstructured_ingest/unit/test_interfaces.py index 1fd948f33..2dacd4161 100644 --- a/test_unstructured_ingest/unit/test_interfaces.py +++ b/test_unstructured_ingest/unit/test_interfaces.py @@ -216,8 +216,8 @@ def test_process_file_metadata_exclude_filename_pagenum(mocker, partition_test_r isd_elems = test_ingest_doc.process_file() assert len(isd_elems) for elem in isd_elems: - assert "filename" not in elem["metadata"].keys() - assert "page_number" not in elem["metadata"].keys() + assert "filename" not in elem["metadata"] + assert "page_number" not in elem["metadata"] def test_process_file_flatten_metadata(mocker, partition_test_results): diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 844274171..e0b78aab5 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.8.8" # pragma: no cover +__version__ = "0.9.0" # pragma: no cover diff --git a/unstructured/file_utils/file_conversion.py b/unstructured/file_utils/file_conversion.py index e92b47043..23b803ecf 100644 --- a/unstructured/file_utils/file_conversion.py +++ b/unstructured/file_utils/file_conversion.py @@ -1,11 +1,14 @@ import tempfile from typing import IO, Optional -import pypandoc - from unstructured.partition.common import exactly_one +from unstructured.utils import dependency_exists, requires_dependencies + +if dependency_exists("pypandoc"): + import pypandoc +@requires_dependencies(["pypandoc"]) def convert_file_to_text(filename: str, source_format: str, target_format: str) -> str: """Uses pandoc to convert the source document to a raw text string.""" try: diff --git a/unstructured/ingest/connector/google_drive.py b/unstructured/ingest/connector/google_drive.py index 67e905e60..02b5a07be 100644 --- a/unstructured/ingest/connector/google_drive.py +++ b/unstructured/ingest/connector/google_drive.py @@ -76,7 +76,7 @@ class SimpleGoogleDriveConfig(BaseConnectorConfig): recursive: bool = False def __post_init__(self): - if self.extension and self.extension not in EXT_TO_FILETYPE.keys(): + if self.extension and self.extension not in EXT_TO_FILETYPE: raise ValueError( f"Extension not supported. " f"Value MUST be one of {', '.join([k for k in EXT_TO_FILETYPE if k is not None])}.", diff --git a/unstructured/ingest/connector/onedrive.py b/unstructured/ingest/connector/onedrive.py index 313ebeafd..24d0e5edc 100644 --- a/unstructured/ingest/connector/onedrive.py +++ b/unstructured/ingest/connector/onedrive.py @@ -65,7 +65,7 @@ class OneDriveIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): if not self.ext: raise ValueError("Unsupported file without extension.") - if self.ext not in EXT_TO_FILETYPE.keys(): + if self.ext not in EXT_TO_FILETYPE: raise ValueError( f"Extension not supported. " f"Value MUST be one of {', '.join([k for k in EXT_TO_FILETYPE if k is not None])}.", diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index ea56b0ee0..9383256d1 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -13,27 +13,58 @@ from unstructured.file_utils.filetype import ( ) from unstructured.logger import logger from unstructured.partition.common import exactly_one -from unstructured.partition.csv import partition_csv -from unstructured.partition.doc import partition_doc -from unstructured.partition.docx import partition_docx from unstructured.partition.email import partition_email -from unstructured.partition.epub import partition_epub from unstructured.partition.html import partition_html -from unstructured.partition.image import partition_image from unstructured.partition.json import partition_json -from unstructured.partition.md import partition_md -from unstructured.partition.msg import partition_msg -from unstructured.partition.odt import partition_odt -from unstructured.partition.org import partition_org -from unstructured.partition.pdf import partition_pdf -from unstructured.partition.ppt import partition_ppt -from unstructured.partition.pptx import partition_pptx -from unstructured.partition.rst import partition_rst -from unstructured.partition.rtf import partition_rtf from unstructured.partition.text import partition_text -from unstructured.partition.tsv import partition_tsv -from unstructured.partition.xlsx import partition_xlsx from unstructured.partition.xml import partition_xml +from unstructured.utils import dependency_exists + +if dependency_exists("pandas"): + from unstructured.partition.csv import partition_csv + from unstructured.partition.tsv import partition_tsv + + +if dependency_exists("docx"): + from unstructured.partition.doc import partition_doc + from unstructured.partition.docx import partition_docx + + +if dependency_exists("docx") and dependency_exists("pypandoc"): + from unstructured.partition.odt import partition_odt + + +if dependency_exists("pypandoc"): + from unstructured.partition.epub import partition_epub + from unstructured.partition.org import partition_org + from unstructured.partition.rst import partition_rst + from unstructured.partition.rtf import partition_rtf + + +if dependency_exists("markdown"): + from unstructured.partition.md import partition_md + + +if dependency_exists("msg_parser"): + from unstructured.partition.msg import partition_msg + + +pdf_imports = ["pdf2image", "pdfminer", "PIL"] +if all(dependency_exists(dep) for dep in pdf_imports): + from unstructured.partition.pdf import partition_pdf + + +if dependency_exists("unstructured_inference"): + from unstructured.partition.image import partition_image + + +if dependency_exists("pptx"): + from unstructured.partition.ppt import partition_ppt + from unstructured.partition.pptx import partition_pptx + + +if dependency_exists("pandas") and dependency_exists("openpyxl"): + from unstructured.partition.xlsx import partition_xlsx def partition( diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py index 2b38a5d67..5e75387e1 100644 --- a/unstructured/partition/common.py +++ b/unstructured/partition/common.py @@ -7,7 +7,6 @@ from io import BufferedReader, BytesIO, TextIOWrapper from tempfile import SpooledTemporaryFile from typing import IO, TYPE_CHECKING, Any, BinaryIO, Dict, List, Optional, Tuple, Union -from docx import table as docxtable from tabulate import tabulate from unstructured.documents.coordinates import CoordinateSystem @@ -23,6 +22,10 @@ from unstructured.documents.elements import ( ) from unstructured.logger import logger from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE +from unstructured.utils import dependency_exists + +if dependency_exists("docx"): + import docx.table as docxtable if TYPE_CHECKING: from unstructured_inference.inference.layoutelement import ( @@ -303,12 +306,12 @@ def convert_to_bytes( return f_bytes -def convert_ms_office_table_to_text(table: docxtable.Table, as_html: bool = True): +def convert_ms_office_table_to_text(table: "docxtable.Table", as_html: bool = True) -> str: """ Convert a table object from a Word document to an HTML table string using the tabulate library. Args: - table (Table): A Table object. + table (Table): A docx.table.Table object. as_html (bool): Whether to return the table as an HTML string (True) or a plain text string (False) diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index fadee72d7..26ae804d7 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -4,7 +4,6 @@ from tempfile import SpooledTemporaryFile from typing import IO, BinaryIO, List, Optional, Tuple, Union, cast import docx -import pypandoc from docx.oxml.shared import qn from docx.text.paragraph import Paragraph from docx.text.run import Run @@ -38,6 +37,10 @@ from unstructured.partition.text_type import ( is_possible_title, is_us_city_state_zip, ) +from unstructured.utils import dependency_exists + +if dependency_exists("pypandoc"): + import pypandoc # NOTE(robinson) - documentation on built in styles can be found at the link below # ref: https://python-docx.readthedocs.io/en/latest/user/ @@ -314,7 +317,7 @@ def convert_and_partition_docx( metadata_filename: Optional[str] = None, metadata_last_modified: Optional[str] = None, ) -> List[Element]: - """Converts a document to DOCX and then partitions it using partition_html. Works with + """Converts a document to DOCX and then partitions it using partition_docx. Works with any file format support by pandoc. Parameters diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py index 5c1a691bc..6dd5667f9 100644 --- a/unstructured/staging/base.py +++ b/unstructured/staging/base.py @@ -3,8 +3,6 @@ import io import json from typing import Any, Dict, List, Optional -import pandas as pd - from unstructured.documents.elements import ( TYPE_TO_TEXT_ELEMENT_MAP, CheckBox, @@ -13,6 +11,10 @@ from unstructured.documents.elements import ( NoID, ) from unstructured.partition.common import exactly_one +from unstructured.utils import dependency_exists, requires_dependencies + +if dependency_exists("pandas"): + import pandas as pd def _get_metadata_table_fieldnames(): @@ -161,7 +163,7 @@ def convert_to_isd_csv(elements: List[Element]) -> str: if row.get("sent_from"): row["sender"] = row.get("sent_from") - if type(row["sender"]) == list: + if isinstance(row["sender"], list): row["sender"] = row["sender"][0] with io.StringIO() as buffer: @@ -176,11 +178,14 @@ def convert_to_csv(elements: List[Element]) -> str: return convert_to_isd_csv(elements) -def convert_to_dataframe(elements: List[Element], drop_empty_cols: bool = True) -> pd.DataFrame: +@requires_dependencies(["pandas"]) +def convert_to_dataframe(elements: List[Element], drop_empty_cols: bool = True) -> "pd.DataFrame": """Converts document elements to a pandas DataFrame. The dataframe contains the following columns: text: the element text type: the text type (NarrativeText, Title, etc) + + Output is pd.DataFrame """ csv_string = convert_to_isd_csv(elements) csv_string_io = io.StringIO(csv_string)