build(deps): split up dependencies by document type (#986)

* split dependencies by document type * make pip-compile with new requirements * add extra requirements to setup.py * add in all docs; re pip-compile * extra for all docs * add pandas to xlsx * dependency requires for tsv and csv * handling for doc, docx and odt * dependency check for pypandoc * required dependencies for pandoc files * xml and html * markdown * msg * add in pdf * add in pptx * add in excel * add lxml as base req * extra all docs for local inference * local inference installs all * pin pillow version * fixes for plain text tests * fixes for doc * update make commands * changelog and version * add xlrd * update pip-compile * pin numpy for python 3.8 support * more constraints * contraint on scipy * update install docs * constrain ipython * add outlook to pip-compile * more ipython constraints * add extras to dockerfile * pin office365 client * few doc tweaks * types as strings * last pip-compile * re pip-comple * make tidy * make tidy
2025-06-27 02:30:08 +00:00 · 2023-08-01 11:31:13 -04:00 · 2023-08-01 11:31:13 -04:00 · 331c7faf38
commit 331c7faf38
parent 13d3559fa4
59 changed files with 508 additions and 353 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,9 @@
+## 0.9.0
+
+### Enhancements
+
+* Dependencies are now split by document type, creating a slimmer base installation.
+
 ## 0.8.8

 ### Enhancements
@ -6,6 +12,7 @@

 ### Fixes

+
 * Rename "date" field to "last_modified"
 * Adds Box connector

--- a/10
+++ b/10
@ -30,7 +30,15 @@ RUN python3.8 -m pip install pip==${PIP_VERSION} && \
  pip install --no-cache -r requirements/ingest-s3.txt && \
  pip install --no-cache -r requirements/ingest-slack.txt && \
  pip install --no-cache -r requirements/ingest-wikipedia.txt && \
-  pip install --no-cache -r requirements/local-inference.txt && \
+  pip install --no-cache -r requirements/extra-csv.txt && \
+  pip install --no-cache -r requirements/extra-docx.txt && \
+  pip install --no-cache -r requirements/extra-markdown.txt && \
+  pip install --no-cache -r requirements/extra-msg.txt && \
+  pip install --no-cache -r requirements/extra-odt.txt && \
+  pip install --no-cache -r requirements/extra-pandoc.txt && \
+  pip install --no-cache -r requirements/extra-pdf-image.txt && \
+  pip install --no-cache -r requirements/extra-pptx.txt && \
+  pip install --no-cache -r requirements/extra-xlsx.txt && \
  dnf -y groupremove "Development Tools" && \
  dnf clean all

--- a/59
+++ b/59
@ -18,10 +18,10 @@ install-base: install-base-pip-packages install-nltk-models

 ## install:                 installs all test, dev, and experimental requirements
 .PHONY: install
-install: install-base-pip-packages install-dev install-nltk-models install-test install-huggingface install-unstructured-inference
+install: install-base-pip-packages install-dev install-nltk-models install-test install-huggingface install-all-docs

 .PHONY: install-ci
-install-ci: install-base-pip-packages install-nltk-models install-huggingface install-unstructured-inference install-test
+install-ci: install-base-pip-packages install-nltk-models install-huggingface install-all-docs install-test

 .PHONY: install-base-pip-packages
 install-base-pip-packages:
@ -53,6 +53,45 @@ install-dev:
 install-build:
 	python3 -m pip install -r requirements/build.txt

+.PHONY: install-csv
+install-csv:
+	python3 -m pip install -r requirements/extra-csv.txt
+
+.PHONY: install-docx
+install-docx:
+	python3 -m pip install -r requirements/extra-docx.txt
+
+.PHONY: install-odt
+install-odt:
+	python3 -m pip install -r requirements/extra-odt.txt
+
+.PHONY: install-pypandoc
+install-pypandoc:
+	python3 -m pip install -r requirements/extra-pandoc.txt
+
+.PHONY: install-markdown
+install-markdown:
+	python3 -m pip install -r requirements/extra-markdown.txt
+
+.PHONY: install-msg
+install-msg:
+	python3 -m pip install -r requirements/extra-msg.txt
+
+.PHONY: install-pdf-image
+install-pdf-image:
+	python3 -m pip install -r requirements/extra-pdf-image.txt
+
+.PHONY: install-pptx
+install-pptx:
+	python3 -m pip install -r requirements/extra-pptx.txt
+
+.PHONY: install-xlsx
+install-xlsx:
+	python3 -m pip install -r requirements/extra-xlsx.txt
+
+.PHONY: install-all-docs
+install-all-docs: install-base install-csv install-docx install-docx install-odt install-pypandoc install-markdown install-msg install-pdf-image install-pptx install-xlsx
+
 .PHONY: install-ingest-google-drive
 install-ingest-google-drive:
 	python3 -m pip install -r requirements/ingest-google-drive.txt
@ -124,7 +163,7 @@ install-unstructured-inference:

 ## install-local-inference: installs requirements for local inference
 .PHONY: install-local-inference
-install-local-inference: install install-unstructured-inference
+install-local-inference: install install-all-docs

 .PHONY: install-pandoc
 install-pandoc:
@ -135,12 +174,23 @@ install-pandoc:
 .PHONY: pip-compile
 pip-compile:
 	pip-compile --upgrade requirements/base.in
+
+	# Extra requirements that are specific to document types
+	pip-compile --upgrade requirements/extra-csv.in
+	pip-compile --upgrade requirements/extra-docx.in
+	pip-compile --upgrade requirements/extra-pandoc.in
+	pip-compile --upgrade requirements/extra-markdown.in
+	pip-compile --upgrade requirements/extra-msg.in
+	pip-compile --upgrade requirements/extra-odt.in
+	pip-compile --upgrade requirements/extra-pdf-image.in
+	pip-compile --upgrade requirements/extra-pptx.in
+	pip-compile --upgrade requirements/extra-xlsx.in
+
 	# Extra requirements for huggingface staging functions
 	pip-compile --upgrade requirements/huggingface.in
 	pip-compile --upgrade requirements/test.in
 	pip-compile --upgrade requirements/dev.in
 	pip-compile --upgrade requirements/build.in
-	pip-compile --upgrade requirements/local-inference.in
 	# NOTE(robinson) - doc/requirements.txt is where the GitHub action for building
 	# sphinx docs looks for additional requirements
 	cp requirements/build.txt docs/requirements.txt
@ -158,6 +208,7 @@ pip-compile:
 	pip-compile --upgrade requirements/ingest-google-drive.in
 	pip-compile --upgrade requirements/ingest-elasticsearch.in
 	pip-compile --upgrade requirements/ingest-onedrive.in
+	pip-compile --upgrade requirements/ingest-outlook.in
 	pip-compile --upgrade requirements/ingest-confluence.in

 ## install-project-local:   install unstructured into your local python environment
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.8
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
 #    pip-compile requirements/build.in
@ -12,10 +12,14 @@ beautifulsoup4==4.12.2
    # via furo
 certifi==2023.7.22
    # via
+    #   -c requirements/base.txt
+    #   -c requirements/constraints.in
    #   -r requirements/build.in
    #   requests
 charset-normalizer==3.2.0
-    # via requests
+    # via
+    #   -c requirements/base.txt
+    #   requests
 docutils==0.18.1
    # via
    #   sphinx
@ -23,11 +27,11 @@ docutils==0.18.1
 furo==2023.7.26
    # via -r requirements/build.in
 idna==3.4
-    # via requests
+    # via
+    #   -c requirements/base.txt
+    #   requests
 imagesize==1.4.1
    # via sphinx
-importlib-metadata==6.8.0
-    # via sphinx
 jinja2==3.1.2
    # via sphinx
 markupsafe==2.1.3
@ -38,10 +42,10 @@ pygments==2.15.1
    # via
    #   furo
    #   sphinx
-pytz==2023.3
-    # via babel
 requests==2.31.0
-    # via sphinx
+    # via
+    #   -c requirements/base.txt
+    #   sphinx
 snowballstemmer==2.2.0
    # via sphinx
 soupsieve==2.4.1
@ -71,7 +75,8 @@ sphinxcontrib-qthelp==1.0.3
    # via sphinx
 sphinxcontrib-serializinghtml==1.1.5
    # via sphinx
-urllib3==2.0.4
-    # via requests
-zipp==3.16.2
-    # via importlib-metadata
+urllib3==1.26.16
+    # via
+    #   -c requirements/base.txt
+    #   -c requirements/constraints.in
+    #   requests
--- a/docs/source/installing.rst
+++ b/docs/source/installing.rst
@ -7,8 +7,15 @@ Quick Start
 Use the following instructions to get up and running with ``unstructured`` and test your
 installation.

-* Install the Python SDK with ``pip install "unstructured[local-inference]"``
-	* If you do not need to process PDFs or images, you can run ``pip install unstructured``
+* Install the Python SDK with ``pip install unstructured``
+	* Plain text files, HTML, XML, JSON and Emails do not require any extra dependencies.
+	* If you need to process other document types, you can install the extras required for those documents
+		with ``pip install "unstructured[docx,pptx]"``.
+	* To install the extras for every document type, use ``pip install "unstructured[all-docs]"``.
+	* For ``unstructured<0.9.0``, you can install the extras for all document types with
+		``pip install "unstructured[local-inference]"``. The ``local-inference`` extra is still
+		supported in newer versions for backward compatibility, but may be deprecated in a future version.
+		The ``all-docs`` extra is the officially supported installation pattern.

 * Install the following system dependencies if they are not already available on your system. Depending on what document types you're parsing, you may not need all of these.
 	* ``libmagic-dev`` (filetype detection)
--- a/requirements/base.in
+++ b/requirements/base.in
@ -1,19 +1,8 @@
 -c "constraints.in"
 chardet
 filetype
-lxml
-msg_parser
-nltk
-openpyxl
-pandas
-pdf2image
-pdfminer.six
-pillow
-pypandoc
-python-docx
-python-pptx
 python-magic
-markdown
-requests
+lxml
+nltk
 tabulate
-xlrd
+requests
--- a/requirements/base.txt
+++ b/requirements/base.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.8
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
 #    pip-compile requirements/base.in
@ -8,89 +8,33 @@ certifi==2023.7.22
    # via
    #   -c requirements/constraints.in
    #   requests
-cffi==1.15.1
-    # via cryptography
 chardet==5.1.0
    # via -r requirements/base.in
 charset-normalizer==3.2.0
-    # via
-    #   pdfminer-six
-    #   requests
+    # via requests
 click==8.1.6
    # via nltk
-cryptography==41.0.2
-    # via pdfminer-six
-et-xmlfile==1.1.0
-    # via openpyxl
 filetype==1.2.0
    # via -r requirements/base.in
 idna==3.4
    # via requests
-importlib-metadata==6.8.0
-    # via markdown
 joblib==1.3.1
    # via nltk
 lxml==4.9.3
-    # via
-    #   -r requirements/base.in
-    #   python-docx
-    #   python-pptx
-markdown==3.4.4
-    # via -r requirements/base.in
-msg-parser==1.2.0
    # via -r requirements/base.in
 nltk==3.8.1
    # via -r requirements/base.in
-numpy==1.24.4
-    # via pandas
-olefile==0.46
-    # via msg-parser
-openpyxl==3.1.2
-    # via -r requirements/base.in
-pandas==2.0.3
-    # via -r requirements/base.in
-pdf2image==1.16.3
-    # via -r requirements/base.in
-pdfminer-six==20221105
-    # via -r requirements/base.in
-pillow==10.0.0
-    # via
-    #   -r requirements/base.in
-    #   pdf2image
-    #   python-pptx
-pycparser==2.21
-    # via cffi
-pypandoc==1.11
-    # via -r requirements/base.in
-python-dateutil==2.8.2
-    # via pandas
-python-docx==0.8.11
-    # via -r requirements/base.in
 python-magic==0.4.27
    # via -r requirements/base.in
-python-pptx==0.6.21
-    # via -r requirements/base.in
-pytz==2023.3
-    # via pandas
 regex==2023.6.3
    # via nltk
 requests==2.31.0
    # via -r requirements/base.in
-six==1.16.0
-    # via python-dateutil
 tabulate==0.9.0
    # via -r requirements/base.in
 tqdm==4.65.0
    # via nltk
-tzdata==2023.3
-    # via pandas
 urllib3==1.26.16
    # via
    #   -c requirements/constraints.in
    #   requests
-xlrd==2.0.1
-    # via -r requirements/base.in
-xlsxwriter==3.1.2
-    # via python-pptx
-zipp==3.16.2
-    # via importlib-metadata
--- a/requirements/build.in
+++ b/requirements/build.in
@ -1,3 +1,6 @@
+-c base.txt
+-c constraints.in
+
 sphinx
 # NOTE(alan) - Pinning to resolve a conflict with sphinx. We can unpin on next sphinx_rtd_theme release.
 sphinx_rtd_theme==1.2.2
--- a/requirements/build.txt
+++ b/requirements/build.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.8
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
 #    pip-compile requirements/build.in
@ -12,10 +12,14 @@ beautifulsoup4==4.12.2
    # via furo
 certifi==2023.7.22
    # via
+    #   -c requirements/base.txt
+    #   -c requirements/constraints.in
    #   -r requirements/build.in
    #   requests
 charset-normalizer==3.2.0
-    # via requests
+    # via
+    #   -c requirements/base.txt
+    #   requests
 docutils==0.18.1
    # via
    #   sphinx
@ -23,11 +27,11 @@ docutils==0.18.1
 furo==2023.7.26
    # via -r requirements/build.in
 idna==3.4
-    # via requests
+    # via
+    #   -c requirements/base.txt
+    #   requests
 imagesize==1.4.1
    # via sphinx
-importlib-metadata==6.8.0
-    # via sphinx
 jinja2==3.1.2
    # via sphinx
 markupsafe==2.1.3
@ -38,10 +42,10 @@ pygments==2.15.1
    # via
    #   furo
    #   sphinx
-pytz==2023.3
-    # via babel
 requests==2.31.0
-    # via sphinx
+    # via
+    #   -c requirements/base.txt
+    #   sphinx
 snowballstemmer==2.2.0
    # via sphinx
 soupsieve==2.4.1
@ -71,7 +75,8 @@ sphinxcontrib-qthelp==1.0.3
    # via sphinx
 sphinxcontrib-serializinghtml==1.1.5
    # via sphinx
-urllib3==2.0.4
-    # via requests
-zipp==3.16.2
-    # via importlib-metadata
+urllib3==1.26.16
+    # via
+    #   -c requirements/base.txt
+    #   -c requirements/constraints.in
+    #   requests
--- a/requirements/constraints.in
+++ b/requirements/constraints.in
@ -12,6 +12,13 @@ jupyter-core>=4.11.2
 wheel>=0.38.1
 # NOTE(robinson) - The following pins are to address
 # vulnerabilities in dependency scans
-certifi>=2022.12.07
+certifi>=2023.7.22
 # From pycocotools in local-inference
 pyparsing<3.1.0
+# NOTE(robinson) - Numpy dropped Python 3.8 support in 1.25.0
+numpy<1.25.0
+scipy<1.11.0
+IPython<8.13
+# NOTE(robinson) - See this issue here
+# https://github.com/facebookresearch/detectron2/issues/5010
+Pillow<10.0.0
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.8
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
 #    pip-compile requirements/dev.in
@ -41,9 +41,7 @@ certifi==2023.7.22
    #   -c requirements/test.txt
    #   requests
 cffi==1.15.1
-    # via
-    #   -c requirements/base.txt
-    #   argon2-cffi-bindings
+    # via argon2-cffi-bindings
 cfgv==3.3.1
    # via pre-commit
 charset-normalizer==3.2.0
@ -57,7 +55,9 @@ click==8.1.6
    #   -c requirements/test.txt
    #   pip-tools
 comm==0.1.3
-    # via ipykernel
+    # via
+    #   ipykernel
+    #   ipywidgets
 debugpy==1.6.7
    # via ipykernel
 decorator==5.1.1
@ -66,10 +66,6 @@ defusedxml==0.7.1
    # via nbconvert
 distlib==0.3.7
    # via virtualenv
-exceptiongroup==1.1.2
-    # via
-    #   -c requirements/test.txt
-    #   anyio
 executing==1.2.0
    # via stack-data
 fastjsonschema==2.18.0
@ -87,40 +83,26 @@ idna==3.4
    #   anyio
    #   jsonschema
    #   requests
-importlib-metadata==6.8.0
-    # via
-    #   -c requirements/base.txt
-    #   jupyter-client
-    #   jupyter-lsp
-    #   jupyterlab
-    #   jupyterlab-server
-    #   nbconvert
-importlib-resources==6.0.0
-    # via
-    #   jsonschema
-    #   jsonschema-specifications
-    #   jupyterlab
-    #   notebook
 ipykernel==6.25.0
    # via
-    #   ipywidgets
    #   jupyter
    #   jupyter-console
    #   jupyterlab
    #   qtconsole
 ipython==8.12.2
    # via
+    #   -c requirements/constraints.in
    #   -r requirements/dev.in
    #   ipykernel
    #   ipywidgets
    #   jupyter-console
 ipython-genutils==0.2.0
    # via qtconsole
-ipywidgets==8.0.7
+ipywidgets==8.1.0
    # via jupyter
 isoduration==20.11.0
    # via jsonschema
-jedi==0.18.2
+jedi==0.19.0
    # via ipython
 jinja2==3.1.2
    # via
@ -162,7 +144,7 @@ jupyter-core==5.3.1
    #   nbconvert
    #   nbformat
    #   qtconsole
-jupyter-events==0.6.3
+jupyter-events==0.7.0
    # via jupyter-server
 jupyter-lsp==2.2.0
    # via jupyterlab
@ -201,16 +183,16 @@ nbconvert==7.7.3
    # via
    #   jupyter
    #   jupyter-server
-nbformat==5.9.1
+nbformat==5.9.2
    # via
    #   jupyter-server
    #   nbclient
    #   nbconvert
-nest-asyncio==1.5.6
+nest-asyncio==1.5.7
    # via ipykernel
 nodeenv==1.8.0
    # via pre-commit
-notebook==7.0.0
+notebook==7.0.1
    # via jupyter
 notebook-shim==0.2.3
    # via
@ -239,9 +221,7 @@ pickleshare==0.7.5
    # via ipython
 pip-tools==7.1.0
    # via -r requirements/dev.in
-pkgutil-resolve-name==1.3.10
-    # via jsonschema
-platformdirs==3.9.1
+platformdirs==3.10.0
    # via
    #   -c requirements/test.txt
    #   jupyter-core
@ -263,9 +243,7 @@ ptyprocess==0.7.0
 pure-eval==0.2.2
    # via stack-data
 pycparser==2.21
-    # via
-    #   -c requirements/base.txt
-    #   cffi
+    # via cffi
 pygments==2.15.1
    # via
    #   ipython
@ -276,16 +254,11 @@ pyproject-hooks==1.0.0
    # via build
 python-dateutil==2.8.2
    # via
-    #   -c requirements/base.txt
    #   -c requirements/test.txt
    #   arrow
    #   jupyter-client
 python-json-logger==2.0.7
    # via jupyter-events
-pytz==2023.3
-    # via
-    #   -c requirements/base.txt
-    #   babel
 pyyaml==6.0.1
    # via
    #   -c requirements/test.txt
@ -306,6 +279,7 @@ referencing==0.30.0
    # via
    #   jsonschema
    #   jsonschema-specifications
+    #   jupyter-events
 requests==2.31.0
    # via
    #   -c requirements/base.txt
@ -327,7 +301,6 @@ send2trash==1.8.2
    # via jupyter-server
 six==1.16.0
    # via
-    #   -c requirements/base.txt
    #   -c requirements/test.txt
    #   asttokens
    #   bleach
@ -345,13 +318,6 @@ terminado==0.17.1
    #   jupyter-server-terminals
 tinycss2==1.2.1
    # via nbconvert
-tomli==2.0.1
-    # via
-    #   -c requirements/test.txt
-    #   build
-    #   jupyterlab
-    #   pip-tools
-    #   pyproject-hooks
 tornado==6.3.2
    # via
    #   ipykernel
@ -377,11 +343,6 @@ traitlets==5.9.0
    #   nbconvert
    #   nbformat
    #   qtconsole
-typing-extensions==4.7.1
-    # via
-    #   -c requirements/test.txt
-    #   async-lru
-    #   ipython
 uri-template==1.3.0
    # via jsonschema
 urllib3==1.26.16
@ -408,11 +369,6 @@ wheel==0.41.0
    #   pip-tools
 widgetsnbextension==4.0.8
    # via ipywidgets
-zipp==3.16.2
-    # via
-    #   -c requirements/base.txt
-    #   importlib-metadata
-    #   importlib-resources

 # The following packages are considered to be unsafe in a requirements file:
 # pip
--- a/requirements/local-inference.in
+++ b/requirements/local-inference.in
@ -1,3 +1,4 @@
 -c constraints.in
 -c base.txt
-unstructured-inference==0.5.7
+
+pandas
--- a/requirements/extra-csv.txt
+++ b/requirements/extra-csv.txt
@ -0,0 +1,20 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile requirements/extra-csv.in
+#
+numpy==1.24.4
+    # via
+    #   -c requirements/constraints.in
+    #   pandas
+pandas==2.0.3
+    # via -r requirements/extra-csv.in
+python-dateutil==2.8.2
+    # via pandas
+pytz==2023.3
+    # via pandas
+six==1.16.0
+    # via python-dateutil
+tzdata==2023.3
+    # via pandas
--- a/requirements/extra-docx.in
+++ b/requirements/extra-docx.in
@ -0,0 +1,4 @@
+-c constraints.in
+-c base.txt
+
+python-docx
--- a/requirements/extra-docx.txt
+++ b/requirements/extra-docx.txt
@ -0,0 +1,12 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile requirements/extra-docx.in
+#
+lxml==4.9.3
+    # via
+    #   -c requirements/base.txt
+    #   python-docx
+python-docx==0.8.11
+    # via -r requirements/extra-docx.in
--- a/requirements/extra-markdown.in
+++ b/requirements/extra-markdown.in
@ -0,0 +1,4 @@
+-c "constraints.in"
+-c "base.txt"
+
+markdown
--- a/requirements/extra-markdown.txt
+++ b/requirements/extra-markdown.txt
@ -0,0 +1,8 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile requirements/extra-markdown.in
+#
+markdown==3.4.4
+    # via -r requirements/extra-markdown.in
--- a/requirements/extra-msg.in
+++ b/requirements/extra-msg.in
@ -0,0 +1,4 @@
+-c constraints.in
+-c base.txt
+
+msg_parser
--- a/requirements/extra-msg.txt
+++ b/requirements/extra-msg.txt
@ -0,0 +1,10 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile requirements/extra-msg.in
+#
+msg-parser==1.2.0
+    # via -r requirements/extra-msg.in
+olefile==0.46
+    # via msg-parser
--- a/requirements/extra-odt.in
+++ b/requirements/extra-odt.in
@ -0,0 +1,5 @@
+-c constraints.in
+-c base.txt
+
+python-docx
+pypandoc
--- a/requirements/extra-odt.txt
+++ b/requirements/extra-odt.txt
@ -0,0 +1,14 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile requirements/extra-odt.in
+#
+lxml==4.9.3
+    # via
+    #   -c requirements/base.txt
+    #   python-docx
+pypandoc==1.11
+    # via -r requirements/extra-odt.in
+python-docx==0.8.11
+    # via -r requirements/extra-odt.in
--- a/requirements/extra-pandoc.in
+++ b/requirements/extra-pandoc.in
@ -0,0 +1,4 @@
+-c constraints.in
+-c base.txt
+
+pypandoc
--- a/requirements/extra-pandoc.txt
+++ b/requirements/extra-pandoc.txt
@ -0,0 +1,8 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile requirements/extra-pandoc.in
+#
+pypandoc==1.11
+    # via -r requirements/extra-pandoc.in
--- a/requirements/extra-pdf-image.in
+++ b/requirements/extra-pdf-image.in
@ -0,0 +1,9 @@
+-c constraints.in
+-c base.txt
+
+pdf2image
+pdfminer.six
+# NOTE(robinson) - See this issue here
+# https://github.com/facebookresearch/detectron2/issues/5010
+Pillow<10
+unstructured-inference==0.5.7
--- a/requirements/extra-pdf-image.txt
+++ b/requirements/extra-pdf-image.txt
@ -1,8 +1,8 @@
 #
-# This file is autogenerated by pip-compile with Python 3.8
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
-#    pip-compile requirements/local-inference.in
+#    pip-compile requirements/extra-pdf-image.in
 #
 antlr4-python3-runtime==4.9.3
    # via omegaconf
@ -12,9 +12,7 @@ certifi==2023.7.22
    #   -c requirements/constraints.in
    #   requests
 cffi==1.15.1
-    # via
-    #   -c requirements/base.txt
-    #   cryptography
+    # via cryptography
 charset-normalizer==3.2.0
    # via
    #   -c requirements/base.txt
@ -25,9 +23,7 @@ coloredlogs==15.0.1
 contourpy==1.1.0
    # via matplotlib
 cryptography==41.0.2
-    # via
-    #   -c requirements/base.txt
-    #   pdfminer-six
+    # via pdfminer-six
 cycler==0.11.0
    # via matplotlib
 effdet==0.4.1
@ -54,8 +50,6 @@ idna==3.4
    # via
    #   -c requirements/base.txt
    #   requests
-importlib-resources==6.0.0
-    # via matplotlib
 iopath==0.1.10
    # via layoutparser
 jinja2==3.1.2
@ -74,7 +68,7 @@ networkx==3.1
    # via torch
 numpy==1.24.4
    # via
-    #   -c requirements/base.txt
+    #   -c requirements/constraints.in
    #   contourpy
    #   layoutparser
    #   matplotlib
@ -101,22 +95,21 @@ packaging==23.1
    #   pytesseract
    #   transformers
 pandas==2.0.3
-    # via
-    #   -c requirements/base.txt
-    #   layoutparser
+    # via layoutparser
 pdf2image==1.16.3
    # via
-    #   -c requirements/base.txt
+    #   -r requirements/extra-pdf-image.in
    #   layoutparser
 pdfminer-six==20221105
    # via
-    #   -c requirements/base.txt
+    #   -r requirements/extra-pdf-image.in
    #   pdfplumber
-pdfplumber==0.10.1
+pdfplumber==0.10.2
    # via layoutparser
-pillow==10.0.0
+pillow==9.5.0
    # via
-    #   -c requirements/base.txt
+    #   -c requirements/constraints.in
+    #   -r requirements/extra-pdf-image.in
    #   layoutparser
    #   matplotlib
    #   pdf2image
@ -132,9 +125,7 @@ protobuf==4.23.4
 pycocotools==2.0.6
    # via effdet
 pycparser==2.21
-    # via
-    #   -c requirements/base.txt
-    #   cffi
+    # via cffi
 pyparsing==3.0.9
    # via
    #   -c requirements/constraints.in
@ -145,15 +136,12 @@ pytesseract==0.3.10
    # via layoutparser
 python-dateutil==2.8.2
    # via
-    #   -c requirements/base.txt
    #   matplotlib
    #   pandas
 python-multipart==0.0.6
    # via unstructured-inference
 pytz==2023.3
-    # via
-    #   -c requirements/base.txt
-    #   pandas
+    # via pandas
 pyyaml==6.0.1
    # via
    #   huggingface-hub
@ -176,11 +164,11 @@ safetensors==0.3.1
    #   timm
    #   transformers
 scipy==1.10.1
-    # via layoutparser
-six==1.16.0
    # via
-    #   -c requirements/base.txt
-    #   python-dateutil
+    #   -c requirements/constraints.in
+    #   layoutparser
+six==1.16.0
+    # via python-dateutil
 sympy==1.12
    # via
    #   onnxruntime
@ -214,17 +202,11 @@ typing-extensions==4.7.1
    #   iopath
    #   torch
 tzdata==2023.3
-    # via
-    #   -c requirements/base.txt
-    #   pandas
+    # via pandas
 unstructured-inference==0.5.7
-    # via -r requirements/local-inference.in
+    # via -r requirements/extra-pdf-image.in
 urllib3==1.26.16
    # via
    #   -c requirements/base.txt
    #   -c requirements/constraints.in
    #   requests
-zipp==3.16.2
-    # via
-    #   -c requirements/base.txt
-    #   importlib-resources
--- a/requirements/extra-pptx.in
+++ b/requirements/extra-pptx.in
@ -0,0 +1,3 @@
+-c "constraints.in"
+
+python-pptx
--- a/requirements/extra-pptx.txt
+++ b/requirements/extra-pptx.txt
@ -0,0 +1,16 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile requirements/extra-pptx.in
+#
+lxml==4.9.3
+    # via python-pptx
+pillow==9.5.0
+    # via
+    #   -c requirements/constraints.in
+    #   python-pptx
+python-pptx==0.6.21
+    # via -r requirements/extra-pptx.in
+xlsxwriter==3.1.2
+    # via python-pptx
--- a/requirements/extra-xlsx.in
+++ b/requirements/extra-xlsx.in
@ -0,0 +1,6 @@
+-c constraints.in
+-c base.txt
+
+openpyxl
+pandas
+xlrd
--- a/requirements/extra-xlsx.txt
+++ b/requirements/extra-xlsx.txt
@ -0,0 +1,26 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile requirements/extra-xlsx.in
+#
+et-xmlfile==1.1.0
+    # via openpyxl
+numpy==1.24.4
+    # via
+    #   -c requirements/constraints.in
+    #   pandas
+openpyxl==3.1.2
+    # via -r requirements/extra-xlsx.in
+pandas==2.0.3
+    # via -r requirements/extra-xlsx.in
+python-dateutil==2.8.2
+    # via pandas
+pytz==2023.3
+    # via pandas
+six==1.16.0
+    # via python-dateutil
+tzdata==2023.3
+    # via pandas
+xlrd==2.0.1
+    # via -r requirements/extra-xlsx.in
--- a/requirements/huggingface.txt
+++ b/requirements/huggingface.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.8
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
 #    pip-compile requirements/huggingface.in
@ -46,7 +46,7 @@ networkx==3.1
    # via torch
 numpy==1.24.4
    # via
-    #   -c requirements/base.txt
+    #   -c requirements/constraints.in
    #   transformers
 packaging==23.1
    # via
@ -74,7 +74,6 @@ sentencepiece==0.1.99
    # via -r requirements/huggingface.in
 six==1.16.0
    # via
-    #   -c requirements/base.txt
    #   langdetect
    #   sacremoses
 sympy==1.12
--- a/requirements/ingest-azure.txt
+++ b/requirements/ingest-azure.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.8
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
 #    pip-compile requirements/ingest-azure.in
@ -32,7 +32,6 @@ certifi==2023.7.22
    #   requests
 cffi==1.15.1
    # via
-    #   -c requirements/base.txt
    #   azure-datalake-store
    #   cryptography
 charset-normalizer==3.2.0
@ -42,7 +41,6 @@ charset-normalizer==3.2.0
    #   requests
 cryptography==41.0.2
    # via
-    #   -c requirements/base.txt
    #   azure-identity
    #   azure-storage-blob
    #   msal
@ -76,9 +74,7 @@ multidict==6.0.4
 portalocker==2.7.0
    # via msal-extensions
 pycparser==2.21
-    # via
-    #   -c requirements/base.txt
-    #   cffi
+    # via cffi
 pyjwt[crypto]==2.8.0
    # via msal
 requests==2.31.0
@ -89,7 +85,6 @@ requests==2.31.0
    #   msal
 six==1.16.0
    # via
-    #   -c requirements/base.txt
    #   azure-core
    #   azure-identity
    #   isodate
--- a/requirements/ingest-box.txt
+++ b/requirements/ingest-box.txt
@ -8,7 +8,7 @@ attrs==23.1.0
    # via boxsdk
 boxfs==0.2.0
    # via -r requirements/ingest-box.in
-boxsdk[jwt]==3.8.0
+boxsdk[jwt]==3.8.1
    # via boxfs
 certifi==2023.7.22
    # via
@ -16,17 +16,13 @@ certifi==2023.7.22
    #   -c requirements/constraints.in
    #   requests
 cffi==1.15.1
-    # via
-    #   -c requirements/base.txt
-    #   cryptography
+    # via cryptography
 charset-normalizer==3.2.0
    # via
    #   -c requirements/base.txt
    #   requests
 cryptography==41.0.2
-    # via
-    #   -c requirements/base.txt
-    #   boxsdk
+    # via boxsdk
 fsspec==2023.6.0
    # via
    #   -r requirements/ingest-box.in
@ -36,15 +32,11 @@ idna==3.4
    #   -c requirements/base.txt
    #   requests
 pycparser==2.21
-    # via
-    #   -c requirements/base.txt
-    #   cffi
+    # via cffi
 pyjwt==2.8.0
    # via boxsdk
 python-dateutil==2.8.2
-    # via
-    #   -c requirements/base.txt
-    #   boxsdk
+    # via boxsdk
 requests==2.31.0
    # via
    #   -c requirements/base.txt
@ -53,9 +45,7 @@ requests==2.31.0
 requests-toolbelt==1.0.0
    # via boxsdk
 six==1.16.0
-    # via
-    #   -c requirements/base.txt
-    #   python-dateutil
+    # via python-dateutil
 urllib3==1.26.16
    # via
    #   -c requirements/base.txt
--- a/requirements/ingest-confluence.txt
+++ b/requirements/ingest-confluence.txt
@ -1,10 +1,10 @@
 #
-# This file is autogenerated by pip-compile with Python 3.8
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
 #    pip-compile requirements/ingest-confluence.in
 #
-atlassian-python-api==3.39.0
+atlassian-python-api==3.40.0
    # via -r requirements/ingest-confluence.in
 certifi==2023.7.22
    # via
@ -33,9 +33,7 @@ requests==2.31.0
 requests-oauthlib==1.3.1
    # via atlassian-python-api
 six==1.16.0
-    # via
-    #   -c requirements/base.txt
-    #   atlassian-python-api
+    # via atlassian-python-api
 urllib3==1.26.16
    # via
    #   -c requirements/base.txt
--- a/requirements/ingest-discord.txt
+++ b/requirements/ingest-discord.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.8
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
 #    pip-compile requirements/ingest-discord.in
--- a/requirements/ingest-dropbox.txt
+++ b/requirements/ingest-dropbox.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.8
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
 #    pip-compile requirements/ingest-dropbox.in
@ -34,7 +34,6 @@ requests==2.31.0
    #   dropboxdrivefs
 six==1.16.0
    # via
-    #   -c requirements/base.txt
    #   dropbox
    #   stone
 stone==3.3.1
--- a/requirements/ingest-elasticsearch.txt
+++ b/requirements/ingest-elasticsearch.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.8
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
 #    pip-compile requirements/ingest-elasticsearch.in
@ -11,7 +11,7 @@ certifi==2023.7.22
    #   elastic-transport
 elastic-transport==8.4.0
    # via elasticsearch
-elasticsearch==8.8.2
+elasticsearch==8.9.0
    # via -r requirements/ingest-elasticsearch.in
 jq==1.4.1
    # via -r requirements/ingest-elasticsearch.in
--- a/requirements/ingest-gcs.txt
+++ b/requirements/ingest-gcs.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.8
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
 #    pip-compile requirements/ingest-gcs.in
@ -57,7 +57,7 @@ google-crc32c==1.5.0
    # via google-resumable-media
 google-resumable-media==2.5.0
    # via google-cloud-storage
-googleapis-common-protos==1.59.1
+googleapis-common-protos==1.60.0
    # via google-api-core
 idna==3.4
    # via
@ -74,7 +74,6 @@ protobuf==4.23.4
    # via
    #   -c requirements/constraints.in
    #   google-api-core
-    #   googleapis-common-protos
 pyasn1==0.5.0
    # via
    #   pyasn1-modules
@ -93,9 +92,7 @@ requests-oauthlib==1.3.1
 rsa==4.9
    # via google-auth
 six==1.16.0
-    # via
-    #   -c requirements/base.txt
-    #   google-auth
+    # via google-auth
 urllib3==1.26.16
    # via
    #   -c requirements/base.txt
--- a/requirements/ingest-github.txt
+++ b/requirements/ingest-github.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.8
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
 #    pip-compile requirements/ingest-github.in
@ -11,7 +11,6 @@ certifi==2023.7.22
    #   requests
 cffi==1.15.1
    # via
-    #   -c requirements/base.txt
    #   cryptography
    #   pynacl
 charset-normalizer==3.2.0
@ -19,9 +18,7 @@ charset-normalizer==3.2.0
    #   -c requirements/base.txt
    #   requests
 cryptography==41.0.2
-    # via
-    #   -c requirements/base.txt
-    #   pyjwt
+    # via pyjwt
 deprecated==1.2.14
    # via pygithub
 idna==3.4
@ -29,9 +26,7 @@ idna==3.4
    #   -c requirements/base.txt
    #   requests
 pycparser==2.21
-    # via
-    #   -c requirements/base.txt
-    #   cffi
+    # via cffi
 pygithub==1.58.2
    # via -r requirements/ingest-github.in
 pyjwt[crypto]==2.8.0
--- a/requirements/ingest-gitlab.txt
+++ b/requirements/ingest-gitlab.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.8
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
 #    pip-compile requirements/ingest-gitlab.in
--- a/requirements/ingest-google-drive.txt
+++ b/requirements/ingest-google-drive.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.8
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
 #    pip-compile requirements/ingest-google-drive.in
@ -26,7 +26,7 @@ google-auth==2.22.0
    #   google-auth-httplib2
 google-auth-httplib2==0.1.0
    # via google-api-python-client
-googleapis-common-protos==1.59.1
+googleapis-common-protos==1.60.0
    # via google-api-core
 httplib2==0.22.0
    # via
@ -59,7 +59,6 @@ rsa==4.9
    # via google-auth
 six==1.16.0
    # via
-    #   -c requirements/base.txt
    #   google-auth
    #   google-auth-httplib2
 uritemplate==4.1.1
--- a/requirements/ingest-onedrive.txt
+++ b/requirements/ingest-onedrive.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.8
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
 #    pip-compile requirements/ingest-onedrive.in
@ -10,16 +10,13 @@ certifi==2023.7.22
    #   -c requirements/constraints.in
    #   requests
 cffi==1.15.1
-    # via
-    #   -c requirements/base.txt
-    #   cryptography
+    # via cryptography
 charset-normalizer==3.2.0
    # via
    #   -c requirements/base.txt
    #   requests
 cryptography==41.0.2
    # via
-    #   -c requirements/base.txt
    #   -r requirements/ingest-onedrive.in
    #   msal
    #   pyjwt
@ -34,15 +31,11 @@ msal==1.23.0
 office365-rest-python-client==2.4.2
    # via -r requirements/ingest-onedrive.in
 pycparser==2.21
-    # via
-    #   -c requirements/base.txt
-    #   cffi
+    # via cffi
 pyjwt[crypto]==2.8.0
    # via msal
 pytz==2023.3
-    # via
-    #   -c requirements/base.txt
-    #   office365-rest-python-client
+    # via office365-rest-python-client
 requests==2.31.0
    # via
    #   -c requirements/base.txt
--- a/requirements/ingest-outlook.txt
+++ b/requirements/ingest-outlook.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.8
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
 #    pip-compile requirements/ingest-outlook.in
@ -10,16 +10,13 @@ certifi==2023.7.22
    #   -c requirements/constraints.in
    #   requests
 cffi==1.15.1
-    # via
-    #   -c requirements/base.txt
-    #   cryptography
+    # via cryptography
 charset-normalizer==3.2.0
    # via
    #   -c requirements/base.txt
    #   requests
 cryptography==41.0.2
    # via
-    #   -c requirements/base.txt
    #   -r requirements/ingest-outlook.in
    #   msal
    #   pyjwt
@ -34,15 +31,11 @@ msal==1.23.0
 office365-rest-python-client==2.4.2
    # via -r requirements/ingest-outlook.in
 pycparser==2.21
-    # via
-    #   -c requirements/base.txt
-    #   cffi
+    # via cffi
 pyjwt[crypto]==2.8.0
    # via msal
 pytz==2023.3
-    # via
-    #   -c requirements/base.txt
-    #   office365-rest-python-client
+    # via office365-rest-python-client
 requests==2.31.0
    # via
    #   -c requirements/base.txt
--- a/requirements/ingest-reddit.txt
+++ b/requirements/ingest-reddit.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.8
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
 #    pip-compile requirements/ingest-reddit.in
--- a/requirements/ingest-s3.txt
+++ b/requirements/ingest-s3.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.8
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
 #    pip-compile requirements/ingest-s3.in
@ -43,17 +43,11 @@ multidict==6.0.4
    #   aiohttp
    #   yarl
 python-dateutil==2.8.2
-    # via
-    #   -c requirements/base.txt
-    #   botocore
+    # via botocore
 s3fs==2023.6.0
    # via -r requirements/ingest-s3.in
 six==1.16.0
-    # via
-    #   -c requirements/base.txt
-    #   python-dateutil
-typing-extensions==4.7.1
-    # via aioitertools
+    # via python-dateutil
 urllib3==1.26.16
    # via
    #   -c requirements/base.txt
--- a/requirements/ingest-slack.txt
+++ b/requirements/ingest-slack.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.8
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
 #    pip-compile requirements/ingest-slack.in
--- a/requirements/ingest-wikipedia.txt
+++ b/requirements/ingest-wikipedia.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.8
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
 #    pip-compile requirements/ingest-wikipedia.in
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.8
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
 #    pip-compile requirements/test.in
@ -26,9 +26,7 @@ coverage[toml]==7.2.7
    # via
    #   -r requirements/test.in
    #   pytest-cov
-exceptiongroup==1.1.2
-    # via pytest
-flake8==6.0.0
+flake8==6.1.0
    # via -r requirements/test.in
 freezegun==1.2.2
    # via -r requirements/test.in
@ -64,17 +62,17 @@ packaging==23.1
    # via
    #   black
    #   pytest
-pathspec==0.11.1
+pathspec==0.11.2
    # via black
-platformdirs==3.9.1
+platformdirs==3.10.0
    # via black
 pluggy==1.2.0
    # via pytest
-pycodestyle==2.10.0
+pycodestyle==2.11.0
    # via flake8
 pydantic==1.10.12
    # via label-studio-sdk
-pyflakes==3.0.1
+pyflakes==3.1.0
    # via flake8
 pytest==7.4.0
    # via
@ -85,28 +83,17 @@ pytest-cov==4.1.0
 pytest-mock==3.11.1
    # via -r requirements/test.in
 python-dateutil==2.8.2
-    # via
-    #   -c requirements/base.txt
-    #   freezegun
+    # via freezegun
 pyyaml==6.0.1
    # via vcrpy
 requests==2.31.0
    # via
    #   -c requirements/base.txt
    #   label-studio-sdk
-ruff==0.0.280
+ruff==0.0.281
    # via -r requirements/test.in
 six==1.16.0
-    # via
-    #   -c requirements/base.txt
-    #   python-dateutil
-    #   vcrpy
-tomli==2.0.1
-    # via
-    #   black
-    #   coverage
-    #   mypy
-    #   pytest
+    # via python-dateutil
 types-click==7.1.8
    # via -r requirements/test.in
 types-markdown==3.4.2.10
@ -119,7 +106,6 @@ types-urllib3==1.26.25.14
    # via types-requests
 typing-extensions==4.7.1
    # via
-    #   black
    #   mypy
    #   pydantic
 urllib3==1.26.16
@ -127,8 +113,7 @@ urllib3==1.26.16
    #   -c requirements/base.txt
    #   -c requirements/constraints.in
    #   requests
-    #   vcrpy
-vcrpy==5.0.0
+vcrpy==5.1.0
    # via -r requirements/test.in
 wrapt==1.15.0
    # via vcrpy
--- a/setup.py
+++ b/setup.py
@ -34,11 +34,48 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List
        with open(file, encoding="utf-8") as f:
            requirements.extend(f.readlines())
    requirements = [
-        req for req in requirements if not req.startswith("#") and not req.startswith("-")
+        req
+        for req in requirements
+        if not req.startswith("#") and not req.startswith("-")
    ]
    return requirements


+csv_reqs = load_requirements("requirements/extra-csv.in")
+docx_reqs = load_requirements("requirements/extra-docx.in")
+epub_reqs = load_requirements("requirements/extra-pandoc.in")
+image_reqs = load_requirements("requirements/extra-pdf-image.in")
+markdown_reqs = load_requirements("requirements/extra-markdown.in")
+msg_reqs = load_requirements("requirements/extra-msg.in")
+odt_reqs = load_requirements("requirements/extra-odt.in")
+org_reqs = load_requirements("requirements/extra-pandoc.in")
+pdf_reqs = load_requirements("requirements/extra-pdf-image.in")
+pptx_reqs = load_requirements("requirements/extra-pptx.in")
+rtf_reqs = load_requirements("requirements/extra-pandoc.in")
+rst_reqs = load_requirements("requirements/extra-pandoc.in")
+tsv_reqs = load_requirements("requirements/extra-csv.in")
+xlsx_reqs = load_requirements("requirements/extra-xlsx.in")
+
+all_doc_reqs = list(
+    set(
+        csv_reqs
+        + docx_reqs
+        + epub_reqs
+        + image_reqs
+        + markdown_reqs
+        + msg_reqs
+        + odt_reqs
+        + org_reqs
+        + pdf_reqs
+        + pptx_reqs
+        + rtf_reqs
+        + rst_reqs
+        + tsv_reqs
+        + xlsx_reqs,
+    ),
+)
+
+
 setup(
    name="unstructured",
    description="A library that prepares raw documents for downstream ML tasks.",
@ -71,8 +108,23 @@ setup(
    },
    install_requires=load_requirements(),
    extras_require={
-        "huggingface": load_requirements("requirements/huggingface.in"),
-        "local-inference": load_requirements("requirements/local-inference.in"),
+        # Document specific extra requirements
+        "all-docs": all_doc_reqs,
+        "csv": csv_reqs,
+        "docx": docx_reqs,
+        "epub": epub_reqs,
+        "image": image_reqs,
+        "md": markdown_reqs,
+        "msg": msg_reqs,
+        "odt": odt_reqs,
+        "org": org_reqs,
+        "pdf": pdf_reqs,
+        "pptx": pptx_reqs,
+        "rtf": rtf_reqs,
+        "rst": rst_reqs,
+        "tsv": tsv_reqs,
+        "xlsx": xlsx_reqs,
+        # Extra requirements for data connectors
        "s3": load_requirements("requirements/ingest-s3.in"),
        "azure": load_requirements("requirements/ingest-azure.in"),
        "discord": load_requirements("requirements/ingest-discord.in"),
@ -89,6 +141,9 @@ setup(
        "onedrive": load_requirements("requirements/ingest-onedrive.in"),
        "outlook": load_requirements("requirements/ingest-outlook.in"),
        "confluence": load_requirements("requirements/ingest-confluence.in"),
+        # Legacy extra requirements
+        "huggingface": load_requirements("requirements/huggingface.in"),
+        "local-inference": all_doc_reqs,
    },
    package_dir={"unstructured": "unstructured"},
    package_data={"unstructured": ["nlp/*.txt"]},
--- a/test_unstructured/file_utils/test_filetype.py
+++ b/test_unstructured/file_utils/test_filetype.py
@ -478,4 +478,4 @@ def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
 def test_get_page_image_metadata_and_coordinate_system():
    doc = MockDocumentLayout()
    metadata = _get_page_image_metadata(doc.pages[0])
-    assert type(metadata) == dict
+    assert isinstance(metadata, dict)
--- a/test_unstructured/partition/test_doc.py
+++ b/test_unstructured/partition/test_doc.py
@ -267,6 +267,6 @@ def test_partition_doc_from_file_without_metadata_date(
        sf = SpooledTemporaryFile()
        sf.write(f.read())
        sf.seek(0)
-        elements = partition_doc(file=sf, metadata_last_modified=None)
+        elements = partition_doc(file=sf, metadata_date="2020-07-05")

-    assert elements[0].metadata.last_modified is None
+    assert elements[0].metadata.date == "2020-07-05"
--- a/test_unstructured_ingest/unit/test_interfaces.py
+++ b/test_unstructured_ingest/unit/test_interfaces.py
@ -216,8 +216,8 @@ def test_process_file_metadata_exclude_filename_pagenum(mocker, partition_test_r
    isd_elems = test_ingest_doc.process_file()
    assert len(isd_elems)
    for elem in isd_elems:
-        assert "filename" not in elem["metadata"].keys()
-        assert "page_number" not in elem["metadata"].keys()
+        assert "filename" not in elem["metadata"]
+        assert "page_number" not in elem["metadata"]


 def test_process_file_flatten_metadata(mocker, partition_test_results):
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.8.8"  # pragma: no cover
+__version__ = "0.9.0"  # pragma: no cover
--- a/unstructured/file_utils/file_conversion.py
+++ b/unstructured/file_utils/file_conversion.py
@ -1,11 +1,14 @@
 import tempfile
 from typing import IO, Optional

-import pypandoc
-
 from unstructured.partition.common import exactly_one
+from unstructured.utils import dependency_exists, requires_dependencies
+
+if dependency_exists("pypandoc"):
+    import pypandoc


+@requires_dependencies(["pypandoc"])
 def convert_file_to_text(filename: str, source_format: str, target_format: str) -> str:
    """Uses pandoc to convert the source document to a raw text string."""
    try:
--- a/unstructured/ingest/connector/google_drive.py
+++ b/unstructured/ingest/connector/google_drive.py
@ -76,7 +76,7 @@ class SimpleGoogleDriveConfig(BaseConnectorConfig):
    recursive: bool = False

    def __post_init__(self):
-        if self.extension and self.extension not in EXT_TO_FILETYPE.keys():
+        if self.extension and self.extension not in EXT_TO_FILETYPE:
            raise ValueError(
                f"Extension not supported. "
                f"Value MUST be one of {', '.join([k for k in EXT_TO_FILETYPE if k is not None])}.",
--- a/unstructured/ingest/connector/onedrive.py
+++ b/unstructured/ingest/connector/onedrive.py
@ -65,7 +65,7 @@ class OneDriveIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
        if not self.ext:
            raise ValueError("Unsupported file without extension.")

-        if self.ext not in EXT_TO_FILETYPE.keys():
+        if self.ext not in EXT_TO_FILETYPE:
            raise ValueError(
                f"Extension not supported. "
                f"Value MUST be one of {', '.join([k for k in EXT_TO_FILETYPE if k is not None])}.",
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -13,27 +13,58 @@ from unstructured.file_utils.filetype import (
 )
 from unstructured.logger import logger
 from unstructured.partition.common import exactly_one
-from unstructured.partition.csv import partition_csv
-from unstructured.partition.doc import partition_doc
-from unstructured.partition.docx import partition_docx
 from unstructured.partition.email import partition_email
-from unstructured.partition.epub import partition_epub
 from unstructured.partition.html import partition_html
-from unstructured.partition.image import partition_image
 from unstructured.partition.json import partition_json
-from unstructured.partition.md import partition_md
-from unstructured.partition.msg import partition_msg
-from unstructured.partition.odt import partition_odt
-from unstructured.partition.org import partition_org
-from unstructured.partition.pdf import partition_pdf
-from unstructured.partition.ppt import partition_ppt
-from unstructured.partition.pptx import partition_pptx
-from unstructured.partition.rst import partition_rst
-from unstructured.partition.rtf import partition_rtf
 from unstructured.partition.text import partition_text
-from unstructured.partition.tsv import partition_tsv
-from unstructured.partition.xlsx import partition_xlsx
 from unstructured.partition.xml import partition_xml
+from unstructured.utils import dependency_exists
+
+if dependency_exists("pandas"):
+    from unstructured.partition.csv import partition_csv
+    from unstructured.partition.tsv import partition_tsv
+
+
+if dependency_exists("docx"):
+    from unstructured.partition.doc import partition_doc
+    from unstructured.partition.docx import partition_docx
+
+
+if dependency_exists("docx") and dependency_exists("pypandoc"):
+    from unstructured.partition.odt import partition_odt
+
+
+if dependency_exists("pypandoc"):
+    from unstructured.partition.epub import partition_epub
+    from unstructured.partition.org import partition_org
+    from unstructured.partition.rst import partition_rst
+    from unstructured.partition.rtf import partition_rtf
+
+
+if dependency_exists("markdown"):
+    from unstructured.partition.md import partition_md
+
+
+if dependency_exists("msg_parser"):
+    from unstructured.partition.msg import partition_msg
+
+
+pdf_imports = ["pdf2image", "pdfminer", "PIL"]
+if all(dependency_exists(dep) for dep in pdf_imports):
+    from unstructured.partition.pdf import partition_pdf
+
+
+if dependency_exists("unstructured_inference"):
+    from unstructured.partition.image import partition_image
+
+
+if dependency_exists("pptx"):
+    from unstructured.partition.ppt import partition_ppt
+    from unstructured.partition.pptx import partition_pptx
+
+
+if dependency_exists("pandas") and dependency_exists("openpyxl"):
+    from unstructured.partition.xlsx import partition_xlsx


 def partition(
--- a/unstructured/partition/common.py
+++ b/unstructured/partition/common.py
@ -7,7 +7,6 @@ from io import BufferedReader, BytesIO, TextIOWrapper
 from tempfile import SpooledTemporaryFile
 from typing import IO, TYPE_CHECKING, Any, BinaryIO, Dict, List, Optional, Tuple, Union

-from docx import table as docxtable
 from tabulate import tabulate

 from unstructured.documents.coordinates import CoordinateSystem
@ -23,6 +22,10 @@ from unstructured.documents.elements import (
 )
 from unstructured.logger import logger
 from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
+from unstructured.utils import dependency_exists
+
+if dependency_exists("docx"):
+    import docx.table as docxtable

 if TYPE_CHECKING:
    from unstructured_inference.inference.layoutelement import (
@ -303,12 +306,12 @@ def convert_to_bytes(
    return f_bytes


-def convert_ms_office_table_to_text(table: docxtable.Table, as_html: bool = True):
+def convert_ms_office_table_to_text(table: "docxtable.Table", as_html: bool = True) -> str:
    """
    Convert a table object from a Word document to an HTML table string using the tabulate library.

    Args:
-        table (Table): A Table object.
+        table (Table): A docx.table.Table object.
        as_html (bool): Whether to return the table as an HTML string (True) or a
            plain text string (False)

--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@ -4,7 +4,6 @@ from tempfile import SpooledTemporaryFile
 from typing import IO, BinaryIO, List, Optional, Tuple, Union, cast

 import docx
-import pypandoc
 from docx.oxml.shared import qn
 from docx.text.paragraph import Paragraph
 from docx.text.run import Run
@ -38,6 +37,10 @@ from unstructured.partition.text_type import (
    is_possible_title,
    is_us_city_state_zip,
 )
+from unstructured.utils import dependency_exists
+
+if dependency_exists("pypandoc"):
+    import pypandoc

 # NOTE(robinson) - documentation on built in styles can be found at the link below
 # ref: https://python-docx.readthedocs.io/en/latest/user/
@ -314,7 +317,7 @@ def convert_and_partition_docx(
    metadata_filename: Optional[str] = None,
    metadata_last_modified: Optional[str] = None,
 ) -> List[Element]:
-    """Converts a document to DOCX and then partitions it using partition_html. Works with
+    """Converts a document to DOCX and then partitions it using partition_docx. Works with
    any file format support by pandoc.

    Parameters
--- a/unstructured/staging/base.py
+++ b/unstructured/staging/base.py
@ -3,8 +3,6 @@ import io
 import json
 from typing import Any, Dict, List, Optional

-import pandas as pd
-
 from unstructured.documents.elements import (
    TYPE_TO_TEXT_ELEMENT_MAP,
    CheckBox,
@ -13,6 +11,10 @@ from unstructured.documents.elements import (
    NoID,
 )
 from unstructured.partition.common import exactly_one
+from unstructured.utils import dependency_exists, requires_dependencies
+
+if dependency_exists("pandas"):
+    import pandas as pd


 def _get_metadata_table_fieldnames():
@ -161,7 +163,7 @@ def convert_to_isd_csv(elements: List[Element]) -> str:

        if row.get("sent_from"):
            row["sender"] = row.get("sent_from")
-            if type(row["sender"]) == list:
+            if isinstance(row["sender"], list):
                row["sender"] = row["sender"][0]

    with io.StringIO() as buffer:
@ -176,11 +178,14 @@ def convert_to_csv(elements: List[Element]) -> str:
    return convert_to_isd_csv(elements)


-def convert_to_dataframe(elements: List[Element], drop_empty_cols: bool = True) -> pd.DataFrame:
+@requires_dependencies(["pandas"])
+def convert_to_dataframe(elements: List[Element], drop_empty_cols: bool = True) -> "pd.DataFrame":
    """Converts document elements to a pandas DataFrame. The dataframe contains the
    following columns:
        text: the element text
        type: the text type (NarrativeText, Title, etc)
+
+    Output is pd.DataFrame
    """
    csv_string = convert_to_isd_csv(elements)
    csv_string_io = io.StringIO(csv_string)