build(deps): split up dependencies by document type (#986)

* split dependencies by document type

* make pip-compile with new requirements

* add extra requirements to setup.py

* add in all docs; re pip-compile

* extra for all docs

* add pandas to xlsx

* dependency requires for tsv and csv

* handling for doc, docx and odt

* dependency check for pypandoc

* required dependencies for pandoc files

* xml and html

* markdown

* msg

* add in pdf

* add in pptx

* add in excel

* add lxml as base req

* extra all docs for local inference

* local inference installs all

* pin pillow version

* fixes for plain text tests

* fixes for doc

* update make commands

* changelog and version

* add xlrd

* update pip-compile

* pin numpy for python 3.8 support

* more constraints

* contraint on scipy

* update install docs

* constrain ipython

* add outlook to pip-compile

* more ipython constraints

* add extras to dockerfile

* pin office365 client

* few doc tweaks

* types as strings

* last pip-compile

* re pip-comple

* make tidy

* make tidy
This commit is contained in:
Matt Robinson 2023-08-01 11:31:13 -04:00 committed by GitHub
parent 13d3559fa4
commit 331c7faf38
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
59 changed files with 508 additions and 353 deletions

View File

@ -1,3 +1,9 @@
## 0.9.0
### Enhancements
* Dependencies are now split by document type, creating a slimmer base installation.
## 0.8.8
### Enhancements
@ -6,6 +12,7 @@
### Fixes
* Rename "date" field to "last_modified"
* Adds Box connector

View File

@ -30,7 +30,15 @@ RUN python3.8 -m pip install pip==${PIP_VERSION} && \
pip install --no-cache -r requirements/ingest-s3.txt && \
pip install --no-cache -r requirements/ingest-slack.txt && \
pip install --no-cache -r requirements/ingest-wikipedia.txt && \
pip install --no-cache -r requirements/local-inference.txt && \
pip install --no-cache -r requirements/extra-csv.txt && \
pip install --no-cache -r requirements/extra-docx.txt && \
pip install --no-cache -r requirements/extra-markdown.txt && \
pip install --no-cache -r requirements/extra-msg.txt && \
pip install --no-cache -r requirements/extra-odt.txt && \
pip install --no-cache -r requirements/extra-pandoc.txt && \
pip install --no-cache -r requirements/extra-pdf-image.txt && \
pip install --no-cache -r requirements/extra-pptx.txt && \
pip install --no-cache -r requirements/extra-xlsx.txt && \
dnf -y groupremove "Development Tools" && \
dnf clean all

View File

@ -18,10 +18,10 @@ install-base: install-base-pip-packages install-nltk-models
## install: installs all test, dev, and experimental requirements
.PHONY: install
install: install-base-pip-packages install-dev install-nltk-models install-test install-huggingface install-unstructured-inference
install: install-base-pip-packages install-dev install-nltk-models install-test install-huggingface install-all-docs
.PHONY: install-ci
install-ci: install-base-pip-packages install-nltk-models install-huggingface install-unstructured-inference install-test
install-ci: install-base-pip-packages install-nltk-models install-huggingface install-all-docs install-test
.PHONY: install-base-pip-packages
install-base-pip-packages:
@ -53,6 +53,45 @@ install-dev:
install-build:
python3 -m pip install -r requirements/build.txt
.PHONY: install-csv
install-csv:
python3 -m pip install -r requirements/extra-csv.txt
.PHONY: install-docx
install-docx:
python3 -m pip install -r requirements/extra-docx.txt
.PHONY: install-odt
install-odt:
python3 -m pip install -r requirements/extra-odt.txt
.PHONY: install-pypandoc
install-pypandoc:
python3 -m pip install -r requirements/extra-pandoc.txt
.PHONY: install-markdown
install-markdown:
python3 -m pip install -r requirements/extra-markdown.txt
.PHONY: install-msg
install-msg:
python3 -m pip install -r requirements/extra-msg.txt
.PHONY: install-pdf-image
install-pdf-image:
python3 -m pip install -r requirements/extra-pdf-image.txt
.PHONY: install-pptx
install-pptx:
python3 -m pip install -r requirements/extra-pptx.txt
.PHONY: install-xlsx
install-xlsx:
python3 -m pip install -r requirements/extra-xlsx.txt
.PHONY: install-all-docs
install-all-docs: install-base install-csv install-docx install-docx install-odt install-pypandoc install-markdown install-msg install-pdf-image install-pptx install-xlsx
.PHONY: install-ingest-google-drive
install-ingest-google-drive:
python3 -m pip install -r requirements/ingest-google-drive.txt
@ -124,7 +163,7 @@ install-unstructured-inference:
## install-local-inference: installs requirements for local inference
.PHONY: install-local-inference
install-local-inference: install install-unstructured-inference
install-local-inference: install install-all-docs
.PHONY: install-pandoc
install-pandoc:
@ -135,12 +174,23 @@ install-pandoc:
.PHONY: pip-compile
pip-compile:
pip-compile --upgrade requirements/base.in
# Extra requirements that are specific to document types
pip-compile --upgrade requirements/extra-csv.in
pip-compile --upgrade requirements/extra-docx.in
pip-compile --upgrade requirements/extra-pandoc.in
pip-compile --upgrade requirements/extra-markdown.in
pip-compile --upgrade requirements/extra-msg.in
pip-compile --upgrade requirements/extra-odt.in
pip-compile --upgrade requirements/extra-pdf-image.in
pip-compile --upgrade requirements/extra-pptx.in
pip-compile --upgrade requirements/extra-xlsx.in
# Extra requirements for huggingface staging functions
pip-compile --upgrade requirements/huggingface.in
pip-compile --upgrade requirements/test.in
pip-compile --upgrade requirements/dev.in
pip-compile --upgrade requirements/build.in
pip-compile --upgrade requirements/local-inference.in
# NOTE(robinson) - doc/requirements.txt is where the GitHub action for building
# sphinx docs looks for additional requirements
cp requirements/build.txt docs/requirements.txt
@ -158,6 +208,7 @@ pip-compile:
pip-compile --upgrade requirements/ingest-google-drive.in
pip-compile --upgrade requirements/ingest-elasticsearch.in
pip-compile --upgrade requirements/ingest-onedrive.in
pip-compile --upgrade requirements/ingest-outlook.in
pip-compile --upgrade requirements/ingest-confluence.in
## install-project-local: install unstructured into your local python environment

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/build.in
@ -12,10 +12,14 @@ beautifulsoup4==4.12.2
# via furo
certifi==2023.7.22
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# -r requirements/build.in
# requests
charset-normalizer==3.2.0
# via requests
# via
# -c requirements/base.txt
# requests
docutils==0.18.1
# via
# sphinx
@ -23,11 +27,11 @@ docutils==0.18.1
furo==2023.7.26
# via -r requirements/build.in
idna==3.4
# via requests
# via
# -c requirements/base.txt
# requests
imagesize==1.4.1
# via sphinx
importlib-metadata==6.8.0
# via sphinx
jinja2==3.1.2
# via sphinx
markupsafe==2.1.3
@ -38,10 +42,10 @@ pygments==2.15.1
# via
# furo
# sphinx
pytz==2023.3
# via babel
requests==2.31.0
# via sphinx
# via
# -c requirements/base.txt
# sphinx
snowballstemmer==2.2.0
# via sphinx
soupsieve==2.4.1
@ -71,7 +75,8 @@ sphinxcontrib-qthelp==1.0.3
# via sphinx
sphinxcontrib-serializinghtml==1.1.5
# via sphinx
urllib3==2.0.4
# via requests
zipp==3.16.2
# via importlib-metadata
urllib3==1.26.16
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# requests

View File

@ -7,8 +7,15 @@ Quick Start
Use the following instructions to get up and running with ``unstructured`` and test your
installation.
* Install the Python SDK with ``pip install "unstructured[local-inference]"``
* If you do not need to process PDFs or images, you can run ``pip install unstructured``
* Install the Python SDK with ``pip install unstructured``
* Plain text files, HTML, XML, JSON and Emails do not require any extra dependencies.
* If you need to process other document types, you can install the extras required for those documents
with ``pip install "unstructured[docx,pptx]"``.
* To install the extras for every document type, use ``pip install "unstructured[all-docs]"``.
* For ``unstructured<0.9.0``, you can install the extras for all document types with
``pip install "unstructured[local-inference]"``. The ``local-inference`` extra is still
supported in newer versions for backward compatibility, but may be deprecated in a future version.
The ``all-docs`` extra is the officially supported installation pattern.
* Install the following system dependencies if they are not already available on your system. Depending on what document types you're parsing, you may not need all of these.
* ``libmagic-dev`` (filetype detection)

View File

@ -1,19 +1,8 @@
-c "constraints.in"
chardet
filetype
lxml
msg_parser
nltk
openpyxl
pandas
pdf2image
pdfminer.six
pillow
pypandoc
python-docx
python-pptx
python-magic
markdown
requests
lxml
nltk
tabulate
xlrd
requests

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/base.in
@ -8,89 +8,33 @@ certifi==2023.7.22
# via
# -c requirements/constraints.in
# requests
cffi==1.15.1
# via cryptography
chardet==5.1.0
# via -r requirements/base.in
charset-normalizer==3.2.0
# via
# pdfminer-six
# requests
# via requests
click==8.1.6
# via nltk
cryptography==41.0.2
# via pdfminer-six
et-xmlfile==1.1.0
# via openpyxl
filetype==1.2.0
# via -r requirements/base.in
idna==3.4
# via requests
importlib-metadata==6.8.0
# via markdown
joblib==1.3.1
# via nltk
lxml==4.9.3
# via
# -r requirements/base.in
# python-docx
# python-pptx
markdown==3.4.4
# via -r requirements/base.in
msg-parser==1.2.0
# via -r requirements/base.in
nltk==3.8.1
# via -r requirements/base.in
numpy==1.24.4
# via pandas
olefile==0.46
# via msg-parser
openpyxl==3.1.2
# via -r requirements/base.in
pandas==2.0.3
# via -r requirements/base.in
pdf2image==1.16.3
# via -r requirements/base.in
pdfminer-six==20221105
# via -r requirements/base.in
pillow==10.0.0
# via
# -r requirements/base.in
# pdf2image
# python-pptx
pycparser==2.21
# via cffi
pypandoc==1.11
# via -r requirements/base.in
python-dateutil==2.8.2
# via pandas
python-docx==0.8.11
# via -r requirements/base.in
python-magic==0.4.27
# via -r requirements/base.in
python-pptx==0.6.21
# via -r requirements/base.in
pytz==2023.3
# via pandas
regex==2023.6.3
# via nltk
requests==2.31.0
# via -r requirements/base.in
six==1.16.0
# via python-dateutil
tabulate==0.9.0
# via -r requirements/base.in
tqdm==4.65.0
# via nltk
tzdata==2023.3
# via pandas
urllib3==1.26.16
# via
# -c requirements/constraints.in
# requests
xlrd==2.0.1
# via -r requirements/base.in
xlsxwriter==3.1.2
# via python-pptx
zipp==3.16.2
# via importlib-metadata

View File

@ -1,3 +1,6 @@
-c base.txt
-c constraints.in
sphinx
# NOTE(alan) - Pinning to resolve a conflict with sphinx. We can unpin on next sphinx_rtd_theme release.
sphinx_rtd_theme==1.2.2

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/build.in
@ -12,10 +12,14 @@ beautifulsoup4==4.12.2
# via furo
certifi==2023.7.22
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# -r requirements/build.in
# requests
charset-normalizer==3.2.0
# via requests
# via
# -c requirements/base.txt
# requests
docutils==0.18.1
# via
# sphinx
@ -23,11 +27,11 @@ docutils==0.18.1
furo==2023.7.26
# via -r requirements/build.in
idna==3.4
# via requests
# via
# -c requirements/base.txt
# requests
imagesize==1.4.1
# via sphinx
importlib-metadata==6.8.0
# via sphinx
jinja2==3.1.2
# via sphinx
markupsafe==2.1.3
@ -38,10 +42,10 @@ pygments==2.15.1
# via
# furo
# sphinx
pytz==2023.3
# via babel
requests==2.31.0
# via sphinx
# via
# -c requirements/base.txt
# sphinx
snowballstemmer==2.2.0
# via sphinx
soupsieve==2.4.1
@ -71,7 +75,8 @@ sphinxcontrib-qthelp==1.0.3
# via sphinx
sphinxcontrib-serializinghtml==1.1.5
# via sphinx
urllib3==2.0.4
# via requests
zipp==3.16.2
# via importlib-metadata
urllib3==1.26.16
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# requests

View File

@ -12,6 +12,13 @@ jupyter-core>=4.11.2
wheel>=0.38.1
# NOTE(robinson) - The following pins are to address
# vulnerabilities in dependency scans
certifi>=2022.12.07
certifi>=2023.7.22
# From pycocotools in local-inference
pyparsing<3.1.0
# NOTE(robinson) - Numpy dropped Python 3.8 support in 1.25.0
numpy<1.25.0
scipy<1.11.0
IPython<8.13
# NOTE(robinson) - See this issue here
# https://github.com/facebookresearch/detectron2/issues/5010
Pillow<10.0.0

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/dev.in
@ -41,9 +41,7 @@ certifi==2023.7.22
# -c requirements/test.txt
# requests
cffi==1.15.1
# via
# -c requirements/base.txt
# argon2-cffi-bindings
# via argon2-cffi-bindings
cfgv==3.3.1
# via pre-commit
charset-normalizer==3.2.0
@ -57,7 +55,9 @@ click==8.1.6
# -c requirements/test.txt
# pip-tools
comm==0.1.3
# via ipykernel
# via
# ipykernel
# ipywidgets
debugpy==1.6.7
# via ipykernel
decorator==5.1.1
@ -66,10 +66,6 @@ defusedxml==0.7.1
# via nbconvert
distlib==0.3.7
# via virtualenv
exceptiongroup==1.1.2
# via
# -c requirements/test.txt
# anyio
executing==1.2.0
# via stack-data
fastjsonschema==2.18.0
@ -87,40 +83,26 @@ idna==3.4
# anyio
# jsonschema
# requests
importlib-metadata==6.8.0
# via
# -c requirements/base.txt
# jupyter-client
# jupyter-lsp
# jupyterlab
# jupyterlab-server
# nbconvert
importlib-resources==6.0.0
# via
# jsonschema
# jsonschema-specifications
# jupyterlab
# notebook
ipykernel==6.25.0
# via
# ipywidgets
# jupyter
# jupyter-console
# jupyterlab
# qtconsole
ipython==8.12.2
# via
# -c requirements/constraints.in
# -r requirements/dev.in
# ipykernel
# ipywidgets
# jupyter-console
ipython-genutils==0.2.0
# via qtconsole
ipywidgets==8.0.7
ipywidgets==8.1.0
# via jupyter
isoduration==20.11.0
# via jsonschema
jedi==0.18.2
jedi==0.19.0
# via ipython
jinja2==3.1.2
# via
@ -162,7 +144,7 @@ jupyter-core==5.3.1
# nbconvert
# nbformat
# qtconsole
jupyter-events==0.6.3
jupyter-events==0.7.0
# via jupyter-server
jupyter-lsp==2.2.0
# via jupyterlab
@ -201,16 +183,16 @@ nbconvert==7.7.3
# via
# jupyter
# jupyter-server
nbformat==5.9.1
nbformat==5.9.2
# via
# jupyter-server
# nbclient
# nbconvert
nest-asyncio==1.5.6
nest-asyncio==1.5.7
# via ipykernel
nodeenv==1.8.0
# via pre-commit
notebook==7.0.0
notebook==7.0.1
# via jupyter
notebook-shim==0.2.3
# via
@ -239,9 +221,7 @@ pickleshare==0.7.5
# via ipython
pip-tools==7.1.0
# via -r requirements/dev.in
pkgutil-resolve-name==1.3.10
# via jsonschema
platformdirs==3.9.1
platformdirs==3.10.0
# via
# -c requirements/test.txt
# jupyter-core
@ -263,9 +243,7 @@ ptyprocess==0.7.0
pure-eval==0.2.2
# via stack-data
pycparser==2.21
# via
# -c requirements/base.txt
# cffi
# via cffi
pygments==2.15.1
# via
# ipython
@ -276,16 +254,11 @@ pyproject-hooks==1.0.0
# via build
python-dateutil==2.8.2
# via
# -c requirements/base.txt
# -c requirements/test.txt
# arrow
# jupyter-client
python-json-logger==2.0.7
# via jupyter-events
pytz==2023.3
# via
# -c requirements/base.txt
# babel
pyyaml==6.0.1
# via
# -c requirements/test.txt
@ -306,6 +279,7 @@ referencing==0.30.0
# via
# jsonschema
# jsonschema-specifications
# jupyter-events
requests==2.31.0
# via
# -c requirements/base.txt
@ -327,7 +301,6 @@ send2trash==1.8.2
# via jupyter-server
six==1.16.0
# via
# -c requirements/base.txt
# -c requirements/test.txt
# asttokens
# bleach
@ -345,13 +318,6 @@ terminado==0.17.1
# jupyter-server-terminals
tinycss2==1.2.1
# via nbconvert
tomli==2.0.1
# via
# -c requirements/test.txt
# build
# jupyterlab
# pip-tools
# pyproject-hooks
tornado==6.3.2
# via
# ipykernel
@ -377,11 +343,6 @@ traitlets==5.9.0
# nbconvert
# nbformat
# qtconsole
typing-extensions==4.7.1
# via
# -c requirements/test.txt
# async-lru
# ipython
uri-template==1.3.0
# via jsonschema
urllib3==1.26.16
@ -408,11 +369,6 @@ wheel==0.41.0
# pip-tools
widgetsnbextension==4.0.8
# via ipywidgets
zipp==3.16.2
# via
# -c requirements/base.txt
# importlib-metadata
# importlib-resources
# The following packages are considered to be unsafe in a requirements file:
# pip

View File

@ -1,3 +1,4 @@
-c constraints.in
-c base.txt
unstructured-inference==0.5.7
pandas

View File

@ -0,0 +1,20 @@
#
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/extra-csv.in
#
numpy==1.24.4
# via
# -c requirements/constraints.in
# pandas
pandas==2.0.3
# via -r requirements/extra-csv.in
python-dateutil==2.8.2
# via pandas
pytz==2023.3
# via pandas
six==1.16.0
# via python-dateutil
tzdata==2023.3
# via pandas

View File

@ -0,0 +1,4 @@
-c constraints.in
-c base.txt
python-docx

View File

@ -0,0 +1,12 @@
#
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/extra-docx.in
#
lxml==4.9.3
# via
# -c requirements/base.txt
# python-docx
python-docx==0.8.11
# via -r requirements/extra-docx.in

View File

@ -0,0 +1,4 @@
-c "constraints.in"
-c "base.txt"
markdown

View File

@ -0,0 +1,8 @@
#
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/extra-markdown.in
#
markdown==3.4.4
# via -r requirements/extra-markdown.in

View File

@ -0,0 +1,4 @@
-c constraints.in
-c base.txt
msg_parser

View File

@ -0,0 +1,10 @@
#
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/extra-msg.in
#
msg-parser==1.2.0
# via -r requirements/extra-msg.in
olefile==0.46
# via msg-parser

View File

@ -0,0 +1,5 @@
-c constraints.in
-c base.txt
python-docx
pypandoc

View File

@ -0,0 +1,14 @@
#
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/extra-odt.in
#
lxml==4.9.3
# via
# -c requirements/base.txt
# python-docx
pypandoc==1.11
# via -r requirements/extra-odt.in
python-docx==0.8.11
# via -r requirements/extra-odt.in

View File

@ -0,0 +1,4 @@
-c constraints.in
-c base.txt
pypandoc

View File

@ -0,0 +1,8 @@
#
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/extra-pandoc.in
#
pypandoc==1.11
# via -r requirements/extra-pandoc.in

View File

@ -0,0 +1,9 @@
-c constraints.in
-c base.txt
pdf2image
pdfminer.six
# NOTE(robinson) - See this issue here
# https://github.com/facebookresearch/detectron2/issues/5010
Pillow<10
unstructured-inference==0.5.7

View File

@ -1,8 +1,8 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/local-inference.in
# pip-compile requirements/extra-pdf-image.in
#
antlr4-python3-runtime==4.9.3
# via omegaconf
@ -12,9 +12,7 @@ certifi==2023.7.22
# -c requirements/constraints.in
# requests
cffi==1.15.1
# via
# -c requirements/base.txt
# cryptography
# via cryptography
charset-normalizer==3.2.0
# via
# -c requirements/base.txt
@ -25,9 +23,7 @@ coloredlogs==15.0.1
contourpy==1.1.0
# via matplotlib
cryptography==41.0.2
# via
# -c requirements/base.txt
# pdfminer-six
# via pdfminer-six
cycler==0.11.0
# via matplotlib
effdet==0.4.1
@ -54,8 +50,6 @@ idna==3.4
# via
# -c requirements/base.txt
# requests
importlib-resources==6.0.0
# via matplotlib
iopath==0.1.10
# via layoutparser
jinja2==3.1.2
@ -74,7 +68,7 @@ networkx==3.1
# via torch
numpy==1.24.4
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# contourpy
# layoutparser
# matplotlib
@ -101,22 +95,21 @@ packaging==23.1
# pytesseract
# transformers
pandas==2.0.3
# via
# -c requirements/base.txt
# layoutparser
# via layoutparser
pdf2image==1.16.3
# via
# -c requirements/base.txt
# -r requirements/extra-pdf-image.in
# layoutparser
pdfminer-six==20221105
# via
# -c requirements/base.txt
# -r requirements/extra-pdf-image.in
# pdfplumber
pdfplumber==0.10.1
pdfplumber==0.10.2
# via layoutparser
pillow==10.0.0
pillow==9.5.0
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# -r requirements/extra-pdf-image.in
# layoutparser
# matplotlib
# pdf2image
@ -132,9 +125,7 @@ protobuf==4.23.4
pycocotools==2.0.6
# via effdet
pycparser==2.21
# via
# -c requirements/base.txt
# cffi
# via cffi
pyparsing==3.0.9
# via
# -c requirements/constraints.in
@ -145,15 +136,12 @@ pytesseract==0.3.10
# via layoutparser
python-dateutil==2.8.2
# via
# -c requirements/base.txt
# matplotlib
# pandas
python-multipart==0.0.6
# via unstructured-inference
pytz==2023.3
# via
# -c requirements/base.txt
# pandas
# via pandas
pyyaml==6.0.1
# via
# huggingface-hub
@ -176,11 +164,11 @@ safetensors==0.3.1
# timm
# transformers
scipy==1.10.1
# via layoutparser
six==1.16.0
# via
# -c requirements/base.txt
# python-dateutil
# -c requirements/constraints.in
# layoutparser
six==1.16.0
# via python-dateutil
sympy==1.12
# via
# onnxruntime
@ -214,17 +202,11 @@ typing-extensions==4.7.1
# iopath
# torch
tzdata==2023.3
# via
# -c requirements/base.txt
# pandas
# via pandas
unstructured-inference==0.5.7
# via -r requirements/local-inference.in
# via -r requirements/extra-pdf-image.in
urllib3==1.26.16
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
zipp==3.16.2
# via
# -c requirements/base.txt
# importlib-resources

View File

@ -0,0 +1,3 @@
-c "constraints.in"
python-pptx

View File

@ -0,0 +1,16 @@
#
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/extra-pptx.in
#
lxml==4.9.3
# via python-pptx
pillow==9.5.0
# via
# -c requirements/constraints.in
# python-pptx
python-pptx==0.6.21
# via -r requirements/extra-pptx.in
xlsxwriter==3.1.2
# via python-pptx

View File

@ -0,0 +1,6 @@
-c constraints.in
-c base.txt
openpyxl
pandas
xlrd

View File

@ -0,0 +1,26 @@
#
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/extra-xlsx.in
#
et-xmlfile==1.1.0
# via openpyxl
numpy==1.24.4
# via
# -c requirements/constraints.in
# pandas
openpyxl==3.1.2
# via -r requirements/extra-xlsx.in
pandas==2.0.3
# via -r requirements/extra-xlsx.in
python-dateutil==2.8.2
# via pandas
pytz==2023.3
# via pandas
six==1.16.0
# via python-dateutil
tzdata==2023.3
# via pandas
xlrd==2.0.1
# via -r requirements/extra-xlsx.in

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/huggingface.in
@ -46,7 +46,7 @@ networkx==3.1
# via torch
numpy==1.24.4
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# transformers
packaging==23.1
# via
@ -74,7 +74,6 @@ sentencepiece==0.1.99
# via -r requirements/huggingface.in
six==1.16.0
# via
# -c requirements/base.txt
# langdetect
# sacremoses
sympy==1.12

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/ingest-azure.in
@ -32,7 +32,6 @@ certifi==2023.7.22
# requests
cffi==1.15.1
# via
# -c requirements/base.txt
# azure-datalake-store
# cryptography
charset-normalizer==3.2.0
@ -42,7 +41,6 @@ charset-normalizer==3.2.0
# requests
cryptography==41.0.2
# via
# -c requirements/base.txt
# azure-identity
# azure-storage-blob
# msal
@ -76,9 +74,7 @@ multidict==6.0.4
portalocker==2.7.0
# via msal-extensions
pycparser==2.21
# via
# -c requirements/base.txt
# cffi
# via cffi
pyjwt[crypto]==2.8.0
# via msal
requests==2.31.0
@ -89,7 +85,6 @@ requests==2.31.0
# msal
six==1.16.0
# via
# -c requirements/base.txt
# azure-core
# azure-identity
# isodate

View File

@ -8,7 +8,7 @@ attrs==23.1.0
# via boxsdk
boxfs==0.2.0
# via -r requirements/ingest-box.in
boxsdk[jwt]==3.8.0
boxsdk[jwt]==3.8.1
# via boxfs
certifi==2023.7.22
# via
@ -16,17 +16,13 @@ certifi==2023.7.22
# -c requirements/constraints.in
# requests
cffi==1.15.1
# via
# -c requirements/base.txt
# cryptography
# via cryptography
charset-normalizer==3.2.0
# via
# -c requirements/base.txt
# requests
cryptography==41.0.2
# via
# -c requirements/base.txt
# boxsdk
# via boxsdk
fsspec==2023.6.0
# via
# -r requirements/ingest-box.in
@ -36,15 +32,11 @@ idna==3.4
# -c requirements/base.txt
# requests
pycparser==2.21
# via
# -c requirements/base.txt
# cffi
# via cffi
pyjwt==2.8.0
# via boxsdk
python-dateutil==2.8.2
# via
# -c requirements/base.txt
# boxsdk
# via boxsdk
requests==2.31.0
# via
# -c requirements/base.txt
@ -53,9 +45,7 @@ requests==2.31.0
requests-toolbelt==1.0.0
# via boxsdk
six==1.16.0
# via
# -c requirements/base.txt
# python-dateutil
# via python-dateutil
urllib3==1.26.16
# via
# -c requirements/base.txt

View File

@ -1,10 +1,10 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/ingest-confluence.in
#
atlassian-python-api==3.39.0
atlassian-python-api==3.40.0
# via -r requirements/ingest-confluence.in
certifi==2023.7.22
# via
@ -33,9 +33,7 @@ requests==2.31.0
requests-oauthlib==1.3.1
# via atlassian-python-api
six==1.16.0
# via
# -c requirements/base.txt
# atlassian-python-api
# via atlassian-python-api
urllib3==1.26.16
# via
# -c requirements/base.txt

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/ingest-discord.in

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/ingest-dropbox.in
@ -34,7 +34,6 @@ requests==2.31.0
# dropboxdrivefs
six==1.16.0
# via
# -c requirements/base.txt
# dropbox
# stone
stone==3.3.1

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/ingest-elasticsearch.in
@ -11,7 +11,7 @@ certifi==2023.7.22
# elastic-transport
elastic-transport==8.4.0
# via elasticsearch
elasticsearch==8.8.2
elasticsearch==8.9.0
# via -r requirements/ingest-elasticsearch.in
jq==1.4.1
# via -r requirements/ingest-elasticsearch.in

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/ingest-gcs.in
@ -57,7 +57,7 @@ google-crc32c==1.5.0
# via google-resumable-media
google-resumable-media==2.5.0
# via google-cloud-storage
googleapis-common-protos==1.59.1
googleapis-common-protos==1.60.0
# via google-api-core
idna==3.4
# via
@ -74,7 +74,6 @@ protobuf==4.23.4
# via
# -c requirements/constraints.in
# google-api-core
# googleapis-common-protos
pyasn1==0.5.0
# via
# pyasn1-modules
@ -93,9 +92,7 @@ requests-oauthlib==1.3.1
rsa==4.9
# via google-auth
six==1.16.0
# via
# -c requirements/base.txt
# google-auth
# via google-auth
urllib3==1.26.16
# via
# -c requirements/base.txt

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/ingest-github.in
@ -11,7 +11,6 @@ certifi==2023.7.22
# requests
cffi==1.15.1
# via
# -c requirements/base.txt
# cryptography
# pynacl
charset-normalizer==3.2.0
@ -19,9 +18,7 @@ charset-normalizer==3.2.0
# -c requirements/base.txt
# requests
cryptography==41.0.2
# via
# -c requirements/base.txt
# pyjwt
# via pyjwt
deprecated==1.2.14
# via pygithub
idna==3.4
@ -29,9 +26,7 @@ idna==3.4
# -c requirements/base.txt
# requests
pycparser==2.21
# via
# -c requirements/base.txt
# cffi
# via cffi
pygithub==1.58.2
# via -r requirements/ingest-github.in
pyjwt[crypto]==2.8.0

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/ingest-gitlab.in

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/ingest-google-drive.in
@ -26,7 +26,7 @@ google-auth==2.22.0
# google-auth-httplib2
google-auth-httplib2==0.1.0
# via google-api-python-client
googleapis-common-protos==1.59.1
googleapis-common-protos==1.60.0
# via google-api-core
httplib2==0.22.0
# via
@ -59,7 +59,6 @@ rsa==4.9
# via google-auth
six==1.16.0
# via
# -c requirements/base.txt
# google-auth
# google-auth-httplib2
uritemplate==4.1.1

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/ingest-onedrive.in
@ -10,16 +10,13 @@ certifi==2023.7.22
# -c requirements/constraints.in
# requests
cffi==1.15.1
# via
# -c requirements/base.txt
# cryptography
# via cryptography
charset-normalizer==3.2.0
# via
# -c requirements/base.txt
# requests
cryptography==41.0.2
# via
# -c requirements/base.txt
# -r requirements/ingest-onedrive.in
# msal
# pyjwt
@ -34,15 +31,11 @@ msal==1.23.0
office365-rest-python-client==2.4.2
# via -r requirements/ingest-onedrive.in
pycparser==2.21
# via
# -c requirements/base.txt
# cffi
# via cffi
pyjwt[crypto]==2.8.0
# via msal
pytz==2023.3
# via
# -c requirements/base.txt
# office365-rest-python-client
# via office365-rest-python-client
requests==2.31.0
# via
# -c requirements/base.txt

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/ingest-outlook.in
@ -10,16 +10,13 @@ certifi==2023.7.22
# -c requirements/constraints.in
# requests
cffi==1.15.1
# via
# -c requirements/base.txt
# cryptography
# via cryptography
charset-normalizer==3.2.0
# via
# -c requirements/base.txt
# requests
cryptography==41.0.2
# via
# -c requirements/base.txt
# -r requirements/ingest-outlook.in
# msal
# pyjwt
@ -34,15 +31,11 @@ msal==1.23.0
office365-rest-python-client==2.4.2
# via -r requirements/ingest-outlook.in
pycparser==2.21
# via
# -c requirements/base.txt
# cffi
# via cffi
pyjwt[crypto]==2.8.0
# via msal
pytz==2023.3
# via
# -c requirements/base.txt
# office365-rest-python-client
# via office365-rest-python-client
requests==2.31.0
# via
# -c requirements/base.txt

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/ingest-reddit.in

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/ingest-s3.in
@ -43,17 +43,11 @@ multidict==6.0.4
# aiohttp
# yarl
python-dateutil==2.8.2
# via
# -c requirements/base.txt
# botocore
# via botocore
s3fs==2023.6.0
# via -r requirements/ingest-s3.in
six==1.16.0
# via
# -c requirements/base.txt
# python-dateutil
typing-extensions==4.7.1
# via aioitertools
# via python-dateutil
urllib3==1.26.16
# via
# -c requirements/base.txt

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/ingest-slack.in

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/ingest-wikipedia.in

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements/test.in
@ -26,9 +26,7 @@ coverage[toml]==7.2.7
# via
# -r requirements/test.in
# pytest-cov
exceptiongroup==1.1.2
# via pytest
flake8==6.0.0
flake8==6.1.0
# via -r requirements/test.in
freezegun==1.2.2
# via -r requirements/test.in
@ -64,17 +62,17 @@ packaging==23.1
# via
# black
# pytest
pathspec==0.11.1
pathspec==0.11.2
# via black
platformdirs==3.9.1
platformdirs==3.10.0
# via black
pluggy==1.2.0
# via pytest
pycodestyle==2.10.0
pycodestyle==2.11.0
# via flake8
pydantic==1.10.12
# via label-studio-sdk
pyflakes==3.0.1
pyflakes==3.1.0
# via flake8
pytest==7.4.0
# via
@ -85,28 +83,17 @@ pytest-cov==4.1.0
pytest-mock==3.11.1
# via -r requirements/test.in
python-dateutil==2.8.2
# via
# -c requirements/base.txt
# freezegun
# via freezegun
pyyaml==6.0.1
# via vcrpy
requests==2.31.0
# via
# -c requirements/base.txt
# label-studio-sdk
ruff==0.0.280
ruff==0.0.281
# via -r requirements/test.in
six==1.16.0
# via
# -c requirements/base.txt
# python-dateutil
# vcrpy
tomli==2.0.1
# via
# black
# coverage
# mypy
# pytest
# via python-dateutil
types-click==7.1.8
# via -r requirements/test.in
types-markdown==3.4.2.10
@ -119,7 +106,6 @@ types-urllib3==1.26.25.14
# via types-requests
typing-extensions==4.7.1
# via
# black
# mypy
# pydantic
urllib3==1.26.16
@ -127,8 +113,7 @@ urllib3==1.26.16
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
# vcrpy
vcrpy==5.0.0
vcrpy==5.1.0
# via -r requirements/test.in
wrapt==1.15.0
# via vcrpy

View File

@ -34,11 +34,48 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List
with open(file, encoding="utf-8") as f:
requirements.extend(f.readlines())
requirements = [
req for req in requirements if not req.startswith("#") and not req.startswith("-")
req
for req in requirements
if not req.startswith("#") and not req.startswith("-")
]
return requirements
csv_reqs = load_requirements("requirements/extra-csv.in")
docx_reqs = load_requirements("requirements/extra-docx.in")
epub_reqs = load_requirements("requirements/extra-pandoc.in")
image_reqs = load_requirements("requirements/extra-pdf-image.in")
markdown_reqs = load_requirements("requirements/extra-markdown.in")
msg_reqs = load_requirements("requirements/extra-msg.in")
odt_reqs = load_requirements("requirements/extra-odt.in")
org_reqs = load_requirements("requirements/extra-pandoc.in")
pdf_reqs = load_requirements("requirements/extra-pdf-image.in")
pptx_reqs = load_requirements("requirements/extra-pptx.in")
rtf_reqs = load_requirements("requirements/extra-pandoc.in")
rst_reqs = load_requirements("requirements/extra-pandoc.in")
tsv_reqs = load_requirements("requirements/extra-csv.in")
xlsx_reqs = load_requirements("requirements/extra-xlsx.in")
all_doc_reqs = list(
set(
csv_reqs
+ docx_reqs
+ epub_reqs
+ image_reqs
+ markdown_reqs
+ msg_reqs
+ odt_reqs
+ org_reqs
+ pdf_reqs
+ pptx_reqs
+ rtf_reqs
+ rst_reqs
+ tsv_reqs
+ xlsx_reqs,
),
)
setup(
name="unstructured",
description="A library that prepares raw documents for downstream ML tasks.",
@ -71,8 +108,23 @@ setup(
},
install_requires=load_requirements(),
extras_require={
"huggingface": load_requirements("requirements/huggingface.in"),
"local-inference": load_requirements("requirements/local-inference.in"),
# Document specific extra requirements
"all-docs": all_doc_reqs,
"csv": csv_reqs,
"docx": docx_reqs,
"epub": epub_reqs,
"image": image_reqs,
"md": markdown_reqs,
"msg": msg_reqs,
"odt": odt_reqs,
"org": org_reqs,
"pdf": pdf_reqs,
"pptx": pptx_reqs,
"rtf": rtf_reqs,
"rst": rst_reqs,
"tsv": tsv_reqs,
"xlsx": xlsx_reqs,
# Extra requirements for data connectors
"s3": load_requirements("requirements/ingest-s3.in"),
"azure": load_requirements("requirements/ingest-azure.in"),
"discord": load_requirements("requirements/ingest-discord.in"),
@ -89,6 +141,9 @@ setup(
"onedrive": load_requirements("requirements/ingest-onedrive.in"),
"outlook": load_requirements("requirements/ingest-outlook.in"),
"confluence": load_requirements("requirements/ingest-confluence.in"),
# Legacy extra requirements
"huggingface": load_requirements("requirements/huggingface.in"),
"local-inference": all_doc_reqs,
},
package_dir={"unstructured": "unstructured"},
package_data={"unstructured": ["nlp/*.txt"]},

View File

@ -478,4 +478,4 @@ def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
def test_get_page_image_metadata_and_coordinate_system():
doc = MockDocumentLayout()
metadata = _get_page_image_metadata(doc.pages[0])
assert type(metadata) == dict
assert isinstance(metadata, dict)

View File

@ -267,6 +267,6 @@ def test_partition_doc_from_file_without_metadata_date(
sf = SpooledTemporaryFile()
sf.write(f.read())
sf.seek(0)
elements = partition_doc(file=sf, metadata_last_modified=None)
elements = partition_doc(file=sf, metadata_date="2020-07-05")
assert elements[0].metadata.last_modified is None
assert elements[0].metadata.date == "2020-07-05"

View File

@ -216,8 +216,8 @@ def test_process_file_metadata_exclude_filename_pagenum(mocker, partition_test_r
isd_elems = test_ingest_doc.process_file()
assert len(isd_elems)
for elem in isd_elems:
assert "filename" not in elem["metadata"].keys()
assert "page_number" not in elem["metadata"].keys()
assert "filename" not in elem["metadata"]
assert "page_number" not in elem["metadata"]
def test_process_file_flatten_metadata(mocker, partition_test_results):

View File

@ -1 +1 @@
__version__ = "0.8.8" # pragma: no cover
__version__ = "0.9.0" # pragma: no cover

View File

@ -1,11 +1,14 @@
import tempfile
from typing import IO, Optional
import pypandoc
from unstructured.partition.common import exactly_one
from unstructured.utils import dependency_exists, requires_dependencies
if dependency_exists("pypandoc"):
import pypandoc
@requires_dependencies(["pypandoc"])
def convert_file_to_text(filename: str, source_format: str, target_format: str) -> str:
"""Uses pandoc to convert the source document to a raw text string."""
try:

View File

@ -76,7 +76,7 @@ class SimpleGoogleDriveConfig(BaseConnectorConfig):
recursive: bool = False
def __post_init__(self):
if self.extension and self.extension not in EXT_TO_FILETYPE.keys():
if self.extension and self.extension not in EXT_TO_FILETYPE:
raise ValueError(
f"Extension not supported. "
f"Value MUST be one of {', '.join([k for k in EXT_TO_FILETYPE if k is not None])}.",

View File

@ -65,7 +65,7 @@ class OneDriveIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
if not self.ext:
raise ValueError("Unsupported file without extension.")
if self.ext not in EXT_TO_FILETYPE.keys():
if self.ext not in EXT_TO_FILETYPE:
raise ValueError(
f"Extension not supported. "
f"Value MUST be one of {', '.join([k for k in EXT_TO_FILETYPE if k is not None])}.",

View File

@ -13,27 +13,58 @@ from unstructured.file_utils.filetype import (
)
from unstructured.logger import logger
from unstructured.partition.common import exactly_one
from unstructured.partition.csv import partition_csv
from unstructured.partition.doc import partition_doc
from unstructured.partition.docx import partition_docx
from unstructured.partition.email import partition_email
from unstructured.partition.epub import partition_epub
from unstructured.partition.html import partition_html
from unstructured.partition.image import partition_image
from unstructured.partition.json import partition_json
from unstructured.partition.md import partition_md
from unstructured.partition.msg import partition_msg
from unstructured.partition.odt import partition_odt
from unstructured.partition.org import partition_org
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.ppt import partition_ppt
from unstructured.partition.pptx import partition_pptx
from unstructured.partition.rst import partition_rst
from unstructured.partition.rtf import partition_rtf
from unstructured.partition.text import partition_text
from unstructured.partition.tsv import partition_tsv
from unstructured.partition.xlsx import partition_xlsx
from unstructured.partition.xml import partition_xml
from unstructured.utils import dependency_exists
if dependency_exists("pandas"):
from unstructured.partition.csv import partition_csv
from unstructured.partition.tsv import partition_tsv
if dependency_exists("docx"):
from unstructured.partition.doc import partition_doc
from unstructured.partition.docx import partition_docx
if dependency_exists("docx") and dependency_exists("pypandoc"):
from unstructured.partition.odt import partition_odt
if dependency_exists("pypandoc"):
from unstructured.partition.epub import partition_epub
from unstructured.partition.org import partition_org
from unstructured.partition.rst import partition_rst
from unstructured.partition.rtf import partition_rtf
if dependency_exists("markdown"):
from unstructured.partition.md import partition_md
if dependency_exists("msg_parser"):
from unstructured.partition.msg import partition_msg
pdf_imports = ["pdf2image", "pdfminer", "PIL"]
if all(dependency_exists(dep) for dep in pdf_imports):
from unstructured.partition.pdf import partition_pdf
if dependency_exists("unstructured_inference"):
from unstructured.partition.image import partition_image
if dependency_exists("pptx"):
from unstructured.partition.ppt import partition_ppt
from unstructured.partition.pptx import partition_pptx
if dependency_exists("pandas") and dependency_exists("openpyxl"):
from unstructured.partition.xlsx import partition_xlsx
def partition(

View File

@ -7,7 +7,6 @@ from io import BufferedReader, BytesIO, TextIOWrapper
from tempfile import SpooledTemporaryFile
from typing import IO, TYPE_CHECKING, Any, BinaryIO, Dict, List, Optional, Tuple, Union
from docx import table as docxtable
from tabulate import tabulate
from unstructured.documents.coordinates import CoordinateSystem
@ -23,6 +22,10 @@ from unstructured.documents.elements import (
)
from unstructured.logger import logger
from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
from unstructured.utils import dependency_exists
if dependency_exists("docx"):
import docx.table as docxtable
if TYPE_CHECKING:
from unstructured_inference.inference.layoutelement import (
@ -303,12 +306,12 @@ def convert_to_bytes(
return f_bytes
def convert_ms_office_table_to_text(table: docxtable.Table, as_html: bool = True):
def convert_ms_office_table_to_text(table: "docxtable.Table", as_html: bool = True) -> str:
"""
Convert a table object from a Word document to an HTML table string using the tabulate library.
Args:
table (Table): A Table object.
table (Table): A docx.table.Table object.
as_html (bool): Whether to return the table as an HTML string (True) or a
plain text string (False)

View File

@ -4,7 +4,6 @@ from tempfile import SpooledTemporaryFile
from typing import IO, BinaryIO, List, Optional, Tuple, Union, cast
import docx
import pypandoc
from docx.oxml.shared import qn
from docx.text.paragraph import Paragraph
from docx.text.run import Run
@ -38,6 +37,10 @@ from unstructured.partition.text_type import (
is_possible_title,
is_us_city_state_zip,
)
from unstructured.utils import dependency_exists
if dependency_exists("pypandoc"):
import pypandoc
# NOTE(robinson) - documentation on built in styles can be found at the link below
# ref: https://python-docx.readthedocs.io/en/latest/user/
@ -314,7 +317,7 @@ def convert_and_partition_docx(
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
) -> List[Element]:
"""Converts a document to DOCX and then partitions it using partition_html. Works with
"""Converts a document to DOCX and then partitions it using partition_docx. Works with
any file format support by pandoc.
Parameters

View File

@ -3,8 +3,6 @@ import io
import json
from typing import Any, Dict, List, Optional
import pandas as pd
from unstructured.documents.elements import (
TYPE_TO_TEXT_ELEMENT_MAP,
CheckBox,
@ -13,6 +11,10 @@ from unstructured.documents.elements import (
NoID,
)
from unstructured.partition.common import exactly_one
from unstructured.utils import dependency_exists, requires_dependencies
if dependency_exists("pandas"):
import pandas as pd
def _get_metadata_table_fieldnames():
@ -161,7 +163,7 @@ def convert_to_isd_csv(elements: List[Element]) -> str:
if row.get("sent_from"):
row["sender"] = row.get("sent_from")
if type(row["sender"]) == list:
if isinstance(row["sender"], list):
row["sender"] = row["sender"][0]
with io.StringIO() as buffer:
@ -176,11 +178,14 @@ def convert_to_csv(elements: List[Element]) -> str:
return convert_to_isd_csv(elements)
def convert_to_dataframe(elements: List[Element], drop_empty_cols: bool = True) -> pd.DataFrame:
@requires_dependencies(["pandas"])
def convert_to_dataframe(elements: List[Element], drop_empty_cols: bool = True) -> "pd.DataFrame":
"""Converts document elements to a pandas DataFrame. The dataframe contains the
following columns:
text: the element text
type: the text type (NarrativeText, Title, etc)
Output is pd.DataFrame
"""
csv_string = convert_to_isd_csv(elements)
csv_string_io = io.StringIO(csv_string)