mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
build(deps): split up dependencies by document type (#986)
* split dependencies by document type * make pip-compile with new requirements * add extra requirements to setup.py * add in all docs; re pip-compile * extra for all docs * add pandas to xlsx * dependency requires for tsv and csv * handling for doc, docx and odt * dependency check for pypandoc * required dependencies for pandoc files * xml and html * markdown * msg * add in pdf * add in pptx * add in excel * add lxml as base req * extra all docs for local inference * local inference installs all * pin pillow version * fixes for plain text tests * fixes for doc * update make commands * changelog and version * add xlrd * update pip-compile * pin numpy for python 3.8 support * more constraints * contraint on scipy * update install docs * constrain ipython * add outlook to pip-compile * more ipython constraints * add extras to dockerfile * pin office365 client * few doc tweaks * types as strings * last pip-compile * re pip-comple * make tidy * make tidy
This commit is contained in:
parent
13d3559fa4
commit
331c7faf38
@ -1,3 +1,9 @@
|
||||
## 0.9.0
|
||||
|
||||
### Enhancements
|
||||
|
||||
* Dependencies are now split by document type, creating a slimmer base installation.
|
||||
|
||||
## 0.8.8
|
||||
|
||||
### Enhancements
|
||||
@ -6,6 +12,7 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
|
||||
* Rename "date" field to "last_modified"
|
||||
* Adds Box connector
|
||||
|
||||
|
10
Dockerfile
10
Dockerfile
@ -30,7 +30,15 @@ RUN python3.8 -m pip install pip==${PIP_VERSION} && \
|
||||
pip install --no-cache -r requirements/ingest-s3.txt && \
|
||||
pip install --no-cache -r requirements/ingest-slack.txt && \
|
||||
pip install --no-cache -r requirements/ingest-wikipedia.txt && \
|
||||
pip install --no-cache -r requirements/local-inference.txt && \
|
||||
pip install --no-cache -r requirements/extra-csv.txt && \
|
||||
pip install --no-cache -r requirements/extra-docx.txt && \
|
||||
pip install --no-cache -r requirements/extra-markdown.txt && \
|
||||
pip install --no-cache -r requirements/extra-msg.txt && \
|
||||
pip install --no-cache -r requirements/extra-odt.txt && \
|
||||
pip install --no-cache -r requirements/extra-pandoc.txt && \
|
||||
pip install --no-cache -r requirements/extra-pdf-image.txt && \
|
||||
pip install --no-cache -r requirements/extra-pptx.txt && \
|
||||
pip install --no-cache -r requirements/extra-xlsx.txt && \
|
||||
dnf -y groupremove "Development Tools" && \
|
||||
dnf clean all
|
||||
|
||||
|
59
Makefile
59
Makefile
@ -18,10 +18,10 @@ install-base: install-base-pip-packages install-nltk-models
|
||||
|
||||
## install: installs all test, dev, and experimental requirements
|
||||
.PHONY: install
|
||||
install: install-base-pip-packages install-dev install-nltk-models install-test install-huggingface install-unstructured-inference
|
||||
install: install-base-pip-packages install-dev install-nltk-models install-test install-huggingface install-all-docs
|
||||
|
||||
.PHONY: install-ci
|
||||
install-ci: install-base-pip-packages install-nltk-models install-huggingface install-unstructured-inference install-test
|
||||
install-ci: install-base-pip-packages install-nltk-models install-huggingface install-all-docs install-test
|
||||
|
||||
.PHONY: install-base-pip-packages
|
||||
install-base-pip-packages:
|
||||
@ -53,6 +53,45 @@ install-dev:
|
||||
install-build:
|
||||
python3 -m pip install -r requirements/build.txt
|
||||
|
||||
.PHONY: install-csv
|
||||
install-csv:
|
||||
python3 -m pip install -r requirements/extra-csv.txt
|
||||
|
||||
.PHONY: install-docx
|
||||
install-docx:
|
||||
python3 -m pip install -r requirements/extra-docx.txt
|
||||
|
||||
.PHONY: install-odt
|
||||
install-odt:
|
||||
python3 -m pip install -r requirements/extra-odt.txt
|
||||
|
||||
.PHONY: install-pypandoc
|
||||
install-pypandoc:
|
||||
python3 -m pip install -r requirements/extra-pandoc.txt
|
||||
|
||||
.PHONY: install-markdown
|
||||
install-markdown:
|
||||
python3 -m pip install -r requirements/extra-markdown.txt
|
||||
|
||||
.PHONY: install-msg
|
||||
install-msg:
|
||||
python3 -m pip install -r requirements/extra-msg.txt
|
||||
|
||||
.PHONY: install-pdf-image
|
||||
install-pdf-image:
|
||||
python3 -m pip install -r requirements/extra-pdf-image.txt
|
||||
|
||||
.PHONY: install-pptx
|
||||
install-pptx:
|
||||
python3 -m pip install -r requirements/extra-pptx.txt
|
||||
|
||||
.PHONY: install-xlsx
|
||||
install-xlsx:
|
||||
python3 -m pip install -r requirements/extra-xlsx.txt
|
||||
|
||||
.PHONY: install-all-docs
|
||||
install-all-docs: install-base install-csv install-docx install-docx install-odt install-pypandoc install-markdown install-msg install-pdf-image install-pptx install-xlsx
|
||||
|
||||
.PHONY: install-ingest-google-drive
|
||||
install-ingest-google-drive:
|
||||
python3 -m pip install -r requirements/ingest-google-drive.txt
|
||||
@ -124,7 +163,7 @@ install-unstructured-inference:
|
||||
|
||||
## install-local-inference: installs requirements for local inference
|
||||
.PHONY: install-local-inference
|
||||
install-local-inference: install install-unstructured-inference
|
||||
install-local-inference: install install-all-docs
|
||||
|
||||
.PHONY: install-pandoc
|
||||
install-pandoc:
|
||||
@ -135,12 +174,23 @@ install-pandoc:
|
||||
.PHONY: pip-compile
|
||||
pip-compile:
|
||||
pip-compile --upgrade requirements/base.in
|
||||
|
||||
# Extra requirements that are specific to document types
|
||||
pip-compile --upgrade requirements/extra-csv.in
|
||||
pip-compile --upgrade requirements/extra-docx.in
|
||||
pip-compile --upgrade requirements/extra-pandoc.in
|
||||
pip-compile --upgrade requirements/extra-markdown.in
|
||||
pip-compile --upgrade requirements/extra-msg.in
|
||||
pip-compile --upgrade requirements/extra-odt.in
|
||||
pip-compile --upgrade requirements/extra-pdf-image.in
|
||||
pip-compile --upgrade requirements/extra-pptx.in
|
||||
pip-compile --upgrade requirements/extra-xlsx.in
|
||||
|
||||
# Extra requirements for huggingface staging functions
|
||||
pip-compile --upgrade requirements/huggingface.in
|
||||
pip-compile --upgrade requirements/test.in
|
||||
pip-compile --upgrade requirements/dev.in
|
||||
pip-compile --upgrade requirements/build.in
|
||||
pip-compile --upgrade requirements/local-inference.in
|
||||
# NOTE(robinson) - doc/requirements.txt is where the GitHub action for building
|
||||
# sphinx docs looks for additional requirements
|
||||
cp requirements/build.txt docs/requirements.txt
|
||||
@ -158,6 +208,7 @@ pip-compile:
|
||||
pip-compile --upgrade requirements/ingest-google-drive.in
|
||||
pip-compile --upgrade requirements/ingest-elasticsearch.in
|
||||
pip-compile --upgrade requirements/ingest-onedrive.in
|
||||
pip-compile --upgrade requirements/ingest-outlook.in
|
||||
pip-compile --upgrade requirements/ingest-confluence.in
|
||||
|
||||
## install-project-local: install unstructured into your local python environment
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/build.in
|
||||
@ -12,10 +12,14 @@ beautifulsoup4==4.12.2
|
||||
# via furo
|
||||
certifi==2023.7.22
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
# -r requirements/build.in
|
||||
# requests
|
||||
charset-normalizer==3.2.0
|
||||
# via requests
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# requests
|
||||
docutils==0.18.1
|
||||
# via
|
||||
# sphinx
|
||||
@ -23,11 +27,11 @@ docutils==0.18.1
|
||||
furo==2023.7.26
|
||||
# via -r requirements/build.in
|
||||
idna==3.4
|
||||
# via requests
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# requests
|
||||
imagesize==1.4.1
|
||||
# via sphinx
|
||||
importlib-metadata==6.8.0
|
||||
# via sphinx
|
||||
jinja2==3.1.2
|
||||
# via sphinx
|
||||
markupsafe==2.1.3
|
||||
@ -38,10 +42,10 @@ pygments==2.15.1
|
||||
# via
|
||||
# furo
|
||||
# sphinx
|
||||
pytz==2023.3
|
||||
# via babel
|
||||
requests==2.31.0
|
||||
# via sphinx
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# sphinx
|
||||
snowballstemmer==2.2.0
|
||||
# via sphinx
|
||||
soupsieve==2.4.1
|
||||
@ -71,7 +75,8 @@ sphinxcontrib-qthelp==1.0.3
|
||||
# via sphinx
|
||||
sphinxcontrib-serializinghtml==1.1.5
|
||||
# via sphinx
|
||||
urllib3==2.0.4
|
||||
# via requests
|
||||
zipp==3.16.2
|
||||
# via importlib-metadata
|
||||
urllib3==1.26.16
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
# requests
|
||||
|
@ -7,8 +7,15 @@ Quick Start
|
||||
Use the following instructions to get up and running with ``unstructured`` and test your
|
||||
installation.
|
||||
|
||||
* Install the Python SDK with ``pip install "unstructured[local-inference]"``
|
||||
* If you do not need to process PDFs or images, you can run ``pip install unstructured``
|
||||
* Install the Python SDK with ``pip install unstructured``
|
||||
* Plain text files, HTML, XML, JSON and Emails do not require any extra dependencies.
|
||||
* If you need to process other document types, you can install the extras required for those documents
|
||||
with ``pip install "unstructured[docx,pptx]"``.
|
||||
* To install the extras for every document type, use ``pip install "unstructured[all-docs]"``.
|
||||
* For ``unstructured<0.9.0``, you can install the extras for all document types with
|
||||
``pip install "unstructured[local-inference]"``. The ``local-inference`` extra is still
|
||||
supported in newer versions for backward compatibility, but may be deprecated in a future version.
|
||||
The ``all-docs`` extra is the officially supported installation pattern.
|
||||
|
||||
* Install the following system dependencies if they are not already available on your system. Depending on what document types you're parsing, you may not need all of these.
|
||||
* ``libmagic-dev`` (filetype detection)
|
||||
|
@ -1,19 +1,8 @@
|
||||
-c "constraints.in"
|
||||
chardet
|
||||
filetype
|
||||
lxml
|
||||
msg_parser
|
||||
nltk
|
||||
openpyxl
|
||||
pandas
|
||||
pdf2image
|
||||
pdfminer.six
|
||||
pillow
|
||||
pypandoc
|
||||
python-docx
|
||||
python-pptx
|
||||
python-magic
|
||||
markdown
|
||||
requests
|
||||
lxml
|
||||
nltk
|
||||
tabulate
|
||||
xlrd
|
||||
requests
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/base.in
|
||||
@ -8,89 +8,33 @@ certifi==2023.7.22
|
||||
# via
|
||||
# -c requirements/constraints.in
|
||||
# requests
|
||||
cffi==1.15.1
|
||||
# via cryptography
|
||||
chardet==5.1.0
|
||||
# via -r requirements/base.in
|
||||
charset-normalizer==3.2.0
|
||||
# via
|
||||
# pdfminer-six
|
||||
# requests
|
||||
# via requests
|
||||
click==8.1.6
|
||||
# via nltk
|
||||
cryptography==41.0.2
|
||||
# via pdfminer-six
|
||||
et-xmlfile==1.1.0
|
||||
# via openpyxl
|
||||
filetype==1.2.0
|
||||
# via -r requirements/base.in
|
||||
idna==3.4
|
||||
# via requests
|
||||
importlib-metadata==6.8.0
|
||||
# via markdown
|
||||
joblib==1.3.1
|
||||
# via nltk
|
||||
lxml==4.9.3
|
||||
# via
|
||||
# -r requirements/base.in
|
||||
# python-docx
|
||||
# python-pptx
|
||||
markdown==3.4.4
|
||||
# via -r requirements/base.in
|
||||
msg-parser==1.2.0
|
||||
# via -r requirements/base.in
|
||||
nltk==3.8.1
|
||||
# via -r requirements/base.in
|
||||
numpy==1.24.4
|
||||
# via pandas
|
||||
olefile==0.46
|
||||
# via msg-parser
|
||||
openpyxl==3.1.2
|
||||
# via -r requirements/base.in
|
||||
pandas==2.0.3
|
||||
# via -r requirements/base.in
|
||||
pdf2image==1.16.3
|
||||
# via -r requirements/base.in
|
||||
pdfminer-six==20221105
|
||||
# via -r requirements/base.in
|
||||
pillow==10.0.0
|
||||
# via
|
||||
# -r requirements/base.in
|
||||
# pdf2image
|
||||
# python-pptx
|
||||
pycparser==2.21
|
||||
# via cffi
|
||||
pypandoc==1.11
|
||||
# via -r requirements/base.in
|
||||
python-dateutil==2.8.2
|
||||
# via pandas
|
||||
python-docx==0.8.11
|
||||
# via -r requirements/base.in
|
||||
python-magic==0.4.27
|
||||
# via -r requirements/base.in
|
||||
python-pptx==0.6.21
|
||||
# via -r requirements/base.in
|
||||
pytz==2023.3
|
||||
# via pandas
|
||||
regex==2023.6.3
|
||||
# via nltk
|
||||
requests==2.31.0
|
||||
# via -r requirements/base.in
|
||||
six==1.16.0
|
||||
# via python-dateutil
|
||||
tabulate==0.9.0
|
||||
# via -r requirements/base.in
|
||||
tqdm==4.65.0
|
||||
# via nltk
|
||||
tzdata==2023.3
|
||||
# via pandas
|
||||
urllib3==1.26.16
|
||||
# via
|
||||
# -c requirements/constraints.in
|
||||
# requests
|
||||
xlrd==2.0.1
|
||||
# via -r requirements/base.in
|
||||
xlsxwriter==3.1.2
|
||||
# via python-pptx
|
||||
zipp==3.16.2
|
||||
# via importlib-metadata
|
||||
|
@ -1,3 +1,6 @@
|
||||
-c base.txt
|
||||
-c constraints.in
|
||||
|
||||
sphinx
|
||||
# NOTE(alan) - Pinning to resolve a conflict with sphinx. We can unpin on next sphinx_rtd_theme release.
|
||||
sphinx_rtd_theme==1.2.2
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/build.in
|
||||
@ -12,10 +12,14 @@ beautifulsoup4==4.12.2
|
||||
# via furo
|
||||
certifi==2023.7.22
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
# -r requirements/build.in
|
||||
# requests
|
||||
charset-normalizer==3.2.0
|
||||
# via requests
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# requests
|
||||
docutils==0.18.1
|
||||
# via
|
||||
# sphinx
|
||||
@ -23,11 +27,11 @@ docutils==0.18.1
|
||||
furo==2023.7.26
|
||||
# via -r requirements/build.in
|
||||
idna==3.4
|
||||
# via requests
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# requests
|
||||
imagesize==1.4.1
|
||||
# via sphinx
|
||||
importlib-metadata==6.8.0
|
||||
# via sphinx
|
||||
jinja2==3.1.2
|
||||
# via sphinx
|
||||
markupsafe==2.1.3
|
||||
@ -38,10 +42,10 @@ pygments==2.15.1
|
||||
# via
|
||||
# furo
|
||||
# sphinx
|
||||
pytz==2023.3
|
||||
# via babel
|
||||
requests==2.31.0
|
||||
# via sphinx
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# sphinx
|
||||
snowballstemmer==2.2.0
|
||||
# via sphinx
|
||||
soupsieve==2.4.1
|
||||
@ -71,7 +75,8 @@ sphinxcontrib-qthelp==1.0.3
|
||||
# via sphinx
|
||||
sphinxcontrib-serializinghtml==1.1.5
|
||||
# via sphinx
|
||||
urllib3==2.0.4
|
||||
# via requests
|
||||
zipp==3.16.2
|
||||
# via importlib-metadata
|
||||
urllib3==1.26.16
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
# requests
|
||||
|
@ -12,6 +12,13 @@ jupyter-core>=4.11.2
|
||||
wheel>=0.38.1
|
||||
# NOTE(robinson) - The following pins are to address
|
||||
# vulnerabilities in dependency scans
|
||||
certifi>=2022.12.07
|
||||
certifi>=2023.7.22
|
||||
# From pycocotools in local-inference
|
||||
pyparsing<3.1.0
|
||||
# NOTE(robinson) - Numpy dropped Python 3.8 support in 1.25.0
|
||||
numpy<1.25.0
|
||||
scipy<1.11.0
|
||||
IPython<8.13
|
||||
# NOTE(robinson) - See this issue here
|
||||
# https://github.com/facebookresearch/detectron2/issues/5010
|
||||
Pillow<10.0.0
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/dev.in
|
||||
@ -41,9 +41,7 @@ certifi==2023.7.22
|
||||
# -c requirements/test.txt
|
||||
# requests
|
||||
cffi==1.15.1
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# argon2-cffi-bindings
|
||||
# via argon2-cffi-bindings
|
||||
cfgv==3.3.1
|
||||
# via pre-commit
|
||||
charset-normalizer==3.2.0
|
||||
@ -57,7 +55,9 @@ click==8.1.6
|
||||
# -c requirements/test.txt
|
||||
# pip-tools
|
||||
comm==0.1.3
|
||||
# via ipykernel
|
||||
# via
|
||||
# ipykernel
|
||||
# ipywidgets
|
||||
debugpy==1.6.7
|
||||
# via ipykernel
|
||||
decorator==5.1.1
|
||||
@ -66,10 +66,6 @@ defusedxml==0.7.1
|
||||
# via nbconvert
|
||||
distlib==0.3.7
|
||||
# via virtualenv
|
||||
exceptiongroup==1.1.2
|
||||
# via
|
||||
# -c requirements/test.txt
|
||||
# anyio
|
||||
executing==1.2.0
|
||||
# via stack-data
|
||||
fastjsonschema==2.18.0
|
||||
@ -87,40 +83,26 @@ idna==3.4
|
||||
# anyio
|
||||
# jsonschema
|
||||
# requests
|
||||
importlib-metadata==6.8.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# jupyter-client
|
||||
# jupyter-lsp
|
||||
# jupyterlab
|
||||
# jupyterlab-server
|
||||
# nbconvert
|
||||
importlib-resources==6.0.0
|
||||
# via
|
||||
# jsonschema
|
||||
# jsonschema-specifications
|
||||
# jupyterlab
|
||||
# notebook
|
||||
ipykernel==6.25.0
|
||||
# via
|
||||
# ipywidgets
|
||||
# jupyter
|
||||
# jupyter-console
|
||||
# jupyterlab
|
||||
# qtconsole
|
||||
ipython==8.12.2
|
||||
# via
|
||||
# -c requirements/constraints.in
|
||||
# -r requirements/dev.in
|
||||
# ipykernel
|
||||
# ipywidgets
|
||||
# jupyter-console
|
||||
ipython-genutils==0.2.0
|
||||
# via qtconsole
|
||||
ipywidgets==8.0.7
|
||||
ipywidgets==8.1.0
|
||||
# via jupyter
|
||||
isoduration==20.11.0
|
||||
# via jsonschema
|
||||
jedi==0.18.2
|
||||
jedi==0.19.0
|
||||
# via ipython
|
||||
jinja2==3.1.2
|
||||
# via
|
||||
@ -162,7 +144,7 @@ jupyter-core==5.3.1
|
||||
# nbconvert
|
||||
# nbformat
|
||||
# qtconsole
|
||||
jupyter-events==0.6.3
|
||||
jupyter-events==0.7.0
|
||||
# via jupyter-server
|
||||
jupyter-lsp==2.2.0
|
||||
# via jupyterlab
|
||||
@ -201,16 +183,16 @@ nbconvert==7.7.3
|
||||
# via
|
||||
# jupyter
|
||||
# jupyter-server
|
||||
nbformat==5.9.1
|
||||
nbformat==5.9.2
|
||||
# via
|
||||
# jupyter-server
|
||||
# nbclient
|
||||
# nbconvert
|
||||
nest-asyncio==1.5.6
|
||||
nest-asyncio==1.5.7
|
||||
# via ipykernel
|
||||
nodeenv==1.8.0
|
||||
# via pre-commit
|
||||
notebook==7.0.0
|
||||
notebook==7.0.1
|
||||
# via jupyter
|
||||
notebook-shim==0.2.3
|
||||
# via
|
||||
@ -239,9 +221,7 @@ pickleshare==0.7.5
|
||||
# via ipython
|
||||
pip-tools==7.1.0
|
||||
# via -r requirements/dev.in
|
||||
pkgutil-resolve-name==1.3.10
|
||||
# via jsonschema
|
||||
platformdirs==3.9.1
|
||||
platformdirs==3.10.0
|
||||
# via
|
||||
# -c requirements/test.txt
|
||||
# jupyter-core
|
||||
@ -263,9 +243,7 @@ ptyprocess==0.7.0
|
||||
pure-eval==0.2.2
|
||||
# via stack-data
|
||||
pycparser==2.21
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# cffi
|
||||
# via cffi
|
||||
pygments==2.15.1
|
||||
# via
|
||||
# ipython
|
||||
@ -276,16 +254,11 @@ pyproject-hooks==1.0.0
|
||||
# via build
|
||||
python-dateutil==2.8.2
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/test.txt
|
||||
# arrow
|
||||
# jupyter-client
|
||||
python-json-logger==2.0.7
|
||||
# via jupyter-events
|
||||
pytz==2023.3
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# babel
|
||||
pyyaml==6.0.1
|
||||
# via
|
||||
# -c requirements/test.txt
|
||||
@ -306,6 +279,7 @@ referencing==0.30.0
|
||||
# via
|
||||
# jsonschema
|
||||
# jsonschema-specifications
|
||||
# jupyter-events
|
||||
requests==2.31.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
@ -327,7 +301,6 @@ send2trash==1.8.2
|
||||
# via jupyter-server
|
||||
six==1.16.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/test.txt
|
||||
# asttokens
|
||||
# bleach
|
||||
@ -345,13 +318,6 @@ terminado==0.17.1
|
||||
# jupyter-server-terminals
|
||||
tinycss2==1.2.1
|
||||
# via nbconvert
|
||||
tomli==2.0.1
|
||||
# via
|
||||
# -c requirements/test.txt
|
||||
# build
|
||||
# jupyterlab
|
||||
# pip-tools
|
||||
# pyproject-hooks
|
||||
tornado==6.3.2
|
||||
# via
|
||||
# ipykernel
|
||||
@ -377,11 +343,6 @@ traitlets==5.9.0
|
||||
# nbconvert
|
||||
# nbformat
|
||||
# qtconsole
|
||||
typing-extensions==4.7.1
|
||||
# via
|
||||
# -c requirements/test.txt
|
||||
# async-lru
|
||||
# ipython
|
||||
uri-template==1.3.0
|
||||
# via jsonschema
|
||||
urllib3==1.26.16
|
||||
@ -408,11 +369,6 @@ wheel==0.41.0
|
||||
# pip-tools
|
||||
widgetsnbextension==4.0.8
|
||||
# via ipywidgets
|
||||
zipp==3.16.2
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# importlib-metadata
|
||||
# importlib-resources
|
||||
|
||||
# The following packages are considered to be unsafe in a requirements file:
|
||||
# pip
|
||||
|
@ -1,3 +1,4 @@
|
||||
-c constraints.in
|
||||
-c base.txt
|
||||
unstructured-inference==0.5.7
|
||||
|
||||
pandas
|
20
requirements/extra-csv.txt
Normal file
20
requirements/extra-csv.txt
Normal file
@ -0,0 +1,20 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/extra-csv.in
|
||||
#
|
||||
numpy==1.24.4
|
||||
# via
|
||||
# -c requirements/constraints.in
|
||||
# pandas
|
||||
pandas==2.0.3
|
||||
# via -r requirements/extra-csv.in
|
||||
python-dateutil==2.8.2
|
||||
# via pandas
|
||||
pytz==2023.3
|
||||
# via pandas
|
||||
six==1.16.0
|
||||
# via python-dateutil
|
||||
tzdata==2023.3
|
||||
# via pandas
|
4
requirements/extra-docx.in
Normal file
4
requirements/extra-docx.in
Normal file
@ -0,0 +1,4 @@
|
||||
-c constraints.in
|
||||
-c base.txt
|
||||
|
||||
python-docx
|
12
requirements/extra-docx.txt
Normal file
12
requirements/extra-docx.txt
Normal file
@ -0,0 +1,12 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/extra-docx.in
|
||||
#
|
||||
lxml==4.9.3
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# python-docx
|
||||
python-docx==0.8.11
|
||||
# via -r requirements/extra-docx.in
|
4
requirements/extra-markdown.in
Normal file
4
requirements/extra-markdown.in
Normal file
@ -0,0 +1,4 @@
|
||||
-c "constraints.in"
|
||||
-c "base.txt"
|
||||
|
||||
markdown
|
8
requirements/extra-markdown.txt
Normal file
8
requirements/extra-markdown.txt
Normal file
@ -0,0 +1,8 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/extra-markdown.in
|
||||
#
|
||||
markdown==3.4.4
|
||||
# via -r requirements/extra-markdown.in
|
4
requirements/extra-msg.in
Normal file
4
requirements/extra-msg.in
Normal file
@ -0,0 +1,4 @@
|
||||
-c constraints.in
|
||||
-c base.txt
|
||||
|
||||
msg_parser
|
10
requirements/extra-msg.txt
Normal file
10
requirements/extra-msg.txt
Normal file
@ -0,0 +1,10 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/extra-msg.in
|
||||
#
|
||||
msg-parser==1.2.0
|
||||
# via -r requirements/extra-msg.in
|
||||
olefile==0.46
|
||||
# via msg-parser
|
5
requirements/extra-odt.in
Normal file
5
requirements/extra-odt.in
Normal file
@ -0,0 +1,5 @@
|
||||
-c constraints.in
|
||||
-c base.txt
|
||||
|
||||
python-docx
|
||||
pypandoc
|
14
requirements/extra-odt.txt
Normal file
14
requirements/extra-odt.txt
Normal file
@ -0,0 +1,14 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/extra-odt.in
|
||||
#
|
||||
lxml==4.9.3
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# python-docx
|
||||
pypandoc==1.11
|
||||
# via -r requirements/extra-odt.in
|
||||
python-docx==0.8.11
|
||||
# via -r requirements/extra-odt.in
|
4
requirements/extra-pandoc.in
Normal file
4
requirements/extra-pandoc.in
Normal file
@ -0,0 +1,4 @@
|
||||
-c constraints.in
|
||||
-c base.txt
|
||||
|
||||
pypandoc
|
8
requirements/extra-pandoc.txt
Normal file
8
requirements/extra-pandoc.txt
Normal file
@ -0,0 +1,8 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/extra-pandoc.in
|
||||
#
|
||||
pypandoc==1.11
|
||||
# via -r requirements/extra-pandoc.in
|
9
requirements/extra-pdf-image.in
Normal file
9
requirements/extra-pdf-image.in
Normal file
@ -0,0 +1,9 @@
|
||||
-c constraints.in
|
||||
-c base.txt
|
||||
|
||||
pdf2image
|
||||
pdfminer.six
|
||||
# NOTE(robinson) - See this issue here
|
||||
# https://github.com/facebookresearch/detectron2/issues/5010
|
||||
Pillow<10
|
||||
unstructured-inference==0.5.7
|
@ -1,8 +1,8 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/local-inference.in
|
||||
# pip-compile requirements/extra-pdf-image.in
|
||||
#
|
||||
antlr4-python3-runtime==4.9.3
|
||||
# via omegaconf
|
||||
@ -12,9 +12,7 @@ certifi==2023.7.22
|
||||
# -c requirements/constraints.in
|
||||
# requests
|
||||
cffi==1.15.1
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# cryptography
|
||||
# via cryptography
|
||||
charset-normalizer==3.2.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
@ -25,9 +23,7 @@ coloredlogs==15.0.1
|
||||
contourpy==1.1.0
|
||||
# via matplotlib
|
||||
cryptography==41.0.2
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# pdfminer-six
|
||||
# via pdfminer-six
|
||||
cycler==0.11.0
|
||||
# via matplotlib
|
||||
effdet==0.4.1
|
||||
@ -54,8 +50,6 @@ idna==3.4
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# requests
|
||||
importlib-resources==6.0.0
|
||||
# via matplotlib
|
||||
iopath==0.1.10
|
||||
# via layoutparser
|
||||
jinja2==3.1.2
|
||||
@ -74,7 +68,7 @@ networkx==3.1
|
||||
# via torch
|
||||
numpy==1.24.4
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
# contourpy
|
||||
# layoutparser
|
||||
# matplotlib
|
||||
@ -101,22 +95,21 @@ packaging==23.1
|
||||
# pytesseract
|
||||
# transformers
|
||||
pandas==2.0.3
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# layoutparser
|
||||
# via layoutparser
|
||||
pdf2image==1.16.3
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -r requirements/extra-pdf-image.in
|
||||
# layoutparser
|
||||
pdfminer-six==20221105
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -r requirements/extra-pdf-image.in
|
||||
# pdfplumber
|
||||
pdfplumber==0.10.1
|
||||
pdfplumber==0.10.2
|
||||
# via layoutparser
|
||||
pillow==10.0.0
|
||||
pillow==9.5.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
# -r requirements/extra-pdf-image.in
|
||||
# layoutparser
|
||||
# matplotlib
|
||||
# pdf2image
|
||||
@ -132,9 +125,7 @@ protobuf==4.23.4
|
||||
pycocotools==2.0.6
|
||||
# via effdet
|
||||
pycparser==2.21
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# cffi
|
||||
# via cffi
|
||||
pyparsing==3.0.9
|
||||
# via
|
||||
# -c requirements/constraints.in
|
||||
@ -145,15 +136,12 @@ pytesseract==0.3.10
|
||||
# via layoutparser
|
||||
python-dateutil==2.8.2
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# matplotlib
|
||||
# pandas
|
||||
python-multipart==0.0.6
|
||||
# via unstructured-inference
|
||||
pytz==2023.3
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# pandas
|
||||
# via pandas
|
||||
pyyaml==6.0.1
|
||||
# via
|
||||
# huggingface-hub
|
||||
@ -176,11 +164,11 @@ safetensors==0.3.1
|
||||
# timm
|
||||
# transformers
|
||||
scipy==1.10.1
|
||||
# via layoutparser
|
||||
six==1.16.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# python-dateutil
|
||||
# -c requirements/constraints.in
|
||||
# layoutparser
|
||||
six==1.16.0
|
||||
# via python-dateutil
|
||||
sympy==1.12
|
||||
# via
|
||||
# onnxruntime
|
||||
@ -214,17 +202,11 @@ typing-extensions==4.7.1
|
||||
# iopath
|
||||
# torch
|
||||
tzdata==2023.3
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# pandas
|
||||
# via pandas
|
||||
unstructured-inference==0.5.7
|
||||
# via -r requirements/local-inference.in
|
||||
# via -r requirements/extra-pdf-image.in
|
||||
urllib3==1.26.16
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
# requests
|
||||
zipp==3.16.2
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# importlib-resources
|
3
requirements/extra-pptx.in
Normal file
3
requirements/extra-pptx.in
Normal file
@ -0,0 +1,3 @@
|
||||
-c "constraints.in"
|
||||
|
||||
python-pptx
|
16
requirements/extra-pptx.txt
Normal file
16
requirements/extra-pptx.txt
Normal file
@ -0,0 +1,16 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/extra-pptx.in
|
||||
#
|
||||
lxml==4.9.3
|
||||
# via python-pptx
|
||||
pillow==9.5.0
|
||||
# via
|
||||
# -c requirements/constraints.in
|
||||
# python-pptx
|
||||
python-pptx==0.6.21
|
||||
# via -r requirements/extra-pptx.in
|
||||
xlsxwriter==3.1.2
|
||||
# via python-pptx
|
6
requirements/extra-xlsx.in
Normal file
6
requirements/extra-xlsx.in
Normal file
@ -0,0 +1,6 @@
|
||||
-c constraints.in
|
||||
-c base.txt
|
||||
|
||||
openpyxl
|
||||
pandas
|
||||
xlrd
|
26
requirements/extra-xlsx.txt
Normal file
26
requirements/extra-xlsx.txt
Normal file
@ -0,0 +1,26 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/extra-xlsx.in
|
||||
#
|
||||
et-xmlfile==1.1.0
|
||||
# via openpyxl
|
||||
numpy==1.24.4
|
||||
# via
|
||||
# -c requirements/constraints.in
|
||||
# pandas
|
||||
openpyxl==3.1.2
|
||||
# via -r requirements/extra-xlsx.in
|
||||
pandas==2.0.3
|
||||
# via -r requirements/extra-xlsx.in
|
||||
python-dateutil==2.8.2
|
||||
# via pandas
|
||||
pytz==2023.3
|
||||
# via pandas
|
||||
six==1.16.0
|
||||
# via python-dateutil
|
||||
tzdata==2023.3
|
||||
# via pandas
|
||||
xlrd==2.0.1
|
||||
# via -r requirements/extra-xlsx.in
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/huggingface.in
|
||||
@ -46,7 +46,7 @@ networkx==3.1
|
||||
# via torch
|
||||
numpy==1.24.4
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
# transformers
|
||||
packaging==23.1
|
||||
# via
|
||||
@ -74,7 +74,6 @@ sentencepiece==0.1.99
|
||||
# via -r requirements/huggingface.in
|
||||
six==1.16.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# langdetect
|
||||
# sacremoses
|
||||
sympy==1.12
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-azure.in
|
||||
@ -32,7 +32,6 @@ certifi==2023.7.22
|
||||
# requests
|
||||
cffi==1.15.1
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# azure-datalake-store
|
||||
# cryptography
|
||||
charset-normalizer==3.2.0
|
||||
@ -42,7 +41,6 @@ charset-normalizer==3.2.0
|
||||
# requests
|
||||
cryptography==41.0.2
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# azure-identity
|
||||
# azure-storage-blob
|
||||
# msal
|
||||
@ -76,9 +74,7 @@ multidict==6.0.4
|
||||
portalocker==2.7.0
|
||||
# via msal-extensions
|
||||
pycparser==2.21
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# cffi
|
||||
# via cffi
|
||||
pyjwt[crypto]==2.8.0
|
||||
# via msal
|
||||
requests==2.31.0
|
||||
@ -89,7 +85,6 @@ requests==2.31.0
|
||||
# msal
|
||||
six==1.16.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# azure-core
|
||||
# azure-identity
|
||||
# isodate
|
||||
|
@ -8,7 +8,7 @@ attrs==23.1.0
|
||||
# via boxsdk
|
||||
boxfs==0.2.0
|
||||
# via -r requirements/ingest-box.in
|
||||
boxsdk[jwt]==3.8.0
|
||||
boxsdk[jwt]==3.8.1
|
||||
# via boxfs
|
||||
certifi==2023.7.22
|
||||
# via
|
||||
@ -16,17 +16,13 @@ certifi==2023.7.22
|
||||
# -c requirements/constraints.in
|
||||
# requests
|
||||
cffi==1.15.1
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# cryptography
|
||||
# via cryptography
|
||||
charset-normalizer==3.2.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# requests
|
||||
cryptography==41.0.2
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# boxsdk
|
||||
# via boxsdk
|
||||
fsspec==2023.6.0
|
||||
# via
|
||||
# -r requirements/ingest-box.in
|
||||
@ -36,15 +32,11 @@ idna==3.4
|
||||
# -c requirements/base.txt
|
||||
# requests
|
||||
pycparser==2.21
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# cffi
|
||||
# via cffi
|
||||
pyjwt==2.8.0
|
||||
# via boxsdk
|
||||
python-dateutil==2.8.2
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# boxsdk
|
||||
# via boxsdk
|
||||
requests==2.31.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
@ -53,9 +45,7 @@ requests==2.31.0
|
||||
requests-toolbelt==1.0.0
|
||||
# via boxsdk
|
||||
six==1.16.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# python-dateutil
|
||||
# via python-dateutil
|
||||
urllib3==1.26.16
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
|
@ -1,10 +1,10 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-confluence.in
|
||||
#
|
||||
atlassian-python-api==3.39.0
|
||||
atlassian-python-api==3.40.0
|
||||
# via -r requirements/ingest-confluence.in
|
||||
certifi==2023.7.22
|
||||
# via
|
||||
@ -33,9 +33,7 @@ requests==2.31.0
|
||||
requests-oauthlib==1.3.1
|
||||
# via atlassian-python-api
|
||||
six==1.16.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# atlassian-python-api
|
||||
# via atlassian-python-api
|
||||
urllib3==1.26.16
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-discord.in
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-dropbox.in
|
||||
@ -34,7 +34,6 @@ requests==2.31.0
|
||||
# dropboxdrivefs
|
||||
six==1.16.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# dropbox
|
||||
# stone
|
||||
stone==3.3.1
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-elasticsearch.in
|
||||
@ -11,7 +11,7 @@ certifi==2023.7.22
|
||||
# elastic-transport
|
||||
elastic-transport==8.4.0
|
||||
# via elasticsearch
|
||||
elasticsearch==8.8.2
|
||||
elasticsearch==8.9.0
|
||||
# via -r requirements/ingest-elasticsearch.in
|
||||
jq==1.4.1
|
||||
# via -r requirements/ingest-elasticsearch.in
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-gcs.in
|
||||
@ -57,7 +57,7 @@ google-crc32c==1.5.0
|
||||
# via google-resumable-media
|
||||
google-resumable-media==2.5.0
|
||||
# via google-cloud-storage
|
||||
googleapis-common-protos==1.59.1
|
||||
googleapis-common-protos==1.60.0
|
||||
# via google-api-core
|
||||
idna==3.4
|
||||
# via
|
||||
@ -74,7 +74,6 @@ protobuf==4.23.4
|
||||
# via
|
||||
# -c requirements/constraints.in
|
||||
# google-api-core
|
||||
# googleapis-common-protos
|
||||
pyasn1==0.5.0
|
||||
# via
|
||||
# pyasn1-modules
|
||||
@ -93,9 +92,7 @@ requests-oauthlib==1.3.1
|
||||
rsa==4.9
|
||||
# via google-auth
|
||||
six==1.16.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# google-auth
|
||||
# via google-auth
|
||||
urllib3==1.26.16
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-github.in
|
||||
@ -11,7 +11,6 @@ certifi==2023.7.22
|
||||
# requests
|
||||
cffi==1.15.1
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# cryptography
|
||||
# pynacl
|
||||
charset-normalizer==3.2.0
|
||||
@ -19,9 +18,7 @@ charset-normalizer==3.2.0
|
||||
# -c requirements/base.txt
|
||||
# requests
|
||||
cryptography==41.0.2
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# pyjwt
|
||||
# via pyjwt
|
||||
deprecated==1.2.14
|
||||
# via pygithub
|
||||
idna==3.4
|
||||
@ -29,9 +26,7 @@ idna==3.4
|
||||
# -c requirements/base.txt
|
||||
# requests
|
||||
pycparser==2.21
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# cffi
|
||||
# via cffi
|
||||
pygithub==1.58.2
|
||||
# via -r requirements/ingest-github.in
|
||||
pyjwt[crypto]==2.8.0
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-gitlab.in
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-google-drive.in
|
||||
@ -26,7 +26,7 @@ google-auth==2.22.0
|
||||
# google-auth-httplib2
|
||||
google-auth-httplib2==0.1.0
|
||||
# via google-api-python-client
|
||||
googleapis-common-protos==1.59.1
|
||||
googleapis-common-protos==1.60.0
|
||||
# via google-api-core
|
||||
httplib2==0.22.0
|
||||
# via
|
||||
@ -59,7 +59,6 @@ rsa==4.9
|
||||
# via google-auth
|
||||
six==1.16.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# google-auth
|
||||
# google-auth-httplib2
|
||||
uritemplate==4.1.1
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-onedrive.in
|
||||
@ -10,16 +10,13 @@ certifi==2023.7.22
|
||||
# -c requirements/constraints.in
|
||||
# requests
|
||||
cffi==1.15.1
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# cryptography
|
||||
# via cryptography
|
||||
charset-normalizer==3.2.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# requests
|
||||
cryptography==41.0.2
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -r requirements/ingest-onedrive.in
|
||||
# msal
|
||||
# pyjwt
|
||||
@ -34,15 +31,11 @@ msal==1.23.0
|
||||
office365-rest-python-client==2.4.2
|
||||
# via -r requirements/ingest-onedrive.in
|
||||
pycparser==2.21
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# cffi
|
||||
# via cffi
|
||||
pyjwt[crypto]==2.8.0
|
||||
# via msal
|
||||
pytz==2023.3
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# office365-rest-python-client
|
||||
# via office365-rest-python-client
|
||||
requests==2.31.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-outlook.in
|
||||
@ -10,16 +10,13 @@ certifi==2023.7.22
|
||||
# -c requirements/constraints.in
|
||||
# requests
|
||||
cffi==1.15.1
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# cryptography
|
||||
# via cryptography
|
||||
charset-normalizer==3.2.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# requests
|
||||
cryptography==41.0.2
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -r requirements/ingest-outlook.in
|
||||
# msal
|
||||
# pyjwt
|
||||
@ -34,15 +31,11 @@ msal==1.23.0
|
||||
office365-rest-python-client==2.4.2
|
||||
# via -r requirements/ingest-outlook.in
|
||||
pycparser==2.21
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# cffi
|
||||
# via cffi
|
||||
pyjwt[crypto]==2.8.0
|
||||
# via msal
|
||||
pytz==2023.3
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# office365-rest-python-client
|
||||
# via office365-rest-python-client
|
||||
requests==2.31.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-reddit.in
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-s3.in
|
||||
@ -43,17 +43,11 @@ multidict==6.0.4
|
||||
# aiohttp
|
||||
# yarl
|
||||
python-dateutil==2.8.2
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# botocore
|
||||
# via botocore
|
||||
s3fs==2023.6.0
|
||||
# via -r requirements/ingest-s3.in
|
||||
six==1.16.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# python-dateutil
|
||||
typing-extensions==4.7.1
|
||||
# via aioitertools
|
||||
# via python-dateutil
|
||||
urllib3==1.26.16
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-slack.in
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-wikipedia.in
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# This file is autogenerated by pip-compile with Python 3.11
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/test.in
|
||||
@ -26,9 +26,7 @@ coverage[toml]==7.2.7
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# pytest-cov
|
||||
exceptiongroup==1.1.2
|
||||
# via pytest
|
||||
flake8==6.0.0
|
||||
flake8==6.1.0
|
||||
# via -r requirements/test.in
|
||||
freezegun==1.2.2
|
||||
# via -r requirements/test.in
|
||||
@ -64,17 +62,17 @@ packaging==23.1
|
||||
# via
|
||||
# black
|
||||
# pytest
|
||||
pathspec==0.11.1
|
||||
pathspec==0.11.2
|
||||
# via black
|
||||
platformdirs==3.9.1
|
||||
platformdirs==3.10.0
|
||||
# via black
|
||||
pluggy==1.2.0
|
||||
# via pytest
|
||||
pycodestyle==2.10.0
|
||||
pycodestyle==2.11.0
|
||||
# via flake8
|
||||
pydantic==1.10.12
|
||||
# via label-studio-sdk
|
||||
pyflakes==3.0.1
|
||||
pyflakes==3.1.0
|
||||
# via flake8
|
||||
pytest==7.4.0
|
||||
# via
|
||||
@ -85,28 +83,17 @@ pytest-cov==4.1.0
|
||||
pytest-mock==3.11.1
|
||||
# via -r requirements/test.in
|
||||
python-dateutil==2.8.2
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# freezegun
|
||||
# via freezegun
|
||||
pyyaml==6.0.1
|
||||
# via vcrpy
|
||||
requests==2.31.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# label-studio-sdk
|
||||
ruff==0.0.280
|
||||
ruff==0.0.281
|
||||
# via -r requirements/test.in
|
||||
six==1.16.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# python-dateutil
|
||||
# vcrpy
|
||||
tomli==2.0.1
|
||||
# via
|
||||
# black
|
||||
# coverage
|
||||
# mypy
|
||||
# pytest
|
||||
# via python-dateutil
|
||||
types-click==7.1.8
|
||||
# via -r requirements/test.in
|
||||
types-markdown==3.4.2.10
|
||||
@ -119,7 +106,6 @@ types-urllib3==1.26.25.14
|
||||
# via types-requests
|
||||
typing-extensions==4.7.1
|
||||
# via
|
||||
# black
|
||||
# mypy
|
||||
# pydantic
|
||||
urllib3==1.26.16
|
||||
@ -127,8 +113,7 @@ urllib3==1.26.16
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
# requests
|
||||
# vcrpy
|
||||
vcrpy==5.0.0
|
||||
vcrpy==5.1.0
|
||||
# via -r requirements/test.in
|
||||
wrapt==1.15.0
|
||||
# via vcrpy
|
||||
|
61
setup.py
61
setup.py
@ -34,11 +34,48 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List
|
||||
with open(file, encoding="utf-8") as f:
|
||||
requirements.extend(f.readlines())
|
||||
requirements = [
|
||||
req for req in requirements if not req.startswith("#") and not req.startswith("-")
|
||||
req
|
||||
for req in requirements
|
||||
if not req.startswith("#") and not req.startswith("-")
|
||||
]
|
||||
return requirements
|
||||
|
||||
|
||||
csv_reqs = load_requirements("requirements/extra-csv.in")
|
||||
docx_reqs = load_requirements("requirements/extra-docx.in")
|
||||
epub_reqs = load_requirements("requirements/extra-pandoc.in")
|
||||
image_reqs = load_requirements("requirements/extra-pdf-image.in")
|
||||
markdown_reqs = load_requirements("requirements/extra-markdown.in")
|
||||
msg_reqs = load_requirements("requirements/extra-msg.in")
|
||||
odt_reqs = load_requirements("requirements/extra-odt.in")
|
||||
org_reqs = load_requirements("requirements/extra-pandoc.in")
|
||||
pdf_reqs = load_requirements("requirements/extra-pdf-image.in")
|
||||
pptx_reqs = load_requirements("requirements/extra-pptx.in")
|
||||
rtf_reqs = load_requirements("requirements/extra-pandoc.in")
|
||||
rst_reqs = load_requirements("requirements/extra-pandoc.in")
|
||||
tsv_reqs = load_requirements("requirements/extra-csv.in")
|
||||
xlsx_reqs = load_requirements("requirements/extra-xlsx.in")
|
||||
|
||||
all_doc_reqs = list(
|
||||
set(
|
||||
csv_reqs
|
||||
+ docx_reqs
|
||||
+ epub_reqs
|
||||
+ image_reqs
|
||||
+ markdown_reqs
|
||||
+ msg_reqs
|
||||
+ odt_reqs
|
||||
+ org_reqs
|
||||
+ pdf_reqs
|
||||
+ pptx_reqs
|
||||
+ rtf_reqs
|
||||
+ rst_reqs
|
||||
+ tsv_reqs
|
||||
+ xlsx_reqs,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
setup(
|
||||
name="unstructured",
|
||||
description="A library that prepares raw documents for downstream ML tasks.",
|
||||
@ -71,8 +108,23 @@ setup(
|
||||
},
|
||||
install_requires=load_requirements(),
|
||||
extras_require={
|
||||
"huggingface": load_requirements("requirements/huggingface.in"),
|
||||
"local-inference": load_requirements("requirements/local-inference.in"),
|
||||
# Document specific extra requirements
|
||||
"all-docs": all_doc_reqs,
|
||||
"csv": csv_reqs,
|
||||
"docx": docx_reqs,
|
||||
"epub": epub_reqs,
|
||||
"image": image_reqs,
|
||||
"md": markdown_reqs,
|
||||
"msg": msg_reqs,
|
||||
"odt": odt_reqs,
|
||||
"org": org_reqs,
|
||||
"pdf": pdf_reqs,
|
||||
"pptx": pptx_reqs,
|
||||
"rtf": rtf_reqs,
|
||||
"rst": rst_reqs,
|
||||
"tsv": tsv_reqs,
|
||||
"xlsx": xlsx_reqs,
|
||||
# Extra requirements for data connectors
|
||||
"s3": load_requirements("requirements/ingest-s3.in"),
|
||||
"azure": load_requirements("requirements/ingest-azure.in"),
|
||||
"discord": load_requirements("requirements/ingest-discord.in"),
|
||||
@ -89,6 +141,9 @@ setup(
|
||||
"onedrive": load_requirements("requirements/ingest-onedrive.in"),
|
||||
"outlook": load_requirements("requirements/ingest-outlook.in"),
|
||||
"confluence": load_requirements("requirements/ingest-confluence.in"),
|
||||
# Legacy extra requirements
|
||||
"huggingface": load_requirements("requirements/huggingface.in"),
|
||||
"local-inference": all_doc_reqs,
|
||||
},
|
||||
package_dir={"unstructured": "unstructured"},
|
||||
package_data={"unstructured": ["nlp/*.txt"]},
|
||||
|
@ -478,4 +478,4 @@ def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
|
||||
def test_get_page_image_metadata_and_coordinate_system():
|
||||
doc = MockDocumentLayout()
|
||||
metadata = _get_page_image_metadata(doc.pages[0])
|
||||
assert type(metadata) == dict
|
||||
assert isinstance(metadata, dict)
|
||||
|
@ -267,6 +267,6 @@ def test_partition_doc_from_file_without_metadata_date(
|
||||
sf = SpooledTemporaryFile()
|
||||
sf.write(f.read())
|
||||
sf.seek(0)
|
||||
elements = partition_doc(file=sf, metadata_last_modified=None)
|
||||
elements = partition_doc(file=sf, metadata_date="2020-07-05")
|
||||
|
||||
assert elements[0].metadata.last_modified is None
|
||||
assert elements[0].metadata.date == "2020-07-05"
|
||||
|
@ -216,8 +216,8 @@ def test_process_file_metadata_exclude_filename_pagenum(mocker, partition_test_r
|
||||
isd_elems = test_ingest_doc.process_file()
|
||||
assert len(isd_elems)
|
||||
for elem in isd_elems:
|
||||
assert "filename" not in elem["metadata"].keys()
|
||||
assert "page_number" not in elem["metadata"].keys()
|
||||
assert "filename" not in elem["metadata"]
|
||||
assert "page_number" not in elem["metadata"]
|
||||
|
||||
|
||||
def test_process_file_flatten_metadata(mocker, partition_test_results):
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.8.8" # pragma: no cover
|
||||
__version__ = "0.9.0" # pragma: no cover
|
||||
|
@ -1,11 +1,14 @@
|
||||
import tempfile
|
||||
from typing import IO, Optional
|
||||
|
||||
import pypandoc
|
||||
|
||||
from unstructured.partition.common import exactly_one
|
||||
from unstructured.utils import dependency_exists, requires_dependencies
|
||||
|
||||
if dependency_exists("pypandoc"):
|
||||
import pypandoc
|
||||
|
||||
|
||||
@requires_dependencies(["pypandoc"])
|
||||
def convert_file_to_text(filename: str, source_format: str, target_format: str) -> str:
|
||||
"""Uses pandoc to convert the source document to a raw text string."""
|
||||
try:
|
||||
|
@ -76,7 +76,7 @@ class SimpleGoogleDriveConfig(BaseConnectorConfig):
|
||||
recursive: bool = False
|
||||
|
||||
def __post_init__(self):
|
||||
if self.extension and self.extension not in EXT_TO_FILETYPE.keys():
|
||||
if self.extension and self.extension not in EXT_TO_FILETYPE:
|
||||
raise ValueError(
|
||||
f"Extension not supported. "
|
||||
f"Value MUST be one of {', '.join([k for k in EXT_TO_FILETYPE if k is not None])}.",
|
||||
|
@ -65,7 +65,7 @@ class OneDriveIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
|
||||
if not self.ext:
|
||||
raise ValueError("Unsupported file without extension.")
|
||||
|
||||
if self.ext not in EXT_TO_FILETYPE.keys():
|
||||
if self.ext not in EXT_TO_FILETYPE:
|
||||
raise ValueError(
|
||||
f"Extension not supported. "
|
||||
f"Value MUST be one of {', '.join([k for k in EXT_TO_FILETYPE if k is not None])}.",
|
||||
|
@ -13,27 +13,58 @@ from unstructured.file_utils.filetype import (
|
||||
)
|
||||
from unstructured.logger import logger
|
||||
from unstructured.partition.common import exactly_one
|
||||
from unstructured.partition.csv import partition_csv
|
||||
from unstructured.partition.doc import partition_doc
|
||||
from unstructured.partition.docx import partition_docx
|
||||
from unstructured.partition.email import partition_email
|
||||
from unstructured.partition.epub import partition_epub
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.partition.image import partition_image
|
||||
from unstructured.partition.json import partition_json
|
||||
from unstructured.partition.md import partition_md
|
||||
from unstructured.partition.msg import partition_msg
|
||||
from unstructured.partition.odt import partition_odt
|
||||
from unstructured.partition.org import partition_org
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
from unstructured.partition.ppt import partition_ppt
|
||||
from unstructured.partition.pptx import partition_pptx
|
||||
from unstructured.partition.rst import partition_rst
|
||||
from unstructured.partition.rtf import partition_rtf
|
||||
from unstructured.partition.text import partition_text
|
||||
from unstructured.partition.tsv import partition_tsv
|
||||
from unstructured.partition.xlsx import partition_xlsx
|
||||
from unstructured.partition.xml import partition_xml
|
||||
from unstructured.utils import dependency_exists
|
||||
|
||||
if dependency_exists("pandas"):
|
||||
from unstructured.partition.csv import partition_csv
|
||||
from unstructured.partition.tsv import partition_tsv
|
||||
|
||||
|
||||
if dependency_exists("docx"):
|
||||
from unstructured.partition.doc import partition_doc
|
||||
from unstructured.partition.docx import partition_docx
|
||||
|
||||
|
||||
if dependency_exists("docx") and dependency_exists("pypandoc"):
|
||||
from unstructured.partition.odt import partition_odt
|
||||
|
||||
|
||||
if dependency_exists("pypandoc"):
|
||||
from unstructured.partition.epub import partition_epub
|
||||
from unstructured.partition.org import partition_org
|
||||
from unstructured.partition.rst import partition_rst
|
||||
from unstructured.partition.rtf import partition_rtf
|
||||
|
||||
|
||||
if dependency_exists("markdown"):
|
||||
from unstructured.partition.md import partition_md
|
||||
|
||||
|
||||
if dependency_exists("msg_parser"):
|
||||
from unstructured.partition.msg import partition_msg
|
||||
|
||||
|
||||
pdf_imports = ["pdf2image", "pdfminer", "PIL"]
|
||||
if all(dependency_exists(dep) for dep in pdf_imports):
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
|
||||
|
||||
if dependency_exists("unstructured_inference"):
|
||||
from unstructured.partition.image import partition_image
|
||||
|
||||
|
||||
if dependency_exists("pptx"):
|
||||
from unstructured.partition.ppt import partition_ppt
|
||||
from unstructured.partition.pptx import partition_pptx
|
||||
|
||||
|
||||
if dependency_exists("pandas") and dependency_exists("openpyxl"):
|
||||
from unstructured.partition.xlsx import partition_xlsx
|
||||
|
||||
|
||||
def partition(
|
||||
|
@ -7,7 +7,6 @@ from io import BufferedReader, BytesIO, TextIOWrapper
|
||||
from tempfile import SpooledTemporaryFile
|
||||
from typing import IO, TYPE_CHECKING, Any, BinaryIO, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from docx import table as docxtable
|
||||
from tabulate import tabulate
|
||||
|
||||
from unstructured.documents.coordinates import CoordinateSystem
|
||||
@ -23,6 +22,10 @@ from unstructured.documents.elements import (
|
||||
)
|
||||
from unstructured.logger import logger
|
||||
from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
|
||||
from unstructured.utils import dependency_exists
|
||||
|
||||
if dependency_exists("docx"):
|
||||
import docx.table as docxtable
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from unstructured_inference.inference.layoutelement import (
|
||||
@ -303,12 +306,12 @@ def convert_to_bytes(
|
||||
return f_bytes
|
||||
|
||||
|
||||
def convert_ms_office_table_to_text(table: docxtable.Table, as_html: bool = True):
|
||||
def convert_ms_office_table_to_text(table: "docxtable.Table", as_html: bool = True) -> str:
|
||||
"""
|
||||
Convert a table object from a Word document to an HTML table string using the tabulate library.
|
||||
|
||||
Args:
|
||||
table (Table): A Table object.
|
||||
table (Table): A docx.table.Table object.
|
||||
as_html (bool): Whether to return the table as an HTML string (True) or a
|
||||
plain text string (False)
|
||||
|
||||
|
@ -4,7 +4,6 @@ from tempfile import SpooledTemporaryFile
|
||||
from typing import IO, BinaryIO, List, Optional, Tuple, Union, cast
|
||||
|
||||
import docx
|
||||
import pypandoc
|
||||
from docx.oxml.shared import qn
|
||||
from docx.text.paragraph import Paragraph
|
||||
from docx.text.run import Run
|
||||
@ -38,6 +37,10 @@ from unstructured.partition.text_type import (
|
||||
is_possible_title,
|
||||
is_us_city_state_zip,
|
||||
)
|
||||
from unstructured.utils import dependency_exists
|
||||
|
||||
if dependency_exists("pypandoc"):
|
||||
import pypandoc
|
||||
|
||||
# NOTE(robinson) - documentation on built in styles can be found at the link below
|
||||
# ref: https://python-docx.readthedocs.io/en/latest/user/
|
||||
@ -314,7 +317,7 @@ def convert_and_partition_docx(
|
||||
metadata_filename: Optional[str] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
) -> List[Element]:
|
||||
"""Converts a document to DOCX and then partitions it using partition_html. Works with
|
||||
"""Converts a document to DOCX and then partitions it using partition_docx. Works with
|
||||
any file format support by pandoc.
|
||||
|
||||
Parameters
|
||||
|
@ -3,8 +3,6 @@ import io
|
||||
import json
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from unstructured.documents.elements import (
|
||||
TYPE_TO_TEXT_ELEMENT_MAP,
|
||||
CheckBox,
|
||||
@ -13,6 +11,10 @@ from unstructured.documents.elements import (
|
||||
NoID,
|
||||
)
|
||||
from unstructured.partition.common import exactly_one
|
||||
from unstructured.utils import dependency_exists, requires_dependencies
|
||||
|
||||
if dependency_exists("pandas"):
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def _get_metadata_table_fieldnames():
|
||||
@ -161,7 +163,7 @@ def convert_to_isd_csv(elements: List[Element]) -> str:
|
||||
|
||||
if row.get("sent_from"):
|
||||
row["sender"] = row.get("sent_from")
|
||||
if type(row["sender"]) == list:
|
||||
if isinstance(row["sender"], list):
|
||||
row["sender"] = row["sender"][0]
|
||||
|
||||
with io.StringIO() as buffer:
|
||||
@ -176,11 +178,14 @@ def convert_to_csv(elements: List[Element]) -> str:
|
||||
return convert_to_isd_csv(elements)
|
||||
|
||||
|
||||
def convert_to_dataframe(elements: List[Element], drop_empty_cols: bool = True) -> pd.DataFrame:
|
||||
@requires_dependencies(["pandas"])
|
||||
def convert_to_dataframe(elements: List[Element], drop_empty_cols: bool = True) -> "pd.DataFrame":
|
||||
"""Converts document elements to a pandas DataFrame. The dataframe contains the
|
||||
following columns:
|
||||
text: the element text
|
||||
type: the text type (NarrativeText, Title, etc)
|
||||
|
||||
Output is pd.DataFrame
|
||||
"""
|
||||
csv_string = convert_to_isd_csv(elements)
|
||||
csv_string_io = io.StringIO(csv_string)
|
||||
|
Loading…
x
Reference in New Issue
Block a user