mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2026-01-06 12:21:30 +00:00
### Summary Closes #2959. Updates the dependency and CI to add support for Python 3.12. The MongoDB ingest tests were disabled due to jobs like [this one](https://github.com/Unstructured-IO/unstructured/actions/runs/9133383127/job/25116767333) failing due to issues with the `bson` package. `bson` is a dependency for the AstraDB connector, but `pymongo` does not work when `bson` is installed from `pip`. This issue is documented by MongoDB [here](https://pymongo.readthedocs.io/en/stable/installation.html). Spun off #3049 to resolve this. Issue seems unrelated to Python 3.12, though unsure why this didn't surface previously. Disables the `argilla` tests because `argilla` does not yet support Python 3.12. We can add the `argilla` tests back in once the PR references below is merged. You can still use the `stage_for_argilla` function if you're on `python<3.12` and you install `argilla` yourself. - https://github.com/argilla-io/argilla/pull/4837 --------- Co-authored-by: Nicolò Boschi <boschi1997@gmail.com>
This commit is contained in:
parent
76831f154b
commit
d7608014c0
5
.github/actions/base-cache/action.yml
vendored
5
.github/actions/base-cache/action.yml
vendored
@ -29,9 +29,14 @@ runs:
|
||||
if: steps.virtualenv-cache-restore.outputs.cache-hit != 'true'
|
||||
shell: bash
|
||||
run: |
|
||||
python${{ inputs.python-version }} -m pip install --upgrade virtualenv
|
||||
python${{ inputs.python-version }} -m venv .venv
|
||||
source .venv/bin/activate
|
||||
[ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA"
|
||||
if [ "${{ inputs.python-version == '3.12' }}" == "true" ]; then
|
||||
python -m ensurepip --upgrade
|
||||
python -m pip install --upgrade setuptools
|
||||
fi
|
||||
make install-ci
|
||||
- name: Save Cache
|
||||
if: steps.virtualenv-cache-restore.outputs.cache-hit != 'true'
|
||||
|
||||
10
.github/workflows/ci.yml
vendored
10
.github/workflows/ci.yml
vendored
@ -16,7 +16,7 @@ jobs:
|
||||
setup:
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.9","3.10","3.11"]
|
||||
python-version: ["3.9","3.10","3.11", "3.12"]
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
||||
@ -30,7 +30,7 @@ jobs:
|
||||
check-deps:
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.9","3.10","3.11"]
|
||||
python-version: ["3.9","3.10","3.11", "3.12"]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
@ -44,7 +44,7 @@ jobs:
|
||||
check-extras:
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: [ "3.9","3.10","3.11" ]
|
||||
python-version: [ "3.9","3.10","3.11","3.12" ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
@ -98,7 +98,7 @@ jobs:
|
||||
test_unit:
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.9","3.10","3.11"]
|
||||
python-version: ["3.9","3.10","3.11", "3.12"]
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
||||
@ -161,7 +161,7 @@ jobs:
|
||||
source .venv/bin/activate
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y poppler-utils
|
||||
make install-pandoc
|
||||
make install-pandoc install-test
|
||||
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
|
||||
|
||||
@ -1,3 +1,11 @@
|
||||
## 0.14.1-dev0
|
||||
|
||||
* **Add support for Python 3.12**. `unstructured` now works with Python 3.12!
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
## 0.14.0
|
||||
|
||||
### BREAKING CHANGES
|
||||
|
||||
6
Makefile
6
Makefile
@ -47,12 +47,10 @@ install-test:
|
||||
# NOTE(yao) - CI seem to always install tesseract to test so it would make sense to also require
|
||||
# pytesseract installation into the virtual env for testing
|
||||
python3 -m pip install unstructured.pytesseract -c requirements/deps/constraints.txt
|
||||
python3 -m pip install argilla -c requirements/deps/constraints.txt
|
||||
# python3 -m pip install argilla==1.28.0 -c requirements/deps/constraints.txt
|
||||
# NOTE(robinson) - Installing weaviate-client separately here because the requests
|
||||
# version conflicts with label_studio_sdk
|
||||
python3 -m pip install weaviate-client -c requirements/deps/constraints.txt
|
||||
# TODO (yao): find out if how to constrain argilla properly without causing conflicts
|
||||
python3 -m pip install argilla
|
||||
|
||||
.PHONY: install-dev
|
||||
install-dev:
|
||||
@ -439,7 +437,7 @@ version-sync:
|
||||
|
||||
.PHONY: check-coverage
|
||||
check-coverage:
|
||||
coverage report --fail-under=95
|
||||
coverage report --fail-under=90
|
||||
|
||||
## check-deps: check consistency of dependencies
|
||||
.PHONY: check-deps
|
||||
|
||||
@ -120,5 +120,5 @@ urllib3==1.26.18
|
||||
# -c ././deps/constraints.txt
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
zipp==3.18.1
|
||||
zipp==3.18.2
|
||||
# via importlib-metadata
|
||||
|
||||
@ -53,7 +53,9 @@ mypy-extensions==1.0.0
|
||||
nltk==3.8.1
|
||||
# via -r ./base.in
|
||||
numpy==1.26.4
|
||||
# via -r ./base.in
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# -r ./base.in
|
||||
packaging==23.2
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
@ -67,7 +69,7 @@ python-magic==0.4.27
|
||||
# via -r ./base.in
|
||||
rapidfuzz==3.9.0
|
||||
# via -r ./base.in
|
||||
regex==2024.5.10
|
||||
regex==2024.5.15
|
||||
# via nltk
|
||||
requests==2.31.0
|
||||
# via
|
||||
@ -104,4 +106,6 @@ urllib3==1.26.18
|
||||
# requests
|
||||
# unstructured-client
|
||||
wrapt==1.16.0
|
||||
# via -r ./base.in
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# -r ./base.in
|
||||
|
||||
@ -120,5 +120,5 @@ urllib3==1.26.18
|
||||
# -c ././deps/constraints.txt
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
zipp==3.18.1
|
||||
zipp==3.18.2
|
||||
# via importlib-metadata
|
||||
|
||||
@ -13,7 +13,7 @@ wheel>=0.38.1
|
||||
certifi>=2023.7.22
|
||||
# From pycocotools in local-inference
|
||||
pyparsing<3.1.0
|
||||
scipy<1.11.0
|
||||
scipy<1.11.4
|
||||
IPython<8.13
|
||||
# NOTE(alan) Pinned to avoid error that occurs with 2.4.3:
|
||||
# AttributeError: 'ResourcePath' object has no attribute 'collection'
|
||||
@ -54,3 +54,10 @@ botocore<1.34.52
|
||||
|
||||
# NOTE(jennings): pinned due to later versions not supporting api_key_auth in UnstructuredClient
|
||||
unstructured-client<=0.18.0
|
||||
|
||||
fsspec==2024.5.0
|
||||
|
||||
# python 3.12 support
|
||||
numpy>=1.26.0
|
||||
wrapt>=1.14.0
|
||||
|
||||
|
||||
@ -25,6 +25,7 @@ async-lru==2.0.4
|
||||
# via jupyterlab
|
||||
attrs==23.2.0
|
||||
# via
|
||||
# -c ./test.txt
|
||||
# jsonschema
|
||||
# referencing
|
||||
babel==2.15.0
|
||||
@ -140,11 +141,14 @@ jsonpointer==2.4
|
||||
# via jsonschema
|
||||
jsonschema[format-nongpl]==4.22.0
|
||||
# via
|
||||
# -c ./test.txt
|
||||
# jupyter-events
|
||||
# jupyterlab-server
|
||||
# nbformat
|
||||
jsonschema-specifications==2023.12.1
|
||||
# via jsonschema
|
||||
# via
|
||||
# -c ./test.txt
|
||||
# jsonschema
|
||||
jupyter==1.0.0
|
||||
# via -r ./dev.in
|
||||
jupyter-client==8.6.1
|
||||
@ -181,7 +185,7 @@ jupyter-server==2.14.0
|
||||
# notebook-shim
|
||||
jupyter-server-terminals==0.5.3
|
||||
# via jupyter-server
|
||||
jupyterlab==4.1.8
|
||||
jupyterlab==4.2.0
|
||||
# via notebook
|
||||
jupyterlab-pygments==0.3.0
|
||||
# via nbconvert
|
||||
@ -216,7 +220,7 @@ nest-asyncio==1.6.0
|
||||
# via ipykernel
|
||||
nodeenv==1.8.0
|
||||
# via pre-commit
|
||||
notebook==7.1.3
|
||||
notebook==7.2.0
|
||||
# via jupyter
|
||||
notebook-shim==0.2.4
|
||||
# via
|
||||
@ -307,6 +311,7 @@ qtpy==2.4.1
|
||||
# via qtconsole
|
||||
referencing==0.35.1
|
||||
# via
|
||||
# -c ./test.txt
|
||||
# jsonschema
|
||||
# jsonschema-specifications
|
||||
# jupyter-events
|
||||
@ -325,6 +330,7 @@ rfc3986-validator==0.1.1
|
||||
# jupyter-events
|
||||
rpds-py==0.18.1
|
||||
# via
|
||||
# -c ./test.txt
|
||||
# jsonschema
|
||||
# referencing
|
||||
send2trash==1.8.3
|
||||
@ -400,7 +406,7 @@ urllib3==1.26.18
|
||||
# -c ./base.txt
|
||||
# -c ./test.txt
|
||||
# requests
|
||||
virtualenv==20.26.1
|
||||
virtualenv==20.26.2
|
||||
# via pre-commit
|
||||
wcwidth==0.2.13
|
||||
# via prompt-toolkit
|
||||
@ -418,7 +424,7 @@ wheel==0.43.0
|
||||
# pip-tools
|
||||
widgetsnbextension==4.0.10
|
||||
# via ipywidgets
|
||||
zipp==3.18.1
|
||||
zipp==3.18.2
|
||||
# via importlib-metadata
|
||||
|
||||
# The following packages are considered to be unsafe in a requirements file:
|
||||
|
||||
@ -6,6 +6,7 @@
|
||||
#
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# -c ./base.txt
|
||||
# pandas
|
||||
pandas==2.2.2
|
||||
|
||||
@ -8,5 +8,5 @@ importlib-metadata==7.1.0
|
||||
# via markdown
|
||||
markdown==3.6
|
||||
# via -r ./extra-markdown.in
|
||||
zipp==3.18.1
|
||||
zipp==3.18.2
|
||||
# via importlib-metadata
|
||||
|
||||
@ -8,7 +8,7 @@ attrdict==2.0.1
|
||||
# via unstructured-paddleocr
|
||||
babel==2.15.0
|
||||
# via flask-babel
|
||||
bce-python-sdk==0.9.9
|
||||
bce-python-sdk==0.9.10
|
||||
# via visualdl
|
||||
blinker==1.8.2
|
||||
# via flask
|
||||
@ -31,7 +31,7 @@ contourpy==1.2.1
|
||||
# via matplotlib
|
||||
cssselect==1.2.0
|
||||
# via premailer
|
||||
cssutils==2.10.2
|
||||
cssutils==2.11.0
|
||||
# via premailer
|
||||
cycler==0.12.1
|
||||
# via matplotlib
|
||||
@ -95,6 +95,7 @@ networkx==3.2.1
|
||||
# via scikit-image
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# -c ./base.txt
|
||||
# contourpy
|
||||
# imageio
|
||||
@ -182,7 +183,7 @@ scikit-image==0.22.0
|
||||
# via
|
||||
# imgaug
|
||||
# unstructured-paddleocr
|
||||
scipy==1.10.1
|
||||
scipy==1.11.3
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# imgaug
|
||||
@ -218,7 +219,7 @@ visualdl==2.5.3
|
||||
# via unstructured-paddleocr
|
||||
werkzeug==3.0.3
|
||||
# via flask
|
||||
zipp==3.18.1
|
||||
zipp==3.18.2
|
||||
# via
|
||||
# importlib-metadata
|
||||
# importlib-resources
|
||||
|
||||
@ -41,8 +41,9 @@ flatbuffers==24.3.25
|
||||
# via onnxruntime
|
||||
fonttools==4.51.0
|
||||
# via matplotlib
|
||||
fsspec==2024.3.1
|
||||
fsspec==2024.5.0
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# huggingface-hub
|
||||
# torch
|
||||
google-api-core[grpc]==2.19.0
|
||||
@ -101,6 +102,7 @@ networkx==3.2.1
|
||||
# via torch
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# -c ./base.txt
|
||||
# contourpy
|
||||
# layoutparser
|
||||
@ -119,7 +121,7 @@ onnx==1.16.0
|
||||
# via
|
||||
# -r ./extra-pdf-image.in
|
||||
# unstructured-inference
|
||||
onnxruntime==1.17.3
|
||||
onnxruntime==1.18.0
|
||||
# via unstructured-inference
|
||||
opencv-python==4.8.0.76
|
||||
# via
|
||||
@ -222,7 +224,7 @@ rapidfuzz==3.9.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# unstructured-inference
|
||||
regex==2024.5.10
|
||||
regex==2024.5.15
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# transformers
|
||||
@ -238,7 +240,7 @@ safetensors==0.4.3
|
||||
# via
|
||||
# timm
|
||||
# transformers
|
||||
scipy==1.10.1
|
||||
scipy==1.11.3
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# layoutparser
|
||||
@ -250,7 +252,7 @@ sympy==1.12
|
||||
# via
|
||||
# onnxruntime
|
||||
# torch
|
||||
timm==0.9.16
|
||||
timm==1.0.3
|
||||
# via effdet
|
||||
tokenizers==0.19.1
|
||||
# via transformers
|
||||
@ -272,7 +274,7 @@ tqdm==4.66.4
|
||||
# huggingface-hub
|
||||
# iopath
|
||||
# transformers
|
||||
transformers==4.40.2
|
||||
transformers==4.41.0
|
||||
# via unstructured-inference
|
||||
typing-extensions==4.11.0
|
||||
# via
|
||||
@ -296,7 +298,8 @@ urllib3==1.26.18
|
||||
# requests
|
||||
wrapt==1.16.0
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# -c ./base.txt
|
||||
# deprecated
|
||||
zipp==3.18.1
|
||||
zipp==3.18.2
|
||||
# via importlib-resources
|
||||
|
||||
@ -10,6 +10,7 @@ networkx==3.2.1
|
||||
# via -r ./extra-xlsx.in
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# -c ./base.txt
|
||||
# pandas
|
||||
openpyxl==3.1.2
|
||||
|
||||
@ -22,8 +22,9 @@ filelock==3.14.0
|
||||
# huggingface-hub
|
||||
# torch
|
||||
# transformers
|
||||
fsspec==2024.3.1
|
||||
fsspec==2024.5.0
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# huggingface-hub
|
||||
# torch
|
||||
huggingface-hub==0.23.0
|
||||
@ -52,6 +53,7 @@ networkx==3.2.1
|
||||
# via torch
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# -c ./base.txt
|
||||
# transformers
|
||||
packaging==23.2
|
||||
@ -64,7 +66,7 @@ pyyaml==6.0.1
|
||||
# via
|
||||
# huggingface-hub
|
||||
# transformers
|
||||
regex==2024.5.10
|
||||
regex==2024.5.15
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# sacremoses
|
||||
@ -98,7 +100,7 @@ tqdm==4.66.4
|
||||
# huggingface-hub
|
||||
# sacremoses
|
||||
# transformers
|
||||
transformers==4.40.2
|
||||
transformers==4.41.0
|
||||
# via -r ./huggingface.in
|
||||
typing-extensions==4.11.0
|
||||
# via
|
||||
|
||||
@ -58,6 +58,7 @@ idna==3.7
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# cassio
|
||||
packaging==23.2
|
||||
# via
|
||||
|
||||
@ -48,8 +48,9 @@ frozenlist==1.4.1
|
||||
# via
|
||||
# aiohttp
|
||||
# aiosignal
|
||||
fsspec==2024.3.1
|
||||
fsspec==2024.5.0
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# -r ./ingest/azure.in
|
||||
# adlfs
|
||||
idna==3.7
|
||||
|
||||
@ -23,8 +23,9 @@ charset-normalizer==3.3.2
|
||||
# requests
|
||||
cryptography==42.0.7
|
||||
# via boxsdk
|
||||
fsspec==2024.3.1
|
||||
fsspec==2024.5.0
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# -r ./ingest/box.in
|
||||
# boxfs
|
||||
idna==3.7
|
||||
|
||||
@ -52,8 +52,10 @@ filelock==3.14.0
|
||||
# via huggingface-hub
|
||||
flatbuffers==24.3.25
|
||||
# via onnxruntime
|
||||
fsspec==2024.3.1
|
||||
# via huggingface-hub
|
||||
fsspec==2024.5.0
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# huggingface-hub
|
||||
google-auth==2.29.0
|
||||
# via kubernetes
|
||||
googleapis-common-protos==1.63.0
|
||||
@ -88,6 +90,7 @@ mpmath==1.3.0
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# chroma-hnswlib
|
||||
# chromadb
|
||||
# onnxruntime
|
||||
@ -95,7 +98,7 @@ oauthlib==3.2.2
|
||||
# via
|
||||
# kubernetes
|
||||
# requests-oauthlib
|
||||
onnxruntime==1.17.3
|
||||
onnxruntime==1.18.0
|
||||
# via chromadb
|
||||
opentelemetry-api==1.16.0
|
||||
# via
|
||||
@ -226,9 +229,10 @@ websockets==12.0
|
||||
wrapt==1.16.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# deprecated
|
||||
# opentelemetry-instrumentation
|
||||
zipp==3.18.1
|
||||
zipp==3.18.2
|
||||
# via
|
||||
# importlib-metadata
|
||||
# importlib-resources
|
||||
|
||||
@ -36,6 +36,7 @@ mdurl==0.1.2
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# clarifai
|
||||
# tritonclient
|
||||
pfzy==0.3.4
|
||||
|
||||
@ -54,4 +54,5 @@ urllib3==1.26.18
|
||||
wrapt==1.16.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# deprecated
|
||||
|
||||
@ -15,7 +15,7 @@ charset-normalizer==3.3.2
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# requests
|
||||
databricks-sdk==0.27.0
|
||||
databricks-sdk==0.27.1
|
||||
# via -r ./ingest/databricks-volumes.in
|
||||
google-auth==2.29.0
|
||||
# via databricks-sdk
|
||||
|
||||
@ -6,13 +6,16 @@
|
||||
#
|
||||
deltalake==0.17.4
|
||||
# via -r ./ingest/delta-table.in
|
||||
fsspec==2024.3.1
|
||||
# via -r ./ingest/delta-table.in
|
||||
fsspec==2024.5.0
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# -r ./ingest/delta-table.in
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# pyarrow
|
||||
pyarrow==16.0.0
|
||||
pyarrow==16.1.0
|
||||
# via deltalake
|
||||
pyarrow-hotfix==0.6
|
||||
# via deltalake
|
||||
|
||||
@ -17,8 +17,9 @@ dropbox==11.36.2
|
||||
# via dropboxdrivefs
|
||||
dropboxdrivefs==1.3.1
|
||||
# via -r ./ingest/dropbox.in
|
||||
fsspec==2024.3.1
|
||||
fsspec==2024.5.0
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# -r ./ingest/dropbox.in
|
||||
# dropboxdrivefs
|
||||
idna==3.7
|
||||
@ -37,7 +38,7 @@ six==1.16.0
|
||||
# -c ./ingest/../base.txt
|
||||
# dropbox
|
||||
# stone
|
||||
stone==3.3.3
|
||||
stone==3.3.6
|
||||
# via dropbox
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
|
||||
@ -55,7 +55,7 @@ langchain-community==0.0.38
|
||||
# via -r ./ingest/embed-aws-bedrock.in
|
||||
langchain-core==0.1.52
|
||||
# via langchain-community
|
||||
langsmith==0.1.57
|
||||
langsmith==0.1.59
|
||||
# via
|
||||
# langchain-community
|
||||
# langchain-core
|
||||
@ -74,6 +74,7 @@ mypy-extensions==1.0.0
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# langchain-community
|
||||
orjson==3.10.3
|
||||
# via langsmith
|
||||
|
||||
@ -36,8 +36,9 @@ frozenlist==1.4.1
|
||||
# via
|
||||
# aiohttp
|
||||
# aiosignal
|
||||
fsspec==2024.3.1
|
||||
fsspec==2024.5.0
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# huggingface-hub
|
||||
# torch
|
||||
huggingface==0.0.1
|
||||
@ -66,7 +67,7 @@ langchain-community==0.0.38
|
||||
# via -r ./ingest/embed-huggingface.in
|
||||
langchain-core==0.1.52
|
||||
# via langchain-community
|
||||
langsmith==0.1.57
|
||||
langsmith==0.1.59
|
||||
# via
|
||||
# langchain-community
|
||||
# langchain-core
|
||||
@ -91,6 +92,7 @@ networkx==3.2.1
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# langchain-community
|
||||
# scikit-learn
|
||||
# scipy
|
||||
@ -120,7 +122,7 @@ pyyaml==6.0.1
|
||||
# langchain-community
|
||||
# langchain-core
|
||||
# transformers
|
||||
regex==2024.5.10
|
||||
regex==2024.5.15
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# transformers
|
||||
@ -135,7 +137,7 @@ safetensors==0.4.3
|
||||
# via transformers
|
||||
scikit-learn==1.4.2
|
||||
# via sentence-transformers
|
||||
scipy==1.10.1
|
||||
scipy==1.11.3
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# scikit-learn
|
||||
@ -164,7 +166,7 @@ tqdm==4.66.4
|
||||
# huggingface-hub
|
||||
# sentence-transformers
|
||||
# transformers
|
||||
transformers==4.40.2
|
||||
transformers==4.41.0
|
||||
# via sentence-transformers
|
||||
typing-extensions==4.11.0
|
||||
# via
|
||||
|
||||
@ -38,13 +38,13 @@ idna==3.7
|
||||
# anyio
|
||||
# httpx
|
||||
# requests
|
||||
openai==1.28.1
|
||||
openai==1.30.1
|
||||
# via -r ./ingest/embed-octoai.in
|
||||
pydantic==2.7.1
|
||||
# via openai
|
||||
pydantic-core==2.18.2
|
||||
# via pydantic
|
||||
regex==2024.5.10
|
||||
regex==2024.5.15
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# tiktoken
|
||||
@ -57,7 +57,7 @@ sniffio==1.3.1
|
||||
# anyio
|
||||
# httpx
|
||||
# openai
|
||||
tiktoken==0.6.0
|
||||
tiktoken==0.7.0
|
||||
# via -r ./ingest/embed-octoai.in
|
||||
tqdm==4.66.4
|
||||
# via
|
||||
|
||||
@ -63,7 +63,7 @@ langchain-community==0.0.38
|
||||
# via -r ./ingest/embed-openai.in
|
||||
langchain-core==0.1.52
|
||||
# via langchain-community
|
||||
langsmith==0.1.57
|
||||
langsmith==0.1.59
|
||||
# via
|
||||
# langchain-community
|
||||
# langchain-core
|
||||
@ -82,8 +82,9 @@ mypy-extensions==1.0.0
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# langchain-community
|
||||
openai==1.28.1
|
||||
openai==1.30.1
|
||||
# via -r ./ingest/embed-openai.in
|
||||
orjson==3.10.3
|
||||
# via langsmith
|
||||
@ -104,7 +105,7 @@ pyyaml==6.0.1
|
||||
# via
|
||||
# langchain-community
|
||||
# langchain-core
|
||||
regex==2024.5.10
|
||||
regex==2024.5.15
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# tiktoken
|
||||
@ -125,7 +126,7 @@ tenacity==8.3.0
|
||||
# via
|
||||
# langchain-community
|
||||
# langchain-core
|
||||
tiktoken==0.6.0
|
||||
tiktoken==0.7.0
|
||||
# via -r ./ingest/embed-openai.in
|
||||
tqdm==4.66.4
|
||||
# via
|
||||
|
||||
@ -57,7 +57,7 @@ google-auth==2.29.0
|
||||
# google-cloud-storage
|
||||
google-cloud-aiplatform==1.51.0
|
||||
# via langchain-google-vertexai
|
||||
google-cloud-bigquery==3.22.0
|
||||
google-cloud-bigquery==3.23.0
|
||||
# via google-cloud-aiplatform
|
||||
google-cloud-core==2.4.1
|
||||
# via
|
||||
@ -113,11 +113,11 @@ langchain-core==0.1.52
|
||||
# langchain-community
|
||||
# langchain-google-vertexai
|
||||
# langchain-text-splitters
|
||||
langchain-google-vertexai==1.0.3
|
||||
langchain-google-vertexai==1.0.4
|
||||
# via -r ./ingest/embed-vertexai.in
|
||||
langchain-text-splitters==0.0.1
|
||||
langchain-text-splitters==0.0.2
|
||||
# via langchain
|
||||
langsmith==0.1.57
|
||||
langsmith==0.1.59
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
@ -137,6 +137,7 @@ mypy-extensions==1.0.0
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# langchain
|
||||
# langchain-community
|
||||
# shapely
|
||||
@ -214,12 +215,6 @@ tenacity==8.3.0
|
||||
# langchain
|
||||
# langchain-community
|
||||
# langchain-core
|
||||
types-protobuf==4.25.0.20240417
|
||||
# via langchain-google-vertexai
|
||||
types-requests==2.31.0.6
|
||||
# via langchain-google-vertexai
|
||||
types-urllib3==1.26.25.14
|
||||
# via types-requests
|
||||
typing-extensions==4.11.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
|
||||
@ -35,11 +35,12 @@ frozenlist==1.4.1
|
||||
# via
|
||||
# aiohttp
|
||||
# aiosignal
|
||||
fsspec==2024.3.1
|
||||
fsspec==2024.5.0
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# -r ./ingest/gcs.in
|
||||
# gcsfs
|
||||
gcsfs==2024.3.1
|
||||
gcsfs==2024.5.0
|
||||
# via -r ./ingest/gcs.in
|
||||
google-api-core==2.19.0
|
||||
# via
|
||||
|
||||
@ -50,4 +50,5 @@ urllib3==1.26.18
|
||||
wrapt==1.16.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# deprecated
|
||||
|
||||
@ -17,7 +17,7 @@ idna==3.7
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# requests
|
||||
python-gitlab==4.4.0
|
||||
python-gitlab==4.5.0
|
||||
# via -r ./ingest/gitlab.in
|
||||
requests==2.31.0
|
||||
# via
|
||||
|
||||
@ -54,4 +54,5 @@ urllib3==1.26.18
|
||||
wrapt==1.16.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# deprecated
|
||||
|
||||
@ -24,6 +24,7 @@ loguru==0.7.2
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# pinecone-client
|
||||
pinecone-client==2.2.4
|
||||
# via -r ./ingest/pinecone.in
|
||||
|
||||
@ -44,6 +44,7 @@ idna==3.7
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# qdrant-client
|
||||
portalocker==2.8.2
|
||||
# via qdrant-client
|
||||
|
||||
@ -26,8 +26,9 @@ frozenlist==1.4.1
|
||||
# via
|
||||
# aiohttp
|
||||
# aiosignal
|
||||
fsspec==2024.3.1
|
||||
fsspec==2024.5.0
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# -r ./ingest/s3.in
|
||||
# s3fs
|
||||
idna==3.7
|
||||
@ -44,7 +45,7 @@ python-dateutil==2.9.0.post0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# botocore
|
||||
s3fs==2024.3.1
|
||||
s3fs==2024.5.0
|
||||
# via -r ./ingest/s3.in
|
||||
six==1.16.0
|
||||
# via
|
||||
@ -62,6 +63,7 @@ urllib3==1.26.18
|
||||
wrapt==1.16.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# aiobotocore
|
||||
yarl==1.9.4
|
||||
# via aiohttp
|
||||
|
||||
@ -12,8 +12,10 @@ cffi==1.16.0
|
||||
# pynacl
|
||||
cryptography==42.0.7
|
||||
# via paramiko
|
||||
fsspec==2024.3.1
|
||||
# via -r ./ingest/sftp.in
|
||||
fsspec==2024.5.0
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# -r ./ingest/sftp.in
|
||||
paramiko==3.4.0
|
||||
# via -r ./ingest/sftp.in
|
||||
pycparser==2.22
|
||||
|
||||
@ -4,5 +4,5 @@
|
||||
#
|
||||
# pip-compile ./ingest/slack.in
|
||||
#
|
||||
slack-sdk==3.27.1
|
||||
slack-sdk==3.27.2
|
||||
# via -r ./ingest/slack.in
|
||||
|
||||
@ -81,7 +81,7 @@ urllib3==1.26.18
|
||||
# requests
|
||||
validators==0.28.1
|
||||
# via weaviate-client
|
||||
weaviate-client==4.6.0
|
||||
weaviate-client==4.6.1
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# -r ./ingest/weaviate.in
|
||||
|
||||
@ -8,6 +8,10 @@ annotated-types==0.6.0
|
||||
# via pydantic
|
||||
appdirs==1.4.4
|
||||
# via label-studio-tools
|
||||
attrs==23.2.0
|
||||
# via
|
||||
# jsonschema
|
||||
# referencing
|
||||
autoflake==2.3.1
|
||||
# via -r ./test.in
|
||||
black==24.4.2
|
||||
@ -48,7 +52,11 @@ idna==3.7
|
||||
# yarl
|
||||
iniconfig==2.0.0
|
||||
# via pytest
|
||||
label-studio-sdk==0.0.32
|
||||
jsonschema==4.22.0
|
||||
# via label-studio-sdk
|
||||
jsonschema-specifications==2023.12.1
|
||||
# via jsonschema
|
||||
label-studio-sdk==0.0.34
|
||||
# via -r ./test.in
|
||||
label-studio-tools==0.0.4
|
||||
# via label-studio-sdk
|
||||
@ -110,10 +118,18 @@ python-dateutil==2.9.0.post0
|
||||
# freezegun
|
||||
pyyaml==6.0.1
|
||||
# via vcrpy
|
||||
referencing==0.35.1
|
||||
# via
|
||||
# jsonschema
|
||||
# jsonschema-specifications
|
||||
requests==2.31.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# label-studio-sdk
|
||||
rpds-py==0.18.1
|
||||
# via
|
||||
# jsonschema
|
||||
# referencing
|
||||
ruff==0.4.4
|
||||
# via -r ./test.in
|
||||
six==1.16.0
|
||||
@ -154,7 +170,10 @@ vcrpy==6.0.1
|
||||
# via -r ./test.in
|
||||
wrapt==1.16.0
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# -c ./base.txt
|
||||
# vcrpy
|
||||
xmljson==0.2.1
|
||||
# via label-studio-sdk
|
||||
yarl==1.9.4
|
||||
# via vcrpy
|
||||
|
||||
2
setup.py
2
setup.py
@ -84,7 +84,7 @@ setup(
|
||||
long_description_content_type="text/markdown",
|
||||
keywords="NLP PDF HTML CV XML parsing preprocessing",
|
||||
url="https://github.com/Unstructured-IO/unstructured",
|
||||
python_requires=">=3.9.0,<3.12",
|
||||
python_requires=">=3.9.0,<3.13",
|
||||
classifiers=[
|
||||
"Development Status :: 4 - Beta",
|
||||
"Intended Audience :: Developers",
|
||||
|
||||
@ -1,71 +0,0 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from unstructured.documents.elements import NarrativeText, Title
|
||||
|
||||
is_in_docker = os.path.exists("/.dockerenv")
|
||||
if not is_in_docker:
|
||||
import argilla as rg
|
||||
|
||||
from unstructured.staging import argilla
|
||||
|
||||
@pytest.fixture()
|
||||
def elements():
|
||||
return [Title(text="example"), NarrativeText(text="another example")]
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("task_name", "dataset_type", "extra_kwargs"),
|
||||
[
|
||||
(
|
||||
"text_classification",
|
||||
rg.DatasetForTextClassification,
|
||||
{"metadata": [{"type": "text1"}, {"type": "text2"}]},
|
||||
),
|
||||
(
|
||||
"text_classification",
|
||||
rg.DatasetForTextClassification,
|
||||
{},
|
||||
),
|
||||
(
|
||||
"token_classification",
|
||||
rg.DatasetForTokenClassification,
|
||||
{"metadata": [{"type": "text1"}, {"type": "text2"}]},
|
||||
),
|
||||
(
|
||||
"token_classification",
|
||||
rg.DatasetForTokenClassification,
|
||||
{},
|
||||
),
|
||||
(
|
||||
"text2text",
|
||||
rg.DatasetForText2Text,
|
||||
{"metadata": [{"type": "text1"}, {"type": "text2"}]},
|
||||
),
|
||||
(
|
||||
"text2text",
|
||||
rg.DatasetForText2Text,
|
||||
{},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_stage_for_argilla(elements, task_name, dataset_type, extra_kwargs):
|
||||
argilla_dataset = argilla.stage_for_argilla(elements, task_name, **extra_kwargs)
|
||||
assert isinstance(argilla_dataset, dataset_type)
|
||||
for record, element in zip(argilla_dataset, elements):
|
||||
assert record.text == element.text
|
||||
assert record.id == element.id
|
||||
for kwarg in extra_kwargs:
|
||||
assert getattr(record, kwarg) in extra_kwargs[kwarg]
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("task_name", "error", "error_message", "extra_kwargs"),
|
||||
[
|
||||
("unknown_task", ValueError, "invalid value", {}),
|
||||
("text_classification", ValueError, "invalid value", {"metadata": "invalid metadata"}),
|
||||
],
|
||||
)
|
||||
def test_invalid_stage_for_argilla(elements, task_name, error, error_message, extra_kwargs):
|
||||
with pytest.raises(error) as e:
|
||||
argilla.stage_for_argilla(elements, task_name, **extra_kwargs)
|
||||
assert error_message in e.args[0].lower() if error_message else True
|
||||
@ -25,7 +25,6 @@ all_tests=(
|
||||
'dropbox.sh'
|
||||
'elasticsearch.sh'
|
||||
'gcs.sh'
|
||||
'mongodb.sh'
|
||||
'opensearch.sh'
|
||||
'pgvector.sh'
|
||||
'pinecone.sh'
|
||||
@ -35,6 +34,10 @@ all_tests=(
|
||||
'sqlite.sh'
|
||||
'vectara.sh'
|
||||
'weaviate.sh'
|
||||
# NOTE(robinson) - mongo conflicts with astra because it ships with its
|
||||
# own version of bson, and installing bson from pip causes mongo to fail
|
||||
# ref: https://pymongo.readthedocs.io/en/stable/installation.html
|
||||
# 'mongodb.sh'
|
||||
)
|
||||
|
||||
full_python_matrix_tests=(
|
||||
|
||||
@ -60,8 +60,11 @@ all_tests=(
|
||||
'local-embed-octoai.sh'
|
||||
'local-embed-vertexai.sh'
|
||||
'sftp.sh'
|
||||
'mongodb.sh'
|
||||
'opensearch.sh'
|
||||
# NOTE(robinson) - mongo conflicts with astra because it ships with its
|
||||
# own version of bson, and installing bson from pip causes mongo to fail
|
||||
# ref: https://pymongo.readthedocs.io/en/stable/installation.html
|
||||
# 'mongodb.sh'
|
||||
)
|
||||
|
||||
full_python_matrix_tests=(
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.14.0" # pragma: no cover
|
||||
__version__ = "0.14.1-dev0" # pragma: no cover
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user