Feat/bump numpy to 2 (#3961)

This PR updates a few dependencies so that they are compatible with
`numpy>=2`.
This commit is contained in:
Yao You 2025-03-18 16:33:48 -05:00 committed by GitHub
parent 4e424efd22
commit 7de630e45e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 146 additions and 167 deletions

View File

@ -1,10 +1,13 @@
## 0.17.1-dev1
## 0.17.1
### Enhancements
- **Add image_url of images in html partitioner** `<img>` tags with non-data content include a new image_url metadata field with the content of the src attribute.
- **Use `lxml` instead of `bs4` to parse hOCR data.** `lxml` is much faster than `bs4` given the hOCR data format is regular (garanteed because it is programatically generated)
- **bump `numpy` to `>2`**. And upgrade `paddlepaddle`, `unstructured-paddleocr`, `onnx` so they are compatible with `numpy>2`.
### Features
### Fixes

View File

@ -10,9 +10,7 @@ emoji
dataclasses-json
python-iso639
langdetect
# NOTE(robinson) - numpy pin is because ONNX model weights are only compatible
# with numpy 1.x.x
numpy<2
numpy
rapidfuzz
backoff
typing-extensions

View File

@ -4,7 +4,7 @@
#
# pip-compile ./base.in
#
anyio==4.8.0
anyio==4.9.0
# via httpx
backoff==2.2.1
# via -r ./base.in
@ -34,7 +34,7 @@ dataclasses-json==0.6.7
# via
# -r ./base.in
# unstructured-client
deepdiff==8.3.0
deepdiff==8.4.2
# via unstructured-client
emoji==2.14.1
# via -r ./base.in
@ -76,7 +76,7 @@ nest-asyncio==1.6.0
# via unstructured-client
nltk==3.9.1
# via -r ./base.in
numpy==1.26.4
numpy==2.0.2
# via -r ./base.in
olefile==0.47
# via python-oxmsg
@ -90,7 +90,7 @@ psutil==7.0.0
# via -r ./base.in
pycparser==2.22
# via cffi
pypdf==5.3.1
pypdf==5.4.0
# via unstructured-client
python-dateutil==2.9.0.post0
# via unstructured-client

View File

@ -15,9 +15,9 @@ click==8.1.8
# pip-tools
distlib==0.3.9
# via virtualenv
filelock==3.17.0
filelock==3.18.0
# via virtualenv
identify==2.6.8
identify==2.6.9
# via pre-commit
importlib-metadata==8.6.1
# via

View File

@ -4,7 +4,7 @@
#
# pip-compile ./extra-csv.in
#
numpy==1.26.4
numpy==2.0.2
# via
# -c ./base.txt
# pandas

View File

@ -1,5 +1,5 @@
-c ./deps/constraints.txt
-c base.txt
paddlepaddle==3.0.0b1
unstructured.paddleocr==2.8.1.0
paddlepaddle>=3.0.0b1
unstructured.paddleocr==2.10.0

View File

@ -4,12 +4,24 @@
#
# pip-compile ./extra-paddleocr.in
#
anyio==4.8.0
albucore==0.0.23
# via
# albumentations
# unstructured-paddleocr
albumentations==2.0.5
# via unstructured-paddleocr
annotated-types==0.7.0
# via pydantic
anyio==4.9.0
# via
# -c ./base.txt
# httpx
astor==0.8.1
# via paddlepaddle
beautifulsoup4==4.13.3
# via
# -c ./base.txt
# unstructured-paddleocr
certifi==2025.1.31
# via
# -c ./base.txt
@ -20,20 +32,20 @@ charset-normalizer==3.4.1
# via
# -c ./base.txt
# requests
contourpy==1.3.0
# via matplotlib
cycler==0.12.1
# via matplotlib
cython==3.0.12
# via unstructured-paddleocr
decorator==5.2.1
# via paddlepaddle
eval-type-backport==0.2.2
# via albumentations
exceptiongroup==1.2.2
# via
# -c ./base.txt
# anyio
fire==0.7.0
# via unstructured-paddleocr
fonttools==4.56.0
# via matplotlib
# via unstructured-paddleocr
h11==0.14.0
# via
# -c ./base.txt
@ -53,32 +65,26 @@ idna==3.10
# httpx
# requests
imageio==2.37.0
# via
# imgaug
# scikit-image
imgaug==0.4.0
# via unstructured-paddleocr
importlib-resources==6.5.2
# via matplotlib
kiwisolver==1.4.7
# via matplotlib
# via scikit-image
lazy-loader==0.4
# via scikit-image
matplotlib==3.9.4
# via imgaug
lxml==5.3.1
# via
# -c ./base.txt
# python-docx
networkx==3.2.1
# via
# paddlepaddle
# scikit-image
numpy==1.26.4
numpy==2.0.2
# via
# -c ./base.txt
# contourpy
# albucore
# albumentations
# imageio
# imgaug
# matplotlib
# opencv-contrib-python
# opencv-python
# opencv-python-headless
# opt-einsum
# paddlepaddle
# scikit-image
@ -89,44 +95,42 @@ numpy==1.26.4
opencv-contrib-python==4.11.0.86
# via unstructured-paddleocr
opencv-python==4.11.0.86
# via unstructured-paddleocr
opencv-python-headless==4.11.0.86
# via
# imgaug
# unstructured-paddleocr
# albucore
# albumentations
opt-einsum==3.3.0
# via paddlepaddle
packaging==24.2
# via
# -c ./base.txt
# lazy-loader
# matplotlib
# scikit-image
paddlepaddle==3.0.0b1
paddlepaddle==3.0.0rc1
# via -r ./extra-paddleocr.in
pdf2image==1.17.0
# via unstructured-paddleocr
pillow==11.1.0
# via
# imageio
# imgaug
# matplotlib
# paddlepaddle
# pdf2image
# scikit-image
# unstructured-paddleocr
protobuf==6.30.0
protobuf==6.30.1
# via
# -c ././deps/constraints.txt
# paddlepaddle
pyclipper==1.3.0.post6
# via unstructured-paddleocr
pyparsing==3.2.1
# via matplotlib
python-dateutil==2.9.0.post0
# via
# -c ./base.txt
# matplotlib
pyyaml==6.0.2
pydantic==2.10.6
# via albumentations
pydantic-core==2.27.2
# via pydantic
python-docx==1.1.2
# via unstructured-paddleocr
pyyaml==6.0.2
# via
# albumentations
# unstructured-paddleocr
rapidfuzz==3.12.2
# via
# -c ./base.txt
@ -136,26 +140,27 @@ requests==2.32.3
# -c ./base.txt
# unstructured-paddleocr
scikit-image==0.24.0
# via
# imgaug
# unstructured-paddleocr
# via unstructured-paddleocr
scipy==1.13.1
# via
# imgaug
# albumentations
# scikit-image
shapely==2.0.7
# via
# imgaug
# unstructured-paddleocr
six==1.17.0
# via
# -c ./base.txt
# imgaug
# python-dateutil
# via unstructured-paddleocr
simsimd==6.2.1
# via albucore
sniffio==1.3.1
# via
# -c ./base.txt
# anyio
soupsieve==2.6
# via
# -c ./base.txt
# beautifulsoup4
stringzilla==3.12.3
# via albucore
termcolor==2.5.0
# via fire
tifffile==2024.8.30
# via scikit-image
tqdm==4.67.1
@ -165,14 +170,18 @@ tqdm==4.67.1
typing-extensions==4.12.2
# via
# -c ./base.txt
# albucore
# albumentations
# anyio
# beautifulsoup4
# paddlepaddle
unstructured-paddleocr==2.8.1.0
# pydantic
# pydantic-core
# python-docx
unstructured-paddleocr==2.10.0
# via -r ./extra-paddleocr.in
urllib3==1.26.20
# via
# -c ././deps/constraints.txt
# -c ./base.txt
# requests
zipp==3.21.0
# via importlib-resources

View File

@ -1,7 +1,8 @@
-c ./deps/constraints.txt
-c base.txt
onnx
onnx>=1.17.0
onnxruntime>=1.19.0
pdf2image
pdfminer.six
pikepdf
@ -11,5 +12,5 @@ google-cloud-vision
effdet
# Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
# when unstructured library is.
unstructured-inference>=0.8.9
unstructured-inference>=0.8.10
unstructured.pytesseract>=0.3.12

View File

@ -35,7 +35,7 @@ deprecated==1.2.18
# via pikepdf
effdet==0.4.1
# via -r ./extra-pdf-image.in
filelock==3.17.0
filelock==3.18.0
# via
# huggingface-hub
# torch
@ -44,30 +44,30 @@ flatbuffers==25.2.10
# via onnxruntime
fonttools==4.56.0
# via matplotlib
fsspec==2025.2.0
fsspec==2025.3.0
# via
# huggingface-hub
# torch
google-api-core[grpc]==2.8.0
google-api-core[grpc]==2.24.2
# via google-cloud-vision
google-auth==2.38.0
# via
# google-api-core
# google-cloud-vision
google-cloud-vision==2.7.2
google-cloud-vision==3.10.1
# via -r ./extra-pdf-image.in
googleapis-common-protos==1.56.1
googleapis-common-protos==1.69.2
# via
# google-api-core
# grpcio-status
grpcio==1.70.0
grpcio==1.71.0
# via
# -c ././deps/constraints.txt
# google-api-core
# grpcio-status
grpcio-status==1.62.3
# via google-api-core
huggingface-hub==0.29.2
huggingface-hub==0.29.3
# via
# timm
# tokenizers
@ -99,7 +99,7 @@ mpmath==1.3.0
# via sympy
networkx==3.2.1
# via torch
numpy==1.26.4
numpy==2.0.2
# via
# -c ./base.txt
# contourpy
@ -120,7 +120,9 @@ onnx==1.17.0
# -r ./extra-pdf-image.in
# unstructured-inference
onnxruntime==1.19.2
# via unstructured-inference
# via
# -r ./extra-pdf-image.in
# unstructured-inference
opencv-python==4.11.0.86
# via unstructured-inference
packaging==24.2
@ -140,7 +142,7 @@ pdfminer-six==20240706
# via
# -r ./extra-pdf-image.in
# unstructured-inference
pi-heif==0.21.0
pi-heif==0.22.0
# via -r ./extra-pdf-image.in
pikepdf==9.5.2
# via -r ./extra-pdf-image.in
@ -152,12 +154,15 @@ pillow==11.1.0
# pikepdf
# torchvision
# unstructured-pytesseract
proto-plus==1.20.4
# via google-cloud-vision
protobuf==6.30.0
proto-plus==1.26.1
# via
# google-api-core
# google-cloud-vision
protobuf==6.30.1
# via
# -c ././deps/constraints.txt
# google-api-core
# google-cloud-vision
# googleapis-common-protos
# grpcio-status
# onnx
@ -177,7 +182,7 @@ pycparser==2.22
# cffi
pyparsing==3.2.1
# via matplotlib
pypdf==5.3.1
pypdf==5.4.0
# via
# -c ./base.txt
# -r ./extra-pdf-image.in
@ -232,7 +237,7 @@ timm==1.0.15
# via
# effdet
# unstructured-inference
tokenizers==0.21.0
tokenizers==0.21.1
# via
# -c ././deps/constraints.txt
# transformers
@ -261,7 +266,7 @@ typing-extensions==4.12.2
# torch
tzdata==2025.1
# via pandas
unstructured-inference==0.8.9
unstructured-inference==0.8.10
# via -r ./extra-pdf-image.in
unstructured-pytesseract==0.3.15
# via -r ./extra-pdf-image.in

View File

@ -8,7 +8,7 @@ et-xmlfile==2.0.0
# via openpyxl
networkx==3.2.1
# via -r ./extra-xlsx.in
numpy==1.26.4
numpy==2.0.2
# via
# -c ./base.txt
# pandas

View File

@ -16,16 +16,16 @@ click==8.1.8
# via
# -c ./base.txt
# sacremoses
filelock==3.17.0
filelock==3.18.0
# via
# huggingface-hub
# torch
# transformers
fsspec==2025.2.0
fsspec==2025.3.0
# via
# huggingface-hub
# torch
huggingface-hub==0.29.2
huggingface-hub==0.29.3
# via
# tokenizers
# transformers
@ -49,7 +49,7 @@ mpmath==1.3.0
# via sympy
networkx==3.2.1
# via torch
numpy==1.26.4
numpy==2.0.2
# via
# -c ./base.txt
# transformers
@ -84,7 +84,7 @@ six==1.17.0
# langdetect
sympy==1.13.1
# via torch
tokenizers==0.21.0
tokenizers==0.21.1
# via
# -c ././deps/constraints.txt
# transformers

View File

@ -6,24 +6,18 @@
#
annotated-types==0.7.0
# via pydantic
anyio==4.8.0
anyio==4.9.0
# via
# -c ./base.txt
# httpx
appdirs==1.4.4
# via label-studio-sdk
argcomplete==3.6.0
# via datamodel-code-generator
attrs==25.1.0
# via
# jsonschema
# referencing
attrs==25.3.0
# via jsonschema
autoflake==2.3.1
# via -r ./test.in
black==25.1.0
# via
# -r ./test.in
# datamodel-code-generator
# via -r ./test.in
certifi==2025.1.31
# via
# -c ./base.txt
@ -39,23 +33,15 @@ click==8.1.8
# -c ./base.txt
# black
# nltk
coverage[toml]==7.6.12
coverage[toml]==7.7.0
# via
# -r ./test.in
# pytest-cov
datamodel-code-generator==0.26.1
# via label-studio-sdk
dnspython==2.7.0
# via email-validator
email-validator==2.2.0
# via pydantic
exceptiongroup==1.2.2
# via
# -c ./base.txt
# anyio
# pytest
faker==36.2.2
# via jsf
flake8==7.1.2
# via
# -r ./test.in
@ -64,9 +50,7 @@ flake8-print==5.0.0
# via -r ./test.in
freezegun==1.5.1
# via -r ./test.in
genson==1.3.0
# via datamodel-code-generator
grpcio==1.70.0
grpcio==1.71.0
# via
# -c ././deps/constraints.txt
# -r ./test.in
@ -86,33 +70,20 @@ idna==3.10
# via
# -c ./base.txt
# anyio
# email-validator
# httpx
# requests
# yarl
ijson==3.3.0
# via label-studio-sdk
inflect==5.6.2
# via datamodel-code-generator
iniconfig==2.0.0
# via pytest
isort==5.13.2
# via datamodel-code-generator
jinja2==3.1.6
# via datamodel-code-generator
joblib==1.4.2
# via
# -c ./base.txt
# nltk
jsf==0.11.2
jsonschema==3.2.0
# via label-studio-sdk
jsonschema==4.23.0
# via
# jsf
# label-studio-sdk
jsonschema-specifications==2024.10.1
# via jsonschema
label-studio-sdk==1.0.10
label-studio-sdk==1.0.5
# via -r ./test.in
liccheck==0.9.2
# via -r ./test.in
@ -120,11 +91,9 @@ lxml==5.3.1
# via
# -c ./base.txt
# label-studio-sdk
markupsafe==3.0.2
# via jinja2
mccabe==0.7.0
# via flake8
multidict==6.1.0
multidict==6.2.0
# via yarl
mypy==1.15.0
# via -r ./test.in
@ -137,16 +106,14 @@ nltk==3.9.1
# via
# -c ./base.txt
# label-studio-sdk
numpy==1.26.4
numpy==2.0.2
# via
# -c ./base.txt
# label-studio-sdk
# pandas
packaging==24.2
# via
# -c ./base.txt
# black
# datamodel-code-generator
# pytest
pandas==2.2.3
# via label-studio-sdk
@ -164,20 +131,18 @@ pycodestyle==2.12.1
# via
# flake8
# flake8-print
pydantic[email]==2.10.6
pydantic==2.10.6
# via
# -r ./test.in
# datamodel-code-generator
# jsf
# label-studio-sdk
pydantic-core==2.27.2
# via
# label-studio-sdk
# pydantic
# via pydantic
pyflakes==3.2.0
# via
# autoflake
# flake8
pyrsistent==0.20.0
# via jsonschema
pytest==8.3.5
# via
# pytest-cov
@ -194,13 +159,7 @@ python-dateutil==2.9.0.post0
pytz==2025.1
# via pandas
pyyaml==6.0.2
# via
# datamodel-code-generator
# vcrpy
referencing==0.36.2
# via
# jsonschema
# jsonschema-specifications
# via vcrpy
regex==2024.11.6
# via
# -c ./base.txt
@ -210,33 +169,23 @@ requests==2.32.3
# -c ./base.txt
# label-studio-sdk
# requests-mock
# smart-open
requests-mock==1.12.1
# via label-studio-sdk
rpds-py==0.23.1
# via
# jsonschema
# referencing
rstr==3.2.2
# via jsf
ruff==0.9.9
ruff==0.11.0
# via -r ./test.in
semantic-version==2.10.0
# via liccheck
six==1.17.0
# via
# -c ./base.txt
# jsonschema
# python-dateutil
smart-open[http]==7.1.0
# via jsf
sniffio==1.3.1
# via
# -c ./base.txt
# anyio
toml==0.10.2
# via
# datamodel-code-generator
# liccheck
# via liccheck
tomli==2.2.1
# via
# autoflake
@ -263,17 +212,13 @@ typing-extensions==4.12.2
# -c ./base.txt
# anyio
# black
# jsf
# label-studio-sdk
# multidict
# mypy
# pydantic
# pydantic-core
# referencing
tzdata==2025.1
# via
# faker
# pandas
# via pandas
ujson==5.10.0
# via label-studio-sdk
urllib3==1.26.20
@ -287,9 +232,11 @@ vcrpy==7.0.0
wrapt==1.17.2
# via
# -c ./base.txt
# smart-open
# vcrpy
xmljson==0.2.1
# via label-studio-sdk
yarl==1.18.3
# via vcrpy
# The following packages are considered to be unsafe in a requirements file:
# setuptools

View File

@ -1052,7 +1052,23 @@ def test_auto_partition_respects_detect_language_per_element_arg():
@pytest.mark.parametrize(
"file_extension", "doc docx eml epub html md odt org ppt pptx rst rtf txt xml".split()
"file_extension",
[
"doc",
"docx",
"eml",
"epub",
"html",
"md",
"odt",
"org",
"ppt",
"pptx",
"rst",
"rtf",
"txt",
"xml",
],
)
def test_auto_partition_respects_language_arg(file_extension: str):
elements = partition(

View File

@ -1 +1 @@
__version__ = "0.17.1-dev1" # pragma: no cover
__version__ = "0.17.1" # pragma: no cover