From 7de630e45e1830fa1b22979258a64ebe5ff511bd Mon Sep 17 00:00:00 2001 From: Yao You Date: Tue, 18 Mar 2025 16:33:48 -0500 Subject: [PATCH] Feat/bump numpy to 2 (#3961) This PR updates a few dependencies so that they are compatible with `numpy>=2`. --- CHANGELOG.md | 5 +- requirements/base.in | 4 +- requirements/base.txt | 8 +- requirements/dev.txt | 4 +- requirements/extra-csv.txt | 2 +- requirements/extra-paddleocr.in | 4 +- requirements/extra-paddleocr.txt | 115 ++++++++++++----------- requirements/extra-pdf-image.in | 5 +- requirements/extra-pdf-image.txt | 37 ++++---- requirements/extra-xlsx.txt | 2 +- requirements/huggingface.txt | 10 +- requirements/test.txt | 97 +++++-------------- test_unstructured/partition/test_auto.py | 18 +++- unstructured/__version__.py | 2 +- 14 files changed, 146 insertions(+), 167 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cbadb01de..bff09cf81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,13 @@ -## 0.17.1-dev1 +## 0.17.1 ### Enhancements - **Add image_url of images in html partitioner** `` tags with non-data content include a new image_url metadata field with the content of the src attribute. + - **Use `lxml` instead of `bs4` to parse hOCR data.** `lxml` is much faster than `bs4` given the hOCR data format is regular (garanteed because it is programatically generated) +- **bump `numpy` to `>2`**. And upgrade `paddlepaddle`, `unstructured-paddleocr`, `onnx` so they are compatible with `numpy>2`. + ### Features ### Fixes diff --git a/requirements/base.in b/requirements/base.in index cc2b27d8a..320a77226 100644 --- a/requirements/base.in +++ b/requirements/base.in @@ -10,9 +10,7 @@ emoji dataclasses-json python-iso639 langdetect -# NOTE(robinson) - numpy pin is because ONNX model weights are only compatible -# with numpy 1.x.x -numpy<2 +numpy rapidfuzz backoff typing-extensions diff --git a/requirements/base.txt b/requirements/base.txt index a29c4a1a3..17a25c4d4 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -4,7 +4,7 @@ # # pip-compile ./base.in # -anyio==4.8.0 +anyio==4.9.0 # via httpx backoff==2.2.1 # via -r ./base.in @@ -34,7 +34,7 @@ dataclasses-json==0.6.7 # via # -r ./base.in # unstructured-client -deepdiff==8.3.0 +deepdiff==8.4.2 # via unstructured-client emoji==2.14.1 # via -r ./base.in @@ -76,7 +76,7 @@ nest-asyncio==1.6.0 # via unstructured-client nltk==3.9.1 # via -r ./base.in -numpy==1.26.4 +numpy==2.0.2 # via -r ./base.in olefile==0.47 # via python-oxmsg @@ -90,7 +90,7 @@ psutil==7.0.0 # via -r ./base.in pycparser==2.22 # via cffi -pypdf==5.3.1 +pypdf==5.4.0 # via unstructured-client python-dateutil==2.9.0.post0 # via unstructured-client diff --git a/requirements/dev.txt b/requirements/dev.txt index ecc41e978..0de6c4eb0 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -15,9 +15,9 @@ click==8.1.8 # pip-tools distlib==0.3.9 # via virtualenv -filelock==3.17.0 +filelock==3.18.0 # via virtualenv -identify==2.6.8 +identify==2.6.9 # via pre-commit importlib-metadata==8.6.1 # via diff --git a/requirements/extra-csv.txt b/requirements/extra-csv.txt index cb7c7efc5..a5779f0a8 100644 --- a/requirements/extra-csv.txt +++ b/requirements/extra-csv.txt @@ -4,7 +4,7 @@ # # pip-compile ./extra-csv.in # -numpy==1.26.4 +numpy==2.0.2 # via # -c ./base.txt # pandas diff --git a/requirements/extra-paddleocr.in b/requirements/extra-paddleocr.in index b1cf3ee2e..ec0c3bf32 100644 --- a/requirements/extra-paddleocr.in +++ b/requirements/extra-paddleocr.in @@ -1,5 +1,5 @@ -c ./deps/constraints.txt -c base.txt -paddlepaddle==3.0.0b1 -unstructured.paddleocr==2.8.1.0 +paddlepaddle>=3.0.0b1 +unstructured.paddleocr==2.10.0 diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 70bdb7b72..a5264d784 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -4,12 +4,24 @@ # # pip-compile ./extra-paddleocr.in # -anyio==4.8.0 +albucore==0.0.23 + # via + # albumentations + # unstructured-paddleocr +albumentations==2.0.5 + # via unstructured-paddleocr +annotated-types==0.7.0 + # via pydantic +anyio==4.9.0 # via # -c ./base.txt # httpx astor==0.8.1 # via paddlepaddle +beautifulsoup4==4.13.3 + # via + # -c ./base.txt + # unstructured-paddleocr certifi==2025.1.31 # via # -c ./base.txt @@ -20,20 +32,20 @@ charset-normalizer==3.4.1 # via # -c ./base.txt # requests -contourpy==1.3.0 - # via matplotlib -cycler==0.12.1 - # via matplotlib cython==3.0.12 # via unstructured-paddleocr decorator==5.2.1 # via paddlepaddle +eval-type-backport==0.2.2 + # via albumentations exceptiongroup==1.2.2 # via # -c ./base.txt # anyio +fire==0.7.0 + # via unstructured-paddleocr fonttools==4.56.0 - # via matplotlib + # via unstructured-paddleocr h11==0.14.0 # via # -c ./base.txt @@ -53,32 +65,26 @@ idna==3.10 # httpx # requests imageio==2.37.0 - # via - # imgaug - # scikit-image -imgaug==0.4.0 - # via unstructured-paddleocr -importlib-resources==6.5.2 - # via matplotlib -kiwisolver==1.4.7 - # via matplotlib + # via scikit-image lazy-loader==0.4 # via scikit-image -matplotlib==3.9.4 - # via imgaug +lxml==5.3.1 + # via + # -c ./base.txt + # python-docx networkx==3.2.1 # via # paddlepaddle # scikit-image -numpy==1.26.4 +numpy==2.0.2 # via # -c ./base.txt - # contourpy + # albucore + # albumentations # imageio - # imgaug - # matplotlib # opencv-contrib-python # opencv-python + # opencv-python-headless # opt-einsum # paddlepaddle # scikit-image @@ -89,44 +95,42 @@ numpy==1.26.4 opencv-contrib-python==4.11.0.86 # via unstructured-paddleocr opencv-python==4.11.0.86 + # via unstructured-paddleocr +opencv-python-headless==4.11.0.86 # via - # imgaug - # unstructured-paddleocr + # albucore + # albumentations opt-einsum==3.3.0 # via paddlepaddle packaging==24.2 # via # -c ./base.txt # lazy-loader - # matplotlib # scikit-image -paddlepaddle==3.0.0b1 +paddlepaddle==3.0.0rc1 # via -r ./extra-paddleocr.in -pdf2image==1.17.0 - # via unstructured-paddleocr pillow==11.1.0 # via # imageio - # imgaug - # matplotlib # paddlepaddle - # pdf2image # scikit-image # unstructured-paddleocr -protobuf==6.30.0 +protobuf==6.30.1 # via # -c ././deps/constraints.txt # paddlepaddle pyclipper==1.3.0.post6 # via unstructured-paddleocr -pyparsing==3.2.1 - # via matplotlib -python-dateutil==2.9.0.post0 - # via - # -c ./base.txt - # matplotlib -pyyaml==6.0.2 +pydantic==2.10.6 + # via albumentations +pydantic-core==2.27.2 + # via pydantic +python-docx==1.1.2 # via unstructured-paddleocr +pyyaml==6.0.2 + # via + # albumentations + # unstructured-paddleocr rapidfuzz==3.12.2 # via # -c ./base.txt @@ -136,26 +140,27 @@ requests==2.32.3 # -c ./base.txt # unstructured-paddleocr scikit-image==0.24.0 - # via - # imgaug - # unstructured-paddleocr + # via unstructured-paddleocr scipy==1.13.1 # via - # imgaug + # albumentations # scikit-image shapely==2.0.7 - # via - # imgaug - # unstructured-paddleocr -six==1.17.0 - # via - # -c ./base.txt - # imgaug - # python-dateutil + # via unstructured-paddleocr +simsimd==6.2.1 + # via albucore sniffio==1.3.1 # via # -c ./base.txt # anyio +soupsieve==2.6 + # via + # -c ./base.txt + # beautifulsoup4 +stringzilla==3.12.3 + # via albucore +termcolor==2.5.0 + # via fire tifffile==2024.8.30 # via scikit-image tqdm==4.67.1 @@ -165,14 +170,18 @@ tqdm==4.67.1 typing-extensions==4.12.2 # via # -c ./base.txt + # albucore + # albumentations # anyio + # beautifulsoup4 # paddlepaddle -unstructured-paddleocr==2.8.1.0 + # pydantic + # pydantic-core + # python-docx +unstructured-paddleocr==2.10.0 # via -r ./extra-paddleocr.in urllib3==1.26.20 # via # -c ././deps/constraints.txt # -c ./base.txt # requests -zipp==3.21.0 - # via importlib-resources diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index 332ca01b6..4f3aef930 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -1,7 +1,8 @@ -c ./deps/constraints.txt -c base.txt -onnx +onnx>=1.17.0 +onnxruntime>=1.19.0 pdf2image pdfminer.six pikepdf @@ -11,5 +12,5 @@ google-cloud-vision effdet # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded # when unstructured library is. -unstructured-inference>=0.8.9 +unstructured-inference>=0.8.10 unstructured.pytesseract>=0.3.12 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 3e0561c16..0226cee3e 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -35,7 +35,7 @@ deprecated==1.2.18 # via pikepdf effdet==0.4.1 # via -r ./extra-pdf-image.in -filelock==3.17.0 +filelock==3.18.0 # via # huggingface-hub # torch @@ -44,30 +44,30 @@ flatbuffers==25.2.10 # via onnxruntime fonttools==4.56.0 # via matplotlib -fsspec==2025.2.0 +fsspec==2025.3.0 # via # huggingface-hub # torch -google-api-core[grpc]==2.8.0 +google-api-core[grpc]==2.24.2 # via google-cloud-vision google-auth==2.38.0 # via # google-api-core # google-cloud-vision -google-cloud-vision==2.7.2 +google-cloud-vision==3.10.1 # via -r ./extra-pdf-image.in -googleapis-common-protos==1.56.1 +googleapis-common-protos==1.69.2 # via # google-api-core # grpcio-status -grpcio==1.70.0 +grpcio==1.71.0 # via # -c ././deps/constraints.txt # google-api-core # grpcio-status grpcio-status==1.62.3 # via google-api-core -huggingface-hub==0.29.2 +huggingface-hub==0.29.3 # via # timm # tokenizers @@ -99,7 +99,7 @@ mpmath==1.3.0 # via sympy networkx==3.2.1 # via torch -numpy==1.26.4 +numpy==2.0.2 # via # -c ./base.txt # contourpy @@ -120,7 +120,9 @@ onnx==1.17.0 # -r ./extra-pdf-image.in # unstructured-inference onnxruntime==1.19.2 - # via unstructured-inference + # via + # -r ./extra-pdf-image.in + # unstructured-inference opencv-python==4.11.0.86 # via unstructured-inference packaging==24.2 @@ -140,7 +142,7 @@ pdfminer-six==20240706 # via # -r ./extra-pdf-image.in # unstructured-inference -pi-heif==0.21.0 +pi-heif==0.22.0 # via -r ./extra-pdf-image.in pikepdf==9.5.2 # via -r ./extra-pdf-image.in @@ -152,12 +154,15 @@ pillow==11.1.0 # pikepdf # torchvision # unstructured-pytesseract -proto-plus==1.20.4 - # via google-cloud-vision -protobuf==6.30.0 +proto-plus==1.26.1 + # via + # google-api-core + # google-cloud-vision +protobuf==6.30.1 # via # -c ././deps/constraints.txt # google-api-core + # google-cloud-vision # googleapis-common-protos # grpcio-status # onnx @@ -177,7 +182,7 @@ pycparser==2.22 # cffi pyparsing==3.2.1 # via matplotlib -pypdf==5.3.1 +pypdf==5.4.0 # via # -c ./base.txt # -r ./extra-pdf-image.in @@ -232,7 +237,7 @@ timm==1.0.15 # via # effdet # unstructured-inference -tokenizers==0.21.0 +tokenizers==0.21.1 # via # -c ././deps/constraints.txt # transformers @@ -261,7 +266,7 @@ typing-extensions==4.12.2 # torch tzdata==2025.1 # via pandas -unstructured-inference==0.8.9 +unstructured-inference==0.8.10 # via -r ./extra-pdf-image.in unstructured-pytesseract==0.3.15 # via -r ./extra-pdf-image.in diff --git a/requirements/extra-xlsx.txt b/requirements/extra-xlsx.txt index cc1bda37c..895935708 100644 --- a/requirements/extra-xlsx.txt +++ b/requirements/extra-xlsx.txt @@ -8,7 +8,7 @@ et-xmlfile==2.0.0 # via openpyxl networkx==3.2.1 # via -r ./extra-xlsx.in -numpy==1.26.4 +numpy==2.0.2 # via # -c ./base.txt # pandas diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index 68b3956c0..829a0448d 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -16,16 +16,16 @@ click==8.1.8 # via # -c ./base.txt # sacremoses -filelock==3.17.0 +filelock==3.18.0 # via # huggingface-hub # torch # transformers -fsspec==2025.2.0 +fsspec==2025.3.0 # via # huggingface-hub # torch -huggingface-hub==0.29.2 +huggingface-hub==0.29.3 # via # tokenizers # transformers @@ -49,7 +49,7 @@ mpmath==1.3.0 # via sympy networkx==3.2.1 # via torch -numpy==1.26.4 +numpy==2.0.2 # via # -c ./base.txt # transformers @@ -84,7 +84,7 @@ six==1.17.0 # langdetect sympy==1.13.1 # via torch -tokenizers==0.21.0 +tokenizers==0.21.1 # via # -c ././deps/constraints.txt # transformers diff --git a/requirements/test.txt b/requirements/test.txt index 9853be184..b64b5d52f 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -6,24 +6,18 @@ # annotated-types==0.7.0 # via pydantic -anyio==4.8.0 +anyio==4.9.0 # via # -c ./base.txt # httpx appdirs==1.4.4 # via label-studio-sdk -argcomplete==3.6.0 - # via datamodel-code-generator -attrs==25.1.0 - # via - # jsonschema - # referencing +attrs==25.3.0 + # via jsonschema autoflake==2.3.1 # via -r ./test.in black==25.1.0 - # via - # -r ./test.in - # datamodel-code-generator + # via -r ./test.in certifi==2025.1.31 # via # -c ./base.txt @@ -39,23 +33,15 @@ click==8.1.8 # -c ./base.txt # black # nltk -coverage[toml]==7.6.12 +coverage[toml]==7.7.0 # via # -r ./test.in # pytest-cov -datamodel-code-generator==0.26.1 - # via label-studio-sdk -dnspython==2.7.0 - # via email-validator -email-validator==2.2.0 - # via pydantic exceptiongroup==1.2.2 # via # -c ./base.txt # anyio # pytest -faker==36.2.2 - # via jsf flake8==7.1.2 # via # -r ./test.in @@ -64,9 +50,7 @@ flake8-print==5.0.0 # via -r ./test.in freezegun==1.5.1 # via -r ./test.in -genson==1.3.0 - # via datamodel-code-generator -grpcio==1.70.0 +grpcio==1.71.0 # via # -c ././deps/constraints.txt # -r ./test.in @@ -86,33 +70,20 @@ idna==3.10 # via # -c ./base.txt # anyio - # email-validator # httpx # requests # yarl ijson==3.3.0 # via label-studio-sdk -inflect==5.6.2 - # via datamodel-code-generator iniconfig==2.0.0 # via pytest -isort==5.13.2 - # via datamodel-code-generator -jinja2==3.1.6 - # via datamodel-code-generator joblib==1.4.2 # via # -c ./base.txt # nltk -jsf==0.11.2 +jsonschema==3.2.0 # via label-studio-sdk -jsonschema==4.23.0 - # via - # jsf - # label-studio-sdk -jsonschema-specifications==2024.10.1 - # via jsonschema -label-studio-sdk==1.0.10 +label-studio-sdk==1.0.5 # via -r ./test.in liccheck==0.9.2 # via -r ./test.in @@ -120,11 +91,9 @@ lxml==5.3.1 # via # -c ./base.txt # label-studio-sdk -markupsafe==3.0.2 - # via jinja2 mccabe==0.7.0 # via flake8 -multidict==6.1.0 +multidict==6.2.0 # via yarl mypy==1.15.0 # via -r ./test.in @@ -137,16 +106,14 @@ nltk==3.9.1 # via # -c ./base.txt # label-studio-sdk -numpy==1.26.4 +numpy==2.0.2 # via # -c ./base.txt - # label-studio-sdk # pandas packaging==24.2 # via # -c ./base.txt # black - # datamodel-code-generator # pytest pandas==2.2.3 # via label-studio-sdk @@ -164,20 +131,18 @@ pycodestyle==2.12.1 # via # flake8 # flake8-print -pydantic[email]==2.10.6 +pydantic==2.10.6 # via # -r ./test.in - # datamodel-code-generator - # jsf # label-studio-sdk pydantic-core==2.27.2 - # via - # label-studio-sdk - # pydantic + # via pydantic pyflakes==3.2.0 # via # autoflake # flake8 +pyrsistent==0.20.0 + # via jsonschema pytest==8.3.5 # via # pytest-cov @@ -194,13 +159,7 @@ python-dateutil==2.9.0.post0 pytz==2025.1 # via pandas pyyaml==6.0.2 - # via - # datamodel-code-generator - # vcrpy -referencing==0.36.2 - # via - # jsonschema - # jsonschema-specifications + # via vcrpy regex==2024.11.6 # via # -c ./base.txt @@ -210,33 +169,23 @@ requests==2.32.3 # -c ./base.txt # label-studio-sdk # requests-mock - # smart-open requests-mock==1.12.1 # via label-studio-sdk -rpds-py==0.23.1 - # via - # jsonschema - # referencing -rstr==3.2.2 - # via jsf -ruff==0.9.9 +ruff==0.11.0 # via -r ./test.in semantic-version==2.10.0 # via liccheck six==1.17.0 # via # -c ./base.txt + # jsonschema # python-dateutil -smart-open[http]==7.1.0 - # via jsf sniffio==1.3.1 # via # -c ./base.txt # anyio toml==0.10.2 - # via - # datamodel-code-generator - # liccheck + # via liccheck tomli==2.2.1 # via # autoflake @@ -263,17 +212,13 @@ typing-extensions==4.12.2 # -c ./base.txt # anyio # black - # jsf # label-studio-sdk # multidict # mypy # pydantic # pydantic-core - # referencing tzdata==2025.1 - # via - # faker - # pandas + # via pandas ujson==5.10.0 # via label-studio-sdk urllib3==1.26.20 @@ -287,9 +232,11 @@ vcrpy==7.0.0 wrapt==1.17.2 # via # -c ./base.txt - # smart-open # vcrpy xmljson==0.2.1 # via label-studio-sdk yarl==1.18.3 # via vcrpy + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index f29f600b4..27701fcf8 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -1052,7 +1052,23 @@ def test_auto_partition_respects_detect_language_per_element_arg(): @pytest.mark.parametrize( - "file_extension", "doc docx eml epub html md odt org ppt pptx rst rtf txt xml".split() + "file_extension", + [ + "doc", + "docx", + "eml", + "epub", + "html", + "md", + "odt", + "org", + "ppt", + "pptx", + "rst", + "rtf", + "txt", + "xml", + ], ) def test_auto_partition_respects_language_arg(file_extension: str): elements = partition( diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 0389ef24a..80fa78aa7 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.1-dev1" # pragma: no cover +__version__ = "0.17.1" # pragma: no cover