diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c4896e5d..549389ee3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.16.22 + +### Enhancements + +### Features + +### Fixes + +- **Fix open CVES in and bump dependencies + ## 0.16.21 ### Enhancements diff --git a/requirements/Makefile b/requirements/Makefile index 9e6b685fc..acb046152 100644 --- a/requirements/Makefile +++ b/requirements/Makefile @@ -27,4 +27,4 @@ clean: clean-base .PHONY: clean-base clean-base: - rm $(BASE_REQUIREMENTSTXT) \ No newline at end of file + rm $(BASE_REQUIREMENTSTXT) diff --git a/requirements/base.txt b/requirements/base.txt index 9ec5c2d33..1b6194274 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -2,15 +2,15 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./base.in +# pip-compile base.in # anyio==4.8.0 # via httpx backoff==2.2.1 - # via -r ./base.in -beautifulsoup4==4.12.3 - # via -r ./base.in -certifi==2024.12.14 + # via -r base.in +beautifulsoup4==4.13.3 + # via -r base.in +certifi==2025.1.31 # via # httpcore # httpx @@ -19,7 +19,7 @@ certifi==2024.12.14 cffi==1.17.1 # via cryptography chardet==5.2.0 - # via -r ./base.in + # via -r base.in charset-normalizer==3.4.1 # via # requests @@ -28,24 +28,24 @@ click==8.1.8 # via # nltk # python-oxmsg -cryptography==44.0.0 +cryptography==44.0.1 # via unstructured-client dataclasses-json==0.6.7 # via - # -r ./base.in + # -r base.in # unstructured-client -deepdiff==8.1.1 +deepdiff==8.2.0 # via unstructured-client emoji==2.14.1 - # via -r ./base.in + # via -r base.in exceptiongroup==1.2.2 # via anyio filetype==1.2.0 - # via -r ./base.in + # via -r base.in h11==0.14.0 # via httpcore html5lib==1.1 - # via -r ./base.in + # via -r base.in httpcore==1.0.7 # via httpx httpx==0.28.1 @@ -61,10 +61,10 @@ joblib==1.4.2 jsonpath-python==1.0.6 # via unstructured-client langdetect==1.0.9 - # via -r ./base.in -lxml==5.3.0 - # via -r ./base.in -marshmallow==3.26.0 + # via -r base.in +lxml==5.3.1 + # via -r base.in +marshmallow==3.26.1 # via # dataclasses-json # unstructured-client @@ -75,38 +75,38 @@ mypy-extensions==1.0.0 nest-asyncio==1.6.0 # via unstructured-client nltk==3.9.1 - # via -r ./base.in + # via -r base.in numpy==1.26.4 - # via -r ./base.in + # via -r base.in olefile==0.47 # via python-oxmsg -orderly-set==5.2.3 +orderly-set==5.3.0 # via deepdiff packaging==24.2 # via # marshmallow # unstructured-client -psutil==6.1.1 - # via -r ./base.in +psutil==7.0.0 + # via -r base.in pycparser==2.22 # via cffi -pypdf==5.2.0 +pypdf==5.3.0 # via unstructured-client python-dateutil==2.9.0.post0 # via unstructured-client -python-iso639==2025.1.28 - # via -r ./base.in +python-iso639==2025.2.18 + # via -r base.in python-magic==0.4.27 - # via -r ./base.in -python-oxmsg==0.0.1 - # via -r ./base.in -rapidfuzz==3.11.0 - # via -r ./base.in + # via -r base.in +python-oxmsg==0.0.2 + # via -r base.in +rapidfuzz==3.12.1 + # via -r base.in regex==2024.11.6 # via nltk requests==2.32.3 # via - # -r ./base.in + # -r base.in # requests-toolbelt # unstructured-client requests-toolbelt==1.0.0 @@ -123,12 +123,13 @@ soupsieve==2.6 # via beautifulsoup4 tqdm==4.67.1 # via - # -r ./base.in + # -r base.in # nltk typing-extensions==4.12.2 # via - # -r ./base.in + # -r base.in # anyio + # beautifulsoup4 # pypdf # python-oxmsg # typing-inspect @@ -139,14 +140,14 @@ typing-inspect==0.9.0 # unstructured-client unstructured-client==0.25.9 # via - # -c ././deps/constraints.txt - # -r ./base.in + # -c ./deps/constraints.txt + # -r base.in urllib3==1.26.20 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # requests # unstructured-client webencodings==0.5.1 # via html5lib wrapt==1.17.2 - # via -r ./base.in + # via -r base.in diff --git a/requirements/deps/constraints.txt b/requirements/deps/constraints.txt index 296dd366b..7f1cdc889 100644 --- a/requirements/deps/constraints.txt +++ b/requirements/deps/constraints.txt @@ -8,7 +8,7 @@ weaviate-client>=3.26.7,<4.0.0 # TODO: Constriant due to multiple versions being installed during pip-compile grpcio>=1.65.5 # TODO: Pinned in transformers package, remove when that gets updated (https://github.com/huggingface/transformers/blob/main/setup.py) -tokenizers>=0.19,<0.20 +tokenizers>=0.21,<0.22 # TODO: Constaint due to boto, with python before 3.10 not requiring openssl 1.1.1, remove when that gets # updated or we drop support for 3.9 urllib3<1.27 diff --git a/requirements/dev.txt b/requirements/dev.txt index a5ebd9921..52558def4 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./dev.in +# pip-compile dev.in # build==1.2.2.post1 # via pip-tools @@ -10,48 +10,48 @@ cfgv==3.4.0 # via pre-commit click==8.1.8 # via - # -c ./base.txt - # -c ./test.txt + # -c base.txt + # -c test.txt # pip-tools distlib==0.3.9 # via virtualenv filelock==3.17.0 # via virtualenv -identify==2.6.6 +identify==2.6.7 # via pre-commit importlib-metadata==8.6.1 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # build nodeenv==1.9.1 # via pre-commit packaging==24.2 # via - # -c ./base.txt - # -c ./test.txt + # -c base.txt + # -c test.txt # build pip-tools==7.4.1 - # via -r ./dev.in + # via -r dev.in platformdirs==4.3.6 # via - # -c ./test.txt + # -c test.txt # virtualenv pre-commit==4.1.0 - # via -r ./dev.in + # via -r dev.in pyproject-hooks==1.2.0 # via # build # pip-tools pyyaml==6.0.2 # via - # -c ./test.txt + # -c test.txt # pre-commit tomli==2.2.1 # via - # -c ./test.txt + # -c test.txt # build # pip-tools -virtualenv==20.29.1 +virtualenv==20.29.2 # via pre-commit wheel==0.45.1 # via pip-tools diff --git a/requirements/extra-csv.txt b/requirements/extra-csv.txt index d4d50645e..33f6d3cb1 100644 --- a/requirements/extra-csv.txt +++ b/requirements/extra-csv.txt @@ -2,23 +2,23 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./extra-csv.in +# pip-compile extra-csv.in # numpy==1.26.4 # via - # -c ./base.txt + # -c base.txt # pandas pandas==2.2.3 - # via -r ./extra-csv.in + # via -r extra-csv.in python-dateutil==2.9.0.post0 # via - # -c ./base.txt + # -c base.txt # pandas -pytz==2024.2 +pytz==2025.1 # via pandas six==1.17.0 # via - # -c ./base.txt + # -c base.txt # python-dateutil tzdata==2025.1 # via pandas diff --git a/requirements/extra-docx.txt b/requirements/extra-docx.txt index 01e7e2e24..3b7e4b8d0 100644 --- a/requirements/extra-docx.txt +++ b/requirements/extra-docx.txt @@ -2,15 +2,15 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./extra-docx.in +# pip-compile extra-docx.in # -lxml==5.3.0 +lxml==5.3.1 # via - # -c ./base.txt + # -c base.txt # python-docx python-docx==1.1.2 - # via -r ./extra-docx.in + # via -r extra-docx.in typing-extensions==4.12.2 # via - # -c ./base.txt + # -c base.txt # python-docx diff --git a/requirements/extra-epub.txt b/requirements/extra-epub.txt index 460408c41..b8571eb4a 100644 --- a/requirements/extra-epub.txt +++ b/requirements/extra-epub.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./extra-epub.in +# pip-compile extra-epub.in # pypandoc==1.15 - # via -r ./extra-epub.in + # via -r extra-epub.in diff --git a/requirements/extra-markdown.txt b/requirements/extra-markdown.txt index 9d0a14da5..2ec0670ae 100644 --- a/requirements/extra-markdown.txt +++ b/requirements/extra-markdown.txt @@ -2,13 +2,13 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./extra-markdown.in +# pip-compile extra-markdown.in # importlib-metadata==8.6.1 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # markdown markdown==3.7 - # via -r ./extra-markdown.in + # via -r extra-markdown.in zipp==3.21.0 # via importlib-metadata diff --git a/requirements/extra-odt.txt b/requirements/extra-odt.txt index 362c53ed7..2c413a496 100644 --- a/requirements/extra-odt.txt +++ b/requirements/extra-odt.txt @@ -2,17 +2,17 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./extra-odt.in +# pip-compile extra-odt.in # -lxml==5.3.0 +lxml==5.3.1 # via - # -c ./base.txt + # -c base.txt # python-docx pypandoc==1.15 - # via -r ./extra-odt.in + # via -r extra-odt.in python-docx==1.1.2 - # via -r ./extra-odt.in + # via -r extra-odt.in typing-extensions==4.12.2 # via - # -c ./base.txt + # -c base.txt # python-docx diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index dcb1e2108..1a76fa76e 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -2,53 +2,53 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./extra-paddleocr.in +# pip-compile extra-paddleocr.in # anyio==4.8.0 # via - # -c ./base.txt + # -c base.txt # httpx astor==0.8.1 # via paddlepaddle -certifi==2024.12.14 +certifi==2025.1.31 # via - # -c ./base.txt + # -c base.txt # httpcore # httpx # requests charset-normalizer==3.4.1 # via - # -c ./base.txt + # -c base.txt # requests contourpy==1.3.0 # via matplotlib cycler==0.12.1 # via matplotlib -cython==3.0.11 +cython==3.0.12 # via unstructured-paddleocr decorator==5.1.1 # via paddlepaddle exceptiongroup==1.2.2 # via - # -c ./base.txt + # -c base.txt # anyio -fonttools==4.55.8 +fonttools==4.56.0 # via matplotlib h11==0.14.0 # via - # -c ./base.txt + # -c base.txt # httpcore httpcore==1.0.7 # via - # -c ./base.txt + # -c base.txt # httpx httpx==0.28.1 # via - # -c ./base.txt + # -c base.txt # paddlepaddle idna==3.10 # via - # -c ./base.txt + # -c base.txt # anyio # httpx # requests @@ -72,7 +72,7 @@ networkx==3.2.1 # scikit-image numpy==1.26.4 # via - # -c ./base.txt + # -c base.txt # contourpy # imageio # imgaug @@ -96,12 +96,12 @@ opt-einsum==3.3.0 # via paddlepaddle packaging==24.2 # via - # -c ./base.txt + # -c base.txt # lazy-loader # matplotlib # scikit-image paddlepaddle==3.0.0b1 - # via -r ./extra-paddleocr.in + # via -r extra-paddleocr.in pdf2image==1.17.0 # via unstructured-paddleocr pillow==11.1.0 @@ -121,17 +121,17 @@ pyparsing==3.2.1 # via matplotlib python-dateutil==2.9.0.post0 # via - # -c ./base.txt + # -c base.txt # matplotlib pyyaml==6.0.2 # via unstructured-paddleocr -rapidfuzz==3.11.0 +rapidfuzz==3.12.1 # via - # -c ./base.txt + # -c base.txt # unstructured-paddleocr requests==2.32.3 # via - # -c ./base.txt + # -c base.txt # unstructured-paddleocr scikit-image==0.24.0 # via @@ -141,36 +141,36 @@ scipy==1.13.1 # via # imgaug # scikit-image -shapely==2.0.6 +shapely==2.0.7 # via # imgaug # unstructured-paddleocr six==1.17.0 # via - # -c ./base.txt + # -c base.txt # imgaug # python-dateutil sniffio==1.3.1 # via - # -c ./base.txt + # -c base.txt # anyio tifffile==2024.8.30 # via scikit-image tqdm==4.67.1 # via - # -c ./base.txt + # -c base.txt # unstructured-paddleocr typing-extensions==4.12.2 # via - # -c ./base.txt + # -c base.txt # anyio # paddlepaddle unstructured-paddleocr==2.8.1.0 - # via -r ./extra-paddleocr.in + # via -r extra-paddleocr.in urllib3==1.26.20 # via - # -c ././deps/constraints.txt - # -c ./base.txt + # -c ./deps/constraints.txt + # -c base.txt # requests zipp==3.21.0 # via importlib-resources diff --git a/requirements/extra-pandoc.txt b/requirements/extra-pandoc.txt index dd397c384..8dbc066d2 100644 --- a/requirements/extra-pandoc.txt +++ b/requirements/extra-pandoc.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./extra-pandoc.in +# pip-compile extra-pandoc.in # pypandoc==1.15 - # via -r ./extra-pandoc.in + # via -r extra-pandoc.in diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index f30252303..b7fe995f4 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -2,49 +2,49 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./extra-pdf-image.in +# pip-compile extra-pdf-image.in # antlr4-python3-runtime==4.9.3 # via omegaconf cachetools==5.5.1 # via google-auth -certifi==2024.12.14 +certifi==2025.1.31 # via - # -c ./base.txt + # -c base.txt # requests cffi==1.17.1 # via - # -c ./base.txt + # -c base.txt # cryptography charset-normalizer==3.4.1 # via - # -c ./base.txt + # -c base.txt # pdfminer-six # requests coloredlogs==15.0.1 # via onnxruntime contourpy==1.3.0 # via matplotlib -cryptography==44.0.0 +cryptography==44.0.1 # via - # -c ./base.txt + # -c base.txt # pdfminer-six cycler==0.12.1 # via matplotlib deprecated==1.2.18 # via pikepdf effdet==0.4.1 - # via -r ./extra-pdf-image.in + # via -r extra-pdf-image.in filelock==3.17.0 # via # huggingface-hub # torch # transformers -flatbuffers==25.1.24 +flatbuffers==25.2.10 # via onnxruntime -fonttools==4.55.8 +fonttools==4.56.0 # via matplotlib -fsspec==2024.12.0 +fsspec==2025.2.0 # via # huggingface-hub # torch @@ -54,20 +54,20 @@ google-auth==2.38.0 # via # google-api-core # google-cloud-vision -google-cloud-vision==3.9.0 - # via -r ./extra-pdf-image.in -googleapis-common-protos==1.66.0 +google-cloud-vision==3.10.0 + # via -r extra-pdf-image.in +googleapis-common-protos==1.67.0 # via # google-api-core # grpcio-status grpcio==1.70.0 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # google-api-core # grpcio-status grpcio-status==1.70.0 # via google-api-core -huggingface-hub==0.28.0 +huggingface-hub==0.28.1 # via # timm # tokenizers @@ -77,7 +77,7 @@ humanfriendly==10.0 # via coloredlogs idna==3.10 # via - # -c ./base.txt + # -c base.txt # requests importlib-resources==6.5.2 # via matplotlib @@ -85,9 +85,9 @@ jinja2==3.1.5 # via torch kiwisolver==1.4.7 # via matplotlib -lxml==5.3.0 +lxml==5.3.1 # via - # -c ./base.txt + # -c base.txt # pikepdf markupsafe==3.0.2 # via jinja2 @@ -101,7 +101,7 @@ networkx==3.2.1 # via torch numpy==1.26.4 # via - # -c ./base.txt + # -c base.txt # contourpy # matplotlib # onnx @@ -117,7 +117,7 @@ omegaconf==2.3.0 # via effdet onnx==1.17.0 # via - # -r ./extra-pdf-image.in + # -r extra-pdf-image.in # unstructured-inference onnxruntime==1.19.2 # via unstructured-inference @@ -125,7 +125,7 @@ opencv-python==4.11.0.86 # via unstructured-inference packaging==24.2 # via - # -c ./base.txt + # -c base.txt # huggingface-hub # matplotlib # onnxruntime @@ -135,15 +135,15 @@ packaging==24.2 pandas==2.2.3 # via unstructured-inference pdf2image==1.17.0 - # via -r ./extra-pdf-image.in + # via -r extra-pdf-image.in pdfminer-six==20240706 # via - # -r ./extra-pdf-image.in + # -r extra-pdf-image.in # unstructured-inference pi-heif==0.21.0 - # via -r ./extra-pdf-image.in -pikepdf==9.5.1 - # via -r ./extra-pdf-image.in + # via -r extra-pdf-image.in +pikepdf==9.5.2 + # via -r extra-pdf-image.in pillow==11.1.0 # via # matplotlib @@ -175,24 +175,24 @@ pycocotools==2.0.8 # via effdet pycparser==2.22 # via - # -c ./base.txt + # -c base.txt # cffi pyparsing==3.2.1 # via matplotlib -pypdf==5.2.0 +pypdf==5.3.0 # via - # -c ./base.txt - # -r ./extra-pdf-image.in + # -c base.txt + # -r extra-pdf-image.in pypdfium2==4.30.1 # via unstructured-inference python-dateutil==2.9.0.post0 # via - # -c ./base.txt + # -c base.txt # matplotlib # pandas python-multipart==0.0.20 # via unstructured-inference -pytz==2024.2 +pytz==2025.1 # via pandas pyyaml==6.0.2 # via @@ -200,17 +200,17 @@ pyyaml==6.0.2 # omegaconf # timm # transformers -rapidfuzz==3.11.0 +rapidfuzz==3.12.1 # via - # -c ./base.txt + # -c base.txt # unstructured-inference regex==2024.11.6 # via - # -c ./base.txt + # -c base.txt # transformers requests==2.32.3 # via - # -c ./base.txt + # -c base.txt # google-api-core # huggingface-hub # transformers @@ -224,7 +224,7 @@ scipy==1.13.1 # via unstructured-inference six==1.17.0 # via - # -c ./base.txt + # -c base.txt # python-dateutil sympy==1.13.1 # via @@ -234,9 +234,9 @@ timm==1.0.14 # via # effdet # unstructured-inference -tokenizers==0.19.1 +tokenizers==0.21.0 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # transformers torch==2.6.0 # via @@ -250,31 +250,31 @@ torchvision==0.21.0 # timm tqdm==4.67.1 # via - # -c ./base.txt + # -c base.txt # huggingface-hub # transformers -transformers==4.44.2 +transformers==4.49.0 # via unstructured-inference typing-extensions==4.12.2 # via - # -c ./base.txt + # -c base.txt # huggingface-hub # pypdf # torch tzdata==2025.1 # via pandas unstructured-inference==0.8.7 - # via -r ./extra-pdf-image.in + # via -r extra-pdf-image.in unstructured-pytesseract==0.3.13 - # via -r ./extra-pdf-image.in + # via -r extra-pdf-image.in urllib3==1.26.20 # via - # -c ././deps/constraints.txt - # -c ./base.txt + # -c ./deps/constraints.txt + # -c base.txt # requests wrapt==1.17.2 # via - # -c ./base.txt + # -c base.txt # deprecated zipp==3.21.0 # via importlib-resources diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt index 83ff09f01..3fd6f4648 100644 --- a/requirements/extra-pptx.txt +++ b/requirements/extra-pptx.txt @@ -2,14 +2,14 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./extra-pptx.in +# pip-compile extra-pptx.in # -lxml==5.3.0 +lxml==5.3.1 # via python-pptx pillow==11.1.0 # via python-pptx python-pptx==1.0.2 - # via -r ./extra-pptx.in + # via -r extra-pptx.in typing-extensions==4.12.2 # via python-pptx xlsxwriter==3.2.2 diff --git a/requirements/extra-xlsx.txt b/requirements/extra-xlsx.txt index b0c6cadbf..59f84a420 100644 --- a/requirements/extra-xlsx.txt +++ b/requirements/extra-xlsx.txt @@ -2,31 +2,31 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./extra-xlsx.in +# pip-compile extra-xlsx.in # et-xmlfile==2.0.0 # via openpyxl networkx==3.2.1 - # via -r ./extra-xlsx.in + # via -r extra-xlsx.in numpy==1.26.4 # via - # -c ./base.txt + # -c base.txt # pandas openpyxl==3.1.5 - # via -r ./extra-xlsx.in + # via -r extra-xlsx.in pandas==2.2.3 - # via -r ./extra-xlsx.in + # via -r extra-xlsx.in python-dateutil==2.9.0.post0 # via - # -c ./base.txt + # -c base.txt # pandas -pytz==2024.2 +pytz==2025.1 # via pandas six==1.17.0 # via - # -c ./base.txt + # -c base.txt # python-dateutil tzdata==2025.1 # via pandas xlrd==2.0.1 - # via -r ./extra-xlsx.in + # via -r extra-xlsx.in diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index bd78de5cb..60c0788db 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -2,47 +2,47 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./huggingface.in +# pip-compile huggingface.in # -certifi==2024.12.14 +certifi==2025.1.31 # via - # -c ./base.txt + # -c base.txt # requests charset-normalizer==3.4.1 # via - # -c ./base.txt + # -c base.txt # requests click==8.1.8 # via - # -c ./base.txt + # -c base.txt # sacremoses filelock==3.17.0 # via # huggingface-hub # torch # transformers -fsspec==2024.12.0 +fsspec==2025.2.0 # via # huggingface-hub # torch -huggingface-hub==0.28.0 +huggingface-hub==0.28.1 # via # tokenizers # transformers idna==3.10 # via - # -c ./base.txt + # -c base.txt # requests jinja2==3.1.5 # via torch joblib==1.4.2 # via - # -c ./base.txt + # -c base.txt # sacremoses langdetect==1.0.9 # via - # -c ./base.txt - # -r ./huggingface.in + # -c base.txt + # -r huggingface.in markupsafe==3.0.2 # via jinja2 mpmath==1.3.0 @@ -51,11 +51,11 @@ networkx==3.2.1 # via torch numpy==1.26.4 # via - # -c ./base.txt + # -c base.txt # transformers packaging==24.2 # via - # -c ./base.txt + # -c base.txt # huggingface-hub # transformers pyyaml==6.0.2 @@ -64,47 +64,47 @@ pyyaml==6.0.2 # transformers regex==2024.11.6 # via - # -c ./base.txt + # -c base.txt # sacremoses # transformers requests==2.32.3 # via - # -c ./base.txt + # -c base.txt # huggingface-hub # transformers sacremoses==0.1.1 - # via -r ./huggingface.in + # via -r huggingface.in safetensors==0.5.2 # via transformers sentencepiece==0.2.0 - # via -r ./huggingface.in + # via -r huggingface.in six==1.17.0 # via - # -c ./base.txt + # -c base.txt # langdetect sympy==1.13.1 # via torch -tokenizers==0.19.1 +tokenizers==0.21.0 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # transformers torch==2.6.0 - # via -r ./huggingface.in + # via -r huggingface.in tqdm==4.67.1 # via - # -c ./base.txt + # -c base.txt # huggingface-hub # sacremoses # transformers -transformers==4.44.2 - # via -r ./huggingface.in +transformers==4.49.0 + # via -r huggingface.in typing-extensions==4.12.2 # via - # -c ./base.txt + # -c base.txt # huggingface-hub # torch urllib3==1.26.20 # via - # -c ././deps/constraints.txt - # -c ./base.txt + # -c ./deps/constraints.txt + # -c base.txt # requests diff --git a/requirements/test.txt b/requirements/test.txt index 554846fff..13c7eb8ef 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -2,13 +2,13 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./test.in +# pip-compile test.in # annotated-types==0.7.0 # via pydantic anyio==4.8.0 # via - # -c ./base.txt + # -c base.txt # httpx appdirs==1.4.4 # via label-studio-sdk @@ -19,29 +19,29 @@ attrs==25.1.0 # jsonschema # referencing autoflake==2.3.1 - # via -r ./test.in + # via -r test.in black==25.1.0 # via - # -r ./test.in + # -r test.in # datamodel-code-generator -certifi==2024.12.14 +certifi==2025.1.31 # via - # -c ./base.txt + # -c base.txt # httpcore # httpx # requests charset-normalizer==3.4.1 # via - # -c ./base.txt + # -c base.txt # requests click==8.1.8 # via - # -c ./base.txt + # -c base.txt # black # nltk -coverage[toml]==7.6.10 +coverage[toml]==7.6.12 # via - # -r ./test.in + # -r test.in # pytest-cov datamodel-code-generator==0.26.1 # via label-studio-sdk @@ -51,40 +51,40 @@ email-validator==2.2.0 # via pydantic exceptiongroup==1.2.2 # via - # -c ./base.txt + # -c base.txt # anyio # pytest -faker==35.0.0 +faker==36.1.1 # via jsf -flake8==7.1.1 +flake8==7.1.2 # via - # -r ./test.in + # -r test.in # flake8-print flake8-print==5.0.0 - # via -r ./test.in + # via -r test.in freezegun==1.5.1 - # via -r ./test.in + # via -r test.in genson==1.3.0 # via datamodel-code-generator grpcio==1.70.0 # via - # -c ././deps/constraints.txt - # -r ./test.in + # -c ./deps/constraints.txt + # -r test.in h11==0.14.0 # via - # -c ./base.txt + # -c base.txt # httpcore httpcore==1.0.7 # via - # -c ./base.txt + # -c base.txt # httpx httpx==0.28.1 # via - # -c ./base.txt + # -c base.txt # label-studio-sdk idna==3.10 # via - # -c ./base.txt + # -c base.txt # anyio # email-validator # httpx @@ -102,7 +102,7 @@ jinja2==3.1.5 # via datamodel-code-generator joblib==1.4.2 # via - # -c ./base.txt + # -c base.txt # nltk jsf==0.11.2 # via label-studio-sdk @@ -112,13 +112,13 @@ jsonschema==4.23.0 # label-studio-sdk jsonschema-specifications==2024.10.1 # via jsonschema -label-studio-sdk==1.0.8 - # via -r ./test.in +label-studio-sdk==1.0.10 + # via -r test.in liccheck==0.9.2 - # via -r ./test.in -lxml==5.3.0 + # via -r test.in +lxml==5.3.1 # via - # -c ./base.txt + # -c base.txt # label-studio-sdk markupsafe==3.0.2 # via jinja2 @@ -126,25 +126,25 @@ mccabe==0.7.0 # via flake8 multidict==6.1.0 # via yarl -mypy==1.14.1 - # via -r ./test.in +mypy==1.15.0 + # via -r test.in mypy-extensions==1.0.0 # via - # -c ./base.txt + # -c base.txt # black # mypy nltk==3.9.1 # via - # -c ./base.txt + # -c base.txt # label-studio-sdk numpy==1.26.4 # via - # -c ./base.txt + # -c base.txt # label-studio-sdk # pandas packaging==24.2 # via - # -c ./base.txt + # -c base.txt # black # datamodel-code-generator # pytest @@ -166,12 +166,14 @@ pycodestyle==2.12.1 # flake8-print pydantic[email]==2.10.6 # via - # -r ./test.in + # -r test.in # datamodel-code-generator # jsf # label-studio-sdk pydantic-core==2.27.2 - # via pydantic + # via + # label-studio-sdk + # pydantic pyflakes==3.2.0 # via # autoflake @@ -181,16 +183,15 @@ pytest==8.3.4 # pytest-cov # pytest-mock pytest-cov==6.0.0 - # via -r ./test.in + # via -r test.in pytest-mock==3.14.0 - # via -r ./test.in + # via -r test.in python-dateutil==2.9.0.post0 # via - # -c ./base.txt - # faker + # -c base.txt # freezegun # pandas -pytz==2024.2 +pytz==2025.1 # via pandas pyyaml==6.0.2 # via @@ -202,11 +203,11 @@ referencing==0.36.2 # jsonschema-specifications regex==2024.11.6 # via - # -c ./base.txt + # -c base.txt # nltk requests==2.32.3 # via - # -c ./base.txt + # -c base.txt # label-studio-sdk # requests-mock # smart-open @@ -218,19 +219,19 @@ rpds-py==0.22.3 # referencing rstr==3.2.2 # via jsf -ruff==0.9.3 - # via -r ./test.in +ruff==0.9.6 + # via -r test.in semantic-version==2.10.0 # via liccheck six==1.17.0 # via - # -c ./base.txt + # -c base.txt # python-dateutil smart-open[http]==7.1.0 # via jsf sniffio==1.3.1 # via - # -c ./base.txt + # -c base.txt # anyio toml==0.10.2 # via @@ -245,24 +246,23 @@ tomli==2.2.1 # pytest tqdm==4.67.1 # via - # -c ./base.txt + # -c base.txt # nltk types-click==7.1.8 - # via -r ./test.in + # via -r test.in types-markdown==3.7.0.20241204 - # via -r ./test.in + # via -r test.in types-requests==2.31.0.6 - # via -r ./test.in + # via -r test.in types-tabulate==0.9.0.20241207 - # via -r ./test.in + # via -r test.in types-urllib3==1.26.25.14 # via types-requests typing-extensions==4.12.2 # via - # -c ./base.txt + # -c base.txt # anyio # black - # faker # jsf # label-studio-sdk # multidict @@ -271,20 +271,22 @@ typing-extensions==4.12.2 # pydantic-core # referencing tzdata==2025.1 - # via pandas + # via + # faker + # pandas ujson==5.10.0 # via label-studio-sdk urllib3==1.26.20 # via - # -c ././deps/constraints.txt - # -c ./base.txt + # -c ./deps/constraints.txt + # -c base.txt # requests # vcrpy vcrpy==7.0.0 - # via -r ./test.in + # via -r test.in wrapt==1.17.2 # via - # -c ./base.txt + # -c base.txt # smart-open # vcrpy xmljson==0.2.1 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index fb8bd1ff8..268ff7e15 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.21" # pragma: no cover +__version__ = "0.16.22" # pragma: no cover diff --git a/unstructured/cleaners/translate.py b/unstructured/cleaners/translate.py index 0e38106d3..32fa500ac 100644 --- a/unstructured/cleaners/translate.py +++ b/unstructured/cleaners/translate.py @@ -52,6 +52,7 @@ def translate_text(text: str, source_lang: Optional[str] = None, target_lang: st return text model_name = _get_opus_mt_model_name(_source_lang, target_lang) + print(f"Using model: {model_name}") try: tokenizer = MarianTokenizer.from_pretrained(model_name) @@ -79,7 +80,7 @@ def _translate_text(text, model, tokenizer): with warnings.catch_warnings(): warnings.simplefilter("ignore") translated = model.generate( - **tokenizer([text], return_tensors="pt", padding="max_length", max_length=512), + **tokenizer([text], return_tensors="pt", padding=True, truncation=True), ) return [tokenizer.decode(t, max_new_tokens=512, skip_special_tokens=True) for t in translated][ 0