mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-24 05:34:58 +00:00
fix(CVE-2024-39705): update to latest nltk version (#3512)
### Summary Addresses [CVE-2024-39705](https://nvd.nist.gov/vuln/detail/CVE-2024-39705) by updating to `nltk==3.8.2` and closes #3511. This CVE had previously been mitigated in #3361. --------- Co-authored-by: Christine Straub <christinemstraub@gmail.com>
This commit is contained in:
parent
1158d8f695
commit
7437f0a084
16
.github/workflows/ci.yml
vendored
16
.github/workflows/ci.yml
vendored
@ -120,8 +120,6 @@ jobs:
|
||||
matrix:
|
||||
python-version: ["3.9","3.10","3.11", "3.12"]
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
||||
needs: [setup, lint]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@ -161,7 +159,6 @@ jobs:
|
||||
python-version: ["3.10"]
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
||||
UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
needs: [setup, lint]
|
||||
steps:
|
||||
@ -179,6 +176,7 @@ jobs:
|
||||
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
make install-nltk-models
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y poppler-utils
|
||||
make install-pandoc install-test
|
||||
@ -193,8 +191,6 @@ jobs:
|
||||
matrix:
|
||||
python-version: ["3.10"]
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
||||
needs: [setup, lint]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@ -211,6 +207,7 @@ jobs:
|
||||
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
make install-nltk-models
|
||||
make test-no-extras CI=true
|
||||
|
||||
test_unit_dependency_extras:
|
||||
@ -276,8 +273,6 @@ jobs:
|
||||
matrix:
|
||||
python-version: [ "3.9","3.10" ]
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
||||
needs: [ setup_ingest, lint ]
|
||||
steps:
|
||||
# actions/checkout MUST come before auth
|
||||
@ -296,6 +291,7 @@ jobs:
|
||||
- name: Test Ingest (unit)
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
make install-nltk-models
|
||||
PYTHONPATH=. pytest test_unstructured_ingest/unit
|
||||
|
||||
|
||||
@ -304,8 +300,6 @@ jobs:
|
||||
matrix:
|
||||
python-version: ["3.9","3.10"]
|
||||
runs-on: ubuntu-latest-m
|
||||
env:
|
||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
||||
needs: [setup_ingest, lint]
|
||||
steps:
|
||||
# actions/checkout MUST come before auth
|
||||
@ -373,6 +367,7 @@ jobs:
|
||||
CI: "true"
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
make install-nltk-models
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
|
||||
make install-pandoc
|
||||
@ -391,8 +386,6 @@ jobs:
|
||||
matrix:
|
||||
python-version: ["3.9","3.10"]
|
||||
runs-on: ubuntu-latest-m
|
||||
env:
|
||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
||||
needs: [setup_ingest, lint]
|
||||
steps:
|
||||
# actions/checkout MUST come before auth
|
||||
@ -445,6 +438,7 @@ jobs:
|
||||
CI: "true"
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
make install-nltk-models
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
|
||||
make install-pandoc
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
## 0.15.2-dev8
|
||||
## 0.15.2
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* **Updates NLTK data file for compatibility with `nltk>=3.8.2`**. The NLTK data file now container `punkt_tab`, making it possible to upgrade to `nltk>=3.8.2`. The `nltk==3.8.2` patches CVE-2024-39705.
|
||||
* **Renames Astra to Astra DB** Conforms with DataStax internal naming conventions.
|
||||
* **Accommodate single-column CSV files.** Resolves a limitation of `partition_csv()` where delimiter detection would fail on a single-column CSV file (which naturally has no delimeters).
|
||||
* **Accommodate `image/jpg` in PPTX as alias for `image/jpeg`.** Resolves problem partitioning PPTX files having an invalid `image/jpg` (should be `image/jpeg`) MIME-type in the `[Content_Types].xml` member of the PPTX Zip archive.
|
||||
|
||||
3
Makefile
3
Makefile
@ -38,8 +38,7 @@ install-huggingface:
|
||||
|
||||
.PHONY: install-nltk-models
|
||||
install-nltk-models:
|
||||
python3 -c "import nltk; nltk.download('punkt')"
|
||||
python3 -c "import nltk; nltk.download('averaged_perceptron_tagger')"
|
||||
python3 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()"
|
||||
|
||||
.PHONY: install-test
|
||||
install-test:
|
||||
|
||||
@ -57,7 +57,7 @@ jsonpath-python==1.0.6
|
||||
# via unstructured-client
|
||||
langdetect==1.0.9
|
||||
# via -r ./base.in
|
||||
lxml==5.2.2
|
||||
lxml==5.3.0
|
||||
# via -r ./base.in
|
||||
marshmallow==3.21.3
|
||||
# via
|
||||
@ -69,7 +69,7 @@ mypy-extensions==1.0.0
|
||||
# unstructured-client
|
||||
nest-asyncio==1.6.0
|
||||
# via unstructured-client
|
||||
nltk==3.8.1
|
||||
nltk==3.8.2
|
||||
# via -r ./base.in
|
||||
numpy==1.26.4
|
||||
# via -r ./base.in
|
||||
|
||||
@ -423,7 +423,7 @@ virtualenv==20.26.3
|
||||
# via pre-commit
|
||||
wcwidth==0.2.13
|
||||
# via prompt-toolkit
|
||||
webcolors==24.6.0
|
||||
webcolors==24.8.0
|
||||
# via jsonschema
|
||||
webencodings==0.5.1
|
||||
# via
|
||||
@ -437,7 +437,7 @@ wheel==0.44.0
|
||||
# pip-tools
|
||||
widgetsnbextension==4.0.11
|
||||
# via ipywidgets
|
||||
zipp==3.19.2
|
||||
zipp==3.20.0
|
||||
# via importlib-metadata
|
||||
|
||||
# The following packages are considered to be unsafe in a requirements file:
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
#
|
||||
# pip-compile ./extra-docx.in
|
||||
#
|
||||
lxml==5.2.2
|
||||
lxml==5.3.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# python-docx
|
||||
|
||||
@ -8,5 +8,5 @@ importlib-metadata==8.2.0
|
||||
# via markdown
|
||||
markdown==3.6
|
||||
# via -r ./extra-markdown.in
|
||||
zipp==3.19.2
|
||||
zipp==3.20.0
|
||||
# via importlib-metadata
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
#
|
||||
# pip-compile ./extra-odt.in
|
||||
#
|
||||
lxml==5.2.2
|
||||
lxml==5.3.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# python-docx
|
||||
|
||||
@ -78,7 +78,7 @@ lanms-neo==1.0.2
|
||||
# via unstructured-paddleocr
|
||||
lazy-loader==0.4
|
||||
# via scikit-image
|
||||
lxml==5.2.2
|
||||
lxml==5.3.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# premailer
|
||||
@ -191,7 +191,7 @@ sniffio==1.3.1
|
||||
# -c ./base.txt
|
||||
# anyio
|
||||
# httpx
|
||||
tifffile==2024.7.24
|
||||
tifffile==2024.8.10
|
||||
# via scikit-image
|
||||
tqdm==4.66.5
|
||||
# via
|
||||
@ -208,5 +208,5 @@ urllib3==1.26.19
|
||||
# -c ././deps/constraints.txt
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
zipp==3.19.2
|
||||
zipp==3.20.0
|
||||
# via importlib-resources
|
||||
|
||||
@ -86,7 +86,7 @@ kiwisolver==1.4.5
|
||||
# via matplotlib
|
||||
layoutparser==0.3.4
|
||||
# via unstructured-inference
|
||||
lxml==5.2.2
|
||||
lxml==5.3.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# pikepdf
|
||||
@ -249,7 +249,7 @@ six==1.16.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# python-dateutil
|
||||
sympy==1.13.1
|
||||
sympy==1.13.2
|
||||
# via
|
||||
# onnxruntime
|
||||
# torch
|
||||
@ -301,5 +301,5 @@ wrapt==1.16.0
|
||||
# -c ././deps/constraints.txt
|
||||
# -c ./base.txt
|
||||
# deprecated
|
||||
zipp==3.19.2
|
||||
zipp==3.20.0
|
||||
# via importlib-resources
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
#
|
||||
# pip-compile ./extra-pptx.in
|
||||
#
|
||||
lxml==5.2.2
|
||||
lxml==5.3.0
|
||||
# via python-pptx
|
||||
pillow==10.4.0
|
||||
# via python-pptx
|
||||
|
||||
@ -85,7 +85,7 @@ six==1.16.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# langdetect
|
||||
sympy==1.13.1
|
||||
sympy==1.13.2
|
||||
# via torch
|
||||
tokenizers==0.19.1
|
||||
# via
|
||||
|
||||
@ -8,7 +8,7 @@ adlfs==2024.7.0
|
||||
# via -r ./ingest/azure.in
|
||||
aiohappyeyeballs==2.3.5
|
||||
# via aiohttp
|
||||
aiohttp==3.10.2
|
||||
aiohttp==3.10.3
|
||||
# via adlfs
|
||||
aiosignal==1.3.1
|
||||
# via aiohttp
|
||||
|
||||
@ -194,7 +194,7 @@ sniffio==1.3.1
|
||||
# httpx
|
||||
starlette==0.37.2
|
||||
# via fastapi
|
||||
sympy==1.13.1
|
||||
sympy==1.13.2
|
||||
# via onnxruntime
|
||||
tenacity==8.5.0
|
||||
# via
|
||||
@ -247,7 +247,7 @@ wrapt==1.16.0
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# deprecated
|
||||
# opentelemetry-instrumentation
|
||||
zipp==3.19.2
|
||||
zipp==3.20.0
|
||||
# via
|
||||
# importlib-metadata
|
||||
# importlib-resources
|
||||
|
||||
@ -15,7 +15,7 @@ charset-normalizer==3.3.2
|
||||
# requests
|
||||
clarifai==10.7.0
|
||||
# via -r ./ingest/clarifai.in
|
||||
clarifai-grpc==10.7.0
|
||||
clarifai-grpc==10.7.1
|
||||
# via clarifai
|
||||
contextlib2==21.6.0
|
||||
# via schema
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
#
|
||||
aiohappyeyeballs==2.3.5
|
||||
# via aiohttp
|
||||
aiohttp==3.10.2
|
||||
aiohttp==3.10.3
|
||||
# via discord-py
|
||||
aiosignal==1.3.1
|
||||
# via aiohttp
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
#
|
||||
aiohappyeyeballs==2.3.5
|
||||
# via aiohttp
|
||||
aiohttp==3.10.2
|
||||
aiohttp==3.10.3
|
||||
# via elasticsearch
|
||||
aiosignal==1.3.1
|
||||
# via aiohttp
|
||||
@ -19,7 +19,7 @@ certifi==2024.7.4
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# elastic-transport
|
||||
elastic-transport==8.13.1
|
||||
elastic-transport==8.15.0
|
||||
# via elasticsearch
|
||||
elasticsearch[async]==8.14.0
|
||||
# via -r ./ingest/elasticsearch.in
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
#
|
||||
aiohappyeyeballs==2.3.5
|
||||
# via aiohttp
|
||||
aiohttp==3.10.2
|
||||
aiohttp==3.10.3
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
@ -70,7 +70,7 @@ langchain-core==0.2.29
|
||||
# langchain-text-splitters
|
||||
langchain-text-splitters==0.2.2
|
||||
# via langchain
|
||||
langsmith==0.1.98
|
||||
langsmith==0.1.99
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
|
||||
@ -49,7 +49,7 @@ langchain-core==0.2.29
|
||||
# via langchain-huggingface
|
||||
langchain-huggingface==0.0.3
|
||||
# via -r ./ingest/embed-huggingface.in
|
||||
langsmith==0.1.98
|
||||
langsmith==0.1.99
|
||||
# via langchain-core
|
||||
markupsafe==2.1.5
|
||||
# via jinja2
|
||||
@ -107,7 +107,7 @@ scipy==1.11.3
|
||||
# sentence-transformers
|
||||
sentence-transformers==3.0.1
|
||||
# via langchain-huggingface
|
||||
sympy==1.13.1
|
||||
sympy==1.13.2
|
||||
# via torch
|
||||
tenacity==8.5.0
|
||||
# via langchain-core
|
||||
|
||||
@ -49,7 +49,7 @@ idna==3.7
|
||||
# requests
|
||||
jiter==0.5.0
|
||||
# via openai
|
||||
openai==1.40.2
|
||||
openai==1.40.3
|
||||
# via -r ./ingest/embed-octoai.in
|
||||
pydantic==2.8.2
|
||||
# via openai
|
||||
|
||||
@ -55,11 +55,11 @@ jsonpointer==3.0.0
|
||||
# via jsonpatch
|
||||
langchain-core==0.2.29
|
||||
# via langchain-openai
|
||||
langchain-openai==0.1.20
|
||||
langchain-openai==0.1.21
|
||||
# via -r ./ingest/embed-openai.in
|
||||
langsmith==0.1.98
|
||||
langsmith==0.1.99
|
||||
# via langchain-core
|
||||
openai==1.40.2
|
||||
openai==1.40.3
|
||||
# via langchain-openai
|
||||
orjson==3.10.7
|
||||
# via langsmith
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
#
|
||||
aiohappyeyeballs==2.3.5
|
||||
# via aiohttp
|
||||
aiohttp==3.10.2
|
||||
aiohttp==3.10.3
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
@ -120,7 +120,7 @@ langchain-google-vertexai==1.0.8
|
||||
# via -r ./ingest/embed-vertexai.in
|
||||
langchain-text-splitters==0.2.2
|
||||
# via langchain
|
||||
langsmith==0.1.98
|
||||
langsmith==0.1.99
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
#
|
||||
aiohappyeyeballs==2.3.5
|
||||
# via aiohttp
|
||||
aiohttp==3.10.2
|
||||
aiohttp==3.10.3
|
||||
# via
|
||||
# langchain
|
||||
# voyageai
|
||||
@ -55,7 +55,7 @@ langchain-text-splitters==0.2.2
|
||||
# via langchain
|
||||
langchain-voyageai==0.1.1
|
||||
# via -r ./ingest/embed-voyageai.in
|
||||
langsmith==0.1.98
|
||||
langsmith==0.1.99
|
||||
# via
|
||||
# langchain
|
||||
# langchain-core
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
#
|
||||
aiohappyeyeballs==2.3.5
|
||||
# via aiohttp
|
||||
aiohttp==3.10.2
|
||||
aiohttp==3.10.3
|
||||
# via gcsfs
|
||||
aiosignal==1.3.1
|
||||
# via aiohttp
|
||||
|
||||
@ -4,11 +4,11 @@
|
||||
#
|
||||
# pip-compile ./ingest/s3.in
|
||||
#
|
||||
aiobotocore==2.13.1
|
||||
aiobotocore==2.13.2
|
||||
# via s3fs
|
||||
aiohappyeyeballs==2.3.5
|
||||
# via aiohttp
|
||||
aiohttp==3.10.2
|
||||
aiohttp==3.10.3
|
||||
# via
|
||||
# aiobotocore
|
||||
# s3fs
|
||||
|
||||
@ -25,7 +25,7 @@ idna==3.7
|
||||
# requests
|
||||
isodate==0.6.1
|
||||
# via zeep
|
||||
lxml==5.2.2
|
||||
lxml==5.3.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# zeep
|
||||
|
||||
@ -16,7 +16,7 @@ fsspec==2024.5.0
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# -r ./ingest/sftp.in
|
||||
paramiko==3.4.0
|
||||
paramiko==3.4.1
|
||||
# via -r ./ingest/sftp.in
|
||||
pycparser==2.22
|
||||
# via cffi
|
||||
|
||||
@ -57,7 +57,7 @@ wheel==0.44.0
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# singlestoredb
|
||||
zipp==3.19.2
|
||||
zipp==3.20.0
|
||||
# via importlib-metadata
|
||||
|
||||
# The following packages are considered to be unsafe in a requirements file:
|
||||
|
||||
@ -66,7 +66,7 @@ label-studio-tools==0.0.4
|
||||
# via label-studio-sdk
|
||||
liccheck==0.9.2
|
||||
# via -r ./test.in
|
||||
lxml==5.2.2
|
||||
lxml==5.3.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# label-studio-sdk
|
||||
|
||||
@ -2,17 +2,11 @@ from typing import List, Tuple
|
||||
from unittest.mock import patch
|
||||
|
||||
import nltk
|
||||
import pytest
|
||||
|
||||
from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize
|
||||
from unstructured.nlp import tokenize
|
||||
|
||||
|
||||
def test_error_raised_on_nltk_download():
|
||||
with pytest.raises(ValueError):
|
||||
tokenize.nltk.download("tokenizers/punkt")
|
||||
|
||||
|
||||
def test_nltk_packages_download_if_not_present():
|
||||
with patch.object(nltk, "find", side_effect=LookupError):
|
||||
with patch.object(tokenize, "download_nltk_packages") as mock_download:
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.15.2-dev8" # pragma: no cover
|
||||
__version__ = "0.15.2" # pragma: no cover
|
||||
|
||||
@ -7,7 +7,7 @@ import tarfile
|
||||
import tempfile
|
||||
import urllib.request
|
||||
from functools import lru_cache
|
||||
from typing import Any, Final, List, Tuple
|
||||
from typing import Final, List, Tuple
|
||||
|
||||
import nltk
|
||||
from nltk import pos_tag as _pos_tag
|
||||
@ -16,15 +16,9 @@ from nltk import word_tokenize as _word_tokenize
|
||||
|
||||
CACHE_MAX_SIZE: Final[int] = 128
|
||||
|
||||
NLTK_DATA_URL = "https://utic-public-cf.s3.amazonaws.com/nltk_data.tgz"
|
||||
NLTK_DATA_SHA256 = "126faf671cd255a062c436b3d0f2d311dfeefcd92ffa43f7c3ab677309404d61"
|
||||
|
||||
|
||||
def _raise_on_nltk_download(*args: Any, **kwargs: Any):
|
||||
raise ValueError("NLTK download disabled. See CVE-2024-39705")
|
||||
|
||||
|
||||
nltk.download = _raise_on_nltk_download
|
||||
NLTK_DATA_FILENAME = "nltk_data_3.8.2.tar.gz"
|
||||
NLTK_DATA_URL = f"https://utic-public-cf.s3.amazonaws.com/{NLTK_DATA_FILENAME}"
|
||||
NLTK_DATA_SHA256 = "ba2ca627c8fb1f1458c15d5a476377a5b664c19deeb99fd088ebf83e140c1663"
|
||||
|
||||
|
||||
# NOTE(robinson) - mimic default dir logic from NLTK
|
||||
@ -84,7 +78,7 @@ def download_nltk_packages():
|
||||
return sha256.hexdigest()
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir_path:
|
||||
tgz_file_path = os.path.join(temp_dir_path, "nltk_data.tgz")
|
||||
tgz_file_path = os.path.join(temp_dir_path, NLTK_DATA_FILENAME)
|
||||
urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
|
||||
|
||||
file_hash = sha256_checksum(tgz_file_path)
|
||||
@ -120,10 +114,10 @@ def _download_nltk_packages_if_not_present():
|
||||
|
||||
tagger_available = check_for_nltk_package(
|
||||
package_category="taggers",
|
||||
package_name="averaged_perceptron_tagger",
|
||||
package_name="averaged_perceptron_tagger_eng",
|
||||
)
|
||||
tokenizer_available = check_for_nltk_package(
|
||||
package_category="tokenizers", package_name="punkt"
|
||||
package_category="tokenizers", package_name="punkt_tab"
|
||||
)
|
||||
|
||||
if not (tokenizer_available and tagger_available):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user